这是一个令人难以置信的 hacky 答案,我欢迎任何更好的解决方案
所以,我们从这张图片开始:
我阅读了这个文件并使用以下方法获取所有白色像素:
from PIL import Image
im = Image.open('image.png')
pix = im.load()
width,height = im.size
values = []
for w in range(width):
for h in range(height):
if sum(pix[w, h]) > 255 * 1.5:
values.append((width - w, height - h))
X = [[i[0]] for i in values]
y = [[i[1]] for i in values]
分散这个结果:
import matplotlib.pyplot as plt
plt.scatter(X, y)
plt.show()
现在我为另一个项目写了很多代码,我不打算解释它是如何工作的,但是你可以找到原始代码here:
from sklearn.linear_model import LinearRegression
from numpy import median, array
class DecisionTreeError(Exception):
pass
class DecisionTreeLinearNodes:
def __init__(self):
self.decisions = None
def fit(self, X, y, min_R2):
self.decisions = {}
X = array(X)
y = array(y)
self._fit(X=X, y=y, min_R2=min_R2, decisions=[])
def _fit(self, X, y, min_R2, decisions):
clf = LinearRegression()
clf.fit(X, y)
if clf.score(X, y) > min_R2 or len(X) <= 3:
self._add_decisions(decisions, (X, y))
else:
try:
x_1, x_2, y_1, y_2, split_param = self._split(X, y)
if min(len(x_1), len(x_2)) == 0:
raise DecisionTreeError
print(split_param, len(x_1), len(x_2))
self._fit(X=x_1, y=y_1, min_R2=min_R2, decisions=decisions + [(split_param, True)])
self._fit(X=x_2, y=y_2, min_R2=min_R2, decisions=decisions + [(split_param, False)])
except DecisionTreeError:
print("Couldn't find a better fit")
self._add_decisions(decisions, clf)
def _split(self, X, y):
best_r2 = 0
best_param = None
best_val = None
for split_param in range(len(X[0])):
lst = X[:, split_param]
val = self._get_val(X, y, lst)
if val is None:
continue
choices = lst >= val
not_choices = lst < val
x_1 = X[choices]
y_1 = y[choices]
x_2 = X[not_choices]
y_2 = y[not_choices]
if len(x_1) > 0:
l1 = LinearRegression()
l1.fit(x_1, y_1)
acc1 = l1.score(x_1, y_1)
else:
acc1 = 0
if len(x_2) > 0:
l2 = LinearRegression()
l2.fit(x_2, y_2)
acc2 = l2.score(x_2, y_2)
else:
acc2 = 0
acc = max(acc1, acc2)
if acc > best_r2:
best_r2 = acc
best_param = split_param
best_val = val
if best_param is None:
raise DecisionTreeError("No param fit")
indexes = X[:, best_param] >= best_val
indexes_false = X[:, best_param] < best_val
return X[indexes], X[indexes_false], y[indexes], y[indexes_false], (best_param, best_val)
def _get_val(self, X, y, lst):
if len(set(lst)) > 2:
val_guess_1 = median(lst)
choices = lst >= val_guess_1
not_choices = lst < val_guess_1
x_1 = X[choices]
y_1 = y[choices]
x_2 = X[not_choices]
y_2 = y[not_choices]
l1 = LinearRegression()
l1.fit(x_1, y_1)
l2 = LinearRegression()
l2.fit(x_2, y_2)
predictions1 = l1.predict(X)
predictions2 = l2.predict(X)
errors1 = (y - predictions1) ** 2
errors2 = (y - predictions2) ** 2
prefered_choice = [0 if errors1[i] < errors2[i] else 1 for i in range(len(errors1))]
lst_vals = sorted(list(zip(lst, prefered_choice)))
total_1s = sum(prefered_choice)
total_0s = len(prefered_choice) - total_1s
seen_0s = 0
seen_1s = 0
val = 0
best_score = 0
for i in lst_vals:
if i[1] == 0:
seen_0s += 1
else:
seen_1s += 1
score = seen_1s + total_0s - seen_0s
if score > best_score:
best_score = score
val = i[0]
if val == min(lst) or max(lst):
return val_guess_1
return val
elif len(set(lst)) == 2:
return (min(lst) + max(lst)) * 0.5
else:
return None
def _add_decisions(self, decisions, clf):
decisions_dict = self.decisions
for decision, bool in decisions[:-1]:
if decision not in decisions_dict:
decisions_dict[decision] = {True: {}, False: {}}
decisions_dict = decisions_dict[decision][bool]
if len(decisions) > 0:
decision, bool = decisions[-1]
if decision not in decisions_dict:
decisions_dict[decision] = {True: {}, False: {}}
decisions_dict[decision][bool] = clf
else:
self.decisions = clf
现在,使用它我们可以将代码分组为直线:
decision_tree = DecisionTreeLinearNodes()
decision_tree.fit(X, y, min_R2=0.6)
我使用 0.6 是因为这似乎可行,您可能需要对此进行调整。为了获得树分组它们所在的组,我使用了:
def flatten_array(array):
x = [i[0] for i in array[0]]
y = [i[0] for i in array[1]]
return sorted(list(zip(x, y)))
def get_arrays(decision_tree):
arrays = []
for j in decision_tree.values():
for i in [True, False]:
if isinstance(j[i], tuple):
arrays.append(flatten_array(j[i]))
else:
arrays += get_arrays(j[i])
return arrays
arrays = get_arrays(decision_tree.decisions)
我警告过你,这太骇人听闻了 ;)
现在我们可以获得描述每个组的点列表:
points = []
for i in arrays:
points.append(i[0])
points.append(i[-1])
但是,如果您这样做,您将在一组开始而另一组结束时获得“额外”分数。您可以使用(又一个 hacky 解决方案)删除这些:
def get_distance(point1, point2):
return ((point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2)**0.5
def remove_unnecesary_points(points, min_distance):
to_delete = []
for i in range(len(points)):
for j in range(i+1, len(points)):
distance = get_distance(points[i], points[j])
if distance <= min_distance:
to_delete.append(j)
return [points[i] for i in range(len(points)) if i not in to_delete]
points = remove_unnecesary_points(points, 100)
我使用 100 是因为 80 似乎是您线条的粗细,所以对于最坏的情况应该足够了。现在绘制这个:
for i in arrays:
plt.scatter([j[0] for j in i], [j[1] for j in i])
plt.scatter([i[0] for i in points], [i[1] for i in points])
plt.show()
结果:
这里橙色是一组,蓝色是另一组,绿点表示三个点。要最终获得所需的 number 分,请使用:
print(len(points))
>>> 3