Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision | Next revision Both sides next revision | ||
spark:spark-introduction [2014/11/03 18:23] straka |
spark:spark-introduction [2014/11/03 19:00] straka |
||
---|---|---|---|
Line 64: | Line 64: | ||
===== K-Means Example ===== | ===== K-Means Example ===== | ||
To show an example of iterative algorithm, consider [[http:// | To show an example of iterative algorithm, consider [[http:// | ||
+ | <file python> | ||
+ | import numpy as np | ||
+ | |||
+ | def closestPoint(point, | ||
+ | return min((np.sum((point - centers[i]) ** 2), i) for i in range(len(centers)))[1] | ||
+ | |||
+ | lines = sc.textFile("/ | ||
+ | data = lines.map(lambda line: np.array([float(x) for x in line.split()])).cache() | ||
+ | |||
+ | K = 50 | ||
+ | epsilon = 1e-3 | ||
+ | |||
+ | centers = data.takeSample(False, | ||
+ | for i in range(5): | ||
+ | old_centers = sc.broadcast(centers) | ||
+ | centers = (data | ||
+ | # For each point, find its closest center index. | ||
+ | | ||
+ | # Sum points and counts in each cluster. | ||
+ | | ||
+ | # Sort by cluster index. | ||
+ | | ||
+ | # Compute the new centers by averaging points in clusters. | ||
+ | | ||
+ | | ||
+ | |||
+ | # If the change in center positions is less than epsilon, stop. | ||
+ | centers_change = sum(np.sqrt(np.sum((a - b)**2)) for (a, b) in zip(centers, | ||
+ | old_centers.unpersist() | ||
+ | if centers_change < epsilon: | ||
+ | break | ||
+ | |||
+ | print "Final centers: " + str(centers) | ||
+ | </ | ||
+ | The implementation starts by loading the data and caching them in memory using '' | ||
+ |