[ Skip to the content ]

Institute of Formal and Applied Linguistics Wiki


[ Back to the navigation ]

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision Both sides next revision
spark:spark-introduction [2014/11/03 20:31]
straka
spark:spark-introduction [2014/11/03 20:35]
straka
Line 70: Line 70:
     return min((np.sum((point - centers[i]) ** 2), i) for i in range(len(centers)))[1]     return min((np.sum((point - centers[i]) ** 2), i) for i in range(len(centers)))[1]
  
-lines = sc.textFile("/net/projects/hadoop/examples/inputs/points-small/points.txt")+lines = sc.textFile("/net/projects/hadoop/examples/inputs/points-small/points.txt", sc.defaultParallelism)
 data = lines.map(lambda line: np.array([float(x) for x in line.split()])).cache() data = lines.map(lambda line: np.array([float(x) for x in line.split()])).cache()
  
Line 76: Line 76:
 epsilon = 1e-3 epsilon = 1e-3
  
-centers = data.takeSample(False,     # Sample K random points+centers = data.takeSample(False, K)       # Sample K random points
 for i in range(5):                        # Perform 5 iterations for i in range(5):                        # Perform 5 iterations
     old_centers = sc.broadcast(centers)     old_centers = sc.broadcast(centers)
Line 89: Line 89:
                .map(lambda (index, (sum, count)): sum / count)                .map(lambda (index, (sum, count)): sum / count)
                .collect())                .collect())
- 
     # If the change in center positions is less than epsilon, stop.     # If the change in center positions is less than epsilon, stop.
     centers_change = sum(np.sqrt(np.sum((a - b)**2)) for (a, b) in zip(centers, old_centers.value))     centers_change = sum(np.sqrt(np.sum((a - b)**2)) for (a, b) in zip(centers, old_centers.value))

[ Back to the navigation ] [ Back to the content ]