[ Skip to the content ]

Institute of Formal and Applied Linguistics Wiki


[ Back to the navigation ]

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
Next revision Both sides next revision
spark:spark-introduction [2014/11/03 20:31]
straka
spark:spark-introduction [2014/11/03 20:37]
straka
Line 70: Line 70:
     return min((np.sum((point - centers[i]) ** 2), i) for i in range(len(centers)))[1]     return min((np.sum((point - centers[i]) ** 2), i) for i in range(len(centers)))[1]
  
-lines = sc.textFile("/net/projects/hadoop/examples/inputs/points-small/points.txt")+lines = sc.textFile("/net/projects/hadoop/examples/inputs/points-small/points.txt", sc.defaultParallelism)
 data = lines.map(lambda line: np.array([float(x) for x in line.split()])).cache() data = lines.map(lambda line: np.array([float(x) for x in line.split()])).cache()
  
Line 76: Line 76:
 epsilon = 1e-3 epsilon = 1e-3
  
-centers = data.takeSample(False,     # Sample K random points+centers = data.takeSample(False, K)       # Sample K random points
 for i in range(5):                        # Perform 5 iterations for i in range(5):                        # Perform 5 iterations
     old_centers = sc.broadcast(centers)     old_centers = sc.broadcast(centers)
Line 89: Line 89:
                .map(lambda (index, (sum, count)): sum / count)                .map(lambda (index, (sum, count)): sum / count)
                .collect())                .collect())
- 
     # If the change in center positions is less than epsilon, stop.     # If the change in center positions is less than epsilon, stop.
     centers_change = sum(np.sqrt(np.sum((a - b)**2)) for (a, b) in zip(centers, old_centers.value))     centers_change = sum(np.sqrt(np.sum((a - b)**2)) for (a, b) in zip(centers, old_centers.value))
Line 112: Line 111:
   centers.map(center => (center-point).norm(2)).zipWithIndex.min._2   centers.map(center => (center-point).norm(2)).zipWithIndex.min._2
  
-val lines = sc.textFile("/net/projects/hadoop/examples/inputs/points-small/points.txt")+val lines = sc.textFile("/net/projects/hadoop/examples/inputs/points-small/points.txt", sc.defaultParallelism)
 val data = lines.map(line => Vector(line.split("\\s+").map(_.toDouble))).cache() val data = lines.map(line => Vector(line.split("\\s+").map(_.toDouble))).cache()
  

[ Back to the navigation ] [ Back to the content ]