Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision Next revision Both sides next revision | ||
spark:spark-introduction [2022/12/14 12:34] straka [Word Count Example] |
spark:spark-introduction [2022/12/14 12:42] straka [K-Means Example] |
||
---|---|---|---|
Line 43: | Line 43: | ||
The output of ' | The output of ' | ||
- | Note that 'map' and ' | + | Note that '' |
The Scala versions is quite similar: | The Scala versions is quite similar: | ||
<file scala> | <file scala> | ||
- | val wiki = sc.textFile("/ | + | val wiki = sc.textFile("/ |
val words = wiki.flatMap(line => line.split(" | val words = wiki.flatMap(line => line.split(" | ||
- | val counts = words.map(word => (word, | + | val counts = words.map(word => (word, 1)).reduceByKey((c1, |
val sorted = counts.sortBy({case (word, count) => count}, ascending=false) | val sorted = counts.sortBy({case (word, count) => count}, ascending=false) | ||
sorted.saveAsTextFile(" | sorted.saveAsTextFile(" | ||
// Alternatively without variables and using placeholders in lambda parameters: | // Alternatively without variables and using placeholders in lambda parameters: | ||
- | (sc.textFile("/ | + | (sc.textFile("/ |
| | ||
| | ||
Line 63: | Line 63: | ||
===== K-Means Example ===== | ===== K-Means Example ===== | ||
- | An example implementing [[http:// | + | An example implementing [[http:// |
<file python> | <file python> | ||
import numpy as np | import numpy as np | ||
Line 70: | Line 70: | ||
return min((np.sum((point - centers[i]) ** 2), i) for i in range(len(centers)))[1] | return min((np.sum((point - centers[i]) ** 2), i) for i in range(len(centers)))[1] | ||
- | lines = sc.textFile("/ | + | lines = sc.textFile("/ |
data = lines.map(lambda line: np.array(map(float, | data = lines.map(lambda line: np.array(map(float, | ||
- | K = 50 | + | K = 100 |
epsilon = 1e-3 | epsilon = 1e-3 | ||
Line 111: | Line 111: | ||
centers.map(center => (center-point).norm(2)).zipWithIndex.min._2 | centers.map(center => (center-point).norm(2)).zipWithIndex.min._2 | ||
- | val lines = sc.textFile("/ | + | val lines = sc.textFile("/ |
val data = lines.map(line => Vector(line.split(" | val data = lines.map(line => Vector(line.split(" | ||
- | val K = 50 | + | val K = 100 |
val epsilon = 1e-3 | val epsilon = 1e-3 | ||