在ipython notebook下運(yùn)行pyspark
from pyspark import SparkConf, SparkContext
appName = 'testSpark'
def main(sc):
pass
if __name__ == '__main__':
#Configure Spark
conf = SparkConf().setAppName(appName).setMaster('local[2]')
# sc.stop()
sc = SparkContext(conf=conf)
print sc.version
main(sc)
2.0.2
在瀏覽器輸入ip:4040進(jìn)入到spark的任務(wù)UI界面淹仑,查看各任務(wù)的信息
參數(shù)preservesPartitioning表示是否保留父RDD的partitioner分區(qū)信息
map
map(f, preservesPartitioning=False)
Return a new RDD by applying a function to each element of this RDD.
#map function
x = sc.parallelize([1,2,3,4])
y = x.map(lambda x:(x, x**3))
print y.collect()
[(1, 1), (2, 8), (3, 27), (4, 64)]
flatMap
flatMap(f, preservesPartitioning=False)
Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.
z = x.flatMap(lambda x: (x, 100*x, x**2))
print z.collect()
[1, 100, 1, 2, 200, 4, 3, 300, 9, 4, 400, 16]
glom
glom()
Return an RDD created by coalescing all elements within each partition into a list.
rdd = sc.parallelize([1, 2, 3, 4], 2)
print sorted(rdd.glom().collect())
[[1, 2], [3, 4]]
mapPartitions
mapPartitions(f, preservesPartitioning=False)
Return a new RDD by applying a function to each partition of this RDD.
x = sc.parallelize([1,2,3,4], 2)
def f(iter):
yield sum(iter)
y = x.mapPartitions(f)
# glom() flattens elements on the same partition
print 'x原來分區(qū)信息:{0}'.format(x.glom().collect())
print 'x經(jīng)過f計(jì)算后的結(jié)果:{}'.format(y.glom().collect())
x原來分區(qū)信息:[[1, 2], [3, 4]]
x經(jīng)過f計(jì)算后的結(jié)果:[[3], [7]]
mapPartitionsWithIndex
mapPartitionsWithIndex(f, preservesPartitioning=False)?
Return a new RDD by applying a function to each partition of this RDD, while tracking the index of the original partition.
x = sc.parallelize([1, 2, 3, 4], 2)
def f(splitIndex, iterator): yield (splitIndex, sum(iterator))
y = x.mapPartitionsWithIndex(f)
print 'x原來分區(qū)信息:{0}'.format(x.glom().collect())
print 'x經(jīng)過f計(jì)算后的結(jié)果:{}'.format(y.glom().collect())
x原來分區(qū)信息:[[1, 2], [3, 4]]
x經(jīng)過f計(jì)算后的結(jié)果:[[(0, 3)], [(1, 7)]]
getNumsPartitions
getNumPartitions()
Returns the number of partitions in RDD
rdd = sc.parallelize([1, 2, 3, 4], 2)
print '分區(qū)有{}個(gè)'.format(rdd.getNumPartitions())
分區(qū)有2個(gè)
filter
filter(f)
Return a new RDD containing only the elements that satisfy a predicate.
rdd = sc.parallelize([1, 2, 3, 4, 5])
res = rdd.filter(lambda x: x % 2 == 0).collect()
print '符合條件的數(shù)據(jù)是:{}'.format(res)
符合條件的數(shù)據(jù)是:[2, 4]
distinct
distinct(numPartitions=None)
Return a new RDD containing the distinct elements in this RDD.
res = sorted(sc.parallelize([1, 1, 1, 2, 3, 2, 3]).distinct().collect())
print '去重后的結(jié)果:{}'.format(res)
去重后的結(jié)果:[1, 2, 3]
sample
sample(withReplacement, fraction, seed=None)
Return a sampled subset of this RDD.
Parameters:
withReplacement – can elements be sampled multiple times (replaced when sampled out)
fraction – expected size of the sample as a fraction of this RDD’s size without replacement: probability that each element is chosen; fraction must be [0, 1] with replacement: expected number of times each element is chosen; fraction must be >= 0
seed – seed for the random number generator
rdd = sc.parallelize(range(7), 2)
samList = [rdd.sample(False, 0.5) for i in range(5)]
print 'rdd.collect()的值是{}'.format(rdd.collect())
for index, d in zip(range(len(samList)), samList):
print 'sample: {0} y = {1}'.format(index, d.collect())
takeSample
takeSample(withReplacement, num, seed=None)?
Return a fixed-size sampled subset of this RDD.
Note that this method should only be used if the resulting array is expected to be small, as all the data is loaded into the driver’s memory.
rdd = sc.parallelize(range(15), 2)
samList = [rdd.takeSample(False, 4) for i in range(5)]
print 'rdd.collect()的值是{}'.format(rdd.glom().collect())
for index, d in zip(range(len(samList)), samList):
print 'sample: {0} y = {1}'.format(index, d)
rdd.collect()的值是[[0, 1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12, 13, 14]]
sample: 0 y = [8, 9, 7, 2]
sample: 1 y = [12, 1, 10, 4]
sample: 2 y = [8, 12, 2, 6]
sample: 3 y = [9, 8, 12, 14]
sample: 4 y = [10, 4, 8, 2]
union
union(other)
Return the union of this RDD and another one.
rdd = sc.parallelize([1, 1, 2, 3])
rdd1 = sc.parallelize([5, 3, 4, 6])
print rdd.union(rdd1).collect()
[1, 1, 2, 3, 5, 3, 4, 6]
intersection
intersection(other)
Return the intersection of this RDD and another one. The output will not contain any duplicate elements, even if the input RDDs did.
Note that this method performs a shuffle internally.
rdd = sc.parallelize([1, 1, 2, 3])
rdd1 = sc.parallelize([5, 3, 4, 6])
print rdd.intersection(rdd1).collect()
[3]
sortByKey
sortByKey(ascending=True, numPartitions=None, keyfunc=func)
Sorts this RDD, which is assumed to consist of (key, value) pairs.
tmp = [('a', 1), ('f', 2), ('d', 3), ('c', 4), ('b', 5)]
rdd = sc.parallelize(tmp, 2)
print rdd.glom().collect()
sort1 = rdd.sortByKey(True,1).glom().collect()
sort2 = rdd.sortByKey(True,3).glom().collect()
print sort1
print sort2
[[('a', 1), ('f', 2)], [('d', 3), ('c', 4), ('b', 5)]]
[[('a', 1), ('b', 5), ('c', 4), ('d', 3), ('f', 2)]]
[[('a', 1), ('b', 5)], [('c', 4), ('d', 3)], [('f', 2)]]