0.前次作业:从文件创建DataFrame
1.pandas df 与 spark df的相互转换 df_s=spark.createDataFrame(df_p) df_p=df_s.toPandas()
# 从数组创建pandas dataframe
import pandas as pd
import numpy as np
arr = np.arange(6).reshape(-1,3)
arr
df_p = pd.DataFrame(arr)
df_p
df_p.columns = ['a','b','c']
df_p
# pandas df 转为spark df
df_s = spark.createDataFrame(df_p)
df_s.show()
df_s.collect()
# spark df 转为pandas df
df_s.show()
df_s.toPandas()
2. Spark与Pandas中DataFrame对比
http://www.lining0806.com/spark%E4%B8%8Epandas%E4%B8%ADdataframe%E5%AF%B9%E6%AF%94/
3.1 利用反射机制推断RDD模式
spark.sparkContext.textFile("file:///D:/Spark/spark-2.4.7-bin-hadoop2.7/examples/src/main/resources/people.txt").first()
spark.sparkContext.textFile("file:///D:/Spark/spark-2.4.7-bin-hadoop2.7/examples/src/main/resources/people.txt")\
.map(lambda line:line.split(',')).first()
from pyspark.sql import Row
people = spark.sparkContext.textFile("file:///D:/Spark/spark-2.4.7-bin-hadoop2.7/examples/src/main/resources/people.txt")\
.map(lambda line:line.split(','))\
.map(lambda p:Row(name=p[0],age=int(p[1])))
schemaPeople = spark.createDataFrame(people)
schemaPeople.show()
schemaPeople.printSchema()
3.2 使用编程方式定义RDD模式
生成“表头”
from pyspark.sql.types import StringType,StructField,StructType
from pyspark.sql import Row
#生成“表头”
schemaString = "name age"
fields = [StructField(field_name,StringType(),True) for field_name in schemaString.split(" ")]
schema = StructType(fields)
生成“表中的记录”
# 生成“表中的记录”
lines = spark.sparkContext.textFile("file:///D:/Spark/spark-2.4.7-bin-hadoop2.7/examples/src/main/resources/people.txt")
parts = lines.map(lambda x:x.split(","))
people = parts.map(lambda p:Row(p[0],p[1].strip()))
people.collect()
把“表头”和“表中的记录”拼装在一起
# 把“表头”和“表中的记录”拼接在一起
schemaPeople = spark.createDataFrame(people,schema)
schemaPeople.show()
schemaPeople.printSchema()
4. DataFrame保存为文件
df.write.json(dir)
schemaPeople.write.json("file:///D:/Demo/schemaPeople")
预练习:
读 学生课程分数文件chapter4-data01.txt,创建DataFrame。并尝试用DataFrame的操作完成实验三的数据分析要求。
1.利用反射机制推断RDD模式
from pyspark.sql import Row
people = spark.sparkContext.textFile("file:///D:/chapter4-data01.txt")\
.map(lambda line:line.split(','))\
.map(lambda p:Row(name=p[0],course=p[1],score=int(p[2])))
df = spark.createDataFrame(people)
people
df
people.first()
df.show()
df.printSchema()
2.使用编程方式定义RDD模式
url = "file:///D:/chapter4-data01.txt"
rdd = sc.textFile(url).map(lambda line:line.split(','))
rdd.take(3)
from pyspark.sql.types import IntegerType,StringType,StructField,StructType
from pyspark.sql import Row
#生成“表头”
schemaString = "name course score"
fields = [StructField(field_name,StringType(),True) for field_name in schemaString.split(" ")]
schema = StructType(fields)
fields
schema
# 生成“表中的记录”
lines = spark.sparkContext.textFile("file:///D:/chapter4-data01.txt")
parts = lines.map(lambda x:x.split(","))
people = parts.map(lambda p:Row(p[0],p[1],p[2].strip()))
people.collect()
# 把“表头”和“表中的记录”拼接在一起
schemaPeople = spark.createDataFrame(people,schema)
schemaPeople.show()
schemaPeople.printSchema()
手机扫一扫
移动阅读更方便
你可能感兴趣的文章