1115
[13주차 - Day3] Spark
Spark 2
SparkSQL 명령어들 업로드
# 실습
# 데이터프레임을 테이블뷰로 만들어서 SparkSQL로 처리해보기
namegender_df.createOrReplaceTempView("namegender")
namegender_group_df = spark.sql("SELECT gender, count(1) FROM namegender GROUP BY 1")
namegender_group_df.collect()
# Redshift와 연결해서 테이블들을 데이터프레임으로 로딩하기
df_user_session_channel = spark.read \
.format("jdbc") \
.option("driver", "com.amazon.redshift.jdbc42.Driver") \
.option("url", "jdbc:redshift://learnde.cduaw970ssvt.ap-northeast-2.redshift.amazonaws.com:5439/dev?user=guest&password=Guest1234") \
.option("dbtable", "raw_data.user_session_channel") \
.load()
df_user_session_channel.createOrReplaceTempView("user_session_channel")
channel_count_df = spark.sql("""
SELECT channel, count(distinct userId) uniqueUsers
FROM session_timestamp st
JOIN user_session_channel usc ON st.sessionID = usc.sessionID
GROUP BY 1
ORDER BY 1
""")