• 利用python将两张表链接




    from pyspark.sql import SparkSession
    from pyspark.sql.types import *
    import os


    def getUser(spark,path):
    struct1 = StructType([
    StructField("user", StringType(), True),
    StructField("vedios", StringType(), True),
    StructField("id", IntegerType(), True)
    ])
    df = spark.read.csv(path, schema=struct1, sep=" ", header=True)
    df.createOrReplaceTempView("users1")
    df = spark.sql("select * from users1")
    return df


    def getMovies(spark,path):
    df = spark.read.csv(path, header=True)
    df.createOrReplaceTempView("movies")
    df = spark.sql("select * from movies ")
    return df


    if __name__ == '__main__':
    os.environ['JAVA_HOME'] = 'C:Program FilesJavajdk1.8.0_211'
    print(os.path)
    spark = SparkSession
    .builder
    .appName("Python Spark SQL basic example")
    .config("spark.some.config.option", "some-value")
    .getOrCreate()
    path_user = "C:/Users/Administrator/Desktop/guiliVideo/user/2008/0903/user.txt"
    path_movies="C:/Users/Administrator/Desktop/vedios.txt"
    df1=getUser(spark,path_user)
    df2=getMovies(spark,path_movies)
    df3=df1.join(df2,df1.user==df2.uploader,how='inner')
    df3.createOrReplaceTempView('table1')
    df4=spark.sql('select * from table1 limit 10')
    df4.show(http://www.amjmh.com)
     
    ---------------------

  • 相关阅读:
    Unity Technologies-提供全面的技术支持服务
    Unity 大中华区核心业务
    帕斯卡(pascal)命名法:
    骆驼命名法
    匈牙利命名法
    软件分类
    模型规范
    命名规范
    22. Generate Parentheses 生成括号
    421. Maximum XOR of Two Numbers in an Array 数组中两个数的最大异或
  • 原文地址:https://www.cnblogs.com/ly570/p/11357427.html
Copyright © 2020-2023  润新知