• titlesplit


    /**
    * Created by lkl on 2017/6/26.
    *///spark-shell --driver-class-path /home/hadoop/test/mysqljdbc.jar
    import java.sql.{DriverManager, ResultSet}
    import org.apache.spark.SparkContext
    import org.apache.spark.SparkConf
    import java.util.Date
    object titlesplit {

    val rl= "jdbc:mysql://10.19.65.17:54321/emotion?user=emotion&password=qingxu&useUnicode=true&characterEncoding=utf8&autoReconnect=true&failOverReadOnly=false"
    classOf[com.mysql.jdbc.Driver]

    val conn = DriverManager.getConnection(rl)
    val statement = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE)
    def main(args: Array[String]) {
    val conf = new SparkConf()
    // val conf = new SparkConf().setAppName("test").setMaster("local")
    val sc = new SparkContext(conf)
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    val format = new java.text.SimpleDateFormat("yyyy-MM-dd")
    val yearformat = new java.text.SimpleDateFormat("yyyy")
    val year = yearformat.format(new java.util.Date().getTime())
    val format2 = new java.text.SimpleDateFormat("yyyy/MM/dd")
    val dat = format2.format(new java.util.Date().getTime() - 0 * 24 * 60 * 60 * 1000).toString
    val st=sqlContext.read.json("hdfs://ns1/user/datacenter/home/datacenter/datacollect/logs/dataplatform/Crawler/Crawler_Common_WebPageNews/"+dat+"/*.gz")
    // val st=sqlContext.read.json("hdfs://192.168.0.211:9000/user/datacenter/home/datacenter/datacollect/logs/dataplatform/Crawler/Crawler_Common_WebPageNews/"+dat+"/*.gz")
    val j=st.toDF().registerTempTable("job")
    val ed = sqlContext.sql("select `innerSessionId`,SUBSTR(`time`,1,10) AS time,`channelType`,`sourcetitle`,`title` from job")
    val pp = ed.map(p => {
    val v0 = p.getString(0)
    val v2 = p.getString(2)
    val v1 = p.get(1)
    val v3 = p.getString(3)
    val v4 = p.getString(4)
    val v5 = p.getString(4).split("\|")
    (v0, v1, v2, v3, v4, v5)
    }) pp.foreach(p => {


    for (i <- 0 until p._6.size) {
    val v1 = p._2.toString val v0 = p._1

    val v2 = p._3
    val v3 = p._4
    val v4 = p._5
    val v5 = p._6(i).split(" ")
    if (v5.size == 4) {
    val now = new Date()
    val a = now.getTime.toInt
    insert(v0, v1, v2, v3, v4, v5(0), v5(1), v5(2), v5(3),a)
    }

    }

    })
    conn.close()
    }
    def insert(value0: String, value1: String, value2: String, value3: String, value4: String, value5: String,
    value6: String, value7: String, value8: String,value9:Int): Unit = {

    try {
    val prep = conn.prepareStatement("INSERT INTO titlesplit(innserSessionid,times,channelType,sourcetitle,title,words,characters,refer,role,Nowtime) VALUES (?,?,?,?,?,?,?,?,?,?) ")
    prep.setString(1, value0)
    prep.setString(2, value1)
    prep.setString(3, value2)
    prep.setString(4, value3)
    prep.setString(5, value4)
    prep.setString(6, value5)
    prep.setString(7, value6)
    prep.setString(8, value7)
    prep.setString(9, value8)
    prep.setInt(10,value9)
    prep.executeUpdate
    } catch {
    case e: Exception => e.printStackTrace
    }
    finally {
    }
    }
    }

  • 相关阅读:
    Python操作Excel表格
    Python爬虫实战:爬取美食节川菜信息
    超级实用的Python网络爬虫反反爬策略之构造UA池及IP代理池
    Python 静态方法
    IDEA连接远程服务器Docker部署Spring Boot项目
    Dockerfile 解析
    Python爬虫实战-统计博客园阅读量问题
    Docker 容器数据卷
    Docker镜像
    PL/SQL
  • 原文地址:https://www.cnblogs.com/canyangfeixue/p/8566851.html
Copyright © 2020-2023  润新知