• 以京东商品评论为目标网站,架构采用爬虫+Flume+Kafka+Spark Streaming+Mysql,实现数据动态实时的采集、分析、展示数据。


    #-*- codeing =utf-8 -*-
    #@Time : 2022/3/16 20:15
    #@Author : huaobin
    #@File : new.py
    #@Software: PyCharm
    # -*- coding: utf-8 -*-
    import gzip
    import urllib.request
    import json
    import time
    import random
    import demjson as dj
    import requests
    import itertools
    
    headers = {
        "Cookie": "",
    "Accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36 Edg/99.0.1150.39" } headers2 = { "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "cookie": ""
    "referer": "https://search.jd.com/" } def crawlProductComment(url): # 读取原始数据(注意选择gbk编码方式) try: req = requests.get(url=url, headers=headers2).text reqs = req.replace("fetchJSON_comment98(", "").strip(');') print(reqs) jsondata = json.loads(reqs) # 遍历商品评论列表 comments = jsondata['comments'] return comments except IOError: print("Error: gbk不合适") # 从原始数据中提取出JSON格式数据(分别以'{'和'}'作为开始和结束标志) def getProduct(url): ids = [] req = requests.get(url=url, headers=headers2).text reqs = req.replace("jQuery1544821(", "").strip(')') jsondata = json.loads(reqs)['291'] for i in range(0, len(jsondata)): ids.append(jsondata[i]['sku_id']) print(ids) return ids import paramiko #服务器信息,主机名(IP地址)、端口号、用户名及密码 hostname = "hadoop102" port = 22 username = "root" password = "123456" client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) client.connect(hostname, port, username, password, compress=True) sftp_client = client.open_sftp() remote_file = sftp_client.open("/opt/a.log",'a+') # 文件路径自己设置 ids = [] for i in range(2,3): product_id = getProduct( "https://search-x.jd.com/Search?callback=jQuery1544821&area=5&enc=utf-8&keyword=%E7%94%B7%E5%A3%AB%E8%BF%90%E5%8A%A8%E9%9E%8B&adType=7&page="+str(i)+"&ad_ids=291%3A33&xtest=new_search&_=1647325621019") time.sleep(random.randint(1, 3)) ids.append(product_id) data = [] count = 0 for k in list(set(itertools.chain.from_iterable(ids))): for i in range(0, 100): url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=' + str( k) + '&score=0&sortType=5&page=' \ + str(i) + '&pageSize=10&isShadowSku=0&fold=1' comments = crawlProductComment(url) if len(comments) <= 0: break print(comments) remote_file.writelines(str(len(comments))+"\n") data.extend(comments) # 设置休眠时间 time.sleep(random.randint(1, 5)) print('-------', i) print("这是第{}类商品".format(count)) count += 1

      python爬取京东商品并向虚拟机上传

    flume编写conf文件配置agent(连接kafka)

    a1.sources=r1
    a1.sinks=k1 k2
    a1.channels=c1 c2
    #define source
    a1.sources.r1.type=exec
    a1.sources.r1.channels=c1 c2
    a1.sources.r1.command=tail -F /training/testData/test-flume.log//监控的目录(爬去信息不断上传的目录)
    a1.sources.r1.shell=/bin/sh -c
    a1.sources.r1.selector.type=replicating
    #sink1toKafka
    a1.sinks.k1.type =org.apache.flume.sink.kafka.KafkaSink
    a1.sinks.k1.topic = test02//监听的kafka的topic(在idea代码中会自动创建)
    a1.sinks.k1.brokerList= 192.168.63.139:9092,192.168.63.140:9092,192.168.63.134:9092//非集群模式可以只设主机ip地址(这里的中文在运行时可能导致报错,把中文注释删除即可)
    a1.sinks.k1.kafka.bootstrap.servers = 192.168.63.139:9092,192.168.63.140:9092,192.168.63.134:9092//这里也是一样可以只设主机ip地址
    a1.sinks.k1.producer.requiredAcks = 1
    a1.sinks.k1.batchSize = 5
    a1.sinks.k1.channel=c1
    #sink2toHDFS
    a1.sinks.k2.type=hdfs
    a1.sinks.k2.channel=c2
    a1.sinks.k2.hdfs.path=hdfs://192.168.63.139:9000/DataTest/datatest/test02//hdfs上,你想输出文件的目录
    #channel1
    a1.channels.c1.type=memory
    #channel2
    a1.channels.c2.type=memory

    flume启动命令:

     bin/flume-ng agent --name a1 --conf conf  --conf-file conf/flume.conf  -Dflume.root.logger=INFO,console

    idea部分编写接受数据并存储到Mysql中

    package com.niit.spark
    
    import java.sql.{Connection, DriverManager, PreparedStatement}
    
    
    import org.apache.kafka.common.serialization.StringDeserializer
    import org.apache.spark.SparkConf
    import org.apache.spark.streaming.dstream.DStream
    import org.apache.spark.streaming.kafka010.KafkaUtils
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
    import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
    
    
    object BigWork{
    
      def main(args:Array[String]): Unit = {
        val sparkConf = new SparkConf().setAppName("BigWork").setMaster("local[2]")
        val streamingContext = new StreamingContext(sparkConf, Seconds(2))
    
        val kafkaParams = Map[String, Object](
          "bootstrap.servers" -> "flume:9092",//修个成自己的虚拟机主机
          "key.deserializer" -> classOf[StringDeserializer],
          "value.deserializer" -> classOf[StringDeserializer],
          "group.id" -> "use_a_separate_group_id_for_each_stream",
          "auto.offset.reset" -> "latest",
          "enable.auto.commit" -> (false: java.lang.Boolean)
        )
    
        val topics = Array("test02", "t100")//kafka监听目录
        val stream = KafkaUtils.createDirectStream[String, String](
          //监听
          streamingContext,
          PreferConsistent,
          Subscribe[String, String](topics, kafkaParams)
        )
        val mapDStream: DStream[(String, String)] = stream.map(record => (record.key, record.value))
    
        val resultRDD: DStream[(String, Int)] = mapDStream.map(_._2.split(",")(2)).map(word=>(word,1)).reduceByKey(_+_)//    处理数据
    
        resultRDD.print()//打印
        
        //转化成rdd,连接并存储到myql中
        resultRDD.foreachRDD(rdd => {
          def func(records: Iterator[(String,Int)]) {
            var conn: Connection = null
            var stmt: PreparedStatement = null
            try {
    
              val url = "jdbc:mysql://localhost:3306/bigjob?useUnicode=true&characterEncoding=UTF-8"
              val user = "root"
              val password = "123456"
              conn = DriverManager.getConnection(url, user, password)
    
              records.foreach(p => {
                val sql = "insert into bigwork(name,count) values (?,?)"
                stmt = conn.prepareStatement(sql);
                stmt.setString(1, p._1.trim)
                stmt.setInt(2,p._2.toInt)
                stmt.executeUpdate()
              })
            } catch {
              case e: Exception => e.printStackTrace()
            } finally {
              if (stmt != null) {
                stmt.close()
              }
              if (conn != null) {
                conn.close()
              }
            }
          }
    
          val repartitionedRDD = rdd.repartition(3)
          repartitionedRDD.foreachPartition(func)
        })
    
        streamingContext.start()
    
      streamingContext.awaitTermination()
      }
    }

    最后在数据库中读取数据可视化展示即可

  • 相关阅读:
    汇总国内开源站点镜像网站-20210813更新
    谈谈技术人的技术家园
    庖丁解码
    好的软件(软件工程)
    LeetCode 914卡盘分组
    《黑客与画家》读书笔记
    30岁的我给现在26岁的自己的想说的一些话
    毕业一年半,发现自己还是一个CURD boy
    LeetCode 120 Triangle
    疫情相关项目复盘
  • 原文地址:https://www.cnblogs.com/huaobin/p/16020267.html
Copyright © 2020-2023  润新知