• 投诉网站爬虫


     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from yg.items import YgItem
     4 
     5 class YgSpiderSpider(scrapy.Spider):
     6     name = 'yg_spider'
     7     allowed_domains = ['wz.sun0769.com']
     8     start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']
     9 
    10     def parse(self, response):
    11         tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")
    12         for tr in tr_list:
    13             item = YgItem()
    14             item["title"] = tr.xpath("./td[2]/a[2]/@title").extract_first()
    15             item["href"] = tr.xpath("./td[2]/a[2]/@href").extract_first()
    16             item["update_time"] = tr.xpath("./td[last()]/text()").extract_first()
    17             # print(item)
    18 
    19             yield scrapy.Request(
    20                 item["href"],
    21                 callback=self.parse_detail,
    22                 meta={"item":item}
    23             )
    24 
    25         next_url = response.xpath("//a[text()='>']/@href").extract_first()
    26         if next_url is not None:
    27             yield scrapy.Request(
    28                 next_url,
    29                 callback=self.parse
    30             )
    31 
    32     def parse_detail(self,response): #处理详情页
    33         item = response.meta["item"]
    34         item["content"] = response.xpath("//div[@class='c1 text14_2']//text()").extract()
    35         item["content_img"] = response.xpath("//div[@class='c1 text14_2']//img/@src").extract()
    36         item["content_img"] = ["http://wz.sun0769.com"+i for i in item["content_img"]]
    37         # print(item)
    38         yield item
     1 # -*- coding: utf-8 -*-
     2 
     3 # Define your item pipelines here
     4 #
     5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
     6 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
     7 import re
     8 import json
     9 
    10 class YgPipeline(object):
    11     def process_item(self, item, spider):
    12         item["content"] = self.process_content(item["content"])
    13         with open("yg.txt", "a", encoding="utf-8") as f:
    14             f.write(json.dumps(dict(item), ensure_ascii=False, indent=4))
    15             f.write("
    ")
    16         return item
    17 
    18     def process_content(self, content):
    19         content = [re.sub(r'xa0|s',"",i) for i in content]
    20         content = [i for i in content if len(i)>0]
    21         return content
     1 # -*- coding: utf-8 -*-
     2 
     3 # Define here the models for your scraped items
     4 #
     5 # See documentation in:
     6 # https://doc.scrapy.org/en/latest/topics/items.html
     7 
     8 import scrapy
     9 
    10 
    11 class YgItem(scrapy.Item):
    12     # define the fields for your item here like:
    13     title = scrapy.Field()
    14     update_time = scrapy.Field()
    15     href = scrapy.Field()
    16     content = scrapy.Field()
    17     content_img = scrapy.Field()
    18     # pass
  • 相关阅读:
    SSH、SCP和SFTP 解析(转)
    SQL Server数据库partition by 与ROW_NUMBER()函数使用详解 (转载)
    Git版本控制与工作流详解(转)
    IQueryable,IEnumberable,.AsEnumerable() 和 .AsQueryable() (转载)
    ASP.NET 中Http处理流程与 HttpModule,HttpHandler学习之初步认知
    xml Node 是否存在
    MVC-前台调用后台action 传递upload file 参数问题
    ResXResourceWriter 与ResourceWriter
    "= ="与 equals 的区别 摘录
    jpg文件格式分析
  • 原文地址:https://www.cnblogs.com/sure-feng/p/10092283.html
Copyright © 2020-2023  润新知