• scrapy将爬取的数据存入MySQL数据库


    items.py
    
    import scrapy
    
    
    class InsistItem(scrapy.Item):
    
        positionname=scrapy.Field()
        type=scrapy.Field()
        place=scrapy.Field()
        mian=scrapy.Field()
        time=scrapy.Field()
    
    pipelines.py
    
    import json
    import scrapy
    import pymysql
    from scrapy.pipelines.images import ImagesPipeline
    class InsistPipeline(object):
        def __init__(self):
            self.db=pymysql.connect(host='localhost',user='dsuser',passwd='badpassword',db='dsdb',charset='utf8',port=3306)
            self.cur=self.db.cursor()
        def process_item(self, item, spider):
            sql='INSERT INTO job(name,type,place,mian,time) VALUES(%s,%s,%s,%s,%s) '
            self.cur.execute(sql,(item['positionname'],item['type'],item['place'],item['mian'],item['time']))
            self.db.commit()
            return item
          
        def close_spider(self, spider):
            self.cur.close()
            self.db.close()
    
    insisits.py
    #爬虫程序
    import scrapy
    from insist.items import InsistItem
    import json
    class InsistsSpider(scrapy.Spider):
        name = 'insists'
        allowed_domains = ['careers.tencent.com']
        #start_urls =['https://careers.tencent.com/search.html?index=']
        baseURL='https://careers.tencent.com/tencentcareer/api/post/Query?pageSize=10&pageIndex='
        offset=1
        start_urls=[baseURL+str(offset)]
    
        def parse(self, response):
            contents = json.loads(response.text)
            jobs = contents['Data']['Posts']
            item = InsistItem()
            for job in jobs:
                item['positionname'] = job['RecruitPostName']
                item['type'] = job['BGName']
                item['place'] = job['LocationName']
                item['mian'] = job['CategoryName']
                item['time'] = job['LastUpdateTime']
                yield item#返回后继续执行数据
            if self.offset<=5:
                  self.offset+=1
                  url=self.baseURL+str(self.offset)
                  yield scrapy.Request(url,callback=self.parse)

  • 相关阅读:
    AJAX
    Django(cookie和session)
    Django(分页)
    跨站请求伪造和csrf_token使用
    Django(ORM查询联系题)
    Django(ORM查询2)
    Django(ORM查询1)
    Django(ORM常用字段)
    复习
    第二次作业:卷积神经网络 part2
  • 原文地址:https://www.cnblogs.com/persistence-ok/p/11647296.html
Copyright © 2020-2023  润新知