• scrapy将爬取的数据存入MySQL数据库


    items.py
    
    import scrapy
    
    
    class InsistItem(scrapy.Item):
    
        positionname=scrapy.Field()
        type=scrapy.Field()
        place=scrapy.Field()
        mian=scrapy.Field()
        time=scrapy.Field()
    
    pipelines.py
    
    import json
    import scrapy
    import pymysql
    from scrapy.pipelines.images import ImagesPipeline
    class InsistPipeline(object):
        def __init__(self):
            self.db=pymysql.connect(host='localhost',user='dsuser',passwd='badpassword',db='dsdb',charset='utf8',port=3306)
            self.cur=self.db.cursor()
        def process_item(self, item, spider):
            sql='INSERT INTO job(name,type,place,mian,time) VALUES(%s,%s,%s,%s,%s) '
            self.cur.execute(sql,(item['positionname'],item['type'],item['place'],item['mian'],item['time']))
            self.db.commit()
            return item
          
        def close_spider(self, spider):
            self.cur.close()
            self.db.close()
    
    insisits.py
    #爬虫程序
    import scrapy
    from insist.items import InsistItem
    import json
    class InsistsSpider(scrapy.Spider):
        name = 'insists'
        allowed_domains = ['careers.tencent.com']
        #start_urls =['https://careers.tencent.com/search.html?index=']
        baseURL='https://careers.tencent.com/tencentcareer/api/post/Query?pageSize=10&pageIndex='
        offset=1
        start_urls=[baseURL+str(offset)]
    
        def parse(self, response):
            contents = json.loads(response.text)
            jobs = contents['Data']['Posts']
            item = InsistItem()
            for job in jobs:
                item['positionname'] = job['RecruitPostName']
                item['type'] = job['BGName']
                item['place'] = job['LocationName']
                item['mian'] = job['CategoryName']
                item['time'] = job['LastUpdateTime']
                yield item#返回后继续执行数据
            if self.offset<=5:
                  self.offset+=1
                  url=self.baseURL+str(self.offset)
                  yield scrapy.Request(url,callback=self.parse)

  • 相关阅读:
    15 react ajax 请求 github 用户信息
    14 react fetch
    13 React axios
    12 脚手架编写React项目(评论管理)---
    gitlab init project
    为什么是2MSL而不是MSL?
    mac python install zlib not available
    Laravel 传递数据到视图
    sleep(0)作用
    ping错误详解
  • 原文地址:https://www.cnblogs.com/persistence-ok/p/11647296.html
Copyright © 2020-2023  润新知