• 学习pyspider两篇记录,python爬虫


    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2018-11-08 09:56:40
    # Project: product

    from pyspider.libs.base_handler import *
    import re
    import base64
    import os
    import urllib
    import urllib.request
    import requests
    import json


    class Handler(BaseHandler):

    def default(self, obj):
    if isinstance(obj, bytes):
    return str(obj, encoding='utf-8')
    return json.JSONEncoder.default(self, obj)

    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
    self.crawl('http://www.yunjinet.com/sell/list/7934/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
    for each in response.doc('a[href^="http"]').items():
    if re.match('http://www.yunjinet.com/sell/show.+',each.attr.href):
    self.crawl(each.attr.href, callback=self.detail_page)
    else:
    self.crawl(each.attr.href, callback=self.index_page)


    @config(priority=2)
    def detail_page(self, response):
    image_url_list=[]#图片url
    tags=[]#分类标签
    images=[]#图片base64
    x=0
    imageresult=[]#放图片对象
    results=[]#最终结果,全部json放里
    result=dict()#放json
    headers = {"Content-Type": "application/json"}
    path='D:\pythonlianxi\testimg'

    if not os.path.isdir(path):
    os.makedirs(path)
    paths = path+'\'

    for img in response.doc('div[class="vertical-img"] img').items():
    image_url_list.append(img.attr.src)
    urllib.request.urlretrieve(img.attr.src,'{0}{1}.jpg'.format(paths,x))
    #print(paths+str(x))
    with open(paths+str(x)+".jpg","rb") as f:
    base64_data = base64.b64encode(f.read())
    base64_data = (base64_data).decode()
    #print((base64_data).decode())
    #print("".join(map(chr, base64_data)))
    images.append(base64_data)
    imgurl=dict()#放base64
    imgurl['imgBase64']=base64_data
    imageresult.append(imgurl)
    x = x + 1


    for each in response.doc('div[class="location_an mt_10"]').items('a'):
    tags.append(each.text())



    pricebefore=response.doc('p[class="s"]').text()
    findlist = re.findall('[0-9]*.?[0-9]+', pricebefore)
    if not len(findlist):
    findlist=[0]
    print(findlist[0])
    result['originalLink']=response.url
    result['productName']=response.doc('h1').text()
    result['price']=findlist[0]
    result['productDescription']=response.doc('div[class="product_content"]').text()
    result['category1']=tags[2]
    result['category2']=tags[3]
    result['category3']=tags[4]
    result['images']=imageresult

    results.append(result)
    print(result)

    #payload=json.dumps(result)
    payload=json.dumps(result)
    r = requests.post('http://192.168.1.115/es/index/product', data=payload, headers=headers)

    return {
    "originalLink": response.url,
    "productName": response.doc('h1').text(),
    "price": response.doc('p[class="s"]').text(),
    "productDescription":response.doc('div[class="product_content"]').text(),
    "category1":tags[2],
    "category2":tags[3],
    "category3":tags[4],
    "images":images,
    }

    -------------------------------

    #!/usr/bin/env python
    # -*- encoding: utf-8 -*-
    # Created on 2018-11-08 09:56:40
    # Project: product

    from pyspider.libs.base_handler import *
    import re
    import base64
    import os
    import urllib
    import urllib.request
    import requests
    import json


    class Handler(BaseHandler):

    def default(self, obj):
    if isinstance(obj, bytes):
    return str(obj, encoding='utf-8')
    return json.JSONEncoder.default(self, obj)

    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
    self.crawl('https://product.suning.com/0000000000/10629204175.html#?safp=d488778a_10004_0_daa73474ac', callback=self.index_page, validate_cert=False)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
    for each in response.doc('a[href^="http"]').items():
    if re.match('https://product.suning.com/+',each.attr.href):
    self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
    else:
    self.crawl(each.attr.href, callback=self.index_page, validate_cert=False)


    @config(priority=2)
    def detail_page(self, response):
    image_url_list=[]#图片url
    tags=[]#分类标签
    images=[]#图片base64
    x=0
    imageresult=[]#放图片对象
    results=[]#最终结果,全部json放里
    result=dict()#放json
    #headers = {"Content-Type": "application/json"}
    path='D:\pythonlianxi\testimg'

    if not os.path.isdir(path):
    os.makedirs(path)
    paths = path+'\'

    for img in response.doc('div[moduleId="R1901001_3"]').items('img'):

    if re.match('http.+',img.attr.src2):
    imgurl = img.attr.src2
    else:
    imgurl = 'https://'+img.attr.src2
    # image_url_list.append(img.attr.src)
    urllib.request.urlretrieve(imgurl,'{0}{1}.jpg'.format(paths,x))
    with open(paths+str(x)+".jpg","rb") as f:
    base64_data = base64.b64encode(f.read())
    base64_data = (base64_data).decode()
    # #print((base64_data).decode())
    # #print("".join(map(chr, base64_data)))
    #images.append(base64_data)
    imgurl=dict()#放base64
    imgurl['imgBase64']=base64_data
    imageresult.append(imgurl)
    x = x + 1


    for each in response.doc('a[class="ft"]').items():
    tags.append(each.text())

    #pricebefore=response.doc('p[class="s"]').text()
    #findlist = re.findall('[0-9]*.?[0-9]+', pricebefore)
    #print(findlist[0])
    #if not len(findlist):
    # findlist=[0]


    result['originalLink']=response.url
    result['productName']=response.doc('h1').text()
    result['price']=3000
    result['productDescription']=response.doc('meta[name="description"]').attr.content
    result['category1']=tags[0]
    result['category2']=tags[1]
    result['category3']=tags[2]
    result['images']=imageresult

    #results.append(result)
    #print(result)

    #payload=json.dumps(result)
    #r = requests.post('http://192.168.1.115/es/index/product', data=payload, headers=headers)

    return {
    "originalLink": response.url,
    "productName": response.doc('h1').text(),
    #"price": response.doc('p[class="s"]').text(),
    "productDescription":response.doc('meta[name="description"]').attr.content,
    "category1":tags[0],
    "category2":tags[1],
    "category3":tags[2],
    "images":imageresult,
    }

  • 相关阅读:
    PHP操作redis
    鼠标失去焦点处理办法
    关于HTTP协议,一篇就够了
    什么是 CDN(超形象)
    网站部署之~阿里云系列汇总
    阿里云系列——6.给你的域名使用CDN加速(详细步骤+简单配置)
    mysql备份与还原
    .NET框架之“小马过河”
    .NET使用Bogus生成大量随机数据
    .NET中的值类型与引用类型
  • 原文地址:https://www.cnblogs.com/lely/p/9936455.html
Copyright © 2020-2023  润新知