• Python 爬虫JD数据


    # -*- coding: utf-8 -*-
    # ---
    # @Software: PyCharm
    # @Site:
    # @File: day1.py
    # @Author: ---SamXu
    # @E-mail: ---xuhongwu1993@gmail.com
    # @Time: 5月 22, 2020

    # 导入模块
    from bs4 import BeautifulSoup # 网页解析
    import re # 正则表达式,进行文字匹配
    import urllib.request,urllib.error,urllib.parse # 制定url,获取网页数据,中文转码
    import xlwt # 进行excel操作
    import sqlite3 # 进行SQLite数据库操作
    import ssl # ssl验证证书问题




    #ssl._create_default_https_context = ssl._create_unverified_context # 取消全局变量

    context = ssl._create_unverified_context() # 取消局部变量



    def main():

    User_input = input("请输入想要查询的产品: ")
    baseurl = userinput(User_input)
    datelist = GetDate(baseurl)
    #askURL(baseurl)
    savepath = "JD数据抓去.xls"
    Savedata(datelist,savepath)




    findList = re.compile(r'<em>(.*?) (.*?)class="promo-words"',re.S)
    findLink = re.compile(r'href="//(.*?)"')
    findName = re.compile(r"n<em>(.*?)<font")
    findMoney = re.compile(r'</em><i>(d+.?d*)</i')


    # 用户输入
    def userinput(User_input):
    enter = urllib.parse.quote(User_input) # 中文转换ascll码
    baseurl = "https://search.jd.com/Search?keyword=" + enter + "&wq=" + enter + "&page="
    return baseurl


    # 爬取网页数
    def GetDate(baseurl):
    datelist = []
    for i in range(1,2,2): # 获取多页内容
    url1 = baseurl + str(i)
    html = askURL(url1) # 保存获取到的网页源代码

    #解析数据
    soup = BeautifulSoup(html,"lxml") # BeautifulSoup('源码','解析格式')
    for item in soup.find_all('div', class_="gl-i-wrap"): # 查找符合要求的字符串,形成列表"class 需要加'_' "
    data = []
    item = str(item)
    FindList = re.findall(findList,item)
    # print(FindList)
    Link = ''.join(re.findall(findLink,str(FindList)))
    name = ''.join(re.findall(findName,str(FindList)))
    money = ''.join(re.findall(findMoney,str(FindList)))
    data.append(name)
    data.append(money)
    data.append(Link)
    datelist.append(data)
    # print(datelist)
    return datelist


    # 指定一个URL的网页内容

    def askURL(url):
    head = {
    "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
    } # 用户代理,伪装浏览器信息
    request = urllib.request.Request(url,headers=head)
    try:
    response = urllib.request.urlopen(request,context=context)
    html = response.read().decode("utf-8")
    return html
    except urllib.error.URLError as e:
    if hasattr(e,"code"):
    print(e.code)
    if hasattr(e,"reason"):
    print(e.reason)



    # 保存数据
    def Savedata(datelist,savepath):
    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet('JD数据')
    col = ('名字','价格','链接')
    for i in range(0,3):
    sheet.write(0,i,col[i])
    for i in range(0,30):
    data = datelist[i]
    for j in range(0,3):
    sheet.write(i+1,j,data[j])



    book.save('JD数据抓去.xls')
    if __name__ == "__main__": # 当程序执行时
    main()


  • 相关阅读:
    SpringMVC 集成 Swagger【问题】 No qualifying bean of type RequestMappingHandlerMapping found for dependency
    【leetcode】medianofTwoSortedArrays
    记一次apache+php调优
    java 文件定位
    Java知识探究一:关于IO类库
    180app待选内容
    SQlserver 2000 根据spid 查询执行的SQL
    (转)Flashbuilder4.5中文版破解方法
    代码整洁之道
    手机分辨率基础知识(DPI,DIP计算)
  • 原文地址:https://www.cnblogs.com/donglian1/p/12945809.html
Copyright © 2020-2023  润新知