• 一些爬虫代码


    基于xpath的爬虫

    ​ 爬取起点的热门书籍名称,作者,月票以及简介,并将结果保存在xiaoshuo.txt中

    import requests
    from lxml import etree
    import time
    import sys		#以下三行是为了解决编码报错问题
    reload(sys)
    sys.setdefaultencoding("utf8")
    
    fo = open("xiaoshuo.txt","w")
    i=1
    for i in range(5):
        url = "https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page=%d"%i
        header = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"}
        data = requests.get(url,headers=header).text
        f = etree.HTML(data)
    
        hrefs = f.xpath('/html/body/div[1]/div[5]/div[2]/div[2]/div/ul/li/div[2]/h4/a/@href')
        for href in hrefs:
            href = "https:"+href
            book = requests.get(href,headers=header).text
            e = etree.HTML(book)    
            title = e.xpath('/html/body/div/div[6]/div[1]/div[2]/h1/em/text()')[0]
            zuozhe = e.xpath('/html/body/div/div[6]/div[1]/div[2]/h1/span/a/text()')[0]
            jieshao = e.xpath('/html/body/div/div[6]/div[4]/div[1]/div[1]/div[1]/p/text()')
            yuepiao = e.xpath('//*[@id="monthCount"]/text()')[0]
            str = '<----->'+title+'<----->'+zuozhe+'<----->'+yuepiao+'
    '
            fo.write(str)
            for te in jieshao:
                fo.write(te)
    
    fo.close()
    

    基于selenium的爬虫

    ​ 目的是爬取校园网上个人基本信息,未完成。最终目的是做出批量查询(学号密码有固定形式)

    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    import time	
    
    #由于find.element_by_*始终无法定位到需要点击的按钮上,无法进入下一页,下一步准备尝试与requests库连用
    
    driver = webdriver.Chrome()
    driver.get("http://cas.hdu.edu.cn/cas/login?service=http%3A%2F%2Fonce.hdu.edu.cn%2Fdcp%2Findex.jsp")
    elem1 = driver.find_element_by_id("un")
    elem2 = driver.find_element_by_id("pd")
    elem1.send_keys("学号")		#将学号密码替换为自己的真实学号密码
    elem2.send_keys("密码")
    driver.find_element_by_id('index_login_btn').click()
    driver.find_element_by_class_name('quickhome_item_link').click()
    print driver.page_source
    

    基于正则表达式

    ​ 贴吧图片批量下载

    import urllib
    import re
    
    def gethtml(url):
        page = urllib.urlopen(url)
        html = page.read()
        return html
    
    def getimg(html):
        reg = r'src="(.+?.jpg)" size'
        imgre= re.compile(reg)
        imglist = re.findall(imgre,html)
        return imglist
    
    def downimg(imglist):
        x=0
        local = 'D:/VScode/image/'
        for img in imglist:
            urllib.urlretrieve(img,local+'%s.jpg'%x)
            x+=1
                
    html = gethtml("https://movie.douban.com/subject/26942674/")
    print html
    
    
  • 相关阅读:
    Linux- 恢复.swp文件
    codeforces contest 1111
    bzoj2589【 Spoj 10707】 Count on a tree II
    20190129模拟题
    loj6070【山东集训第一轮Day4】基因
    bzoj4784【zjoi2017】仙人掌
    bzoj4520【cqoi2016】K远点对
    【学习笔记】BEST定理
    bzoj2441【中山市选】小W的问题
    bzoj3203【sdoi2013】保护出题人
  • 原文地址:https://www.cnblogs.com/iloveacm/p/13412535.html
Copyright © 2020-2023  润新知