• lxml 和 pyquery 示例 爬 卡牌


    import requests
    from pyquery import PyQuery as pq
    import json
    import jsonpath
    from lxml import etree
    import os
    
    html = '''
    <div>
        <ul>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
    '''
    html = requests.get('http://news.4399.com/gonglue/lscs/kptj/').content.decode('gbk')
    num = 0
    def pq方法():
        global num
        doc = pq(html)
        items = doc('#dq_list > li').items()
        # print(doc)
        # print(type(doc))
        for item in items:
            url=item.find('img').attr('lz_src')
            num+=1
            print(str(num),url)
            url_content=requests.get(url).content
            name = item.find('.kp-name').text()
            
            with open('e:/py3/002/'+'{:0>4}'.format(str(num))+name+'.jpg','wb') as file:
                file.write(url_content)
            # print(url,name)
    
    def lxml方法():
        print(html)
        global num
        r=etree.HTML(html)
        # items=r.xpath("//div[@class='box10-content']//ul[@id='dq_list']/li/a/img/@lz_src")
        items=r.xpath("//div[@class='box10-content']//ul[@id='dq_list']/li/a")
        # print(items)
        for item in items:
            kpname=item.xpath("./div/text()")[0]
            lzsrc=item.xpath("./img/@lz_src")[0]
            num+=1
            print(kpname,lzsrc)
            lzcontent=requests.get(lzsrc).content
            with open('e:/py3/003/'+'{:0>4}'.format(str(num))+'_'+kpname+'.jpg','wb')as file:
                file.write(lzcontent)
    
    
    
    
    
    if __name__ == '__main__':
        # pq方法()
        lxml方法()
    
        # 创建目录
        '''
        for dirnum in range(1,100):
            dirnum2='{:0>3}'.format(str(dirnum))
            mkpath="e:\py3\{}\".format(dirnum2)
            print(mkpath)
            print('已存在!') if os.path.exists(mkpath) else os.makedirs(mkpath)
        '''
    
    '''
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_doc)
    
    print(soup.prettify()) # 结构化输出文档
    print(soup.title) # 获取title标签
    print(soup.title.name) # 获取title标签名称 
    print(soup.title.parent.name)
    print(soup.p['class'])
    '''
  • 相关阅读:
    利用ssh传输文件
    linux 终端常用快捷键
    ubuntu 下关闭apache服务自动启动
    linux ps命令介绍
    virtualenv 使用
    startuml 2.6注册
    三代组装小基因组研究综述
    畅想未来的测序
    测序简史
    纳米孔测序技术介绍
  • 原文地址:https://www.cnblogs.com/pscc/p/9866194.html
Copyright © 2020-2023  润新知