• pyquery 抓取优图


    案例

    import requests
    import os
    from requests.packages import urllib3
    from pyquery import PyQuery as pq
    import re  # 解析数据
    import ssl
    
    os.chdir(r"E:/pics22223/")
    def get_url1(url):
        ssl._create_default_https_context = ssl._create_unverified_context
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
        urllib3.disable_warnings()
        response  = requests.get(url, headers=headers, verify=False)
        response.encoding = response.apparent_encoding
        html = response.text
        doc = pq(html)
        a = doc('.TypeList .TypeBigPics')
        for item in a.items():
            lists = []
            b = item.attr('href')
            lists.append(b)
            response2 = requests.get(b, headers=headers, verify=False)
            response2.encoding = response2.apparent_encoding
            html2 = response2.text
            doc2 = pq(html2)
            text = doc2('body > div.wrap > div.NewPages > ul > li:nth-child(1) > a').text()
            group = re.search(r'[d]+', text).group()
            for item2 in range(2, int(group)):
                htm_ = b.replace(".htm", '') + '_' + str(item2) + '.htm'
                lists.append(htm_)
            title = re.search(r'[u4e00-u9fa5]+', doc2('body > div.wrap > div.ArticleTitle > strong').text()).group()
            for element in lists:
                response3 = requests.get(element, headers=headers, verify=False)
                response3.encoding = response3.apparent_encoding
                html3 = response3.text
                doc3 = pq(html3)
                attr = doc3('#ArticleId0 > p > a > img').attr('src')
                path = str(title) + '/' + attr.split(r'/')[-1]
                # 根目录加上url中以反斜杠分割的最后一部分,即可以以图片原来的名字存储在本地
                try:
                    if not os.path.exists(str(title)):  # 判断当前根目录是否存在
                        os.mkdir(str(title))  # 创建根目录
                    if not os.path.exists(path):  # 判断文件是否存在
                        r = requests.get(attr)
                        with open(path, 'wb')as f:
                            f.write(r.content)
                            f.close()
                            print("文件保存成功", '
    ')
                    else:
                        print("文件已存在")
                except:
                    print("爬取失败")
    
    if __name__ == '__main__':
        url = 'https://www.umei.cc/p/gaoqing/cn/'
        for i in range(21, 26):
            url1 = url + str(i) + '.htm'
            get_url1(url1)

     案例抓取-->剪切板-->发送

    # -*- coding:utf-8 -*-
    import os
    import re
    import requests
    import time
    import win32api
    import win32con
    from PIL import Image
    from io import BytesIO
    import win32clipboard
    
    os.chdir(r"E:/ntmssFile/umei/")
    def paste_img(file_img):
        """
        图片转换成二进制字符串,然后以位图的格式写入剪贴板
        主要思路是用Image模块打开图片,
        用BytesIO存储图片转换之后的二进制字符串
        :param file_img: 图片的路径
        """
        # 把图片写入image变量中
        # 用open函数处理后,图像对象的模式都是 RGB
        image = Image.open(file_img)
        # 声明output字节对象
        output = BytesIO()
        # 用BMP (Bitmap) 格式存储
        # 这里是位图,然后用output字节对象来存储
        image.save(output, 'BMP')
        # BMP图片有14字节的header,需要额外去除
        data = output.getvalue()[14:]
        # 关闭
        output.close()
        # DIB: 设备无关位图(device-independent bitmap),名如其意
        # BMP的图片有时也会以.DIB和.RLE作扩展名
        # 设置好剪贴板的数据格式,再传入对应格式的数据,才能正确向剪贴板写入数据
        send_msg_to_clip(win32clipboard.CF_DIB, data)
    
    
    def send_msg_to_clip(type_data, msg):
        """
        操作剪贴板分四步:
        1. 打开剪贴板:OpenClipboard()
        2. 清空剪贴板,新的数据才好写进去:EmptyClipboard()
        3. 往剪贴板写入数据:SetClipboardData()
        4. 关闭剪贴板:CloseClipboard()
        :param type_data: 数据的格式,
        unicode字符通常是传 win32con.CF_UNICODETEXT
        :param msg: 要写入剪贴板的数据
        """
        win32clipboard.OpenClipboard()
        win32clipboard.EmptyClipboard()
        win32clipboard.SetClipboardData(type_data, msg)
        win32clipboard.CloseClipboard()
    
    
    def pasteInfo():
        win32api.keybd_event(17, 0, 0, 0)  # ctrl键位码是17
        win32api.keybd_event(86, 0, 0, 0)  # v键位码是86
        win32api.keybd_event(86, 0, win32con.KEYEVENTF_KEYUP, 0)  # 释放按键
        win32api.keybd_event(17, 0, win32con.KEYEVENTF_KEYUP, 0)
        win32api.keybd_event(13, 0, 0, 0)  # enter
        win32api.keybd_event(13, 0, win32con.KEYEVENTF_KEYUP, 0)  # 释放按键
    
    
    def crawl(start_url, req_headers):
        try:
            res = requests.get(start_url, headers=req_headers)
            content = res.content.decode("utf8")
            parttern_href = re.compile(r'<li>.*?<a href="(.*?)" class="TypeBigPics" .*?>.*?</li>', flags=re.DOTALL)
            hrefs = re.findall(parttern_href, content)
            for href in hrefs:
                res = requests.get(href, headers=req_headers)
                content_href = res.content.decode("utf8")
                parttern_title = re.compile(r'<strong>(.*)</strong>', flags=re.DOTALL)
                title = re.search(parttern_title, content_href).group(1)
                if not os.path.exists(title):
                    os.makedirs(title)
                total_compile = re.compile(r'<li><a>共(d*)页: </a></li>', flags=re.DOTALL)
                total_page = re.search(total_compile, content_href).group(1)
                for page in range(2, int(total_page)):
                    sub = re.sub(r'.htm', '', href)
                    url_page = sub + '_' + str(page) + '.htm'
                    res_ = requests.get(url_page)
                    content_ = res_.content.decode("utf8")
                    photo_compile = re.compile(r'<p align="center">.*?<a .*?>.*?<img alt=".*" src="(.*?)" /></a>',flags=re.DOTALL)
                    photo_url = re.search(photo_compile, content_).group(1)
                    img_ = requests.get(photo_url)
                    file_path = '{}/{}.{}'.format(title, page, 'jpg')
                    with open(file_path, 'wb') as f:
                        f.write(img_.content)
                        f.close()
                    curryDir = os.getcwd()
                    time.sleep(5)
                    absPath = curryDir + '/' + file_path
                    replace = absPath.replace("\", '/')
                    paste_img(replace)
                    time.sleep(5)
                    pasteInfo()
                res_1 = requests.get(href)
                content_ = res_1.content.decode("utf8")
                photo_compile = re.compile(r'<p align="center">.*?<a .*?>.*?<img alt=".*" src="(.*?)" /></a>',flags=re.DOTALL)
                photo_url = re.search(photo_compile, content_).group(1)
                img_ = requests.get(photo_url)
                file_path = '{}/{}.{}'.format(title, '1', 'jpg')
                with open(file_path, 'wb') as f:
                    f.write(img_.content)
                    f.close()
        except:
            print("爬去失败")
    
    
    if __name__ == '__main__':
        start_url = 'https://www.umei.cc/p/gaoqing/cn/'
        req_headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
        res = requests.get(start_url, headers=req_headers)
        content = res.content.decode("utf8")
        count_com = re.compile(r"<li><a href='(d+).htm'>末页</a></li>", flags=re.DOTALL)
        count = re.search(count_com, content).group(1)
        for i in range(1, int(count) + 1):
            url = start_url + str(i) + '.htm'
            crawl(url, req_headers)
    故乡明
  • 相关阅读:
    C++成员变量与函数内存分配
    Sqlite ContentProvider Loader 上下文 对话框
    好书好人生--读书的步骤
    小智慧40
    流媒体开发之-直播界面切换电视台频道
    HDU 4617Weapon(两条异面直线的距离)
    BON取代半岛电视,美国人要“换口味”了吗?
    【Todo】Lucene系统学习
    Zookeeper学习 & Paxos
    C++中的虚继承 & 重载隐藏覆盖的讨论
  • 原文地址:https://www.cnblogs.com/luweiweicode/p/14335683.html
Copyright © 2020-2023  润新知