案例
import requests import os from requests.packages import urllib3 from pyquery import PyQuery as pq import re # 解析数据 import ssl os.chdir(r"E:/pics22223/") def get_url1(url): ssl._create_default_https_context = ssl._create_unverified_context headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'} urllib3.disable_warnings() response = requests.get(url, headers=headers, verify=False) response.encoding = response.apparent_encoding html = response.text doc = pq(html) a = doc('.TypeList .TypeBigPics') for item in a.items(): lists = [] b = item.attr('href') lists.append(b) response2 = requests.get(b, headers=headers, verify=False) response2.encoding = response2.apparent_encoding html2 = response2.text doc2 = pq(html2) text = doc2('body > div.wrap > div.NewPages > ul > li:nth-child(1) > a').text() group = re.search(r'[d]+', text).group() for item2 in range(2, int(group)): htm_ = b.replace(".htm", '') + '_' + str(item2) + '.htm' lists.append(htm_) title = re.search(r'[u4e00-u9fa5]+', doc2('body > div.wrap > div.ArticleTitle > strong').text()).group() for element in lists: response3 = requests.get(element, headers=headers, verify=False) response3.encoding = response3.apparent_encoding html3 = response3.text doc3 = pq(html3) attr = doc3('#ArticleId0 > p > a > img').attr('src') path = str(title) + '/' + attr.split(r'/')[-1] # 根目录加上url中以反斜杠分割的最后一部分,即可以以图片原来的名字存储在本地 try: if not os.path.exists(str(title)): # 判断当前根目录是否存在 os.mkdir(str(title)) # 创建根目录 if not os.path.exists(path): # 判断文件是否存在 r = requests.get(attr) with open(path, 'wb')as f: f.write(r.content) f.close() print("文件保存成功", ' ') else: print("文件已存在") except: print("爬取失败") if __name__ == '__main__': url = 'https://www.umei.cc/p/gaoqing/cn/' for i in range(21, 26): url1 = url + str(i) + '.htm' get_url1(url1)
案例抓取-->剪切板-->发送
# -*- coding:utf-8 -*- import os import re import requests import time import win32api import win32con from PIL import Image from io import BytesIO import win32clipboard os.chdir(r"E:/ntmssFile/umei/") def paste_img(file_img): """ 图片转换成二进制字符串,然后以位图的格式写入剪贴板 主要思路是用Image模块打开图片, 用BytesIO存储图片转换之后的二进制字符串 :param file_img: 图片的路径 """ # 把图片写入image变量中 # 用open函数处理后,图像对象的模式都是 RGB image = Image.open(file_img) # 声明output字节对象 output = BytesIO() # 用BMP (Bitmap) 格式存储 # 这里是位图,然后用output字节对象来存储 image.save(output, 'BMP') # BMP图片有14字节的header,需要额外去除 data = output.getvalue()[14:] # 关闭 output.close() # DIB: 设备无关位图(device-independent bitmap),名如其意 # BMP的图片有时也会以.DIB和.RLE作扩展名 # 设置好剪贴板的数据格式,再传入对应格式的数据,才能正确向剪贴板写入数据 send_msg_to_clip(win32clipboard.CF_DIB, data) def send_msg_to_clip(type_data, msg): """ 操作剪贴板分四步: 1. 打开剪贴板:OpenClipboard() 2. 清空剪贴板,新的数据才好写进去:EmptyClipboard() 3. 往剪贴板写入数据:SetClipboardData() 4. 关闭剪贴板:CloseClipboard() :param type_data: 数据的格式, unicode字符通常是传 win32con.CF_UNICODETEXT :param msg: 要写入剪贴板的数据 """ win32clipboard.OpenClipboard() win32clipboard.EmptyClipboard() win32clipboard.SetClipboardData(type_data, msg) win32clipboard.CloseClipboard() def pasteInfo(): win32api.keybd_event(17, 0, 0, 0) # ctrl键位码是17 win32api.keybd_event(86, 0, 0, 0) # v键位码是86 win32api.keybd_event(86, 0, win32con.KEYEVENTF_KEYUP, 0) # 释放按键 win32api.keybd_event(17, 0, win32con.KEYEVENTF_KEYUP, 0) win32api.keybd_event(13, 0, 0, 0) # enter win32api.keybd_event(13, 0, win32con.KEYEVENTF_KEYUP, 0) # 释放按键 def crawl(start_url, req_headers): try: res = requests.get(start_url, headers=req_headers) content = res.content.decode("utf8") parttern_href = re.compile(r'<li>.*?<a href="(.*?)" class="TypeBigPics" .*?>.*?</li>', flags=re.DOTALL) hrefs = re.findall(parttern_href, content) for href in hrefs: res = requests.get(href, headers=req_headers) content_href = res.content.decode("utf8") parttern_title = re.compile(r'<strong>(.*)</strong>', flags=re.DOTALL) title = re.search(parttern_title, content_href).group(1) if not os.path.exists(title): os.makedirs(title) total_compile = re.compile(r'<li><a>共(d*)页: </a></li>', flags=re.DOTALL) total_page = re.search(total_compile, content_href).group(1) for page in range(2, int(total_page)): sub = re.sub(r'.htm', '', href) url_page = sub + '_' + str(page) + '.htm' res_ = requests.get(url_page) content_ = res_.content.decode("utf8") photo_compile = re.compile(r'<p align="center">.*?<a .*?>.*?<img alt=".*" src="(.*?)" /></a>',flags=re.DOTALL) photo_url = re.search(photo_compile, content_).group(1) img_ = requests.get(photo_url) file_path = '{}/{}.{}'.format(title, page, 'jpg') with open(file_path, 'wb') as f: f.write(img_.content) f.close() curryDir = os.getcwd() time.sleep(5) absPath = curryDir + '/' + file_path replace = absPath.replace("\", '/') paste_img(replace) time.sleep(5) pasteInfo() res_1 = requests.get(href) content_ = res_1.content.decode("utf8") photo_compile = re.compile(r'<p align="center">.*?<a .*?>.*?<img alt=".*" src="(.*?)" /></a>',flags=re.DOTALL) photo_url = re.search(photo_compile, content_).group(1) img_ = requests.get(photo_url) file_path = '{}/{}.{}'.format(title, '1', 'jpg') with open(file_path, 'wb') as f: f.write(img_.content) f.close() except: print("爬去失败") if __name__ == '__main__': start_url = 'https://www.umei.cc/p/gaoqing/cn/' req_headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} res = requests.get(start_url, headers=req_headers) content = res.content.decode("utf8") count_com = re.compile(r"<li><a href='(d+).htm'>末页</a></li>", flags=re.DOTALL) count = re.search(count_com, content).group(1) for i in range(1, int(count) + 1): url = start_url + str(i) + '.htm' crawl(url, req_headers)