今天学习了python爬虫的简单操作。
1.学会创建文件夹和创建文件:
1 import os 2 3 def mkdir(path): 4 if os.path.exists(path):###判断是文件夹否存在,否则有可能会报错 5 print("The path has already existed .") 6 else: 7 os.makedirs(path)###新建文件夹 8 print("Done .") 9 10 def write(path,str): 11 with open(path,"w+") as file:###写文件 12 file.write(str) 13 14 def main(): 15 mkdir("test") 16 write("test/test.txt","hello world") 17 18 if __name__=="__main__": 19 main()
2.得到一个网站的源码(如果能够访问):
1 from bs4 import BeautifulSoup 2 import requests 3 4 def main(): 5 html=requests.get("https://www.baidu.com")###去找这个网址 6 html.encoding="utf-8"###中文网址,换个字符集 7 soup=BeautifulSoup(html.text,"lxml")###美味的汤,就是正则表达式 8 print(soup.prettify())###将源码格式化(不是删数据) 9 10 if __name__=="__main__": 11 main()
3.得到一个网站的源码中相应标签的元素(如果能够访问):
1 import requests 2 from bs4 import BeautifulSoup 3 4 def write_to_file(content): 5 with open("save.txt","a",encoding="utf-8") as f: 6 f.write(content) 7 8 def get_blog_info(url): 9 html=requests.get(url) 10 soup=BeautifulSoup(html.text,"lxml") 11 print(soup.title)###各种各样的元素 12 print("="*100) 13 print(type(soup.title)) 14 print("="*100) 15 print(type(soup.title.string)) 16 print("="*100) 17 print(soup.title.string) 18 print("="*100) 19 print(soup.head) 20 print("="*100) 21 print(soup.p) 22 23 def main(): 24 blog_url="https://www.cnblogs.com/sgh1023" 25 get_blog_info(blog_url) 26 27 if __name__=="__main__": 28 main()
4.下载一个图片(如果能够访问):
1 import requests 2 from bs4 import BeautifulSoup 3 import os 4 5 tot=0 6 path="save" 7 8 def mkdir(path): 9 if os.path.exists(path): 10 return 11 else: 12 os.makedirs(path) 13 14 def save(content): 15 global tot,path 16 mkdir(path) 17 with open(path+"/"+str(tot)+".png","wb+") as file: 18 file.write(content) 19 file.close() 20 tot=tot+1 21 22 def download_image(url):###下图片,不保证一定成功 23 print("Now downloading...",tot) 24 response=requests.get(url) 25 save(response.content) 26 print("Done !") 27 28 def main(): 29 download_image("https://www.baidu.com/img/pc_1c6e30772d5e4103103bd460913332f9.png") 30 31 if __name__=="__main__": 32 main()
5.下载一个网页的图片:
1 import requests 2 import urllib 3 import os 4 from bs4 import BeautifulSoup 5 6 tot=0 7 path="save" 8 9 def mkdir(path): 10 if os.path.exists(path): 11 return 12 else: 13 os.makedirs(path) 14 15 def save(content): 16 global tot,path 17 mkdir(path) 18 with open(path+"/"+str(tot)+".png","wb+") as file: 19 file.write(content) 20 file.close() 21 tot=tot+1 22 ###################################################################### 23 def get_html_content(url):###获得网址的源码 24 req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。 25 req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36') 26 response=urllib.request.urlopen(req) 27 html=response.read() 28 return html 29 30 def url_exist(url):###判断网址是否存在 31 try: 32 response=requests.get(url) 33 return True 34 except: 35 return False 36 37 def download_image(url):###下图片 38 print("Now downloading...",tot,url) 39 if(url_exist(url)):###判断网址是否存在 40 response=requests.get(url) 41 save(response.content) 42 print("Done !") 43 else: 44 print("Unavailable !") 45 ###################################################################### 46 def process(str):###简单地处理网址 47 if(str[0]=='h'): 48 return str; 49 elif(str[0]=='/' and str[1]!='/'): 50 return "https:/"+str 51 return "https:"+str; 52 53 def get_image(url): 54 soup=BeautifulSoup(get_html_content(url),"lxml") 55 items=soup.find_all("img",{"src":True}) 56 for i in items: 57 download_image(process(i["src"])) 58 59 def main(): 60 url="https://www.bilibili.com" 61 get_image(url) 62 63 if __name__=="__main__": 64 main()
当然,find_all的参数视具体情况而定。
6.正则表达式下载jpg:
1 import requests 2 import urllib 3 import os 4 import re 5 from bs4 import BeautifulSoup 6 7 tot=0 8 path="save" 9 10 def mkdir(path): 11 if os.path.exists(path): 12 return 13 else: 14 os.makedirs(path) 15 16 def save(content): 17 global tot,path 18 mkdir(path) 19 with open(path+"/"+str(tot)+".jpg","wb+") as file: 20 file.write(content) 21 file.close() 22 tot=tot+1 23 ################################################################################ 24 def get_html_content(url):###获得网址的源码 25 req=urllib.request.Request(url)###添加头部,伪装Goole浏览器,这是抄的代码。 26 req.add_header('user-agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36') 27 response=urllib.request.urlopen(req) 28 response.encode="utf-8" 29 html=response.read() 30 return html 31 32 def url_exist(url):###判断网址是否存在 33 try: 34 response=requests.get(url) 35 return True 36 except: 37 return False 38 39 def download_image(url):###下图片 40 print("Now downloading...",tot,url) 41 if(url_exist(url)):###判断网址是否存在 42 response=requests.get(url) 43 save(response.content) 44 print("Done !") 45 else: 46 print("Unavailable !") 47 ################################################################################ 48 def process(str):###简单地处理网址 49 if(len(str)<=5): 50 return "NO" 51 if(str[0]=='h'): 52 return str; 53 elif(str[0]=='/' and str[1]=='/'): 54 return "https:"+str 55 elif(str[0]=='/' and str[1]!='/'): 56 return "https:/"+str 57 return "https://"+str; 58 59 def get_image(url): 60 S=get_html_content(url) 61 items=re.findall('https://(.*?)([.]jpg)',str(S),re.S)###正则表达式 62 for i in items: 63 download_image(process(i[0]+i[1])) 64 if(tot==100): 65 print("Too many images ! Stop what you are doing !") 66 break 67 68 def main(): 69 url="https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%D1%BC%D7%D3" 70 get_image(url) 71 72 if __name__=="__main__": 73 main()