• 最基础网页爬虫


    第一个网页文本爬虫程序(没有添加下载器):

     1 import requests
     2 from bs4 import BeautifulSoup
     3 import os
     4 
     5 headers={'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTMl,like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
     6 url_begin= 'http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000'
     7 start_url=requests.get(url_begin,headers=headers)
     8 #print(start_url.text)
     9 Soup=BeautifulSoup(start_url.text,'lxml')
    10 link_list=Soup.find('div',class_="x-sidebar-left-content").find_all('a')
    11 #print(link_list)
    12 
    13 for link in link_list:
    14         url='http://www.liaoxuefeng.com' + link['href']
    15         html=requests.get(url,headers=headers)
    16         html_Soup=BeautifulSoup(html.text,'lxml')
    17         title_list = html_Soup.find('div', class_="x-content").find_all('h4')
    18         # print(title_list)
    19         for title in title_list:
    20             titlereal = title.get_text()
    21             print(titlereal)
    22 
    23         content_list = html_Soup.find("div", class_="x-wiki-content").find_all('p')
    24         for content in content_list:
    25             # print(content)
    26             contentreal = content.get_text()
    27             print(contentreal)

    第二个网页图片爬虫(引入os模块,可以将网页内容爬取到本地文件夹)

     1 import requests
     2 from bs4 import BeautifulSoup
     3 import os
     4 import urllib
     5 import urllib3
     6 
     7 url= 'http://www.dbmeinv.com/?pager_oofset=1'
     8 x=0
     9 
    10 def crawl(url):
    11     headers = {
    12         'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTMl,like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
    13     req=requests.get(url,headers=headers)
    14 
    15     Soup=BeautifulSoup(req.text,'lxml')
    16     link_list=Soup.find_all('img')
    17     for girl in link_list:
    18         link= girl.get('src')
    19         print(link)
    20 
    21         global x
    22         path=r'/Users/wangxitao/Desktop/douban'
    23         local=os.path.join(path,'image\%s.jpg'%x)
    24         urllib.request.urlretrieve(link,local)
    25                                          #'image\%s.jpg'%x
    26 
    27         x+=1
    28         print("正在下载第%s张"%x)
    29 
    30 for page in range(1,10):
    31     page+=1
    32     url='http://www.dbmeinv.com/?pager_offset=%d'%page
    33     crawl(url)
    34 
    35 print('爬取完毕')
  • 相关阅读:
    2017沈阳站 Tree
    P2146 [NOI2015]软件包管理器
    hdu3307 欧拉函数
    poj 3126 Prime Path bfs
    CSL的字符串
    P1045 麦森数
    洛谷 P1338 末日的传说
    树链剖分
    SQL[Err] ORA-00933: SQL command not properly ended
    Postman 快速入门之脚本
  • 原文地址:https://www.cnblogs.com/jidongdeatao/p/6943286.html
Copyright © 2020-2023  润新知