• 随笔写一个简单的爬虫


    目标:爬取damai网上即将上演的信息

     1 #!/usr/bin/python
     2 # -*- coding: utf-8 -*-
     3 
     4 import requests, re
     5 from bs4 import BeautifulSoup
     6 
     7 DOWNLOAD_URL = "http://www.damai.cn/bj/"
     8 
     9 #获取url页面内容
    10 def download_page(url):
    11     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
    12                         'Chrome/51.0.2704.63 Safari/537.36'}
    13     data = requests.get(url, headers = headers).content
    14     return data
    15 #解析html 
    16 def x_page(url):
    17     soup = BeautifulSoup(url, 'html.parser')
    18     li_lst = soup.find('div', class_='index-con').next_sibling.next_sibling.find_all('li')
    19     titles = [i.find('dt').find('a').string for i in li_lst]
    20     prices = map(lambda x: x.find('p',class_='price').find('strong').text if x.find('p',class_='price').find('strong') is not None else 'none',li_lst)
    21     time = [i.find('p',class_='time').string for i in li_lst]
    22     places = [i.find('p',class_='place').string for i in li_lst]
    23     return titles,prices,time,places
    24 
    25 if __name__ == '__main__':
    26     url = download_page(DOWNLOAD_URL)
    27     titles, prices, time, places = x_page(url)
    28     info_lst = zip(titles,prices,time,places)
    29     #写入文件
    30     with open('damai.txt','w+') as f:
    31         for j in info_lst:
    32             f.write(' '.join(j)+'
    
    ')

     

  • 相关阅读:
    curl常用选项
    cuda
    mysql 备份文件.xbstream 恢复到本地
    firewall 常用命令(update...)
    ownCloud 研究笔记(update...)
    V3
    English trip EM3-LP-3A ROOMMATES Teacher:Corrine
    V3
    English trip EM3-LP-5A Shopping Teacher:Taylor
    新概念 Lesson 11 Which book?
  • 原文地址:https://www.cnblogs.com/fuzzier/p/5929453.html
Copyright © 2020-2023  润新知