• python爬虫之BeautifulSoup


    # -*- coding: UTF-8 -*-
    import re
    from bs4 import BeautifulSoup
    import requests
    import codecs
    import sys  
    reload(sys)  
    sys.setdefaultencoding('utf8') 
    
    def mei_url():
        url = 'http://mdl.com/product'
        web_data = requests.get(url)
        web_data.encoding = 'utf-8'
        soup = BeautifulSoup(web_data.text, 'lxml')
        return soup
        
    def mei_info(sub_url='/product/item/293410'):
        url = 'http://mdl.com'+sub_url
        web_data = requests.get(url)
        web_data.encoding = 'utf-8'
        soup = BeautifulSoup(web_data.text, 'lxml')
        title=soup.select('#main > div.boundary > div > div.container__main > div.section.section-info.clearfix > h2')[0].get_text()
        introduce=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text')[0].get_text()
        effect=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text > span')[0].get_text()
        crowd=soup.select('#main > div.boundary > div > div.container__main > div.section.section-intro.clearfix > div > div.section-intro__item__body.rich-text')[2].get_text()
        print  title
        with codecs.open(r'E:
    otemei_infov3.txt', "a+",'utf8') as file: 
            file.write('&'.join(map(lambda x:str(x),[title,introduce,effect,crowd])))
            file.write('
    ')
            file.write('$')
    if __name__=='__main__':
        
        # items=mei_url()
        # items=str(items)
        soup1 = BeautifulSoup(open(r'E:
    otemei.htm'),'lxml')
        items1=str(soup1)
        url_list1=re.findall(r'/product/item/d{6}',items1 )
        soup2 = BeautifulSoup(open(r'E:
    otemei2.htm'),'lxml')
        items2=str(soup2)
        url_list2=re.findall(r'/product/item/d{6}',items2 )
        url_list3=url_list1+url_list2
        print len(url_list3)
        for sub_url in url_list3:
            mei_info(sub_url)
    
    
        
  • 相关阅读:
    sessionStorage用于分页,瀑布流和存储用户数据等
    js瀑布流
    sql 日结
    js 去除html标签
    c# 去除文本的html标签
    jQuery 数据滚动(上下)
    jQuery 图片随滚动条滚动加载
    sql 指定范围 获取随机数
    js 时间格式化
    js自写字符串 append 方法
  • 原文地址:https://www.cnblogs.com/wangbin2188/p/6555137.html
Copyright © 2020-2023  润新知