• 爬取北京小猪短租网前2页信息的内容


    参考网址:http://bj.xiaozhu.com/

    需要爬取的信息包括:标题、地址、价格、房东名称、房东性别和房东头像的链接,将数据分别使用TXT、JSON、CSV存储。

    import csv
    import time
    import json
    import requests
    from bs4 import BeautifulSoup
    from requests import RequestException
        
        
    def get_one_page(url):
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
            response = requests.get(url, headers=headers)
            #response.encoding = response.apparent_encoding
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    def get_detailurl(text):
        detailurl = []
        soup = BeautifulSoup(text, 'lxml')
        result_btm_con = soup.find_all(name='div', class_='result_btm_con lodgeunitname')
        for i in range(len(result_btm_con)):
            detailurl.append(result_btm_con[i]['detailurl'])
        return detailurl
        
    def parse_one_page(text):
        soup = BeautifulSoup(text, 'lxml') #使用lxml XML 解析库
        title = soup.select('.pho_info > h4 > em')
        addresse = soup.select('.pho_info > p')
        price = soup.find_all(name='span', class_='detail_avgprice')
        name = soup.find_all(name='a', class_='lorder_name')
        sex = soup.find_all(name='div', class_='member_ico')
        sex1 = soup.find_all(name='div', class_='member_ico1')
        ssex = '' if len(sex) else ''
        img = soup.select('.member_pic img')
        yield{ 
            'title': title[0].string,
            'address': addresse[0]['title'],
            'price': price[0].string,
            'name': name[0]['title'],
            'sex': ssex,
            'img': img[0]['src']
        }
            
    def write_to_file(content):
        with open('xiaozhu.txt', 'a', encoding='utf-8') as f:
            f.write(json.dumps(content, ensure_ascii=False)+'
    ')
            #dumps将json对象转化为字符串
    
    def write_to_json(content):
        with open('xiaozhu.json', 'a', encoding='utf-8') as f:
            f.write(json.dumps(content, ensure_ascii=False)+'
    ')
            
    def write_to_csv(content):
        with open('xiaozhu.csv', 'a', encoding='utf-8') as f:
            fieldnames = ['title','address','price','name','sex','img']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(content)
    
    if __name__ == '__main__':
        url = 'http://bj.xiaozhu.com/search-duanzufang-p{}-0/'
        urls = [url.format(page) for page in range(1,3)]
        content = []
        for url in urls:
            text1 = get_one_page(url)
            detailurl = get_detailurl(text1)
            for i in detailurl:
                text2 = get_one_page(i)
                for item in parse_one_page(text2):
                    print(item)
                    write_to_file(item)            
                    content.append(item)
        write_to_csv(content)
        write_to_json(content)
  • 相关阅读:
    【Mysql】了解Mysql中的启动参数和系统变量
    【Mysql】初识MySQL
    【分布式搜索引擎】Elasticsearch之开启Elasticsearch的用户名密码验证
    SpringBoot------整合MyBatis
    SpringBoot------自定义拦截器
    SpringBoot------Servlet3.0的注解自定义原生Listener监听器
    SpringBoot------Servlet3.0的注解自定义原生Servlet
    SpringBoot------拦截器Filter的使用
    SpringBoot------如何将项目打成war包
    SpringBoot------Maven Install报错
  • 原文地址:https://www.cnblogs.com/oeong/p/11687056.html
Copyright © 2020-2023  润新知