• 简单抓取安居客房产数据,并保存到Oracle数据库


    思路和上一篇差不多,先获取网站html文件,使用BeautifulSoup进行解析,将对应属性取出,逐一处理,最后把整理出的记录保存到oracle中,持久化储存。

    '''
    Created on 2017年2月20日

    @author: Administrator
    '''
    from urllib import parse, request
    from bs4 import BeautifulSoup
    from sqlalchemy import create_engine
    from datetime import *

    import numpy as np
    import pandas as pd
    import time
    import re
    import socket
    import traceback
    import logging

    def get_page(url):
        headers = {
            'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
            'Referer': r'http://jinan.anjuke.com/sale/b151-m161-o5-p1/',
            'Host': r'jinan.anjuke.com',
            'Connection': 'keep-alive'
        }
        timeout = 60
        socket.setdefaulttimeout(timeout)  # 设置超时
        req = request.Request(url, headers=headers)
        response = request.urlopen(req).read()
        page = response.decode('utf-8','ignore')
        return page
    if __name__ == '__main__':  
       
        curDate = date.strftime(date.today(),'%Y%m%d',)
        logName =  'Anjuke_%s.log' %curDate
        logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    filename=logName,
                    filemode='a')
       
        url = 'http://jinan.anjuke.com/sale/b151-m161-o5-p1/?from_price=150&to_price=250&from_area=120&to_area=200'
        html = get_page(url)
        soup =  BeautifulSoup(html,"lxml")
        table =soup.find_all('li','list-item')
       
        df = pd.DataFrame(columns=["address","floor","house_name","href","m2","price","room","unit_price","web","year","op_time"])
       
        for tr in table:
            #名称
            str_name = tr.find("div","house-title").find('a').string.strip()
            ##连接
            str_href = tr.find("a","houseListTitle")["href"]
           
            ##房产属性
            str_ts = list()
            for s in tr.find("div","details-item").find_all('span'):           
                str_ts.append(s.string)
            room = str_ts[0]
            m2 =re.findall(r"(d+.*d+)",str_ts[1])
            floor = str_ts[2]
            year = str_ts[3]
           
            ##地址信息
            str_add = tr.find("span","comm-address").string.strip()
            str_add = re.sub(r"(xa0xa0 )","",str_add)
            ##价格
            str_price = tr.find("div","pro-price").find('span','price-det')
            str_price = re.findall(r"(d+.*d+)",str_price.text)
            str_unit_price = re.findall(r"(d+.*d+)",tr.find("div","pro-price").find('span','unit-price').text)
           
            row = {'web':'安居客','house_name':str_name,'room':room,'m2':m2,'price':str_price,'unit_price':str_unit_price,'floor':floor,'year':year,'address':str_add,'href':str_href}
            #print(row)
            newrow = pd.DataFrame(data=row,index=["0"])
            df=df.append(newrow,ignore_index=True)
        #df.reset_index(drop = True)
        df["op_time"]=time.strftime('%Y-%m-%d',time.localtime(time.time()))
        df['m2'] = df['m2'].astype('int')
        df['price'] = df['price'].astype('int')
        df['unit_price'] = df['unit_price'].astype('int')
       
        ##建立数据库连接
        engine = create_engine('oracle+cx_oracle://user:pass@localhost/orcl')
        cnx = engine.connect() 
        try:
            df.to_sql('anju_house', cnx,if_exists='append',index=False)
        except Exception as e:
            logging.error(traceback.format_exc())
        ##关闭数据链接
        cnx.close()

  • 相关阅读:
    根据大小生成对应尺寸网络图片的网址 狼人:
    如何建立高效的质量保障机制
    全链路压测(8):构建三大模型
    聊聊我对敏捷项目交付的理解
    分享最近做的一个中文 wordle 的游戏《词影》
    面试突击34:如何使用线程池执行定时任务?
    面试突击37:线程安全问题的解决方案有哪些?
    面试突击36:线程安全问题是怎么产生的?
    面试突击35:如何判断线程池已经执行完所有任务了?
    面试突击33:线程池有哪些状态?状态是如何转换的?
  • 原文地址:https://www.cnblogs.com/iHqq/p/6897247.html
Copyright © 2020-2023  润新知