• 数据采集技术第二次作业


    No.1——WeatherForecast

    题目要求

    在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。

    代码部分

    from bs4 import BeautifulSoup
    from bs4 import UnicodeDammit
    import urllib.request
    import sqlite3
    class weatherDB:
        def openDB(self):
            self.con = sqlite3.connect("weathers.db")
            self.cursor = self.con.cursor()
            try:
                self.cursor.execute(
                    "create table weathers (wcity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key(wCity,wDate))")
            except:
                self.cursor.execute("delete from weathers")
        def closeDB(self):
            self.con.commit()
            self.con.close()
        def insert(self, city, date, weather, temp):
            try:
                self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values(?,?,?,?)",(city, date, weather, temp))
            except Exception as err:
                print(err)
        def show(self):
            self.cursor.execute("select * from weathers")
            rows = self.cursor.fetchall()
            print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
            for row in rows:
                print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
    class weatherforecast():
        def __init__(self):
            self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre) Gecko/200807242 Minefield/3.0.2pre"}
            self.citycode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
        def forecastcity(self, city):
            if city not in self.citycode.keys():
                print(city + "code not found")
                return
            url = "http://www.weather.com.cn/weather/" + self.citycode[city] + ".shtml"
            try:
                req = urllib.request.Request(url, headers=self.headers)
                data = urllib.request.urlopen(req)
                data = data.read()
                dammit = UnicodeDammit(data, ["utf-8", "gbk"])
                data = dammit.unicode_markup
                soup = BeautifulSoup(data, 'lxml')
                lis = soup.select("ul[class='t clearfix'] li")
                for li in lis:
                    try:
                        date = li.select('h1')[0].text
                        weather = li.select('p[class="wea"]')[0].text
                        temp= li.select('p[class="tem"] span')[0].text + "/" + li.select("p[class='tem'] i")[0].text
                        #print(city,date,weather,temp)
                        self.db.insert(city,date,weather,temp)
                    except Exception as err:
                        pass
            except Exception as err:
                print(err)
        def process(self, cities):
            self.db = weatherDB()
            self.db.openDB()
            for city in cities:
                self.forecastcity(city)
            self.db.show()
            self.db.closeDB()
    ws = weatherforecast()
    ws.process(["北京", "上海", "广州", "深圳"])

    运行结果

    心得体会

    复现一次天气预报,加强巩固了数据库的使用,也复习了html获取和select的方法,虽然是照抄复现,感觉还是又学到一些东西

    No.2——抓取股票

    题目要求

    用requests和BeautifulSoup库方法爬取股票相关信息

    候选网站:东方财富网https://www.eastmoney.com/

    代码部分

    
    import urllib
    import urllib.request
    import re
    from bs4 import UnicodeDammit, BeautifulSoup
    import prettytable as pt
    import sys
    tb = pt.PrettyTable(["序号", "代码", "名称", "最新价", "涨跌幅", "跌涨额", "成交量", "成交额", "涨幅"])#建表
    def getHtml(page,fs,fields):
        #防止反爬
        headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400"}
        #获取html文档
        url='http://13.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124001030397983465936_1601816496595&pn='+str(page)+'&pz=20&po=1&np=1' 
                '&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs='+fs+
                '&fields='+fields+'&_=1601816496603'
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        #装载html
        soup = BeautifulSoup(data, 'lxml')
        #用正则表达式获取所需的模块
        data = re.findall(r'"diff":[(.*?)]', soup.text)
        print(soup)
        return data
    def getOnePageStock(count,page,fs,fields):
        data=getHtml(page,fs,fields)
        datas=data[0].split("},{")#分割
        datas[0]=datas[0].replace("{","")#去除开头的{
        datas[len(datas)-1]=datas[len(datas)-1].replace("}","")#去除结尾的}
        for i in range(len(datas)):
            #用re的split方法通过正则匹配可以实现以多个分割关键字分割字符串
            stock=re.split('[:,]',datas[i].replace('"',""))
            #加入到表格的行
            tb.add_row([count,stock[13],stock[15],stock[1],stock[3],stock[5],stock[7],stock[9],stock[11]])
            count=count+1 #序号自加
        return count
    def main():
        count=1
        page=1
        fields = "f12,f14,f2,f3,f4,f5,f6,f7"#(f12:代码,f14:名称,f2:最新价,f3:涨跌幅,f4:涨跌额,f5:成交量,f6:成交额,f7:涨幅)
        fs = {
            "沪深A股": "m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23",
            "上证A股": "m:1+t:2,m:1+t:23",
            "深证A股": "m:0+t:6,m:0+t:13,m:0+t:80",
        }
        for i in fs.keys():
            count = getOnePageStock(count,page,fs[i],fields)
        print(tb)
    main()
    

    运行结果

    心得体会

    根据参考链接上的教程去找对应的js文件真的很需要耐心,然后对比json和网站的数据从field字典去找所需的数据,蛮伤眼神的,不过算是实现了从0到1的过程,收获还算蛮大的

    No.3——抓取自定义代码股票

    题目要求

    根据自选3位数+学号后3位选取股票,屏幕打印股票信息

    代码部分

    
    import urllib
    import urllib.request
    import re
    from bs4 import UnicodeDammit, BeautifulSoup
    import prettytable as pt
    import sys
    tb = pt.PrettyTable(["股票代码号", "名称", "今日开", "今日最高", "今日最低"])
    def getHtml(number,fields):
        # 防止反爬
        headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400"}
        # 获取html文档
        url='http://push2.eastmoney.com/api/qt/stock/get?ut=fa5fd1943c7b386f172d6893dbfba10b&invt=2&fltt=2&' 
            'fields='+fields+'&secid=0.'+str(number)+'&cb=jQuery1124012344986700569804_1601825123071&_=1601825123079'
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        # 装载html
        soup = BeautifulSoup(data, 'lxml')
        # 用正则表达式获取所需的模块
        data = re.findall(r'{"f.*?}', soup.text)
        return data
    def getOnePageStock(number,fields):
        data=getHtml(number,fields)
        datas=data[0].split("},{")#分割
        datas[0]=datas[0].replace("{","")#去除开头的{
        datas[0]=datas[0].replace("}","")#去除结尾的}
        stocks=[]
        for i in range(len(datas)):
            # 用re的split方法通过正则匹配可以实现以多个分割关键字分割字符串
            stock=re.split('[:,]',datas[i].replace('"',""))
            tb.add_row([stock[7],stock[9],stock[5],stock[1],stock[3]])
            stocks.append(stock)
    def main():
        number=300140 #自定义查询的股票代码
        fields = "f44,f45,f46,f57,f58"# f44:今日最高,f45:今日最低,f45:今开,f57:股票代码,f58:股票名称
        try:
            getOnePageStock(number,fields)
            print(tb)
        except:
            print("目标不存在")
    main()

    运行结果

    心得体会

    实现这个内容只要在第二题基础上稍作修改即可,原本的fs选定股票的字典也可以删除,寻找动态js和field的方法与上题相同,还可以在做一些优化,url的一些参数感觉还是影响到查询结果,但是没观察出来。

  • 相关阅读:
    取消Git代理设置
    在Go语言中使用JSON(去掉空字段)
    go-- 用go-mssql驱动连接sqlserver数据库
    Go加密解密之DES
    Go语言interface详解
    Go--避免SQL注入
    golang: 把sql结果集以json格式输出
    Golang操作数据库
    Oracle ORA-01555(快照过旧)
    racle undo 解析
  • 原文地址:https://www.cnblogs.com/axx4136/p/13775330.html
Copyright © 2020-2023  润新知