• Python3 | 爬取百度地图信息的的代码,我更改了城市,关键词,页码等,完成了获取有关“筛网”店铺的信息。


    篇爬取百度地图信息的的代码,我更改了城市,关键词,页码等,完成了获取有关“筛网”店铺的信息。

    代码如下:

    import requests
    import re
    import csv
    import time
    
    
    def BusinessFromBaiduDitu(citycode = '287',key_word='筛网',pageno=0):
        parameter = {
                "newmap": "1",
                "reqflag": "pcmap",
                "biz": "1",
                "from": "webmap",
                "da_par": "direct",
                "pcevaname": "pc4.1",
                "qt": "con",
                "c": citycode,        # 城市代码
                "wd": key_word,       # 搜索关键词
                "wd2": "",
                "pn": pageno,         # 页数
                "nn": pageno * 10,
                "db": "0",
                "sug": "0",
                "addr": "0",
                "da_src": "pcmappg.poi.page",
                "on_gel": "1",
                "src": "7",
                "gr": "3",
                "l": "12",
                "tn": "B_NORMAL_MAP",
                # "u_loc": "12621219.536556,2630747.285024",
                "ie": "utf-8",
                # "b": "(11845157.18,3047692.2;11922085.18,3073932.2)",  #这个应该是地理位置坐标,可以忽略
                "t": "1468896652886"}
    
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36(KHTML, like Gecko) Chrome/56.0.2924.87Safari/537.36'}
    
        url = 'http://map.baidu.com/'
        htm = requests.get(url, params=parameter, headers=headers)
        htm = htm.text.encode('latin-1').decode('unicode_escape')  # 转码
        pattern = r'(?<=address_norm":"[).+?(?="ty":)'
        htm = re.findall(pattern, htm)  # 按段落匹配
    
        for r in htm:
            pattern = r'(?<="},"name":").+?(?=")'
            name = re.findall(pattern, r)
            #if not name:
            pattern = r'(?<=,"name":").+?(?=")'
            name = re.findall(pattern, r)
            #print(name[0])  # 名称
    
            pattern = r'.+?(?=")'
            adr = re.findall(pattern, r)
            pattern = r'(.+?['
            address = re.sub(pattern, ' ', adr[0])
            pattern = r'(.+?]'
            address = re.sub(pattern, ' ', address)
            #print(address)  # 地址
    
            pattern = r'(?<="phone":").+?(?=")'
            phone = re.findall(pattern, r)
            try:
                if phone[0] and '",' != phone[0]:
                    phone_list = phone[0].split(sep=',')
                    for number in phone_list:
                        if re.match('1', number):
                            print(citycode+name[0]+','+address+','+number)
                            writer.writerow((name[0], address, number))
            except:
                continue
        print(citycode + '  ' + key_word + '  ' + str(pageno))
    
    

    现在开始写我搜“丝网”“筛网”(key_word)的代码获取想要的数据,也要改城市代码(citycode)

    #citynumlist是百度地图城市代码列表
    citynumlist = ['33','34','35'
                  '''''''''''''''''  
                   '370','371','372']
    keywordlist = ['丝网','筛网']
    
    start = time.time()
    num = 1
    
    #建立csv文件,保存数据
    csvFile = open(r'/Users/apple888/PycharmProjects/百度地图/Data/%s.csv' % 'CityData','a+', newline='', encoding='utf-8')
    writer = csv.writer(csvFile)
    writer.writerow(('name', 'address', 'number'))
    
    
    for citycode in citynumlist:
        for kw in keywordlist:
            for page in range(10):
                BusinessFromBaiduDitu(citycode=citycode, key_word=kw, pageno=page)
                
                #防止访问频率太高,避免被百度公司封
                time.sleep(1)
                if num%20 == 0:
                    time.sleep(2)
                if num%100== 0:
                    time.sleep(3)
                if num%200==0:
                    time.sleep(7)
                num = num + 1
    
    end = time.time()
    lasttime = int((end-start))
    print('耗时'+str(lasttime)+'s')
    

    程序运行了大约三个小时,抓取了1085条有用信息信息

    python爬取上市公司办公地址

    IDLE编辑器,python3.8版本。
    import requests
    from bs4 import BeautifulSoup
    import re
    import xlwt

    def getHTMLText(url, code=“utf-8”):
    kv={‘user-agent’:‘Mozilla/5.0’}
    try:
    r = requests.get(url,headers=kv)
    r.raise_for_status()#抛出异常
    r.encoding =r.apparent_encoding#设定编码格式
    return r.text
    except:
    return “”

    def getStockList(lst, stockURL):
    html = getHTMLText(stockURL, “gb2312”) #只获取htlm文本
    soup = BeautifulSoup(html, ‘html.parser’) #html解析,到这里把整个网站源代码排版整理干净
    a = soup.find_all(‘a’) #解析页面,找到所有的a标签
    for i in a:
    try:
    #找到a标签中的href属性,并且判断属性中间的链接,把链接后面的数字取出来
    href = i.attrs[‘href’]
    #深圳交易所的代码以sz开头,上海交易所的代码以sh开头,股票的数字有6位构成,所以正则表达式可以写为[s][hz]d{6}
    lst.append(re.findall(r"[s][hz][0,3,6][0-9]{5}", href)[0])
    except:
    continue

    def getStockInfo(lst, stockURL):
    count = 0
    for stock in lst[3500:3814]:
    url = stockURL + ‘s’+ stock[2:8] + “.shtml”
    #print(url)
    inflinshi=[]
    html = getHTMLText(url)#对一只股票进行操作
    soup = BeautifulSoup(html, ‘html.parser’)
    tds = soup.find_all(‘td’,attrs={‘class’: ‘’})
    #print(tds[15].string)
    count = count + 1
    sheet01.write(count+1,0,tds[0].string)
    sheet01.write(count+1,1,tds[1].string)
    sheet01.write(count+1,2,tds[15].string)
    sheet01.write(count+1,3,tds[8].string)
    # count = count + 1
    print(" 当前进度: {:.2f}%".format(count*100/len(lst[3500:3814])),end="")
    print(stock)
    stock_list_url = ‘http://quote.eastmoney.com/stock_list.html’
    stock_info_url = ‘http://stockdata.stock.hexun.com/gszl/’
    slist=[]
    inf = []
    getStockList(slist, stock_list_url)
    print(len(slist))
    f=xlwt.Workbook(encoding=‘utf-8’)
    sheet01=f.add_sheet(u’sheet1’,cell_overwrite_ok=True)
    sheet01.write(0,0,“股票简称”)
    sheet01.write(0,1,“股票代码”)
    sheet01.write(0,2,“注册地址”)
    sheet01.write(0,3,“所属地域”)
    getStockInfo(slist, stock_info_url)
    f.save(u’E:股票基本资料(3500-3814).xls’)

    因数据量较大,可分多次爬取。

    二、在百度地图上标注。

    首先在百度地图个人开始平台申请密钥AK。
    vscode编辑器代码:

    <!DOCTYPE html>
    <html>
    
    <head>
        <meta http-equiv="Content-Type" content="text/html; charset=utf8" />
        <meta name="viewport" content="initial-scale=1.0, user-scalable=no" />
        <script src="e:\xlsx.full.min.js "></script>
        <script src="e:\jquery-3.5.0.min.js "></script>
        <title>批量地址</title>
        <style type="text/css">
            body,
            html {
                 100%;
                height: 100%;
                margin: 0;
                font-family: "微软雅黑";
            }
            
            #l-map {
                height: 550px;
                 100%;
            }
            
            #r-result {
                 100%;
                font-size: 14px;
                line-height: 20px;
            }
        </style>
    </head>
    
    <body>
        <div id="l-map"></div>
        <div id="r-result">
            <input type="button" value="批量地址解析" onclick="bdGEO()" />
            <input type="file" id="excel-file">
            <div id="result"></div>
        </div>
    </body>
    
    </html>
    <script type="text/javascript" src="http://api.map.baidu.com/api?v=2.0&ak=申请到的密钥"></script>
    
    <script type="text/javascript">
        // 百度地图API功能
        var map = new BMap.Map("l-map");
        map.centerAndZoom(new BMap.Point(116.402831, 39.914271), 13);
        map.enableScrollWheelZoom(true);
        var index = 0;
        var myGeo = new BMap.Geocoder();
        var adds = [];
        var jianchengs = [];
        //给input标签绑定change事件,一上传选中的.xls文件就会触发该函数
        $('#excel-file').change(function(e) {
            var files = e.target.files;
            var fileReader = new FileReader();
            fileReader.onload = function(ev) {
                try {
                    var data = ev.target.result
                    var workbook = XLSX.read(data, {
                            type: 'binary'
                        }) // 以二进制流方式读取得到整份excel表格对象
                    var persons = []; // 存储获取到的数据
                } catch (e) {
                    console.log('文件类型不正确');
                    return;
                }
                // 表格的表格范围,可用于判断表头是否数量是否正确
                var fromTo = '';
                // 遍历每张表读取
                for (var sheet in workbook.Sheets) {
                    if (workbook.Sheets.hasOwnProperty(sheet)) {
                        fromTo = workbook.Sheets[sheet]['!ref'];
                        console.log(fromTo);
                        persons = persons.concat(XLSX.utils.sheet_to_json(workbook.Sheets[sheet]));
    
                        // break; // 如果只取第一张表,就取消注释这行
                    }
                }
                //在控制台打印出来表格中的数据
                console.log(persons);
                for (var i in persons) {
                    adds = adds.concat(i + "," + persons[i].注册地址 + ",");
                    jianchengs = jianchengs.concat(persons[i].股票简称);
                }
                console.log(adds);
            };
            // 以二进制方式打开文件
            fileReader.readAsBinaryString(files[0]);
        });
           function bdGEO() {
            var add = adds[index];
            // var jiancheng = jianchengs[index]
            console.log(add);
            // console.log(jiancheng);
            geocodeSearch(add);
            index++;
        }
    
        function geocodeSearch(add) {
            var jiancheng = jianchengs[index]
            console.log(jiancheng);
            if (index < adds.length) {
                setTimeout(window.bdGEO, 100);
            }
            myGeo.getPoint(add, function(point) {
                if (point) {
                    //document.getElementById("result").innerHTML +=  index + "、" + add + ":" + point.lng + "," + point.lat + "</br>";
                    document.getElementById("result").innerHTML += "longitude = " + point.lng + ", latitude =" + point.lat + "</br>";
                    var address = new BMap.Point(point.lng, point.lat);
                    addMarker(address, new BMap.Label(index + ":" + jiancheng, {
                        offset: new BMap.Size(20, -10)
                    }));
                }
            }, "北京市");
        }
        // 编写自定义函数,创建标注
        function addMarker(point, label) {
            var marker = new BMap.Marker(point);
            map.addOverlay(marker);
            marker.setLabel(label);
            map.addEventListener("click", showInfo);
        }
        function showInfo(e){
    		alert(e.point.lng + ", " + e.point.lat);
    	}
    	
    </script>
    

    运行代码,open in default browser.如图:

  • 相关阅读:
    Spring MVC+FreeMarker简介
    集合框架
    异常处理
    c语言中的一些注意点
    在ScrollView中自定义GridView无法显示全部的问题的解决
    Android 发送request请求在服务器端解析时乱码
    Android 4.1 APP中的static变量即使在APP退出后仍然不会被擦除
    关于Android的asynctask-threads-limits问题:asynctask开启的线程是否有极限
    ListView中各组件点击事件冲突,ListView不响应OnItemClickListener事件
    异常:java.lang.NoClassDefFoundError: com.android.volley.toolbox.Volley
  • 原文地址:https://www.cnblogs.com/xinxihua/p/14390797.html
Copyright © 2020-2023  润新知