• python 爬取媒体文件(使用chrome代理,启动客户端,有防火墙)


    #coding = utf-8
    '''
    中文转经纬度
    '''
    import time,json
    import urllib.request
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import pandas as pd
    import numpy as np
    
    AK ='C2hKkyF9fHbmzESq6dmSArZIzw8wEiS1'
    table = pd.read_csv('./data/test.csv',encoding='utf-8')
    outfp = open('./data/result_test.csv','w',encoding='utf-8')
    class LoadData:
        def __init__(self):
            print("start")
            self.m_driver = webdriver.Chrome('D:Program Files (x86)ChromeDriverchromedriver.exe')
            self.loc_result = []
    
        def get_uri(self, addr, city = ''):
            # try:
            server  = 'http://api.map.baidu.com/geocoder/v2/?'
            params = urllib.parse.urlencode({'address':addr,'city':city,'ak':AK,'output':'json'})
            self.m_driver.get(server+params)
            bs = BeautifulSoup(self.m_driver.page_source,'lxml')
            # temp = bs.prefix
            result = json.loads(bs.pre.get_text())['result']
    
            location = result.get('location')
            if( location != None ):
                lng = location.get('lng')
                lat = location.get('lat')
            return lng,lat
            # except:
            #     print("error addr:",addr)
            #     return np.NAN,np.NAN
    
        def get_lng_lat(self, addr):
            lng,lat = self.get_uri(addr)
            if((lng == None) or (lat == None)):
                print("error")
            self.loc_result.append([addr,lng,lat])
    
    
    
        def main(self):
            addr_list = table['ADDRESS'].tolist()
    
            [self.get_lng_lat(addr) for addr in addr_list]
    
            outfp.write(str(self.loc_result))
    
    if __name__ == '__main__':
        tStart = time.clock()
    
        LD = LoadData()
        LD.main()
    
        tEnd = time.clock()
        print("%s s"%(tEnd - tStart))

    附录:

    chromdriver.exe与chrome版本映射及下载链接

    https://blog.csdn.net/mmayanshuo/article/details/78962398

  • 相关阅读:
    使用 Python 编码和解码 JSON 对象
    搞定github下载加速
    git错误:fatal: Could not read from remote repository.解决
    webstorm安装配置
    node.js下载安装
    IDEA安装小配置
    JAVA软件安装
    关于升级一般软件的一些想法
    linux 的 逻辑卷管理
    记一次内核升级。
  • 原文地址:https://www.cnblogs.com/smuxiaolei/p/10847381.html
Copyright © 2020-2023  润新知