不分类别的效果
不同分类的分布效果图
从海友网获取各个企业名单保存进mysql
cmfishhelper.py
从下列网址得到各个企业名片的网址保存进表cmfish
cds = get_cds()
http://www.cmfish.com/cd/cd_style.php?pageNum_Recordset1=%d&totalRows_Recordset1=191&id=%d
访问企业名片页面获得名称联系人地址保存进数据库
update_cds()
取出地址,从百度地图获得经纬度保存进数据库
http://api.map.baidu.com/geocoder/v2/
update_lnglat()
把名称,经纬度取出来生成个json文件
en_json()
#encoding=utf-8 import sys reload(sys) sys.setdefaultencoding('utf-8') sys.path.append('..') import requests from bs4 import BeautifulSoup import pprint from utils.db import sqlhelper from utils import setting import re import pymysql import traceback from requests.utils import get_encoding_from_headers, get_encodings_from_content import urllib2 import json typeto,pageto = 7,5 cardurls = "http://www.cmfish.com/cd/cd_style.php?id=%d" cdpages = "http://www.cmfish.com/cd/cd_style.php?pageNum_Recordset1=%d&totalRows_Recordset1=191&id=1" mysqldb = setting.YAMLDATA.get('mysqldb2') host,user,pwd,db=mysqldb.get('host'),mysqldb.get('user'),mysqldb.get('pwd'),mysqldb.get('cmfishdb') sh=sqlhelper.SqlHelper(host,user,pwd,db,'mysql') cardidrec=re.compile('((id=)([^&][^&]*))', re.IGNORECASE) hotrec=re.compile('((hot=)([^&][^&]*))', re.IGNORECASE) daterec=re.compile('(xbcxd3xc8xebxc8xd5xc6xda: ([\s\S]*?)</td>)', re.IGNORECASE) namerec=re.compile('(</a> > <strong>([\s\S]*?)</strong>)', re.IGNORECASE) contactrec=re.compile('(xc1xaa xcfxb5 xc8xcb</td> <td>([\s\S]*?)</td>)', re.IGNORECASE) mobilerec=re.compile('(xc1xaaxcfxb5xb5xe7xbbxb0</td> <td>([\s\S]*?)</td> )', re.IGNORECASE) mailrec=re.compile('(xb5xe7xd7xd3xd3xcaxcfxe4</td> <td>([\s\S]*?)</td> )', re.IGNORECASE) addressrec=re.compile('(xc1xaaxcfxb5xb5xd8xd6xb7</td> <td>([\s\S]*?)</td> )', re.IGNORECASE) noterec=re.compile('(<td> ([\s\S]*?)</td> )', re.IGNORECASE) def get_cds(): cds = [] for i in range(1,typeto+1): for j in range(0,pageto): url = 'http://www.cmfish.com/cd/cd_style.php?pageNum_Recordset1=%d&totalRows_Recordset1=191&id=%d' % (j,i) req=requests.get(url) soup = BeautifulSoup(req.text, 'html.parser') links=soup.find_all('a') for link in links: href = link.attrs['href'] if "cd.php?id=" in href and href<>'cd.php?id=&hot=': cdlink = "http://www.cmfish.com/cd/"+href pprint.pprint(cdlink) save_cd(i,j,cdlink) cds.append(cdlink) return cds def save_cd(typeid,pageid,url): sql = r"insert into card(typeid,pageid,url) values(%d,%d,'%s')" % (typeid,pageid,url) sh.ExecNonQuery(sql) def re_result(strrec,str,value): searched = strrec.findall(str) if searched <> None and len(searched)>0: pprint.pprint(searched[0][1]) try: return searched[0][1].decode('gb2312') except: return searched[0][1].decode('gbk') else: return value def url_result(strrec,str,value): searched = strrec.findall(str) if searched <> None and len(searched)>0: print searched[0][2] return searched[0][2] else: return value def get_detail(selsql,updsql): results=sh.ExecQuery(selsql) for result in results: try: id,url=result[0],result[1] cardid,hot = url_result(cardidrec,url,''),url_result(hotrec,url,0) #req = requests.get(url) #req.encoding=get_encodings_from_content(req.content) request = urllib2.Request(url) response = urllib2.urlopen(request) content = response.read() date = re_result(daterec,content,'') name = re_result(namerec,content,'') contact = re_result(contactrec,content,'') mobile=re_result(mobilerec,content,'') mail = re_result(mailrec,content,'') address = re_result(addressrec,content,'') note = re_result(noterec,content,'') sql = updsql % (cardid,hot,pymysql.escape_string(date),pymysql.escape_string(name),pymysql.escape_string(contact),pymysql.escape_string(mobile),pymysql.escape_string(mail),pymysql.escape_string(address),pymysql.escape_string(note),id) sh.ExecNonQuery(sql) except Exception,e: print 'error:',e.message,traceback.format_exc() def update_cds(): selsql= "select id,url from card where cardid is null" updsql = "update card set cardid=%s,hot=%s,date='%s',name='%s',contact='%s',mobile='%s',mail='%s',address='%s',note='%s' where id=%d" get_detail(selsql,updsql) def get_lnglat(address): print address url = 'http://api.map.baidu.com/geocoder/v2/' output = 'json' ak = 'c7aBgFWD6cMDPOe4BSiG8HLNlvXNKvCW' uri = url + '?' + 'address=' + address + '&output=' + output + '&ak=' + ak temp = urllib2.urlopen(uri) temp = json.loads(temp.read()) return temp def save_lnglat(selsql,updsql): results=sh.ExecQuery(selsql) for result in results: try: id,address=result[0],result[1] if '例如' not in address: address = address.replace(' ',',') result = get_lnglat(address) if result.get('result') <> None: lat,lng = result.get('result').get('location').get('lat'),result.get('result').get('location').get('lng') sql = updsql % (lat,lng,id) sh.ExecNonQuery(sql) except Exception,e: print 'error:',e.message,traceback.format_exc() #result[1] def update_lnglat(): selsql= "select id,address from card where lat is null" updsql = "update card set lat=%f,lng=%f where id=%d" save_lnglat(selsql,updsql) def gen_json(ofile='./../json/cards.json'): selsql = "select name,lat,lng,typeid from card where lat is not null" results=sh.ExecQuery(selsql) objs = [] for res in results: objs.append({"name":res[0],"lat":float(res[1]),"lng":float(res[2]),"typeid":int(res[3])}) jsonstr =json.dumps(objs) with open(ofile,'w') as f: f.write(jsonstr) print jsonstr if __name__ == '__main__': #cds = get_cds() #print len(cds) #update_cds() #print get_lnglat('台灣省桃園縣龜山鄉振興路1089巷15-1號') #update_lnglat() gen_json()
读取生成的json文件,显示在地图上
用file://访问html的时候读取json文件的时候会报错不能跨域的错误,发布成网站访问就可以
cmfish.html
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>海友网企业全国分布</title> </head> <body> <div id="main" style=" 1200px;height:1200px;"></div> <script type="text/javascript" src="../js/jquery-3.3.1.min.js"></script> <script type="text/javascript" src="../js/echarts-all-3.js"></script> <script type="text/javascript" src="../js/china.js"></script> <!-- 为ECharts准备一个具备大小(宽高)的Dom --> <script type="text/javascript"> var cards_arr= new Array() cards_arr[0] = [] cards_arr[1] = [] cards_arr[2] = [] cards_arr[3] = [] cards_arr[4] = [] cards_arr[5] = [] cards_arr[6] = [] $.ajax({ url: "../json/cards.json",//json文件位置 type: "GET",//请求方式为get dataType: "json", //返回数据格式为json success: function(data) {//请求成功完成后要执行的方法 //each循环 使用$.each方法遍历返回的数据date $.each(data ,function(i, item) { cards_arr[item.typeid-1].push({ name: item.name, value: [item.lng, item.lat] }); loaddata(cards_arr) }) } }) function loaddata(data){ //初始化 var myChart = echarts.init(document.getElementById('main')); //myChart.showLoading();//加载数据前显示的动画效果 /* res.push({ name: "白玉路346号", value: [121.423744, 31.23829] //这里concat后面的值就是value,这里统一设置成1。 }); res.push({ name: "新民路城南工商所对面巷子( 南苑菜市旁边 )", value: [106.577573, 31.082472] //这里concat后面的值就是value,这里统一设置成1。 }); */ //console.log(data) var chart = echarts.init(document.getElementById('main')); //这里是主体的初始化echart方法,与上面的简单demo类似。 chart.setOption({ backgroundColor: '#404a59', title: { text: '海友网企业全国分布', subtext: 'data from cmfish', sublink: 'http://www.cmfish.com', x: 'center', textStyle: { color: '#fff' } }, tooltip: { trigger: 'item' }, legend: { orient: 'vertical', x: 'left', data: ['生产厂商','进出口商','代理商','店铺','繁殖','个人','其他'], textStyle: { color: 'orange' } }, //地图坐标系必备的配置,具体的含义可以参考api,索性都是中文的,没有什么阅读障碍。 geo: { silent:false, map: 'china', label: { normal: { show: false }, emphasis: { show: true } }, itemStyle: { hoverAnimation:true, normal: { areaColor: '#323c48', borderColor: '#000' }, emphasis: { areaColor: '#2a333d', opacity:0 } } }, series: [ { name: '生产厂商', type: 'scatter', coordinateSystem: 'geo', //参照系:之前设置的geo。 //这里是api数据接受的的地方 data: data[0], symbolSize: 10, //散点半径 label: { normal: { formatter: '{b}', show: false }, emphasis: { show: true } }, hoverAnimation:true, silent:false, animation:false, z:3 }, { name: '进出口商', type: 'scatter', coordinateSystem: 'geo', //参照系:之前设置的geo。 //这里是api数据接受的的地方 data:data[1], symbolSize: 10, //散点半径 label: { normal: { formatter: '{b}', show: false }, emphasis: { show: true } }, hoverAnimation:true, silent:false, animation:false, z:3 }, { name: '代理商', type: 'scatter', coordinateSystem: 'geo', //参照系:之前设置的geo。 //这里是api数据接受的的地方 data:data[2], symbolSize: 10, //散点半径 label: { normal: { formatter: '{b}', show: false }, emphasis: { show: true } }, hoverAnimation:true, silent:false, animation:false, z:3 }, { name: '店铺', type: 'scatter', coordinateSystem: 'geo', //参照系:之前设置的geo。 //这里是api数据接受的的地方 data:data[3], symbolSize: 10, //散点半径 label: { normal: { formatter: '{b}', show: false }, emphasis: { show: true } }, hoverAnimation:true, silent:false, animation:false, z:3 }, { name: '繁殖', type: 'scatter', coordinateSystem: 'geo', //参照系:之前设置的geo。 //这里是api数据接受的的地方 data:data[4], symbolSize: 10, //散点半径 label: { normal: { formatter: '{b}', show: false }, emphasis: { show: true } }, hoverAnimation:true, silent:false, animation:false, z:3 }, { name: '个人', type: 'scatter', coordinateSystem: 'geo', //参照系:之前设置的geo。 //这里是api数据接受的的地方 data:data[5], symbolSize: 10, //散点半径 label: { normal: { formatter: '{b}', show: false }, emphasis: { show: true } }, hoverAnimation:true, silent:false, animation:false, z:3 }, { name: '其他', type: 'scatter', coordinateSystem: 'geo', //参照系:之前设置的geo。 //这里是api数据接受的的地方 data:data[6], symbolSize: 10, //散点半径 label: { normal: { formatter: '{b}', show: false }, emphasis: { show: true } }, hoverAnimation:true, silent:false, animation:false, z:3 } ] }); } //myChart.setOption(option) </script> </body> </html>
以上代码提交在github上,可以下载所用到的echarts的js文件
https://github.com/sui84/pytest