C:UsersIBM_ADMIN>python -V
Python 2.7.13
查Python 工资的网站 :
http://www.jobui.com/salary/%E5%8C%97%E4%BA%AC-python%E5%B7%A5%E7%A8%8B%E5%B8%88/
# -*- coding:utf-8 -*-
import re,urllib2
url = 'http://daily.zhihu.com/'
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'}
#https://v.qq.com/x/page/w05097k8olz.html
def getHtml(url):
request = urllib2.Request(url,headers=headers)
response = urllib2.urlopen(request)
content = response.read()
#print content
return content
html = getHtml(url)
def getUrls(html):
pattern = re.compile('<a href="/story/(.*?)"')
items = re.findall(pattern,html)
allUrls=[]
for it in items:
allUrls.append(url+'story/'+it)
return allUrls
urls = getUrls(html)
def getContent(urls):
patternTitle=re.compile('<h1 class="headline-title">(.*?)</h1>')
patternContent = re.compile('<div class="content">\n<p>(.*?)</p>\n</div>',re.S)# 匹配换行
for url in urls:
html = getHtml(url)
item =re.findall(patternTitle,html)
print '-----------------------------------------'+'-----------------------------------------'
print '-----------------------------------------'+'-----------------------------------------'
print '***************'+item[0]+'***************'
print '-----------------------------------------'+'-----------------------------------------'
content = re.findall(patternContent,html)
for con in content:
print con
#print content[0]
getContent(urls)
# remove unneeded things ----> 祛杂质
def characterProcessing(content):
pattern = re.compile('<p>(.*?)</p>'|'<li>(.*?)</li>')
pass