• 最简单的python 爬虫



    C:UsersIBM_ADMIN>python -V
    Python 2.7.13

    查Python 工资的网站 :

    http://www.jobui.com/salary/%E5%8C%97%E4%BA%AC-python%E5%B7%A5%E7%A8%8B%E5%B8%88/

    # -*- coding:utf-8 -*-
    import re,urllib2
    url = 'http://daily.zhihu.com/'
    headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6'}
    #https://v.qq.com/x/page/w05097k8olz.html
    def getHtml(url):
        request = urllib2.Request(url,headers=headers)
        response = urllib2.urlopen(request)
        content = response.read()
        #print content
        return content
    
    html = getHtml(url)
        
    def getUrls(html):
        pattern = re.compile('<a href="/story/(.*?)"')
        items = re.findall(pattern,html)
        allUrls=[]
        for it in items:
            allUrls.append(url+'story/'+it)
        return allUrls
            
    urls = getUrls(html)
    
    def getContent(urls):
        patternTitle=re.compile('<h1 class="headline-title">(.*?)</h1>')
        patternContent = re.compile('<div class="content">\n<p>(.*?)</p>\n</div>',re.S)# 匹配换行
        for url in urls:
            html = getHtml(url)
            item =re.findall(patternTitle,html)
            print '-----------------------------------------'+'-----------------------------------------'
            print '-----------------------------------------'+'-----------------------------------------'
            print '***************'+item[0]+'***************'
            print '-----------------------------------------'+'-----------------------------------------'
            content = re.findall(patternContent,html)
            for con in content:
                print con
            #print content[0]
                                    
            
    getContent(urls)
    
    # remove unneeded things ----> 祛杂质
    def characterProcessing(content):
        pattern = re.compile('<p>(.*?)</p>'|'<li>(.*?)</li>')
        pass
    


  • 相关阅读:
    第一阶段冲刺4
    用户场景分析
    最小不重复数
    BOM
    虚拟机下ubuntu系统设置分辨率
    富文本编辑器KindEditor使用
    页面路径设置
    VMware虚拟机不能上网的问题
    Apache Tomcat/7.0.42配置用户
    JFreeChart 横轴文字竖着显示
  • 原文地址:https://www.cnblogs.com/TendToBigData/p/10501215.html
Copyright © 2020-2023  润新知