• PYTHON爬取66影视的电影下载链接,有搜索功能


    本片代码亮点在于使用BeautifulSoup的select功能,可以直接根据数据在html页面中的层级标签来获取数据。

    # -*- coding=gb18030 -*-
    
    __author__ = 'vincent'
    
    import sys
    import urllib2
    import urllib
    import cookielib
    from bs4 import BeautifulSoup
    
    
    class Spider66ys:
        headers = None
        home_url = None
    
        def __init__(self):
            self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:50.0) Gecko/20100101 Firefox/50.0'
            }
            self.home_url = "http://www.66ys.tv"
    
        #   获取网页信息
        def get_html(self, url):
            print "正在获取网页[", url, "]的信息..."
            if len(url) == 0:
                print "Input url is null!"
                sys.exit(0)
    
            request = urllib2.Request(url, headers=self.headers)
            response = urllib2.urlopen(request)
            html = response.read()
            #   print "获取首页信息(", url, ")完毕."
            return html
    
        # 在电影页面下获取电影的下载链接
        def get_download_url(self, film):
            print "开始从网页[", film[0], "]中获取电影[", film[1], "]的下载链接..."
            html = self.get_html(film[0])
    
            # fp = open("film.html", "w")
            # fp.write(html)
            # fp.close()
    
            soup = BeautifulSoup(html, "lxml", from_encoding="gb18030")
            # print soup.prettify()
            results = soup.select("html > body > div.wrap > div.mainleft 
                > div.contentinfo > div#text > table > tbody > tr > td > a")
            for result in results:
                film.append(result['href'])
    
        # 获取最新更新电影
        def get_new_update(self):
            new_film_list = []
    
            print "正在获取[", self.home_url, "]更新电影..."
            html = self.get_html(self.home_url)
    
            # fp = open("66ys.html", "w")
            # fp.write(html)
            # fp.close()
    
            soup = BeautifulSoup(html, "lxml", from_encoding="gb18030")
            results = soup.select("html > body > div.wrap > div.tnlist > ul > li > a")
            for result in results:
                film = []
                film.append(result['href'])
                film.append(result.getText().encode('gb18030').strip())
                self.get_download_url(film)
                new_film_list.append(film)
    
            return new_film_list
    
        # 根据关键字在66影视上搜索电影
        def search_film(self, content):
            search_film_list = []
            search_url = self.home_url + "/e/search/index.php"
            print "开始搜索电影[", content, "]..."
            # print search_url
            postDict = {
                "keyboard": content,
                "show": "title,smalltext",
                "submit": "",
                "tbname": "Article",
                "tempid": "1"
            }
            postData = urllib.urlencode(postDict)
            # print postData
            cookie_jar = cookielib.LWPCookieJar()
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
            urllib2.install_opener(opener)
            request = urllib2.Request(search_url, postData, headers=self.headers)
            response = urllib2.urlopen(request)
            opener.open(request)
            html = response.read()
            # fp = open("search.html", "w")
            # fp.write(html)
            # fp.close()
            # print content
            soup = BeautifulSoup(html, "lxml", from_encoding="gb18030")
            results = soup.select("html > body > table.tableborder > tr > td > div > b")
            if len(results) == 1:
                print "没有搜索到相关的内容"
                return None
    
            results = soup.select("html > body > div > div.wrap > div.mainleft > div.channellist > div.listBox > ul > li 
                                  div.listInfo > h3 > a")
            # print results
            for result in results:
                film = []
                film.append(result['href'])
                film.append(result.getText().encode('gb18030').strip())
                self.get_download_url(film)
                search_film_list.append(film)
            print "共搜索到[", len(results), "]部电影。"
            return search_film_list
    
    
    if __name__ == "__main__":
        spider = Spider66ys()
        # new_film_list = spider.get_new_update()
        # for film in new_film_list:
        #     for info in film:
        #         print info, "	"
        #     print ""
        content = "冰与火之歌"
        search_film_list = spider.search_film(content)
        for film in search_film_list:
            print film[1], ":"
            for info in film[2:]:
                print info
            print "-"*200
  • 相关阅读:
    《C#从现象到本质》读书笔记(八)第10章反射
    《C#从现象到本质》读书笔记(七)第9章 泛型
    《C#从现象到本质》读书笔记(六)第8章委托和事件
    《C#从现象到本质》读书笔记(五)第5章字符串第6章垃圾回收第7章异常与异常处理
    求1+2+……+n的和
    回溯法的应用举例
    回溯法
    翻转单词顺序列
    左旋转字符串
    和为S的两个数字
  • 原文地址:https://www.cnblogs.com/stupid-vincent/p/6279794.html
Copyright © 2020-2023  润新知