• Python——爬虫(一定要看下)

    #!/usr/bin/env python3.5
    # -*- coding: utf-8 -*-
    # @Time   : 2018/1/26
    # @Author : Lyrichu
    # @Email  : 919987476@qq.com
    # @File   : NetCloudAnalyse.py
    Simple Analysis for NetCloud music,including song comments,users info etc.
    And we use pyecharts for visualization analyse. 
        from NetCloudCrawler import NetCloudCrawl
    except ImportError:
        from .NetCloudCrawler import NetCloudCrawl
    from pyecharts import Bar,Geo
    import requests 
    import re 
    import time 
    import json 
    import pandas as pd 
    import jieba 
    from wordcloud import WordCloud
    import os 
    from threading import Thread 
    from scipy.misc import imread
    from collections import Counter
    from operator import itemgetter
    class NetCloudAnalyse(NetCloudCrawl):
        analyse for NetCloud comments of songs,user info etc. 
        def __init__(self,song_name,singer_name,song_id = 1,singer_id = 1):
            super(NetCloudAnalyse, self).__init__(song_name = song_name,song_id = song_id,
                                                singer_name = singer_name,singer_id = singer_id)
            self.threading_count = 0 # global count for threadings
            self.unknown = "" # blank str for unknown info
        def load_comments_csv(self):
            load crawler comments csv file
            comments_df = pd.read_csv(self.comments_file_path,engine = 'python',encoding = 'utf-8') # read csv file as dataframe
            return comments_df
        def save_users_info_to_file(self):
            with open(self.users_info_file_path,"w",encoding = "utf-8") as fout:
                users_url = self.load_users_url()
                num = len(users_url)
                # iterate the users url list
                for index,user_url in enumerate(users_url,1):
                        user_id = re.search(r'.*id=(d+)',user_url).group(1) # user id
                        # time to crawl this info
                        crawler_time = self.from_timestamp_to_date(time_stamp = time.time())
                        html = requests.get(user_url,headers = self.headers).text
                        # personal events counts
                        event_count_pattern = re.compile(r'<strong id="event_count">(d+?)</strong>')
                        event_count = re.search(event_count_pattern,html)
                        if event_count:
                            event_count = event_count.group(1) 
                            event_count = self.unknown
                        # how many people the user follow
                        follow_count_pattern = re.compile(r'<strong id="follow_count">(d+?)</strong>')
                        follow_count = re.search(follow_count_pattern,html)
                        if follow_count:
                            follow_count = follow_count.group(1) 
                            follow_count = self.unknown
                        # how many fans the user has
                        fan_count_pattern = re.compile(r'<strong id="fan_count">(d+?)</strong>')
                        fan_count = re.search(fan_count_pattern,html)
                        if fan_count:
                            fan_count = fan_count.group(1)
                            fan_count = self.unknown
                        # the location the user is in
                        location_pattern = re.compile('<span>所在地区:(.+?)</span>')
                        location = re.search(location_pattern,html)
                        if location:
                            location = location.group(1)
                            location = self.unknown # unknown location
                        description_pattern = re.compile('<div class="inf s-fc3 f-brk">个人介绍:(.*?)</div>')
                        description = re.search(description_pattern,html)
                        if description:   # if user has a description
                            description = description.group(1)
                            description = description.replace(","," ")
                            description = self.unknown
                        age_pattern = re.compile(r'<span.*?data-age="(d+)">')
                        age = re.search(age_pattern,html) # if user age info exists
                        if age:
                            age = age.group(1) # note that this age is formatted as timestamp
                            # we should convert it into real age
                            current_year = int(self.from_timestamp_to_date(time_stamp = time.time(),format = "%Y"))
                            age = (current_year-1970) - int(age)//(1000*365*24*3600) # real age
                            age = self.unknown
                        listening_songs_num_pattern = re.compile('<h4>累积听歌(d+?)首</h4>')
                        # total listening songs count
                        listening_songs_num = re.search(listening_songs_num_pattern,html)
                        if listening_songs_num:
                            listening_songs_num = listening_songs_num.group(1) 
                            listening_songs_num = self.unknown
                        # write user info to the file
                                        user_id = user_id,crawler_time = crawler_time,event_count = event_count,
                                        follow_count = follow_count,fan_count = fan_count,location = location,
                                        description = description,age = age,listening_songs_num = listening_songs_num
                        print("Write {current}/{total} user info to file successfully!".format(current = index,total = num))
                    except Exception as e:
                        print("Fail to get No.{index} comment user's info:{error}"
                              .format(index = index,error = e))
        def threading_save_users_info_to_file(self,threads = 10):
            using multithreads to save users info to file
            :param threads: the threads count
            start_time = time.time()
            with open(self.users_info_file_path,"w",encoding = "utf-8") as fout:
            users_url = self.load_users_url()
            num = len(users_url)
            pack = num//threads # urls count every threads process
            unknown = "" # blank str for unknown info
            threads_list = []
            for i in range(threads):
                if i < threads-1:
                    urls = users_url[i*pack:(i+1)*pack]
                    urls = users_url[i*pack:]
                t = Thread(target = self.save_users_info,args=(urls,num))
            for i in range(threads):
            for i in range(threads):
            end_time = time.time()
            print("Using {threads} threads to save users info done,costs {cost_time} seconds"
                    .format(threads = threads,cost_time = (end_time - start_time)))
        def save_users_info(self,users_url,total):
            add users info to file,this function will be called in threadings
            :param users_url: the processing users url list
            :param total:total users ulr count
            users_info_list = []
            # note that we use add mode
            with open(self.users_info_file_path,"a",encoding = "utf-8") as fout:
                for user_url in users_url:
                            user_id = re.search(r'.*id=(d+)',user_url).group(1) # user id
                            # time to crawl this info
                            crawler_time = self.from_timestamp_to_date(time_stamp = time.time())
                            html = requests.get(user_url,headers = self.headers).text
                            # personal events counts
                            event_count_pattern = re.compile(r'<strong id="event_count">(d+?)</strong>')
                            event_count = re.search(event_count_pattern,html)
                            if event_count:
                                event_count = event_count.group(1) 
                                event_count = self.unknown
                            # how many people the user follow
                            follow_count_pattern = re.compile(r'<strong id="follow_count">(d+?)</strong>')
                            follow_count = re.search(follow_count_pattern,html)
                            if follow_count:
                                follow_count = follow_count.group(1) 
                                follow_count = self.unknown
                            # how many fans the user has
                            fan_count_pattern = re.compile(r'<strong id="fan_count">(d+?)</strong>')
                            fan_count = re.search(fan_count_pattern,html)
                            if fan_count:
                                fan_count = fan_count.group(1)
                                fan_count = self.unknown
                            # the location the user is in
                            location_pattern = re.compile('<span>所在地区:(.+?)</span>')
                            location = re.search(location_pattern,html)
                            if location:
                                location = location.group(1)
                                location = self.unknown # unknown location
                            description_pattern = re.compile('<div class="inf s-fc3 f-brk">个人介绍:(.*?)</div>')
                            description = re.search(description_pattern,html)
                            if description:   # if user has a description
                                description = description.group(1)
                                description = description.replace(","," ")
                                description = self.unknown
                            age_pattern = re.compile(r'<span.*?data-age="(d+)">')
                            age = re.search(age_pattern,html) # if user age info exists
                            if age:
                                age = age.group(1) # note that this age is formatted as timestamp
                                # we should convert it into real age
                                current_year = int(self.from_timestamp_to_date(time_stamp = time.time(),format = "%Y"))
                                age = (current_year-1970) - int(age)//(1000*365*24*3600) # real age
                                age = self.unknown
                            listening_songs_num_pattern = re.compile('<h4>累积听歌(d+?)首</h4>')
                            # total listening songs count
                            listening_songs_num = re.search(listening_songs_num_pattern,html)
                            if listening_songs_num:
                                listening_songs_num = listening_songs_num.group(1) 
                                listening_songs_num = self.unknown
                            # write user info to the file
                            user_info = "{user_id},{crawler_time},{event_count},{follow_count},{fan_count},{location},{description},{age},{listening_songs_num}
                                            user_id = user_id,crawler_time = crawler_time,event_count = event_count,
                                            follow_count = follow_count,fan_count = fan_count,location = location,
                                            description = description,age = age,listening_songs_num = listening_songs_num
                            print("Get {current}/{total} user info to file successfully!".format(current = self.threading_count,total = total))
                        except Exception as e:
                            print("Fail to get No.{index} comment user's info:{error}"
                                  .format(index = self.threading_count,error = e))
                        self.threading_count += 1
        def count_comments_lines(self):
            count total comments lines
            with open(self.comments_file_path,"r",encoding = "utf-8") as fin:
                for total,_ in enumerate(fin,1):
            return total
        def from_timestamp_to_date(self,time_stamp,format = "%Y-%m-%d %H:%M:%S"):
            convert from timestamp to real date formatted in Year-Month-Day etc. 
            :param time_stamp: the time stamp
            :param format: the date format we want to convert
            real_date = time.strftime(format,time.localtime(time_stamp))
            return real_date
        def load_users_url(self):
            return all users domain page ulr list
            comments_df = self.load_comments_csv()
            users_id = comments_df['用户ID'].dropna() # user id
            ids_num = len(users_id) # all ids num
            # users id must be integers like string
            users_id = [users_id.iloc[i] for i in range(ids_num) if re.match(r'd+',str(users_id.iloc[i]))]
            users_url = []
            for user_id in users_id:
                users_url.append('http://music.163.com/user/home?id={user_id}'.format(user_id = user_id))
            return list(set(users_url)) # remove the same user's ulr
        def load_users_info_csv(self):
            load users info from file,
            return users info dataframe
            users_info_df = pd.read_csv(self.users_info_file_path,engine = 'python',encoding = 'utf-8')
            return users_info_df
        def draw_wordcloud(self,full_comments = True,background_path = "source/JayChou.jpg",font_path = "source/simsun.ttc"):
            darw wordcloud of full comments of one song or hot comments of a singer
            :param full_comments: True means full comments,False means hot comments
            :param background_path:background image path
            :param font_path: font path
            abs_path = os.path.split(os.path.realpath(__file__))[0]
            background_path = os.path.join(abs_path,background_path)
            font_path = os.path.join(abs_path,font_path)
            if full_comments:
                file_path = self.comments_file_path
                save_path = os.path.join(self.song_path,self.song_name+".jpg")
                file_path = os.path.join(self.singer_path,"hot_comments.csv")
                save_path = os.path.join(self.singer_path,self.singer_name+".jpg")
            comments_df = pd.read_csv(file_path,engine = 'python',encoding = 'utf-8')["评论内容"]
            comments_text = ""
            for i in range(len(comments_df)):
                comments_text += str(comments_df.iloc[i]) 
            cut_text = " ".join(jieba.cut(comments_text)) # use blank space to paste cut keywords to str
            color_mask = imread(background_path) # read the background image
            cloud = WordCloud(font_path=font_path,background_color='white',mask=color_mask,max_words=2000,max_font_size=40)
            word_cloud = cloud.generate(cut_text) # 产生词云
            print("Successfully generate {save_path}".format(save_path =save_path))
        def core_visual_analyse(self):
            core visual analyse for comments and users info,including:
            1. The distribution of comments time,both for months,days(bar to show)
            2. The distribution of comments agree count(bar to show)
            3. The distribution of comment keywords,excluded stopwords(bar to show)
            4. The distribution of users location,using geo to show(geo to show)
            5. The distribution of users location,using bar to show(bar to show)
            6. The distribution of events count(bar to show)
            7. The distribution of follow people count(bar to show)
            8. The distribution of fans count(bar to show)
            9. The distribution of description keywords(excluded stopwords)(bar to show)
            10. The distribution of users age(bar to show)
            11. The distribution of listening songs total count(bar to show)
            plot_save_path = os.path.join(self.song_path,"plots")
            if not os.path.exists(plot_save_path):
            comments_df = self.load_comments_csv()
            users_info_df = self.load_users_info_csv()
            # 1. The distribution of comments time,both for months,days and for hours(bar to show)
            comments_time = list(comments_df['评论时间'].dropna())
            # date formatted by year-month
            comments_date_year_month = []
            # date formatted by year-month-day
            comments_date_year_month_day = []
            for comment_time in comments_time:
                # note that the timestamp should divide by 1000 first
                year_month = self.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m")
                year_month_day = self.from_timestamp_to_date(comment_time*0.001,format = "%Y-%m-%d")
            comments_date_year_month_x,comments_date_year_month_y = zip(*(sorted(Counter(comments_date_year_month).items(),key = itemgetter(0))))
            comments_date_year_month_day_x,comments_date_year_month_day_y = zip(*(sorted(Counter(comments_date_year_month_day).items(),key = itemgetter(0))))
            # year-month bar plot
            comments_date_year_month_bar = Bar(title = "歌曲<{song_name}>评论时间(年-月)数量分布".format(song_name = self.song_name))
            comments_date_year_month_save_path = os.path.join(plot_save_path,"comments_year_month_bar.html")
            # year-month-day bar plot
            comments_date_year_month_day_bar = Bar(title = "歌曲<{song_name}>评论时间(年-月-日)数量分布".format(song_name = self.song_name))
            comments_date_year_month_day_save_path = os.path.join(plot_save_path,"comments_year_month_day_bar.html")
            # 2. The distribution of comments agree count(bar to show)
            agree_count = list(comments_df['点赞总数'].dropna())
            agree_count_x,agree_count_y = zip(*(sorted(Counter(agree_count).items(),key = itemgetter(0))))
            agree_count_bar = Bar(title = "歌曲<{song_name}>评论点赞数量分布".format(song_name = self.song_name))
            agree_count_save_path = os.path.join(plot_save_path,"agree_count_bar.html")
            # 3. The distribution of comment keywords,excluded stopwords(bar to show)
            comments_text = "".join(list(comments_df['评论内容'].dropna()))
            comments_keywords = jieba.cut(comments_text)
            # remove the stopwords and word that length less than 2
            stopwords = self.load_stopwords()
            comments_keywords = [keyword for keyword in comments_keywords if keyword not in stopwords and len(keyword) > 1]
            comments_keywords_x,comments_keywords_y = zip(*(sorted(Counter(comments_keywords).items(),key = itemgetter(1),reverse = True)))
            comments_keywords_bar = Bar(title = "歌曲<{song_name}>评论关键词数量分布(已去除停用词)".format(song_name = self.song_name))
            comments_keywords_save_path = os.path.join(plot_save_path,"comments_keywords_bar.html")
            # 4. The distribution of users location,using geo to show(geo to show)
            users_location = list(users_info_df['用户所在地区'].dropna())
            users_city = [] # city users in
            all_cities = self.load_all_cities()
            for location in users_location:
                for city in all_cities:
                    if city in location:
            users_city_data = list(Counter(users_city).items()) 
            users_city_geo = Geo("歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name),title_color="#fff", title_pos="left",
                                    width=1200, height=600, background_color='#404a59')
            attr, value = users_city_geo.cast(users_city_data)
            users_city_geo.add("", attr, value, visual_range=[0, 200], visual_text_color="#fff", symbol_size=15, is_visualmap=True)
            users_city_save_path = os.path.join(plot_save_path,"users_city_geo.html")
            # 5. The distribution of users location,using bar to show(bar to show)
            users_location_x,users_location_y = zip(*(sorted(Counter(users_location).items(),key = itemgetter(1),reverse = True)))
            users_location_bar = Bar(title = "歌曲<{song_name}>评论用户所在地区分布".format(song_name = self.song_name))
            users_location_save_path = os.path.join(plot_save_path,"users_location_bar.html")
            # 6. The distribution of events count(pie to show)
            events_count = list(users_info_df['动态总数'].dropna())
            events_count_x,events_count_y = zip(*(sorted(Counter(events_count).items(),key = itemgetter(0))))
            events_count_bar = Bar(title = "歌曲<{song_name}>评论用户动态总数分布".format(song_name = self.song_name))
            events_count_save_path = os.path.join(plot_save_path,"events_count_bar.html")
            # 7. The distribution of follow people count(bar to show)
            follow_count = list(users_info_df['关注人数'].dropna())
            follow_count_x,follow_count_y = zip(*(sorted(Counter(follow_count).items(),key = itemgetter(0))))
            follow_count_bar = Bar(title = "歌曲<{song_name}>评论用户关注人数分布".format(song_name = self.song_name))
            follow_count_save_path = os.path.join(plot_save_path,"follow_count_bar.html")
            # 8. The distribution of fans count(bar to show)
            fans_count = list(users_info_df['粉丝人数'].dropna())
            fans_count_x,fans_count_y = zip(*(sorted(Counter(fans_count).items(),key = itemgetter(0))))
            fans_count_bar = Bar(title = "歌曲<{song_name}>评论用户粉丝人数分布".format(song_name = self.song_name))
            fans_count_save_path = os.path.join(plot_save_path,"fans_count_bar.html")
            # 9. The distribution of description keywords(excluded stopwords)(bar to show)
            description_text = "".join(list(users_info_df['用户简介'].dropna()))
            description_keywords = jieba.cut(description_text)
            description_keywords = [keyword for keyword in description_keywords if keyword not in stopwords and len(keyword) > 1]
            description_keywords_x,description_keywords_y = zip(*(sorted(Counter(description_keywords).items(),key = itemgetter(1),reverse = True)))
            description_keywords_bar = Bar(title = "歌曲<{song_name}>评论用户简介关键词数量分布(已去除停用词)".format(song_name = self.song_name))
            description_keywords_save_path = os.path.join(plot_save_path,"description_keywords_bar.html")
            # 10. The distribution of users age(bar to show)
            age_count = list(users_info_df['年龄'].dropna())
            age_count = [age for age in age_count if age >= 0] # filter legal age
            age_count_x,age_count_y = zip(*(sorted(Counter(age_count).items(),key = itemgetter(0))))
            age_count_bar = Bar(title = "歌曲<{song_name}>评论用户年龄分布".format(song_name = self.song_name))
            age_count_save_path = os.path.join(plot_save_path,"age_count_bar.html")
            # 11. The distribution of listening songs total count(bar to show)
            listening_songs_count = list(users_info_df['累计听歌数量'].dropna())
            listening_songs = {'0-100':0,'100-1000':0,'1000-10000':0,'>10000':0}
            for c in listening_songs_count:
                if c < 100:
                    listening_songs['0-100'] += 1
                elif c < 1000:
                    listening_songs['100-1000'] += 1
                elif c < 10000:
                    listening_songs['1000-10000'] += 1
                    listening_songs['>10000'] += 1
            listening_songs_count_x,listening_songs_count_y = zip(*sorted(Counter(listening_songs).items(),key = itemgetter(1),reverse = True))
            listening_songs_count_bar = Bar(title = "歌曲<{song_name}>评论用户听歌总数分布".format(song_name = self.song_name))
            listening_songs_count_save_path = os.path.join(plot_save_path,"listening_songs_count_bar.html")
        def load_stopwords(self):
            load stopwords list
            abs_path = os.path.split(os.path.realpath(__file__))[0]
            stopwords_path = os.path.join(abs_path,"source","stopwords.txt")
            with open(stopwords_path,"r",encoding = "utf-8") as f:
                stopwords = f.readlines()
            stopwords = [word.strip() for word in stopwords]
            return list(set(stopwords))
        def load_all_cities(self):
            load all cities from province_cities.json file,
            to match city from location text
            abs_path = os.path.split(os.path.realpath(__file__))[0]
            province_cities_file = os.path.join(abs_path,"source","province_cities.json")
            all_cities = []
            with open(province_cities_file,"r",encoding = "utf-8") as fin:
                content = fin.read()
                d = json.loads(content)
                for province in d:
                    for city in province['city']:
            return all_cities
        def generate_all_analyse_files(self,threads = 10):
            generate all analyse files,including:
            1. generate users info file
            2. generate wordcloud picture
            3. generate core analyse files
        def _test_load_all_cities(self):
            all_cities = self.load_all_cities()
            print("There are %d cities." % len(all_cities))
        def _test_load_stopwords(self):
            stopwords = self.load_stopwords()
            print('There are %d stopwords.' % len(stopwords))
            # print first 100 stopwords
        def _test_load_comments_csv(self):
            df = self.load_comments_csv()
        def _test_count_comments_lines(self):
            total = self.count_comments_lines()
            print("{file} has {total} comments.".format(file = self.comments_file_path,total = total))
        def _test_from_timestamp_to_date(self):
            comments_df = self.load_comments_csv()
            comments_timestamp = comments_df['评论时间'].dropna() # drop na value
            show_num = 10 # lines to show
            print("timestamp           real_date")
            for i in range(show_num):
                time_stamp = comments_timestamp.iloc[i]
                if time_stamp:
                    real_date = self.from_timestamp_to_date(time_stamp)
                    print("%s       %s" %(time_stamp,real_date))
        def _test_load_users_url(self):
            users_url = self.load_users_url()
            print("There are %d users ulr." % len(users_url))
            num = 10
            print("Top %d users ulr are:" % num)
            for i in range(num):
                print("{index}:{url}".format(index = i+1,url = users_url[i]))
        def _test_load_users_info_csv(self):
            users_info_df = self.load_users_info_csv()
        def _test_save_users_info_to_file(self):
        def _test_draw_wordcloud(self):
            full_comments = False
            self.draw_wordcloud(full_comments = full_comments)
        def _test_core_visual_analyse(self):
        def _test_threading_save_users_info_to_file(self,threads = 10):
        def _test_netcloudanalyse_all(self):
    # if __name__ == '__main__':
    #     song_name = '晴天'
    #     song_id = 186016
    #     singer_name = '周杰伦'
    #     singer_id = 6452
    #     netcloud_analyse = NetCloudAnalyse(song_name = song_name,song_id = song_id,singer_name = singer_name,
    #                                         singer_id = singer_id)
    #     #netcloud_analyse._test_netcloudanalyse_all()
    #     netcloud_analyse.generate_all_analyse_files(100)


