• 网页行为分析


    去掉重复的id,并且存入数据库:

    import MySQLdb
    # 打开数据库连接
    db = MySQLdb.connect(host='localhost', user='root', passwd='123456', port=3306, charset="utf8", db="db_websiterecommend")
    cur = db.cursor()
    # db = MySQLdb.connect(host="localhost",user="root",passwd="123456",db="db_websiterecommend",port='3306' )
    # # 使用cursor()方法获取操作游标
    # cursor = db.cursor()
    # 使用execute方法执行SQL语句
    sql='select ip from t_useripvisittrace'
    
    cur.execute(sql)
    # 使用 fetchone() 方法获取一条数据库。
    data = cur.fetchall()
    #print(data)
    user_ip=[]
    for l in data:
        user_ip.append(l[0])
    #print(type(user_ip))
    #print(user_ip)
    # 关闭数据库连接
    user_ip_cai=set(user_ip)
    # print(user_ip_cai)
    userip=list(user_ip_cai)
    value=[]
    for i in range(len(userip)):
        value.append((i,userip[i]))
    cur.executemany('insert into t_userIP_list values(%s,%s)',value)
    db.commit()

     根据网站情况,进行网站的分类:

    # -*- coding: utf-8 -*-
    import MySQLdb
    import re
    import requests
    from lxml import etree
    # 打开数据库连接
    count=0
    url='http://www.tipdm.org'
    db = MySQLdb.connect(host='localhost', user='root', passwd='123456', port=3306, charset="utf8", db="db_websiterecommend")
    cur = db.cursor()
    cur1=db.cursor()
    sql='select page_path from t_useripvisittrace'
    sql1='select url_list from urls'
    cur.execute(sql)
    cur1.execute(sql1)
    # 使用 fetchone() 方法获取一条数据库。
    value=[]
    data = cur.fetchall()
    print(len(data))
    for each in data:
        #print(type(each[0]))
        if each[0]=='/':
            print('2222')
            value.append((each[0],'首页'))
            cur.executemany('insert into t_url_classify values(%s,%s)', value)
            db.commit()
            count+=1
            print(count)
            #print(value)
        elif each[0]=='/index.jhtml':
            print('3333')
            value.append((each[0],'首页'))
            cur.executemany('insert into t_url_classify values(%s,%s)', value)
            db.commit()
            count += 1
            print(count)
            #print(value)
        elif 'index' in each[0]:
            print('4444')
            urls=url+each[0]
            html = requests.get(urls)
            selector = etree.HTML(html.text)
            content=selector.xpath('/html/body/div[6]/div[2]/div[1]/div/a[2]/text()')
            value.append((each[0],content))
            cur.executemany('insert into t_url_classify values(%s,%s)', value)
            db.commit()
            count += 1
            print(count)
            #print(value)
        elif '.jhtml' in each[0]:
            print('5555')
            url1=url+each[0]
            html = requests.get(url1)
            selector = etree.HTML(html.text)
            content=selector.xpath('/html/body/div[5]/div[2]/div[1]/div[1]/a[2]/text()')
            value.append((each[0],content))
            cur.executemany('insert into t_url_classify values(%s,%s)', value)
            db.commit()
            count += 1
            print(count)
        else:
            print('666')
            value.append((each[0],'其他'))
            print(each[0])
            cur.executemany('insert into t_url_classify values(%s,%s)', value)
            db.commit()
            count += 1
            print(count)
    print(value)
    print('finish')

     使用pandas读取数据库进行统计

    import pandas as pd
    from sqlalchemy import create_engine
    engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/db_websiterecommend?charset=utf8')
    sql = pd.read_sql('t_useripvisittrace', engine, chunksize = 10000)
    output='C:\Users\lenovo\Desktop\count_.xls'
    '''z
    用create_engine建立连接,连接地址的意思依次为“数据库格式(mysql)+程序名(pymysql)+账号密码@地址端口/数据库名(test)”,最后指定编码为utf8;
    all_gzdata是表名,engine是连接数据的引擎,chunksize指定每次读取1万条记录。这时候sql是一个容器,未真正读取数据。
    '''
    import MySQLdb
    # 打开数据库连接
    db = MySQLdb.connect(host='localhost', user='root', passwd='123456', port=3306, charset="utf8", db="db_websiterecommend")
    cur = db.cursor()
    value=[]
    # for j in range(len(list(sql))):
    #     s=sql[j]['ip'].value_counts()
    #     value.append((j,s))
    #     print value
    for i in sql:
        s=i['ip'].value_counts()
        print type(s)
        value.append((list(i['ip']),list(s)))
        cur.executemany('insert into userip values(%,%s)', value)
        db.commit()
    print value
  • 相关阅读:
    c/c++生成预编译文件
    (转)vim 访问系统剪贴板
    vim 使用、设置笔记
    使用cygwin注意事项二
    (转)使用cygwin注意事项一
    cygwin Could not create directory '/home/Administrator/.ssh'
    Linux 学习笔记 文件权限
    Linux 学习笔记 Linux环境变量初稿
    Linux 学习笔记 更多的bash shell命令
    Linux 学习笔记 查看文件内容诸多命令
  • 原文地址:https://www.cnblogs.com/caicaihong/p/5869653.html
Copyright © 2020-2023  润新知