• 将 Book-Crossing Dataset 书籍推荐算法中 CVS 格式测试数据集导入到MySQL数据库


    本文内容

    最近看《写给程序员的数据挖掘指南》,研究推荐算法,书中的测试数据集是 Book-Crossing Dataset 提供的亚马逊用户对书籍评分的真实数据。推荐大家看本书,写得不错,立刻就能对推荐算法上手,甚至应用到你的项目中。

    Book-Crossing Dataset 提供两种格式的数据集:CVS 格式SQL dump,问题是:

    如果你有 UE 打开 cvs 文件,有乱码。无论如何转换编码,都不行~因为,这个文件是亚马逊通过程序持久化后,再导出来的。你还会发现,文件中有 html 标记,另外,关于用户名,书名等等信息,基本都是德文的(看域名就知道了)~

    虽然,作者提供了加载测试数据集的 python 代码,不过不能导入到 MySQL 数据库中,其中,作者只是简单地按分号来分割字段内容(虽然推荐算法并不需要全部字段),可数据集中包含类似“ऩ”或““”这样的字符,不可能导入到 MySQL 数据库中~

    你也许会问,作者都不导入到数据库,你为什么要导?因为,作者提供的推荐算法属于内存模型,也就是一次性把数据加载到内存,但之前,总还是要持久化吧~

    因此,只能改造一下作者的 Python 代码~

    Github Demo

    改造后测试数据集

    Python

    # -*- coding: utf-8 -*-
     
    import mysql.connector
    import codecs
    import string
    import os
    import sys
    import ConfigParser
    from collections import OrderedDict
    import re
     
    class MysqlPythonFacotry(object):
        """
            Python Class for connecting  with MySQL server.
        """
     
        __instance = None
        __host = None
        __user = None
        __password = None
        __database = None
        __session = None
        __connection = None
     
        def __init__(self, host='localhost', user='root', password='', database=''):
            self.__host = host
            self.__user = user
            self.__password = password
            self.__database = database
        ## End def __init__
     
        def open(self):
            try:
                cnx = mysql.connector.connect(host=self.__host,
                    user= self.__user,
                    password= self.__password,
                    database= self.__database)
                self.__connection = cnx
                self.__session = cnx.cursor()
            except mysql.connector.Error as e:
                print('connect fails!{}'.format(e))
        ## End def open
     
        def close(self):
            self.__session.close()
            self.__connection.close()
        ## End def close
     
        def select(self, table, where=None, *args, **kwargs):
            result = None
            query = 'SELECT '
            keys = args
            values = tuple(kwargs.values())
            l = len(keys) - 1
     
            for i, key in enumerate(keys):
                query += "`" + key + "`"
                if i <; l:
                    query += ","
            ## End for keys
     
            query += 'FROM %s' % table
     
            if where:
                query += " WHERE %s" % where
            ## End if where
     
            self.__session.execute(query, values)
            number_rows = self.__session.rowcount
            number_columns = len(self.__session.description)
            result = self.__session.fetchall()
     
            return result
        ## End def select
     
        def update(self, table, where=None, *args, **kwargs):
            try:
                query = "UPDATE %s SET " % table
                keys = kwargs.keys()
                values = tuple(kwargs.values()) + tuple(args)
                l = len(keys) - 1
                for i, key in enumerate(keys):
                    query += "`" + key + "` = %s"
                    if i <; l:
                        query += ","
                    ## End if i less than 1
                ## End for keys
                query += " WHERE %s" % where
     
                self.__session.execute(query, values)
                self.__connection.commit()
     
                # Obtain rows affected
                update_rows = self.__session.rowcount
     
            except mysql.connector.Error as e:
                print(e.value)
     
            return update_rows
        ## End function update
     
        def insert(self, table, *args, **kwargs):
            values = None
            query = "INSERT INTO %s " % table
            if kwargs:
                keys = kwargs.keys()
                values = tuple(kwargs.values())
                query += "(" + ",".join(["`%s`"] * len(keys)) % tuple(keys) + ") VALUES (" + ",".join(["%s"] * len(values)) + ")"
            elif args:
                values = args
                query += " VALUES(" + ",".join(["%s"] * len(values)) + ")"
     
            self.__session.execute(query, values)
            self.__connection.commit()
            cnt = self.__session.rowcount
            return cnt
        ## End def insert
     
        def delete(self, table, where=None, *args):
            query = "DELETE FROM %s" % table
            if where:
                query += ' WHERE %s' % where
     
            values = tuple(args)
     
            self.__session.execute(query, values)
            self.__connection.commit()
            delete_rows = self.__session.rowcount
            return delete_rows
        ## End def delete
     
        def select_advanced(self, sql, *args):
            od = OrderedDict(args)
            query = sql
            values = tuple(od.values())
            self.__session.execute(query, values)
            number_rows = self.__session.rowcount
            number_columns = len(self.__session.description)
            result = self.__session.fetchall()
            return result
        ## End def select_advanced
    ## End class
     
     
    class ErrorMyProgram(Exception):
        """
            My Exception Error Class
        """
        def __init__(self, value):
            self.value = value
        ##End def __init__
            
        def __str__(self):
            return repr(self.value)
        ##End def __str__
     ## End class ErrorMyProgram
        
        
    class LoadAppConf(object):
        """
            Load app.conf Config File Class
        """
        __configFileName = "app.conf"
     
        def __init__(self):
            config = ConfigParser.ConfigParser()
            config.read(self.__configFileName)
     
            self.biz_db_host = config.get("biz_db","host") 
            self.biz_db_user = config.get("biz_db","user") 
            self.biz_db_password = config.get("biz_db","password")
            self.biz_db_database = config.get("biz_db","database")
        ## End def __init__
     ## End class LoadAppConf    
            
    class Biz_Base(object):
        """
            biz base class
        """
        def __init__(self, db):
            self.db = db
        ## End def __init__
     ## End class Biz_Base
            
     
    class Biz_bx_book_ratings(Biz_Base):
        """
            bx_book_ratings table
        """
     
        __tableName = "bx_book_ratings"
     
        def __init__(self, db):
            Biz_Base.__init__(self, db)
        ## End def __init__
            
        def insert(self, userid, isbn, bookrating):
            cnt = self.db.insert(self.__tableName,
                userid = userid, 
                isbn = isbn,
                bookrating = bookrating)
            return cnt >; 0
        ## End def insert
     ## End class Biz_bx_book_ratings    
     
     
    class Biz_bx_books(Biz_Base):
        """
            bx_books table
        """
     
        __tableName = "bx_books"
     
        def __init__(self, db):
             Biz_Base.__init__(self, db)
        ## End def __init__
             
        def insert(self, isbn, booktitle, bookauthor, yearofpublication, publisher, imageurls, imageurlm, imageurll):
            cnt = self.db.insert(self.__tableName,
                isbn = isbn, 
                booktitle = booktitle, 
                bookauthor = bookauthor,
                yearofpublication = yearofpublication, 
                publisher = publisher, 
                imageurls = imageurls, 
                imageurlm = imageurlm, 
                imageurll = imageurll)
            return cnt >; 0
        ## End def insert
    ## End class Biz_bx_books 
     
    class Biz_bx_users(Biz_Base):
        """
            bx_users table
        """
     
        __tableName = "bx_users"
     
        def __init__(self, db):
             Biz_Base.__init__(self, db)
        ## End def __init__
             
        def insert(self, userid, location, age):
            cnt = self.db.insert(self.__tableName,
                userid = userid, 
                location = location,
                age = age)
            return cnt >; 0
        ## End def insert
    ## End class Biz_bx_users 
     
    def regx(l):
        """
            split line by regex
        """
        p = re.compile(r'"[^"]*"')
        return p.findall(l)
    ## End def regx 
     
    class LoadDataset(object):
        """
            bx_books table
        """
        
        __loadConf = None
        
        __users = None
        __books = None
        __book_ratings = None
        
        __bizDb = None    
     
        def __init__(self):
            self.__loadConf = LoadAppConf()
            
            self.__bizDb = MysqlPythonFacotry(self.__loadConf.biz_db_host,
                     self.__loadConf.biz_db_user, 
                     self.__loadConf.biz_db_password,
                     self.__loadConf.biz_db_database)
     
            self.__users = Biz_bx_users(self.__bizDb)
            self.__books = Biz_bx_books(self.__bizDb)
            self.__book_ratings = Biz_bx_book_ratings(self.__bizDb)
        
            self.__bizDb.open()
        ## End def __init__
            
        def toDB(self, path=''):
            """
                loads the BX book dataset. Path is where the BX files are
                located
            """
            
            self.data = {}
            i = 0
            j = 0
            try:
                #
                # First load book ratings into self.data
                #
                f = codecs.open(path + "BX-Book-Ratings.csv", 'r', 'utf8')
                for line in f:
                    i += 1
                    j += 1
                    
                    print(j)
                    print(line)
                    
                    #separate line into fields
                    fields = line.split(';')
                    user = fields[0].strip('"')
                    book = fields[1].strip('"')
                    rating = int(fields[2].strip().strip('"'))
     
                    self.__book_ratings.insert(user, book, rating)
     
                f.close()
                #
                # Now load books into self.productid2name
                # Books contains isbn, title, and author among other fields
                #
                j = 0
                f = codecs.open(path + "BX-Books.csv", 'r', 'utf8')
                for line in f:
                    i += 1
                    j += 1
     
                    print(j)
                    print(line)
                    
                    #separate line into fields
                    fields = regx(line)
                    isbn = fields[0].strip('"')
                    title = fields[1].strip('"')
                    author = fields[2].strip().strip('"')
                    yearOfPublication = fields[3].strip().strip('"')
                    publisher = fields[4].strip().strip('"')
                    imageUrlS = fields[5].strip().strip('"')
                    imageUrlM = fields[6].strip().strip('"')
                    imageUrlL = fields[7].strip().strip('"')
     
                    self.__books.insert(isbn, title, author, yearOfPublication, publisher, imageUrlS, imageUrlM, imageUrlL)
                f.close()
                #
                #  Now load user info into both self.userid2name and
                #  self.username2id
                #
                j = 0
                f = codecs.open(path + "BX-Users.csv", 'r', 'utf8')
                for line in f:
                    i += 1
                    j += 1
                    
                    print(j)
                    print(line)
                    
                    #separate line into fields                
                    fields = regx(line)
                    userid = fields[0].strip('"')
                    location = fields[1].strip('"')
                    if len(fields) >; 2:
                        age = fields[2].strip().strip('"')
                    else:
                        age = None
                    if age != None:
                        value = location + '  (age: ' + age + ')'
                    else:
                        value = location
     
                    if age == None:
                        age =0
       
                    self.__users.insert(userid, location, age)
                                        
                f.close()
            except  ErrorMyProgram as e:
                print(e.value)
            finally:
                self.__bizDb.close()
     
            print(i)
        ## End def toDB
    ## End class LoadData    

    Github Demo

    测试数据集

  • 相关阅读:
    Oracle自带的sql developer导入导出数据
    LSMW批处理工具操作手册
    JS控制TABLE表格在任意一行下面添加一行(有待完善)
    SAP销售模块塑工常见问题和解决方案(自己收藏)
    SAP采购订单历史明细报表源代码(自己收藏)
    SAP公司间采购订单关联交货单报表源代码(自己收藏)
    《嵌入式系统可靠性设计技术及案例解析》读书笔记(七)
    《嵌入式系统可靠性设计技术及案例解析》读书笔记(六)
    《嵌入式系统可靠性设计技术及案例解析》读书笔记(五)
    《嵌入式系统可靠性设计技术及案例解析》读书笔记(四)
  • 原文地址:https://www.cnblogs.com/liuning8023/p/5412200.html
Copyright © 2020-2023  润新知