一个简单的根据行数对大文件进行分片的python程序

项目中碰到了这么一个需求：
有一个record文件，每行一个item，整个文件大小在2G左右。根据要求，需要每天向其他系统提供100000个item，怎么处理比较好？
考虑之后觉得分片的主意不错，先根据每片的item数对这个大文件进行分片，然后每天投放一片即可。
具体python代码如下：

View Code

# -*- coding: utf-8 -*-

import os
import sys
import shutil
import time
# import linecache
import hashlib
import zlib
import binascii
import urllib2

import logging


datas_dir = "./datas/"
items_per_page = 10000
url_prefix = "http://172.16.1.110:80/download/"
check_result_dir  = "./results/"

logger = logging.getLogger(__name__)

def initialize():
    """
    @summary: initialize the working directory
    """
    if os.path.exists(datas_dir) and os.path.isdir(datas_dir):
        # clear datas dir
        print "begin to remove old datas directory"
        shutil.rmtree(datas_dir)
    print "begin to make datas directory"
    # to resove the conflict between rmtree and mkdir, so i will sleep 1 seconds
    time.sleep(1)
    os.mkdir(datas_dir)
    
       
def read_specific_lines(file, lines_to_read):
    """
    @summary: read specific lines from file
    file is any iterable; lines_to_read is an iterable
    containing int values
    """ 
    lines = set(lines_to_read)
    last = max(lines)
    for n, line in enumerate(file):
        if n + 1 in lines:
            yield line
        if n + 1 > last:
            return
def split_file(filename, lines_per_page):
    """
    @summary: split the file into n lines a page
    """
    if lines_per_page <=0:
        lines_per_page = 1
        
    with open(filename, 'r') as fp:
        lines = []
        for n, line in enumerate(fp):
            guard = n % lines_per_page
            if guard == 0:
                lines = []
            lines.append(line)
            if guard == lines_per_page - 1:
                yield lines
        yield lines

def write_to_file(lines, filename):
    """
    @summary: write lines to specified file
    """
    with open(filename, 'w') as fp:
        for line in lines:
            # construct content
            line_to_write = url_prefix + line
            fp.write(line_to_write)

def calculate_md5_crc32(msg):
    """
    @summary: calculate the md5 and crc32
    """
    m = hashlib.md5()
    m.update(msg)
    md5 = m.hexdigest().upper()
    crc32 = binascii.crc32(msg)
    crc32 = crc32 & 0xffffffff
    crc32_str = "%08x" % crc32
    crc32_str = crc32_str.upper()
    
    return md5 + '.' + crc32_str

def check_file_integrity(download_url):
    """
    @summary: download file and check it's  integrity
    @return: True/False
    """
    try:
        file_name = download_url.rsplit("/", 1)[1]
        response = urllib2.urlopen(download_url)
        md5_crc32 = calculate_md5_crc32(response.read())
        print "file_name = %s, md5_crc32 = %s" %(file_name, md5_crc32)
        if file_name == md5_crc32:
            return True
        else:
            return False
    except Exception, ex:
        logger.exception(ex)
        return False
    
def do_check():
    if os.path.exists(check_result_dir) and os.path.isdir(check_result_dir):
        # clear datas dir
        print "begin to remove old result directory"
        shutil.rmtree(check_result_dir)
    print "begin to make result directory"
        # to resove the conflict between rmtree and mkdir, so i will sleep 1 seconds
    time.sleep(1)
    os.mkdir(check_result_dir)    
#    fp = open("not_integrity.list", 'w') 
    
    for n, lines in enumerate(split_file("alive_sample.log", items_per_page)):
        print "begin to check %d sample list" %( n+1)
        if n >= 1:
            break
        filename = os.path.join(check_result_dir, "notintergrity_" + str(n + 1) + ".list")
        fp = open(filename, 'w')
        for line in lines:
            try:
                download_url = url_prefix + line.strip()
                res = check_file_integrity(download_url)
                if res == False:
                    fp.write(line)
                    fp.flush()
                    logger.error("check integrity error, download_url = %s", download_url)
                else:
                    print "%s check OK" % line
            except Exception, ex:
                logger.exception(ex)
        fp.close()
    fp.close()         
if __name__ == "__main__":
    import myloggingconfig
    #do_check()
    #assert False
    print check_file_integrity("http://172.16.1.110:80/download/B4D2EF861106F6812668D5163EA9CD58.4F38C168")
    assert False
    initialize()
    for n, lines in enumerate(split_file("20120106.rpt", items_per_page)):
        print "begin construct %d sample list" %( n+1)
##        if n > 4:
##            break
        # construct file name
        filename = os.path.join(datas_dir, "samplelist_" + str(n + 1) + ".list")
        write_to_file(lines, filename)

上述代码中包含了计算md5和crc32的工具，整个分片功能包含在split_file函数中。

def split_file(filename, lines_per_page):
    """
    @summary: split the file into n lines a page
    """
    if lines_per_page <=0:
        lines_per_page = 1
        
    with open(filename, 'r') as fp:
        lines = []
        for n, line in enumerate(fp):
            guard = n % lines_per_page
            if guard == 0:
                lines = []
            lines.append(line)
            if guard == lines_per_page - 1:
                yield lines
        yield lines

相关阅读:
机器学习系列-tensorflow-01-急切执行API
Python3基础-代码阅读系列—优惠码生成
 英语口语练习系列-C01-好棒
 Python3基础系列——枚举类型大揭秘
 Python3字符串-最容易理解的方式
 英语词性系列-B02-动词
 专业方向系列-02-基于深度学习的诊断方法
 英语词性系列-B01-名词
 英语进阶系列-A06-本周总结
 Python数据可视化系列-02-pyecharts可视化非常cool
原文地址：https://www.cnblogs.com/Jerryshome/p/2334437.html