• 一个简单的根据行数对大文件进行分片的python程序


    项目中碰到了这么一个需求:
    有一个record文件,每行一个item,整个文件大小在2G左右。根据要求,需要每天向其他系统提供100000个item,怎么处理比较好?
    考虑之后觉得分片的主意不错,先根据每片的item数对这个大文件进行分片,然后每天投放一片即可。
    具体python代码如下:

    View Code
    # -*- coding: utf-8 -*-

    import os
    import sys
    import shutil
    import time
    # import linecache
    import hashlib
    import zlib
    import binascii
    import urllib2

    import logging


    datas_dir = "./datas/"
    items_per_page = 10000
    url_prefix = "http://172.16.1.110:80/download/"
    check_result_dir = "./results/"

    logger = logging.getLogger(__name__)

    def initialize():
    """
    @summary: initialize the working directory
    """
    if os.path.exists(datas_dir) and os.path.isdir(datas_dir):
    # clear datas dir
    print "begin to remove old datas directory"
    shutil.rmtree(datas_dir)
    print "begin to make datas directory"
    # to resove the conflict between rmtree and mkdir, so i will sleep 1 seconds
    time.sleep(1)
    os.mkdir(datas_dir)


    def read_specific_lines(file, lines_to_read):
    """
    @summary: read specific lines from file
    file is any iterable; lines_to_read is an iterable
    containing int values
    """
    lines = set(lines_to_read)
    last = max(lines)
    for n, line in enumerate(file):
    if n + 1 in lines:
    yield line
    if n + 1 > last:
    return
    def split_file(filename, lines_per_page):
    """
    @summary: split the file into n lines a page
    """
    if lines_per_page <=0:
    lines_per_page = 1

    with open(filename, 'r') as fp:
    lines = []
    for n, line in enumerate(fp):
    guard = n % lines_per_page
    if guard == 0:
    lines = []
    lines.append(line)
    if guard == lines_per_page - 1:
    yield lines
    yield lines

    def write_to_file(lines, filename):
    """
    @summary: write lines to specified file
    """
    with open(filename, 'w') as fp:
    for line in lines:
    # construct content
    line_to_write = url_prefix + line
    fp.write(line_to_write)

    def calculate_md5_crc32(msg):
    """
    @summary: calculate the md5 and crc32
    """
    m = hashlib.md5()
    m.update(msg)
    md5 = m.hexdigest().upper()
    crc32 = binascii.crc32(msg)
    crc32 = crc32 & 0xffffffff
    crc32_str = "%08x" % crc32
    crc32_str = crc32_str.upper()

    return md5 + '.' + crc32_str

    def check_file_integrity(download_url):
    """
    @summary: download file and check it's integrity
    @return: True/False
    """
    try:
    file_name = download_url.rsplit("/", 1)[1]
    response = urllib2.urlopen(download_url)
    md5_crc32 = calculate_md5_crc32(response.read())
    print "file_name = %s, md5_crc32 = %s" %(file_name, md5_crc32)
    if file_name == md5_crc32:
    return True
    else:
    return False
    except Exception, ex:
    logger.exception(ex)
    return False

    def do_check():
    if os.path.exists(check_result_dir) and os.path.isdir(check_result_dir):
    # clear datas dir
    print "begin to remove old result directory"
    shutil.rmtree(check_result_dir)
    print "begin to make result directory"
    # to resove the conflict between rmtree and mkdir, so i will sleep 1 seconds
    time.sleep(1)
    os.mkdir(check_result_dir)
    # fp = open("not_integrity.list", 'w')

    for n, lines in enumerate(split_file("alive_sample.log", items_per_page)):
    print "begin to check %d sample list" %( n+1)
    if n >= 1:
    break
    filename = os.path.join(check_result_dir, "notintergrity_" + str(n + 1) + ".list")
    fp = open(filename, 'w')
    for line in lines:
    try:
    download_url = url_prefix + line.strip()
    res = check_file_integrity(download_url)
    if res == False:
    fp.write(line)
    fp.flush()
    logger.error("check integrity error, download_url = %s", download_url)
    else:
    print "%s check OK" % line
    except Exception, ex:
    logger.exception(ex)
    fp.close()
    fp.close()
    if __name__ == "__main__":
    import myloggingconfig
    #do_check()
    #assert False
    print check_file_integrity("http://172.16.1.110:80/download/B4D2EF861106F6812668D5163EA9CD58.4F38C168")
    assert False
    initialize()
    for n, lines in enumerate(split_file("20120106.rpt", items_per_page)):
    print "begin construct %d sample list" %( n+1)
    ## if n > 4:
    #
    # break
    # construct file name
    filename = os.path.join(datas_dir, "samplelist_" + str(n + 1) + ".list")
    write_to_file(lines, filename)

    上述代码中包含了计算md5和crc32的工具,整个分片功能包含在split_file函数中。

    def split_file(filename, lines_per_page):
    """
    @summary: split the file into n lines a page
    """
    if lines_per_page <=0:
    lines_per_page = 1

    with open(filename, 'r') as fp:
    lines = []
    for n, line in enumerate(fp):
    guard = n % lines_per_page
    if guard == 0:
    lines = []
    lines.append(line)
    if guard == lines_per_page - 1:
    yield lines
    yield lines



  • 相关阅读:
    机器学习系列-tensorflow-01-急切执行API
    Python3基础-代码阅读系列—优惠码生成
    英语口语练习系列-C01-好棒
    Python3基础系列——枚举类型大揭秘
    Python3字符串-最容易理解的方式
    英语词性系列-B02-动词
    专业方向系列-02-基于深度学习的诊断方法
    英语词性系列-B01-名词
    英语进阶系列-A06-本周总结
    Python数据可视化系列-02-pyecharts可视化非常cool
  • 原文地址:https://www.cnblogs.com/Jerryshome/p/2334437.html
Copyright © 2020-2023  润新知