• LZ77.py


    import math
    from bitarray import bitarray
    
    class LZ77Compressor:
    	"""
    	A simplified implementation of the LZ77 Compression Algorithm
    	"""
    	MAX_WINDOW_SIZE = 400
    
    	def __init__(self, window_size=20):
    		self.window_size = min(window_size, self.MAX_WINDOW_SIZE) 
    		self.lookahead_buffer_size = 15 # length of match is at most 4 bits
    
    	def compress(self, input_file_path, output_file_path=None, verbose=False):
    		"""
    		Given the path of an input file, its content is compressed by applying a simple 
    		LZ77 compression algorithm. 
    
    		The compressed format is:
    		0 bit followed by 8 bits (1 byte character) when there are no previous matches
    			within window
    		1 bit followed by 12 bits pointer (distance to the start of the match from the 
    			current position) and 4 bits (length of the match)
    		
    		If a path to the output file is provided, the compressed data is written into 
    		a binary file. Otherwise, it is returned as a bitarray
    
    		if verbose is enabled, the compression description is printed to standard output
    		"""
    		data = None
    		i = 0
    		output_buffer = bitarray(endian='big')
    
    		# read the input file 
    		try:
    			with open(input_file_path, 'rb') as input_file:
    				data = input_file.read()
    		except IOError:
    			print 'Could not open input file ...'
    			raise
    
    		while i < len(data):
    			#print i
    
    			match = self.findLongestMatch(data, i)
    
    			if match: 
    				# Add 1 bit flag, followed by 12 bit for distance, and 4 bit for the length
    				# of the match 
    				(bestMatchDistance, bestMatchLength) = match
    
    				output_buffer.append(True)
    				output_buffer.frombytes(chr(bestMatchDistance >> 4))
    				output_buffer.frombytes(chr(((bestMatchDistance & 0xf) << 4) | bestMatchLength))
    
    				if verbose:
    					print "<1, %i, %i>" % (bestMatchDistance, bestMatchLength),
    
    				i += bestMatchLength
    
    			else:
    				# No useful match was found. Add 0 bit flag, followed by 8 bit for the character
    				output_buffer.append(False)
    				output_buffer.frombytes(data[i])
    				
    				if verbose:
    					print "<0, %s>" % data[i],
    
    				i += 1
    
    		# fill the buffer with zeros if the number of bits is not a multiple of 8		
    		output_buffer.fill()
    
    		# write the compressed data into a binary file if a path is provided
    		if output_file_path:
    			try:
    				with open(output_file_path, 'wb') as output_file:
    					output_file.write(output_buffer.tobytes())
    					print "File was compressed successfully and saved to output path ..."
    					return None
    			except IOError:
    				print 'Could not write to output file path. Please check if the path is correct ...'
    				raise
    
    		# an output file path was not provided, return the compressed data
    		return output_buffer
    
    
    	def decompress(self, input_file_path, output_file_path=None):
    		"""
    		Given a string of the compressed file path, the data is decompressed back to its 
    		original form, and written into the output file path if provided. If no output 
    		file path is provided, the decompressed data is returned as a string
    		"""
    		data = bitarray(endian='big')
    		output_buffer = []
    
    		# read the input file
    		try:
    			with open(input_file_path, 'rb') as input_file:
    				data.fromfile(input_file)
    		except IOError:
    			print 'Could not open input file ...'
    			raise
    
    		while len(data) >= 9:
    
    			flag = data.pop(0)
    
    			if not flag:
    				byte = data[0:8].tobytes()
    
    				output_buffer.append(byte)
    				del data[0:8]
    			else:
    				byte1 = ord(data[0:8].tobytes())
    				byte2 = ord(data[8:16].tobytes())
    
    				del data[0:16]
    				distance = (byte1 << 4) | (byte2 >> 4)
    				length = (byte2 & 0xf)
    
    				for i in range(length):
    					output_buffer.append(output_buffer[-distance])
    		out_data =  ''.join(output_buffer)
    
    		if output_file_path:
    			try:
    				with open(output_file_path, 'wb') as output_file:
    					output_file.write(out_data)
    					print 'File was decompressed successfully and saved to output path ...'
    					return None 
    			except IOError:
    				print 'Could not write to output file path. Please check if the path is correct ...'
    				raise 
    		return out_data
    
    
    	def findLongestMatch(self, data, current_position):
    		""" 
    		Finds the longest match to a substring starting at the current_position 
    		in the lookahead buffer from the history window
    		"""
    		end_of_buffer = min(current_position + self.lookahead_buffer_size, len(data) + 1)
    
    		best_match_distance = -1
    		best_match_length = -1
    
    		# Optimization: Only consider substrings of length 2 and greater, and just 
    		# output any substring of length 1 (8 bits uncompressed is better than 13 bits
    		# for the flag, distance, and length)
    		for j in range(current_position + 2, end_of_buffer):
    
    			start_index = max(0, current_position - self.window_size)
    			substring = data[current_position:j]
    
    			for i in range(start_index, current_position):
    
    				repetitions = len(substring) / (current_position - i)
    
    				last = len(substring) % (current_position - i)
    
    				matched_string = data[i:current_position] * repetitions + data[i:i+last]
    
    				if matched_string == substring and len(substring) > best_match_length:
    					best_match_distance = current_position - i 
    					best_match_length = len(substring)
    
    		if best_match_distance > 0 and best_match_length > 0:
    			return (best_match_distance, best_match_length)
    		return None
    

      

  • 相关阅读:
    正则表达式
    模块的初始
    装饰器 1
    匿名函数
    内置函数一(待跟新)
    python 学习迭代器的认识
    python 学习笔记 —— 函数的认识
    搭建yum 源
    python 学习第六天 文件的处理方式
    python 学习第五天 字典得 增删改查
  • 原文地址:https://www.cnblogs.com/rmthy/p/6238200.html
Copyright © 2020-2023  润新知