新版字符截取函数

新版字符截取函数

#!/usr/bin/env python
#encoding=utf-8
import re
def find_tag_idx(html,tag,count=1,use_re=False):
   """
   查找特征值的位置
   -1,表示没有找到
   html="abcabc"
   print find_tag_idx(html,'e')
   """
   r=(-1,0)
   current_count=0
   idx=-1
   if not use_re:
   start=0
   while(current_count<count):
   idx=html.find(tag,start)
   if idx>-1:
   current_count+=1
   start=idx+len(tag)
   else:
   break
   r=(idx,idx+len(tag))
   else:
   for match in re.finditer(tag,html):
   if (current_count+1)==count:
   r=match.span()
   break
   current_count+=1
   return r
def rfind_tag_idx(html,tag,count=1,use_re=False):
   """
   查找特征值的位置
   -1,表示没有找到
   html="abcabc"
   a,b=rfind_tag_idx(html,'a',count=1,use_re=True)
   print a,b
   print html[a:b]
   """
   r=(-1,0)
   current_count=0
   idx=-1
   if not use_re:
   end=len(html)
   while(current_count<count):
   idx=html.rfind(tag,0,end)#end位是包含的
   if idx>-1:
   current_count+=1
   end=idx
   else:
   break
   r=(idx,idx+len(tag))
   else:
   matchs=[match for match in re.finditer(tag,html)][::-1]
   for match in matchs:
   if (current_count+1)==count:
   r=match.span()
   break
   current_count+=1
   return r
html="abcabc"
#a,b=rfind_tag_idx(html,r"""" id="aa""",count=1,use_re=True)
#print a,b
#print html[a:b]

def get_part2(html,start,end,start_count=1,end_count=1,start_re=False,end_re=False,reverseDirection=False):
   """
   取得start,end中间的数据,不包括start和end中的字符
   html="abcabc"
   print get_part2(html,"a","c",start_count=2)
   """
   if not reverseDirection:
   a,b=find_tag_idx(html,start,start_count,use_re=start_re)
   if a==-1:
   return ""
   #print a,b
   _html=html[b:]
   #print _html

   c,d=find_tag_idx(_html,end,end_count,use_re=end_re)
   #print c,d
   if c==-1:
   return ""
   return _html[:c]
   else:
   a,b=rfind_tag_idx(html,end,end_count,use_re=end_re)
   #print a,b
   if a==-1:
   return ""

   _html=html[:a]
   #print _html
   c,d=rfind_tag_idx(_html,start,start_count,use_re=start_re)
   if c==-1:
   return ""
   #print c,d
   return _html[d:]
html="""
   <table width="770" border="0" align="center" cellpadding="0" cellspacing="0">
   <tr>
   <td width="400" height="400" valign="top">

   <table width="400" border="0" cellspacing="0" cellpadding="0">
   <tr>
   <td width="400" height="320">
<div align="center">

<a href="/PicB/201010161245413204.jpg" id="aa" class="jqzoom" title="zoom">

<img style="CURSOR: hand" src="/PicB/201010161245413204.jpg" name="rImage" width="350" height="350" border="0" align="center"> </a> </div>

"""
#html="abcabc"
s="""" id="aa" class="jqzoom\""""#---------->这种的兼容性比较好
print s
s2=r"""" id="aa" class="jqzoom\""""
print s2
print get_part2(html,'"',s,end_re=False,reverseDirection=True)
print get_part2(html,'"',s2,end_re=False,reverseDirection=True)
print get_part2(html,'"',s,end_re=True,reverseDirection=True)
print get_part2(html,'"',s2,end_re=True,reverseDirection=True)
#print get_part2(html,'"',r"""id="aa""",end_re=True,reverseDirection=True)
下面是老版的
#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.17,v0.31
2010.10.16,v0.30
2010.10.15,v0.29
2010.10.14,v0.27
2010.10.13,v0.26
2010.10.12,v0.25
2010.10.10,v0.23
2010.10.10,v0.22
2010.10.09,v0.2
2010.10.07,v0.1
批量抽取详细页数据
"""
import sys
#print sys.getdefaultencoding()
####reload(sys)#生成代码时,####的会自动去除
#sys.setdefaultencoding(sys.stdout.encoding)
####sys.setdefaultencoding('utf-8')
import re
import time
import urllib2
import os.path
from urlparse import urljoin
from pyquery import PyQuery as pq
from xml.dom import minidom,Node
import types
g_host = "${host}"
g_details_folder = os.path.join("./","details")
g_xmls_folder = os.path.join("./","xmls")
g_xmls_infos_folder = os.path.join("./","xmls_infos")
g_success_file = os.path.join("./","xmls_infos/success.txt")
g_error_file = os.path.join("./","xmls_infos/error.txt")
g_extract_links_file = os.path.join("./","details_infos/success.txt")
g_headers={}
headers = """${headers}"""
headers = headers.strip().replace("\r\n","\n")
if headers<>"":
   for elem in headers.split("\n"):
   if elem.strip()=="":
   continue
   a,b=elem.split(":",1)
   a=a.strip()
   b=b.strip()
   g_headers[a]=b
#存放抽取的数据
dict={}
#功能函数
def init():
   print "初始数据"
   if not os.path.exists(g_xmls_folder):
   os.makedirs(g_xmls_folder)
   if not os.path.exists(g_xmls_infos_folder):
   os.makedirs(g_xmls_infos_folder)
   if not os.path.exists(g_details_folder):
   os.makedirs(g_details_folder)
def delete(src):
   '''delete files and folders'''
   #permission(src)
   if os.path.isfile(src):
   try:
   os.remove(src)
   except:
   pass
   elif os.path.isdir(src):
   for item in os.listdir(src):
   itemsrc=os.path.join(src,item)
   delete(itemsrc)
   try:
   os.rmdir(src)
   except:
   pass
def clear():
   print "清除以前数据"
   delete(g_xmls_folder)
   delete(g_xmls_infos_folder)
def size(src):
   "检查文件或文件夹大小"
   r = 0L
   if os.path.isfile(src):
   r=os.path.getsize(src)
   else:
   for root, dirs, files in os.walk(src):
   r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
   l=len(str(r))

   if l>9:
   r=r/1024/1024/1024
   r="%.2f GiB"%r
   elif l>6:
   r=r/1024/1024
   r="%.2f MiB"%r
   elif l>3:
   r=r/1024
   r="%.2f KiB"%r
   print "%s 大小为:%s"%(src,r)
def error(url,ex):
   f=open(g_error_file,"a")
   f.write("%s\n"%(url,))
   f.close()
def success(url):
   f=open(g_success_file,"a")
   f.write("%s\n"%url)
   f.close()

def statistics(func):
   def tongji():
   total,successed=0,0
   if os.path.exists(g_extract_links_file):
   total=len(set(open(g_extract_links_file,"r").readlines()))
   print "total lines:%s"%total
   if os.path.exists(g_success_file):
   successed=len(set(open(g_success_file,"r").readlines()))
   print "successed lines:%s"%successed
   print "left lines:%s"%(total-successed)
   def newFunc(*args,**args2):
   tongji()
   back = func(*args, **args2)
   tongji()
   return back
   return newFunc
def cost_time(func):
   def newFunc(*args, **args2):
   t0 = time.time()
   print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
   back = func(*args, **args2)
   print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
   print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
   return back
   return newFunc
def url2filename(url):
   import base64
   return base64.urlsafe_b64encode(url)
def url2filename2(url):
   url=url.strip()
   idx=url.rfind("/")
   r=url[idx+1:]
   if idx==-1 or len(r)==0:
# raise ValueError("url2filename function parser error")
   print "启用特殊url2filename"
   r = re.findall(r"\d+", url)[-1]
   return r
def get_html(url):
   init()
   html=""
   file=url2filename(url)
   file=os.path.join(g_details_folder,file)
   print file
   if not os.path.exists(file):
   print url
   req = urllib2.Request(url = url,headers = g_headers)
   html=urllib2.urlopen(req).read()
   #html=urllib2.urlopen(url).read()
   open(file,"w").write(html)
   print "从网络抓取"
   else:
   print "直接利用本地"
   html=open(file,"r").read()
   return html
def get_part(html,start,end):
   s=html.find(start)
   l=len(start)
   e=html.find(end,s+l)
   if s==-1 or e==-1:
   return ""
   return html[s+len(start):e]
def find_tag_idx(html,tag,count=1,use_re=False):
   """
   查找特征值的位置
   -1,表示没有找到
   html="abcabc"
   print find_tag_idx(html,'e')
   """
   r=(-1,0)
   current_count=0
   idx=-1
   if not use_re:
   start=0
   while(current_count<count):
   idx=html.find(tag,start)
   if idx>-1:
   current_count+=1
   start=idx+len(tag)
   else:
   break
   r=(idx,idx+len(tag))
   else:
   for match in re.finditer(tag,html):
   if (current_count+1)==count:
   r=match.span()
   break
   current_count+=1
   return r

def get_part2(html,start,end,start_count=1,end_count=1,start_re=False,end_re=False):
   """
   取得start,end中间的数据,不包括start和end中的字符
   html="abcabc"
   print get_part2(html,"a","c",start_count=2)
   """
   a,b=find_tag_idx(html,start,start_count,use_re=start_re)
   if a==-1:
   return ""
   #print a,b
   _html=html[b:]
   #print _html

   c,d=find_tag_idx(_html,end,end_count,use_re=end_re)
   #print c,d
   if c==-1:
   return ""
   return _html[:c]

def filter_tags(html,tags=["em","dd","input","h1","h2","h3","br","a","b","span","strong","p","hr","strong","p","hr","font","div","td","tr","img","form","table"]):
   result=html
   for elem in tags:
   result=re.sub(r"(?i)<%s[\s\S]*?>"%elem,"",result)
   result=re.sub(r"(?i)</ *%s[\s\S]*?>"%elem,"",result)
   return result
def filter_comment(html):
   r=re.sub(r"",'', html)
   return r
def filter_characters(html,tags=["￥"," ","]","："]):
   for tag in tags:
   html=html.replace(tag,"")
   return html
def filter_int(html):
   r=re.sub(r"(?m)[^\d]+",'', html).strip()
   try:
   return str(int(r))
   except:
   return "0"
def filter_price(html):
   r=re.sub(r"(?m)[^\d\.]*",'', html).strip()
   try:
   return str(float(r))
   except:
   return "0"
def _(u):
   if not isinstance(u,unicode):
   return unicode(u,"utf8")
   return u
def gen_xml(url):
   xml_filename=os.path.join(g_xmls_folder,url2filename(url)+".xml")
   xml=minidom.Document()
   add=xml.createElement("add")
   xml.appendChild(add)
   doc=xml.createElement("doc")
   add.appendChild(doc)

   def c(na,va):
   "create field node"
   field=xml.createElement("field")
   field.setAttribute("name",na)
   field.appendChild(xml.createTextNode(va))
   doc.appendChild(field)

   for k in dict.keys():
   if dict[k] is None or ((type(dict[k])==types.StringType or type(dict[k])==types.UnicodeType) and dict[k].strip()==""):
   del dict[k]

   for k,v in dict.iteritems():
   c(k,str(v))#stock等数值类型，在抽取时也使用字符类型

   import codecs
   f=codecs.open(xml_filename,"w")
   f.write(codecs.BOM_UTF8)
   f.write(xml.toxml("utf-8"))
   f.close()
   print "生成文件%s"%xml_filename
相关阅读:
程序员需要的各种PDF格式电子书【附网盘免费下载资源地址】
Web安全大揭秘
 tar 压缩解压命令详解
 django开发项目的部署nginx
CentOS7安装mysql-python模块
 我的博客站点上线了
 2006
centos7安装pip
mysql删除匿名用户
 FilenameFilter 文件名过滤
原文地址：https://www.cnblogs.com/lexus/p/1853821.html