#!/usr/bin/env python
#encoding=utf-8
import re
def find_tag_idx(html,tag,count=1,use_re=False):
"""
查找特征值的位置
-1,表示没有找到
html="abcabc"
print find_tag_idx(html,'e')
"""
r=(-1,0)
current_count=0
idx=-1
if not use_re:
start=0
while(current_count<count):
idx=html.find(tag,start)
if idx>-1:
current_count+=1
start=idx+len(tag)
else:
break
r=(idx,idx+len(tag))
else:
for match in re.finditer(tag,html):
if (current_count+1)==count:
r=match.span()
break
current_count+=1
return r
def rfind_tag_idx(html,tag,count=1,use_re=False):
"""
查找特征值的位置
-1,表示没有找到
html="abcabc"
a,b=rfind_tag_idx(html,'a',count=1,use_re=True)
print a,b
print html[a:b]
"""
r=(-1,0)
current_count=0
idx=-1
if not use_re:
end=len(html)
while(current_count<count):
idx=html.rfind(tag,0,end)#end位是包含的
if idx>-1:
current_count+=1
end=idx
else:
break
r=(idx,idx+len(tag))
else:
matchs=[match for match in re.finditer(tag,html)][::-1]
for match in matchs:
if (current_count+1)==count:
r=match.span()
break
current_count+=1
return r
html="abcabc"
#a,b=rfind_tag_idx(html,r"""" id="aa""",count=1,use_re=True)
#print a,b
#print html[a:b]
def get_part2(html,start,end,start_count=1,end_count=1,start_re=False,end_re=False,reverseDirection=False):
"""
取得start,end中间的数据,不包括start和end中的字符
html="abcabc"
print get_part2(html,"a","c",start_count=2)
"""
if not reverseDirection:
a,b=find_tag_idx(html,start,start_count,use_re=start_re)
if a==-1:
return ""
#print a,b
_html=html[b:]
#print _html
c,d=find_tag_idx(_html,end,end_count,use_re=end_re)
#print c,d
if c==-1:
return ""
return _html[:c]
else:
a,b=rfind_tag_idx(html,end,end_count,use_re=end_re)
#print a,b
if a==-1:
return ""
_html=html[:a]
#print _html
c,d=rfind_tag_idx(_html,start,start_count,use_re=start_re)
if c==-1:
return ""
#print c,d
return _html[d:]
html="""
<table width="770" border="0" align="center" cellpadding="0" cellspacing="0">
<tr>
<td width="400" height="400" valign="top">
<table width="400" border="0" cellspacing="0" cellpadding="0">
<tr>
<td width="400" height="320">
<div align="center">
<a href="/PicB/201010161245413204.jpg" id="aa" class="jqzoom" title="zoom">
<img style="CURSOR: hand" src="/PicB/201010161245413204.jpg" name="rImage" width="350" height="350" border="0" align="center"> </a> </div>
"""
#html="abcabc"
s="""" id="aa" class="jqzoom\""""#---------->这种的兼容性比较好
print s
s2=r"""" id="aa" class="jqzoom\""""
print s2
print get_part2(html,'"',s,end_re=False,reverseDirection=True)
print get_part2(html,'"',s2,end_re=False,reverseDirection=True)
print get_part2(html,'"',s,end_re=True,reverseDirection=True)
print get_part2(html,'"',s2,end_re=True,reverseDirection=True)
#print get_part2(html,'"',r"""id="aa""",end_re=True,reverseDirection=True)
下面是老版的
#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.17,v0.31
2010.10.16,v0.30
2010.10.15,v0.29
2010.10.14,v0.27
2010.10.13,v0.26
2010.10.12,v0.25
2010.10.10,v0.23
2010.10.10,v0.22
2010.10.09,v0.2
2010.10.07,v0.1
批量抽取详细页数据
"""
import sys
#print sys.getdefaultencoding()
####reload(sys)#生成代码时,####的会自动去除
#sys.setdefaultencoding(sys.stdout.encoding)
####sys.setdefaultencoding('utf-8')
import re
import time
import urllib2
import os.path
from urlparse import urljoin
from pyquery import PyQuery as pq
from xml.dom import minidom,Node
import types
g_host = "${host}"
g_details_folder = os.path.join("./","details")
g_xmls_folder = os.path.join("./","xmls")
g_xmls_infos_folder = os.path.join("./","xmls_infos")
g_success_file = os.path.join("./","xmls_infos/success.txt")
g_error_file = os.path.join("./","xmls_infos/error.txt")
g_extract_links_file = os.path.join("./","details_infos/success.txt")
g_headers={}
headers = """${headers}"""
headers = headers.strip().replace("\r\n","\n")
if headers<>"":
for elem in headers.split("\n"):
if elem.strip()=="":
continue
a,b=elem.split(":",1)
a=a.strip()
b=b.strip()
g_headers[a]=b
#存放抽取的数据
dict={}
#功能函数
def init():
print "初始数据"
if not os.path.exists(g_xmls_folder):
os.makedirs(g_xmls_folder)
if not os.path.exists(g_xmls_infos_folder):
os.makedirs(g_xmls_infos_folder)
if not os.path.exists(g_details_folder):
os.makedirs(g_details_folder)
def delete(src):
'''delete files and folders'''
#permission(src)
if os.path.isfile(src):
try:
os.remove(src)
except:
pass
elif os.path.isdir(src):
for item in os.listdir(src):
itemsrc=os.path.join(src,item)
delete(itemsrc)
try:
os.rmdir(src)
except:
pass
def clear():
print "清除以前数据"
delete(g_xmls_folder)
delete(g_xmls_infos_folder)
def size(src):
"检查文件或文件夹大小"
r = 0L
if os.path.isfile(src):
r=os.path.getsize(src)
else:
for root, dirs, files in os.walk(src):
r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
l=len(str(r))
if l>9:
r=r/1024/1024/1024
r="%.2f GiB"%r
elif l>6:
r=r/1024/1024
r="%.2f MiB"%r
elif l>3:
r=r/1024
r="%.2f KiB"%r
print "%s 大小为:%s"%(src,r)
def error(url,ex):
f=open(g_error_file,"a")
f.write("%s\n"%(url,))
f.close()
def success(url):
f=open(g_success_file,"a")
f.write("%s\n"%url)
f.close()
def statistics(func):
def tongji():
total,successed=0,0
if os.path.exists(g_extract_links_file):
total=len(set(open(g_extract_links_file,"r").readlines()))
print "total lines:%s"%total
if os.path.exists(g_success_file):
successed=len(set(open(g_success_file,"r").readlines()))
print "successed lines:%s"%successed
print "left lines:%s"%(total-successed)
def newFunc(*args,**args2):
tongji()
back = func(*args, **args2)
tongji()
return back
return newFunc
def cost_time(func):
def newFunc(*args, **args2):
t0 = time.time()
print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
back = func(*args, **args2)
print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
return back
return newFunc
def url2filename(url):
import base64
return base64.urlsafe_b64encode(url)
def url2filename2(url):
url=url.strip()
idx=url.rfind("/")
r=url[idx+1:]
if idx==-1 or len(r)==0:
# raise ValueError("url2filename function parser error")
print "启用特殊url2filename"
r = re.findall(r"\d+", url)[-1]
return r
def get_html(url):
init()
html=""
file=url2filename(url)
file=os.path.join(g_details_folder,file)
print file
if not os.path.exists(file):
print url
req = urllib2.Request(url = url,headers = g_headers)
html=urllib2.urlopen(req).read()
#html=urllib2.urlopen(url).read()
open(file,"w").write(html)
print "从网络抓取"
else:
print "直接利用本地"
html=open(file,"r").read()
return html
def get_part(html,start,end):
s=html.find(start)
l=len(start)
e=html.find(end,s+l)
if s==-1 or e==-1:
return ""
return html[s+len(start):e]
def find_tag_idx(html,tag,count=1,use_re=False):
"""
查找特征值的位置
-1,表示没有找到
html="abcabc"
print find_tag_idx(html,'e')
"""
r=(-1,0)
current_count=0
idx=-1
if not use_re:
start=0
while(current_count<count):
idx=html.find(tag,start)
if idx>-1:
current_count+=1
start=idx+len(tag)
else:
break
r=(idx,idx+len(tag))
else:
for match in re.finditer(tag,html):
if (current_count+1)==count:
r=match.span()
break
current_count+=1
return r
def get_part2(html,start,end,start_count=1,end_count=1,start_re=False,end_re=False):
"""
取得start,end中间的数据,不包括start和end中的字符
html="abcabc"
print get_part2(html,"a","c",start_count=2)
"""
a,b=find_tag_idx(html,start,start_count,use_re=start_re)
if a==-1:
return ""
#print a,b
_html=html[b:]
#print _html
c,d=find_tag_idx(_html,end,end_count,use_re=end_re)
#print c,d
if c==-1:
return ""
return _html[:c]
def filter_tags(html,tags=["em","dd","input","h1","h2","h3","br","a","b","span","strong","p","hr","strong","p","hr","font","div","td","tr","img","form","table"]):
result=html
for elem in tags:
result=re.sub(r"(?i)<%s[\s\S]*?>"%elem,"",result)
result=re.sub(r"(?i)</ *%s[\s\S]*?>"%elem,"",result)
return result
def filter_comment(html):
r=re.sub(r"<!--[\s\S]*?-->",'', html)
return r
def filter_characters(html,tags=["¥"," ","]",":"]):
for tag in tags:
html=html.replace(tag,"")
return html
def filter_int(html):
r=re.sub(r"(?m)[^\d]+",'', html).strip()
try:
return str(int(r))
except:
return "0"
def filter_price(html):
r=re.sub(r"(?m)[^\d\.]*",'', html).strip()
try:
return str(float(r))
except:
return "0"
def _(u):
if not isinstance(u,unicode):
return unicode(u,"utf8")
return u
def gen_xml(url):
xml_filename=os.path.join(g_xmls_folder,url2filename(url)+".xml")
xml=minidom.Document()
add=xml.createElement("add")
xml.appendChild(add)
doc=xml.createElement("doc")
add.appendChild(doc)
def c(na,va):
"create field node"
field=xml.createElement("field")
field.setAttribute("name",na)
field.appendChild(xml.createTextNode(va))
doc.appendChild(field)
for k in dict.keys():
if dict[k] is None or ((type(dict[k])==types.StringType or type(dict[k])==types.UnicodeType) and dict[k].strip()==""):
del dict[k]
for k,v in dict.iteritems():
c(k,str(v))#stock等数值类型,在抽取时也使用字符类型
import codecs
f=codecs.open(xml_filename,"w")
f.write(codecs.BOM_UTF8)
f.write(xml.toxml("utf-8"))
f.close()
print "生成文件%s"%xml_filename