前端时间花了1个月左右,搞了个新闻APP,功能很简单,就是把页面版的新闻条目定时爬到后台数据库,然后用app显示出来。
1.客户端
使用了DCloud框架,js基本是个新手,从没写过像样的代码,html5更是新手,索性直接使用现成的前端框架。APPcan,APICloud尝试过,最终选择DCloud,话说它的HBuild编辑器确实不错。
贴一部分关键代码: 使用DCloud的下拉刷新方法,使用ajax获取后台返回的json列表;
1 <!DOCTYPE html>
2 <html>
3
4 <head>
5 <meta charset="utf-8">
6 <meta name="viewport" content="width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=1,user-scalable=no" />
7 <title></title>
8 <script src="js/mui.min.js"></script>
9 <link href="css/mui.min.css" rel="stylesheet" />
10 <script type="text/javascript" charset="utf-8">
11 //mui.init();
12 var t;
13 mui.init({
14 pullRefresh: {
15 container: "#pullMine", //下拉刷新容器标识,querySelector能定位的css选择器均可,比如:id、.class等
16 down: {
17 contentdown: "下拉可以刷新", //可选,在下拉可刷新状态时,下拉刷新控件上显示的标题内容
18 contentover: "释放立即刷新", //可选,在释放可刷新状态时,下拉刷新控件上显示的标题内容
19 contentrefresh: "正在刷新...", //可选,正在刷新状态时,下拉刷新控件上显示的标题内容
20 callback: pulldownRefresh //必选,刷新函数,根据具体业务来编写,比如通过ajax从服务器获取新数据;
21 }
22 }
23 });
24
25 mui.plusReady(function() {
26 console.log("当前页面URL:" + plus.webview.currentWebview().getURL());
27 mui.ajax('http://202.110.123.123:801/newssystem/index.php/Home/News/getlist_sd', {
28 dataType: 'json',
29 type: 'get',
30 timeout: 10000,
31 success: function(data) {
32 t=data;
33 var list = document.getElementById("list");
34 var finallist = '';
35 for (i = data.length - 1; i >= 0; i--) {
36 finallist = finallist + '<li data-id="' + i + '" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">' + data[i].title + '<p class="mui-ellipsis">' + data[i].pubtime + '</p></div></a></li>';
37 }
38 list.innerHTML = finallist;
39 console.log("no1"+finallist);
40 mui('#list').on('tap', 'li', function() {
41 mui.openWindow({
42 url: 'detail_sd.html',
43 id: 'detail_sd',
44 extras: {
45 title: t[this.getAttribute('data-id')].title,
46 author: t[this.getAttribute('data-id')].author,
47 pubtime: t[this.getAttribute('data-id')].pubtime,
48 content: t[this.getAttribute('data-id')].content
49 }
50 })
51
52 })
53 },
54 error: function() {}
55 })
56 })
57
58 //下拉刷新
59 //
60
61
62
63 /**
64 * 下拉刷新具体业务实现
65 */function pulldownRefresh() {
66 setTimeout(function() {
67 console.log("refreshing....");
68 mui.ajax('http://202.110.123.123:801/newssystem/index.php/Home/News/getlist_sd', {
69 dataType: 'json',
70 type: 'get',
71 timeout: 10000,
72 success: function(data) {
73 t=data;
74 var list = document.getElementById("list");
75 var finallist = '';
76 for (i = data.length - 1; i >= 0; i--) {
77 finallist = finallist + '<li data-id="' + i + '" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">' + data[i].title + '<p class="mui-ellipsis">' + data[i].pubtime + '</p></div></a></li>';
78 // finallist=finallist+'<li data-id="'+i+'" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">'+data[i].title+'<p class="mui-ellipsis">'+data[i].content+'</p></div></a></li>';
79 }
80 list.innerHTML = finallist;
81
82
83 },
84 error: function() {}
85 });
86 mui('#pullMine').pullRefresh().endPulldownToRefresh(); //refresh completed
87
88 }, 1500);
89 }
90 </script>
91 </head>
92
93 <body>
94
95 <!--<div id="pullMine" class="mui-content mui-scroll-wrapper">
96 <div class="mui-scroll">
97 <ul class="mui-table-view" id="list">
98
99 </ul>
100 </div>
101
102 </div>-->
103
104 <div id="pullMine" class="mui-content mui-scroll-wrapper">
105
106 <div class="mui-scroll">
107 <ul class="mui-table-view" id="list">
108
109 </ul>
110 </div>
111 </div>
112
113 </body>
114
2 <html>
3
4 <head>
5 <meta charset="utf-8">
6 <meta name="viewport" content="width=device-width,initial-scale=1,minimum-scale=1,maximum-scale=1,user-scalable=no" />
7 <title></title>
8 <script src="js/mui.min.js"></script>
9 <link href="css/mui.min.css" rel="stylesheet" />
10 <script type="text/javascript" charset="utf-8">
11 //mui.init();
12 var t;
13 mui.init({
14 pullRefresh: {
15 container: "#pullMine", //下拉刷新容器标识,querySelector能定位的css选择器均可,比如:id、.class等
16 down: {
17 contentdown: "下拉可以刷新", //可选,在下拉可刷新状态时,下拉刷新控件上显示的标题内容
18 contentover: "释放立即刷新", //可选,在释放可刷新状态时,下拉刷新控件上显示的标题内容
19 contentrefresh: "正在刷新...", //可选,正在刷新状态时,下拉刷新控件上显示的标题内容
20 callback: pulldownRefresh //必选,刷新函数,根据具体业务来编写,比如通过ajax从服务器获取新数据;
21 }
22 }
23 });
24
25 mui.plusReady(function() {
26 console.log("当前页面URL:" + plus.webview.currentWebview().getURL());
27 mui.ajax('http://202.110.123.123:801/newssystem/index.php/Home/News/getlist_sd', {
28 dataType: 'json',
29 type: 'get',
30 timeout: 10000,
31 success: function(data) {
32 t=data;
33 var list = document.getElementById("list");
34 var finallist = '';
35 for (i = data.length - 1; i >= 0; i--) {
36 finallist = finallist + '<li data-id="' + i + '" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">' + data[i].title + '<p class="mui-ellipsis">' + data[i].pubtime + '</p></div></a></li>';
37 }
38 list.innerHTML = finallist;
39 console.log("no1"+finallist);
40 mui('#list').on('tap', 'li', function() {
41 mui.openWindow({
42 url: 'detail_sd.html',
43 id: 'detail_sd',
44 extras: {
45 title: t[this.getAttribute('data-id')].title,
46 author: t[this.getAttribute('data-id')].author,
47 pubtime: t[this.getAttribute('data-id')].pubtime,
48 content: t[this.getAttribute('data-id')].content
49 }
50 })
51
52 })
53 },
54 error: function() {}
55 })
56 })
57
58 //下拉刷新
59 //
60
61
62
63 /**
64 * 下拉刷新具体业务实现
65 */function pulldownRefresh() {
66 setTimeout(function() {
67 console.log("refreshing....");
68 mui.ajax('http://202.110.123.123:801/newssystem/index.php/Home/News/getlist_sd', {
69 dataType: 'json',
70 type: 'get',
71 timeout: 10000,
72 success: function(data) {
73 t=data;
74 var list = document.getElementById("list");
75 var finallist = '';
76 for (i = data.length - 1; i >= 0; i--) {
77 finallist = finallist + '<li data-id="' + i + '" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">' + data[i].title + '<p class="mui-ellipsis">' + data[i].pubtime + '</p></div></a></li>';
78 // finallist=finallist+'<li data-id="'+i+'" class="mui-table-view-cell" ><a class="mui-navigate-right"><div class="mui-media-body">'+data[i].title+'<p class="mui-ellipsis">'+data[i].content+'</p></div></a></li>';
79 }
80 list.innerHTML = finallist;
81
82
83 },
84 error: function() {}
85 });
86 mui('#pullMine').pullRefresh().endPulldownToRefresh(); //refresh completed
87
88 }, 1500);
89 }
90 </script>
91 </head>
92
93 <body>
94
95 <!--<div id="pullMine" class="mui-content mui-scroll-wrapper">
96 <div class="mui-scroll">
97 <ul class="mui-table-view" id="list">
98
99 </ul>
100 </div>
101
102 </div>-->
103
104 <div id="pullMine" class="mui-content mui-scroll-wrapper">
105
106 <div class="mui-scroll">
107 <ul class="mui-table-view" id="list">
108
109 </ul>
110 </div>
111 </div>
112
113 </body>
114
115 </html>
2.后台PHP发布端
使用了thinkphp框架
1 <?php
2 namespace HomeController;
3 use ThinkController;
4 class NewsController extends Controller {
5 public function getlist(){
6 $newsList=M('news')->order('pubtime asc')->limit(30)->select();
7 echo json_encode($newsList);
8 }
9 public function getlist_sd(){
10 $newsList=M('newssd')->order('pubtime asc')->limit(30)->select();
11 echo json_encode($newsList);
12 }
2 namespace HomeController;
3 use ThinkController;
4 class NewsController extends Controller {
5 public function getlist(){
6 $newsList=M('news')->order('pubtime asc')->limit(30)->select();
7 echo json_encode($newsList);
8 }
9 public function getlist_sd(){
10 $newsList=M('newssd')->order('pubtime asc')->limit(30)->select();
11 echo json_encode($newsList);
12 }
13 ?>
3.后台爬虫
使用了scrapy,爬取新闻内容写入DB
pipelines.py
1 # -*- coding: utf-8 -*-
2
3 # Define your item pipelines here
4 #
5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7
8 from scrapy import signals
9 import json
10 import codecs
11 from twisted.enterprise import adbapi
12 from datetime import datetime
13 from hashlib import md5
14 import MySQLdb
15 import MySQLdb.cursors
16
17 class JsonWithEncodingtutorialPipeline(object):
18 def __init__(self):
19 self.file = codecs.open('qdnews.json', 'w', encoding='utf-8')
20 def process_item(self, item, spider):
21 line = json.dumps(dict(item), ensure_ascii=False) + " "
22 self.file.write(line)
23 return item
24 def spider_closed(self, spider):
25 self.file.close()
26
27 class MySQLStoretutorialPipeline(object):
28 def __init__(self, dbpool):
29 self.dbpool = dbpool
30 print("-----------init sql proc---")
31 @classmethod
32 def from_settings(cls, settings):
33 dbargs = dict(
34 host=settings['MYSQL_HOST'],
35 db=settings['MYSQL_DBNAME'],
36 user=settings['MYSQL_USER'],
37 passwd=settings['MYSQL_PASSWD'],
38 charset='utf8',
39 cursorclass = MySQLdb.cursors.DictCursor,
40 use_unicode= True,
41 )
42 dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
43 return cls(dbpool)
44
45 #pipeline默认调用
46 def process_item(self, item, spider):
47 d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
48 d.addErrback(self._handle_error, item, spider)
49 d.addBoth(lambda _: item)
50 return d
51 #将每行更新或写入数据库中
52 def _do_upinsert(self, conn, item, spider):
53 print (item['link'][0])
54 linkmd5id = self._get_linkmd5id(item)
55
56 print linkmd5id
57 print("--------------")
58 now = datetime.now().replace(microsecond=0).isoformat(' ')
59 #now=datetime2timestamp(datetime.datetime.now())
60 conn.execute("""
61 select 1 from tp_news where linkmd5id = %s
62 """, (linkmd5id, ))
63 ret = conn.fetchone()
64 print ('ret=',ret)
65
66 if ret:
67 print "1111111111"
68 conn.execute("""
69 update tp_news set title = %s, content = %s, author = %s,pubtime = %s, pubtime2 = %s,link = %s, updated = %s where linkmd5id = %s
70 """, (item['title'][0][4:-5], item['content'][0], item['pubtime'][0][16:-4],item['pubtime'][0][-14:-4], item['pubtime'][0][-14:-4],item['link'][0], now, linkmd5id))
71 #print """
72 # update tp_news_2 set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
73 #""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)
74 else:
75 print '2222222222'
76 conn.execute("""
77 insert into tp_news(linkmd5id, title, content, author,link, updated, pubtime, pubtime2)
78 values(%s, %s, %s, %s, %s,%s,%s,%s)
79 """, (linkmd5id, item['title'][0][4:-5], item['content'][0], item['pubtime'][0][16:-4],item['link'][0], now,item['pubtime'][0][-14:-4], item['pubtime'][0][-14:-4]))
80 #print """
81 # insert into tp_news_2(linkmd5id, title, description, link, listUrl, updated)
82 # values(%s, %s, %s, %s, %s, %s)
83 #""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
84 #获取url的md5编码
85 def _get_linkmd5id(self, item):
86 #url进行md5处理,为避免重复采集设计
87 s=md5(item['link'][0]).hexdigest()
88 #print (s)
89 #print(md5(item['link']).hexdigest())
90 return s
91 #异常处理
92 def _handle_error(self, failue, item, spider):
2
3 # Define your item pipelines here
4 #
5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7
8 from scrapy import signals
9 import json
10 import codecs
11 from twisted.enterprise import adbapi
12 from datetime import datetime
13 from hashlib import md5
14 import MySQLdb
15 import MySQLdb.cursors
16
17 class JsonWithEncodingtutorialPipeline(object):
18 def __init__(self):
19 self.file = codecs.open('qdnews.json', 'w', encoding='utf-8')
20 def process_item(self, item, spider):
21 line = json.dumps(dict(item), ensure_ascii=False) + " "
22 self.file.write(line)
23 return item
24 def spider_closed(self, spider):
25 self.file.close()
26
27 class MySQLStoretutorialPipeline(object):
28 def __init__(self, dbpool):
29 self.dbpool = dbpool
30 print("-----------init sql proc---")
31 @classmethod
32 def from_settings(cls, settings):
33 dbargs = dict(
34 host=settings['MYSQL_HOST'],
35 db=settings['MYSQL_DBNAME'],
36 user=settings['MYSQL_USER'],
37 passwd=settings['MYSQL_PASSWD'],
38 charset='utf8',
39 cursorclass = MySQLdb.cursors.DictCursor,
40 use_unicode= True,
41 )
42 dbpool = adbapi.ConnectionPool('MySQLdb', **dbargs)
43 return cls(dbpool)
44
45 #pipeline默认调用
46 def process_item(self, item, spider):
47 d = self.dbpool.runInteraction(self._do_upinsert, item, spider)
48 d.addErrback(self._handle_error, item, spider)
49 d.addBoth(lambda _: item)
50 return d
51 #将每行更新或写入数据库中
52 def _do_upinsert(self, conn, item, spider):
53 print (item['link'][0])
54 linkmd5id = self._get_linkmd5id(item)
55
56 print linkmd5id
57 print("--------------")
58 now = datetime.now().replace(microsecond=0).isoformat(' ')
59 #now=datetime2timestamp(datetime.datetime.now())
60 conn.execute("""
61 select 1 from tp_news where linkmd5id = %s
62 """, (linkmd5id, ))
63 ret = conn.fetchone()
64 print ('ret=',ret)
65
66 if ret:
67 print "1111111111"
68 conn.execute("""
69 update tp_news set title = %s, content = %s, author = %s,pubtime = %s, pubtime2 = %s,link = %s, updated = %s where linkmd5id = %s
70 """, (item['title'][0][4:-5], item['content'][0], item['pubtime'][0][16:-4],item['pubtime'][0][-14:-4], item['pubtime'][0][-14:-4],item['link'][0], now, linkmd5id))
71 #print """
72 # update tp_news_2 set title = %s, description = %s, link = %s, listUrl = %s, updated = %s where linkmd5id = %s
73 #""", (item['title'], item['desc'], item['link'], item['listUrl'], now, linkmd5id)
74 else:
75 print '2222222222'
76 conn.execute("""
77 insert into tp_news(linkmd5id, title, content, author,link, updated, pubtime, pubtime2)
78 values(%s, %s, %s, %s, %s,%s,%s,%s)
79 """, (linkmd5id, item['title'][0][4:-5], item['content'][0], item['pubtime'][0][16:-4],item['link'][0], now,item['pubtime'][0][-14:-4], item['pubtime'][0][-14:-4]))
80 #print """
81 # insert into tp_news_2(linkmd5id, title, description, link, listUrl, updated)
82 # values(%s, %s, %s, %s, %s, %s)
83 #""", (linkmd5id, item['title'], item['desc'], item['link'], item['listUrl'], now)
84 #获取url的md5编码
85 def _get_linkmd5id(self, item):
86 #url进行md5处理,为避免重复采集设计
87 s=md5(item['link'][0]).hexdigest()
88 #print (s)
89 #print(md5(item['link']).hexdigest())
90 return s
91 #异常处理
92 def _handle_error(self, failue, item, spider):
93 log.err(failure)
items.py
1 # -*- coding: utf-8 -*-
2
3 # Define here the models for your scraped items
4 #
5 # See documentation in:
6 # http://doc.scrapy.org/en/latest/topics/items.html
7
8 import scrapy
9
10
11 class DmozItem(scrapy.Item):
12 # define the fields for your item here like:
13 # name = scrapy.Field()
14 pubtime=scrapy.Field()
15 title=scrapy.Field()
16 link=scrapy.Field()
17 desc=scrapy.Field()
18 content=scrapy.Field()
2
3 # Define here the models for your scraped items
4 #
5 # See documentation in:
6 # http://doc.scrapy.org/en/latest/topics/items.html
7
8 import scrapy
9
10
11 class DmozItem(scrapy.Item):
12 # define the fields for your item here like:
13 # name = scrapy.Field()
14 pubtime=scrapy.Field()
15 title=scrapy.Field()
16 link=scrapy.Field()
17 desc=scrapy.Field()
18 content=scrapy.Field()
19 id=scrapy.Field()
spiders.py
1 from scrapy.spider import BaseSpider
2 from scrapy.selector import HtmlXPathSelector
3 from tutorial.items import DmozItem
4 from scrapy.http import Request
5 from scrapy.utils.response import get_base_url
6 from scrapy.utils.url import urljoin_rfc
7 from urllib2 import urlopen
8 from BeautifulSoup import BeautifulSoup
9
10 from scrapy.spiders import CrawlSpider
11 from scrapy.loader import ItemLoader
12 from scrapy.linkextractors.sgml import SgmlLinkExtractor
13
14
15 import scrapy
16 class DmozSpider(BaseSpider):
17 name = "dmoz"
18 allowed_domains = ["dmoz.org"]
19 start_urls = [
20 "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
21 "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
22 ]
23 def parse(self, response):
24 # filename = response.url.split("/")[-2]
25 # open(filename, 'wb').write(response.body)
26 hxs=HtmlXPathSelector(response)
27 sites=hxs.select('//ul/li')
28 items=[]
29 for site in sites:
30 item=DmozItem()
31 item['title']=site.select('a/text()').extract()
32 item['link']=site.select('a/@href').extract()
33 item['desc']=site.select('text()').extract()
34 items.append(item)
35 return items
36
37 class DmozSpider2(BaseSpider):
38 name = "dmoz2"
39 allowed_domains = ["10.60.32.179"]
40 start_urls = [
41 "http://10.60.32.179/Site/Site1/myindex.shtml",
42 #"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
43 ]
44 def parse(self, response):
45 # filename = response.url.split("/")[-2]
46 # open(filename, 'wb').write(response.body)
47 hxs=HtmlXPathSelector(response)
48 sites=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
49 items=[]
50 for site in sites:
51 item=DmozItem()
52 item['date']=site.select('span/text()').extract()
53 item['title']=site.select('a/text()').extract()
54 item['link']=site.select('a/@href').extract()
55 item['desc']=site.select('text()').extract()
56 items.append(item)
57 return items
58
59
60 class MySpider(BaseSpider):
61 name = "myspider"
62 allowed_domains = ["10.60.32.179"]
63 start_urls = [
64 'http://10.60.32.179/Site/Site1/myindex.shtml',
65 #'http://example.com/page2',
66 ]
67 def parse(self, response):
68 # collect `item_urls`
69 hxs=HtmlXPathSelector(response)
70 item_urls=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
71 base_url = get_base_url(response)
72 items=[]
73 for item_url in item_urls:
74
75 yield Request(url=response.url, callback=self.parse_item,meta={'items': items})
76
77 def parse_item(self, response):
78 hxs=HtmlXPathSelector(response)
79 item_urls=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
80
81 item=DmozItem()
82 items=response.meta['items']
83 item['date']=item_urls.select('span/text()').extract()
84 item['title']=item_urls.select('a/text()').extract()
85 item['link']=item_urls.select('a/@href').extract()
86 item['desc']=item_urls.select('text()').extract()
87
88 # item_details_url=item['link']
89 # populate `item` fields
90 relative_url=item_urls.select('a/@href').extract()
91 print(relative_url[0])
92 base_url = get_base_url(response)
93 item_details_url=urljoin_rfc(base_url, relative_url[0])
94 yield Request(url=item_details_url,callback=self.parse_details,dont_filter=True,meta={'item':item,'items':items})
95
96 def parse_details(self, response):
97 #item = response.meta['item']
98 # populate more `item` fields
99 print("***********************In parse_details()***************")
100 hxs=HtmlXPathSelector(response)
101 print("-------------------------------")
102 print(response.url)
103 item_detail=hxs.select('/html/body/center/div/div[4]/div[1]/p[1]').extract()
104 print("________________",item_detail)
105 item=response.meta['item']
106 item['detail']=item_detail
107 items=response.meta['items']
108 items.append[item]
109 return items
110
111
112
113
114 class DmozSpider3(BaseSpider):
115 name = "dmoz3"
116 allowed_domains = ["10.60.32.179"]
117 start_urls = [
118 'http://10.60.32.179/Site/Site1/myindex.shtml',
119 ]
120
121 def parse(self, response):
122 hxs=HtmlXPathSelector(response)
123 sites=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
124 items=[]
125 for site in sites:
126 item=DmozItem()
127 item['date']=site.select('span/text()').extract()
128 item['title']=site.select('a/text()').extract()
129 item['link']=site.select('a/@href').extract()
130 item['desc']=site.select('text()').extract()
131
132 print(item['link'][0])
133 base_url = get_base_url(response)
134 relative_url=item['link'][0]
135 item_details_url=urljoin_rfc(base_url, relative_url)
136 print("*********************",item_details_url)
137 #response2=BeautifulSoup(urlopen(item_details_url).read())
138 response2=scrapy.http.Response(item_details_url)
139 hxs2=HtmlXPathSelector(response2)
140 item['detail']=hxs2.select('/html/body/center/div/div[4]/div[1]/p[1]').extract()
141
142 items.append(item)
143 return items
144
145
146
147
148 class MySpider5(BaseSpider):
149 name = "myspider5"
150 allowed_domains = ["10.60.32.179"]
151 start_urls = [
152 'http://10.60.32.179/Site/Site1/myindex.shtml',
153 #'http://example.com/page2',
154 ]
155
156 items=[]
157 item=DmozItem()
158
159 def parse(self, response):
160 # collect `item_urls`
161 hxs=HtmlXPathSelector(response)
162 item_urls=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
163
164
165
166 base_url = get_base_url(response)
167
168
169 for item_url in item_urls:
170
171
172
173
174 MySpider5.item['date']=item_url.select('span/text()').extract()
175 MySpider5.item['title']=item_url.select('a/text()').extract()
176 MySpider5.item['link']=item_url.select('a/@href').extract()
177 MySpider5.item['desc']=item_url.select('text()').extract()
178
179
180
181 relative_url=MySpider5.item['link']
182 print(relative_url[0])
183 base_url = get_base_url(response)
184 item_details_url=urljoin_rfc(base_url, relative_url[0])
185 print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx=', str(item_details_url)
186
187 yield Request(url=item_details_url, callback=self.parse_details)
188
189
190
191
192 # def parse_item(self, response):
193 # hxs=HtmlXPathSelector(response)
194 # item_urls=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
195
196
197
198 # # item_details_url=item['link']
199 # # populate `item` fields
200 # relative_url=item_urls.select('a/@href').extract()
201 # print(relative_url[0])
202 # base_url = get_base_url(response)
203 # item_details_url=urljoin_rfc(base_url, relative_url[0])
204 # print 'item urls============================================================='
205 # yield Request(url=item_details_url,callback=self.parse_details,dont_filter=True,meta={'item':item,'items':items})
206
207 def parse_details(self, response):
208 #item = response.meta['item']
209 # populate more `item` fields
210
211
212 print("***********************In parse_details()***************")
213 hxs=HtmlXPathSelector(response)
214 print("----------------------------------------------------------------")
215 print(response.url)
216 item_detail=hxs.select('/html/body/center/div/div[4]/div[1]/p[1]').extract()
217 print("________________",item_detail)
218 #item=response.meta['item']
219 #item['detail']=item_detail
220
221 #items.append(item)
222 MySpider5.item['detail']=item_detail
223 # item['detail']=item_detail
224
225 MySpider5.items.append(MySpider5.item)
226
227
228
229 return MySpider5.item
230
231
232 def parse_details2(self, response):
233 #item = response.meta['item']
234 # populate more `item` fields
235 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
236 url=str(response.url)
237 bbsItem_loader.add_value('title',item['title'])
238 abc={
239 'detail':'/html/body/center/div/div[4]/div[1]/p[1]'}
240 bbsItem_loader.add_xpath('detail',abc['detail'])
241 return bbsItem_loader.load_item()
242
243
244
245 class MySpider6(CrawlSpider):
246 name = "myspider6"
247 allowed_domains = ["10.60.32.179"]
248 start_urls = [
249 'http://10.60.32.179/Site/Site1/myindex.shtml',
250 #'http://example.com/page2',
251 ]
252 link_extractor={
253 # 'page':SgmlLinkExtractor(allow='/bbsdoc,board,w+.html$'),
254 # 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,w+,page,d+.html$'),
255 'page':SgmlLinkExtractor(allow='/Article/w+/w+.shtml$'),
256 }
257
258 _x_query={
259 'date':'span/text()',
260 'date2':'/html/body/center/div/div[4]/p',
261 'title':'a/text()',
262 'title2':'/html/body/center/div/div[4]/h2'
263 }
264 _y_query={
265 'detail':'/html/body/center/div/div[4]/div[1]/p[1]',
266 }
267
268 def parse(self,response):
269 self.t=0
270 for link in self.link_extractor['page'].extract_links(response):
271 yield Request(url=link.url,callback=self.parse_content)
272 self.t=self.t+1
273
274
275
276 def parse_content(self,response):
277 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
278 url=str(response.url)
279 bbsItem_loader.add_value('desc',url)
280 bbsItem_loader.add_value('link',url)
281 bbsItem_loader.add_xpath('title',self._x_query['title2'])
282 bbsItem_loader.add_xpath('pubtime',self._x_query['date2'])
283 bbsItem_loader.add_xpath('content',self._y_query['detail'])
284 bbsItem_loader.add_value('id',self.t) #why not useful?
285 return bbsItem_loader.load_item()
286
287
288 class MySpider6SD(CrawlSpider):
289 name = "myspider6sd"
290 allowed_domains = ["10.60.7.45"]
291 start_urls = [
292 'http://10.60.7.45/SITE_sdyc_WEB/Site1219/index.shtml',
293 #'http://example.com/page2',
294 ]
295 link_extractor={
296 # 'page':SgmlLinkExtractor(allow='/bbsdoc,board,w+.html$'),
297 # 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,w+,page,d+.html$'),
298 'page':SgmlLinkExtractor(allow='/Article/w+/w+.shtml$'),
299 #http://10.60.32.179/Site/Col411/Article/201510/35770_2015_10_29_8058797.shtml
300 #http://10.60.7.45/SITE_sdyc_WEB/Col1527/Article/201510/sdnw_2110280_2015_10_29_91353216.shtml
301 }
302
303 _x_query={
304 'date':'span/text()',
305 'date2':'/html/body/center/div/div[4]/p',
306
307 'title':'a/text()',
308 #'title2':'/html/body/center/div/div[4]/h2'
309 'title2':'/html/body/div[4]/div[1]/div[2]/div[1]/h1[2]/font'
310 #'author':'/html/body/div[4]/div[1]/div[2]/div[1]/div/span[1]'
311 #'pubtime2':'/html/body/div[4]/div[1]/div[2]/div[1]/div/span[2]'
312
313 }
314 _y_query={
315 #'detail':'/html/body/center/div/div[4]/div[1]/p[1]',
316 'detail':'//*[@id="Zoom"]'
317 }
318
319 def parse(self,response):
320 self.t=0
321 for link in self.link_extractor['page'].extract_links(response):
322 yield Request(url=link.url,callback=self.parse_content)
323 self.t=self.t+1
324
325
326
327 def parse_content(self,response):
328 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
329 url=str(response.url)
330 bbsItem_loader.add_value('desc',url)
331 bbsItem_loader.add_value('link',url)
332 bbsItem_loader.add_xpath('title',self._x_query['title2'])
333 bbsItem_loader.add_xpath('pubtime',self._x_query['date2'])
334 bbsItem_loader.add_xpath('content',self._y_query['detail'])
335 bbsItem_loader.add_value('id',self.t) #why not useful?
2 from scrapy.selector import HtmlXPathSelector
3 from tutorial.items import DmozItem
4 from scrapy.http import Request
5 from scrapy.utils.response import get_base_url
6 from scrapy.utils.url import urljoin_rfc
7 from urllib2 import urlopen
8 from BeautifulSoup import BeautifulSoup
9
10 from scrapy.spiders import CrawlSpider
11 from scrapy.loader import ItemLoader
12 from scrapy.linkextractors.sgml import SgmlLinkExtractor
13
14
15 import scrapy
16 class DmozSpider(BaseSpider):
17 name = "dmoz"
18 allowed_domains = ["dmoz.org"]
19 start_urls = [
20 "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
21 "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
22 ]
23 def parse(self, response):
24 # filename = response.url.split("/")[-2]
25 # open(filename, 'wb').write(response.body)
26 hxs=HtmlXPathSelector(response)
27 sites=hxs.select('//ul/li')
28 items=[]
29 for site in sites:
30 item=DmozItem()
31 item['title']=site.select('a/text()').extract()
32 item['link']=site.select('a/@href').extract()
33 item['desc']=site.select('text()').extract()
34 items.append(item)
35 return items
36
37 class DmozSpider2(BaseSpider):
38 name = "dmoz2"
39 allowed_domains = ["10.60.32.179"]
40 start_urls = [
41 "http://10.60.32.179/Site/Site1/myindex.shtml",
42 #"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
43 ]
44 def parse(self, response):
45 # filename = response.url.split("/")[-2]
46 # open(filename, 'wb').write(response.body)
47 hxs=HtmlXPathSelector(response)
48 sites=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
49 items=[]
50 for site in sites:
51 item=DmozItem()
52 item['date']=site.select('span/text()').extract()
53 item['title']=site.select('a/text()').extract()
54 item['link']=site.select('a/@href').extract()
55 item['desc']=site.select('text()').extract()
56 items.append(item)
57 return items
58
59
60 class MySpider(BaseSpider):
61 name = "myspider"
62 allowed_domains = ["10.60.32.179"]
63 start_urls = [
64 'http://10.60.32.179/Site/Site1/myindex.shtml',
65 #'http://example.com/page2',
66 ]
67 def parse(self, response):
68 # collect `item_urls`
69 hxs=HtmlXPathSelector(response)
70 item_urls=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
71 base_url = get_base_url(response)
72 items=[]
73 for item_url in item_urls:
74
75 yield Request(url=response.url, callback=self.parse_item,meta={'items': items})
76
77 def parse_item(self, response):
78 hxs=HtmlXPathSelector(response)
79 item_urls=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
80
81 item=DmozItem()
82 items=response.meta['items']
83 item['date']=item_urls.select('span/text()').extract()
84 item['title']=item_urls.select('a/text()').extract()
85 item['link']=item_urls.select('a/@href').extract()
86 item['desc']=item_urls.select('text()').extract()
87
88 # item_details_url=item['link']
89 # populate `item` fields
90 relative_url=item_urls.select('a/@href').extract()
91 print(relative_url[0])
92 base_url = get_base_url(response)
93 item_details_url=urljoin_rfc(base_url, relative_url[0])
94 yield Request(url=item_details_url,callback=self.parse_details,dont_filter=True,meta={'item':item,'items':items})
95
96 def parse_details(self, response):
97 #item = response.meta['item']
98 # populate more `item` fields
99 print("***********************In parse_details()***************")
100 hxs=HtmlXPathSelector(response)
101 print("-------------------------------")
102 print(response.url)
103 item_detail=hxs.select('/html/body/center/div/div[4]/div[1]/p[1]').extract()
104 print("________________",item_detail)
105 item=response.meta['item']
106 item['detail']=item_detail
107 items=response.meta['items']
108 items.append[item]
109 return items
110
111
112
113
114 class DmozSpider3(BaseSpider):
115 name = "dmoz3"
116 allowed_domains = ["10.60.32.179"]
117 start_urls = [
118 'http://10.60.32.179/Site/Site1/myindex.shtml',
119 ]
120
121 def parse(self, response):
122 hxs=HtmlXPathSelector(response)
123 sites=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
124 items=[]
125 for site in sites:
126 item=DmozItem()
127 item['date']=site.select('span/text()').extract()
128 item['title']=site.select('a/text()').extract()
129 item['link']=site.select('a/@href').extract()
130 item['desc']=site.select('text()').extract()
131
132 print(item['link'][0])
133 base_url = get_base_url(response)
134 relative_url=item['link'][0]
135 item_details_url=urljoin_rfc(base_url, relative_url)
136 print("*********************",item_details_url)
137 #response2=BeautifulSoup(urlopen(item_details_url).read())
138 response2=scrapy.http.Response(item_details_url)
139 hxs2=HtmlXPathSelector(response2)
140 item['detail']=hxs2.select('/html/body/center/div/div[4]/div[1]/p[1]').extract()
141
142 items.append(item)
143 return items
144
145
146
147
148 class MySpider5(BaseSpider):
149 name = "myspider5"
150 allowed_domains = ["10.60.32.179"]
151 start_urls = [
152 'http://10.60.32.179/Site/Site1/myindex.shtml',
153 #'http://example.com/page2',
154 ]
155
156 items=[]
157 item=DmozItem()
158
159 def parse(self, response):
160 # collect `item_urls`
161 hxs=HtmlXPathSelector(response)
162 item_urls=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
163
164
165
166 base_url = get_base_url(response)
167
168
169 for item_url in item_urls:
170
171
172
173
174 MySpider5.item['date']=item_url.select('span/text()').extract()
175 MySpider5.item['title']=item_url.select('a/text()').extract()
176 MySpider5.item['link']=item_url.select('a/@href').extract()
177 MySpider5.item['desc']=item_url.select('text()').extract()
178
179
180
181 relative_url=MySpider5.item['link']
182 print(relative_url[0])
183 base_url = get_base_url(response)
184 item_details_url=urljoin_rfc(base_url, relative_url[0])
185 print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx=', str(item_details_url)
186
187 yield Request(url=item_details_url, callback=self.parse_details)
188
189
190
191
192 # def parse_item(self, response):
193 # hxs=HtmlXPathSelector(response)
194 # item_urls=hxs.select('//*[@id="_ctl0_LblContent"]/div/div//ul/li')
195
196
197
198 # # item_details_url=item['link']
199 # # populate `item` fields
200 # relative_url=item_urls.select('a/@href').extract()
201 # print(relative_url[0])
202 # base_url = get_base_url(response)
203 # item_details_url=urljoin_rfc(base_url, relative_url[0])
204 # print 'item urls============================================================='
205 # yield Request(url=item_details_url,callback=self.parse_details,dont_filter=True,meta={'item':item,'items':items})
206
207 def parse_details(self, response):
208 #item = response.meta['item']
209 # populate more `item` fields
210
211
212 print("***********************In parse_details()***************")
213 hxs=HtmlXPathSelector(response)
214 print("----------------------------------------------------------------")
215 print(response.url)
216 item_detail=hxs.select('/html/body/center/div/div[4]/div[1]/p[1]').extract()
217 print("________________",item_detail)
218 #item=response.meta['item']
219 #item['detail']=item_detail
220
221 #items.append(item)
222 MySpider5.item['detail']=item_detail
223 # item['detail']=item_detail
224
225 MySpider5.items.append(MySpider5.item)
226
227
228
229 return MySpider5.item
230
231
232 def parse_details2(self, response):
233 #item = response.meta['item']
234 # populate more `item` fields
235 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
236 url=str(response.url)
237 bbsItem_loader.add_value('title',item['title'])
238 abc={
239 'detail':'/html/body/center/div/div[4]/div[1]/p[1]'}
240 bbsItem_loader.add_xpath('detail',abc['detail'])
241 return bbsItem_loader.load_item()
242
243
244
245 class MySpider6(CrawlSpider):
246 name = "myspider6"
247 allowed_domains = ["10.60.32.179"]
248 start_urls = [
249 'http://10.60.32.179/Site/Site1/myindex.shtml',
250 #'http://example.com/page2',
251 ]
252 link_extractor={
253 # 'page':SgmlLinkExtractor(allow='/bbsdoc,board,w+.html$'),
254 # 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,w+,page,d+.html$'),
255 'page':SgmlLinkExtractor(allow='/Article/w+/w+.shtml$'),
256 }
257
258 _x_query={
259 'date':'span/text()',
260 'date2':'/html/body/center/div/div[4]/p',
261 'title':'a/text()',
262 'title2':'/html/body/center/div/div[4]/h2'
263 }
264 _y_query={
265 'detail':'/html/body/center/div/div[4]/div[1]/p[1]',
266 }
267
268 def parse(self,response):
269 self.t=0
270 for link in self.link_extractor['page'].extract_links(response):
271 yield Request(url=link.url,callback=self.parse_content)
272 self.t=self.t+1
273
274
275
276 def parse_content(self,response):
277 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
278 url=str(response.url)
279 bbsItem_loader.add_value('desc',url)
280 bbsItem_loader.add_value('link',url)
281 bbsItem_loader.add_xpath('title',self._x_query['title2'])
282 bbsItem_loader.add_xpath('pubtime',self._x_query['date2'])
283 bbsItem_loader.add_xpath('content',self._y_query['detail'])
284 bbsItem_loader.add_value('id',self.t) #why not useful?
285 return bbsItem_loader.load_item()
286
287
288 class MySpider6SD(CrawlSpider):
289 name = "myspider6sd"
290 allowed_domains = ["10.60.7.45"]
291 start_urls = [
292 'http://10.60.7.45/SITE_sdyc_WEB/Site1219/index.shtml',
293 #'http://example.com/page2',
294 ]
295 link_extractor={
296 # 'page':SgmlLinkExtractor(allow='/bbsdoc,board,w+.html$'),
297 # 'page_down':SgmlLinkExtractor(allow='/bbsdoc,board,w+,page,d+.html$'),
298 'page':SgmlLinkExtractor(allow='/Article/w+/w+.shtml$'),
299 #http://10.60.32.179/Site/Col411/Article/201510/35770_2015_10_29_8058797.shtml
300 #http://10.60.7.45/SITE_sdyc_WEB/Col1527/Article/201510/sdnw_2110280_2015_10_29_91353216.shtml
301 }
302
303 _x_query={
304 'date':'span/text()',
305 'date2':'/html/body/center/div/div[4]/p',
306
307 'title':'a/text()',
308 #'title2':'/html/body/center/div/div[4]/h2'
309 'title2':'/html/body/div[4]/div[1]/div[2]/div[1]/h1[2]/font'
310 #'author':'/html/body/div[4]/div[1]/div[2]/div[1]/div/span[1]'
311 #'pubtime2':'/html/body/div[4]/div[1]/div[2]/div[1]/div/span[2]'
312
313 }
314 _y_query={
315 #'detail':'/html/body/center/div/div[4]/div[1]/p[1]',
316 'detail':'//*[@id="Zoom"]'
317 }
318
319 def parse(self,response):
320 self.t=0
321 for link in self.link_extractor['page'].extract_links(response):
322 yield Request(url=link.url,callback=self.parse_content)
323 self.t=self.t+1
324
325
326
327 def parse_content(self,response):
328 bbsItem_loader=ItemLoader(item=DmozItem(),response=response)
329 url=str(response.url)
330 bbsItem_loader.add_value('desc',url)
331 bbsItem_loader.add_value('link',url)
332 bbsItem_loader.add_xpath('title',self._x_query['title2'])
333 bbsItem_loader.add_xpath('pubtime',self._x_query['date2'])
334 bbsItem_loader.add_xpath('content',self._y_query['detail'])
335 bbsItem_loader.add_value('id',self.t) #why not useful?
336 return bbsItem_loader.load_item()