• 微博Tag的抓取



    title: 微博Tag的抓取
    date: 2017-09-12 04:49:18
    tags: [爬虫]

    问题

    抓取页面中的标签内容

    方案

    使用Fiddler获取Cookie

    package crwaler;
    
    import java.io.BufferedReader;
    import java.io.BufferedWriter;
    import java.io.File;
    import java.io.FileNotFoundException;
    import java.io.FileReader;
    import java.io.FileWriter;
    import java.nio.charset.Charset;
    import java.util.HashMap;
    import java.util.Map;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    
    import org.apache.http.HttpHost;
    import org.apache.http.client.HttpClient;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.conn.params.ConnRouteParams;
    import org.apache.http.impl.client.HttpClients;
    
    import util.PostUtils;
    
    public class TagUtil {
    	public static void main(String[] args) throws Exception {
    //		Thread.sleep(1000*60*30);
    		BufferedReader br = new BufferedReader(new FileReader(new File("detial.csv")));
    		BufferedWriter bw = new BufferedWriter(new FileWriter(new File("tag.csv")));
    		String temp;
    		while((temp=br.readLine())!=null){
    			String[] strs = temp.split(",");
    			String id = strs[0];
    			String tag = getTags(id);
    			System.out.println(id);
    			System.out.println(tag);
    			bw.write(temp + "," + tag + "
    ");
    			Thread.sleep(4*1000);
    		}
    		br.close();
    		bw.close();
    	}
    	private static String getTags(String id){
    		String url = "http://weibo.com/p/100306" + id + "/info?mod=pedit_more";
    		HttpClient httpClient = HttpClients.createDefault();
    		Map<String, String> map = new HashMap<>();
    		String ret = PostUtils.sendPost(httpClient, url, map, Charset.forName("utf-8"));
    		try{
    			ret = ret.split("标签")[2];
    		}catch(Exception e){
    			return "";
    		}
    		Pattern pattern = Pattern.compile("\\t([0-9a-zA-Z\u4e00-\u9fa5]+?)\\t");
    		Matcher matcher = pattern.matcher(ret);
    		StringBuilder sb = new StringBuilder();
    		while(matcher.find()){
    			sb.append(matcher.group(1)+" ");
    		}
    		if(sb.length()>0)
    			return sb.substring(0, sb.length()-1);
    		return "";
    	}
    }
    
    package util;
    
    import java.io.IOException;
    import java.nio.charset.Charset;
    import java.util.ArrayList;
    import java.util.Iterator;
    import java.util.List;
    import java.util.Map;
    
    import org.apache.http.HttpHost;
    import org.apache.http.NameValuePair;
    import org.apache.http.client.HttpClient;
    import org.apache.http.client.config.RequestConfig;
    import org.apache.http.client.entity.UrlEncodedFormEntity;
    import org.apache.http.client.methods.CloseableHttpResponse;
    import org.apache.http.client.methods.HttpGet;
    import org.apache.http.client.methods.HttpPost;
    import org.apache.http.message.BasicNameValuePair;
    import org.apache.http.util.EntityUtils;
    
    
    public class PostUtils {
    	/**
    	 *
    	 * @param httpClient
    	 * @param url
    	 *            请求地址
    	 * @param params
    	 *            请求参数
    	 * @param encoding
    	 *            编码
    	 * @return
    	 */
    	public static String sendPost(HttpClient httpClient, String url, Map<String, String> params, Charset encoding) {
    		String resp = "";
    		HttpPost httpPost = new HttpPost(url);
    		HttpHost proxy = new HttpHost("127.0.0.1",8888);
    		RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
    		httpPost.setConfig(config);
    		
    		httpPost.addHeader("Host", "weibo.com");
    		
    		//httpPost.addHeader("Connection","keep-alive");
    		//httpPost.addHeader("Cache-Control","max-age=0");
    		httpPost.addHeader("Upgrade-Insecure-Requests","1");
    		
    		httpPost.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36");
    		httpPost.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
    		httpPost.addHeader("DNT","1");
    		httpPost.addHeader("Referer",url);
    		httpPost.addHeader("Accept-Language","zh-CN,zh;q=0.8");
    		httpPost.addHeader("Accept-Encoding", "gzip, deflate");
    		httpPost.addHeader("Cookie","你的Cookie");
    		boolean redo = false;
    		if (params != null && params.size() > 0) {
    			List<NameValuePair> formParams = new ArrayList<NameValuePair>();
    			Iterator<Map.Entry<String, String>> itr = params.entrySet().iterator();
    			while (itr.hasNext()) {
    				Map.Entry<String, String> entry = itr.next();
    				formParams.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
    			}
    			UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(formParams, encoding);
    			httpPost.setEntity(postEntity);
    		}
    		CloseableHttpResponse response = null;
    		try {
    			response = (CloseableHttpResponse) httpClient.execute(httpPost);
    			System.out.println(response.getStatusLine().getStatusCode());
    			if(response.getStatusLine().getStatusCode() == 302)
    				redo = true;
    			resp = EntityUtils.toString(response.getEntity(), encoding);
    		} catch (Exception e) {
    			e.printStackTrace();
    		} finally {
    			if (response != null) {
    				try {
    					response.close();
    				} catch (IOException e) {
    
    				}
    			}
    		}
    		return resp;
    	}
    	
    	public static String sendGet(HttpClient httpClient, String url,Charset encoding) {
    		String resp = "";
    		HttpGet httpGet = new HttpGet(url);
    		httpGet.setHeader("User-Agent",
    				"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36");
    		CloseableHttpResponse response = null;
    		try {
    			response = (CloseableHttpResponse) httpClient.execute(httpGet);
    			resp = EntityUtils.toString(response.getEntity());
    		} catch (Exception e) {
    		} finally {
    			if (response != null) {
    				try {
    					response.close();
    				} catch (IOException e) {
    
    				}
    			}
    		}
    		return resp;
    	}
    }
    
    
  • 相关阅读:
    SQLite基础-7.子句(一)
    SQLite基础-8.子句(二)
    SQLite基础-6.运算符
    SQLite基础-5.数据操作语言
    SQLite基础-4.数据定义语言(DDL)
    SQLite基础-3.语法与数据类型
    IDEA操作之FileHeager设置
    IDEA操作之test case coverage的方法
    IDEA插件之JavaDoc
    IDEA插件之JProfiler
  • 原文地址:https://www.cnblogs.com/yanximin/p/10982229.html
Copyright © 2020-2023  润新知