• 网页抓取 (2)


      1 #include <stdlib.h>
      2 #include <stdio.h>
      3 #include <string.h>
      4 #include "lyGetHttpResult.h"
      5 #include "lyPublic/lyCodeConvert.c"
      6 int main()
      7 {
      8 
      9     char szUrl[512] = "";
     10     char svData[1024 * 40] = "";
     11     char *szData = NULL;
     12     FILE *fp;
     13     char *p, *q,*q2,*p2;
     14     char strFrom[100] = "", strTo[100] = "";
     15     char findStr[20] = "", andStr[20] = "</span> - ";//查找标记串
     16     char outStr[100] = "",reStr[100] = "";
     17     char str[100] = "http://www.chazidian.com/jinyici/",str2[100] = "";
     18     int len, falg;
     19 
     20     gets(strFrom);//初串
     21     strcpy(reStr,strFrom);
     22     CodeConvert(strFrom, str2, sizeof(str2), 2);
     23     strcat(str,str2);
     24 
     25     sprintf(szUrl, str);
     26     szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
     27     if(!szData)
     28         return NULL;
     29 
     30     CodeConvert(szData, svData, sizeof(svData), 1);
     31         if(fopen("Text.txt", "r+") == NULL)
     32                 fp=fopen("Text.txt", "w+r");
     33             else
     34                 fp=fopen("Text.txt", "r+");
     35             fputs(svData, fp);
     36 
     37 /*    while(strstr(svData, strFrom) == NULL)//判断是否在本页,不在的话进入下一页
     38     {
     39         p = strstr(svData, "下一页");    
     40     //    q = p-60;
     41         q=p;
     42         while(*q != ':')
     43             q--;
     44         q+=2;
     45         memset(szUrl, 0, sizeof(szUrl));
     46         len = 0;
     47         while(q++ < p)
     48             szUrl[len++]=*q;
     49         szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
     50        CodeConvert(szData, svData, sizeof(svData), 1);
     51     }
     52 */
     53     p = strstr(svData, strFrom);//找到起点
     54     falg = 0;
     55     if(*(p-1)=='/')
     56     {
     57         q = p - 1;
     58     }
     59     else
     60     {
     61         q = p - 1;
     62         while(*q!='/')
     63         {
     64             q--;
     65         }
     66         p = q;//
     67         q2 = p;//
     68         p2=p-1;//
     69         while(*(--q2) != '/');
     70         p=q2+1;//后后
     71         memset(strFrom,0,sizeof(strFrom));
     72         while(q2 < p2)
     73             strFrom[falg++] = *(++q2);
     74     }
     75 
     76     while(*q!='"')
     77         q--;
     78 
     79     memset(szUrl, 0, sizeof(szUrl));
     80     len = 0;
     81     while(++q < p)
     82         szUrl[len++] = *q;
     83     strcpy(findStr,strFrom);//
     84     strcat(findStr,andStr);//设置查找串
     85     puts(findStr);
     86     CodeConvert(strFrom, strTo, sizeof(strTo), 2);//关键词转码
     87     strcat(szUrl, strTo);
     88     puts(szUrl);
     89     szData = GetDataFromWeb(szUrl, NULL, NULL,1, 5);
     90     CodeConvert(szData, svData, sizeof(svData), 1);//转码
     91 //    puts(svData);
     92     p = strstr(svData, findStr);
     93     len = strlen(outStr);
     94     while(*p != '4')
     95     {
     96         if(*p != '<' && (*p < 'a'||*p > 'z') && *p != '/' && *p != '>' && *p != '-')
     97         {
     98             outStr[len++] = *p;
     99         }
    100         p++;
    101     }
    102     puts(outStr);
    103 
    104     p = strstr(outStr,reStr);//去重
    105     len = strlen(reStr);
    106     if(p == &outStr[0])
    107     {
    108         p+=len+2;
    109         while(*p != '')
    110             printf("%c",*(p++));
    111     }
    112     else
    113     {
    114         q=outStr;
    115         while(q != p)
    116             printf("%c",*(q++));
    117         if(*q == *p)
    118             q=p+len;
    119         while(*q != '')
    120             printf("%c",*(q++));
    121     }
    122     free(szData);
    123     szData = NULL;
    124 
    125 /*    if(fopen("Text.txt", "r+") == NULL)
    126         fp=fopen("Text.txt", "w+r");
    127     else
    128         fp=fopen("Text.txt", "r+");
    129     fputs(outStr, fp);*/
    130 
    131 /*    char strFrom[100] = "";
    132     char strTo[100] = "";
    133     gets(strFrom);
    134     if(CodeGbkToUnicode(strFrom,strTo,100,20))
    135         puts(strTo);
    136     else
    137         printf("NO~!
    ");
    138 
    139 /*    char strFrom[1024*40] = "http://www.chazidian.com/jinyici/", * strTo= NULL;
    140     char szData[100] = "",svData[100] = "";
    141     char str[1024*40] ;
    142     char szUrl[512] = "";
    143     FILE *fp;
    144     gets(szData);
    145     CodeConvert(szData, svData, sizeof(svData), 2);
    146     strcat(strFrom,svData);
    147     sprintf(szUrl, strFrom);
    148     strTo = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
    149     CodeConvert(strTo, str, sizeof(str), 1);
    150     if(fopen("Text.txt", "r+") == NULL)
    151         fp=fopen("Text.txt", "w+r");
    152     else
    153         fp=fopen("Text.txt", "r+");
    154     fputs(str, fp);    
    155     puts(str);*/
    156     return 1;
    157 }

     早上写的是一页一页抓的,当页数太多时 会变得很慢,后来 老韦说让我写第二个 网站的时候用 网址,但是那个网站用的是内码,不能直接获取信息,可能还要检索他的内码。由此我想前面这个查字典的网站 是不是也可以用 网址去检索,后来改了下,还好可以的,这样就变得快多啦

    改后的代码:

     1 #include <stdlib.h>
     2 #include <stdio.h>
     3 #include <string.h>
     4 #include "lyGetHttpResult.h"
     5 #include "lyPublic/lyCodeConvert.c"
     6 int main()
     7 {
     8 
     9     char szUrl[512] = "";
    10     char svData[1024 * 40] = "";
    11     char *szData = NULL;
    12 //    FILE *fp;
    13 //    char *p, *q,*q2,*p2;
    14     char *p,*q;
    15     char strFrom[100] = "", strTo[100] = "";
    16     char findStr[20] = "", andStr[20] = "</span> - ";//查找标记串
    17     char outStr[100] = "",reStr[100] = "";
    18     char str[100] = "http://www.chazidian.com/jinyici/",str2[100] = "";
    19     int len;
    20 
    21     gets(strFrom);//初串
    22     strcpy(reStr,strFrom);
    23     CodeConvert(strFrom, str2, sizeof(str2), 2);//先将汉字GBK转为UTF-8再接道网址后面
    24     strcat(str,str2);
    25 
    26     sprintf(szUrl, str);
    27     szData = GetDataFromWeb(szUrl, NULL, NULL, 1, 5);
    28     if(!szData)
    29         return NULL;
    30 
    31     CodeConvert(szData, svData, sizeof(svData), 1);//找汉字的时候是找GBK。,所以还要转回来
    32     /*    if(fopen("Text.txt", "r+") == NULL)
    33                 fp=fopen("Text.txt", "w+r");
    34             else
    35                 fp=fopen("Text.txt", "r+");
    36             fputs(svData, fp);*/
    37     strcpy(findStr,strFrom);//
    38     strcat(findStr,andStr);
    39     p = strstr(svData, findStr);
    40     len = strlen(outStr);
    41     while(*p != '4')
    42     {
    43         if(*p != '<' && (*p < 'a'||*p > 'z') && *p != '/' && *p != '>' && *p != '-')
    44         {
    45             outStr[len++] = *p;
    46         }
    47         p++;
    48     }
    49     puts(outStr);
    50 
    51     p = strstr(outStr,reStr);//去重
    52     len = strlen(reStr);
    53     if(p == &outStr[0])
    54     {
    55         p+=len+2;
    56         while(*p != '')
    57             printf("%c",*(p++));
    58     }
    59     else
    60     {
    61         q=outStr;
    62         while(q != p)
    63             printf("%c",*(q++));
    64         if(*q == *p)
    65             q=p+len;
    66         while(*q != '')
    67             printf("%c",*(q++));
    68     }
    69     free(szData);
    70     szData = NULL;
    71     return 1;
    72 }
  • 相关阅读:
    【Mysql学习笔记】浅析mysql的binlog
    HBase 学习笔记---守护进程及内存调优
    字符集例子-同一字符不同字符集编码不同及导入导出的乱码
    随机访问
    格式化的代价
    读写文本文件
    缓冲
    加速I/O的基本规则
    序列化再探讨
    数据库I/O:CMP、Hibernate
  • 原文地址:https://www.cnblogs.com/zibuyu/p/3196161.html
Copyright © 2020-2023  润新知