• Linux下socket实现网页抓取 Unicorn 博客频道 CSDN.NET


    Linux下socket实现网页抓取 - Unicorn - 博客频道 - CSDN.NET


    Linux下socket实现网页抓取


    分类:
    C/C++学习点滴
    DO spiders DO
    linux编程


    951人阅读
    评论(0)
    收藏
    举报

     主要用来和WinSock进行下比较:

    --WinSock--

    需要初始化:

    if( (Ret = WSAStartup(MAKEWORD(1,1), &wsaData) ) != 0 )

    {

     printf("WSAStartup failed with error %d/n", Ret);

     return FALSE;

    }

    头文件:

    --WinSock--

    #include <winsock2.h> //header

    #pragma comment (lib, "ws2_32.lib") //lib

    --Linux--

    #include <sys/socket.h>

    #include <netinet/in.h>

    #include <arpa/inet.h>

    #include <netdb.h>

    各个头文件的作用还需要进一步研究

    gethostbyname(host)://从主机名返回地址

    这个都是一样的,返回一个struct hostent *的指针。

    地址结构:

    --WinSock--

    SOCKADDR_IN

    --Linux--

    sockaddr_in

    实际上是一样的都是

    struct sockaddr_in{

       shortsin_family;

       unsigned short sin_port;

       struct in_addr sin_addr;

       charsin_zero[8];

    };

    (

      这个结构是sockaddr的等价结构

      struct sockaddr

      {

       unsigned short sa_family; // address family, AF_XXX

     char sa_data[14];   //14 bytes of protocol address

      };

    )

    其中IP地址结构struct in_addr定义如下:

    struct   in_addr {

        union   {

             struct{

                 unsigned  char   s_b1,

                                  s_b2,

                                  s_b3,

                                  s_b4;

            }  S_un_b;

                 struct  {

                 unsigned  short  s_w1,

                                  s_w2;

                  }  S_un_w;

                   unsigned long  S_addr;

         } S_un;

    };

    Socket:

    --WinSock--

    返回句柄SOCKET,就是socket描述符

    --Linux--

    比较直接返回int型socket描述符

    函数接口都一样

    函数例子:

    socket (AF_INET, SOCK_STREAM, 0); //TCP

    connect(sock, (const sockaddr * )&tcpaddr, sizeof(tcpaddr)); //返回值有不同

    --WinSock--

    If no error occurs, connect returns zero. Otherwise, it returns SOCKET_ERROR, and a specific error code can be retrieved by calling

    WSAGetLastError.

    --Linux--

    错误返回-1

    send(sock_description, message, strlen(message), 0); //返回值不同

    --WinSock--

    If no error occurs, send returns the total number of bytes sent, which can be less than the number indicated by len. Otherwise, a value of

    SOCKET_ERROR is returned, and a specific error code can be retrieved by calling WSAGetLastError.

    --Linux--

    错误返回-1

    recv(sock_description, buffer, sizeof(buffer), 0);//返回值不同

    --WinSock--

    If no error occurs, recv returns the number of bytes received. If the connection has been gracefully closed, the return value is zero. Otherwise, a

    value of SOCKET_ERROR is returned, and a specific error code can be retrieved by calling WSAGetLastError.

    --Linux--

    错误返回-1

    结束:

    --WinSock--

    closesocket(sock);

    if( WSACleanup() == SOCKET_ERROR )

    {

     printf("WSACleanup failed with error %d /n", WSAGetLastError() );

    }

    --Linux--

    close(sock);

    下面是一个Linux下socket一个HTTP协议GET方法的应用:

    #include <stdio.h>

    #include <stdlib.h>

    #include <string.h>

    #include <sys/socket.h>

    #include <netinet/in.h>

    #include <arpa/inet.h>

    #include <netdb.h>

    char* host = "www.hao123.com";

    int port = 80;

    int main(void)

    {

     char buffer[512];

     int isock;

     struct sockaddr_in pin;

     struct hostent * remoteHost;

     char message[512];

     int done = 0;

     int chars = 0;

     int l = 0;

     if( (remoteHost = gethostbyname(host)) == 0 )

     {

      printf("Error resolving host/n");

      exit(1);

     }

     bzero(message,sizeof(message));

     bzero(&pin,sizeof(pin));

     pin.sin_family = AF_INET;

     pin.sin_port = htons(port);

     pin.sin_addr.s_addr = ( (struct in_addr *)(remoteHost->h_addr) )->s_addr;

     

     if( (isock = socket(AF_INET, SOCK_STREAM, 0)) == -1)

     {

      printf("Error opening socket!/n");

      exit(1);

     }

     sprintf(message, "GET / HTTP/1.1/r/n");

     strcat(message, "Host:www.hao123.com/r/n");

     strcat(message, "Accept: */*/r/n");

     strcat(message, "User-Agent: Mozilla/4.0(compatible)/r/n");

     strcat(message, "connection:Keep-Alive/r/n");

     strcat(message, "/r/n/r/n");

     printf("%s",message);

     if( connect(isock, (void *)&pin, sizeof(pin)) == -1 )

     {

      printf("Error connecting to socket/n");

      exit(1);

     }

     if( send(isock, message, strlen(message), 0) == -1)

     {

      printf("Error in send/n");

      exit(1);

     }

     

     while(done == 0)

     {

      l = recv(isock, buffer, 1, 0);

      if( l < 0 )

       done = 1;

      switch(*buffer)

      {

       case '/r':

        break;

       case '/n':

        if(chars == 0)

         done = 1;

        chars = 0;

        break;

       default:

        chars++;

        break;

      }

       printf("%c",*buffer);

     }

     do

     {

      l = recv(isock, buffer, sizeof(buffer) - 1, 0);

      if( l < 0 )

       break;

      *(buffer + l) = 0;

      fputs(buffer, stdout);

     }while( l > 0 );

     close(isock);

     return 0;

    }

  • 相关阅读:
    HMM (隐马尔可夫) 推导 (上)
    图模型初识
    K-means 和 EM 比较
    EM-高斯混合模型
    EM算法-完整推导
    EM算法直观认识
    pandas 之 时间序列索引
    K-Means 算法
    接口。
    第一册:lesson ninety-nine。
  • 原文地址:https://www.cnblogs.com/lexus/p/2616303.html
Copyright © 2020-2023  润新知