1 #ifndef _URL_H_030728_ 2 #define _URL_H_030728_ 3 4 #include <string> 5 6 const unsigned int URL_LEN = 256; 7 const unsigned int HOST_LEN = 256; 8 9 using namespace std; 10 11 12 enum url_scheme { 13 SCHEME_HTTP, 14 SCHEME_FTP, 15 SCHEME_INVALID 16 }; 17 18 const int DEFAULT_HTTP_PORT = 80; 19 const int DEFAULT_FTP_PORT = 21; 20 21 class CUrl 22 { 23 public: 24 string m_sUrl; // 原始的url地址 25 enum url_scheme m_eScheme; // URL 类型 26 27 string m_sHost; // 提取出来的主机地址 28 int m_nPort; // 主机端口号 29 string m_sPath; //路径 30 31 32 public: 33 CUrl(); 34 ~CUrl(); 35 36 //bool ParseUrl(string strUrl); 37 38 // break an URL into scheme, host, port and request. 39 // result as member variants 40 bool ParseUrlEx(string strUrl); 41 42 // break an URL into scheme, host, port and request. 43 // result url as argvs 44 void ParseUrlEx(const char *url, char *protocol, int lprotocol, 45 char *host, int lhost, 46 char *request, int lrequest, int *port); 47 48 // get the ip address by host name 49 char *GetIpByHost(const char *host); 50 51 bool IsValidHost(const char *ip); 52 bool IsForeignHost(string host); 53 bool IsImageUrl(string url); 54 bool IsValidIp(const char *ip); 55 bool IsVisitedUrl(const char *url); 56 bool IsUnReachedUrl(const char *url); 57 bool IsValidHostChar(char ch); 58 59 //private: 60 void ParseScheme (const char *url); 61 }; 62 63 extern pthread_mutex_t mutexMemory; 64 65 #endif /* _URL_H_030728_ */
1 /* URL handling 2 */ 3 4 #include <iostream> 5 #include <string.h> 6 #include <sys/socket.h> 7 #include <netdb.h> 8 #include <map> 9 #include "Url.h" 10 #include <stdlib.h> 11 #include <arpa/inet.h> 12 13 //#include "Tse.h" 14 //#include "Url.h" 15 //#include "Http.h" 16 //#include "Md5.h" 17 //#include "StrFun.h" 18 19 20 21 // 22 ///* Is X "."? */ 23 #define DOTP(x) ((*(x) == '.') && (!*(sdfx + 1))) 24 ///* Is X ".."? */ 25 #define DDOTP(x) ((*(x) == '.') && (*(x + 1) == '.') && (!*(x + 2))) 26 27 map<string,string> mapCacheHostLookup; 28 //extern vector<string> vsUnreachHost; 29 //pthread_mutex_t mutexCacheHost = PTHREAD_MUTEX_INITIALIZER; 30 //extern set<string> setVisitedUrlMD5; 31 //extern map<unsigned long,unsigned long> mapIpBlock; 32 typedef map<string,string>::value_type valTypeCHL; 33 34 struct scheme_data 35 { 36 const char *leading_string; 37 int default_port; 38 int enabled; 39 }; 40 41 /* 支持的网页类型 */ 42 static struct scheme_data supported_schemes[] = 43 { 44 { "http://", DEFAULT_HTTP_PORT, 1 }, 45 { "ftp://", DEFAULT_FTP_PORT, 1 }, 46 47 /* 不合法的网页 */ 48 { NULL, -1, 0 } 49 }; 50 51 /* 分析类型,若是合法则返回正确的,否则是不合法的 */ 52 void CUrl::ParseScheme (const char *url) 53 { 54 int i; 55 56 for (i = 0; supported_schemes[i].leading_string; i++) 57 if (0 == strncasecmp (url, supported_schemes[i].leading_string,strlen (supported_schemes[i].leading_string)))//比较url的前几个字母 58 { 59 if (supported_schemes[i].enabled) 60 { 61 this->m_eScheme = (enum url_scheme) i; 62 return; 63 } 64 else 65 { 66 this->m_eScheme = SCHEME_INVALID; 67 return; 68 } 69 } 70 71 this->m_eScheme = SCHEME_INVALID; 72 return; 73 } 74 75 /************************************************************************ 76 * Function name: ParseUrlEx 77 * Input argv: 78 * -- strUrl: url 79 * Output argv: 80 * -- 81 * Return: 82 true: success 83 false: fail 84 * Fucntion Description: break an URL into scheme, host, port and request. 85 * result as member variants 86 * Be careful: release the memory by the client 87 ************************************************************************/ 88 89 bool CUrl::ParseUrlEx(string strUrl) 90 { 91 char protocol[10]; 92 char host[HOST_LEN]; 93 char request[256]; 94 int port = -1; 95 96 memset( protocol, 0, sizeof(protocol) ); 97 memset( host, 0, sizeof(host) ); 98 memset( request, 0, sizeof(request) ); 99 100 this->ParseScheme(strUrl.c_str()); 101 if( this->m_eScheme != SCHEME_HTTP ) 102 { 103 return false; 104 } 105 106 ParseUrlEx(strUrl.c_str(), 107 protocol, sizeof(protocol), 108 host, sizeof(host), 109 request, sizeof(request), 110 &port); 111 112 m_sUrl = strUrl; 113 m_sHost = host; 114 m_sPath = request; 115 116 if( port > 0 ){ 117 m_nPort = port; 118 } 119 120 return true; 121 } 122 123 /************************************************************************ 124 * Function name: ParseUrlEx 125 * Input argv: 126 * -- url: host name 127 * -- protocol: result protocol 128 * -- lprotocol: protocol length 129 * -- host: result host 130 * -- lhost: host length 131 * -- request: result request 132 * -- lrequest: request length 133 * Output argv: 134 * -- 135 * Return: 136 true: success 137 false: fail 138 * Fucntion Description: break an URL into scheme, host, port and request. 139 * result as argvs 140 * Be careful: 141 ************************************************************************/ 142 void CUrl::ParseUrlEx(const char *url, 143 char *protocol, int lprotocol, 144 char *host, int lhost, 145 char *request, int lrequest, 146 int *port) 147 { 148 char *work,*ptr,*ptr2; 149 150 *protocol = *host = *request = 0; 151 *port = 80; 152 153 int len = strlen(url); 154 //pthread_mutex_lock(&mutexMemory); 155 work = new char[len + 1]; 156 //pthread_mutex_unlock(&mutexMemory); 157 memset(work, 0, len+1); 158 strncpy(work, url, len); 159 //把url的内容复制到work中 160 161 // find protocol if any 162 //在work中查找:(默认的是http) 163 ptr = strchr(work, ':'); 164 if( ptr != NULL ) 165 { 166 *(ptr++) = 0; 167 strncpy( protocol, work, lprotocol ); 168 } else { 169 strncpy( protocol, "HTTP", lprotocol ); 170 ptr = work; 171 } 172 173 // skip past opening /'s 174 //调过 // 175 if( (*ptr=='/') && (*(ptr+1)=='/') ) 176 ptr+=2; 177 178 // 查找主机地址 179 ptr2 = ptr; 180 while( IsValidHostChar(*ptr2) && *ptr2 ) 181 ptr2++; 182 *ptr2 = 0;//保证合法的字符串 183 strncpy( host, ptr, lhost ); 184 185 //查找请求的网页 186 int offset = ptr2 - work; 187 const char *pStr = url + offset; 188 strncpy( request, pStr, lrequest ); 189 190 //找到主机的端口 191 ptr = strchr( host, ':' ); 192 if( ptr != NULL ){ 193 *ptr = 0; 194 *port = atoi(ptr+1); 195 } 196 197 //pthread_mutex_lock(&mutexMemory); 198 delete [] work; 199 //pthread_mutex_unlock(&mutexMemory); 200 work = NULL; 201 } 202 203 204 205 206 207 208 /* scheme://user:pass@host[:port]... 209 * ^ 210 * We attempt to break down the URL into the components path, 211 * params, query, and fragment. They are ordered like this: 212 * scheme://host[:port][/path][;params][?query][#fragment] 213 */ 214 215 /* 216 bool CUrl::ParseUrl(string strUrl) 217 { 218 string::size_type idx; 219 220 this->ParseScheme(strUrl.c_str()); 221 if( this->m_eScheme != SCHEME_HTTP ) 222 return false; 223 224 // get host name 225 this->m_sHost = strUrl.substr(7); 226 idx = m_sHost.find('/'); 227 if(idx != string::npos){ 228 m_sHost = m_sHost.substr(0,idx); 229 } 230 231 this->m_sUrl = strUrl; 232 233 return true; 234 } 235 */ 236 //CUrl的构造函数 237 CUrl::CUrl() 238 { 239 this->m_sUrl = ""; 240 this->m_eScheme= SCHEME_INVALID; 241 242 this->m_sHost = ""; 243 this->m_nPort = DEFAULT_HTTP_PORT; //默认端口 244 245 this->m_sPath = ""; 246 /* 247 this->m_sParams = ""; 248 this->m_sQuery = ""; 249 this->m_sFragment = ""; 250 251 this->m_sDir = ""; 252 this->m_sFile = ""; 253 254 this->m_sUser = ""; 255 this->m_sPasswd = ""; 256 */ 257 258 } 259 260 CUrl::~CUrl() 261 { 262 263 } 264 265 266 /**************************************************************************** 267 * Function name: GetIpByHost 268 * Input argv: 269 * -- host: host name 270 * Output argv: 271 * -- 272 * Return: 273 ip: sucess 274 NULL: fail 275 * Function Description: get the ip address by host name 276 * Be careful: release the memory by the client 277 ****************************************************************************/ 278 //通过主机地址获得IP地址 279 char * CUrl::GetIpByHost(const char *host) 280 { 281 282 if( !host ){ // null pointer 283 return NULL; 284 cout<<"f1"; 285 } 286 287 if( !IsValidHost(host) ){ // invalid host 288 return NULL; 289 cout<<"f2"; 290 } 291 unsigned long inaddr = 0; 292 char *result = NULL; 293 int len = 0; 294 295 296 inaddr = (unsigned long)inet_addr( host );//将字符串IP转化为32二进制的网络字节序 297 //if ( (int)inaddr != -1){ 298 if ( inaddr != INADDR_NONE) 299 { // 主机地址就是用IP地址表示的 300 len = strlen(host); 301 //pthread_mutex_lock(&mutexMemory); 302 result = new char[len+1]; 303 cout<<result; 304 //pthread_mutex_unlock(&mutexMemory); 305 memset(result, 0, len+1); 306 memcpy(result, host, len); 307 308 return result; 309 } 310 else 311 { 312 //firt find from cache 313 314 map<string,string>::iterator it = mapCacheHostLookup.find(host); 315 //可以在DNS缓存中找到 316 if( it != mapCacheHostLookup.end() ) 317 { //如果在cache中找到IP地址 318 const char * strHostIp; 319 320 strHostIp = (*it).second.c_str(); 321 322 inaddr = (unsigned long)inet_addr( strHostIp ); 323 //if ( (int)inaddr != -1){ 324 if ( inaddr != INADDR_NONE ) 325 { 326 len = strlen(strHostIp); 327 //pthread_mutex_lock(&mutexMemory); 328 result = new char[len+1]; 329 //pthread_mutex_unlock(&mutexMemory); 330 memset( result, 0, len+1 ); 331 memcpy( result, strHostIp, len ); 332 333 //cout << ":)" ; 334 335 return result; 336 } 337 } 338 } 339 340 //通过上面的方法我们都没有查找,这个时候我们只能通过DNS server查找了,这种带宽的消耗是必要的! 341 struct hostent *hp; /* Host entity */ 342 hp = gethostbyname(host); 343 //通过主机号或者说是域名得到hostent结构,这个结构包含主机号或者说域名的很多信息,例如我们要找的IP字符串就在其中 344 if(hp == NULL) { 345 //cout << "gethostbyname() error in GetIpByHost: " << host << endl; 346 return NULL; 347 } 348 349 // cache host lookup 350 struct in_addr in; 351 352 bcopy(*(hp->h_addr_list), (caddr_t)&in, hp->h_length); 353 /*功能:将字符串src的前n个字节复制到dest中 354 说明:bcopy不检查字符串中的空字节NULL,函数没有返回值。*/ 355 356 char abuf[INET_ADDRSTRLEN]; 357 if( inet_ntop(AF_INET, (void *)&in,abuf, sizeof(abuf)) == NULL ) 358 { 359 cout << "inet_ntop() return error in GetIpByHost" << endl; 360 return NULL; 361 362 } 363 else 364 { 365 366 //if( mapCacheHostLookup.count(host) == 0){ 367 if( mapCacheHostLookup.find(host) == mapCacheHostLookup.end() ){ 368 369 //cout << endl << host << " and " << abuf << endl; 370 mapCacheHostLookup.insert( valTypeCHL ( host, abuf)); 371 //更新DNS缓存 372 //cout<<((*mapCacheHostLookup.find("home.ustc.edu.cn")).second.c_str()); 373 374 } 375 376 } 377 378 // return result 379 len = strlen(abuf); 380 //pthread_mutex_lock(&mutexMemory); 381 result = new char[len + 1]; 382 //pthread_mutex_unlock(&mutexMemory); 383 memset( result, 0, len+1 ); 384 memcpy( result, abuf, len ); 385 386 return result; 387 } 388 389 /********************************************************************************** 390 * Function name: IsValidHostChar 391 * Input argv: 392 * -- ch: the character for testing 393 * Output argv: 394 * -- 395 * Return: 396 true: is valid 397 false: is invalid 398 * Function Description: test the specified character valid 399 * for a host name, i.e. A-Z or 0-9 or -.: 400 **********************************************************************************/ 401 bool CUrl::IsValidHostChar(char ch) 402 { 403 return( isalpha(ch) || isdigit(ch) 404 || ch=='-' || ch=='.' || ch==':' || ch=='_'); 405 } 406 407 /********************************************************************************** 408 * Function name: IsValidHost 409 * Input argv: 410 * -- ch: the character for testing 411 * Output argv: 412 * -- 413 * Return: 414 true: is valid 415 false: is invalid 416 * Function Description: test the specified character valid 417 * for a host name, i.e. A-Z or 0-9 or -.: 418 * Be careful: 419 **********************************************************************************/ 420 bool CUrl::IsValidHost(const char *host) 421 { 422 if( !host ){//空的主机号,我们认为是无效的主机号 423 return false; 424 } 425 426 if( strlen(host) < 6 ){ //主机号长度小于6,我们认为ieshi无效的主机号 427 return false; 428 } 429 430 char ch; 431 for(unsigned int i=0; i<strlen(host); i++){ 432 ch = *(host++); 433 if( !IsValidHostChar(ch) ){ 434 return false; 435 } 436 } 437 438 return true; 439 } 440 441 /********************************************************************************** 442 * Function name: IsVisitedUrl 443 * Input argv: 444 * -- url: url 445 * Output argv: 446 * -- 447 * Return: 448 true: is visited 449 false: not visited 450 * Function Description: test the url visited by the MD5 451 * Be careful: 452 **********************************************************************************/ 453 bool CUrl::IsVisitedUrl(const char *url)//判断该URL是否访问过 454 { 455 if( !url ){ 456 return true; // if be null, we think it have been visited 457 } 458 459 CMD5 iMD5; 460 iMD5.GenerateMD5( (unsigned char*)url, strlen(url) ); 461 string strDigest = iMD5.ToString(); 462 463 if( setVisitedUrlMD5.find(strDigest) != setVisitedUrlMD5.end() ) { 464 return true; 465 } else { 466 return false; 467 } 468 469 } 470 471 472 /********************************************************************************** 473 * Function name: IsValidIp 474 * Input argv: 475 * -- ip: ip 476 * Output argv: 477 * -- 478 * Return: 479 true: inside the ip block 480 false: outside the ip block 481 * Function Description: decide teh ip whether or not inside the ip block 482 * Be careful: 483 **********************************************************************************/ 484 bool CUrl::IsValidIp(const char *ip) 485 { 486 if( ip == NULL ) 487 { 488 return false; 489 } 490 491 unsigned long inaddr = (unsigned long)inet_addr(ip); 492 if( inaddr == INADDR_NONE ){//显然该IP参数不是正确的字符串IP 493 return false; 494 } 495 496 if (mapIpBlock.size() > 0) { //判断是否要过滤掉 497 map<unsigned long, unsigned long>::iterator pos; 498 for (pos = mapIpBlock.begin(); pos != mapIpBlock.end(); ++pos) { 499 unsigned long ret; 500 501 ret = inaddr & ~((*pos).second); 502 if (ret == (*pos).first) { // inside 503 return true; 504 } 505 } 506 507 // outside 508 return false; 509 } 510 511 512 // if block range is not given, we think it inside also 513 return true; 514 } 515 /* 516 * If it is, return true; otherwise false 517 * not very precise 518 */ 519 bool CUrl::IsForeignHost(string host) 520 { 521 if( host.empty() ) return true; 522 if( host.size() > HOST_LEN ) return true; 523 524 unsigned long inaddr = 0; 525 526 inaddr = (unsigned long)inet_addr( host.c_str() ); 527 if ( inaddr != INADDR_NONE){ // host is just ip 528 return false; 529 } 530 531 string::size_type idx = host.rfind('.'); 532 string tmp; 533 if( idx != string::npos ){ 534 tmp = host.substr(idx+1); 535 } 536 537 CStrFun::Str2Lower( tmp, tmp.size() ); 538 const char *home_host[] ={ 539 "cn","com","net","org","info", 540 "biz","tv","cc", "hk", "tw" 541 }; 542 543 int home_host_num = 10; 544 545 for(int i=0; i<home_host_num; i++){ 546 if( tmp == home_host[i] ) 547 return false; 548 } 549 550 return true; 551 } 552 553 554 bool CUrl::IsImageUrl(string url) 555 { 556 if( url.empty() ) return false; 557 if( url.size() > HOST_LEN ) return false; 558 559 string::size_type idx = url.rfind('.'); 560 string tmp; 561 if( idx != string::npos ){ 562 tmp = url.substr(idx+1); 563 } 564 565 CStrFun::Str2Lower( tmp, tmp.size() ); 566 const char *image_type[] ={ 567 "gif","jpg","jpeg","png","bmp", 568 "tif","psd" 569 }; 570 571 int image_type_num = 7; 572 573 for (int i=0; i<image_type_num; i++) 574 { 575 if( tmp == image_type[i] ) 576 return true; 577 } 578 579 return false; 580 }