• 百度题目TOP K问题


    某天有1千万条查询,大部分为重复的,可能只有300万条查询,每条查询的长度为1-255字节,请设计算法查找出最热门的10条查询

    哈希 + 最小堆 时间复杂度为O(nlgk) n为数据量 , k为查询长度,这里为10;

    #include <stdio.h>
    #include <cstring>
    #include <algorithm>
    using namespace std;
    #define HASHLEN 2807303
    #define CHARLEN 30
    typedef struct node_no_space* ptr_no_space;
    typedef struct node_has_space * ptr_has_space;
     ptr_no_space  head[HASHLEN];
     
    struct node_no_space
    {
    	char* word;
    	int count;
    	node_no_space * next;
    };
    struct node_has_space
    {
    	char word[CHARLEN];
    	int count;
    };
    bool cmp(const node_has_space a ,const node_has_space b )
    {
    	return a.count > b.count ;
    }
      int hash_funtion(char *p)
      {
    	  int value = 0 ;
    	  while ( *p != '\0')
    	  {
    		 value = value * 31 + *p++;
    		 if ( value > HASHLEN)
    			 value = value % HASHLEN;
    	  }
    	  return value;
      }
      void addwordToTable(char * str)
      {
    		int index = hash_funtion(str);
    	    ptr_no_space temp = head[index]; //判断头结点
    		while ( temp != NULL ) 
    		{
    			if ( !strcmp(temp->word,str))
    			{
    				temp->count ++;
    				return ;
    			}
    			temp = temp->next;
    		}
    		//不在任意的index里面,新开一条记录
    		ptr_no_space new_list = new node_no_space;
    		new_list->count =1;
    		new_list->word = new char[strlen(str ) +1 ];
    		strcpy(new_list->word , str);
    		new_list->next = head[index];
    		head[index] = new_list;
      }
    //去除前后的特殊符号
    void handle_symbol(char *str, int n)  
    {  
        while (str[n] < '0' || (str[n] > '9' && str[n] < 'A') || (str[n] > 'Z' && str[n] < 'a') || str[n] > 'z')  
        {  
            str[n] = '\0';  
            n--;  
        }  
          
        while (str[0] < '0' || (str[0] > '9' && str[0] < 'A') || (str[0] > 'Z' && str[0] < 'a') || str[0] > 'z')  
        {  
            int i = 0;  
            while (i < n)  
            {  
                str[i] = str[i+1];  
                i++;  
            }  
            str[i] = '\0';  
            n--;  
        }  
    }  
    void write_to_file()
    {
    	FILE *fp = fopen("result.txt","w");
    	for ( int i = 0 ; i < HASHLEN; i++)
    	{
    		ptr_no_space tmp = head[i];
    		while (  tmp != NULL )
    		{
    			
    			fprintf(fp,"%s %d\n" ,tmp->word , tmp->count);
    			tmp = tmp->next ;
    		}
    	}
    	fclose(fp);
    }
    int main()
    {
    	FILE *fp_read  = fopen("string.txt","r");
    	
    	char str[CHARLEN];
    	for ( int i = 0 ; i < HASHLEN ; i++)
    		head[i] = NULL;
    	while ( fscanf(fp_read,"%s" , &str) != EOF)
    	{
    		 int n = strlen(str) - 1;  
            if (n > 0)  
                handle_symbol(str, n);  
            addwordToTable(str);//往哈希表中添加str
    	}
    	fclose(fp_read);
    	write_to_file();//写入文件
    	ptr_has_space heap = new node_has_space [10];
    	FILE *fp_result = fopen("result.txt","r");
    	int c;
    	for ( int i = 0 ; i < 10 ; i++)
    	{
    		fscanf(fp_result,"%s %d" ,&str  ,&c);
    		heap[i].count = c;
    		strcpy(heap[i].word , str);
    	}
    	//建立最小堆
    	make_heap(heap,heap+10,cmp);
    	ptr_has_space p = new node_has_space;
    	//不断读入result.txt中数据 , 维护最小堆
    	while ( fscanf(fp_result,"%s %d" ,&p->word , &p->count) != EOF)
    	{
    		if ( p->count > heap[0].count)
    		{
    			heap[0].count = p->count;
    			strcpy(heap[0].word , p->word);
    			make_heap(heap , heap+10 , cmp);
    		}
    	}
    	fclose(fp_result);
    	//输出堆中结果
    	sort_heap(heap,heap+10 ,cmp);
    	for ( int i = 0 ; i < 10  ; i++)
    		printf("%s %d\n", heap[i].word , heap[i].count);
    	return 0 ;
    }
    

      

  • 相关阅读:
    【转】开发人员一定要加入收藏夹的网站
    ASP.NET页面之间传递值的几种方式
    查询数据库中字段内容相同的记录
    将csv文件导入到数据库中
    XMLHttpRequest对象(三)
    Ajax基础(一)
    Ajax浏览器支持(二)
    javascript获取浏览器的
    SQL SERVER 通过链接服务器访问ORACLE 包中的存储过程 带参数
    SQL 添加链接服务器
  • 原文地址:https://www.cnblogs.com/lzhenf/p/2410521.html
Copyright © 2020-2023  润新知