• 进阶实验5-3.3 基于词频的文件相似度 (30分)-哈希


     

     解题思路:

    1、存储:用一张哈希表存储单词以及对应所在的文件,再用一张文件表,存储每个文件的词汇量以及单词在哈希表中的位置

    2、查询:先在文件表中查询对应的文件名,(取文件词汇量较少的文件名)-> 找到对应文件名中的词汇所在位置-> 根据此单词的位置到哈希表中查找单词所在文件列表->从而判断该单词是否是两文件的公共词汇

    重复步骤2,直至文件中的单词全部查询完毕

    #include <stdio.h>
    #include <string.h>
    #include <malloc.h>
    #include <ctype.h>
    
    #define MAXSIZE 500009
    #define MAXS 10
    #define MINS 3
    #define MAXB 5
    
    typedef char Element[MAXS+1];
    typedef struct FileEntry *FList;
    struct FileEntry{
        short FileNo;
        FList Next;
    };
    typedef struct WordEntry *WList;
    struct WordEntry{
        int words;
        WList Next;
    };
    
    struct HashEntry{
        short FileNo;
        Element Word;
        FList InvIndex;
    };
    typedef struct HashTbl *HashTable;
    struct HashTbl{
        int TableSize;
        struct HashEntry *TheCells;
    };
    
    HashTable InitialHashTable(int size);//哈希表初始化
    WList CreateWordList(int size);//文件单词表初始化
    int Hash(Element Key,int P);//哈希函数
    int Find(HashTable H,Element Key);//获取存储位置
    int FindAndInsert(HashTable H,Element Key,int FileNo);//插入哈希表(前插法)
    void FileInsert(WList File,int Pos,int FileNo);//插入文凭单词表(前插法)
    int GetWord(Element Word);//获取单词
    double CalSim(HashTable H,WList File,int F1,int F2);//计算公共词汇量占两文件总词汇量的百分比
    
    int main()
    {
        int i,N,M,F1,F2;
        HashTable H;
        WList File;
        Element Word;
        
        scanf("%d",&N);
        H=InitialHashTable(MAXSIZE);
        File=CreateWordList(N+1);
        
        for(i=1;i<=N;i++)
        {
            while(GetWord(Word))
            FileInsert(File,FindAndInsert(H,Word,i),i);
        }
        
        scanf("%d",&M);
        for(i=0;i<M;i++)
        {
            scanf("%d%d",&F1,&F2);
            printf("%.1lf%%
    ",CalSim(H,File,F1,F2));
        }
    }
    
    HashTable InitialHashTable(int size)
    {
        HashTable H=malloc(sizeof(struct HashTbl));
        H->TheCells=malloc(sizeof(struct HashEntry)*size);
        H->TableSize=size;
        while(size)
        {
            H->TheCells[--size].InvIndex=NULL;
            H->TheCells[size].FileNo=0;
        }
        return H;
    }
    
    WList CreateWordList(int size)
    {
        WList F=malloc(sizeof(struct WordEntry)*size);
        while(size)
        {
            F[--size].words=0;
            F[size].Next=NULL;
        }
        return F;
    }
    
    int Hash(Element Key,int P)
    {
        unsigned int h=0;
        while(*Key!='')
        {
            h=(h<<MAXB)+(*Key++-'a');
        }
        return h%P;
    }
    
    int Find(HashTable H,Element Key)
    {
        int Pos=Hash(Key,H->TableSize);
        while(H->TheCells[Pos].FileNo&&strcmp(H->TheCells[Pos].Word,Key))
        {
            Pos++;
            if(Pos==H->TableSize)
            Pos-=H->TableSize;
        }
        return Pos;
    }
    int FindAndInsert(HashTable H,Element Key,int FileNo)
    {
        int Pos=Find(H,Key);
        if(H->TheCells[Pos].FileNo!=FileNo)
        {
            if(!H->TheCells[Pos].FileNo)
            strcpy(H->TheCells[Pos].Word,Key);
            H->TheCells[Pos].FileNo=FileNo;
            
            FList node=malloc(sizeof(struct FileEntry));
            node->FileNo=FileNo;
            node->Next=H->TheCells[Pos].InvIndex;
            H->TheCells[Pos].InvIndex=node;
            return Pos;
        }
        return -1;
    }
    void FileInsert(WList File,int Pos,int FileNo)
    {
        if(Pos<0)return;
        WList W=malloc(sizeof(struct WordEntry));
        W->words=Pos;
        W->Next=File[FileNo].Next;
        File[FileNo].Next=W;
        File[FileNo].words++;
    }
    
    int GetWord(Element Word)
    {
        char c;
        int p=0;
        scanf("%c",&c);
        while(!isalpha(c)&&(c!='#'))scanf("%c",&c);
        if(c=='#')return 0;
        while(isalpha(c)&&(p<MAXS))
        {
            Word[p++]=tolower(c);
            scanf("%c",&c);
        }
        while(isalpha(c))scanf("%c",&c);
        if(p<MINS)return GetWord(Word);
        else
        {
            Word[p]='';
            return 1;
        }
    }
    
    double CalSim(HashTable H,WList File,int F1,int F2)
    {
        int i;
        if(File[F1].words>File[F2].words)
        {
            i=F1;F1=F2;F2=i;
        }
        WList W=File[F1].Next;
        i=0;
        while(W)
        {
            FList F=H->TheCells[W->words].InvIndex;
            while(F)
            {
                if(F->FileNo==F2)
                break;
                F=F->Next;
            }
            if(F) i++;
            W=W->Next;
        }
        return ((double)(i*100)/(double)(File[F1].words+File[F2].words-i));
    }
  • 相关阅读:
    flask_日期和时间
    使用SQLAlchemy对博客文章进行分页
    P2725 邮票 Stamps
    P2679 子串
    P3396 哈希冲突
    P1754 球迷购票问题
    P1504 积木城堡
    P1244 青蛙过河
    CSP-S 2019 考试分析
    2019.11.11 模拟赛 T2 乘积求和
  • 原文地址:https://www.cnblogs.com/snzhong/p/12662503.html
Copyright © 2020-2023  润新知