• segmenter_worker.go


    package engine

    import (
        "github.com/huichen/wukong/types"
    )

    type segmenterRequest struct {
        docId       uint64
        hash        uint32
        data        types.DocumentIndexData
        forceUpdate bool
    }

    func (engine *Engine) segmenterWorker() {
        for {
            request := <-engine.segmenterChannel
            if request.docId == 0 {
                if request.forceUpdate {
                    for i := 0; i < engine.initOptions.NumShards; i++ {
                        engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
                    }
                }
                continue
            }

            shard := engine.getShard(request.hash)
            tokensMap := make(map[string][]int)
            numTokens := 0
            if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" {
                // 当文档正文不为空时,优先从内容分词中得到关键词
                segments := engine.segmenter.Segment([]byte(request.data.Content))
                for _, segment := range segments {
                    token := segment.Token().Text()
                    if !engine.stopTokens.IsStopToken(token) {
                        tokensMap[token] = append(tokensMap[token], segment.Start())
                    }
                }
                numTokens = len(segments)
            } else {
                // 否则载入用户输入的关键词
                for _, t := range request.data.Tokens {
                    if !engine.stopTokens.IsStopToken(t.Text) {
                        tokensMap[t.Text] = t.Locations
                    }
                }
                numTokens = len(request.data.Tokens)
            }

            // 加入非分词的文档标签
            for _, label := range request.data.Labels {
                if !engine.initOptions.NotUsingSegmenter {
                    if !engine.stopTokens.IsStopToken(label) {
                        //当正文中已存在关键字时,若不判断,位置信息将会丢失
                        if _, ok := tokensMap[label]; !ok {
                            tokensMap[label] = []int{}
                        }
                    }
                } else {
                    //当正文中已存在关键字时,若不判断,位置信息将会丢失
                    if _, ok := tokensMap[label]; !ok {
                        tokensMap[label] = []int{}
                    }
                }
            }

            indexerRequest := indexerAddDocumentRequest{
                document: &types.DocumentIndex{
                    DocId:       request.docId,
                    TokenLength: float32(numTokens),
                    Keywords:    make([]types.KeywordIndex, len(tokensMap)),
                },
                forceUpdate: request.forceUpdate,
            }
            iTokens := 0
            for k, v := range tokensMap {
                indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
                    Text: k,
                    // 非分词标注的词频设置为0,不参与tf-idf计算
                    Frequency: float32(len(v)),
                    Starts:    v}
                iTokens++
            }

            engine.indexerAddDocChannels[shard] <- indexerRequest
            if request.forceUpdate {
                for i := 0; i < engine.initOptions.NumShards; i++ {
                    if i == shard {
                        continue
                    }
                    engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
                }
            }
            rankerRequest := rankerAddDocRequest{
                docId: request.docId, fields: request.data.Fields}
            engine.rankerAddDocChannels[shard] <- rankerRequest
        }
    }

  • 相关阅读:
    Jenkins的插件管理(安装和更新插件)
    [Flutter] MacOS/Windows Flutter 环境走一遍
    [Sw] 使用 Swoole Server task/协程 处理大数据量异步任务时注意
    [Sw] Swoole-4.2.9 可以尝试愉快应用 Swoole 协程
    [PHP] 常备的现代 PHP 项目开发准备
    [SF] Symfony 标准 HttpFoundationRequest 实现分析
    [Linux] umask 从三类人群的权限中拿走权限数字
    [Design] 后端程序的高并发与异步
    [Linux]系统管理: 进程管理(ps/top/pstree/kill/pkill), 工作管理, 系统资源查看, 系统定时任务
    [FE] 有效开展一个前端项目-V2 (vuejs-templates/webpack)
  • 原文地址:https://www.cnblogs.com/zhangboyu/p/7461678.html
Copyright © 2020-2023  润新知