• solr6.6 solrJ索引富文本(word/pdf)文件


      1、文件配置

        在core下面新建lib文件夹,存放相关的jar包,如图所示:

        

        

        修改solrconfig.xml

       

    <lib dir="${solr.install.dir:../../../..}/contrib/extraction/lib" regex=".*.jar" />
      <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-cell-d.*.jar" />
    
      <lib dir="${solr.install.dir:../../../..}/contrib/clustering/lib/" regex=".*.jar" />
      <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-clustering-d.*.jar" />
    
      <lib dir="${solr.install.dir:../../../..}/contrib/langid/lib/" regex=".*.jar" />
      <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-langid-d.*.jar" />
    
      <lib dir="${solr.install.dir:../../../..}/contrib/velocity/lib" regex=".*.jar" />
      <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-velocity-d.*.jar" />
      <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*.jar" />
      <lib dir="${solr.install.dir:../../../..}/dist/" regex="solr-dataimporthandler-.*.jar" />
      <lib dir="./lib" regex=".*.jar"/>

        增加配置,如果有则不用添加:

        

     <requestHandler name="/update/extract"
                      startup="lazy"
                      class="solr.extraction.ExtractingRequestHandler" >
        <lst name="defaults">
          <str name="fmap.content">text</str>
          <str name="fmap.meta">ignored_</str>
          <str name="lowernames">true</str>
          <str name="uprefix">attr_</str>
          <str name="captureAttr">true</str>
        </lst>
      </requestHandler>

       配置managed-schema文件:

       

      

      修改managed-schema文件,增加字段:

      <field name="path"      type="string"   indexed="true"  stored="true"  multiValued="false" />
      <field name="pathftype"      type="string"   indexed="true"  stored="true"  multiValued="false" />
      <field name="pathuploaddate"      type="string"   indexed="true"  stored="true"  multiValued="false" />
      <field name="pathsummary"      type="string"   indexed="true"  stored="true"  multiValued="false" />
      <field name="attr_content"      type="text_general"   indexed="true"  stored="true"  multiValued="false" />

      2、Java代码solrj操作(6.6.0版本) 

    import java.io.File;
    import java.io.IOException;
    import java.text.SimpleDateFormat;
    import java.util.Date;
    
    import org.apache.solr.client.solrj.SolrClient;
    import org.apache.solr.client.solrj.SolrQuery;
    import org.apache.solr.client.solrj.SolrServerException;
    import org.apache.solr.client.solrj.impl.HttpSolrClient;
    import org.apache.solr.client.solrj.request.AbstractUpdateRequest.ACTION;
    import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
    import org.apache.solr.client.solrj.response.QueryResponse;
    
    /**
     * @Author:sks
     * @Description:索引pdf等富文本文件
     * @Date:Created in 15:16 2017/12/13
     * @Modified by:
     **/
    public class solr_pdf {
        public static void main(String[] args)
        {
    
            String fileName = "D:/work/Solr/ImportData/20160229001cn.pdf";
            String solrId = "20160229001cn.pdf";
    
            try
            {
                indexFilesSolrCell(solrId, solrId,fileName);
            }
            catch (IOException e)
            {
                e.printStackTrace();
            }
            catch (SolrServerException e)
            {
                e.printStackTrace();
            }
         
        }
    
        /**
         * @Author:sks
         * @Description:获取系统当天日期yyyy-mm-dd
         * @Date:
         */
        private static String GetCurrentDate(){
            Date dt = new Date();
            //最后的aa表示“上午”或“下午”    HH表示24小时制    如果换成hh表示12小时制
    //        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss aa");
            SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
            String day =sdf.format(dt);
            return day;
        }
    
        public static void indexFilesSolrCell(String fileName, String solrId, String path)
                throws IOException, SolrServerException
        {
            String urlString = "http://localhost:8983/solr/test";
            SolrClient solr = new HttpSolrClient.Builder(urlString).build();
    
            ContentStreamUpdateRequest up = new ContentStreamUpdateRequest("/update/extract");
            String contentType = getFileContentType(fileName);
            up.addFile(new File(path), contentType);
            String fileType = fileName.substring(fileName.lastIndexOf(".")+1);
            up.setParam("literal.id", fileName);
    
            up.setParam("literal.path", path);//文件路径
            up.setParam("literal.pathuploaddate", GetCurrentDate());//文件上传时间
            up.setParam("literal.pathftype", fileType);//文件类型,doc,pdf
            up.setParam("fmap.content", "attr_content");//文件内容
            up.setAction(ACTION.COMMIT, true, true);
            solr.request(up);
        }
    
        /**
        * @Author:sks
        * @Description:根据文件名获取文件的ContentType类型
        * @Date: 
        */
        public static String getFileContentType(String filename) {
            String contentType = "";
            String prefix = filename.substring(filename.lastIndexOf(".") + 1);
            if (prefix.equals("xlsx")) {
                contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
            } else if (prefix.equals("pdf")) {
                contentType = "application/pdf";
            } else if (prefix.equals("doc")) {
                contentType = "application/msword";
            } else if (prefix.equals("txt")) {
                contentType = "text/plain";
            } else if (prefix.equals("xls")) {
                contentType = "application/vnd.ms-excel";
            } else if (prefix.equals("docx")) {
                contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
            } else if (prefix.equals("ppt")) {
                contentType = "application/vnd.ms-powerpoint";
            } else if (prefix.equals("pptx")) {
                contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
            }
    
            else {
                contentType = "othertype";
            }
    
            return contentType;
        }
    }
  • 相关阅读:
    final
    职场语句
    故事
    三个关键字
    关于重读字母去掉的代码
    Java书
    docker私库harbor的搭建
    配置允许匿名用户登录访问vsftpd服务,进行文档的上传下载、文档的新建删除等操作
    docker容器内外相互拷贝数据
    docker
  • 原文地址:https://www.cnblogs.com/shaosks/p/8033362.html
Copyright © 2020-2023  润新知