• solr dataimport 数据导入源码分析 补充


    上部分的代码还可以进一步优化,主要是构建Collection<SolrInputDocument> 集合,分批次提交,优化新增索引速度

    其实分页方式也是分批次提交的,不过这种方式 更优雅

    参考如下代码 

    import java.io.IOException;
    import java.net.MalformedURLException;
    import java.sql.ResultSet;
    import java.sql.ResultSetMetaData;
    import java.sql.SQLException;
    import java.sql.Types;
    import java.util.ArrayList;
    import java.util.Collection;

    import org.apache.solr.client.solrj.SolrServerException;
    import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
    import org.apache.solr.common.SolrInputDocument;

    public class Test
    {
        private static int fetchSize = 1000;
        private static String url = "http://localhost:8983/solr/core1/";
        private static CommonsHttpSolrServer solrCore;

        public Test() throws MalformedURLException
        {
             solrCore = new CommonsHttpSolrServer(url);
        }

        /**
         * Takes an SQL ResultSet and adds the documents to solr. Does it in batches
         * of fetchSize.
         * 
         * 
    @param rs
         *            A ResultSet from the database.
         * 
    @return The number of documents added to solr.
         * 
    @throws SQLException
         * 
    @throws SolrServerException
         * 
    @throws IOException
         
    */
        public long addResultSet(ResultSet rs) throws SQLException,
                SolrServerException, IOException
        {
            long count = 0;
            int innerCount = 0;
            Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
            ResultSetMetaData rsm = rs.getMetaData();
            int numColumns = rsm.getColumnCount();
            String[] colNames = new String[numColumns + 1];

            /**
             * JDBC numbers the columns starting at 1, so the normal java convention
             * of starting at zero won't work.
             
    */
            for (int i = 1; i < (numColumns + 1); i++)
            {
                colNames[i] = rsm.getColumnName(i);
                /**
                 * If there are fields that you want to handle manually, check for
                 * them here and change that entry in colNames to null. This will
                 * cause the loop in the next section to skip that database column.
                 
    */
                // //Example:
                
    // if (rsm.getColumnName(i) == "db_id")
                
    // {
                
    // colNames[i] = null;
                
    // }
            }

            while (rs.next())
            {
                count++;
                innerCount++;

                SolrInputDocument doc = new SolrInputDocument();

                /**
                 * At this point, take care of manual document field assignments for
                 * which you previously assigned the colNames entry to null.
                 
    */
                // //Example:
                
    // doc.addField("solr_db_id", rs.getLong("db_id"));

                for (int j = 1; j < (numColumns + 1); j++)
                {
                    if (colNames[j] != null)
                    {
                        Object f;
                        switch (rsm.getColumnType(j))
                        {
                            case Types.BIGINT:
                            {
                                f = rs.getLong(j);
                                break;
                            }
                            case Types.INTEGER:
                            {
                                f = rs.getInt(j);
                                break;
                            }
                            case Types.DATE:
                            {
                                f = rs.getDate(j);
                                break;
                            }
                            case Types.FLOAT:
                            {
                                f = rs.getFloat(j);
                                break;
                            }
                            case Types.DOUBLE:
                            {
                                f = rs.getDouble(j);
                                break;
                            }
                            case Types.TIME:
                            {
                                f = rs.getDate(j);
                                break;
                            }
                            case Types.BOOLEAN:
                            {
                                f = rs.getBoolean(j);
                                break;
                            }
                            default:
                            {
                                f = rs.getString(j);
                            }
                        }
                        doc.addField(colNames[j], f);
                    }
                }
                docs.add(doc);

                /**
                 * When we reach fetchSize, index the documents and reset the inner
                 * counter.
                 
    */
                if (innerCount == fetchSize)
                {
                    solrCore.add(docs);
                    docs.clear();
                    innerCount = 0;
                }
            }

            /**
             * If the outer loop ended before the inner loop reset, index the
             * remaining documents.
             
    */
            if (innerCount != 0)
            {
                solrCore.add(docs);
            }
            return count;
        }
    }
  • 相关阅读:
    【论文阅读】Transformer及其在计算机视觉领域上的应用
    【学习笔记】asyncio的使用
    【论文阅读】Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization
    【论文阅读】Bag of Tricks for Image Classification with Convolutional Neural Networks
    【论文阅读】Bag of Tricks and A Strong Baseline for Deep Person Re-identification
    【论文阅读】主动学习 (Active Learning)
    可能有点用的东西
    .vimrc
    莫比乌斯反演 学习笔记
    对拍
  • 原文地址:https://www.cnblogs.com/chenying99/p/2678175.html
Copyright © 2020-2023  润新知