• MongoSpark 28799错误


    Exception in thread "main" com.mongodb.MongoCommandException: Command failed with error 28799: 'Received error in response from 192.168.12.161:27018: { $err: "$sample stage could not find a non-duplicate document after 100 while using a random cursor. This is likely a sporadic failure, please try again.", code: 28799 }' on server 192.168.12.161:27017. The full response is { "ok" : 0.0, "errmsg" : "Received error in response from 192.168.12.161:27018: { $err: "$sample stage could not find a non-duplicate document after 100 while using a random cursor. This is likely a sporadic failure, please try again.", code: 28799 }", "code" : 28799, "codeName" : "Location28799" }
        at com.mongodb.connection.ProtocolHelper.getCommandFailureException(ProtocolHelper.java:115)
        at com.mongodb.connection.CommandProtocol.execute(CommandProtocol.java:114)
        at com.mongodb.connection.DefaultServer$DefaultServerProtocolExecutor.execute(DefaultServer.java:168)
        at com.mongodb.connection.DefaultServerConnection.executeProtocol(DefaultServerConnection.java:289)
        at com.mongodb.connection.DefaultServerConnection.command(DefaultServerConnection.java:176)
        at com.mongodb.operation.CommandOperationHelper.executeWrappedCommandProtocol(CommandOperationHelper.java:216)
        at com.mongodb.operation.CommandOperationHelper.executeWrappedCommandProtocol(CommandOperationHelper.java:207)
        at com.mongodb.operation.CommandOperationHelper.executeWrappedCommandProtocol(CommandOperationHelper.java:113)
        at com.mongodb.operation.AggregateOperation$1.call(AggregateOperation.java:257)
        at com.mongodb.operation.AggregateOperation$1.call(AggregateOperation.java:253)
        at com.mongodb.operation.OperationHelper.withConnectionSource(OperationHelper.java:431)
        at com.mongodb.operation.OperationHelper.withConnection(OperationHelper.java:404)
        at com.mongodb.operation.AggregateOperation.execute(AggregateOperation.java:253)
        at com.mongodb.operation.AggregateOperation.execute(AggregateOperation.java:67)
        at com.mongodb.Mongo.execute(Mongo.java:836)
        at com.mongodb.Mongo$2.execute(Mongo.java:823)
        at com.mongodb.OperationIterable.iterator(OperationIterable.java:47)
        at com.mongodb.OperationIterable.forEach(OperationIterable.java:70)
        at com.mongodb.OperationIterable.into(OperationIterable.java:82)
        at com.mongodb.AggregateIterableImpl.into(AggregateIterableImpl.java:143)
        at com.mongodb.spark.rdd.partitioner.MongoSamplePartitioner$$anonfun$8.apply(MongoSamplePartitioner.scala:103)
        at com.mongodb.spark.rdd.partitioner.MongoSamplePartitioner$$anonfun$8.apply(MongoSamplePartitioner.scala:97)
        at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:186)
        at com.mongodb.spark.MongoConnector$$anonfun$withCollectionDo$1.apply(MongoConnector.scala:184)
        at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:171)
        at com.mongodb.spark.MongoConnector$$anonfun$withDatabaseDo$1.apply(MongoConnector.scala:171)
        at com.mongodb.spark.MongoConnector.withMongoClientDo(MongoConnector.scala:154)
        at com.mongodb.spark.MongoConnector.withDatabaseDo(MongoConnector.scala:171)
        at com.mongodb.spark.MongoConnector.withCollectionDo(MongoConnector.scala:184)
        at com.mongodb.spark.rdd.partitioner.MongoSamplePartitioner.partitions(MongoSamplePartitioner.scala:96)
        at com.mongodb.spark.rdd.partitioner.DefaultMongoPartitioner.partitions(DefaultMongoPartitioner.scala:34)
        at com.mongodb.spark.rdd.MongoRDD.getPartitions(MongoRDD.scala:137)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
        at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
        at scala.Option.getOrElse(Option.scala:121)
        at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
        at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
        at org.apache.spark.rdd.RDD.count(RDD.scala:1158)
        at org.jh.TestSpark$.doTest(DocHandler.scala:17)
        at org.jh.TestSpark$.main(DocHandler.scala:29)
        at org.jh.TestSpark.main(DocHandler.scala)

    错误如上,解决方式如下,根据connector源码(并没有完全看懂),分析出现这个问题的原因是因为:

    if (numDocumentsPerPartition >= count) {
              MongoSinglePartitioner.partitions(connector, readConfig, pipeline)
            } else {
              val samples = connector.withCollectionDo(readConfig, {
                coll: MongoCollection[BsonDocument] =>
                  coll.aggregate(List(
                    Aggregates.`match`(matchQuery),
                    Aggregates.sample(numberOfSamples),
                    Aggregates.project(Projections.include(partitionKey)),
                    Aggregates.sort(Sorts.ascending(partitionKey))
                  ).asJava).allowDiskUse(true).into(new util.ArrayList[BsonDocument]()).asScala
              })
              def collectSplit(i: Int): Boolean = (i % samplesPerPartition == 0) || !matchQuery.isEmpty && i == count - 1
              val rightHandBoundaries = samples.zipWithIndex.collect {
                case (field, i) if collectSplit(i) => field.get(partitionKey)
              }
              PartitionerHelper.createPartitions(partitionKey, rightHandBoundaries, PartitionerHelper.locations(connector))
            }
    

      numDocumentsPerPartition < count,导致执行了else代码出现的,else先进行sample,然后:

    val numDocumentsPerPartition: Int = math.floor(partitionSizeInBytes.toFloat / avgObjSizeInBytes).toInt
    val numberOfSamples = math.floor(samplesPerPartition * count / numDocumentsPerPartition.toFloat).toInt

      为了避免出错,所以要降低numberOfSamples,那么就需要降低samplesPerPartition,增加numDocumentsPerPartition,samplesPerPartition通过调低spark.mongodb.input.partitionerOptions.samplesPerPartition实现,增加numDocumentsPerPartition通过调大spark.mongodb.input.partitionerOptions.partitionSizeMB实现。并且调大spark.mongodb.input.partitionerOptions.partitionSizeMB会提高numDocumentsPerPartition的数值,可以避免进入else下面的代码块。

      所以解决方案如下:

    SparkSession.builder()
    //			.master("local")
    			.master(sparkURI)
    			.config(new SparkConf().setJars(Array(s"${hdfsURI}/mongolib/mongo-spark-connector_2.11-2.2.1.jar",
    					s"${hdfsURI}/mongolib/bson-3.4.2.jar",
    					s"${hdfsURI}/mongolib/mongo-java-driver-3.4.2.jar",
    					s"${hdfsURI}/mongolib/mongodb-driver-3.4.2.jar",
    					s"${hdfsURI}/mongolib/mongodb-driver-core-3.4.2.jar",
    					s"${hdfsURI}/mongolib/commons-io-2.5.jar",
    					s"${hdfsURI}/mongolib/config-1.2.1.jar",
    					s"${hdfsURI}/${jarName}") ++ extJars))  	  
    			.config("spark.cores.max", 80)		
    			.config("spark.executor.cores", 16)
    			.config("spark.executor.memory", "32g")
    			.config("spark.mongodb.input.uri", inp)
    			.config("spark.mongodb.output.uri", oup)
    			.config("spark.mongodb.input.partitionerOptions.samplesPerPartition", 1)
    			.config("spark.mongodb.input.partitionerOptions.partitionSizeMB", 128)			
    			.getOrCreate()
    

      

  • 相关阅读:
    使用Dapper参数化查询(三) IN 查询
    cs窗体继承问题
    SVN使用教程总结(转载)
    celery——使用
    celery——简介及安装
    luffy——django中使用redis
    redis——redis入门(常用数据类型:l )
    redis——redis入门(二)
    redis——redis入门(常用数据类型:string hash)
    redis——redis入门(一)
  • 原文地址:https://www.cnblogs.com/gaoze/p/8383802.html
Copyright © 2020-2023  润新知