• Spark Distributed matrix 分布式矩阵


    RowMatrix行矩阵

    import org.apache.spark.rdd.RDD
    import org.apache.spark.mllib.linalg.Vectors
    import org.apache.spark.mllib.linalg.distributed.RowMatrix
    
    val df1 = Seq(
         |       (1.0, 2.0, 3.0),
         |       (1.1, 2.1, 3.1),
         |       (1.2, 2.2, 3.2)).toDF("c1", "c2", "c3")
    df1: org.apache.spark.sql.DataFrame = [c1: double, c2: double ... 1 more field]
    
    df1.show
    +---+---+---+
    | c1| c2| c3|
    +---+---+---+
    |1.0|2.0|3.0|
    |1.1|2.1|3.1|
    |1.2|2.2|3.2|
    +---+---+---+
                           
    // DataFrame转换成RDD[Vector]
    val rowsVector= df1.rdd.map {
         |       x =>
         |         Vectors.dense(
         |           x(0).toString().toDouble,
         |           x(1).toString().toDouble,
         |           x(2).toString().toDouble)
         |     }
    rowsVector: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[4] at map
    
    // Create a RowMatrix from an RDD[Vector].
    val mat1: RowMatrix = new RowMatrix(rowsVector)
    mat1: org.apache.spark.mllib.linalg.distributed.RowMatrix = org.apache.spark.mllib.linalg.distributed.RowMatrix@7ba821ef
    
    // Get its size.
    val m = mat1.numRows()
    m: Long = 3                                                                     
    
    val n = mat1.numCols()
    n: Long = 3
    
    // 将RowMatrix转换成DataFrame
    val resDF = mat1.rows.map {
         |       x =>
         |         (x(0).toDouble,
         |           x(1).toDouble,
         |           x(2).toDouble)
         |     }.toDF("c1", "c2", "c3")
    resDF: org.apache.spark.sql.DataFrame = [c1: double, c2: double ... 1 more field]
    
    resDF.show
    +---+---+---+
    | c1| c2| c3|
    +---+---+---+
    |1.0|2.0|3.0|
    |1.1|2.1|3.1|
    |1.2|2.2|3.2|
    +---+---+---+
    
    
    mat1.rows.collect().take(10)
    res3: Array[org.apache.spark.mllib.linalg.Vector] = Array([1.0,2.0,3.0], [1.1,2.1,3.1], [1.2,2.2,3.2])
    

    CoordinateMatrix坐标矩阵

    import org.apache.spark.rdd.RDD
    import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
    
    // 第一列:行坐标;第二列:列坐标;第三列:矩阵元素
    val df = Seq(
         |       (0, 0, 1.1), (0, 1, 1.2), (0, 2, 1.3),
         |       (1, 0, 2.1), (1, 1, 2.2), (1, 2, 2.3),
         |       (2, 0, 3.1), (2, 1, 3.2), (2, 2, 3.3),
         |       (3, 0, 4.1), (3, 1, 4.2), (3, 2, 4.3)).toDF("row", "col", "value")
    df: org.apache.spark.sql.DataFrame = [row: int, col: int ... 1 more field]
    
    df.show
    +---+---+-----+
    |row|col|value|
    +---+---+-----+
    |  0|  0|  1.1|
    |  0|  1|  1.2|
    |  0|  2|  1.3|
    |  1|  0|  2.1|
    |  1|  1|  2.2|
    |  1|  2|  2.3|
    |  2|  0|  3.1|
    |  2|  1|  3.2|
    |  2|  2|  3.3|
    |  3|  0|  4.1|
    |  3|  1|  4.2|
    |  3|  2|  4.3|
    +---+---+-----+
    
    // 生成入口矩阵
    val entr = df.rdd.map { x =>
         |       val a = x(0).toString().toLong
         |       val b = x(1).toString().toLong
         |       val c = x(2).toString().toDouble
         |       MatrixEntry(a, b, c)
         |     }
    entr: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.distributed.MatrixEntry] = MapPartitionsRDD[20] at map
    
    // 生成坐标矩阵
    val mat: CoordinateMatrix = new CoordinateMatrix(entr)
    mat: org.apache.spark.mllib.linalg.distributed.CoordinateMatrix = org.apache.spark.mllib.linalg.distributed.CoordinateMatrix@5381deec
    
    mat.numRows()
    res5: Long = 4                                                                  
    
    mat.numCols()
    res6: Long = 3
    
    mat.entries.collect().take(10)
    res7: Array[org.apache.spark.mllib.linalg.distributed.MatrixEntry] = Array(MatrixEntry(0,0,1.1), MatrixEntry(0,1,1.2), MatrixEntry(0,2,1.3), MatrixEntry(1,0,2.1), MatrixEntry(1,1,2.2), MatrixEntry(1,2,2.3), MatrixEntry(2,0,3.1), MatrixEntry(2,1,3.2), MatrixEntry(2,2,3.3), MatrixEntry(3,0,4.1))
    
    // 坐标矩阵转成,带行索引的DataFrame,行索引为行坐标
    val t = mat.toIndexedRowMatrix().rows.map { x =>
         |       val v=x.vector
         |       (x.index,v(0).toDouble, v(1).toDouble, v(2).toDouble)
         |     }
    t: org.apache.spark.rdd.RDD[(Long, Double, Double, Double)] = MapPartitionsRDD[33] at map
    
    t.toDF().show
    +---+---+---+---+                                                               
    | _1| _2| _3| _4|
    +---+---+---+---+
    |  0|1.1|1.2|1.3|
    |  1|2.1|2.2|2.3|
    |  2|3.1|3.2|3.3|
    |  3|4.1|4.2|4.3|
    +---+---+---+---+
    
    // 坐标矩阵转成DataFrame
    val t1 = mat.toRowMatrix().rows.map { x =>
         |       (x(0).toDouble, x(1).toDouble, x(2).toDouble)
         |     }
    t1: org.apache.spark.rdd.RDD[(Double, Double, Double)] = MapPartitionsRDD[26] at map
    
    t1.toDF().show
    +---+---+---+
    | _1| _2| _3|
    +---+---+---+
    |1.1|1.2|1.3|
    |3.1|3.2|3.3|
    |2.1|2.2|2.3|
    |4.1|4.2|4.3|
    +---+---+---+
    
  • 相关阅读:
    Java内部类
    Java创建对象的初始化顺序
    Java多态与动态绑定
    Java访问修饰符
    Django框架学习----视图与模板(详情页的上下篇文章跳转跳转)
    Django框架学习----视图与模板(首页与详情页的跳转)
    Django框架学习----视图与模板(显示数据库数据到页面)
    Django框架学习----视图与模板(网站页面设计)
    Django框架学习----模型层
    Logstash同步mysql数据库信息到ES
  • 原文地址:https://www.cnblogs.com/wwxbi/p/6815685.html
Copyright © 2020-2023  润新知