• Spark之GraphX的Graph_scala学习


    /*
     * Licensed to the Apache Software Foundation (ASF) under one or more
     * contributor license agreements.  See the NOTICE file distributed with
     * this work for additional information regarding copyright ownership.
     * The ASF licenses this file to You under the Apache License, Version 2.0
     * (the "License"); you may not use this file except in compliance with
     * the License.  You may obtain a copy of the License at
     *
     *    http://www.apache.org/licenses/LICENSE-2.0
     *
     * Unless required by applicable law or agreed to in writing, software
     * distributed under the License is distributed on an "AS IS" BASIS,
     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     * See the License for the specific language governing permissions and
     * limitations under the License.
     */
    
    package org.apache.spark.graphx
    
    import scala.language.implicitConversions
    import scala.reflect.ClassTag
    
    import org.apache.spark.graphx.impl._
    import org.apache.spark.rdd.RDD
    import org.apache.spark.storage.StorageLevel
    
    
    /**
     * The Graph abstractly represents a graph with arbitrary objects
     * associated with vertices and edges.  The graph provides basic
     * operations to access and manipulate the data associated with
     * vertices and edges as well as the underlying structure.  Like Spark
     * RDDs, the graph is a functional data-structure in which mutating
     * operations return new graphs.
     *
     * @note [[GraphOps]] contains additional convenience operations and graph algorithms.
     *
     * @tparam VD the vertex attribute type
     * @tparam ED the edge attribute type
     */
    abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializable {
    
      /**
       * An RDD containing the vertices and their associated attributes.
       *
       * @note vertex ids are unique.
       * @return an RDD containing the vertices in this graph
       */
      @transient val vertices: VertexRDD[VD]
    
      /**
       * An RDD containing the edges and their associated attributes.  The entries in the RDD contain
       * just the source id and target id along with the edge data.
       *
       * @return an RDD containing the edges in this graph
       *
       * @see [[Edge]] for the edge type.
       * @see [[triplets]] to get an RDD which contains all the edges
       * along with their vertex data.
       *
       */
      @transient val edges: EdgeRDD[ED, VD]
    
      /**
       * An RDD containing the edge triplets, which are edges along with the vertex data associated with
       * the adjacent vertices. The caller should use [[edges]] if the vertex data are not needed, i.e.
       * if only the edge data and adjacent vertex ids are needed.
       *
       * @return an RDD containing edge triplets
       *
       * @example This operation might be used to evaluate a graph
       * coloring where we would like to check that both vertices are a
       * different color.
       * {{{
       * type Color = Int
       * val graph: Graph[Color, Int] = GraphLoader.edgeListFile("hdfs://file.tsv")
       * val numInvalid = graph.triplets.map(e => if (e.src.data == e.dst.data) 1 else 0).sum
       * }}}
       */
      @transient val triplets: RDD[EdgeTriplet[VD, ED]]
    
      /**
       * Caches the vertices and edges associated with this graph at the specified storage level,
       * ignoring any target storage levels previously set.
       *
       * @param newLevel the level at which to cache the graph.
       *
       * @return A reference to this graph for convenience.
       */
      def persist(newLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, ED]
    
      /**
       * Caches the vertices and edges associated with this graph at the previously-specified target
       * storage levels, which default to `MEMORY_ONLY`. This is used to pin a graph in memory enabling
       * multiple queries to reuse the same construction process.
       */
      def cache(): Graph[VD, ED]
    
      /**
       * Uncaches only the vertices of this graph, leaving the edges alone. This is useful in iterative
       * algorithms that modify the vertex attributes but reuse the edges. This method can be used to
       * uncache the vertex attributes of previous iterations once they are no longer needed, improving
       * GC performance.
       */
      def unpersistVertices(blocking: Boolean = true): Graph[VD, ED]
    
      /**
       * Repartitions the edges in the graph according to `partitionStrategy`.
       *
       * @param partitionStrategy the partitioning strategy to use when partitioning the edges
       * in the graph.
       */
      def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED]
    
      /**
       * Repartitions the edges in the graph according to `partitionStrategy`.
       *
       * @param partitionStrategy the partitioning strategy to use when partitioning the edges
       * in the graph.
       * @param numPartitions the number of edge partitions in the new graph.
       */
      def partitionBy(partitionStrategy: PartitionStrategy, numPartitions: Int): Graph[VD, ED]
    
      /**
       * Transforms each vertex attribute in the graph using the map function.
       *
       * @note The new graph has the same structure.  As a consequence the underlying index structures
       * can be reused.
       *
       * @param map the function from a vertex object to a new vertex value
       *
       * @tparam VD2 the new vertex data type
       *
       * @example We might use this operation to change the vertex values
       * from one type to another to initialize an algorithm.
       * {{{
       * val rawGraph: Graph[(), ()] = Graph.textFile("hdfs://file")
       * val root = 42
       * var bfsGraph = rawGraph.mapVertices[Int]((vid, data) => if (vid == root) 0 else Math.MaxValue)
       * }}}
       *
       */
      def mapVertices[VD2: ClassTag](map: (VertexId, VD) => VD2)
        (implicit eq: VD =:= VD2 = null): Graph[VD2, ED]
    
      /**
       * Transforms each edge attribute in the graph using the map function.  The map function is not
       * passed the vertex value for the vertices adjacent to the edge.  If vertex values are desired,
       * use `mapTriplets`.
       *
       * @note This graph is not changed and that the new graph has the
       * same structure.  As a consequence the underlying index structures
       * can be reused.
       *
       * @param map the function from an edge object to a new edge value.
       *
       * @tparam ED2 the new edge data type
       *
       * @example This function might be used to initialize edge
       * attributes.
       *
       */
      def mapEdges[ED2: ClassTag](map: Edge[ED] => ED2): Graph[VD, ED2] = {
        mapEdges((pid, iter) => iter.map(map))
      }
    
      /**
       * Transforms each edge attribute using the map function, passing it a whole partition at a
       * time. The map function is given an iterator over edges within a logical partition as well as
       * the partition's ID, and it should return a new iterator over the new values of each edge. The
       * new iterator's elements must correspond one-to-one with the old iterator's elements. If
       * adjacent vertex values are desired, use `mapTriplets`.
       *
       * @note This does not change the structure of the
       * graph or modify the values of this graph.  As a consequence
       * the underlying index structures can be reused.
       *
       * @param map a function that takes a partition id and an iterator
       * over all the edges in the partition, and must return an iterator over
       * the new values for each edge in the order of the input iterator
       *
       * @tparam ED2 the new edge data type
       *
       */
      def mapEdges[ED2: ClassTag](map: (PartitionID, Iterator[Edge[ED]]) => Iterator[ED2])
        : Graph[VD, ED2]
    
      /**
       * Transforms each edge attribute using the map function, passing it the adjacent vertex
       * attributes as well. If adjacent vertex values are not required,
       * consider using `mapEdges` instead.
       *
       * @note This does not change the structure of the
       * graph or modify the values of this graph.  As a consequence
       * the underlying index structures can be reused.
       *
       * @param map the function from an edge object to a new edge value.
       *
       * @tparam ED2 the new edge data type
       *
       * @example This function might be used to initialize edge
       * attributes based on the attributes associated with each vertex.
       * {{{
       * val rawGraph: Graph[Int, Int] = someLoadFunction()
       * val graph = rawGraph.mapTriplets[Int]( edge =>
       *   edge.src.data - edge.dst.data)
       * }}}
       *
       */
      def mapTriplets[ED2: ClassTag](map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2] = {
        mapTriplets((pid, iter) => iter.map(map))
      }
    
      /**
       * Transforms each edge attribute a partition at a time using the map function, passing it the
       * adjacent vertex attributes as well. The map function is given an iterator over edge triplets
       * within a logical partition and should yield a new iterator over the new values of each edge in
       * the order in which they are provided.  If adjacent vertex values are not required, consider
       * using `mapEdges` instead.
       *
       * @note This does not change the structure of the
       * graph or modify the values of this graph.  As a consequence
       * the underlying index structures can be reused.
       *
       * @param map the iterator transform
       *
       * @tparam ED2 the new edge data type
       *
       */
      def mapTriplets[ED2: ClassTag](map: (PartitionID, Iterator[EdgeTriplet[VD, ED]]) => Iterator[ED2])
        : Graph[VD, ED2]
    
      /**
       * Reverses all edges in the graph.  If this graph contains an edge from a to b then the returned
       * graph contains an edge from b to a.
       */
      def reverse: Graph[VD, ED]
    
      /**
       * Restricts the graph to only the vertices and edges satisfying the predicates. The resulting
       * subgraph satisifies
       *
       * {{{
       * V' = {v : for all v in V where vpred(v)}
       * E' = {(u,v): for all (u,v) in E where epred((u,v)) && vpred(u) && vpred(v)}
       * }}}
       *
       * @param epred the edge predicate, which takes a triplet and
       * evaluates to true if the edge is to remain in the subgraph.  Note
       * that only edges where both vertices satisfy the vertex
       * predicate are considered.
       *
       * @param vpred the vertex predicate, which takes a vertex object and
       * evaluates to true if the vertex is to be included in the subgraph
       *
       * @return the subgraph containing only the vertices and edges that
       * satisfy the predicates
       */
      def subgraph(
          epred: EdgeTriplet[VD,ED] => Boolean = (x => true),
          vpred: (VertexId, VD) => Boolean = ((v, d) => true))
        : Graph[VD, ED]
    
      /**
       * Restricts the graph to only the vertices and edges that are also in `other`, but keeps the
       * attributes from this graph.
       * @param other the graph to project this graph onto
       * @return a graph with vertices and edges that exist in both the current graph and `other`,
       * with vertex and edge data from the current graph
       */
      def mask[VD2: ClassTag, ED2: ClassTag](other: Graph[VD2, ED2]): Graph[VD, ED]
    
      /**
       * Merges multiple edges between two vertices into a single edge. For correct results, the graph
       * must have been partitioned using [[partitionBy]].
       *
       * @param merge the user-supplied commutative associative function to merge edge attributes
       *              for duplicate edges.
       *
       * @return The resulting graph with a single edge for each (source, dest) vertex pair.
       */
      def groupEdges(merge: (ED, ED) => ED): Graph[VD, ED]
    
      /**
       * Aggregates values from the neighboring edges and vertices of each vertex.  The user supplied
       * `mapFunc` function is invoked on each edge of the graph, generating 0 or more "messages" to be
       * "sent" to either vertex in the edge.  The `reduceFunc` is then used to combine the output of
       * the map phase destined to each vertex.
       *
       * @tparam A the type of "message" to be sent to each vertex
       *
       * @param mapFunc the user defined map function which returns 0 or
       * more messages to neighboring vertices
       *
       * @param reduceFunc the user defined reduce function which should
       * be commutative and associative and is used to combine the output
       * of the map phase
       *
       * @param activeSetOpt optionally, a set of "active" vertices and a direction of edges to
       * consider when running `mapFunc`. If the direction is `In`, `mapFunc` will only be run on
       * edges with destination in the active set.  If the direction is `Out`,
       * `mapFunc` will only be run on edges originating from vertices in the active set. If the
       * direction is `Either`, `mapFunc` will be run on edges with *either* vertex in the active set
       * . If the direction is `Both`, `mapFunc` will be run on edges with *both* vertices in the
       * active set. The active set must have the same index as the graph's vertices.
       *
       * @example We can use this function to compute the in-degree of each
       * vertex
       * {{{
       * val rawGraph: Graph[(),()] = Graph.textFile("twittergraph")
       * val inDeg: RDD[(VertexId, Int)] =
       *   mapReduceTriplets[Int](et => Iterator((et.dst.id, 1)), _ + _)
       * }}}
       *
       * @note By expressing computation at the edge level we achieve
       * maximum parallelism.  This is one of the core functions in the
       * Graph API in that enables neighborhood level computation. For
       * example this function can be used to count neighbors satisfying a
       * predicate or implement PageRank.
       *
       */
      def mapReduceTriplets[A: ClassTag](
          mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)],
          reduceFunc: (A, A) => A,
          activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None)
        : VertexRDD[A]
    
      /**
       * Joins the vertices with entries in the `table` RDD and merges the results using `mapFunc`.  The
       * input table should contain at most one entry for each vertex.  If no entry in `other` is
       * provided for a particular vertex in the graph, the map function receives `None`.
       *
       * @tparam U the type of entry in the table of updates
       * @tparam VD2 the new vertex value type
       *
       * @param other the table to join with the vertices in the graph.
       *              The table should contain at most one entry for each vertex.
       * @param mapFunc the function used to compute the new vertex values.
       *                The map function is invoked for all vertices, even those
       *                that do not have a corresponding entry in the table.
       *
       * @example This function is used to update the vertices with new values based on external data.
       *          For example we could add the out-degree to each vertex record:
       *
       * {{{
       * val rawGraph: Graph[_, _] = Graph.textFile("webgraph")
       * val outDeg: RDD[(VertexId, Int)] = rawGraph.outDegrees
       * val graph = rawGraph.outerJoinVertices(outDeg) {
       *   (vid, data, optDeg) => optDeg.getOrElse(0)
       * }
       * }}}
       */
      def outerJoinVertices[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)])
          (mapFunc: (VertexId, VD, Option[U]) => VD2)(implicit eq: VD =:= VD2 = null)
        : Graph[VD2, ED]
    
      /**
       * The associated [[GraphOps]] object.
       */
      // Save a copy of the GraphOps object so there is always one unique GraphOps object
      // for a given Graph object, and thus the lazy vals in GraphOps would work as intended.
      val ops = new GraphOps(this)
    } // end of Graph
    
    
    /**
     * The Graph object contains a collection of routines used to construct graphs from RDDs.
     */
    object Graph {
    
      /**
       * Construct a graph from a collection of edges encoded as vertex id pairs.
       *
       * @param rawEdges a collection of edges in (src, dst) form
       * @param defaultValue the vertex attributes with which to create vertices referenced by the edges
       * @param uniqueEdges if multiple identical edges are found they are combined and the edge
       * attribute is set to the sum.  Otherwise duplicate edges are treated as separate. To enable
       * `uniqueEdges`, a [[PartitionStrategy]] must be provided.
       * @param edgeStorageLevel the desired storage level at which to cache the edges if necessary
       * @param vertexStorageLevel the desired storage level at which to cache the vertices if necessary
       *
       * @return a graph with edge attributes containing either the count of duplicate edges or 1
       * (if `uniqueEdges` is `None`) and vertex attributes containing the total degree of each vertex.
       */
      def fromEdgeTuples[VD: ClassTag](
          rawEdges: RDD[(VertexId, VertexId)],
          defaultValue: VD,
          uniqueEdges: Option[PartitionStrategy] = None,
          edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
          vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, Int] =
      {
        val edges = rawEdges.map(p => Edge(p._1, p._2, 1))
        val graph = GraphImpl(edges, defaultValue, edgeStorageLevel, vertexStorageLevel)
        uniqueEdges match {
          case Some(p) => graph.partitionBy(p).groupEdges((a, b) => a + b)
          case None => graph
        }
      }
    
      /**
       * Construct a graph from a collection of edges.
       *
       * @param edges the RDD containing the set of edges in the graph
       * @param defaultValue the default vertex attribute to use for each vertex
       * @param edgeStorageLevel the desired storage level at which to cache the edges if necessary
       * @param vertexStorageLevel the desired storage level at which to cache the vertices if necessary
       *
       * @return a graph with edge attributes described by `edges` and vertices
       *         given by all vertices in `edges` with value `defaultValue`
       */
      def fromEdges[VD: ClassTag, ED: ClassTag](
          edges: RDD[Edge[ED]],
          defaultValue: VD,
          edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
          vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, ED] = {
        GraphImpl(edges, defaultValue, edgeStorageLevel, vertexStorageLevel)
      }
    
      /**
       * Construct a graph from a collection of vertices and
       * edges with attributes.  Duplicate vertices are picked arbitrarily and
       * vertices found in the edge collection but not in the input
       * vertices are assigned the default attribute.
       *
       * @tparam VD the vertex attribute type
       * @tparam ED the edge attribute type
       * @param vertices the "set" of vertices and their attributes
       * @param edges the collection of edges in the graph
       * @param defaultVertexAttr the default vertex attribute to use for vertices that are
       *                          mentioned in edges but not in vertices
       * @param edgeStorageLevel the desired storage level at which to cache the edges if necessary
       * @param vertexStorageLevel the desired storage level at which to cache the vertices if necessary
       */
      def apply[VD: ClassTag, ED: ClassTag](
          vertices: RDD[(VertexId, VD)],
          edges: RDD[Edge[ED]],
          defaultVertexAttr: VD = null.asInstanceOf[VD],
          edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
          vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, ED] = {
        GraphImpl(vertices, edges, defaultVertexAttr, edgeStorageLevel, vertexStorageLevel)
      }
    
      /**
       * Implicitly extracts the [[GraphOps]] member from a graph.
       *
       * To improve modularity the Graph type only contains a small set of basic operations.
       * All the convenience operations are defined in the [[GraphOps]] class which may be
       * shared across multiple graph implementations.
       */
      implicit def graphToGraphOps[VD: ClassTag, ED: ClassTag]
          (g: Graph[VD, ED]): GraphOps[VD, ED] = g.ops
    } // end of Graph object
  • 相关阅读:
    phpStudy for Linux (lnmp+lamp一键安装包)
    eq,neq,gt,lt等表达式缩写
    lnmp环境的使用
    lnmp环境的搭建
    箭头函数中的this
    Vue中实现路由懒加载及组件懒加载
    Vue项目中实现路由按需加载(路由懒加载)的3中方式:
    判断数据类型的方式以及各自的优缺点
    最近工作中踩的坑
    7种方法实现CSS左侧固定,右侧自适应布局
  • 原文地址:https://www.cnblogs.com/likai198981/p/4166815.html
Copyright © 2020-2023  润新知