A simple script below which uses Spark/Scala to generate a graph output, then write the contents of the connected nodes to a Hive table.
Command Line – execute Spark job calling Scala file
spark-shell –conf “spark.ui.port=1081” –driver-memory 20G –executor-memory 20G -i graph_test.scala
<graph_test.scala>
import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.GraphLoader
import org.apache.spark.SparkContext
val graph = GraphLoader.edgeListFile(sc,”/tmp/graph_edges_small.csv”)
val cc = graph.connectedComponents().vertices
//Convert to a Data Frame
val df = graph.connectedComponents().vertices.toDF()
//Write data frame to Hive Table
df.write.mode(“overwrite”).saveAsTable(“mr2_raw.graph”)
//Print Result of CC
println(cc.collect().mkString(“\n”))
//Quit
:q