Skip to content

Commit 13b1de7

Browse files
committed
Merge pull request #30 in BACIBBD/spline from feature/SL-43-new-data-model to release/0.2
* commit '9c5cd9d43b8866d7be9646b8a8c66b8195107a7f': SL-62 + scala doc SL-62 Fix REST serialization by switching from Salat JSON to Json4s impl SL-61 Redo loading indicator SL-61 Redo nodes highlighting SL-61 Redo attribute selection SL-61 Redo full schema view SL-60 - Moving reference to salat-core from api to mongo module. SL-60 - Fixing binary incompatibility of json4s-native between hdfs and atlas persistence layers. SL-59 Updating comments of Dataset converter. SL-59 Fixing Atlas persistence layer SL-61 Angular/material: Replace 'MD' suffixes to 'MAT' according to the recent change in the library (see angular/components#7241) SL-61 Redo graph visualization + fix types + fix server side unit tests SL-59 Adding comment parameters SL-59 Refactoring Atlas persistence layer according to the latest Splline data model. SL-57 Refactoring harvester according the latest Spline data model SL-61 Start updating Web UI layer according to the new data model Fix SL-58 compilation errors SL-59 Renaming AtlasDataLineagePersistor to AtlasDataLineageWriter SL-58 Changing property name according to the naming convention for ids (yyyID -> yyyId) SL-58 Updating Mongo persistence layer according to the new data model SL-43 Data Set Oriented Data Model
2 parents 87f349c + 9c5cd9d commit 13b1de7

File tree

149 files changed

+9439
-100401
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+9439
-100401
lines changed

commons/pom.xml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@
3333
<groupId>commons-configuration</groupId>
3434
<artifactId>commons-configuration</artifactId>
3535
</dependency>
36+
<dependency>
37+
<groupId>org.scala-lang</groupId>
38+
<artifactId>scala-reflect</artifactId>
39+
</dependency>
3640
</dependencies>
3741

3842
</project>
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/*
2+
* Copyright 2017 Barclays Africa Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package za.co.absa.spline.common
18+
19+
import scala.reflect.runtime.{universe => ru}
20+
21+
/**
22+
* Reflection utils
23+
*/
24+
object ReflectionUtils {
25+
26+
private val mirror: ru.Mirror = ru.runtimeMirror(getClass.getClassLoader)
27+
28+
/**
29+
* Lists all direct sub-classes of the given trait T
30+
*
31+
* @tparam T sealed trait type
32+
* @return List of Class[_] instances
33+
*/
34+
def subClassesOf[T: ru.TypeTag]: List[Class[_]] = {
35+
val clazz: ru.ClassSymbol = ru.typeOf[T].typeSymbol.asClass
36+
require(clazz.isTrait && clazz.isSealed)
37+
clazz.knownDirectSubclasses.toList map ((s: ru.Symbol) => mirror runtimeClass s.asClass)
38+
}
39+
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
/*
2+
* Copyright 2017 Barclays Africa Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package za.co.absa.spline.core
18+
19+
import java.util.UUID
20+
import java.util.UUID.randomUUID
21+
22+
import org.apache.spark.sql
23+
import za.co.absa.spline.model.Attribute
24+
25+
import scala.collection.mutable.{Map, ListBuffer}
26+
27+
/**
28+
* The class is responsible for creation of [[za.co.absa.spline.model.Attribute attributes]] and assigning them unique identifiers.
29+
*/
30+
class AttributeFactory extends DataTypeMapper{
31+
private val mapById : Map[UUID, Attribute] = Map()
32+
private val mapBySparkId : Map[Long, Attribute] = Map()
33+
private val allAttributes : ListBuffer[Attribute] = ListBuffer[Attribute]()
34+
35+
/**
36+
* The method fills up the internal collection with initial sequence of attributes.
37+
* @param sparkIds A sequence of unique identifiers provided by Spark
38+
* @param attributes A sequence of attributes
39+
*/
40+
def initialize(sparkIds: Seq[Long], attributes: Seq[Attribute]) : Unit =
41+
mapById.synchronized {
42+
mapById.clear()
43+
mapBySparkId.clear()
44+
sparkIds.zip(attributes).foreach {
45+
case (k, a) => {
46+
mapBySparkId.put(k, a)
47+
mapById.put(a.id, a)
48+
allAttributes ++= attributes
49+
}
50+
}
51+
}
52+
53+
/**
54+
* The method creates an attribute if does not exist. Returns identifier to the attribute matching the input criteria.
55+
* @param sparkAttributeId An unique identifier of the attribute assigned by Spark
56+
* @param name A name of the attribute
57+
* @param sparkDataType A Spark dataType related to the attribute
58+
* @param nullable A flag expressing whether the attribute is nullable or not
59+
* @return An unique identifier of the created attribute
60+
*/
61+
def getOrCreate(sparkAttributeId : Long, name : String, sparkDataType: sql.types.DataType, nullable: Boolean) : UUID =
62+
mapById.synchronized(
63+
mapBySparkId.get(sparkAttributeId) match {
64+
case Some(x) => x.id
65+
case None => {
66+
val a = Attribute(randomUUID, name, fromSparkDataType(sparkDataType, nullable))
67+
mapBySparkId.put(sparkAttributeId, a)
68+
mapById.put(a.id, a)
69+
allAttributes += a
70+
a.id
71+
}
72+
}
73+
)
74+
75+
/**
76+
* The method returns an attribute for a specific identifier if has already been created by the factory. Otherwise, returns None.
77+
* @param id An identifier of the attribute
78+
* @return An option
79+
*/
80+
def getById(id : UUID) : Option[Attribute] = mapById.synchronized(mapById.get(id))
81+
82+
/**
83+
* The method returns all attributes created by the factory.
84+
* @return A sequence of attributes
85+
*/
86+
def getAll(): Seq[Attribute] = mapById.synchronized(allAttributes.toList)
87+
88+
}

core/src/main/scala/za/co/absa/spline/core/DataLineageHarvester.scala

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,19 @@ package za.co.absa.spline.core
1919
import java.util.UUID
2020

2121
import org.apache.hadoop.conf.Configuration
22-
import za.co.absa.spline.model.{DataLineage, OperationNode}
2322
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
2423
import org.apache.spark.sql.execution.QueryExecution
2524
import org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand
2625
import za.co.absa.spline.common.transformations.TransformationPipeline
27-
import za.co.absa.spline.core.transformations.ProjectionMerger
26+
import za.co.absa.spline.core.transformations.{ProjectionMerger, ReferenceConsolidator}
27+
import za.co.absa.spline.model.DataLineage
28+
import za.co.absa.spline.model.op.Operation
2829

2930
import scala.collection.mutable
31+
import scala.language.postfixOps
3032

31-
/** The object is responsible for gathering lineage information from Spark internal structures (logical plan, physical plan, etc.) */
32-
object DataLineageHarvester {
33+
/** The class is responsible for gathering lineage information from Spark internal structures (logical plan, physical plan, etc.) */
34+
class DataLineageHarvester(hadoopConfiguration: Configuration) {
3335

3436
val transformationPipeline = new TransformationPipeline(Seq(ProjectionMerger))
3537

@@ -38,18 +40,29 @@ object DataLineageHarvester {
3840
* @param queryExecution An instance holding Spark internal structures (logical plan, physical plan, etc.)
3941
* @return A lineage representation
4042
*/
41-
def harvestLineage(queryExecution: QueryExecution, hadoopConfiguration: Configuration): DataLineage = {
42-
val nodes = harvestOperationNodes(queryExecution.analyzed, hadoopConfiguration)
43+
def harvestLineage(queryExecution: QueryExecution): DataLineage = {
44+
val attributeFactory = new AttributeFactory()
45+
val metaDatasetFactory = new MetaDatasetFactory(attributeFactory)
46+
val operationNodeBuilderFactory = new OperationNodeBuilderFactory(hadoopConfiguration, metaDatasetFactory)
47+
val nodes = harvestOperationNodes(queryExecution.analyzed, operationNodeBuilderFactory)
4348
val transformedNodes = transformationPipeline.apply(nodes)
4449

45-
DataLineage(
50+
val sparkContext = queryExecution.sparkSession.sparkContext
51+
52+
val lineage = DataLineage(
4653
UUID.randomUUID,
47-
queryExecution.sparkSession.sparkContext.appName,
48-
transformedNodes
54+
sparkContext.applicationId,
55+
sparkContext.appName,
56+
System.currentTimeMillis(),
57+
transformedNodes,
58+
metaDatasetFactory.getAll(),
59+
attributeFactory.getAll()
4960
)
61+
62+
ReferenceConsolidator(lineage)
5063
}
5164

52-
private def harvestOperationNodes(logicalPlan: LogicalPlan, hadoopConfiguration: Configuration): Seq[OperationNode] = {
65+
private def harvestOperationNodes(logicalPlan: LogicalPlan, operationNodeBuilderFactory: OperationNodeBuilderFactory): Seq[Operation] = {
5366
val result = mutable.ArrayBuffer[OperationNodeBuilder[_]]()
5467
val stack = mutable.Stack[(LogicalPlan, Int)]((logicalPlan, -1))
5568
val visitedNodes = mutable.Map[LogicalPlan, Int]()
@@ -60,7 +73,7 @@ object DataLineageHarvester {
6073
val currentNode: OperationNodeBuilder[_] = currentPosition match {
6174
case Some(pos) => result(pos)
6275
case None =>
63-
val newNode = OperationNodeBuilderFactory.create(currentOperation, hadoopConfiguration)
76+
val newNode = operationNodeBuilderFactory.create(currentOperation)
6477
visitedNodes += (currentOperation -> result.size)
6578
currentPosition = Some(result.size)
6679
result += newNode
@@ -73,14 +86,9 @@ object DataLineageHarvester {
7386

7487
if (parentPosition >= 0) {
7588
val parent = result(parentPosition)
76-
parent.childRefs += currentPosition.get
77-
currentNode.output foreach (parent.input +=)
78-
currentNode.parentRefs += parentPosition
79-
89+
parent.inputMetaDatasets += currentNode.outputMetaDataset
8090
}
8191
}
8292
result.map(i => i.build())
8393
}
84-
85-
8694
}

core/src/main/scala/za/co/absa/spline/core/DataLineageListener.scala

Lines changed: 9 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,24 +16,20 @@
1616

1717
package za.co.absa.spline.core
1818

19-
import java.util.UUID
20-
2119
import org.apache.hadoop.conf.Configuration
22-
import za.co.absa.spline.model.Execution
23-
import za.co.absa.spline.persistence.api.PersistenceFactory
2420
import org.apache.spark.sql.execution.QueryExecution
2521
import org.apache.spark.sql.util.QueryExecutionListener
26-
import za.co.absa.spline.common.FutureImplicits._
27-
import scala.concurrent.Future
22+
import za.co.absa.spline.persistence.api.PersistenceWriterFactory
2823

2924
/**
3025
* The class represents a handler listening on events that Spark triggers when an execution any action is performed. It can be considered as an entry point to Spline library.
3126
*
32-
* @param dataStorageFactory A factory of persistence layers
27+
* @param persistenceWriterFactory A factory of persistence writers
28+
* @param hadoopConfiguration A hadoop configuration
3329
*/
34-
class DataLineageListener(dataStorageFactory: PersistenceFactory, hadoopConfiguration: Configuration) extends QueryExecutionListener {
35-
private lazy val dataLineagePersistor = dataStorageFactory.createDataLineagePersistor()
36-
private lazy val executionPersistor = dataStorageFactory.createExecutionPersistor()
30+
class DataLineageListener(persistenceWriterFactory: PersistenceWriterFactory, hadoopConfiguration: Configuration) extends QueryExecutionListener {
31+
private lazy val persistenceWriter = persistenceWriterFactory.createDataLineageWriter()
32+
private lazy val harvester = new DataLineageHarvester(hadoopConfiguration)
3733

3834
/**
3935
* The method is executed when an action execution is successful.
@@ -59,16 +55,8 @@ class DataLineageListener(dataStorageFactory: PersistenceFactory, hadoopConfigur
5955

6056
private def processQueryExecution(funcName: String, qe: QueryExecution): Unit = {
6157
if (funcName == "save") {
62-
val lineage = DataLineageHarvester.harvestLineage(qe, hadoopConfiguration)
63-
dataLineagePersistor.exists(lineage).flatMap( lineageIdOption =>
64-
lineageIdOption match
65-
{
66-
case None => dataLineagePersistor.store(lineage).map(_ => lineage.id)
67-
case Some(x) => Future.successful(x)
68-
}).flatMap(lineageId => {
69-
val execution = Execution(UUID.randomUUID(), lineageId, qe.sparkSession.sparkContext.applicationId, System.currentTimeMillis())
70-
executionPersistor.store(execution)
71-
})
72-
}
58+
val lineage = harvester harvestLineage qe
59+
persistenceWriter store lineage
60+
}
7361
}
7462
}

core/src/main/scala/za/co/absa/spline/core/DataTypeMapper.scala

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,8 @@
1616

1717
package za.co.absa.spline.core
1818

19-
import za.co.absa.spline.model.StructField
20-
import za.co.absa.spline.model
2119
import org.apache.spark.sql
20+
import za.co.absa.spline.model.dt._
2221

2322
/**
2423
* The trait represents a mapper translating Spark data types to data types specified by Spline library.
@@ -32,9 +31,9 @@ trait DataTypeMapper {
3231
* @param nullable A flag specifying whether result data type will be nullable or not
3332
* @return A Spline data type
3433
*/
35-
def fromSparkDataType(sparkDataType: sql.types.DataType, nullable: Boolean): model.DataType = sparkDataType match {
36-
case s: sql.types.StructType => model.StructType(s.fields.map(i => StructField(i.name, fromSparkDataType(i.dataType, i.nullable))), nullable)
37-
case a: sql.types.ArrayType => model.ArrayType(fromSparkDataType(a.elementType, a.containsNull), nullable)
38-
case x => model.SimpleType(x.typeName, nullable)
34+
def fromSparkDataType(sparkDataType: sql.types.DataType, nullable: Boolean): DataType = sparkDataType match {
35+
case s: sql.types.StructType => Struct(s.fields.map(i => StructField(i.name, fromSparkDataType(i.dataType, i.nullable))), nullable)
36+
case a: sql.types.ArrayType => Array(fromSparkDataType(a.elementType, a.containsNull), nullable)
37+
case x => Simple(x.typeName, nullable)
3938
}
4039
}

core/src/main/scala/za/co/absa/spline/core/ExpressionMapper.scala

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,25 +16,29 @@
1616

1717
package za.co.absa.spline.core
1818

19-
import za.co.absa.spline.model._
2019
import org.apache.spark.sql.catalyst.expressions
20+
import za.co.absa.spline.model._
21+
22+
import scala.language.implicitConversions
2123

2224
/**
2325
* The trait represents a mapper translating Spark expressions to expressions specified by Spline library.
2426
*/
2527
trait ExpressionMapper extends DataTypeMapper {
2628

29+
val attributeFactory : AttributeFactory
30+
2731
/**
2832
* The method translates a Spark expression to an expression specified by Spline library.
2933
*
3034
* @param sparkExpr An input Spark expression
3135
* @return A Spline expression
3236
*/
33-
implicit def fromSparkExpression(sparkExpr: org.apache.spark.sql.catalyst.expressions.Expression): Expression = sparkExpr match {
34-
case a: expressions.Alias => AliasExpression(a.name,a.simpleString, fromSparkDataType(a.dataType, a.nullable), a.children map fromSparkExpression)
35-
case a: expressions.AttributeReference => AttributeReference(a.exprId.id, a.name, fromSparkDataType(a.dataType, a.nullable))
36-
case bo: expressions.BinaryOperator => BinaryOperator(bo.nodeName, bo.symbol, bo.simpleString, fromSparkDataType(bo.dataType, bo.nullable), bo.children map fromSparkExpression)
37-
case u: expressions.ScalaUDF => UserDefinedFunction(u.udfName getOrElse u.function.getClass.getName, u.simpleString, fromSparkDataType(u.dataType, u.nullable), u.children map fromSparkExpression)
38-
case x => GenericExpression(x.nodeName, x.simpleString, fromSparkDataType(x.dataType, x.nullable), x.children map fromSparkExpression)
37+
implicit def fromSparkExpression(sparkExpr: org.apache.spark.sql.catalyst.expressions.Expression): expr.Expression = sparkExpr match {
38+
case a: expressions.Alias => expr.Alias(a.name, a.simpleString, fromSparkDataType(a.dataType, a.nullable), a.children map fromSparkExpression)
39+
case a: expressions.AttributeReference => expr.AttributeReference(attributeFactory.getOrCreate(a.exprId.id, a.name, a.dataType, a.nullable), a.name, fromSparkDataType(a.dataType, a.nullable))
40+
case bo: expressions.BinaryOperator => expr.Binary(bo.nodeName, bo.symbol, bo.simpleString, fromSparkDataType(bo.dataType, bo.nullable), bo.children map fromSparkExpression)
41+
case u: expressions.ScalaUDF => expr.UserDefinedFunction(u.udfName getOrElse u.function.getClass.getName, u.simpleString, fromSparkDataType(u.dataType, u.nullable), u.children map fromSparkExpression)
42+
case x => expr.Generic(x.nodeName, x.simpleString, fromSparkDataType(x.dataType, x.nullable), x.children map fromSparkExpression)
3943
}
4044
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/*
2+
* Copyright 2017 Barclays Africa Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package za.co.absa.spline.core
18+
19+
import java.util.UUID
20+
import java.util.UUID.randomUUID
21+
22+
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
23+
import za.co.absa.spline.model.{MetaDataset, Schema}
24+
25+
import scala.collection.mutable.{ListBuffer, Map}
26+
27+
/**
28+
* The class is responsible for creation of [[za.co.absa.spline.model.MetaDataset meta data sets]] and assigning them unique identifiers.
29+
* @param attributeFactory An attribute factory
30+
*/
31+
class MetaDatasetFactory(val attributeFactory: AttributeFactory) {
32+
private val datasets : Map[UUID,MetaDataset] = Map()
33+
private val allDatasets : ListBuffer[MetaDataset] = ListBuffer[MetaDataset]()
34+
35+
/**
36+
* The method crates a meta data set for a specific Spark operation and returns its identifier.
37+
* @param operation A Spark operation
38+
* @return An identifier of created meta data set
39+
*/
40+
def create(operation: LogicalPlan) : UUID = datasets.synchronized{
41+
val attributeIds = operation.output.map(i => attributeFactory.getOrCreate(i.exprId.id, i.name, i.dataType, i.nullable))
42+
val metaDataset = MetaDataset(randomUUID, Schema(attributeIds))
43+
datasets.put(metaDataset.id, metaDataset)
44+
allDatasets += metaDataset
45+
metaDataset.id
46+
}
47+
48+
/**
49+
* The method returns a meta data set for a specific identifier if has already been created by the factory. Otherwise, returns None.
50+
* @param id An identifier of the meta data set
51+
* @return An option
52+
*/
53+
def getById(id: UUID) : Option[MetaDataset] = datasets.synchronized(datasets.get(id))
54+
55+
/**
56+
* The method returns all meta data sets created by the factory.
57+
* @return A sequence of meta data sets
58+
*/
59+
def getAll() : Seq[MetaDataset] = datasets.synchronized(allDatasets)
60+
}

0 commit comments

Comments
 (0)