-
Notifications
You must be signed in to change notification settings - Fork 28.7k
[SPARK-4912][SQL] Persistent tables for the Spark SQL data sources api #3960
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d7da491
6edc710
1ea6e7b
c00bb1b
2b59723
f47fda1
8f8f1a1
49bf1ac
172db80
feb88aa
06f9b0c
aeaf4b3
7fc4b56
5315dfc
4456e98
c07cbc6
069c235
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,10 +20,11 @@ package org.apache.spark.sql.hive | |
import java.io.IOException | ||
import java.util.{List => JList} | ||
|
||
import com.google.common.cache.{LoadingCache, CacheLoader, CacheBuilder} | ||
|
||
import org.apache.hadoop.util.ReflectionUtils | ||
import org.apache.hadoop.hive.metastore.TableType | ||
import org.apache.hadoop.hive.metastore.api.FieldSchema | ||
import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition} | ||
import org.apache.hadoop.hive.metastore.api.{Table => TTable, Partition => TPartition, FieldSchema} | ||
import org.apache.hadoop.hive.ql.metadata.{Hive, Partition, Table, HiveException} | ||
import org.apache.hadoop.hive.ql.metadata.InvalidTableException | ||
import org.apache.hadoop.hive.ql.plan.CreateTableDesc | ||
|
@@ -39,6 +40,7 @@ import org.apache.spark.sql.catalyst.plans.logical | |
import org.apache.spark.sql.catalyst.plans.logical._ | ||
import org.apache.spark.sql.catalyst.rules._ | ||
import org.apache.spark.sql.catalyst.types._ | ||
import org.apache.spark.sql.sources.{LogicalRelation, ResolvedDataSource} | ||
import org.apache.spark.util.Utils | ||
|
||
/* Implicit conversions */ | ||
|
@@ -50,8 +52,76 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with | |
/** Connection to hive metastore. Usages should lock on `this`. */ | ||
protected[hive] val client = Hive.get(hive.hiveconf) | ||
|
||
// TODO: Use this everywhere instead of tuples or databaseName, tableName,. | ||
/** A fully qualified identifier for a table (i.e., database.tableName) */ | ||
case class QualifiedTableName(database: String, name: String) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't really match the rest of the API any more now that we have the concept of a |
||
def toLowerCase = QualifiedTableName(database.toLowerCase, name.toLowerCase) | ||
} | ||
|
||
/** A cache of Spark SQL data source tables that have been accessed. */ | ||
protected[hive] val cachedDataSourceTables: LoadingCache[QualifiedTableName, LogicalPlan] = { | ||
val cacheLoader = new CacheLoader[QualifiedTableName, LogicalPlan]() { | ||
override def load(in: QualifiedTableName): LogicalPlan = { | ||
logDebug(s"Creating new cached data source for $in") | ||
val table = client.getTable(in.database, in.name) | ||
val schemaString = table.getProperty("spark.sql.sources.schema") | ||
val userSpecifiedSchema = | ||
if (schemaString == null) { | ||
None | ||
} else { | ||
Some(DataType.fromJson(schemaString).asInstanceOf[StructType]) | ||
} | ||
// It does not appear that the ql client for the metastore has a way to enumerate all the | ||
// SerDe properties directly... | ||
val options = table.getTTable.getSd.getSerdeInfo.getParameters.toMap | ||
|
||
val resolvedRelation = | ||
ResolvedDataSource( | ||
hive, | ||
userSpecifiedSchema, | ||
table.getProperty("spark.sql.sources.provider"), | ||
options) | ||
|
||
LogicalRelation(resolvedRelation.relation) | ||
} | ||
} | ||
|
||
CacheBuilder.newBuilder().maximumSize(1000).build(cacheLoader) | ||
} | ||
|
||
def refreshTable(databaseName: String, tableName: String): Unit = { | ||
cachedDataSourceTables.refresh(QualifiedTableName(databaseName, tableName).toLowerCase) | ||
} | ||
|
||
def invalidateTable(databaseName: String, tableName: String): Unit = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These should probably also take |
||
cachedDataSourceTables.invalidate(QualifiedTableName(databaseName, tableName).toLowerCase) | ||
} | ||
|
||
val caseSensitive: Boolean = false | ||
|
||
def createDataSourceTable( | ||
tableName: String, | ||
userSpecifiedSchema: Option[StructType], | ||
provider: String, | ||
options: Map[String, String]) = { | ||
val (dbName, tblName) = processDatabaseAndTableName("default", tableName) | ||
val tbl = new Table(dbName, tblName) | ||
|
||
tbl.setProperty("spark.sql.sources.provider", provider) | ||
if (userSpecifiedSchema.isDefined) { | ||
tbl.setProperty("spark.sql.sources.schema", userSpecifiedSchema.get.json) | ||
} | ||
options.foreach { case (key, value) => tbl.setSerdeParam(key, value) } | ||
|
||
tbl.setProperty("EXTERNAL", "TRUE") | ||
tbl.setTableType(TableType.EXTERNAL_TABLE) | ||
|
||
// create the table | ||
synchronized { | ||
client.createTable(tbl, false) | ||
} | ||
} | ||
|
||
def tableExists(tableIdentifier: Seq[String]): Boolean = { | ||
val tableIdent = processTableIdentifier(tableIdentifier) | ||
val databaseName = tableIdent.lift(tableIdent.size - 2).getOrElse( | ||
|
@@ -72,7 +142,10 @@ private[hive] class HiveMetastoreCatalog(hive: HiveContext) extends Catalog with | |
hive.sessionState.getCurrentDatabase) | ||
val tblName = tableIdent.last | ||
val table = client.getTable(databaseName, tblName) | ||
if (table.isView) { | ||
|
||
if (table.getProperty("spark.sql.sources.provider") != null) { | ||
cachedDataSourceTables(QualifiedTableName(databaseName, tblName).toLowerCase) | ||
} else if (table.isView) { | ||
// if the unresolved relation is from hive view | ||
// parse the text into logic node. | ||
HiveQl.createPlanForView(table, alias) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we avoid make this strategy
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was original in
CommandStrategy
. I was trying to find a good place for these, but I did not find a suitable Strategy. Any suggestion?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@scwf Actually, I think that it is better to put all rules for the data data source API in the same place.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@yhuai, i mean since CreateTableUsing and CreateTempTableUsing is command, we'd better make it follow strategy:
i will try for this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, I am not sure we should put them in BasicOperators. We cannot just create a
RunnableCommand
inddl.scala
sinceSQLContext
does not allow persistent table and we need to throw the error inSparkStrategies
. Also, I feel code is clear when we put stuff related to the data source API together.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@yhuai,i write a draft version for this, can you have a look(https://github.com/scwf/spark/compare/apache:master...scwf:createDataSourceTable?expand=1)
why we put
case r: RunnableCommand => ExecutedCommand(r)
in BasicOperators is because we no need make a new strategy for only one rule.And after we refactor command implementation in spark sql, we should make the newly added command follow
RunnableCommand
if possible, then we can avoid adding new strategy for newly added command./cc @marmbrus