From a49ea784639bcedaee61172918627bc4ea2389c7 Mon Sep 17 00:00:00 2001 From: Satya Kommula Date: Mon, 17 Jan 2022 18:35:47 +0530 Subject: [PATCH 1/5] bump spark version "3.0.0" -> "3.2.0" --- build.sbt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 2303e62d..68e31d3c 100644 --- a/build.sbt +++ b/build.sbt @@ -14,7 +14,7 @@ sparkPackageName := "databricks/spark-sql-perf" // All Spark Packages need a license licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")) -sparkVersion := "3.0.0" +sparkVersion := "3.2.0" sparkComponents ++= Seq("sql", "hive", "mllib") From 229b5fff9078c8733d9301e58c5d71352904c2df Mon Sep 17 00:00:00 2001 From: Satya Kommula Date: Tue, 3 Dec 2024 15:49:48 +0530 Subject: [PATCH 2/5] Upgrade Spark to version 3.5.1, update dependencies, and replace the Bintray URL --- bin/run | 2 +- build.sbt | 6 +++--- build/sbt-launch-lib.bash | 17 +++++++++-------- project/plugins.sbt | 19 +++++++++++-------- .../databricks/spark/sql/perf/Benchmark.scala | 3 ++- .../spark/sql/perf/Benchmarkable.scala | 8 +++++--- .../com/databricks/spark/sql/perf/Query.scala | 3 ++- .../mllib/MLPipelineStageBenchmarkable.scala | 3 ++- version.sbt | 2 +- 9 files changed, 36 insertions(+), 27 deletions(-) diff --git a/bin/run b/bin/run index 7d28227c..f8923ffc 100755 --- a/bin/run +++ b/bin/run @@ -3,4 +3,4 @@ # runs spark-sql-perf from the current directory ARGS="runBenchmark $@" -build/sbt "$ARGS" \ No newline at end of file +sbt "$ARGS" \ No newline at end of file diff --git a/build.sbt b/build.sbt index 68e31d3c..1a2b09f2 100644 --- a/build.sbt +++ b/build.sbt @@ -5,16 +5,16 @@ name := "spark-sql-perf" organization := "com.databricks" -scalaVersion := "2.12.10" +scalaVersion := "2.12.18" -crossScalaVersions := Seq("2.12.10") +crossScalaVersions := Seq("2.12.18") sparkPackageName := "databricks/spark-sql-perf" // All Spark Packages need a license licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")) -sparkVersion := "3.2.0" +sparkVersion := "3.5.1" sparkComponents ++= Seq("sql", "hive", "mllib") diff --git a/build/sbt-launch-lib.bash b/build/sbt-launch-lib.bash index 2a399365..707f70ef 100755 --- a/build/sbt-launch-lib.bash +++ b/build/sbt-launch-lib.bash @@ -45,9 +45,8 @@ dlog () { acquire_sbt_jar () { SBT_VERSION=`awk -F "=" '/sbt\.version/ {print $2}' ./project/build.properties` - URL1=https://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar + URL1=https://github.com/sbt/sbt/releases/download/v${SBT_VERSION}/sbt-${SBT_VERSION}.zip JAR=build/sbt-launch-${SBT_VERSION}.jar - sbt_jar=$JAR if [[ ! -f "$sbt_jar" ]]; then @@ -55,13 +54,15 @@ acquire_sbt_jar () { if [ ! -f "${JAR}" ]; then # Download printf "Attempting to fetch sbt\n" - JAR_DL="${JAR}.part" + COMPLETE_SBT="build/sbt.zip" if [ $(command -v curl) ]; then - curl --fail --location --silent ${URL1} > "${JAR_DL}" &&\ - mv "${JAR_DL}" "${JAR}" + curl --fail --location --silent ${URL1} > "${COMPLETE_SBT}" &&\ + unzip ${COMPLETE_SBT} &&\ + cp "sbt/bin/sbt-launch.jar" "${JAR}" elif [ $(command -v wget) ]; then - wget --quiet ${URL1} -O "${JAR_DL}" &&\ - mv "${JAR_DL}" "${JAR}" + wget --quiet ${URL1} -O "${COMPLETE_SBT}" &&\ + unzip ${COMPLETE_SBT} &&\ + cp "sbt/bin/sbt-launch.jar" "${JAR}" else printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" exit -1 @@ -195,4 +196,4 @@ run() { -jar "$sbt_jar" \ "${sbt_commands[@]}" \ "${residual_args[@]}" -} +} \ No newline at end of file diff --git a/project/plugins.sbt b/project/plugins.sbt index d2473b61..c76851f6 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,17 +1,20 @@ // You may use this file to add plugin dependencies for sbt. -resolvers += "Spark Packages repo" at "https://repos.spark-packages.org/" +resolvers ++= Seq( + Resolver.mavenLocal, + Resolver.sonatypeRepo("releases"), + "Maven Central" at "https://repo1.maven.org/maven2/", + "Spark Packages Repo" at "https://repos.spark-packages.org/" +) -resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" - -addSbtPlugin("org.spark-packages" %% "sbt-spark-package" % "0.1.1") +addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.3") addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") -addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.0") +addSbtPlugin("com.github.sbt" % "sbt-release" % "1.0.15") -addSbtPlugin("com.databricks" %% "sbt-databricks" % "0.1.3") +addSbtPlugin("com.databricks" %% "sbt-databricks" % "0.1.5") -addSbtPlugin("me.lessis" % "bintray-sbt" % "0.3.0") +addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.6") -addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") +addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") diff --git a/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala b/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala index ebb49353..6098f353 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala @@ -240,7 +240,8 @@ abstract class Benchmark( protected override def doBenchmark( includeBreakdown: Boolean, description: String = "", - messages: ArrayBuffer[String]): BenchmarkResult = { + messages: ArrayBuffer[String], + iteration: Int = 1): BenchmarkResult = { try { val timeMs = measureTimeMs(run()) BenchmarkResult( diff --git a/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala b/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala index 24efef70..b36850fc 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala @@ -43,14 +43,15 @@ trait Benchmarkable { description: String = "", messages: ArrayBuffer[String], timeout: Long, - forkThread: Boolean = true): BenchmarkResult = { + forkThread: Boolean = true, + iteration: Int = 1): BenchmarkResult = { logger.info(s"$this: benchmark") sparkContext.setJobDescription(s"Execution: $name, $description") beforeBenchmark() val result = if (forkThread) { runBenchmarkForked(includeBreakdown, description, messages, timeout) } else { - doBenchmark(includeBreakdown, description, messages) + doBenchmark(includeBreakdown, description, messages, iteration) } afterBenchmark(sqlContext.sparkContext) result @@ -107,7 +108,8 @@ trait Benchmarkable { protected def doBenchmark( includeBreakdown: Boolean, description: String = "", - messages: ArrayBuffer[String]): BenchmarkResult + messages: ArrayBuffer[String], + iteration: Int = 1): BenchmarkResult protected def measureTimeMs[A](f: => A): Double = { val startTime = System.nanoTime() diff --git a/src/main/scala/com/databricks/spark/sql/perf/Query.scala b/src/main/scala/com/databricks/spark/sql/perf/Query.scala index babc63f0..48c0e880 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Query.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Query.scala @@ -62,7 +62,8 @@ class Query( protected override def doBenchmark( includeBreakdown: Boolean, description: String = "", - messages: ArrayBuffer[String]): BenchmarkResult = { + messages: ArrayBuffer[String], + iteration: Int = 1): BenchmarkResult = { try { val dataFrame = buildDataFrame val queryExecution = dataFrame.queryExecution diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala index 8296f46b..58b58919 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala @@ -45,7 +45,8 @@ class MLPipelineStageBenchmarkable( override protected def doBenchmark( includeBreakdown: Boolean, description: String, - messages: ArrayBuffer[String]): BenchmarkResult = { + messages: ArrayBuffer[String], + iteration: Int = 1): BenchmarkResult = { try { val (trainingTime, model: Transformer) = measureTime { logger.info(s"$this: train: trainingSet=${trainingData.schema}") diff --git a/version.sbt b/version.sbt index 7338ce76..f9436171 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "0.5.1-SNAPSHOT" +version in ThisBuild := "0.5.2-SNAPSHOT" From 9b8d6531dab1044148421d78d733488a33c3064d Mon Sep 17 00:00:00 2001 From: Satya Kommula Date: Tue, 3 Dec 2024 17:24:42 +0530 Subject: [PATCH 3/5] initial commit for github workflows --- .github/workflows/scala.yml | 58 +++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 .github/workflows/scala.yml diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml new file mode 100644 index 00000000..c1d767bc --- /dev/null +++ b/.github/workflows/scala.yml @@ -0,0 +1,58 @@ +name: Build Spark sql perf + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + build: + runs-on: ubuntu-22.04 + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'adopt' + + - name: Install SDKMAN! and sbt 0.13.18 + run: | + curl -s "https://get.sdkman.io" | bash + source "$HOME/.sdkman/bin/sdkman-init.sh" + sdk install sbt 0.13.18 + + - name: Cache sbt + uses: actions/cache@v4 + with: + path: | + ~/.ivy2/cache + ~/.sbt + ~/.coursier + key: ${{ runner.os }}-sbt-${{ hashFiles('**/build.sbt') }} + restore-keys: | + ${{ runner.os }}-sbt- + + - name: Build with sbt + run: sbt compile + + - name: Package with sbt + run: sbt package + + - name: Extract version + id: extract_version + run: | + version=$(cat version.sbt | grep 'version in ThisBuild :=' | awk -F'\"' '{print $2}') + echo "version=$version" >> $GITHUB_ENV + + - name: Upload JAR artifact + uses: actions/upload-artifact@v4 + with: + name: spark-sql-perf_2.12-${{ env.version }}.jar + path: target/scala-2.12/*.jar \ No newline at end of file From dcfeaf1a18c0bb294c324d2d58bedaa4d4675043 Mon Sep 17 00:00:00 2001 From: Satya Kommula Date: Tue, 25 Nov 2025 13:20:47 +0530 Subject: [PATCH 4/5] Update build configuration and dependencies; remove incompatible plugins and Databricks settings (#3) --- .github/workflows/scala.yml | 10 +-- .gitignore | 2 + build.sbt | 62 ++++++++++--------- build/sbt | 2 +- project/build.properties | 3 +- project/plugins.sbt | 17 +++-- .../databricks/spark/sql/perf/Benchmark.scala | 2 +- .../spark/sql/perf/Benchmarkable.scala | 2 +- .../spark/sql/perf/DatasetPerformance.scala | 2 +- .../com/databricks/spark/sql/perf/Query.scala | 2 +- version.sbt | 2 +- 11 files changed, 51 insertions(+), 55 deletions(-) diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml index c1d767bc..cb480db0 100644 --- a/.github/workflows/scala.yml +++ b/.github/workflows/scala.yml @@ -22,12 +22,6 @@ jobs: java-version: '11' distribution: 'adopt' - - name: Install SDKMAN! and sbt 0.13.18 - run: | - curl -s "https://get.sdkman.io" | bash - source "$HOME/.sdkman/bin/sdkman-init.sh" - sdk install sbt 0.13.18 - - name: Cache sbt uses: actions/cache@v4 with: @@ -40,10 +34,10 @@ jobs: ${{ runner.os }}-sbt- - name: Build with sbt - run: sbt compile + run: ./build/sbt compile - name: Package with sbt - run: sbt package + run: ./build/sbt package - name: Extract version id: extract_version diff --git a/.gitignore b/.gitignore index 1bcb62a0..fec77466 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ src_managed/ project/boot/ project/plugins/project/ performance/ +/.bloop/ +/build/*.zip diff --git a/build.sbt b/build.sbt index 1a2b09f2..ba97caa9 100644 --- a/build.sbt +++ b/build.sbt @@ -9,61 +9,60 @@ scalaVersion := "2.12.18" crossScalaVersions := Seq("2.12.18") -sparkPackageName := "databricks/spark-sql-perf" +// Remove publishing configuration for now - focus on compilation +// sparkPackageName := "databricks/spark-sql-perf" // All Spark Packages need a license licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")) -sparkVersion := "3.5.1" +// Spark version - define it manually since we removed the spark-packages plugin +val sparkVersion = "3.5.1" -sparkComponents ++= Seq("sql", "hive", "mllib") +// Add Spark dependencies manually +libraryDependencies ++= Seq( + "org.apache.spark" %% "spark-core" % sparkVersion % "provided", + "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", + "org.apache.spark" %% "spark-hive" % sparkVersion % "provided", + "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided" +) -initialCommands in console := +initialCommands / console := """ |import org.apache.spark.sql._ |import org.apache.spark.sql.functions._ |import org.apache.spark.sql.types._ - |import org.apache.spark.sql.hive.test.TestHive - |import TestHive.implicits - |import TestHive.sql + |import org.apache.spark.sql.SparkSession | - |val sqlContext = TestHive + |val spark = SparkSession.builder().appName("spark-sql-perf").getOrCreate() + |val sqlContext = spark.sqlContext |import sqlContext.implicits._ """.stripMargin -libraryDependencies += "com.github.scopt" %% "scopt" % "3.7.1" +libraryDependencies += "com.github.scopt" %% "scopt" % "4.1.0" -libraryDependencies += "com.twitter" %% "util-jvm" % "6.45.0" % "provided" +libraryDependencies += "com.twitter" %% "util-jvm" % "24.2.0" % "provided" -libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.5" % "test" +libraryDependencies += "org.scalatest" %% "scalatest" % "3.2.19" % "test" -libraryDependencies += "org.yaml" % "snakeyaml" % "1.23" +libraryDependencies += "org.yaml" % "snakeyaml" % "2.5" fork := true -// Your username to login to Databricks Cloud -dbcUsername := sys.env.getOrElse("DBC_USERNAME", "") - -// Your password (Can be set as an environment variable) -dbcPassword := sys.env.getOrElse("DBC_PASSWORD", "") - -// The URL to the Databricks Cloud DB Api. Don't forget to set the port number to 34563! -dbcApiUrl := sys.env.getOrElse ("DBC_URL", sys.error("Please set DBC_URL")) - -// Add any clusters that you would like to deploy your work to. e.g. "My Cluster" -// or run dbcExecuteCommand -dbcClusters += sys.env.getOrElse("DBC_USERNAME", "") - -dbcLibraryPath := s"/Users/${sys.env.getOrElse("DBC_USERNAME", "")}/lib" +// Remove Databricks Cloud configuration for now +// dbcUsername := sys.env.getOrElse("DBC_USERNAME", "") +// dbcPassword := sys.env.getOrElse("DBC_PASSWORD", "") +// dbcApiUrl := sys.env.getOrElse ("DBC_URL", sys.error("Please set DBC_URL")) +// dbcClusters += sys.env.getOrElse("DBC_USERNAME", "") +// dbcLibraryPath := s"/Users/${sys.env.getOrElse("DBC_USERNAME", "")}/lib" val runBenchmark = inputKey[Unit]("runs a benchmark") runBenchmark := { import complete.DefaultParsers._ val args = spaceDelimited("[args]").parsed - val scalaRun = (runner in run).value - val classpath = (fullClasspath in Compile).value + val scalaRun = (Compile / run / runner).value + val classpath = (Compile / fullClasspath).value scalaRun.run("com.databricks.spark.sql.perf.RunBenchmark", classpath.map(_.data), args, streams.value.log) } @@ -74,13 +73,15 @@ val runMLBenchmark = inputKey[Unit]("runs an ML benchmark") runMLBenchmark := { import complete.DefaultParsers._ val args = spaceDelimited("[args]").parsed - val scalaRun = (runner in run).value - val classpath = (fullClasspath in Compile).value + val scalaRun = (Compile / run / runner).value + val classpath = (Compile / fullClasspath).value scalaRun.run("com.databricks.spark.sql.perf.mllib.MLLib", classpath.map(_.data), args, streams.value.log) } +// Comment out release configuration for now +/* import ReleaseTransformations._ /** Push to the team directory instead of the user's homedir for releases. */ @@ -159,3 +160,4 @@ releaseProcess := Seq[ReleaseStep]( commitNextVersion, pushChanges ) +*/ \ No newline at end of file diff --git a/build/sbt b/build/sbt index cc3203d7..7d26b548 100755 --- a/build/sbt +++ b/build/sbt @@ -153,4 +153,4 @@ trap onExit INT run "$@" exit_status=$? -onExit +onExit \ No newline at end of file diff --git a/project/build.properties b/project/build.properties index 5c4bcd91..e88a0d81 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1,2 +1 @@ -// This file should only contain the version of sbt to use. -sbt.version=0.13.18 +sbt.version=1.10.6 diff --git a/project/plugins.sbt b/project/plugins.sbt index c76851f6..1b633aee 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,20 +1,19 @@ // You may use this file to add plugin dependencies for sbt. resolvers ++= Seq( - Resolver.mavenLocal, - Resolver.sonatypeRepo("releases"), - "Maven Central" at "https://repo1.maven.org/maven2/", + "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", "Spark Packages Repo" at "https://repos.spark-packages.org/" ) -addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.3") +// Remove incompatible plugins for now +// addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.3") -addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") +// addSbtPlugin("com.github.mpeltonen" % "sbt-idea" % "1.6.0") -addSbtPlugin("com.github.sbt" % "sbt-release" % "1.0.15") +// addSbtPlugin("com.github.sbt" % "sbt-release" % "1.0.15") -addSbtPlugin("com.databricks" %% "sbt-databricks" % "0.1.5") +// addSbtPlugin("com.databricks" %% "sbt-databricks" % "0.1.5") -addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.6") +// addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.6") -addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") +// addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") diff --git a/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala b/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala index 6098f353..e214dff1 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala @@ -212,7 +212,7 @@ abstract class Benchmark( new SparkPerfExecution( name, Map.empty, - () => Unit, + () => (), () => rdd.count(), rdd.toDebugString) } diff --git a/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala b/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala index b36850fc..6acb520a 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala @@ -86,7 +86,7 @@ trait Benchmarkable { mode = executionMode.toString, parameters = Map.empty, failure = Some(Failure(e.getClass.getSimpleName, - e.getMessage + ":\n" + e.getStackTraceString))) + e.getMessage + ":\n" + e.getStackTrace.mkString("\n")))) } } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala b/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala index 0aaa6296..b3d25d44 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala @@ -133,7 +133,7 @@ class DatasetPerformance extends Benchmark { new SparkPerfExecution( "RDD: average", Map.empty, - prepare = () => Unit, + prepare = () => (), run = () => { val sumAndCount = smallrdd.map(i => (i, 1)).reduce((a, b) => (a._1 + b._1, a._2 + b._2)) diff --git a/src/main/scala/com/databricks/spark/sql/perf/Query.scala b/src/main/scala/com/databricks/spark/sql/perf/Query.scala index 48c0e880..c694225e 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Query.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Query.scala @@ -93,7 +93,7 @@ class Query( messages += s"Breakdown: ${node.simpleString(maxFields)}" val newNode = buildDataFrame.queryExecution.executedPlan.p(index) val executionTime = measureTimeMs { - newNode.execute().foreach((row: Any) => Unit) + newNode.execute().foreach((row: Any) => ()) } timeMap += ((index, executionTime)) diff --git a/version.sbt b/version.sbt index f9436171..f13c2095 100644 --- a/version.sbt +++ b/version.sbt @@ -1 +1 @@ -version in ThisBuild := "0.5.2-SNAPSHOT" +ThisBuild / version := "0.5.2-SNAPSHOT" From dd8833af777bd803e8880a40e1c58dfed9922272 Mon Sep 17 00:00:00 2001 From: Satya Kommula Date: Sat, 4 Apr 2026 12:17:16 +0530 Subject: [PATCH 5/5] feat: Migrate spark-sql-perf to Spark 3.x with Iceberg/Delta Lake support (#4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: migrate spark-sql-perf to Spark 3.x with Iceberg/Delta support BREAKING CHANGES (Spark 2.x → 3.x): - Replace all SQLContext APIs with SparkSession equivalents - Replace sqlContext.implicits with spark.implicits across all modules - Replace deprecated createExternalTable with spark.catalog.createTable - Replace setConf/getAllConfs with spark.conf.set/spark.conf.getAll - Replace createDataFrame/range/sparkContext calls with spark.* equivalents Tables.scala (core changes): - Rebuild createExternalTable() using explicit SQL DDL (CREATE EXTERNAL TABLE) to correctly handle partitioned and non-partitioned external tables - Add isPartitioned flag (default: false) to createExternalTable/createExternalTables - isPartitioned=false: flat external table, reads all files (safe default) - isPartitioned=true: adds PARTITIONED BY + runs MSCK REPAIR TABLE - Add Delta Lake and Iceberg to supported formats - Skip MSCK REPAIR TABLE for delta/iceberg (they manage their own partitioning) - Add try/catch around partition discovery for graceful degradation Benchmark.scala / Benchmarkable.scala: - Add val spark = sqlContext.sparkSession accessor throughout - Replace sqlContext.read.json with spark.read.json MLLib / MLBenchContext / dataGeneration: - Add SparkSession accessor (val spark = sqlContext.sparkSession) - Replace sql.sparkContext / sql.createDataFrame with spark.sparkSession.* calls - Modernize constructor initialization GenTPCDSData.scala: - Update --format help text to include Delta and Iceberg as valid options Tooling: - Add sbt-scalafmt 2.5.2 plugin to project/plugins.sbt - Add .scalafmt.conf (Scala 2.12, maxColumn=100, import sorting) - Apply scalafmt formatting to all 63 Scala sources Build: verified sbt compile + sbt assembly succeed on Spark 3.5.1 * ci: add scalafmtCheck job to GitHub Actions workflow --- .github/workflows/scala.yml | 104 +- .scalafmt.conf | 18 + build.sbt | 7 +- project/plugins.sbt | 4 + .../sql/perf/AggregationPerformance.scala | 58 +- .../databricks/spark/sql/perf/Benchmark.scala | 302 +++--- .../spark/sql/perf/Benchmarkable.scala | 46 +- .../spark/sql/perf/CpuProfile.scala | 68 +- .../spark/sql/perf/DatasetPerformance.scala | 65 +- .../spark/sql/perf/ExecutionMode.scala | 17 +- .../spark/sql/perf/JoinPerformance.scala | 61 +- .../com/databricks/spark/sql/perf/Query.scala | 53 +- .../spark/sql/perf/RunBenchmark.scala | 113 +- .../databricks/spark/sql/perf/Tables.scala | 236 +++-- .../spark/sql/perf/bigdata/BigData.scala | 1 + .../spark/sql/perf/bigdata/Queries.scala | 61 +- .../spark/sql/perf/bigdata/Tables.scala | 1 + .../spark/sql/perf/handleResults.scala | 4 +- .../sql/perf/mllib/BenchmarkAlgorithm.scala | 97 +- .../spark/sql/perf/mllib/MLBenchContext.scala | 41 +- .../spark/sql/perf/mllib/MLBenchmarks.scala | 31 +- .../spark/sql/perf/mllib/MLLib.scala | 62 +- .../mllib/MLPipelineStageBenchmarkable.scala | 80 +- .../sql/perf/mllib/OptionImplicits.scala | 37 +- .../sql/perf/mllib/ReflectionUtils.scala | 24 +- .../perf/mllib/TreeOrForestEstimator.scala | 73 +- .../classification/GBTClassification.scala | 8 +- .../perf/mllib/classification/LinearSVC.scala | 11 +- .../classification/LogisticRegression.scala | 12 +- .../mllib/classification/NaiveBayes.scala | 30 +- .../RandomForestClassification.scala | 1 - .../mllib/clustering/GaussianMixture.scala | 11 +- .../sql/perf/mllib/clustering/KMeans.scala | 13 +- .../spark/sql/perf/mllib/clustering/LDA.scala | 7 +- .../perf/mllib/data/ItemSetGenerator.scala | 24 +- .../sql/perf/mllib/data/RatingGenerator.scala | 12 +- .../sql/perf/mllib/data/dataGeneration.scala | 218 ++-- .../sql/perf/mllib/feature/Bucketizer.scala | 28 +- .../sql/perf/mllib/feature/HashingTF.scala | 14 +- .../perf/mllib/feature/OneHotEncoder.scala | 26 +- .../mllib/feature/QuantileDiscretizer.scala | 23 +- .../perf/mllib/feature/StringIndexer.scala | 6 +- .../sql/perf/mllib/feature/Tokenizer.scala | 6 +- .../perf/mllib/feature/UnaryTransformer.scala | 2 +- .../perf/mllib/feature/VectorAssembler.scala | 18 +- .../sql/perf/mllib/feature/Word2Vec.scala | 14 +- .../spark/sql/perf/mllib/fpm/FPGrowth.scala | 14 +- .../sql/perf/mllib/recommendation/ALS.scala | 51 +- .../regression/DecisionTreeRegression.scala | 1 - .../perf/mllib/regression/GBTRegression.scala | 9 +- .../perf/mllib/regression/GLMRegression.scala | 13 +- .../mllib/regression/LinearRegression.scala | 11 +- .../regression/RandomForestRegression.scala | 7 +- .../spark/sql/perf/mllib/yaml.scala | 147 +-- .../databricks/spark/sql/perf/package.scala | 6 +- .../databricks/spark/sql/perf/results.scala | 189 ++-- .../spark/sql/perf/tpcds/GenTPCDSData.scala | 30 +- .../sql/perf/tpcds/ImpalaKitQueries.scala | 305 +++--- .../spark/sql/perf/tpcds/SimpleQueries.scala | 95 +- .../spark/sql/perf/tpcds/TPCDS.scala | 47 +- .../spark/sql/perf/tpcds/TPCDSTables.scala | 974 +++++++++--------- .../sql/perf/tpcds/TPCDS_1_4_Queries.scala | 837 +++++++++++---- .../sql/perf/tpcds/TPCDS_2_4_Queries.scala | 130 ++- .../databricks/spark/sql/perf/tpch/TPCH.scala | 106 +- .../org/apache/spark/ml/ModelBuilderSSP.scala | 252 +++-- .../scala/org/apache/spark/ml/TreeUtils.scala | 42 +- .../ClassificationModelBuilder.scala | 6 +- 67 files changed, 3209 insertions(+), 2211 deletions(-) create mode 100644 .scalafmt.conf diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml index cb480db0..dc412fd9 100644 --- a/.github/workflows/scala.yml +++ b/.github/workflows/scala.yml @@ -9,44 +9,74 @@ on: - master jobs: + scalafmt-check: + name: Scalafmt Check + runs-on: ubuntu-22.04 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'adopt' + + - name: Cache sbt + uses: actions/cache@v4 + with: + path: | + ~/.ivy2/cache + ~/.sbt + ~/.coursier + key: ${{ runner.os }}-sbt-${{ hashFiles('**/build.sbt') }} + restore-keys: | + ${{ runner.os }}-sbt- + + - name: Check formatting with scalafmt + run: sbt scalafmtCheck + build: + name: Build & Package runs-on: ubuntu-22.04 + needs: scalafmt-check steps: - - name: Checkout code - uses: actions/checkout@v2 - - - name: Set up JDK 11 - uses: actions/setup-java@v3 - with: - java-version: '11' - distribution: 'adopt' - - - name: Cache sbt - uses: actions/cache@v4 - with: - path: | - ~/.ivy2/cache - ~/.sbt - ~/.coursier - key: ${{ runner.os }}-sbt-${{ hashFiles('**/build.sbt') }} - restore-keys: | - ${{ runner.os }}-sbt- - - - name: Build with sbt - run: ./build/sbt compile - - - name: Package with sbt - run: ./build/sbt package - - - name: Extract version - id: extract_version - run: | - version=$(cat version.sbt | grep 'version in ThisBuild :=' | awk -F'\"' '{print $2}') - echo "version=$version" >> $GITHUB_ENV - - - name: Upload JAR artifact - uses: actions/upload-artifact@v4 - with: - name: spark-sql-perf_2.12-${{ env.version }}.jar - path: target/scala-2.12/*.jar \ No newline at end of file + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up JDK 11 + uses: actions/setup-java@v3 + with: + java-version: '11' + distribution: 'adopt' + + - name: Cache sbt + uses: actions/cache@v4 + with: + path: | + ~/.ivy2/cache + ~/.sbt + ~/.coursier + key: ${{ runner.os }}-sbt-${{ hashFiles('**/build.sbt') }} + restore-keys: | + ${{ runner.os }}-sbt- + + - name: Compile + run: sbt compile + + - name: Package + run: sbt package + + - name: Extract version + id: extract_version + run: | + version=$(cat version.sbt | grep 'version in ThisBuild :=' | awk -F'"' '{print $2}') + echo "version=$version" >> $GITHUB_ENV + + - name: Upload JAR artifact + uses: actions/upload-artifact@v4 + with: + name: spark-sql-perf_2.12-${{ env.version }}.jar + path: target/scala-2.12/*.jar \ No newline at end of file diff --git a/.scalafmt.conf b/.scalafmt.conf new file mode 100644 index 00000000..8a313d91 --- /dev/null +++ b/.scalafmt.conf @@ -0,0 +1,18 @@ +version = "3.8.1" +runner.dialect = scala212 + +maxColumn = 100 + +indent.main = 2 +indent.significant = 2 + +align.preset = more +align.tokens."+" = [ + { code = "->", owner = "Term.ApplyInfix" } +] + +newlines.alwaysBeforeElseAfterCurlyIf = false +newlines.beforeCurlyLambdaParams = multilineWithCaseOnly + +rewrite.rules = [RedundantBraces, SortImports] +rewrite.redundantBraces.stringInterpolation = true diff --git a/build.sbt b/build.sbt index ba97caa9..d44fca83 100644 --- a/build.sbt +++ b/build.sbt @@ -160,4 +160,9 @@ releaseProcess := Seq[ReleaseStep]( commitNextVersion, pushChanges ) -*/ \ No newline at end of file +*/ + +assembly / assemblyMergeStrategy := { + case PathList("META-INF", xs @ _*) => MergeStrategy.discard + case x => MergeStrategy.first +} \ No newline at end of file diff --git a/project/plugins.sbt b/project/plugins.sbt index 1b633aee..cd448d78 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -17,3 +17,7 @@ resolvers ++= Seq( // addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.6") // addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") + +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.3.1") + +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2") diff --git a/src/main/scala/com/databricks/spark/sql/perf/AggregationPerformance.scala b/src/main/scala/com/databricks/spark/sql/perf/AggregationPerformance.scala index 0ba3930a..48880753 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/AggregationPerformance.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/AggregationPerformance.scala @@ -2,30 +2,30 @@ package com.databricks.spark.sql.perf class AggregationPerformance extends Benchmark { - import sqlContext.implicits._ + import spark.implicits._ import ExecutionMode._ - val sizes = (1 to 6).map(math.pow(10, _).toInt) val x = Table( "1milints", { - val df = sqlContext.range(0, 1000000).repartition(1) + val df = spark.range(0, 1000000).repartition(1) df.createTempView("1milints") df - }) + } + ) val joinTables = Seq( Table( "100milints", { - val df = sqlContext.range(0, 100000000).repartition(10) + val df = spark.range(0, 100000000).repartition(10) df.createTempView("100milints") df - }), - + } + ), Table( "1bilints", { - val df = sqlContext.range(0, 1000000000).repartition(10) + val df = spark.range(0, 1000000000).repartition(10) df.createTempView("1bilints") df } @@ -33,28 +33,34 @@ class AggregationPerformance extends Benchmark { ) val variousCardinality = sizes.map { size => - Table(s"ints$size", { - val df = sparkContext.parallelize(1 to size).flatMap { group => - (1 to 10000).map(i => (group, i)) - }.toDF("a", "b") - df.createTempView(s"ints$size") - df - }) + Table( + s"ints$size", { + val df = spark.sparkContext + .parallelize(1 to size) + .flatMap { group => + (1 to 10000).map(i => (group, i)) + } + .toDF("a", "b") + df.createTempView(s"ints$size") + df + } + ) } val lowCardinality = sizes.map { size => val fullSize = size * 10000L Table( s"twoGroups$fullSize", { - val df = sqlContext.range(0, fullSize).select($"id" % 2 as 'a, $"id" as 'b) + val df = spark.range(0, fullSize).select($"id" % 2 as 'a, $"id" as 'b) df.createTempView(s"twoGroups$fullSize") df - }) + } + ) } val newAggreation = Variation("aggregationType", Seq("new", "old")) { - case "old" => sqlContext.setConf("spark.sql.useAggregate2", "false") - case "new" => sqlContext.setConf("spark.sql.useAggregate2", "true") + case "old" => spark.conf.set("spark.sql.useAggregate2", "false") + case "new" => spark.conf.set("spark.sql.useAggregate2", "true") } val varyNumGroupsAvg: Seq[Benchmarkable] = variousCardinality.map(_.name).map { table => @@ -62,7 +68,8 @@ class AggregationPerformance extends Benchmark { s"avg-$table", s"SELECT AVG(b) FROM $table GROUP BY a", "an average with a varying number of groups", - executionMode = ForeachResults) + executionMode = ForeachResults + ) } val twoGroupsAvg: Seq[Benchmarkable] = lowCardinality.map(_.name).map { table => @@ -70,7 +77,8 @@ class AggregationPerformance extends Benchmark { s"avg-$table", s"SELECT AVG(b) FROM $table GROUP BY a", "an average on an int column with only two groups", - executionMode = ForeachResults) + executionMode = ForeachResults + ) } val complexInput: Seq[Benchmarkable] = @@ -79,7 +87,8 @@ class AggregationPerformance extends Benchmark { s"aggregation-complex-input-$table", s"SELECT SUM(id + id + id + id + id + id + id + id + id + id) FROM $table", "Sum of 9 columns added together", - executionMode = CollectResults) + executionMode = CollectResults + ) } val aggregates: Seq[Benchmarkable] = @@ -89,7 +98,8 @@ class AggregationPerformance extends Benchmark { s"single-aggregate-$agg-$table", s"SELECT $agg(id) FROM $table", "aggregation of a single column", - executionMode = CollectResults) + executionMode = CollectResults + ) } } -} \ No newline at end of file +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala b/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala index e214dff1..3cd4249b 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Benchmark.scala @@ -21,97 +21,111 @@ import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent._ import scala.concurrent.duration._ import scala.language.implicitConversions -import scala.util.{Success, Try, Failure => SFailure} +import scala.util.{Failure => SFailure, Success, Try} import scala.util.control.NonFatal import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Dataset, DataFrame, SQLContext, SparkSession} +import org.apache.spark.sql.{DataFrame, Dataset, SQLContext, SparkSession} import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.SparkContext import com.databricks.spark.sql.perf.cpu._ -/** - * A collection of queries that test a particular aspect of Spark SQL. - * - * @param sqlContext An existing SQLContext. - */ -abstract class Benchmark( - @transient val sqlContext: SQLContext) - extends Serializable { +/** A collection of queries that test a particular aspect of Spark SQL. + * + * @param sqlContext + * An existing SQLContext. + */ +abstract class Benchmark(@transient val sqlContext: SQLContext) extends Serializable { + + @transient val spark = sqlContext.sparkSession import Benchmark._ def this() = this(SparkSession.builder.getOrCreate().sqlContext) val resultsLocation = - sqlContext.getAllConfs.getOrElse( - "spark.sql.perf.results", - "/spark/sql/performance") + spark.conf.getAll.getOrElse("spark.sql.perf.results", "/spark/sql/performance") protected def sparkContext = sqlContext.sparkContext protected implicit def toOption[A](a: A): Option[A] = Option(a) - val buildInfo = Try(getClass.getClassLoader.loadClass("org.apache.spark.BuildInfo")).map { cls => - cls.getMethods - .filter(_.getReturnType == classOf[String]) + val buildInfo = Try(getClass.getClassLoader.loadClass("org.apache.spark.BuildInfo")) + .map { cls => + cls.getMethods + .filter(_.getReturnType == classOf[String]) .filterNot(_.getName == "toString") .map(m => m.getName -> m.invoke(cls).asInstanceOf[String]) .toMap - }.getOrElse(Map.empty) + } + .getOrElse(Map.empty) def currentConfiguration = BenchmarkConfiguration( - sqlConf = sqlContext.getAllConfs, + sqlConf = spark.conf.getAll, sparkConf = sparkContext.getConf.getAll.toMap, defaultParallelism = sparkContext.defaultParallelism, - buildInfo = buildInfo) - + buildInfo = buildInfo + ) val codegen = Variation("codegen", Seq("on", "off")) { - case "off" => sqlContext.setConf("spark.sql.codegen", "false") - case "on" => sqlContext.setConf("spark.sql.codegen", "true") + case "off" => spark.conf.set("spark.sql.codegen", "false") + case "on" => spark.conf.set("spark.sql.codegen", "true") } val unsafe = Variation("unsafe", Seq("on", "off")) { - case "off" => sqlContext.setConf("spark.sql.unsafe.enabled", "false") - case "on" => sqlContext.setConf("spark.sql.unsafe.enabled", "true") + case "off" => spark.conf.set("spark.sql.unsafe.enabled", "false") + case "on" => spark.conf.set("spark.sql.unsafe.enabled", "true") } val tungsten = Variation("tungsten", Seq("on", "off")) { - case "off" => sqlContext.setConf("spark.sql.tungsten.enabled", "false") - case "on" => sqlContext.setConf("spark.sql.tungsten.enabled", "true") + case "off" => spark.conf.set("spark.sql.tungsten.enabled", "false") + case "on" => spark.conf.set("spark.sql.tungsten.enabled", "true") } - /** - * Starts an experiment run with a given set of executions to run. - * - * @param executionsToRun a list of executions to run. - * @param includeBreakdown If it is true, breakdown results of an execution will be recorded. - * Setting it to true may significantly increase the time used to - * run an execution. - * @param iterations The number of iterations to run of each execution. - * @param variations [[Variation]]s used in this run. The cross product of all variations will be - * run for each execution * iteration. - * @param tags Tags of this run. - * @param timeout wait at most timeout milliseconds for each query, 0 means wait forever - * @return It returns a ExperimentStatus object that can be used to - * track the progress of this experiment run. - */ + /** Starts an experiment run with a given set of executions to run. + * + * @param executionsToRun + * a list of executions to run. + * @param includeBreakdown + * If it is true, breakdown results of an execution will be recorded. Setting it to true may + * significantly increase the time used to run an execution. + * @param iterations + * The number of iterations to run of each execution. + * @param variations + * [[Variation]]s used in this run. The cross product of all variations will be run for each + * execution * iteration. + * @param tags + * Tags of this run. + * @param timeout + * wait at most timeout milliseconds for each query, 0 means wait forever + * @return + * It returns a ExperimentStatus object that can be used to track the progress of this + * experiment run. + */ def runExperiment( executionsToRun: Seq[Benchmarkable], includeBreakdown: Boolean = false, iterations: Int = 3, - variations: Seq[Variation[_]] = Seq(Variation("StandardRun", Seq("true")) { _ => {} }), + variations: Seq[Variation[_]] = Seq(Variation("StandardRun", Seq("true")) { _ => }), tags: Map[String, String] = Map.empty, timeout: Long = 0L, resultLocation: String = resultsLocation, - forkThread: Boolean = true) = { - - new ExperimentStatus(executionsToRun, includeBreakdown, iterations, variations, tags, - timeout, resultLocation, sqlContext, allTables, currentConfiguration, forkThread = forkThread) - } - + forkThread: Boolean = true + ) = + new ExperimentStatus( + executionsToRun, + includeBreakdown, + iterations, + variations, + tags, + timeout, + resultLocation, + sqlContext, + allTables, + currentConfiguration, + forkThread = forkThread + ) import reflect.runtime._, universe._ import reflect.runtime._ @@ -135,7 +149,9 @@ abstract class Benchmark( .filter(m => m.isMethod) .map(_.asMethod) .filter(_.asMethod.returnType =:= typeOf[Seq[Table]]) - .flatMap(method => runtimeMirror.reflect(this).reflectMethod(method).apply().asInstanceOf[Seq[Table]]) + .flatMap(method => + runtimeMirror.reflect(this).reflectMethod(method).apply().asInstanceOf[Seq[Table]] + ) @transient lazy val allTables: Seq[Table] = (singleTables ++ groupedTables).toSeq @@ -145,14 +161,18 @@ abstract class Benchmark( .filter(m => m.isMethod) .map(_.asMethod) .filter(_.asMethod.returnType =:= typeOf[Benchmarkable]) - .map(method => runtimeMirror.reflect(this).reflectMethod(method).apply().asInstanceOf[Benchmarkable]) + .map(method => + runtimeMirror.reflect(this).reflectMethod(method).apply().asInstanceOf[Benchmarkable] + ) def groupedQueries = myType.declarations .filter(m => m.isMethod) .map(_.asMethod) .filter(_.asMethod.returnType =:= typeOf[Seq[Benchmarkable]]) - .flatMap(method => runtimeMirror.reflect(this).reflectMethod(method).apply().asInstanceOf[Seq[Benchmarkable]]) + .flatMap(method => + runtimeMirror.reflect(this).reflectMethod(method).apply().asInstanceOf[Seq[Benchmarkable]] + ) @transient lazy val allQueries = (singleQueries ++ groupedQueries).toSeq @@ -163,21 +183,25 @@ abstract class Benchmark( .filter(m => m.isMethod) .map(_.asMethod) .filter(_.asMethod.returnType =:= typeOf[Query]) - .map(method => runtimeMirror.reflect(this).reflectMethod(method).apply().asInstanceOf[Query]) + .map(method => + runtimeMirror.reflect(this).reflectMethod(method).apply().asInstanceOf[Query] + ) .mkString(",") val queries = myType.declarations - .filter(m => m.isMethod) - .map(_.asMethod) - .filter(_.asMethod.returnType =:= typeOf[Seq[Query]]) - .map { method => - val queries = runtimeMirror.reflect(this).reflectMethod(method).apply().asInstanceOf[Seq[Query]] - val queryList = queries.map(_.name).mkString(", ") - s""" + .filter(m => m.isMethod) + .map(_.asMethod) + .filter(_.asMethod.returnType =:= typeOf[Seq[Query]]) + .map { method => + val queries = + runtimeMirror.reflect(this).reflectMethod(method).apply().asInstanceOf[Seq[Query]] + val queryList = queries.map(_.name).mkString(", ") + s""" |

${method.name}

|
    $queryList
""".stripMargin - }.mkString("\n") + } + .mkString("\n") s""" |

Spark SQL Performance Benchmarking

@@ -193,29 +217,17 @@ abstract class Benchmark( name: String, sqlText: String, description: String, - executionMode: ExecutionMode = ExecutionMode.ForeachResults): Query = { - new Query(name, sqlContext.sql(sqlText), description, Some(sqlText), executionMode) - } + executionMode: ExecutionMode = ExecutionMode.ForeachResults + ): Query = + new Query(name, spark.sql(sqlText), description, Some(sqlText), executionMode) - def apply( - name: String, - dataFrameBuilder: => DataFrame, - description: String): Query = { + def apply(name: String, dataFrameBuilder: => DataFrame, description: String): Query = new Query(name, dataFrameBuilder, description, None, ExecutionMode.CollectResults) - } } object RDDCount { - def apply( - name: String, - rdd: RDD[_]) = { - new SparkPerfExecution( - name, - Map.empty, - () => (), - () => rdd.count(), - rdd.toDebugString) - } + def apply(name: String, rdd: RDD[_]) = + new SparkPerfExecution(name, Map.empty, () => (), () => rdd.count(), rdd.toDebugString) } /** A class for benchmarking Spark perf results. */ @@ -224,8 +236,8 @@ abstract class Benchmark( parameters: Map[String, String], prepare: () => Unit, run: () => Unit, - description: String = "") - extends Benchmarkable { + description: String = "" + ) extends Benchmarkable { override def toString: String = s""" @@ -235,56 +247,52 @@ abstract class Benchmark( protected override val executionMode: ExecutionMode = ExecutionMode.SparkPerfResults - protected override def beforeBenchmark(): Unit = { prepare() } + protected override def beforeBenchmark(): Unit = prepare() protected override def doBenchmark( includeBreakdown: Boolean, description: String = "", messages: ArrayBuffer[String], - iteration: Int = 1): BenchmarkResult = { + iteration: Int = 1 + ): BenchmarkResult = try { val timeMs = measureTimeMs(run()) BenchmarkResult( name = name, mode = executionMode.toString, parameters = parameters, - executionTime = Some(timeMs)) + executionTime = Some(timeMs) + ) } catch { case e: Exception => BenchmarkResult( name = name, mode = executionMode.toString, parameters = parameters, - failure = Some(Failure(e.getClass.getSimpleName, e.getMessage))) + failure = Some(Failure(e.getClass.getSimpleName, e.getMessage)) + ) } - } } } -/** - * A Variation represents a setting (e.g. the number of shuffle partitions or if tables - * are cached in memory) that we want to change in a experiment run. - * A Variation has three parts, `name`, `options`, and `setup`. - * The `name` is the identifier of a Variation. `options` is a Seq of options that - * will be used for a query. Basically, a query will be executed with every option - * defined in the list of `options`. `setup` defines the needed action for every - * option. For example, the following Variation is used to change the number of shuffle - * partitions of a query. The name of the Variation is "shufflePartitions". There are - * two options, 200 and 2000. The setup is used to set the value of property - * "spark.sql.shuffle.partitions". - * - * {{{ - * Variation("shufflePartitions", Seq("200", "2000")) { - * case num => sqlContext.setConf("spark.sql.shuffle.partitions", num) - * } - * }}} - */ +/** A Variation represents a setting (e.g. the number of shuffle partitions or if tables are cached + * in memory) that we want to change in a experiment run. A Variation has three parts, `name`, + * `options`, and `setup`. The `name` is the identifier of a Variation. `options` is a Seq of + * options that will be used for a query. Basically, a query will be executed with every option + * defined in the list of `options`. `setup` defines the needed action for every option. For + * example, the following Variation is used to change the number of shuffle partitions of a query. + * The name of the Variation is "shufflePartitions". There are two options, 200 and 2000. The setup + * is used to set the value of property "spark.sql.shuffle.partitions". + * + * {{{ + * Variation("shufflePartitions", Seq("200", "2000")) { + * case num => spark.conf.set("spark.sql.shuffle.partitions", num) + * } + * }}} + */ case class Variation[T](name: String, options: Seq[T])(val setup: T => Unit) -case class Table( - name: String, - data: Dataset[_]) - +case class Table(name: String, data: Dataset[_]) object Benchmark { @@ -299,9 +307,11 @@ object Benchmark { sqlContext: SQLContext, allTables: Seq[Table], currentConfiguration: BenchmarkConfiguration, - forkThread: Boolean = true) { - val currentResults = new collection.mutable.ArrayBuffer[BenchmarkResult]() - val currentRuns = new collection.mutable.ArrayBuffer[ExperimentRun]() + forkThread: Boolean = true + ) { + val spark = sqlContext.sparkSession + val currentResults = new collection.mutable.ArrayBuffer[BenchmarkResult]() + val currentRuns = new collection.mutable.ArrayBuffer[ExperimentRun]() val currentMessages = new collection.mutable.ArrayBuffer[String]() def logMessage(msg: String) = { @@ -311,22 +321,21 @@ object Benchmark { // Stats for HTML status message. @volatile var currentExecution = "" - @volatile var currentPlan = "" // for queries only - @volatile var currentConfig = "" - @volatile var failures = 0 - @volatile var startTime = 0L + @volatile var currentPlan = "" // for queries only + @volatile var currentConfig = "" + @volatile var failures = 0 + @volatile var startTime = 0L /** An optional log collection task that will run after the experiment. */ @volatile var logCollection: () => Unit = () => {} - def cartesianProduct[T](xss: List[List[T]]): List[List[T]] = xss match { - case Nil => List(Nil) - case h :: t => for(xh <- h; xt <- cartesianProduct(t)) yield xh :: xt + case Nil => List(Nil) + case h :: t => for (xh <- h; xt <- cartesianProduct(t)) yield xh :: xt } - val timestamp = System.currentTimeMillis() - val resultPath = s"$resultsLocation/timestamp=$timestamp" + val timestamp = System.currentTimeMillis() + val resultPath = s"$resultsLocation/timestamp=$timestamp" val combinations = cartesianProduct(variations.map(l => (0 until l.options.size).toList).toList) val resultsFuture = Future { @@ -334,11 +343,11 @@ object Benchmark { executionsToRun .collect { case query: Query => query } .flatMap { query => - try { + try query.newDataFrame().queryExecution.logical.collect { case r: UnresolvedRelation => r.tableName } - } catch { + catch { // ignore the queries that can't be parsed case e: Exception => Seq() } @@ -346,7 +355,7 @@ object Benchmark { .distinct .foreach { name => try { - sqlContext.table(name) + spark.table(name) logMessage(s"Table $name exists.") } catch { case ae: Exception => @@ -354,8 +363,7 @@ object Benchmark { .find(_.name == name) if (table.isDefined) { logMessage(s"Creating table: $name") - table.get.data - .write + table.get.data.write .mode("overwrite") .saveAsTable(name) } else { @@ -373,18 +381,19 @@ object Benchmark { v.setup(v.options(idx)) v.name -> v.options(idx).toString } - currentConfig = currentOptions.map { case (k,v) => s"$k: $v" }.mkString(", ") + currentConfig = currentOptions.map { case (k, v) => s"$k: $v" }.mkString(", ") val res = executionsToRun.flatMap { q => - val setup = s"iteration: $i, ${currentOptions.map { case (k, v) => s"$k=$v"}.mkString(", ")}" + val setup = + s"iteration: $i, ${currentOptions.map { case (k, v) => s"$k=$v" }.mkString(", ")}" logMessage(s"Running execution ${q.name} $setup") currentExecution = q.name currentPlan = q match { case query: Query => - try { + try query.newDataFrame().queryExecution.executedPlan.toString() - } catch { + catch { case e: Exception => s"failed to parse: $e" } @@ -393,8 +402,13 @@ object Benchmark { startTime = System.currentTimeMillis() val singleResultT = Try { - q.benchmark(includeBreakdown, setup, currentMessages, timeout, - forkThread=forkThread) + q.benchmark( + includeBreakdown, + setup, + currentMessages, + timeout, + forkThread = forkThread + ) } singleResultT match { @@ -410,7 +424,7 @@ object Benchmark { singleResult :: Nil case SFailure(e) => failures += 1 - logMessage(s"Execution '${q.name}' failed: ${e}") + logMessage(s"Execution '${q.name}' failed: $e") Nil } } @@ -420,7 +434,8 @@ object Benchmark { iteration = i, tags = currentOptions.toMap ++ tags, configuration = currentConfiguration, - res) + res + ) currentRuns += result @@ -429,7 +444,7 @@ object Benchmark { } try { - val resultsTable = sqlContext.createDataFrame(results) + val resultsTable = spark.createDataFrame(results) logMessage(s"Results written to table: 'sqlPerformance' at $resultPath") resultsTable .coalesce(1) @@ -445,7 +460,7 @@ object Benchmark { logCollection() } - def scheduleCpuCollection(fs: FS) = { + def scheduleCpuCollection(fs: FS) = logCollection = () => { logMessage(s"Begining CPU log collection") try { @@ -457,40 +472,36 @@ object Benchmark { throw e } } - } - def cpuProfile = new Profile(sqlContext, sqlContext.read.json(getCpuLocation(timestamp))) + def cpuProfile = new Profile(spark.sqlContext, spark.read.json(getCpuLocation(timestamp))) - def cpuProfileHtml(fs: FS) = { + def cpuProfileHtml(fs: FS) = s""" |

CPU Profile

|Permalink: sqlContext.read.json("${getCpuLocation(timestamp)}")
|${cpuProfile.buildGraph(fs)} """.stripMargin - } /** Waits for the finish of the experiment. */ - def waitForFinish(timeoutInSeconds: Int) = { + def waitForFinish(timeoutInSeconds: Int) = Await.result(resultsFuture, timeoutInSeconds.seconds) - } /** Returns results from an actively running experiment. */ def getCurrentResults() = { - val tbl = sqlContext.createDataFrame(currentResults) + val tbl = spark.createDataFrame(currentResults) tbl.createOrReplaceTempView("currentResults") tbl } /** Returns full iterations from an actively running experiment. */ def getCurrentRuns() = { - val tbl = sqlContext.createDataFrame(currentRuns) + val tbl = spark.createDataFrame(currentRuns) tbl.createOrReplaceTempView("currentRuns") tbl } - def tail(n: Int = 20) = { + def tail(n: Int = 20) = currentMessages.takeRight(n).mkString("\n") - } def status = if (resultsFuture.isCompleted) { @@ -502,7 +513,6 @@ object Benchmark { override def toString = s"""Permalink: table("sqlPerformance").where('timestamp === ${timestamp}L)""" - def html: String = { val maybeQueryPlan: String = if (currentPlan.nonEmpty) { @@ -517,7 +527,7 @@ object Benchmark { } s""" |

$status Experiment

- |Permalink: sqlContext.read.json("$resultPath")
+ |Permalink: spark.read.json("$resultPath")
|Iterations complete: ${currentRuns.size / combinations.size} / $iterations
|Failures: $failures
|Executions run: ${currentResults.size} / ${iterations * combinations.size * executionsToRun.size} diff --git a/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala b/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala index 6acb520a..3aacd315 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Benchmarkable.scala @@ -24,15 +24,14 @@ import scala.concurrent.duration._ import scala.collection.mutable.ArrayBuffer import scala.util.control.NonFatal -import org.apache.spark.sql.{SQLContext,SparkSession} -import org.apache.spark.{SparkEnv, SparkContext} - +import org.apache.spark.sql.{SQLContext, SparkSession} +import org.apache.spark.{SparkContext, SparkEnv} /** A trait to describe things that can be benchmarked. */ trait Benchmarkable { - @transient protected[this] val sqlSession = SparkSession.builder.getOrCreate() - @transient protected[this] val sqlContext = sqlSession.sqlContext - @transient protected[this] val sparkContext = sqlSession.sparkContext + @transient protected[this] val spark = SparkSession.builder.getOrCreate() + @transient protected[this] val sqlContext = spark.sqlContext + @transient protected[this] val sparkContext = spark.sparkContext val name: String protected val executionMode: ExecutionMode @@ -44,7 +43,8 @@ trait Benchmarkable { messages: ArrayBuffer[String], timeout: Long, forkThread: Boolean = true, - iteration: Int = 1): BenchmarkResult = { + iteration: Int = 1 + ): BenchmarkResult = { logger.info(s"$this: benchmark") sparkContext.setJobDescription(s"Execution: $name, $description") beforeBenchmark() @@ -57,27 +57,27 @@ trait Benchmarkable { result } - protected def beforeBenchmark(): Unit = { } + protected def beforeBenchmark(): Unit = {} - protected def afterBenchmark(sc: SparkContext): Unit = { + protected def afterBenchmark(sc: SparkContext): Unit = System.gc() - } private def runBenchmarkForked( includeBreakdown: Boolean, description: String = "", messages: ArrayBuffer[String], - timeout: Long): BenchmarkResult = { - val jobgroup = UUID.randomUUID().toString - val that = this + timeout: Long + ): BenchmarkResult = { + val jobgroup = UUID.randomUUID().toString + val that = this var result: BenchmarkResult = null val thread = new Thread("benchmark runner") { override def run(): Unit = { logger.info(s"$that running $this") sparkContext.setJobGroup(jobgroup, s"benchmark $name", true) - try { + try result = doBenchmark(includeBreakdown, description, messages) - } catch { + catch { case e: Throwable => logger.info(s"$that: failure in runBenchmark: $e") println(s"$that: failure in runBenchmark: $e") @@ -85,8 +85,13 @@ trait Benchmarkable { name = name, mode = executionMode.toString, parameters = Map.empty, - failure = Some(Failure(e.getClass.getSimpleName, - e.getMessage + ":\n" + e.getStackTrace.mkString("\n")))) + failure = Some( + Failure( + e.getClass.getSimpleName, + e.getMessage + ":\n" + e.getStackTrace.mkString("\n") + ) + ) + ) } } } @@ -109,7 +114,8 @@ trait Benchmarkable { includeBreakdown: Boolean, description: String = "", messages: ArrayBuffer[String], - iteration: Int = 1): BenchmarkResult + iteration: Int = 1 + ): BenchmarkResult protected def measureTimeMs[A](f: => A): Double = { val startTime = System.nanoTime() @@ -120,8 +126,8 @@ trait Benchmarkable { protected def measureTime[A](f: => A): (Duration, A) = { val startTime = System.nanoTime() - val res = f - val endTime = System.nanoTime() + val res = f + val endTime = System.nanoTime() (endTime - startTime).nanos -> res } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/CpuProfile.scala b/src/main/scala/com/databricks/spark/sql/perf/CpuProfile.scala index 901563a2..bdf493f9 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/CpuProfile.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/CpuProfile.scala @@ -16,10 +16,10 @@ package com.databricks.spark.sql.perf -import java.io.{FileOutputStream, File} +import java.io.{File, FileOutputStream} import org.apache.hadoop.conf.Configuration -import org.apache.spark.sql.{DataFrame, SQLContext, Row} +import org.apache.spark.sql.{DataFrame, Row, SQLContext, SparkSession} import org.apache.spark.sql.functions._ import scala.language.reflectiveCalls @@ -29,10 +29,9 @@ import org.apache.hadoop.fs.{FileSystem, Path} import com.twitter.jvm.CpuProfile -/** - * A collection of utilities for parsing stacktraces that have been recorded in JSON and generating visualizations - * on where time is being spent. - */ +/** A collection of utilities for parsing stacktraces that have been recorded in JSON and generating + * visualizations on where time is being spent. + */ package object cpu { // Placeholder for DBFS. @@ -44,10 +43,7 @@ package object cpu { private val resultsLocation = "/spark/sql/cpu" lazy val pprof = { - run( - "sudo apt-get install -y graphviz", - "cp /dbfs/home/michael/pprof ./", - "chmod 755 pprof") + run("sudo apt-get install -y graphviz", "cp /dbfs/home/michael/pprof ./", "chmod 755 pprof") "./pprof" } @@ -55,23 +51,27 @@ package object cpu { def getCpuLocation(timestamp: Long) = s"$resultsLocation/timestamp=$timestamp" def collectLogs(sqlContext: SQLContext, fs: FS, timestamp: Long): String = { - import sqlContext.implicits._ + val spark = sqlContext.sparkSession + import spark.implicits._ def sc = sqlContext.sparkContext def copyLogFiles() = { - val path = "pwd".!!.trim + val path = "pwd".!!.trim val hostname = "hostname".!!.trim val conf = new Configuration() - val fs = FileSystem.get(conf) - fs.copyFromLocalFile(new Path(s"$path/logs/cpu.json"), new Path(s"$resultsLocation/timestamp=$timestamp/$hostname")) + val fs = FileSystem.get(conf) + fs.copyFromLocalFile( + new Path(s"$path/logs/cpu.json"), + new Path(s"$resultsLocation/timestamp=$timestamp/$hostname") + ) } fs.rm(getCpuLocation(timestamp), true) copyLogFiles() - sc.parallelize((1 to 100)).foreach { i => copyLogFiles() } + sc.parallelize((1 to 100)).foreach(i => copyLogFiles()) getCpuLocation(timestamp) } @@ -92,7 +92,8 @@ package object cpu { } class Profile(private val sqlContext: SQLContext, cpuLogs: DataFrame) { - import sqlContext.implicits._ + val spark = sqlContext.sparkSession + import spark.implicits._ def hosts = cpuLogs.select($"tags.hostName").distinct.collect().map(_.getString(0)) @@ -100,24 +101,39 @@ package object cpu { val stackLine = """(.*)\.([^\(]+)\(([^:]+)(:{0,1}\d*)\)""".r def toStackElement(s: String) = s match { case stackLine(cls, method, file, "") => new StackTraceElement(cls, method, file, 0) - case stackLine(cls, method, file, line) => new StackTraceElement(cls, method, file, line.stripPrefix(":").toInt) + case stackLine(cls, method, file, line) => + new StackTraceElement(cls, method, file, line.stripPrefix(":").toInt) } - val counts = cpuLogs.groupBy($"stack").agg(count($"*")).collect().flatMap { - case Row(stackLines: Array[String], count: Long) => stackLines.toSeq.map(toStackElement) -> count :: Nil - case other => println(s"Failed to parse $other"); Nil - }.toMap - val profile = new com.twitter.jvm.CpuProfile(counts, com.twitter.util.Duration.fromSeconds(10), cpuLogs.count().toInt, 0) + val counts = cpuLogs + .groupBy($"stack") + .agg(count($"*")) + .collect() + .flatMap { + case Row(stackLines: Array[String], count: Long) => + stackLines.toSeq.map(toStackElement) -> count :: Nil + case other => println(s"Failed to parse $other"); Nil + } + .toMap + val profile = new com.twitter.jvm.CpuProfile( + counts, + com.twitter.util.Duration.fromSeconds(10), + cpuLogs.count().toInt, + 0 + ) val outfile = File.createTempFile("cpu", "profile") val svgFile = File.createTempFile("cpu", "svg") profile.writeGoogleProfile(new FileOutputStream(outfile)) - println(run( - "cp /dbfs/home/michael/pprof ./", - "chmod 755 pprof", - s"$pprof --svg ${outfile.getCanonicalPath} > ${svgFile.getCanonicalPath}")) + println( + run( + "cp /dbfs/home/michael/pprof ./", + "chmod 755 pprof", + s"$pprof --svg ${outfile.getCanonicalPath} > ${svgFile.getCanonicalPath}" + ) + ) val timestamp = System.currentTimeMillis() fs.cp(s"file://$svgFile", s"/FileStore/cpu.profiles/$timestamp.svg", false) diff --git a/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala b/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala index b3d25d44..c25c6df0 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/DatasetPerformance.scala @@ -28,7 +28,7 @@ object TypedAverage extends Aggregator[Long, SumAndCount, Double] { b } - override def bufferEncoder = Encoders.product + override def bufferEncoder = Encoders.product override def outputEncoder = Encoders.scalaDouble @@ -47,30 +47,22 @@ case class SumAndCount(var sum: Long, var count: Int) class DatasetPerformance extends Benchmark { - import sqlContext.implicits._ + import spark.implicits._ val numLongs = 100000000 - val ds = sqlContext.range(1, numLongs) - val rdd = sparkContext.range(1, numLongs) + val ds = spark.range(1, numLongs) + val rdd = spark.sparkContext.range(1, numLongs) val smallNumLongs = 1000000 - val smallds = sqlContext.range(1, smallNumLongs).as[Long] - val smallrdd = sparkContext.range(1, smallNumLongs) + val smallds = spark.range(1, smallNumLongs).as[Long] + val smallrdd = spark.sparkContext.range(1, smallNumLongs) - def allBenchmarks = range ++ backToBackFilters ++ backToBackMaps ++ computeAverage + def allBenchmarks = range ++ backToBackFilters ++ backToBackMaps ++ computeAverage val range = Seq( - new Query( - "DS: range", - ds.as[Data].toDF(), - executionMode = ExecutionMode.ForeachResults), - new Query( - "DF: range", - ds.toDF(), - executionMode = ExecutionMode.ForeachResults), - RDDCount( - "RDD: range", - rdd.map(Data(_))) + new Query("DS: range", ds.as[Data].toDF(), executionMode = ExecutionMode.ForeachResults), + new Query("DF: range", ds.toDF(), executionMode = ExecutionMode.ForeachResults), + RDDCount("RDD: range", rdd.map(Data(_))) ) val backToBackFilters = Seq( @@ -80,21 +72,26 @@ class DatasetPerformance extends Benchmark { .filter(_.id % 100 != 0) .filter(_.id % 101 != 0) .filter(_.id % 102 != 0) - .filter(_.id % 103 != 0).toDF()), + .filter(_.id % 103 != 0) + .toDF() + ), new Query( "DF: back-to-back filters", ds.toDF() .filter("id % 100 != 0") .filter("id % 101 != 0") .filter("id % 102 != 0") - .filter("id % 103 != 0")), + .filter("id % 103 != 0") + ), RDDCount( "RDD: back-to-back filters", - rdd.map(Data(_)) + rdd + .map(Data(_)) .filter(_.id % 100 != 0) .filter(_.id % 101 != 0) .filter(_.id % 102 != 0) - .filter(_.id % 103 != 0)) + .filter(_.id % 103 != 0) + ) ) val backToBackMaps = Seq( @@ -104,32 +101,39 @@ class DatasetPerformance extends Benchmark { .map(d => Data(d.id + 1L)) .map(d => Data(d.id + 1L)) .map(d => Data(d.id + 1L)) - .map(d => Data(d.id + 1L)).toDF()), + .map(d => Data(d.id + 1L)) + .toDF() + ), new Query( "DF: back-to-back maps", ds.toDF() .select($"id" + 1 as 'id) .select($"id" + 1 as 'id) .select($"id" + 1 as 'id) - .select($"id" + 1 as 'id)), + .select($"id" + 1 as 'id) + ), RDDCount( "RDD: back-to-back maps", - rdd.map(Data) + rdd + .map(Data) + .map(d => Data(d.id + 1L)) .map(d => Data(d.id + 1L)) .map(d => Data(d.id + 1L)) .map(d => Data(d.id + 1L)) - .map(d => Data(d.id + 1L))) + ) ) val computeAverage = Seq( new Query( "DS: average", smallds.select(TypedAverage.toColumn).toDF(), - executionMode = ExecutionMode.CollectResults), + executionMode = ExecutionMode.CollectResults + ), new Query( "DF: average", smallds.toDF().selectExpr("avg(id)"), - executionMode = ExecutionMode.CollectResults), + executionMode = ExecutionMode.CollectResults + ), new SparkPerfExecution( "RDD: average", Map.empty, @@ -138,6 +142,7 @@ class DatasetPerformance extends Benchmark { val sumAndCount = smallrdd.map(i => (i, 1)).reduce((a, b) => (a._1 + b._1, a._2 + b._2)) sumAndCount._1.toDouble / sumAndCount._2 - }) + } + ) ) -} \ No newline at end of file +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/ExecutionMode.scala b/src/main/scala/com/databricks/spark/sql/perf/ExecutionMode.scala index e44bd87c..dd7f8e5d 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/ExecutionMode.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/ExecutionMode.scala @@ -16,12 +16,12 @@ package com.databricks.spark.sql.perf -/** - * Describes how a given Spark benchmark should be run (i.e. should the results be collected to - * the driver or just computed on the executors. - */ +/** Describes how a given Spark benchmark should be run (i.e. should the results be collected to the + * driver or just computed on the executors. + */ trait ExecutionMode extends Serializable case object ExecutionMode { + /** Benchmark run by collecting queries results (e.g. rdd.collect()) */ case object CollectResults extends ExecutionMode { override def toString: String = "collect" @@ -37,10 +37,9 @@ case object ExecutionMode { override def toString: String = "saveToParquet" } - /** - * Benchmark run by calculating the sum of the hash value of all rows. This is used to check - * query results do not change. - */ + /** Benchmark run by calculating the sum of the hash value of all rows. This is used to check + * query results do not change. + */ case object HashResults extends ExecutionMode { override def toString: String = "hash" } @@ -49,4 +48,4 @@ case object ExecutionMode { case object SparkPerfResults extends ExecutionMode { override def toString: String = "sparkPerf" } -} \ No newline at end of file +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/JoinPerformance.scala b/src/main/scala/com/databricks/spark/sql/perf/JoinPerformance.scala index 8c587066..c27a0d3a 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/JoinPerformance.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/JoinPerformance.scala @@ -5,30 +5,30 @@ import org.apache.spark.sql.types._ class JoinPerformance extends Benchmark { - import ExecutionMode._ - import sqlContext.implicits._ + import spark.implicits._ - private val table = sqlContext.table _ + private val table = (s: String) => spark.table(s) val x = Table( - "1milints", { // 1.5 mb, 1 file - val df = sqlContext.range(0, 1000000).repartition(1) + "1milints", { // 1.5 mb, 1 file + val df = spark.range(0, 1000000).repartition(1) df.createTempView("1milints") df - }) + } + ) val joinTables = Seq( Table( - "100milints", { // 143.542mb, 10 files - val df = sqlContext.range(0, 100000000).repartition(10) + "100milints", { // 143.542mb, 10 files + val df = spark.range(0, 100000000).repartition(10) df.createTempView("100milints") df - }), - + } + ), Table( - "1bilints", { // 143.542mb, 10 files - val df = sqlContext.range(0, 1000000000).repartition(10) + "1bilints", { // 143.542mb, 10 files + val df = spark.range(0, 1000000000).repartition(10) df.createTempView("1bilints") df } @@ -36,41 +36,46 @@ class JoinPerformance extends Benchmark { ) val sortMergeJoin = Variation("sortMergeJoin", Seq("on", "off")) { - case "off" => sqlContext.setConf("spark.sql.planner.sortMergeJoin", "false") - case "on" => sqlContext.setConf("spark.sql.planner.sortMergeJoin", "true") + case "off" => spark.conf.set("spark.sql.planner.sortMergeJoin", "false") + case "on" => spark.conf.set("spark.sql.planner.sortMergeJoin", "true") } - val singleKeyJoins: Seq[Benchmarkable] = Seq("1milints", "100milints", "1bilints").flatMap { table1 => - Seq("1milints", "100milints", "1bilints").flatMap { table2 => - Seq("JOIN", "RIGHT JOIN", "LEFT JOIN", "FULL OUTER JOIN").map { join => - Query( - s"singleKey-$join-$table1-$table2", - s"SELECT COUNT(*) FROM $table1 a $join $table2 b ON a.id = b.id", - "equi-inner join a small table with a big table using a single key.", - executionMode = CollectResults) + val singleKeyJoins: Seq[Benchmarkable] = Seq("1milints", "100milints", "1bilints").flatMap { + table1 => + Seq("1milints", "100milints", "1bilints").flatMap { table2 => + Seq("JOIN", "RIGHT JOIN", "LEFT JOIN", "FULL OUTER JOIN").map { join => + Query( + s"singleKey-$join-$table1-$table2", + s"SELECT COUNT(*) FROM $table1 a $join $table2 b ON a.id = b.id", + "equi-inner join a small table with a big table using a single key.", + executionMode = CollectResults + ) + } } - } } val varyDataSize = Seq(1, 128, 256, 512, 1024).map { dataSize => val intsWithData = table("100milints").select($"id", lit("*" * dataSize).as(s"data$dataSize")) new Query( s"join - datasize: $dataSize", - intsWithData.as("a").join(intsWithData.as("b"), $"a.id" === $"b.id")) + intsWithData.as("a").join(intsWithData.as("b"), $"a.id" === $"b.id") + ) } val varyKeyType = Seq(StringType, IntegerType, LongType, DoubleType).map { keyType => val convertedInts = table("100milints").select($"id".cast(keyType).as("id")) new Query( s"join - keytype: $keyType", - convertedInts.as("a").join(convertedInts.as("b"), $"a.id" === $"b.id")) + convertedInts.as("a").join(convertedInts.as("b"), $"a.id" === $"b.id") + ) } val varyNumMatches = Seq(1, 2, 4, 8, 16).map { numCopies => - val ints = table("100milints") + val ints = table("100milints") val copiedInts = Seq.fill(numCopies)(ints).reduce(_ union _) new Query( s"join - numMatches: $numCopies", - copiedInts.as("a").join(ints.as("b"), $"a.id" === $"b.id")) + copiedInts.as("a").join(ints.as("b"), $"a.id" === $"b.id") + ) } -} \ No newline at end of file +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/Query.scala b/src/main/scala/com/databricks/spark/sql/perf/Query.scala index c694225e..4339fb4e 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Query.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Query.scala @@ -24,25 +24,25 @@ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.execution.SparkPlan - /** Holds one benchmark query and its metadata. */ class Query( override val name: String, buildDataFrame: => DataFrame, val description: String = "", val sqlText: Option[String] = None, - override val executionMode: ExecutionMode = ExecutionMode.ForeachResults) - extends Benchmarkable with Serializable { + override val executionMode: ExecutionMode = ExecutionMode.ForeachResults +) extends Benchmarkable + with Serializable { private implicit def toOption[A](a: A): Option[A] = Option(a) - override def toString: String = { - try { + override def toString: String = + try s""" |== Query: $name == |${buildDataFrame.queryExecution.analyzed} """.stripMargin - } catch { + catch { case e: Exception => s""" |== Query: $name == @@ -51,7 +51,6 @@ class Query( | $description """.stripMargin } - } lazy val tablesInvolved = buildDataFrame.queryExecution.logical collect { case r: UnresolvedRelation => r.tableName @@ -63,9 +62,10 @@ class Query( includeBreakdown: Boolean, description: String = "", messages: ArrayBuffer[String], - iteration: Int = 1): BenchmarkResult = { + iteration: Int = 1 + ): BenchmarkResult = try { - val dataFrame = buildDataFrame + val dataFrame = buildDataFrame val queryExecution = dataFrame.queryExecution // We are not counting the time of ScalaReflection.convertRowToScala. val parsingTime = measureTimeMs { @@ -82,11 +82,11 @@ class Query( } val breakdownResults = if (includeBreakdown) { - val depth = queryExecution.executedPlan.collect { case p: SparkPlan => p }.size + val depth = queryExecution.executedPlan.collect { case p: SparkPlan => p }.size val physicalOperators = (0 until depth).map(i => (i, queryExecution.executedPlan.p(i))) - val indexMap = physicalOperators.map { case (index, op) => (op, index) }.toMap - val timeMap = new mutable.HashMap[Int, Double] - val maxFields = 999 // Maximum number of fields that will be converted to strings + val indexMap = physicalOperators.map { case (index, op) => (op, index) }.toMap + val timeMap = new mutable.HashMap[Int, Double] + val maxFields = 999 // Maximum number of fields that will be converted to strings physicalOperators.reverse.map { case (index, node) => @@ -98,7 +98,7 @@ class Query( timeMap += ((index, executionTime)) val childIndexes = node.children.map(indexMap) - val childTime = childIndexes.map(timeMap).sum + val childTime = childIndexes.map(timeMap).sum messages += s"Breakdown time: $executionTime (+${executionTime - childTime})" BreakdownResult( @@ -107,7 +107,8 @@ class Query( index, childIndexes, executionTime, - executionTime - childTime) + executionTime - childTime + ) } } else { Seq.empty[BreakdownResult] @@ -121,7 +122,7 @@ class Query( val executionTime = measureTimeMs { executionMode match { case ExecutionMode.CollectResults => dataFrame.collect() - case ExecutionMode.ForeachResults => dataFrame.foreach { _ => ():Unit } + case ExecutionMode.ForeachResults => dataFrame.foreach(_ => (): Unit) case ExecutionMode.WriteParquet(location) => dataFrame.write.parquet(s"$location/$name.parquet") case ExecutionMode.HashResults => @@ -150,18 +151,20 @@ class Query( executionTime = executionTime, result = result, queryExecution = dataFrame.queryExecution.toString, - breakDown = breakdownResults) + breakDown = breakdownResults + ) } catch { case e: Exception => - BenchmarkResult( - name = name, - mode = executionMode.toString, - failure = Failure(e.getClass.getName, e.getMessage)) + BenchmarkResult( + name = name, + mode = executionMode.toString, + failure = Failure(e.getClass.getName, e.getMessage) + ) } - } - /** Change the ExecutionMode of this Query to HashResults, which is used to check the query result. */ - def checkResult: Query = { + /** Change the ExecutionMode of this Query to HashResults, which is used to check the query + * result. + */ + def checkResult: Query = new Query(name, buildDataFrame, description, sqlText, ExecutionMode.HashResults) - } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/RunBenchmark.scala b/src/main/scala/com/databricks/spark/sql/perf/RunBenchmark.scala index ed367e7f..857f4df4 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/RunBenchmark.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/RunBenchmark.scala @@ -20,7 +20,7 @@ import java.net.InetAddress import java.io.File import org.apache.spark.sql.{SQLContext, SparkSession} import org.apache.spark.sql.functions._ -import org.apache.spark.{SparkContext, SparkConf} +import org.apache.spark.{SparkConf, SparkContext} import scala.util.Try case class RunConfig( @@ -28,20 +28,20 @@ case class RunConfig( benchmarkName: String = null, filter: Option[String] = None, iterations: Int = 3, - baseline: Option[Long] = None) + baseline: Option[Long] = None +) -/** - * Runs a benchmark locally and prints the results to the screen. - */ +/** Runs a benchmark locally and prints the results to the screen. + */ object RunBenchmark { def main(args: Array[String]): Unit = { val parser = new scopt.OptionParser[RunConfig]("spark-sql-perf") { head("spark-sql-perf", "0.2.0") opt[String]('m', "master") - .action { (x, c) => c.copy(master = x) } + .action((x, c) => c.copy(master = x)) .text("the Spark master to use, default to local[*]") opt[String]('b', "benchmark") - .action { (x, c) => c.copy(benchmarkName = x) } + .action((x, c) => c.copy(benchmarkName = x)) .text("the name of the benchmark to run") .required() opt[String]('f', "filter") @@ -51,8 +51,8 @@ object RunBenchmark { .action((x, c) => c.copy(iterations = x)) .text("the number of iterations to run") opt[Long]('c', "compare") - .action((x, c) => c.copy(baseline = Some(x))) - .text("the timestamp of the baseline experiment to compare with") + .action((x, c) => c.copy(baseline = Some(x))) + .text("the timestamp of the baseline experiment to compare with") help("help") .text("prints this usage text") } @@ -71,21 +71,22 @@ object RunBenchmark { .setAppName(getClass.getName) val sparkSession = SparkSession.builder.config(conf).getOrCreate() - val sc = sparkSession.sparkContext - val sqlContext = sparkSession.sqlContext - import sqlContext.implicits._ + val sc = sparkSession.sparkContext + val sqlContext = sparkSession.sqlContext + import sparkSession.implicits._ - sqlContext.setConf("spark.sql.perf.results", - new File("performance").toURI.toString) + sparkSession.conf.set("spark.sql.perf.results", new File("performance").toURI.toString) val benchmark = Try { - Class.forName(config.benchmarkName) - .newInstance() - .asInstanceOf[Benchmark] + Class + .forName(config.benchmarkName) + .newInstance() + .asInstanceOf[Benchmark] } getOrElse { - Class.forName("com.databricks.spark.sql.perf." + config.benchmarkName) - .newInstance() - .asInstanceOf[Benchmark] + Class + .forName("com.databricks.spark.sql.perf." + config.benchmarkName) + .newInstance() + .asInstanceOf[Benchmark] } val allQueries = config.filter.map { f => @@ -100,49 +101,55 @@ object RunBenchmark { val experiment = benchmark.runExperiment( executionsToRun = allQueries, iterations = config.iterations, - tags = Map( - "runtype" -> "local", - "host" -> InetAddress.getLocalHost().getHostName())) + tags = Map("runtype" -> "local", "host" -> InetAddress.getLocalHost().getHostName()) + ) println("== STARTING EXPERIMENT ==") experiment.waitForFinish(1000 * 60 * 30) - sqlContext.setConf("spark.sql.shuffle.partitions", "1") - - val toShow = experiment.getCurrentRuns() - .withColumn("result", explode($"results")) - .select("result.*") - .groupBy("name") - .agg( - min($"executionTime") as 'minTimeMs, - max($"executionTime") as 'maxTimeMs, - avg($"executionTime") as 'avgTimeMs, - stddev($"executionTime") as 'stdDev, - (stddev($"executionTime") / avg($"executionTime") * 100) as 'stdDevPercent) - .orderBy("name") - + sparkSession.conf.set("spark.sql.shuffle.partitions", "1") + + val toShow = experiment + .getCurrentRuns() + .withColumn("result", explode($"results")) + .select("result.*") + .groupBy("name") + .agg( + min($"executionTime") as 'minTimeMs, + max($"executionTime") as 'maxTimeMs, + avg($"executionTime") as 'avgTimeMs, + stddev($"executionTime") as 'stdDev, + (stddev($"executionTime") / avg($"executionTime") * 100) as 'stdDevPercent + ) + .orderBy("name") + println("Showing at most 100 query results now") toShow.show(100) - + println(s"""Results: sqlContext.read.json("${experiment.resultPath}")""") config.baseline.foreach { baseTimestamp => val baselineTime = when($"timestamp" === baseTimestamp, $"executionTime").otherwise(null) - val thisRunTime = when($"timestamp" === experiment.timestamp, $"executionTime").otherwise(null) - - val data = sqlContext.read.json(benchmark.resultsLocation) - .coalesce(1) - .where(s"timestamp IN ($baseTimestamp, ${experiment.timestamp})") - .withColumn("result", explode($"results")) - .select("timestamp", "result.*") - .groupBy("name") - .agg( - avg(baselineTime) as 'baselineTimeMs, - avg(thisRunTime) as 'thisRunTimeMs, - stddev(baselineTime) as 'stddev) - .withColumn( - "percentChange", ($"baselineTimeMs" - $"thisRunTimeMs") / $"baselineTimeMs" * 100) - .filter('thisRunTimeMs.isNotNull) + val thisRunTime = + when($"timestamp" === experiment.timestamp, $"executionTime").otherwise(null) + + val data = sparkSession.read + .json(benchmark.resultsLocation) + .coalesce(1) + .where(s"timestamp IN ($baseTimestamp, ${experiment.timestamp})") + .withColumn("result", explode($"results")) + .select("timestamp", "result.*") + .groupBy("name") + .agg( + avg(baselineTime) as 'baselineTimeMs, + avg(thisRunTime) as 'thisRunTimeMs, + stddev(baselineTime) as 'stddev + ) + .withColumn( + "percentChange", + ($"baselineTimeMs" - $"thisRunTimeMs") / $"baselineTimeMs" * 100 + ) + .filter('thisRunTimeMs.isNotNull) data.show(truncate = false) } diff --git a/src/main/scala/com/databricks/spark/sql/perf/Tables.scala b/src/main/scala/com/databricks/spark/sql/perf/Tables.scala index 177d38ce..e76ec372 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/Tables.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/Tables.scala @@ -29,21 +29,19 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SQLContext, SaveMode} - -/** - * Using ProcessBuilder.lineStream produces a stream, that uses - * a LinkedBlockingQueue with a default capacity of Integer.MAX_VALUE. - * - * This causes OOM if the consumer cannot keep up with the producer. - * - * See scala.sys.process.ProcessBuilderImpl.lineStream - */ +/** Using ProcessBuilder.lineStream produces a stream, that uses a LinkedBlockingQueue with a + * default capacity of Integer.MAX_VALUE. + * + * This causes OOM if the consumer cannot keep up with the producer. + * + * See scala.sys.process.ProcessBuilderImpl.lineStream + */ object BlockingLineStream { // See scala.sys.process.Streamed private final class BlockingStreamed[T]( - val process: T => Unit, - val done: Int => Unit, - val stream: () => Stream[T] + val process: T => Unit, + val done: Int => Unit, + val stream: () => Stream[T] ) // See scala.sys.process.Streamed @@ -70,7 +68,7 @@ object BlockingLineStream { private object Spawn { def apply(f: => Unit): Thread = apply(f, daemon = false) def apply(f: => Unit, daemon: Boolean): Thread = { - val thread = new Thread() { override def run() = { f } } + val thread = new Thread() { override def run() = f } thread.setDaemon(daemon) thread.start() thread @@ -79,7 +77,7 @@ object BlockingLineStream { def apply(command: Seq[String]): Stream[String] = { val streamed = BlockingStreamed[String](true) - val process = command.run(BasicIO(false, streamed.process, None)) + val process = command.run(BasicIO(false, streamed.process, None)) Spawn(streamed.done(process.exitValue())) streamed.stream() } @@ -87,16 +85,19 @@ object BlockingLineStream { trait DataGenerator extends Serializable { def generate( - sparkContext: SparkContext, - name: String, - partitions: Int, - scaleFactor: String): RDD[String] + sparkContext: SparkContext, + name: String, + partitions: Int, + scaleFactor: String + ): RDD[String] } - -abstract class Tables(sqlContext: SQLContext, scaleFactor: String, - useDoubleForDecimal: Boolean = false, useStringForDate: Boolean = false) - extends Serializable { +abstract class Tables( + sqlContext: SQLContext, + scaleFactor: String, + useDoubleForDecimal: Boolean = false, + useStringForDate: Boolean = false +) extends Serializable { def dataGenerator: DataGenerator def tables: Seq[Table] @@ -104,18 +105,17 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, private val log = LoggerFactory.getLogger(getClass) def sparkContext = sqlContext.sparkContext + val spark = sqlContext.sparkSession case class Table(name: String, partitionColumns: Seq[String], fields: StructField*) { val schema = StructType(fields) - def nonPartitioned: Table = { - Table(name, Nil, fields : _*) - } + def nonPartitioned: Table = + Table(name, Nil, fields: _*) - /** - * If convertToSchema is true, the data from generator will be parsed into columns and - * converted to `schema`. Otherwise, it just outputs the raw data (as a single STRING column). - */ + /** If convertToSchema is true, the data from generator will be parsed into columns and + * converted to `schema`. Otherwise, it just outputs the raw data (as a single STRING column). + */ def df(convertToSchema: Boolean, numPartition: Int) = { val generatedData = dataGenerator.generate(sparkContext, name, numPartition, scaleFactor) val rows = generatedData.mapPartitions { iter => @@ -138,9 +138,10 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, if (convertToSchema) { val stringData = - sqlContext.createDataFrame( + spark.createDataFrame( rows, - StructType(schema.fields.map(f => StructField(f.name, StringType)))) + StructType(schema.fields.map(f => StructField(f.name, StringType))) + ) val convertedData = { val columns = schema.fields.map { f => @@ -151,7 +152,7 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, convertedData } else { - sqlContext.createDataFrame(rows, StructType(Seq(StructField("value", StringType)))) + spark.createDataFrame(rows, StructType(Seq(StructField("value", StringType)))) } } @@ -159,33 +160,36 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, val newFields = fields.map { field => val newDataType = field.dataType match { case decimal: DecimalType if useDoubleForDecimal => DoubleType - case date: DateType if useStringForDate => StringType - case other => other + case date: DateType if useStringForDate => StringType + case other => other } field.copy(dataType = newDataType) } - Table(name, partitionColumns, newFields:_*) + Table(name, partitionColumns, newFields: _*) } def genData( - location: String, - format: String, - overwrite: Boolean, - clusterByPartitionColumns: Boolean, - filterOutNullPartitionValues: Boolean, - numPartitions: Int): Unit = { + location: String, + format: String, + overwrite: Boolean, + clusterByPartitionColumns: Boolean, + filterOutNullPartitionValues: Boolean, + numPartitions: Int + ): Unit = { val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Ignore - val data = df(format != "text", numPartitions) + val data = df(format != "text", numPartitions) val tempTableName = s"${name}_text" data.createOrReplaceTempView(tempTableName) val writer = if (partitionColumns.nonEmpty) { if (clusterByPartitionColumns) { - val columnString = data.schema.fields.map { field => - field.name - }.mkString(",") + val columnString = data.schema.fields + .map { field => + field.name + } + .mkString(",") val partitionColumnString = partitionColumns.mkString(",") val predicates = if (filterOutNullPartitionValues) { partitionColumns.map(col => s"$col IS NOT NULL").mkString("WHERE ", " AND ", "") @@ -203,7 +207,7 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, |DISTRIBUTE BY | $partitionColumnString """.stripMargin - val grouped = sqlContext.sql(query) + val grouped = spark.sql(query) println(s"Pre-clustering with partitioning columns with query $query.") log.info(s"Pre-clustering with partitioning columns with query $query.") grouped.write @@ -216,13 +220,18 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, // in case data has more than maxRecordsPerFile, split into multiple writers to improve datagen speed // files will be truncated to maxRecordsPerFile value, so the final result will be the same val numRows = data.count - val maxRecordPerFile = util.Try(sqlContext.getConf("spark.sql.files.maxRecordsPerFile").toInt).getOrElse(0) + val maxRecordPerFile = + util.Try(spark.conf.get("spark.sql.files.maxRecordsPerFile").toInt).getOrElse(0) - println(s"Data has $numRows rows clustered $clusterByPartitionColumns for $maxRecordPerFile") - log.info(s"Data has $numRows rows clustered $clusterByPartitionColumns for $maxRecordPerFile") + println( + s"Data has $numRows rows clustered $clusterByPartitionColumns for $maxRecordPerFile" + ) + log.info( + s"Data has $numRows rows clustered $clusterByPartitionColumns for $maxRecordPerFile" + ) if (maxRecordPerFile > 0 && numRows > maxRecordPerFile) { - val numFiles = (numRows.toDouble/maxRecordPerFile).ceil.toInt + val numFiles = (numRows.toDouble / maxRecordPerFile).ceil.toInt println(s"Coalescing into $numFiles files") log.info(s"Coalescing into $numFiles files") data.coalesce(numFiles).write @@ -235,49 +244,96 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, } writer.format(format).mode(mode) if (partitionColumns.nonEmpty) { - writer.partitionBy(partitionColumns : _*) + writer.partitionBy(partitionColumns: _*) } println(s"Generating table $name in database to $location with save mode $mode.") log.info(s"Generating table $name in database to $location with save mode $mode.") writer.save(location) - sqlContext.dropTempTable(tempTableName) + spark.catalog.dropTempView(tempTableName) } - def createExternalTable(location: String, format: String, databaseName: String, - overwrite: Boolean, discoverPartitions: Boolean = true): Unit = { - - val qualifiedTableName = databaseName + "." + name - val tableExists = sqlContext.tableNames(databaseName).contains(name) + def createExternalTable( + location: String, + format: String, + databaseName: String, + overwrite: Boolean, + discoverPartitions: Boolean = true, + isPartitioned: Boolean = false + ): Unit = { + + val qualifiedTableName = s"`$databaseName`.`$name`" + val tableExists = spark.catalog.tableExists(databaseName, name) if (overwrite) { - sqlContext.sql(s"DROP TABLE IF EXISTS $databaseName.$name") + spark.sql(s"DROP TABLE IF EXISTS $qualifiedTableName") } if (!tableExists || overwrite) { - println(s"Creating external table $name in database $databaseName using data stored in $location.") - log.info(s"Creating external table $name in database $databaseName using data stored in $location.") - sqlContext.createExternalTable(qualifiedTableName, location, format) + println( + s"Creating external table $name in database $databaseName using data stored in $location." + ) + log.info( + s"Creating external table $name in database $databaseName using data stored in $location." + ) + + val ddlSchema = schema.toDDL + + // Only add PARTITIONED BY when the caller explicitly signals that data is stored + // in Hive-style col=value/ directories. For flat files (e.g. JSON, Parquet without + // partition directories), keep isPartitioned=false (the default) to avoid 0-row tables. + val partitioningClause = if (isPartitioned && partitionColumns.nonEmpty) { + s"PARTITIONED BY (${partitionColumns.mkString("`", "`, `", "`")})" + } else { + "" + } + + val ddl = + s"""CREATE EXTERNAL TABLE IF NOT EXISTS $qualifiedTableName ($ddlSchema) + |USING $format + |$partitioningClause + |LOCATION '$location' + """.stripMargin + + spark.sql(ddl) } - if (partitionColumns.nonEmpty && discoverPartitions) { - println(s"Discovering partitions for table $name.") - log.info(s"Discovering partitions for table $name.") - sqlContext.sql(s"ALTER TABLE $databaseName.$name RECOVER PARTITIONS") + + val formatLower = format.toLowerCase + val skipRecover = Set("delta", "iceberg") + if ( + isPartitioned && partitionColumns.nonEmpty && discoverPartitions && !skipRecover.contains( + formatLower + ) + ) { + println(s"Attempting partition discovery for table $name.") + log.info(s"Attempting partition discovery for table $name.") + try { + spark.sql(s"MSCK REPAIR TABLE $qualifiedTableName") + println(s"Partition discovery succeeded for table $name.") + log.info(s"Partition discovery succeeded for table $name.") + } catch { + case e: Exception => + println( + s"[INFO] Partition discovery skipped for table $name " + + s"(data may be in flat files, not Hive-style col=value/ directories)." + ) + log.info(s"Partition discovery skipped for $name: ${e.getMessage}") + } } } def createTemporaryTable(location: String, format: String): Unit = { println(s"Creating temporary table $name using data stored in $location.") log.info(s"Creating temporary table $name using data stored in $location.") - sqlContext.read.format(format).load(location).createOrReplaceTempView(name) + spark.read.format(format).load(location).createOrReplaceTempView(name) } def analyzeTable(databaseName: String, analyzeColumns: Boolean = false): Unit = { println(s"Analyzing table $name.") log.info(s"Analyzing table $name.") - sqlContext.sql(s"ANALYZE TABLE $databaseName.$name COMPUTE STATISTICS") + spark.sql(s"ANALYZE TABLE $databaseName.$name COMPUTE STATISTICS") if (analyzeColumns) { val allColumns = fields.map(_.name).mkString(", ") println(s"Analyzing table $name columns $allColumns.") log.info(s"Analyzing table $name columns $allColumns.") - sqlContext.sql(s"ANALYZE TABLE $databaseName.$name COMPUTE STATISTICS FOR COLUMNS $allColumns") + spark.sql(s"ANALYZE TABLE $databaseName.$name COMPUTE STATISTICS FOR COLUMNS $allColumns") } } } @@ -290,7 +346,8 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, clusterByPartitionColumns: Boolean, filterOutNullPartitionValues: Boolean, tableFilter: String = "", - numPartitions: Int = 100): Unit = { + numPartitions: Int = 100 + ): Unit = { var tablesToBeGenerated = if (partitionTables) { tables } else { @@ -306,13 +363,26 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, tablesToBeGenerated.foreach { table => val tableLocation = s"$location/${table.name}" - table.genData(tableLocation, format, overwrite, clusterByPartitionColumns, - filterOutNullPartitionValues, numPartitions) + table.genData( + tableLocation, + format, + overwrite, + clusterByPartitionColumns, + filterOutNullPartitionValues, + numPartitions + ) } } - def createExternalTables(location: String, format: String, databaseName: String, - overwrite: Boolean, discoverPartitions: Boolean, tableFilter: String = ""): Unit = { + def createExternalTables( + location: String, + format: String, + databaseName: String, + overwrite: Boolean, + discoverPartitions: Boolean, + tableFilter: String = "", + isPartitioned: Boolean = false + ): Unit = { val filtered = if (tableFilter.isEmpty) { tables @@ -320,12 +390,19 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, tables.filter(_.name == tableFilter) } - sqlContext.sql(s"CREATE DATABASE IF NOT EXISTS $databaseName") + spark.sql(s"CREATE DATABASE IF NOT EXISTS $databaseName") filtered.foreach { table => val tableLocation = s"$location/${table.name}" - table.createExternalTable(tableLocation, format, databaseName, overwrite, discoverPartitions) + table.createExternalTable( + tableLocation, + format, + databaseName, + overwrite, + discoverPartitions, + isPartitioned + ) } - sqlContext.sql(s"USE $databaseName") + spark.sql(s"USE $databaseName") println(s"The current database has been set to $databaseName.") log.info(s"The current database has been set to $databaseName.") } @@ -342,7 +419,11 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, } } - def analyzeTables(databaseName: String, analyzeColumns: Boolean = false, tableFilter: String = ""): Unit = { + def analyzeTables( + databaseName: String, + analyzeColumns: Boolean = false, + tableFilter: String = "" + ): Unit = { val filtered = if (tableFilter.isEmpty) { tables } else { @@ -353,5 +434,4 @@ abstract class Tables(sqlContext: SQLContext, scaleFactor: String, } } - } diff --git a/src/main/scala/com/databricks/spark/sql/perf/bigdata/BigData.scala b/src/main/scala/com/databricks/spark/sql/perf/bigdata/BigData.scala index e69de29b..454276f2 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/bigdata/BigData.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/bigdata/BigData.scala @@ -0,0 +1 @@ +package com.databricks.spark.sql.perf.bigdata diff --git a/src/main/scala/com/databricks/spark/sql/perf/bigdata/Queries.scala b/src/main/scala/com/databricks/spark/sql/perf/bigdata/Queries.scala index e1b1c69b..43bd70bd 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/bigdata/Queries.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/bigdata/Queries.scala @@ -16,7 +16,7 @@ package com.databricks.spark.sql.perf.bigdata -import com.databricks.spark.sql.perf.{ExecutionMode, Benchmark} +import com.databricks.spark.sql.perf.{Benchmark, ExecutionMode} trait Queries extends Benchmark { @@ -25,8 +25,7 @@ trait Queries extends Benchmark { val queries1to3 = Seq( Query( name = "q1A", - sqlText = - """ + sqlText = """ |SELECT | pageURL, | pageRank @@ -35,12 +34,11 @@ trait Queries extends Benchmark { | pageRank > 1000 """.stripMargin, description = "", - executionMode = ForeachResults), - + executionMode = ForeachResults + ), Query( name = "q1B", - sqlText = - """ + sqlText = """ |SELECT | pageURL, | pageRank @@ -49,12 +47,11 @@ trait Queries extends Benchmark { | pageRank > 100 """.stripMargin, description = "", - executionMode = ForeachResults), - + executionMode = ForeachResults + ), Query( name = "q1C", - sqlText = - """ + sqlText = """ |SELECT | pageURL, | pageRank @@ -63,12 +60,11 @@ trait Queries extends Benchmark { | pageRank > 10 """.stripMargin, description = "", - executionMode = ForeachResults), - + executionMode = ForeachResults + ), Query( name = "q2A", - sqlText = - """ + sqlText = """ |SELECT | SUBSTR(sourceIP, 1, 8), | SUM(adRevenue) @@ -77,12 +73,11 @@ trait Queries extends Benchmark { | SUBSTR(sourceIP, 1, 8) """.stripMargin, description = "", - executionMode = ForeachResults), - + executionMode = ForeachResults + ), Query( name = "q2B", - sqlText = - """ + sqlText = """ |SELECT | SUBSTR(sourceIP, 1, 10), | SUM(adRevenue) @@ -91,12 +86,11 @@ trait Queries extends Benchmark { | SUBSTR(sourceIP, 1, 10) """.stripMargin, description = "", - executionMode = ForeachResults), - + executionMode = ForeachResults + ), Query( name = "q2C", - sqlText = - """ + sqlText = """ |SELECT | SUBSTR(sourceIP, 1, 12), | SUM(adRevenue) @@ -105,12 +99,11 @@ trait Queries extends Benchmark { | SUBSTR(sourceIP, 1, 12) """.stripMargin, description = "", - executionMode = ForeachResults), - + executionMode = ForeachResults + ), Query( name = "q3A", - sqlText = - """ + sqlText = """ |SELECT sourceIP, totalRevenue, avgPageRank |FROM | (SELECT sourceIP, @@ -124,12 +117,11 @@ trait Queries extends Benchmark { |ORDER BY totalRevenue DESC LIMIT 1 """.stripMargin, description = "", - executionMode = ForeachResults), - + executionMode = ForeachResults + ), Query( name = "q3B", - sqlText = - """ + sqlText = """ |SELECT sourceIP, totalRevenue, avgPageRank |FROM | (SELECT sourceIP, @@ -143,8 +135,8 @@ trait Queries extends Benchmark { |ORDER BY totalRevenue DESC LIMIT 1 """.stripMargin, description = "", - executionMode = ForeachResults), - + executionMode = ForeachResults + ), Query( name = "q3C", sqlText = """ @@ -161,6 +153,7 @@ trait Queries extends Benchmark { |ORDER BY totalRevenue DESC LIMIT 1 """.stripMargin, description = "", - executionMode = ForeachResults) + executionMode = ForeachResults + ) ) } diff --git a/src/main/scala/com/databricks/spark/sql/perf/bigdata/Tables.scala b/src/main/scala/com/databricks/spark/sql/perf/bigdata/Tables.scala index e69de29b..454276f2 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/bigdata/Tables.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/bigdata/Tables.scala @@ -0,0 +1 @@ +package com.databricks.spark.sql.perf.bigdata diff --git a/src/main/scala/com/databricks/spark/sql/perf/handleResults.scala b/src/main/scala/com/databricks/spark/sql/perf/handleResults.scala index a1c07de7..a5311d01 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/handleResults.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/handleResults.scala @@ -19,7 +19,7 @@ package com.databricks.spark.sql.perf import org.apache.spark.sql.SQLContext case class Results(resultsLocation: String, @transient sqlContext: SQLContext) { + val spark = sqlContext.sparkSession def allResults = - sqlContext.read.json( - sqlContext.sparkContext.textFile(s"$resultsLocation/*/")) + spark.read.json(spark.sparkContext.textFile(s"$resultsLocation/*/")) } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/BenchmarkAlgorithm.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/BenchmarkAlgorithm.scala index 9e00a45e..dc7adf71 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/BenchmarkAlgorithm.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/BenchmarkAlgorithm.scala @@ -8,67 +8,58 @@ import org.apache.spark.sql.functions._ import com.databricks.spark.sql.perf._ -/** - * The description of a benchmark for an ML algorithm. It follows a simple, standard proceduce: - * - generate some test and training data - * - generate a model against the training data - * - score the model against the training data - * - score the model against the test data - * - * You should not assume that your implementation can carry state around. If some state is needed, - * consider adding it to the context. - * - * It is assumed that the implementation is going to be an object. - */ +/** The description of a benchmark for an ML algorithm. It follows a simple, standard proceduce: + * - generate some test and training data + * - generate a model against the training data + * - score the model against the training data + * - score the model against the test data + * + * You should not assume that your implementation can carry state around. If some state is needed, + * consider adding it to the context. + * + * It is assumed that the implementation is going to be an object. + */ trait BenchmarkAlgorithm { def trainingDataSet(ctx: MLBenchContext): DataFrame def testDataSet(ctx: MLBenchContext): DataFrame - /** - * Create an [[Estimator]] or [[Transformer]] with params set from the given [[MLBenchContext]]. - */ + /** Create an [[Estimator]] or [[Transformer]] with params set from the given [[MLBenchContext]]. + */ def getPipelineStage(ctx: MLBenchContext): PipelineStage - /** - * The unnormalized score of the training procedure on a dataset. The normalization is - * performed by the caller. - * This calls `count()` on the transformed data to attempt to materialize the result for - * recording timing metrics. - */ + /** The unnormalized score of the training procedure on a dataset. The normalization is performed + * by the caller. This calls `count()` on the transformed data to attempt to materialize the + * result for recording timing metrics. + */ @throws[Exception]("if scoring fails") - def score( - ctx: MLBenchContext, - testSet: DataFrame, - model: Transformer): MLMetric = { + def score(ctx: MLBenchContext, testSet: DataFrame, model: Transformer): MLMetric = { val output = model.transform(testSet) // We create a useless UDF to make sure the entire DataFrame is instantiated. - val fakeUDF = udf { (_: Any) => 0 } + val fakeUDF = udf((_: Any) => 0) val columns = testSet.columns - output.select(sum(fakeUDF(struct(columns.map(col) : _*)))).first() + output.select(sum(fakeUDF(struct(columns.map(col): _*)))).first() MLMetric.Invalid } - def name: String = { + def name: String = this.getClass.getCanonicalName.replace("$", "") - } - /** - * Test additional methods for some algorithms. - * - * @param transformer The transformer which includes additional methods. - * @return A map which key is the additional method name, and value is a function which runs - * the corresponding method. - */ - def testAdditionalMethods( - ctx: MLBenchContext, - transformer: Transformer): Map[String, () => _] = Map.empty[String, () => _] + /** Test additional methods for some algorithms. + * + * @param transformer + * The transformer which includes additional methods. + * @return + * A map which key is the additional method name, and value is a function which runs the + * corresponding method. + */ + def testAdditionalMethods(ctx: MLBenchContext, transformer: Transformer): Map[String, () => _] = + Map.empty[String, () => _] } -/** - * Uses an evaluator to perform the scoring. - */ +/** Uses an evaluator to perform the scoring. + */ trait ScoringWithEvaluator { self: BenchmarkAlgorithm => @@ -77,9 +68,10 @@ trait ScoringWithEvaluator { final override def score( ctx: MLBenchContext, testSet: DataFrame, - model: Transformer): MLMetric = { + model: Transformer + ): MLMetric = { val results = model.transform(testSet) - val eval = evaluator(ctx) + val eval = evaluator(ctx) val metricName = if (eval.hasParam("metricName")) { val param = eval.getParam("metricName") eval.getOrDefault(param).toString @@ -91,10 +83,9 @@ trait ScoringWithEvaluator { } } -/** - * Builds the training set for an initial dataset and an initial model. Useful for validating a - * trained model against a given model. - */ +/** Builds the training set for an initial dataset and an initial model. Useful for validating a + * trained model against a given model. + */ trait TrainingSetFromTransformer { self: BenchmarkAlgorithm => @@ -104,8 +95,8 @@ trait TrainingSetFromTransformer { final override def trainingDataSet(ctx: MLBenchContext): DataFrame = { val initial = initialData(ctx) - val model = trueModel(ctx) - val fCol = col("features") + val model = trueModel(ctx) + val fCol = col("features") // Special case for the trees: we need to set the number of labels. // numClasses is set? We will add the number of classes to the final column. val lCol = ctx.params.numClasses match { @@ -124,9 +115,8 @@ trait TrainingSetFromTransformer { } } -/** - * The test data is the same as the training data. - */ +/** The test data is the same as the training data. + */ trait TestFromTraining { self: BenchmarkAlgorithm => @@ -145,4 +135,3 @@ trait TestFromTraining { self.trainingDataSet(ctx2) } } - diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLBenchContext.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLBenchContext.scala index b8971fe4..22a405bd 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLBenchContext.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLBenchContext.scala @@ -2,38 +2,33 @@ package com.databricks.spark.sql.perf.mllib import java.util.Random -import com.databricks.spark.sql.perf.{MLParams} -import org.apache.spark.sql.SQLContext +import com.databricks.spark.sql.perf.MLParams +import org.apache.spark.sql.{SQLContext, SparkSession} +/** All the information required to run a test. + * + * @param params + * @param sqlContext + */ +case class MLBenchContext(params: MLParams, sqlContext: SQLContext) { -/** - * All the information required to run a test. - * - * @param params - * @param sqlContext - */ -case class MLBenchContext( - params: MLParams, - sqlContext: SQLContext) { + val spark = sqlContext.sparkSession // Some seed fixed for the context. - private val internalSeed: Long = { + private val internalSeed: Long = params.randomSeed.map(_.toLong).getOrElse { throw new Exception("You need te specify the random seed") } - } - /** - * A fixed seed for this class. This function will always return the same value. - * - * @return - */ + /** A fixed seed for this class. This function will always return the same value. + * + * @return + */ def seed(): Long = internalSeed - /** - * Creates a new generator. The generator will always start with the same state. - * - * @return - */ + /** Creates a new generator. The generator will always start with the same state. + * + * @return + */ def newGenerator(): Random = new Random(seed()) } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLBenchmarks.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLBenchmarks.scala index 13b5d143..87395675 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLBenchmarks.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLBenchmarks.scala @@ -2,34 +2,33 @@ package com.databricks.spark.sql.perf.mllib import com.databricks.spark.sql.perf.mllib.classification.LogisticRegression import org.apache.spark.SparkContext -import org.apache.spark.sql.{SQLContext,SparkSession} +import org.apache.spark.sql.{SQLContext, SparkSession} -import com.databricks.spark.sql.perf.{MLParams} +import com.databricks.spark.sql.perf.MLParams import OptionImplicits._ -case class MLTest( - benchmark: BenchmarkAlgorithm, - params: MLParams) +case class MLTest(benchmark: BenchmarkAlgorithm, params: MLParams) // Example on how to create benchmarks using the API. object MLBenchmarks { // The list of standard benchmarks that we are going to run for ML. val benchmarks: Seq[MLTest] = List( - MLTest( - LogisticRegression, - new MLParams( - numFeatures = 10, - numExamples = 10, - numTestExamples = 10, - numPartitions = 3, - regParam = 1, - tol = 0.2) + MLTest( + LogisticRegression, + new MLParams( + numFeatures = 10, + numExamples = 10, + numTestExamples = 10, + numPartitions = 3, + regParam = 1, + tol = 0.2 ) + ) ) - val sparkSession = SparkSession.builder.getOrCreate() + val sparkSession = SparkSession.builder.getOrCreate() val sqlContext: SQLContext = sparkSession.sqlContext - val context = sqlContext.sparkContext + val context = sqlContext.sparkContext def benchmarkObjects: Seq[MLPipelineStageBenchmarkable] = benchmarks.map { mlb => new MLPipelineStageBenchmarkable(mlb.params, mlb.benchmark, sqlContext) diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala index c0bf70e0..e139bc32 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLLib.scala @@ -1,39 +1,35 @@ package com.databricks.spark.sql.perf.mllib - import scala.io.Source import scala.language.implicitConversions import org.slf4j.LoggerFactory -import org.apache.spark.sql.{DataFrame, SQLContext} +import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession} import org.apache.spark.{SparkConf, SparkContext} import com.databricks.spark.sql.perf._ +class MLLib(sqlContext: SQLContext) extends Benchmark(sqlContext) with Serializable { -class MLLib(sqlContext: SQLContext) - extends Benchmark(sqlContext) with Serializable { - - def this() = this(SQLContext.getOrCreate(SparkContext.getOrCreate())) + def this() = this(SparkSession.builder.getOrCreate().sqlContext) } object MLLib { - /** - * Runs a set of preprogrammed experiments and blocks on completion. - * - * @param runConfig a configuration that is av - * @return - */ + /** Runs a set of preprogrammed experiments and blocks on completion. + * + * @param runConfig + * a configuration that is av + * @return + */ lazy val logger = LoggerFactory.getLogger(this.getClass.getName) def runDefault(runConfig: RunConfig): DataFrame = { - val ml = new MLLib() + val ml = new MLLib() val benchmarks = MLBenchmarks.benchmarkObjects - val e = ml.runExperiment( - executionsToRun = benchmarks) + val e = ml.runExperiment(executionsToRun = benchmarks) e.waitForFinish(1000 * 60 * 30) logger.info("Run finished") e.getCurrentResults() @@ -47,44 +43,43 @@ object MLLib { val smallConfig: String = getConfig("config/mllib-small.yaml") val largeConfig: String = getConfig("config/mllib-large.yaml") - /** - * Entry point for running ML tests. Expects a single command-line argument: the path to - * a YAML config file specifying which ML tests to run and their parameters. - * @param args command line args - */ + /** Entry point for running ML tests. Expects a single command-line argument: the path to a YAML + * config file specifying which ML tests to run and their parameters. + * @param args + * command line args + */ def main(args: Array[String]): Unit = { val configFile = args(0) run(yamlFile = configFile) } - private[mllib] def getConf(yamlFile: String = null, yamlConfig: String = null): YamlConfig = { + private[mllib] def getConf(yamlFile: String = null, yamlConfig: String = null): YamlConfig = Option(yamlFile).map(YamlConfig.readFile).getOrElse { require(yamlConfig != null) YamlConfig.readString(yamlConfig) } - } private[mllib] def getBenchmarks(conf: YamlConfig): Seq[MLPipelineStageBenchmarkable] = { - val sqlContext = com.databricks.spark.sql.perf.mllib.MLBenchmarks.sqlContext + val sqlContext = com.databricks.spark.sql.perf.mllib.MLBenchmarks.sqlContext val benchmarksDescriptions = conf.runnableBenchmarks benchmarksDescriptions.map { mlb => new MLPipelineStageBenchmarkable(mlb.params, mlb.benchmark, sqlContext) } } - /** - * Runs all the experiments and blocks on completion - * - * @param yamlFile a file name - * @return - */ + /** Runs all the experiments and blocks on completion + * + * @param yamlFile + * a file name + * @return + */ def run(yamlFile: String = null, yamlConfig: String = null): DataFrame = { logger.info("Starting run") - val conf = getConf(yamlFile, yamlConfig) + val conf = getConf(yamlFile, yamlConfig) val sparkConf = new SparkConf().setAppName("MLlib QA").setMaster("local[2]") - val sc = SparkContext.getOrCreate(sparkConf) + val sc = SparkContext.getOrCreate(sparkConf) sc.setLogLevel("INFO") - val b = new com.databricks.spark.sql.perf.mllib.MLLib() + val b = new com.databricks.spark.sql.perf.mllib.MLLib() val benchmarks = getBenchmarks(conf) println(s"${benchmarks.size} benchmarks identified:") val str = benchmarks.map(_.prettyPrint).mkString("\n") @@ -94,7 +89,8 @@ object MLLib { executionsToRun = benchmarks, iterations = 1, // If you want to increase the number of iterations, add more seeds resultLocation = conf.output, - forkThread = false) + forkThread = false + ) e.waitForFinish(conf.timeout.toSeconds.toInt) logger.info("Run finished") e.getCurrentResults() diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala index 58b58919..7bb7965e 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/MLPipelineStageBenchmarkable.scala @@ -12,15 +12,16 @@ import com.databricks.spark.sql.perf._ class MLPipelineStageBenchmarkable( params: MLParams, test: BenchmarkAlgorithm, - sqlContext: SQLContext) - extends Benchmarkable with Serializable { + sqlContext: SQLContext +) extends Benchmarkable + with Serializable { import MLPipelineStageBenchmarkable._ - private var testData: DataFrame = null - private var trainingData: DataFrame = null + private var testData: DataFrame = null + private var trainingData: DataFrame = null private var testDataCount: Option[Long] = None - private val param = MLBenchContext(params, sqlContext) + private val param = MLBenchContext(params, sqlContext) override val name = test.name @@ -43,10 +44,11 @@ class MLPipelineStageBenchmarkable( } override protected def doBenchmark( - includeBreakdown: Boolean, - description: String, - messages: ArrayBuffer[String], - iteration: Int = 1): BenchmarkResult = { + includeBreakdown: Boolean, + description: String, + messages: ArrayBuffer[String], + iteration: Int = 1 + ): BenchmarkResult = try { val (trainingTime, model: Transformer) = measureTime { logger.info(s"$this: train: trainingSet=${trainingData.schema}") @@ -55,8 +57,11 @@ class MLPipelineStageBenchmarkable( case transformer: Transformer => transformer.transform(trainingData) transformer - case other: Any => throw new UnsupportedOperationException("Algorithm to benchmark must" + - s" be an estimator or transformer, found ${other.getClass} instead.") + case other: Any => + throw new UnsupportedOperationException( + "Algorithm to benchmark must" + + s" be an estimator or transformer, found ${other.getClass} instead." + ) } } logger.info(s"model: $model") @@ -64,30 +69,38 @@ class MLPipelineStageBenchmarkable( test.score(param, trainingData, model) } val metricTrainingTime = MLMetric("training.time", trainingTime.toMillis, false) - val metricTraining = MLMetric("training."+scoreTraining.metricName, + val metricTraining = MLMetric( + "training." + scoreTraining.metricName, scoreTraining.metricValue, - scoreTraining.isLargerBetter) + scoreTraining.isLargerBetter + ) val (scoreTestTime, scoreTest) = measureTime { test.score(param, testData, model) } val metricTestTime = MLMetric("test.time", scoreTestTime.toMillis, false) - val metricTest = MLMetric("test."+scoreTraining.metricName, + val metricTest = MLMetric( + "test." + scoreTraining.metricName, scoreTraining.metricValue, - scoreTraining.isLargerBetter) - - logger.info(s"$this doBenchmark: Trained model in ${trainingTime.toMillis / 1000.0}" + - s" s, Scored training dataset in ${scoreTrainTime.toMillis / 1000.0} s," + - s" test dataset in ${scoreTestTime.toMillis / 1000.0} s") - - val additionalTests = test.testAdditionalMethods(param, model).map { - tuple => - val (additionalMethodTime, _) = measureTime { tuple._2() } + scoreTraining.isLargerBetter + ) + + logger.info( + s"$this doBenchmark: Trained model in ${trainingTime.toMillis / 1000.0}" + + s" s, Scored training dataset in ${scoreTrainTime.toMillis / 1000.0} s," + + s" test dataset in ${scoreTestTime.toMillis / 1000.0} s" + ) + + val additionalTests = test + .testAdditionalMethods(param, model) + .map { tuple => + val (additionalMethodTime, _) = measureTime(tuple._2()) MLMetric(tuple._1, additionalMethodTime.toMillis, false) - }.toArray + } + .toArray val mlMetrics = Array(metricTrainingTime, metricTraining, metricTestTime, metricTest) ++ additionalTests - val paramsMap = params.toMap + val paramsMap = params.toMap val benchmarkId = name.split('.').last + "_" + paramsMap.hashCode.abs BenchmarkResult( @@ -96,27 +109,27 @@ class MLPipelineStageBenchmarkable( parameters = paramsMap, executionTime = Some(trainingTime.toMillis), mlResult = Some(mlMetrics), - benchmarkId = Some(benchmarkId)) + benchmarkId = Some(benchmarkId) + ) } catch { case e: Exception => BenchmarkResult( name = name, mode = executionMode.toString, parameters = params.toMap, - failure = Some(Failure(e.getClass.getSimpleName, - e.getMessage + ":\n" + e.getStackTraceString))) + failure = + Some(Failure(e.getClass.getSimpleName, e.getMessage + ":\n" + e.getStackTraceString)) + ) } finally { Option(testData).map(_.unpersist()) Option(trainingData).map(_.unpersist()) } - } def prettyPrint: String = { val paramString = pprint(params).mkString("\n") s"$test\n$paramString" } - } object MLPipelineStageBenchmarkable { @@ -124,15 +137,14 @@ object MLPipelineStageBenchmarkable { val m = getCCParams(p) m.flatMap { case (key, Some(value: Any)) => Some(s" $key=$value") - case _ => None - } .toSeq + case _ => None + }.toSeq } // From http://stackoverflow.com/questions/1226555/case-class-to-map-in-scala private def getCCParams(cc: AnyRef) = - (Map[String, Any]() /: cc.getClass.getDeclaredFields) {(a, f) => + (Map[String, Any]() /: cc.getClass.getDeclaredFields) { (a, f) => f.setAccessible(true) a + (f.getName -> f.get(cc)) } } - diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/OptionImplicits.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/OptionImplicits.scala index ef905258..15169c5b 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/OptionImplicits.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/OptionImplicits.scala @@ -2,32 +2,29 @@ package com.databricks.spark.sql.perf.mllib import scala.language.implicitConversions -/** - * Implicits to transparently convert some Option[X] to X and vice-versa. - * - * This is usually dangerous to do, but in our case, the config is expressed through Options and - * it alleviates the need to manually box values. - */ +/** Implicits to transparently convert some Option[X] to X and vice-versa. + * + * This is usually dangerous to do, but in our case, the config is expressed through Options and it + * alleviates the need to manually box values. + */ object OptionImplicits { // The following implicits are unrolled for safety: private def oX2X[A](x: Option[A]): A = x.get - def checkLong(x: Option[Long]): Option[Long] = { + def checkLong(x: Option[Long]): Option[Long] = x.asInstanceOf[Option[Any]] match { case Some(u: java.lang.Integer) => Some(u.toLong) - case Some(u: java.lang.Long) => Some(u.toLong) - case _ => x + case Some(u: java.lang.Long) => Some(u.toLong) + case _ => x } - } - def checkDouble(x: Option[Double]): Option[Double] = { + def checkDouble(x: Option[Double]): Option[Double] = x.asInstanceOf[Option[Any]] match { case Some(u: java.lang.Integer) => Some(u.toDouble) - case Some(u: java.lang.Long) => Some(u.toDouble) - case Some(u: java.lang.Double) => Some(u.toDouble) - case _ => x + case Some(u: java.lang.Long) => Some(u.toDouble) + case Some(u: java.lang.Double) => Some(u.toDouble) + case _ => x } - } implicit def oD2D(x: Option[Double]): Double = oX2X(x) @@ -37,9 +34,9 @@ object OptionImplicits { implicit def oL2L(x: Option[Long]): Long = oX2X(x) - implicit def l2lo(x: Long): Option[Long] = checkLong(Option(x)) - implicit def i2lo(x: Int): Option[Long] = Option(x.toLong) - implicit def i2io(x: Int): Option[Int] = Option(x) + implicit def l2lo(x: Long): Option[Long] = checkLong(Option(x)) + implicit def i2lo(x: Int): Option[Long] = Option(x.toLong) + implicit def i2io(x: Int): Option[Int] = Option(x) implicit def d2do(x: Double): Option[Double] = Option(x) - implicit def i2do(x: Int): Option[Double] = Option(x) -} \ No newline at end of file + implicit def i2do(x: Int): Option[Double] = Option(x) +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/ReflectionUtils.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/ReflectionUtils.scala index 75a29496..7e688a84 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/ReflectionUtils.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/ReflectionUtils.scala @@ -6,16 +6,13 @@ import scala.reflect.runtime.universe._ /** Exposes methods to simplify implementation of classes like MLParams. */ private[perf] object ReflectionUtils { - private def getConstructor[T: TypeTag: ClassTag](obj: T): MethodSymbol = { + private def getConstructor[T: TypeTag: ClassTag](obj: T): MethodSymbol = typeOf[T].declaration(nme.CONSTRUCTOR).asMethod - } - /** - * Given an instance [[obj]] of a class whose constructor arguments are all of type Option[Any], - * returns a map of key-value pairs (argName -> argValue) where argName is the name - * of a constructor argument with a defined (not None) value and argValue is the corresponding - * value. - */ + /** Given an instance [[obj]] of a class whose constructor arguments are all of type Option[Any], + * returns a map of key-value pairs (argName -> argValue) where argName is the name of a + * constructor argument with a defined (not None) value and argValue is the corresponding value. + */ def getConstructorArgs[T: TypeTag: ClassTag](obj: T): Map[String, Any] = { // Get constructor of passed-in instance val constructor = getConstructor(obj) @@ -23,15 +20,18 @@ private[perf] object ReflectionUtils { constructor.paramss.flatten.flatMap { (param: Symbol) => // Get name and value of the constructor argument val paramName = param.name.toString - val getter = obj.getClass.getDeclaredField(paramName) + val getter = obj.getClass.getDeclaredField(paramName) getter.setAccessible(true) val paramValue = getter.get(obj) // If the constructor argument is defined, include it in our output map paramValue match { case value: Option[Any] => if (value.isDefined) Seq(paramName -> paramValue) else Seq.empty - case _ => throw new UnsupportedOperationException("ReflectionUtils.getConstructorArgs " + - "can only be called on instances of classes whose constructor arguments are all of " + - s"type Option[Any]; constructor argument ${paramName} had invalid type.") + case _ => + throw new UnsupportedOperationException( + "ReflectionUtils.getConstructorArgs " + + "can only be called on instances of classes whose constructor arguments are all of " + + s"type Option[Any]; constructor argument $paramName had invalid type." + ) } }.toMap } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/TreeOrForestEstimator.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/TreeOrForestEstimator.scala index 0bf8b536..e43728d6 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/TreeOrForestEstimator.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/TreeOrForestEstimator.scala @@ -1,8 +1,11 @@ package com.databricks.spark.sql.perf.mllib import org.apache.spark.ml.{ModelBuilderSSP, Transformer, TreeUtils} -import org.apache.spark.ml.evaluation.{Evaluator, MulticlassClassificationEvaluator, - RegressionEvaluator} +import org.apache.spark.ml.evaluation.{ + Evaluator, + MulticlassClassificationEvaluator, + RegressionEvaluator +} import org.apache.spark.sql.DataFrame import com.databricks.spark.sql.perf.mllib.OptionImplicits._ @@ -10,14 +13,21 @@ import com.databricks.spark.sql.perf.mllib.data.DataGenerator /** Base trait for BenchmarkAlgorithm objects testing a tree or forest estimator */ private[mllib] trait TreeOrForestEstimator - extends TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { + extends TestFromTraining + with TrainingSetFromTransformer + with ScoringWithEvaluator { self: BenchmarkAlgorithm => override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ val featureArity: Array[Int] = TreeOrForestEstimator.getFeatureArity(ctx) - val data: DataFrame = DataGenerator.generateMixedFeatures(ctx.sqlContext, numExamples, - ctx.seed(), numPartitions, featureArity) + val data: DataFrame = DataGenerator.generateMixedFeatures( + ctx.sqlContext, + numExamples, + ctx.seed(), + numPartitions, + featureArity + ) TreeUtils.setMetadata(data, "features", featureArity) } } @@ -26,49 +36,50 @@ private[mllib] trait TreeOrForestEstimator private[mllib] trait TreeOrForestClassifier extends TreeOrForestEstimator { self: BenchmarkAlgorithm => - override protected def evaluator(ctx: MLBenchContext): Evaluator = { + override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() - } - override protected def trueModel(ctx: MLBenchContext): Transformer = { - ModelBuilderSSP.newDecisionTreeClassificationModel(ctx.params.depth, ctx.params.numClasses, - TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed()) - } + override protected def trueModel(ctx: MLBenchContext): Transformer = + ModelBuilderSSP.newDecisionTreeClassificationModel( + ctx.params.depth, + ctx.params.numClasses, + TreeOrForestEstimator.getFeatureArity(ctx), + ctx.seed() + ) } /** Base trait for BenchmarkAlgorithm objects testing a tree or forest regressor */ private[mllib] trait TreeOrForestRegressor extends TreeOrForestEstimator { self: BenchmarkAlgorithm => - override protected def evaluator(ctx: MLBenchContext): Evaluator = { + override protected def evaluator(ctx: MLBenchContext): Evaluator = new RegressionEvaluator() - } - override protected def trueModel(ctx: MLBenchContext): Transformer = { - ModelBuilderSSP.newDecisionTreeRegressionModel(ctx.params.depth, - TreeOrForestEstimator.getFeatureArity(ctx), ctx.seed()) - } + override protected def trueModel(ctx: MLBenchContext): Transformer = + ModelBuilderSSP.newDecisionTreeRegressionModel( + ctx.params.depth, + TreeOrForestEstimator.getFeatureArity(ctx), + ctx.seed() + ) } private[mllib] object TreeOrForestEstimator { - /** - * Get feature arity for tree and tree ensemble tests. - * Currently, this is hard-coded as: - * - 1/4 binary features - * - 1/4 high-arity (20-category) features - * - 1/2 continuous features - * - * @return Array of length numFeatures, where 0 indicates continuous feature and - * value > 0 indicates a categorical feature of that arity. - */ + /** Get feature arity for tree and tree ensemble tests. Currently, this is hard-coded as: + * - 1/4 binary features + * - 1/4 high-arity (20-category) features + * - 1/2 continuous features + * + * @return + * Array of length numFeatures, where 0 indicates continuous feature and value > 0 indicates a + * categorical feature of that arity. + */ def getFeatureArity(ctx: MLBenchContext): Array[Int] = { - val numFeatures = ctx.params.numFeatures + val numFeatures = ctx.params.numFeatures val fourthFeatures = numFeatures / 4 - Array.fill[Int](fourthFeatures)(2) ++ // low-arity categorical - Array.fill[Int](fourthFeatures)(20) ++ // high-arity categorical + Array.fill[Int](fourthFeatures)(2) ++ // low-arity categorical + Array.fill[Int](fourthFeatures)(20) ++ // high-arity categorical Array.fill[Int](numFeatures - 2 * fourthFeatures)(0) // continuous } } - diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/GBTClassification.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/GBTClassification.scala index 9580fea7..4f14633e 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/GBTClassification.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/GBTClassification.scala @@ -14,8 +14,12 @@ object GBTClassification extends BenchmarkAlgorithm with TreeOrForestClassifier import ctx.params._ // We add +1 to the depth to make it more likely that many iterations of boosting are needed // to model the true tree. - ModelBuilderSSP.newDecisionTreeClassificationModel(depth + 1, numClasses, getFeatureArity(ctx), - ctx.seed()) + ModelBuilderSSP.newDecisionTreeClassificationModel( + depth + 1, + numClasses, + getFeatureArity(ctx), + ctx.seed() + ) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/LinearSVC.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/LinearSVC.scala index 08f139c2..1d536b4a 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/LinearSVC.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/LinearSVC.scala @@ -9,8 +9,11 @@ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator -object LinearSVC extends BenchmarkAlgorithm - with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { +object LinearSVC + extends BenchmarkAlgorithm + with TestFromTraining + with TrainingSetFromTransformer + with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ @@ -19,7 +22,8 @@ object LinearSVC extends BenchmarkAlgorithm numExamples, ctx.seed(), numPartitions, - numFeatures) + numFeatures + ) } override protected def trueModel(ctx: MLBenchContext): Transformer = { @@ -42,4 +46,3 @@ object LinearSVC extends BenchmarkAlgorithm override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() } - diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/LogisticRegression.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/LogisticRegression.scala index 67f0ef62..a382e855 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/LogisticRegression.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/LogisticRegression.scala @@ -8,9 +8,11 @@ import org.apache.spark.ml.{Estimator, ModelBuilderSSP, PipelineStage, Transform import org.apache.spark.ml import org.apache.spark.ml.linalg.Vectors - -object LogisticRegression extends BenchmarkAlgorithm - with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { +object LogisticRegression + extends BenchmarkAlgorithm + with TestFromTraining + with TrainingSetFromTransformer + with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ @@ -19,7 +21,8 @@ object LogisticRegression extends BenchmarkAlgorithm numExamples, ctx.seed(), numPartitions, - numFeatures) + numFeatures + ) } override protected def trueModel(ctx: MLBenchContext): Transformer = { @@ -42,4 +45,3 @@ object LogisticRegression extends BenchmarkAlgorithm override protected def evaluator(ctx: MLBenchContext): Evaluator = new MulticlassClassificationEvaluator() } - diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/NaiveBayes.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/NaiveBayes.scala index 6d648f52..34be5ce4 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/NaiveBayes.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/NaiveBayes.scala @@ -10,8 +10,11 @@ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator /** Object containing methods used in performance tests for (multinomial) NaiveBayesModels */ -object NaiveBayes extends BenchmarkAlgorithm - with TestFromTraining with TrainingSetFromTransformer with ScoringWithEvaluator { +object NaiveBayes + extends BenchmarkAlgorithm + with TestFromTraining + with TrainingSetFromTransformer + with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ @@ -25,7 +28,8 @@ object NaiveBayes extends BenchmarkAlgorithm numExamples, ctx.seed(), numPartitions, - featureArity) + featureArity + ) } override protected def trueModel(ctx: MLBenchContext): Transformer = { @@ -35,21 +39,23 @@ object NaiveBayes extends BenchmarkAlgorithm // theta = log of class conditional probabilities, whose dimension is C (number of classes) // by D (number of features) val unnormalizedProbs = 0.until(numClasses).map(_ => rng.nextDouble() + 1e-5).toArray - val logProbSum = math.log(unnormalizedProbs.sum) - val piArray = unnormalizedProbs.map(prob => math.log(prob) - logProbSum) + val logProbSum = math.log(unnormalizedProbs.sum) + val piArray = unnormalizedProbs.map(prob => math.log(prob) - logProbSum) // For class i, set the class-conditional probability of feature i to 0.7, and split up the // remaining probability mass across the other features val currClassProb = 0.7 - val thetaArray = Array.tabulate(numClasses) { i: Int => - val baseProbMass = (1 - currClassProb) / (numFeatures - 1) - val probs = Array.fill[Double](numFeatures)(baseProbMass) - probs(i) = currClassProb - probs - }.map(_.map(math.log)) + val thetaArray = Array + .tabulate(numClasses) { i: Int => + val baseProbMass = (1 - currClassProb) / (numFeatures - 1) + val probs = Array.fill[Double](numFeatures)(baseProbMass) + probs(i) = currClassProb + probs + } + .map(_.map(math.log)) // Initialize new Naive Bayes model - val pi = Vectors.dense(piArray) + val pi = Vectors.dense(piArray) val theta = new DenseMatrix(numClasses, numFeatures, thetaArray.flatten, true) ModelBuilderSSP.newNaiveBayesModel(pi, theta) diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/RandomForestClassification.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/RandomForestClassification.scala index cfb1a953..1985d5b6 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/RandomForestClassification.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/classification/RandomForestClassification.scala @@ -6,7 +6,6 @@ import org.apache.spark.ml.classification.RandomForestClassifier import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ - object RandomForestClassification extends BenchmarkAlgorithm with TreeOrForestClassifier { override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/GaussianMixture.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/GaussianMixture.scala index 3c684a7b..10f77065 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/GaussianMixture.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/GaussianMixture.scala @@ -12,9 +12,14 @@ object GaussianMixture extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ - DataGenerator.generateGaussianMixtureData(ctx.sqlContext, numCenters = k, - numExamples = numExamples, seed = ctx.seed(), numPartitions = numPartitions, - numFeatures = numFeatures) + DataGenerator.generateGaussianMixtureData( + ctx.sqlContext, + numCenters = k, + numExamples = numExamples, + seed = ctx.seed(), + numPartitions = numPartitions, + numFeatures = numFeatures + ) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/KMeans.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/KMeans.scala index 9b2f2331..a8d43427 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/KMeans.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/KMeans.scala @@ -1,20 +1,25 @@ package com.databricks.spark.sql.perf.mllib.clustering import org.apache.spark.ml -import org.apache.spark.ml.{PipelineStage} +import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} - object KMeans extends BenchmarkAlgorithm with TestFromTraining { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ - DataGenerator.generateGaussianMixtureData(ctx.sqlContext, k, numExamples, ctx.seed(), - numPartitions, numFeatures) + DataGenerator.generateGaussianMixtureData( + ctx.sqlContext, + k, + numExamples, + ctx.seed(), + numPartitions, + numFeatures + ) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/LDA.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/LDA.scala index dfe9a2bc..ce4cea13 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/LDA.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/clustering/LDA.scala @@ -13,7 +13,6 @@ import org.apache.spark.ml.linalg.{Vector, Vectors} import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ - object LDA extends BenchmarkAlgorithm with TestFromTraining { // The LDA model is package private, no need to expose it. @@ -24,13 +23,13 @@ object LDA extends BenchmarkAlgorithm with TestFromTraining { numPartitions ) val seed: Int = randomSeed - val docLen = docLength.get - val numVocab = vocabSize.get + val docLen = docLength.get + val numVocab = vocabSize.get val data: RDD[(Long, Vector)] = rdd.mapPartitionsWithIndex { (idx, partition) => val rng = new Well19937c(seed ^ idx) partition.map { docIndex => var currentSize = 0 - val entries = MHashMap[Int, Int]() + val entries = MHashMap[Int, Int]() while (currentSize < docLen) { val index = rng.nextInt(numVocab) entries(index) = entries.getOrElse(index, 0) + 1 diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/data/ItemSetGenerator.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/data/ItemSetGenerator.scala index ec47b873..c04ba75d 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/data/ItemSetGenerator.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/data/ItemSetGenerator.scala @@ -4,17 +4,15 @@ import scala.collection.mutable.ArrayBuffer import org.apache.spark.mllib.random.{PoissonGenerator, RandomDataGenerator} -class ItemSetGenerator( - val numItems: Int, - val avgItemSetSize: Int) - extends RandomDataGenerator[Array[String]] { +class ItemSetGenerator(val numItems: Int, val avgItemSetSize: Int) + extends RandomDataGenerator[Array[String]] { assert(avgItemSetSize > 2) assert(numItems > 2) - private val rng = new java.util.Random() + private val rng = new java.util.Random() private val itemSetSizeRng = new PoissonGenerator(avgItemSetSize - 2) - private val itemRng = new PoissonGenerator(numItems / 2.0) + private val itemRng = new PoissonGenerator(numItems / 2.0) override def setSeed(seed: Long) { rng.setSeed(seed) @@ -24,15 +22,18 @@ class ItemSetGenerator( override def nextValue(): Array[String] = { // 1. generate size of itemset - val size = DataGenUtil.nextPoisson(itemSetSizeRng, v => v >= 1 && v <= numItems).toInt + val size = DataGenUtil.nextPoisson(itemSetSizeRng, v => v >= 1 && v <= numItems).toInt val arrayBuff = new ArrayBuffer[Int](size + 2) // 2. generate items in the itemset var i = 0 while (i < size) { - val nextVal = DataGenUtil.nextPoisson(itemRng, (item: Double) => { - item >= 0 && item < numItems && !arrayBuff.contains(item) - }).toInt + val nextVal = DataGenUtil + .nextPoisson( + itemRng, + (item: Double) => item >= 0 && item < numItems && !arrayBuff.contains(item) + ) + .toInt arrayBuff.append(nextVal) i += 1 } @@ -54,6 +55,5 @@ class ItemSetGenerator( arrayBuff.map(_.toString).toArray } - override def copy(): ItemSetGenerator - = new ItemSetGenerator(numItems, avgItemSetSize) + override def copy(): ItemSetGenerator = new ItemSetGenerator(numItems, avgItemSetSize) } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/data/RatingGenerator.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/data/RatingGenerator.scala index b1b197ac..58aac5d9 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/data/RatingGenerator.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/data/RatingGenerator.scala @@ -8,20 +8,20 @@ import scala.collection.mutable class RatingGenerator( private val numUsers: Int, private val numProducts: Int, - private val implicitPrefs: Boolean) extends RandomDataGenerator[Rating[Int]] { + private val implicitPrefs: Boolean +) extends RandomDataGenerator[Rating[Int]] { private val rng = new java.util.Random() private val observed = new mutable.HashMap[(Int, Int), Boolean]() override def nextValue(): Rating[Int] = { - var tuple = (rng.nextInt(numUsers),rng.nextInt(numProducts)) - while (observed.getOrElse(tuple,false)){ - tuple = (rng.nextInt(numUsers),rng.nextInt(numProducts)) - } + var tuple = (rng.nextInt(numUsers), rng.nextInt(numProducts)) + while (observed.getOrElse(tuple, false)) + tuple = (rng.nextInt(numUsers), rng.nextInt(numProducts)) observed += (tuple -> true) - val rating = if (implicitPrefs) rng.nextInt(2)*1.0 else rng.nextDouble()*5 + val rating = if (implicitPrefs) rng.nextInt(2) * 1.0 else rng.nextDouble() * 5 new Rating(tuple._1, tuple._2, rating.toFloat) } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/data/dataGeneration.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/data/dataGeneration.scala index d2838156..7606096b 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/data/dataGeneration.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/data/dataGeneration.scala @@ -14,44 +14,61 @@ object DataGenerator { numExamples: Long, seed: Long, numPartitions: Int, - numFeatures: Int): DataFrame = { + numFeatures: Int + ): DataFrame = { val featureArity = Array.fill[Int](numFeatures)(0) - val rdd: RDD[Vector] = RandomRDDs.randomRDD(sql.sparkContext, - new FeaturesGenerator(featureArity), numExamples, numPartitions, seed) - sql.createDataFrame(rdd.map(Tuple1.apply)).toDF("features") + val rdd: RDD[Vector] = RandomRDDs.randomRDD( + sql.sparkSession.sparkContext, + new FeaturesGenerator(featureArity), + numExamples, + numPartitions, + seed + ) + sql.sparkSession.createDataFrame(rdd.map(Tuple1.apply)).toDF("features") } - /** - * Generate a mix of continuous and categorical features. - * @param featureArity Array of length numFeatures, where 0 indicates a continuous feature and - * a value > 0 indicates a categorical feature with that arity. - */ + /** Generate a mix of continuous and categorical features. + * @param featureArity + * Array of length numFeatures, where 0 indicates a continuous feature and a value > 0 + * indicates a categorical feature with that arity. + */ def generateMixedFeatures( sql: SQLContext, numExamples: Long, seed: Long, numPartitions: Int, - featureArity: Array[Int]): DataFrame = { - val rdd: RDD[Vector] = RandomRDDs.randomRDD(sql.sparkContext, - new FeaturesGenerator(featureArity), numExamples, numPartitions, seed) - sql.createDataFrame(rdd.map(Tuple1.apply)).toDF("features") + featureArity: Array[Int] + ): DataFrame = { + val rdd: RDD[Vector] = RandomRDDs.randomRDD( + sql.sparkSession.sparkContext, + new FeaturesGenerator(featureArity), + numExamples, + numPartitions, + seed + ) + sql.sparkSession.createDataFrame(rdd.map(Tuple1.apply)).toDF("features") } - /** - * Generate data from a Gaussian mixture model. - * @param numCenters Number of clusters in mixture - */ + /** Generate data from a Gaussian mixture model. + * @param numCenters + * Number of clusters in mixture + */ def generateGaussianMixtureData( sql: SQLContext, numCenters: Int, numExamples: Long, seed: Long, numPartitions: Int, - numFeatures: Int): DataFrame = { - val rdd: RDD[Vector] = RandomRDDs.randomRDD(sql.sparkContext, - new GaussianMixtureDataGenerator(numCenters, numFeatures, seed), numExamples, numPartitions, - seed) - sql.createDataFrame(rdd.map(Tuple1.apply)).toDF("features") + numFeatures: Int + ): DataFrame = { + val rdd: RDD[Vector] = RandomRDDs.randomRDD( + sql.sparkSession.sparkContext, + new GaussianMixtureDataGenerator(numCenters, numFeatures, seed), + numExamples, + numPartitions, + seed + ) + sql.sparkSession.createDataFrame(rdd.map(Tuple1.apply)).toDF("features") } def generateRatings( @@ -62,20 +79,31 @@ object DataGenerator { numTestExamples: Long, implicitPrefs: Boolean, numPartitions: Int, - seed: Long): (DataFrame, DataFrame) = { - - val sc = sql.sparkContext - val train = RandomRDDs.randomRDD(sc, + seed: Long + ): (DataFrame, DataFrame) = { + + val sc = sql.sparkSession.sparkContext + val train = RandomRDDs + .randomRDD( + sc, + new RatingGenerator(numUsers, numProducts, implicitPrefs), + numExamples, + numPartitions, + seed + ) + .cache() + + val test = RandomRDDs.randomRDD( + sc, new RatingGenerator(numUsers, numProducts, implicitPrefs), - numExamples, numPartitions, seed).cache() - - val test = RandomRDDs.randomRDD(sc, - new RatingGenerator(numUsers, numProducts, implicitPrefs), - numTestExamples, numPartitions, seed + 24) + numTestExamples, + numPartitions, + seed + 24 + ) // Now get rid of duplicate ratings and remove non-existant userID's // and prodID's from the test set - val commons: PairRDDFunctions[(Int,Int),Rating[Int]] = + val commons: PairRDDFunctions[(Int, Int), Rating[Int]] = new PairRDDFunctions(train.keyBy(rating => (rating.user, rating.item)).cache()) val exact = commons.join(test.keyBy(rating => (rating.user, rating.item))) @@ -83,15 +111,15 @@ object DataGenerator { val trainPruned = commons.subtractByKey(exact).map(_._2).cache() // Now get rid of users that don't exist in the train set - val trainUsers: RDD[(Int,Rating[Int])] = trainPruned.keyBy(rating => rating.user) - val testUsers: PairRDDFunctions[Int,Rating[Int]] = + val trainUsers: RDD[(Int, Rating[Int])] = trainPruned.keyBy(rating => rating.user) + val testUsers: PairRDDFunctions[Int, Rating[Int]] = new PairRDDFunctions(test.keyBy(rating => rating.user)) val testWithAdditionalUsers = testUsers.subtractByKey(trainUsers) - val userPrunedTestProds: RDD[(Int,Rating[Int])] = + val userPrunedTestProds: RDD[(Int, Rating[Int])] = testUsers.subtractByKey(testWithAdditionalUsers).map(_._2).keyBy(rating => rating.item) - val trainProds: RDD[(Int,Rating[Int])] = trainPruned.keyBy(rating => rating.item) + val trainProds: RDD[(Int, Rating[Int])] = trainPruned.keyBy(rating => rating.item) val testWithAdditionalProds = new PairRDDFunctions[Int, Rating[Int]](userPrunedTestProds).subtractByKey(trainProds) @@ -100,7 +128,7 @@ object DataGenerator { .subtractByKey(testWithAdditionalProds) .map(_._2) - (sql.createDataFrame(trainPruned), sql.createDataFrame(finalTest)) + (sql.sparkSession.createDataFrame(trainPruned), sql.sparkSession.createDataFrame(finalTest)) } def generateRandString( @@ -109,10 +137,16 @@ object DataGenerator { seed: Long, numPartitions: Int, distinctCount: Int, - dataColName: String): DataFrame = { - val rdd: RDD[String] = RandomRDDs.randomRDD(sql.sparkContext, - new RandStringGenerator(distinctCount), numExamples, numPartitions, seed) - sql.createDataFrame(rdd.map(Tuple1.apply)).toDF(dataColName) + dataColName: String + ): DataFrame = { + val rdd: RDD[String] = RandomRDDs.randomRDD( + sql.sparkSession.sparkContext, + new RandStringGenerator(distinctCount), + numExamples, + numPartitions, + seed + ) + sql.sparkSession.createDataFrame(rdd.map(Tuple1.apply)).toDF(dataColName) } def generateDoc( @@ -122,10 +156,16 @@ object DataGenerator { numPartitions: Int, vocabSize: Int, avgDocLength: Int, - dataColName: String): DataFrame = { - val rdd: RDD[String] = RandomRDDs.randomRDD(sql.sparkContext, - new DocGenerator(vocabSize, avgDocLength), numExamples, numPartitions, seed) - sql.createDataFrame(rdd.map(Tuple1.apply)).toDF(dataColName) + dataColName: String + ): DataFrame = { + val rdd: RDD[String] = RandomRDDs.randomRDD( + sql.sparkSession.sparkContext, + new DocGenerator(vocabSize, avgDocLength), + numExamples, + numPartitions, + seed + ) + sql.sparkSession.createDataFrame(rdd.map(Tuple1.apply)).toDF(dataColName) } def generateItemSet( @@ -134,42 +174,44 @@ object DataGenerator { seed: Long, numPartitions: Int, numItems: Int, - avgItemSetSize: Int): DataFrame = { + avgItemSetSize: Int + ): DataFrame = { val rdd: RDD[Array[String]] = RandomRDDs.randomRDD( - sql.sparkContext, + sql.sparkSession.sparkContext, new ItemSetGenerator(numItems, avgItemSetSize), numExamples, numPartitions, - seed) - sql.createDataFrame(rdd.map(Tuple1.apply)).toDF("items") + seed + ) + sql.sparkSession.createDataFrame(rdd.map(Tuple1.apply)).toDF("items") } } - -/** - * Generator for a feature vector which can include a mix of categorical and continuous features. - * - * @param featureArity Length numFeatures, where 0 indicates continuous feature and > 0 - * indicates a categorical feature of that arity. - */ -class FeaturesGenerator(val featureArity: Array[Int]) - extends RandomDataGenerator[Vector] { +/** Generator for a feature vector which can include a mix of categorical and continuous features. + * + * @param featureArity + * Length numFeatures, where 0 indicates continuous feature and > 0 indicates a categorical + * feature of that arity. + */ +class FeaturesGenerator(val featureArity: Array[Int]) extends RandomDataGenerator[Vector] { featureArity.foreach { arity => - require(arity >= 0, s"FeaturesGenerator given categorical arity = $arity, " + - s"but arity should be >= 0.") + require( + arity >= 0, + s"FeaturesGenerator given categorical arity = $arity, " + + s"but arity should be >= 0." + ) } val numFeatures = featureArity.length private val rng = new java.util.Random() - /** - * Generates vector with features in the order given by [[featureArity]] - */ + /** Generates vector with features in the order given by [[featureArity]] + */ override def nextValue(): Vector = { val arr = new Array[Double](numFeatures) - var j = 0 + var j = 0 while (j < featureArity.length) { if (featureArity(j) == 0) arr(j) = 2 * rng.nextDouble() - 1 // centered uniform data @@ -187,37 +229,33 @@ class FeaturesGenerator(val featureArity: Array[Int]) override def copy(): FeaturesGenerator = new FeaturesGenerator(featureArity) } +/** Generate data from a Gaussian mixture model. + */ +class GaussianMixtureDataGenerator(val numCenters: Int, val numFeatures: Int, val seed: Long) + extends RandomDataGenerator[Vector] { -/** - * Generate data from a Gaussian mixture model. - */ -class GaussianMixtureDataGenerator( - val numCenters: Int, - val numFeatures: Int, - val seed: Long) extends RandomDataGenerator[Vector] { - - private val rng = new java.util.Random(seed) - private val rng2 = new java.util.Random(seed + 24) + private val rng = new java.util.Random(seed) + private val rng2 = new java.util.Random(seed + 24) private val scale_factors = Array.fill(numCenters)(rng.nextInt(20) - 10) // Have a random number of points around a cluster private val concentrations: Seq[Double] = { - val rand = Array.fill(numCenters)(rng.nextDouble()) + val rand = Array.fill(numCenters)(rng.nextDouble()) val randSum = rand.sum - val scaled = rand.map(x => x / randSum) + val scaled = rand.map(x => x / randSum) - (1 to numCenters).map{i => + (1 to numCenters).map { i => scaled.slice(0, i).sum } } - private val centers = (0 until numCenters).map{i => + private val centers = (0 until numCenters).map { i => Array.fill(numFeatures)((2 * rng.nextDouble() - 1) * scale_factors(i)) } override def nextValue(): Vector = { val pick_center_rand = rng2.nextDouble() - val center = centers(concentrations.indexWhere(p => pick_center_rand <= p)) + val center = centers(concentrations.indexWhere(p => pick_center_rand <= p)) Vectors.dense(Array.tabulate(numFeatures)(i => center(i) + rng2.nextGaussian())) } @@ -230,14 +268,12 @@ class GaussianMixtureDataGenerator( new GaussianMixtureDataGenerator(numCenters, numFeatures, seed) } -class RandStringGenerator( - distinctCount: Int) extends RandomDataGenerator[String] { +class RandStringGenerator(distinctCount: Int) extends RandomDataGenerator[String] { private val rng = new java.util.Random() - override def nextValue(): String = { + override def nextValue(): String = rng.nextInt(distinctCount).toString - } override def setSeed(seed: Long) { rng.setSeed(seed) @@ -246,12 +282,10 @@ class RandStringGenerator( override def copy(): RandStringGenerator = new RandStringGenerator(distinctCount) } -class DocGenerator( - vocabSize: Int, - avgDocLength: Int, - maxDocLength: Int = 65535) extends RandomDataGenerator[String] { +class DocGenerator(vocabSize: Int, avgDocLength: Int, maxDocLength: Int = 65535) + extends RandomDataGenerator[String] { - private val wordRng = new java.util.Random() + private val wordRng = new java.util.Random() private val docLengthRng = new PoissonGenerator(avgDocLength) override def setSeed(seed: Long) { @@ -261,7 +295,7 @@ class DocGenerator( override def nextValue(): String = { val docLength = DataGenUtil.nextPoisson(docLengthRng, v => v > 0 && v <= maxDocLength).toInt - val sb = new StringBuffer() + val sb = new StringBuffer() var i = 0 while (i < docLength) { @@ -279,9 +313,7 @@ class DocGenerator( object DataGenUtil { def nextPoisson(rng: PoissonGenerator, condition: Double => Boolean): Double = { var value = 0.0 - do { - value = rng.nextValue() - } while (!condition(value)) + do value = rng.nextValue() while (!condition(value)) value } -} \ No newline at end of file +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Bucketizer.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Bucketizer.scala index 789aba9e..3a509d22 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Bucketizer.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Bucketizer.scala @@ -19,19 +19,33 @@ object Bucketizer extends BenchmarkAlgorithm with TestFromTraining with UnaryTra import ctx.sqlContext.implicits._ val rng = ctx.newGenerator() // For a bucketizer, training data consists of a single column of random doubles - DataGenerator.generateContinuousFeatures(ctx.sqlContext, - numExamples, ctx.seed(), numPartitions, numFeatures = 1).rdd.map { case Row(vec: Vector) => - vec(0) // extract the single generated double value for each row - }.toDF(inputCol) + DataGenerator + .generateContinuousFeatures( + ctx.sqlContext, + numExamples, + ctx.seed(), + numPartitions, + numFeatures = 1 + ) + .rdd + .map { + case Row(vec: Vector) => + vec(0) // extract the single generated double value for each row + } + .toDF(inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ val rng = ctx.newGenerator() // Generate an array of (finite) splitting points in [-1, 1) for the Bucketizer - val splitPoints = 0.until(bucketizerNumBuckets - 1).map { _ => - 2 * rng.nextDouble() - 1 - }.sorted.toArray + val splitPoints = 0 + .until(bucketizerNumBuckets - 1) + .map { _ => + 2 * rng.nextDouble() - 1 + } + .sorted + .toArray // Final array of splits contains +/- infinity val splits = Array(Double.NegativeInfinity) ++ splitPoints ++ Array(Double.PositiveInfinity) new ml.feature.Bucketizer() diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/HashingTF.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/HashingTF.scala index 5fb7d76a..536a8740 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/HashingTF.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/HashingTF.scala @@ -15,7 +15,7 @@ object HashingTF extends BenchmarkAlgorithm with TestFromTraining with UnaryTran // Sample a random sentence of length up to maxLen from the provided array of words private def randomSentence(rng: Random, maxLen: Int, dictionary: Array[String]): Array[String] = { - val length = rng.nextInt(maxLen - 1) + 1 + val length = rng.nextInt(maxLen - 1) + 1 val dictLength = dictionary.length Array.tabulate[String](length)(_ => dictionary(rng.nextInt(dictLength))) } @@ -26,9 +26,15 @@ object HashingTF extends BenchmarkAlgorithm with TestFromTraining with UnaryTran // each string is selected from a pool of vocabSize strings // The expected # of occurrences of each word in our vocabulary is // (docLength * numExamples) / vocabSize - val df = DataGenerator.generateDoc(ctx.sqlContext, numExamples = numExamples, seed = ctx.seed(), - numPartitions = numPartitions, vocabSize = vocabSize, avgDocLength = docLength, - dataColName = inputCol) + val df = DataGenerator.generateDoc( + ctx.sqlContext, + numExamples = numExamples, + seed = ctx.seed(), + numPartitions = numPartitions, + vocabSize = vocabSize, + avgDocLength = docLength, + dataColName = inputCol + ) df.withColumn(inputCol, split(df(inputCol), " ")) } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/OneHotEncoder.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/OneHotEncoder.scala index 9ad4ceba..aacd8882 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/OneHotEncoder.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/OneHotEncoder.scala @@ -16,19 +16,23 @@ object OneHotEncoder extends BenchmarkAlgorithm with TestFromTraining with Unary import ctx.params._ import ctx.sqlContext.implicits._ - DataGenerator.generateMixedFeatures( - ctx.sqlContext, - numExamples, - ctx.seed(), - numPartitions, - Array.fill(1)(featureArity.get) - ).rdd.map { case Row(vec: Vector) => - vec(0) // extract the single generated double value for each row - }.toDF(inputCol) + DataGenerator + .generateMixedFeatures( + ctx.sqlContext, + numExamples, + ctx.seed(), + numPartitions, + Array.fill(1)(featureArity.get) + ) + .rdd + .map { + case Row(vec: Vector) => + vec(0) // extract the single generated double value for each row + } + .toDF(inputCol) } - override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { + override def getPipelineStage(ctx: MLBenchContext): PipelineStage = new ml.feature.OneHotEncoder() .setInputCol(inputCol) - } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/QuantileDiscretizer.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/QuantileDiscretizer.scala index cf32b0f9..00d08256 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/QuantileDiscretizer.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/QuantileDiscretizer.scala @@ -16,15 +16,20 @@ object QuantileDiscretizer extends BenchmarkAlgorithm with TestFromTraining with import ctx.params._ import ctx.sqlContext.implicits._ - DataGenerator.generateContinuousFeatures( - ctx.sqlContext, - numExamples, - ctx.seed(), - numPartitions, - 1 - ).rdd.map { case Row(vec: Vector) => - vec(0) // extract the single generated double value for each row - }.toDF(inputCol) + DataGenerator + .generateContinuousFeatures( + ctx.sqlContext, + numExamples, + ctx.seed(), + numPartitions, + 1 + ) + .rdd + .map { + case Row(vec: Vector) => + vec(0) // extract the single generated double value for each row + } + .toDF(inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/StringIndexer.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/StringIndexer.scala index 852cefa4..ca42f773 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/StringIndexer.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/StringIndexer.scala @@ -16,12 +16,14 @@ object StringIndexer extends BenchmarkAlgorithm with TestFromTraining with Unary import ctx.params._ import ctx.sqlContext.implicits._ - DataGenerator.generateRandString(ctx.sqlContext, + DataGenerator.generateRandString( + ctx.sqlContext, numExamples, ctx.seed(), numPartitions, vocabSize, - inputCol) + inputCol + ) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Tokenizer.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Tokenizer.scala index aa066661..b3e863e9 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Tokenizer.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Tokenizer.scala @@ -23,11 +23,11 @@ object Tokenizer extends BenchmarkAlgorithm with TestFromTraining with UnaryTran numPartitions, vocabSize, docLength, - inputCol) + inputCol + ) } - override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { + override def getPipelineStage(ctx: MLBenchContext): PipelineStage = new ml.feature.Tokenizer() .setInputCol(inputCol) - } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/UnaryTransformer.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/UnaryTransformer.scala index bd7b3cc3..23c0afd1 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/UnaryTransformer.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/UnaryTransformer.scala @@ -2,6 +2,6 @@ package com.databricks.spark.sql.perf.mllib.feature /** Trait defining common state/methods for featurizers taking a single input col */ private[feature] trait UnaryTransformer { - private[feature] val inputCol = "inputCol" + private[feature] val inputCol = "inputCol" private[feature] val outputCol = "outputCol" } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/VectorAssembler.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/VectorAssembler.scala index 66897d97..dc16ee47 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/VectorAssembler.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/VectorAssembler.scala @@ -13,29 +13,31 @@ import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, /** Object for testing VectorAssembler performance */ object VectorAssembler extends BenchmarkAlgorithm with TestFromTraining { - private def getInputCols(numInputCols: Int): Array[String] = { - Array.tabulate(numInputCols)(i => s"c${i}") - } + private def getInputCols(numInputCols: Int): Array[String] = + Array.tabulate(numInputCols)(i => s"c$i") override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ - require(numInputCols.get <= numFeatures.get, - s"numInputCols (${numInputCols}) cannot be greater than numFeatures (${numFeatures}).") + require( + numInputCols.get <= numFeatures.get, + s"numInputCols ($numInputCols) cannot be greater than numFeatures ($numFeatures)." + ) val df = DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, - numFeatures) + numFeatures + ) val slice = udf { (v: Vector, numSlices: Int) => val data = v.toArray - val n = data.length.toLong + val n = data.length.toLong (0 until numSlices).map { i => val start = ((i * n) / numSlices).toInt - val end = ((i + 1) * n / numSlices).toInt + val end = ((i + 1) * n / numSlices).toInt Vectors.dense(data.slice(start, end)) } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Word2Vec.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Word2Vec.scala index a59d29e5..199baba4 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Word2Vec.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/feature/Word2Vec.scala @@ -31,22 +31,20 @@ object Word2Vec extends BenchmarkAlgorithm with TestFromTraining { df.select(split(col("text"), " ").as("text")) } - override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { + override def getPipelineStage(ctx: MLBenchContext): PipelineStage = new ml.feature.Word2Vec().setInputCol("text") - } override def testAdditionalMethods( ctx: MLBenchContext, - model: Transformer): Map[String, () => _] = { + model: Transformer + ): Map[String, () => _] = { import ctx.params._ - val rng = new Random(ctx.seed()) + val rng = new Random(ctx.seed()) val word2vecModel = model.asInstanceOf[Word2VecModel] - val testWord = Vectors.dense(Array.fill(word2vecModel.getVectorSize)(rng.nextGaussian())) + val testWord = Vectors.dense(Array.fill(word2vecModel.getVectorSize)(rng.nextGaussian())) - Map("findSynonyms" -> (() => { - word2vecModel.findSynonyms(testWord, numSynonymsToFind) - })) + Map("findSynonyms" -> (() => word2vecModel.findSynonyms(testWord, numSynonymsToFind))) } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/fpm/FPGrowth.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/fpm/FPGrowth.scala index 691bf5bd..ec519e18 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/fpm/FPGrowth.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/fpm/FPGrowth.scala @@ -9,7 +9,6 @@ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator - /** Object containing methods used in performance tests for FPGrowth */ object FPGrowth extends BenchmarkAlgorithm with TestFromTraining { @@ -22,21 +21,20 @@ object FPGrowth extends BenchmarkAlgorithm with TestFromTraining { ctx.seed(), numPartitions, numItems, - itemSetSize) + itemSetSize + ) } - override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { + override def getPipelineStage(ctx: MLBenchContext): PipelineStage = new ml.fpm.FPGrowth() .setItemsCol("items") - } override def testAdditionalMethods( ctx: MLBenchContext, - model: Transformer): Map[String, () => _] = { + model: Transformer + ): Map[String, () => _] = { val fpModel = model.asInstanceOf[FPGrowthModel] - Map("associationRules" -> (() => { - fpModel.associationRules.count() - })) + Map("associationRules" -> (() => fpModel.associationRules.count())) } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/recommendation/ALS.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/recommendation/ALS.scala index 9c21947b..609fc515 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/recommendation/ALS.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/recommendation/ALS.scala @@ -7,34 +7,44 @@ import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator -import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, ScoringWithEvaluator} +import com.databricks.spark.sql.perf.mllib.{ + BenchmarkAlgorithm, + MLBenchContext, + ScoringWithEvaluator +} object ALS extends BenchmarkAlgorithm with ScoringWithEvaluator { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ - DataGenerator.generateRatings( - ctx.sqlContext, - numUsers, - numItems, - numExamples, - numTestExamples, - implicitPrefs = false, - numPartitions, - ctx.seed())._1 + DataGenerator + .generateRatings( + ctx.sqlContext, + numUsers, + numItems, + numExamples, + numTestExamples, + implicitPrefs = false, + numPartitions, + ctx.seed() + ) + ._1 } override def testDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ - DataGenerator.generateRatings( - ctx.sqlContext, - numUsers, - numItems, - numExamples, - numTestExamples, - implicitPrefs = false, - numPartitions, - ctx.seed())._2 + DataGenerator + .generateRatings( + ctx.sqlContext, + numUsers, + numItems, + numExamples, + numTestExamples, + implicitPrefs = false, + numPartitions, + ctx.seed() + ) + ._2 } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { @@ -47,7 +57,6 @@ object ALS extends BenchmarkAlgorithm with ScoringWithEvaluator { .setMaxIter(maxIter) } - override protected def evaluator(ctx: MLBenchContext): Evaluator = { + override protected def evaluator(ctx: MLBenchContext): Evaluator = new RegressionEvaluator().setLabelCol("rating") - } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/DecisionTreeRegression.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/DecisionTreeRegression.scala index 126ffe4d..dbae2fde 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/DecisionTreeRegression.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/DecisionTreeRegression.scala @@ -6,7 +6,6 @@ import org.apache.spark.ml.regression.DecisionTreeRegressor import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ - object DecisionTreeRegression extends BenchmarkAlgorithm with TreeOrForestRegressor { override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GBTRegression.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GBTRegression.scala index e78d2eb6..dbc9cc12 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GBTRegression.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GBTRegression.scala @@ -4,8 +4,11 @@ import org.apache.spark.ml.PipelineStage import org.apache.spark.ml.regression.GBTRegressor import com.databricks.spark.sql.perf.mllib.OptionImplicits._ -import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, - TreeOrForestRegressor} +import com.databricks.spark.sql.perf.mllib.{ + BenchmarkAlgorithm, + MLBenchContext, + TreeOrForestRegressor +} object GBTRegression extends BenchmarkAlgorithm with TreeOrForestRegressor { override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { @@ -15,4 +18,4 @@ object GBTRegression extends BenchmarkAlgorithm with TreeOrForestRegressor { .setMaxIter(maxIter) .setSeed(ctx.seed()) } -} \ No newline at end of file +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GLMRegression.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GLMRegression.scala index c2761a0b..16fca1b2 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GLMRegression.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/GLMRegression.scala @@ -9,9 +9,11 @@ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator - -object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with - TrainingSetFromTransformer with ScoringWithEvaluator { +object GLMRegression + extends BenchmarkAlgorithm + with TestFromTraining + with TrainingSetFromTransformer + with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ @@ -20,7 +22,8 @@ object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with numExamples, ctx.seed(), numPartitions, - numFeatures) + numFeatures + ) } override protected def trueModel(ctx: MLBenchContext): Transformer = { @@ -30,7 +33,7 @@ object GLMRegression extends BenchmarkAlgorithm with TestFromTraining with Vectors.dense(Array.fill[Double](ctx.params.numFeatures)(2 * rng.nextDouble() - 1)) // Small intercept to prevent some skew in the data. val intercept = 0.01 * (2 * rng.nextDouble - 1) - val m = ModelBuilderSSP.newGLR(coefficients, intercept) + val m = ModelBuilderSSP.newGLR(coefficients, intercept) m.set(m.link, link.get) m.set(m.family, family.get) m diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala index c2882ce8..29a3b735 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/LinearRegression.scala @@ -9,9 +9,11 @@ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator - -object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with - TrainingSetFromTransformer with ScoringWithEvaluator { +object LinearRegression + extends BenchmarkAlgorithm + with TestFromTraining + with TrainingSetFromTransformer + with ScoringWithEvaluator { override protected def initialData(ctx: MLBenchContext) = { import ctx.params._ @@ -20,7 +22,8 @@ object LinearRegression extends BenchmarkAlgorithm with TestFromTraining with numExamples, ctx.seed(), numPartitions, - numFeatures) + numFeatures + ) } override protected def trueModel(ctx: MLBenchContext): Transformer = { diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/RandomForestRegression.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/RandomForestRegression.scala index c9ed4e8d..33ab1f21 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/RandomForestRegression.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/regression/RandomForestRegression.scala @@ -4,8 +4,11 @@ import org.apache.spark.ml.PipelineStage import org.apache.spark.ml.regression.RandomForestRegressor import com.databricks.spark.sql.perf.mllib.OptionImplicits._ -import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, - TreeOrForestRegressor} +import com.databricks.spark.sql.perf.mllib.{ + BenchmarkAlgorithm, + MLBenchContext, + TreeOrForestRegressor +} object RandomForestRegression extends BenchmarkAlgorithm with TreeOrForestRegressor { override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { diff --git a/src/main/scala/com/databricks/spark/sql/perf/mllib/yaml.scala b/src/main/scala/com/databricks/spark/sql/perf/mllib/yaml.scala index edd54a7c..35c93a49 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/mllib/yaml.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/mllib/yaml.scala @@ -8,105 +8,99 @@ import scala.io.Source import scala.reflect._ import scala.reflect.runtime.universe._ -import scala.util.{Try => STry, Success, Failure} +import scala.util.{Failure, Success, Try => STry} import org.yaml.snakeyaml.Yaml -import com.databricks.spark.sql.perf.{MLParams} +import com.databricks.spark.sql.perf.MLParams - -/** - * The configuration information generated from reading a YAML file. - * - * @param output the output direct - */ +/** The configuration information generated from reading a YAML file. + * + * @param output + * the output direct + */ case class YamlConfig( - output: String = "/tmp/result", - timeout: Duration = 20.minutes, - runnableBenchmarks: Seq[MLTest]) + output: String = "/tmp/result", + timeout: Duration = 20.minutes, + runnableBenchmarks: Seq[MLTest] +) object YamlConfig { - /** - * Reads a string (assumed to contain a yaml description) and returns the configuration. - */ + /** Reads a string (assumed to contain a yaml description) and returns the configuration. + */ def readString(s: String): YamlConfig = { println(s) - val yaml = new Yaml() - val m = dict(yaml.load(s)) + val yaml = new Yaml() + val m = dict(yaml.load(s)) val common = m.get("common").map(dict).getOrElse(Map.empty) println("common") println(m) val exps = m("benchmarks") - .asInstanceOf[AL[Map[String, Any]]].asScala.map(dict).toSeq + .asInstanceOf[AL[Map[String, Any]]] + .asScala + .map(dict) + .toSeq println("exps:") println(exps) val experiments = exps.flatMap { sd => - val name = sd("name").toString - val params = sd.get("params").map(dict).getOrElse(Map.empty) + val name = sd("name").toString + val params = sd.get("params").map(dict).getOrElse(Map.empty) val expParams = cartesian(common ++ params) for (c <- expParams) yield name -> c } println("exp parsed") println(experiments) - val e2 = experiments.map { case (n, e) => - val e2 = ccFromMap.fromMap[MLParams](e, strict=true) - val s = ccFromMap.loadExperiment(n).getOrElse { - throw new Exception(s"Cannot find algorithm $n in the standard benchmark algorithms") - } - MLTest(s, e2) + val e2 = experiments.map { + case (n, e) => + val e2 = ccFromMap.fromMap[MLParams](e, strict = true) + val s = ccFromMap.loadExperiment(n).getOrElse { + throw new Exception(s"Cannot find algorithm $n in the standard benchmark algorithms") + } + MLTest(s, e2) } var c = YamlConfig(runnableBenchmarks = e2) - for (output <- m.get("output")) { + for (output <- m.get("output")) c = c.copy(output = output.toString) - } - for (x <- m.get("timeoutSeconds")) { + for (x <- m.get("timeoutSeconds")) c = c.copy(timeout = x.toString.toInt.seconds) - } c } - /** - * Reads a file (assumed to contain a yaml config). - */ - def readFile(filename: String): YamlConfig = { + /** Reads a file (assumed to contain a yaml config). + */ + def readFile(filename: String): YamlConfig = readString(Source.fromFile(filename).mkString) - } // Converts a java dictionary to a scala map. - private def dict[T](d: T): Map[String, Any] = { + private def dict[T](d: T): Map[String, Any] = d.asInstanceOf[java.util.Map[String, Any]].asScala.toMap - } - /** - * Given keys that may be lists, builds the cartesian product of all the values into defined - * options. - * - * For example: {a: [1,2], b: [3,4]} -> {a: 1, b: 3}, {a: 1, b:4}, {a:2, b:3}, ... - * - * @return - */ - private def cartesian(m: Map[String, Any]): Seq[Map[String, Any]] = { + /** Given keys that may be lists, builds the cartesian product of all the values into defined + * options. + * + * For example: {a: [1,2], b: [3,4]} -> {a: 1, b: 3}, {a: 1, b:4}, {a:2, b:3}, ... + * + * @return + */ + private def cartesian(m: Map[String, Any]): Seq[Map[String, Any]] = if (m.isEmpty) { Seq(m) } else { - val k = m.keys.head + val k = m.keys.head val sub = m - k - val l = cartesian(sub) + val l = cartesian(sub) m(k) match { case a: AL[_] => for { - x <- a.asScala.toSeq + x <- a.asScala.toSeq m2 <- l - } yield { - m2 ++ Map(k -> x.asInstanceOf[Any]) - } + } yield m2 ++ Map(k -> x.asInstanceOf[Any]) case _ => val v = m(k) - l.map { m => m ++ Map(k -> v) } + l.map(m => m ++ Map(k -> v)) } } - } } @@ -115,35 +109,42 @@ object ccFromMap { // Builds a case class from a map. // (taken from stack overflow) // if strict, will report an error if some unknown arguments are passed to the constructor - def fromMap[T: TypeTag: ClassTag](m: Map[String,_], strict: Boolean) = { + def fromMap[T: TypeTag: ClassTag](m: Map[String, _], strict: Boolean) = { scala.reflect.runtime.universe - val rm = runtimeMirror(classTag[T].runtimeClass.getClassLoader) - val classTest = typeOf[T].typeSymbol.asClass - val classMirror = rm.reflectClass(classTest) - val constructor = typeOf[T].declaration(nme.CONSTRUCTOR).asMethod + val rm = runtimeMirror(classTag[T].runtimeClass.getClassLoader) + val classTest = typeOf[T].typeSymbol.asClass + val classMirror = rm.reflectClass(classTest) + val constructor = typeOf[T].declaration(nme.CONSTRUCTOR).asMethod val constructorMirror = classMirror.reflectConstructor(constructor) val constructorArgNames = constructor.paramss.flatten.map(_.name.toString).toSet - val extraElements = m.keySet -- constructorArgNames + val extraElements = m.keySet -- constructorArgNames if (extraElements.nonEmpty) { - throw new Exception(s"Found extra arguments when instantiating an object of " + - s"class ${classTest.asClass.toString}:" + - s" ${extraElements.toSeq.sorted}") + throw new Exception( + s"Found extra arguments when instantiating an object of " + + s"class ${classTest.asClass.toString}:" + + s" ${extraElements.toSeq.sorted}" + ) } - val constructorArgs = constructor.paramss.flatten.map( (param: Symbol) => { + val constructorArgs = constructor.paramss.flatten.map { (param: Symbol) => val paramName = param.name.toString - if(param.typeSignature <:< typeOf[Option[Long]]) + if (param.typeSignature <:< typeOf[Option[Long]]) OptionImplicits.checkLong(m.get(paramName).asInstanceOf[Option[Long]]) - else if(param.typeSignature <:< typeOf[Option[Double]]) + else if (param.typeSignature <:< typeOf[Option[Double]]) OptionImplicits.checkDouble(m.get(paramName).asInstanceOf[Option[Double]]) - else if(param.typeSignature <:< typeOf[Option[Any]]) + else if (param.typeSignature <:< typeOf[Option[Any]]) m.get(paramName) else - m.get(paramName).getOrElse(throw new IllegalArgumentException("Map is missing required parameter named " + paramName)) - }) + m.get(paramName) + .getOrElse( + throw new IllegalArgumentException( + "Map is missing required parameter named " + paramName + ) + ) + } - val res = constructorMirror(constructorArgs:_*).asInstanceOf[T] + val res = constructorMirror(constructorArgs: _*).asInstanceOf[T] res } @@ -152,7 +153,7 @@ object ccFromMap { val rm = runtimeMirror(getClass.getClassLoader) try { val module = rm.staticModule("com.databricks.spark.sql.perf.mllib." + name) - val obj = rm.reflectModule(module) + val obj = rm.reflectModule(module) Success(obj.instance.asInstanceOf[BenchmarkAlgorithm]) } catch { case x: scala.reflect.internal.MissingRequirementError => @@ -167,10 +168,10 @@ object ccFromMap { def loadExperiment( name: String, - searchPackages: Seq[String] = defaultPackages): Option[BenchmarkAlgorithm] = { + searchPackages: Seq[String] = defaultPackages + ): Option[BenchmarkAlgorithm] = searchPackages.view.flatMap { p => val n = if (p.isEmpty) name else s"$p.$name" load(n).toOption - } .headOption - } + }.headOption } diff --git a/src/main/scala/com/databricks/spark/sql/perf/package.scala b/src/main/scala/com/databricks/spark/sql/perf/package.scala index 080d0243..563b32d7 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/package.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/package.scala @@ -4,5 +4,7 @@ import org.apache.spark.sql.functions._ package object perf { val runtime = - (col("result.analysisTime") + col("result.optimizationTime") + col("result.planningTime") + col("result.executionTime")).as("runtime") -} \ No newline at end of file + (col("result.analysisTime") + col("result.optimizationTime") + col("result.planningTime") + col( + "result.executionTime" + )).as("runtime") +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/results.scala b/src/main/scala/com/databricks/spark/sql/perf/results.scala index 28d72263..9fcd7c91 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/results.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/results.scala @@ -18,62 +18,84 @@ package com.databricks.spark.sql.perf import com.databricks.spark.sql.perf.mllib.ReflectionUtils -/** - * The performance results of all given queries for a single iteration. - * - * @param timestamp The timestamp indicates when the entire experiment is started. - * @param iteration The index number of the current iteration. - * @param tags Tags of this iteration (variations are stored at here). - * @param configuration Configuration properties of this iteration. - * @param results The performance results of queries for this iteration. - */ +/** The performance results of all given queries for a single iteration. + * + * @param timestamp + * The timestamp indicates when the entire experiment is started. + * @param iteration + * The index number of the current iteration. + * @param tags + * Tags of this iteration (variations are stored at here). + * @param configuration + * Configuration properties of this iteration. + * @param results + * The performance results of queries for this iteration. + */ case class ExperimentRun( timestamp: Long, iteration: Int, tags: Map[String, String], configuration: BenchmarkConfiguration, - results: Seq[BenchmarkResult]) + results: Seq[BenchmarkResult] +) -/** - * The configuration used for an iteration of an experiment. - * - * @param sparkVersion The version of Spark. - * @param sqlConf All configuration properties related to Spark SQL. - * @param sparkConf All configuration properties of Spark. - * @param defaultParallelism The default parallelism of the cluster. - * Usually, it is the number of cores of the cluster. - */ +/** The configuration used for an iteration of an experiment. + * + * @param sparkVersion + * The version of Spark. + * @param sqlConf + * All configuration properties related to Spark SQL. + * @param sparkConf + * All configuration properties of Spark. + * @param defaultParallelism + * The default parallelism of the cluster. Usually, it is the number of cores of the cluster. + */ case class BenchmarkConfiguration( sparkVersion: String = org.apache.spark.SPARK_VERSION, sqlConf: Map[String, String], sparkConf: Map[String, String], defaultParallelism: Int, - buildInfo: Map[String, String]) + buildInfo: Map[String, String] +) -/** - * The result of a query. - * - * @param name The name of the query. - * @param mode The ExecutionMode of this run. - * @param parameters Additional parameters that describe this query. - * @param joinTypes The type of join operations in the query. - * @param tables The tables involved in the query. - * @param parsingTime The time used to parse the query. - * @param analysisTime The time used to analyze the query. - * @param optimizationTime The time used to optimize the query. - * @param planningTime The time used to plan the query. - * @param executionTime The time used to execute the query. - * @param result the result of this run. It is not necessarily the result of the query. - * For example, it can be the number of rows generated by this query or - * the sum of hash values of rows generated by this query. - * @param breakDown The breakdown results of the query plan tree. - * @param queryExecution The query execution plan. - * @param failure The failure message. - * @param mlResult The result metrics specific to MLlib. - * @param benchmarkId An optional ID to identify a series of benchmark runs. - * In ML, this is generated based on the benchmark name and - * the hash value of params. - */ +/** The result of a query. + * + * @param name + * The name of the query. + * @param mode + * The ExecutionMode of this run. + * @param parameters + * Additional parameters that describe this query. + * @param joinTypes + * The type of join operations in the query. + * @param tables + * The tables involved in the query. + * @param parsingTime + * The time used to parse the query. + * @param analysisTime + * The time used to analyze the query. + * @param optimizationTime + * The time used to optimize the query. + * @param planningTime + * The time used to plan the query. + * @param executionTime + * The time used to execute the query. + * @param result + * the result of this run. It is not necessarily the result of the query. For example, it can be + * the number of rows generated by this query or the sum of hash values of rows generated by this + * query. + * @param breakDown + * The breakdown results of the query plan tree. + * @param queryExecution + * The query execution plan. + * @param failure + * The failure message. + * @param mlResult + * The result metrics specific to MLlib. + * @param benchmarkId + * An optional ID to identify a series of benchmark runs. In ML, this is generated based on the + * benchmark name and the hash value of params. + */ case class BenchmarkResult( name: String, mode: String, @@ -90,34 +112,37 @@ case class BenchmarkResult( queryExecution: Option[String] = None, failure: Option[Failure] = None, mlResult: Option[Array[MLMetric]] = None, - benchmarkId: Option[String] = None) + benchmarkId: Option[String] = None +) -/** - * The execution time of a subtree of the query plan tree of a specific query. - * - * @param nodeName The name of the top physical operator of the subtree. - * @param nodeNameWithArgs The name and arguments of the top physical operator of the subtree. - * @param index The index of the top physical operator of the subtree - * in the original query plan tree. The index starts from 0 - * (0 represents the top physical operator of the original query plan tree). - * @param executionTime The execution time of the subtree. - */ +/** The execution time of a subtree of the query plan tree of a specific query. + * + * @param nodeName + * The name of the top physical operator of the subtree. + * @param nodeNameWithArgs + * The name and arguments of the top physical operator of the subtree. + * @param index + * The index of the top physical operator of the subtree in the original query plan tree. The + * index starts from 0 (0 represents the top physical operator of the original query plan tree). + * @param executionTime + * The execution time of the subtree. + */ case class BreakdownResult( nodeName: String, nodeNameWithArgs: String, index: Int, children: Seq[Int], executionTime: Double, - delta: Double) + delta: Double +) case class Failure(className: String, message: String) -/** - * Class wrapping parameters for ML tests. - * - * KEEP CONSTRUCTOR ARGUMENTS SORTED BY NAME. - * It simplifies lookup when checking if a parameter is here already. - */ +/** Class wrapping parameters for ML tests. + * + * KEEP CONSTRUCTOR ARGUMENTS SORTED BY NAME. It simplifies lookup when checking if a parameter is + * here already. + */ class MLParams( // *** Common to all algorithms *** val randomSeed: Option[Int] = Some(42), @@ -148,12 +173,12 @@ class MLParams( val rank: Option[Int] = None, val smoothing: Option[Double] = None, val tol: Option[Double] = None, - val vocabSize: Option[Int] = None) { + val vocabSize: Option[Int] = None +) { - /** - * Returns a map of param names to string representations of their values. Only params that - * were defined (i.e., not equal to None) are included in the map. - */ + /** Returns a map of param names to string representations of their values. Only params that were + * defined (i.e., not equal to None) are included in the map. + */ def toMap: Map[String, String] = { // Only outputs params that have values val allParams = ReflectionUtils.getConstructorArgs(this) @@ -196,7 +221,8 @@ class MLParams( rank: Option[Int] = rank, smoothing: Option[Double] = smoothing, tol: Option[Double] = tol, - vocabSize: Option[Int] = vocabSize): MLParams = { + vocabSize: Option[Int] = vocabSize + ): MLParams = new MLParams( randomSeed = randomSeed, numExamples = numExamples, @@ -225,26 +251,25 @@ class MLParams( rank = rank, smoothing = smoothing, tol = tol, - vocabSize = vocabSize) - } + vocabSize = vocabSize + ) } object MLParams { val empty = new MLParams() } -/** - * Metrics specific to MLlib benchmark. - * - * @param metricName the name of the metric - * @param metricValue the value of the metric - * @param isLargerBetter the indicator showing whether larger metric value is better - */ -case class MLMetric( - metricName: String, - metricValue: Double, - isLargerBetter: Boolean) +/** Metrics specific to MLlib benchmark. + * + * @param metricName + * the name of the metric + * @param metricValue + * the value of the metric + * @param isLargerBetter + * the indicator showing whether larger metric value is better + */ +case class MLMetric(metricName: String, metricValue: Double, isLargerBetter: Boolean) object MLMetric { val Invalid = MLMetric("Invalid", 0.0, false) -} \ No newline at end of file +} diff --git a/src/main/scala/com/databricks/spark/sql/perf/tpcds/GenTPCDSData.scala b/src/main/scala/com/databricks/spark/sql/perf/tpcds/GenTPCDSData.scala index d3414844..332158b8 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/tpcds/GenTPCDSData.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/tpcds/GenTPCDSData.scala @@ -31,23 +31,22 @@ case class GenTPCDSDataConfig( clusterByPartitionColumns: Boolean = true, filterOutNullPartitionValues: Boolean = true, tableFilter: String = "", - numPartitions: Int = 100) + numPartitions: Int = 100 +) -/** - * Gen TPCDS data. - * To run this: - * {{{ - * build/sbt "test:runMain -d -s -l -f " - * }}} - */ +/** Gen TPCDS data. To run this: + * {{{ + * build/sbt "test:runMain -d -s -l -f " + * }}} + */ object GenTPCDSData { def main(args: Array[String]): Unit = { val parser = new scopt.OptionParser[GenTPCDSDataConfig]("Gen-TPC-DS-data") { opt[String]('m', "master") - .action { (x, c) => c.copy(master = x) } + .action((x, c) => c.copy(master = x)) .text("the Spark master to use, default to local[*]") opt[String]('d', "dsdgenDir") - .action { (x, c) => c.copy(dsdgenDir = x) } + .action((x, c) => c.copy(dsdgenDir = x)) .text("location of dsdgen") .required() opt[String]('s', "scaleFactor") @@ -58,7 +57,7 @@ object GenTPCDSData { .text("root directory of location to create data in") opt[String]('f', "format") .action((x, c) => c.copy(format = x)) - .text("valid spark format, Parquet, ORC ...") + .text("valid spark format, Parquet, ORC, Delta, Iceberg ...") opt[Boolean]('i', "useDoubleForDecimal") .action((x, c) => c.copy(useDoubleForDecimal = x)) .text("true to replace DecimalType with DoubleType") @@ -102,11 +101,13 @@ object GenTPCDSData { .master(config.master) .getOrCreate() - val tables = new TPCDSTables(spark.sqlContext, + val tables = new TPCDSTables( + spark.sqlContext, dsdgenDir = config.dsdgenDir, scaleFactor = config.scaleFactor, useDoubleForDecimal = config.useDoubleForDecimal, - useStringForDate = config.useStringForDate) + useStringForDate = config.useStringForDate + ) tables.genData( location = config.location, @@ -116,6 +117,7 @@ object GenTPCDSData { clusterByPartitionColumns = config.clusterByPartitionColumns, filterOutNullPartitionValues = config.filterOutNullPartitionValues, tableFilter = config.tableFilter, - numPartitions = config.numPartitions) + numPartitions = config.numPartitions + ) } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/tpcds/ImpalaKitQueries.scala b/src/main/scala/com/databricks/spark/sql/perf/tpcds/ImpalaKitQueries.scala index 5ef20344..d431d1a4 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/tpcds/ImpalaKitQueries.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/tpcds/ImpalaKitQueries.scala @@ -16,7 +16,7 @@ package com.databricks.spark.sql.perf.tpcds -import com.databricks.spark.sql.perf.{ExecutionMode, Benchmark} +import com.databricks.spark.sql.perf.{Benchmark, ExecutionMode} trait ImpalaKitQueries extends Benchmark { @@ -25,7 +25,9 @@ trait ImpalaKitQueries extends Benchmark { // Queries are from // https://github.com/cloudera/impala-tpcds-kit/tree/master/queries-sql92-modified/queries val queries = Seq( - ("q19", """ + ( + "q19", + """ |-- start query 1 in stream 0 using template query19.tpl |select | i_brand_id, @@ -60,9 +62,11 @@ trait ImpalaKitQueries extends Benchmark { | i_manufact |limit 100 |-- end query 1 in stream 0 using template query19.tpl - """.stripMargin), - - ("q27", """ + """.stripMargin + ), + ( + "q27", + """ |-- start query 1 in stream 0 using template query27.tpl |select | i_item_id, @@ -95,9 +99,11 @@ trait ImpalaKitQueries extends Benchmark { | s_state |limit 100 |-- end query 1 in stream 0 using template query27.tpl - """.stripMargin), - - ("q3", """ + """.stripMargin + ), + ( + "q3", + """ |-- start query 1 in stream 0 using template query3.tpl |select | dt.d_year, @@ -139,9 +145,11 @@ trait ImpalaKitQueries extends Benchmark { | brand_id |-- end query 1 in stream 0 using template query3.tpl |limit 100 - """.stripMargin), - - ("q34", """ + """.stripMargin + ), + ( + "q34", + """ |-- start query 1 in stream 0 using template query34.tpl |select | c_last_name, @@ -186,9 +194,11 @@ trait ImpalaKitQueries extends Benchmark { | cnt |limit 1000 |-- end query 1 in stream 0 using template query34.tpl - """.stripMargin), - - ("q42", """ + """.stripMargin + ), + ( + "q42", + """ |-- start query 1 in stream 0 using template query42.tpl |select | d_year, @@ -217,9 +227,11 @@ trait ImpalaKitQueries extends Benchmark { | i_category |limit 100 |-- end query 1 in stream 0 using template query42.tpl - """.stripMargin), - - ("q43", """ + """.stripMargin + ), + ( + "q43", + """ |-- start query 1 in stream 0 using template query43.tpl |select | s_store_name, @@ -255,9 +267,11 @@ trait ImpalaKitQueries extends Benchmark { | sat_sales |limit 100 |-- end query 1 in stream 0 using template query43.tpl - """.stripMargin), - - ("q46", """ + """.stripMargin + ), + ( + "q46", + """ |-- start query 1 in stream 0 using template query46.tpl |select | c_last_name, @@ -333,9 +347,11 @@ trait ImpalaKitQueries extends Benchmark { | ss_ticket_number |limit 100 |-- end query 1 in stream 0 using template query46.tpl - """.stripMargin), - - ("q52", """ + """.stripMargin + ), + ( + "q52", + """ |-- start query 1 in stream 0 using template query52.tpl |select | d_year, @@ -362,9 +378,11 @@ trait ImpalaKitQueries extends Benchmark { | i_brand_id |limit 100 |-- end query 1 in stream 0 using template query52.tpl - """.stripMargin), - - ("q53", """ + """.stripMargin + ), + ( + "q53", + """ |-- start query 1 in stream 0 using template query53.tpl |select | * @@ -405,9 +423,11 @@ trait ImpalaKitQueries extends Benchmark { | i_manufact_id |limit 100 |-- end query 1 in stream 0 using template query53.tpl - """.stripMargin), - - ("q55", """ + """.stripMargin + ), + ( + "q55", + """ |-- start query 1 in stream 0 using template query55.tpl |select | i_brand_id, @@ -431,9 +451,11 @@ trait ImpalaKitQueries extends Benchmark { | i_brand_id |limit 100 |-- end query 1 in stream 0 using template query55.tpl - """.stripMargin), - - ("q59", """ + """.stripMargin + ), + ( + "q59", + """ |-- start query 1 in stream 0 using template query59.tpl |select | s_store_name1, @@ -531,9 +553,11 @@ trait ImpalaKitQueries extends Benchmark { | d_week_seq1 |limit 100 |-- end query 1 in stream 0 using template query59.tpl - """.stripMargin), - - ("q63", """ + """.stripMargin + ), + ( + "q63", + """ |-- start query 1 in stream 0 using template query63.tpl |select | * @@ -574,9 +598,11 @@ trait ImpalaKitQueries extends Benchmark { | sum_sales |limit 100 |-- end query 1 in stream 0 using template query63.tpl - """.stripMargin), - - ("q65", """ + """.stripMargin + ), + ( + "q65", + """ |--q65 |-- start query 1 in stream 0 using template query65.tpl |select @@ -634,9 +660,11 @@ trait ImpalaKitQueries extends Benchmark { | i_item_desc |limit 100 |-- end query 1 in stream 0 using template query65.tpl - """.stripMargin), - - ("q68", """ + """.stripMargin + ), + ( + "q68", + """ |-- start query 1 in stream 0 using template query68.tpl |select | c_last_name, @@ -693,9 +721,11 @@ trait ImpalaKitQueries extends Benchmark { | ss_ticket_number |limit 100 |-- end query 1 in stream 0 using template query68.tpl - """.stripMargin), - - ("q7", """ + """.stripMargin + ), + ( + "q7", + """ |-- start query 1 in stream 0 using template query7.tpl |select | i_item_id, @@ -724,9 +754,11 @@ trait ImpalaKitQueries extends Benchmark { | i_item_id |limit 100 |-- end query 1 in stream 0 using template query7.tpl - """.stripMargin), - - ("q73", """ + """.stripMargin + ), + ( + "q73", + """ |-- start query 1 in stream 0 using template query73.tpl |select | c_last_name, @@ -775,9 +807,11 @@ trait ImpalaKitQueries extends Benchmark { | cnt desc |limit 1000 |-- end query 1 in stream 0 using template query73.tpl - """.stripMargin), - - ("q79", """ + """.stripMargin + ), + ( + "q79", + """ |-- start query 1 in stream 0 using template query79.tpl |select | c_last_name, @@ -823,9 +857,11 @@ trait ImpalaKitQueries extends Benchmark { | profit |limit 100 |-- end query 1 in stream 0 using template query79.tpl - """.stripMargin), - - ("q8", """ + """.stripMargin + ), + ( + "q8", + """ |-- start query 8 in stream 0 using template query8.tpl |select s_store_name | ,sum(ss_net_profit) @@ -885,9 +921,11 @@ trait ImpalaKitQueries extends Benchmark { | order by s_store_name |limit 100 |-- end query 8 in stream 0 using template query8.tpl - """.stripMargin), - - ("q82", """ + """.stripMargin + ), + ( + "q82", + """ |-- start query 1 in stream 0 using template query82.tpl |select | i_item_id, @@ -912,9 +950,11 @@ trait ImpalaKitQueries extends Benchmark { | i_item_id |limit 100 |-- end query 1 in stream 0 using template query82.tpl - """.stripMargin), - - ("q89", """ + """.stripMargin + ), + ( + "q89", + """ |-- start query 1 in stream 0 using template query89.tpl |select | * @@ -958,9 +998,11 @@ trait ImpalaKitQueries extends Benchmark { | s_store_name |limit 100 |-- end query 1 in stream 0 using template query89.tpl - """.stripMargin), - - ("q98", """ + """.stripMargin + ), + ( + "q98", + """ |-- start query 1 in stream 0 using template query98.tpl |select | i_item_desc, @@ -995,9 +1037,11 @@ trait ImpalaKitQueries extends Benchmark { | -- revenueratio |limit 1000 |-- end query 1 in stream 0 using template query98.tpl - """.stripMargin), - - ("ss_max", """ + """.stripMargin + ), + ( + "ss_max", + """ |select | count(*) as total, | count(ss_sold_date_sk) as not_null_total, @@ -1012,14 +1056,17 @@ trait ImpalaKitQueries extends Benchmark { | max(ss_store_sk) as max_ss_store_sk, | max(ss_promo_sk) as max_ss_promo_sk |from store_sales - """.stripMargin) + """.stripMargin + ) ).map { case (name, sqlText) => Query(name, sqlText, description = "", executionMode = CollectResults) } val queriesMap = queries.map(q => q.name -> q).toMap val originalQueries = Seq( - ("q3", """ + ( + "q3", + """ select d_year ,item.i_brand_id brand_id ,item.i_brand brand @@ -1036,9 +1083,11 @@ trait ImpalaKitQueries extends Benchmark { order by d_year ,sum_agg desc ,brand_id - limit 100"""), - - ("q7", """ + limit 100""" + ), + ( + "q7", + """ select i_item_id, avg(ss_quantity) agg1, avg(ss_list_price) agg2, @@ -1057,9 +1106,11 @@ trait ImpalaKitQueries extends Benchmark { d_year = 1998 group by i_item_id order by i_item_id - limit 100"""), - - ("q19", """ + limit 100""" + ), + ( + "q19", + """ select i_brand_id, i_brand, i_manufact_id, i_manufact, sum(ss_ext_sales_price) as ext_price from date_dim @@ -1082,9 +1133,11 @@ trait ImpalaKitQueries extends Benchmark { ,i_brand_id ,i_manufact_id ,i_manufact - limit 100"""), - - ("q27", """ + limit 100""" + ), + ( + "q27", + """ select i_item_id, s_state, avg(ss_quantity) agg1, @@ -1105,9 +1158,11 @@ trait ImpalaKitQueries extends Benchmark { group by i_item_id, s_state order by i_item_id ,s_state - limit 100"""), - - ("q34", """ + limit 100""" + ), + ( + "q34", + """ select c_last_name ,c_first_name ,c_salutation @@ -1143,9 +1198,11 @@ trait ImpalaKitQueries extends Benchmark { c_salutation, c_preferred_cust_flag desc, ss_ticket_number, - cnt"""), - - ("q42", """ + cnt""" + ), + ( + "q42", + """ select d_year ,item.i_category_id ,item.i_category @@ -1163,9 +1220,11 @@ trait ImpalaKitQueries extends Benchmark { order by s desc,d_year ,i_category_id ,i_category - limit 100"""), - - ("q43", """ + limit 100""" + ), + ( + "q43", + """ select s_store_name, s_store_id, sum(case when (d_day_name='Sunday') then ss_sales_price else null end) sun_sales, sum(case when (d_day_name='Monday') then ss_sales_price else null end) mon_sales, @@ -1182,9 +1241,11 @@ trait ImpalaKitQueries extends Benchmark { d_year = 1998 group by s_store_name, s_store_id order by s_store_name, s_store_id,sun_sales,mon_sales,tue_sales,wed_sales,thu_sales,fri_sales,sat_sales - limit 100"""), - - ("q46", """ + limit 100""" + ), + ( + "q46", + """ select c_last_name ,c_first_name ,ca_city @@ -1218,9 +1279,11 @@ trait ImpalaKitQueries extends Benchmark { ,ca_city ,bought_city ,ss_ticket_number - limit 100"""), - - ("q52", """ + limit 100""" + ), + ( + "q52", + """ select d_year ,item.i_brand_id brand_id ,item.i_brand brand @@ -1238,9 +1301,11 @@ trait ImpalaKitQueries extends Benchmark { order by d_year ,ext_price desc ,brand_id - limit 100"""), - - ("q55", """ + limit 100""" + ), + ( + "q55", + """ select i_brand_id as brand_id, i_brand as brand, sum(store_sales.ss_ext_sales_price) ext_price from date_dim @@ -1252,9 +1317,10 @@ trait ImpalaKitQueries extends Benchmark { and d_year=2001 group by i_brand, i_brand_id order by ext_price desc, brand_id - limit 100 """), - - ("q59", + limit 100 """ + ), + ( + "q59", """ |select | s_store_name1, @@ -1355,9 +1421,11 @@ trait ImpalaKitQueries extends Benchmark { | s_store_id1, | d_week_seq1 |limit 100 - """.stripMargin), - - ("q68", """ + """.stripMargin + ), + ( + "q68", + """ select c_last_name ,c_first_name ,ca_city ,bought_city ,ss_ticket_number ,extended_price ,extended_tax ,list_price @@ -1387,9 +1455,11 @@ trait ImpalaKitQueries extends Benchmark { customer_address.ca_city <> dn.bought_city order by c_last_name ,ss_ticket_number - limit 100"""), - - ("q73", """ + limit 100""" + ), + ( + "q73", + """ select c_last_name ,c_first_name ,c_salutation @@ -1416,9 +1486,11 @@ trait ImpalaKitQueries extends Benchmark { JOIN customer ON dj.ss_customer_sk = customer.c_customer_sk where cnt between 5 and 10 - order by cnt desc"""), - - ("q79", """ + order by cnt desc""" + ), + ( + "q79", + """ select c_last_name,c_first_name,substr(s_city,1,30) as s_city,ss_ticket_number,amt,profit from @@ -1439,9 +1511,10 @@ trait ImpalaKitQueries extends Benchmark { group by ss_ticket_number,ss_customer_sk,ss_addr_sk,store.s_city) ms JOIN customer on ms.ss_customer_sk = customer.c_customer_sk order by c_last_name,c_first_name,s_city, profit - limit 100"""), - - ("qSsMax", + limit 100""" + ), + ( + "qSsMax", """ |select | count(*) as total, @@ -1457,14 +1530,16 @@ trait ImpalaKitQueries extends Benchmark { | max(ss_store_sk) as max_ss_store_sk, | max(ss_promo_sk) as max_ss_promo_sk |from store_sales - """.stripMargin) - ).map { case (name, sqlText) => - Query(name, sqlText, description = "original query", executionMode = CollectResults) + """.stripMargin + ) + ).map { + case (name, sqlText) => + Query(name, sqlText, description = "original query", executionMode = CollectResults) } val interactiveQueries = Seq("q19", "q42", "q52", "q55", "q63", "q68", "q73", "q98").map(queriesMap) - val reportingQueries = Seq("q3","q7", "q27","q43", "q53", "q89").map(queriesMap) - val deepAnalyticQueries = Seq("q34", "q46", "q59", "q65", "q79", "ss_max").map(queriesMap) - val impalaKitQueries = interactiveQueries ++ reportingQueries ++ deepAnalyticQueries + val reportingQueries = Seq("q3", "q7", "q27", "q43", "q53", "q89").map(queriesMap) + val deepAnalyticQueries = Seq("q34", "q46", "q59", "q65", "q79", "ss_max").map(queriesMap) + val impalaKitQueries = interactiveQueries ++ reportingQueries ++ deepAnalyticQueries } diff --git a/src/main/scala/com/databricks/spark/sql/perf/tpcds/SimpleQueries.scala b/src/main/scala/com/databricks/spark/sql/perf/tpcds/SimpleQueries.scala index 1f7f3554..1cb1644e 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/tpcds/SimpleQueries.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/tpcds/SimpleQueries.scala @@ -16,32 +16,38 @@ package com.databricks.spark.sql.perf.tpcds -import com.databricks.spark.sql.perf.{ExecutionMode, Benchmark} +import com.databricks.spark.sql.perf.{Benchmark, ExecutionMode} trait SimpleQueries extends Benchmark { import ExecutionMode._ - val targetedPerfQueries = Seq( - // Query to measure scan performance. - ("stores-sales-scan", - """ + val targetedPerfQueries = Seq( + // Query to measure scan performance. + ( + "stores-sales-scan", + """ |select * from store_sales where ss_item_sk = 1 - """.stripMargin), - ("fact-fact-join", - """ + """.stripMargin + ), + ( + "fact-fact-join", + """ | select count(*) from store_sales | join store_returns | on store_sales.ss_item_sk = store_returns.sr_item_sk | and store_sales.ss_ticket_number = store_returns.sr_ticket_number - """.stripMargin) - ).map { case (name, sqlText) => - Query(name = name, sqlText = sqlText, description = "", executionMode = ForeachResults) - } + """.stripMargin + ) + ).map { + case (name, sqlText) => + Query(name = name, sqlText = sqlText, description = "", executionMode = ForeachResults) + } - val q7Derived = Seq( - ("q7-simpleScan", - """ + val q7Derived = Seq( + ( + "q7-simpleScan", + """ |select | ss_quantity, | ss_list_price, @@ -54,9 +60,11 @@ trait SimpleQueries extends Benchmark { |from store_sales |where | ss_sold_date_sk between 2450815 and 2451179 - """.stripMargin), - - ("q7-twoMapJoins", """ + """.stripMargin + ), + ( + "q7-twoMapJoins", + """ |select | i_item_id, | ss_quantity, @@ -74,9 +82,11 @@ trait SimpleQueries extends Benchmark { | and cd_marital_status = 'W' | and cd_education_status = 'Primary' | and ss_sold_date_sk between 2450815 and 2451179 -- partition key filter - """.stripMargin), - - ("q7-fourMapJoins", """ + """.stripMargin + ), + ( + "q7-fourMapJoins", + """ |select | i_item_id, | ss_quantity, @@ -98,9 +108,11 @@ trait SimpleQueries extends Benchmark { | and d_year = 1998 | -- and ss_date between '1998-01-01' and '1998-12-31' | and ss_sold_date_sk between 2450815 and 2451179 -- partition key filter - """.stripMargin), - - ("q7-noOrderBy", """ + """.stripMargin + ), + ( + "q7-noOrderBy", + """ |select | i_item_id, | avg(ss_quantity) agg1, @@ -124,9 +136,11 @@ trait SimpleQueries extends Benchmark { | and ss_sold_date_sk between 2450815 and 2451179 -- partition key filter |group by | i_item_id - """.stripMargin), - - ("q7", """ + """.stripMargin + ), + ( + "q7", + """ |-- start query 1 in stream 0 using template query7.tpl |select | i_item_id, @@ -155,9 +169,11 @@ trait SimpleQueries extends Benchmark { | i_item_id |limit 100 |-- end query 1 in stream 0 using template query7.tpl - """.stripMargin), - - ("store_sales-selfjoin-1", """ + """.stripMargin + ), + ( + "store_sales-selfjoin-1", + """ |-- The join condition will yield many matches. |select | t1.ss_quantity, @@ -170,10 +186,11 @@ trait SimpleQueries extends Benchmark { |from store_sales t1 join store_sales t2 on t1.ss_item_sk = t2.ss_item_sk |where | t1.ss_sold_date_sk between 2450815 and 2451179 - """.stripMargin), - - - ("store_sales-selfjoin-2", """ + """.stripMargin + ), + ( + "store_sales-selfjoin-2", + """ |-- We ust comound primary key as the join condition. The size of output is comparable with the input table. |select | t1.ss_quantity, @@ -186,8 +203,10 @@ trait SimpleQueries extends Benchmark { |from store_sales t1 join store_sales t2 on t1.ss_item_sk = t2.ss_item_sk and t1.ss_ticket_number = t2.ss_ticket_number |where | t1.ss_sold_date_sk between 2450815 and 2451179 - """.stripMargin) - ).map { case (name, sqlText) => - Query(name = name, sqlText = sqlText, description = "", executionMode = ForeachResults) - } + """.stripMargin + ) + ).map { + case (name, sqlText) => + Query(name = name, sqlText = sqlText, description = "", executionMode = ForeachResults) + } } diff --git a/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS.scala b/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS.scala index 2f173f0e..1fb69a01 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS.scala @@ -21,18 +21,18 @@ import com.databricks.spark.sql.perf._ import org.apache.spark.SparkContext import org.apache.spark.sql.{SQLContext, SparkSession} -/** - * TPC-DS benchmark's dataset. - * - * @param sqlContext An existing SQLContext. - */ +/** TPC-DS benchmark's dataset. + * + * @param sqlContext + * An existing SQLContext. + */ class TPCDS(@transient sqlContext: SQLContext) - extends Benchmark(sqlContext) - with ImpalaKitQueries - with SimpleQueries - with Tpcds_1_4_Queries - with Tpcds_2_4_Queries - with Serializable { + extends Benchmark(sqlContext) + with ImpalaKitQueries + with SimpleQueries + with Tpcds_1_4_Queries + with Tpcds_2_4_Queries + with Serializable { def this() = this(SparkSession.builder.getOrCreate().sqlContext) @@ -50,17 +50,16 @@ class TPCDS(@transient sqlContext: SQLContext) println(setQuery) sql(setQuery) } - */ - - /** - * Simple utilities to run the queries without persisting the results. */ + + /** Simple utilities to run the queries without persisting the results. + */ def explain(queries: Seq[Query], showPlan: Boolean = false): Unit = { val succeeded = mutable.ArrayBuffer.empty[String] queries.foreach { q => println(s"Query: ${q.name}") try { - val df = sqlContext.sql(q.sqlText.get) + val df = spark.sql(q.sqlText.get) if (showPlan) { df.explain() } else { @@ -80,28 +79,27 @@ class TPCDS(@transient sqlContext: SQLContext) val succeeded = mutable.ArrayBuffer.empty[String] queries.foreach { q => println(s"Query: ${q.name}") - val start = System.currentTimeMillis() - val df = sqlContext.sql(q.sqlText.get) - var failed = false + val start = System.currentTimeMillis() + val df = spark.sql(q.sqlText.get) + var failed = false val jobgroup = s"benchmark ${q.name}" val t = new Thread("query runner") { - override def run(): Unit = { + override def run(): Unit = try { - sqlContext.sparkContext.setJobGroup(jobgroup, jobgroup, true) + sparkContext.setJobGroup(jobgroup, jobgroup, true) df.show(numRows) } catch { case e: Exception => println("Failed to run: " + e) failed = true } - } } t.setDaemon(true) t.start() t.join(timeout) if (t.isAlive) { println(s"Timeout after $timeout seconds") - sqlContext.sparkContext.cancelJobGroup(jobgroup) + sparkContext.cancelJobGroup(jobgroup) t.interrupt() } else { if (!failed) { @@ -115,6 +113,3 @@ class TPCDS(@transient sqlContext: SQLContext) println(succeeded.map("\"" + _ + "\"")) } } - - - diff --git a/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDSTables.scala b/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDSTables.scala index 8243cd34..7e404df6 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDSTables.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDSTables.scala @@ -28,7 +28,7 @@ class DSDGEN(dsdgenDir: String) extends DataGenerator { val dsdgen = s"$dsdgenDir/dsdgen" def generate(sparkContext: SparkContext, name: String, partitions: Int, scaleFactor: String) = { - val generatedData = { + val generatedData = sparkContext.parallelize(1 to partitions, partitions).flatMap { i => val localToolsDir = if (new java.io.File(dsdgen).exists) { dsdgenDir @@ -41,502 +41,550 @@ class DSDGEN(dsdgenDir: String) extends DataGenerator { // Note: RNGSEED is the RNG seed used by the data generator. Right now, it is fixed to 100. val parallel = if (partitions > 1) s"-parallel $partitions -child $i" else "" val commands = Seq( - "bash", "-c", - s"cd $localToolsDir && ./dsdgen -table $name -filter Y -scale $scaleFactor -RNGSEED 100 $parallel") + "bash", + "-c", + s"cd $localToolsDir && ./dsdgen -table $name -filter Y -scale $scaleFactor -RNGSEED 100 $parallel" + ) println(commands) BlockingLineStream(commands) } - } generatedData.setName(s"$name, sf=$scaleFactor, strings") generatedData } } - class TPCDSTables( - sqlContext: SQLContext, - dsdgenDir: String, - scaleFactor: String, - useDoubleForDecimal: Boolean = false, - useStringForDate: Boolean = false) - extends Tables(sqlContext, scaleFactor, useDoubleForDecimal, useStringForDate) { - import sqlContext.implicits._ + sqlContext: SQLContext, + dsdgenDir: String, + scaleFactor: String, + useDoubleForDecimal: Boolean = false, + useStringForDate: Boolean = false +) extends Tables(sqlContext, scaleFactor, useDoubleForDecimal, useStringForDate) { + import spark.implicits._ val dataGenerator = new DSDGEN(dsdgenDir) val tables = Seq( - Table("catalog_sales", + Table( + "catalog_sales", partitionColumns = "cs_sold_date_sk" :: Nil, - 'cs_sold_date_sk .int, - 'cs_sold_time_sk .int, - 'cs_ship_date_sk .int, - 'cs_bill_customer_sk .int, - 'cs_bill_cdemo_sk .int, - 'cs_bill_hdemo_sk .int, - 'cs_bill_addr_sk .int, - 'cs_ship_customer_sk .int, - 'cs_ship_cdemo_sk .int, - 'cs_ship_hdemo_sk .int, - 'cs_ship_addr_sk .int, - 'cs_call_center_sk .int, - 'cs_catalog_page_sk .int, - 'cs_ship_mode_sk .int, - 'cs_warehouse_sk .int, - 'cs_item_sk .int, - 'cs_promo_sk .int, - 'cs_order_number .long, - 'cs_quantity .int, - 'cs_wholesale_cost .decimal(7,2), - 'cs_list_price .decimal(7,2), - 'cs_sales_price .decimal(7,2), - 'cs_ext_discount_amt .decimal(7,2), - 'cs_ext_sales_price .decimal(7,2), - 'cs_ext_wholesale_cost .decimal(7,2), - 'cs_ext_list_price .decimal(7,2), - 'cs_ext_tax .decimal(7,2), - 'cs_coupon_amt .decimal(7,2), - 'cs_ext_ship_cost .decimal(7,2), - 'cs_net_paid .decimal(7,2), - 'cs_net_paid_inc_tax .decimal(7,2), - 'cs_net_paid_inc_ship .decimal(7,2), - 'cs_net_paid_inc_ship_tax .decimal(7,2), - 'cs_net_profit .decimal(7,2)), - Table("catalog_returns", + 'cs_sold_date_sk.int, + 'cs_sold_time_sk.int, + 'cs_ship_date_sk.int, + 'cs_bill_customer_sk.int, + 'cs_bill_cdemo_sk.int, + 'cs_bill_hdemo_sk.int, + 'cs_bill_addr_sk.int, + 'cs_ship_customer_sk.int, + 'cs_ship_cdemo_sk.int, + 'cs_ship_hdemo_sk.int, + 'cs_ship_addr_sk.int, + 'cs_call_center_sk.int, + 'cs_catalog_page_sk.int, + 'cs_ship_mode_sk.int, + 'cs_warehouse_sk.int, + 'cs_item_sk.int, + 'cs_promo_sk.int, + 'cs_order_number.long, + 'cs_quantity.int, + 'cs_wholesale_cost.decimal(7, 2), + 'cs_list_price.decimal(7, 2), + 'cs_sales_price.decimal(7, 2), + 'cs_ext_discount_amt.decimal(7, 2), + 'cs_ext_sales_price.decimal(7, 2), + 'cs_ext_wholesale_cost.decimal(7, 2), + 'cs_ext_list_price.decimal(7, 2), + 'cs_ext_tax.decimal(7, 2), + 'cs_coupon_amt.decimal(7, 2), + 'cs_ext_ship_cost.decimal(7, 2), + 'cs_net_paid.decimal(7, 2), + 'cs_net_paid_inc_tax.decimal(7, 2), + 'cs_net_paid_inc_ship.decimal(7, 2), + 'cs_net_paid_inc_ship_tax.decimal(7, 2), + 'cs_net_profit.decimal(7, 2) + ), + Table( + "catalog_returns", partitionColumns = "cr_returned_date_sk" :: Nil, - 'cr_returned_date_sk .int, - 'cr_returned_time_sk .int, - 'cr_item_sk .int, - 'cr_refunded_customer_sk .int, - 'cr_refunded_cdemo_sk .int, - 'cr_refunded_hdemo_sk .int, - 'cr_refunded_addr_sk .int, - 'cr_returning_customer_sk .int, - 'cr_returning_cdemo_sk .int, - 'cr_returning_hdemo_sk .int, - 'cr_returning_addr_sk .int, - 'cr_call_center_sk .int, - 'cr_catalog_page_sk .int, - 'cr_ship_mode_sk .int, - 'cr_warehouse_sk .int, - 'cr_reason_sk .int, - 'cr_order_number .long, - 'cr_return_quantity .int, - 'cr_return_amount .decimal(7,2), - 'cr_return_tax .decimal(7,2), - 'cr_return_amt_inc_tax .decimal(7,2), - 'cr_fee .decimal(7,2), - 'cr_return_ship_cost .decimal(7,2), - 'cr_refunded_cash .decimal(7,2), - 'cr_reversed_charge .decimal(7,2), - 'cr_store_credit .decimal(7,2), - 'cr_net_loss .decimal(7,2)), - Table("inventory", + 'cr_returned_date_sk.int, + 'cr_returned_time_sk.int, + 'cr_item_sk.int, + 'cr_refunded_customer_sk.int, + 'cr_refunded_cdemo_sk.int, + 'cr_refunded_hdemo_sk.int, + 'cr_refunded_addr_sk.int, + 'cr_returning_customer_sk.int, + 'cr_returning_cdemo_sk.int, + 'cr_returning_hdemo_sk.int, + 'cr_returning_addr_sk.int, + 'cr_call_center_sk.int, + 'cr_catalog_page_sk.int, + 'cr_ship_mode_sk.int, + 'cr_warehouse_sk.int, + 'cr_reason_sk.int, + 'cr_order_number.long, + 'cr_return_quantity.int, + 'cr_return_amount.decimal(7, 2), + 'cr_return_tax.decimal(7, 2), + 'cr_return_amt_inc_tax.decimal(7, 2), + 'cr_fee.decimal(7, 2), + 'cr_return_ship_cost.decimal(7, 2), + 'cr_refunded_cash.decimal(7, 2), + 'cr_reversed_charge.decimal(7, 2), + 'cr_store_credit.decimal(7, 2), + 'cr_net_loss.decimal(7, 2) + ), + Table( + "inventory", partitionColumns = "inv_date_sk" :: Nil, - 'inv_date_sk .int, - 'inv_item_sk .int, - 'inv_warehouse_sk .int, - 'inv_quantity_on_hand .int), - Table("store_sales", + 'inv_date_sk.int, + 'inv_item_sk.int, + 'inv_warehouse_sk.int, + 'inv_quantity_on_hand.int + ), + Table( + "store_sales", partitionColumns = "ss_sold_date_sk" :: Nil, - 'ss_sold_date_sk .int, - 'ss_sold_time_sk .int, - 'ss_item_sk .int, - 'ss_customer_sk .int, - 'ss_cdemo_sk .int, - 'ss_hdemo_sk .int, - 'ss_addr_sk .int, - 'ss_store_sk .int, - 'ss_promo_sk .int, - 'ss_ticket_number .long, - 'ss_quantity .int, - 'ss_wholesale_cost .decimal(7,2), - 'ss_list_price .decimal(7,2), - 'ss_sales_price .decimal(7,2), - 'ss_ext_discount_amt .decimal(7,2), - 'ss_ext_sales_price .decimal(7,2), - 'ss_ext_wholesale_cost.decimal(7,2), - 'ss_ext_list_price .decimal(7,2), - 'ss_ext_tax .decimal(7,2), - 'ss_coupon_amt .decimal(7,2), - 'ss_net_paid .decimal(7,2), - 'ss_net_paid_inc_tax .decimal(7,2), - 'ss_net_profit .decimal(7,2)), - Table("store_returns", - partitionColumns = "sr_returned_date_sk" ::Nil, - 'sr_returned_date_sk .int, - 'sr_return_time_sk .int, - 'sr_item_sk .int, - 'sr_customer_sk .int, - 'sr_cdemo_sk .int, - 'sr_hdemo_sk .int, - 'sr_addr_sk .int, - 'sr_store_sk .int, - 'sr_reason_sk .int, - 'sr_ticket_number .long, - 'sr_return_quantity .int, - 'sr_return_amt .decimal(7,2), - 'sr_return_tax .decimal(7,2), - 'sr_return_amt_inc_tax.decimal(7,2), - 'sr_fee .decimal(7,2), - 'sr_return_ship_cost .decimal(7,2), - 'sr_refunded_cash .decimal(7,2), - 'sr_reversed_charge .decimal(7,2), - 'sr_store_credit .decimal(7,2), - 'sr_net_loss .decimal(7,2)), - Table("web_sales", + 'ss_sold_date_sk.int, + 'ss_sold_time_sk.int, + 'ss_item_sk.int, + 'ss_customer_sk.int, + 'ss_cdemo_sk.int, + 'ss_hdemo_sk.int, + 'ss_addr_sk.int, + 'ss_store_sk.int, + 'ss_promo_sk.int, + 'ss_ticket_number.long, + 'ss_quantity.int, + 'ss_wholesale_cost.decimal(7, 2), + 'ss_list_price.decimal(7, 2), + 'ss_sales_price.decimal(7, 2), + 'ss_ext_discount_amt.decimal(7, 2), + 'ss_ext_sales_price.decimal(7, 2), + 'ss_ext_wholesale_cost.decimal(7, 2), + 'ss_ext_list_price.decimal(7, 2), + 'ss_ext_tax.decimal(7, 2), + 'ss_coupon_amt.decimal(7, 2), + 'ss_net_paid.decimal(7, 2), + 'ss_net_paid_inc_tax.decimal(7, 2), + 'ss_net_profit.decimal(7, 2) + ), + Table( + "store_returns", + partitionColumns = "sr_returned_date_sk" :: Nil, + 'sr_returned_date_sk.int, + 'sr_return_time_sk.int, + 'sr_item_sk.int, + 'sr_customer_sk.int, + 'sr_cdemo_sk.int, + 'sr_hdemo_sk.int, + 'sr_addr_sk.int, + 'sr_store_sk.int, + 'sr_reason_sk.int, + 'sr_ticket_number.long, + 'sr_return_quantity.int, + 'sr_return_amt.decimal(7, 2), + 'sr_return_tax.decimal(7, 2), + 'sr_return_amt_inc_tax.decimal(7, 2), + 'sr_fee.decimal(7, 2), + 'sr_return_ship_cost.decimal(7, 2), + 'sr_refunded_cash.decimal(7, 2), + 'sr_reversed_charge.decimal(7, 2), + 'sr_store_credit.decimal(7, 2), + 'sr_net_loss.decimal(7, 2) + ), + Table( + "web_sales", partitionColumns = "ws_sold_date_sk" :: Nil, - 'ws_sold_date_sk .int, - 'ws_sold_time_sk .int, - 'ws_ship_date_sk .int, - 'ws_item_sk .int, - 'ws_bill_customer_sk .int, - 'ws_bill_cdemo_sk .int, - 'ws_bill_hdemo_sk .int, - 'ws_bill_addr_sk .int, - 'ws_ship_customer_sk .int, - 'ws_ship_cdemo_sk .int, - 'ws_ship_hdemo_sk .int, - 'ws_ship_addr_sk .int, - 'ws_web_page_sk .int, - 'ws_web_site_sk .int, - 'ws_ship_mode_sk .int, - 'ws_warehouse_sk .int, - 'ws_promo_sk .int, - 'ws_order_number .long, - 'ws_quantity .int, - 'ws_wholesale_cost .decimal(7,2), - 'ws_list_price .decimal(7,2), - 'ws_sales_price .decimal(7,2), - 'ws_ext_discount_amt .decimal(7,2), - 'ws_ext_sales_price .decimal(7,2), - 'ws_ext_wholesale_cost .decimal(7,2), - 'ws_ext_list_price .decimal(7,2), - 'ws_ext_tax .decimal(7,2), - 'ws_coupon_amt .decimal(7,2), - 'ws_ext_ship_cost .decimal(7,2), - 'ws_net_paid .decimal(7,2), - 'ws_net_paid_inc_tax .decimal(7,2), - 'ws_net_paid_inc_ship .decimal(7,2), - 'ws_net_paid_inc_ship_tax .decimal(7,2), - 'ws_net_profit .decimal(7,2)), - Table("web_returns", - partitionColumns = "wr_returned_date_sk" ::Nil, - 'wr_returned_date_sk .int, - 'wr_returned_time_sk .int, - 'wr_item_sk .int, - 'wr_refunded_customer_sk .int, - 'wr_refunded_cdemo_sk .int, - 'wr_refunded_hdemo_sk .int, - 'wr_refunded_addr_sk .int, - 'wr_returning_customer_sk .int, - 'wr_returning_cdemo_sk .int, - 'wr_returning_hdemo_sk .int, - 'wr_returning_addr_sk .int, - 'wr_web_page_sk .int, - 'wr_reason_sk .int, - 'wr_order_number .long, - 'wr_return_quantity .int, - 'wr_return_amt .decimal(7,2), - 'wr_return_tax .decimal(7,2), - 'wr_return_amt_inc_tax .decimal(7,2), - 'wr_fee .decimal(7,2), - 'wr_return_ship_cost .decimal(7,2), - 'wr_refunded_cash .decimal(7,2), - 'wr_reversed_charge .decimal(7,2), - 'wr_account_credit .decimal(7,2), - 'wr_net_loss .decimal(7,2)), - Table("call_center", + 'ws_sold_date_sk.int, + 'ws_sold_time_sk.int, + 'ws_ship_date_sk.int, + 'ws_item_sk.int, + 'ws_bill_customer_sk.int, + 'ws_bill_cdemo_sk.int, + 'ws_bill_hdemo_sk.int, + 'ws_bill_addr_sk.int, + 'ws_ship_customer_sk.int, + 'ws_ship_cdemo_sk.int, + 'ws_ship_hdemo_sk.int, + 'ws_ship_addr_sk.int, + 'ws_web_page_sk.int, + 'ws_web_site_sk.int, + 'ws_ship_mode_sk.int, + 'ws_warehouse_sk.int, + 'ws_promo_sk.int, + 'ws_order_number.long, + 'ws_quantity.int, + 'ws_wholesale_cost.decimal(7, 2), + 'ws_list_price.decimal(7, 2), + 'ws_sales_price.decimal(7, 2), + 'ws_ext_discount_amt.decimal(7, 2), + 'ws_ext_sales_price.decimal(7, 2), + 'ws_ext_wholesale_cost.decimal(7, 2), + 'ws_ext_list_price.decimal(7, 2), + 'ws_ext_tax.decimal(7, 2), + 'ws_coupon_amt.decimal(7, 2), + 'ws_ext_ship_cost.decimal(7, 2), + 'ws_net_paid.decimal(7, 2), + 'ws_net_paid_inc_tax.decimal(7, 2), + 'ws_net_paid_inc_ship.decimal(7, 2), + 'ws_net_paid_inc_ship_tax.decimal(7, 2), + 'ws_net_profit.decimal(7, 2) + ), + Table( + "web_returns", + partitionColumns = "wr_returned_date_sk" :: Nil, + 'wr_returned_date_sk.int, + 'wr_returned_time_sk.int, + 'wr_item_sk.int, + 'wr_refunded_customer_sk.int, + 'wr_refunded_cdemo_sk.int, + 'wr_refunded_hdemo_sk.int, + 'wr_refunded_addr_sk.int, + 'wr_returning_customer_sk.int, + 'wr_returning_cdemo_sk.int, + 'wr_returning_hdemo_sk.int, + 'wr_returning_addr_sk.int, + 'wr_web_page_sk.int, + 'wr_reason_sk.int, + 'wr_order_number.long, + 'wr_return_quantity.int, + 'wr_return_amt.decimal(7, 2), + 'wr_return_tax.decimal(7, 2), + 'wr_return_amt_inc_tax.decimal(7, 2), + 'wr_fee.decimal(7, 2), + 'wr_return_ship_cost.decimal(7, 2), + 'wr_refunded_cash.decimal(7, 2), + 'wr_reversed_charge.decimal(7, 2), + 'wr_account_credit.decimal(7, 2), + 'wr_net_loss.decimal(7, 2) + ), + Table( + "call_center", partitionColumns = Nil, - 'cc_call_center_sk .int, - 'cc_call_center_id .string, - 'cc_rec_start_date .date, - 'cc_rec_end_date .date, - 'cc_closed_date_sk .int, - 'cc_open_date_sk .int, - 'cc_name .string, - 'cc_class .string, - 'cc_employees .int, - 'cc_sq_ft .int, - 'cc_hours .string, - 'cc_manager .string, - 'cc_mkt_id .int, - 'cc_mkt_class .string, - 'cc_mkt_desc .string, - 'cc_market_manager .string, - 'cc_division .int, - 'cc_division_name .string, - 'cc_company .int, - 'cc_company_name .string, - 'cc_street_number .string, - 'cc_street_name .string, - 'cc_street_type .string, - 'cc_suite_number .string, - 'cc_city .string, - 'cc_county .string, - 'cc_state .string, - 'cc_zip .string, - 'cc_country .string, - 'cc_gmt_offset .decimal(5,2), - 'cc_tax_percentage .decimal(5,2)), - Table("catalog_page", + 'cc_call_center_sk.int, + 'cc_call_center_id.string, + 'cc_rec_start_date.date, + 'cc_rec_end_date.date, + 'cc_closed_date_sk.int, + 'cc_open_date_sk.int, + 'cc_name.string, + 'cc_class.string, + 'cc_employees.int, + 'cc_sq_ft.int, + 'cc_hours.string, + 'cc_manager.string, + 'cc_mkt_id.int, + 'cc_mkt_class.string, + 'cc_mkt_desc.string, + 'cc_market_manager.string, + 'cc_division.int, + 'cc_division_name.string, + 'cc_company.int, + 'cc_company_name.string, + 'cc_street_number.string, + 'cc_street_name.string, + 'cc_street_type.string, + 'cc_suite_number.string, + 'cc_city.string, + 'cc_county.string, + 'cc_state.string, + 'cc_zip.string, + 'cc_country.string, + 'cc_gmt_offset.decimal(5, 2), + 'cc_tax_percentage.decimal(5, 2) + ), + Table( + "catalog_page", partitionColumns = Nil, - 'cp_catalog_page_sk .int, - 'cp_catalog_page_id .string, - 'cp_start_date_sk .int, - 'cp_end_date_sk .int, - 'cp_department .string, - 'cp_catalog_number .int, - 'cp_catalog_page_number .int, - 'cp_description .string, - 'cp_type .string), - Table("customer", + 'cp_catalog_page_sk.int, + 'cp_catalog_page_id.string, + 'cp_start_date_sk.int, + 'cp_end_date_sk.int, + 'cp_department.string, + 'cp_catalog_number.int, + 'cp_catalog_page_number.int, + 'cp_description.string, + 'cp_type.string + ), + Table( + "customer", partitionColumns = Nil, - 'c_customer_sk .int, - 'c_customer_id .string, - 'c_current_cdemo_sk .int, - 'c_current_hdemo_sk .int, - 'c_current_addr_sk .int, - 'c_first_shipto_date_sk .int, - 'c_first_sales_date_sk .int, - 'c_salutation .string, - 'c_first_name .string, - 'c_last_name .string, - 'c_preferred_cust_flag .string, - 'c_birth_day .int, - 'c_birth_month .int, - 'c_birth_year .int, - 'c_birth_country .string, - 'c_login .string, - 'c_email_address .string, - 'c_last_review_date .string), - Table("customer_address", + 'c_customer_sk.int, + 'c_customer_id.string, + 'c_current_cdemo_sk.int, + 'c_current_hdemo_sk.int, + 'c_current_addr_sk.int, + 'c_first_shipto_date_sk.int, + 'c_first_sales_date_sk.int, + 'c_salutation.string, + 'c_first_name.string, + 'c_last_name.string, + 'c_preferred_cust_flag.string, + 'c_birth_day.int, + 'c_birth_month.int, + 'c_birth_year.int, + 'c_birth_country.string, + 'c_login.string, + 'c_email_address.string, + 'c_last_review_date.string + ), + Table( + "customer_address", partitionColumns = Nil, - 'ca_address_sk .int, - 'ca_address_id .string, - 'ca_street_number .string, - 'ca_street_name .string, - 'ca_street_type .string, - 'ca_suite_number .string, - 'ca_city .string, - 'ca_county .string, - 'ca_state .string, - 'ca_zip .string, - 'ca_country .string, - 'ca_gmt_offset .decimal(5,2), - 'ca_location_type .string), - Table("customer_demographics", + 'ca_address_sk.int, + 'ca_address_id.string, + 'ca_street_number.string, + 'ca_street_name.string, + 'ca_street_type.string, + 'ca_suite_number.string, + 'ca_city.string, + 'ca_county.string, + 'ca_state.string, + 'ca_zip.string, + 'ca_country.string, + 'ca_gmt_offset.decimal(5, 2), + 'ca_location_type.string + ), + Table( + "customer_demographics", partitionColumns = Nil, - 'cd_demo_sk .int, - 'cd_gender .string, - 'cd_marital_status .string, - 'cd_education_status .string, - 'cd_purchase_estimate .int, - 'cd_credit_rating .string, - 'cd_dep_count .int, - 'cd_dep_employed_count .int, - 'cd_dep_college_count .int), - Table("date_dim", + 'cd_demo_sk.int, + 'cd_gender.string, + 'cd_marital_status.string, + 'cd_education_status.string, + 'cd_purchase_estimate.int, + 'cd_credit_rating.string, + 'cd_dep_count.int, + 'cd_dep_employed_count.int, + 'cd_dep_college_count.int + ), + Table( + "date_dim", partitionColumns = Nil, - 'd_date_sk .int, - 'd_date_id .string, - 'd_date .date, - 'd_month_seq .int, - 'd_week_seq .int, - 'd_quarter_seq .int, - 'd_year .int, - 'd_dow .int, - 'd_moy .int, - 'd_dom .int, - 'd_qoy .int, - 'd_fy_year .int, - 'd_fy_quarter_seq .int, - 'd_fy_week_seq .int, - 'd_day_name .string, - 'd_quarter_name .string, - 'd_holiday .string, - 'd_weekend .string, - 'd_following_holiday .string, - 'd_first_dom .int, - 'd_last_dom .int, - 'd_same_day_ly .int, - 'd_same_day_lq .int, - 'd_current_day .string, - 'd_current_week .string, - 'd_current_month .string, - 'd_current_quarter .string, - 'd_current_year .string), - Table("household_demographics", + 'd_date_sk.int, + 'd_date_id.string, + 'd_date.date, + 'd_month_seq.int, + 'd_week_seq.int, + 'd_quarter_seq.int, + 'd_year.int, + 'd_dow.int, + 'd_moy.int, + 'd_dom.int, + 'd_qoy.int, + 'd_fy_year.int, + 'd_fy_quarter_seq.int, + 'd_fy_week_seq.int, + 'd_day_name.string, + 'd_quarter_name.string, + 'd_holiday.string, + 'd_weekend.string, + 'd_following_holiday.string, + 'd_first_dom.int, + 'd_last_dom.int, + 'd_same_day_ly.int, + 'd_same_day_lq.int, + 'd_current_day.string, + 'd_current_week.string, + 'd_current_month.string, + 'd_current_quarter.string, + 'd_current_year.string + ), + Table( + "household_demographics", partitionColumns = Nil, - 'hd_demo_sk .int, - 'hd_income_band_sk .int, - 'hd_buy_potential .string, - 'hd_dep_count .int, - 'hd_vehicle_count .int), - Table("income_band", + 'hd_demo_sk.int, + 'hd_income_band_sk.int, + 'hd_buy_potential.string, + 'hd_dep_count.int, + 'hd_vehicle_count.int + ), + Table( + "income_band", partitionColumns = Nil, - 'ib_income_band_sk .int, - 'ib_lower_bound .int, - 'ib_upper_bound .int), - Table("item", + 'ib_income_band_sk.int, + 'ib_lower_bound.int, + 'ib_upper_bound.int + ), + Table( + "item", partitionColumns = Nil, - 'i_item_sk .int, - 'i_item_id .string, - 'i_rec_start_date .date, - 'i_rec_end_date .date, - 'i_item_desc .string, - 'i_current_price .decimal(7,2), - 'i_wholesale_cost .decimal(7,2), - 'i_brand_id .int, - 'i_brand .string, - 'i_class_id .int, - 'i_class .string, - 'i_category_id .int, - 'i_category .string, - 'i_manufact_id .int, - 'i_manufact .string, - 'i_size .string, - 'i_formulation .string, - 'i_color .string, - 'i_units .string, - 'i_container .string, - 'i_manager_id .int, - 'i_product_name .string), - Table("promotion", + 'i_item_sk.int, + 'i_item_id.string, + 'i_rec_start_date.date, + 'i_rec_end_date.date, + 'i_item_desc.string, + 'i_current_price.decimal(7, 2), + 'i_wholesale_cost.decimal(7, 2), + 'i_brand_id.int, + 'i_brand.string, + 'i_class_id.int, + 'i_class.string, + 'i_category_id.int, + 'i_category.string, + 'i_manufact_id.int, + 'i_manufact.string, + 'i_size.string, + 'i_formulation.string, + 'i_color.string, + 'i_units.string, + 'i_container.string, + 'i_manager_id.int, + 'i_product_name.string + ), + Table( + "promotion", partitionColumns = Nil, - 'p_promo_sk .int, - 'p_promo_id .string, - 'p_start_date_sk .int, - 'p_end_date_sk .int, - 'p_item_sk .int, - 'p_cost .decimal(15,2), - 'p_response_target .int, - 'p_promo_name .string, - 'p_channel_dmail .string, - 'p_channel_email .string, - 'p_channel_catalog .string, - 'p_channel_tv .string, - 'p_channel_radio .string, - 'p_channel_press .string, - 'p_channel_event .string, - 'p_channel_demo .string, - 'p_channel_details .string, - 'p_purpose .string, - 'p_discount_active .string), - Table("reason", + 'p_promo_sk.int, + 'p_promo_id.string, + 'p_start_date_sk.int, + 'p_end_date_sk.int, + 'p_item_sk.int, + 'p_cost.decimal(15, 2), + 'p_response_target.int, + 'p_promo_name.string, + 'p_channel_dmail.string, + 'p_channel_email.string, + 'p_channel_catalog.string, + 'p_channel_tv.string, + 'p_channel_radio.string, + 'p_channel_press.string, + 'p_channel_event.string, + 'p_channel_demo.string, + 'p_channel_details.string, + 'p_purpose.string, + 'p_discount_active.string + ), + Table( + "reason", partitionColumns = Nil, - 'r_reason_sk .int, - 'r_reason_id .string, - 'r_reason_desc .string), - Table("ship_mode", + 'r_reason_sk.int, + 'r_reason_id.string, + 'r_reason_desc.string + ), + Table( + "ship_mode", partitionColumns = Nil, - 'sm_ship_mode_sk .int, - 'sm_ship_mode_id .string, - 'sm_type .string, - 'sm_code .string, - 'sm_carrier .string, - 'sm_contract .string), - Table("store", + 'sm_ship_mode_sk.int, + 'sm_ship_mode_id.string, + 'sm_type.string, + 'sm_code.string, + 'sm_carrier.string, + 'sm_contract.string + ), + Table( + "store", partitionColumns = Nil, - 's_store_sk .int, - 's_store_id .string, - 's_rec_start_date .date, - 's_rec_end_date .date, - 's_closed_date_sk .int, - 's_store_name .string, - 's_number_employees .int, - 's_floor_space .int, - 's_hours .string, - 's_manager .string, - 's_market_id .int, - 's_geography_class .string, - 's_market_desc .string, - 's_market_manager .string, - 's_division_id .int, - 's_division_name .string, - 's_company_id .int, - 's_company_name .string, - 's_street_number .string, - 's_street_name .string, - 's_street_type .string, - 's_suite_number .string, - 's_city .string, - 's_county .string, - 's_state .string, - 's_zip .string, - 's_country .string, - 's_gmt_offset .decimal(5,2), - 's_tax_precentage .decimal(5,2)), - Table("time_dim", + 's_store_sk.int, + 's_store_id.string, + 's_rec_start_date.date, + 's_rec_end_date.date, + 's_closed_date_sk.int, + 's_store_name.string, + 's_number_employees.int, + 's_floor_space.int, + 's_hours.string, + 's_manager.string, + 's_market_id.int, + 's_geography_class.string, + 's_market_desc.string, + 's_market_manager.string, + 's_division_id.int, + 's_division_name.string, + 's_company_id.int, + 's_company_name.string, + 's_street_number.string, + 's_street_name.string, + 's_street_type.string, + 's_suite_number.string, + 's_city.string, + 's_county.string, + 's_state.string, + 's_zip.string, + 's_country.string, + 's_gmt_offset.decimal(5, 2), + 's_tax_precentage.decimal(5, 2) + ), + Table( + "time_dim", partitionColumns = Nil, - 't_time_sk .int, - 't_time_id .string, - 't_time .int, - 't_hour .int, - 't_minute .int, - 't_second .int, - 't_am_pm .string, - 't_shift .string, - 't_sub_shift .string, - 't_meal_time .string), - Table("warehouse", + 't_time_sk.int, + 't_time_id.string, + 't_time.int, + 't_hour.int, + 't_minute.int, + 't_second.int, + 't_am_pm.string, + 't_shift.string, + 't_sub_shift.string, + 't_meal_time.string + ), + Table( + "warehouse", partitionColumns = Nil, - 'w_warehouse_sk .int, - 'w_warehouse_id .string, - 'w_warehouse_name .string, - 'w_warehouse_sq_ft .int, - 'w_street_number .string, - 'w_street_name .string, - 'w_street_type .string, - 'w_suite_number .string, - 'w_city .string, - 'w_county .string, - 'w_state .string, - 'w_zip .string, - 'w_country .string, - 'w_gmt_offset .decimal(5,2)), - Table("web_page", + 'w_warehouse_sk.int, + 'w_warehouse_id.string, + 'w_warehouse_name.string, + 'w_warehouse_sq_ft.int, + 'w_street_number.string, + 'w_street_name.string, + 'w_street_type.string, + 'w_suite_number.string, + 'w_city.string, + 'w_county.string, + 'w_state.string, + 'w_zip.string, + 'w_country.string, + 'w_gmt_offset.decimal(5, 2) + ), + Table( + "web_page", partitionColumns = Nil, - 'wp_web_page_sk .int, - 'wp_web_page_id .string, - 'wp_rec_start_date .date, - 'wp_rec_end_date .date, - 'wp_creation_date_sk .int, - 'wp_access_date_sk .int, - 'wp_autogen_flag .string, - 'wp_customer_sk .int, - 'wp_url .string, - 'wp_type .string, - 'wp_char_count .int, - 'wp_link_count .int, - 'wp_image_count .int, - 'wp_max_ad_count .int), - Table("web_site", + 'wp_web_page_sk.int, + 'wp_web_page_id.string, + 'wp_rec_start_date.date, + 'wp_rec_end_date.date, + 'wp_creation_date_sk.int, + 'wp_access_date_sk.int, + 'wp_autogen_flag.string, + 'wp_customer_sk.int, + 'wp_url.string, + 'wp_type.string, + 'wp_char_count.int, + 'wp_link_count.int, + 'wp_image_count.int, + 'wp_max_ad_count.int + ), + Table( + "web_site", partitionColumns = Nil, - 'web_site_sk .int, - 'web_site_id .string, - 'web_rec_start_date .date, - 'web_rec_end_date .date, - 'web_name .string, - 'web_open_date_sk .int, - 'web_close_date_sk .int, - 'web_class .string, - 'web_manager .string, - 'web_mkt_id .int, - 'web_mkt_class .string, - 'web_mkt_desc .string, - 'web_market_manager .string, - 'web_company_id .int, - 'web_company_name .string, - 'web_street_number .string, - 'web_street_name .string, - 'web_street_type .string, - 'web_suite_number .string, - 'web_city .string, - 'web_county .string, - 'web_state .string, - 'web_zip .string, - 'web_country .string, - 'web_gmt_offset .decimal(5,2), - 'web_tax_percentage .decimal(5,2)) + 'web_site_sk.int, + 'web_site_id.string, + 'web_rec_start_date.date, + 'web_rec_end_date.date, + 'web_name.string, + 'web_open_date_sk.int, + 'web_close_date_sk.int, + 'web_class.string, + 'web_manager.string, + 'web_mkt_id.int, + 'web_mkt_class.string, + 'web_mkt_desc.string, + 'web_market_manager.string, + 'web_company_id.int, + 'web_company_name.string, + 'web_street_number.string, + 'web_street_name.string, + 'web_street_type.string, + 'web_suite_number.string, + 'web_city.string, + 'web_county.string, + 'web_state.string, + 'web_zip.string, + 'web_country.string, + 'web_gmt_offset.decimal(5, 2), + 'web_tax_percentage.decimal(5, 2) + ) ).map(_.convertTypes()) } diff --git a/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS_1_4_Queries.scala b/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS_1_4_Queries.scala index 55196787..44ba25b6 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS_1_4_Queries.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS_1_4_Queries.scala @@ -18,11 +18,9 @@ package com.databricks.spark.sql.perf.tpcds import com.databricks.spark.sql.perf.{Benchmark, ExecutionMode, Query} -/** - * This implements the official TPCDS v1.4 queries with only cosmetic modifications - * (noted for each query). - * Don't modify this except for these kind of modifications. - */ +/** This implements the official TPCDS v1.4 queries with only cosmetic modifications (noted for each + * query). Don't modify this except for these kind of modifications. + */ trait Tpcds_1_4_Queries extends Benchmark { import ExecutionMode._ @@ -33,7 +31,9 @@ trait Tpcds_1_4_Queries extends Benchmark { // Queries the TPCDS 1.4 queries using the qualifcations values in the templates. val tpcds1_4Queries = Seq( - ("q1", """ + ( + "q1", + """ | WITH customer_total_return AS | (SELECT sr_customer_sk AS ctr_customer_sk, sr_store_sk AS ctr_store_sk, | sum(sr_return_amt) AS ctr_total_return @@ -50,8 +50,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | AND s_state = 'TN' | AND ctr1.ctr_customer_sk = c_customer_sk | ORDER BY c_customer_id LIMIT 100 - """.stripMargin), - ("q2", """ + """.stripMargin + ), + ( + "q2", + """ | WITH wscs as | (SELECT sold_date_sk, sales_price | FROM (SELECT ws_sold_date_sk sold_date_sk, ws_ext_sales_price sales_price @@ -102,8 +105,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | WHERE date_dim.d_week_seq = wswscs.d_week_seq AND d_year = 2001 + 1) z | WHERE d_week_seq1=d_week_seq2-53 | ORDER BY d_week_seq1 - """.stripMargin), - ("q3", """ + """.stripMargin + ), + ( + "q3", + """ | SELECT dt.d_year, item.i_brand_id brand_id, item.i_brand brand,SUM(ss_ext_sales_price) sum_agg | FROM date_dim dt, store_sales, item | WHERE dt.d_date_sk = store_sales.ss_sold_date_sk @@ -113,8 +119,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | GROUP BY dt.d_year, item.i_brand, item.i_brand_id | ORDER BY dt.d_year, sum_agg desc, brand_id | LIMIT 100 - """.stripMargin), - ("q4", """ + """.stripMargin + ), + ( + "q4", + """ |WITH year_total AS ( | SELECT c_customer_id customer_id, | c_first_name customer_first_name, @@ -221,10 +230,13 @@ trait Tpcds_1_4_Queries extends Benchmark { | t_s_secyear.customer_login, | t_s_secyear.customer_email_address | LIMIT 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add // Modifications: "||" -> concat - ("q5", """ + ( + "q5", + """ | WITH ssr AS | (SELECT s_store_id, | sum(sales_price) as sales, @@ -342,8 +354,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | GROUP BY ROLLUP (channel, id) | ORDER BY channel, id | LIMIT 100 - """.stripMargin), - ("q6", """ + """.stripMargin + ), + ( + "q6", + """ | SELECT a.ca_state state, count(*) cnt | FROM | customer_address a, customer c, store_sales s, date_dim d, item i @@ -360,8 +375,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | GROUP BY a.ca_state | HAVING count(*) >= 10 | ORDER BY cnt LIMIT 100 - """.stripMargin), - ("q7", """ + """.stripMargin + ), + ( + "q7", + """ | SELECT i_item_id, | avg(ss_quantity) agg1, | avg(ss_list_price) agg2, @@ -379,8 +397,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | d_year = 2000 | GROUP BY i_item_id | ORDER BY i_item_id LIMIT 100 - """.stripMargin), - ("q8", """ + """.stripMargin + ), + ( + "q8", + """ | select s_store_name, sum(ss_net_profit) | from store_sales, date_dim, store, | (SELECT ca_zip @@ -462,8 +483,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and (substr(s_zip,1,2) = substr(V1.ca_zip,1,2)) | group by s_store_name | order by s_store_name LIMIT 100 - """.stripMargin), - ("q9", s""" + """.stripMargin + ), + ( + "q9", + s""" |select case when (select count(*) from store_sales | where ss_quantity between 1 and 20) > ${rc(0)} | then (select avg(ss_ext_discount_amt) from store_sales @@ -496,8 +520,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | where ss_quantity between 81 and 100) end bucket5 |from reason |where r_reason_sk = 1 - """.stripMargin), - ("q10", """ + """.stripMargin + ), + ( + "q10", + """ | select | cd_gender, cd_marital_status, cd_education_status, count(*) cnt1, | cd_purchase_estimate, count(*) cnt2, cd_credit_rating, count(*) cnt3, @@ -542,8 +569,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | cd_dep_employed_count, | cd_dep_college_count |LIMIT 100 - """.stripMargin), - ("q11", """ + """.stripMargin + ), + ( + "q11", + """ | with year_total as ( | select c_customer_id customer_id | ,c_first_name customer_first_name @@ -607,9 +637,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | > case when t_s_firstyear.year_total > 0 then t_s_secyear.year_total / t_s_firstyear.year_total else null end | order by t_s_secyear.customer_preferred_cust_flag | LIMIT 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add - ("q12", """ + ( + "q12", + """ | select | i_item_desc, i_category, i_class, i_current_price, | sum(ws_ext_sales_price) as itemrevenue, @@ -628,8 +661,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | order by | i_category, i_class, i_item_id, i_item_desc, revenueratio | LIMIT 100 - """.stripMargin), - ("q13", """ + """.stripMargin + ), + ( + "q13", + """ | select avg(ss_quantity) | ,avg(ss_ext_sales_price) | ,avg(ss_ext_wholesale_cost) @@ -678,8 +714,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and ca_state in ('VA', 'TX', 'MS') | and ss_net_profit between 50 and 250 | )) - """.stripMargin), - ("q14a", """ + """.stripMargin + ), + ( + "q14a", + """ |with cross_items as | (select i_item_sk ss_item_sk | from item, @@ -758,8 +797,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by rollup (channel, i_brand_id,i_class_id,i_category_id) | order by channel,i_brand_id,i_class_id,i_category_id | limit 100 - """.stripMargin), - ("q14b", """ + """.stripMargin + ), + ( + "q14b", + """ | with cross_items as | (select i_item_sk ss_item_sk | from item, @@ -823,8 +865,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and this_year.i_category_id = last_year.i_category_id | order by this_year.channel, this_year.i_brand_id, this_year.i_class_id, this_year.i_category_id | limit 100 - """.stripMargin), - ("q15", """ + """.stripMargin + ), + ( + "q15", + """ | select ca_zip, sum(cs_sales_price) | from catalog_sales, customer, customer_address, date_dim | where cs_bill_customer_sk = c_customer_sk @@ -838,9 +883,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by ca_zip | order by ca_zip | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: " -> ` - ("q16", """ + ( + "q16", + """ | select | count(distinct cs_order_number) as `order count`, | sum(cs_ext_ship_cost) as `total shipping cost`, @@ -863,8 +911,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | where cs1.cs_order_number = cr1.cr_order_number) | order by count(distinct cs_order_number) | limit 100 - """.stripMargin), - ("q17", """ + """.stripMargin + ), + ( + "q17", + """ | select i_item_id | ,i_item_desc | ,s_state @@ -896,9 +947,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by i_item_id, i_item_desc, s_state | order by i_item_id, i_item_desc, s_state | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "numeric" -> "decimal" - ("q18", """ + ( + "q18", + """ | select i_item_id, | ca_country, | ca_state, @@ -926,8 +980,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by rollup (i_item_id, ca_country, ca_state, ca_county) | order by ca_country, ca_state, ca_county, i_item_id | LIMIT 100 - """.stripMargin), - ("q19", """ + """.stripMargin + ), + ( + "q19", + """ | select i_brand_id brand_id, i_brand brand, i_manufact_id, i_manufact, | sum(ss_ext_sales_price) ext_price | from date_dim, store_sales, item,customer,customer_address,store @@ -943,8 +1000,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by i_brand, i_brand_id, i_manufact_id, i_manufact | order by ext_price desc, brand, brand_id, i_manufact_id, i_manufact | limit 100 - """.stripMargin), - ("q20", """ + """.stripMargin + ), + ( + "q20", + """ |select i_item_desc | ,i_category | ,i_class @@ -961,9 +1021,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by i_item_id, i_item_desc, i_category, i_class, i_current_price | order by i_category, i_class, i_item_id, i_item_desc, revenueratio | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add - ("q21", """ + ( + "q21", + """ | select * from( | select w_warehouse_name, i_item_id, | sum(case when (cast(d_date as date) < cast ('2000-03-11' as date)) @@ -986,8 +1049,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | end) between 2.0/3.0 and 3.0/2.0 | order by w_warehouse_name, i_item_id | limit 100 - """.stripMargin), - ("q22", """ + """.stripMargin + ), + ( + "q22", + """ | select i_product_name, i_brand, i_class, i_category, avg(inv_quantity_on_hand) qoh | from inventory, date_dim, item, warehouse | where inv_date_sk=d_date_sk @@ -997,8 +1063,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by rollup(i_product_name, i_brand, i_class, i_category) | order by qoh, i_product_name, i_brand, i_class, i_category | limit 100 - """.stripMargin), - ("q23a", """ + """.stripMargin + ), + ( + "q23a", + """ | with frequent_ss_items as | (select substr(i_item_desc,1,30) itemdesc,i_item_sk item_sk,d_date solddate,count(*) cnt | from store_sales, date_dim, item @@ -1039,8 +1108,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and ws_item_sk in (select item_sk from frequent_ss_items) | and ws_bill_customer_sk in (select c_customer_sk from best_ss_customer))) y | limit 100 - """.stripMargin), - ("q23b", """ + """.stripMargin + ), + ( + "q23b", + """ | | with frequent_ss_items as | (select substr(i_item_desc,1,30) itemdesc,i_item_sk item_sk,d_date solddate,count(*) cnt @@ -1088,8 +1160,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by c_last_name,c_first_name)) y | order by c_last_name,c_first_name,sales | limit 100 - """.stripMargin), - ("q24a", """ + """.stripMargin + ), + ( + "q24a", + """ | with ssales as | (select c_last_name, c_first_name, s_store_name, ca_state, s_state, i_color, | i_current_price, i_manager_id, i_units, i_size, sum(ss_net_paid) netpaid @@ -1109,8 +1184,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | where i_color = 'pale' | group by c_last_name, c_first_name, s_store_name | having sum(netpaid) > (select 0.05*avg(netpaid) from ssales) - """.stripMargin), - ("q24b", """ + """.stripMargin + ), + ( + "q24b", + """ | with ssales as | (select c_last_name, c_first_name, s_store_name, ca_state, s_state, i_color, | i_current_price, i_manager_id, i_units, i_size, sum(ss_net_paid) netpaid @@ -1130,8 +1208,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | where i_color = 'chiffon' | group by c_last_name, c_first_name, s_store_name | having sum(netpaid) > (select 0.05*avg(netpaid) from ssales) - """.stripMargin), - ("q25", """ + """.stripMargin + ), + ( + "q25", + """ | select i_item_id, i_item_desc, s_store_id, s_store_name, | sum(ss_net_profit) as store_sales_profit, | sum(sr_net_loss) as store_returns_loss, @@ -1161,8 +1242,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | order by | i_item_id, i_item_desc, s_store_id, s_store_name | limit 100 - """.stripMargin), - ("q26", """ + """.stripMargin + ), + ( + "q26", + """ | select i_item_id, | avg(cs_quantity) agg1, | avg(cs_list_price) agg2, @@ -1181,8 +1265,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by i_item_id | order by i_item_id | limit 100 - """.stripMargin), - ("q27", """ + """.stripMargin + ), + ( + "q27", + """ | select i_item_id, | s_state, grouping(s_state) g_state, | avg(ss_quantity) agg1, @@ -1202,8 +1289,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by rollup (i_item_id, s_state) | order by i_item_id, s_state | limit 100 - """.stripMargin), - ("q28", """ + """.stripMargin + ), + ( + "q28", + """ | select * | from (select avg(ss_list_price) B1_LP | ,count(ss_list_price) B1_CNT @@ -1254,8 +1344,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | or ss_coupon_amt between 7326 and 7326+1000 | or ss_wholesale_cost between 7 and 7+20)) B6 | limit 100 - """.stripMargin), - ("q29", """ + """.stripMargin + ), + ( + "q29", + """ | select | i_item_id | ,i_item_desc @@ -1288,8 +1381,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | order by | i_item_id, i_item_desc, s_store_id, s_store_name | limit 100 - """.stripMargin), - ("q30", """ + """.stripMargin + ), + ( + "q30", + """ | with customer_total_return as | (select wr_returning_customer_sk as ctr_customer_sk | ,ca_state as ctr_state, @@ -1313,8 +1409,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | ,c_birth_day,c_birth_month,c_birth_year,c_birth_country,c_login,c_email_address | ,c_last_review_date,ctr_total_return | limit 100 - """.stripMargin), - ("q31", """ + """.stripMargin + ), + ( + "q31", + """ | with ss as | (select ca_county,d_qoy, d_year,sum(ss_ext_sales_price) as store_sales | from store_sales,date_dim,customer_address @@ -1359,9 +1458,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | and case when ws2.web_sales > 0 then ws3.web_sales/ws2.web_sales else null end | > case when ss2.store_sales > 0 then ss3.store_sales/ss2.store_sales else null end | order by ss1.ca_county - """.stripMargin), + """.stripMargin + ), // Modifications: " -> ` - ("q32", """ + ( + "q32", + """ | select sum(cs_ext_discount_amt) as `excess discount amount` | from | catalog_sales, item, date_dim @@ -1377,8 +1479,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and d_date between '2000-01-27]' and (cast('2000-01-27' as date) + interval 90 days) | and d_date_sk = cs_sold_date_sk) |limit 100 - """.stripMargin), - ("q33", """ + """.stripMargin + ), + ( + "q33", + """ | with ss as ( | select | i_manufact_id,sum(ss_ext_sales_price) total_sales @@ -1432,8 +1537,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by i_manufact_id | order by total_sales |limit 100 - """.stripMargin), - ("q34", """ + """.stripMargin + ), + ( + "q34", + """ | select c_last_name, c_first_name, c_salutation, c_preferred_cust_flag, ss_ticket_number, | cnt | FROM @@ -1457,8 +1565,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | where ss_customer_sk = c_customer_sk | and cnt between 15 and 20 | order by c_last_name,c_first_name,c_salutation,c_preferred_cust_flag desc - """.stripMargin), - ("q35", """ + """.stripMargin + ), + ( + "q35", + """ | select | ca_state, | cd_gender, @@ -1502,8 +1613,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | order by ca_state, cd_gender, cd_marital_status, cd_dep_count, | cd_dep_employed_count, cd_dep_college_count | limit 100 - """.stripMargin), - ("q36", """ + """.stripMargin + ), + ( + "q36", + """ | select | sum(ss_net_profit)/sum(ss_ext_sales_price) as gross_margin | ,i_category @@ -1527,9 +1641,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | ,case when lochierarchy = 0 then i_category end | ,rank_within_parent | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add - ("q37", """ + ( + "q37", + """ | select i_item_id, i_item_desc, i_current_price | from item, inventory, date_dim, catalog_sales | where i_current_price between 68 and 68 + 30 @@ -1542,8 +1659,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by i_item_id,i_item_desc,i_current_price | order by i_item_id | limit 100 - """.stripMargin), - ("q38", """ + """.stripMargin + ), + ( + "q38", + """ | select count(*) from ( | select distinct c_last_name, c_first_name, d_date | from store_sales, date_dim, customer @@ -1564,8 +1684,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and d_month_seq between 1200 and 1200 + 11 | ) hot_cust | limit 100 - """.stripMargin), - ("q39a", """ + """.stripMargin + ), + ( + "q39a", + """ | with inv as | (select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy | ,stdev,mean, case mean when 0 then null else stdev/mean end cov @@ -1587,8 +1710,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and inv2.d_moy=1+1 | order by inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean,inv1.cov | ,inv2.d_moy,inv2.mean, inv2.cov - """.stripMargin), - ("q39b", """ + """.stripMargin + ), + ( + "q39b", + """ | with inv as | (select w_warehouse_name,w_warehouse_sk,i_item_sk,d_moy | ,stdev,mean, case mean when 0 then null else stdev/mean end cov @@ -1611,9 +1737,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | and inv1.cov > 1.5 | order by inv1.w_warehouse_sk,inv1.i_item_sk,inv1.d_moy,inv1.mean,inv1.cov | ,inv2.d_moy,inv2.mean, inv2.cov - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add - ("q40", """ + ( + "q40", + """ | select | w_state | ,i_item_id @@ -1636,8 +1765,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by w_state,i_item_id | order by w_state,i_item_id | limit 100 - """.stripMargin), - ("q41", """ + """.stripMargin + ), + ( + "q41", + """ | select distinct(i_product_name) | from item i1 | where i_manufact_id between 738 and 738+40 @@ -1687,8 +1819,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | )))) > 0 | order by i_product_name | limit 100 - """.stripMargin), - ("q42", """ + """.stripMargin + ), + ( + "q42", + """ | select dt.d_year, item.i_category_id, item.i_category, sum(ss_ext_sales_price) | from date_dim dt, store_sales, item | where dt.d_date_sk = store_sales.ss_sold_date_sk @@ -1703,8 +1838,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | ,item.i_category_id | ,item.i_category | limit 100 - """.stripMargin), - ("q43", """ + """.stripMargin + ), + ( + "q43", + """ | select s_store_name, s_store_id, | sum(case when (d_day_name='Sunday') then ss_sales_price else null end) sun_sales, | sum(case when (d_day_name='Monday') then ss_sales_price else null end) mon_sales, @@ -1722,8 +1860,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | order by s_store_name, s_store_id,sun_sales,mon_sales,tue_sales,wed_sales, | thu_sales,fri_sales,sat_sales | limit 100 - """.stripMargin), - ("q44", """ + """.stripMargin + ), + ( + "q44", + """ | select asceding.rnk, i1.i_product_name best_performing, i2.i_product_name worst_performing | from(select * | from (select item_sk,rank() over (order by rank_col asc) rnk @@ -1755,8 +1896,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and i2.i_item_sk=descending.item_sk | order by asceding.rnk | limit 100 - """.stripMargin), - ("q45", """ + """.stripMargin + ), + ( + "q45", + """ | select ca_zip, ca_city, sum(ws_sales_price) | from web_sales, customer, customer_address, date_dim, item | where ws_bill_customer_sk = c_customer_sk @@ -1774,8 +1918,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by ca_zip, ca_city | order by ca_zip, ca_city | limit 100 - """.stripMargin), - ("q46", """ + """.stripMargin + ), + ( + "q46", + """ | select c_last_name, c_first_name, ca_city, bought_city, ss_ticket_number, amt,profit | from | (select ss_ticket_number @@ -1799,8 +1946,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and current_addr.ca_city <> bought_city | order by c_last_name, c_first_name, ca_city, bought_city, ss_ticket_number | limit 100 - """.stripMargin), - ("q47", """ + """.stripMargin + ), + ( + "q47", + """ | with v1 as( | select i_category, i_brand, | s_store_name, s_company_name, @@ -1847,8 +1997,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | case when avg_monthly_sales > 0 then abs(sum_sales - avg_monthly_sales) / avg_monthly_sales else null end > 0.1 | order by sum_sales - avg_monthly_sales, 3 | limit 100 - """.stripMargin), - ("q48", """ + """.stripMargin + ), + ( + "q48", + """ | select sum (ss_quantity) | from store_sales, store, customer_demographics, customer_address, date_dim | where s_store_sk = ss_store_sk @@ -1912,9 +2065,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | and ss_net_profit between 50 and 25000 | ) | ) - """.stripMargin), + """.stripMargin + ), // Modifications: "dec" -> "decimal" - ("q49", """ + ( + "q49", + """ | select 'web' as channel, web.item, web.return_ratio, web.return_rank, web.currency_rank | from ( | select @@ -2010,9 +2166,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | where (store.return_rank <= 10 or store.currency_rank <= 10) | order by 1,4,5 | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: " -> ` - ("q50", """ + ( + "q50", + """ | select | s_store_name, s_company_id, s_street_number, s_street_name, s_street_type, | s_suite_number, s_city, s_county, s_state, s_zip @@ -2042,8 +2201,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | s_store_name, s_company_id, s_street_number, s_street_name, s_street_type, | s_suite_number, s_city, s_county, s_state, s_zip | limit 100 - """.stripMargin), - ("q51", """ + """.stripMargin + ), + ( + "q51", + """ | WITH web_v1 as ( | select | ws_item_sk item_sk, d_date, @@ -2080,8 +2242,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | where web_cumulative > store_cumulative | order by item_sk, d_date | limit 100 - """.stripMargin), - ("q52", """ + """.stripMargin + ), + ( + "q52", + """ | select dt.d_year | ,item.i_brand_id brand_id | ,item.i_brand brand @@ -2095,8 +2260,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by dt.d_year, item.i_brand, item.i_brand_id | order by dt.d_year, ext_price desc, brand_id |limit 100 - """.stripMargin), - ("q53", """ + """.stripMargin + ), + ( + "q53", + """ | select * from | (select i_manufact_id, | sum(ss_sales_price) sum_sales, @@ -2124,8 +2292,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | sum_sales, | i_manufact_id | limit 100 - """.stripMargin), - ("q54", """ + """.stripMargin + ), + ( + "q54", + """ | with my_customers as ( | select distinct c_customer_sk | , c_current_addr_sk @@ -2177,8 +2348,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by segment | order by segment, num_customers | limit 100 - """.stripMargin), - ("q55", """ + """.stripMargin + ), + ( + "q55", + """ |select i_brand_id brand_id, i_brand brand, | sum(ss_ext_sales_price) ext_price | from date_dim, store_sales, item @@ -2190,8 +2364,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by i_brand, i_brand_id | order by ext_price desc, brand_id | limit 100 - """.stripMargin), - ("q56", """ + """.stripMargin + ), + ( + "q56", + """ | with ss as ( | select i_item_id,sum(ss_ext_sales_price) total_sales | from @@ -2240,8 +2417,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by i_item_id | order by total_sales | limit 100 - """.stripMargin), - ("q57", """ + """.stripMargin + ), + ( + "q57", + """ | with v1 as( | select i_category, i_brand, | cc_name, @@ -2283,8 +2463,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | case when avg_monthly_sales > 0 then abs(sum_sales - avg_monthly_sales) / avg_monthly_sales else null end > 0.1 | order by sum_sales - avg_monthly_sales, 3 | limit 100 - """.stripMargin), - ("q58", """ + """.stripMargin + ), + ( + "q58", + """ | with ss_items as | (select i_item_id item_id, sum(ss_ext_sales_price) ss_item_rev | from store_sales, item, date_dim @@ -2338,8 +2521,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and ws_item_rev between 0.9 * cs_item_rev and 1.1 * cs_item_rev | order by item_id, ss_item_rev | limit 100 - """.stripMargin), - ("q59", """ + """.stripMargin + ), + ( + "q59", + """ | with wss as | (select d_week_seq, | ss_store_sk, @@ -2381,8 +2567,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and d_week_seq1=d_week_seq2-52 | order by s_store_name1,s_store_id1,d_week_seq1 | limit 100 - """.stripMargin), - ("q60", """ + """.stripMargin + ), + ( + "q60", + """ | with ss as ( | select i_item_id,sum(ss_ext_sales_price) total_sales | from store_sales, date_dim, customer_address, item @@ -2428,8 +2617,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by i_item_id | order by i_item_id, total_sales | limit 100 - """.stripMargin), - ("q61", s""" + """.stripMargin + ), + ( + "q61", + s""" | select promotions,total,cast(promotions as decimal(15,4))/cast(total as decimal(15,4))*100 | from | (select sum(ss_ext_sales_price) promotions @@ -2460,9 +2652,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | and d_moy = 11) all_sales | order by promotions, total | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: " -> ` - ("q62", """ + ( + "q62", + """ | select | substr(w_warehouse_name,1,20) | ,sm_type @@ -2488,8 +2683,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | order by | substr(w_warehouse_name,1,20), sm_type, web_name | limit 100 - """.stripMargin), - ("q63", """ + """.stripMargin + ), + ( + "q63", + """ | select * | from (select i_manager_id | ,sum(ss_sales_price) sum_sales @@ -2517,8 +2715,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | ,avg_monthly_sales | ,sum_sales | limit 100 - """.stripMargin), - ("q64", """ + """.stripMargin + ), + ( + "q64", + """ | with cs_ui as | (select cs_item_sk | ,sum(cs_ext_list_price) as sale,sum(cr_refunded_cash+cr_reversed_charge+cr_store_credit) as refund @@ -2576,8 +2777,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | cs1.store_name = cs2.store_name and | cs1.store_zip = cs2.store_zip | order by cs1.product_name, cs1.store_name, cs2.cnt - """.stripMargin), - ("q65", """ + """.stripMargin + ), + ( + "q65", + """ | select | s_store_name, i_item_desc, sc.revenue, i_current_price, i_wholesale_cost, i_brand | from store, item, @@ -2599,9 +2803,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | i_item_sk = sc.ss_item_sk | order by s_store_name, i_item_desc | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "||" -> concat - ("q66", """ + ( + "q66", + """ | select w_warehouse_name, w_warehouse_sq_ft, w_city, w_county, w_state, w_country, | ship_carriers, year | ,sum(jan_sales) as jan_sales @@ -2728,8 +2935,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | ship_carriers, year | order by w_warehouse_name | limit 100 - """.stripMargin), - ("q67", """ + """.stripMargin + ), + ( + "q67", + """ | select * from | (select i_category, i_class, i_brand, i_product_name, d_year, d_qoy, d_moy, s_store_id, | sumsales, rank() over (partition by i_category order by sumsales desc) rk @@ -2748,8 +2958,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | i_category, i_class, i_brand, i_product_name, d_year, | d_qoy, d_moy, s_store_id, sumsales, rk | limit 100 - """.stripMargin), - ("q68", """ + """.stripMargin + ), + ( + "q68", + """ | select | c_last_name, c_first_name, ca_city, bought_city, ss_ticket_number, extended_price, | extended_tax, list_price @@ -2776,8 +2989,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and current_addr.ca_city <> bought_city | order by c_last_name, ss_ticket_number | limit 100 - """.stripMargin), - ("q69", """ + """.stripMargin + ), + ( + "q69", + """ | select | cd_gender, cd_marital_status, cd_education_status, count(*) cnt1, | cd_purchase_estimate, count(*) cnt2, cd_credit_rating, count(*) cnt3 @@ -2807,8 +3023,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | order by cd_gender, cd_marital_status, cd_education_status, | cd_purchase_estimate, cd_credit_rating | limit 100 - """.stripMargin), - ("q70", """ + """.stripMargin + ), + ( + "q70", + """ | select | sum(ss_net_profit) as total_sum, s_state, s_county | ,grouping(s_state)+grouping(s_county) as lochierarchy @@ -2838,8 +3057,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | ,case when lochierarchy = 0 then s_state end | ,rank_within_parent | limit 100 - """.stripMargin), - ("q71", """ + """.stripMargin + ), + ( + "q71", + """ | select i_brand_id brand_id, i_brand brand,t_hour,t_minute, | sum(ext_price) ext_price | from item, @@ -2880,9 +3102,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | and (t_meal_time = 'breakfast' or t_meal_time = 'dinner') | group by i_brand, i_brand_id,t_hour,t_minute | order by ext_price desc, brand_id - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add - ("q72", """ + ( + "q72", + """ | select i_item_desc | ,w_warehouse_name | ,d1.d_week_seq @@ -2911,8 +3136,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by i_item_desc,w_warehouse_name,d1.d_week_seq | order by total_cnt desc, i_item_desc, w_warehouse_name, d_week_seq | limit 100 - """.stripMargin), - ("q73", """ + """.stripMargin + ), + ( + "q73", + """ | select | c_last_name, c_first_name, c_salutation, c_preferred_cust_flag, | ss_ticket_number, cnt from @@ -2933,8 +3161,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | where ss_customer_sk = c_customer_sk | and cnt between 1 and 5 | order by cnt desc - """.stripMargin), - ("q74", """ + """.stripMargin + ), + ( + "q74", + """ | with year_total as ( | select | c_customer_id customer_id, c_first_name customer_first_name, @@ -2981,8 +3212,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | > case when t_s_firstyear.year_total > 0 then t_s_secyear.year_total / t_s_firstyear.year_total else null end | order by 1, 1, 1 | limit 100 - """.stripMargin), - ("q75", """ + """.stripMargin + ), + ( + "q75", + """ | WITH all_sales AS ( | SELECT | d_year, i_brand_id, i_class_id, i_category_id, i_manufact_id, @@ -3037,8 +3271,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | AND CAST(curr_yr.sales_cnt AS DECIMAL(17,2))/CAST(prev_yr.sales_cnt AS DECIMAL(17,2))<0.9 | ORDER BY sales_cnt_diff | LIMIT 100 - """.stripMargin), - ("q76", """ + """.stripMargin + ), + ( + "q76", + """ | SELECT | channel, col_name, d_year, d_qoy, i_category, COUNT(*) sales_cnt, | SUM(ext_sales_price) sales_amt @@ -3069,9 +3306,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | GROUP BY channel, col_name, d_year, d_qoy, i_category | ORDER BY channel, col_name, d_year, d_qoy, i_category | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add - ("q77", """ + ( + "q77", + """ | with ss as | (select s_store_sk, sum(ss_ext_sales_price) as sales, sum(ss_net_profit) as profit | from store_sales, date_dim, store @@ -3139,8 +3379,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by rollup(channel, id) | order by channel, id | limit 100 - """.stripMargin), - ("q78", """ + """.stripMargin + ), + ( + "q78", + """ | with ws as | (select d_year AS ws_sold_year, ws_item_sk, | ws_bill_customer_sk ws_customer_sk, @@ -3195,8 +3438,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | other_chan_sales_price, | round(ss_qty/(coalesce(ws_qty+cs_qty,1)),2) | limit 100 - """.stripMargin), - ("q79", """ + """.stripMargin + ), + ( + "q79", + """ | select | c_last_name,c_first_name,substr(s_city,1,30),ss_ticket_number,amt,profit | from @@ -3218,10 +3464,13 @@ trait Tpcds_1_4_Queries extends Benchmark { | where ss_customer_sk = c_customer_sk | order by c_last_name,c_first_name,substr(s_city,1,30), profit | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add // Modifications: "||" -> "concat" - ("q80", """ + ( + "q80", + """ | with ssr as | (select s_store_id as store_id, | sum(ss_ext_sales_price) as sales, @@ -3289,8 +3538,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by rollup (channel, id) | order by channel, id | limit 100 - """.stripMargin), - ("q81", """ + """.stripMargin + ), + ( + "q81", + """ | with customer_total_return as | (select | cr_returning_customer_sk as ctr_customer_sk, ca_state as ctr_state, @@ -3315,8 +3567,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | ,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset | ,ca_location_type,ctr_total_return | limit 100 - """.stripMargin), - ("q82", """ + """.stripMargin + ), + ( + "q82", + """ | select i_item_id, i_item_desc, i_current_price | from item, inventory, date_dim, store_sales | where i_current_price between 62 and 62+30 @@ -3329,8 +3584,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by i_item_id,i_item_desc,i_current_price | order by i_item_id | limit 100 - """.stripMargin), - ("q83", """ + """.stripMargin + ), + ( + "q83", + """ | with sr_items as | (select i_item_id item_id, sum(sr_return_quantity) sr_item_qty | from store_returns, item, date_dim @@ -3368,9 +3626,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | and sr_items.item_id=wr_items.item_id | order by sr_items.item_id, sr_item_qty | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "||" -> concat - ("q84", """ + ( + "q84", + """ | select c_customer_id as customer_id | ,concat(c_last_name, ', ', c_first_name) as customername | from customer @@ -3389,8 +3650,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and sr_cdemo_sk = cd_demo_sk | order by c_customer_id | limit 100 - """.stripMargin), - ("q85", """ + """.stripMargin + ), + ( + "q85", + """ | select | substr(r_reason_desc,1,20), avg(ws_quantity), avg(wr_refunded_cash), avg(wr_fee) | from web_sales, web_returns, web_page, customer_demographics cd1, @@ -3470,8 +3734,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | ,avg(wr_refunded_cash) | ,avg(wr_fee) | limit 100 - """.stripMargin), - ("q86", """ + """.stripMargin + ), + ( + "q86", + """ | select sum(ws_net_paid) as total_sum, i_category, i_class, | grouping(i_category)+grouping(i_class) as lochierarchy, | rank() over ( @@ -3490,8 +3757,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | case when lochierarchy = 0 then i_category end, | rank_within_parent | limit 100 - """.stripMargin), - ("q87", """ + """.stripMargin + ), + ( + "q87", + """ | select count(*) | from ((select distinct c_last_name, c_first_name, d_date | from store_sales, date_dim, customer @@ -3511,8 +3781,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and web_sales.ws_bill_customer_sk = customer.c_customer_sk | and d_month_seq between 1200 and 1200+11) |) cool_cust - """.stripMargin), - ("q88", """ + """.stripMargin + ), + ( + "q88", + """ | select * | from | (select count(*) h8_30_to_9 @@ -3603,8 +3876,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | (household_demographics.hd_dep_count = 2 and household_demographics.hd_vehicle_count<=2+2) or | (household_demographics.hd_dep_count = 0 and household_demographics.hd_vehicle_count<=0+2)) | and store.s_store_name = 'ese') s8 - """.stripMargin), - ("q89", """ + """.stripMargin + ), + ( + "q89", + """ | select * | from( | select i_category, i_class, i_brand, @@ -3628,8 +3904,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | where case when (avg_monthly_sales <> 0) then (abs(sum_sales - avg_monthly_sales) / avg_monthly_sales) else null end > 0.1 | order by sum_sales - avg_monthly_sales, s_store_name | limit 100 - """.stripMargin), - ("q90", """ + """.stripMargin + ), + ( + "q90", + """ | select cast(amc as decimal(15,4))/cast(pmc as decimal(15,4)) am_pm_ratio | from ( select count(*) amc | from web_sales, household_demographics , time_dim, web_page @@ -3649,8 +3928,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and web_page.wp_char_count between 5000 and 5200) pt | order by am_pm_ratio | limit 100 - """.stripMargin), - ("q91", """ + """.stripMargin + ), + ( + "q91", + """ | select | cc_call_center_id Call_Center, cc_name Call_Center_Name, cc_manager Manager, | sum(cr_net_loss) Returns_Loss @@ -3672,10 +3954,13 @@ trait Tpcds_1_4_Queries extends Benchmark { | and ca_gmt_offset = -7 | group by cc_call_center_id,cc_name,cc_manager,cd_marital_status,cd_education_status | order by sum(cr_net_loss) desc - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add // Modifications: " -> ` - ("q92", """ + ( + "q92", + """ | select sum(ws_ext_discount_amt) as `Excess Discount Amount` | from web_sales, item, date_dim | where i_manufact_id = 350 @@ -3692,8 +3977,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | ) | order by sum(ws_ext_discount_amt) | limit 100 - """.stripMargin), - ("q93", """ + """.stripMargin + ), + ( + "q93", + """ | select ss_customer_sk, sum(act_sales) sumsales | from (select | ss_item_sk, ss_ticket_number, ss_customer_sk, @@ -3707,10 +3995,13 @@ trait Tpcds_1_4_Queries extends Benchmark { | group by ss_customer_sk | order by sumsales, ss_customer_sk | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add // Modifications: " -> ` - ("q94", """ + ( + "q94", + """ | select | count(distinct ws_order_number) as `order count` | ,sum(ws_ext_ship_cost) as `total shipping cost` @@ -3734,9 +4025,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | where ws1.ws_order_number = wr1.wr_order_number) | order by count(distinct ws_order_number) | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add - ("q95", """ + ( + "q95", + """ | with ws_wh as | (select ws1.ws_order_number,ws1.ws_warehouse_sk wh1,ws2.ws_warehouse_sk wh2 | from web_sales ws1,web_sales ws2 @@ -3763,8 +4057,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | where wr_order_number = ws_wh.ws_order_number) | order by count(distinct ws_order_number) | limit 100 - """.stripMargin), - ("q96", """ + """.stripMargin + ), + ( + "q96", + """ | select count(*) | from store_sales, household_demographics, time_dim, store | where ss_sold_time_sk = time_dim.t_time_sk @@ -3776,8 +4073,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | and store.s_store_name = 'ese' | order by count(*) | limit 100 - """.stripMargin), - ("q97", """ + """.stripMargin + ), + ( + "q97", + """ | with ssci as ( | select ss_customer_sk customer_sk, ss_item_sk item_sk | from store_sales,date_dim @@ -3796,9 +4096,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | from ssci full outer join csci on (ssci.customer_sk=csci.customer_sk | and ssci.item_sk = csci.item_sk) | limit 100 - """.stripMargin), + """.stripMargin + ), // Modifications: "+ days" -> date_add - ("q98", """ + ( + "q98", + """ |select i_item_desc, i_category, i_class, i_current_price | ,sum(ss_ext_sales_price) as itemrevenue | ,sum(ss_ext_sales_price)*100/sum(sum(ss_ext_sales_price)) over @@ -3815,9 +4118,12 @@ trait Tpcds_1_4_Queries extends Benchmark { | i_item_id, i_item_desc, i_category, i_class, i_current_price |order by | i_category, i_class, i_item_id, i_item_desc, revenueratio - """.stripMargin), + """.stripMargin + ), // Modifications: " -> ` - ("q99", """ + ( + "q99", + """ | select | substr(w_warehouse_name,1,20), sm_type, cc_name | ,sum(case when (cs_ship_date_sk - cs_sold_date_sk <= 30 ) then 1 else 0 end) as `30 days` @@ -3840,9 +4146,11 @@ trait Tpcds_1_4_Queries extends Benchmark { | substr(w_warehouse_name,1,20), sm_type, cc_name | order by substr(w_warehouse_name,1,20), sm_type, cc_name | limit 100 - """.stripMargin), - ("qSsMax", - """ + """.stripMargin + ), + ( + "qSsMax", + """ |select | count(*) as total, | count(ss_sold_date_sk) as not_null_total, @@ -3857,23 +4165,100 @@ trait Tpcds_1_4_Queries extends Benchmark { | max(ss_store_sk) as max_ss_store_sk, | max(ss_promo_sk) as max_ss_promo_sk |from store_sales - """.stripMargin) - ).map { case (name, sqlText) => - Query(name + "-v1.4", sqlText, description = "TPCDS 1.4 Query", executionMode = CollectResults) + """.stripMargin + ) + ).map { + case (name, sqlText) => + Query( + name + "-v1.4", + sqlText, + description = "TPCDS 1.4 Query", + executionMode = CollectResults + ) } val tpcds1_4QueriesMap = tpcds1_4Queries.map(q => q.name.split("-").get(0) -> q).toMap val runnable: Seq[Query] = Seq( - "q1", "q2", "q3", "q4", "q5", "q7", "q8", "q9", - "q11", "q12", "q13", "q15", "q17", "q18", "q19", - "q20", "q21", "q22", "q25", "q26", "q27", "q28", "q29", - "q31", "q34", "q36", "q37", "q38", "q39a", "q39b", - "q40", "q42", "q43", "q44", "q46", "q47", "q48", "q49", - "q50", "q51", "q52", "q53", "q54", "q55", "q57", "q59", - "q61", "q62", "q63", "q64", "q65", "q66", "q67", "q68", - "q71", "q72", "q73", "q74", "q75", "q76", "q77", "q78", "q79", - "q80", "q82", "q84", "q85", "q86", "q87", "q88", "q89", - "q90", "q91", "q93", "q96", "q97", "q98", "q99", "qSsMax").map(tpcds1_4QueriesMap) + "q1", + "q2", + "q3", + "q4", + "q5", + "q7", + "q8", + "q9", + "q11", + "q12", + "q13", + "q15", + "q17", + "q18", + "q19", + "q20", + "q21", + "q22", + "q25", + "q26", + "q27", + "q28", + "q29", + "q31", + "q34", + "q36", + "q37", + "q38", + "q39a", + "q39b", + "q40", + "q42", + "q43", + "q44", + "q46", + "q47", + "q48", + "q49", + "q50", + "q51", + "q52", + "q53", + "q54", + "q55", + "q57", + "q59", + "q61", + "q62", + "q63", + "q64", + "q65", + "q66", + "q67", + "q68", + "q71", + "q72", + "q73", + "q74", + "q75", + "q76", + "q77", + "q78", + "q79", + "q80", + "q82", + "q84", + "q85", + "q86", + "q87", + "q88", + "q89", + "q90", + "q91", + "q93", + "q96", + "q97", + "q98", + "q99", + "qSsMax" + ).map(tpcds1_4QueriesMap) val all: Seq[Query] = tpcds1_4QueriesMap.values.toSeq } diff --git a/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS_2_4_Queries.scala b/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS_2_4_Queries.scala index f78dfe04..e2f997f4 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS_2_4_Queries.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/tpcds/TPCDS_2_4_Queries.scala @@ -20,32 +20,128 @@ import org.apache.commons.io.IOUtils import com.databricks.spark.sql.perf.{Benchmark, ExecutionMode, Query} -/** - * This implements the official TPCDS v2.4 queries with only cosmetic modifications. - */ +/** This implements the official TPCDS v2.4 queries with only cosmetic modifications. + */ trait Tpcds_2_4_Queries extends Benchmark { import ExecutionMode._ val queryNames = Seq( - "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11", "q12", "q13", "q14a", "q14b", "q15", "q16", "q17", "q18", "q19", - "q20", "q21", "q22", "q23a", "q23b", "q24a", "q24b", "q25", "q26", "q27", - "q28", "q29", "q30", "q31", "q32", "q33", "q34", "q35", "q36", "q37", - "q38", "q39a", "q39b", "q40", "q41", "q42", "q43", "q44", "q45", "q46", "q47", - "q48", "q49", "q50", "q51", "q52", "q53", "q54", "q55", "q56", "q57", "q58", - "q59", "q60", "q61", "q62", "q63", "q64", "q65", "q66", "q67", "q68", "q69", - "q70", "q71", "q72", "q73", "q74", "q75", "q76", "q77", "q78", "q79", - "q80", "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", - "q90", "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99", + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14a", + "q14b", + "q15", + "q16", + "q17", + "q18", + "q19", + "q20", + "q21", + "q22", + "q23a", + "q23b", + "q24a", + "q24b", + "q25", + "q26", + "q27", + "q28", + "q29", + "q30", + "q31", + "q32", + "q33", + "q34", + "q35", + "q36", + "q37", + "q38", + "q39a", + "q39b", + "q40", + "q41", + "q42", + "q43", + "q44", + "q45", + "q46", + "q47", + "q48", + "q49", + "q50", + "q51", + "q52", + "q53", + "q54", + "q55", + "q56", + "q57", + "q58", + "q59", + "q60", + "q61", + "q62", + "q63", + "q64", + "q65", + "q66", + "q67", + "q68", + "q69", + "q70", + "q71", + "q72", + "q73", + "q74", + "q75", + "q76", + "q77", + "q78", + "q79", + "q80", + "q81", + "q82", + "q83", + "q84", + "q85", + "q86", + "q87", + "q88", + "q89", + "q90", + "q91", + "q92", + "q93", + "q94", + "q95", + "q96", + "q97", + "q98", + "q99", "ss_max" ) val tpcds2_4Queries = queryNames.map { queryName => - val queryContent: String = IOUtils.toString( - getClass().getClassLoader().getResourceAsStream(s"tpcds_2_4/$queryName.sql")) - Query(queryName + "-v2.4", queryContent, description = "TPCDS 2.4 Query", - executionMode = CollectResults) + val queryContent: String = + IOUtils.toString(getClass().getClassLoader().getResourceAsStream(s"tpcds_2_4/$queryName.sql")) + Query( + queryName + "-v2.4", + queryContent, + description = "TPCDS 2.4 Query", + executionMode = CollectResults + ) } val tpcds2_4QueriesMap = tpcds2_4Queries.map(q => q.name.split("-").get(0) -> q).toMap diff --git a/src/main/scala/com/databricks/spark/sql/perf/tpch/TPCH.scala b/src/main/scala/com/databricks/spark/sql/perf/tpch/TPCH.scala index 5a23edf9..6206729c 100644 --- a/src/main/scala/com/databricks/spark/sql/perf/tpch/TPCH.scala +++ b/src/main/scala/com/databricks/spark/sql/perf/tpch/TPCH.scala @@ -26,37 +26,41 @@ import org.apache.spark.sql.SQLContext class DBGEN(dbgenDir: String, params: Seq[String]) extends DataGenerator { val dbgen = s"$dbgenDir/dbgen" - def generate(sparkContext: SparkContext,name: String, partitions: Int, scaleFactor: String) = { - val smallTables = Seq("nation", "region") + def generate(sparkContext: SparkContext, name: String, partitions: Int, scaleFactor: String) = { + val smallTables = Seq("nation", "region") val numPartitions = if (partitions > 1 && !smallTables.contains(name)) partitions else 1 - val generatedData = { - sparkContext.parallelize(1 to numPartitions, numPartitions).flatMap { i => - val localToolsDir = if (new java.io.File(dbgen).exists) { - dbgenDir - } else if (new java.io.File(s"/$dbgenDir").exists) { - s"/$dbgenDir" - } else { - sys.error(s"Could not find dbgen at $dbgen or /$dbgenDir. Run install") + val generatedData = + sparkContext + .parallelize(1 to numPartitions, numPartitions) + .flatMap { i => + val localToolsDir = if (new java.io.File(dbgen).exists) { + dbgenDir + } else if (new java.io.File(s"/$dbgenDir").exists) { + s"/$dbgenDir" + } else { + sys.error(s"Could not find dbgen at $dbgen or /$dbgenDir. Run install") + } + val parallel = if (numPartitions > 1) s"-C $partitions -S $i" else "" + val shortTableNames = Map( + "customer" -> "c", + "lineitem" -> "L", + "nation" -> "n", + "orders" -> "O", + "part" -> "P", + "region" -> "r", + "supplier" -> "s", + "partsupp" -> "S" + ) + val paramsString = params.mkString(" ") + val commands = Seq( + "bash", + "-c", + s"cd $localToolsDir && ./dbgen -q $paramsString -T ${shortTableNames(name)} -s $scaleFactor $parallel" + ) + println(commands) + BlockingLineStream(commands) } - val parallel = if (numPartitions > 1) s"-C $partitions -S $i" else "" - val shortTableNames = Map( - "customer" -> "c", - "lineitem" -> "L", - "nation" -> "n", - "orders" -> "O", - "part" -> "P", - "region" -> "r", - "supplier" -> "s", - "partsupp" -> "S" - ) - val paramsString = params.mkString(" ") - val commands = Seq( - "bash", "-c", - s"cd $localToolsDir && ./dbgen -q $paramsString -T ${shortTableNames(name)} -s $scaleFactor $parallel") - println(commands) - BlockingLineStream(commands) - }.repartition(numPartitions) - } + .repartition(numPartitions) generatedData.setName(s"$name, sf=$scaleFactor, strings") generatedData @@ -69,14 +73,15 @@ class TPCHTables( scaleFactor: String, useDoubleForDecimal: Boolean = false, useStringForDate: Boolean = false, - generatorParams: Seq[String] = Nil) - extends Tables(sqlContext, scaleFactor, useDoubleForDecimal, useStringForDate) { - import sqlContext.implicits._ + generatorParams: Seq[String] = Nil +) extends Tables(sqlContext, scaleFactor, useDoubleForDecimal, useStringForDate) { + import spark.implicits._ val dataGenerator = new DBGEN(dbgenDir, generatorParams) val tables = Seq( - Table("part", + Table( + "part", partitionColumns = "p_brand" :: Nil, 'p_partkey.long, 'p_name.string, @@ -88,7 +93,8 @@ class TPCHTables( 'p_retailprice.decimal(12, 2), 'p_comment.string ), - Table("supplier", + Table( + "supplier", partitionColumns = Nil, 's_suppkey.long, 's_name.string, @@ -98,7 +104,8 @@ class TPCHTables( 's_acctbal.decimal(12, 2), 's_comment.string ), - Table("partsupp", + Table( + "partsupp", partitionColumns = Nil, 'ps_partkey.long, 'ps_suppkey.long, @@ -106,7 +113,8 @@ class TPCHTables( 'ps_supplycost.decimal(12, 2), 'ps_comment.string ), - Table("customer", + Table( + "customer", partitionColumns = "c_mktsegment" :: Nil, 'c_custkey.long, 'c_name.string, @@ -117,7 +125,8 @@ class TPCHTables( 'c_mktsegment.string, 'c_comment.string ), - Table("orders", + Table( + "orders", partitionColumns = "o_orderdate" :: Nil, 'o_orderkey.long, 'o_custkey.long, @@ -129,7 +138,8 @@ class TPCHTables( 'o_shippriority.int, 'o_comment.string ), - Table("lineitem", + Table( + "lineitem", partitionColumns = "l_shipdate" :: Nil, 'l_orderkey.long, 'l_partkey.long, @@ -148,30 +158,24 @@ class TPCHTables( 'l_shipmode.string, 'l_comment.string ), - Table("nation", + Table( + "nation", partitionColumns = Nil, 'n_nationkey.long, 'n_name.string, 'n_regionkey.long, 'n_comment.string ), - Table("region", - partitionColumns = Nil, - 'r_regionkey.long, - 'r_name.string, - 'r_comment.string - ) + Table("region", partitionColumns = Nil, 'r_regionkey.long, 'r_name.string, 'r_comment.string) ).map(_.convertTypes()) } -class TPCH(@transient sqlContext: SQLContext) - extends Benchmark(sqlContext) { +class TPCH(@transient sqlContext: SQLContext) extends Benchmark(sqlContext) { val queries = (1 to 22).map { q => - val queryContent: String = IOUtils.toString( - getClass().getClassLoader().getResourceAsStream(s"tpch/queries/$q.sql")) - Query(s"Q$q", queryContent, description = "TPCH Query", - executionMode = CollectResults) + val queryContent: String = + IOUtils.toString(getClass().getClassLoader().getResourceAsStream(s"tpch/queries/$q.sql")) + Query(s"Q$q", queryContent, description = "TPCH Query", executionMode = CollectResults) } val queriesMap = queries.map(q => q.name.split("-").get(0) -> q).toMap } diff --git a/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala b/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala index fa66e005..2abca96c 100644 --- a/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala +++ b/src/main/scala/org/apache/spark/ml/ModelBuilderSSP.scala @@ -1,29 +1,31 @@ package org.apache.spark.ml -import org.apache.spark.ml.classification.{ClassificationModelBuilder, DecisionTreeClassificationModel, LinearSVCModel, LogisticRegressionModel, NaiveBayesModel} +import org.apache.spark.ml.classification.{ + ClassificationModelBuilder, + DecisionTreeClassificationModel, + LinearSVCModel, + LogisticRegressionModel, + NaiveBayesModel +} import org.apache.spark.ml.linalg.{Matrix, Vector} -import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, GeneralizedLinearRegressionModel, LinearRegressionModel} +import org.apache.spark.ml.regression.{ + DecisionTreeRegressionModel, + GeneralizedLinearRegressionModel, + LinearRegressionModel +} import org.apache.spark.ml.tree._ import org.apache.spark.mllib.random.RandomDataGenerator import org.apache.spark.mllib.tree.impurity.ImpurityCalculator - -/** - * Helper for creating MLlib models which have private constructors. - */ +/** Helper for creating MLlib models which have private constructors. + */ object ModelBuilderSSP { - def newLogisticRegressionModel( - coefficients: Vector, - intercept: Double): LogisticRegressionModel = { + def newLogisticRegressionModel(coefficients: Vector, intercept: Double): LogisticRegressionModel = new LogisticRegressionModel("lr", coefficients, intercept) .setThreshold(.5) - } - - def newLinearRegressionModel( - coefficients: Vector, - intercept: Double): LinearRegressionModel = { + def newLinearRegressionModel(coefficients: Vector, intercept: Double): LinearRegressionModel = { val model = new LinearRegressionModel("linr", coefficients, intercept) if (model.hasParam("loss")) { model.set(model.getParam("loss"), "squaredError") @@ -31,30 +33,44 @@ object ModelBuilderSSP { model } - def newGLR( - coefficients: Vector, - intercept: Double): GeneralizedLinearRegressionModel = + def newGLR(coefficients: Vector, intercept: Double): GeneralizedLinearRegressionModel = new GeneralizedLinearRegressionModel("glr-uid", coefficients, intercept) def newDecisionTreeClassificationModel( depth: Int, numClasses: Int, featureArity: Array[Int], - seed: Long): DecisionTreeClassificationModel = { - require(numClasses >= 2, s"DecisionTreeClassificationModel requires numClasses >= 2," + - s" but was given $numClasses") - val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = numClasses, - featureArity = featureArity, seed = seed) - new DecisionTreeClassificationModel(rootNode, numFeatures = featureArity.length, - numClasses = numClasses) + seed: Long + ): DecisionTreeClassificationModel = { + require( + numClasses >= 2, + s"DecisionTreeClassificationModel requires numClasses >= 2," + + s" but was given $numClasses" + ) + val rootNode = TreeBuilder.randomBalancedDecisionTree( + depth = depth, + labelType = numClasses, + featureArity = featureArity, + seed = seed + ) + new DecisionTreeClassificationModel( + rootNode, + numFeatures = featureArity.length, + numClasses = numClasses + ) } def newDecisionTreeRegressionModel( depth: Int, featureArity: Array[Int], - seed: Long): DecisionTreeRegressionModel = { - val rootNode = TreeBuilder.randomBalancedDecisionTree(depth = depth, labelType = 0, - featureArity = featureArity, seed = seed) + seed: Long + ): DecisionTreeRegressionModel = { + val rootNode = TreeBuilder.randomBalancedDecisionTree( + depth = depth, + labelType = 0, + featureArity = featureArity, + seed = seed + ) new DecisionTreeRegressionModel(rootNode, numFeatures = featureArity.length) } @@ -63,52 +79,46 @@ object ModelBuilderSSP { model.set(model.modelType, "multinomial") } - def newLinearSVCModel( - coefficients: Vector, - intercept: Double): LinearSVCModel = { + def newLinearSVCModel(coefficients: Vector, intercept: Double): LinearSVCModel = ClassificationModelBuilder.newLinearSVCModel(coefficients, intercept) - } } -/** - * Helpers for creating random decision trees. - */ +/** Helpers for creating random decision trees. + */ object TreeBuilder { - /** - * Generator for a pair of distinct class labels from the set {0,...,numClasses-1}. - * Pairs are useful for trees to make sure sibling leaf nodes make different predictions. - * @param numClasses Number of classes. - */ + /** Generator for a pair of distinct class labels from the set {0,...,numClasses-1}. Pairs are + * useful for trees to make sure sibling leaf nodes make different predictions. + * @param numClasses + * Number of classes. + */ private class ClassLabelPairGenerator(val numClasses: Int) - extends RandomDataGenerator[Pair[Double, Double]] { + extends RandomDataGenerator[Pair[Double, Double]] { - require(numClasses >= 2, - s"ClassLabelPairGenerator given label numClasses = $numClasses, but numClasses should be >= 2.") + require( + numClasses >= 2, + s"ClassLabelPairGenerator given label numClasses = $numClasses, but numClasses should be >= 2." + ) private val rng = new java.util.Random() override def nextValue(): Pair[Double, Double] = { - val left = rng.nextInt(numClasses) + val left = rng.nextInt(numClasses) var right = rng.nextInt(numClasses) - while (right == left) { + while (right == left) right = rng.nextInt(numClasses) - } new Pair[Double, Double](left, right) } - override def setSeed(seed: Long): Unit = { + override def setSeed(seed: Long): Unit = rng.setSeed(seed) - } override def copy(): ClassLabelPairGenerator = new ClassLabelPairGenerator(numClasses) } - - /** - * Generator for a pair of real-valued labels. - * Pairs are useful for trees to make sure sibling leaf nodes make different predictions. - */ + /** Generator for a pair of real-valued labels. Pairs are useful for trees to make sure sibling + * leaf nodes make different predictions. + */ private class RealLabelPairGenerator() extends RandomDataGenerator[Pair[Double, Double]] { private val rng = new java.util.Random() @@ -116,34 +126,37 @@ object TreeBuilder { override def nextValue(): Pair[Double, Double] = new Pair[Double, Double](rng.nextDouble(), rng.nextDouble()) - override def setSeed(seed: Long): Unit = { + override def setSeed(seed: Long): Unit = rng.setSeed(seed) - } override def copy(): RealLabelPairGenerator = new RealLabelPairGenerator() } - /** - * Creates a random decision tree structure. - * @param depth Depth of tree to build. Must be <= numFeatures. - * @param labelType Value 0 indicates regression. Integers >= 2 indicate numClasses for - * classification. - * @param featureArity Array of length numFeatures indicating feature type. - * Value 0 indicates continuous feature. - * Other values >= 2 indicate a categorical feature, - * where the value is the number of categories. - * @return root node of tree - */ + /** Creates a random decision tree structure. + * @param depth + * Depth of tree to build. Must be <= numFeatures. + * @param labelType + * Value 0 indicates regression. Integers >= 2 indicate numClasses for classification. + * @param featureArity + * Array of length numFeatures indicating feature type. Value 0 indicates continuous feature. + * Other values >= 2 indicate a categorical feature, where the value is the number of + * categories. + * @return + * root node of tree + */ def randomBalancedDecisionTree( depth: Int, labelType: Int, featureArity: Array[Int], - seed: Long): Node = { + seed: Long + ): Node = { require(depth >= 0, s"randomBalancedDecisionTree given depth < 0.") val numFeatures = featureArity.length - require(depth <= numFeatures, + require( + depth <= numFeatures, s"randomBalancedDecisionTree requires depth <= featureArity.size," + - s" but depth = $depth and featureArity.size = $numFeatures") + s" but depth = $depth and featureArity.size = $numFeatures" + ) val isRegression = labelType == 0 if (!isRegression) { require(labelType >= 2, s"labelType must be >= 2 for classification. 0 indicates regression.") @@ -165,29 +178,38 @@ object TreeBuilder { ImpurityCalculator.getCalculator("gini", Array.fill[Double](labelType)(0.0), 0L) } - randomBalancedDecisionTreeHelper(depth, featureArity, impurityCalculator, - labelGenerator, Set.empty, rng) + randomBalancedDecisionTreeHelper( + depth, + featureArity, + impurityCalculator, + labelGenerator, + Set.empty, + rng + ) } - /** - * Create an internal node. Either create the leaf nodes beneath it, or recurse as needed. - * @param subtreeDepth Depth of subtree to build. Depth 0 means this is a leaf node. - * @param featureArity Indicates feature type. Value 0 indicates continuous feature. - * Other values >= 2 indicate a categorical feature, - * where the value is the number of categories. - * @param impurityCalculator Dummy impurity calculator to use at all tree nodes - * @param usedFeatures Features appearing in the path from the tree root to the node - * being constructed. - * @param labelGenerator Generates pairs of distinct labels. - * @return - */ + /** Create an internal node. Either create the leaf nodes beneath it, or recurse as needed. + * @param subtreeDepth + * Depth of subtree to build. Depth 0 means this is a leaf node. + * @param featureArity + * Indicates feature type. Value 0 indicates continuous feature. Other values >= 2 indicate a + * categorical feature, where the value is the number of categories. + * @param impurityCalculator + * Dummy impurity calculator to use at all tree nodes + * @param usedFeatures + * Features appearing in the path from the tree root to the node being constructed. + * @param labelGenerator + * Generates pairs of distinct labels. + * @return + */ private def randomBalancedDecisionTreeHelper( subtreeDepth: Int, featureArity: Array[Int], impurityCalculator: ImpurityCalculator, labelGenerator: RandomDataGenerator[Pair[Double, Double]], usedFeatures: Set[Int], - rng: scala.util.Random): Node = { + rng: scala.util.Random + ): Node = { if (subtreeDepth == 0) { // This case only happens for a depth 0 tree. @@ -196,14 +218,16 @@ object TreeBuilder { val numFeatures = featureArity.length // Should not happen. - assert(usedFeatures.size < numFeatures, s"randomBalancedDecisionTreeSplitNode ran out of " + - s"features for splits.") + assert( + usedFeatures.size < numFeatures, + s"randomBalancedDecisionTreeSplitNode ran out of " + + s"features for splits." + ) // Make node internal. var feature: Int = rng.nextInt(numFeatures) - while (usedFeatures.contains(feature)) { + while (usedFeatures.contains(feature)) feature = rng.nextInt(numFeatures) - } val split: Split = if (featureArity(feature) == 0) { // continuous feature new ContinuousSplit(featureIndex = feature, threshold = rng.nextDouble()) @@ -213,27 +237,55 @@ object TreeBuilder { // nCatsSplit is in {1,...,arity-1}. val nCatsSplit = rng.nextInt(featureArity(feature) - 1) + 1 val splitCategories: Array[Double] = - rng.shuffle(Range(0,featureArity(feature)).toList).toArray.map(_.toDouble).take(nCatsSplit) - new CategoricalSplit(featureIndex = feature, - _leftCategories = splitCategories, numCategories = featureArity(feature)) + rng.shuffle(Range(0, featureArity(feature)).toList).toArray.map(_.toDouble).take(nCatsSplit) + new CategoricalSplit( + featureIndex = feature, + _leftCategories = splitCategories, + numCategories = featureArity(feature) + ) } val (leftChild: Node, rightChild: Node) = if (subtreeDepth == 1) { // Add leaf nodes. Assign these jointly so they make different predictions. val predictions = labelGenerator.nextValue() - val leftChild = new LeafNode(prediction = predictions._1, impurity = 0.0, - impurityStats = impurityCalculator) - val rightChild = new LeafNode(prediction = predictions._2, impurity = 0.0, - impurityStats = impurityCalculator) + val leftChild = new LeafNode( + prediction = predictions._1, + impurity = 0.0, + impurityStats = impurityCalculator + ) + val rightChild = new LeafNode( + prediction = predictions._2, + impurity = 0.0, + impurityStats = impurityCalculator + ) (leftChild, rightChild) } else { - val leftChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity, - impurityCalculator, labelGenerator, usedFeatures + feature, rng) - val rightChild = randomBalancedDecisionTreeHelper(subtreeDepth - 1, featureArity, - impurityCalculator, labelGenerator, usedFeatures + feature, rng) + val leftChild = randomBalancedDecisionTreeHelper( + subtreeDepth - 1, + featureArity, + impurityCalculator, + labelGenerator, + usedFeatures + feature, + rng + ) + val rightChild = randomBalancedDecisionTreeHelper( + subtreeDepth - 1, + featureArity, + impurityCalculator, + labelGenerator, + usedFeatures + feature, + rng + ) (leftChild, rightChild) } - new InternalNode(prediction = 0.0, impurity = 0.0, gain = 0.0, leftChild = leftChild, - rightChild = rightChild, split = split, impurityStats = impurityCalculator) + new InternalNode( + prediction = 0.0, + impurity = 0.0, + gain = 0.0, + leftChild = leftChild, + rightChild = rightChild, + split = split, + impurityStats = impurityCalculator + ) } } diff --git a/src/main/scala/org/apache/spark/ml/TreeUtils.scala b/src/main/scala/org/apache/spark/ml/TreeUtils.scala index badef4fd..2dc1dcb5 100644 --- a/src/main/scala/org/apache/spark/ml/TreeUtils.scala +++ b/src/main/scala/org/apache/spark/ml/TreeUtils.scala @@ -4,26 +4,28 @@ import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericA import org.apache.spark.sql.DataFrame object TreeUtils { - /** - * Set label metadata (particularly the number of classes) on a DataFrame. - * - * @param data Dataset. Categorical features and labels must already have 0-based indices. - * This must be non-empty. - * @param featuresColName Name of the features column - * @param featureArity Array of length numFeatures, where 0 indicates continuous feature and - * value > 0 indicates a categorical feature of that arity. - * @return DataFrame with metadata - */ - def setMetadata( - data: DataFrame, - featuresColName: String, - featureArity: Array[Int]): DataFrame = { - val featuresAttributes = featureArity.zipWithIndex.map { case (arity: Int, feature: Int) => - if (arity > 0) { - NominalAttribute.defaultAttr.withIndex(feature).withNumValues(arity) - } else { - NumericAttribute.defaultAttr.withIndex(feature) - } + + /** Set label metadata (particularly the number of classes) on a DataFrame. + * + * @param data + * Dataset. Categorical features and labels must already have 0-based indices. This must be + * non-empty. + * @param featuresColName + * Name of the features column + * @param featureArity + * Array of length numFeatures, where 0 indicates continuous feature and value > 0 indicates a + * categorical feature of that arity. + * @return + * DataFrame with metadata + */ + def setMetadata(data: DataFrame, featuresColName: String, featureArity: Array[Int]): DataFrame = { + val featuresAttributes = featureArity.zipWithIndex.map { + case (arity: Int, feature: Int) => + if (arity > 0) { + NominalAttribute.defaultAttr.withIndex(feature).withNumValues(arity) + } else { + NumericAttribute.defaultAttr.withIndex(feature) + } } val featuresMetadata = new AttributeGroup("features", featuresAttributes).toMetadata() data.select(data(featuresColName).as(featuresColName, featuresMetadata)) diff --git a/src/main/scala/org/apache/spark/ml/classification/ClassificationModelBuilder.scala b/src/main/scala/org/apache/spark/ml/classification/ClassificationModelBuilder.scala index 485b883f..00fe5155 100644 --- a/src/main/scala/org/apache/spark/ml/classification/ClassificationModelBuilder.scala +++ b/src/main/scala/org/apache/spark/ml/classification/ClassificationModelBuilder.scala @@ -2,12 +2,8 @@ package org.apache.spark.ml.classification import org.apache.spark.ml.linalg.{Matrix, Vector} - object ClassificationModelBuilder { - def newLinearSVCModel( - coefficients: Vector, - intercept: Double): LinearSVCModel = { + def newLinearSVCModel(coefficients: Vector, intercept: Double): LinearSVCModel = new LinearSVCModel("linearSVC", coefficients, intercept) - } }