From b400522774bd4a484d2ef7c03253b7db1b1d6ca1 Mon Sep 17 00:00:00 2001 From: Erik Erlandson Date: Fri, 19 Jun 2020 18:02:21 -0700 Subject: [PATCH] 0.4.0 - remove compiled python, build for spark 2.2 - 3.0, update sbt & packages, update README --- README.md | 41 ++++++++++++++---------- build.sbt | 68 ++++++++++------------------------------ project/build.properties | 2 +- project/plugins.sbt | 8 ++--- 4 files changed, 46 insertions(+), 73 deletions(-) diff --git a/README.md b/README.md index 51470d8..c7ec933 100644 --- a/README.md +++ b/README.md @@ -7,15 +7,24 @@ https://isarn.github.io/isarn-sketches-spark/latest/api/#org.isarnproject.sketch ## How to use in your project ``` scala -// Note that the version of spark and python is part of the release name. -// This example is for spark 2.2 and python 2.7: -libraryDependencies += "org.isarnproject" %% "isarn-sketches-spark" % "0.3.1-sp2.2-py2.7" +// Note that the version of spark is part of the release name. +// This example is for spark 2.4: +libraryDependencies += "org.isarnproject" %% "isarn-sketches-spark" % "0.4.0-sp2.4" ``` -** Currently supported: python 2.7, 3.6 X spark 2.2, 2.3 X scala 2.11 ** +Currently supported: + +- spark 2.2, scala 2.11 +- spark 2.3, scala 2.11 +- spark 2.4, scala 2.11 and 2.12 +- spark 3.0, scala 2.12 If you are interested in a python/spark/scala build that is not listed above, please contact me and/or file an issue! +Python code is also packaged with all of the artifacts above. +Spark will automatically extract and compile Python components for use with PySpark. +Python 2 and 3 are supported. Note that Python 2 is EOL as of January 2020. + This package builds against some `% Provided` Apache Spark dependencies: ```scala libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion @@ -26,9 +35,9 @@ libraryDependencies += "org.apache.spark" %% "spark-mllib" % sparkVersion ## How to use from the Spark CLI Several Spark CLI tools accept the `--packages` argument, as with this `spark-shell` example: ```bash -$ spark-shell --packages "org.isarnproject:isarn-sketches-spark_2.11:0.3.1-sp2.3-py3.6" +$ spark-shell --packages "org.isarnproject:isarn-sketches-spark_2.12:0.4.0-sp2.4" ``` -Note that you need to explicitly include the scala version as part of the package name +Note that you need to explicitly include the scala version as part of the package name. ## Examples @@ -244,7 +253,7 @@ scala> td.show() >>> from isarnproject.sketches.udaf.tdigest import * >>> from random import gauss >>> from pyspark.sql.types import * ->>> data = sc.parallelize([[gauss(0,1)] for x in xrange(1000)]).toDF(StructType([StructField("x", DoubleType())])) +>>> data = sc.parallelize([[gauss(0,1)] for x in range(1000)]).toDF(StructType([StructField("x", DoubleType())])) >>> agg = data.agg(tdigestDoubleUDAF("x")) >>> td = agg.first()[0] >>> td.cdfInverse(0.5) @@ -257,10 +266,10 @@ scala> td.show() >>> from isarnproject.sketches.udaf.tdigest import * >>> from random import gauss >>> from pyspark.sql.types import * ->>> data = sc.parallelize([[[gauss(0,1),gauss(0,1),gauss(0,1)]] for x in xrange(1000)]).toDF(StructType([StructField("x", ArrayType(DoubleType()))])) +>>> data = sc.parallelize([[[gauss(0,1),gauss(0,1),gauss(0,1)]] for x in range(1000)]).toDF(StructType([StructField("x", ArrayType(DoubleType()))])) >>> agg = data.agg(tdigestDoubleArrayUDAF("x")) >>> tds = agg.first()[0] ->>> [t.cdfInverse(0.5) for t in td] +>>> [t.cdfInverse(0.5) for t in tds] [0.046116924117141189, -0.011071666930287466, -0.019006033872431105] >>> ``` @@ -271,7 +280,7 @@ scala> td.show() >>> from random import gauss >>> from pyspark.ml.linalg import VectorUDT, Vectors >>> from pyspark.sql.types import * ->>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())])) +>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())])) >>> agg = data.agg(tdigestMLVecUDAF("x")) >>> tds = agg.first()[0] >>> [t.cdfInverse(0.5) for t in tds] @@ -285,7 +294,7 @@ scala> td.show() >>> from random import gauss >>> from pyspark.mllib.linalg import VectorUDT, Vectors >>> from pyspark.sql.types import * ->>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())])) +>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())])) >>> agg = data.agg(tdigestMLLibVecUDAF("x")) >>> tds = agg.first()[0] >>> [t.cdfInverse(0.5) for t in tds] @@ -298,8 +307,8 @@ scala> td.show() >>> from isarnproject.sketches.udaf.tdigest import * >>> from random import gauss >>> from pyspark.sql.types import * ->>> x = sc.parallelize([[gauss(0,1)] for x in xrange(1000)]).toDF(StructType([StructField("x", DoubleType())])) ->>> g = sc.parallelize([[1+x] for x in xrange(5)]).toDF(StructType([StructField("g", IntegerType())])) +>>> x = sc.parallelize([[gauss(0,1)] for x in range(1000)]).toDF(StructType([StructField("x", DoubleType())])) +>>> g = sc.parallelize([[1+x] for x in range(5)]).toDF(StructType([StructField("g", IntegerType())])) >>> data = g.crossJoin(x) >>> tds = data.groupBy("g").agg(tdigestDoubleUDAF("x").alias("tdigests")) >>> tds.show() @@ -330,8 +339,8 @@ scala> td.show() >>> from random import gauss >>> from pyspark.ml.linalg import VectorUDT, Vectors >>> from pyspark.sql.types import * ->>> x = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())])) ->>> g = sc.parallelize([[1+x] for x in xrange(5)]).toDF(StructType([StructField("g", IntegerType())])) +>>> x = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())])) +>>> g = sc.parallelize([[1+x] for x in range(5)]).toDF(StructType([StructField("g", IntegerType())])) >>> data = g.crossJoin(x) >>> tds = data.groupBy("g").agg(tdigestMLVecUDAF("x").alias("tdigests")) >>> tds.show() @@ -422,7 +431,7 @@ scala> imp.show >>> fiMod = fi.fit(training) \ ... .setTargetModel(lrModel) \ ... .setDeviationMeasure("rms-dev") \ -... .setFeatureNames(["x%d" % (j) for j in xrange(10)]) +... .setFeatureNames(["x%d" % (j) for j in range(10)]) >>> imp = fiMod.transform(training) >>> imp.show() +----+-------------------+ diff --git a/build.sbt b/build.sbt index 7654c1e..0bd86fa 100644 --- a/build.sbt +++ b/build.sbt @@ -1,7 +1,8 @@ // xsbt clean unidoc previewSite // xsbt clean unidoc ghpagesPushSite -// xsbt -Dsbt.global.base=/home/eje/.sbt/sonatype +publish -// make sure sparkVersion and pythonVersion are set as you want them prior to +publish +// xsbt +publish +// https://oss.sonatype.org +// make sure sparkVersion is set as you want prior to +publish import scala.sys.process._ @@ -9,23 +10,17 @@ name := "isarn-sketches-spark" organization := "org.isarnproject" -val packageVersion = "0.3.1" +val packageVersion = "0.4.0" -val sparkVersion = "2.2.2" - -val pythonVersion = "2.7" +val sparkVersion = "3.0.0" val sparkSuffix = s"""sp${sparkVersion.split('.').take(2).mkString(".")}""" -val pythonSuffix = s"""py${pythonVersion.split('.').take(2).mkString(".")}""" - -val pythonCMD = s"""python${pythonVersion.split('.').head}""" - -version := s"${packageVersion}-${sparkSuffix}-${pythonSuffix}" +version := s"${packageVersion}-${sparkSuffix}" -scalaVersion := "2.11.12" +scalaVersion := "2.12.11" -crossScalaVersions := Seq("2.11.12") // scala 2.12 when spark supports it +crossScalaVersions := Seq("2.12.11") // scala 2.12.11 when spark supports it pomIncludeRepository := { _ => false } @@ -92,46 +87,15 @@ licenses += ("Apache-2.0", url("http://opensource.org/licenses/Apache-2.0")) scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature") -lazy val deletePYC = taskKey[Unit]("Delete .pyc files") - -deletePYC := { - val s: TaskStreams = streams.value - s.log.info("delete .pyc files...") - val cmd = "bash" :: "-c" :: "rm -f $(find python -name *.pyc)" :: Nil - val stat = (cmd !) - if (stat == 0) { - s.log.info("delete .pyc succeeded") - } else { - throw new IllegalStateException("delete .pyc failed") - } -} - -lazy val compilePython = taskKey[Unit]("Compile python files") - -compilePython := { - val s: TaskStreams = streams.value - s.log.info("compiling python...") - val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !) - if (stat == 0) { - s.log.info("python compile succeeded") - } else { - throw new IllegalStateException("python compile failed") - } -} - -compilePython := (compilePython.dependsOn(deletePYC)).value - -(packageBin in Compile) := ((packageBin in Compile).dependsOn(compilePython)).value - mappings in (Compile, packageBin) ++= Seq( - (baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc", - (baseDirectory.value / "python" / "isarnproject" / "pipelines" / "__init__.pyc") -> "isarnproject/pipelines/__init__.pyc", - (baseDirectory.value / "python" / "isarnproject" / "pipelines" / "fi.pyc") -> "isarnproject/pipelines/fi.pyc", - (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc", - (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc", - (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc", - (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc", - (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc" + (baseDirectory.value / "python" / "isarnproject" / "__init__.py") -> "isarnproject/__init__.py", + (baseDirectory.value / "python" / "isarnproject" / "pipelines" / "__init__.py") -> "isarnproject/pipelines/__init__.py", + (baseDirectory.value / "python" / "isarnproject" / "pipelines" / "fi.py") -> "isarnproject/pipelines/fi.py", + (baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.py") -> "isarnproject/sketches/__init__.py", + (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.py") -> "isarnproject/sketches/udaf/__init__.py", + (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.py") -> "isarnproject/sketches/udaf/tdigest.py", + (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.py") -> "isarnproject/sketches/udt/__init__.py", + (baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.py") -> "isarnproject/sketches/udt/tdigest.py" ) test in assembly := {} diff --git a/project/build.properties b/project/build.properties index f59579f..654fe70 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=1.2.0 +sbt.version=1.3.12 diff --git a/project/plugins.sbt b/project/plugins.sbt index ffb7cd7..4f35abb 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -7,13 +7,13 @@ resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositori resolvers += "jgit-repo" at "http://download.eclipse.org/jgit/maven" -addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.2") +addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3") -addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.1") +addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.3") -addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.1.6") +addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.2.1") -addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3") +addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.2") // scoverage and coveralls deps are at old versions to avoid a bug in the current versions // update these when this fix is released: https://github.com/scoverage/sbt-coveralls/issues/73