diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000000..ce4c880606 --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,139 @@ +name: CI + +on: + pull_request: + branches: + - '**' + # it's annoying to have CI double run on PR/push + # limit push to only final integration branches like develop + push: + branches: + - 'develop' + +jobs: + run-base-tests: + runs-on: ubuntu-latest + strategy: + # it's helpful to see exactly which test modules passed/failed + # this finishes all matrix nodes even if one fails along the way + fail-fast: false + + # all combinations that we will run in parallel to increase throughput + matrix: + include: + + - scala-version: "2.11.12" + BUILD: "enforce code format" + script: './sbt ++$TRAVIS_SCALA_VERSION clean scalafmtSbtCheck scalafmtCheckAll' + + - scala-version: "2.11.12" + BUILD: "base" + TEST_TARGET: "scalding-args scalding-base scalding-date maple scalding-quotation scalding-dagon" + script: './scripts/run_test.sh' + - scala-version: "2.12.14" + BUILD: "base" + TEST_TARGET: "scalding-args scalding-base scalding-date maple scalding-quotation scalding-dagon" + script: './scripts/run_test.sh' + + - scala-version: "2.11.12" + BUILD: "base" + TEST_TARGET: "scalding-avro scalding-hraven scalding-commons scalding-parquet scalding-parquet-scrooge" + script: './scripts/run_test.sh' + - scala-version: "2.12.14" + BUILD: "base" + TEST_TARGET: "scalding-avro scalding-hraven scalding-commons scalding-parquet scalding-parquet-scrooge" + script: './scripts/run_test.sh' + + - scala-version: "2.11.12" + BUILD: "base" + TEST_TARGET: "scalding-core scalding-json scalding-db scalding-cats" + script: './scripts/run_test.sh' + - scala-version: "2.12.14" + BUILD: "base" + TEST_TARGET: "scalding-core scalding-json scalding-db scalding-cats" + script: './scripts/run_test.sh' + + - scala-version: "2.11.12" + BUILD: "base" + TEST_TARGET: "scalding-hadoop-test" + script: './scripts/run_test.sh' + - scala-version: "2.12.14" + BUILD: "base" + TEST_TARGET: "scalding-hadoop-test" + script: './scripts/run_test.sh' + + - scala-version: "2.11.12" + BUILD: "base" + TEST_TARGET: "scalding-estimators-test" + script: './scripts/run_test.sh' + - scala-version: "2.12.14" + BUILD: "base" + TEST_TARGET: "scalding-estimators-test" + script: './scripts/run_test.sh' + + - scala-version: "2.11.12" + BUILD: "base" + TEST_TARGET: "scalding-serialization scalding-spark scalding-beam" + script: './scripts/run_test.sh' + - scala-version: "2.12.14" + BUILD: "base" + TEST_TARGET: "scalding-serialization scalding-spark scalding-beam" + script: './scripts/run_test.sh' + + - scala-version: "2.11.12" + BUILD: "base" + TEST_TARGET: "scalding-thrift-macros" + script: './scripts/run_test.sh' + - scala-version: "2.12.14" + BUILD: "base" + TEST_TARGET: "scalding-thrift-macros" + script: './scripts/run_test.sh' + + - scala-version: "2.11.12" + BUILD: "test tutorials and matrix tutorials and repl" + TEST_TARGET: "scalding-repl" + script: "./scripts/run_test.sh && ./scripts/build_assembly_no_test.sh scalding-assembly && ./scripts/test_tutorials.sh && ./scripts/build_assembly_no_test.sh scalding-assembly && ./scripts/test_matrix_tutorials.sh" + - scala-version: "2.12.14" + BUILD: "test tutorials and matrix tutorials and repl" + TEST_TARGET: "scalding-repl" + script: "./scripts/run_test.sh && ./scripts/build_assembly_no_test.sh scalding-assembly && ./scripts/test_tutorials.sh && ./scripts/build_assembly_no_test.sh scalding-assembly && ./scripts/test_matrix_tutorials.sh" + + - scala-version: "2.11.12" + BUILD: "test repl and typed tutorials and microsite" + script: "./sbt ++$TRAVIS_SCALA_VERSION clean docs/makeMicrosite && ./scripts/build_assembly_no_test.sh scalding-repl && ./scripts/test_repl_tutorial.sh && ./scripts/build_assembly_no_test.sh scalding-core && ./scripts/test_typed_tutorials.sh && ./scripts/build_assembly_no_test.sh execution-tutorial && ./scripts/test_execution_tutorial.sh" + - scala-version: "2.12.14" + BUILD: "test repl and typed tutorials and microsite" + script: "./sbt ++$TRAVIS_SCALA_VERSION clean docs/makeMicrosite && ./scripts/build_assembly_no_test.sh scalding-repl && ./scripts/test_repl_tutorial.sh && ./scripts/build_assembly_no_test.sh scalding-core && ./scripts/test_typed_tutorials.sh && ./scripts/build_assembly_no_test.sh execution-tutorial && ./scripts/test_execution_tutorial.sh" + + + steps: + - name: Checkout Repo + uses: actions/checkout@v2 + with: + fetch-depth: 0 # fetch all tags for sbt-dynver to properly resolve scalding version + + - uses: actions/setup-java@v2 + with: + distribution: 'adopt-openj9' + java-version: '8.0.322+6' # non hadoop 3.3 versions build break https://issues.apache.org/jira/browse/HADOOP-16590 + + - uses: coursier/cache-action@v6 + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: 2.4 + + - name: Install Ruby Gems + run: | + gem install sass -v 3.7.4 + gem install jekyll -v 3.2.1 + + - name: "Run Test Variant" + env: + TRAVIS_SCALA_VERSION: ${{ matrix.scala-version }} + BUILD: ${{ matrix.BUILD }} + TEST_TARGET: ${{ matrix.TEST_TARGET }} + + run: | + mkdir -p /home/runner/.ivy2/cache/ # some scripts assume that this folder exists + ${{ matrix.script }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000000..53cc76507f --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,42 @@ +name: Publish + +on: + push: + branches: + - "develop" + tags: + - "v*" + +jobs: + publish: + runs-on: ubuntu-latest + + steps: + - name: Checkout Repo + uses: actions/checkout@v2 + with: + fetch-depth: 0 # fetch all tags for sbt-dynver to properly resolve scalding version + + - uses: actions/setup-java@v2 + with: + distribution: "adopt-openj9" + java-version: '8.0.322+6' # non hadoop 3.3 versions build break https://issues.apache.org/jira/browse/HADOOP-16590 + + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: 2.4 + + - name: Install Ruby Gems + run: | + gem install sass -v 3.7.4 + gem install jekyll -v 3.2.1 + + - name: "Publish" + env: + PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }} + PGP_SECRET: ${{ secrets.PGP_SECRET }} + SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }} + SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }} + run: | + ./sbt "ci-release" diff --git a/.gitignore b/.gitignore index d3ad4e7030..0c4651bea3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +.bsp +.cache +.project +.settings +.classpath *.swp BUILD target/ @@ -9,7 +14,16 @@ project/plugins/lib_managed/ project/plugins/src_managed/ /.idea/ /.idea_modules/ +.project +.classpath +.cache-main +.cache-tests +.tmpBin +bin *.iml +sonatype.sbt +build.sbt-e # not sure where this comes from some kind of backup? +tutorial/data/execution_output.txt tutorial/data/cofollows.tsv tutorial/data/cosineSim.tsv tutorial/data/graphFiltered.tsv @@ -27,3 +41,10 @@ tutorial/data/rightDiff.tsv tutorial/data/tmp3.tsv tutorial/data/jsonoutput0.tsv tutorial/data/avrooutput0.avro +.scalding_repl +scalding-hadoop-test/NOTICE +NOTICE + +# Auto-copied by sbt-microsites +docs/src/main/tut/contributing.md +.DS_Store diff --git a/.scalafmt.conf b/.scalafmt.conf new file mode 100644 index 0000000000..9d5d5221ee --- /dev/null +++ b/.scalafmt.conf @@ -0,0 +1,17 @@ +version = 3.5.1 +maxColumn = 110 +docstrings.style = Asterisk +newlines.alwaysBeforeMultilineDef = false +newlines.penalizeSingleSelectMultiArgList = false +align.openParenCallSite = false +rewrite.rules = [AvoidInfix, SortImports, RedundantBraces, RedundantParens, PreferCurlyFors] +rewrite.redundantBraces.generalExpressions = false + +# scalafmt can only choose one scala version target per file to format +# we have to use 212 for build.sbt or else we get failures +runner.dialect = scala211 +fileOverride { + "glob:**build.sbt" { + runner.dialect = scala212 + } +} \ No newline at end of file diff --git a/.travis.blacklist b/.travis.blacklist new file mode 100644 index 0000000000..a30e311bf7 --- /dev/null +++ b/.travis.blacklist @@ -0,0 +1,11 @@ +#This describes extra builds our validator will pretend to run in CI but won't +# Remember we run most builds twice, so if you want it disabled for both 2.10 and 2.11 it needs to be here twice +scalding-benchmarks +scalding-benchmarks +# These are just for fixtures, so blacklist for 2.10 and 2.11 +scalding-thrift-macros-fixtures +scalding-thrift-macros-fixtures +scalding-parquet-fixtures +scalding-parquet-fixtures +scalding-parquet-scrooge-fixtures +scalding-parquet-scrooge-fixtures diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 05c0228325..0000000000 --- a/.travis.yml +++ /dev/null @@ -1,11 +0,0 @@ -language: scala -scala: - - 2.10.3 - - 2.9.3 -script: - - "sbt -Duser.name=$USER.$RANDOM -Dlog4j.configuration=file://$TRAVIS_BUILD_DIR/project/travis-log4j.properties ++$TRAVIS_SCALA_VERSION assembly" - - "scripts/test_tutorials.sh" -jdk: - - oraclejdk7 -notifications: - irc: "chat.freenode.net#scalding" diff --git a/CHANGES.md b/CHANGES.md index cd0f5bcbb8..eeb38c973e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,433 @@ # Scalding # +### Version 0.17.2 ### +This version is basically the same as 0.17.1 but backward compatible with 0.17.0. +* Revert memory estimator changes on 0.17.x branch: #1704 +* Turn on mima checks on 0.17.x branch: #1706 + +### Version 0.17.1 ### +This version breaks backward compatibility with 0.17.0, don't use it, use 0.17.2 instead. +* Request for Scalding release 0.17.0: #1641 +* make ordered serialization stable across compilations: #1664 +* Remove unnecessary semicolon: #1668 +* Add tailrec annotation: #1671 +* Be more paranoid about Kryo registration order: #1673 +* Update sbt version to 0.13.15: #1677 +* Register all Boxed classes in Kryo: #1678 +* Fix serialization of KryoHadoop: #1685 +* Merge pull request #1686 from ttim/cherry_pick_0.17.x_changes +* Fix stack overflow in typedPipeMonoid.zero: #1688 +* A couple of fixes into the 0.17.x branch: #1695 +* Memory estimator changes to 0.17.x branch: #1700 + +### Version 0.17.0 ### +This is the first Scalding release that publishes artifacts for Scala 2.12! +* 2.12 releated updates: #1663, #1646 +* Use reflection over Jobs to find serialized classes: #1654, #1662 +* Simplify match statement and use collection.breakOut: #1661 +* Add explicit types to implicit methods and values: #1660 +* Reducer estimation size fixes: #1652, #1650, #1645, #1644 +* Use Combined*SequenceFile for VKVS, WritableSequenceFileScheme, SequenceFileScheme: #1647 +* Improve Vertica support in scalding-db: #1655 +* Add andThen to Mappable: #1656 +* Expand libjars globs in ScaldingShell to match the behavior of Tool: #1651 +* Use Batched in Sketch production: #1648 +* Pick up Algebird 0.13.0: #1640 +* Added API for Execution/Config to work with DistributedCache: #1635 +* Bump chill version to 0.8.3: #1634 +* Fixes a bug in how we use this stack: #1632 +* Upgrade build to sbt 0.13.13: #1629 +* Generate Scalding microsite via sbt-microsites: #1623 +* FileSource support for empty directories: #1622, #1618, #1613, #1611, #1591 +* Clean up temporary files created by forceToDiskExecution: #1621 +* Moving the repl in wonderland to a dedicated md file: #1614 +* Update Scala and sbt version: #1610 +* REFACTOR: Fixed some compilation warnings: #1604 +* REFACTOR: Rename parameter to reflect expectation: #1601 +* Add partitioned sources for Parquet thrift / scrooge: #1590 +* Add a test for sortBy: #1594 +* Create COMMITTERS.md: #1589 +* Use ExecutionContext in Execution.from/fromTry: #1587 +* Support custom parquet field name strategies: #1580 +* Deprecate reflection-based JobTest apply method: #1578 +* Use Caching for FlowDefExecution: #1581 +* [parquet tuple macros] listType was deprecated in favor of listOfElements: #1579 +* Use Batched to speed up CMS summing on mappers: #1575 +* Remove a TypedPipeFactory wrapper which seems unneeded: #1576 +* Make Writeable sources Mappable to get toIterator: #1573 +* case class implicit children: #1569 + +### Version 0.16.0 ### + +* Add tests around hashcode collisions : #1299 +* Fix performance bug in TypedPipeDiff : #1300 +* make serialization modules build on travis : #1301 +* Improve TypedParquetTuple : #1303 +* Add UnitOrderedSerialization : #1304 +* Revert "Add UnitOrderedSerialization" : #1306 +* Change groupRandomly & groupAll to use OrderedSerialization : #1307 +* Make test of Kmeans very very unlikely to fail : #1310 +* make LongThrift sources TypedSink : #1313 +* Fix testing VersionedKeyValSource#toIterator for non-Array[Byte] types : #1314 +* Make SketchJoin ordered serialization aware : #1316 +* Added a sealed trait ordered serializer. When it works its great. Not as reliable as we'd like. But hopefully restrictions on it will do the job : #1320 +* Add secondary sorting using ordered serialization : #1321 +* Bails out from the length calculation if we don't succeed often : #1322 +* increased number of box instances to 250 : #1323 +* Apply merge strategy for pom.properties files : #1325 +* Apply merge strategy for pom.xml files : #1327 +* Add a OrderedSerialization.viaTransform with no dependencies, and a BijectedOrderedSerialization in scalding core : #1329 +* Precompute int hashes : #1330 +* Hide the deprecated string error for getting ASCII bytes. : #1332 +* Change defaults for Scalding reducer estimator : #1333 +* Execution id code : #1334 +* Add line numbers at .group and .toPipe boundaries : #1335 +* Ordered Serialization macros for thrift : #1338 +* make some repl components extensible : #1342 +* Remove the bootstrap section : #1346 +* Fix the execution test : #1347 +* Implement flatMapValues method : #1348 +* Consistent style in homepage example : #1349 +* Serialization folding : #1351 +* Collapses scalding-db packages : #1353 +* Merge scalding-macros into scalding-core : #1355 +* Migrate typedtext : #1356 +* Runtime reducer estimator : #1358 +* Update Build.scala : #1361 +* Allow overriding of hadoop configuration options for a single source/sink : #1362 +* Missing an extends Serializable, causes issues if capture Config's anywhere : #1365 +* Fix TypedPipe.limit to be correct, if slightly slower : #1366 +* Fix scala.Function2 showing up in line numbers : #1367 +* Drop with MacroGenerated from Fields macros : #1370 +* Fix deprecation warnings in TypedDelimited : #1371 +* Ianoc/revert changes around making file systems : #1372 +* Revert typed tsv behavior : #1373 +* A serialization error we were seeing in repl usage : #1376 +* Add NullSink and test : #1378 +* Add monoid and semigroup for Execution : #1379 +* Upgrade parquet to 1.8.1 : #1380 +* Upgrade sbt launcher script (sbt-extras) : #1381 +* Just move whitespace, add comments, simplify a few methods : #1383 +* Don't publish maple when doing 2.11 so we only publish it once -- needed for cross publishing to maven repo's : #1386 +* Support nesting Options in TypeDescriptor : #1387 +* Enable Scalding-REPL for Scala 2.11 : #1388 +* Updates for some upstream fixes/changes : #1390 +* Remove use of hadoop version in estimators : #1391 +* Set hadoop version to dummy value : #1392 +* Handle no history case in RatioBasedEstimator : #1393 +* Inline parquet-scrooge : #1395 +* RatioBasedEstimator - fix threshold edge case, add tests : #1397 +* Fixes the scrooge generator tasks not to generate code in the compile target, we were publishing these : #1399 +* Ianoc/configure set converter : #1400 +* Change hash function in GroupRandomly : #1401 +* Improve logging in runtime reducer estimators : #1402 +* Add the type in ScroogeReadSupport : #1403 +* Adds a function to test if a sink exists at the version we created : #1404 +* add .groupWith method to TypedPipe : #1406 +* Add some return types : #1407 +* add counter verification logic : #1409 +* Runtime reducer estimator fixes : #1411 +* Make sure Execution.zip fails fast : #1412 +* When using WriteExecution and forceToDisk we can share the same flow def closer in construction : #1414 +* Cache the zipped up write executions : #1415 +* Fix DateOps "match may not be exhaustive" warning : #1416 +* Factor out repeated code into FutureCache : #1417 +* Fix lack of Externalizer in joins. : #1421 +* Adds much more line number information through the NoStackAndThen class : #1423 +* Requires a DateRange's "end" to be after its "start" : #1425 +* Scalding viz options : #1426 +* Fixes map-only jobs to accommodate both an lzo source and sink binary converter : #1431 +* Fix Readme travis link : #1432 +* Fixes docs wording : #1433 +* Don't squash the exception in history service when there's a failure : #1434 +* Log the exception in RatioBasedEstimator when there's a failure : #1435 +* make getBytesPerReducer support human readable values like 128m and 1g : #1436 +* Fixes minor KeyedList docs wording : #1437 +* Fix `readPathsFor` to use the `tz` argument : #1439 +* Scalding viz options : #1440 +* call Job.validate when running tests under JobTest : #1441 +* opt-in to calling Job.validate in JobTest : #1444 +* Fix bug with sketch joins and single keys : #1451 +* Fix incorrect usage of `percent`. : #1455 +* Add OrderedSerialization2 support in Matrix2. : #1457 +* Add InvalidSourceTap to catch all cases for no good path. : #1458 +* Cluster info and fs shell in repl : #1462 +* Update Scala version to 2.10.6 : #1463 +* Fix median estimation : #1464 +* Makes the config transient in the KryoHadoop instanciator : #1466 +* Moves the default to 2.11 : #1467 +* Adds Error Message to REPL when Current Directory Not Readable : #1468 +* SuccessFileSource: correctness for multi-dir globs : #1470 +* Limit task history fields consumed from hraven : #1472 +* Remove dependency on dfs-datastores : #1473 +* ScaldingILoop should enable one to pass in in/out : #1475 +* Switch Chat to Gitter : #1477 +* Add two functions that assist in testing a TypedPipe : #1478 +* Makes permission failures non-fatal when looking for .scalding_repl files : #1479 +* Update TypeDescriptor to explain that Option[String] is not supported : #1480 +* Remove a type parameter that doesn't seem to do anything : #1481 +* Utility for expanding libjars : #1483 +* Shouldn't skip hidden files, user can decide such things with their glob : #1485 +* Fix FileSystem.get issue : #1487 +* Remove dependency on parquet-cascading : #1488 +* Add withConfig api to allow running an execution with a transformed config : #1489 +* Call validateTaps in toIterator codepath : #1490 +* Update the build : #1491 +* Arg Descriptions/Help for Execution Apps : #1492 +* Fix issue #1429 : #1493 +* Cache counters for stat updates : #1495 +* Pulls the core ExecutionTests back into scalding-core : #1498 +* Add a liftToTry function to Execution : #1499 +* Small improvements to the `Boxed.scala` module : #1500 +* Cache boxed classes : #1501 +* Fix unnecessary use of `.get` in `Globifier.scala` : #1502 +* Replace unintentional use of `Unit` with `()` : #1503 +* Fix unnecessary uses of `Option.get` : #1506 +* Utility methods for running Executions in parallel : #1507 +* Typed Mapside Reduce : #1508 +* Use `wartremover` to guard against careless use of `_.get` : #1509 +* Add in an API around cache isolation : #1511 +* Add implicit Ordering[RichDate] : #1512 +* Fix MultipleTextLineFiles source in JobTest : #1513 +* Add's support for sealed abstract classes : #1518 +* Update FixedPathSource to strip out '*' in paths ending with '/*' for writes : #1520 +* support for more formats to work with RichDate : #1522 +* WIP: Add forceToDisk parameter to hashJoin in TypedPipe : #1529 +* Fixing comments on partitioned delimited source : #1530 +* Remove weakly typed Source : #1531 +* Maple fix for HBaseTap : #1532 +* Add an enrichment for TypedPipe.inMemoryToList and use it in TypedPipeDiff test : #1533 +* Because, because... fun, the scala compiler has special naming rules it appears when there are leading underscores : #1534 +* Fix README examples link : #1536 +* Fixes Config to accommodate spaces in argument values : #1537 +* Add before() and after() to RichDate : #1538 +* Adds late tap validation for cases where race conditions cause it to fail : #1540 +* Fix Rounding Bug in RatioBasedEstimator : #1542 + +### Version 0.15.0 ### +* Move OrderedSerialization into zero-dep scalding-serialization module #1289 +* bump elephantbird to 4.8 #1292 +* Fix OrderedSerialization for some forked graphs #1293 +* Add serialization modules to aggregate list #1298 + +### Version 0.14.0 ### +* add .unit to Execution object #1189 +* Override hashCode for Args #1190 +* Put a value in a exception message #1191 +* Add an exclusiveUpper method to DateRange #1194 +* Covert LzoTextDelimited to Cascading scheme. #1179 +* Remove Travis IRC notifications #1200 +* add LookupJoin and LookupJoinTest changes from summingbird #1199 +* Add a new ExecutionApp tutorial #1196 +* Move main simple example to be the typed API, and put the .'s at the sta... #1193 +* Add Execution.withArgs #1205 +* Config/Cascading updater #1197 +* Remove algebird serializers #1206 +* remove warnings in CumulativeSum #1215 +* Implicit execution context / easier switching between modes #1113 +* add row l1 normalize #1214 +* provide Args as an implicit val #1219 +* call sourceConfInit when reading from taps in local mode #1228 +* Add distinctCount and distinctValues helper methods to KeyedList. #1232 +* import hygiene: remove unused imports and remove JavaConversions use #1239 +* Swap hash and filename for filename-extension-sensitive code #1243 +* Remove more unused imports #1240 +* Provide useHdfsLocalMode for an easy switch to mapreduce local mode #1244 +* upgrade scalacheck and scalatest #1246 +* Optimize string and (hopefully) number comparisons a bit #1241 +* Note the active FlowProcess for Joiners #1235 +* Make sure Executions are executed at most once #1253 +* Fix Config.getUniqueIDs #1254 +* Add MustHasReducers trait. #1252 +* Make sure the EvalCache thread isDaemon #1255 +* Use non-regex split function #1251 +* make InputSizeReducerEstimator work for any CompositeTap #1256 +* TimePathedSource helper methods #1257 +* Fix for reducer estimation not working correctly if withReducers is set to 1 reducer #1263 +* Add make(dest) to TypedPipe #1217 +* Fix SimpleDateFormat caching by default #1265 +* upgrade sbt and sbt launcher script #1270 +* Add TypedPipeDiff for comparing typed pipes #1266 +* Change separator from \1 to \u0001 #1271 +* Disable reducer estimation for map-only steps #1276 +* Local sources support multiple paths #1275 +* fix the spelling of the cumulativeSumTest file #1281 +* Hydrate both sides of sampledCounts in skewJoinWithSmaller #1278 +* Bijection 0.8.0, algebird 0.10.0, chill 0.6.0, scala 2.10.5 #1287 +* Remove some deprecated items #1288 + +### Version 0.13.1 ### +* Back out 4 changes to be binary compatible: https://github.com/twitter/scalding/pull/1187 +* Use java.util.Random instead of scala.util.Random: https://github.com/twitter/scalding/pull/1186 +* Add Execution.failed: https://github.com/twitter/scalding/pull/1185 +* Using a ConcurrentHashMap instead of a WeakHashMap to make the Stats behave in a correct manner: https://github.com/twitter/scalding/pull/1184 +* Add applicative for Execution: https://github.com/twitter/scalding/pull/1181 + +### Version 0.13.0 ### +* Covert LzoTextDelimited to Cascading scheme.: https://github.com/twitter/scalding/pull/1179 +* Make TraceUtil support versions of cascading older than 2.6: https://github.com/twitter/scalding/pull/1180 +* Add support for more LzoTextDeilmited parameters in LzoTraits.: https://github.com/twitter/scalding/pull/1178 +* Use latest algebird, bijection, chill, elephantbird, and scala 2.11.5: https://github.com/twitter/scalding/pull/1174 +* Cascading 2.6 tracing: https://github.com/twitter/scalding/pull/1156 +* use Cascading 2.6.1 and cascading-jdbc 2.6.0: https://github.com/twitter/scalding/pull/1110 +* add reducer option to LookupJoin: https://github.com/twitter/scalding/pull/1160 +* Add dump to ValuePipe in the REPL: https://github.com/twitter/scalding/pull/1157 +* Ianoc/type descriptor: https://github.com/twitter/scalding/pull/1147 +* Refactor around the macro definitions into 3 files. Both converter and setter support Options: https://github.com/twitter/scalding/pull/1145 +* Fix a few random typos: https://github.com/twitter/scalding/pull/1144 +* Fix two issues found by static analysis: https://github.com/twitter/scalding/pull/1143 +* Add implicit helpers for numeric arguments: https://github.com/twitter/scalding/pull/1138 +* Add a fields macro: https://github.com/twitter/scalding/pull/1132 +* Ianoc/case class tuple converters: https://github.com/twitter/scalding/pull/1131 +* Some minor changes, cleanup pulled from jco's macro branch: https://github.com/twitter/scalding/pull/1130 +* Adds a typedjson source: https://github.com/twitter/scalding/pull/1129 +* Pulls all external 3rdparty versions up to the top of the build file: https://github.com/twitter/scalding/pull/1128 +* remove transitive pig and elephantbird dependencies for parquet-cascading: https://github.com/twitter/scalding/pull/1127 +* Some minor clean up in the build file: https://github.com/twitter/scalding/pull/1123 +* Ianoc/scalding 210: https://github.com/twitter/scalding/pull/1116 +* Decrease test count: https://github.com/twitter/scalding/pull/1117 +* Removes scala 2.9.3: https://github.com/twitter/scalding/pull/1106 +* Fix some typos in TypedPipe docs, expand flatMap docs: https://github.com/twitter/scalding/pull/1115 +* Implicit execution context / easier switching between modes: https://github.com/twitter/scalding/pull/1113 +* Add more documentation to TypedPipe: https://github.com/twitter/scalding/pull/1111 +* Update the README: https://github.com/twitter/scalding/pull/1114 +* Fixed comment in LookupJoin.scala: https://github.com/twitter/scalding/pull/1108 + +### Version 0.12.0 ### +* Fix long compile time for MultiJoin helpers: https://github.com/twitter/scalding/pull/1109 +* Allows reducer estimation to operate on all hfs taps: https://github.com/twitter/scalding/pull/1080 +* Fix bufferedTake: https://github.com/twitter/scalding/pull/1107 +* Generate methods for flattening the results of many joins: https://github.com/twitter/scalding/pull/1097 +* Make TimePathedSource more configurable: https://github.com/twitter/scalding/pull/1105 +* Adding DailyPrefixSuffixLzoTsv: https://github.com/twitter/scalding/pull/1082 +* Option to select the fields for output in templatesource: https://github.com/twitter/scalding/pull/1061 +* Add a DailySuffixMostRecentLzoProtobuf source: https://github.com/twitter/scalding/pull/1104 +* Updates default scala version to 2.10.4: https://github.com/twitter/scalding/pull/1081 +* MultiSourceTap hashcode: https://github.com/twitter/scalding/pull/1101 +* scalding-core: merge flow step strategies to allow reducer estimation combined with other strategies: https://github.com/twitter/scalding/pull/1094 +* Improve command line handling of the execution app: https://github.com/twitter/scalding/pull/1083 +* More testing around the globifier with new properties: https://github.com/twitter/scalding/pull/1092 +* Refactor JDBCSource to add compile-time info about type of DB: https://github.com/twitter/scalding/pull/1087 +* Add a cumulative sum to KeyedList: https://github.com/twitter/scalding/pull/1085 +* Add in failing test case: https://github.com/twitter/scalding/pull/1090 +* Adds ability to also get the mode inside the Execution monad.: https://github.com/twitter/scalding/pull/1088 +* Enforce invariant: mapGroup iterators all nonempty: https://github.com/twitter/scalding/pull/1072 +* Allow PartitionSource to limit the number of open files: https://github.com/twitter/scalding/pull/1078 +* append to Cascading frameworks system property instead of setting it directly: https://github.com/twitter/scalding/pull/1076 +* Adds some output while assembly is building to keep travis happy: https://github.com/twitter/scalding/pull/1084 +* Only request necessary hadoop configs in hraven reducer estimator: https://github.com/twitter/scalding/pull/1067 +* Add parquet-scrooge sources: https://github.com/twitter/scalding/pull/1064 +* Outer join handles case when both are empty: https://github.com/twitter/scalding/pull/1065 +* Fix race in merging: https://github.com/twitter/scalding/pull/1063 +* Add support for column projection to parquet sources: https://github.com/twitter/scalding/pull/1056 +* Add typed version of RichPipe 'using': https://github.com/twitter/scalding/pull/1049 +* Add getExecution/getOrElseExecution: https://github.com/twitter/scalding/pull/1062 +* Change toIteratorExecution to toIterableExecution: https://github.com/twitter/scalding/pull/1058 +* Cache Execution evaluations: https://github.com/twitter/scalding/pull/1057 +* Add support for push down filters in parquet sources: https://github.com/twitter/scalding/pull/1050 +* Add support for Fold: https://github.com/twitter/scalding/pull/1053 +* move to use JobConf(true) for hadoop crazyness that causes host not foun...: https://github.com/twitter/scalding/pull/1051 +* Disable Cascading update check.: https://github.com/twitter/scalding/pull/1048 +* Respects -Dmapred.job.name when passed in on the command line: https://github.com/twitter/scalding/pull/1045 +* Add some instances from Algebird: https://github.com/twitter/scalding/pull/1039 +* Fix join.mapGroup issue: https://github.com/twitter/scalding/pull/1038 +* Add a defensive .forceToDisk in Sketched: https://github.com/twitter/scalding/pull/1035 +* Override toIterator for all Mappable with transformForRead: https://github.com/twitter/scalding/pull/1034 +* Make sinkFields in TypedDelimited final.: https://github.com/twitter/scalding/pull/1032 +* Fixed type of exception thrown by validateTaps: https://github.com/twitter/scalding/pull/1033 +* Add default local maven repo to the resolver list: https://github.com/twitter/scalding/pull/1024 +* Add an ExecutionApp trait for objects to skip the Job class: https://github.com/twitter/scalding/pull/1027 +* Make each head pipe have a unique name: https://github.com/twitter/scalding/pull/1025 +* Run REPL from SBT: https://github.com/twitter/scalding/pull/1021 +* Add Config to openForRead: https://github.com/twitter/scalding/pull/1023 +* Fix replConfig merging and evaluate values in Config.fromHadoop: https://github.com/twitter/scalding/pull/1015 +* REPL Autoload file: https://github.com/twitter/scalding/pull/1009 +* Fix hRaven Reducer Estimator: https://github.com/twitter/scalding/pull/1018 +* Update Cascading JDBC Version.: https://github.com/twitter/scalding/pull/1016 +* Some Execution fixes: https://github.com/twitter/scalding/pull/1007 +* Refactor InputSizeReducerEstimator to correctly unroll MultiSourceTaps: https://github.com/twitter/scalding/pull/1017 +* Fix issue #1011: Building develop branch fails: https://github.com/twitter/scalding/pull/1012 +* hRaven Reducer Estimator: https://github.com/twitter/scalding/pull/996 +* JsonLine should handle empty lines: https://github.com/twitter/scalding/pull/966 +* Add comments for memory-related reduce operations.: https://github.com/twitter/scalding/pull/1006 +* Add the remaining odds and ends to Execution[T]: https://github.com/twitter/scalding/pull/985 +* Fix up the tests to run forked, and split across lots of travis builds: https://github.com/twitter/scalding/pull/993 +* Typedpipe partition: https://github.com/twitter/scalding/pull/987 +* Fix toIterator bug (#988): https://github.com/twitter/scalding/pull/990 +* Basic reducer estimator support: https://github.com/twitter/scalding/pull/973 +* Improve TypedSimilarity algorithm and update test.: https://github.com/twitter/scalding/pull/983 +* Adds support for Counters inside the Execution Monad.: https://github.com/twitter/scalding/pull/982 +* Make map/flatMap lazy on IterablePipe to address OOM: https://github.com/twitter/scalding/pull/981 +* JsonLine: enable read transformation in test to get correct fields in sourceTap: https://github.com/twitter/scalding/pull/971 +* Read and writable partitioned sources: https://github.com/twitter/scalding/pull/969 +* Make an Execution[T] type, which is a monad, which makes composing Jobs easy.: https://github.com/twitter/scalding/pull/974 +* Generalize handling of merged TypedPipes: https://github.com/twitter/scalding/pull/975 +* Do not inherit from FileSource in LzoTraits: https://github.com/twitter/scalding/pull/976 +* Make TypedPipe immutable: https://github.com/twitter/scalding/pull/968 +* Adds an optional source: https://github.com/twitter/scalding/pull/963 +* Add pipe1.join(pipe2) syntax in TypedAPI: https://github.com/twitter/scalding/pull/958 +* Extending BddDsl for Typed API: https://github.com/twitter/scalding/pull/956 +* VerticaJdbcDriver: https://github.com/twitter/scalding/pull/957 +* fix the example usage in JDBCSource: https://github.com/twitter/scalding/pull/955 +* Push back off ec2 requiring sudo, build failures are a nightmare: https://github.com/twitter/scalding/pull/953 +* Add ExecutionContextJob to interop execution style with Job style: https://github.com/twitter/scalding/pull/952 + +### Version 0.11.2 ### +* hadoop.tmp.dir for snapshot in config + +### Version 0.11.1 ### +* Fixes bad release portion where code wasn't updated for new scalding version number. +* use cascading-jdbc 2.5.3 for table exists fix and cascading 2.5.5: https://github.com/twitter/scalding/pull/951 +* Bump build properties and sbt launcher: https://github.com/twitter/scalding/pull/950 +* Fixes the travis build: https://github.com/twitter/scalding/pull/944 +* Making the README.md consistent with 0.11.0 changes for the REPL.: https://github.com/twitter/scalding/pull/941 +* Backport Meatlocker: https://github.com/twitter/scalding/pull/571 + +### Version 0.11.0 ### +* REPL: Add toIterator (and related methods): https://github.com/twitter/scalding/pull/929 +* Fix the build to use the shared module method: https://github.com/twitter/scalding/pull/938 +* Clean up the UniqueID stuff, to avoid plumbing it everywhere: https://github.com/twitter/scalding/pull/937 +* TypedPipe.from(List).distinct fails: https://github.com/twitter/scalding/pull/935 +* Clean up ExecutionContext a bit: https://github.com/twitter/scalding/pull/933 +* Fix Issue 932: no-op Jobs should not throw: https://github.com/twitter/scalding/pull/934 +* Use Execution to run flows in REPL: https://github.com/twitter/scalding/pull/928 +* Snapshot a pipe in the REPL: https://github.com/twitter/scalding/pull/918 +* Add support for AppJar in Config: https://github.com/twitter/scalding/pull/924 +* Fix LzoTextLine as a TypedSource: https://github.com/twitter/scalding/pull/921 +* Use externalizer in BijectedSourceSink: https://github.com/twitter/scalding/pull/926 +* Add an Executor to run flows without a Job: https://github.com/twitter/scalding/pull/915 +* This handles the case where scalding will save out a tsv and re-use it down stream leading to issues where the types are not strings: https://github.com/twitter/scalding/pull/913 +* Fix DailySuffixTsv for testability, remove leaked DailySuffixTsv: https://github.com/twitter/scalding/pull/919 +* Add a Config class to make configuration understandable: https://github.com/twitter/scalding/pull/914 +* Integrate the repl completely into scald.rb. Fixup scald-rb for better hdfs-local mode now with our provides: https://github.com/twitter/scalding/pull/902 +* Add some auto-reformats: https://github.com/twitter/scalding/pull/911 +* Update JDBCSource: https://github.com/twitter/scalding/pull/898 +* Allow tests for typed delimited by fixing swallowed bug: https://github.com/twitter/scalding/pull/910 +* Add Hadoop platform test to enable unit testing for Hadoop semantics: https://github.com/twitter/scalding/pull/858 +* Some minor improvements to typed joining code: https://github.com/twitter/scalding/pull/909 +* Fix #906: https://github.com/twitter/scalding/pull/908 +* Run the test target, so the tests are reformatted: https://github.com/twitter/scalding/pull/907 +* Enable scalariform: https://github.com/twitter/scalding/pull/905 +* Simplify "scald-repl.sh": https://github.com/twitter/scalding/pull/901 +* Typed Tutorial: https://github.com/twitter/scalding/pull/897 +* Adding a test for the scalding repl: https://github.com/twitter/scalding/pull/890 +* Properly close tuple iterator in test framework.: https://github.com/twitter/scalding/pull/896 +* Add constructors to ValuePipe: https://github.com/twitter/scalding/pull/893 +* contraMap and andThen on TypedSink/TypedSource: https://github.com/twitter/scalding/pull/892 +* Tiny fix to use an ImplicitBijection rather than Bijection: https://github.com/twitter/scalding/pull/887 +* Feature/bijected source sink: https://github.com/twitter/scalding/pull/886 +* Fix intersection equality error: https://github.com/twitter/scalding/pull/878 +* Add DailySuffixTypedTsv and HourlySuffixTypedTsv.: https://github.com/twitter/scalding/pull/873 +* add stepListner register support in Scalding: https://github.com/twitter/scalding/pull/875 +* Backport Meatlocker: https://github.com/twitter/scalding/pull/571 + +### Version 0.10.0 ### +* Upgrade cascading to 2.5.4, cascading jdbc to 2.5.2 +* Adding an hdfs mode for the Scalding REPL +* Added implementation of PartitionSource with tests +* Add helper methods to KeyedList and TypedPipe +* Add addTrap to TypedPipe + ### Version 0.9.0 ### * Add join operations to TypedPipe that do not require grouping beforehand * Fixed bug in size estimation of diagonal matrices diff --git a/COMMITTERS.md b/COMMITTERS.md new file mode 100644 index 0000000000..3ece57ba8f --- /dev/null +++ b/COMMITTERS.md @@ -0,0 +1,19 @@ +# Committers + +Please see our [Project Governance](https://github.com/twitter/analytics-infra-governance) page for more details. + +## Active + +| Name | Handle | +|------------------------|-----------------------------------------------------------| +| Alex Levenson | [@isnotinvain](https://github.com/isnotinvain) | +| Ben Pence | [@benpence](https://github.com/benpence) | +| Ian O'Connell | [@ianoc](https://github.com/ianoc) | +| Joe Nievelt | [@jnievelt](https://github.com/jnievelt) | +| Oscar Boykin | [@johnynek](https://github.com/johnynek) | +| Pankaj Gupta | [@pankajroark](https://github.com/pankajroark) | +| Piyush Narang | [@piyushnarang](https://github.com/piyushnarang) | +| Ruban Monu | [@rubanm](https://github.com/rubanm) | +| Sriram Krishnan | [@sriramkrishnan](https://github.com/sriramkrishnan) | + +##Emeritus diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9afa836db3..681b58c52c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,13 +1,75 @@ -Recommendations and requirements for how to best contribute to Scalding. We strive to obey these as best as possible. As always, thanks for contributing--we hope these guidelines make it easier and shed some light on our approach and processes. +--- +layout: page +title: "Contributing" +section: "contributing" +position: 4 +--- -### Key branches -- `master` is the latest, deployed version -- `develop` is where development happens and all pull requests should be submitted +# Contributing to Scalding -### Pull requests -- Submit pull requests against the `develop` branch -- Try not to pollute your pull request with unintended changes--keep them simple and small +This page lists recommendations and requirements for how to best contribute to Scalding. We strive to obey these as best as possible. As always, thanks for contributing--we hope these guidelines make it easier and shed some light on our approach and processes. -### License -By contributing your code, you agree to license your contribution under the terms of the APLv2: -https://github.com/twitter/scalding/blob/master/LICENSE +## Key branches + +- `master` is the latest, deployed version. +- `develop` is where development happens and all pull requests should be submitted. + +## Pull requests + +Submit pull requests against the `develop` branch. Try not to pollute your pull request with unintended changes. Keep it simple and small. + +## Contributing Documentation + +The documentation for Scalding's website is stored in the `docs/src/main/tut` directory of the [docs subproject](https://github.com/twitter/scalding/tree/develop/docs). + +Scalding's documentation is powered by [sbt-microsites](https://47deg.github.io/sbt-microsites/) and [tut](https://github.com/tpolecat/tut). `tut` compiles any code that appears in the documentation, ensuring that snippets and examples won't go out of date. + +We would love your help making our documentation better. If you see a page that's empty or needs work, please send us a pull request making it better. + +- Make sure to add some code examples! Any code block of this form in any documentation markdown file will get compiled using `tut`: + + ```toot:book + + ``` + +(Please replace `toot` with `tut`!) `tut` will evaluate your code as if you'd pasted it into a REPL and insert each line's results in the output. State persists across `tut` code blocks, so feel free to alternate code blocks with text discussion. See the [tut README](https://github.com/tpolecat/tut) for more information on the various options you can use to customize your code blocks. + +- Add your page to the appropriate section in [the menu](https://github.com/twitter/scalding/tree/develop/docs/src/main/resources/microsite/data/menu.yml) + +### Generating the Site + +run `sbt docs/makeMicrosite` to generate a local copy of the microsite. + +### Previewing the site + +1. Install jekyll locally, depending on your platform, you might do this with any of the following commands: + +``` +yum install jekyll +apt-get install jekyll +gem install jekyll +``` + +2. In a shell, navigate to the generated site directory in `docs/target/site` +3. Start jekyll with `jekyll serve --incremental` +4. Navigate to http://127.0.0.1:4000/scalding/ in your browser +5. Make changes to your site, and run `sbt docs/makeMicrosite` to regenerate the site. The changes should be reflected as soon as `sbt docs/makeMicrosite` completes. + +## Post-release + +After the release occurs, you will need to update the documentation. Here is a list of the places that will definitely need to be updated: + + * `README.md`: update version numbers + * `CHANGES.md`: summarize changes since last release + +(Other changes may be necessary, especially for large releases.) + +You can get a list of changes between release tags `v0.1.2` and `v0.2.0` via `git log v0.1.2..v0.2.0`. Scanning this list of commit messages is a good way to get a summary of what happened, although it does not account for conversations that occurred on Github. + +Once the relevant documentation changes have been committed, new [release notes](https://github.com/twitter/scalding/releases) should be added. You can add a release by clicking the "Draft a new release" button on that page, or if the relevant release already exists, you can click "Edit release". + +The website should then be updated via `sbt docs/publishMicrosite`. + +## License + +By contributing your code, you agree to license your contribution under the terms of the [APLv2](LICENSE). diff --git a/README.md b/README.md index 47f3359cab..afb0c18c02 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,14 @@ # Scalding +[![Build status](https://github.com/twitter/scalding/actions/workflows/CI.yml/badge.svg?branch=develop)](https://github.com/twitter/scalding/actions) +[![Coverage Status](https://img.shields.io/codecov/c/github/twitter/scalding/develop.svg?maxAge=3600)](https://codecov.io/github/twitter/scalding) +[![Latest version](https://index.scala-lang.org/twitter/scalding/scalding-core/latest.svg?color=orange)](https://index.scala-lang.org/twitter/scalding/scalding-core) +[![Chat](https://badges.gitter.im/twitter/scalding.svg)](https://gitter.im/twitter/scalding?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) + Scalding is a Scala library that makes it easy to specify Hadoop MapReduce jobs. Scalding is built on top of [Cascading](http://www.cascading.org/), a Java library that abstracts away low-level Hadoop details. Scalding is comparable to [Pig](http://pig.apache.org/), but offers tight integration with Scala, bringing advantages of Scala to your MapReduce jobs. ![Scalding Logo](https://raw.github.com/twitter/scalding/develop/logo/scalding.png) -Current version: `0.9.0rc4` - ## Word Count Hadoop is a distributed system for counting words. Here is how it's done in Scalding. @@ -14,15 +17,17 @@ Hadoop is a distributed system for counting words. Here is how it's done in Scal package com.twitter.scalding.examples import com.twitter.scalding._ +import com.twitter.scalding.source.TypedText -class WordCountJob(args : Args) extends Job(args) { - TextLine( args("input") ) - .flatMap('line -> 'word) { line : String => tokenize(line) } - .groupBy('word) { _.size } - .write( Tsv( args("output") ) ) +class WordCountJob(args: Args) extends Job(args) { + TypedPipe.from(TextLine(args("input"))) + .flatMap { line => tokenize(line) } + .groupBy { word => word } // use each word for a key + .size // in each group, get the size + .write(TypedText.tsv[(String, Long)](args("output"))) // Split a piece of text into individual words. - def tokenize(text : String) : Array[String] = { + def tokenize(text: String): Array[String] = { // Lowercase each word and remove punctuation. text.toLowerCase.replaceAll("[^a-zA-Z0-9\\s]", "").split("\\s+") } @@ -31,22 +36,46 @@ class WordCountJob(args : Args) extends Job(args) { Notice that the `tokenize` function, which is standard Scala, integrates naturally with the rest of the MapReduce job. This is a very powerful feature of Scalding. (Compare it to the use of UDFs in Pig.) -You can find more example code under [examples/](https://github.com/twitter/scalding/tree/master/scalding-core/src/main/scala/com/twitter/scalding/examples). If you're interested in comparing Scalding to other languages, see our [Rosetta Code page](https://github.com/twitter/scalding/wiki/Rosetta-Code), which has several MapReduce tasks in Scalding and other frameworks (e.g., Pig and Hadoop Streaming). +You can find more example code under [examples/](https://github.com/twitter/scalding/tree/master/scalding-commons/src/main/scala/com/twitter/scalding/examples). If you're interested in comparing Scalding to other languages, see our [Rosetta Code page](https://github.com/twitter/scalding/wiki/Rosetta-Code), which has several MapReduce tasks in Scalding and other frameworks (e.g., Pig and Hadoop Streaming). ## Documentation and Getting Started * [**Getting Started**](https://github.com/twitter/scalding/wiki/Getting-Started) page on the [Scalding Wiki](https://github.com/twitter/scalding/wiki) +* [Scalding Scaladocs](http://twitter.github.com/scalding) provide details beyond the API References. Prefer using this as it's always up to date. +* [**REPL in Wonderland**](tutorial/WONDERLAND.md) a hands-on tour of the scalding REPL requiring only git and java installed. * [**Runnable tutorials**](https://github.com/twitter/scalding/tree/master/tutorial) in the source. * The API Reference, including many example Scalding snippets: - * [Fields-based API Reference](https://github.com/twitter/scalding/wiki/Fields-based-API-Reference) * [Type-safe API Reference](https://github.com/twitter/scalding/wiki/Type-safe-api-reference) -* [Scalding Scaladocs](http://twitter.github.com/scalding) provide details beyond the API References + * [Fields-based API Reference](https://github.com/twitter/scalding/wiki/Fields-based-API-Reference) * The Matrix Library provides a way of working with key-attribute-value scalding pipes: * The [Introduction to Matrix Library](https://github.com/twitter/scalding/wiki/Introduction-to-Matrix-Library) contains an overview and a "getting started" example * The [Matrix API Reference](https://github.com/twitter/scalding/wiki/Matrix-API-Reference) contains the Matrix Library API reference with examples +* [**Introduction to Scalding Execution**](https://github.com/twitter/scalding/wiki/Calling-Scalding-from-inside-your-application) contains general rules and examples of calling Scalding from inside another application. Please feel free to use the beautiful [Scalding logo](https://drive.google.com/folderview?id=0B3i3pDi3yVgNbm9pMUdDcHFKVEk&usp=sharing) artwork anywhere. +## Contact +For user questions or scalding development (internals, extending, release planning): + (Google search also works as a first step) + +In the remote possibility that there exist bugs in this code, please report them to: + + +Follow [@Scalding](http://twitter.com/scalding) on Twitter for updates. + +Chat: [![Gitter](https://badges.gitter.im/twitter/scalding.svg)](https://gitter.im/twitter/scalding?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) + +## Get Involved + Code of Conduct +Pull requests and bug reports are always welcome! + +We use a lightweight form of project governence inspired by the one used by Apache projects. +Please see [Contributing and Committership](https://github.com/twitter/analytics-infra-governance#contributing-and-committership) for our code of conduct and our pull request review process. +The TL;DR is send us a pull request, iterate on the feedback + discussion, and get a +1 from a [Committer](COMMITTERS.md) in order to get your PR accepted. + +The current list of active committers (who can +1 a pull request) can be found here: [Committers](COMMITTERS.md) + +A list of contributors to the project can be found here: [Contributors](https://github.com/twitter/scalding/graphs/contributors) + ## Building There is a script (called sbt) in the root that loads the correct sbt version to build: @@ -60,25 +89,25 @@ The test suite takes a while to run. When you're in sbt, here's a shortcut to ru Please refer to [FAQ page](https://github.com/twitter/scalding/wiki/Frequently-asked-questions#issues-with-sbt) if you encounter problems when using sbt. -We use [Travis CI](http://travis-ci.org/) to verify the build: -[![Build Status](https://secure.travis-ci.org/twitter/scalding.png)](http://travis-ci.org/twitter/scalding) +We use Github Actions to verify the build: +[![Build Status](https://github.com/twitter/scalding/actions/workflows/CI.yml/badge.svg?branch=develop)](https://github.com/twitter/scalding/actions) + +We use [Coveralls](https://coveralls.io/r/twitter/scalding) for code coverage results: +[![Coverage Status](https://coveralls.io/repos/twitter/scalding/badge.png?branch=develop)](https://coveralls.io/r/twitter/scalding?branch=develop) Scalding modules are available from maven central. -The current groupid and version for all modules is, respectively, `"com.twitter"` and `0.8.11`. +The current groupid and version for all modules is, respectively, `"com.twitter"` and `0.17.2`. Current published artifacts are -* `scalding-core_2.9.2` -* `scalding-core_2.10` -* `scalding-args_2.9.2` -* `scalding-args_2.10` -* `scalding-date_2.9.2` -* `scalding-date_2.10` -* `scalding-commons_2.9.2` -* `scalding-commons_2.10` -* `scalding-avro_2.9.2` -* `scalding-avro_2.10` +* `scalding-core_2.11`, `scalding-core_2.12` +* `scalding-args_2.11`, `scalding-args_2.12` +* `scalding-date_2.11`, `scalding-date_2.12` +* `scalding-commons_2.11`, `scalding-commons_2.12` +* `scalding-avro_2.11`, `scalding-avro_2.12` +* `scalding-parquet_2.11`, `scalding-parquet_2.12` +* `scalding-repl_2.11`, `scalding-repl_2.12` The suffix denotes the scala version. @@ -94,21 +123,6 @@ The suffix denotes the scala version. To see a full list of users or to add yourself, see the [wiki](https://github.com/twitter/scalding/wiki/Powered-By) -## Contact - -For user questions, we are using the cascading-user mailing list for discussions: - - -For scalding development (internals, extending, release planning): - - -In the remote possibility that there exist bugs in this code, please report them to: - - -Follow [@Scalding](http://twitter.com/scalding) on Twitter for updates. - -Chat (IRC): [freenode](https://webchat.freenode.net/) channel: #scalding - ## Authors: * Avi Bryant * Oscar Boykin @@ -118,6 +132,10 @@ Thanks for assistance and contributions: * Sam Ritchie * Aaron Siegel: +* Ian O'Connell +* Alex Levenson +* Jonathan Coveney +* Kevin Lin * Brad Greenlee: * Edwin Chen * Arkajit Dey: @@ -127,13 +145,15 @@ Thanks for assistance and contributions: * Ning Liang * Dmitriy Ryaboy * Dong Wang -* Kevin Lin * Josh Attenberg -* Juliet Hougland +* Juliet Hougland +* Eddie Xie A full list of [contributors](https://github.com/twitter/scalding/graphs/contributors) can be found on GitHub. ## License -Copyright 2013 Twitter, Inc. -Licensed under the Apache License, Version 2.0: http://www.apache.org/licenses/LICENSE-2.0 +Copyright 2016 Twitter, Inc. + +Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) + diff --git a/build.sbt b/build.sbt new file mode 100644 index 0000000000..abe4a0b9c4 --- /dev/null +++ b/build.sbt @@ -0,0 +1,778 @@ +import com.typesafe.tools.mima.plugin.MimaPlugin.mimaDefaultSettings +import scala.collection.JavaConverters._ +import microsites.ExtraMdFileConfig + +def scalaBinaryVersion(scalaVersion: String) = scalaVersion match { + case version if version.startsWith("2.11") => "2.11" + case version if version.startsWith("2.12") => "2.12" + case _ => sys.error("unknown error") +} +val algebirdVersion = "0.13.4" +val apacheCommonsVersion = "2.2" +val avroVersion = "1.8.2" +val bijectionVersion = "0.9.5" +val cascadingAvroVersion = "2.1.2" +val catsEffectVersion = "1.1.0" +val catsVersion = "1.5.0" +val chillVersion = "0.8.4" +val elephantbirdVersion = "4.15" +val hadoopLzoVersion = "0.4.19" +val hadoopVersion = "2.5.0" +val hbaseVersion = "1.2.4" +val hravenVersion = "1.0.1" +val jacksonVersion = "2.8.7" +val json4SVersion = "3.5.0" +val paradiseVersion = "2.1.1" +val parquetVersion = "1.10.0" +val protobufVersion = "2.4.1" +val scalameterVersion = "0.8.2" +val scalaCheckVersion = "1.13.4" +val scalaTestVersion = "3.0.1" +val scroogeVersion = "19.8.0" +val beamVersion = "2.29.0" +val slf4jVersion = "1.7.30" +val thriftVersion = "0.9.3" +val junitVersion = "4.10" +val jlineVersion = "2.14.3" + +val printDependencyClasspath = taskKey[Unit]("Prints location of the dependencies") + +// these are override functions for sbt-dynver (plugin for resolving project version from git tags) +// the default behaviour includes timestamps in SNAPSHOT versions which is incompatible with tests and unnecessary +// implementation based on source in https://github.com/sbt/sbt-dynver/blob/master/dynver/src/main/scala/sbtdynver/DynVer.scala +def versionFmt(out: sbtdynver.GitDescribeOutput): String = { + // head commit has a version tag, in this case we know that we have a final release + // we should publish in the form of + if (out.commitSuffix.isEmpty) { + return out.ref.dropPrefix + } + // head commit has no tag (ie. PR has been merged into develop) + // we should publish in the form of --SNAPSHOT + out.ref.dropPrefix + out.commitSuffix.mkString("-", "-", "") + "-SNAPSHOT" +} +// this edge case is not relevant as it is only triggered on non-git repos +def fallbackVersion(d: java.util.Date): String = "HEAD" + +val sharedSettings = Seq( + version := dynverGitDescribeOutput.value.mkVersion(versionFmt, fallbackVersion(dynverCurrentDate.value)), + dynver := dynverGitDescribeOutput.value.mkVersion(versionFmt, fallbackVersion(dynverCurrentDate.value)), + organization := "com.twitter", + scalaVersion := "2.11.12", + crossScalaVersions := Seq(scalaVersion.value, "2.12.14"), + javacOptions ++= Seq("-source", "1.8", "-target", "1.8"), + doc / javacOptions := Seq("-source", "1.8"), + versionScheme := Some("early-semver"), + Compile / compile / wartremoverErrors ++= Seq( + // Wart.OptionPartial, // this kills the ability to use serialization macros + Wart.ExplicitImplicitTypes, + Wart.LeakingSealed, + Wart.Return, + Wart.EitherProjectionPartial + ), + libraryDependencies ++= Seq( + "org.mockito" % "mockito-all" % "1.8.5" % "test", + "org.scalacheck" %% "scalacheck" % scalaCheckVersion % "test", + "org.scalatest" %% "scalatest" % scalaTestVersion % "test", + "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "test", + "com.novocode" % "junit-interface" % "0.10" % "test" + ), + resolvers ++= Seq( + Opts.resolver.sonatypeSnapshots, + Opts.resolver.sonatypeReleases, + "Concurrent Maven Repo".at("https://conjars.org/repo"), + "Twitter Maven".at("https://maven.twttr.com"), + "Cloudera".at("https://repository.cloudera.com/artifactory/cloudera-repos/") + ), + printDependencyClasspath := { + val cp = (Compile / dependencyClasspath).value + cp.foreach(f => println(s"${f.metadata.get(moduleID.key)} => ${f.data}")) + }, + Test / fork := true, + updateOptions := updateOptions.value.withCachedResolution(true), + update / aggregate := false, + Test / javaOptions ++= Seq("-Xmx2048m", "-XX:ReservedCodeCacheSize=384m"), + Global / concurrentRestrictions := Seq( + Tags.limitAll(1) + ), + Test / parallelExecution := false, + scalacOptions ++= Seq( + "-unchecked", + "-deprecation", + "-language:implicitConversions", + "-language:higherKinds", + "-language:existentials", + "-Ywarn-unused-import" + ), + Compile / doc / scalacOptions ++= Seq(scalaVersion.value).flatMap { + case v if v.startsWith("2.12") => Seq("-no-java-comments") // workaround for scala/scala-dev#249 + case _ => Seq() + }, + + // Code coverage options + jacocoReportSettings := JacocoReportSettings( + "Jacoco Coverage Report", + None, + JacocoThresholds(), + Seq(JacocoReportFormats.ScalaHTML, JacocoReportFormats.XML), + "utf-8" + ), + + // Enables full stack traces in scalatest + Test / testOptions += Tests.Argument(TestFrameworks.ScalaTest, "-oF"), + + // Uncomment if you don't want to run all the tests before building assembly + // test in assembly := {}, + assembly / logLevel := Level.Warn, + + // Publishing options: + Test / publishArtifact := false, + pomIncludeRepository := { x => false }, + + // Janino includes a broken signature, and is not needed: + assembly / assemblyExcludedJars := { + val excludes = + Set("jsp-api-2.1-6.1.14.jar", "jsp-2.1-6.1.14.jar", "jasper-compiler-5.5.12.jar", "janino-2.5.16.jar") + (assembly / fullClasspath).value.filter { jar => + excludes(jar.data.getName) + } + }, + // Some of these files have duplicates, let's ignore: + assembly / assemblyMergeStrategy := { + case s if s.endsWith(".class") => MergeStrategy.last + case s if s.endsWith("project.clj") => MergeStrategy.concat + case s if s.endsWith(".html") => MergeStrategy.last + case s if s.endsWith(".dtd") => MergeStrategy.last + case s if s.endsWith(".xsd") => MergeStrategy.last + case s if s.endsWith("pom.properties") => MergeStrategy.last + case s if s.endsWith("pom.xml") => MergeStrategy.last + case s if s.endsWith(".jnilib") => MergeStrategy.rename + case s if s.endsWith("jansi.dll") => MergeStrategy.rename + case s if s.endsWith("libjansi.so") => MergeStrategy.rename + case s if s.endsWith("properties") => MergeStrategy.filterDistinctLines + case x => (assembly / assemblyMergeStrategy).value(x) + }, + pomExtra := (https://github.com/twitter/scalding + + + Apache 2 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + A business-friendly OSS license + + + + git@github.com:twitter/scalding.git + scm:git:git@github.com:twitter/scalding.git + + + + posco + Oscar Boykin + https://twitter.com/posco + + + avibryant + Avi Bryant + https://twitter.com/avibryant + + + argyris + Argyris Zymnis + https://twitter.com/argyris + + ) +) ++ mimaDefaultSettings + +lazy val scalding = Project(id = "scalding", base = file(".")) + .settings(sharedSettings ++ noPublishSettings) + .aggregate( + scaldingArgs, + scaldingDate, + scaldingQuotation, + scaldingCats, + scaldingDagon, + scaldingBase, + scaldingCore, + scaldingCommons, + scaldingAvro, + scaldingParquet, + scaldingParquetScrooge, + scaldingHRaven, + scaldingRepl, + scaldingJson, + scaldingHadoopTest, + scaldingEstimatorsTest, + scaldingDb, + maple, + executionTutorial, + scaldingSerialization, + scaldingSpark, + scaldingBeam, + scaldingThriftMacros + ) + +lazy val scaldingAssembly = Project(id = "scalding-assembly", base = file("assembly")) + .settings(sharedSettings ++ noPublishSettings) + .aggregate( + scaldingArgs, + scaldingDate, + scaldingQuotation, + scaldingCore, + scaldingCommons, + scaldingAvro, + scaldingParquet, + scaldingParquetScrooge, + scaldingHRaven, + scaldingRepl, + scaldingJson, + maple, + scaldingSerialization + ) + +lazy val noPublishSettings = Seq( + publish := (()), + publishLocal := (()), + test := (()), + publishArtifact := false +) + +/** + * This returns the youngest jar we released that is compatible with the current. + */ +val ignoredModules = Set[String]("benchmarks") + +def youngestForwardCompatible(subProj: String) = + None +// Enable mima binary check back after releasing 0.18.0 +// Some(subProj) +// .filterNot(ignoredModules.contains(_)) +// .map { +// s => "com.twitter" %% (s"scalding-$s") % "0.17.0" +// } + +def module(name: String) = { + val id = "scalding-%s".format(name) + Project(id = id, base = file(id)).settings( + sharedSettings ++ Seq(Keys.name := id, mimaPreviousArtifacts := youngestForwardCompatible(name).toSet) + ) +} + +lazy val scaldingArgs = module("args") + +lazy val scaldingDate = module("date") + +lazy val cascadingVersion = + System.getenv.asScala.getOrElse("SCALDING_CASCADING_VERSION", "2.6.1") + +lazy val scaldingBenchmarks = module("benchmarks") + .settings( + libraryDependencies ++= Seq( + "com.storm-enroute" %% "scalameter" % scalameterVersion % "test", + "org.scalacheck" %% "scalacheck" % scalaCheckVersion % "test" + ), + testFrameworks += new TestFramework("org.scalameter.ScalaMeterFramework"), + Test / parallelExecution := false + ) + .dependsOn(scaldingCore) + +lazy val scaldingQuotation = module("quotation").settings( + libraryDependencies ++= Seq( + "org.scala-lang" % "scala-reflect" % scalaVersion.value % "provided", + "org.scala-lang" % "scala-compiler" % scalaVersion.value % "provided" + ) +) + +lazy val scaldingDagon = module("dagon").settings( + addCompilerPlugin("org.typelevel" %% "kind-projector" % "0.13.0" cross CrossVersion.full), + Compile / unmanagedSourceDirectories ++= scaldingDagonSettings + .scalaVersionSpecificFolders("main", baseDirectory.value, scalaVersion.value), + Test / unmanagedSourceDirectories ++= scaldingDagonSettings + .scalaVersionSpecificFolders("test", baseDirectory.value, scalaVersion.value) +) + +lazy val scaldingBase = module("base") + .settings( + libraryDependencies ++= Seq( + "com.twitter" %% "algebird-core" % algebirdVersion, + "org.slf4j" % "slf4j-api" % slf4jVersion + ), + // buildInfo here refers to https://github.com/sbt/sbt-buildinfo + // for logging purposes, src/main/scala/com/twitter/package.scala would like to know the scalding-version + buildInfoKeys := Seq[BuildInfoKey](version), + buildInfoPackage := "com.twitter.scalding" // the codegen would be under com.twitter.scalding.BuildInfo + ) + .enablePlugins(BuildInfoPlugin) + .dependsOn(scaldingArgs, scaldingDagon, scaldingSerialization) + +lazy val scaldingCore = module("core") + .settings( + libraryDependencies ++= Seq( + "cascading" % "cascading-core" % cascadingVersion, + "cascading" % "cascading-hadoop" % cascadingVersion, + "cascading" % "cascading-local" % cascadingVersion, + "com.twitter" % "chill-hadoop" % chillVersion, + "com.twitter" % "chill-java" % chillVersion, + "com.twitter" %% "chill-bijection" % chillVersion, + "com.twitter" %% "algebird-core" % algebirdVersion, + "com.twitter" %% "algebird-test" % algebirdVersion % "test", + "com.twitter" %% "bijection-core" % bijectionVersion, + "com.twitter" %% "bijection-macros" % bijectionVersion, + "com.twitter" %% "chill" % chillVersion, + "com.twitter" %% "chill-algebird" % chillVersion, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "org.scala-lang" % "scala-library" % scalaVersion.value, + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided" + ), + addCompilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)) + ) + .dependsOn( + scaldingArgs, + scaldingBase, + scaldingDate, + scaldingSerialization, + maple, + scaldingQuotation, + scaldingDagon + ) + +lazy val scaldingCats = module("cats") + .settings( + libraryDependencies ++= Seq( + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "org.typelevel" %% "cats-core" % catsVersion, + "org.typelevel" %% "cats-laws" % catsVersion % "test", + "org.typelevel" %% "cats-effect" % catsEffectVersion, + "org.typelevel" %% "cats-effect-laws" % catsEffectVersion % "test" + ) + ) + .dependsOn(scaldingArgs, scaldingDate, scaldingCore) + +lazy val scaldingSpark = module("spark") + .settings( + libraryDependencies ++= { + CrossVersion.partialVersion(Keys.scalaVersion.value) match { + case Some((2, 11)) => + Seq( + "org.apache.spark" %% "spark-core" % "2.4.8", + "org.apache.spark" %% "spark-sql" % "2.4.8" + ) + case Some((2, 12)) => + Seq( + "org.apache.spark" %% "spark-core" % "3.1.2", + "org.apache.spark" %% "spark-sql" % "3.1.2" + ) + case _ => ??? // not supported + } + } + ) + .dependsOn(scaldingCore) + +lazy val scaldingBeam = module("beam") + .settings( + libraryDependencies ++= Seq( + "com.twitter" % "chill-java" % chillVersion, + "com.twitter" %% "chill" % chillVersion, + "org.apache.beam" % "beam-sdks-java-core" % beamVersion, + "org.apache.beam" % "beam-sdks-java-extensions-google-cloud-platform-core" % beamVersion, + "org.apache.beam" % "beam-sdks-java-extensions-sorter" % beamVersion, + "org.apache.beam" % "beam-runners-direct-java" % beamVersion % "test", + // Including this dependency since scalding configuration depends on hadoop + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided" + ), + // Useful for the BeamPlanner implementation so we know if anything is missing + scalacOptions ++= Seq("-Ypatmat-exhaust-depth", "200") + ) + .dependsOn(scaldingCore) + +lazy val scaldingCommons = module("commons") + .settings( + libraryDependencies ++= Seq( + // TODO: split into scalding-protobuf + "com.google.protobuf" % "protobuf-java" % protobufVersion, + "com.twitter" %% "bijection-core" % bijectionVersion, + "com.twitter" %% "algebird-core" % algebirdVersion, + "com.twitter" %% "chill" % chillVersion, + "com.twitter.elephantbird" % "elephant-bird-cascading2" % elephantbirdVersion, + "com.twitter.elephantbird" % "elephant-bird-core" % elephantbirdVersion, + "com.hadoop.gplcompression" % "hadoop-lzo" % hadoopLzoVersion, + // TODO: split this out into scalding-thrift + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "org.apache.thrift" % "libthrift" % thriftVersion, + // TODO: split this out into a scalding-scrooge + ("com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided") + .exclude("com.google.guava", "guava"), + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided", + "junit" % "junit" % junitVersion % "test" + ) + ) + .dependsOn(scaldingArgs, scaldingDate, scaldingCore, scaldingHadoopTest % "test") + +lazy val scaldingAvro = module("avro") + .settings( + libraryDependencies ++= Seq( + "cascading.avro" % "avro-scheme" % cascadingAvroVersion, + "org.apache.avro" % "avro" % avroVersion, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided" + ) + ) + .dependsOn(scaldingCore) + +lazy val scaldingParquetFixtures = module("parquet-fixtures") + .settings( + Test / scroogeThriftSourceFolder := baseDirectory.value / "src/test/resources", + Test / scroogeLanguages := Seq("java", "scala"), + libraryDependencies ++= Seq( + ("com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided") + .exclude("com.google.guava", "guava"), + "commons-lang" % "commons-lang" % apacheCommonsVersion, // needed for HashCodeBuilder used in thriftjava + "org.apache.thrift" % "libthrift" % thriftVersion + ) + ) + +lazy val scaldingParquet = module("parquet") + .settings( + libraryDependencies ++= Seq( + "org.apache.parquet" % "parquet-column" % parquetVersion, + "org.apache.parquet" % "parquet-hadoop" % parquetVersion, + ("org.apache.parquet" % "parquet-thrift" % parquetVersion) + // see https://issues.apache.org/jira/browse/PARQUET-143 for exclusions + .exclude("org.apache.parquet", "parquet-pig") + .exclude("com.twitter.elephantbird", "elephant-bird-pig") + .exclude("com.twitter.elephantbird", "elephant-bird-core"), + "org.scala-lang" % "scala-compiler" % scalaVersion.value, + "org.apache.thrift" % "libthrift" % thriftVersion, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "com.twitter" %% "bijection-macros" % bijectionVersion, + "com.twitter" %% "chill-bijection" % chillVersion, + "com.twitter.elephantbird" % "elephant-bird-core" % elephantbirdVersion % "test" + ), + addCompilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)) + ) + .dependsOn(scaldingCore, scaldingHadoopTest % "test", scaldingParquetFixtures % "test->test") + +lazy val scaldingParquetScroogeFixtures = module("parquet-scrooge-fixtures") + .settings( + Test / scroogeThriftSourceFolder := baseDirectory.value / "src/test/resources", + Test / scroogeLanguages := Seq("java", "scala"), + libraryDependencies ++= Seq( + ("com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided") + .exclude("com.google.guava", "guava"), + "commons-lang" % "commons-lang" % apacheCommonsVersion, // needed for HashCodeBuilder used in thriftjava + "org.apache.thrift" % "libthrift" % thriftVersion + ) + ) + +lazy val scaldingParquetScrooge = module("parquet-scrooge") + .settings( + libraryDependencies ++= Seq( + "org.slf4j" % "slf4j-api" % slf4jVersion, + // see https://issues.apache.org/jira/browse/PARQUET-143 for exclusions + ("org.apache.parquet" % "parquet-thrift" % parquetVersion % "test") + .classifier("tests") + .exclude("org.apache.parquet", "parquet-pig") + .exclude("com.twitter.elephantbird", "elephant-bird-pig") + .exclude("com.twitter.elephantbird", "elephant-bird-core"), + ("com.twitter" %% "scrooge-serializer" % scroogeVersion) + .exclude("com.google.guava", "guava"), + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "com.novocode" % "junit-interface" % "0.11" % "test", + "junit" % "junit" % junitVersion % "test" + ) + ) + .dependsOn( + scaldingCore, + scaldingParquet % "compile->compile;test->test", + scaldingParquetScroogeFixtures % "test->test" + ) + +lazy val scaldingHRaven = module("hraven") + .settings( + libraryDependencies ++= Seq( + ("com.twitter.hraven" % "hraven-core" % hravenVersion) + // These transitive dependencies cause sbt to give a ResolveException + // because they're not available on Maven. We don't need them anyway. + // See https://github.com/twitter/cassie/issues/13 + .exclude("javax.jms", "jms") + .exclude("com.sun.jdmk", "jmxtools") + .exclude("com.sun.jmx", "jmxri") + + // These transitive dependencies of hRaven cause conflicts when + // running scalding-hraven/*assembly and aren't needed + // for the part of the hRaven API that we use anyway + .exclude("com.twitter.common", "application-module-log") + .exclude("com.twitter.common", "application-module-stats") + .exclude("com.twitter.common", "args") + .exclude("com.twitter.common", "application") + // Excluding this dependencies because they get resolved to incorrect version, + // and not needed during compilation. + .exclude("com.twitter", "util-registry_2.10") + .exclude("com.twitter", "util-core_2.10") + .exclude("com.twitter", "util-jvm_2.10"), + "org.apache.hbase" % "hbase" % hbaseVersion, + "org.apache.hbase" % "hbase-client" % hbaseVersion % "provided", + "org.apache.hbase" % "hbase-common" % hbaseVersion % "provided", + "org.apache.hbase" % "hbase-server" % hbaseVersion % "provided", + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided" + ) + ) + .dependsOn(scaldingCore) + +lazy val scaldingRepl = module("repl") + .settings( + console / initialCommands := """ + import com.twitter.scalding._ + import com.twitter.scalding.ReplImplicits._ + import com.twitter.scalding.ReplImplicitContext._ + """, + libraryDependencies ++= Seq( + "jline" % "jline" % jlineVersion, + "org.scala-lang" % "scala-compiler" % scalaVersion.value, + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "org.scala-lang" % "scala-library" % scalaVersion.value, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided" + ) + ) + .dependsOn(scaldingCore) + .settings( + inConfig(Compile)( + Classpaths.configSettings ++ Seq( + // This is needed to make "provided" dependencies presented in repl, + // solution borrowed from: http://stackoverflow.com/a/18839656/1404395 + run := Defaults + .runTask(Compile / fullClasspath, Compile / run / mainClass, Compile / run / runner) + .evaluated, + // we need to fork repl task, because scala repl doesn't work well with sbt classloaders. + run / fork := true, + run / connectInput := true, + run / outputStrategy := Some(OutputStrategy.StdoutOutput) + ) + ): _* + ) + +// zero dependency serialization module +lazy val scaldingSerialization = module("serialization").settings( + libraryDependencies ++= Seq( + "org.scala-lang" % "scala-reflect" % scalaVersion.value + ), + addCompilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)) +) + +lazy val scaldingJson = module("json") + .settings( + libraryDependencies ++= Seq( + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "com.fasterxml.jackson.module" %% "jackson-module-scala" % jacksonVersion, + "org.json4s" %% "json4s-native" % json4SVersion, + "com.twitter.elephantbird" % "elephant-bird-cascading2" % elephantbirdVersion % "provided" + ) + ) + .dependsOn(scaldingCore) + +lazy val scaldingHadoopTest = module("hadoop-test") + .settings( + libraryDependencies ++= Seq( + "org.apache.hadoop" % "hadoop-client" % hadoopVersion, + "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion, + ("org.apache.hadoop" % "hadoop-yarn-server-tests" % hadoopVersion).classifier("tests"), + "org.apache.hadoop" % "hadoop-yarn-server" % hadoopVersion, + ("org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-common" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion).classifier("tests"), + "com.twitter" %% "chill-algebird" % chillVersion, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.slf4j" % "slf4j-log4j12" % slf4jVersion, + "org.scalacheck" %% "scalacheck" % scalaCheckVersion, + "org.scalatest" %% "scalatest" % scalaTestVersion + ) + ) + .dependsOn(scaldingCore, scaldingSerialization) + +lazy val scaldingEstimatorsTest = module("estimators-test") + .settings( + libraryDependencies ++= Seq( + "org.apache.hadoop" % "hadoop-client" % hadoopVersion, + "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion, + ("org.apache.hadoop" % "hadoop-yarn-server-tests" % hadoopVersion).classifier("tests"), + "org.apache.hadoop" % "hadoop-yarn-server" % hadoopVersion, + ("org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-common" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion).classifier("tests"), + "com.twitter" %% "chill-algebird" % chillVersion, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.slf4j" % "slf4j-log4j12" % slf4jVersion, + "org.scalacheck" %% "scalacheck" % scalaCheckVersion, + "org.scalatest" %% "scalatest" % scalaTestVersion + ) + ) + .dependsOn(scaldingHadoopTest % "test") + +// This one uses a different naming convention +lazy val maple = Project( + id = "maple", + base = file("maple") +).settings( + sharedSettings ++ Seq( + name := "maple", + mimaPreviousArtifacts := Set.empty, + crossPaths := false, + autoScalaLibrary := false, + publishArtifact := true, + libraryDependencies ++= Seq( + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "provided", + "org.apache.hbase" % "hbase" % hbaseVersion % "provided", + "org.apache.hbase" % "hbase-client" % hbaseVersion % "provided", + "org.apache.hbase" % "hbase-common" % hbaseVersion % "provided", + "org.apache.hbase" % "hbase-server" % hbaseVersion % "provided", + "cascading" % "cascading-hadoop" % cascadingVersion + ) + ) +) + +lazy val executionTutorial = Project( + id = "execution-tutorial", + base = file("tutorial/execution-tutorial") +).settings( + sharedSettings ++ Seq( + name := "execution-tutorial", + libraryDependencies ++= Seq( + "org.scala-lang" % "scala-library" % scalaVersion.value, + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion, + "org.slf4j" % "slf4j-api" % slf4jVersion, + "org.slf4j" % "slf4j-log4j12" % slf4jVersion, + "cascading" % "cascading-hadoop" % cascadingVersion + ) + ) +).dependsOn(scaldingCore) + +lazy val scaldingDb = module("db") + .settings( + libraryDependencies ++= Seq( + "org.scala-lang" % "scala-library" % scalaVersion.value, + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "com.twitter" %% "bijection-macros" % bijectionVersion + ), + addCompilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)) + ) + .dependsOn(scaldingCore) + +lazy val scaldingThriftMacrosFixtures = module("thrift-macros-fixtures") + .settings( + Test / scroogeThriftSourceFolder := baseDirectory.value / "src/test/resources", + libraryDependencies ++= Seq( + ("com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided") + .exclude("com.google.guava", "guava"), + "org.apache.thrift" % "libthrift" % thriftVersion + ) + ) + +lazy val scaldingThriftMacros = module("thrift-macros") + .settings( + libraryDependencies ++= Seq( + "org.scala-lang" % "scala-reflect" % scalaVersion.value, + "com.twitter" %% "bijection-macros" % bijectionVersion, + "com.twitter" % "chill-thrift" % chillVersion % "test", + ("com.twitter" %% "scrooge-serializer" % scroogeVersion % "provided") + .exclude("com.google.guava", "guava"), + "org.apache.thrift" % "libthrift" % thriftVersion, + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "test", + "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion % "test", + "org.apache.hadoop" % "hadoop-client" % hadoopVersion % "test", + "org.apache.hadoop" % "hadoop-minicluster" % hadoopVersion % "test", + ("org.apache.hadoop" % "hadoop-yarn-server-tests" % hadoopVersion).classifier("tests"), + "org.apache.hadoop" % "hadoop-yarn-server" % hadoopVersion % "test", + ("org.apache.hadoop" % "hadoop-hdfs" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-common" % hadoopVersion).classifier("tests"), + ("org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % hadoopVersion).classifier("tests") + ), + addCompilerPlugin(("org.scalamacros" % "paradise" % paradiseVersion).cross(CrossVersion.full)) + ) + .dependsOn( + scaldingCore, + scaldingHadoopTest % "test", + scaldingSerialization, + scaldingThriftMacrosFixtures % "test->test" + ) + +def docsSourcesAndProjects(sv: String): Seq[ProjectReference] = + Seq( + scaldingArgs, + scaldingDate, + scaldingCore + // scaldingCommons, + // scaldingAvro, + // scaldingParquet, + // scaldingParquetScrooge, + // scaldingHRaven, + // scaldingRepl, + // scaldingJson, + // scaldingDb, + // maple, + // scaldingSerialization, + // scaldingThriftMacros + ) + +lazy val docsMappingsAPIDir = settingKey[String]("Name of subdirectory in site target directory for api docs") + +lazy val docSettings = Seq( + micrositeName := "Scalding", + micrositeDescription := "Scala API for Cascading.", + micrositeAuthor := "Scalding's contributors", + micrositeHighlightTheme := "atom-one-light", + micrositeHomepage := "https://twitter.github.io/scalding", + micrositeBaseUrl := "scalding", + micrositeDocumentationUrl := "api", + micrositeGithubOwner := "twitter", + micrositeExtraMdFiles := Map(file("CONTRIBUTING.md") -> ExtraMdFileConfig("contributing.md", "home")), + micrositeGithubRepo := "scalding", + micrositePalette := Map( + "brand-primary" -> "#5B5988", + "brand-secondary" -> "#292E53", + "brand-tertiary" -> "#222749", + "gray-dark" -> "#49494B", + "gray" -> "#7B7B7E", + "gray-light" -> "#E5E5E6", + "gray-lighter" -> "#F4F3F4", + "white-color" -> "#FFFFFF" + ), + autoAPIMappings := true, + ScalaUnidoc / unidoc / unidocProjectFilter := + inProjects(docsSourcesAndProjects(scalaVersion.value): _*), + docsMappingsAPIDir := "api", + addMappingsToSiteDir(ScalaUnidoc / packageDoc / mappings, docsMappingsAPIDir), + ghpagesNoJekyll := false, + ScalaUnidoc / unidoc / fork := true, + ScalaUnidoc / unidoc / scalacOptions ++= Seq( + "-doc-source-url", + "https://github.com/twitter/scalding/tree/develop€{FILE_PATH}.scala", + "-sourcepath", + (LocalRootProject / baseDirectory).value.getAbsolutePath, + "-diagrams" + ), + mdocIn := new File((LocalRootProject / baseDirectory).value, "docs/src"), + git.remoteRepo := "git@github.com:twitter/scalding.git", + makeSite / includeFilter := "*.html" | "*.css" | "*.png" | "*.jpg" | "*.gif" | "*.js" | "*.swf" | "*.yml" | "*.md" +) + +// Documentation is generated for projects defined in +// `docsSourcesAndProjects`. +lazy val docs = project + .enablePlugins(MdocPlugin) + .enablePlugins(MicrositesPlugin) + .enablePlugins(ScalaUnidocPlugin) + .enablePlugins(GhpagesPlugin) + .settings(moduleName := "scalding-docs") + .settings(sharedSettings) + .settings(noPublishSettings) + .settings(docSettings) + .settings(Compile / scalacOptions ~= (_.filterNot(Set("-Ywarn-unused-import", "-Ywarn-dead-code")))) + .dependsOn(scaldingCore) diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000000..17cc44412b --- /dev/null +++ b/codecov.yml @@ -0,0 +1,25 @@ +# note that if an org-wide global config is configured, it will be merged (with duplicate settings taking priority from this file) +# it's better to explicitly set all configs if you want consistency + +codecov: + require_ci_to_pass: yes + +coverage: + precision: 2 + round: down + range: "0...100" # acceptable coverage range + +# default behaviour +parsers: + gcov: + branch_detection: + conditional: yes + loop: yes + method: no + macro: no + +# can be configured in https://docs.codecov.com/docs/pull-request-comments +comment: + layout: "reach,diff,flags,files,footer" + behavior: default + require_changes: no diff --git a/docs/src/main/resources/microsite/img/favicon.png b/docs/src/main/resources/microsite/img/favicon.png new file mode 100644 index 0000000000..290604444a Binary files /dev/null and b/docs/src/main/resources/microsite/img/favicon.png differ diff --git a/docs/src/main/resources/microsite/img/navbar_brand.png b/docs/src/main/resources/microsite/img/navbar_brand.png new file mode 100644 index 0000000000..91091b7b7d Binary files /dev/null and b/docs/src/main/resources/microsite/img/navbar_brand.png differ diff --git a/docs/src/main/resources/microsite/img/navbar_brand2x.png b/docs/src/main/resources/microsite/img/navbar_brand2x.png new file mode 100644 index 0000000000..3de30ba8bd Binary files /dev/null and b/docs/src/main/resources/microsite/img/navbar_brand2x.png differ diff --git a/docs/src/main/resources/microsite/img/sidebar_brand.png b/docs/src/main/resources/microsite/img/sidebar_brand.png new file mode 100644 index 0000000000..947f241cb8 Binary files /dev/null and b/docs/src/main/resources/microsite/img/sidebar_brand.png differ diff --git a/docs/src/main/resources/microsite/img/sidebar_brand2x.png b/docs/src/main/resources/microsite/img/sidebar_brand2x.png new file mode 100644 index 0000000000..f3d13e23b8 Binary files /dev/null and b/docs/src/main/resources/microsite/img/sidebar_brand2x.png differ diff --git a/docs/src/main/tut/cookbook.md b/docs/src/main/tut/cookbook.md new file mode 100644 index 0000000000..8f660b9fa2 --- /dev/null +++ b/docs/src/main/tut/cookbook.md @@ -0,0 +1,16 @@ +--- +layout: docs +title: "Cookbook" +section: "cookbook" +position: 1 +--- + +{% include_relative cookbook/cookbook.md %} + +## Index + +{% for x in site.pages %} + {% if x.section == 'cookbook' %} +- [{{x.title}}]({{site.baseurl}}{{x.url}}) + {% endif %} +{% endfor %} diff --git a/docs/src/main/tut/cookbook/cookbook.md b/docs/src/main/tut/cookbook/cookbook.md new file mode 100644 index 0000000000..e32df6d324 --- /dev/null +++ b/docs/src/main/tut/cookbook/cookbook.md @@ -0,0 +1,3 @@ +# Cookbook + +In Progress - a cookbook of things you might like to do with Scalding. diff --git a/docs/src/main/tut/cookbook/hbase.md b/docs/src/main/tut/cookbook/hbase.md new file mode 100644 index 0000000000..b95c3bc163 --- /dev/null +++ b/docs/src/main/tut/cookbook/hbase.md @@ -0,0 +1,22 @@ +--- +layout: docs +title: "Scalding and HBase" +section: "cookbook" +--- + +# Using Scalding with HBase + +## Resources + +- [Running Scalding with HBase support](https://github.com/kianwilcox/hbase-scalding) a github example project. +- [Spy Glass](https://github.com/ParallelAI/SpyGlass) - Advanced featured HBase wrapper for Cascading and Scalding +- [Maple](https://github.com/Cascading/maple) a collection of Cascading Taps, including a simple HBase tap. Spy Glass appears to be the more advanced option. +- [KijiExpress](https://github.com/kijiproject/kiji-express) provides a full lifecycle for building predictive models using Scalding and HBase. + +## Example Code + +TODO: Please add links to example code here. + +### Documentation Help + +We'd love your help fleshing out this documentation! You can edit this page in your browser by clicking [this link](https://github.com/twitter/scalding/edit/develop/docs/src/main/tut/cookbook/hbase.md). diff --git a/docs/src/main/tut/faq.md b/docs/src/main/tut/faq.md new file mode 100644 index 0000000000..5b216d0bda --- /dev/null +++ b/docs/src/main/tut/faq.md @@ -0,0 +1,253 @@ +--- +layout: page +title: "FAQ" +section: "faq" +position: 3 +--- + +# Frequently Asked Questions + +Feel free to add new questions and to ping [@Scalding](http://twitter.com/scalding) for an answer. + +# Running Scalding + +### Who actually uses Scalding? + +Twitter uses it in production all over the place! + +Check out our [Powered By](powered_by.html) page for more examples. + +### I'm having trouble with scald.rb, and I just want to run jars in my own system: + +See this [conversation on Twitter](https://twitter.com/Joolz/status/264834261549457409). + +### Can Scalding be run on Amazon's Elastic MapReduce? + +Yes! See the [cascading-user group discussion](https://groups.google.com/forum/?fromgroups#!topic/cascading-user/5RfJa8n1JPo). We would like to see someone prepare a patch for scald.rb to handle submission of scalding jobs to EMR. + +### Scalding complains when I use a [TimePathedSource](https://github.com/twitter/scalding/blob/master/src/main/scala/com/twitter/scalding/FileSource.scala#L213) and some of the data is missing. How can I ignore that error? + +Pass the option `--tool.partialok` to your job and it will ignore any missing data. It's safer to work around by either filling with place-holder empty files, or writing sources thatxb will skip known-missing dates. Using that option by default is very dangerous. + +### I receive this error when running `sbt update`: Error occurred during initialization of VM. Incompatible minimum and maximum heap sizes specified + +In your sbt script, set `local min=$(( $mem / 2 ))` + +# Writing Jobs + +### How do I make simple records for use in my scalding job? + +We recommend cases classes **defined outside of your Job**. Case classes defined inside your job capture an $outer member variable that references the job that is wasteful for serialization. If you are having stack overflows during case class serialization this is likely your problem. If you have a use case this doesn't cover, email the cascading-user list or mention [@scalding](http://twitter.com/scalding). Dealing with serialization issues well in systems like Hadoop is tricky, and we're still improving our approaches. + +See the [discussion on cascading-user](https://groups.google.com/forum/?fromgroups#!topic/cascading-user/kjpohwyC03Y). + +### How do I pass parameters to my hadoop job (number of reducers , memory options , etc.) ? + +``` +hadoop jar myjar \ +com.twitter.scalding.Tool \ +-D mapred.output.compress=false \ +-D mapred.child.java.opts=-Xmx2048m \ +-D mapred.reduce.tasks=20 \ +com.class.myclass \ +--hdfs \ +--input $input \ +--output $output +``` + +### How do I access the jobConf? + +If you want to update the jobConf in your job, the way to do it is to override the config method in Job: + +https://github.com/twitter/scalding/blob/cee3bb99ebb00db9622c387bee0b2718ab9cea61/scalding-core/src/main/scala/com/twitter/scalding/Job.scala#L163 + +If you really want to just read from the jobConf, you can do it with code like: + +```scala +implicitly[Mode] match { + case Hdfs(_, configuration) => { + // use the configuration which is an instance of Configuration + } + case _ => error("Not running on Hadoop! (maybe cascading local mode?)") +} +``` + +See this discussion: https://groups.google.com/forum/?fromgroups=#!topic/cascading-user/YppTLebWds8 + +### How do I append my parameters to jobConf? + +```scala +class WordCountJob(args : Args) extends Job(args) { + +// Prior to 0.9.0 we need the mode, after 0.9.0 mode is a def on Job. +override def config(implicit m: Mode): Map[AnyRef,AnyRef] = { + super.config ++ Map ("my.job.name" -> "my new job name") + + } +``` + +### What if I have more than 22 fields in my data-set? + +**TODO: this answer refers to the DEPRECATED Fields API.** + +Many of the examples (e.g. in the `tutorial/` directory) show that the fields argument is specified as a Scala Tuple when reading a delimited file. However Scala Tuples are currently limited to a maximum of 22 elements. To read-in a data-set with more than 22 fields, you can use a List of Symbols as fields specifier. E.g. + +```scala +val mySchema = List('first, 'last, 'phone, 'age, 'country) + +val input = Csv("/path/to/file.txt", separator = ",", fields = mySchema) +val output = TextLine("/path/to/out.txt") +input.read + .project('age, 'country) + .write(Tsv(output)) +``` + +Another way to specify fields is using Scala Enumerations, which is available in the `develop` branch (as of Apr 2, 2013), as demonstrated in [Tutorial 6](https://github.com/twitter/scalding/blob/develop/tutorial/Tutorial6.scala): + +```scala +object Schema extends Enumeration { + val first, last, phone, age, country = Value // arbitrary number of fields +} + +import Schema._ + +Csv("tutorial/data/phones.txt", separator = " ", fields = Schema) + .read + .project(first,age) + .write(Tsv("tutorial/data/output6.tsv")) +``` + +### How do I increase the spill threshold? + +The spilling is controlled with the same hadoop option as cascading: + +``` +-Dcascading.spill.list.threshold=1000000 +``` + +Would keep 1 million items in memory. + +The rule of thumb is use as much as you can without getting OOM. + +### How do I increase the AggregateBy threshold value? + +You can't set a default for AggregateBy, you need to set it in each reducer by calling spillThreshold function on GroupBuilder. +https://github.com/twitter/scalding/blob/develop/scalding-core/src/main/scala/com/twitter/scalding/GroupBuilder.scala#L97 + +### Q. My Hadoop job is erroring out with AbstractMethodError or IncompatibleClassChangeError. + +A. If your job has dependencies that clash with Hadoop's, Hadoop can replace your version of a library (like log4j or ASM) with its own native version. You can fix this with an environment flag that makes sure that your jars show up on the classpath before Hadoop's. Set these environment variables: + + bash + export HADOOP_CLASSPATH= + export HADOOP_USER_CLASSPATH_FIRST=true + +### Q. I'm getting a NotSerializableException on Hadoop job submission. + +A. All fields in Job get serialized and sent to Hadoop. Your job contains an +object that is not serializable, even with Kryo. This issue may exhibit itself +as other exceptions, such as `InvocationTargetException`, `KryoException`, or +`IllegalAccessException`. What all these potential exceptions have in common +is being related to serialization failures during Hadoop job submission. + +First, try to figure out which object is causing the problem. + +For a better stacktrace than the usual opaque dump, try submitting your job again with the `extendedDebugInfo` flag set: + + export HADOOP_OPTS="-Dsun.io.serialization.extendedDebugInfo=true"; hadoop + +You should see a much larger stacktrace, with many entries like this: + +``` + - field (class "com.twitter.scalding.MapsideReduce", name: "commutativeSemigroup", type: "interface com.twitter.algebird.Semigroup") + - object (class "com.twitter.scalding.MapsideReduce", MapsideReduce[decl:'key', 'value']) + - field (class "cascading.pipe.Operator", name: "operation", type: "interface cascading.operation.Operation") + - object (class "cascading.pipe.Each", Each(_pipe_2*_pipe_3)[MapsideReduce[decl:'key', 'value']]) + - field (class "org.jgrapht.graph.IntrusiveEdge", name: "target", type: "class java.lang.Object") + - object (class "org.jgrapht.graph.IntrusiveEdge", org.jgrapht.graph.IntrusiveEdge@6ed95e60) + - custom writeObject data (class "java.util.HashMap") + - object (class "java.util.LinkedHashMap", {[{?}:UNKNOWN] +[{?}:UNKNOWN]=org.jgrapht.graph.IntrusiveEdge@6ce4ece3, [{2}:0:1] +``` + +Typically, if you start reading from the bottom of these entries upward, the first familiar class you see will be the object that's being unexpectedly serialized and causing you issues. In this case, the error was with Scalding's =MapsideReduce= class. + +Once you know which object is causing the problem, try one of the following remedies: + +1. Put the object in a lazy val + +2. Move it into a companion object, which will not be serialized. + +3. If the item is only needed at submission, but not on the Mappers/Reducers, make it `@transient`. + +If you see a common case we overlooked, let us know. Some common issues are inner classes to the Job (don't do that), Logger objects (don't put those in the job, put them in a companion), and some mutable Guava objects have given us trouble (we'd love to see this ticket closed: https://github.com/twitter/chill/issues/66 ) + +# Issues with Testing + +### How do I get my tests working with Spec2? + +from [Alex Dean, @alexatkeplar](https://twitter.com/alexatkeplar) + +The problem was in how I was defining my tests. For Scalding, your Specs2 tests must look like this: +```scala +"A job which trys to do blah" should { + <> + "successfully do blah" in { + expected.blah must_== actual.blah + } +} +``` + +My problem was that my tests looked like this: + +```scala +"A job which trys to do blah" should { + "successfully do blah" in { + <> + expected.blah must_== actual.blah + } +} +``` +In other words, running the job was inside the `in {}`. For some reason, this was leading to multiple jobs running at the same time and conflicting with each others' output. + +If anyone is interested, the diff which fixed my tests is here: https://github.com/snowplow/snowplow/commit/792ed2f9082b871ecedcf36956427a2f0935588c + +### How can I work with HBase with scalding? + +See the [Scalding and HBase](cookbook/hbase.html) page in the cookbook. + +# Issues with SBT + +Q) What version of SBT do I need? (It'd be great to capture the actual error that happens when you use the wrong version) + +A) Get SBT 0.12.2. If you're having an older version of SBT, you can update it by typing in command line: + +brew update; +brew unlink sbt; +brew install sbt + +Q) What happens if I get OutOfMemoryErrors when running "sbt assembly"? + +A) Create ~/.sbtconfig with these options: + +``` +SBT_OPTS="-XX:+UseConcMarkSweepGC -XX:+CMSClassUnloadingEnabled -XX:PermSize=256M -XX:MaxPermSize=512M" +``` + +Q) What should I do if I get "value compare is not a member of object Integer" when running "./sbt compile"? + +A) You're probably using Java 6 instead of Java 7. You can specify which version of Java SBT should use by passing it the `-java-home` option. For example, on a Mac you're SBT command might look something like: + +``` +./sbt -java-home /Library/Java/JavaVirtualMachines//Contents/Home/ +``` + +# Contributing code + +### Do you accept pull requests? + +Yes! By requesting a pull, you are agreeing to license your code under the same license as Scalding. + +### To which branch do I make my pull request? + +[develop](https://github.com/twitter/scalding/tree/develop) diff --git a/docs/src/main/tut/index.md b/docs/src/main/tut/index.md new file mode 100644 index 0000000000..58e05b38cf --- /dev/null +++ b/docs/src/main/tut/index.md @@ -0,0 +1,80 @@ +--- +layout: home +title: "Home" +section: "home" +--- + +Scalding is a Scala library that makes it easy to specify Hadoop MapReduce jobs. Scalding is built on top of [Cascading](http://www.cascading.org/), a Java library that abstracts away low-level Hadoop details. Scalding is comparable to [Pig](http://pig.apache.org/), but offers tight integration with Scala, bringing advantages of Scala to your MapReduce jobs. + +![Scalding Logo](https://raw.github.com/twitter/scalding/develop/logo/scalding.png) + +### Word Count + +Hadoop is a distributed system for counting words. Here is how it's done in Scalding. + +```scala +package com.twitter.scalding.examples + +import com.twitter.scalding._ +import com.twitter.scalding.source.TypedText + +class WordCountJob(args: Args) extends Job(args) { + TypedPipe.from(TextLine(args("input"))) + .flatMap { line => tokenize(line) } + .groupBy { word => word } // use each word for a key + .size // in each group, get the size + .write(TypedText.tsv[(String, Long)](args("output"))) + + // Split a piece of text into individual words. + def tokenize(text: String): Array[String] = { + // Lowercase each word and remove punctuation. + text.toLowerCase.replaceAll("[^a-zA-Z0-9\\s]", "").split("\\s+") + } +} +``` + +Notice that the `tokenize` function, which is standard Scala, integrates naturally with the rest of the MapReduce job. This is a very powerful feature of Scalding. (Compare it to the use of UDFs in Pig.) + +You can find more example code under [examples/](https://github.com/twitter/scalding/tree/master/scalding-commons/src/main/scala/com/twitter/scalding/examples). If you're interested in comparing Scalding to other languages, see our [Rosetta Code page](https://github.com/twitter/scalding/wiki/Rosetta-Code), which has several MapReduce tasks in Scalding and other frameworks (e.g., Pig and Hadoop Streaming). + +## Documentation and Getting Started + +The latest API docs are hosted at Scalding's [ScalaDoc index](api/). + +* [**Getting Started**](https://github.com/twitter/scalding/wiki/Getting-Started) page on the [Scalding Wiki](https://github.com/twitter/scalding/wiki) +* [Scalding Scaladocs](http://twitter.github.com/scalding) provide details beyond the API References. Prefer using this as it's always up to date. +* [**REPL in Wonderland**](tutorial/WONDERLAND.md) a hands-on tour of the scalding REPL requiring only git and java installed. +* [**Runnable tutorials**](https://github.com/twitter/scalding/tree/master/tutorial) in the source. +* The API Reference, including many example Scalding snippets: + * [Type-safe API Reference](https://github.com/twitter/scalding/wiki/Type-safe-api-reference) + * [Fields-based API Reference](https://github.com/twitter/scalding/wiki/Fields-based-API-Reference) +* The Matrix Library provides a way of working with key-attribute-value scalding pipes: + * The [Introduction to Matrix Library](https://github.com/twitter/scalding/wiki/Introduction-to-Matrix-Library) contains an overview and a "getting started" example + * The [Matrix API Reference](https://github.com/twitter/scalding/wiki/Matrix-API-Reference) contains the Matrix Library API reference with examples +* [**Introduction to Scalding Execution**](https://github.com/twitter/scalding/wiki/Calling-Scalding-from-inside-your-application) contains general rules and examples of calling Scalding from inside another application. + +Please feel free to use the beautiful [Scalding logo](https://drive.google.com/folderview?id=0B3i3pDi3yVgNbm9pMUdDcHFKVEk&usp=sharing) artwork anywhere. + +## Get Involved + Code of Conduct + +Pull requests and bug reports are always welcome! + +Discussion occurs primarily on the Gitter channel: [![Chat](https://badges.gitter.im/twitter/scalding.svg)](https://gitter.im/twitter/scalding?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +Issues should be reported on the [GitHub issue tracker](https://github.com/twitter/scalding/issues). +Follow [@Scalding](http://twitter.com/scalding) on Twitter for updates. + +We use a lightweight form of project governance inspired by the one used by Apache projects. + +Please see [Contributing and Committership](https://github.com/twitter/analytics-infra-governance#contributing-and-committership) for our code of conduct and our pull request review process. + +The TL;DR is send us a pull request, iterate on the feedback + discussion, and get a +1 from a [Committer](https://github.com/twitter/scalding/blob/develop/COMMITTERS.md) in order to get your PR accepted. + +The current list of active committers (who can +1 a pull request) can be found here: [Committers](https://github.com/twitter/scalding/blob/develop/COMMITTERS.md) + +A list of contributors to the project can be found here: [Contributors](https://github.com/twitter/scalding/graphs/contributors) + +## License + +Copyright 2016 Twitter, Inc. + +Licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0). diff --git a/docs/src/main/tut/powered_by.md b/docs/src/main/tut/powered_by.md new file mode 100644 index 0000000000..5152314950 --- /dev/null +++ b/docs/src/main/tut/powered_by.md @@ -0,0 +1,40 @@ +--- +layout: page +title: "Powered By" +section: "poweredby" +position: 5 +--- + +# Powered By Scalding + +Want to be added to this page? Send a tweet to [@scalding](https://twitter.com/scalding) or open an issue. + +| **Company** | **Scalding Use Case** | **Code** | +|:------------|:----------------------|:---------| +| Twitter | We use Scalding often, for everything from custom ad targeting algorithms, market insight, click prediction, traffic quality to PageRank on the Twitter graph. We hope you will use it too! | - | +| Spotify | We use Scalding for almost everything including music recommendation features like [Discover Weekly](https://www.spotify.com/us/discoverweekly/) & Release Radar, key business metrics, analytics and content catalogue. | - | +| Etsy | We're starting to use Scalding alongside the JRuby Cascading stack described here. More to come as we use it further. | - | +| eBay | We use Scalding in our Search organization for ad-hoc data analysis jobs as well as more mature data pipelines that feed our production systems. | - | +| Snowplow Analytics | Our data validation & enrichment process for event analytics is built on top of Scalding. | [GitHub](https://github.com/snowplow/snowplow/tree/master/3-enrich/hadoop-etl) | +| PredictionIO | Machine-learning algorithms build on top of Scalding. | [GitHub](https://github.com/PredictionIO/PredictionIO/tree/master/process/engines) | +| Gatling | We've just rebuilt our reports generation module on top of Scalding. Handy API on top of an efficient engine. | [GitHub](https://github.com/gcoutant/gatling-scalding/tree/master/src/main/scala/com/excilys/ebi/gatling/scalding) +| SoundCloud | We use Scalding in our search and recommendations production pipelines to pre and post-process data for various machine learning and graph-based learning algorithms. We also use Scalding for ad-hoc and regular jobs run over production logs for things like click tracking and quality evaluation on search results and recommendations. | - | +| Sonar | Our platform is built on Hadoop, Scalding, Cassandra and Storm. See Sonar's job listings. | - | +| BSkyB | Sky is using Scalding on Hadoop and utilizing HBase through the SpyGlass library for statistical analysis , content related jobs and reporting. | - | +| [LivePerson](http://www.liveperson.com/) | LivePerson's data science group is using Scalding on Hadoop, to develop machine learning algorithms and big data analysis. | - | +| Sharethrough | Sharethrough uses Scalding throughout our production data infrastructure. We use it for everything from advertiser reporting and ML feature engineering, to ad targeting and click forecasting. | - | +| LinkedIn | Scalding is being used at LinkedIn both at the Product Data Science team and the Email Experience team. | - | +| Stripe | Stripe uses Scalding for ETL and machine learning to support our analytics and fraud prevention teams. | - | +| Move | Move uses Scalding on Hadoop for advanced analytics and personalization for Realtor.com and its mobile real estate apps. | - | +| Tapad | Tapad uses scalding to manage productized analytics and reporting, internal ad-hoc data mining, and to support our data science team's research and development efforts. | - | +| CrowdStrike | CrowdStrike employs Scalding in our data science and data mining pipelines as part of our big data security platforms in research, development, product and customer endpoints. We have plans to open source our Scalding API (AWS, EMR) on github. | - | +| Tumblr | Tumblr uses scalding as a sort of MVC framework for Hadoop. Applications include recommendations/discovery, spam detection, and general ETL. | - | +| Elance | Elance uses scalding for constructing data sets for search ranking, recommendation systems, other modeling problems. | - | +| Commonwealth Bank Of Australia | Commbank uses scalding as a key component within its big data infrastructure. Both on the ETL side, and for the implementation of data science pipelines for building various predictive models | Github | +| Sabre Labs | Sabre Labs uses Scalding for ETL and ad hoc data analysis of trip information. | | +| gutefrage.net | gutefrage.net uses Scalding for it's Data Products and general ETL flows. | | +| MediaMath | MediaMath uses Scalding to power its Data Platform, the centralized data store that powers our ad hoc analytics, client log delivery and new optimization/insight-based products. | | +| The Search Party | The Search Party is using Scalding to build production machine learning libraries for clustering, recommendation and text analysis of recruitment related data. Scalding is a breath of fresh air! | | +| Opower | Opower uses Scalding and [KijiExpress](https://github.com/kijiproject/kiji-express) to analyze the world's energy data and extract machine learning-based insights that power behavior change. | | +| Barclays | Barclays uses Scalding for Data Warehousing, ETL and data tranformation into columnar (query optimized) data formats. | | +| Devsisters | Devsisters uses Scalding for game log analysis (1264) | | diff --git a/docs/src/main/tut/resources_for_learners.md b/docs/src/main/tut/resources_for_learners.md new file mode 100644 index 0000000000..b91c781050 --- /dev/null +++ b/docs/src/main/tut/resources_for_learners.md @@ -0,0 +1,44 @@ +--- +layout: page +title: "Resources for Learners" +section: "resources_for_learners" +position: 2 +--- + +## Videos + +- [Scalding: Powerful & Concise MapReduce Programming](http://www.youtube.com/watch?v=LaAEhPoIm_A) +- [Scalding lecture for UC Berkeley's Analyzing Big Data with Twitter class](http://blogs.ischool.berkeley.edu/i290-abdt-s12/2012/11/03/video-lecture-intro-to-scalding-by-posco-and-argyris/) +- [Scalding REPL with Eclipse Scala Worksheets](http://youtu.be/Forl4hpg7kA) + +## How-tos + +- Scalding with CDH3U2 in a Maven project (TODO: Convert wiki page and link) +- [Running your Scalding jobs in Eclipse](http://hokiesuns.blogspot.com/2012/07/running-your-scalding-jobs-in-eclipse.html) +- [Running your Scalding jobs in IDEA intellij](http://willwhim.wpengine.com/2013/02/28/using-intellij-with-twitters-scalding/) +- [Running Scalding jobs on EMR](https://github.com/snowplow/scalding-example-project) +- [Running Scalding with HBase support](cookbook/hbase.html) +- [Using the distributed cache](https://github.com/twitter/scalding/wiki/Using-the-distributed-cache) +- [Unit Testing Scalding Jobs](http://www.agileatwork.com/unit-testing-scalding-jobs/) +- [TDD for Scalding](http://scalding.io/2014/07/applying-tdd-to-scalding-development/) +- [Using counters](https://github.com/tomer-ben-david-examples/scalding-counters-example) + +## Tutorials + +- [Scalding for the impatient](http://sujitpal.blogspot.com/2012/08/scalding-for-impatient.html) +- [Movie Recommendations and more in MapReduce and Scalding](http://blog.echen.me/2012/02/09/movie-recommendations-and-more-via-mapreduce-and-scalding/) +- [Generating Recommendations with MapReduce and Scalding](https://blog.twitter.com/2012/generating-recommendations-with-mapreduce-and-scalding) +- [Poker collusion detection with Mahout and Scalding](http://www.javacodegeeks.com/2012/08/mahout-and-scalding-for-poker-collusion.html) +- [Portfolio Management in Scalding](http://www.jasq.org/2/post/2012/12/portfolio-mgmt-in-scalding.html) +- [Find the Fastest Growing County in US, 1969-2011, using Scalding](https://gist.github.com/4696053) +- [Mod-4 matrix arithmetic with Scalding and Algebird](https://github.com/wibiclint/mod4matrix-arithmetic/blob/master/Mod4.scala) +- [Dean Wampler's Scalding Workshop](https://github.com/deanwampler/scalding-workshop) +- [Typesafe's Activator for Scalding](http://typesafe.com/activator/template/activator-scalding) + +## Articles + +- [Hive, Pig, Scalding, Scoobi, Scrunch and Spark: A Comparison of Hadoop Frameworks](http://blog.samibadawi.com/2012/03/hive-pig-scalding-scoobi-scrunch-and.html) +- [Why Hadoop MapReduce needs Scala](http://speakerdeck.com/u/agemooij/p/why-hadoop-mapreduce-needs-scala) +- [How Twitter is doing its part to democratize big data](http://gigaom.com/cloud/how-twitter-is-doing-its-part-to-democratize-big-data/) +- [Meet the combo powering Hadoop at Etsy, Airbnb and Climate Corp.](http://gigaom.com/data/meet-the-combo-behind-etsy-airbnb-and-climate-corp-hadoop-jobs/) +- [Scalding wins a Bossie award from InfoWorld](http://www.infoworld.com/slideshow/65089/bossie-awards-2012-the-best-open-source-databases-202354#slide3) diff --git a/maple/src/main/java/com/twitter/maple/hbase/HBaseScheme.java b/maple/src/main/java/com/twitter/maple/hbase/HBaseScheme.java index b90b6efc81..0f830ede86 100644 --- a/maple/src/main/java/com/twitter/maple/hbase/HBaseScheme.java +++ b/maple/src/main/java/com/twitter/maple/hbase/HBaseScheme.java @@ -31,7 +31,6 @@ import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.RecordReader; -import org.mortbay.log.Log; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,7 +39,7 @@ import java.util.HashSet; /** - * The HBaseScheme class is a {@link Scheme} subclass. It is used in conjunction with the {@HBaseTap} to + * The HBaseScheme class is a {@link Scheme} subclass. It is used in conjunction with the {@link HBaseTap} to * allow for the reading and writing of data to and from a HBase cluster. * * @see HBaseTap diff --git a/maple/src/main/java/com/twitter/maple/hbase/HBaseTap.java b/maple/src/main/java/com/twitter/maple/hbase/HBaseTap.java index 2cbd1a237d..bd5ffa8095 100644 --- a/maple/src/main/java/com/twitter/maple/hbase/HBaseTap.java +++ b/maple/src/main/java/com/twitter/maple/hbase/HBaseTap.java @@ -17,7 +17,6 @@ import cascading.flow.FlowProcess; import cascading.tap.SinkMode; import cascading.tap.Tap; -import cascading.tap.hadoop.io.HadoopTupleEntrySchemeCollector; import cascading.tap.hadoop.io.HadoopTupleEntrySchemeIterator; import cascading.tuple.TupleEntryCollector; import cascading.tuple.TupleEntryIterator; @@ -25,6 +24,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.*; +import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; import org.apache.hadoop.mapred.FileInputFormat; @@ -33,15 +33,13 @@ import org.apache.hadoop.mapred.RecordReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import sun.reflect.generics.reflectiveObjects.NotImplementedException; import java.io.IOException; -import java.util.Map.Entry; import java.util.UUID; /** * The HBaseTap class is a {@link Tap} subclass. It is used in conjunction with - * the {@HBaseFullScheme} to allow for the reading and writing + * the {@link HBaseScheme} to allow for the reading and writing * of data to and from a HBase cluster. */ public class HBaseTap extends Tap { @@ -92,6 +90,8 @@ public HBaseTap(String tableName, HBaseScheme HBaseFullScheme, SinkMode sinkMode /** * Constructor HBaseTap creates a new HBaseTap instance. * + * @param quorumNames + * of type String * @param tableName * of type String * @param HBaseFullScheme @@ -106,6 +106,8 @@ public HBaseTap(String quorumNames, String tableName, HBaseScheme HBaseFullSchem /** * Constructor HBaseTap creates a new HBaseTap instance. * + * @param quorumNames + * of type String * @param tableName * of type String * @param HBaseFullScheme @@ -132,7 +134,7 @@ public Path getPath() { return new Path(SCHEME + ":/" + tableName.replaceAll(":", "_")); } - protected HBaseAdmin getHBaseAdmin(JobConf conf) throws MasterNotRunningException, ZooKeeperConnectionException { + protected HBaseAdmin getHBaseAdmin(JobConf conf) throws MasterNotRunningException, ZooKeeperConnectionException, IOException { if (hBaseAdmin == null) { Configuration hbaseConf = HBaseConfiguration.create(conf); hBaseAdmin = new HBaseAdmin(hbaseConf); @@ -143,8 +145,12 @@ protected HBaseAdmin getHBaseAdmin(JobConf conf) throws MasterNotRunningExceptio @Override public void sinkConfInit(FlowProcess process, JobConf conf) { - if(quorumNames != null) { + if (quorumNames != null) { conf.set("hbase.zookeeper.quorum", quorumNames); + } else { + Configuration hbaseConfig = HBaseConfiguration.create(conf); + conf.set(HConstants.ZOOKEEPER_QUORUM, hbaseConfig.get(HConstants.ZOOKEEPER_QUORUM)); + conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, hbaseConfig.get(HConstants.ZOOKEEPER_ZNODE_PARENT)); } LOG.debug("sinking to table: {}", tableName); diff --git a/maple/src/main/java/com/twitter/maple/hbase/HBaseTapCollector.java b/maple/src/main/java/com/twitter/maple/hbase/HBaseTapCollector.java index 65f6ab9deb..f5ad1ed2dd 100644 --- a/maple/src/main/java/com/twitter/maple/hbase/HBaseTapCollector.java +++ b/maple/src/main/java/com/twitter/maple/hbase/HBaseTapCollector.java @@ -34,7 +34,7 @@ * {@link cascading.tuple.TupleEntrySchemeCollector} that writes tuples to the * resource managed by a particular {@link HBaseTap} instance. */ -public class HBaseTapCollector extends TupleEntrySchemeCollector implements OutputCollector { +public class HBaseTapCollector extends TupleEntrySchemeCollector implements OutputCollector { /** Field LOG */ private static final Logger LOG = LoggerFactory.getLogger(HBaseTapCollector.class); /** Field conf */ @@ -50,8 +50,9 @@ public class HBaseTapCollector extends TupleEntrySchemeCollector implements Outp /** * Constructor TapCollector creates a new TapCollector instance. - * + * * @param flowProcess + * of type FlowProcess * @param tap * of type Tap * @throws IOException @@ -100,7 +101,7 @@ public void close() { /** * Method collect writes the given values to the {@link Tap} this instance * encapsulates. - * + * * @param writableComparable * of type WritableComparable * @param writable diff --git a/maple/src/main/java/com/twitter/maple/tap/MemorySourceTap.java b/maple/src/main/java/com/twitter/maple/tap/MemorySourceTap.java index 1d07de3a23..d3ef58e624 100644 --- a/maple/src/main/java/com/twitter/maple/tap/MemorySourceTap.java +++ b/maple/src/main/java/com/twitter/maple/tap/MemorySourceTap.java @@ -48,7 +48,6 @@ public List getTuples() { @Override public void sourceConfInit(FlowProcess flowProcess, Tap, Void> tap, JobConf conf) { - FileInputFormat.setInputPaths(conf, this.id); conf.setInputFormat(TupleMemoryInputFormat.class); TupleMemoryInputFormat.storeTuples(conf, TupleMemoryInputFormat.TUPLES_PROPERTY, this.tuples); } diff --git a/maple/src/main/java/com/twitter/maple/tap/TupleMemoryInputFormat.java b/maple/src/main/java/com/twitter/maple/tap/TupleMemoryInputFormat.java index fc48eb8f28..f8826ffa80 100644 --- a/maple/src/main/java/com/twitter/maple/tap/TupleMemoryInputFormat.java +++ b/maple/src/main/java/com/twitter/maple/tap/TupleMemoryInputFormat.java @@ -140,12 +140,12 @@ public static List retrieveTuples(JobConf conf, String key) { String s = conf.get(key); if (s == null) return null; - + String[] pieces = s.split(":"); int size = Integer.valueOf(pieces[0]); - + byte[] val; - + if (pieces.length > 1){ val = decodeBytes(pieces[1]); }else{ diff --git a/project/Build.scala b/project/Build.scala deleted file mode 100644 index 1749be3dd2..0000000000 --- a/project/Build.scala +++ /dev/null @@ -1,318 +0,0 @@ -package scalding - -import sbt._ -import Keys._ -import sbtassembly.Plugin._ -import AssemblyKeys._ -import com.typesafe.tools.mima.plugin.MimaPlugin.mimaDefaultSettings -import com.typesafe.tools.mima.plugin.MimaKeys._ - -import scala.collection.JavaConverters._ - -object ScaldingBuild extends Build { - val sharedSettings = Project.defaultSettings ++ assemblySettings ++ Seq( - organization := "com.twitter", - - //TODO: Change to 2.10.* when Twitter moves to Scala 2.10 internally - scalaVersion := "2.9.3", - - crossScalaVersions := Seq("2.9.3", "2.10.3"), - - javacOptions ++= Seq("-source", "1.6", "-target", "1.6"), - - javacOptions in doc := Seq("-source", "1.6"), - - - libraryDependencies ++= Seq( - "org.scalacheck" %% "scalacheck" % "1.10.0" % "test", - "org.scala-tools.testing" %% "specs" % "1.6.9" % "test", - "org.mockito" % "mockito-all" % "1.8.5" % "test" - ), - - resolvers ++= Seq( - "snapshots" at "http://oss.sonatype.org/content/repositories/snapshots", - "releases" at "http://oss.sonatype.org/content/repositories/releases", - "Concurrent Maven Repo" at "http://conjars.org/repo", - "Clojars Repository" at "http://clojars.org/repo", - "Twitter Maven" at "http://maven.twttr.com" - ), - - parallelExecution in Test := false, - - scalacOptions ++= Seq("-unchecked", "-deprecation"), - - // Uncomment if you don't want to run all the tests before building assembly - // test in assembly := {}, - - // Publishing options: - - publishMavenStyle := true, - - publishArtifact in Test := false, - - pomIncludeRepository := { - x => false - }, - - publishTo <<= version { v => - Some( - if (v.trim.endsWith("SNAPSHOT")) - Opts.resolver.sonatypeSnapshots - else - Opts.resolver.sonatypeStaging - //"twttr" at "http://artifactory.local.twitter.com/libs-releases-local" - ) - }, - - // Janino includes a broken signature, and is not needed: - excludedJars in assembly <<= (fullClasspath in assembly) map { - cp => - val excludes = Set("jsp-api-2.1-6.1.14.jar", "jsp-2.1-6.1.14.jar", - "jasper-compiler-5.5.12.jar", "janino-2.5.16.jar") - cp filter { - jar => excludes(jar.data.getName) - } - }, - // Some of these files have duplicates, let's ignore: - mergeStrategy in assembly <<= (mergeStrategy in assembly) { - (old) => { - case s if s.endsWith(".class") => MergeStrategy.last - case s if s.endsWith("project.clj") => MergeStrategy.concat - case s if s.endsWith(".html") => MergeStrategy.last - case s if s.endsWith(".dtd") => MergeStrategy.last - case s if s.endsWith(".xsd") => MergeStrategy.last - case s if s.endsWith(".jnilib") => MergeStrategy.rename - case s if s.endsWith("jansi.dll") => MergeStrategy.rename - case x => old(x) - } - }, - - pomExtra := ( - https://github.com/twitter/scalding - - - Apache 2 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - A business-friendly OSS license - - - - git@github.com:twitter/scalding.git - scm:git:git@github.com:twitter/scalding.git - - - - posco - Oscar Boykin - http://twitter.com/posco - - - avibryant - Avi Bryant - http://twitter.com/avibryant - - - argyris - Argyris Zymnis - http://twitter.com/argyris - - ) - ) ++ mimaDefaultSettings - - lazy val scalding = Project( - id = "scalding", - base = file("."), - settings = sharedSettings ++ DocGen.publishSettings - ).settings( - test := {}, - publish := {}, // skip publishing for this root project. - publishLocal := {} - ).aggregate( - scaldingArgs, - scaldingDate, - scaldingCore, - scaldingCommons, - scaldingAvro, - scaldingParquet, - scaldingRepl, - scaldingJson, - scaldingJdbc, - maple - ) - - /** - * This returns the youngest jar we released that is compatible with - * the current. - */ - val unreleasedModules = Set[String]() - - def youngestForwardCompatible(subProj: String) = - Some(subProj) - .filterNot(unreleasedModules.contains(_)) - .map { - s => "com.twitter" % ("scalding-" + s + "_2.9.2") % "0.8.5" - } - - def module(name: String) = { - val id = "scalding-%s".format(name) - Project(id = id, base = file(id), settings = sharedSettings ++ Seq( - Keys.name := id, - previousArtifact := youngestForwardCompatible(name)) - ) - } - - lazy val scaldingArgs = module("args") - - lazy val scaldingDate = module("date") - - lazy val cascadingVersion = - System.getenv.asScala.getOrElse("SCALDING_CASCADING_VERSION", "2.5.2") - - lazy val cascadingJDBCVersion = - System.getenv.asScala.getOrElse("SCALDING_CASCADING_JDBC_VERSION", "2.5.1") - - val hadoopVersion = "1.1.2" - val algebirdVersion = "0.5.0" - val bijectionVersion = "0.6.2" - val chillVersion = "0.3.6" - val slf4jVersion = "1.6.6" - - lazy val scaldingCore = module("core").settings( - libraryDependencies ++= Seq( - "cascading" % "cascading-core" % cascadingVersion, - "cascading" % "cascading-local" % cascadingVersion, - "cascading" % "cascading-hadoop" % cascadingVersion, - "com.twitter" %% "chill" % chillVersion, - "com.twitter" % "chill-hadoop" % chillVersion, - "com.twitter" % "chill-java" % chillVersion, - "com.twitter" %% "bijection-core" % bijectionVersion, - "com.twitter" %% "algebird-core" % algebirdVersion, - "org.apache.hadoop" % "hadoop-core" % hadoopVersion % "provided", - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided" - ) - ).dependsOn(scaldingArgs, scaldingDate, maple) - - lazy val scaldingCommons = Project( - id = "scalding-commons", - base = file("scalding-commons"), - settings = sharedSettings - ).settings( - name := "scalding-commons", - previousArtifact := Some("com.twitter" % "scalding-commons_2.9.2" % "0.2.0"), - libraryDependencies ++= Seq( - "com.backtype" % "dfs-datastores-cascading" % "1.3.4", - "com.backtype" % "dfs-datastores" % "1.3.4", - // TODO: split into scalding-protobuf - "com.google.protobuf" % "protobuf-java" % "2.4.1", - "com.twitter" %% "bijection-core" % bijectionVersion, - "com.twitter" %% "algebird-core" % algebirdVersion, - "com.twitter" %% "chill" % chillVersion, - "com.twitter.elephantbird" % "elephant-bird-cascading2" % "3.0.6", - "com.hadoop.gplcompression" % "hadoop-lzo" % "0.4.16", - // TODO: split this out into scalding-thrift - "org.apache.thrift" % "libthrift" % "0.5.0", - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "provided", - "org.scalacheck" %% "scalacheck" % "1.10.0" % "test", - "org.scala-tools.testing" %% "specs" % "1.6.9" % "test" - ) - ).dependsOn(scaldingArgs, scaldingDate, scaldingCore) - - lazy val scaldingAvro = Project( - id = "scalding-avro", - base = file("scalding-avro"), - settings = sharedSettings - ).settings( - name := "scalding-avro", - previousArtifact := Some("com.twitter" % "scalding-avro_2.9.2" % "0.1.0"), - libraryDependencies ++= Seq( - "cascading.avro" % "avro-scheme" % "2.1.2", - "org.apache.avro" % "avro" % "1.7.4", - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.apache.hadoop" % "hadoop-core" % hadoopVersion % "provided", - "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "test", - "org.scalacheck" %% "scalacheck" % "1.10.0" % "test", - "org.scala-tools.testing" %% "specs" % "1.6.9" % "test" - ) - ).dependsOn(scaldingCore) - - lazy val scaldingParquet = Project( - id = "scalding-parquet", - base = file("scalding-parquet"), - settings = sharedSettings - ).settings( - name := "scalding-parquet", - //previousArtifact := Some("com.twitter" % "scalding-parquet_2.9.2" % "0.1.0"), - previousArtifact := None, - libraryDependencies ++= Seq( - "com.twitter" % "parquet-cascading" % "1.3.2", - "org.slf4j" % "slf4j-api" % slf4jVersion, - "org.apache.hadoop" % "hadoop-core" % hadoopVersion % "provided", - "org.slf4j" % "slf4j-log4j12" % slf4jVersion % "test", - "org.scalacheck" %% "scalacheck" % "1.10.0" % "test", - "org.scala-tools.testing" %% "specs" % "1.6.9" % "test" - ) - ).dependsOn(scaldingCore) - - lazy val scaldingRepl = Project( - id = "scalding-repl", - base = file("scalding-repl"), - settings = sharedSettings - ).settings( - name := "scalding-repl", - previousArtifact := None, - libraryDependencies <++= (scalaVersion) { scalaVersion => Seq( - "org.scala-lang" % "jline" % scalaVersion, - "org.scala-lang" % "scala-compiler" % scalaVersion, - "org.apache.hadoop" % "hadoop-core" % hadoopVersion % "provided" - ) - } - ).dependsOn(scaldingCore) - - lazy val scaldingJson = Project( - id = "scalding-json", - base = file("scalding-json"), - settings = sharedSettings - ).settings( - name := "scalding-json", - previousArtifact := None, - libraryDependencies <++= (scalaVersion) { scalaVersion => Seq( - "org.apache.hadoop" % "hadoop-core" % hadoopVersion % "provided", - "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.2.3" - ) - } - ).dependsOn(scaldingCore) - - lazy val scaldingJdbc = Project( - id = "scalding-jdbc", - base = file("scalding-jdbc"), - settings = sharedSettings - ).settings( - name := "scalding-jdbc", - previousArtifact := None, - libraryDependencies <++= (scalaVersion) { scalaVersion => Seq( - "org.apache.hadoop" % "hadoop-core" % hadoopVersion % "provided", - "cascading" % "cascading-jdbc-core" % cascadingJDBCVersion - ) - } - ).dependsOn(scaldingCore) - - lazy val maple = Project( - id = "maple", - base = file("maple"), - settings = sharedSettings - ).settings( - name := "maple", - previousArtifact := None, - crossPaths := false, - autoScalaLibrary := false, - libraryDependencies <++= (scalaVersion) { scalaVersion => Seq( - "org.apache.hadoop" % "hadoop-core" % hadoopVersion % "provided", - "org.apache.hbase" % "hbase" % "0.94.5" % "provided", - "cascading" % "cascading-hadoop" % cascadingVersion - ) - } - ) -} diff --git a/project/DocGen.scala b/project/DocGen.scala deleted file mode 100644 index d5762536b0..0000000000 --- a/project/DocGen.scala +++ /dev/null @@ -1,44 +0,0 @@ -package scalding - -import sbt._ -import Keys._ - -import com.typesafe.sbt.git.GitRunner -import com.typesafe.sbt.SbtGit.GitKeys -import com.typesafe.sbt.SbtSite.{ site, SiteKeys } -import com.typesafe.sbt.SbtGhPages.{ ghpages, GhPagesKeys => ghkeys } -import com.typesafe.sbt.SbtGit.GitKeys.gitRemoteRepo - -object DocGen { - val docDirectory = "target/site" - val aggregateName = "scalding" - - def syncLocal = (ghkeys.updatedRepository, GitKeys.gitRunner, streams) map { (repo, git, s) => - cleanSite(repo, git, s) // First, remove 'stale' files. - val rootPath = file(docDirectory) // Now copy files. - IO.copyDirectory(rootPath, repo) - IO.touch(repo / ".nojekyll") - repo - } - - private def cleanSite(dir: File, git: GitRunner, s: TaskStreams): Unit = { - val toClean = IO.listFiles(dir).filterNot(_.getName == ".git").map(_.getAbsolutePath).toList - if(!toClean.isEmpty) - git(("rm" :: "-r" :: "-f" :: "--ignore-unmatch" :: toClean) :_*)(dir, s.log) - () - } - - lazy val unidocSettings: Seq[sbt.Setting[_]] = - site.includeScaladoc(docDirectory) ++ Seq( - scalacOptions in doc <++= (version, baseDirectory in LocalProject(aggregateName)).map { (v, rootBase) => - val tagOrBranch = if (v.endsWith("-SNAPSHOT")) "develop" else v - val docSourceUrl = "https://github.com/twitter/" + aggregateName + "/tree/" + tagOrBranch + "€{FILE_PATH}.scala" - Seq("-sourcepath", rootBase.getAbsolutePath, "-doc-source-url", docSourceUrl) - }, - Unidoc.unidocDirectory := file(docDirectory), - gitRemoteRepo := "git@github.com:twitter/" + aggregateName + ".git", - ghkeys.synchLocal <<= syncLocal - ) - - lazy val publishSettings = site.settings ++ Unidoc.settings ++ ghpages.settings ++ unidocSettings -} diff --git a/project/Unidoc.scala b/project/Unidoc.scala deleted file mode 100644 index 10b534ca47..0000000000 --- a/project/Unidoc.scala +++ /dev/null @@ -1,54 +0,0 @@ -package scalding - -import sbt._ -import sbt.Keys._ -import sbt.Project.Initialize - -/** Borrowed from https://github.com/akka/akka/blob/master/project/Unidoc.scala */ -object Unidoc { - val unidocDirectory = SettingKey[File]("unidoc-directory") - val unidocExclude = SettingKey[Seq[String]]("unidoc-exclude") - val unidocAllSources = TaskKey[Seq[Seq[File]]]("unidoc-all-sources") - val unidocSources = TaskKey[Seq[File]]("unidoc-sources") - val unidocAllClasspaths = TaskKey[Seq[Classpath]]("unidoc-all-classpaths") - val unidocClasspath = TaskKey[Seq[File]]("unidoc-classpath") - val unidoc = TaskKey[File]("unidoc", "Create unified scaladoc for all aggregates") - - lazy val settings = Seq( - unidocDirectory <<= crossTarget / "unidoc", - unidocExclude := Seq.empty, - unidocAllSources <<= (thisProjectRef, buildStructure, unidocExclude) flatMap allSources, - unidocSources <<= unidocAllSources map { _.flatten }, - unidocAllClasspaths <<= (thisProjectRef, buildStructure, unidocExclude) flatMap allClasspaths, - unidocClasspath <<= unidocAllClasspaths map { _.flatten.map(_.data).distinct }, - unidoc <<= unidocTask - ) - - def allSources(projectRef: ProjectRef, structure: Load.BuildStructure, exclude: Seq[String]): Task[Seq[Seq[File]]] = { - val projects = aggregated(projectRef, structure, exclude) - projects flatMap { sources in Compile in LocalProject(_) get structure.data } join - } - - def allClasspaths(projectRef: ProjectRef, structure: Load.BuildStructure, exclude: Seq[String]): Task[Seq[Classpath]] = { - val projects = aggregated(projectRef, structure, exclude) - projects flatMap { dependencyClasspath in Compile in LocalProject(_) get structure.data } join - } - - def aggregated(projectRef: ProjectRef, structure: Load.BuildStructure, exclude: Seq[String]): Seq[String] = { - val aggregate = Project.getProject(projectRef, structure).toSeq.flatMap(_.aggregate) - aggregate flatMap { ref => - if (exclude contains ref.project) Seq.empty - else ref.project +: aggregated(ref, structure, exclude) - } - } - - def unidocTask: Initialize[Task[File]] = { - (compilers, cacheDirectory, unidocSources, unidocClasspath, unidocDirectory, scalacOptions in doc, streams) map { - (compilers, cache, sources, classpath, target, options, s) => { - val scaladoc = new Scaladoc(100, compilers.scalac) - scaladoc.cached(cache / "unidoc", "main", sources, classpath, target, options, s.log) - target - } - } - } -} diff --git a/project/build.properties b/project/build.properties index 0974fce44d..9edb75b77c 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=0.13.0 +sbt.version=1.5.4 diff --git a/project/plugins.sbt b/project/plugins.sbt index 1f806a2bf6..bb01c0a2f4 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,14 +1,22 @@ -resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) - resolvers ++= Seq( - "jgit-repo" at "http://download.eclipse.org/jgit/maven", - "sonatype-releases" at "http://oss.sonatype.org/content/repositories/releases" + "jgit-repo".at("https://download.eclipse.org/jgit/maven"), + "sonatype-releases".at("https://oss.sonatype.org/content/repositories/releases"), + "Twitter Maven".at("https://maven.twttr.com") ) -addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.6.2") - -addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.6") - -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.10.2") - -addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.5.1") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") +addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.11.0") +addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.3") +addSbtPlugin("com.47deg" % "sbt-microsites" % "1.3.4") +addSbtPlugin("com.github.sbt" % "sbt-ci-release" % "1.5.10") +addSbtPlugin("com.dwijnand" % "sbt-dynver" % "4.1.1") +addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") +addSbtPlugin("com.twitter" %% "scrooge-sbt-plugin" % "18.9.0") +addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.14") +addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3") +addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "1.0.0") +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6") +addSbtPlugin("com.github.sbt" % "sbt-jacoco" % "3.4.0") +addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.7") +addSbtPlugin("org.wartremover" % "sbt-wartremover" % "2.4.16") +addSbtPlugin("org.scalameta" % "sbt-mdoc" % "2.2.22") diff --git a/project/scalding-dagon.scala b/project/scalding-dagon.scala new file mode 100644 index 0000000000..a530a7aee2 --- /dev/null +++ b/project/scalding-dagon.scala @@ -0,0 +1,30 @@ +import sbt.CrossVersion + +import java.io.File +import java.nio.file.Paths + +object scaldingDagonSettings { + + // load either scala-2.12- or scala-2.12+ dagon src depending on scala version + def scalaVersionSpecificFolders(srcName: String, srcBaseDir: File, scalaVersion: String) = { + + def extraDirs(suffix: String) = { + val scalaCompat = Paths + .get(srcBaseDir.toString) + .resolve("src") + .resolve(srcName) + .resolve("scala" + suffix) + .toFile + Seq(scalaCompat) + } + + CrossVersion.partialVersion(scalaVersion) match { + case Some((2, y)) if y <= 12 => + extraDirs("-2.12-") + case Some((2, y)) if y >= 13 => + extraDirs("-2.13+") + case _ => Nil + } + } + +} diff --git a/project/travis-log4j.properties b/project/travis-log4j.properties index c125798f40..e45c815eb0 100644 --- a/project/travis-log4j.properties +++ b/project/travis-log4j.properties @@ -1,9 +1,12 @@ -log4j.debug=true -log4j.rootCategory=WARN, console -log4j.threshhold=ALL +log4j.rootCategory=DEBUG, console +log4j.threshold=ALL log4j.category.cascading=WARN log4j.category.com.twitter=INFO +log4j.logger.org.apache.hadoop=ERROR +log4j.logger.cascading.flow=WARN +log4j.logger.cascading.tap=WARN + log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.layout=org.apache.log4j.PatternLayout diff --git a/sbt b/sbt index 040ce8012e..af23d61b0e 100755 --- a/sbt +++ b/sbt @@ -1,81 +1,58 @@ #!/usr/bin/env bash # # A more capable sbt runner, coincidentally also called sbt. -# Author: Paul Phillips - -# todo - make this dynamic -declare -r sbt_release_version=0.13.0 - -declare sbt_jar sbt_dir sbt_create sbt_launch_dir -declare scala_version java_home sbt_explicit_version -declare verbose debug quiet noshare batch trace_level log_level -declare sbt_saved_stty - -echoerr () { [[ -z $quiet ]] && echo "$@" >&2; } -vlog () { [[ -n "$verbose$debug" ]] && echoerr "$@"; } -dlog () { [[ -n $debug ]] && echoerr "$@"; } - -# we'd like these set before we get around to properly processing arguments -for arg in "$@"; do - case $arg in - -q|-quiet) quiet=true ;; - -d|-debug) debug=true ;; - -v|-verbose) verbose=true ;; - *) ;; - esac -done +# Author: Paul Phillips -build_props_sbt () { - if [[ -r project/build.properties ]]; then - versionLine=$(grep ^sbt.version project/build.properties | tr -d ' \r') - versionString=${versionLine##sbt.version=} - echo "$versionString" - fi -} +set -o pipefail -update_build_props_sbt () { - local ver="$1" - local old=$(build_props_sbt) - - if [[ $ver == $old ]]; then - return - elif [[ -r project/build.properties ]]; then - perl -pi -e "s/^sbt\.version[ ]*=.*\$/sbt.version=${ver}/" project/build.properties - grep -q '^sbt.version[ ]*=' project/build.properties || printf "\nsbt.version=${ver}\n" >> project/build.properties - - echoerr !!! - echoerr !!! Updated file project/build.properties setting sbt.version to: $ver - echoerr !!! Previous value was: $old - echoerr !!! - fi -} +declare -r sbt_release_version="0.13.15" +declare -r sbt_unreleased_version="0.13.15" -sbt_version () { - if [[ -n $sbt_explicit_version ]]; then - echo $sbt_explicit_version - else - local v=$(build_props_sbt) - if [[ -n $v ]]; then - echo $v - else - echo $sbt_release_version - fi - fi -} +declare -r latest_212="2.12.1" +declare -r latest_211="2.11.8" + +declare -r buildProps="project/build.properties" + +declare -r sbt_launch_ivy_release_repo="https://repo.typesafe.com/typesafe/ivy-releases" +declare -r sbt_launch_ivy_snapshot_repo="https://repo.scala-sbt.org/scalasbt/ivy-snapshots" +declare -r sbt_launch_mvn_release_repo="https://repo.scala-sbt.org/scalasbt/maven-releases" +declare -r sbt_launch_mvn_snapshot_repo="https://repo.scala-sbt.org/scalasbt/maven-snapshots" + +declare -r default_jvm_opts_common="-Xms512m -Xmx1536m -Xss2m" +declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" + +declare sbt_jar sbt_dir sbt_create sbt_version sbt_script sbt_new +declare sbt_explicit_version +declare verbose noshare batch trace_level +declare sbt_saved_stty debugUs + +declare java_cmd="java" +declare sbt_launch_dir="$HOME/.sbt/launchers" +declare sbt_launch_repo + +# pull -J and -D options to give to java. +declare -a java_args scalac_args sbt_commands residual_args + +# args to jvm/sbt via files or environment variables +declare -a extra_jvm_opts extra_sbt_opts + +echoerr () { echo >&2 "$@"; } +vlog () { [[ -n "$verbose" ]] && echoerr "$@"; } +die () { echo "Aborting: $@" ; exit 1; } # restore stty settings (echo in particular) onSbtRunnerExit() { - [[ -n $sbt_saved_stty ]] || return - dlog "" - dlog "restoring stty: $sbt_saved_stty" - stty $sbt_saved_stty + [[ -n "$sbt_saved_stty" ]] || return + vlog "" + vlog "restoring stty: $sbt_saved_stty" + stty "$sbt_saved_stty" unset sbt_saved_stty } -# save stty and trap exit, to ensure echo is reenabled if we are interrupted. +# save stty and trap exit, to ensure echo is re-enabled if we are interrupted. trap onSbtRunnerExit EXIT -sbt_saved_stty=$(stty -g 2>/dev/null) -dlog "Saved stty: $sbt_saved_stty" +sbt_saved_stty="$(stty -g 2>/dev/null)" +vlog "Saved stty: $sbt_saved_stty" # this seems to cover the bases on OSX, and someone will # have to tell me about the others. @@ -83,156 +60,225 @@ get_script_path () { local path="$1" [[ -L "$path" ]] || { echo "$path" ; return; } - local target=$(readlink "$path") + local target="$(readlink "$path")" if [[ "${target:0:1}" == "/" ]]; then echo "$target" else - echo "$(dirname $path)/$target" + echo "${path%/*}/$target" fi } -die() { - echo "Aborting: $@" - exit 1 +declare -r script_path="$(get_script_path "$BASH_SOURCE")" +declare -r script_name="${script_path##*/}" + +init_default_option_file () { + local overriding_var="${!1}" + local default_file="$2" + if [[ ! -r "$default_file" && "$overriding_var" =~ ^@(.*)$ ]]; then + local envvar_file="${BASH_REMATCH[1]}" + if [[ -r "$envvar_file" ]]; then + default_file="$envvar_file" + fi + fi + echo "$default_file" } -make_url () { - version="$1" - - echo "$sbt_launch_repo/org.scala-sbt/sbt-launch/$version/sbt-launch.jar" +declare sbt_opts_file="$(init_default_option_file SBT_OPTS .sbtopts)" +declare jvm_opts_file="$(init_default_option_file JVM_OPTS .jvmopts)" + +build_props_sbt () { + [[ -r "$buildProps" ]] && \ + grep '^sbt\.version' "$buildProps" | tr '=\r' ' ' | awk '{ print $2; }' } -readarr () { - while read ; do - eval "$1+=(\"$REPLY\")" - done +update_build_props_sbt () { + local ver="$1" + local old="$(build_props_sbt)" + + [[ -r "$buildProps" ]] && [[ "$ver" != "$old" ]] && { + perl -pi -e "s/^sbt\.version\b.*\$/sbt.version=${ver}/" "$buildProps" + grep -q '^sbt.version[ =]' "$buildProps" || printf "\nsbt.version=%s\n" "$ver" >> "$buildProps" + + vlog "!!!" + vlog "!!! Updated file $buildProps setting sbt.version to: $ver" + vlog "!!! Previous value was: $old" + vlog "!!!" + } } -init_default_option_file () { - local overriding_var=${!1} - local default_file=$2 - if [[ ! -r "$default_file" && $overriding_var =~ ^@(.*)$ ]]; then - local envvar_file=${BASH_REMATCH[1]} - if [[ -r $envvar_file ]]; then - default_file=$envvar_file - fi - fi - echo $default_file +set_sbt_version () { + sbt_version="${sbt_explicit_version:-$(build_props_sbt)}" + [[ -n "$sbt_version" ]] || sbt_version=$sbt_release_version + export sbt_version } -declare -r cms_opts="-XX:+CMSClassUnloadingEnabled -XX:+UseConcMarkSweepGC" -declare -r jit_opts="-XX:ReservedCodeCacheSize=256m -XX:+TieredCompilation" -declare -r default_jvm_opts="-Dfile.encoding=UTF8 -XX:MaxPermSize=384m -Xms512m -Xmx1536m -Xss2m $jit_opts $cms_opts" -declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" -declare -r latest_28="2.8.2" -declare -r latest_29="2.9.3" -declare -r latest_210="2.10.3" -declare -r latest_211="2.11.0-M5" +url_base () { + local version="$1" + + case "$version" in + 0.7.*) echo "https://simple-build-tool.googlecode.com" ;; + 0.10.* ) echo "$sbt_launch_ivy_release_repo" ;; + 0.11.[12]) echo "$sbt_launch_ivy_release_repo" ;; + 0.*-[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9][0-9][0-9]) # ie "*-yyyymmdd-hhMMss" + echo "$sbt_launch_ivy_snapshot_repo" ;; + 0.*) echo "$sbt_launch_ivy_release_repo" ;; + *-[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9][0-9][0-9]) # ie "*-yyyymmdd-hhMMss" + echo "$sbt_launch_mvn_snapshot_repo" ;; + *) echo "$sbt_launch_mvn_release_repo" ;; + esac +} -declare -r script_path=$(get_script_path "$BASH_SOURCE") -declare -r script_dir="$(dirname $script_path)" -declare -r script_name="$(basename $script_path)" +make_url () { + local version="$1" -# some non-read-onlies set with defaults -declare java_cmd=java -declare sbt_opts_file=$(init_default_option_file SBT_OPTS .sbtopts) -declare jvm_opts_file=$(init_default_option_file JVM_OPTS .jvmopts) -declare sbt_launch_repo="http://typesafe.artifactoryonline.com/typesafe/ivy-releases" + local base="${sbt_launch_repo:-$(url_base "$version")}" -# pull -J and -D options to give to java. -declare -a residual_args -declare -a java_args -declare -a scalac_args -declare -a sbt_commands + case "$version" in + 0.7.*) echo "$base/files/sbt-launch-0.7.7.jar" ;; + 0.10.* ) echo "$base/org.scala-tools.sbt/sbt-launch/$version/sbt-launch.jar" ;; + 0.11.[12]) echo "$base/org.scala-tools.sbt/sbt-launch/$version/sbt-launch.jar" ;; + 0.*) echo "$base/org.scala-sbt/sbt-launch/$version/sbt-launch.jar" ;; + *) echo "$base/org/scala-sbt/sbt-launch/$version/sbt-launch-$version.jar" ;; + esac +} -# args to jvm/sbt via files or environment variables -declare -a extra_jvm_opts extra_sbt_opts +addJava () { vlog "[addJava] arg = '$1'" ; java_args+=("$1"); } +addSbt () { vlog "[addSbt] arg = '$1'" ; sbt_commands+=("$1"); } +addScalac () { vlog "[addScalac] arg = '$1'" ; scalac_args+=("$1"); } +addResidual () { vlog "[residual] arg = '$1'" ; residual_args+=("$1"); } + +addResolver () { addSbt "set resolvers += $1"; } +addDebugger () { addJava "-Xdebug" ; addJava "-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"; } +setThisBuild () { + vlog "[addBuild] args = '$@'" + local key="$1" && shift + addSbt "set $key in ThisBuild := $@" +} +setScalaVersion () { + [[ "$1" == *"-SNAPSHOT" ]] && addResolver 'Resolver.sonatypeRepo("snapshots")' + addSbt "++ $1" +} +setJavaHome () { + java_cmd="$1/bin/java" + setThisBuild javaHome "_root_.scala.Some(file(\"$1\"))" + export JAVA_HOME="$1" + export JDK_HOME="$1" + export PATH="$JAVA_HOME/bin:$PATH" +} -# if set, use JAVA_HOME over java found in path -[[ -e "$JAVA_HOME/bin/java" ]] && java_cmd="$JAVA_HOME/bin/java" +getJavaVersion() { "$1" -version 2>&1 | grep -E -e '(java|openjdk) version' | awk '{ print $3 }' | tr -d \"; } -# directory to store sbt launchers -declare sbt_launch_dir="$HOME/.sbt/launchers" -[[ -d "$sbt_launch_dir" ]] || mkdir -p "$sbt_launch_dir" -[[ -w "$sbt_launch_dir" ]] || sbt_launch_dir="$(mktemp -d -t sbt_extras_launchers)" +checkJava() { + # Warn if there is a Java version mismatch between PATH and JAVA_HOME/JDK_HOME + + [[ -n "$JAVA_HOME" && -e "$JAVA_HOME/bin/java" ]] && java="$JAVA_HOME/bin/java" + [[ -n "$JDK_HOME" && -e "$JDK_HOME/lib/tools.jar" ]] && java="$JDK_HOME/bin/java" + + if [[ -n "$java" ]]; then + pathJavaVersion=$(getJavaVersion java) + homeJavaVersion=$(getJavaVersion "$java") + if [[ "$pathJavaVersion" != "$homeJavaVersion" ]]; then + echoerr "Warning: Java version mismatch between PATH and JAVA_HOME/JDK_HOME, sbt will use the one in PATH" + echoerr " Either: fix your PATH, remove JAVA_HOME/JDK_HOME or use -java-home" + echoerr " java version from PATH: $pathJavaVersion" + echoerr " java version from JAVA_HOME/JDK_HOME: $homeJavaVersion" + fi + fi +} + +default_jvm_opts () { + echo "$default_jvm_opts_common" +} build_props_scala () { - if [[ -r project/build.properties ]]; then - versionLine=$(grep ^build.scala.versions project/build.properties) - versionString=${versionLine##build.scala.versions=} - echo ${versionString%% .*} + if [[ -r "$buildProps" ]]; then + versionLine="$(grep '^build.scala.versions' "$buildProps")" + versionString="${versionLine##build.scala.versions=}" + echo "${versionString%% .*}" fi } execRunner () { # print the arguments one to a line, quoting any containing spaces - [[ $verbose || $debug ]] && echo "# Executing command line:" && { + vlog "# Executing command line:" && { for arg; do if [[ -n "$arg" ]]; then if printf "%s\n" "$arg" | grep -q ' '; then - printf "\"%s\"\n" "$arg" + printf >&2 "\"%s\"\n" "$arg" else - printf "%s\n" "$arg" + printf >&2 "%s\n" "$arg" fi fi done - echo "" + vlog "" } - if [[ -n $batch ]]; then - exec /dev/null; then - curl --fail --silent "$url" --output "$jar" + curl --fail --silent --location "$url" --output "$jar" elif which wget >/dev/null; then - wget --quiet -O "$jar" "$url" + wget -q -O "$jar" "$url" fi } && [[ -r "$jar" ]] } acquire_sbt_jar () { - for_sbt_version="$(sbt_version)" - sbt_url="$(jar_url $for_sbt_version)" - sbt_jar="$(jar_file $for_sbt_version)" - - [[ -r "$sbt_jar" ]] || download_url "$sbt_url" "$sbt_jar" + { + sbt_jar="$(jar_file "$sbt_version")" + [[ -r "$sbt_jar" ]] + } || { + sbt_jar="$HOME/.ivy2/local/org.scala-sbt/sbt-launch/$sbt_version/jars/sbt-launch.jar" + [[ -r "$sbt_jar" ]] + } || { + sbt_jar="$(jar_file "$sbt_version")" + download_url "$(make_url "$sbt_version")" "$sbt_jar" + } } usage () { + set_sbt_version cat < display stack traces with a max of frames (default: -1, traces suppressed) + -debug-inc enable debugging log for the incremental compiler -no-colors disable ANSI color codes -sbt-create start sbt even if current directory contains no sbt project -sbt-dir path to global settings/plugins directory (default: ~/.sbt/) @@ -243,20 +289,19 @@ Usage: $script_name [options] -jvm-debug Turn on JVM debugging, open at the given port. -batch Disable interactive mode -prompt Set the sbt prompt; in expr, 's' is the State and 'e' is Extracted + -script Run the specified file as a scala script - # sbt version (default: from project/build.properties if present, else latest release) - !!! The only way to accomplish this pre-0.12.0 if there is a build.properties file which - !!! contains an sbt.version property is to update the file on disk. That's what this does. + # sbt version (default: sbt.version from $buildProps if present, otherwise $sbt_release_version) + -sbt-force-latest force the use of the latest release of sbt: $sbt_release_version -sbt-version use the specified version of sbt (default: $sbt_release_version) + -sbt-dev use the latest pre-release version of sbt: $sbt_unreleased_version -sbt-jar use the specified jar as the sbt launcher -sbt-launch-dir directory to hold sbt launchers (default: $sbt_launch_dir) - -sbt-launch-repo repo url for downloading sbt launcher jar (default: $sbt_launch_repo) + -sbt-launch-repo repo url for downloading sbt launcher jar (default: $(url_base "$sbt_version")) # scala version (default: as chosen by sbt) - -28 use $latest_28 - -29 use $latest_29 - -210 use $latest_210 -211 use $latest_211 + -212 use $latest_212 -scala-home use the scala build at the specified directory -scala-version use the specified version of scala -binary-version use the specified scala version when searching for dependencies @@ -266,7 +311,7 @@ Usage: $script_name [options] # passing options to the jvm - note it does NOT use JAVA_OPTS due to pollution # The default set is used if JVM_OPTS is unset and no -jvm-opts file is found - $default_jvm_opts + $(default_jvm_opts) JVM_OPTS environment variable holding either the jvm args directly, or the reference to a file containing jvm args if given path is prepended by '@' (e.g. '@/etc/jvmopts') Note: "@"-file is overridden by local '.jvmopts' or '-jvm-opts' argument. @@ -283,36 +328,7 @@ Usage: $script_name [options] EOM } -addJava () { - dlog "[addJava] arg = '$1'" - java_args=( "${java_args[@]}" "$1" ) -} -addSbt () { - dlog "[addSbt] arg = '$1'" - sbt_commands=( "${sbt_commands[@]}" "$1" ) -} -addScalac () { - dlog "[addScalac] arg = '$1'" - scalac_args=( "${scalac_args[@]}" "$1" ) -} -addResidual () { - dlog "[residual] arg = '$1'" - residual_args=( "${residual_args[@]}" "$1" ) -} -addResolver () { - addSbt "set resolvers += $1" -} -addDebugger () { - addJava "-Xdebug" - addJava "-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1" -} -setScalaVersion () { - [[ "$1" == *-SNAPSHOT ]] && addResolver 'Resolver.sonatypeRepo("snapshots")' - addSbt "++ $1" -} - -process_args () -{ +process_args () { require_arg () { local type="$1" local opt="$2" @@ -324,44 +340,46 @@ process_args () } while [[ $# -gt 0 ]]; do case "$1" in - -h|-help) usage; exit 1 ;; - -v|-verbose) verbose=true && log_level=Info && shift ;; - -d|-debug) debug=true && log_level=Debug && shift ;; - -q|-quiet) quiet=true && log_level=Error && shift ;; - - -trace) require_arg integer "$1" "$2" && trace_level=$2 && shift 2 ;; - -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;; - -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; - -no-share) noshare=true && shift ;; - -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; - -sbt-dir) require_arg path "$1" "$2" && sbt_dir="$2" && shift 2 ;; - -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; - -offline) addSbt "set offline := true" && shift ;; - -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;; - -batch) batch=true && shift ;; - -prompt) require_arg "expr" "$1" "$2" && addSbt "set shellPrompt in ThisBuild := (s => { val e = Project.extract(s) ; $2 })" && shift 2 ;; - - -sbt-create) sbt_create=true && shift ;; - -sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;; - -sbt-version) require_arg version "$1" "$2" && sbt_explicit_version="$2" && shift 2 ;; --sbt-launch-dir) require_arg path "$1" "$2" && sbt_launch_dir="$2" && shift 2 ;; --sbt-launch-repo) require_arg path "$1" "$2" && sbt_launch_repo="$2" && shift 2 ;; - -scala-version) require_arg version "$1" "$2" && setScalaVersion "$2" && shift 2 ;; --binary-version) require_arg version "$1" "$2" && addSbt "set scalaBinaryVersion in ThisBuild := \"$2\"" && shift 2 ;; - -scala-home) require_arg path "$1" "$2" && addSbt "set every scalaHome := Some(file(\"$2\"))" && shift 2 ;; - -java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && shift 2 ;; - -sbt-opts) require_arg path "$1" "$2" && sbt_opts_file="$2" && shift 2 ;; - -jvm-opts) require_arg path "$1" "$2" && jvm_opts_file="$2" && shift 2 ;; - - -D*) addJava "$1" && shift ;; - -J*) addJava "${1:2}" && shift ;; - -S*) addScalac "${1:2}" && shift ;; - -28) setScalaVersion $latest_28 && shift ;; - -29) setScalaVersion $latest_29 && shift ;; - -210) setScalaVersion $latest_210 && shift ;; - -211) setScalaVersion $latest_211 && shift ;; - - *) addResidual "$1" && shift ;; + -h|-help) usage; exit 0 ;; + -v) verbose=true && shift ;; + -d) addSbt "--debug" && shift ;; + -w) addSbt "--warn" && shift ;; + -q) addSbt "--error" && shift ;; + -x) debugUs=true && shift ;; + -trace) require_arg integer "$1" "$2" && trace_level="$2" && shift 2 ;; + -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;; + -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; + -no-share) noshare=true && shift ;; + -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; + -sbt-dir) require_arg path "$1" "$2" && sbt_dir="$2" && shift 2 ;; + -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; + -offline) addSbt "set offline in Global := true" && shift ;; + -jvm-debug) require_arg port "$1" "$2" && addDebugger "$2" && shift 2 ;; + -batch) batch=true && shift ;; + -prompt) require_arg "expr" "$1" "$2" && setThisBuild shellPrompt "(s => { val e = Project.extract(s) ; $2 })" && shift 2 ;; + -script) require_arg file "$1" "$2" && sbt_script="$2" && addJava "-Dsbt.main.class=sbt.ScriptMain" && shift 2 ;; + + -sbt-create) sbt_create=true && shift ;; + -sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;; + -sbt-version) require_arg version "$1" "$2" && sbt_explicit_version="$2" && shift 2 ;; + -sbt-force-latest) sbt_explicit_version="$sbt_release_version" && shift ;; + -sbt-dev) sbt_explicit_version="$sbt_unreleased_version" && shift ;; + -sbt-launch-dir) require_arg path "$1" "$2" && sbt_launch_dir="$2" && shift 2 ;; + -sbt-launch-repo) require_arg path "$1" "$2" && sbt_launch_repo="$2" && shift 2 ;; + -scala-version) require_arg version "$1" "$2" && setScalaVersion "$2" && shift 2 ;; + -binary-version) require_arg version "$1" "$2" && setThisBuild scalaBinaryVersion "\"$2\"" && shift 2 ;; + -scala-home) require_arg path "$1" "$2" && setThisBuild scalaHome "_root_.scala.Some(file(\"$2\"))" && shift 2 ;; + -java-home) require_arg path "$1" "$2" && setJavaHome "$2" && shift 2 ;; + -sbt-opts) require_arg path "$1" "$2" && sbt_opts_file="$2" && shift 2 ;; + -jvm-opts) require_arg path "$1" "$2" && jvm_opts_file="$2" && shift 2 ;; + + -D*) addJava "$1" && shift ;; + -J*) addJava "${1:2}" && shift ;; + -S*) addScalac "${1:2}" && shift ;; + -211) setScalaVersion "$latest_211" && shift ;; + -212) setScalaVersion "$latest_212" && shift ;; + new) sbt_new=true && : ${sbt_explicit_version:=$sbt_release_version} && addResidual "$1" && shift ;; + *) addResidual "$1" && shift ;; esac done } @@ -369,34 +387,43 @@ process_args () # process the direct command line arguments process_args "$@" -# skip #-styled comments +# skip #-styled comments and blank lines readConfigFile() { - while read line; do echo ${line/\#*/} | grep -vE '^\s*$'; done < $1 + local end=false + until $end; do + read || end=true + [[ $REPLY =~ ^# ]] || [[ -z $REPLY ]] || echo "$REPLY" + done < "$1" } # if there are file/environment sbt_opts, process again so we # can supply args to this runner if [[ -r "$sbt_opts_file" ]]; then vlog "Using sbt options defined in file $sbt_opts_file" - readarr extra_sbt_opts < <(readConfigFile "$sbt_opts_file") -elif [[ -n "$SBT_OPTS" && !($SBT_OPTS =~ ^@.*) ]]; then + while read opt; do extra_sbt_opts+=("$opt"); done < <(readConfigFile "$sbt_opts_file") +elif [[ -n "$SBT_OPTS" && ! ("$SBT_OPTS" =~ ^@.*) ]]; then vlog "Using sbt options defined in variable \$SBT_OPTS" extra_sbt_opts=( $SBT_OPTS ) else vlog "No extra sbt options have been defined" fi -[[ -n $extra_sbt_opts ]] && process_args "${extra_sbt_opts[@]}" +[[ -n "${extra_sbt_opts[*]}" ]] && process_args "${extra_sbt_opts[@]}" # reset "$@" to the residual args set -- "${residual_args[@]}" argumentCount=$# +# set sbt version +set_sbt_version + +checkJava + # only exists in 0.12+ setTraceLevel() { - case $(sbt_version) in - 0.{7,10,11}.*) echoerr "Cannot set trace level in sbt version $(sbt_version)" ;; - *) addSbt "set every traceLevel := $trace_level" ;; + case "$sbt_version" in + "0.7."* | "0.10."* | "0.11."* ) echoerr "Cannot set trace level in sbt version $sbt_version" ;; + *) setThisBuild traceLevel $trace_level ;; esac } @@ -404,19 +431,21 @@ setTraceLevel() { [[ ${#scalac_args[@]} -eq 0 ]] || addSbt "set scalacOptions in ThisBuild += \"${scalac_args[@]}\"" # Update build.properties on disk to set explicit version - sbt gives us no choice -[[ -n "$sbt_explicit_version" ]] && update_build_props_sbt "$sbt_explicit_version" -vlog "Detected sbt version $(sbt_version)" +[[ -n "$sbt_explicit_version" && -z "$sbt_new" ]] && update_build_props_sbt "$sbt_explicit_version" +vlog "Detected sbt version $sbt_version" -[[ -n "$scala_version" ]] && echoerr "Overriding scala version to $scala_version" - -# no args - alert them there's stuff in here -(( $argumentCount > 0 )) || { - vlog "Starting $script_name: invoke with -help for other options" - residual_args=( shell ) -} +if [[ -n "$sbt_script" ]]; then + residual_args=( $sbt_script ${residual_args[@]} ) +else + # no args - alert them there's stuff in here + (( argumentCount > 0 )) || { + vlog "Starting $script_name: invoke with -help for other options" + residual_args=( shell ) + } +fi -# verify this is an sbt dir or -create was given -[[ -r ./build.sbt || -d ./project || -n "$sbt_create" ]] || { +# verify this is an sbt dir, -create was given or user attempts to run a scala script +[[ -r ./build.sbt || -d ./project || -n "$sbt_create" || -n "$sbt_script" || -n "$sbt_new" ]] || { cat < List[values]) + // Fold into a list of (arg -> List[values]) args - .filter{ a => !a.matches("\\s*") } + .filter(a => !a.matches("\\s*")) .foldLeft(List("" -> List[String]())) { (acc, arg) => - val noDashes = arg.dropWhile{ _ == '-'} - if(arg == noDashes || isNumber(arg)) + val noDashes = arg.dropWhile(_ == '-') + if (arg == noDashes || isNumber(arg)) (acc.head._1 -> (arg :: acc.head._2)) :: acc.tail else (noDashes -> List()) :: acc } - //Now reverse the values to keep the same order - .map {case (key, value) => key -> value.reverse}.toMap + // Now reverse the values to keep the same order + .map { case (key, value) => key -> value.reverse } + .toMap ) } - def isNumber(arg : String) : Boolean = { + def isNumber(arg: String): Boolean = try { arg.toDouble true + } catch { + case e: NumberFormatException => false } - catch { - case e : NumberFormatException => false - } - } + + /** + * By default, scalding will use reflection to try and identify classes to tokenize. Set to false to disable + */ + val jobClassReflection = "scalding.job.classreflection" } -class Args(val m : Map[String,List[String]]) extends java.io.Serializable { +class Args(val m: Map[String, List[String]]) extends java.io.Serializable { - //Replace or add a given key+args pair: - def +(keyvals : (String,Iterable[String])) : Args = new Args(m + (keyvals._1 -> keyvals._2.toList)) + // Replace or add a given key+args pair: + def +(keyvals: (String, Iterable[String])): Args = new Args(m + (keyvals._1 -> keyvals._2.toList)) /** - * Does this Args contain a given key? - */ - def boolean(key : String) : Boolean = m.contains(key) + * Does this Args contain a given key? + */ + def boolean(key: String): Boolean = m.contains(key) /** - * Get the list of values associated with a given key. - * if the key is absent, return the empty list. NOTE: empty - * does not mean the key is absent, it could be a key without - * a value. Use boolean() to check existence. - */ - def list(key : String) : List[String] = m.get(key).getOrElse(List()) + * Get the list of values associated with a given key. if the key is absent, return the empty list. NOTE: + * empty does not mean the key is absent, it could be a key without a value. Use boolean() to check + * existence. + */ + def list(key: String): List[String] = m.get(key).getOrElse(List()) /** - * This is a synonym for required - */ - def apply(key : String) : String = required(key) + * This is a synonym for required + */ + def apply(key: String): String = required(key) /** * Gets the list of positional arguments */ - def positional : List[String] = list("") + def positional: List[String] = list("") /** - * return required positional value. - */ - def required(position: Int) : String = positional match { + * return required positional value. + */ + def required(position: Int): String = positional match { case l if l.size > position => l(position) - case _ => sys.error("Please provide " + (position + 1) + " positional arguments") + case _ => throw ArgsException("Please provide " + (position + 1) + " positional arguments") } /** - * This is a synonym for required - */ - def apply(position : Int) : String = required(position) + * This is a synonym for required + */ + def apply(position: Int): String = required(position) - override def equals(other : Any) : Boolean = { - if( other.isInstanceOf[Args] ) { + override def equals(other: Any): Boolean = + if (other.isInstanceOf[Args]) { other.asInstanceOf[Args].m.equals(m) - } - else { + } else { false } - } + + override def hashCode(): Int = m.hashCode() /** - * Equivalent to .optional(key).getOrElse(default) - */ - def getOrElse(key : String, default : String) : String = optional(key).getOrElse(default) + * Equivalent to .optional(key).getOrElse(default) + */ + def getOrElse(key: String, default: String): String = optional(key).getOrElse(default) /** - * return exactly one value for a given key. - * If there is more than one value, you get an exception - */ - def required(key : String) : String = list(key) match { - case List() => sys.error("Please provide a value for --" + key) + * return exactly one value for a given key. If there is more than one value, you get an exception + */ + def required(key: String): String = list(key) match { + case List() => throw ArgsException("Please provide a value for --" + key) case List(a) => a - case _ => sys.error("Please only provide a single value for --" + key) + case _ => throw ArgsException("Please only provide a single value for --" + key) } - def toList : List[String] = { + def toList: List[String] = m.foldLeft(List[String]()) { (args, kvlist) => val k = kvlist._1 val values = kvlist._2 - if( k != "") { - //Make sure positional args are first - args ++ ((("--" + k) :: values)) - } - else { + if (k != "") { + // Make sure positional args are first + args ++ (("--" + k) :: values) + } else { // These are positional args (no key), put them first: values ++ args } } - } /** - * Asserts whether all the args belong to the given set of accepted arguments. - * If an arg does not belong to the given set, you get an error. - */ - def restrictTo(acceptedArgs: Set[String]) : Unit = { - val invalidArgs = m.keySet.filter(!_.startsWith("scalding.")) -- (acceptedArgs + "" + "tool.graph" + "hdfs" + "local") - if (!invalidArgs.isEmpty) sys.error("Invalid args: " + invalidArgs.map("--" + _).mkString(", ")) + * Asserts whether all the args belong to the given set of accepted arguments. If an arg does not belong to + * the given set, you get an error. + */ + def restrictTo(acceptedArgs: Set[String]): Unit = { + val invalidArgs = + m.keySet.filter(!_.startsWith("scalding.")) -- (acceptedArgs + "" + "tool.graph" + "hdfs" + "local") + if (!invalidArgs.isEmpty) throw ArgsException("Invalid args: " + invalidArgs.map("--" + _).mkString(", ")) } // TODO: if there are spaces in the keys or values, this will not round-trip - override def toString : String = toList.mkString(" ") + override def toString: String = toList.mkString(" ") /** - * If there is zero or one element, return it as an Option. - * If there is a list of more than one item, you get an error - */ - def optional(key : String) : Option[String] = list(key) match { - case List() => None + * If there is zero or one element, return it as an Option. If there is a list of more than one item, you + * get an error + */ + def optional(key: String): Option[String] = list(key) match { + case List() => None case List(a) => Some(a) - case _ => sys.error("Please provide at most one value for --" + key) + case _ => throw ArgsException("Please provide at most one value for --" + key) + } + + def int(key: String, default: Int): Int = + optional(key) + .map(value => + try value.toInt + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } + ) + .getOrElse(default) + + def int(key: String): Int = { + val value = required(key) + try value.toInt + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } + } + + def long(key: String, default: Long): Long = + optional(key) + .map(value => + try value.toLong + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } + ) + .getOrElse(default) + + def long(key: String): Long = { + val value = required(key) + try value.toLong + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } + } + + def float(key: String, default: Float): Float = + optional(key) + .map(value => + try value.toFloat + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } + ) + .getOrElse(default) + + def float(key: String): Float = { + val value = required(key) + try value.toFloat + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } + } + + def double(key: String, default: Double): Double = + optional(key) + .map(value => + try value.toDouble + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } + ) + .getOrElse(default) + + def double(key: String): Double = { + val value = required(key) + try value.toDouble + catch { + case NonFatal(_) => throw ArgsException(s"Invalid value $value for -- $key") + } } } diff --git a/scalding-args/src/main/scala/com/twitter/scalding/RangedArgs.scala b/scalding-args/src/main/scala/com/twitter/scalding/RangedArgs.scala index b36addb715..319c9d6f09 100644 --- a/scalding-args/src/main/scala/com/twitter/scalding/RangedArgs.scala +++ b/scalding-args/src/main/scala/com/twitter/scalding/RangedArgs.scala @@ -12,46 +12,44 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding object RangedArgs { - implicit def rangedFromArgs(args: Args) = new RangedArgs(args) + implicit def rangedFromArgs(args: Args): RangedArgs = new RangedArgs(args) } case class Range[T](lower: T, upper: T)(implicit ord: Ordering[T]) { assert(ord.lteq(lower, upper), "Bad range: " + lower + " > " + upper) - def assertLowerBound(min: T) { + def assertLowerBound(min: T): Unit = assert(ord.lteq(min, lower), "Range out of bounds: " + lower + " < " + min) - } - def assertUpperBound(max: T) { + def assertUpperBound(max: T): Unit = assert(ord.gteq(max, upper), "Range out of bounds: " + upper + " > " + max) - } - def assertBounds(min: T, max: T) { + def assertBounds(min: T, max: T): Unit = { assertLowerBound(min) assertUpperBound(max) } - def mkString(sep: String) = { + def mkString(sep: String) = if (ord.equiv(lower, upper)) { lower.toString } else { lower.toString + sep + upper.toString } - } } class RangedArgs(args: Args) { - def range[T](argName: String)(cnv: String => T)(implicit ord: Ordering[T]): Range[T] = args.list(argName) match { - case List(v) => - Range(cnv(v), cnv(v)) - case List(v1, v2) => - Range(cnv(v1), cnv(v2)) - case _ => - throw new IllegalArgumentException(argName + " must have either 1 or 2 values specified") - } + def range[T](argName: String)(cnv: String => T)(implicit ord: Ordering[T]): Range[T] = + args.list(argName) match { + case List(v) => + Range(cnv(v), cnv(v)) + case List(v1, v2) => + Range(cnv(v1), cnv(v2)) + case _ => + throw new IllegalArgumentException(argName + " must have either 1 or 2 values specified") + } } diff --git a/scalding-args/src/test/scala/com/twitter/scalding/ArgTest.scala b/scalding-args/src/test/scala/com/twitter/scalding/ArgTest.scala index c91546fc92..15ff4c8dae 100644 --- a/scalding-args/src/test/scala/com/twitter/scalding/ArgTest.scala +++ b/scalding-args/src/test/scala/com/twitter/scalding/ArgTest.scala @@ -12,117 +12,149 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ +import org.scalatest.WordSpec -class ArgTest extends Specification { +class ArgTest extends WordSpec { "Tool.parseArgs" should { "handle the empty list" in { val map = Args(Array[String]()) - map.list("") must be_==(List()) + assert(map.list("").isEmpty) } "accept any number of dashed args" in { val map = Args(Array("--one", "1", "--two", "2", "--three", "3")) - map.list("") must be_==(List()) - map.optional("") must be_==(None) - - map.list("absent") must be_==(List()) - map.optional("absent") must be_==(None) - - map("one") must be_==("1") - map.list("one") must be_==(List("1")) - map.required("one") must be_==("1") - map.optional("one") must be_==(Some("1")) - - map("two") must be_==("2") - map.list("two") must be_==(List("2")) - map.required("two") must be_==("2") - map.optional("two") must be_==(Some("2")) - - map("three") must be_==("3") - map.list("three") must be_==(List("3")) - map.required("three") must be_==("3") - map.optional("three") must be_==(Some("3")) + assert(map.list("").isEmpty) + assert(map.optional("").isEmpty) + + assert(map.list("absent").isEmpty) + assert(map.optional("absent").isEmpty) + + assert(map("one") === "1") + assert(map.list("one") === List("1")) + assert(map.required("one") === "1") + assert(map.optional("one") === Some("1")) + + assert(map("two") === "2") + assert(map.list("two") === List("2")) + assert(map.required("two") === "2") + assert(map.optional("two") === Some("2")) + + assert(map("three") === "3") + assert(map.list("three") === List("3")) + assert(map.required("three") === "3") + assert(map.optional("three") === Some("3")) } "remove empty args in lists" in { val map = Args(Array("", "hello", "--one", "1", "", "\t", "--two", "2", "", "3")) - map("") must be_==("hello") - map.list("") must be_==(List("hello")) - map("one") must be_==("1") - map.list("one") must be_==(List("1")) - map.list("two") must be_==(List("2", "3")) + assert(map("") === "hello") + assert(map.list("") === List("hello")) + assert(map("one") === "1") + assert(map.list("one") === List("1")) + assert(map.list("two") === List("2", "3")) } "put initial args into the empty key" in { - val map =Args(List("hello", "--one", "1")) - map("") must be_==("hello") - map.list("") must be_==(List("hello")) - map.required("") must be_==("hello") - map.optional("") must be_==(Some("hello")) - - map("one") must be_==("1") - map.list("one") must be_==(List("1")) + val map = Args(List("hello", "--one", "1")) + assert(map("") === "hello") + assert(map.list("") === List("hello")) + assert(map.required("") === "hello") + assert(map.optional("") === Some("hello")) + + assert(map("one") === "1") + assert(map.list("one") === List("1")) } "allow any number of args per key" in { val map = Args(Array("--one", "1", "--two", "2", "deux", "--zero")) - map("one") must be_==("1") - map.list("two") must be_==(List("2","deux")) - map.boolean("zero") must be_==(true) + assert(map("one") === "1") + assert(map.list("two") === List("2", "deux")) + assert(map.boolean("zero")) } "allow any number of dashes" in { val map = Args(Array("-one", "1", "--two", "2", "---three", "3")) - map("three") must be_==("3") - map("two") must be_==("2") - map("one") must be_==("1") + assert(map("three") === "3") + assert(map("two") === "2") + assert(map("one") === "1") } "round trip to/from string" in { val a = Args("--you all every --body 1 2") - a must be_==(Args(a.toString)) - a must be_==(Args(a.toList)) + assert(a === Args(a.toString)) + assert(a === Args(a.toList)) } "handle positional arguments" in { val a = Args("p0 p1 p2 --f 1 2") - a.positional must be_==(List("p0", "p1", "p2")) - Args(a.toString) must be_==(a) - Args(a.toList) must be_==(a) + assert(a.positional === List("p0", "p1", "p2")) + assert(Args(a.toString) === a) + assert(Args(a.toList) === a) } "handle negative numbers in args" in { val a = Args("--a 1 -2.1 --b 1 -3 4 --c -5") - a.list("a") must_== List("1", "-2.1") - a.list("b") must_== List("1", "-3", "4") - a("c").toInt must_== -5 + assert(a.list("a") === List("1", "-2.1")) + assert(a.list("b") === List("1", "-3", "4")) + assert(a("c").toInt === -5) } "handle strange characters in the args" in { val a = Args("p-p --a-a 1-1 -b=b c=d e/f -5,2 5,3") - a.positional must be_==(List("p-p")) - a.list("a-a") must be_==(List("1-1")) - a.list("b=b") must be_==(List("c=d", "e/f")) - a.list("5,2") must be_==(List("5,3")) + assert(a.positional === List("p-p")) + assert(a.list("a-a") === List("1-1")) + assert(a.list("b=b") === List("c=d", "e/f")) + assert(a.list("5,2") === List("5,3")) } "access positional arguments using apply" in { val a = Args("a b c --d e") - a(0) must be_==("a") - a(1) must be_==("b") - a(2) must be_==("c") - a("d") must be_==("e") + assert(a(0) === "a") + assert(a(1) === "b") + assert(a(2) === "c") + assert(a("d") === "e") } "verify that args belong to an accepted key set" in { val a = Args("a --one --two b --three c d --scalding.tool.mode") a.restrictTo(Set("one", "two", "three", "four")) - a.restrictTo(Set("one", "two")) must throwA[java.lang.RuntimeException] + intercept[RuntimeException](a.restrictTo(Set("one", "two"))) } + "correctly parse numeric args" in { + val map = Args( + Array("--anInt", "-1", "--aLong", "21474836470", "--aDecimal", "3.141592654", "--aString", "foo") + ) + assert(map.int("anInt") == "-1".toInt) + assert(map.int("anInt", 2) == "-1".toInt) + assert(map.int("nothing", 2) == 2) + intercept[RuntimeException](map.int("nothing")) + intercept[RuntimeException](map.int("aString")) + intercept[RuntimeException](map.int("aString", 2)) + + assert(map.long("aLong") == "21474836470".toLong) + assert(map.long("anInt", 2L) == "-1".toLong) + assert(map.long("nothing", 2L) == 2L) + intercept[RuntimeException](map.long("nothing")) + intercept[RuntimeException](map.long("aString")) + intercept[RuntimeException](map.long("aString", 2L)) + + assert(map.float("aDecimal") == "3.141592654".toFloat) + assert(map.float("aDecimal", 2.71828f) == "3.141592654".toFloat) + assert(map.float("nothing", 2.71828f) == 2.71828f) + intercept[RuntimeException](map.float("nothing")) + intercept[RuntimeException](map.float("aString")) + intercept[RuntimeException](map.float("aString", 2.71828f)) + + assert(map.double("aDecimal") == "3.141592654".toDouble) + assert(map.double("aDecimal", 2.71828d) == "3.141592654".toDouble) + assert(map.double("nothing", 2.71828d) == 2.71828d) + intercept[RuntimeException](map.double("nothing")) + intercept[RuntimeException](map.double("aString")) + intercept[RuntimeException](map.double("aString", 2.71828d)) + } } } diff --git a/scalding-args/src/test/scala/com/twitter/scalding/RangedArgsSpec.scala b/scalding-args/src/test/scala/com/twitter/scalding/RangedArgsSpec.scala index f0b5daea75..e2b05305d2 100644 --- a/scalding-args/src/test/scala/com/twitter/scalding/RangedArgsSpec.scala +++ b/scalding-args/src/test/scala/com/twitter/scalding/RangedArgsSpec.scala @@ -12,45 +12,45 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ +import org.scalatest.WordSpec -class RangeSpecs extends Specification { +class RangeSpecs extends WordSpec { "A Range" should { val testRange = Range(4, 5) "contain its endpoints" in { - testRange.lower must_== 4 - testRange.upper must_== 5 + assert(testRange.lower === 4) + assert(testRange.upper === 5) } "throw errors for misordered ranges" in { Range(4, 4) - Range(5, 4) must throwAn[AssertionError] + intercept[AssertionError](Range(5, 4)) } "assert lower bounds" in { testRange.assertLowerBound(3) testRange.assertLowerBound(4) - testRange.assertLowerBound(5) must throwAn[AssertionError] + intercept[AssertionError](testRange.assertLowerBound(5)) } "assert upper bounds" in { testRange.assertUpperBound(6) testRange.assertUpperBound(5) - testRange.assertUpperBound(4) must throwAn[AssertionError] + intercept[AssertionError](testRange.assertUpperBound(4)) } - "print nicely with mkString" in { + "print nicely with mkString" should { "for trivial ranges" in { - Range(4, 4).mkString("_") must beEqualTo("4") + assert(Range(4, 4).mkString("_") === "4") } "for proper ranges" in { - testRange.mkString("_") must beEqualTo("4_5") - testRange.mkString("-") must beEqualTo("4-5") + assert(testRange.mkString("_") === "4_5") + assert(testRange.mkString("-") === "4-5") } } } diff --git a/scalding-avro/README.md b/scalding-avro/README.md index 8a80f8381c..eacffbe7a1 100644 --- a/scalding-avro/README.md +++ b/scalding-avro/README.md @@ -4,7 +4,8 @@ https://github.com/ScaleUnlimited/cascading.avro . In some case Kryo (the default serializer used by Scalding) doesn't work well with Avro objects. If you run in to serialization errors, or if you want to preempt and trouble, you should add the following to your Job class: ```scala -override def ioSerializations = super.ioSerializations :+ "cascading.avro.serialization.AvroSpecificRecordSerialization" +override def ioSerializations = + super.ioSerializations :+ classOf[cascading.avro.serialization.AvroSpecificRecordSerialization[_]] ``` This will use cascading.avro's Avro SpecificRecord serialization for Avro objects in place of the Kryo serialization. diff --git a/scalding-avro/src/main/scala/com/twitter/scalding/avro/AvroSource.scala b/scalding-avro/src/main/scala/com/twitter/scalding/avro/AvroSource.scala index 6a4b8ccb99..708436e54f 100644 --- a/scalding-avro/src/main/scala/com/twitter/scalding/avro/AvroSource.scala +++ b/scalding-avro/src/main/scala/com/twitter/scalding/avro/AvroSource.scala @@ -26,16 +26,17 @@ import java.io.OutputStream import java.util.Properties import cascading.tuple.Fields import collection.JavaConverters._ -import org.apache.hadoop.mapred.{OutputCollector, RecordReader, JobConf} - +import org.apache.hadoop.mapred.{JobConf, OutputCollector, RecordReader} trait UnpackedAvroFileScheme extends FileSource { def schema: Option[Schema] // HadoopSchemeInstance gives compile errors in 2.10 for some reason - override def hdfsScheme = (new AvroScheme(schema.getOrElse(null))).asInstanceOf[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]] + override def hdfsScheme = (new AvroScheme(schema.getOrElse(null))) + .asInstanceOf[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]] - override def localScheme = (new LAvroScheme(schema.getOrElse(null))).asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]] + override def localScheme = (new LAvroScheme(schema.getOrElse(null))) + .asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]] } @@ -43,9 +44,11 @@ trait PackedAvroFileScheme[T] extends FileSource { def schema: Schema // HadoopSchemeInstance gives compile errors for this in 2.10 for some reason - override def hdfsScheme = (new PackedAvroScheme[T](schema)).asInstanceOf[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]] + override def hdfsScheme = (new PackedAvroScheme[T](schema)) + .asInstanceOf[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]] - override def localScheme = (new LPackedAvroScheme[T](schema)).asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]] + override def localScheme = + (new LPackedAvroScheme[T](schema)).asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]] } object UnpackedAvroSource { @@ -66,17 +69,20 @@ object UnpackedAvroSource { new UnpackedAvroSource[T](Seq(path), schema) } -case class UnpackedAvroSource[T](paths: Seq[String], schema: Option[Schema]) - (implicit val conv: TupleConverter[T], tset: TupleSetter[T]) - - extends FixedPathSource(paths: _*) - with UnpackedAvroFileScheme with Mappable[T] with TypedSink[T] { +case class UnpackedAvroSource[T](paths: Seq[String], schema: Option[Schema])(implicit + val conv: TupleConverter[T], + tset: TupleSetter[T] +) extends FixedPathSource(paths: _*) + with UnpackedAvroFileScheme + with Mappable[T] + with TypedSink[T] { override def sinkFields: Fields = { - val outFields = schema.map { - schema => - val schemaFields = schema.getFields - schemaFields.asScala.foldLeft(new Fields())((cFields, sField) => cFields.append(new Fields(sField.name()))) + val outFields = schema.map { schema => + val schemaFields = schema.getFields + schemaFields.asScala.foldLeft(new Fields())((cFields, sField) => + cFields.append(new Fields(sField.name())) + ) } outFields.getOrElse(Dsl.intFields(0 until setter.arity)) } @@ -85,18 +91,21 @@ case class UnpackedAvroSource[T](paths: Seq[String], schema: Option[Schema]) override def setter[U <: T] = TupleSetter.asSubSetter[T, U](tset) - } - object PackedAvroSource { - def apply[T: AvroSchemaType : Manifest : TupleConverter](path: String) - = new PackedAvroSource[T](Seq(path)) + def apply[T: AvroSchemaType: Manifest: TupleConverter](path: String) = new PackedAvroSource[T](Seq(path)) } -case class PackedAvroSource[T](paths: Seq[String]) - (implicit val mf: Manifest[T], conv: TupleConverter[T], tset: TupleSetter[T], avroType: AvroSchemaType[T]) - extends FixedPathSource(paths: _*) with PackedAvroFileScheme[T] with Mappable[T] with TypedSink[T] { +case class PackedAvroSource[T](paths: Seq[String])(implicit + val mf: Manifest[T], + conv: TupleConverter[T], + tset: TupleSetter[T], + avroType: AvroSchemaType[T] +) extends FixedPathSource(paths: _*) + with PackedAvroFileScheme[T] + with Mappable[T] + with TypedSink[T] { override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](conv) override def setter[U <: T] = TupleSetter.asSubSetter[T, U](tset) diff --git a/scalding-avro/src/main/scala/com/twitter/scalding/avro/SchemaType.scala b/scalding-avro/src/main/scala/com/twitter/scalding/avro/SchemaType.scala index 0005a12163..b401f84a15 100644 --- a/scalding-avro/src/main/scala/com/twitter/scalding/avro/SchemaType.scala +++ b/scalding-avro/src/main/scala/com/twitter/scalding/avro/SchemaType.scala @@ -20,7 +20,6 @@ import org.apache.avro.specific.SpecificRecord import java.nio.ByteBuffer - trait AvroSchemaType[T] extends Serializable { def schema: Schema } @@ -29,51 +28,58 @@ object AvroSchemaType { // primitive types - implicit def BooleanSchema = new AvroSchemaType[Boolean] { + implicit def BooleanSchema: AvroSchemaType[Boolean] = new AvroSchemaType[Boolean] { def schema = Schema.create(Schema.Type.BOOLEAN) } - implicit def ByteBufferSchema = new AvroSchemaType[ByteBuffer] { + implicit def ByteBufferSchema: AvroSchemaType[ByteBuffer] = new AvroSchemaType[ByteBuffer] { def schema = Schema.create(Schema.Type.BYTES) } - implicit def DoubleSchema = new AvroSchemaType[Double] { + implicit def DoubleSchema: AvroSchemaType[Double] = new AvroSchemaType[Double] { def schema = Schema.create(Schema.Type.DOUBLE) } - implicit def FloatSchema = new AvroSchemaType[Float] { + implicit def FloatSchema: AvroSchemaType[Float] = new AvroSchemaType[Float] { def schema = Schema.create(Schema.Type.FLOAT) } - implicit def IntSchema = new AvroSchemaType[Int] { + implicit def IntSchema: AvroSchemaType[Int] = new AvroSchemaType[Int] { def schema = Schema.create(Schema.Type.INT) } - implicit def LongSchema = new AvroSchemaType[Long] { + implicit def LongSchema: AvroSchemaType[Long] = new AvroSchemaType[Long] { def schema = Schema.create(Schema.Type.LONG) } - implicit def StringSchema = new AvroSchemaType[String] { + implicit def StringSchema: AvroSchemaType[String] = new AvroSchemaType[String] { def schema = Schema.create(Schema.Type.STRING) } // collections - implicit def CollectionSchema[CC[x] <: Iterable[x], T](implicit sch: AvroSchemaType[T]) = new AvroSchemaType[CC[T]] { + implicit def CollectionSchema[CC[x] <: Iterable[x], T](implicit + sch: AvroSchemaType[T] + ): AvroSchemaType[CC[T]] = new AvroSchemaType[CC[T]] { def schema = Schema.createArray(sch.schema) } - implicit def ArraySchema[CC[x] <: Array[x], T](implicit sch: AvroSchemaType[T]) = new AvroSchemaType[CC[T]] { + implicit def ArraySchema[CC[x] <: Array[x], T](implicit + sch: AvroSchemaType[T] + ): AvroSchemaType[CC[T]] { val schema: Schema } = new AvroSchemaType[CC[T]] { val schema = Schema.createArray(sch.schema) } - //maps - implicit def MapSchema[CC[String, x] <: Map[String, x], T](implicit sch: AvroSchemaType[T]) = new AvroSchemaType[CC[String, T]] { + // maps + implicit def MapSchema[CC[String, x] <: Map[String, x], T](implicit + sch: AvroSchemaType[T] + ): AvroSchemaType[CC[String, T]] = new AvroSchemaType[CC[String, T]] { def schema = Schema.createMap(sch.schema) } // Avro SpecificRecord - implicit def SpecificRecordSchema[T <: SpecificRecord](implicit mf: Manifest[T]) = new AvroSchemaType[T] { - def schema = mf.erasure.newInstance.asInstanceOf[SpecificRecord].getSchema - } + implicit def SpecificRecordSchema[T <: SpecificRecord](implicit mf: Manifest[T]): AvroSchemaType[T] = + new AvroSchemaType[T] { + def schema = mf.runtimeClass.newInstance.asInstanceOf[SpecificRecord].getSchema + } -} \ No newline at end of file +} diff --git a/scalding-avro/src/main/scala/com/twitter/scalding/avro/package.scala b/scalding-avro/src/main/scala/com/twitter/scalding/avro/package.scala index b7d2a74694..f9677938b0 100644 --- a/scalding-avro/src/main/scala/com/twitter/scalding/avro/package.scala +++ b/scalding-avro/src/main/scala/com/twitter/scalding/avro/package.scala @@ -20,30 +20,36 @@ import org.apache.avro.Schema import collection.JavaConverters._ import cascading.tuple.Fields +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ + package object avro { - def writePackedAvro[T](pipe: TypedPipe[T], path: String) - (implicit mf: Manifest[T], - st: AvroSchemaType[T], - conv: TupleConverter[T], - set: TupleSetter[T], - flow: FlowDef, - mode: Mode): Unit = { + def writePackedAvro[T](pipe: TypedPipe[T], path: String)(implicit + mf: Manifest[T], + st: AvroSchemaType[T], + conv: TupleConverter[T], + set: TupleSetter[T], + flow: FlowDef, + mode: Mode + ): Unit = { val sink = PackedAvroSource[T](path) pipe.write(sink) } - def writeUnpackedAvro[T <: Product](pipe: TypedPipe[T], path: String, schema: Schema) - (implicit mf: Manifest[T], - conv: TupleConverter[T], - set: TupleSetter[T], - flow: FlowDef, - mode: Mode): Unit = { + def writeUnpackedAvro[T <: Product](pipe: TypedPipe[T], path: String, schema: Schema)(implicit + mf: Manifest[T], + conv: TupleConverter[T], + set: TupleSetter[T], + flow: FlowDef, + mode: Mode + ): Unit = { import Dsl._ val sink = UnpackedAvroSource[T](path, Some(schema)) val outFields = { val schemaFields = schema.getFields - schemaFields.asScala.foldLeft(new Fields())((cFields, sField) => cFields.append(new Fields(sField.name()))) + schemaFields.asScala.foldLeft(new Fields())((cFields, sField) => + cFields.append(new Fields(sField.name())) + ) } pipe.toPipe(outFields).write(sink) } -} \ No newline at end of file +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/CFuture.scala b/scalding-base/src/main/scala/com/twitter/scalding/CFuture.scala new file mode 100644 index 0000000000..1c1461ec2c --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/CFuture.scala @@ -0,0 +1,51 @@ +package com.twitter.scalding + +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future} + +/** + * Represents a cancellable future. + */ +case class CFuture[+T](future: Future[T], cancellationHandler: CancellationHandler) { + def map[S](fn: T => S)(implicit cec: ConcurrentExecutionContext): CFuture[S] = { + val mapped = future.map(fn) + CFuture(mapped, cancellationHandler) + } + + def mapFuture[S](fn: Future[T] => Future[S]): CFuture[S] = { + val transformed = fn(future) + CFuture(transformed, cancellationHandler) + } + + def zip[U](other: CFuture[U])(implicit cec: ConcurrentExecutionContext): CFuture[(T, U)] = { + val zippedFut: Future[(T, U)] = Execution.failFastZip(future, other.future) + val cancelHandler = cancellationHandler.compose(other.cancellationHandler) + + CFuture(zippedFut, cancelHandler) + } +} + +object CFuture { + def successful[T](result: T): CFuture[T] = + CFuture(Future.successful(result), CancellationHandler.empty) + + def failed(t: Throwable): CFuture[Nothing] = { + val f = Future.failed(t) + CFuture(f, CancellationHandler.empty) + } + + def uncancellable[T](fut: Future[T]): CFuture[T] = + CFuture(fut, CancellationHandler.empty) + + def fromFuture[T](fut: Future[CFuture[T]])(implicit cec: ConcurrentExecutionContext): CFuture[T] = + CFuture(fut.flatMap(_.future), CancellationHandler.fromFuture(fut.map(_.cancellationHandler))) + + /** + * Use our internal faster failing zip function rather than the standard one due to waiting + */ + def failFastSequence[T]( + t: Iterable[CFuture[T]] + )(implicit cec: ConcurrentExecutionContext): CFuture[List[T]] = + t.foldLeft(CFuture.successful(Nil: List[T])) { (f, i) => + f.zip(i).map { case (tail, h) => h :: tail } + }.map(_.reverse) +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/CPromise.scala b/scalding-base/src/main/scala/com/twitter/scalding/CPromise.scala new file mode 100644 index 0000000000..d06f6b04dc --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/CPromise.scala @@ -0,0 +1,25 @@ +package com.twitter.scalding + +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} + +/** + * Represents a cancellable promise. + */ +case class CPromise[T](promise: Promise[T], cancellationHandler: Promise[CancellationHandler]) { + + /** + * Creates a CFuture using the given promises. + */ + def cfuture: CFuture[T] = + CFuture(promise.future, CancellationHandler.fromFuture(cancellationHandler.future)) + + def completeWith(other: CFuture[T]): this.type = { + // fulfill the main and cancellation handler promises + promise.completeWith(other.future) + cancellationHandler.completeWith(Future.successful(other.cancellationHandler)) + this + } +} +object CPromise { + def apply[T](): CPromise[T] = CPromise(Promise[T](), Promise[CancellationHandler]()) +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/CancellationHandler.scala b/scalding-base/src/main/scala/com/twitter/scalding/CancellationHandler.scala new file mode 100644 index 0000000000..f5967f6217 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/CancellationHandler.scala @@ -0,0 +1,26 @@ +package com.twitter.scalding + +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future} + +sealed trait CancellationHandler { outer => + def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] + def compose(other: CancellationHandler): CancellationHandler = new CancellationHandler { + override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = + other.stop().zip(outer.stop()).map(_ => ()) + } +} + +object CancellationHandler { + val empty: CancellationHandler = new CancellationHandler { + def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = Future.successful(()) + } + + def fromFn(fn: ConcurrentExecutionContext => Future[Unit]): CancellationHandler = new CancellationHandler { + override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = fn(ec) + } + + def fromFuture(f: Future[CancellationHandler]): CancellationHandler = new CancellationHandler { + override def stop()(implicit ec: ConcurrentExecutionContext): Future[Unit] = + f.flatMap(_.stop()) + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/Config.scala b/scalding-base/src/main/scala/com/twitter/scalding/Config.scala new file mode 100644 index 0000000000..104afe3a58 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/Config.scala @@ -0,0 +1,530 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import com.twitter.scalding.serialization.{RequireOrderedSerializationMode, Serialization} +import com.twitter.scalding.serialization.macros.impl.BinaryOrdering.{ordSer => serializer} +import java.util.Base64 +import java.security.MessageDigest +import java.io.{ByteArrayInputStream, ByteArrayOutputStream} +import scala.util.{Failure, Success, Try} + +/** + * This is a wrapper class on top of Map[String, String] + */ +abstract class Config extends Serializable { + + import Config._ // get the constants + def toMap: Map[String, String] + + def get(key: String): Option[String] = toMap.get(key) + def +(kv: (String, String)): Config = Config(toMap + kv) + def ++(that: Config): Config = Config(toMap ++ that.toMap) + def -(k: String): Config = Config(toMap - k) + def update[R](k: String)(fn: Option[String] => (Option[String], R)): (R, Config) = + fn(get(k)) match { + case (Some(v), r) => (r, this + (k -> v)) + case (None, r) => (r, this - k) + } + + def getBoolean(key: String, orElse: => Boolean): Boolean = + get(key).map(_.toBoolean).getOrElse(orElse) + + /** + * This is a name that if present is passed to flow.setName, which should appear in the job tracker. + */ + def getCascadingAppName: Option[String] = get(CascadingAppName) + def setCascadingAppName(name: String): Config = + this + (CascadingAppName -> name) + + def setCascadingAppId(id: String): Config = + this + (CascadingAppId -> id) + + /** + * Non-fat-jar use cases require this, BUT using it with fat jars can cause problems. It is not set by + * default, but if you have problems you might need to set the Job class here Consider also setting this + * same class here: setScaldingFlowClass + */ + def setCascadingAppJar(clazz: Class[_]): Config = + this + (CascadingAppAppJarClass -> clazz.getName) + + /** + * Returns None if not set, otherwise reflection is used to create the Class.forName + */ + def getCascadingAppJar: Option[Try[Class[_]]] = + getClassForKey(CascadingAppAppJarClass) + + def getClassForKey(k: String): Option[Try[Class[_]]] = + get(k).map { str => + try { + Success( + // Make sure we are using the class-loader for the current thread + Class.forName(str, true, Thread.currentThread().getContextClassLoader) + ) + } catch { case err: Throwable => Failure(err) } + } + + @deprecated("Use setRequireOrderedSerializationMode", "12/14/17") + def setRequireOrderedSerialization(b: Boolean): Config = + this + (ScaldingRequireOrderedSerialization -> (b.toString)) + + @deprecated("Use getRequireOrderedSerializationMode", "12/14/17") + def getRequireOrderedSerialization: Boolean = + getRequireOrderedSerializationMode == Some(RequireOrderedSerializationMode.Fail) + + /** + * Set this configuration option to require all grouping/cogrouping to use OrderedSerialization + */ + def setRequireOrderedSerializationMode(r: Option[RequireOrderedSerializationMode]): Config = + r.map { v => + this + (ScaldingRequireOrderedSerialization -> (v.toString)) + }.getOrElse(this) + + def getRequireOrderedSerializationMode: Option[RequireOrderedSerializationMode] = + get(ScaldingRequireOrderedSerialization) + .map(_.toLowerCase()) + .collect { + case "true" => RequireOrderedSerializationMode.Fail // backwards compatibility + case "fail" => RequireOrderedSerializationMode.Fail + case "log" => RequireOrderedSerializationMode.Log + } + + def getArgs: Args = get(Config.ScaldingJobArgsSerialized) match { + case None => new Args(Map.empty) + case Some(str) => + val bytes = Base64.getDecoder.decode(str) + val bais = new ByteArrayInputStream(bytes) + new Args(argMapSerializer.read(bais).get) + } + + def setArgs(args: Args): Config = { + val mapSer: Serialization[Map[String, List[String]]] = serializer[Map[String, List[String]]] + val baos = new ByteArrayOutputStream() + argMapSerializer.write(baos, args.m).get + val bytes = baos.toByteArray + val str = Base64.getEncoder.encodeToString(bytes) + + this + .+(Config.ScaldingJobArgs -> args.toString) + .+(Config.ScaldingJobArgsSerialized -> str) + } + + def getOptimizationPhases: Option[Try[typed.OptimizationPhases]] = + getClassForKey(Config.OptimizationPhases).map { tryClass => + tryClass.flatMap { clazz => + Try(clazz.newInstance().asInstanceOf[typed.OptimizationPhases]) + } + } + + def setOptimizationPhases(clazz: Class[_ <: typed.OptimizationPhases]): Config = + setOptimizationPhasesFromName(clazz.getName) + + def setOptimizationPhasesFromName(className: String): Config = + this + (Config.OptimizationPhases -> className) + + def getUniqueIds: Set[UniqueID] = + get(UniqueID.UNIQUE_JOB_ID) + .map(str => str.split(",").toSet[String].map(UniqueID(_))) + .getOrElse(Set.empty) + + /* + * This is *required* if you are using counters. You must use + * the same UniqueID as you used when defining your jobs. + */ + def addUniqueId(u: UniqueID): Config = + update(UniqueID.UNIQUE_JOB_ID) { + case None => (Some(u.get), ()) + case Some(str) => (Some((StringUtility.fastSplit(str, ",").toSet + u.get).mkString(",")), ()) + }._2 + + /** + * Allocate a new UniqueID if there is not one present + */ + def ensureUniqueId: (UniqueID, Config) = + update(UniqueID.UNIQUE_JOB_ID) { + case None => + val uid = UniqueID.getRandom + (Some(uid.get), uid) + case s @ Some(str) => + (s, UniqueID(StringUtility.fastSplit(str, ",").head)) + } + + /** + * Set an ID to be shared across this usage of run for Execution + */ + def setScaldingExecutionId(id: String): Config = + this.+(ScaldingExecutionId -> id) + + def getScaldingExecutionId: Option[String] = + get(ScaldingExecutionId) + + /* + * Add this class name and the md5 hash of it into the config + */ + def setScaldingFlowClass(clazz: Class[_]): Config = + this + .+(ScaldingFlowClassName -> clazz.getName) + .+(ScaldingFlowClassSignature -> Config.md5Identifier(clazz)) + + def setScaldingFlowCounterValue(value: Long): Config = + this + (ScaldingFlowCounterValue -> value.toString) + + def getScaldingFlowCounterValue: Option[Long] = + get(ScaldingFlowCounterValue).map(_.toLong) + + /** + * Prepend an estimator so it will be tried first. If it returns None, the previously-set estimators will be + * tried in order. + */ + def addReducerEstimator[T](cls: Class[T]): Config = + addReducerEstimator(cls.getName) + + /** + * Prepend an estimator so it will be tried first. If it returns None, the previously-set estimators will be + * tried in order. + */ + def addReducerEstimator(clsName: String): Config = + update(Config.ReducerEstimators) { + case None => (Some(clsName), ()) + case Some(lst) => (Some(s"$clsName,$lst"), ()) + }._2 + + /** Set the entire list of reducer estimators (overriding the existing list) */ + def setReducerEstimators(clsList: String): Config = + this + (Config.ReducerEstimators -> clsList) + + /** Get the number of reducers (this is the parameter Hadoop will use) */ + def getNumReducers: Option[Int] = get(Config.HadoopNumReducers).map(_.toInt) + def setNumReducers(n: Int): Config = this + (Config.HadoopNumReducers -> n.toString) + + /** Set username from System.used for querying hRaven. */ + def setHRavenHistoryUserName: Config = + this + (Config.HRavenHistoryUserName -> System.getProperty("user.name")) + + def setHashJoinAutoForceRight(b: Boolean): Config = + this + (HashJoinAutoForceRight -> (b.toString)) + + def getHashJoinAutoForceRight: Boolean = + getBoolean(HashJoinAutoForceRight, false) + + /** + * Set to true to enable very verbose logging during FileSource's validation and planning. This can help + * record what files were present / missing at runtime. Should only be enabled for debugging. + */ + def setVerboseFileSourceLogging(b: Boolean): Config = + this + (VerboseFileSourceLoggingKey -> b.toString) + + def getSkipNullCounters: Boolean = + getBoolean(SkipNullCounters, false) + + /** + * If this is true, on hadoop, when we get a null Counter for a given name, we just ignore the counter + * instead of NPE + */ + def setSkipNullCounters(boolean: Boolean): Config = + this + (SkipNullCounters -> boolean.toString) + + /** + * When this value is true, all temporary output is removed when the outer-most execution completes, not on + * JVM shutdown. + * + * When you do .forceToDiskExecution or .toIterableExecution we need to materialize the data somewhere. We + * can't be sure that when the outer most execution is complete that all reads have been done, since they + * could escape the value of the Execution. If you know no such reference escapes, it is safe to set to + * true. + * + * Note, this is *always* safe for Execution[Unit], a common value. + */ + def setExecutionCleanupOnFinish(boolean: Boolean): Config = + this + (ScaldingExecutionCleanupOnFinish -> boolean.toString) + + /** + * should we cleanup temporary files when the outer most Execution is run. + * + * Not safe if the outer-most execution returns a TypedPipe or Iterable derived from a forceToDiskExecution + * or a toIterableExecution + */ + def getExecutionCleanupOnFinish: Boolean = + getBoolean(ScaldingExecutionCleanupOnFinish, false) + + /** + * Enable/Disable optimization of `Exception` graph. + */ + def setExecutionOptimization(boolean: Boolean): Config = + this + (ScaldingExecutionOptimizationEnabled -> boolean.toString) + + /** + * Should we optimize of `Execution` graph. + */ + def getExecutionOptimization: Boolean = + getBoolean(ScaldingExecutionOptimizationEnabled, true) + + // we use Config as a key in Execution caches so we + // want to avoid recomputing it repeatedly + override lazy val hashCode = toMap.hashCode + override def equals(that: Any) = that match { + case thatConf: Config => + if (this eq thatConf) true + else if (hashCode != thatConf.hashCode) false + else toMap == thatConf.toMap + case _ => false + } + + /** + * Enable/Disable check of taps that we use ScaldingHfs before openForRead + */ + def setCheckHfsTaps(boolean: Boolean): Config = + this + (Config.ScaldingCheckHfsTaps -> boolean.toString) + + /** + * Should we check taps that we use ScaldingHfs before openForRead + */ + def getCheckHfsTaps: Boolean = + getBoolean(Config.ScaldingCheckHfsTaps, false) + + /* + * Used in joins to determine how much of the "right hand side" of + * the join to keep in memory + */ + def setListSpillThreshold(count: Int): Config = + this + (CascadingSpillablePropListThreshold -> count.toString) + + /* + * Used in hashJoin/joinWithTiny to determine how big the map + * can be before spilling to disk. Generally, as big as you can + * allow here without OOM will help performance. + */ + def setMapSpillThreshold(count: Int): Config = + this + (CascadingSpillablePropMapThreshold -> count.toString) + + /* + * Used in map-side aggregation of associative operations (Semigroup/Monoid) + * This controls how many keys are in an in-memory cache. If a significant + * probability mass of the key-space is far bigger than this value, it + * does not help much (and may hurt, so experiment with disabling to get + * the best results + */ + def setMapSideAggregationThreshold(count: Int): Config = + this + (CascadingAggregateByThreshold -> count.toString) + + def getMapSideAggregationThreshold: Option[Int] = + get(CascadingAggregateByThreshold).map(_.toInt) + + def getScaldingVersion: Option[String] = get(ScaldingVersion) + def setScaldingVersion: Config = + (this + .+(ScaldingVersion -> BuildInfo.version)) + .+( + // This is setting a property for cascading/driven + (CascadingAppFrameworks -> ("scalding:" + BuildInfo.version)) + ) +} + +object Config { + val CascadingAppName: String = "cascading.app.name" + val CascadingAppId: String = "cascading.app.id" + val CascadingAppFrameworks: String = "cascading.app.frameworks" + val CascadingAppAppJarClass: String = "cascading.app.appjar.class" + val CascadingAggregateByThreshold: String = "cascading.aggregateby.threshold" + val CascadingSerializationTokens = "cascading.serialization.tokens" + val CascadingSpillablePropListThreshold: String = "cascading.spill.list.threshold" + val CascadingSpillablePropMapThreshold: String = "cascading.spill.map.threshold" + + val IoSerializationsKey: String = "io.serializations" + val ScaldingFlowClassName: String = "scalding.flow.class.name" + val ScaldingFlowClassSignature: String = "scalding.flow.class.signature" + + /** + * This is incremented every time a cascading flow is run as an Execution + */ + val ScaldingFlowCounterValue: String = "scalding.flow.counter.value" + val ScaldingFlowSubmittedTimestamp: String = "scalding.flow.submitted.timestamp" + val ScaldingExecutionId: String = "scalding.execution.uuid" + val ScaldingExecutionCleanupOnFinish: String = "scalding.execution.cleanup.onfinish" + val ScaldingExecutionOptimizationEnabled: String = "scalding.execution.optimization.enabled" + val ScaldingJobArgs: String = "scalding.job.args" + val ScaldingJobArgsSerialized: String = "scalding.job.argsserialized" + val ScaldingVersion: String = "scalding.version" + val ScaldingCheckHfsTaps: String = "scalding.taps.check.hfs" + val SkipNullCounters: String = "scalding.counters.skipnull" + val HRavenHistoryUserName: String = "hraven.history.user.name" + val ScaldingRequireOrderedSerialization: String = "scalding.require.orderedserialization" + val FlowListeners: String = "scalding.observability.flowlisteners" + val FlowStepListeners: String = "scalding.observability.flowsteplisteners" + val FlowStepStrategies: String = "scalding.strategies.flowstepstrategies" + val VerboseFileSourceLoggingKey: String = "scalding.filesource.verbose.logging" + val OptimizationPhases: String = "scalding.optimization.phases" + val RuntimeFrameworkKey = "mapreduce.framework.name" + val RuntimeFrameworkValueLocal = "local" + + /** + * Parameter that actually controls the number of reduce tasks. Be sure to set this in the JobConf for the + * *step* not the flow. + */ + val HadoopNumReducers = "mapred.reduce.tasks" + + /** Name of parameter to specify which class to use as the default estimator. */ + val ReducerEstimators = "scalding.reducer.estimator.classes" + + /** Whether estimator should override manually-specified reducers. */ + val ReducerEstimatorOverride = "scalding.reducer.estimator.override" + + /** Whether the number of reducers has been set explicitly using a `withReducers` */ + val WithReducersSetExplicitly = "scalding.with.reducers.set.explicitly" + + /** Name of parameter to specify which class to use as the default estimator. */ + val MemoryEstimators = "scalding.memory.estimator.classes" + + /** Hadoop map memory */ + val MapMemory = "mapreduce.map.memory.mb" + + /** Hadoop map java opts */ + val MapJavaOpts = "mapreduce.map.java.opts" + + /** Hadoop reduce java opts */ + val ReduceJavaOpts = "mapreduce.reduce.java.opts" + + /** Hadoop reduce memory */ + val ReduceMemory = "mapreduce.reduce.memory.mb" + + /** Manual description for use in .dot and MR step names set using a `withDescription`. */ + val PipeDescriptions = "scalding.pipe.descriptions" + val StepDescriptions = "scalding.step.descriptions" + + /** + * Parameter that can be used to determine behavior on the rhs of a hashJoin. If true, we try to guess when + * to auto force to disk before a hashJoin else (the default) we don't try to infer this and the behavior + * can be dictated by the user manually calling forceToDisk on the rhs or not as they wish. + */ + val HashJoinAutoForceRight: String = "scalding.hashjoin.autoforceright" + + val empty: Config = Config(Map.empty) + + /* + * Here is a config that will work, but perhaps is not optimally tuned for + * your cluster + */ + val default: Config = + Config.empty + .setListSpillThreshold(100 * 1000) + .setMapSpillThreshold(100 * 1000) + .setMapSideAggregationThreshold(100 * 1000) + .setScaldingVersion + + /* + * Extensions to the Default Config to tune it for unit tests + */ + def unitTestDefault: Config = + Config.default ++ Config.from( + Map( + ("cascading.update.skip" -> "true"), + (Config.RuntimeFrameworkKey -> Config.RuntimeFrameworkValueLocal) + ) + ) + + def apply(m: Map[String, String]): Config = new Config { def toMap = m } + /* + * Implicits cannot collide in name, so making apply impliict is a bad idea + */ + implicit def from(m: Map[String, String]): Config = apply(m) + + /** + * Merge Config.default with Hadoop config from the mode (if in Hadoop mode) + */ + def defaultFrom(mode: Mode): Config = + Config.from(mode.defaultConfig) + + /** + * Returns all the non-string keys on the left, the string keys/values on the right + */ + def stringsFrom[K >: String, V >: String](m: Map[K, V]): (Map[K, V], Map[String, String]) = + m.foldLeft((Map.empty[K, V], Map.empty[String, String])) { case ((kvs, conf), kv) => + kv match { + case (ks: String, vs: String) => (kvs, conf + (ks -> vs)) + case _ => (kvs + kv, conf) + } + } + + /** + * Either union these two, or return the keys that overlap + */ + def disjointUnion[K >: String, V >: String](m: Map[K, V], conf: Config): Either[Set[String], Map[K, V]] = { + val asMap = conf.toMap.toMap[K, V] // linter:disable:TypeToType // we are upcasting K, V + val duplicateKeys = m.keySet & asMap.keySet + if (duplicateKeys.isEmpty) Right(m ++ asMap) + else Left(conf.toMap.keySet.filter(duplicateKeys(_))) // make sure to return Set[String], and not cast + } + + /** + * This overwrites any keys in m that exist in config. + */ + def overwrite[K >: String, V >: String](m: Map[K, V], conf: Config): Map[K, V] = + m ++ (conf.toMap.toMap[K, V]) // linter:disable:TypeToType // we are upcasting K, V + + /* + * This can help with versioning Class files into configurations if they are + * logged. This allows you to detect changes in the job logic that may correlate + * with changes in performance + */ + def md5Identifier(clazz: Class[_]): String = { + def fromInputStream(s: java.io.InputStream): Array[Byte] = + Stream.continually(s.read).takeWhile(-1 !=).map(_.toByte).toArray + + def toHexString(bytes: Array[Byte]): String = bytes.map("%02X".format(_)).mkString + + def md5Hex(bytes: Array[Byte]): String = { + val md = MessageDigest.getInstance("MD5") + md.update(bytes) + toHexString(md.digest) + } + + val classAsPath = clazz.getName.replace(".", "/") + ".class" + val is = clazz.getClassLoader.getResourceAsStream(classAsPath) + val bytes = fromInputStream(is) + is.close() + md5Hex(bytes) + } + + /* + * Legacy code that uses Map[AnyRef, AnyRef] can call this + * function to get a Config. + * If there are unrecognized non-string values, this may fail. + */ + def tryFrom(maybeConf: Map[AnyRef, AnyRef]): Try[Config] = { + val (nonStrings, strings) = Config.stringsFrom(maybeConf) + val initConf = Config.from(strings) + + (nonStrings + .get(CascadingAppAppJarClass) match { + case Some(clazz) => + // Again, the _ causes problem with Try + try { + val cls = classOf[Class[_]].cast(clazz) + Success((nonStrings - CascadingAppAppJarClass, initConf.setCascadingAppJar(cls))) + } catch { + case err: Throwable => Failure(err) + } + case None => Success((nonStrings, initConf)) + }) + .flatMap { case (unhandled, withJar) => + if (unhandled.isEmpty) Success(withJar) + else Failure(new Exception("unhandled configurations: " + unhandled.toString)) + } + } + + private def argMapSerializer: Serialization[Map[String, List[String]]] = + serializer[Map[String, List[String]]] +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/CumulativeSum.scala b/scalding-base/src/main/scala/com/twitter/scalding/CumulativeSum.scala new file mode 100644 index 0000000000..d12074b74a --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/CumulativeSum.scala @@ -0,0 +1,97 @@ +package com.twitter.scalding.typed + +import com.twitter.algebird._ + +/** + * Extension for TypedPipe to add a cumulativeSum method. Given a TypedPipe with T = (GroupField, (SortField, + * SummableField)) cumulaitiveSum will return a SortedGrouped with the SummableField accumulated according to + * the sort field. eg: ('San Francisco', (100, 100)), ('San Francisco', (101, 50)), ('San Francisco', (200, + * 200)), ('Vancouver', (100, 50)), ('Vancouver', (101, 300)), ('Vancouver', (200, 100)) becomes ('San + * Francisco', (100, 100)), ('San Francisco', (101, 150)), ('San Francisco', (200, 300)), ('Vancouver', (100, + * 50)), ('Vancouver', (101, 350)), ('Vancouver', (200, 450)) + * + * If you provide cumulativeSum a partition function you get the same result but you allow for more than one + * reducer per group. This is useful for when you have a single group that has a very large number of entries. + * For example in the previous example if you gave a partition function of the form { _ / 100 } then you would + * never have any one reducer deal with more than 2 entries. + */ +object CumulativeSum { + implicit def toCumulativeSum[K, U, V](pipe: TypedPipe[(K, (U, V))]): CumulativeSumExtension[K, U, V] = + new CumulativeSumExtension(pipe) + + class CumulativeSumExtension[K, U, V](val pipe: TypedPipe[(K, (U, V))]) { + + /** Takes a sortable field and a monoid and returns the cumulative sum of that monoid * */ + def cumulativeSum(implicit + sg: Semigroup[V], + ordU: Ordering[U], + ordK: Ordering[K] + ): SortedGrouped[K, (U, V)] = + pipe.group + .sortBy { case (u, _) => u } + .scanLeft(Nil: List[(U, V)]) { case (acc, (u, v)) => + acc match { + case List((previousU, previousSum)) => List((u, sg.plus(previousSum, v))) + case _ => List((u, v)) + } + } + .flattenValues + + /** + * An optimization of cumulativeSum for cases when a particular key has many entries. Requires a sortable + * partitioning of U. Accomplishes the optimization by not requiring all the entries for a single key to + * go through a single scan. Instead requires the sums of the partitions for a single key to go through a + * single scan. + */ + def cumulativeSum[S](partition: U => S)(implicit + ordS: Ordering[S], + sg: Semigroup[V], + ordU: Ordering[U], + ordK: Ordering[K] + ): TypedPipe[(K, (U, V))] = { + + val sumPerS = pipe + .map { case (k, (u, v)) => (k, partition(u)) -> v } + .sumByKey + .map { case ((k, s), v) => (k, (s, v)) } + .group + .sortBy { case (s, v) => s } + .scanLeft(None: Option[(Option[V], V, S)]) { case (acc, (s, v)) => + acc match { + case Some((previousPreviousSum, previousSum, previousS)) => { + Some((Some(previousSum), sg.plus(v, previousSum), s)) + } + case _ => Some((None, v, s)) + } + } + .flatMap { case (k, maybeAcc) => + for { + acc <- maybeAcc + previousSum <- acc._1 + } yield { (k, acc._3) -> (None, previousSum) } + } + + val summands = pipe + .map { case (k, (u, v)) => + (k, partition(u)) -> (Some(u), v) + } ++ sumPerS + + summands.group + .sortBy { case (u, _) => u } + .scanLeft(None: Option[(Option[U], V)]) { case (acc, (maybeU, v)) => + acc match { + case Some((_, previousSum)) => Some((maybeU, sg.plus(v, previousSum))) + case _ => Some((maybeU, v)) + } + } + .flatMap { case ((k, s), acc) => + for { + uv <- acc + u <- uv._1 + } yield { + (k, (u, uv._2)) + } + } + } + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/Execution.scala b/scalding-base/src/main/scala/com/twitter/scalding/Execution.scala new file mode 100644 index 0000000000..d7fedf1571 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/Execution.scala @@ -0,0 +1,1152 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import com.twitter.scalding.typed.{Output, TypedPipe} +import com.twitter.scalding.dagon.{Dag, Id, Rule} +import com.twitter.algebird.monad.Trampoline +import com.twitter.algebird.{Monad, Monoid, Semigroup} +import com.twitter.scalding.typed.functions.{ConsList, ReverseList} +import com.twitter.scalding.dagon.{Memoize, RefPair} +import java.io.Serializable +import java.util.UUID +import scala.collection.mutable +import scala.concurrent.duration.SECONDS +import scala.concurrent.{ + blocking, + duration, + Await, + ExecutionContext => ConcurrentExecutionContext, + Future, + Promise +} +import scala.util.{Failure, Success, Try} +import scala.util.control.NonFatal +import scala.util.hashing.MurmurHash3 + +/** + * Execution[T] represents and computation that can be run and will produce a value T and keep track of + * counters incremented inside of TypedPipes using a Stat. + * + * Execution[T] is the recommended way to compose multistep computations that involve branching (if/then), + * intermediate calls to remote services, file operations, or looping (e.g. testing for convergence). + * + * Library functions are encouraged to implement functions from TypedPipes or ValuePipes to Execution[R] for + * some result R. Refrain from calling run in library code. Let the caller of your library call run. + * + * Note this is a Monad, meaning flatMap composes in series as you expect. It is also an applicative functor, + * which means zip (called join in some libraries) composes two Executions is parallel. Prefer zip to flatMap + * if you want to run two Executions in parallel. + */ +sealed trait Execution[+T] extends Serializable { self: Product => + import Execution.{ + EvalCache, + FlatMapped, + GetCounters, + Mapped, + OnComplete, + RecoverWith, + ResetCounters, + Zipped + } + + /** + * Lift an Execution into a Try + * + * When this function is called the Execution should never be failed instead only the Try. + */ + def liftToTry: Execution[Try[T]] = + map(e => Success(e)).recoverWith { case throwable => Execution.from(Failure(throwable)) } + + /** + * Scala uses the filter method in for syntax for pattern matches that can fail. If this filter is false, + * the result of run will be an exception in the future + */ + def filter(pred: T => Boolean): Execution[T] = + flatMap { + case good if pred(good) => Execution.from(good) + case failed => Execution.from(sys.error("Filter failed on: " + failed.toString)) + } + + /** + * First run this Execution, then move to the result of the function + */ + def flatMap[U](fn: T => Execution[U]): Execution[U] = + FlatMapped(this, fn) + + /** + * This is the same as flatMap(identity) + */ + def flatten[U](implicit ev: T <:< Execution[U]): Execution[U] = + flatMap(ev) + + /** + * Apply a pure function to the result. This may not be called if subsequently the result is discarded with + * .unit For side effects see onComplete. + */ + def map[U](fn: T => U): Execution[U] = + Mapped(this, fn) + + /** + * Reads the counters into the value, but does not reset them. You may want .getAndResetCounters. + */ + def getCounters: Execution[(T, ExecutionCounters)] = + GetCounters(this) + + /** + * Reads the counters and resets them to zero. Probably what you want in a loop that is using counters to + * check for convergence. + */ + def getAndResetCounters: Execution[(T, ExecutionCounters)] = + getCounters.resetCounters + + /** + * This function is called when the current run is completed. This is only a side effect (see unit return). + * + * ALSO You must .run the result. If you throw away the result of this call, your fn will never be called. + * When you run the result, the Future you get will not be complete unless fn has completed running. If fn + * throws, it will be handled be the scala.concurrent.ExecutionContext.reportFailure NOT by returning a + * Failure in the Future. + */ + def onComplete(fn: Try[T] => Unit): Execution[T] = OnComplete(this, fn) + + /** + * This allows you to handle a failure by giving a replacement execution in some cases. This execution may + * be a retry if you know that your execution can have spurious errors, or it could be a constant or an + * alternate way to compute. Be very careful creating looping retries that could hammer your cluster when + * the data is missing or when when there is some real problem with your job logic. + */ + def recoverWith[U >: T](rec: PartialFunction[Throwable, Execution[U]]): Execution[U] = + RecoverWith(this, rec) + + /** + * Resets the counters back to zero. This is useful if you want to reset before a zip or a call to flatMap + */ + def resetCounters: Execution[T] = + ResetCounters(this) + + /** + * This causes the Execution to occur. The result is not cached, so each call to run will result in the + * computation being re-run. Avoid calling this until the last possible moment by using flatMap, zip and + * recoverWith. + * + * Seriously: pro-style is for this to be called only once in a program. + */ + final def run(conf: Config, mode: Mode)(implicit cec: ConcurrentExecutionContext): Future[T] = { + val writer: Execution.Writer = mode.newWriter() + val ec = new EvalCache(writer) + val confWithId = conf.setScaldingExecutionId(UUID.randomUUID.toString) + + val exec = Execution.optimize(conf, this) + // get on Trampoline + val CFuture(fut, cancelHandler) = exec.runStats(confWithId, mode, ec)(cec).get + // When the final future in complete we stop the submit thread + val result = fut.map(_._1).andThen { case t => + if (t.isFailure) { + blocking { + // cancel running executions if this was a failure + Await.ready(cancelHandler.stop(), duration.Duration(30, SECONDS)) + } + } + writer.finished() + } + // wait till the end to start the thread in case the above throws + writer.start() + result + } + + /** + * This is the internal method that must be implemented Given a config, mode, and cache of evaluations for + * this config and mode, return the new cache with as much evaluation as possible before the future + * completes, and a future of the result, counters and cache after the future is complete + */ + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ): Trampoline[CFuture[(T, Map[Long, ExecutionCounters])]] + + /** + * This is convenience for when we don't care about the result. like .map(_ => ()) + */ + def unit: Execution[Unit] = map(_ => ()) + + /** + * This waits synchronously on run, using the global execution context Avoid calling this if possible, + * prefering run or just Execution composition. Every time someone calls this, be very suspect. It is always + * code smell. Very seldom should you need to wait on a future. + */ + def waitFor(conf: Config, mode: Mode): Try[T] = + Try(Await.result(run(conf, mode)(ConcurrentExecutionContext.global), duration.Duration.Inf)) + + /** + * This is here to silence warnings in for comprehensions, but is identical to .filter. + * + * Users should never directly call this method, call .filter + */ + def withFilter(p: T => Boolean): Execution[T] = filter(p) + /* + * run this and that in parallel, without any dependency. This will + * be done in a single cascading flow if possible. + */ + def zip[U](that: Execution[U]): Execution[(T, U)] = + Zipped(this, that) + + override val hashCode: Int = MurmurHash3.productHash(self) + + /** + * since executions, particularly Zips can cause two executions to merge we can have exponential cost to + * computing equals if we are not careful + */ + override def equals(other: Any): Boolean = + other match { + case otherEx: Execution[_] => + if (otherEx eq this) true + else if (otherEx.hashCode != hashCode) false + else { + // If we get here, we have two executions that either + // collide in hashcode, or they are truely equal. Since + // collisions are rare, most of these will be true equality + // so we will fully walk the graph. If we don't remember + // the branches we go down, Zipped will be very expensize + import Execution._ + val fn = Memoize.function[RefPair[Execution[Any], Execution[Any]], Boolean] { + case (RefPair(a, b), _) if a eq b => true + case (RefPair(BackendExecution(fn0), BackendExecution(fn1)), rec) => + fn0 == fn1 + case (RefPair(FlatMapped(ex0, fn0), FlatMapped(ex1, fn1)), rec) => + (fn0 == fn1) && rec(RefPair(ex0, ex1)) + case (RefPair(FutureConst(fn0), FutureConst(fn1)), rec) => + fn0 == fn1 + case (RefPair(GetCounters(ex0), GetCounters(ex1)), rec) => + rec(RefPair(ex0, ex1)) + case (RefPair(Mapped(ex0, fn0), Mapped(ex1, fn1)), rec) => + (fn0 == fn1) && rec(RefPair(ex0, ex1)) + case (RefPair(OnComplete(ex0, fn0), OnComplete(ex1, fn1)), rec) => + (fn0 == fn1) && rec(RefPair(ex0, ex1)) + case (RefPair(ReaderExecution, ReaderExecution), _) => true + case (RefPair(RecoverWith(ex0, fn0), RecoverWith(ex1, fn1)), rec) => + (fn0 == fn1) && rec(RefPair(ex0, ex1)) + case (RefPair(ResetCounters(ex0), ResetCounters(ex1)), rec) => + rec(RefPair(ex0, ex1)) + case (RefPair(TransformedConfig(ex0, fn0), TransformedConfig(ex1, fn1)), rec) => + (fn0 == fn1) && rec(RefPair(ex0, ex1)) + case (RefPair(UniqueIdExecution(fn0), UniqueIdExecution(fn1)), _) => + fn0 == fn1 + case (RefPair(WithNewCache(ex0), WithNewCache(ex1)), rec) => + rec(RefPair(ex0, ex1)) + case (RefPair(WriteExecution(h0, t0, f0), WriteExecution(h1, t1, f1)), _) => + (f0 == f1) && ((h0 :: t0) == (h1 :: t1)) + case (RefPair(Zipped(a0, b0), Zipped(a1, b1)), rec) => + rec(RefPair(a0, a1)) && rec(RefPair(b0, b1)) + case (rp, _) => + require(rp._1.getClass != rp._2.getClass) + false // the executions are not of the same type + } + fn(RefPair(this, otherEx)) + } + case _ => false + } +} + +/** + * Execution has many methods for creating Execution[T] instances, which are the preferred way to compose + * computations in scalding libraries. + */ +object Execution { + private[Execution] class AsyncSemaphore(initialPermits: Int = 0) { + private[this] val waiters = new mutable.Queue[() => Unit] + private[this] var availablePermits = initialPermits + + private[Execution] class SemaphorePermit { + def release() = + AsyncSemaphore.this.synchronized { + availablePermits += 1 + if (availablePermits > 0 && waiters.nonEmpty) { + availablePermits -= 1 + waiters.dequeue()() + } + } + } + + def acquire(): Future[SemaphorePermit] = { + val promise = Promise[SemaphorePermit]() + + def setAcquired(): Unit = + promise.success(new SemaphorePermit) + + synchronized { + if (availablePermits > 0) { + availablePermits -= 1 + setAcquired() + } else { + waiters.enqueue(setAcquired) + } + } + + promise.future + } + } + + private def optimize[A](conf: Config, ex: Execution[A]): Execution[A] = + if (conf.getExecutionOptimization) { + ExecutionOptimizationRules.stdOptimizations(ex) + } else { + ex + } + + /** + * This is an instance of Monad for execution so it can be used in functions that apply to all Monads + */ + implicit object ExecutionMonad extends Monad[Execution] { + override def apply[T](t: T): Execution[T] = Execution.from(t) + override def map[T, U](e: Execution[T])(fn: T => U): Execution[U] = e.map(fn) + override def flatMap[T, U](e: Execution[T])(fn: T => Execution[U]): Execution[U] = e.flatMap(fn) + override def join[T, U](t: Execution[T], u: Execution[U]): Execution[(T, U)] = t.zip(u) + } + + def withConfig[T](ex: Execution[T])(c: Config => Config): Execution[T] = + TransformedConfig(ex, c) + + /** + * This function allows running the passed execution with its own cache. This will mean anything inside + * won't benefit from Execution's global attempts to avoid repeated executions. + * + * The main use case here is when generating a lot of Execution results which are large. Executions caching + * in this case can lead to out of memory errors as the cache keeps references to many heap objects. + * + * Ex. (0 until 1000).map { _ => Execution.withNewCache(myLargeObjectProducingExecution)} + */ + def withNewCache[T](ex: Execution[T]): Execution[T] = + WithNewCache(ex) + + /** + * This is the standard semigroup on an Applicative (zip, then inside the Execution do plus) + */ + implicit def semigroup[T: Semigroup]: Semigroup[Execution[T]] = Semigroup.from[Execution[T]] { (a, b) => + a.zip(b).map { case (ta, tb) => Semigroup.plus(ta, tb) } + } + + /** + * This is the standard monoid on an Applicative (zip, then inside the Execution do plus) useful to combine + * unit Executions: Monoid.sum(ex1, ex2, ex3, ex4): Execution[Unit] where each are exi are Execution[Unit] + */ + implicit def monoid[T: Monoid]: Monoid[Execution[T]] = Monoid.from(Execution.from(Monoid.zero[T])) { + (a, b) => + a.zip(b).map { case (ta, tb) => Monoid.plus(ta, tb) } + } + + /** + * This is a mutable state that is kept internal to an execution as it is evaluating. + */ + private[scalding] class EvalCache(val writer: Execution.Writer) { + + type Counters = Map[Long, ExecutionCounters] + private[this] val cache = + new FutureCacheGeneric[(Config, Execution[Any]), (Any, Counters), CPromise, CFuture] + private[this] val toWriteCache = new FutureCacheGeneric[(Config, ToWrite[_]), Counters, CPromise, CFuture] + + // This method with return a 'clean' cache, that shares + // the underlying thread and message queue of the parent evalCache + def cleanCache: EvalCache = new EvalCache(writer) + + def getOrLock(cfg: Config, write: ToWrite[_]): Either[CPromise[Counters], CFuture[Counters]] = + toWriteCache.getOrPromise((cfg, write)) + + def getOrElseInsertWithFeedback[T]( + cfg: Config, + ex: Execution[T], + res: => CFuture[(T, Counters)] + ): (Boolean, CFuture[(T, Counters)]) = + // This cast is safe because we always insert with match T types + cache + .getOrElseUpdateIsNew((cfg, ex), res) + .asInstanceOf[(Boolean, CFuture[(T, Counters)])] + + def getOrElseInsert[T]( + cfg: Config, + ex: Execution[T], + res: => CFuture[(T, Counters)] + ): CFuture[(T, Counters)] = + getOrElseInsertWithFeedback(cfg, ex, res)._2 + } + + private[scalding] final case class FutureConst[T](get: ConcurrentExecutionContext => Future[T]) + extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline { + lazy val fut = for { + futt <- toFuture(Try(get(cec))) + t <- futt + } yield (t, Map.empty[Long, ExecutionCounters]) + + lazy val cfut = CFuture.uncancellable(fut) + + cache.getOrElseInsert(conf, this, cfut) + } + + // Note that unit is not optimized away, since Futures are often used with side-effects, so, + // we ensure that get is always called in contrast to Mapped, which assumes that fn is pure. + } + private[scalding] final case class FlatMapped[S, T](prev: Execution[S], fn: S => Execution[T]) + extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline.call(prev.runStats(conf, mode, cache)).map { case CFuture(fut1, cancelHandler1) => + lazy val uncachedCFut = for { + (s, st1) <- fut1 + next0 = fn(s) + // next0 has not been optimized yet, we need to try + next = optimize(conf, next0) + } yield { + Trampoline.call(next.runStats(conf, mode, cache)).get.map { case (t, st2) => + (t, st1 ++ st2) + } + } + + val futCancel = cache.getOrElseInsert(conf, this, CFuture.fromFuture(uncachedCFut)) + + CFuture(futCancel.future, cancelHandler1.compose(futCancel.cancellationHandler)) + } + } + + private[scalding] final case class Mapped[S, T](prev: Execution[S], fn: S => T) extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline.call(prev.runStats(conf, mode, cache)).map { cfuture => + cache.getOrElseInsert(conf, this, cfuture.map { case (s, stats) => (fn(s), stats) }) + } + } + + private[scalding] final case class GetCounters[T](prev: Execution[T]) + extends Execution[(T, ExecutionCounters)] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline.call(prev.runStats(conf, mode, cache)).map { cfuture => + cache.getOrElseInsert( + conf, + this, + cfuture.map { case (t, c) => + val totalCount = Monoid.sum(c.map(_._2)) + ((t, totalCount), c) + } + ) + } + } + private[scalding] final case class ResetCounters[T](prev: Execution[T]) extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline.call(prev.runStats(conf, mode, cache)).map { cfuture => + cache + .getOrElseInsert(conf, this, cfuture.map { case (t, _) => (t, Map.empty[Long, ExecutionCounters]) }) + } + } + + private[scalding] final case class TransformedConfig[T](prev: Execution[T], fn: Config => Config) + extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = { + val mutatedConfig = fn(conf) + Trampoline.call(prev.runStats(mutatedConfig, mode, cache)) + } + } + + /** + * This class allows running the passed execution with its own cache. This will mean anything inside won't + * benefit from Execution's global attempts to avoid repeated executions. + * + * The main use case here is when generating a lot of Execution results which are large. Executions caching + * in this case can lead to out of memory errors as the cache keeps references to many heap objects. + * + * We operate here by getting a copy of the super EvalCache, without its cache's. This is so we can share + * the singleton thread for scheduling jobs against Cascading. + */ + private[scalding] final case class WithNewCache[T](prev: Execution[T]) extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = { + val ec = cache.cleanCache + Trampoline.call(prev.runStats(conf, mode, ec)) + } + } + + private[scalding] final case class OnComplete[T](prev: Execution[T], fn: Try[T] => Unit) + extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline.call(prev.runStats(conf, mode, cache)).map { cfuture => + cache.getOrElseInsert( + conf, + this, + cfuture.mapFuture { fut => + /** + * The result we give is only completed AFTER fn is run so callers can wait on the result of this + * OnComplete + */ + val finished = Promise[(T, Map[Long, ExecutionCounters])]() + fut.onComplete { tryT => + try { + fn(tryT.map(_._1)) + } finally { + // Do our best to signal when we are done + finished.complete(tryT) + } + } + finished.future + } + ) + } + } + + private[scalding] final case class RecoverWith[T]( + prev: Execution[T], + fn: PartialFunction[Throwable, Execution[T]] + ) extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline.call(prev.runStats(conf, mode, cache)).map { case CFuture(fut, cancelHandler) => + lazy val uncachedFut = + fut + .map(v => (v, CancellationHandler.empty)) // map this to the right shape + .recoverWith { + val flowStop: PartialFunction[Throwable, Future[Nothing]] = { + case t: FatalExecutionError => // do not recover when the flow was stopped + Future.failed(t) + } + + flowStop.orElse(fn.andThen { ex0 => + // we haven't optimized ex0 yet + val ex = optimize(conf, ex0) + val CFuture(f, c) = ex.runStats(conf, mode, cache).get + f.map(v => (v, c)) + }) + } + + val recoveredFut = cache.getOrElseInsert( + conf, + this, + CFuture(uncachedFut.map(_._1), CancellationHandler.fromFuture(uncachedFut.map(_._2))) + ) + CFuture(recoveredFut.future, cancelHandler.compose(recoveredFut.cancellationHandler)) + } + } + + /** + * Standard scala zip waits forever on the left side, even if the right side fails + */ + def failFastZip[T, U](ft: Future[T], fu: Future[U])(implicit + cec: ConcurrentExecutionContext + ): Future[(T, U)] = { + type State = Either[(T, Promise[U]), (U, Promise[T])] + val middleState = Promise[State]() + + ft.onComplete { + case f @ Failure(err) => + if (!middleState.tryFailure(err)) { + // the right has already succeeded + middleState.future.foreach { + case Right((_, pt)) => pt.complete(f) + case Left((t1, _)) => // This should never happen + sys.error(s"Logic error: tried to set Failure($err) but Left($t1) already set") + } + } + case Success(t) => + // Create the next promise: + val pu = Promise[U]() + if (!middleState.trySuccess(Left((t, pu)))) { + // we can't set, so the other promise beat us here. + middleState.future.foreach { + case Right((_, pt)) => pt.success(t) + case Left((t1, _)) => // This should never happen + sys.error(s"Logic error: tried to set Left($t) but Left($t1) already set") + } + } + } + fu.onComplete { + case f @ Failure(err) => + if (!middleState.tryFailure(err)) { + // we can't set, so the other promise beat us here. + middleState.future.foreach { + case Left((_, pu)) => pu.complete(f) + case Right((u1, _)) => // This should never happen + sys.error(s"Logic error: tried to set Failure($err) but Right($u1) already set") + } + } + case Success(u) => + // Create the next promise: + val pt = Promise[T]() + if (!middleState.trySuccess(Right((u, pt)))) { + // we can't set, so the other promise beat us here. + middleState.future.foreach { + case Left((_, pu)) => pu.success(u) + case Right((u1, _)) => // This should never happen + sys.error(s"Logic error: tried to set Right($u) but Right($u1) already set") + } + } + } + + middleState.future.flatMap { + case Left((t, pu)) => pu.future.map((t, _)) + case Right((u, pt)) => pt.future.map((_, u)) + } + } + + private[scalding] final case class Zipped[S, T](one: Execution[S], two: Execution[T]) + extends Execution[(S, T)] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + for { + futCancel1 <- Trampoline.call(one.runStats(conf, mode, cache)) + futCancel2 <- Trampoline.call(two.runStats(conf, mode, cache)) + } yield { + cache.getOrElseInsert( + conf, + this, + futCancel1.zip(futCancel2).map { case ((s, ss), (t, st)) => ((s, t), ss ++ st) } + ) + } + } + private[scalding] final case class UniqueIdExecution[T](fn: UniqueID => Execution[T]) extends Execution[T] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline( + cache.getOrElseInsert( + conf, + this, { + val (uid, nextConf) = conf.ensureUniqueId + val next0 = fn(uid) + // next0 has not been optimized yet, we need to try + val next = optimize(conf, next0) + next.runStats(nextConf, mode, cache).get + } + ) + ) + } + + /* + * This allows you to run platform specific executions + */ + private[scalding] final case class BackendExecution[A]( + result: (Config, Mode, Writer, ConcurrentExecutionContext) => CFuture[(Long, ExecutionCounters, A)] + ) extends Execution[A] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline( + cache.getOrElseInsert( + conf, + this, + try result(conf, mode, cache.writer, cec).map { case (id, c, a) => (a, Map(id -> c)) } + catch { + case NonFatal(e) => CFuture.failed(e) + } + ) + ) + } + + /* + * This is here so we can call without knowing the type T + * but with proof that pipe matches sink + * + * We capture these objects in calls of TypedPipe.toIterableExecution, + * but can safely ignore serializing planning objects for the same reasons mentioned in KryoHadoop.scala + */ + + sealed trait ToWrite[T] extends Serializable { + def pipe: TypedPipe[T] + def replacePipe(p: TypedPipe[T]): ToWrite[T] = + this match { + case ToWrite.Force(_) => ToWrite.Force(p) + case ToWrite.ToIterable(_) => ToWrite.ToIterable(p) + case ToWrite.SimpleWrite(_, sink) => ToWrite.SimpleWrite(p, sink) + } + } + object ToWrite extends Serializable { + final case class Force[T](@transient pipe: TypedPipe[T]) extends ToWrite[T] + final case class ToIterable[T](@transient pipe: TypedPipe[T]) extends ToWrite[T] + final case class SimpleWrite[T](@transient pipe: TypedPipe[T], @transient sink: Output[T]) + extends ToWrite[T] + + final case class OptimizedWrite[F[_], T](@transient original: F[T], toWrite: ToWrite[T]) + + /** + * Optimize these writes into new writes and provide a mapping from the original TypedPipe to the new + * TypedPipe + */ + def optimizeWriteBatch( + writes: List[ToWrite[_]], + rules: Seq[Rule[TypedPipe]] + ): List[OptimizedWrite[TypedPipe, _]] = { + val dag = Dag.empty(typed.OptimizationRules.toLiteral) + val (d1, ws) = writes.foldLeft((dag, List.empty[OptimizedWrite[Id, _]])) { case ((dag, ws), toWrite) => + val (d1, id) = dag.addRoot(toWrite.pipe) + (d1, OptimizedWrite(id, toWrite) :: ws) + } + // now we optimize the graph + val d2 = d1.applySeq(rules) + // convert back to TypedPipe: + ws.foldLeft(List.empty[OptimizedWrite[TypedPipe, _]]) { case (tail, optWrite) => + def go[A](optWriteId: OptimizedWrite[Id, A]): OptimizedWrite[TypedPipe, A] = { + val idA = optWriteId.original + val origPipe = d1.evaluate(idA) + val optPipe = d2.evaluate(idA) + OptimizedWrite(original = origPipe, toWrite = optWriteId.toWrite.replacePipe(optPipe)) + } + go(optWrite) :: tail + } + } + } + + /** + * Something that can handle a batch of writes that may be optimized before running. Return a unique Long + * for each run and Counters + */ + trait Writer { + + /** + * This is called by an Execution to begin processing + */ + def start(): Unit + + /** + * This is called by an Execution to end processing + */ + def finished(): Unit + + /** + * do a batch of writes, possibly optimizing, and return a new unique Long. + * + * empty writes are legitimate and should still return a Long + */ + def execute(conf: Config, writes: List[ToWrite[_]])(implicit + cec: ConcurrentExecutionContext + ): CFuture[(Long, ExecutionCounters)] + + /** + * This should only be called after a call to execute + */ + private[Execution] def getForced[T]( + conf: Config, + initial: TypedPipe[T] + )(implicit cec: ConcurrentExecutionContext): Future[TypedPipe[T]] + + /** + * This should only be called after a call to execute + */ + private[Execution] def getIterable[T]( + conf: Config, + initial: TypedPipe[T] + )(implicit cec: ConcurrentExecutionContext): Future[Iterable[T]] + } + + /** + * This is the fundamental execution that actually happens in TypedPipes, all the rest are based on on this + * one. By keeping the Pipe and the Sink, can inspect the Execution DAG and optimize it later (a goal, but + * not done yet). + */ + private[scalding] final case class WriteExecution[T]( + head: ToWrite[_], + tail: List[ToWrite[_]], + result: ((Config, Mode, Writer, ConcurrentExecutionContext)) => Future[T] + ) extends Execution[T] { + + /** + * We override this here to enable inlining the zip optimization below. + * + * This is such an important optimization, that we apply it locally. It is a bit ugly to have it here and + * in ExecutionOptimizationRules but since this is so important, we do so anyway. + * + * Note Execution optimizations are not always applied, they are something users can disable, which they + * may since in some cases giant Execution graphs have seen stack overflows. It doesn't hurt to apply this + * optimization here, but it doesn't cover all cases since it only combines adjacent writes. + */ + override def map[U](mapFn: T => U): Execution[U] = + WriteExecution(head, tail, ExecutionOptimizationRules.MapWrite.ComposeMap(result, mapFn)) + + private def unwrapListEither[A, B, C](it: List[(A, Either[B, C])]): (List[(A, B)], List[(A, C)]) = + it match { + case (a, Left(b)) :: tail => + val (l, r) = unwrapListEither(tail) + ((a, b) :: l, r) + case (a, Right(c)) :: tail => + val (l, r) = unwrapListEither(tail) + (l, (a, c) :: r) + case Nil => (Nil, Nil) + } + + // We look up to see if any of our ToWrite elements have already been ran + // if so we remove them from the cache. + // Anything not already ran we run as part of a single flow def, using their combined counters for the others + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = { + lazy val uncachedFutureCancel = { + val cacheLookup: List[ + ( + ToWrite[_], + (Either[CPromise[Map[Long, ExecutionCounters]], CFuture[Map[Long, ExecutionCounters]]]) + ) + ] = + (head :: tail).map(tw => (tw, cache.getOrLock(conf, tw))) + val (weDoOperation, someoneElseDoesOperation) = unwrapListEither(cacheLookup) + + val otherResult = CFuture.failFastSequence(someoneElseDoesOperation.map(_._2)) + + otherResult.future.value match { + case Some(Failure(e)) => CFuture.failed(e) + case _ => // Either successful or not completed yet + val localFlowDefCountersFuture: CFuture[Map[Long, ExecutionCounters]] = + weDoOperation match { + case all @ (h :: tail) => + val CFuture(fut, cancelHandler) = cache.writer + .execute(conf, all.map(_._1)) + + val futCounters: Future[Map[Long, ExecutionCounters]] = + fut.map(Map(_)) + + // Complete all of the promises we put into the cache + // with this future counters set + all.foreach { case (toWrite, cpromise) => + cpromise.completeWith(CFuture(futCounters, cancelHandler)) + } + CFuture(futCounters, cancelHandler) + case Nil => + // No work to do, provide a fulled set of 0 counters to operate on + CFuture(Future.successful(Map.empty), CancellationHandler.empty) + } + val bothFutures = otherResult.zip(localFlowDefCountersFuture) + + val fut = for { + (lCounters, fdCounters) <- bothFutures.future + t <- result( + (conf, mode, cache.writer, cec) + ) // TODO do i need to do something here to make this cancellable? + summedCounters = (fdCounters :: lCounters).reduce(_ ++ _) + } yield (t, summedCounters) + + CFuture(fut, bothFutures.cancellationHandler) + } + } + + Trampoline(cache.getOrElseInsert(conf, this, uncachedFutureCancel)) + } + + /** + * This is such an important optimization, that we apply it locally. It is a bit ugly to have it here and + * in ExecutionOptimizationRules but since this is so important, we do so anyway. + * + * Note Execution optimizations are not always applied, they are something users can disable, which they + * may since in some cases giant Execution graphs have seen stack overflows. It doesn't hurt to apply this + * optimization here, but it doesn't cover all cases since it only combines adjacent writes. + * + * Note, each Write is individually cached so it won't happen twice, but it is usually better to compose + * into the biggest set of writes so the planner can optimize the largest graph possible. + * + * run this and that in parallel, without any dependency. This will be done in a single cascading flow if + * possible. + * + * If both sides are write executions then merge them + */ + override def zip[U](that: Execution[U]): Execution[(T, U)] = + that match { + case w1 @ WriteExecution(_, _, _) => + ExecutionOptimizationRules.ZipWrite.mergeWrite(this, w1) + case that => Zipped(this, that) + } + } + + /** + * This is called Reader, because it just returns its input to run as the output + */ + private[scalding] case object ReaderExecution extends Execution[(Config, Mode)] { + protected def runStats(conf: Config, mode: Mode, cache: EvalCache)(implicit + cec: ConcurrentExecutionContext + ) = + Trampoline(CFuture.successful(((conf, mode), Map.empty))) + + override def equals(that: Any): Boolean = + that match { + // this has to be here or we get an infinite loop in the default equals + case _: ReaderExecution.type => true + case _ => false + } + } + + private def toFuture[R](t: Try[R]): Future[R] = + t match { + case Success(s) => Future.successful(s) + case Failure(err) => Future.failed(err) + } + + /** + * This creates a definitely failed Execution. + */ + def failed(t: Throwable): Execution[Nothing] = fromTry(Failure(t)) + + /** + * This makes a constant execution that runs no job. Note this is a lazy parameter that is evaluated every + * time run is called and does so in the ExecutionContext given to run + */ + def from[T](t: => T): Execution[T] = fromFuture(implicit ec => Future(t)) + + /** + * This evaluates the argument every time run is called, and does so in the ExecutionContext given to run + */ + def fromTry[T](t: => Try[T]): Execution[T] = fromFuture { implicit ec => + Future(t).flatMap(toFuture) + } + + /** + * The call to fn will happen when the run method on the result is called. The ConcurrentExecutionContext + * will be the same one used on run. This is intended for cases where you need to make asynchronous calls in + * the middle or end of execution. Presumably this is used with flatMap either before or after + */ + def fromFuture[T](fn: ConcurrentExecutionContext => Future[T]): Execution[T] = FutureConst(fn) + + /** Returns a constant Execution[Unit] */ + val unit: Execution[Unit] = from(()) + + /** + * This should be avoided if at all possible. It is here to allow backend authors to implement custom + * executions which should very rarely be needed. + * + * The CFuture returned should have three elements: + * 1. unique ID (Long) for the scope of the Writer 2. Counter values created by this Execution 3. the + * final result of the Execution (maybe Unit) + */ + def backendSpecific[A]( + fn: (Config, Mode, Writer, ConcurrentExecutionContext) => CFuture[(Long, ExecutionCounters, A)] + ): Execution[A] = + BackendExecution(fn) + + def forceToDisk[T](t: TypedPipe[T]): Execution[TypedPipe[T]] = + WriteExecution(ToWrite.Force(t), Nil, { case (conf, _, w, cec) => w.getForced(conf, t)(cec) }) + + def toIterable[T](t: TypedPipe[T]): Execution[Iterable[T]] = + WriteExecution(ToWrite.ToIterable(t), Nil, { case (conf, _, w, cec) => w.getIterable(conf, t)(cec) }) + + /** + * The simplest form, just sink the typed pipe into the sink and get a unit execution back + */ + private[scalding] def write[T](pipe: TypedPipe[T], sink: Output[T]): Execution[Unit] = + write(pipe, sink, ()) + + private[scalding] def write[T, U](pipe: TypedPipe[T], sink: Output[T], presentType: => U): Execution[U] = + WriteExecution(ToWrite.SimpleWrite(pipe, sink), Nil, tup => Future(presentType)(tup._4)) + + /** + * Convenience method to get the Args + */ + def getArgs: Execution[Args] = ReaderExecution.map(_._1.getArgs) + + /** + * Use this to read the configuration, which may contain Args or options which describe input on which to + * run + */ + def getConfig: Execution[Config] = ReaderExecution.map(_._1) + + /** Use this to get the mode, which may contain the job conf */ + def getMode: Execution[Mode] = ReaderExecution.map(_._2) + + /** Use this to get the config and mode. */ + def getConfigMode: Execution[(Config, Mode)] = ReaderExecution + + /** + * This is convenience method only here to make it slightly cleaner to get Args, which are in the Config + */ + def withArgs[T](fn: Args => Execution[T]): Execution[T] = + getConfig.flatMap(conf => fn(conf.getArgs)) + + /** + * Use this to use counters/stats with Execution. You do this: Execution.withId { implicit uid => val myStat + * \= Stat("myStat") // uid is implicitly pulled in pipe.map { t => if(someCase(t)) myStat.inc fn(t) } + * .writeExecution(mySink) } + */ + def withId[T](fn: UniqueID => Execution[T]): Execution[T] = UniqueIdExecution(fn) + + /** + * combine several executions and run them in parallel when .run is called + */ + def zip[A, B](ax: Execution[A], bx: Execution[B]): Execution[(A, B)] = + ax.zip(bx) + + /** + * combine several executions and run them in parallel when .run is called + */ + def zip[A, B, C](ax: Execution[A], bx: Execution[B], cx: Execution[C]): Execution[(A, B, C)] = + ax.zip(bx).zip(cx).map { case ((a, b), c) => (a, b, c) } + + /** + * combine several executions and run them in parallel when .run is called + */ + def zip[A, B, C, D]( + ax: Execution[A], + bx: Execution[B], + cx: Execution[C], + dx: Execution[D] + ): Execution[(A, B, C, D)] = + ax.zip(bx).zip(cx).zip(dx).map { case (((a, b), c), d) => (a, b, c, d) } + + /** + * combine several executions and run them in parallel when .run is called + */ + def zip[A, B, C, D, E]( + ax: Execution[A], + bx: Execution[B], + cx: Execution[C], + dx: Execution[D], + ex: Execution[E] + ): Execution[(A, B, C, D, E)] = + ax.zip(bx).zip(cx).zip(dx).zip(ex).map { case ((((a, b), c), d), e) => (a, b, c, d, e) } + + // Avoid recreating the empty Execution each time + private val nil = from(Nil) + + /* + * If you have many Executions, it is better to combine them with + * zip than flatMap (which is sequential). sequence just calls + * zip on each item in the input sequence. + * + * Note, despite the name, which is taken from the standard scala Future API, + * these executions are executed in parallel: run is called on all at the + * same time, not one after the other. + */ + def sequence[T](exs: Seq[Execution[T]]): Execution[Seq[T]] = { + @annotation.tailrec + def go(xs: List[Execution[T]], acc: Execution[List[T]]): Execution[List[T]] = xs match { + case Nil => acc + case h :: tail => go(tail, h.zip(acc).map(ConsList())) + } + // This pushes all of them onto a list, and then reverse to keep order + go(exs.toList, nil).map(ReverseList()) + } + + /** + * Run a sequence of executions but only permitting parallelism amount to run at the same time. + * + * @param executions + * List of executions to run + * @param parallelism + * Number to run in parallel + * @return + * Execution Seq + */ + def withParallelism[T](executions: Seq[Execution[T]], parallelism: Int): Execution[Seq[T]] = { + require(parallelism > 0, s"Parallelism must be > 0: $parallelism") + + val sem = new AsyncSemaphore(parallelism) + + def waitRun(e: Execution[T]): Execution[T] = + Execution + .fromFuture(_ => sem.acquire()) + .flatMap(p => e.liftToTry.map((_, p))) + .onComplete { + case Success((_, p)) => p.release() + case Failure(ex) => throw ex // should never happen or there is a logic bug + } + .flatMap { case (ex, _) => fromTry(ex) } + + Execution.sequence(executions.map(waitRun)) + } +} + +/** + * Any exception extending this is never recovered + */ +abstract class FatalExecutionError(msg: String) extends Exception(msg) + +/** + * This represents the counters portion of the JobStats that are returned. Counters are just a vector of longs + * with counter name, group keys. + */ +trait ExecutionCounters { + + /** + * immutable set of the keys. + */ + def keys: Set[StatKey] + + /** + * Same as get(key).getOrElse(0L) Note if a counter is never incremented, get returns None. But you can't + * tell 0L that comes from None vs. a counter that was incremented then decremented. + */ + def apply(key: StatKey): Long = get(key).getOrElse(0L) + + /** + * If the counter is present, return it. + */ + def get(key: StatKey): Option[Long] + def toMap: Map[StatKey, Long] = keys.map(k => (k, get(k).getOrElse(0L))).toMap +} + +/** + * The companion gives several ways to create ExecutionCounters from other CascadingStats, JobStats, or Maps + */ +object ExecutionCounters { + + /** + * This is the zero of the ExecutionCounter Monoid + */ + def empty: ExecutionCounters = new ExecutionCounters { + def keys = Set.empty + def get(key: StatKey) = None + override def toMap = Map.empty + } + + /** + * Gets just the counters from the JobStats + */ + def fromJobStats(js: JobStats): ExecutionCounters = { + val counters = js.counters + new ExecutionCounters { + def keys = for { + group <- counters.keySet + counter <- counters(group).keys + } yield StatKey(counter, group) + + def get(k: StatKey) = counters.get(k.group).flatMap(_.get(k.counter)) + } + } + + /** + * A Simple wrapper over a Map[StatKey, Long] + */ + def fromMap(allValues: Map[StatKey, Long]): ExecutionCounters = + new ExecutionCounters { + def keys = allValues.keySet + def get(k: StatKey) = allValues.get(k) + override def toMap = allValues + } + + /** + * This allows us to merge the results of two computations. It just does pointwise addition. + */ + implicit def monoid: Monoid[ExecutionCounters] = new Monoid[ExecutionCounters] { + override def isNonZero(that: ExecutionCounters) = that.keys.nonEmpty + def zero = ExecutionCounters.empty + def plus(left: ExecutionCounters, right: ExecutionCounters) = + fromMap((left.keys ++ right.keys).map(k => (k, left(k) + right(k))).toMap) + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/ExecutionOptimizationRules.scala b/scalding-base/src/main/scala/com/twitter/scalding/ExecutionOptimizationRules.scala new file mode 100644 index 0000000000..a70ea9f17d --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/ExecutionOptimizationRules.scala @@ -0,0 +1,388 @@ +package com.twitter.scalding + +import com.twitter.scalding.dagon.{Dag, FunctionK, Literal, Memoize, PartialRule, Rule} +import com.twitter.scalding.ExecutionOptimizationRules.ZipMap.{MapLeft, MapRight} +import com.twitter.scalding.typed.functions.ComposedFunctions.ComposedMapFn +import com.twitter.scalding.typed.functions.{ComposedFunctions, Identity, Swap} +import scala.annotation.tailrec +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future} + +object ExecutionOptimizationRules { + type LiteralExecution[T] = Literal[Execution, T] + + /** + * Since our Execution is covariant, but the Literal is not this is actually safe in this context, but not + * in general + */ + def widen[T](l: LiteralExecution[_ <: T]): LiteralExecution[T] = + // to prove this is safe, see that if you have + // LiteralExecution[_ <: T] we can call .evaluate to get + // Execution[_ <: T] which due to covariance is + // Execution[T], and then using toLiteral we can get + // LiteralExecution[T] + // + // that would be wasteful to apply since the final + // result is identity. + l.asInstanceOf[LiteralExecution[T]] + + def toLiteral: FunctionK[Execution, LiteralExecution] = + Memoize.functionK[Execution, LiteralExecution]( + new Memoize.RecursiveK[Execution, LiteralExecution] { + override def toFunction[A] = { + case (e @ Execution.ReaderExecution, _) => + Literal.Const(e) + case (e: Execution.FutureConst[a], _) => + Literal.Const(e) + case (e: Execution.UniqueIdExecution[a], _) => + Literal.Const(e) + case (e: Execution.BackendExecution[a], _) => + Literal.Const(e) + case (e: Execution.WriteExecution[a], _) => + Literal.Const(e) + case (e: Execution.GetCounters[a], f) => + widen( + Literal.Unary[Execution, a, (a, ExecutionCounters)]( + f(e.prev), + Execution.GetCounters(_: Execution[a]) + ) + ) + case (e: Execution.ResetCounters[a], f) => + Literal.Unary(f(e.prev), Execution.ResetCounters(_: Execution[a])) + case (e: Execution.WithNewCache[a], f) => + Literal.Unary(f(e.prev), Execution.WithNewCache(_: Execution[a])) + case (e: Execution.TransformedConfig[a], f) => + Literal.Unary(f(e.prev), Execution.TransformedConfig(_: Execution[a], e.fn)) + case (e: Execution.OnComplete[a], f) => + Literal.Unary(f(e.prev), Execution.OnComplete(_: Execution[a], e.fn)) + case (e: Execution.RecoverWith[a], f) => + Literal.Unary(f(e.prev), Execution.RecoverWith(_: Execution[a], e.fn)) + case (e: Execution.Mapped[a, b], f) => + Literal.Unary(f(e.prev), Execution.Mapped(_: Execution[a], e.fn)) + case (e: Execution.FlatMapped[a, b], f) => + Literal.Unary(f(e.prev), Execution.FlatMapped(_: Execution[a], e.fn)) + case (e: Execution.Zipped[a, b], f) => + Literal.Binary(f(e.one), f(e.two), Execution.Zipped(_: Execution[a], _: Execution[b])) + } + } + ) + + /** + * If `Execution` is `WriteExecution`, we are considering those executions as slow, since they will schedule + * some expensive work, like Hadoop or Spark Job. + * + * If `Execution` is `FlatMapped` or `UniqueIdExecution`, we are considering those executions as slow, since + * we don't know which execution they can produce. + * + * Everything else we are considering as fast execution compare to `WriteExecution`. + */ + def isFastExecution[A](e: Execution[A]): Boolean = + areFastExecution(e :: Nil) + + /** + * If `Execution` is `FlowDefExecution` or `WriteExecution`, we are considering those executions as slow, + * since they will schedule some expensive work, like Hadoop or Spark Job. + * + * If `Execution` is `FlatMapped` or `UniqueIdExecution`, we are considering those executions as slow, since + * we don't know which execution they can produce. + * + * Everything else we are considering as fast execution compare to `FlowDefExecution` and `WriteExecution`. + */ + @tailrec + def areFastExecution(es: List[Execution[Any]]): Boolean = + es match { + case Nil => true + case h :: tail => + h match { + case Execution.UniqueIdExecution(_) => false + case Execution.WriteExecution(_, _, _) => false + case Execution.FlatMapped(_, _) => false + case Execution.BackendExecution(_) => false + + case Execution.ReaderExecution => areFastExecution(tail) + case Execution.FutureConst(_) => areFastExecution(tail) + case Execution.GetCounters(e) => areFastExecution(e :: tail) + case Execution.ResetCounters(e) => areFastExecution(e :: tail) + case Execution.WithNewCache(e) => areFastExecution(e :: tail) + case Execution.TransformedConfig(e, _) => areFastExecution(e :: tail) + case Execution.OnComplete(e, _) => areFastExecution(e :: tail) + case Execution.RecoverWith(e, _) => areFastExecution(e :: tail) + case Execution.Mapped(e, _) => areFastExecution(e :: tail) + case Execution.Zipped(one, two) => areFastExecution(one :: two :: tail) + } + } + + /** + * This is a rather complex optimization rule, but also very important. After this runs, there will only be + * 1 WriteExecution in a graph, other than within recoverWith/flatMap/uniqueId nodes. + * + * This is the best we can do without running those functions. The motivation for this is to allow the user + * to write Executions as is convenient in code, but still have full access to a TypedPipe graph when + * planning a stage. Without this, we can wind up recomputing work that we don't need to do. + */ + case object ZipWrite extends Rule[Execution] { + import Execution._ + + /* + * First we define some case class functions to make sure + * the rule is reproducible and doesn't break equality + */ + case class Twist[A, B, C]() extends Function1[((A, B), C), (A, (B, C))] { + def apply(in: ((A, B), C)) = + (in._1._1, (in._1._2, in._2)) + } + case class UnTwist[A, B, C]() extends Function1[(A, (B, C)), ((A, B), C)] { + def apply(in: (A, (B, C))) = + ((in._1, in._2._1), in._2._2) + } + case class TwistSwap[A, B, C]() extends Function1[(A, (B, C)), (B, (A, C))] { + def apply(in: (A, (B, C))) = + (in._2._1, (in._1, in._2._2)) + } + case class ComposedFn[A1, A2, A, B1, B2, B]( + fn1: Function1[(A1, A2), A], + fn2: Function1[(B1, B2), B] + ) extends Function1[((A1, B1), (A2, B2)), (A, B)] { + override def apply(v1: ((A1, B1), (A2, B2))): (A, B) = + (fn1(v1._1._1, v1._2._1), fn2(v1._1._2, v1._2._2)) + } + + case class ComposeWriteFn[A, B, C, D, E]( + fn1: ((A, B, C, ConcurrentExecutionContext)) => Future[D], + fn2: ((A, B, C, ConcurrentExecutionContext)) => Future[E] + ) extends Function1[(A, B, C, ConcurrentExecutionContext), Future[(D, E)]] { + + def apply(tup: (A, B, C, ConcurrentExecutionContext)): Future[(D, E)] = + Execution.failFastZip(fn1(tup), fn2(tup))(tup._4) + } + + def mergeWrite[A, B](w1: WriteExecution[A], w2: WriteExecution[B]): WriteExecution[(A, B)] = { + val newFn = ComposeWriteFn(w1.result, w2.result) + WriteExecution(w1.head, w1.tail ::: (w2.head :: w2.tail), newFn) + } + + /** + * This is the fundamental type we use to optimize zips, basically we expand graphs of WriteExecution, + * Zipped, Mapped. Our goal to optimize any `Execution`'s DAG to have at most one write. + * + * This is achieved by optimizing any `Execution` to either: + * - `NonWrite` execution + * - `Write` execution + * - composed execution which has both write and non write. + */ + private sealed trait FlattenedZip[+A] + + private object FlattenedZip { + final case class NonWrite[T](nonWrite: Execution[T]) extends FlattenedZip[T] + final case class Write[T](write: WriteExecution[T]) extends FlattenedZip[T] + final case class Composed[T1, T2, T]( + write: WriteExecution[T1], + nonWrite: Execution[T2], + compose: Function1[(T1, T2), T] + ) extends FlattenedZip[T] + + def toExecution[A](ex: FlattenedZip[A]): Execution[A] = ex match { + case NonWrite(nonWrite) => nonWrite + case Write(write) => write + case c @ Composed(_, _, _) => c.write.zip(c.nonWrite).map(c.compose) + } + + def map[A, B](ex: FlattenedZip[A], fn: A => B): FlattenedZip[B] = ex match { + case NonWrite(nonWrite) => + NonWrite(nonWrite.map(fn)) + case Write(write) => + Write(WriteExecution(write.head, write.tail, MapWrite.ComposeMap(write.result, fn))) + case Composed(write, nonWrite, compose) => + Composed(write, nonWrite, ComposedMapFn(compose, fn)) + } + + def zip[A, B](left: FlattenedZip[A], right: FlattenedZip[B]): FlattenedZip[(A, B)] = + (left, right) match { + case (left @ NonWrite(_), right @ NonWrite(_)) => + NonWrite(left.nonWrite.zip(right.nonWrite)) + case (left @ NonWrite(_), right @ Write(_)) => + Composed(right.write, left.nonWrite, Swap[B, A]()) + case (left @ NonWrite(_), right @ Composed(_, _, _)) => + zipNonWriteComposed(left, right) + + case (left @ Write(_), right @ NonWrite(_)) => + Composed(left.write, right.nonWrite, Identity[(A, B)]()) + case (left @ Write(_), right @ Write(_)) => + Write(mergeWrite(left.write, right.write)) + case (left @ Write(_), right @ Composed(_, _, _)) => + zipWriteComposed(left, right) + + case (left @ Composed(_, _, _), right @ NonWrite(_)) => + map(zipNonWriteComposed(right, left), Swap[B, A]()) + case (left @ Composed(_, _, _), right @ Write(_)) => + map(zipWriteComposed(right, left), Swap[B, A]()) + case (left @ Composed(_, _, _), right @ Composed(_, _, _)) => + Composed( + mergeWrite(left.write, right.write), + left.nonWrite.zip(right.nonWrite), + ComposedFn(left.compose, right.compose) + ) + } + + private def zipNonWriteComposed[A, B1, B2, B]( + left: NonWrite[A], + right: Composed[B1, B2, B] + ): Composed[B1, (B2, A), (A, B)] = + Composed( + right.write, + right.nonWrite.zip(left.nonWrite), + ComposedMapFn(ComposedMapFn(UnTwist(), MapLeft[(B1, B2), A, B](right.compose)), Swap[B, A]()) + ) + + private def zipWriteComposed[A, B1, B2, B]( + left: Write[A], + right: Composed[B1, B2, B] + ): Composed[(A, B1), B2, (A, B)] = + Composed( + mergeWrite(left.write, right.write), + right.nonWrite, + ComposedMapFn(Twist(), MapRight[A, (B1, B2), B](right.compose)) + ) + + /** + * Convert an Execution to the Flattened (tuple-ized) representation + */ + def apply[A](ex: Execution[A]): FlattenedZip[A] = + ex match { + case Zipped(left, right) => zip(apply(left), apply(right)) + case Mapped(that, fn) => map(apply(that), fn) + case write @ WriteExecution(_, _, _) => FlattenedZip.Write(write) + case notZipMap => FlattenedZip.NonWrite(notZipMap) + } + } + + /** + * Apply the optimization of merging all zipped/mapped WriteExecution into a single value. If ex is + * already optimal (0 or 1 write) return None + */ + def optimize[A](ex: Execution[A]): Option[Execution[A]] = { + def writes(execution: Execution[_]): Int = { + @tailrec + def loop(executions: List[Execution[_]], acc: Int): Int = executions match { + case Nil => acc + case head :: tail => + head match { + case Zipped(left, right) => loop(left :: right :: tail, acc) + case Mapped(that, _) => loop(that :: tail, acc) + case WriteExecution(_, _, _) => loop(tail, acc + 1) + case _ => loop(tail, acc) + } + } + loop(execution :: Nil, 0) + } + // only optimize if there are 2 or more writes, otherwise we create an infinite loop + if (writes(ex) > 1) + Some(FlattenedZip.toExecution(FlattenedZip(ex))) + else + None + } + + def apply[A](on: Dag[Execution]) = { + case z @ Zipped(_, _) => optimize(z) + case _ => + // since this optimization only applies to zips, there + // is no need to check on nodes that aren't zips. + None + } + } + + object ZipMap extends PartialRule[Execution] { + case class MapLeft[S, T, B](fn: S => B) extends (((S, T)) => (B, T)) { + override def apply(st: (S, T)): (B, T) = (fn(st._1), st._2) + } + + case class MapRight[S, T, B](fn: T => B) extends (((S, T)) => (S, B)) { + override def apply(st: (S, T)): (S, B) = (st._1, fn(st._2)) + } + + override def applyWhere[T](on: Dag[Execution]) = { + case Execution.Zipped(Execution.Mapped(left, fn), right) => + Execution.Zipped(left, right).map(MapLeft(fn)) + case Execution.Zipped(left, Execution.Mapped(right, fn)) => + Execution.Zipped(left, right).map(MapRight(fn)) + } + } + + object ZipFlatMap extends PartialRule[Execution] { + case class LeftZipRight[S, T, B](left: Execution[B], fn: S => Execution[T]) + extends (S => Execution[(B, T)]) { + private val fun = fn.andThen(left.zip) + + override def apply(s: S): Execution[(B, T)] = fun(s) + } + + case class RightZipLeft[S, T, B](right: Execution[B], fn: S => Execution[T]) + extends (S => Execution[(T, B)]) { + private val fun = fn.andThen(_.zip(right)) + + override def apply(s: S): Execution[(T, B)] = fun(s) + } + + case class NestedZip[S, T, B, A](right: Execution[B], lfn: S => Execution[T], rfn: B => Execution[A]) + extends (S => Execution[(T, A)]) { + private val fun = lfn.andThen { lr => + Execution.FlatMapped(right, rfn.andThen(lr.zip)) + } + + override def apply(s: S): Execution[(T, A)] = fun(s) + } + + override def applyWhere[T](on: Dag[Execution]) = { + case Execution.Zipped(Execution.FlatMapped(left, lfn), Execution.FlatMapped(right, rfn)) + if isFastExecution(left) && isFastExecution(right) => + Execution.FlatMapped(left, NestedZip(right, lfn, rfn)) + case Execution.Zipped(Execution.FlatMapped(left, fn), right) if isFastExecution(left) => + Execution.FlatMapped(left, RightZipLeft(right, fn)) + case Execution.Zipped(left, Execution.FlatMapped(right, fn)) if isFastExecution(right) => + Execution.FlatMapped(right, LeftZipRight(left, fn)) + } + } + + object MapWrite extends PartialRule[Execution] { + case class ComposeMap[A, B, C, D, E]( + fn1: ((A, B, C, ConcurrentExecutionContext)) => Future[D], + fn2: D => E + ) extends Function1[(A, B, C, ConcurrentExecutionContext), Future[E]] { + + def apply(tup: (A, B, C, ConcurrentExecutionContext)): Future[E] = + fn1(tup).map(fn2)(tup._4) + } + + override def applyWhere[T](on: Dag[Execution]) = { + case Execution.Mapped(Execution.WriteExecution(h, t, f1), f2) => + Execution.WriteExecution(h, t, ComposeMap(f1, f2)) + } + } + + case object FuseMaps extends PartialRule[Execution] { + import Execution._ + def applyWhere[A](on: Dag[Execution]) = { case Mapped(Mapped(ex, fn0), fn1) => + Mapped(ex, ComposedFunctions.ComposedMapFn(fn0, fn1)) + } + } + + val std: Rule[Execution] = + Rule.orElse( + List( + ZipWrite, + MapWrite, + ZipMap, + ZipFlatMap, + FuseMaps + ) + ) + + def apply[A](e: Execution[A], r: Rule[Execution]): Execution[A] = + try { + Dag.applyRule(e, toLiteral, r) + } catch { + case _: StackOverflowError => e + } + + def stdOptimizations[A](e: Execution[A]): Execution[A] = + apply(e, std) +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/FutureCache.scala b/scalding-base/src/main/scala/com/twitter/scalding/FutureCache.scala new file mode 100644 index 0000000000..09d5ba8b9e --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/FutureCache.scala @@ -0,0 +1,67 @@ +package com.twitter.scalding + +import java.util.concurrent.ConcurrentHashMap +import scala.concurrent.{Future, Promise} + +trait PromiseLike[P[_], F[_]] { + def apply[T](): P[T] + def future[T](p: P[T]): F[T] + def completeWith[T](p: P[T], other: F[T]): P[T] +} + +object PromiseLike { + implicit object PromiseLikeCPromise extends PromiseLike[CPromise, CFuture] { + def apply[T](): CPromise[T] = CPromise[T]() + def future[T](p: CPromise[T]): CFuture[T] = p.cfuture + def completeWith[T](p: CPromise[T], other: CFuture[T]): CPromise[T] = p.completeWith(other) + } + + implicit object PromiseLikePromise extends PromiseLike[Promise, Future] { + def apply[T](): Promise[T] = Promise[T]() + def future[T](p: Promise[T]): Future[T] = p.future + def completeWith[T](p: Promise[T], other: Future[T]): Promise[T] = p.completeWith(other) + } +} + +/** + * This is a map for values that are produced in futures as is common in Execution + */ +class FutureCacheGeneric[-K, V, P[_], F[_]](implicit pl: PromiseLike[P, F]) { + private[this] val cache = new ConcurrentHashMap[K, F[V]]() + + def get(k: K): Option[F[V]] = Option(cache.get(k)) + + def getOrElseUpdate(k: K, res: => F[V]): F[V] = + getOrElseUpdateIsNew(k, res)._2 + + /** + * Tells you if this was the first lookup of this key or not + */ + def getOrElseUpdateIsNew(k: K, res: => F[V]): (Boolean, F[V]) = + getOrPromise(k) match { + case Left(cpromise) => + // be careful to not evaluate res twice + pl.completeWith(cpromise, res) + (true, pl.future(cpromise)) + case Right(cfut) => (false, cfut) + } + + /** + * If you get a Left value as a result you MUST complete that Promise or you may deadlock other callers + */ + def getOrPromise(k: K): Either[P[V], F[V]] = { + /* + * Since we don't want to evaluate res twice, we make a promise + * which we will use if it has not already been evaluated + */ + val cpromise = pl.apply[V]() + val cancelFut = pl.future(cpromise) + + cache.putIfAbsent(k, cancelFut) match { + case null => Left(cpromise) + case existsFut => Right(existsFut) + } + } +} + +class FutureCache[-K, V] extends FutureCacheGeneric[K, V, Promise, Future]()(PromiseLike.PromiseLikePromise) diff --git a/scalding-base/src/main/scala/com/twitter/scalding/JobStats.scala b/scalding-base/src/main/scala/com/twitter/scalding/JobStats.scala new file mode 100644 index 0000000000..cb4f449b85 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/JobStats.scala @@ -0,0 +1,75 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import scala.util.{Failure, Try} + +object JobStats { + def empty: JobStats = new JobStats(Map("counters" -> Map.empty)) + + /** + * Returns the counters with Group String -> Counter String -> Long + */ + def toCounters(cMap: Any): Try[Map[String, Map[String, Long]]] = + // This really sucks, but this is what happens when you let Map[String, Any] into your code + cMap match { + case m: Map[_, _] => + Try { + m.foldLeft(Map.empty[String, Map[String, Long]]) { + case (acc, (k: String, v: Any)) => + v match { + case m: Map[_, _] => + acc + (k -> m.foldLeft(Map.empty[String, Long]) { + case (acc2, (k: String, v: Long)) => acc2 + (k -> v) + case (_, kv) => sys.error("inner k, v not (String, Long):" + kv) + }) + case _ => sys.error("inner values are not Maps: " + v) + } + case kv => sys.error("Map does not contain string keys: " + kv) + } + } + case _ => Failure(new Exception("%s not a Map[String, Any]".format(cMap))) + } + + def toJsonValue(a: Any): String = + if (a == null) "null" + else { + Try(a.toString.toInt) + .recoverWith { case t: Throwable => Try(a.toString.toDouble) } + .recover { case t: Throwable => + val s = a.toString + "\"%s\"".format(s) + } + .get + .toString + } +} + +// Simple wrapper for a Map that contains the useful info from the job flow's stats +// If you want to write this, call toMap and use json, etc... to write it +case class JobStats(toMap: Map[String, Any]) { + def counters: Map[String, Map[String, Long]] = + toMap + .get("counters") + .map(JobStats.toCounters(_)) + .getOrElse(sys.error("counters missing from: " + toMap)) + .get + + def toJson: String = + toMap + .map { case (k, v) => "\"%s\" : %s".format(k, JobStats.toJsonValue(v)) } + .mkString("{", ",", "}") +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/LineNumber.scala b/scalding-base/src/main/scala/com/twitter/scalding/LineNumber.scala new file mode 100644 index 0000000000..aaf4ec650a --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/LineNumber.scala @@ -0,0 +1,99 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import org.slf4j.{Logger, LoggerFactory} + +object LineNumber { + + /** + * depth 0 means the StackTraceElement for the caller of this method (skipping getCurrent and the + * Thread.currentThread + */ + def getCurrent(depth: Int): StackTraceElement = + getCurrent(depth, Thread.currentThread().getStackTrace) + + private[this] def getCurrent(depth: Int, stack: Seq[StackTraceElement]): StackTraceElement = + stack(depth + 2) + + def ignorePath(classPrefix: String): Option[StackTraceElement] = ignorePath(Set(classPrefix)) + def ignorePath(classPrefixes: Set[String]): Option[StackTraceElement] = + ignorePaths(classPrefixes, Thread.currentThread().getStackTrace) + + private val LOG: Logger = LoggerFactory.getLogger(LineNumber.getClass) + + private[this] def ignorePaths( + classPrefixes: Set[String], + stack: Seq[StackTraceElement] + ): Option[StackTraceElement] = + stack + .drop(2) + .dropWhile { ste => + classPrefixes.exists { prefix => + ste.getClassName.startsWith(prefix) + } + } + .headOption + + /* + * If you use this method, it will try to give you the non-scalding + * caller of the current method. It does this by ignoring all callers + * in com.twitter.scalding.* unless the caller is a Job (to make testing + * easier). Otherwise it just gets the most direct + * caller for methods that have all the callers in the scalding package + */ + def tryNonScaldingCaller: Option[StackTraceElement] = + tryNonScaldingCaller(Thread.currentThread().getStackTrace) + + def tryNonScaldingCaller(stack: Array[StackTraceElement]): Option[StackTraceElement] = { + /* depth = 1: + * depth 0 => tryNonScaldingCaller + * depth 1 => caller of this method + */ + + // user code is never in our package, or in scala, but + // since internal methods often recurse we ignore these + // in our attempt to get a good line number for the user. + val scaldingPrefix = "com.twitter.scalding." + val ignoredPrefixes = Set(scaldingPrefix, "scala.") + val nonScalding = ignorePaths(ignoredPrefixes, stack) + val jobClass = "com.twitter.scalding.Job" + + // there is no .headOption on Iterator. WTF? + def headOption[T](it: Iterator[T]): Option[T] = + if (it.hasNext) Some(it.next) + else None + + val scaldingJobCaller = headOption( + stack.iterator + .filter(se => se.getClassName.startsWith(scaldingPrefix)) + .filter { se => + try { + val cls = Class.forName(se.getClassName) + Class.forName(jobClass).isAssignableFrom(cls) + } catch { + // skip classes that we don't find. We seem to run into this for some lambdas on Scala 2.12 in travis + case cnf: ClassNotFoundException => + LOG.warn(s"Skipping $se.getClassName as we can't find the class") + false + } + } + ) + + scaldingJobCaller + .orElse(nonScalding) + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/Mode.scala b/scalding-base/src/main/scala/com/twitter/scalding/Mode.scala new file mode 100644 index 0000000000..a62a3140e2 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/Mode.scala @@ -0,0 +1,39 @@ +package com.twitter.scalding + +trait Mode extends java.io.Serializable { + + /** + * Make the Execution.Writer for this platform + */ + def newWriter(): Execution.Writer + + /** + * Config.defaultForMode converts this map into a Config (we don't use Config here to avoid a circular + * dependency) + */ + def defaultConfig: Map[String, String] = Map.empty +} + +object Mode { + + /** + * This is a Args and a Mode together. It is used purely as a work-around for the fact that Job only accepts + * an Args object, but needs a Mode inside. + */ + private class ArgsWithMode(argsMap: Map[String, List[String]], val mode: Mode) extends Args(argsMap) { + override def +(keyvals: (String, Iterable[String])): Args = + new ArgsWithMode(super.+(keyvals).m, mode) + } + + /** Attach a mode to these Args and return the new Args */ + def putMode(mode: Mode, args: Args): Args = new ArgsWithMode(args.m, mode) + + /** Get a Mode if this Args was the result of a putMode */ + def getMode(args: Args): Option[Mode] = args match { + case withMode: ArgsWithMode => Some(withMode.mode) + case _ => None + } +} + +case class ModeException(message: String) extends RuntimeException(message) +case class ModeLoadException(message: String, origin: ClassNotFoundException) extends RuntimeException(origin) diff --git a/scalding-base/src/main/scala/com/twitter/scalding/StatKey.scala b/scalding-base/src/main/scala/com/twitter/scalding/StatKey.scala new file mode 100644 index 0000000000..fcbe5bb6e4 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/StatKey.scala @@ -0,0 +1,15 @@ +package com.twitter.scalding + +case class StatKey(counter: String, group: String) extends java.io.Serializable + +object StatKey { + // This is implicit to allow Stat("c", "g") to work. + implicit def fromCounterGroup(counterGroup: (String, String)): StatKey = counterGroup match { + case (c, g) => StatKey(c, g) + } + // Create a Stat in the ScaldingGroup + implicit def fromCounterDefaultGroup(counter: String): StatKey = + StatKey(counter, ScaldingGroup) + + val ScaldingGroup = "Scalding Custom" +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/StringUtility.scala b/scalding-base/src/main/scala/com/twitter/scalding/StringUtility.scala new file mode 100644 index 0000000000..5c24b3d524 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/StringUtility.scala @@ -0,0 +1,34 @@ +package com.twitter.scalding + +object StringUtility { + private[this] val emptyCons = "" :: Nil + + private def fastSplitHelper( + text: String, + key: String, + from: Int, + textLength: Int, + keyLength: Int + ): List[String] = { + val firstIndex = text.indexOf(key, from) + if (firstIndex == -1) { + if (from < textLength) { + text.substring(from) :: Nil + } else { + emptyCons + } + } else { + // the text till the separator should be kept in any case + text.substring(from, firstIndex) :: fastSplitHelper( + text, + key, + firstIndex + keyLength, + textLength, + keyLength + ) + } + } + + def fastSplit(text: String, key: String): List[String] = + fastSplitHelper(text, key, 0, text.length, key.length) +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/UniqueID.scala b/scalding-base/src/main/scala/com/twitter/scalding/UniqueID.scala new file mode 100644 index 0000000000..ee5d21e2c1 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/UniqueID.scala @@ -0,0 +1,30 @@ +package com.twitter.scalding + +/** + * Used to inject a typed unique identifier to uniquely name each scalding flow. This is here mostly to deal + * with the case of testing where there are many concurrent threads running Flows. Users should never have to + * worry about these + */ +case class UniqueID(get: String) { + require(get.indexOf(',') == -1, s"UniqueID cannot contain ,: $get") +} + +object UniqueID { + val UNIQUE_JOB_ID = "scalding.job.uniqueId" + private val id = new java.util.concurrent.atomic.AtomicInteger(0) + + def getRandom: UniqueID = { + // This number is unique as long as we don't create more than 10^6 per milli + // across separate jobs. which seems very unlikely. + val unique = (System.currentTimeMillis << 20) ^ (id.getAndIncrement.toLong) + UniqueID(unique.toString) + } + + /** + * This is only safe if you use something known to have a single instance in the relevant scope. + * + * In cascading, the FlowDef has been used here + */ + def fromSystemHashCode(ar: AnyRef): UniqueID = + UniqueID(System.identityHashCode(ar).toString) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix2.scala b/scalding-base/src/main/scala/com/twitter/scalding/mathematics/Matrix2.scala similarity index 59% rename from scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix2.scala rename to scalding-base/src/main/scala/com/twitter/scalding/mathematics/Matrix2.scala index cfbee2e830..1f6ff4f4f7 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix2.scala +++ b/scalding-base/src/main/scala/com/twitter/scalding/mathematics/Matrix2.scala @@ -12,34 +12,29 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics -import cascading.pipe.Pipe -import cascading.tuple.Fields -import com.twitter.scalding.TDsl._ -import com.twitter.scalding._ -import com.twitter.scalding.typed.{ValuePipe, EmptyValue, LiteralValue, ComputedValue} -import com.twitter.algebird.{ Semigroup, Monoid, Ring, Group, Field } +import com.twitter.scalding.serialization.OrderedSerialization2 +import com.twitter.scalding.typed.{ComputedValue, EmptyValue, Input, LiteralValue, TypedPipe, ValuePipe} +import com.twitter.algebird.{Field, Group, Monoid, Ring, Semigroup} import scala.collection.mutable.Map import scala.collection.mutable.HashMap -import cascading.flow.FlowDef import java.io.Serializable /** * This is the future Matrix API. The old one will be removed in scalding 0.10.0 (or 1.0.0). * - * Create Matrix2 instances with methods in the Matrix2 object. - * Note that this code optimizes the order in which it evaluates matrices, and replaces equivalent - * terms to avoid recomputation. Also, this code puts the parenthesis in the optimal place in - * terms of size according to the sizeHints. For instance: - * (A*B)*C == A*(B*C) but if B is a 10 x 10^6 matrix, and C is 10^6 x 100, - * it is better to do the B*C product first in order to avoid storing as much intermediate output. + * Create Matrix2 instances with methods in the Matrix2 object. Note that this code optimizes the order in + * which it evaluates matrices, and replaces equivalent terms to avoid recomputation. Also, this code puts the + * parenthesis in the optimal place in terms of size according to the sizeHints. For instance: (A*B)*C == + * A*(B*C) but if B is a 10 x 10^6 matrix, and C is 10^6 x 100, it is better to do the B*C product first in + * order to avoid storing as much intermediate output. * - * NOTE THIS REQUIREMENT: for each formula, you can only have one Ring[V] in scope. If you - * evaluate part of the formula with one Ring, and another part with another, you must go through - * a TypedPipe (call toTypedPipe) or the result may not be correct. + * NOTE THIS REQUIREMENT: for each formula, you can only have one Ring[V] in scope. If you evaluate part of + * the formula with one Ring, and another part with another, you must go through a TypedPipe (call + * toTypedPipe) or the result may not be correct. */ sealed trait Matrix2[R, C, V] extends Serializable { implicit def rowOrd: Ordering[R] @@ -49,25 +44,30 @@ sealed trait Matrix2[R, C, V] extends Serializable { def -(that: Matrix2[R, C, V])(implicit g: Group[V]): Matrix2[R, C, V] = Sum(this, that.negate, g) def unary_-(implicit g: Group[V]): Matrix2[R, C, V] = negate def negate(implicit g: Group[V]): Matrix2[R, C, V] - /** Represents the pointwise, or Hadamard, product of two matrices. + + /** + * Represents the pointwise, or Hadamard, product of two matrices. */ - def #*#(that: Matrix2[R, C, V])(implicit ring: Ring[V]): Matrix2[R, C, V] = HadamardProduct(this, that, ring) + def #*#(that: Matrix2[R, C, V])(implicit ring: Ring[V]): Matrix2[R, C, V] = + HadamardProduct(this, that, ring) // Matrix product def *[C2](that: Matrix2[C, C2, V])(implicit ring: Ring[V], mj: MatrixJoiner2): Matrix2[R, C2, V] = Product(this, that, ring) - def *(that: Scalar2[V])(implicit ring: Ring[V], mode: Mode, flowDef: FlowDef, mj: MatrixJoiner2): Matrix2[R, C, V] = that * this + def *(that: Scalar2[V])(implicit ring: Ring[V], mj: MatrixJoiner2): Matrix2[R, C, V] = that * this - def /(that: Scalar2[V])(implicit field: Field[V], mode: Mode, flowDef: FlowDef): Matrix2[R, C, V] = - that divMatrix this - /** Convert the current Matrix to a TypedPipe + def /(that: Scalar2[V])(implicit field: Field[V]): Matrix2[R, C, V] = + that.divMatrix(this) + + /** + * Convert the current Matrix to a TypedPipe */ def toTypedPipe: TypedPipe[(R, C, V)] def transpose: Matrix2[C, R, V] + /** - * Users should never need this. This is the current Matrix2, but in most optimized - * form. Usually, you will just do matrix operations until you eventually call write - * or toTypedPipe + * Users should never need this. This is the current Matrix2, but in most optimized form. Usually, you will + * just do matrix operations until you eventually call write or toTypedPipe */ def optimizedSelf: Matrix2[R, C, V] = Matrix2.optimize(this.asInstanceOf[Matrix2[Any, Any, V]])._2.asInstanceOf[Matrix2[R, C, V]] @@ -90,50 +90,71 @@ sealed trait Matrix2[R, C, V] extends Serializable { Product(this, OneC()(colOrd), ring) /** - * the result is the same as considering everything on the this to be like a 1 value - * so we just sum, using only a monoid on VecV, where this Matrix has the value true. - * This is useful for graph propagation of monoids, such as sketchs like HyperLogLog, - * BloomFilters or CountMinSketch. - * TODO This is a special kind of product that could be optimized like Product is + * the result is the same as considering everything on the this to be like a 1 value so we just sum, using + * only a monoid on VecV, where this Matrix has the value true. This is useful for graph propagation of + * monoids, such as sketchs like HyperLogLog, BloomFilters or CountMinSketch. TODO This is a special kind of + * product that could be optimized like Product is */ - def propagate[C2, VecV](vec: Matrix2[C, C2, VecV])(implicit ev: =:=[V, Boolean], - mon: Monoid[VecV], - mj: MatrixJoiner2): Matrix2[R, C2, VecV] = { + def propagate[C2, VecV]( + vec: Matrix2[C, C2, VecV] + )(implicit ev: =:=[V, Boolean], mon: Monoid[VecV], mj: MatrixJoiner2): Matrix2[R, C2, VecV] = { - //This cast will always succeed: + // This cast will always succeed: lazy val joinedBool = mj.join(this.asInstanceOf[Matrix2[R, C, Boolean]], vec) implicit val ord2: Ordering[C2] = vec.colOrd - lazy val resultPipe = joinedBool.flatMap { case (key, ((row, bool), (col2, v))) => + lazy val resultPipe = joinedBool + .flatMap { case (key, ((row, bool), (col2, v))) => if (bool) Some((row, col2), v) else None // filter early } .group // TODO we could be lazy with this group and combine with a sum .sum - .filter { kv => mon.isNonZero(kv._2) } + .filter(kv => mon.isNonZero(kv._2)) .map { case ((r, c2), v) => (r, c2, v) } MatrixLiteral(resultPipe, this.sizeHint) } - def propagateRow[C2](mat: Matrix2[C, C2, Boolean])(implicit ev: =:=[R, Unit], mon: Monoid[V], mj: MatrixJoiner2): Matrix2[Unit, C2, V] = + def propagateRow[C2]( + mat: Matrix2[C, C2, Boolean] + )(implicit ev: =:=[R, Unit], mon: Monoid[V], mj: MatrixJoiner2): Matrix2[Unit, C2, V] = mat.transpose.propagate(this.transpose.asInstanceOf[Matrix2[C, Unit, V]]).transpose // Binarize values, all x != 0 become 1 def binarizeAs[NewValT](implicit mon: Monoid[V], ring: Ring[NewValT]): Matrix2[R, C, NewValT] = { - lazy val newPipe = toTypedPipe.map { case (r, c, x) => - (r, c, if (mon.isNonZero(x)) { ring.one } else { ring.zero }) + lazy val newPipe = toTypedPipe + .map { case (r, c, x) => + ( + r, + c, + if (mon.isNonZero(x)) { ring.one } + else { ring.zero } + ) } - .filter { kv => ring.isNonZero(kv._3) } + .filter(kv => ring.isNonZero(kv._3)) MatrixLiteral(newPipe, this.sizeHint) } - /** Row L2 normalization (can only be called for Double) - * After this operation, the sum(|x|^2) along each row will be 1. + /** + * Row L2 normalization After this operation, the sum(|x|^2) along each row will be 1. + */ + def rowL2Normalize(implicit num: Numeric[V], mj: MatrixJoiner2): Matrix2[R, C, Double] = { + val matD = + MatrixLiteral(this.toTypedPipe.map { case (r, c, x) => (r, c, num.toDouble(x)) }, this.sizeHint) + lazy val result = MatrixLiteral( + this.toTypedPipe.map { case (r, c, x) => (r, c, num.toDouble(x) * num.toDouble(x)) }, + this.sizeHint + ).sumColVectors.toTypedPipe + .map { case (r, c, x) => (r, r, 1 / scala.math.sqrt(x)) } // diagonal + inverse + MatrixLiteral(result, SizeHint.asDiagonal(this.sizeHint.setRowsToCols)) * matD + } + + /** + * Row L1 normalization After this operation, the sum(|x|) alone each row will be 1. */ - def rowL2Normalize(implicit ev: =:=[V, Double], mj: MatrixJoiner2): Matrix2[R, C, Double] = { - val matD = this.asInstanceOf[Matrix2[R, C, Double]] - lazy val result = MatrixLiteral(matD.toTypedPipe.map { case (r, c, x) => (r, c, x * x) }, this.sizeHint) - .sumColVectors - .toTypedPipe - .map { case (r, c, x) => (r, r, 1 / scala.math.sqrt(x)) } // diagonal + inverse + def rowL1Normalize(implicit num: Numeric[V], mj: MatrixJoiner2): Matrix2[R, C, Double] = { + val matD = + MatrixLiteral(this.toTypedPipe.map { case (r, c, x) => (r, c, num.toDouble(x).abs) }, this.sizeHint) + lazy val result = matD.sumColVectors.toTypedPipe + .map { case (r, c, x) => (r, r, 1 / x) } // diagonal + inverse MatrixLiteral(result, SizeHint.asDiagonal(this.sizeHint.setRowsToCols)) * matD } @@ -141,18 +162,21 @@ sealed trait Matrix2[R, C, V] extends Serializable { MatrixLiteral( toTypedPipe .filter { case (r, c, v) => Ordering[R].equiv(r, index) } - .map { case (r, c, v) => ((), c, v) }, this.sizeHint.setRows(1L) - ) + .map { case (r, c, v) => ((), c, v) }, + this.sizeHint.setRows(1L) + ) def getColumn(index: C): Matrix2[R, Unit, V] = MatrixLiteral( toTypedPipe .filter { case (r, c, v) => Ordering[C].equiv(c, index) } - .map { case (r, c, v) => (r, (), v) }, this.sizeHint.setCols(1L) - ) + .map { case (r, c, v) => (r, (), v) }, + this.sizeHint.setCols(1L) + ) - /** Consider this Matrix as the r2 row of a matrix. The current matrix must be a row, - * which is to say, its row type must be Unit. + /** + * Consider this Matrix as the r2 row of a matrix. The current matrix must be a row, which is to say, its + * row type must be Unit. */ def asRow[R2](r2: R2)(implicit ev: R =:= Unit, rowOrd: Ordering[R2]): Matrix2[R2, C, V] = MatrixLiteral(toTypedPipe.map { case (r, c, v) => (r2, c, v) }, this.sizeHint) @@ -162,24 +186,26 @@ sealed trait Matrix2[R, C, V] extends Serializable { // Compute the sum of the main diagonal. Only makes sense cases where the row and col type are // equal - def trace(implicit mon: Monoid[V], ev: =:=[R,C]): Scalar2[V] = - Scalar2(toTypedPipe.asInstanceOf[TypedPipe[(R, R, V)]] - .filter{case (r1, r2, _) => Ordering[R].equiv(r1, r2)} - .map{case (_,_,x) => x} - .sum(mon) + def trace(implicit mon: Monoid[V], ev: =:=[R, C]): Scalar2[V] = + Scalar2( + toTypedPipe + .asInstanceOf[TypedPipe[(R, R, V)]] + .filter { case (r1, r2, _) => Ordering[R].equiv(r1, r2) } + .map { case (_, _, x) => x } + .sum(mon) ) - - def write(sink: TypedSink[(R, C, V)])(implicit fd: FlowDef, m: Mode): Matrix2[R, C, V] = - MatrixLiteral(toTypedPipe.write(sink), sizeHint) } -/** This trait allows users to plug in join algoritms - * where they are needed to improve products and propagations. - * The default works well in most cases, but highly skewed matrices may need some - * special handling +/** + * This trait allows users to plug in join algorithms where they are needed to improve products and + * propagations. The default works well in most cases, but highly skewed matrices may need some special + * handling */ trait MatrixJoiner2 extends java.io.Serializable { - def join[R, C, V, C2, V2](left: Matrix2[R, C, V], right: Matrix2[C, C2, V2]): TypedPipe[(C, ((R, V), (C2, V2)))] + def join[R, C, V, C2, V2]( + left: Matrix2[R, C, V], + right: Matrix2[C, C2, V2] + ): TypedPipe[(C, ((R, V), (C2, V2)))] } object MatrixJoiner2 { @@ -187,24 +213,30 @@ object MatrixJoiner2 { // comment this out to verify we are not hiding the user's suppled values implicit def default: MatrixJoiner2 = new DefaultMatrixJoiner(10000L) - def join[R, C, V, C2, V2](left: Matrix2[R, C, V], - right: Matrix2[C, C2, V2])(implicit mj: MatrixJoiner2): TypedPipe[(C, ((R, V), (C2, V2)))] = - mj.join(left, right) + def join[R, C, V, C2, V2](left: Matrix2[R, C, V], right: Matrix2[C, C2, V2])(implicit + mj: MatrixJoiner2 + ): TypedPipe[(C, ((R, V), (C2, V2)))] = + mj.join(left, right) } -/** This uses standard join if the matrices are comparable size and large, - * otherwise, if one is much smaller than the other, we use a hash join +/** + * This uses standard join if the matrices are comparable size and large, otherwise, if one is much smaller + * than the other, we use a hash join */ class DefaultMatrixJoiner(sizeRatioThreshold: Long) extends MatrixJoiner2 { - def join[R, C, V, C2, V2](left: Matrix2[R, C, V], - right: Matrix2[C, C2, V2]): TypedPipe[(C, ((R, V), (C2, V2)))] = { + def join[R, C, V, C2, V2]( + left: Matrix2[R, C, V], + right: Matrix2[C, C2, V2] + ): TypedPipe[(C, ((R, V), (C2, V2)))] = { implicit val cOrd: Ordering[C] = left.colOrd val one = left.toTypedPipe.map { case (r, c, v) => (c, (r, v)) }.group val two = right.toTypedPipe.map { case (c, c2, v2) => (c, (c2, v2)) }.group val sizeOne = left.sizeHint.total.getOrElse(BigInt(1L)) val sizeTwo = right.sizeHint.total.getOrElse(BigInt(1L)) - def swapInner[M,N](t: TypedPipe[(C, (M, N))]): TypedPipe[(C, (N, M))] = t.mapValues { t: (M,N) => t.swap } + def swapInner[M, N](t: TypedPipe[(C, (M, N))]): TypedPipe[(C, (N, M))] = t.mapValues { t: (M, N) => + t.swap + } // TODO: // use block join on tall skinny times skinny tall (or skewed): the result really big, // but the direct approach can't get much parallelism. @@ -224,45 +256,54 @@ class DefaultMatrixJoiner(sizeRatioThreshold: Long) extends MatrixJoiner2 { /** * Infinite column vector - only for intermediate computations */ -case class OneC[R, V](implicit override val rowOrd: Ordering[R]) extends Matrix2[R, Unit, V] { +final case class OneC[R, V](implicit override val rowOrd: Ordering[R]) extends Matrix2[R, Unit, V] { override val sizeHint: SizeHint = FiniteHint(Long.MaxValue, 1) override def colOrd = Ordering[Unit] def transpose = OneR() - override def negate(implicit g: Group[V]) = sys.error("Only used in intermediate computations, try (-1 * OneC)") + override def negate(implicit g: Group[V]) = + sys.error("Only used in intermediate computations, try (-1 * OneC)") def toTypedPipe = sys.error("Only used in intermediate computations") } /** * Infinite row vector - only for intermediate computations */ -case class OneR[C, V](implicit override val colOrd: Ordering[C]) extends Matrix2[Unit, C, V] { +final case class OneR[C, V](implicit override val colOrd: Ordering[C]) extends Matrix2[Unit, C, V] { override val sizeHint: SizeHint = FiniteHint(1, Long.MaxValue) override def rowOrd = Ordering[Unit] def transpose = OneC() - override def negate(implicit g: Group[V]) = sys.error("Only used in intermediate computations, try (-1 * OneR)") + override def negate(implicit g: Group[V]) = + sys.error("Only used in intermediate computations, try (-1 * OneR)") def toTypedPipe = sys.error("Only used in intermediate computations") } /** * Class representing a matrix product * - * @param left multiplicand - * @param right multiplier + * @param left + * multiplicand + * @param right + * multiplier * @param ring - * @param expressions a HashMap of common subtrees; None if possibly not optimal (did not go through optimize), Some(...) with a HashMap that was created in optimize + * @param expressions + * a HashMap of common subtrees; None if possibly not optimal (did not go through optimize), Some(...) with + * a HashMap that was created in optimize */ -case class Product[R, C, C2, V](left: Matrix2[R, C, V], - right: Matrix2[C, C2, V], - ring: Ring[V], - expressions: Option[Map[Matrix2[R, C2, V], TypedPipe[(R, C2, V)]]] = None)(implicit val joiner: MatrixJoiner2) extends Matrix2[R, C2, V] { +final case class Product[R, C, C2, V]( + left: Matrix2[R, C, V], + right: Matrix2[C, C2, V], + ring: Ring[V], + expressions: Option[Map[Matrix2[R, C2, V], TypedPipe[(R, C2, V)]]] = None +)(implicit val joiner: MatrixJoiner2) + extends Matrix2[R, C2, V] { /** - * Structural, NOT mathematical equality (e.g. (A*B) * C != A * (B*C)) - * Used for the Matrix2OptimizationTest (so that it doesn't care about expressions) + * Structural, NOT mathematical equality (e.g. (A*B) * C != A * (B*C)) Used for the Matrix2OptimizationTest + * (so that it doesn't care about expressions) */ override def equals(obj: Any): Boolean = obj match { case Product(tl, tr, _, _) => left.equals(tl) && right.equals(tr) - case _ => false + case _ => false } override def hashCode(): Int = left.hashCode ^ right.hashCode @@ -276,14 +317,15 @@ case class Product[R, C, C2, V](left: Matrix2[R, C, V], val localRing = ring val joined = (if (leftMatrix) { - val ord: Ordering[R] = left.rowOrd - left.toTypedPipe.groupBy(x => x._1)(ord) - } else { - val ord: Ordering[C] = right.rowOrd - right.toTypedPipe.groupBy(x => x._1)(ord) - }).mapValues { _._3 } + val ord: Ordering[R] = left.rowOrd + left.toTypedPipe.groupBy(x => x._1)(ord) + } else { + val ord: Ordering[C] = right.rowOrd + right.toTypedPipe.groupBy(x => x._1)(ord) + }) + .mapValues(_._3) .sum(localRing) - .filter { kv => localRing.isNonZero(kv._2) } + .filter(kv => localRing.isNonZero(kv._2)) if (leftMatrix) { joined.map { case (r, v) => (r, (), v) }.asInstanceOf[TypedPipe[(R, C2, V)]] // we know C2 is Unit @@ -293,14 +335,15 @@ case class Product[R, C, C2, V](left: Matrix2[R, C, V], } // represents `\sum_{i j} M_{i j}` where `M_{i j}` is the Matrix with exactly one element at `row=i, col = j`. - lazy val toOuterSum: TypedPipe[(R, C2, V)] = { + lazy val toOuterSum: TypedPipe[(R, C2, V)] = if (optimal) { if (isSpecialCase) { specialCase } else { implicit val ord: Ordering[C] = right.rowOrd val localRing = ring - joiner.join(left, right) + joiner + .join(left, right) .map { case (key, ((l1, lv), (r2, rv))) => (l1, r2, localRing.times(lv, rv)) } } } else { @@ -308,82 +351,81 @@ case class Product[R, C, C2, V](left: Matrix2[R, C, V], // Maybe it is Product[R, _, C2, V] optimizedSelf.asInstanceOf[Product[R, _, C2, V]].toOuterSum } - } - private def computePipe(joined: TypedPipe[(R, C2, V)] = toOuterSum): TypedPipe[(R, C2, V)] = { + private def computePipe(joined: TypedPipe[(R, C2, V)] = toOuterSum): TypedPipe[(R, C2, V)] = if (isSpecialCase) { joined } else { - val ord2: Ordering[(R, C2)] = Ordering.Tuple2(rowOrd, colOrd) val localRing = ring - joined.groupBy(w => (w._1, w._2))(ord2).mapValues { _._3 } + joined + .groupBy(w => (w._1, w._2)) + .mapValues(_._3) .sum(localRing) - .filter { kv => localRing.isNonZero(kv._2) } + .filter(kv => localRing.isNonZero(kv._2)) .map { case ((r, c), v) => (r, c, v) } } - } - override lazy val toTypedPipe: TypedPipe[(R, C2, V)] = { + override lazy val toTypedPipe: TypedPipe[(R, C2, V)] = expressions match { - case Some(m) => m.get(this) match { - case Some(pipe) => pipe - case None => { + case Some(m) => + m.get(this).getOrElse { val result = computePipe() m.put(this, result) result } - } case None => optimizedSelf.toTypedPipe } - } override val sizeHint = left.sizeHint * right.sizeHint implicit override val rowOrd: Ordering[R] = left.rowOrd implicit override val colOrd: Ordering[C2] = right.colOrd + implicit def withOrderedSerialization: Ordering[(R, C2)] = + OrderedSerialization2.maybeOrderedSerialization2(rowOrd, colOrd) + override lazy val transpose: Product[C2, C, R, V] = Product(right.transpose, left.transpose, ring) - override def negate(implicit g: Group[V]): Product[R, C, C2, V] = { + override def negate(implicit g: Group[V]): Product[R, C, C2, V] = if (left.sizeHint.total.getOrElse(BigInt(0L)) > right.sizeHint.total.getOrElse(BigInt(0L))) { Product(left, right.negate, ring, expressions) } else { Product(left.negate, right, ring, expressions) } - } - /** Trace(A B) = Trace(B A) so we optimize to choose the lowest cost item + /** + * Trace(A B) = Trace(B A) so we optimize to choose the lowest cost item */ - override def trace(implicit mon: Monoid[V], ev1: =:=[R,C2]): Scalar2[V] = { - val (cost1, plan1) = Matrix2.optimize(this.asInstanceOf[Matrix2[Any, Any, V]]) - val (cost2, plan2) = Matrix2.optimize( - Product(right.asInstanceOf[Matrix2[C,R,V]], left.asInstanceOf[Matrix2[R,C,V]], ring, None) + override def trace(implicit mon: Monoid[V], ev1: =:=[R, C2]): Scalar2[V] = { + val (cost1, plan1) = Matrix2.optimize(this.asInstanceOf[Matrix2[Any, Any, V]]) // linter:ignore + val (cost2, plan2) = Matrix2.optimize( // linter:ignore + Product(right.asInstanceOf[Matrix2[C, R, V]], left.asInstanceOf[Matrix2[R, C, V]], ring, None) .asInstanceOf[Matrix2[Any, Any, V]] - ) + ) if (cost1 > cost2) { val product2 = plan2.asInstanceOf[Product[C, R, C, V]] val ord = left.colOrd - val filtered = product2.toOuterSum.filter{case (c1, c2, _) => ord.equiv(c1, c2)} - Scalar2(product2.computePipe(filtered).map{case (_, _, x) => x}.sum(mon)) + val filtered = product2.toOuterSum.filter { case (c1, c2, _) => ord.equiv(c1, c2) } + Scalar2(product2.computePipe(filtered).map { case (_, _, x) => x }.sum(mon)) } else { val product1 = plan1.asInstanceOf[Product[R, C, R, V]] val ord = left.rowOrd - val filtered = product1.toOuterSum.filter{case (r1, r2, _) => ord.equiv(r1, r2)} - Scalar2(product1.computePipe(filtered).map{case (_, _, x) => x}.sum(mon)) + val filtered = product1.toOuterSum.filter { case (r1, r2, _) => ord.equiv(r1, r2) } + Scalar2(product1.computePipe(filtered).map { case (_, _, x) => x }.sum(mon)) } } } -case class Sum[R, C, V](left: Matrix2[R, C, V], right: Matrix2[R, C, V], mon: Monoid[V]) extends Matrix2[R, C, V] { +final case class Sum[R, C, V](left: Matrix2[R, C, V], right: Matrix2[R, C, V], mon: Monoid[V]) + extends Matrix2[R, C, V] { def collectAddends(sum: Sum[R, C, V]): List[TypedPipe[(R, C, V)]] = { - def getLiteral(mat: Matrix2[R, C, V]): TypedPipe[(R, C, V)] = { + def getLiteral(mat: Matrix2[R, C, V]): TypedPipe[(R, C, V)] = mat match { - case x @ Product(_, _, _, _) => x.toOuterSum - case x @ MatrixLiteral(_, _) => x.toTypedPipe + case x @ Product(_, _, _, _) => x.toOuterSum + case x @ MatrixLiteral(_, _) => x.toTypedPipe case x @ HadamardProduct(_, _, _) => x.optimizedSelf.toTypedPipe - case _ => sys.error("Invalid addend") + case _ => sys.error("Invalid addend") } - } sum match { case Sum(l @ Sum(_, _, _), r @ Sum(_, _, _), _) => { @@ -401,59 +443,65 @@ case class Sum[R, C, V](left: Matrix2[R, C, V], right: Matrix2[R, C, V], mon: Mo } } - override lazy val toTypedPipe: TypedPipe[(R, C, V)] = { + override lazy val toTypedPipe: TypedPipe[(R, C, V)] = if (left.equals(right)) { left.optimizedSelf.toTypedPipe.map(v => (v._1, v._2, mon.plus(v._3, v._3))) } else { - val ord: Ordering[(R, C)] = Ordering.Tuple2(left.rowOrd, left.colOrd) collectAddends(this) .reduce((x, y) => x ++ y) - .groupBy(x => (x._1, x._2))(ord).mapValues { _._3 } + .groupBy(x => (x._1, x._2)) + .mapValues(_._3) .sum(mon) - .filter { kv => mon.isNonZero(kv._2) } + .filter(kv => mon.isNonZero(kv._2)) .map { case ((r, c), v) => (r, c, v) } } - } override val sizeHint = left.sizeHint + right.sizeHint implicit override val rowOrd: Ordering[R] = left.rowOrd implicit override val colOrd: Ordering[C] = left.colOrd + implicit def withOrderedSerialization: Ordering[(R, C)] = + OrderedSerialization2.maybeOrderedSerialization2(rowOrd, colOrd) + override lazy val transpose: Sum[C, R, V] = Sum(left.transpose, right.transpose, mon) override def negate(implicit g: Group[V]): Sum[R, C, V] = Sum(left.negate, right.negate, mon) override def sumColVectors(implicit ring: Ring[V], mj: MatrixJoiner2): Matrix2[R, Unit, V] = Sum(left.sumColVectors, right.sumColVectors, mon) - override def trace(implicit mon: Monoid[V], ev: =:=[R,C]): Scalar2[V] = - Scalar2(collectAddends(this).map { pipe => - pipe.asInstanceOf[TypedPipe[(R, R, V)]] - .filter { case (r, c, v) => Ordering[R].equiv(r, c) } - .map { _._3 } - }.reduce(_ ++ _).sum) + override def trace(implicit mon: Monoid[V], ev: =:=[R, C]): Scalar2[V] = + Scalar2( + collectAddends(this) + .map { pipe => + pipe + .asInstanceOf[TypedPipe[(R, R, V)]] + .filter { case (r, c, v) => Ordering[R].equiv(r, c) } + .map(_._3) + } + .reduce(_ ++ _) + .sum + ) } -case class HadamardProduct[R, C, V](left: Matrix2[R, C, V], - right: Matrix2[R, C, V], - ring: Ring[V]) extends Matrix2[R, C, V] { +final case class HadamardProduct[R, C, V](left: Matrix2[R, C, V], right: Matrix2[R, C, V], ring: Ring[V]) + extends Matrix2[R, C, V] { // TODO: optimize / combine with Sums: https://github.com/tomtau/scalding/issues/14#issuecomment-22971582 - override lazy val toTypedPipe: TypedPipe[(R, C, V)] = { + override lazy val toTypedPipe: TypedPipe[(R, C, V)] = if (left.equals(right)) { left.optimizedSelf.toTypedPipe.map(v => (v._1, v._2, ring.times(v._3, v._3))) } else { - val ord: Ordering[(R, C)] = Ordering.Tuple2(left.rowOrd, left.colOrd) // tracking values which were reduced (multiplied by non-zero) or non-reduced (multiplied by zero) with a boolean (left.optimizedSelf.toTypedPipe.map { case (r, c, v) => (r, c, (v, false)) } ++ right.optimizedSelf.toTypedPipe.map { case (r, c, v) => (r, c, (v, false)) }) - .groupBy(x => (x._1, x._2))(ord) - .mapValues { _._3 } + .groupBy(x => (x._1, x._2)) + .mapValues(_._3) .reduce((x, y) => (ring.times(x._1, y._1), true)) - .filter { kv => kv._2._2 && ring.isNonZero(kv._2._1) } + .filter(kv => kv._2._2 && ring.isNonZero(kv._2._1)) .map { case ((r, c), v) => (r, c, v._1) } } - } - override lazy val transpose: MatrixLiteral[C, R, V] = MatrixLiteral(toTypedPipe.map(x => (x._2, x._1, x._3)), sizeHint.transpose)(colOrd, rowOrd) + override lazy val transpose: MatrixLiteral[C, R, V] = + MatrixLiteral(toTypedPipe.map(x => (x._2, x._1, x._3)), sizeHint.transpose)(colOrd, rowOrd) override val sizeHint = left.sizeHint #*# right.sizeHint override def negate(implicit g: Group[V]): HadamardProduct[R, C, V] = if (left.sizeHint.total.getOrElse(BigInt(0L)) > right.sizeHint.total.getOrElse(BigInt(0L))) @@ -463,11 +511,15 @@ case class HadamardProduct[R, C, V](left: Matrix2[R, C, V], implicit override val rowOrd: Ordering[R] = left.rowOrd implicit override val colOrd: Ordering[C] = left.colOrd + implicit def withOrderedSerialization: Ordering[(R, C)] = + OrderedSerialization2.maybeOrderedSerialization2(rowOrd, colOrd) } -case class MatrixLiteral[R, C, V](override val toTypedPipe: TypedPipe[(R, C, V)], - override val sizeHint: SizeHint)(implicit override val rowOrd: Ordering[R], override val colOrd: Ordering[C]) - extends Matrix2[R, C, V] { +final case class MatrixLiteral[R, C, V]( + override val toTypedPipe: TypedPipe[(R, C, V)], + override val sizeHint: SizeHint +)(implicit override val rowOrd: Ordering[R], override val colOrd: Ordering[C]) + extends Matrix2[R, C, V] { override lazy val transpose: MatrixLiteral[C, R, V] = MatrixLiteral(toTypedPipe.map(x => (x._2, x._1, x._3)), sizeHint.transpose)(colOrd, rowOrd) @@ -476,22 +528,22 @@ case class MatrixLiteral[R, C, V](override val toTypedPipe: TypedPipe[(R, C, V)] MatrixLiteral(toTypedPipe.map(x => (x._1, x._2, g.negate(x._3))), sizeHint) } -/** A representation of a scalar value that can be used with Matrices +/** + * A representation of a scalar value that can be used with Matrices */ trait Scalar2[V] extends Serializable { def value: ValuePipe[V] - def +(that: Scalar2[V])(implicit sg: Semigroup[V]): Scalar2[V] = { + def +(that: Scalar2[V])(implicit sg: Semigroup[V]): Scalar2[V] = (value, that.value) match { - case (EmptyValue(), _) => that + case (EmptyValue, _) => that case (LiteralValue(v1), _) => that.map(sg.plus(v1, _)) - case (_, EmptyValue()) => this + case (_, EmptyValue) => this case (_, LiteralValue(v2)) => map(sg.plus(_, v2)) // TODO: optimize sums of scalars like sums of matrices: // only one M/R pass for the whole Sum. case (_, ComputedValue(v2)) => Scalar2((value ++ v2).sum(sg)) } - } def -(that: Scalar2[V])(implicit g: Group[V]): Scalar2[V] = this + that.map(x => g.negate(x)) def *(that: Scalar2[V])(implicit ring: Ring[V]): Scalar2[V] = Scalar2(ValuePipe.fold(value, that.value)(ring.times _)) @@ -501,7 +553,7 @@ trait Scalar2[V] extends Serializable { def *[R, C](that: Matrix2[R, C, V])(implicit ring: Ring[V], mj: MatrixJoiner2): Matrix2[R, C, V] = that match { - case p@Product(left, right, _, expressions) => + case p @ Product(left, right, _, expressions) => if (left.sizeHint.total.getOrElse(BigInt(0L)) > right.sizeHint.total.getOrElse(BigInt(0L))) Product(left, (this * right), ring, expressions)(p.joiner) else @@ -512,7 +564,7 @@ trait Scalar2[V] extends Serializable { else HadamardProduct(this * left, right, ring) case s @ Sum(left, right, mon) => Sum(this * left, this * right, mon) - case m @ MatrixLiteral(_, _) => timesLiteral(m) // handle literals here + case m @ MatrixLiteral(_, _) => timesLiteral(m) // handle literals here case x @ OneC() => Product(OneC[Unit, V](), toMatrix, ring) .asInstanceOf[Matrix2[R, C, V]] @@ -545,7 +597,7 @@ trait Scalar2[V] extends Serializable { // TODO: FunctionMatrix[R,C,V](fn: (R,C) => V) and a Literal scalar is just: FuctionMatrix[Unit, Unit, V]({ (_, _) => v }) } -case class ValuePipeScalar[V](override val value: ValuePipe[V]) extends Scalar2[V] +final case class ValuePipeScalar[V](override val value: ValuePipe[V]) extends Scalar2[V] object Scalar2 { // implicits cannot share names @@ -553,33 +605,37 @@ object Scalar2 { def apply[V](v: ValuePipe[V]): Scalar2[V] = ValuePipeScalar(v) // implicits can't share names, but we want the implicit - implicit def const[V](v: V)(implicit fd: FlowDef, m: Mode): Scalar2[V] = + implicit def const[V](v: V): Scalar2[V] = from(LiteralValue(v)) - def apply[V](v: V)(implicit fd: FlowDef, m: Mode): Scalar2[V] = + def apply[V](v: V): Scalar2[V] = from(LiteralValue(v)) } object Matrix2 { - def apply[R:Ordering, C: Ordering, V](t: TypedPipe[(R, C, V)], hint: SizeHint): Matrix2[R, C, V] = + def apply[R: Ordering, C: Ordering, V](t: TypedPipe[(R, C, V)], hint: SizeHint): Matrix2[R, C, V] = MatrixLiteral(t, hint) - def read[R, C, V](t: TypedSource[(R, C, V)], - hint: SizeHint)(implicit ordr: Ordering[R], - ordc: Ordering[C], fd: FlowDef, m: Mode): Matrix2[R, C, V] = + def read[R, C, V](t: Input[(R, C, V)], hint: SizeHint)(implicit + ordr: Ordering[R], + ordc: Ordering[C] + ): Matrix2[R, C, V] = MatrixLiteral(TypedPipe.from(t), hint) def J[R, C, V](implicit ordR: Ordering[R], ordC: Ordering[C], ring: Ring[V], mj: MatrixJoiner2) = Product(OneC[R, V]()(ordR), OneR[C, V]()(ordC), ring) /** - * The original prototype that employs the standard O(n^3) dynamic programming - * procedure to optimize a matrix chain factorization. + * The original prototype that employs the standard O(n^3) dynamic programming procedure to optimize a + * matrix chain factorization. * - * Now, it also "prefers" more spread out / bushy / less deep factorization - * which reflects more the Map/Reduce nature. + * Now, it also "prefers" more spread out / bushy / less deep factorization which reflects more the + * Map/Reduce nature. */ - def optimizeProductChain[V](p: IndexedSeq[Matrix2[Any, Any, V]], product: Option[(Ring[V], MatrixJoiner2)]): (BigInt, Matrix2[Any, Any, V]) = { + def optimizeProductChain[V]( + p: IndexedSeq[Matrix2[Any, Any, V]], + product: Option[(Ring[V], MatrixJoiner2)] + ): (BigInt, Matrix2[Any, Any, V]) = { val subchainCosts = HashMap.empty[(Int, Int), BigInt] @@ -606,18 +662,20 @@ object Matrix2 { val sharedMap = HashMap.empty[Matrix2[Any, Any, V], TypedPipe[(Any, Any, V)]] - def generatePlan(i: Int, j: Int): Matrix2[Any, Any, V] = { + /* The only case where `product` will be `None` is if the result is an + * intermediate matrix (like `OneC`). This is not yet forbidden in the types. + */ + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + def generatePlan(i: Int, j: Int): Matrix2[Any, Any, V] = if (i == j) p(i) else { val k = splitMarkers((i, j)) - val left = generatePlan(i, k) - val right = generatePlan(k + 1, j) + val left = generatePlan(i, k) // linter:ignore + val right = generatePlan(k + 1, j) // linter:ignore val (ring, joiner) = product.get Product(left, right, ring, Some(sharedMap))(joiner) } - } - val best = computeCosts(p, 0, p.length - 1) (best, generatePlan(0, p.length - 1)) @@ -625,27 +683,30 @@ object Matrix2 { /** * This function walks the input tree, finds basic blocks to optimize, - * i.e. matrix product chains that are not interrupted by summations. - * One example: - * A*B*C*(D+E)*(F*G) => "basic blocks" are ABC, D, E, and FG + * i.e. matrix product chains that are not interrupted by summations. One example: A*B*C*(D+E)*(F*G) => + * "basic blocks" are ABC, D, E, and FG * - * + it now does "global" optimization - i.e. over optimize over basic blocks. - * In the above example, we'd treat (D+E) as a temporary matrix T and optimize the whole chain ABCTFG + * + it now does "global" optimization - i.e. over optimize over basic blocks. In the above example, we'd + * treat (D+E) as a temporary matrix T and optimize the whole chain ABCTFG * - * Not sure if making use of distributivity to generate more variants would be good. - * In the above example, we could also generate ABCDFG + ABCEFG and have basic blocks: ABCDFG, and ABCEFG. - * But this would be almost twice as much work with the current cost estimation. + * Not sure if making use of distributivity to generate more variants would be good. In the above example, + * we could also generate ABCDFG + ABCEFG and have basic blocks: ABCDFG, and ABCEFG. But this would be + * almost twice as much work with the current cost estimation. */ def optimize[V](mf: Matrix2[Any, Any, V]): (BigInt, Matrix2[Any, Any, V]) = { - def pair[X,Y](x: Option[X], y: Option[Y]): Option[(X,Y)] = - for { xi <- x; yi <- y } yield (xi, yi) + def pair[X, Y](x: Option[X], y: Option[Y]): Option[(X, Y)] = + for { + xi <- x + yi <- y + } yield (xi, yi) /** * Recursive function - returns a flatten product chain and optimizes product chains under sums */ - def optimizeBasicBlocks(mf: Matrix2[Any, Any, V]): (List[Matrix2[Any, Any, V]], BigInt, Option[Ring[V]], Option[MatrixJoiner2]) = { - + def optimizeBasicBlocks( + mf: Matrix2[Any, Any, V] + ): (List[Matrix2[Any, Any, V]], BigInt, Option[Ring[V]], Option[MatrixJoiner2]) = mf match { // basic block of one matrix case element @ MatrixLiteral(_, _) => (List(element), 0, None, None) @@ -653,25 +714,33 @@ object Matrix2 { case Sum(left, right, mon) => { val (lastLChain, lastCost1, ringL, joinerL) = optimizeBasicBlocks(left) val (lastRChain, lastCost2, ringR, joinerR) = optimizeBasicBlocks(right) - val (cost1, newLeft) = optimizeProductChain(lastLChain.toIndexedSeq, pair(ringL, joinerL)) - val (cost2, newRight) = optimizeProductChain(lastRChain.toIndexedSeq, pair(ringR, joinerR)) - (List(Sum(newLeft, newRight, mon)), + val (cost1, newLeft) = + optimizeProductChain(lastLChain.toIndexedSeq, pair(ringL, joinerL)) // linter:ignore + val (cost2, newRight) = + optimizeProductChain(lastRChain.toIndexedSeq, pair(ringR, joinerR)) // linter:ignore + ( + List(Sum(newLeft, newRight, mon)), lastCost1 + lastCost2 + cost1 + cost2, ringL.orElse(ringR), - joinerL.orElse(joinerR)) + joinerL.orElse(joinerR) + ) } case HadamardProduct(left, right, ring) => { val (lastLChain, lastCost1, ringL, joinerL) = optimizeBasicBlocks(left) val (lastRChain, lastCost2, ringR, joinerR) = optimizeBasicBlocks(right) - val (cost1, newLeft) = optimizeProductChain(lastLChain.toIndexedSeq, pair(ringL, joinerL)) - val (cost2, newRight) = optimizeProductChain(lastRChain.toIndexedSeq, pair(ringR, joinerR)) - (List(HadamardProduct(newLeft, newRight, ring)), + val (cost1, newLeft) = + optimizeProductChain(lastLChain.toIndexedSeq, pair(ringL, joinerL)) // linter:ignore + val (cost2, newRight) = + optimizeProductChain(lastRChain.toIndexedSeq, pair(ringR, joinerR)) // linter:ignore + ( + List(HadamardProduct(newLeft, newRight, ring)), lastCost1 + lastCost2 + cost1 + cost2, ringL.orElse(ringR), - joinerL.orElse(joinerR)) + joinerL.orElse(joinerR) + ) } // chain (...something...)*(...something...) - case p@Product(left, right, ring, _) => { + case p @ Product(left, right, ring, _) => { val (lastLChain, lastCost1, ringL, joinerL) = optimizeBasicBlocks(left) val (lastRChain, lastCost2, ringR, joinerR) = optimizeBasicBlocks(right) (lastLChain ++ lastRChain, lastCost1 + lastCost2, Some(ring), Some(p.joiner)) @@ -679,9 +748,9 @@ object Matrix2 { // OneC, OneR and potentially other intermediate matrices case el => (List(el), 0, None, None) } - } val (lastChain, lastCost, ring, joiner) = optimizeBasicBlocks(mf) - val (potentialCost, finalResult) = optimizeProductChain(lastChain.toIndexedSeq, pair(ring, joiner)) + val (potentialCost, finalResult) = + optimizeProductChain(lastChain.toIndexedSeq, pair(ring, joiner)) // linter:ignore (lastCost + potentialCost, finalResult) } } diff --git a/scalding-base/src/main/scala/com/twitter/scalding/mathematics/SizeHint.scala b/scalding-base/src/main/scala/com/twitter/scalding/mathematics/SizeHint.scala new file mode 100644 index 0000000000..d8408358ed --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/mathematics/SizeHint.scala @@ -0,0 +1,155 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.mathematics + +object SizeHint { + implicit val ordering: Ordering[SizeHint] = SizeHintOrdering + // Return a sparsity assuming all the diagonal is present, but nothing else + def asDiagonal(h: SizeHint): SizeHint = { + def make(r: BigInt, c: BigInt) = + h.total + .map { tot => + val maxElements = r.min(c) + val sparsity = 1.0 / maxElements.doubleValue + SparseHint(sparsity, maxElements, maxElements) + } + .getOrElse(NoClue) + h match { + case NoClue => NoClue + case FiniteHint(r, c) => make(r, c) + case SparseHint(sp, r, c) => make(r, c) + } + } +} + +sealed abstract class SizeHint { + def *(other: SizeHint): SizeHint + def +(other: SizeHint): SizeHint + // for estimating the Hadamard product + def #*#(other: SizeHint): SizeHint + def total: Option[BigInt] + def setCols(cols: Long): SizeHint + def setRows(rows: Long): SizeHint + def setColsToRows: SizeHint + def setRowsToCols: SizeHint + def transpose: SizeHint +} + +// If we have no idea, we still don't have any idea, this is like NaN +case object NoClue extends SizeHint { + def *(other: SizeHint) = NoClue + def +(other: SizeHint) = NoClue + def #*#(other: SizeHint) = NoClue + def total = None + def setCols(cols: Long) = FiniteHint(-1L, cols) + def setRows(rows: Long) = FiniteHint(rows, -1L) + def setColsToRows = NoClue + def setRowsToCols = NoClue + def transpose = NoClue +} + +final case class FiniteHint(rows: BigInt = -1L, cols: BigInt = -1L) extends SizeHint { + def *(other: SizeHint) = + other match { + case NoClue => NoClue + case FiniteHint(orows, ocols) => FiniteHint(rows, ocols) + case sp @ SparseHint(_, _, _) => (SparseHint(1.0, rows, cols) * sp) + } + def +(other: SizeHint) = + other match { + case NoClue => NoClue + // In this case, a hint on one side, will overwrite lack of knowledge (-1L) + case FiniteHint(orows, ocols) => FiniteHint(rows.max(orows), cols.max(ocols)) + case sp @ SparseHint(_, _, _) => (sp + this) + } + def #*#(other: SizeHint) = + other match { + case NoClue => NoClue + // In this case, a hint on one side, will overwrite lack of knowledge (-1L) + case FiniteHint(orows, ocols) => FiniteHint(rows.min(orows), cols.min(ocols)) + case sp @ SparseHint(_, _, _) => (sp #*# this) + } + def total = if (rows >= 0 && cols >= 0) { Some(rows * cols) } + else None + def setCols(ncols: Long) = FiniteHint(rows, ncols) + def setRows(nrows: Long) = FiniteHint(nrows, cols) + def setColsToRows = FiniteHint(rows, rows) + def setRowsToCols = FiniteHint(cols, cols) + def transpose = FiniteHint(cols, rows) +} + +// sparsity is the fraction of the rows and columns that are expected to be present +final case class SparseHint(sparsity: Double, rows: BigInt, cols: BigInt) extends SizeHint { + def *(other: SizeHint): SizeHint = + other match { + case NoClue => NoClue + case FiniteHint(r, c) => (this * SparseHint(1.0, r, c)) + case SparseHint(sp, r, c) => { + // if I occupy a bin with probability p, and you q, then both: pq + // There are cols samples of the, above, so the probability one is present: + // 1-(1-pq)^cols ~ (cols * p * q) min 1.0 + val newSp = BigDecimal(cols) * sp * sparsity + if (newSp >= 1.0) { + FiniteHint(rows, c) + } else { + SparseHint(newSp.toDouble, rows, c) + } + } + } + def +(other: SizeHint): SizeHint = + other match { + case NoClue => NoClue + case FiniteHint(r, c) => (this + SparseHint(1.0, r, c)) + case SparseHint(sp, r, c) => { + // if I occupy a bin with probability p, and you q, then either: p + q - pq + if ((sparsity == 1.0) || (sp == 1.0)) { + FiniteHint(rows.max(r), cols.max(c)) + } else { + val newSp = sparsity + sp - sp * sparsity + SparseHint(newSp, rows.max(r), cols.max(c)) + } + } + } + def #*#(other: SizeHint): SizeHint = + other match { + case NoClue => NoClue + case FiniteHint(r, c) => (this #*# SparseHint(1.0, r, c)) + case SparseHint(sp, r, c) => { + val newSp = sp.min(sparsity) + SparseHint(newSp, rows.min(r), cols.min(c)) + } + } + def total: Option[BigInt] = + if ((rows >= 0) && (cols >= 0)) { + Some((BigDecimal(rows) * BigDecimal(cols) * sparsity).toBigInt) + } else + None + def setCols(c: Long): SizeHint = copy(cols = c) + def setRows(r: Long): SizeHint = copy(rows = r) + def setColsToRows: SizeHint = copy(cols = rows) + def setRowsToCols: SizeHint = copy(rows = cols) + def transpose: SizeHint = copy(cols = rows, rows = cols) +} + +/** + * Allows us to sort matrices by approximate type + */ +object SizeHintOrdering extends Ordering[SizeHint] with java.io.Serializable { + def compare(left: SizeHint, right: SizeHint): Int = + left.total + .getOrElse(BigInt(-1L)) + .compare(right.total.getOrElse(BigInt(-1L))) +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/GeneratedFlattenGroup.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/GeneratedFlattenGroup.scala new file mode 100644 index 0000000000..ea52d42fc4 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/GeneratedFlattenGroup.scala @@ -0,0 +1,1792 @@ +// following were autogenerated by ./codegen/flatten_group_generator.rb at Mon Dec 01 19:29:52 -0800 2014 do not edit +package com.twitter.scalding.typed + +/** + * Autogenerated methods for flattening the nested value tuples that result after joining many pipes together. + * These methods can be used directly, or via the the joins available in MultiJoin. + */ +object FlattenGroup { + val pairOfNones = (None, None) + + // methods for flattening results of join / leftJoin + + def flattenNestedTuple[A, B, C](nested: ((A, B), C)): (A, B, C) = { + val ((a, b), c) = nested + (a, b, c) + } + + class FlattenLeftJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C]( + nested: KLL[KEY, ((A, B), C)] + ) { + def flattenValueTuple: KLL[KEY, (A, B, C)] = nested.mapValues { tup => + FlattenGroup.flattenNestedTuple(tup) + } + } + + implicit def toFlattenLeftJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C]( + nested: KLL[KEY, ((A, B), C)] + ): FlattenGroup.FlattenLeftJoin3[KEY, KLL, A, B, C] = new FlattenLeftJoin3(nested) + + def flattenNestedTuple[A, B, C, D](nested: (((A, B), C), D)): (A, B, C, D) = { + val (((a, b), c), d) = nested + (a, b, c, d) + } + + class FlattenLeftJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D]( + nested: KLL[KEY, (((A, B), C), D)] + ) { + def flattenValueTuple: KLL[KEY, (A, B, C, D)] = nested.mapValues { tup => + FlattenGroup.flattenNestedTuple(tup) + } + } + + implicit def toFlattenLeftJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D]( + nested: KLL[KEY, (((A, B), C), D)] + ): FlattenGroup.FlattenLeftJoin4[KEY, KLL, A, B, C, D] = new FlattenLeftJoin4(nested) + + def flattenNestedTuple[A, B, C, D, E](nested: ((((A, B), C), D), E)): (A, B, C, D, E) = { + val ((((a, b), c), d), e) = nested + (a, b, c, d, e) + } + + class FlattenLeftJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E]( + nested: KLL[KEY, ((((A, B), C), D), E)] + ) { + def flattenValueTuple: KLL[KEY, (A, B, C, D, E)] = nested.mapValues { tup => + FlattenGroup.flattenNestedTuple(tup) + } + } + + implicit def toFlattenLeftJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E]( + nested: KLL[KEY, ((((A, B), C), D), E)] + ): FlattenGroup.FlattenLeftJoin5[KEY, KLL, A, B, C, D, E] = new FlattenLeftJoin5(nested) + + def flattenNestedTuple[A, B, C, D, E, F](nested: (((((A, B), C), D), E), F)): (A, B, C, D, E, F) = { + val (((((a, b), c), d), e), f) = nested + (a, b, c, d, e, f) + } + + class FlattenLeftJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E, F]( + nested: KLL[KEY, (((((A, B), C), D), E), F)] + ) { + def flattenValueTuple: KLL[KEY, (A, B, C, D, E, F)] = nested.mapValues { tup => + FlattenGroup.flattenNestedTuple(tup) + } + } + + implicit def toFlattenLeftJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[ + KLL_K, + KLL_V, + KLL + ], A, B, C, D, E, F]( + nested: KLL[KEY, (((((A, B), C), D), E), F)] + ): FlattenGroup.FlattenLeftJoin6[KEY, KLL, A, B, C, D, E, F] = new FlattenLeftJoin6(nested) + + def flattenNestedTuple[A, B, C, D, E, F, G]( + nested: ((((((A, B), C), D), E), F), G) + ): (A, B, C, D, E, F, G) = { + val ((((((a, b), c), d), e), f), g) = nested + (a, b, c, d, e, f, g) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H]( + nested: (((((((A, B), C), D), E), F), G), H) + ): (A, B, C, D, E, F, G, H) = { + val (((((((a, b), c), d), e), f), g), h) = nested + (a, b, c, d, e, f, g, h) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I]( + nested: ((((((((A, B), C), D), E), F), G), H), I) + ): (A, B, C, D, E, F, G, H, I) = { + val ((((((((a, b), c), d), e), f), g), h), i) = nested + (a, b, c, d, e, f, g, h, i) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J]( + nested: (((((((((A, B), C), D), E), F), G), H), I), J) + ): (A, B, C, D, E, F, G, H, I, J) = { + val (((((((((a, b), c), d), e), f), g), h), i), j) = nested + (a, b, c, d, e, f, g, h, i, j) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K]( + nested: ((((((((((A, B), C), D), E), F), G), H), I), J), K) + ): (A, B, C, D, E, F, G, H, I, J, K) = { + val ((((((((((a, b), c), d), e), f), g), h), i), j), k) = nested + (a, b, c, d, e, f, g, h, i, j, k) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L]( + nested: (((((((((((A, B), C), D), E), F), G), H), I), J), K), L) + ): (A, B, C, D, E, F, G, H, I, J, K, L) = { + val (((((((((((a, b), c), d), e), f), g), h), i), j), k), l) = nested + (a, b, c, d, e, f, g, h, i, j, k, l) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M]( + nested: ((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M) = { + val ((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m) = nested + (a, b, c, d, e, f, g, h, i, j, k, l, m) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + nested: (((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N) = { + val (((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n) = nested + (a, b, c, d, e, f, g, h, i, j, k, l, m, n) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + nested: ((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) = { + val ((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o) = nested + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + nested: (((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) = { + val (((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p) = nested + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + nested: ((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q) = { + val ((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q) = nested + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + nested: (((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R) = { + val (((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r) = nested + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + nested: ((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S) = { + val ((((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r), s) = nested + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + nested: (((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S), T) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T) = { + val (((((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r), s), t) = + nested + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + nested: ( + (((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S), T), + U + ) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U) = { + val ( + (((((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r), s), t), + u + ) = nested + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u) + } + + def flattenNestedTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + nested: ( + ( + ( + ((((((((((((((((((A, B), C), D), E), F), G), H), I), J), K), L), M), N), O), P), Q), R), S), + T + ), + U + ), + V + ) + ): (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V) = { + val ( + ((((((((((((((((((((a, b), c), d), e), f), g), h), i), j), k), l), m), n), o), p), q), r), s), t), u), + v + ) = nested + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v) + } + + // methods for flattening results of outerJoin + + def flattenNestedOptionTuple[A, B, C]( + nested: (Option[(Option[A], Option[B])], Option[C]) + ): (Option[A], Option[B], Option[C]) = { + val (rest1, c) = nested + val (a, b) = rest1.getOrElse(pairOfNones) + (a, b, c) + } + + class FlattenOuterJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C]( + nested: KLL[KEY, (Option[(Option[A], Option[B])], Option[C])] + ) { + def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C])] = nested.mapValues { tup => + FlattenGroup.flattenNestedOptionTuple(tup) + } + } + + implicit def toFlattenOuterJoin3[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C]( + nested: KLL[KEY, (Option[(Option[A], Option[B])], Option[C])] + ): FlattenGroup.FlattenOuterJoin3[KEY, KLL, A, B, C] = new FlattenOuterJoin3(nested) + + def flattenNestedOptionTuple[A, B, C, D]( + nested: (Option[(Option[(Option[A], Option[B])], Option[C])], Option[D]) + ): (Option[A], Option[B], Option[C], Option[D]) = { + val (rest1, d) = nested + val (rest2, c) = rest1.getOrElse(pairOfNones) + val (a, b) = rest2.getOrElse(pairOfNones) + (a, b, c, d) + } + + class FlattenOuterJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D]( + nested: KLL[KEY, (Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])] + ) { + def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C], Option[D])] = nested.mapValues { tup => + FlattenGroup.flattenNestedOptionTuple(tup) + } + } + + implicit def toFlattenOuterJoin4[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D]( + nested: KLL[KEY, (Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])] + ): FlattenGroup.FlattenOuterJoin4[KEY, KLL, A, B, C, D] = new FlattenOuterJoin4(nested) + + def flattenNestedOptionTuple[A, B, C, D, E]( + nested: (Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E]) + ): (Option[A], Option[B], Option[C], Option[D], Option[E]) = { + val (rest1, e) = nested + val (rest2, d) = rest1.getOrElse(pairOfNones) + val (rest3, c) = rest2.getOrElse(pairOfNones) + val (a, b) = rest3.getOrElse(pairOfNones) + (a, b, c, d, e) + } + + class FlattenOuterJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E]( + nested: KLL[KEY, (Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])] + ) { + def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E])] = + nested.mapValues(tup => FlattenGroup.flattenNestedOptionTuple(tup)) + } + + implicit def toFlattenOuterJoin5[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[ + KLL_K, + KLL_V, + KLL + ], A, B, C, D, E]( + nested: KLL[KEY, (Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])] + ): FlattenGroup.FlattenOuterJoin5[KEY, KLL, A, B, C, D, E] = new FlattenOuterJoin5(nested) + + def flattenNestedOptionTuple[A, B, C, D, E, F]( + nested: ( + Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], + Option[F] + ) + ): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F]) = { + val (rest1, f) = nested + val (rest2, e) = rest1.getOrElse(pairOfNones) + val (rest3, d) = rest2.getOrElse(pairOfNones) + val (rest4, c) = rest3.getOrElse(pairOfNones) + val (a, b) = rest4.getOrElse(pairOfNones) + (a, b, c, d, e, f) + } + + class FlattenOuterJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], A, B, C, D, E, F]( + nested: KLL[ + KEY, + ( + Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], + Option[F] + ) + ] + ) { + def flattenValueTuple: KLL[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F])] = + nested.mapValues(tup => FlattenGroup.flattenNestedOptionTuple(tup)) + } + + implicit def toFlattenOuterJoin6[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[ + KLL_K, + KLL_V, + KLL + ], A, B, C, D, E, F]( + nested: KLL[ + KEY, + ( + Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], + Option[F] + ) + ] + ): FlattenGroup.FlattenOuterJoin6[KEY, KLL, A, B, C, D, E, F] = new FlattenOuterJoin6(nested) + + def flattenNestedOptionTuple[A, B, C, D, E, F, G]( + nested: ( + Option[ + ( + Option[(Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E])], + Option[F] + ) + ], + Option[G] + ) + ): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G]) = { + val (rest1, g) = nested + val (rest2, f) = rest1.getOrElse(pairOfNones) + val (rest3, e) = rest2.getOrElse(pairOfNones) + val (rest4, d) = rest3.getOrElse(pairOfNones) + val (rest5, c) = rest4.getOrElse(pairOfNones) + val (a, b) = rest5.getOrElse(pairOfNones) + (a, b, c, d, e, f, g) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + (Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], Option[E]) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H]) = { + val (rest1, h) = nested + val (rest2, g) = rest1.getOrElse(pairOfNones) + val (rest3, f) = rest2.getOrElse(pairOfNones) + val (rest4, e) = rest3.getOrElse(pairOfNones) + val (rest5, d) = rest4.getOrElse(pairOfNones) + val (rest6, c) = rest5.getOrElse(pairOfNones) + val (a, b) = rest6.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[(Option[(Option[A], Option[B])], Option[C])], Option[D])], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ): (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I]) = { + val (rest1, i) = nested + val (rest2, h) = rest1.getOrElse(pairOfNones) + val (rest3, g) = rest2.getOrElse(pairOfNones) + val (rest4, f) = rest3.getOrElse(pairOfNones) + val (rest5, e) = rest4.getOrElse(pairOfNones) + val (rest6, d) = rest5.getOrElse(pairOfNones) + val (rest7, c) = rest6.getOrElse(pairOfNones) + val (a, b) = rest7.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + (Option[(Option[(Option[A], Option[B])], Option[C])], Option[D]) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J] + ) = { + val (rest1, j) = nested + val (rest2, i) = rest1.getOrElse(pairOfNones) + val (rest3, h) = rest2.getOrElse(pairOfNones) + val (rest4, g) = rest3.getOrElse(pairOfNones) + val (rest5, f) = rest4.getOrElse(pairOfNones) + val (rest6, e) = rest5.getOrElse(pairOfNones) + val (rest7, d) = rest6.getOrElse(pairOfNones) + val (rest8, c) = rest7.getOrElse(pairOfNones) + val (a, b) = rest8.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[(Option[A], Option[B])], Option[C])], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K] + ) = { + val (rest1, k) = nested + val (rest2, j) = rest1.getOrElse(pairOfNones) + val (rest3, i) = rest2.getOrElse(pairOfNones) + val (rest4, h) = rest3.getOrElse(pairOfNones) + val (rest5, g) = rest4.getOrElse(pairOfNones) + val (rest6, f) = rest5.getOrElse(pairOfNones) + val (rest7, e) = rest6.getOrElse(pairOfNones) + val (rest8, d) = rest7.getOrElse(pairOfNones) + val (rest9, c) = rest8.getOrElse(pairOfNones) + val (a, b) = rest9.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[(Option[A], Option[B])], Option[C])], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L] + ) = { + val (rest1, l) = nested + val (rest2, k) = rest1.getOrElse(pairOfNones) + val (rest3, j) = rest2.getOrElse(pairOfNones) + val (rest4, i) = rest3.getOrElse(pairOfNones) + val (rest5, h) = rest4.getOrElse(pairOfNones) + val (rest6, g) = rest5.getOrElse(pairOfNones) + val (rest7, f) = rest6.getOrElse(pairOfNones) + val (rest8, e) = rest7.getOrElse(pairOfNones) + val (rest9, d) = rest8.getOrElse(pairOfNones) + val (rest10, c) = rest9.getOrElse(pairOfNones) + val (a, b) = rest10.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k, l) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + (Option[(Option[A], Option[B])], Option[C]) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M] + ) = { + val (rest1, m) = nested + val (rest2, l) = rest1.getOrElse(pairOfNones) + val (rest3, k) = rest2.getOrElse(pairOfNones) + val (rest4, j) = rest3.getOrElse(pairOfNones) + val (rest5, i) = rest4.getOrElse(pairOfNones) + val (rest6, h) = rest5.getOrElse(pairOfNones) + val (rest7, g) = rest6.getOrElse(pairOfNones) + val (rest8, f) = rest7.getOrElse(pairOfNones) + val (rest9, e) = rest8.getOrElse(pairOfNones) + val (rest10, d) = rest9.getOrElse(pairOfNones) + val (rest11, c) = rest10.getOrElse(pairOfNones) + val (a, b) = rest11.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k, l, m) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[A], Option[B])], + Option[C] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N] + ) = { + val (rest1, n) = nested + val (rest2, m) = rest1.getOrElse(pairOfNones) + val (rest3, l) = rest2.getOrElse(pairOfNones) + val (rest4, k) = rest3.getOrElse(pairOfNones) + val (rest5, j) = rest4.getOrElse(pairOfNones) + val (rest6, i) = rest5.getOrElse(pairOfNones) + val (rest7, h) = rest6.getOrElse(pairOfNones) + val (rest8, g) = rest7.getOrElse(pairOfNones) + val (rest9, f) = rest8.getOrElse(pairOfNones) + val (rest10, e) = rest9.getOrElse(pairOfNones) + val (rest11, d) = rest10.getOrElse(pairOfNones) + val (rest12, c) = rest11.getOrElse(pairOfNones) + val (a, b) = rest12.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k, l, m, n) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + (Option[A], Option[B]) + ], + Option[C] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O] + ) = { + val (rest1, o) = nested + val (rest2, n) = rest1.getOrElse(pairOfNones) + val (rest3, m) = rest2.getOrElse(pairOfNones) + val (rest4, l) = rest3.getOrElse(pairOfNones) + val (rest5, k) = rest4.getOrElse(pairOfNones) + val (rest6, j) = rest5.getOrElse(pairOfNones) + val (rest7, i) = rest6.getOrElse(pairOfNones) + val (rest8, h) = rest7.getOrElse(pairOfNones) + val (rest9, g) = rest8.getOrElse(pairOfNones) + val (rest10, f) = rest9.getOrElse(pairOfNones) + val (rest11, e) = rest10.getOrElse(pairOfNones) + val (rest12, d) = rest11.getOrElse(pairOfNones) + val (rest13, c) = rest12.getOrElse(pairOfNones) + val (a, b) = rest13.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[A], + Option[B] + ) + ], + Option[C] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P] + ) = { + val (rest1, p) = nested + val (rest2, o) = rest1.getOrElse(pairOfNones) + val (rest3, n) = rest2.getOrElse(pairOfNones) + val (rest4, m) = rest3.getOrElse(pairOfNones) + val (rest5, l) = rest4.getOrElse(pairOfNones) + val (rest6, k) = rest5.getOrElse(pairOfNones) + val (rest7, j) = rest6.getOrElse(pairOfNones) + val (rest8, i) = rest7.getOrElse(pairOfNones) + val (rest9, h) = rest8.getOrElse(pairOfNones) + val (rest10, g) = rest9.getOrElse(pairOfNones) + val (rest11, f) = rest10.getOrElse(pairOfNones) + val (rest12, e) = rest11.getOrElse(pairOfNones) + val (rest13, d) = rest12.getOrElse(pairOfNones) + val (rest14, c) = rest13.getOrElse(pairOfNones) + val (a, b) = rest14.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[A], + Option[B] + ) + ], + Option[C] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q] + ) = { + val (rest1, q) = nested + val (rest2, p) = rest1.getOrElse(pairOfNones) + val (rest3, o) = rest2.getOrElse(pairOfNones) + val (rest4, n) = rest3.getOrElse(pairOfNones) + val (rest5, m) = rest4.getOrElse(pairOfNones) + val (rest6, l) = rest5.getOrElse(pairOfNones) + val (rest7, k) = rest6.getOrElse(pairOfNones) + val (rest8, j) = rest7.getOrElse(pairOfNones) + val (rest9, i) = rest8.getOrElse(pairOfNones) + val (rest10, h) = rest9.getOrElse(pairOfNones) + val (rest11, g) = rest10.getOrElse(pairOfNones) + val (rest12, f) = rest11.getOrElse(pairOfNones) + val (rest13, e) = rest12.getOrElse(pairOfNones) + val (rest14, d) = rest13.getOrElse(pairOfNones) + val (rest15, c) = rest14.getOrElse(pairOfNones) + val (a, b) = rest15.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + A + ], + Option[ + B + ] + ) + ], + Option[C] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ], + Option[R] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R] + ) = { + val (rest1, r) = nested + val (rest2, q) = rest1.getOrElse(pairOfNones) + val (rest3, p) = rest2.getOrElse(pairOfNones) + val (rest4, o) = rest3.getOrElse(pairOfNones) + val (rest5, n) = rest4.getOrElse(pairOfNones) + val (rest6, m) = rest5.getOrElse(pairOfNones) + val (rest7, l) = rest6.getOrElse(pairOfNones) + val (rest8, k) = rest7.getOrElse(pairOfNones) + val (rest9, j) = rest8.getOrElse(pairOfNones) + val (rest10, i) = rest9.getOrElse(pairOfNones) + val (rest11, h) = rest10.getOrElse(pairOfNones) + val (rest12, g) = rest11.getOrElse(pairOfNones) + val (rest13, f) = rest12.getOrElse(pairOfNones) + val (rest14, e) = rest13.getOrElse(pairOfNones) + val (rest15, d) = rest14.getOrElse(pairOfNones) + val (rest16, c) = rest15.getOrElse(pairOfNones) + val (a, b) = rest16.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[A], + Option[B] + ) + ], + Option[ + C + ] + ) + ], + Option[D] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ], + Option[R] + ) + ], + Option[S] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S] + ) = { + val (rest1, s) = nested + val (rest2, r) = rest1.getOrElse(pairOfNones) + val (rest3, q) = rest2.getOrElse(pairOfNones) + val (rest4, p) = rest3.getOrElse(pairOfNones) + val (rest5, o) = rest4.getOrElse(pairOfNones) + val (rest6, n) = rest5.getOrElse(pairOfNones) + val (rest7, m) = rest6.getOrElse(pairOfNones) + val (rest8, l) = rest7.getOrElse(pairOfNones) + val (rest9, k) = rest8.getOrElse(pairOfNones) + val (rest10, j) = rest9.getOrElse(pairOfNones) + val (rest11, i) = rest10.getOrElse(pairOfNones) + val (rest12, h) = rest11.getOrElse(pairOfNones) + val (rest13, g) = rest12.getOrElse(pairOfNones) + val (rest14, f) = rest13.getOrElse(pairOfNones) + val (rest15, e) = rest14.getOrElse(pairOfNones) + val (rest16, d) = rest15.getOrElse(pairOfNones) + val (rest17, c) = rest16.getOrElse(pairOfNones) + val (a, b) = rest17.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[A], Option[B])], + Option[C] + ) + ], + Option[ + D + ] + ) + ], + Option[E] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ], + Option[R] + ) + ], + Option[S] + ) + ], + Option[T] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T] + ) = { + val (rest1, t) = nested + val (rest2, s) = rest1.getOrElse(pairOfNones) + val (rest3, r) = rest2.getOrElse(pairOfNones) + val (rest4, q) = rest3.getOrElse(pairOfNones) + val (rest5, p) = rest4.getOrElse(pairOfNones) + val (rest6, o) = rest5.getOrElse(pairOfNones) + val (rest7, n) = rest6.getOrElse(pairOfNones) + val (rest8, m) = rest7.getOrElse(pairOfNones) + val (rest9, l) = rest8.getOrElse(pairOfNones) + val (rest10, k) = rest9.getOrElse(pairOfNones) + val (rest11, j) = rest10.getOrElse(pairOfNones) + val (rest12, i) = rest11.getOrElse(pairOfNones) + val (rest13, h) = rest12.getOrElse(pairOfNones) + val (rest14, g) = rest13.getOrElse(pairOfNones) + val (rest15, f) = rest14.getOrElse(pairOfNones) + val (rest16, e) = rest15.getOrElse(pairOfNones) + val (rest17, d) = rest16.getOrElse(pairOfNones) + val (rest18, c) = rest17.getOrElse(pairOfNones) + val (a, b) = rest18.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[(Option[A], Option[B])], Option[C])], + Option[D] + ) + ], + Option[ + E + ] + ) + ], + Option[F] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ], + Option[R] + ) + ], + Option[S] + ) + ], + Option[T] + ) + ], + Option[U] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U] + ) = { + val (rest1, u) = nested + val (rest2, t) = rest1.getOrElse(pairOfNones) + val (rest3, s) = rest2.getOrElse(pairOfNones) + val (rest4, r) = rest3.getOrElse(pairOfNones) + val (rest5, q) = rest4.getOrElse(pairOfNones) + val (rest6, p) = rest5.getOrElse(pairOfNones) + val (rest7, o) = rest6.getOrElse(pairOfNones) + val (rest8, n) = rest7.getOrElse(pairOfNones) + val (rest9, m) = rest8.getOrElse(pairOfNones) + val (rest10, l) = rest9.getOrElse(pairOfNones) + val (rest11, k) = rest10.getOrElse(pairOfNones) + val (rest12, j) = rest11.getOrElse(pairOfNones) + val (rest13, i) = rest12.getOrElse(pairOfNones) + val (rest14, h) = rest13.getOrElse(pairOfNones) + val (rest15, g) = rest14.getOrElse(pairOfNones) + val (rest16, f) = rest15.getOrElse(pairOfNones) + val (rest17, e) = rest16.getOrElse(pairOfNones) + val (rest18, d) = rest17.getOrElse(pairOfNones) + val (rest19, c) = rest18.getOrElse(pairOfNones) + val (a, b) = rest19.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u) + } + + def flattenNestedOptionTuple[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + nested: ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[ + ( + Option[(Option[(Option[A], Option[B])], Option[C])], + Option[D] + ) + ], + Option[E] + ) + ], + Option[ + F + ] + ) + ], + Option[G] + ) + ], + Option[H] + ) + ], + Option[I] + ) + ], + Option[J] + ) + ], + Option[K] + ) + ], + Option[L] + ) + ], + Option[M] + ) + ], + Option[N] + ) + ], + Option[O] + ) + ], + Option[P] + ) + ], + Option[Q] + ) + ], + Option[R] + ) + ], + Option[S] + ) + ], + Option[T] + ) + ], + Option[U] + ) + ], + Option[V] + ) + ): ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U], + Option[V] + ) = { + val (rest1, v) = nested + val (rest2, u) = rest1.getOrElse(pairOfNones) + val (rest3, t) = rest2.getOrElse(pairOfNones) + val (rest4, s) = rest3.getOrElse(pairOfNones) + val (rest5, r) = rest4.getOrElse(pairOfNones) + val (rest6, q) = rest5.getOrElse(pairOfNones) + val (rest7, p) = rest6.getOrElse(pairOfNones) + val (rest8, o) = rest7.getOrElse(pairOfNones) + val (rest9, n) = rest8.getOrElse(pairOfNones) + val (rest10, m) = rest9.getOrElse(pairOfNones) + val (rest11, l) = rest10.getOrElse(pairOfNones) + val (rest12, k) = rest11.getOrElse(pairOfNones) + val (rest13, j) = rest12.getOrElse(pairOfNones) + val (rest14, i) = rest13.getOrElse(pairOfNones) + val (rest15, h) = rest14.getOrElse(pairOfNones) + val (rest16, g) = rest15.getOrElse(pairOfNones) + val (rest17, f) = rest16.getOrElse(pairOfNones) + val (rest18, e) = rest17.getOrElse(pairOfNones) + val (rest19, d) = rest18.getOrElse(pairOfNones) + val (rest20, c) = rest19.getOrElse(pairOfNones) + val (a, b) = rest20.getOrElse(pairOfNones) + (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v) + } + +} +// end of autogenerated diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/Grouped.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/Grouped.scala new file mode 100644 index 0000000000..c8ba9ee24c --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/Grouped.scala @@ -0,0 +1,908 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +import com.twitter.algebird.Semigroup +import com.twitter.scalding.typed.functions._ +import com.twitter.scalding.typed.functions.ComposedFunctions.ComposedMapGroup +import scala.collection.JavaConverters._ +import scala.util.hashing.MurmurHash3 +import java.io.Serializable + +object CoGroupable extends Serializable { + + /** + * Return true if there is a sum occurring at the end the mapGroup transformations If we know this is + * finally summed, we can make some different optimization choices + * + * If this is true, we know we have at most one value for each key + */ + final def atMostOneValue[A, B](cg: CoGroupable[A, B]): Boolean = { + import CoGrouped._ + cg match { + case Pair(left, right, joinf) => + atMostOneValue(left) && atMostOneValue(right) && (joinf match { + case Joiner.InnerJoin() => true + case Joiner.OuterJoin() => true + case Joiner.LeftJoin() => true + case Joiner.RightJoin() => true + case _ => false + }) + case WithReducers(on, _) => atMostOneValue(on) + case WithDescription(on, _) => atMostOneValue(on) + case FilterKeys(on, _) => atMostOneValue(on) + case MapGroup(on, fn) => + atMostOneFn(fn) || (atMostOneValue(on) && atMostInputSizeFn(fn)) + case IdentityReduce(_, _, _, _, _) => false + case UnsortedIdentityReduce(_, _, _, _, _) => false + case IteratorMappedReduce(_, _, fn, _, _) => atMostOneFn(fn) + } + } + + /** + * Returns true if the group mapping function definitely returns 0 or 1 element. + * + * in 2.12 this can be tailrec, but the types change on recursion, so 2.11 forbids + */ + final def atMostOneFn[A, B, C](fn: (A, Iterator[B]) => Iterator[C]): Boolean = + fn match { + case ComposedMapGroup(_, fn) if atMostOneFn(fn) => true + case ComposedMapGroup(first, second) => atMostOneFn(first) && atMostInputSizeFn(second) + case MapValueStream(SumAll(_)) => true + case MapValueStream(ToList()) => true + case MapValueStream(FoldIterator(_)) => true + case MapValueStream(FoldLeftIterator(_, _)) => true + case FoldWithKeyIterator(_) => true + case EmptyGuard(fn) => atMostOneFn(fn) + case _ => false + } + + /** + * Returns true if the group mapping function does not increase the number of items in the Iterator + */ + final def atMostInputSizeFn[A, B, C](fn: (A, Iterator[B]) => Iterator[C]): Boolean = + fn match { + case MapGroupMapValues(_) => true + case MapValueStream(Drop(_)) => true + case MapValueStream(DropWhile(_)) => true + case MapValueStream(Take(_)) => true + case MapValueStream(TakeWhile(_)) => true + case FilterGroup(_) => true + case EmptyGuard(fn) if atMostOneFn(fn) => + true // since 0 always goes to 0 due to empty guard, and 1 -> 0 or 1 since atMostOne + case EmptyGuard(fn) => atMostInputSizeFn(fn) + case ComposedMapGroup(first, second) => atMostInputSizeFn(first) && atMostInputSizeFn(second) + case _ => false + } +} + +/** + * Represents something than can be CoGrouped with another CoGroupable + */ +sealed trait CoGroupable[K, +R] extends HasReducers with HasDescription with Serializable { + + /** + * This is the list of mapped pipes, just before the (reducing) joinFunction is applied + */ + def inputs: List[TypedPipe[(K, Any)]] + + def keyOrdering: Ordering[K] + + /** + * This function is not type-safe for others to call, but it should never have an error. By construction, we + * never call it with incorrect types. It would be preferable to have stronger type safety here, but unclear + * how to achieve, and since it is an internal function, not clear it would actually help anyone for it to + * be type-safe + */ + def joinFunction: MultiJoinFunction[K, R] + + /** + * Smaller is about average values/key not total size (that does not matter, but is clearly related). + * + * Note that from the type signature we see that the right side is iterated (or may be) over and over, but + * the left side is not. That means that you want the side with fewer values per key on the right. If both + * sides are similar, no need to worry. If one side is a one-to-one mapping, that should be the "smaller" + * side. + */ + def cogroup[R1, R2](smaller: CoGroupable[K, R1])( + fn: (K, Iterator[R], Iterable[R1]) => Iterator[R2] + ): CoGrouped[K, R2] = + CoGrouped.Pair(this, smaller, fn) + + def join[W](smaller: CoGroupable[K, W]) = + cogroup[W, (R, W)](smaller)(Joiner.inner2) + def leftJoin[W](smaller: CoGroupable[K, W]) = + cogroup[W, (R, Option[W])](smaller)(Joiner.left2) + def rightJoin[W](smaller: CoGroupable[K, W]) = + cogroup[W, (Option[R], W)](smaller)(Joiner.right2) + def outerJoin[W](smaller: CoGroupable[K, W]) = + cogroup[W, (Option[R], Option[W])](smaller)(Joiner.outer2) + // TODO: implement blockJoin +} + +object CoGrouped extends Serializable { + // distinct by mapped, but don't reorder if the list is unique + final def distinctBy[T, U](list: List[T])(fn: T => U): List[T] = { + @annotation.tailrec + def go(l: List[T], seen: Set[U] = Set[U](), acc: List[T] = Nil): List[T] = l match { + case Nil => acc.reverse // done + case h :: tail => + val uh = fn(h) + if (seen(uh)) + go(tail, seen, acc) + else + go(tail, seen + uh, h :: acc) + } + go(list) + } + + def maybeCompose[A, B, C](cg: CoGrouped[A, B], rs: ReduceStep[A, B, C]): Option[CoGrouped[A, C]] = { + val reds = com.twitter.scalding.typed.WithReducers.maybeCombine(cg.reducers, rs.reducers) + + val optCg = rs match { + case step @ IdentityReduce(_, _, _, _, _) => + type Res[T] = CoGrouped[A, T] + Some(step.evidence.subst[Res](cg)) + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + type Res[T] = CoGrouped[A, T] + Some(step.evidence.subst[Res](cg)) + case step @ IteratorMappedReduce(_, _, _, _, _) => + Some(CoGrouped.MapGroup(cg, step.reduceFn)) + case IdentityValueSortedReduce(_, _, _, _, _, _) => + // We can't sort after a join + None + case ValueSortedReduce(_, _, _, _, _, _) => + // We can't sort after a join + None + } + + optCg.map { cg1 => + reds match { + case Some(r) if cg1.reducers != reds => CoGrouped.WithReducers(cg1, r) + case _ => cg1 + } + } + } + + final case class Pair[K, A, B, C]( + larger: CoGroupable[K, A], + smaller: CoGroupable[K, B], + fn: (K, Iterator[A], Iterable[B]) => Iterator[C] + ) extends CoGrouped[K, C] { + + // case classes that merge more than one TypedPipe need to memoize the result or + // it can be exponential in complexity + override val hashCode = MurmurHash3.productHash(this) + override def equals(that: Any) = + that match { + case thatRef: AnyRef if this eq thatRef => true + case Pair(l, s, f) => (fn == f) && (l == larger) && (s == smaller) + case _ => false + } + + def inputs = larger.inputs ++ smaller.inputs + def reducers = com.twitter.scalding.typed.WithReducers.maybeCombine(larger.reducers, smaller.reducers) + def descriptions: Seq[String] = larger.descriptions ++ smaller.descriptions + def keyOrdering = smaller.keyOrdering + + /** + * Avoid capturing anything below as it will need to be serialized and sent to all the reducers. + */ + def joinFunction = { + + /** + * if there is at most one value on the smaller side definitely cache the result to avoid repeatedly + * computing it + */ + val smallerIsAtMostOne = CoGroupable.atMostOneValue(smaller) + if (smallerIsAtMostOne) MultiJoinFunction.PairCachedRight(larger.joinFunction, smaller.joinFunction, fn) + else MultiJoinFunction.Pair(larger.joinFunction, smaller.joinFunction, fn) + } + } + + final case class WithReducers[K, V](on: CoGrouped[K, V], reds: Int) extends CoGrouped[K, V] { + def inputs = on.inputs + def reducers = Some(reds) + def keyOrdering = on.keyOrdering + def joinFunction = on.joinFunction + def descriptions: Seq[String] = on.descriptions + } + + final case class WithDescription[K, V](on: CoGrouped[K, V], description: String) extends CoGrouped[K, V] { + + def inputs = on.inputs + def reducers = on.reducers + def keyOrdering = on.keyOrdering + def joinFunction = on.joinFunction + def descriptions: Seq[String] = on.descriptions :+ description + } + + final case class FilterKeys[K, V](on: CoGrouped[K, V], fn: K => Boolean) extends CoGrouped[K, V] { + val inputs = on.inputs.map(TypedPipe.FilterKeys(_, fn)) + def reducers = on.reducers + def keyOrdering = on.keyOrdering + def joinFunction = on.joinFunction + def descriptions: Seq[String] = on.descriptions + } + + final case class MapGroup[K, V1, V2](on: CoGrouped[K, V1], fn: (K, Iterator[V1]) => Iterator[V2]) + extends CoGrouped[K, V2] { + def inputs = on.inputs + def reducers = on.reducers + def descriptions: Seq[String] = on.descriptions + def keyOrdering = on.keyOrdering + def joinFunction = + MultiJoinFunction.MapGroup(on.joinFunction, fn) + } +} + +sealed trait CoGrouped[K, +R] + extends KeyedListLike[K, R, CoGrouped] + with CoGroupable[K, R] + with WithReducers[CoGrouped[K, R]] + with WithDescription[CoGrouped[K, R]] + with Serializable { + + override def withReducers(reds: Int): CoGrouped[K, R] = + CoGrouped.WithReducers(this, reds) + + override def withDescription(description: String): CoGrouped[K, R] = + CoGrouped.WithDescription(this, description) + + /** + * It seems complex to push a take up to the mappers before a general join. For some cases (inner join), we + * could take at most n from each TypedPipe, but it is not clear how to generalize that for general + * cogrouping functions. For now, just do a normal take. + */ + override def bufferedTake(n: Int): CoGrouped[K, R] = + take(n) + + // Filter the keys before doing the join + override def filterKeys(fn: K => Boolean): CoGrouped[K, R] = + CoGrouped.FilterKeys(this, fn) + + override def mapGroup[R1](fn: (K, Iterator[R]) => Iterator[R1]): CoGrouped[K, R1] = + /* + * After the join, if the key has no values, don't present it to the mapGroup + * function. Doing so would break the invariant: + * + * a.join(b).toTypedPipe.group.mapGroup(fn) == a.join(b).mapGroup(fn) + */ + CoGrouped.MapGroup(this, Grouped.addEmptyGuard(fn)) + + override def toTypedPipe: TypedPipe[(K, R)] = + TypedPipe.CoGroupedPipe(this) +} + +/** + * If we can HashJoin, then we can CoGroup, but not vice-versa + * i.e., HashJoinable is a strict subset of CoGroupable (CoGrouped, for instance is CoGroupable, but not + * HashJoinable). + */ +sealed trait HashJoinable[K, +V] extends CoGroupable[K, V] with KeyedPipe[K] { + + /** A HashJoinable has a single input into to the cogroup */ + override def inputs = List(mapped) +} + +object HashJoinable extends Serializable { + def toReduceStep[A, B](hj: HashJoinable[A, B]): ReduceStep[A, _, _ <: B] = + hj match { + case step @ IdentityReduce(_, _, _, _, _) => step + case step @ UnsortedIdentityReduce(_, _, _, _, _) => step + case step @ IteratorMappedReduce(_, _, _, _, _) => step + } + + def filterKeys[A, B](hj: HashJoinable[A, B], fn: A => Boolean): HashJoinable[A, B] = + hj match { + case step @ IdentityReduce(_, _, _, _, _) => + step.copy(mapped = TypedPipe.FilterKeys(step.mapped, fn)) + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + step.copy(mapped = TypedPipe.FilterKeys(step.mapped, fn)) + case step @ IteratorMappedReduce(_, _, _, _, _) => + step.copy(mapped = TypedPipe.FilterKeys(step.mapped, fn)) + } +} + +/** + * This encodes the rules that 1) sorting is only possible before doing any reduce, 2) reversing is only + * possible after sorting. 3) unsorted Groups can be CoGrouped or HashJoined + * + * This may appear a complex type, but it makes sure that code won't compile if it breaks the rule + */ +sealed trait Grouped[K, +V] + extends KeyedListLike[K, V, UnsortedGrouped] + with HashJoinable[K, V] + with Sortable[V, ({ type t[+x] = SortedGrouped[K, x] with Reversable[SortedGrouped[K, x]] })#t] + with WithReducers[Grouped[K, V]] + with WithDescription[Grouped[K, V]] + +/** + * After sorting, we are no longer CoGroupable, and we can only call reverse in the initial SortedGrouped + * created from the Sortable: .sortBy(_._2).reverse for instance + * + * Once we have sorted, we cannot do a HashJoin or a CoGrouping + */ +sealed trait SortedGrouped[K, +V] + extends KeyedListLike[K, V, SortedGrouped] + with WithReducers[SortedGrouped[K, V]] + with WithDescription[SortedGrouped[K, V]] + +/** + * This is the state after we have done some reducing. It is not possible to sort at this phase, but it is + * possible to do a CoGrouping or a HashJoin. + */ +sealed trait UnsortedGrouped[K, +V] + extends KeyedListLike[K, V, UnsortedGrouped] + with HashJoinable[K, V] + with WithReducers[UnsortedGrouped[K, V]] + with WithDescription[UnsortedGrouped[K, V]] + +object Grouped extends Serializable { + def apply[K, V](pipe: TypedPipe[(K, V)])(implicit ordering: Ordering[K]): Grouped[K, V] = + IdentityReduce[K, V, V](ordering, pipe, None, Nil, implicitly) + + def addEmptyGuard[K, V1, V2](fn: (K, Iterator[V1]) => Iterator[V2]): (K, Iterator[V1]) => Iterator[V2] = + fn match { + case alreadyGuarded @ EmptyGuard(_) => alreadyGuarded + case ami if CoGroupable.atMostInputSizeFn(ami) => ami // already safe + case needGuard => EmptyGuard(needGuard) + } +} + +/** + * All sorting methods defined here trigger Hadoop secondary sort on key + value. Hadoop secondary sort is + * external sorting. i.e. it won't materialize all values of each key in memory on the reducer. + */ +sealed trait Sortable[+T, +Sorted[+_]] { + def withSortOrdering[U >: T](so: Ordering[U]): Sorted[U] + + def sortBy[B: Ordering](fn: (T) => B): Sorted[T] = + withSortOrdering(Ordering.by(fn)) + + // Sorts the values for each key + def sorted[B >: T](implicit ord: Ordering[B]): Sorted[B] = + withSortOrdering(ord) + + def sortWith(lt: (T, T) => Boolean): Sorted[T] = + withSortOrdering(Ordering.fromLessThan(lt)) +} + +// Represents something that when we call reverse changes type to R +sealed trait Reversable[+R] { + def reverse: R +} + +/** + * This is a class that models the logical portion of the reduce step. details like where this occurs, the + * number of reducers, etc... are left in the Grouped class + */ +sealed trait ReduceStep[K, V1, V2] extends KeyedPipe[K] with HasReducers { + + /** + * Note, this satisfies KeyedPipe.mapped: TypedPipe[(K, Any)] + */ + def mapped: TypedPipe[(K, V1)] + + def toTypedPipe: TypedPipe[(K, V2)] = TypedPipe.ReduceStepPipe(this) +} + +object ReduceStep extends Serializable { + + /** + * assuming coherent Orderings on the A, in some cases ReduceSteps can be combined. Note: we have always + * assumed coherant orderings in scalding with joins where both sides have their own Ordering, so we argue + * this is not different. + * + * If a user has incoherant Orderings, which are already dangerous, they can use .forceToDisk between reduce + * steps, however, a better strategy is to use different key types. + * + * The only case where they can't is when there are two different value sorts going on. + */ + def maybeCompose[A, B, C, D]( + rs1: ReduceStep[A, B, C], + rs2: ReduceStep[A, C, D] + ): Option[ReduceStep[A, B, D]] = { + val reds = WithReducers.maybeCombine(rs1.reducers, rs2.reducers) + val optRs = (rs1, rs2) match { + case (step @ IdentityReduce(_, _, _, _, _), step2) => + type Res[T] = ReduceStep[A, T, D] + Some(step.evidence.reverse.subst[Res](step2)) + case (step @ UnsortedIdentityReduce(_, _, _, _, _), step2) => + type Res[T] = ReduceStep[A, T, D] + Some(step.evidence.reverse.subst[Res](step2)) + case (step, step2 @ IdentityReduce(_, _, _, _, _)) => + type Res[T] = ReduceStep[A, B, T] + Some(step2.evidence.subst[Res](step)) + case (step, step2 @ UnsortedIdentityReduce(_, _, _, _, _)) => + type Res[T] = ReduceStep[A, B, T] + Some(step2.evidence.subst[Res](step)) + case (step, step2 @ IteratorMappedReduce(_, _, _, _, _)) => + Some(mapGroup(step)(step2.reduceFn)) + /* + * All the rest have either two sorts, or a sort after a reduce + */ + case (IdentityValueSortedReduce(_, _, _, _, _, _), IdentityValueSortedReduce(_, _, _, _, _, _)) => None + case (IdentityValueSortedReduce(_, _, _, _, _, _), ValueSortedReduce(_, _, _, _, _, _)) => None + case (IteratorMappedReduce(_, _, _, _, _), IdentityValueSortedReduce(_, _, _, _, _, _)) => None + case (IteratorMappedReduce(_, _, _, _, _), ValueSortedReduce(_, _, _, _, _, _)) => None + case (ValueSortedReduce(_, _, _, _, _, _), IdentityValueSortedReduce(_, _, _, _, _, _)) => None + case (ValueSortedReduce(_, _, _, _, _, _), ValueSortedReduce(_, _, _, _, _, _)) => None + } + + optRs.map { composed => + reds.fold(composed)(withReducers(composed, _)) + } + } + + def setInput[A, B, C](rs: ReduceStep[A, B, C], input: TypedPipe[(A, B)]): ReduceStep[A, B, C] = { + type Res[V] = ReduceStep[A, V, C] + type In[V] = TypedPipe[(A, V)] + + rs match { + case step0 @ IdentityReduce(_, _, _, _, _) => + type IR[V] = IdentityReduce[A, V, C] + val step = step0.evidence.subst[IR](step0) + val revEv = step0.evidence.reverse + val res = + IdentityReduce[A, C, C]( + step.keyOrdering, + step0.evidence.subst[In](input), + step.reducers, + step.descriptions, + implicitly + ) + // Put the type back to what scala expects ReduceStep[A, B, C] + revEv.subst[Res](res) + case step0 @ UnsortedIdentityReduce(_, _, _, _, _) => + type IR[V] = UnsortedIdentityReduce[A, V, C] + val step = step0.evidence.subst[IR](step0) + val revEv = step0.evidence.reverse + val res = + UnsortedIdentityReduce[A, C, C]( + step.keyOrdering, + step0.evidence.subst[In](input), + step.reducers, + step.descriptions, + implicitly + ) + // Put the type back to what scala expects ReduceStep[A, B, C] + revEv.subst[Res](res) + case step0 @ IdentityValueSortedReduce(_, _, _, _, _, _) => + type IVSR[V] = IdentityValueSortedReduce[A, V, C] + val step = step0.evidence.subst[IVSR](step0) + val revEv = step0.evidence.reverse + val res = + IdentityValueSortedReduce[A, C, C]( + step.keyOrdering, + step0.evidence.subst[In](input), + step.valueSort, + step.reducers, + step.descriptions, + implicitly + ) + // Put the type back to what scala expects ReduceStep[A, B, C] + revEv.subst[Res](res) + case step @ ValueSortedReduce(_, _, _, _, _, _) => + ValueSortedReduce[A, B, C]( + step.keyOrdering, + input, + step.valueSort, + step.reduceFn, + step.reducers, + step.descriptions + ) + case step @ IteratorMappedReduce(_, _, _, _, _) => + def go(imr: IteratorMappedReduce[A, B, C]): IteratorMappedReduce[A, B, C] = + imr.copy(mapped = input) + go(step) + } + } + + def mapGroup[A, B, C, D]( + rs: ReduceStep[A, B, C] + )(fn: (A, Iterator[C]) => Iterator[D]): ReduceStep[A, B, D] = + rs match { + case step @ IdentityReduce(_, _, _, _, _) => + type Res[T] = ReduceStep[A, T, D] + step.evidence.reverse.subst[Res](step.mapGroup(fn)) + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + type Res[T] = ReduceStep[A, T, D] + step.evidence.reverse.subst[Res](step.mapGroup(fn)) + case step @ IdentityValueSortedReduce(_, _, _, _, _, _) => + type Res[T] = ReduceStep[A, T, D] + step.evidence.reverse.subst[Res](step.mapGroup(fn)) + case step @ ValueSortedReduce(_, _, _, _, _, _) => + step.mapGroup(fn) + case step @ IteratorMappedReduce(_, _, _, _, _) => + step.mapGroup(fn) + } + + def toHashJoinable[A, B, C](rs: ReduceStep[A, B, C]): Option[HashJoinable[A, C]] = + rs match { + case step @ IdentityReduce(_, _, _, _, _) => + Some(step) + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + Some(step) + case step @ IteratorMappedReduce(_, _, _, _, _) => + Some(step) + case step @ IdentityValueSortedReduce(_, _, _, _, _, _) => + None + case step @ ValueSortedReduce(_, _, _, _, _, _) => + None + } + + def withReducers[A, B, C](rs: ReduceStep[A, B, C], reds: Int): ReduceStep[A, B, C] = + rs match { + case step @ IdentityReduce(_, _, _, _, _) => + step.withReducers(reds) + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + step.withReducers(reds) + case step @ IdentityValueSortedReduce(_, _, _, _, _, _) => + step.withReducers(reds) + case step @ ValueSortedReduce(_, _, _, _, _, _) => + step.withReducers(reds) + case step @ IteratorMappedReduce(_, _, _, _, _) => + step.withReducers(reds) + } + + def withDescription[A, B, C](rs: ReduceStep[A, B, C], descr: String): ReduceStep[A, B, C] = + rs match { + case step @ IdentityReduce(_, _, _, _, _) => + step.withDescription(descr) + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + step.withDescription(descr) + case step @ IdentityValueSortedReduce(_, _, _, _, _, _) => + step.withDescription(descr) + case step @ ValueSortedReduce(_, _, _, _, _, _) => + step.withDescription(descr) + case step @ IteratorMappedReduce(_, _, _, _, _) => + step.withDescription(descr) + } +} + +final case class IdentityReduce[K, V1, V2]( + override val keyOrdering: Ordering[K], + override val mapped: TypedPipe[(K, V1)], + override val reducers: Option[Int], + override val descriptions: Seq[String], + evidence: EqTypes[V1, V2] +) extends ReduceStep[K, V1, V2] + with Grouped[K, V2] { + + /* + * Because after mapValues, take, filter, we can no-longer sort, + * we commonly convert to UnsortedIdentityReduce first, then + * call the method there to reduce code duplication + */ + private def toUIR = UnsortedIdentityReduce[K, V1, V2](keyOrdering, mapped, reducers, descriptions, evidence) + + private[this] def mappedV2: TypedPipe[(K, V2)] = { + type TK[V] = TypedPipe[(K, V)] + evidence.subst[TK](mapped) + } + + /** + * This does the partial heap sort followed by take in memory on the mappers before sending to the mappers. + * This is a big help if there are relatively few keys and n is relatively small. + */ + override def bufferedTake(n: Int) = + toUIR.bufferedTake(n) + + override def withSortOrdering[U >: V2](so: Ordering[U]): IdentityValueSortedReduce[K, U, U] = + IdentityValueSortedReduce[K, U, U](keyOrdering, mappedV2, so, reducers, descriptions, implicitly) + + override def withReducers(red: Int): IdentityReduce[K, V1, V2] = + copy(reducers = Some(red)) + + override def withDescription(description: String): IdentityReduce[K, V1, V2] = + copy(descriptions = descriptions :+ description) + + override def filterKeys(fn: K => Boolean) = + toUIR.filterKeys(fn) + + override def mapGroup[V3](fn: (K, Iterator[V2]) => Iterator[V3]) = + // Only pass non-Empty iterators to subsequent functions + IteratorMappedReduce(keyOrdering, mappedV2, Grouped.addEmptyGuard(fn), reducers, descriptions) + + // It would be nice to return IdentityReduce here, but + // the type constraints prevent it currently + override def mapValues[V3](fn: V2 => V3) = + toUIR.mapValues(fn) + + // This is not correct in the type-system, but would be nice to encode + // override def mapValues[V3](fn: V1 => V3) = IdentityReduce(keyOrdering, mapped.mapValues(fn), reducers) + + override def sum[U >: V2](implicit sg: Semigroup[U]) = { + // there is no sort, mapValueStream or force to reducers: + val upipe: TypedPipe[(K, U)] = mappedV2 // use covariance to set the type + UnsortedIdentityReduce[K, U, U]( + keyOrdering, + upipe.sumByLocalKeys, + reducers, + descriptions, + implicitly + ).sumLeft + } + + /** This is just an identity that casts the result to V2 */ + override def joinFunction = MultiJoinFunction.Casting[K, V2] +} + +final case class UnsortedIdentityReduce[K, V1, V2]( + override val keyOrdering: Ordering[K], + override val mapped: TypedPipe[(K, V1)], + override val reducers: Option[Int], + override val descriptions: Seq[String], + evidence: EqTypes[V1, V2] +) extends ReduceStep[K, V1, V2] + with UnsortedGrouped[K, V2] { + + /** + * This does the partial heap sort followed by take in memory on the mappers before sending to the reducers. + * This is a big help if there are relatively few keys and n is relatively small. + */ + override def bufferedTake(n: Int) = + if (n < 1) { + // This means don't take anything, which is legal, but strange + filterKeys(Constant(false)) + } else if (n == 1) { + head + } else { + // By default, there is no ordering. This method is overridden + // in IdentityValueSortedReduce + // Note, this is going to bias toward low hashcode items. + // If you care which items you take, you should sort by a random number + // or the value itself. + val fakeOrdering: Ordering[V1] = Ordering.by { v: V1 => v.hashCode } + implicit val mon: ScaldingPriorityQueueMonoid[V1] = new ScaldingPriorityQueueMonoid[V1](n)(fakeOrdering) + // Do the heap-sort on the mappers: + val pretake: TypedPipe[(K, V1)] = mapped + .mapValues { v: V1 => mon.build(v) } + .sumByLocalKeys + .flatMap { case (k, vs) => vs.iterator.asScala.map((k, _)) } + // We have removed the priority queues, so serialization is not greater + // Now finish on the reducers + UnsortedIdentityReduce[K, V1, V2]( + keyOrdering, + pretake, + reducers, + descriptions, + evidence + ).forceToReducers // jump to ValueSortedReduce + .take(n) + } + + override def withReducers(red: Int): UnsortedIdentityReduce[K, V1, V2] = + copy(reducers = Some(red)) + + override def withDescription(description: String): UnsortedIdentityReduce[K, V1, V2] = + copy(descriptions = descriptions :+ description) + + override def filterKeys(fn: K => Boolean) = + UnsortedIdentityReduce[K, V1, V2](keyOrdering, mapped.filterKeys(fn), reducers, descriptions, evidence) + + private[this] def mappedV2 = { + type TK[V] = TypedPipe[(K, V)] + evidence.subst[TK](mapped) + } + + override def mapGroup[V3](fn: (K, Iterator[V2]) => Iterator[V3]) = + // Only pass non-Empty iterators to subsequent functions + IteratorMappedReduce[K, V2, V3](keyOrdering, mappedV2, Grouped.addEmptyGuard(fn), reducers, descriptions) + + // It would be nice to return IdentityReduce here, but + // the type constraints prevent it currently + override def mapValues[V3](fn: V2 => V3) = + UnsortedIdentityReduce[K, V3, V3](keyOrdering, mappedV2.mapValues(fn), reducers, descriptions, implicitly) + + override def sum[U >: V2](implicit sg: Semigroup[U]) = { + // there is no sort, mapValueStream or force to reducers: + val upipe: TypedPipe[(K, U)] = mappedV2 // use covariance to set the type + UnsortedIdentityReduce[K, U, U]( + keyOrdering, + upipe.sumByLocalKeys, + reducers, + descriptions, + implicitly + ).sumLeft + } + + /** This is just an identity that casts the result to V2 */ + override def joinFunction = MultiJoinFunction.Casting[K, V2] +} + +final case class IdentityValueSortedReduce[K, V1, V2]( + override val keyOrdering: Ordering[K], + override val mapped: TypedPipe[(K, V1)], + valueSort: Ordering[V1], + override val reducers: Option[Int], + override val descriptions: Seq[String], + evidence: EqTypes[V1, V2] +) extends ReduceStep[K, V1, V2] + with SortedGrouped[K, V2] + with Reversable[IdentityValueSortedReduce[K, V1, V2]] { + + override def reverse: IdentityValueSortedReduce[K, V1, V2] = + IdentityValueSortedReduce[K, V1, V2]( + keyOrdering, + mapped, + valueSort.reverse, + reducers, + descriptions, + evidence + ) + + override def withReducers(red: Int): IdentityValueSortedReduce[K, V1, V2] = + // copy fails to get the types right, :/ + IdentityValueSortedReduce[K, V1, V2]( + keyOrdering, + mapped, + valueSort, + reducers = Some(red), + descriptions, + evidence + ) + + override def withDescription(description: String): IdentityValueSortedReduce[K, V1, V2] = + IdentityValueSortedReduce[K, V1, V2]( + keyOrdering, + mapped, + valueSort, + reducers, + descriptions = descriptions :+ description, + evidence + ) + + override def filterKeys(fn: K => Boolean) = + // copy fails to get the types right, :/ + IdentityValueSortedReduce[K, V1, V2]( + keyOrdering, + mapped.filterKeys(fn), + valueSort, + reducers, + descriptions, + evidence + ) + + override def mapGroup[V3](fn: (K, Iterator[V2]) => Iterator[V3]) = { + // Only pass non-Empty iterators to subsequent functions + val gfn = Grouped.addEmptyGuard(fn) + type TK[V] = TypedPipe[(K, V)] + ValueSortedReduce[K, V2, V3]( + keyOrdering, + evidence.subst[TK](mapped), + evidence.subst[Ordering](valueSort), + gfn, + reducers, + descriptions + ) + } + + /** + * This does the partial heap sort followed by take in memory on the mappers before sending to the reducers. + * This is a big help if there are relatively few keys and n is relatively small. + */ + override def bufferedTake(n: Int): SortedGrouped[K, V2] = + if (n <= 0) { + // This means don't take anything, which is legal, but strange + filterKeys(Constant(false)) + } else { + implicit val mon: ScaldingPriorityQueueMonoid[V1] = new ScaldingPriorityQueueMonoid[V1](n)(valueSort) + // Do the heap-sort on the mappers: + val pretake: TypedPipe[(K, V1)] = mapped + .mapValues { v: V1 => mon.build(v) } + .sumByLocalKeys + .flatMap { case (k, vs) => vs.iterator.asScala.map((k, _)) } + // Now finish on the reducers + IdentityValueSortedReduce[K, V1, V2]( + keyOrdering, + pretake, + valueSort, + reducers, + descriptions, + evidence + ).forceToReducers // jump to ValueSortedReduce + .take(n) + } + + /** + * We are sorting then taking. Optimized for small take values If we take <= 1, we use an in-memory-based + * method. To force a memory-based take, use bufferedTake Otherwise, we send all the values to the reducers + */ + override def take(n: Int) = + if (n <= 1) bufferedTake(n) + else mapValueStream(_.take(n)) +} + +final case class ValueSortedReduce[K, V1, V2]( + override val keyOrdering: Ordering[K], + override val mapped: TypedPipe[(K, V1)], + valueSort: Ordering[V1], + reduceFn: (K, Iterator[V1]) => Iterator[V2], + override val reducers: Option[Int], + override val descriptions: Seq[String] +) extends ReduceStep[K, V1, V2] + with SortedGrouped[K, V2] { + + /** + * After sorting, then reducing, there is no chance to operate in the mappers. Just call take. + */ + override def bufferedTake(n: Int) = take(n) + + override def withReducers(red: Int) = + // copy infers loose types. :( + ValueSortedReduce[K, V1, V2](keyOrdering, mapped, valueSort, reduceFn, Some(red), descriptions) + + override def withDescription(description: String) = + ValueSortedReduce[K, V1, V2]( + keyOrdering, + mapped, + valueSort, + reduceFn, + reducers, + descriptions :+ description + ) + + override def filterKeys(fn: K => Boolean) = + // copy fails to get the types right, :/ + ValueSortedReduce[K, V1, V2]( + keyOrdering, + mapped.filterKeys(fn), + valueSort, + reduceFn, + reducers, + descriptions + ) + + override def mapGroup[V3](fn: (K, Iterator[V2]) => Iterator[V3]) = { + // we don't need the empty guard here because ComposedMapGroup already does it + val newReduce = ComposedMapGroup(reduceFn, fn) + ValueSortedReduce[K, V1, V3](keyOrdering, mapped, valueSort, newReduce, reducers, descriptions) + } +} + +final case class IteratorMappedReduce[K, V1, V2]( + override val keyOrdering: Ordering[K], + override val mapped: TypedPipe[(K, V1)], + reduceFn: (K, Iterator[V1]) => Iterator[V2], + override val reducers: Option[Int], + override val descriptions: Seq[String] +) extends ReduceStep[K, V1, V2] + with UnsortedGrouped[K, V2] { + + /** + * After reducing, we are always operating in memory. Just call take. + */ + override def bufferedTake(n: Int) = take(n) + + override def withReducers(red: Int): IteratorMappedReduce[K, V1, V2] = + copy(reducers = Some(red)) + + override def withDescription(description: String): IteratorMappedReduce[K, V1, V2] = + copy(descriptions = descriptions :+ description) + + override def filterKeys(fn: K => Boolean) = + copy(mapped = mapped.filterKeys(fn)) + + override def mapGroup[V3](fn: (K, Iterator[V2]) => Iterator[V3]) = { + // we don't need the empty guard here because ComposedMapGroup already does it + val newReduce = ComposedMapGroup(reduceFn, fn) + copy(reduceFn = newReduce) + } + + override def joinFunction = MultiJoinFunction.MapCast(reduceFn) +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/HashEqualsArrayWrapper.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/HashEqualsArrayWrapper.scala new file mode 100644 index 0000000000..5c9f30cfc6 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/HashEqualsArrayWrapper.scala @@ -0,0 +1,350 @@ +package com.twitter.scalding.typed + +import java.util + +import reflect.ClassTag + +sealed trait HashEqualsArrayWrapper[T] { + def wrapped: Array[T] +} + +object HashEqualsArrayWrapper { + + /** + * Wraps an Array in an object with a valid equals() and hashCode() Uses specialized wrappers for arrays of + * primitive values. + */ + def wrap[T](a: Array[T]): HashEqualsArrayWrapper[T] = + wrapByClassFn[T](a.getClass.asInstanceOf[Class[Array[T]]])(a) + + /** + * Creates a function that can be used to wrap Arrays into objects with valid equals() and hashCode() + * methods. + * + * Using this method and applying it to many arrays should be faster than using wrap above on each array, + * because this method uses reflection once, and wrap above uses reflection on each individual array. + */ + def wrapByClassFn[T](clazz: Class[Array[T]]): Array[T] => HashEqualsArrayWrapper[T] = { + + val fn = clazz match { + case c if classOf[Array[Long]].equals(c) => a: Array[Long] => new HashEqualsLongArrayWrapper(a) + case c if classOf[Array[Int]].equals(c) => a: Array[Int] => new HashEqualsIntArrayWrapper(a) + case c if classOf[Array[Short]].equals(c) => a: Array[Short] => new HashEqualsShortArrayWrapper(a) + case c if classOf[Array[Char]].equals(c) => a: Array[Char] => new HashEqualsCharArrayWrapper(a) + case c if classOf[Array[Byte]].equals(c) => a: Array[Byte] => new HashEqualsByteArrayWrapper(a) + case c if classOf[Array[Boolean]].equals(c) => a: Array[Boolean] => new HashEqualsBooleanArrayWrapper(a) + case c if classOf[Array[Float]].equals(c) => a: Array[Float] => new HashEqualsFloatArrayWrapper(a) + case c if classOf[Array[Double]].equals(c) => a: Array[Double] => new HashEqualsDoubleArrayWrapper(a) + case c => a: Array[T] => new HashEqualsObjectArrayWrapper(a) + } + + fn.asInstanceOf[(Array[T] => HashEqualsArrayWrapper[T])] + } + + /** + * ct.runtimeClass returns Class[_] so here we cast + */ + private[typed] def classForTag[T](ct: ClassTag[T]): Class[T] = ct.runtimeClass.asInstanceOf[Class[T]] + + def wrapByClassTagFn[T: ClassTag]: Array[T] => HashEqualsArrayWrapper[T] = + wrapByClassFn(classForTag(implicitly[ClassTag[T]].wrap)) + + implicit val longArrayOrd: Ordering[Array[Long]] = new Ordering[Array[Long]] { + override def compare(x: Array[Long], y: Array[Long]): Int = { + val lenCmp = java.lang.Integer.compare(x.length, y.length) + + if (lenCmp != 0) { + lenCmp + } else if (x.length == 0) { + 0 + } else { + val len = x.length + var i = 1 + var cmp = java.lang.Long.compare(x(0), y(0)) + while (i < len && cmp == 0) { + cmp = java.lang.Long.compare(x(i), y(i)) + i = i + 1 + } + cmp + } + } + } + + implicit val intArrayOrd: Ordering[Array[Int]] = new Ordering[Array[Int]] { + override def compare(x: Array[Int], y: Array[Int]): Int = { + val lenCmp = java.lang.Integer.compare(x.length, y.length) + + if (lenCmp != 0) { + lenCmp + } else if (x.length == 0) { + 0 + } else { + val len = x.length + var i = 1 + var cmp = java.lang.Integer.compare(x(0), y(0)) + while (i < len && cmp == 0) { + cmp = java.lang.Integer.compare(x(i), y(i)) + i = i + 1 + } + cmp + } + } + } + + implicit val shortArrayOrd: Ordering[Array[Short]] = new Ordering[Array[Short]] { + override def compare(x: Array[Short], y: Array[Short]): Int = { + val lenCmp = java.lang.Integer.compare(x.length, y.length) + + if (lenCmp != 0) { + lenCmp + } else if (x.length == 0) { + 0 + } else { + val len = x.length + var i = 1 + var cmp = java.lang.Short.compare(x(0), y(0)) + while (i < len && cmp == 0) { + cmp = java.lang.Short.compare(x(i), y(i)) + i = i + 1 + } + cmp + } + } + } + + implicit val charArrayOrd: Ordering[Array[Char]] = new Ordering[Array[Char]] { + override def compare(x: Array[Char], y: Array[Char]): Int = { + val lenCmp = java.lang.Integer.compare(x.length, y.length) + + if (lenCmp != 0) { + lenCmp + } else if (x.length == 0) { + 0 + } else { + val len = x.length + var i = 1 + var cmp = java.lang.Character.compare(x(0), y(0)) + while (i < len && cmp == 0) { + cmp = java.lang.Character.compare(x(i), y(i)) + i = i + 1 + } + cmp + } + } + } + + implicit val byteArrayOrd: Ordering[Array[Byte]] = new Ordering[Array[Byte]] { + override def compare(x: Array[Byte], y: Array[Byte]): Int = { + val lenCmp = java.lang.Integer.compare(x.length, y.length) + + if (lenCmp != 0) { + lenCmp + } else if (x.length == 0) { + 0 + } else { + val len = x.length + var i = 1 + var cmp = java.lang.Byte.compare(x(0), y(0)) + while (i < len && cmp == 0) { + cmp = java.lang.Byte.compare(x(i), y(i)) + i = i + 1 + } + cmp + } + } + } + + implicit val booleanArrayOrd: Ordering[Array[Boolean]] = new Ordering[Array[Boolean]] { + override def compare(x: Array[Boolean], y: Array[Boolean]): Int = { + val lenCmp = java.lang.Integer.compare(x.length, y.length) + + if (lenCmp != 0) { + lenCmp + } else if (x.length == 0) { + 0 + } else { + val len = x.length + var i = 1 + var cmp = java.lang.Boolean.compare(x(0), y(0)) + while (i < len && cmp == 0) { + cmp = java.lang.Boolean.compare(x(i), y(i)) + i = i + 1 + } + cmp + } + } + } + + implicit val floatArrayOrd: Ordering[Array[Float]] = new Ordering[Array[Float]] { + override def compare(x: Array[Float], y: Array[Float]): Int = { + val lenCmp = java.lang.Integer.compare(x.length, y.length) + + if (lenCmp != 0) { + lenCmp + } else if (x.length == 0) { + 0 + } else { + val len = x.length + var i = 1 + var cmp = java.lang.Float.compare(x(0), y(0)) + while (i < len && cmp == 0) { + cmp = java.lang.Float.compare(x(i), y(i)) + i = i + 1 + } + cmp + } + } + } + + implicit val doubleArrayOrd: Ordering[Array[Double]] = new Ordering[Array[Double]] { + override def compare(x: Array[Double], y: Array[Double]): Int = { + val lenCmp = java.lang.Integer.compare(x.length, y.length) + + if (lenCmp != 0) { + lenCmp + } else if (x.length == 0) { + 0 + } else { + val len = x.length + var i = 1 + var cmp = java.lang.Double.compare(x(0), y(0)) + while (i < len && cmp == 0) { + cmp = java.lang.Double.compare(x(i), y(i)) + i = i + 1 + } + cmp + } + } + } + + implicit val hashEqualsLongOrdering: Ordering[HashEqualsArrayWrapper[Long]] = + new Ordering[HashEqualsArrayWrapper[Long]] { + override def compare(x: HashEqualsArrayWrapper[Long], y: HashEqualsArrayWrapper[Long]): Int = + longArrayOrd.compare(x.wrapped, y.wrapped) + } + + implicit val hashEqualsIntOrdering: Ordering[HashEqualsArrayWrapper[Int]] = + new Ordering[HashEqualsArrayWrapper[Int]] { + override def compare(x: HashEqualsArrayWrapper[Int], y: HashEqualsArrayWrapper[Int]): Int = + intArrayOrd.compare(x.wrapped, y.wrapped) + } + + implicit val hashEqualsShortOrdering: Ordering[HashEqualsArrayWrapper[Short]] = + new Ordering[HashEqualsArrayWrapper[Short]] { + override def compare(x: HashEqualsArrayWrapper[Short], y: HashEqualsArrayWrapper[Short]): Int = + shortArrayOrd.compare(x.wrapped, y.wrapped) + } + + implicit val hashEqualsCharOrdering: Ordering[HashEqualsArrayWrapper[Char]] = + new Ordering[HashEqualsArrayWrapper[Char]] { + override def compare(x: HashEqualsArrayWrapper[Char], y: HashEqualsArrayWrapper[Char]): Int = + charArrayOrd.compare(x.wrapped, y.wrapped) + } + + implicit val hashEqualsByteOrdering: Ordering[HashEqualsArrayWrapper[Byte]] = + new Ordering[HashEqualsArrayWrapper[Byte]] { + override def compare(x: HashEqualsArrayWrapper[Byte], y: HashEqualsArrayWrapper[Byte]): Int = + byteArrayOrd.compare(x.wrapped, y.wrapped) + } + + implicit val hashEqualsBooleanOrdering: Ordering[HashEqualsArrayWrapper[Boolean]] = + new Ordering[HashEqualsArrayWrapper[Boolean]] { + override def compare(x: HashEqualsArrayWrapper[Boolean], y: HashEqualsArrayWrapper[Boolean]): Int = + booleanArrayOrd.compare(x.wrapped, y.wrapped) + } + + implicit val hashEqualsFloatOrdering: Ordering[HashEqualsArrayWrapper[Float]] = + new Ordering[HashEqualsArrayWrapper[Float]] { + override def compare(x: HashEqualsArrayWrapper[Float], y: HashEqualsArrayWrapper[Float]): Int = + floatArrayOrd.compare(x.wrapped, y.wrapped) + } + + implicit val hashEqualsDoubleOrdering: Ordering[HashEqualsArrayWrapper[Double]] = + new Ordering[HashEqualsArrayWrapper[Double]] { + override def compare(x: HashEqualsArrayWrapper[Double], y: HashEqualsArrayWrapper[Double]): Int = + doubleArrayOrd.compare(x.wrapped, y.wrapped) + } + +} + +final class HashEqualsLongArrayWrapper(override val wrapped: Array[Long]) + extends HashEqualsArrayWrapper[Long] { + override def hashCode(): Int = util.Arrays.hashCode(wrapped) + override def equals(obj: scala.Any): Boolean = obj match { + case other: HashEqualsLongArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) + case _ => false + } +} + +final class HashEqualsIntArrayWrapper(override val wrapped: Array[Int]) extends HashEqualsArrayWrapper[Int] { + override def hashCode(): Int = util.Arrays.hashCode(wrapped) + override def equals(obj: scala.Any): Boolean = obj match { + case other: HashEqualsIntArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) + case _ => false + } +} + +final class HashEqualsShortArrayWrapper(override val wrapped: Array[Short]) + extends HashEqualsArrayWrapper[Short] { + override def hashCode(): Int = util.Arrays.hashCode(wrapped) + override def equals(obj: scala.Any): Boolean = obj match { + case other: HashEqualsShortArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) + case _ => false + } +} + +final class HashEqualsCharArrayWrapper(override val wrapped: Array[Char]) + extends HashEqualsArrayWrapper[Char] { + override def hashCode(): Int = util.Arrays.hashCode(wrapped) + override def equals(obj: scala.Any): Boolean = obj match { + case other: HashEqualsCharArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) + case _ => false + } +} + +final class HashEqualsByteArrayWrapper(override val wrapped: Array[Byte]) + extends HashEqualsArrayWrapper[Byte] { + override def hashCode(): Int = util.Arrays.hashCode(wrapped) + override def equals(obj: scala.Any): Boolean = obj match { + case other: HashEqualsByteArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) + case _ => false + } +} + +final class HashEqualsBooleanArrayWrapper(override val wrapped: Array[Boolean]) + extends HashEqualsArrayWrapper[Boolean] { + override def hashCode(): Int = util.Arrays.hashCode(wrapped) + override def equals(obj: scala.Any): Boolean = obj match { + case other: HashEqualsBooleanArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) + case _ => false + } +} + +final class HashEqualsFloatArrayWrapper(override val wrapped: Array[Float]) + extends HashEqualsArrayWrapper[Float] { + override def hashCode(): Int = util.Arrays.hashCode(wrapped) + override def equals(obj: scala.Any): Boolean = obj match { + case other: HashEqualsFloatArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) + case _ => false + } +} + +final class HashEqualsDoubleArrayWrapper(override val wrapped: Array[Double]) + extends HashEqualsArrayWrapper[Double] { + override def hashCode(): Int = util.Arrays.hashCode(wrapped) + + override def equals(obj: scala.Any): Boolean = obj match { + case other: HashEqualsDoubleArrayWrapper => util.Arrays.equals(wrapped, other.wrapped) + case _ => false + } +} + +final class HashEqualsObjectArrayWrapper[T](override val wrapped: Array[T]) + extends HashEqualsArrayWrapper[T] { + private val wrappedInternal = wrapped.toSeq + override def hashCode(): Int = wrappedInternal.hashCode() + override def equals(obj: scala.Any): Boolean = obj match { + case other: HashEqualsObjectArrayWrapper[T] => wrappedInternal.equals(other.wrappedInternal) + case _ => false + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/Joiner.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/Joiner.scala new file mode 100644 index 0000000000..a9c66be55e --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/Joiner.scala @@ -0,0 +1,199 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +object Joiner extends java.io.Serializable { + + type JoinFn[K, V, U, R] = (K, Iterator[V], Iterable[U]) => Iterator[R] + type HashJoinFn[K, V, U, R] = (K, V, Iterable[U]) => Iterator[R] + + def toCogroupJoiner2[K, V, U, R](hashJoiner: (K, V, Iterable[U]) => Iterator[R]): JoinFn[K, V, U, R] = + JoinFromHashJoin(hashJoiner) + + def hashInner2[K, V, U]: HashJoinFn[K, V, U, (V, U)] = + HashInner() + + def hashLeft2[K, V, U]: HashJoinFn[K, V, U, (V, Option[U])] = + HashLeft() + + def inner2[K, V, U]: JoinFn[K, V, U, (V, U)] = + InnerJoin() + + def asOuter[U](it: Iterator[U]): Iterator[Option[U]] = + if (it.isEmpty) Iterator.single(None) + else it.map(Some(_)) + + def outer2[K, V, U]: JoinFn[K, V, U, (Option[V], Option[U])] = + OuterJoin() + + def left2[K, V, U]: JoinFn[K, V, U, (V, Option[U])] = + LeftJoin() + + def right2[K, V, U]: JoinFn[K, V, U, (Option[V], U)] = + RightJoin() + + /** + * Optimizers want to match on the kinds of joins we are doing. This gives them that ability + */ + sealed abstract class HashJoinFunction[K, V, U, R] extends Function3[K, V, Iterable[U], Iterator[R]] + + final case class HashInner[K, V, U]() extends HashJoinFunction[K, V, U, (V, U)] { + def apply(k: K, v: V, u: Iterable[U]) = u.iterator.map((v, _)) + } + final case class HashLeft[K, V, U]() extends HashJoinFunction[K, V, U, (V, Option[U])] { + def apply(k: K, v: V, u: Iterable[U]) = asOuter(u.iterator).map((v, _)) + } + final case class FilteredHashJoin[K, V1, V2, R](jf: HashJoinFunction[K, V1, V2, R], fn: ((K, R)) => Boolean) + extends HashJoinFunction[K, V1, V2, R] { + def apply(k: K, left: V1, right: Iterable[V2]) = + jf.apply(k, left, right).filter(r => fn((k, r))) + } + final case class MappedHashJoin[K, V1, V2, R, R1](jf: HashJoinFunction[K, V1, V2, R], fn: R => R1) + extends HashJoinFunction[K, V1, V2, R1] { + def apply(k: K, left: V1, right: Iterable[V2]) = + jf.apply(k, left, right).map(fn) + } + final case class FlatMappedHashJoin[K, V1, V2, R, R1]( + jf: HashJoinFunction[K, V1, V2, R], + fn: R => TraversableOnce[R1] + ) extends HashJoinFunction[K, V1, V2, R1] { + def apply(k: K, left: V1, right: Iterable[V2]) = + jf.apply(k, left, right).flatMap(fn) + } + + sealed abstract class JoinFunction[K, V1, V2, R] + extends Function3[K, Iterator[V1], Iterable[V2], Iterator[R]] + + final case class InnerJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (V1, V2)] { + def apply(k: K, left: Iterator[V1], right: Iterable[V2]): Iterator[(V1, V2)] = + left.flatMap(v1 => right.iterator.map((v1, _))) + } + final case class LeftJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (V1, Option[V2])] { + def apply(k: K, left: Iterator[V1], right: Iterable[V2]): Iterator[(V1, Option[V2])] = + left.flatMap(v1 => asOuter(right.iterator).map((v1, _))) + } + final case class RightJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (Option[V1], V2)] { + def apply(k: K, left: Iterator[V1], right: Iterable[V2]): Iterator[(Option[V1], V2)] = + asOuter(left).flatMap(v1 => right.iterator.map((v1, _))) + } + final case class OuterJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (Option[V1], Option[V2])] { + def apply(k: K, left: Iterator[V1], right: Iterable[V2]): Iterator[(Option[V1], Option[V2])] = + if (left.isEmpty && right.isEmpty) Iterator.empty + else asOuter(left).flatMap(v1 => asOuter(right.iterator).map((v1, _))) + } + final case class FilteredJoin[K, V1, V2, R](jf: JoinFunction[K, V1, V2, R], fn: ((K, R)) => Boolean) + extends JoinFunction[K, V1, V2, R] { + def apply(k: K, left: Iterator[V1], right: Iterable[V2]) = + jf.apply(k, left, right).filter(r => fn((k, r))) + } + final case class MappedJoin[K, V1, V2, R, R1](jf: JoinFunction[K, V1, V2, R], fn: R => R1) + extends JoinFunction[K, V1, V2, R1] { + def apply(k: K, left: Iterator[V1], right: Iterable[V2]) = + jf.apply(k, left, right).map(fn) + } + final case class FlatMappedJoin[K, V1, V2, R, R1]( + jf: JoinFunction[K, V1, V2, R], + fn: R => TraversableOnce[R1] + ) extends JoinFunction[K, V1, V2, R1] { + def apply(k: K, left: Iterator[V1], right: Iterable[V2]) = + jf.apply(k, left, right).flatMap(fn) + } + final case class MappedGroupJoin[K, V1, V2, R, R1]( + jf: JoinFunction[K, V1, V2, R], + fn: (K, Iterator[R]) => Iterator[R1] + ) extends JoinFunction[K, V1, V2, R1] { + def apply(k: K, left: Iterator[V1], right: Iterable[V2]) = { + val iterr = jf.apply(k, left, right) + if (iterr.isEmpty) Iterator.empty // mapGroup operates on non-empty groups + else fn(k, iterr) + } + } + final case class JoinFromHashJoin[K, V1, V2, R](hj: (K, V1, Iterable[V2]) => Iterator[R]) + extends JoinFunction[K, V1, V2, R] { + def apply(k: K, itv: Iterator[V1], itu: Iterable[V2]) = + itv.flatMap(hj(k, _, itu)) + } + + /** + * an inner-like join function is empty definitely if either side is empty + */ + final def isInnerJoinLike[K, V1, V2, R]( + jf: (K, Iterator[V1], Iterable[V2]) => Iterator[R] + ): Option[Boolean] = + jf match { + case InnerJoin() => Some(true) + case LeftJoin() => Some(false) + case RightJoin() => Some(false) + case OuterJoin() => Some(false) + case JoinFromHashJoin(hj) => isInnerHashJoinLike(hj) + case FilteredJoin(jf, _) => isInnerJoinLike(jf) + case MappedJoin(jf, _) => isInnerJoinLike(jf) + case FlatMappedJoin(jf, _) => isInnerJoinLike(jf) + case MappedGroupJoin(jf, _) => isInnerJoinLike(jf) + case _ => None + } + + /** + * a left-like join function is empty definitely if the left side is empty + */ + final def isLeftJoinLike[K, V1, V2, R]( + jf: (K, Iterator[V1], Iterable[V2]) => Iterator[R] + ): Option[Boolean] = + jf match { + case InnerJoin() => Some(true) + case JoinFromHashJoin(hj) => isInnerHashJoinLike(hj) + case LeftJoin() => Some(true) + case RightJoin() => Some(false) + case OuterJoin() => Some(false) + case FilteredJoin(jf, _) => isLeftJoinLike(jf) + case MappedJoin(jf, _) => isLeftJoinLike(jf) + case FlatMappedJoin(jf, _) => isLeftJoinLike(jf) + case MappedGroupJoin(jf, _) => isLeftJoinLike(jf) + case _ => None + } + + /** + * a right-like join function is empty definitely if the right side is empty + */ + final def isRightJoinLike[K, V1, V2, R]( + jf: (K, Iterator[V1], Iterable[V2]) => Iterator[R] + ): Option[Boolean] = + jf match { + case InnerJoin() => Some(true) + case JoinFromHashJoin(hj) => isInnerHashJoinLike(hj) + case LeftJoin() => Some(false) + case RightJoin() => Some(true) + case OuterJoin() => Some(false) + case FilteredJoin(jf, _) => isRightJoinLike(jf) + case MappedJoin(jf, _) => isRightJoinLike(jf) + case FlatMappedJoin(jf, _) => isRightJoinLike(jf) + case MappedGroupJoin(jf, _) => isRightJoinLike(jf) + case _ => None + } + + /** + * a inner-like hash-join function is empty definitely if either side is empty + */ + final def isInnerHashJoinLike[K, V1, V2, R](jf: (K, V1, Iterable[V2]) => Iterator[R]): Option[Boolean] = + jf match { + case HashInner() => Some(true) + case HashLeft() => Some(false) + case FilteredHashJoin(jf, _) => isInnerHashJoinLike(jf) + case MappedHashJoin(jf, _) => isInnerHashJoinLike(jf) + case FlatMappedHashJoin(jf, _) => isInnerHashJoinLike(jf) + case _ => None + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/KeyedList.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/KeyedList.scala new file mode 100644 index 0000000000..1d5bb3ee38 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/KeyedList.scala @@ -0,0 +1,353 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +import java.io.Serializable +import scala.collection.JavaConverters._ + +import com.twitter.algebird.{Aggregator, Fold, Ring, Semigroup} + +import com.twitter.scalding.typed.functions._ + +object KeyedListLike { + + /** KeyedListLike items are implicitly convertable to TypedPipe */ + implicit def toTypedPipe[K, V, S[K, +V] <: KeyedListLike[K, V, S]]( + keyed: KeyedListLike[K, V, S] + ): TypedPipe[(K, V)] = keyed.toTypedPipe + + implicit def toTypedPipeKeyed[K, V, S[K, +V] <: KeyedListLike[K, V, S]]( + keyed: KeyedListLike[K, V, S] + ): TypedPipe.Keyed[K, V] = + new TypedPipe.Keyed(keyed.toTypedPipe) +} + +/** + * This is for the case where you don't want to expose any structure but the ability to operate on an iterator + * of the values + */ +trait KeyedList[K, +T] extends KeyedListLike[K, T, KeyedList] + +/** + * Represents sharded lists of items of type T There are exactly two fundamental operations: toTypedPipe: + * marks the end of the grouped-on-key operations. mapValueStream: further transforms all values, in order, + * one at a time, with a function from Iterator to another Iterator + */ +trait KeyedListLike[K, +T, +This[K, +T] <: KeyedListLike[K, T, This]] extends Serializable { + + /** + * End of the operations on values. From this point on the keyed structure is lost and another shuffle is + * generally required to reconstruct it + */ + def toTypedPipe: TypedPipe[(K, T)] + + /** + * This is like take except that the items are kept in memory and we attempt to partially execute on the + * mappers if possible For very large values of n, this could create memory pressure. (as you may aggregate + * n items in a memory heap for each key) If you get OOM issues, try to resolve using the method `take` + * instead. + */ + def bufferedTake(n: Int): This[K, T] + /* + Here is an example implementation, but since each subclass of + KeyedListLike has its own constraints, this is always to be + overriden. + + {@code + + if (n < 1) { + // This means don't take anything, which is legal, but strange + filterKeys(Constant(false)) + } else if (n == 1) { + head + } else { + // By default, there is no ordering. This method is overridden + // in IdentityValueSortedReduce + // Note, this is going to bias toward low hashcode items. + // If you care which items you take, you should sort by a random number + // or the value itself. + val fakeOrdering: Ordering[T] = Ordering.by { v: T => v.hashCode } + implicit val mon = new ScaldingPriorityQueueMonoid(n)(fakeOrdering) + mapValues(mon.build(_)) + // Do the heap-sort on the mappers: + .sum + .mapValues { vs => vs.iterator.asScala } + .flattenValues + } + + } + */ + + /** + * filter keys on a predicate. More efficient than filter if you are only looking at keys + */ + def filterKeys(fn: K => Boolean): This[K, T] + /* an inefficient implementation is below, but + * since this can always be pushed mapside, we should avoid + * using this implementation, lest we accidentally forget to + * implement the smart thing + * {@code + * mapGroup { (k: K, items: Iterator[T]) => if (fn(k)) items else Iterator.empty } + * } + */ + + /** + * Operate on an Iterator[T] of all the values for each key at one time. Prefer this to toList, when you can + * avoid accumulating the whole list in memory. Prefer sum, which is partially executed map-side by default. + * Use mapValueStream when you don't care about the key for the group. + * + * Iterator is always Non-empty. Note, any key that has all values removed will not appear in subsequent + * .mapGroup/mapValueStream + */ + def mapGroup[V](smfn: (K, Iterator[T]) => Iterator[V]): This[K, V] + + /////////// + /// The below are all implemented in terms of the above: + /////////// + + /** + * Use Algebird Aggregator to do the reduction + */ + def aggregate[B, C](agg: Aggregator[T, B, C]): This[K, C] = + mapValues[B](AggPrepare(agg)) + .sum[B](agg.semigroup) + .mapValues[C](AggPresent(agg)) + + /** + * .filter(fn).toTypedPipe == .toTypedPipe.filter(fn) It is generally better to avoid going back to a + * TypedPipe as long as possible: this minimizes the times we go in and out of cascading/hadoop types. + */ + def filter(fn: ((K, T)) => Boolean): This[K, T] = + mapGroup(FilterGroup(fn)) + + /** + * flatten the values Useful after sortedTake, for instance + */ + def flattenValues[U](implicit ev: T <:< TraversableOnce[U]): This[K, U] = + flatMapValues(Widen(SubTypes.fromEv(ev))) + + /** + * This is just short hand for mapValueStream(identity), it makes sure the planner sees that you want to + * force a shuffle. For expert tuning + */ + def forceToReducers: This[K, T] = + mapValueStream(Identity()) + + /** + * Use this to get the first value encountered. prefer this to take(1). + */ + def head: This[K, T] = sum(HeadSemigroup[T]()) + + /** + * This is a special case of mapValueStream, but can be optimized because it doesn't need all the values for + * a given key at once. An unoptimized implementation is: mapValueStream { _.map { fn } } but for Grouped we + * can avoid resorting to mapValueStream + */ + def mapValues[V](fn: T => V): This[K, V] = + mapGroup(MapGroupMapValues(fn)) + + /** + * Similar to mapValues, but works like flatMap, returning a collection of outputs for each value input. + */ + def flatMapValues[V](fn: T => TraversableOnce[V]): This[K, V] = + mapGroup(MapGroupFlatMapValues(fn)) + + /** + * Use this when you don't care about the key for the group, otherwise use mapGroup + */ + def mapValueStream[V](smfn: Iterator[T] => Iterator[V]): This[K, V] = + mapGroup(MapValueStream(smfn)) + + /** + * Add all items according to the implicit Semigroup If there is no sorting, we default to assuming the + * Semigroup is commutative. If you don't want that, define an ordering on the Values, sort or + * .forceToReducers. + * + * Semigroups MAY have a faster implementation of sum for iterators, so prefer using sum/sumLeft to reduce + */ + def sum[U >: T](implicit sg: Semigroup[U]): This[K, U] = sumLeft[U] + + /** + * reduce with fn which must be associative and commutative. Like the above this can be optimized in some + * Grouped cases. If you don't have a commutative operator, use reduceLeft + */ + def reduce[U >: T](fn: (U, U) => U): This[K, U] = + sum(SemigroupFromFn(fn)) + + /** + * Take the largest k things according to the implicit ordering. Useful for top-k without having to call + * ord.reverse + */ + def sortedReverseTake[U >: T](k: Int)(implicit ord: Ordering[U]): This[K, Seq[U]] = + sortedTake[U](k)(ord.reverse) + + /** + * This implements bottom-k (smallest k items) on each mapper for each key, then sends those to reducers to + * get the result. This is faster than using .take if k * (number of Keys) is small enough to fit in memory. + */ + def sortedTake[U >: T](k: Int)(implicit ord: Ordering[U]): This[K, Seq[U]] = { + val mon = new ScaldingPriorityQueueMonoid[U](k)(ord) + mapValues(mon.build(_)) + .sum(mon) // results in a PriorityQueue + // scala can't infer the type, possibly due to the view bound on TypedPipe + .mapValues(_.iterator.asScala.toList.sorted(ord)) + } + + /** Like the above, but with a less than operation for the ordering */ + def sortWithTake[U >: T](k: Int)(lessThan: (U, U) => Boolean): This[K, Seq[T]] = + sortedTake(k)(Ordering.fromLessThan(lessThan)) + + /** For each key, Return the product of all the values */ + def product[U >: T](implicit ring: Ring[U]): This[K, U] = + sum(SemigroupFromProduct(ring)) + + /** For each key, count the number of values that satisfy a predicate */ + def count(fn: T => Boolean): This[K, Long] = + mapValues(Count(fn)).sum + + /** For each key, check to see if a predicate is true for all Values */ + def forall(fn: T => Boolean): This[K, Boolean] = + mapValues(fn).product + + /** + * For each key, selects all elements except first n ones. + */ + def drop(n: Int): This[K, T] = + mapValueStream(Drop(n)) + + /** + * For each key, Drops longest prefix of elements that satisfy the given predicate. + */ + def dropWhile(p: T => Boolean): This[K, T] = + mapValueStream(DropWhile(p)) + + /** + * For each key, Selects first n elements. Don't use this if n == 1, head is faster in that case. + */ + def take(n: Int): This[K, T] = + if (n < 1) filterKeys(Constant(false)) // just don't keep anything + else if (n == 1) head + else mapValueStream(Take(n)) + + /** + * For each key, Takes longest prefix of elements that satisfy the given predicate. + */ + def takeWhile(p: T => Boolean): This[K, T] = + mapValueStream(TakeWhile(p)) + + /** + * Folds are composable aggregations that make one pass over the data. If you need to do several custom + * folds over the same data, use Fold.join and this method + */ + def fold[V](f: Fold[T, V]): This[K, V] = + mapValueStream(FoldIterator(f)) + + /** + * If the fold depends on the key, use this method to construct the fold for each key + */ + def foldWithKey[V](fn: K => Fold[T, V]): This[K, V] = + mapGroup(FoldWithKeyIterator(fn)) + + /** For each key, fold the values. see scala.collection.Iterable.foldLeft */ + def foldLeft[B](z: B)(fn: (B, T) => B): This[K, B] = + mapValueStream(FoldLeftIterator(z, fn)) + + /** For each key, scanLeft the values. see scala.collection.Iterable.scanLeft */ + def scanLeft[B](z: B)(fn: (B, T) => B): This[K, B] = + mapValueStream(ScanLeftIterator(z, fn)) + + /** + * Similar to reduce but always on the reduce-side (never optimized to mapside), and named for the scala + * function. fn need not be associative and/or commutative. Makes sense when you want to reduce, but in a + * particular sorted order. the old value comes in on the left. + */ + def reduceLeft[U >: T](fn: (U, U) => U): This[K, U] = + sumLeft[U](SemigroupFromFn(fn)) + + /** + * Semigroups MAY have a faster implementation of sum for iterators, so prefer using sum/sumLeft to + * reduce/reduceLeft + */ + def sumLeft[U >: T](implicit sg: Semigroup[U]): This[K, U] = + mapValueStream[U](SumAll(sg)) + + /** For each key, give the number of values */ + def size: This[K, Long] = mapValues(Constant(1L)).sum + + /** + * For each key, give the number of unique values. WARNING: May OOM. This assumes the values for each key + * can fit in memory. + */ + def distinctSize: This[K, Long] = + toSet[T].mapValues(SizeOfSet()) + + /** + * For each key, remove duplicate values. WARNING: May OOM. This assumes the values for each key can fit in + * memory. + */ + def distinctValues: This[K, T] = toSet[T].flattenValues + + /** + * AVOID THIS IF POSSIBLE For each key, accumulate all the values into a List. WARNING: May OOM Only use + * this method if you are sure all the values will fit in memory. You really should try to ask why you need + * all the values, and if you want to do some custom reduction, do it in mapGroup or mapValueStream + * + * This does no map-side aggregation even though it is a Monoid because toList does not decrease the size of + * the data at all, so in practice it only wastes effort to try to cache. + */ + def toList: This[K, List[T]] = mapValueStream(ToList[T]()) + + /** + * AVOID THIS IF POSSIBLE Same risks apply here as to toList: you may OOM. See toList. Note that toSet needs + * to be parameterized even though toList does not. This is because List is covariant in its type parameter + * in the scala API, but Set is invariant. See: + * http://stackoverflow.com/questions/676615/why-is-scalas-immutable-set-not-covariant-in-its-type + */ + def toSet[U >: T]: This[K, Set[U]] = mapValues(ToSet[U]()).sum + + /** For each key, give the maximum value */ + def max[B >: T](implicit cmp: Ordering[B]): This[K, T] = + reduce(MaxOrd[T, B](cmp)) + + /** For each key, give the maximum value by some function */ + def maxBy[B](fn: T => B)(implicit cmp: Ordering[B]): This[K, T] = + reduce(MaxOrdBy(fn, cmp)) + + /** For each key, give the minimum value */ + def min[B >: T](implicit cmp: Ordering[B]): This[K, T] = + reduce(MinOrd[T, B](cmp)) + + /** For each key, give the minimum value by some function */ + def minBy[B](fn: T => B)(implicit cmp: Ordering[B]): This[K, T] = + reduce(MinOrdBy(fn, cmp)) + + /** + * Use this to error if there is more than 1 value per key Using this makes it easier to detect when data + * does not have the shape you expect and to communicate to scalding that certain optimizations are safe to + * do + * + * Note, this has no effect and is a waste to call after sum because it is true by construction at that + * point + */ + def requireSingleValuePerKey: This[K, T] = + mapValueStream(SumAll(RequireSingleSemigroup())) + + /** Convert to a TypedPipe and only keep the keys */ + def keys: TypedPipe[K] = toTypedPipe.keys + + /** Convert to a TypedPipe and only keep the values */ + def values: TypedPipe[T] = toTypedPipe.values +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/WithReducers.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/KeyedPipe.scala similarity index 54% rename from scalding-core/src/main/scala/com/twitter/scalding/typed/WithReducers.scala rename to scalding-base/src/main/scala/com/twitter/scalding/typed/KeyedPipe.scala index ee34fc0be4..f00ccd046a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/WithReducers.scala +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/KeyedPipe.scala @@ -1,5 +1,5 @@ /* -Copyright 2012 Twitter, Inc. +Copyright 2014 Twitter, Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,21 +12,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ -package com.twitter.scalding.typed - -/** used for types that may know how many reducers they need - * e.g. CoGrouped, Grouped, SortedGrouped, UnsortedGrouped */ -trait HasReducers { - def reducers: Option[Int] -} +package com.twitter.scalding.typed /** - * used for objects that may _set_ how many reducers they need - * e.g. CoGrouped, Grouped, SortedGrouped, UnsortedGrouped + * Represents anything that starts as a TypedPipe of Key Value, where the value type has been erased. Acts as + * proof that the K in the tuple has an Ordering */ -trait WithReducers[+This <: WithReducers[This]] extends HasReducers { - /** never mutates this, instead returns a new item. */ - def withReducers(reds: Int): This +trait KeyedPipe[K] { + def keyOrdering: Ordering[K] + def mapped: TypedPipe[(K, Any)] } diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/LookupJoin.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/LookupJoin.scala new file mode 100644 index 0000000000..4ea0099c13 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/LookupJoin.scala @@ -0,0 +1,192 @@ +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.scalding.typed + +import java.io.Serializable + +import com.twitter.algebird.Semigroup + +/* + Copyright 2013 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +/** + * lookupJoin simulates the behavior of a realtime system attempting to leftJoin (K, V) pairs against some + * other value type (JoinedV) by performing realtime lookups on a key-value Store. + * + * An example would join (K, V) pairs of (URL, Username) against a service of (URL, ImpressionCount). The + * result of this join would be a pipe of (ShortenedURL, (Username, Option[ImpressionCount])). + * + * To simulate this behavior, lookupJoin accepts pipes of key-value pairs with an explicit time value T + * attached. T must have some sensible ordering. The semantics are, if one were to hit the right pipe's + * simulated realtime service at any time between T(tuple) T(tuple + 1), one would receive Some((K, + * JoinedV)(tuple)). + * + * The entries in the left pipe's tuples have the following meaning: + * + * T: The time at which the (K, W) lookup occurred. K: the join key. W: the current value for the join key. + * + * The right pipe's entries have the following meaning: + * + * T: The time at which the "service" was fed an update K: the join K. V: value of the key at time T + * + * Before the time T in the right pipe's very first entry, the simulated "service" will return None. After + * this time T, the right side will return None only if the key is absent, else, the service will return + * Some(joinedV). + */ + +object LookupJoin extends Serializable { + + /** + * This is the "infinite history" join and always joins regardless of how much time is between the left and + * the right + */ + + def apply[T: Ordering, K: Ordering, V, JoinedV]( + left: TypedPipe[(T, (K, V))], + right: TypedPipe[(T, (K, JoinedV))], + reducers: Option[Int] = None + ): TypedPipe[(T, (K, (V, Option[JoinedV])))] = + withWindow(left, right, reducers)((_, _) => true) + + /** + * In this case, the right pipe is fed through a scanLeft doing a Semigroup.plus before joined to the left + */ + def rightSumming[T: Ordering, K: Ordering, V, JoinedV: Semigroup]( + left: TypedPipe[(T, (K, V))], + right: TypedPipe[(T, (K, JoinedV))], + reducers: Option[Int] = None + ): TypedPipe[(T, (K, (V, Option[JoinedV])))] = + withWindowRightSumming(left, right, reducers)((_, _) => true) + + /** + * This ensures that gate(Tleft, Tright) == true, else the None is emitted as the joined value. Useful for + * bounding the time of the join to a recent window + */ + def withWindow[T: Ordering, K: Ordering, V, JoinedV]( + left: TypedPipe[(T, (K, V))], + right: TypedPipe[(T, (K, JoinedV))], + reducers: Option[Int] = None + )(gate: (T, T) => Boolean): TypedPipe[(T, (K, (V, Option[JoinedV])))] = { + + implicit val keepNew: Semigroup[JoinedV] = Semigroup.from((older, newer) => newer) + withWindowRightSumming(left, right, reducers)(gate) + } + + /** + * This ensures that gate(Tleft, Tright) == true, else the None is emitted as the joined value, and sums are + * only done as long as they they come within the gate interval as well + */ + def withWindowRightSumming[T: Ordering, K: Ordering, V, JoinedV: Semigroup]( + left: TypedPipe[(T, (K, V))], + right: TypedPipe[(T, (K, JoinedV))], + reducers: Option[Int] = None + )(gate: (T, T) => Boolean): TypedPipe[(T, (K, (V, Option[JoinedV])))] = { + + /** + * Implicit ordering on an either that doesn't care about the actual container values, puts the lookups + * before the service writes Since we assume it takes non-zero time to do a lookup. + */ + implicit def eitherOrd[T, U]: Ordering[Either[T, U]] = + new Ordering[Either[T, U]] { + def compare(l: Either[T, U], r: Either[T, U]) = + (l, r) match { + case (Left(_), Right(_)) => -1 + case (Right(_), Left(_)) => 1 + case (Left(_), Left(_)) => 0 + case (Right(_), Right(_)) => 0 + } + } + + val joined: TypedPipe[(K, (Option[(T, JoinedV)], Option[(T, V, Option[JoinedV])]))] = + left + .map { case (t, (k, v)) => (k, (t, Left(v): Either[V, JoinedV])) } + .++(right.map { case (t, (k, joinedV)) => + (k, (t, Right(joinedV): Either[V, JoinedV])) + }) + .group + .withReducers(reducers.getOrElse(-1)) // -1 means default in scalding + .sorted + /** + * Grouping by K leaves values of (T, Either[V, JoinedV]). Sort by time and scanLeft. The iterator + * will now represent pairs of T and either new values to join against or updates to the simulated + * "realtime store" described above. + */ + .scanLeft( + /** + * In the simulated realtime store described above, this None is the value in the store at the + * current time. Because we sort by time and scan forward, this value will be updated with a new + * value every time a Right(delta) shows up in the iterator. + * + * The second entry in the pair will be None when the JoinedV is updated and Some(newValue) when a + * (K, V) shows up and a new join occurs. + */ + (Option.empty[(T, JoinedV)], Option.empty[(T, V, Option[JoinedV])]) + ) { + case ((None, result), (time, Left(v))) => { + // The was no value previously + (None, Some((time, v, None))) + } + + case ((prev @ Some((oldt, jv)), result), (time, Left(v))) => { + // Left(v) means that we have a new value from the left + // pipe that we need to join against the current + // "lastJoined" value sitting in scanLeft's state. This + // is equivalent to a lookup on the data in the right + // pipe at time "thisTime". + val filteredJoined = if (gate(time, oldt)) Some(jv) else None + (prev, Some((time, v, filteredJoined))) + } + + case ((None, result), (time, Right(joined))) => { + // There was no value before, so we just update to joined + (Some((time, joined)), None) + } + + case ((Some((oldt, oldJ)), result), (time, Right(joined))) => { + // Right(joinedV) means that we've received a new value + // to use in the simulated realtime service + // described in the comments above + // did it fall out of cache? + val nextJoined = if (gate(time, oldt)) Semigroup.plus(oldJ, joined) else joined + (Some((time, nextJoined)), None) + } + } + .toTypedPipe + + // Now, get rid of residual state from the scanLeft above: + joined.flatMap { case (k, (_, optV)) => + // filter out every event that produced a Right(delta) above, + // leaving only the leftJoin events that occurred above: + optV.map { case (t, v, optJoined) => + (t, (k, (v, optJoined))) + } + } + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/MultiJoin.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/MultiJoin.scala new file mode 100644 index 0000000000..f65803028f --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/MultiJoin.scala @@ -0,0 +1,2222 @@ +// following were autogenerated by ./codegen/multi_join_generator.rb at Mon Dec 01 19:28:47 -0800 2014 do not edit +package com.twitter.scalding.typed + +/** + * This is an autogenerated object which gives you easy access to doing N-way joins so the types are cleaner. + * However, it just calls the underlying methods on CoGroupable and flattens the resulting tuple + */ +object MultiJoin extends java.io.Serializable { + import com.twitter.scalding.typed.FlattenGroup._ + + def apply[KEY, A, B](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B]): CoGrouped[KEY, (A, B)] = + a.join(b) + + def apply[KEY, A, B, C]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C] + ): CoGrouped[KEY, (A, B, C)] = + a.join(b) + .join(c) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D] + ): CoGrouped[KEY, (A, B, C, D)] = + a.join(b) + .join(c) + .join(d) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E] + ): CoGrouped[KEY, (A, B, C, D, E)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F] + ): CoGrouped[KEY, (A, B, C, D, E, F)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G] + ): CoGrouped[KEY, (A, B, C, D, E, F, G)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .join(l) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .join(l) + .join(m) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .join(l) + .join(m) + .join(n) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .join(l) + .join(m) + .join(n) + .join(o) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .join(l) + .join(m) + .join(n) + .join(o) + .join(p) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .join(l) + .join(m) + .join(n) + .join(o) + .join(p) + .join(q) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .join(l) + .join(m) + .join(n) + .join(o) + .join(p) + .join(q) + .join(r) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .join(l) + .join(m) + .join(n) + .join(o) + .join(p) + .join(q) + .join(r) + .join(s) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .join(l) + .join(m) + .join(n) + .join(o) + .join(p) + .join(q) + .join(r) + .join(s) + .join(t) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .join(l) + .join(m) + .join(n) + .join(o) + .join(p) + .join(q) + .join(r) + .join(s) + .join(t) + .join(u) + .mapValues(tup => flattenNestedTuple(tup)) + + def apply[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U], + v: CoGroupable[KEY, V] + ): CoGrouped[KEY, (A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V)] = + a.join(b) + .join(c) + .join(d) + .join(e) + .join(f) + .join(g) + .join(h) + .join(i) + .join(j) + .join(k) + .join(l) + .join(m) + .join(n) + .join(o) + .join(p) + .join(q) + .join(r) + .join(s) + .join(t) + .join(u) + .join(v) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B](a: CoGroupable[KEY, A], b: CoGroupable[KEY, B]): CoGrouped[KEY, (A, Option[B])] = + a.leftJoin(b) + + def left[KEY, A, B, C]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C] + ): CoGrouped[KEY, (A, Option[B], Option[C])] = + a.leftJoin(b) + .leftJoin(c) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D] + ): CoGrouped[KEY, (A, Option[B], Option[C], Option[D])] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E] + ): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E])] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F] + ): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F])] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G] + ): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G])] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H] + ): CoGrouped[KEY, (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H])] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I] + ): CoGrouped[ + KEY, + (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I]) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J] + ): CoGrouped[ + KEY, + (A, Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I], Option[J]) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .leftJoin(l) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .leftJoin(l) + .leftJoin(m) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .leftJoin(l) + .leftJoin(m) + .leftJoin(n) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .leftJoin(l) + .leftJoin(m) + .leftJoin(n) + .leftJoin(o) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .leftJoin(l) + .leftJoin(m) + .leftJoin(n) + .leftJoin(o) + .leftJoin(p) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .leftJoin(l) + .leftJoin(m) + .leftJoin(n) + .leftJoin(o) + .leftJoin(p) + .leftJoin(q) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .leftJoin(l) + .leftJoin(m) + .leftJoin(n) + .leftJoin(o) + .leftJoin(p) + .leftJoin(q) + .leftJoin(r) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .leftJoin(l) + .leftJoin(m) + .leftJoin(n) + .leftJoin(o) + .leftJoin(p) + .leftJoin(q) + .leftJoin(r) + .leftJoin(s) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .leftJoin(l) + .leftJoin(m) + .leftJoin(n) + .leftJoin(o) + .leftJoin(p) + .leftJoin(q) + .leftJoin(r) + .leftJoin(s) + .leftJoin(t) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .leftJoin(l) + .leftJoin(m) + .leftJoin(n) + .leftJoin(o) + .leftJoin(p) + .leftJoin(q) + .leftJoin(r) + .leftJoin(s) + .leftJoin(t) + .leftJoin(u) + .mapValues(tup => flattenNestedTuple(tup)) + + def left[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U], + v: CoGroupable[KEY, V] + ): CoGrouped[ + KEY, + ( + A, + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U], + Option[V] + ) + ] = + a.leftJoin(b) + .leftJoin(c) + .leftJoin(d) + .leftJoin(e) + .leftJoin(f) + .leftJoin(g) + .leftJoin(h) + .leftJoin(i) + .leftJoin(j) + .leftJoin(k) + .leftJoin(l) + .leftJoin(m) + .leftJoin(n) + .leftJoin(o) + .leftJoin(p) + .leftJoin(q) + .leftJoin(r) + .leftJoin(s) + .leftJoin(t) + .leftJoin(u) + .leftJoin(v) + .mapValues(tup => flattenNestedTuple(tup)) + + def outer[KEY, A, B]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B] + ): CoGrouped[KEY, (Option[A], Option[B])] = + a.outerJoin(b) + + def outer[KEY, A, B, C]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C] + ): CoGrouped[KEY, (Option[A], Option[B], Option[C])] = + a.outerJoin(b) + .outerJoin(c) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D] + ): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D])] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E] + ): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E])] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F] + ): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F])] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G] + ): CoGrouped[KEY, (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G])] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H] + ): CoGrouped[ + KEY, + (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H]) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I] + ): CoGrouped[ + KEY, + (Option[A], Option[B], Option[C], Option[D], Option[E], Option[F], Option[G], Option[H], Option[I]) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .outerJoin(l) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .outerJoin(l) + .outerJoin(m) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .outerJoin(l) + .outerJoin(m) + .outerJoin(n) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .outerJoin(l) + .outerJoin(m) + .outerJoin(n) + .outerJoin(o) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .outerJoin(l) + .outerJoin(m) + .outerJoin(n) + .outerJoin(o) + .outerJoin(p) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .outerJoin(l) + .outerJoin(m) + .outerJoin(n) + .outerJoin(o) + .outerJoin(p) + .outerJoin(q) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .outerJoin(l) + .outerJoin(m) + .outerJoin(n) + .outerJoin(o) + .outerJoin(p) + .outerJoin(q) + .outerJoin(r) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .outerJoin(l) + .outerJoin(m) + .outerJoin(n) + .outerJoin(o) + .outerJoin(p) + .outerJoin(q) + .outerJoin(r) + .outerJoin(s) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .outerJoin(l) + .outerJoin(m) + .outerJoin(n) + .outerJoin(o) + .outerJoin(p) + .outerJoin(q) + .outerJoin(r) + .outerJoin(s) + .outerJoin(t) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .outerJoin(l) + .outerJoin(m) + .outerJoin(n) + .outerJoin(o) + .outerJoin(p) + .outerJoin(q) + .outerJoin(r) + .outerJoin(s) + .outerJoin(t) + .outerJoin(u) + .mapValues(tup => flattenNestedOptionTuple(tup)) + + def outer[KEY, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + a: CoGroupable[KEY, A], + b: CoGroupable[KEY, B], + c: CoGroupable[KEY, C], + d: CoGroupable[KEY, D], + e: CoGroupable[KEY, E], + f: CoGroupable[KEY, F], + g: CoGroupable[KEY, G], + h: CoGroupable[KEY, H], + i: CoGroupable[KEY, I], + j: CoGroupable[KEY, J], + k: CoGroupable[KEY, K], + l: CoGroupable[KEY, L], + m: CoGroupable[KEY, M], + n: CoGroupable[KEY, N], + o: CoGroupable[KEY, O], + p: CoGroupable[KEY, P], + q: CoGroupable[KEY, Q], + r: CoGroupable[KEY, R], + s: CoGroupable[KEY, S], + t: CoGroupable[KEY, T], + u: CoGroupable[KEY, U], + v: CoGroupable[KEY, V] + ): CoGrouped[ + KEY, + ( + Option[A], + Option[B], + Option[C], + Option[D], + Option[E], + Option[F], + Option[G], + Option[H], + Option[I], + Option[J], + Option[K], + Option[L], + Option[M], + Option[N], + Option[O], + Option[P], + Option[Q], + Option[R], + Option[S], + Option[T], + Option[U], + Option[V] + ) + ] = + a.outerJoin(b) + .outerJoin(c) + .outerJoin(d) + .outerJoin(e) + .outerJoin(f) + .outerJoin(g) + .outerJoin(h) + .outerJoin(i) + .outerJoin(j) + .outerJoin(k) + .outerJoin(l) + .outerJoin(m) + .outerJoin(n) + .outerJoin(o) + .outerJoin(p) + .outerJoin(q) + .outerJoin(r) + .outerJoin(s) + .outerJoin(t) + .outerJoin(u) + .outerJoin(v) + .mapValues(tup => flattenNestedOptionTuple(tup)) + +} +// end of autogenerated diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/MultiJoinFunction.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/MultiJoinFunction.scala new file mode 100644 index 0000000000..be15efd2c0 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/MultiJoinFunction.scala @@ -0,0 +1,157 @@ +package com.twitter.scalding.typed + +import java.io.Serializable + +/** + * This is a weakly typed multi-way join function. By construction, it should be kept in sync with the types + * in a Seq[TypedPipe[(K, Any)]] + * + * a more sophisticated typing could use an HList of TypedPipe and another more advanced coding here to prove + * the types line up. However, this is somewhat easy to test and only exposed to those writing backends, so we + * are currently satisfied with the weak typing in this case + * + * We use Externalizer internally to independently serialize each function in the composition. This, in + * principle, should allow Externalizer to work better since different functions may be serializable with Kryo + * or Java, but currently Externalizer has to use java or kryo for the entire object. + */ +sealed abstract class MultiJoinFunction[A, +B] extends Serializable { + def inputSize: Int + def apply(key: A, leftMost: Iterator[Any], rightStreams: Seq[Iterable[Any]]): Iterator[B] +} + +object MultiJoinFunction extends Serializable { + final case class Casting[A, B]() extends MultiJoinFunction[A, B] { + def inputSize = 1 + def apply(k: A, iter: Iterator[Any], empties: Seq[Iterable[Any]]) = { + require(empties.isEmpty, "this join function should never be called with non-empty right-most") + iter.asInstanceOf[Iterator[B]] + } + } + + final case class PairCachedRight[K, A, B, C]( + left: MultiJoinFunction[K, A], + right: MultiJoinFunction[K, B], + fn: (K, Iterator[A], Iterable[B]) => Iterator[C] + ) extends MultiJoinFunction[K, C] { + + val inputSize = left.inputSize + right.inputSize + private[this] val leftSeqCount = left.inputSize - 1 + + def apply(key: K, leftMost: Iterator[Any], rightStreams: Seq[Iterable[Any]]): Iterator[C] = { + /* + * This require is just an extra check (which should never possibly fail unless we have a programming bug) + * that the number of streams we are joining matches the total joining operation we have. + * + * Since we have one stream in leftMost, the others should be in rightStreams. + * + * This check is cheap compared with the whole join, so we put this here to aid in checking + * correctness due to the weak types that MultiJoinFunction has (non-static size of Seq and + * the use of Any) + */ + require( + rightStreams.size == inputSize - 1, + s"expected $inputSize inputSize, found ${rightStreams.size + 1}" + ) + val (leftSeq, rightSeq) = rightStreams.splitAt(leftSeqCount) + val joinedLeft = left(key, leftMost, leftSeq) + + // we should materialize the final right one time: + val joinedRight = right(key, rightSeq.head.iterator, rightSeq.tail).toList + fn(key, joinedLeft, joinedRight) + } + } + + final case class Pair[K, A, B, C]( + left: MultiJoinFunction[K, A], + right: MultiJoinFunction[K, B], + fn: (K, Iterator[A], Iterable[B]) => Iterator[C] + ) extends MultiJoinFunction[K, C] { + + val inputSize = left.inputSize + right.inputSize + private[this] val leftSeqCount = left.inputSize - 1 + + def apply(key: K, leftMost: Iterator[Any], rightStreams: Seq[Iterable[Any]]): Iterator[C] = { + /* + * This require is just an extra check (which should never possibly fail unless we have a programming bug) + * that the number of streams we are joining matches the total joining operation we have. + * + * Since we have one stream in leftMost, the others should be in rightStreams. + * + * This check is cheap compared with the whole join, so we put this here to aid in checking + * correctness due to the weak types that MultiJoinFunction has (non-static size of Seq and + * the use of Any) + */ + require( + rightStreams.size == inputSize - 1, + s"expected $inputSize inputSize, found ${rightStreams.size + 1}" + ) + val (leftSeq, rightSeq) = rightStreams.splitAt(leftSeqCount) + val joinedLeft = left(key, leftMost, leftSeq) + + // Only do this once, for all calls to iterator below + val smallerHead = rightSeq.head // linter:disable:UndesirableTypeInference + val smallerTail = rightSeq.tail + + // TODO: it might make sense to cache this in memory as an IndexedSeq and not + // recompute it on every value for the left if the smallerJf is non-trivial + // we could see how long it is, and possible switch to a cached version the + // second time through if it is small enough + val joinedRight = new Iterable[B] { + def iterator = right(key, smallerHead.iterator, smallerTail) + } + + fn(key, joinedLeft, joinedRight) + } + } + + /** + * This is used to implement mapGroup on already joined streams + */ + final case class MapGroup[K, A, B]( + input: MultiJoinFunction[K, A], + mapGroupFn: (K, Iterator[A]) => Iterator[B] + ) extends MultiJoinFunction[K, B] { + + def inputSize = input.inputSize + + def apply(key: K, leftMost: Iterator[Any], rightStreams: Seq[Iterable[Any]]): Iterator[B] = { + val joined = input(key, leftMost, rightStreams) + mapGroupFn(key, joined) + } + } + + /** + * This is used to join IteratorMappedReduce with others. We could compose Casting[A] with MapGroup[K, A, B] + * but since it is common enough we give it its own case. + */ + final case class MapCast[K, A, B](mapGroupFn: (K, Iterator[A]) => Iterator[B]) + extends MultiJoinFunction[K, B] { + + def inputSize = 1 + + def apply(key: K, leftMost: Iterator[Any], rightStreams: Seq[Iterable[Any]]): Iterator[B] = { + require(rightStreams.isEmpty, "this join function should never be called with non-empty right-most") + mapGroupFn(key, leftMost.asInstanceOf[Iterator[A]]) + } + } + + abstract class Transformer extends Serializable { + def transformJoin[A, B, C, D]( + fn: (A, Iterator[B], Iterable[C]) => Iterator[D] + ): (A, Iterator[B], Iterable[C]) => Iterator[D] + def transformMap[A, B, C](fn: (A, Iterator[B]) => Iterator[C]): (A, Iterator[B]) => Iterator[C] + + def apply[A, B](mjf: MultiJoinFunction[A, B]): MultiJoinFunction[A, B] = + mjf match { + case c @ Casting() => c + case PairCachedRight(l, r, fn) => + PairCachedRight(apply(l), apply(r), transformJoin(fn)) + case Pair(l, r, fn) => + Pair(apply(l), apply(r), transformJoin(fn)) + case MapGroup(prev, fn) => + MapGroup(apply(prev), transformMap(fn)) + case mc: MapCast[A, x, B] => + MapCast[A, x, B](transformMap(mc.mapGroupFn)) + } + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/NoStackAndThen.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/NoStackAndThen.scala new file mode 100644 index 0000000000..a7569f34bb --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/NoStackAndThen.scala @@ -0,0 +1,106 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +/** + * This type is used to implement .andThen on a function in a way that will never blow up the stack. This is + * done to prevent deep scalding TypedPipe pipelines from blowing the stack + * + * This may be slow, but is used in scalding at planning time + */ +sealed trait NoStackAndThen[-A, +B] extends java.io.Serializable { + def apply(a: A): B + def andThen[C](fn: B => C): NoStackAndThen[A, C] = NoStackAndThen.NoStackMore(this, fn) + def andThen[C](that: NoStackAndThen[B, C]): NoStackAndThen[A, C] = { + import NoStackAndThen._ + @annotation.tailrec + def push( + front: NoStackAndThen[A, Any], + next: NoStackAndThen[Any, Any], + toAndThen: ReversedStack[Any, C] + ): NoStackAndThen[A, C] = + (next, toAndThen) match { + case (NoStackWrap(fn), EmptyStack(fn2)) => NoStackMore(front, fn).andThen(fn2) + case (NoStackWrap(fn), NonEmpty(h, tail)) => + push(NoStackMore(front, fn), NoStackAndThen.NoStackWrap(h), tail) + case (NoStackMore(first, tail), _) => push(front, first, NonEmpty(tail, toAndThen)) + case (WithStackTrace(_, _), _) => sys.error("should be unreachable") + } + that match { + case NoStackWrap(fn) => andThen(fn) + case NoStackMore(head, tail) => + // casts needed for the tailrec, they can't cause runtime errors + push(this, head.asInstanceOf[NoStackAndThen[Any, Any]], EmptyStack(tail)) + case WithStackTrace(inner, stack) => WithStackTrace(andThen(inner), stack) + } + } +} + +object NoStackAndThen { + private[typed] def buildStackEntry: Array[StackTraceElement] = Thread.currentThread().getStackTrace + + def apply[A, B](fn: A => B): NoStackAndThen[A, B] = WithStackTrace(NoStackWrap(fn), buildStackEntry) + + private sealed trait ReversedStack[-A, +B] + private final case class EmptyStack[-A, +B](fn: A => B) extends ReversedStack[A, B] + private final case class NonEmpty[-A, B, +C](head: A => B, rest: ReversedStack[B, C]) + extends ReversedStack[A, C] + + private[scalding] final case class WithStackTrace[A, B]( + inner: NoStackAndThen[A, B], + stackEntry: Array[StackTraceElement] + ) extends NoStackAndThen[A, B] { + override def apply(a: A): B = inner(a) + + override def andThen[C](fn: B => C): NoStackAndThen[A, C] = + WithStackTrace[A, C](inner.andThen(fn), stackEntry ++ buildStackEntry) + + override def andThen[C](that: NoStackAndThen[B, C]): NoStackAndThen[A, C] = + WithStackTrace[A, C](inner.andThen(that), stackEntry ++ buildStackEntry) + } + + // Just wraps a function + private final case class NoStackWrap[A, B](fn: A => B) extends NoStackAndThen[A, B] { + def apply(a: A) = fn(a) + } + // This is the defunctionalized andThen + private final case class NoStackMore[A, B, C](first: NoStackAndThen[A, B], andThenFn: (B) => C) + extends NoStackAndThen[A, C] { + /* + * scala cannot optimize tail calls if the types change. + * Any call that changes types, we replace that type with Any. These casts + * can never fail, due to the structure above. + */ + @annotation.tailrec + private def reversed(toPush: NoStackAndThen[A, Any], rest: ReversedStack[Any, C]): ReversedStack[A, C] = + toPush match { + case NoStackWrap(fn) => NonEmpty(fn, rest) + case NoStackMore(more, fn) => reversed(more, NonEmpty(fn, rest)) + case WithStackTrace(_, _) => sys.error("should be unreachable") + } + @annotation.tailrec + private def call(arg: Any, revstack: ReversedStack[Any, C]): C = revstack match { + case EmptyStack(last) => last(arg) + case NonEmpty(head, rest) => call(head(arg), rest) + } + private lazy val revStack: ReversedStack[Any, C] = + // casts needed for the tailrec, they can't cause runtime errors + reversed(first, EmptyStack(andThenFn.asInstanceOf[(Any) => C])) + .asInstanceOf[ReversedStack[Any, C]] + + def apply(a: A): C = call(a, revStack) + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/OptimizationPhases.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/OptimizationPhases.scala new file mode 100644 index 0000000000..3a0a535591 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/OptimizationPhases.scala @@ -0,0 +1,14 @@ +package com.twitter.scalding.typed + +import com.twitter.scalding.dagon.Rule + +/** + * This is a class to allow customization of how we plan typed pipes + */ +abstract class OptimizationPhases { + def phases: Seq[Rule[TypedPipe]] +} + +final class EmptyOptimizationPhases extends OptimizationPhases { + def phases = Nil +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/OptimizationRules.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/OptimizationRules.scala new file mode 100644 index 0000000000..31d43cf54a --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/OptimizationRules.scala @@ -0,0 +1,1235 @@ +package com.twitter.scalding.typed + +import com.twitter.algebird.Monoid +import com.twitter.scalding.dagon.{Dag, FunctionK, Literal, Memoize, PartialRule, Rule} +import com.twitter.scalding.typed.functions.{ + Fill, + FilterGroup, + FilterKeysToFilter, + FlatMapValuesToFlatMap, + FlatMappedFn, + FlatMapping, + MapGroupFlatMapValues, + MapGroupMapValues, + MapValueStream, + MapValuesToMap, + MergeFlatMaps, + SumAll +} +import com.twitter.scalding.typed.functions.ComposedFunctions.{ + ComposedFilterFn, + ComposedMapFn, + ComposedOnComplete +} + +object OptimizationRules { + type LiteralPipe[T] = Literal[TypedPipe, T] + + import Literal.{Binary, Unary} + import TypedPipe._ + + /** + * Since our TypedPipe is covariant, but the Literal is not this is actually safe in this context, but not + * in general + */ + def widen[T](l: LiteralPipe[_ <: T]): LiteralPipe[T] = + // to prove this is safe, see that if you have + // LiteralPipe[_ <: T] we can call .evaluate to get + // TypedPipe[_ <: T] which due to covariance is + // TypedPipe[T], and then using toLiteral we can get + // LiteralPipe[T] + // + // that would be wasteful to apply since the final + // result is identity. + l.asInstanceOf[LiteralPipe[T]] + + /** + * Convert a TypedPipe[T] to a Literal[TypedPipe, T] for use with Dagon + */ + def toLiteral: FunctionK[TypedPipe, LiteralPipe] = + Memoize.functionK[TypedPipe, LiteralPipe](new Memoize.RecursiveK[TypedPipe, LiteralPipe] { + + def toFunction[A] = { + case (cp: CounterPipe[a], f) => + Unary(f(cp.pipe), CounterPipe(_: TypedPipe[(a, Iterable[((String, String), Long)])])) + case (c: CrossPipe[a, b], f) => + Binary(f(c.left), f(c.right), CrossPipe(_: TypedPipe[a], _: TypedPipe[b])) + case (cv @ CrossValue(_, _), f) => + def go[A, B](cv: CrossValue[A, B]): LiteralPipe[(A, B)] = + cv match { + case CrossValue(a, ComputedValue(v)) => + Binary( + f(a), + f(v), + (a: TypedPipe[A], b: TypedPipe[B]) => CrossValue(a, ComputedValue(b)) + ) + case CrossValue(a, v) => + Unary(f(a), CrossValue(_: TypedPipe[A], v)) + } + widen(go(cv)) + case (p: DebugPipe[a], f) => + Unary(f(p.input), DebugPipe(_: TypedPipe[a])) + case (p: FilterKeys[a, b], f) => + widen(Unary(f(p.input), FilterKeys(_: TypedPipe[(a, b)], p.fn))) + case (p: Filter[a], f) => + Unary(f(p.input), Filter(_: TypedPipe[a], p.fn)) + case (p: Fork[a], f) => + Unary(f(p.input), Fork(_: TypedPipe[a])) + case (p: FlatMapValues[a, b, c], f) => + widen(Unary(f(p.input), FlatMapValues(_: TypedPipe[(a, b)], p.fn))) + case (p: FlatMapped[a, b], f) => + Unary(f(p.input), FlatMapped(_: TypedPipe[a], p.fn)) + case (p: ForceToDisk[a], f) => + Unary(f(p.input), ForceToDisk(_: TypedPipe[a])) + case (it @ IterablePipe(_), _) => + Literal.Const(it) + case (p: MapValues[a, b, c], f) => + widen(Unary(f(p.input), MapValues(_: TypedPipe[(a, b)], p.fn))) + case (p: Mapped[a, b], f) => + Unary(f(p.input), Mapped(_: TypedPipe[a], p.fn)) + case (p: MergedTypedPipe[a], f) => + Binary(f(p.left), f(p.right), MergedTypedPipe(_: TypedPipe[a], _: TypedPipe[a])) + case (src @ SourcePipe(_), _) => + Literal.Const(src) + case (p: SumByLocalKeys[a, b], f) => + widen(Unary(f(p.input), SumByLocalKeys(_: TypedPipe[(a, b)], p.semigroup))) + case (p: TrappedPipe[a], f) => + Unary(f(p.input), TrappedPipe[a](_: TypedPipe[a], p.sink)) + case (p: WithDescriptionTypedPipe[a], f) => + Unary(f(p.input), WithDescriptionTypedPipe(_: TypedPipe[a], p.descriptions)) + case (p: WithOnComplete[a], f) => + Unary(f(p.input), WithOnComplete(_: TypedPipe[a], p.fn)) + case (EmptyTypedPipe, _) => + Literal.Const(EmptyTypedPipe) + case (hg: HashCoGroup[a, b, c, d], f) => + widen(handleHashCoGroup(hg, f)) + case (CoGroupedPipe(cg), f) => + widen(handleCoGrouped(cg, f)) + case (ReduceStepPipe(rs), f) => + widen(handleReduceStep(rs, f)) + } + }) + + private def handleReduceStep[K, V1, V2]( + rs: ReduceStep[K, V1, V2], + recurse: FunctionK[TypedPipe, LiteralPipe] + ): LiteralPipe[(K, V2)] = { + // zero out the input so we can potentially GC it + val emptyRs = ReduceStep.setInput[K, V1, V2](rs, TypedPipe.empty) + + Unary( + widen[(K, V1)](recurse(rs.mapped)), + (tp: TypedPipe[(K, V1)]) => ReduceStepPipe(ReduceStep.setInput[K, V1, V2](emptyRs, tp)) + ) + } + + private def handleCoGrouped[K, V]( + cg: CoGroupable[K, V], + recurse: FunctionK[TypedPipe, LiteralPipe] + ): LiteralPipe[(K, V)] = { + import CoGrouped._ + + def pipeToCG[V1](t: TypedPipe[(K, V1)]): (CoGroupable[K, V1], List[(String, Boolean)]) = + t match { + case ReduceStepPipe(cg: CoGroupable[K @unchecked, V1 @unchecked]) => + // we are relying on the fact that we use Ordering[K] + // as a contravariant type, despite it not being defined + // that way. + (cg, Nil) + case CoGroupedPipe(cg) => + // we are relying on the fact that we use Ordering[K] + // as a contravariant type, despite it not being defined + // that way. + (cg.asInstanceOf[CoGroupable[K, V1]], Nil) + case WithDescriptionTypedPipe(pipe, descs) => + val (cg, d1) = pipeToCG(pipe) + (cg, ComposeDescriptions.combine(d1, descs)) + case kvPipe => + (IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly), Nil) + } + + cg match { + case p @ Pair(_, _, _) => + def go[A, B, C](pair: Pair[K, A, B, C]): LiteralPipe[(K, C)] = { + val llit = handleCoGrouped(pair.larger, recurse) + val rlit = handleCoGrouped(pair.smaller, recurse) + val fn = pair.fn + Binary( + llit, + rlit, + { (l: TypedPipe[(K, A)], r: TypedPipe[(K, B)]) => + val (left, d1) = pipeToCG(l) + val (right, d2) = pipeToCG(r) + val d3 = ComposeDescriptions.combine(d1, d2) + val pair = Pair(left, right, fn) + val withD = d3.foldLeft(pair: CoGrouped[K, C]) { case (p, (d, _)) => + p.withDescription(d) + } + CoGroupedPipe(withD) + } + ) + } + widen(go(p)) + case wr @ WithReducers(_, _) => + def go[V1 <: V](wr: WithReducers[K, V1]): LiteralPipe[(K, V)] = { + val reds = wr.reds + Unary[TypedPipe, (K, V1), (K, V)]( + handleCoGrouped(wr.on, recurse), + (tp: TypedPipe[(K, V1)]) => + tp match { + case ReduceStepPipe(rs) => + ReduceStepPipe(ReduceStep.withReducers(rs, reds)) + case CoGroupedPipe(cg) => + CoGroupedPipe(WithReducers(cg, reds)) + case kvPipe => + ReduceStepPipe( + IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly) + .withReducers(reds) + ) + } + ) + } + go(wr) + case wd @ WithDescription(_, _) => + def go[V1 <: V](wd: WithDescription[K, V1]): LiteralPipe[(K, V)] = { + val desc = wd.description + Unary[TypedPipe, (K, V1), (K, V)]( + handleCoGrouped(wd.on, recurse), + (tp: TypedPipe[(K, V1)]) => + tp match { + case ReduceStepPipe(rs) => + ReduceStepPipe(ReduceStep.withDescription(rs, desc)) + case CoGroupedPipe(cg) => + CoGroupedPipe(WithDescription(cg, desc)) + case kvPipe => + kvPipe.withDescription(desc) + } + ) + } + go(wd) + case fk @ FilterKeys(_, _) => + def go[V1 <: V](fk: FilterKeys[K, V1]): LiteralPipe[(K, V)] = { + val fn = fk.fn + Unary[TypedPipe, (K, V1), (K, V)]( + handleCoGrouped(fk.on, recurse), + (tp: TypedPipe[(K, V1)]) => + tp match { + case ReduceStepPipe(rs) => + val mapped = rs.mapped + val mappedF = TypedPipe.FilterKeys(mapped, fn) + ReduceStepPipe(ReduceStep.setInput(rs, mappedF)) + case CoGroupedPipe(cg) => + CoGroupedPipe(FilterKeys(cg, fn)) + case kvPipe => + TypedPipe.FilterKeys(kvPipe, fn) + } + ) + } + go(fk) + case mg @ MapGroup(_, _) => + def go[V1, V2 <: V](mg: MapGroup[K, V1, V2]): LiteralPipe[(K, V)] = { + val fn = mg.fn + Unary[TypedPipe, (K, V1), (K, V)]( + handleCoGrouped(mg.on, recurse), + (tp: TypedPipe[(K, V1)]) => + tp match { + case ReduceStepPipe(rs) => + ReduceStepPipe(ReduceStep.mapGroup(rs)(fn)) + case CoGroupedPipe(cg) => + CoGroupedPipe(MapGroup(cg, fn)) + case kvPipe => + ReduceStepPipe( + IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly) + .mapGroup(fn) + ) + } + ) + } + go(mg) + case step @ IdentityReduce(_, _, _, _, _) => + widen(handleReduceStep(step, recurse)) + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + widen(handleReduceStep(step, recurse)) + case step @ IteratorMappedReduce(_, _, _, _, _) => + widen(handleReduceStep(step, recurse)) + } + } + + private def handleHashCoGroup[K, V, V2, R]( + hj: HashCoGroup[K, V, V2, R], + recurse: FunctionK[TypedPipe, LiteralPipe] + ): LiteralPipe[(K, R)] = { + val rightLit: LiteralPipe[(K, V2)] = { + val rs = HashJoinable.toReduceStep(hj.right) + def go[A, B, C](rs: ReduceStep[A, B, C]): LiteralPipe[(A, C)] = + Unary(recurse(rs.mapped), { tp: TypedPipe[(A, B)] => ReduceStepPipe(ReduceStep.setInput(rs, tp)) }) + widen(go(rs)) + } + + val ordK: Ordering[K] = hj.right.keyOrdering + val joiner = hj.joiner + + Binary( + recurse(hj.left), + rightLit, + (ltp: TypedPipe[(K, V)], rtp: TypedPipe[(K, V2)]) => + rtp match { + case ReduceStepPipe(hg: HashJoinable[K @unchecked, V2 @unchecked]) => + HashCoGroup(ltp, hg, joiner) + case otherwise => + HashCoGroup(ltp, IdentityReduce[K, V2, V2](ordK, otherwise, None, Nil, implicitly), joiner) + } + ) + } + + /** + * Unroll a set of merges up to the first non-merge node, dropping an EmptyTypedPipe from the list + */ + def unrollMerge[A](t: TypedPipe[A]): List[TypedPipe[A]] = { + @annotation.tailrec + def loop(first: TypedPipe[A], todo: List[TypedPipe[A]], acc: List[TypedPipe[A]]): List[TypedPipe[A]] = + first match { + case MergedTypedPipe(l, r) => loop(l, r :: todo, acc) + case EmptyTypedPipe => + todo match { + case Nil => acc.reverse + case h :: tail => loop(h, tail, acc) + } + case IterablePipe(as) if as.isEmpty => + todo match { + case Nil => acc.reverse + case h :: tail => loop(h, tail, acc) + } + case notMerge => + val acc1 = notMerge :: acc + todo match { + case Nil => acc1.reverse + case h :: tail => loop(h, tail, acc1) + } + } + + loop(t, Nil, Nil) + } + + /** + * Make sure each returned item is unique. Any duplicates are merged using flatMap(Iterator.fill(size)(_)) + * + * TODO: this could be more precise by combining more complex mapping operations into one large flatMap + */ + def dedupMerge[A](as: List[TypedPipe[A]]): List[TypedPipe[A]] = + as.groupBy(tp => tp) + .iterator + .map { + case (p, Nil) => sys.error(s"unreachable: $p has no values") + case (p, _ :: Nil) => p // just once + case (p, repeated) => + val rsize = repeated.size + FlatMapped(p, Fill[A](rsize)) + } + .toList + + ///////////////////////////// + // + // Here are some actual rules for simplifying TypedPipes + // + ///////////////////////////// + + /** + * It is easier for planning if all fanouts are made explicit. This rule adds a Fork node every time there + * is a fanout + * + * This rule applied first makes it easier to match in subsequent rules without constantly checking for + * fanout nodes. + * + * This can increase the number of map-reduce steps compared to simply recomputing on both sides of a fork + */ + object AddExplicitForks extends Rule[TypedPipe] { + + def maybeFork[A](on: Dag[TypedPipe], t: TypedPipe[A]): Option[TypedPipe[A]] = + t match { + case ForceToDisk(_) => None + case Fork(t) if on.contains(ForceToDisk(t)) => Some(ForceToDisk(t)) + case Fork(_) => None + case EmptyTypedPipe | IterablePipe(_) | SourcePipe(_) => None + case other if !on.hasSingleDependent(other) => + Some { + // if we are already forcing this, use it + if (on.contains(ForceToDisk(other))) ForceToDisk(other) + else Fork(other) + } + case _ => None + } + + def needsFork[A](on: Dag[TypedPipe], t: TypedPipe[A]): Boolean = + maybeFork(on, t).isDefined + + private def forkCoGroup[K, V](on: Dag[TypedPipe], cg: CoGrouped[K, V]): Option[CoGrouped[K, V]] = { + import CoGrouped._ + + cg match { + case Pair(left: HashJoinable[K, v], right, jf) if forkHashJoinable(on, left).isDefined => + forkHashJoinable(on, left).map { + Pair(_, right, jf) + } + case Pair(left: CoGrouped[K, v], right, jf) if forkCoGroup(on, left).isDefined => + forkCoGroup(on, left).map { + Pair(_, right, jf) + } + case Pair(left, right: HashJoinable[K, v], jf) if forkHashJoinable(on, right).isDefined => + forkHashJoinable(on, right).map { + Pair(left, _, jf) + } + case Pair(left, right: CoGrouped[K, v], jf) if forkCoGroup(on, right).isDefined => + forkCoGroup(on, right).map { + Pair(left, _, jf) + } + case Pair(_, _, _) => None // neither side needs a fork + case WithDescription(cg, d) => forkCoGroup(on, cg).map(WithDescription(_, d)) + case WithReducers(cg, r) => forkCoGroup(on, cg).map(WithReducers(_, r)) + case MapGroup(cg, fn) => forkCoGroup(on, cg).map(MapGroup(_, fn)) + case FilterKeys(cg, fn) => forkCoGroup(on, cg).map(FilterKeys(_, fn)) + } + } + + /** + * The casts in here are safe, but scala loses track of the types in these kinds of pattern matches. We + * can fix it by changing the types on the identity reduces to use EqTypes[V1, V2] in case class and + * leaving the V2 parameter. + */ + private def forkReduceStep[A, B, C]( + on: Dag[TypedPipe], + rs: ReduceStep[A, B, C] + ): Option[ReduceStep[A, B, C]] = + maybeFork(on, rs.mapped).map(ReduceStep.setInput(rs, _)) + + private def forkHashJoinable[K, V]( + on: Dag[TypedPipe], + hj: HashJoinable[K, V] + ): Option[HashJoinable[K, V]] = + hj match { + case step @ IdentityReduce(_, _, _, _, _) => + maybeFork(on, step.mapped).map(p => step.copy(mapped = p)) + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + maybeFork(on, step.mapped).map(p => step.copy(mapped = p)) + case step @ IteratorMappedReduce(_, _, _, _, _) => + maybeFork(on, step.mapped).map(p => step.copy(mapped = p)) + } + + def apply[T](on: Dag[TypedPipe]) = { + case CounterPipe(a) if needsFork(on, a) => maybeFork(on, a).map(CounterPipe(_)) + case CrossPipe(a, b) if needsFork(on, a) => maybeFork(on, a).map(CrossPipe(_, b)) + case CrossPipe(a, b) if needsFork(on, b) => maybeFork(on, b).map(CrossPipe(a, _)) + case CrossValue(a, b) if needsFork(on, a) => maybeFork(on, a).map(CrossValue(_, b)) + case CrossValue(a, ComputedValue(b)) if needsFork(on, b) => + maybeFork(on, b).map(fb => CrossValue(a, ComputedValue(fb))) + case DebugPipe(p) => maybeFork(on, p).map(DebugPipe(_)) + case FilterKeys(p, fn) => maybeFork(on, p).map(FilterKeys(_, fn)) + case f @ Filter(_, _) => + def go[A](f: Filter[A]): Option[TypedPipe[A]] = { + val Filter(p, fn) = f + maybeFork(on, p).map(Filter(_, fn)) + } + go(f) + case FlatMapValues(p, fn) => maybeFork(on, p).map(FlatMapValues(_, fn)) + case FlatMapped(p, fn) => maybeFork(on, p).map(FlatMapped(_, fn)) + case ForceToDisk(_) | Fork(_) => None // already has a barrier + case HashCoGroup(left, right, jf) if needsFork(on, left) => + maybeFork(on, left).map(HashCoGroup(_, right, jf)) + case HashCoGroup(left, right, jf) => forkHashJoinable(on, right).map(HashCoGroup(left, _, jf)) + case MapValues(p, fn) => maybeFork(on, p).map(MapValues(_, fn)) + case Mapped(p, fn) => maybeFork(on, p).map(Mapped(_, fn)) + case MergedTypedPipe(a, b) if needsFork(on, a) => maybeFork(on, a).map(MergedTypedPipe(_, b)) + case MergedTypedPipe(a, b) if needsFork(on, b) => maybeFork(on, b).map(MergedTypedPipe(a, _)) + case ReduceStepPipe(rs) => forkReduceStep(on, rs).map(ReduceStepPipe(_)) + case SumByLocalKeys(p, sg) => maybeFork(on, p).map(SumByLocalKeys(_, sg)) + case t @ TrappedPipe(_, _) => + def go[A](t: TrappedPipe[A]): Option[TypedPipe[A]] = { + val TrappedPipe(p, sink) = t + maybeFork(on, p).map(TrappedPipe(_, sink)) + } + go(t) + case CoGroupedPipe(cgp) => forkCoGroup(on, cgp).map(CoGroupedPipe(_)) + case WithOnComplete(p, fn) => maybeFork(on, p).map(WithOnComplete(_, fn)) + case WithDescriptionTypedPipe(p, ds) => maybeFork(on, p).map(WithDescriptionTypedPipe(_, ds)) + case _ => None + } + } + + /** + * a.flatMap(f).flatMap(g) == a.flatMap { x => f(x).flatMap(g) } + */ + object ComposeFlatMap extends PartialRule[TypedPipe] { + def applyWhere[T](on: Dag[TypedPipe]) = { + case FlatMapped(FlatMapped(in, fn0), fn1) => + FlatMapped(in, FlatMappedFn(fn1).runAfter(FlatMapping.FlatM(fn0))) + case FlatMapValues(FlatMapValues(in, fn0), fn1) => + FlatMapValues(in, FlatMappedFn(fn1).runAfter(FlatMapping.FlatM(fn0))) + } + } + + /** + * a.map(f).map(g) == a.map { x => f(x).map(g) } + */ + object ComposeMap extends PartialRule[TypedPipe] { + def applyWhere[T](on: Dag[TypedPipe]) = { + case Mapped(Mapped(in, fn0), fn1) => + Mapped(in, ComposedMapFn(fn0, fn1)) + case MapValues(MapValues(in, fn0), fn1) => + MapValues(in, ComposedMapFn(fn0, fn1)) + } + } + + /** + * a.filter(f).filter(g) == a.filter { x => f(x) && g(x) } + * + * also if a filterKeys follows a filter, we might as well compose because we can't push the filterKeys up + * higher + */ + object ComposeFilter extends Rule[TypedPipe] { + def apply[T](on: Dag[TypedPipe]) = { + // scala can't type check this, so we hold its hand: + // case Filter(Filter(in, fn0), fn1) => + // Some(Filter(in, ComposedFilterFn(fn0, fn1))) + case f @ Filter(_, _) => + def go[A](f: Filter[A]): Option[TypedPipe[A]] = + f.input match { + case f1: Filter[a] => + // We have to be really careful here because f.fn and f1.fn + // have the same type. Type checking won't save you here + // we do have a test that exercises this, however + Some(Filter[a](f1.input, ComposedFilterFn(f1.fn, f.fn))) + case _ => None + } + go(f) + case FilterKeys(FilterKeys(in, fn0), fn1) => + Some(FilterKeys(in, ComposedFilterFn(fn0, fn1))) + case FilterKeys(Filter(in, fn0), fn1) => + Some(Filter(in, ComposedFilterFn(fn0, FilterKeysToFilter(fn1)))) + case _ => None + } + } + + /** + * If we assume that Orderings are coherent, which we do generally in scalding in joins for instance, we can + * compose two reduce steps + */ + object ComposeReduceSteps extends Rule[TypedPipe] { + def apply[A](on: Dag[TypedPipe]) = { + case ReduceStepPipe(rs2) => + rs2.mapped match { + case ReduceStepPipe(rs1) => + ReduceStep.maybeCompose(rs1, rs2).map(ReduceStepPipe(_)) + case WithDescriptionTypedPipe(ReduceStepPipe(rs1), descs) => + ReduceStep.maybeCompose(rs1, rs2).map { rs3 => + WithDescriptionTypedPipe(ReduceStepPipe(rs3), descs) + } + case CoGroupedPipe(cg1) => + CoGrouped.maybeCompose(cg1, rs2).map(CoGroupedPipe(_)) + case WithDescriptionTypedPipe(CoGroupedPipe(cg1), descs) => + CoGrouped.maybeCompose(cg1, rs2).map { cg2 => + WithDescriptionTypedPipe(CoGroupedPipe(cg2), descs) + } + case _ => None + } + case _ => None + } + } + + /** + * a.onComplete(f).onComplete(g) == a.onComplete { () => f(); g() } + */ + object ComposeWithOnComplete extends PartialRule[TypedPipe] { + def applyWhere[T](on: Dag[TypedPipe]) = { case WithOnComplete(WithOnComplete(pipe, fn0), fn1) => + WithOnComplete(pipe, ComposedOnComplete(fn0, fn1)) + } + } + + /** + * a.map(f).flatMap(g) == a.flatMap { x => g(f(x)) } + * a.flatMap(f).map(g) == a.flatMap { x => f(x).map(g) } + * + * This is a rule you may want to apply after having composed all the maps first + */ + object ComposeMapFlatMap extends PartialRule[TypedPipe] { + def applyWhere[T](on: Dag[TypedPipe]) = { + case FlatMapped(Mapped(in, f), g) => + FlatMapped(in, FlatMappedFn(g).runAfter(FlatMapping.Map(f))) + case FlatMapValues(MapValues(in, f), g) => + FlatMapValues(in, FlatMappedFn(g).runAfter(FlatMapping.Map(f))) + case Mapped(FlatMapped(in, f), g) => + FlatMapped(in, FlatMappedFn(f).combine(FlatMappedFn.fromMap(g))) + case MapValues(FlatMapValues(in, f), g) => + FlatMapValues(in, FlatMappedFn(f).combine(FlatMappedFn.fromMap(g))) + } + } + + /** + * a.filter(f).flatMap(g) == a.flatMap { x => if (f(x)) g(x) else Iterator.empty } + * a.flatMap(f).filter(g) == a.flatMap { x => f(x).filter(g) } + * + * This is a rule you may want to apply after having composed all the filters first + */ + object ComposeFilterFlatMap extends Rule[TypedPipe] { + def apply[T](on: Dag[TypedPipe]) = { + case FlatMapped(Filter(in, f), g) => + Some(FlatMapped(in, FlatMappedFn(g).runAfter(FlatMapping.filter(f)))) + case filter: Filter[b] => + filter.input match { + case fm: FlatMapped[a, b] => + Some(FlatMapped[a, b](fm.input, FlatMappedFn(fm.fn).combine(FlatMappedFn.fromFilter(filter.fn)))) + case _ => None + } + case _ => + None + } + } + + /** + * a.filter(f).map(g) == a.flatMap { x => if (f(x)) Iterator.single(g(x)) else Iterator.empty } + * a.map(f).filter(g) == a.flatMap { x => val y = f(x); if (g(y)) Iterator.single(y) else Iterator.empty } + * + * This is a rule you may want to apply after having composed all the filters first + * + * This may be a deoptimization on some platforms that have native filters since you could avoid the + * Iterator boxing in that case. + */ + object ComposeFilterMap extends Rule[TypedPipe] { + def apply[T](on: Dag[TypedPipe]) = { + case Mapped(Filter(in, f), g) => + Some(FlatMapped(in, FlatMappedFn.fromFilter(f).combine(FlatMappedFn.fromMap(g)))) + case filter: Filter[b] => + filter.input match { + case fm: Mapped[a, b] => + Some( + FlatMapped[a, b]( + fm.input, + FlatMappedFn.fromMap(fm.fn).combine(FlatMappedFn.fromFilter(filter.fn)) + ) + ) + case _ => None + } + case _ => + None + } + } + + /** + * This rule is important in that it allows us to reduce the number of nodes in the graph, which is helpful + * to speed up rule application + */ + object ComposeDescriptions extends PartialRule[TypedPipe] { + def combine(descs1: List[(String, Boolean)], descs2: List[(String, Boolean)]): List[(String, Boolean)] = { + val combined = descs1 ::: descs2 + + combined + .foldLeft((Set.empty[String], List.empty[(String, Boolean)])) { + case (state @ (s, acc), item @ (m, true)) => + if (s(m)) state + else (s + m, item :: acc) + case ((s, acc), item) => + (s, item :: acc) + } + ._2 + .reverse + } + + def applyWhere[T](on: Dag[TypedPipe]) = { + case WithDescriptionTypedPipe(WithDescriptionTypedPipe(input, descs1), descs2) => + WithDescriptionTypedPipe(input, combine(descs1, descs2)) + } + } + + /** + * In scalding 0.17 and earlier, descriptions were automatically pushdown below merges and + * flatMaps/map/etc.. + */ + object DescribeLater extends PartialRule[TypedPipe] { + def applyWhere[T](on: Dag[TypedPipe]) = { + case Mapped(WithDescriptionTypedPipe(in, descs), fn) => + WithDescriptionTypedPipe(Mapped(in, fn), descs) + case MapValues(WithDescriptionTypedPipe(in, descs), fn) => + WithDescriptionTypedPipe(MapValues(in, fn), descs) + case FlatMapped(WithDescriptionTypedPipe(in, descs), fn) => + WithDescriptionTypedPipe(FlatMapped(in, fn), descs) + case FlatMapValues(WithDescriptionTypedPipe(in, descs), fn) => + WithDescriptionTypedPipe(FlatMapValues(in, fn), descs) + case f @ Filter(WithDescriptionTypedPipe(_, _), _) => + def go[A](f: Filter[A]): TypedPipe[A] = + f match { + case Filter(WithDescriptionTypedPipe(in, descs), fn) => + WithDescriptionTypedPipe(Filter(in, fn), descs) + case unreachable => unreachable + } + go(f) + case FilterKeys(WithDescriptionTypedPipe(in, descs), fn) => + WithDescriptionTypedPipe(FilterKeys(in, fn), descs) + case MergedTypedPipe(WithDescriptionTypedPipe(left, descs), right) => + WithDescriptionTypedPipe(MergedTypedPipe(left, right), descs) + case MergedTypedPipe(left, WithDescriptionTypedPipe(right, descs)) => + WithDescriptionTypedPipe(MergedTypedPipe(left, right), descs) + case SumByLocalKeys(WithDescriptionTypedPipe(input, descs), sg) => + WithDescriptionTypedPipe(SumByLocalKeys(input, sg), descs) + case WithDescriptionTypedPipe(WithDescriptionTypedPipe(input, descs1), descs2) => + // This rule is important in that it allows us to reduce + // the number of nodes in the graph, which is helpful to speed up rule application + WithDescriptionTypedPipe(input, ComposeDescriptions.combine(descs1, descs2)) + } + } + + /** + * (a ++ a) == a.flatMap { t => List(t, t) } This is a very simple rule that is subsumed by DeDiamondMappers + * below + */ + object DiamondToFlatMap extends Rule[TypedPipe] { + def apply[T](on: Dag[TypedPipe]) = { + case m @ MergedTypedPipe(_, _) => + val pipes = unrollMerge(m) + val flatMapped = dedupMerge(pipes) + + if (pipes.size == flatMapped.size) None // we didn't reduce the number of merges + else { + Some(flatMapped match { + case Nil => EmptyTypedPipe + case h :: tail => + tail.foldLeft(h)(MergedTypedPipe(_, _)) + }) + } + case _ => None + } + } + + /** + * This is a more expensive, but more general version of the previous rule: we can merge trailing mapping + * operations that originate at a common node. + * + * After this rule, the only diamonds that exist have at least one non-mapping operation on the path. + */ + object DeDiamondMappers extends Rule[TypedPipe] { + sealed abstract class Mapper[+A] { + type Init + val input: TypedPipe[Init] + val fn: FlatMappedFn[Init, A] + val descriptions: List[(String, Boolean)] + + def combine[B](fn2: FlatMappedFn[A, B]): Mapper.Aux[Init, B] = + Mapper(input, fn.combine(fn2), descriptions) + + def withDescriptions(desc: List[(String, Boolean)]): Mapper.Aux[Init, A] = + Mapper(input, fn, ComposeDescriptions.combine(descriptions, desc)) + + def toTypedPipe: TypedPipe[A] = { + val pipe = FlatMappedFn.asId(fn) match { + case Some(ev) => + ev.subst[TypedPipe](input) + case None => + FlatMappedFn.asFilter(fn) match { + case Some((f, ev)) => + ev.subst[TypedPipe](Filter(input, f)) + case None => + FlatMapped(input, fn) + } + } + Mapper.maybeDescribe(pipe, descriptions) + } + } + + object Mapper { + type Aux[A, +B] = Mapper[B] { type Init = A } + + def maybeDescribe[A](tp: TypedPipe[A], descs: List[(String, Boolean)]): TypedPipe[A] = + descs match { + case Nil => tp + case ne => + tp match { + case WithDescriptionTypedPipe(t0, d0) => + WithDescriptionTypedPipe(t0, ComposeDescriptions.combine(d0, ne)) + case notD => + // here we use combine to make sure follow the rules of removing uniqued + // descriptions + WithDescriptionTypedPipe(notD, ComposeDescriptions.combine(Nil, ne)) + } + } + + def apply[A, B]( + p: TypedPipe[A], + fn0: FlatMappedFn[A, B], + desc: List[(String, Boolean)] + ): Mapper[B] { type Init = A } = + new Mapper[B] { + type Init = A + val input = p + val fn = fn0 + val descriptions = desc + } + + def unmapped[A](p: TypedPipe[A]): Aux[A, A] = + apply(p, FlatMappedFn.identity[A], Nil) + } + + def toMappers[A](tp: TypedPipe[A]): List[Mapper[A]] = + tp match { + // First, these are non-mapped pipes. + case EmptyTypedPipe | IterablePipe(_) | SourcePipe(_) | ReduceStepPipe(_) | CoGroupedPipe(_) | + CrossPipe(_, _) | CounterPipe(_) | CrossValue(_, _) | DebugPipe(_) | ForceToDisk(_) | Fork(_) | + HashCoGroup(_, _, _) | SumByLocalKeys(_, _) | TrappedPipe(_, _) | WithOnComplete(_, _) => + Mapper.unmapped(tp) :: Nil + case FilterKeys(p, fn) => + toMappers(p).map(_.combine(FlatMappedFn.fromFilter(FilterKeysToFilter(fn)))) + case f @ Filter(_, _) => + // type inference needs hand holding on this one + def go[A1 <: A](p: TypedPipe[A1], fn: A1 => Boolean): List[Mapper[A]] = { + val fn1: FlatMappedFn[A1, A] = + FlatMappedFn.fromFilter[A1](fn) + toMappers(p).map(_.combine(fn1)) + } + go(f.input, f.fn) + case FlatMapValues(p, fn) => + toMappers(p).map(_.combine(FlatMappedFn(FlatMapValuesToFlatMap(fn)))) + case FlatMapped(p, fn) => + toMappers(p).map(_.combine(FlatMappedFn(fn))) + case MapValues(p, fn) => + toMappers(p).map(_.combine(FlatMappedFn.fromMap(MapValuesToMap(fn)))) + case Mapped(p, fn) => + toMappers(p).map(_.combine(FlatMappedFn.fromMap(fn))) + case MergedTypedPipe(a, b) => + toMappers(a) ::: toMappers(b) + case WithDescriptionTypedPipe(p, ds) => + toMappers(p).map(_.withDescriptions(ds)) + } + + // This is unsafe as written, since we require callers to ensure + // that the init is the same + private def merge[A](ms: Iterable[Mapper[A]]): TypedPipe[A] = + ms.toList match { + case Nil => EmptyTypedPipe + case h :: Nil => + // there is only one Mapper, just convert back to a TypedPipe + h.toTypedPipe + case all @ (h :: t) => + // we have several merged back from a common point + // we don't know what that previous type was, but + // we cast it to Any, we know the values in there + // must be compatible with all the Mappers, since they + // all consume, so this cast is safe after this check: + require(t.forall(_.input == h.input), s"mismatched inputs: ${all.map(_.input)}") + val msCast = all.asInstanceOf[List[Mapper.Aux[Any, A]]] + val fn: Any => TraversableOnce[A] = MergeFlatMaps(msCast.map(_.fn)) + val fmpipe = FlatMapped(h.input, fn) + Mapper.maybeDescribe(fmpipe, all.flatMap(_.descriptions)) + } + + def apply[A](on: Dag[TypedPipe]) = { + // here are the trailing mappers of this pipe + case fork @ Fork(inner) if on.hasSingleDependent(fork) => + // due to a previous application of this rule, + // this fork may have been reduced to only one + // downstream, in that case, we can remove the fork + Some(inner) + case m @ MergedTypedPipe(_, _) => + // this rule only applies to merged pipes + // let's see if we have any duplicated inputs: + val mapperGroups = toMappers(m).groupBy(_.input) + val hasDiamond = mapperGroups.exists { case (_, ps) => ps.lengthCompare(1) > 0 } + if (hasDiamond) Some { + val groups = mapperGroups.map { case (_, ms) => merge(ms) } + Monoid.sum(groups) + } + else None + case _ => + // Not a merge or a fork + None + } + } + + /** + * After a forceToDisk there is no need to immediately fork. Calling forceToDisk twice in a row is the same + * as once. Calling fork twice in a row is the same as once. + */ + object RemoveDuplicateForceFork extends PartialRule[TypedPipe] { + def applyWhere[T](on: Dag[TypedPipe]) = { + case ForceToDisk(ForceToDisk(t)) => ForceToDisk(t) + case ForceToDisk(WithDescriptionTypedPipe(ForceToDisk(t), desc)) => + // we might as well only do one force to disk in this case + WithDescriptionTypedPipe(ForceToDisk(t), desc) + case ForceToDisk(Fork(t)) => ForceToDisk(t) + case Fork(Fork(t)) => Fork(t) + case Fork(ForceToDisk(t)) => ForceToDisk(t) + case Fork(t) if on.contains(ForceToDisk(t)) => ForceToDisk(t) + } + } + + /** + * If a fork has no fan-out when planned, it serves no purpose and is safe to remove. Likewise, there is no + * reason to put a forceToDisk immediatle after a source + */ + object RemoveUselessFork extends PartialRule[TypedPipe] { + def applyWhere[T](on: Dag[TypedPipe]) = { + case fork @ Fork(t) if on.hasSingleDependent(fork) => t + case Fork(src @ SourcePipe(_)) => src + case Fork(iter @ IterablePipe(_)) => iter + case ForceToDisk(src @ SourcePipe(_)) => src + case ForceToDisk(iter @ IterablePipe(_)) => iter + } + } + + /** + * This allows you to replace the sources according to a given Resolver + */ + case class ReplaceSources(resolver: Resolver[Input, Input]) extends Rule[TypedPipe] { + def apply[T](on: Dag[TypedPipe]) = { + case SourcePipe(src) => + resolver(src).map(SourcePipe(_)) + case _ => None + } + } + + /** + * We ignore .group if there are is no setting of reducers + * + * This is arguably not a great idea, but scalding has always done it to minimize accidental map-reduce + * steps + */ + object IgnoreNoOpGroup extends PartialRule[TypedPipe] { + def applyWhere[T](on: Dag[TypedPipe]) = { + case ReduceStepPipe(IdentityReduce(_, input, None, _, _)) => + input + case ReduceStepPipe(UnsortedIdentityReduce(_, input, None, _, _)) => + input + } + } + + /** + * In map-reduce settings, Merge is almost free in two contexts: + * 1. the final write 2. at the point we are doing a shuffle anyway. + * + * By defering merge as long as possible, we hope to find more such cases + */ + object DeferMerge extends PartialRule[TypedPipe] { + private def handleFilter[A]: PartialFunction[Filter[A], TypedPipe[A]] = { + case Filter(MergedTypedPipe(a, b), fn) => MergedTypedPipe(Filter(a, fn), Filter(b, fn)) + } + + def applyWhere[T](on: Dag[TypedPipe]) = { + case Mapped(MergedTypedPipe(a, b), fn) => + MergedTypedPipe(Mapped(a, fn), Mapped(b, fn)) + case FlatMapped(MergedTypedPipe(a, b), fn) => + MergedTypedPipe(FlatMapped(a, fn), FlatMapped(b, fn)) + case MapValues(MergedTypedPipe(a, b), fn) => + MergedTypedPipe(MapValues(a, fn), MapValues(b, fn)) + case FlatMapValues(MergedTypedPipe(a, b), fn) => + MergedTypedPipe(FlatMapValues(a, fn), FlatMapValues(b, fn)) + case f @ Filter(_, _) if handleFilter.isDefinedAt(f) => handleFilter(f) + case FilterKeys(MergedTypedPipe(a, b), fn) => + MergedTypedPipe(FilterKeys(a, fn), FilterKeys(b, fn)) + } + } + + /** + * Push filterKeys up as early as possible. This can happen before a shuffle, which can be a major win. This + * allows you to write generic methods that return all the data, but if downstream someone only wants + * certain keys they don't pay to compute everything. + * + * This is an optimization we didn't do in scalding 0.17 and earlier because .toTypedPipe on the group + * totally hid the structure from us + */ + object FilterKeysEarly extends Rule[TypedPipe] { + private def filterReduceStep[K, V1, V2]( + rs: ReduceStep[K, V1, V2], + fn: K => Boolean + ): ReduceStep[K, V1, V2] = + ReduceStep.setInput(rs, FilterKeys(rs.mapped, fn)) + + private def filterCoGroupable[K, V](rs: CoGroupable[K, V], fn: K => Boolean): CoGroupable[K, V] = + rs match { + case rs: ReduceStep[K @unchecked, v1, V @unchecked] => + ReduceStep + .toHashJoinable(filterReduceStep(rs, fn)) + .getOrElse { + sys.error( + "unreachable: filterReduceStep returns the same type, and this input type was CoGroupable" + ) + } + case cg: CoGrouped[K @unchecked, V @unchecked] => filterCoGroup(cg, fn) + } + + private def filterCoGroup[K, V](cg: CoGrouped[K, V], fn: K => Boolean): CoGrouped[K, V] = + cg match { + case CoGrouped.Pair(a, b, jf) => + CoGrouped.Pair(filterCoGroupable(a, fn), filterCoGroupable(b, fn), jf) + case CoGrouped.FilterKeys(cg, g) => + filterCoGroup(cg, ComposedFilterFn(g, fn)) + case CoGrouped.MapGroup(cg, g) => + CoGrouped.MapGroup(filterCoGroup(cg, fn), g) + case CoGrouped.WithDescription(cg, d) => + CoGrouped.WithDescription(filterCoGroup(cg, fn), d) + case CoGrouped.WithReducers(cg, r) => + CoGrouped.WithReducers(filterCoGroup(cg, fn), r) + } + + def apply[T](on: Dag[TypedPipe]) = { + case FilterKeys(ReduceStepPipe(rsp), fn) => + Some(ReduceStepPipe(filterReduceStep(rsp, fn))) + case FilterKeys(CoGroupedPipe(cg), fn) => + Some(CoGroupedPipe(filterCoGroup(cg, fn))) + case FilterKeys(HashCoGroup(left, right, joiner), fn) => + val newRight = HashJoinable.filterKeys(right, fn) + Some(HashCoGroup(FilterKeys(left, fn), newRight, joiner)) + case FilterKeys(MapValues(pipe, mapFn), filterFn) => + Some(MapValues(FilterKeys(pipe, filterFn), mapFn)) + case FilterKeys(FlatMapValues(pipe, fmFn), filterFn) => + Some(FlatMapValues(FilterKeys(pipe, filterFn), fmFn)) + case _ => None + } + } + + /** + * EmptyTypedPipe is kind of zero of most of these operations We go ahead and simplify as much as possible + * if we see an EmptyTypedPipe + */ + object EmptyIsOftenNoOp extends PartialRule[TypedPipe] { + + private def emptyCogroup[K, V](cg: CoGrouped[K, V]): Boolean = { + import CoGrouped._ + + def empty(t: TypedPipe[Any]): Boolean = t match { + case EmptyTypedPipe => true + case _ => false + } + cg match { + case Pair(left, _, jf) if left.inputs.forall(empty) && (Joiner.isLeftJoinLike(jf) == Some(true)) => + true + case Pair(_, right, jf) if right.inputs.forall(empty) && (Joiner.isRightJoinLike(jf) == Some(true)) => + true + case Pair(left, right, _) if left.inputs.forall(empty) && right.inputs.forall(empty) => true + case Pair(_, _, _) => false + case WithDescription(cg, _) => emptyCogroup(cg) + case WithReducers(cg, _) => emptyCogroup(cg) + case MapGroup(cg, _) => emptyCogroup(cg) + case FilterKeys(cg, _) => emptyCogroup(cg) + } + } + + private def emptyHashJoinable[K, V](hj: HashJoinable[K, V]): Boolean = + HashJoinable.toReduceStep(hj).mapped == EmptyTypedPipe + + def applyWhere[T](on: Dag[TypedPipe]) = { + case CrossPipe(EmptyTypedPipe, _) => EmptyTypedPipe + case CrossPipe(_, EmptyTypedPipe) => EmptyTypedPipe + case CrossValue(EmptyTypedPipe, _) => EmptyTypedPipe + case CrossValue(_, ComputedValue(EmptyTypedPipe)) => EmptyTypedPipe + case CrossValue(_, EmptyValue) => EmptyTypedPipe + case DebugPipe(EmptyTypedPipe) => EmptyTypedPipe + case FilterKeys(EmptyTypedPipe, _) => EmptyTypedPipe + case Filter(EmptyTypedPipe, _) => EmptyTypedPipe + case FlatMapValues(EmptyTypedPipe, _) => EmptyTypedPipe + case FlatMapped(EmptyTypedPipe, _) => EmptyTypedPipe + case ForceToDisk(EmptyTypedPipe) => EmptyTypedPipe + case HashCoGroup(EmptyTypedPipe, _, _) => EmptyTypedPipe + case HashCoGroup(_, right, hjf) + if emptyHashJoinable(right) && Joiner.isInnerHashJoinLike(hjf) == Some(true) => + EmptyTypedPipe + case MapValues(EmptyTypedPipe, _) => EmptyTypedPipe + case Mapped(EmptyTypedPipe, _) => EmptyTypedPipe + case MergedTypedPipe(EmptyTypedPipe, a) => a + case MergedTypedPipe(a, EmptyTypedPipe) => a + case ReduceStepPipe(rs: ReduceStep[_, _, _]) if rs.mapped == EmptyTypedPipe => EmptyTypedPipe + case SumByLocalKeys(EmptyTypedPipe, _) => EmptyTypedPipe + case TrappedPipe(EmptyTypedPipe, _) => EmptyTypedPipe + case CoGroupedPipe(cgp) if emptyCogroup(cgp) => EmptyTypedPipe + case WithOnComplete(EmptyTypedPipe, _) => + EmptyTypedPipe // there is nothing to do, so we never have workers complete + case WithDescriptionTypedPipe(EmptyTypedPipe, _) => + EmptyTypedPipe // descriptions apply to tasks, but empty has no tasks + + // This rule is tempting, but dangerous since if used in combination + // with AddExplicitForks it would create an infinite loop + // case Fork(EmptyTypedPipe) => EmptyTypedPipe + } + } + + /** + * If an Iterable is empty, it is the same as EmptyTypedPipe + */ + object EmptyIterableIsEmpty extends PartialRule[TypedPipe] { + def applyWhere[T](on: Dag[TypedPipe]) = { + case IterablePipe(it) if it.isEmpty => EmptyTypedPipe + } + } + + /** + * This is useful on map-reduce like systems to avoid serializing data into the system that you are going to + * then filter + */ + object FilterLocally extends Rule[TypedPipe] { + def apply[T](on: Dag[TypedPipe]) = { + case f @ Filter(_, _) => + def go[T1 <: T](f: Filter[T1]): Option[TypedPipe[T]] = + f match { + case Filter(IterablePipe(iter), fn) => + Some(IterablePipe(iter.filter(fn))) + case _ => None + } + go(f) + case f @ FilterKeys(_, _) => + def go[K, V, T >: (K, V)](f: FilterKeys[K, V]): Option[TypedPipe[T]] = + f match { + case FilterKeys(IterablePipe(iter), fn) => + Some(IterablePipe(iter.filter { case (k, _) => fn(k) })) + case _ => None + } + go(f) + case _ => None + } + } + + /** + * ForceToDisk before hashJoin, this makes sure any filters have been applied + */ + object ForceToDiskBeforeHashJoin extends Rule[TypedPipe] { + // A set of operations naturally have barriers after them, + // there is no need to add an explicit force after a reduce + // step or after a source, since both will already have been + // checkpointed + final def maybeForce[T](t: TypedPipe[T]): TypedPipe[T] = + t match { + case ReduceStepPipe(IdentityReduce(_, input, None, _, _)) => + // this is a no-op reduce that will be removed, so we may need to add a force + maybeForce(input) + case ReduceStepPipe(UnsortedIdentityReduce(_, input, None, _, _)) => + // this is a no-op reduce that will be removed, so we may need to add a force + maybeForce(input) + case SourcePipe(_) | IterablePipe(_) | CoGroupedPipe(_) | ReduceStepPipe(_) | ForceToDisk(_) => t + case WithOnComplete( + pipe, + fn + ) => // TODO it is not clear this is safe in cascading 3, since oncomplete is an each + WithOnComplete(maybeForce(pipe), fn) + case WithDescriptionTypedPipe(pipe, descs) => + WithDescriptionTypedPipe(maybeForce(pipe), descs) + case pipe => ForceToDisk(pipe) + } + + def apply[T](on: Dag[TypedPipe]) = { + case HashCoGroup(left, right: HashJoinable[a, b], joiner) => + val newRight: HashJoinable[a, b] = right match { + case step @ IdentityReduce(_, _, _, _, _) => + step.copy(mapped = maybeForce(step.mapped)) + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + step.copy(mapped = maybeForce(step.mapped)) + case step @ IteratorMappedReduce(_, _, _, _, _) => + step.copy(mapped = maybeForce(step.mapped)) + } + if (newRight != right) Some(HashCoGroup(left, newRight, joiner)) + else None + case (cp @ CrossPipe(_, _)) => Some(cp.viaHashJoin) + case (cv @ CrossValue(_, _)) => Some(cv.viaHashJoin) + case _ => None + } + } + + /** + * Convert all HashCoGroup to CoGroupedPipe + */ + object HashToShuffleCoGroup extends Rule[TypedPipe] { + def apply[T](on: Dag[TypedPipe]) = { + case HashCoGroup(left, right: HashJoinable[a, b], joiner) => + val leftg = Grouped(left)(right.keyOrdering) + val joiner2 = Joiner.toCogroupJoiner2(joiner) + Some(CoGroupedPipe(CoGrouped.Pair(leftg, right, joiner2))) + case (cp @ CrossPipe(_, _)) => Some(cp.viaHashJoin) + case (cv @ CrossValue(_, _)) => Some(cv.viaHashJoin) + case _ => None + } + } + + /** + * Prefer to do mapValues/flatMapValues in a Reduce/Join so we can avoid some boxing in-and-out of cascading + */ + object MapValuesInReducers extends PartialRule[TypedPipe] { + + def handleFilter[A](f: Filter[A]): Option[TypedPipe[A]] = + f.input match { + case ReduceStepPipe(rs) => + Some(ReduceStepPipe(ReduceStep.mapGroup(rs)(FilterGroup(f.fn)))) + case CoGroupedPipe(cg) => + Some(CoGroupedPipe(CoGrouped.MapGroup(cg, FilterGroup(f.fn)))) + case _ => None + } + + def applyWhere[T](on: Dag[TypedPipe]) = { + case MapValues(ReduceStepPipe(rs), fn) => + ReduceStepPipe(ReduceStep.mapGroup(rs)(MapGroupMapValues(fn))) + case FlatMapValues(ReduceStepPipe(rs), fn) => + ReduceStepPipe(ReduceStep.mapGroup(rs)(MapGroupFlatMapValues(fn))) + case MapValues(CoGroupedPipe(cg), fn) => + CoGroupedPipe(CoGrouped.MapGroup(cg, MapGroupMapValues(fn))) + case FlatMapValues(CoGroupedPipe(cg), fn) => + CoGroupedPipe(CoGrouped.MapGroup(cg, MapGroupFlatMapValues(fn))) + case f @ Filter(_, _) if handleFilter(f).isDefined => + handleFilter(f).getOrElse(sys.error("unreachable: already checked isDefined")) + case SumByLocalKeys(ReduceStepPipe(rs), sg) => + ReduceStepPipe(ReduceStep.mapGroup(rs)(MapValueStream(SumAll(sg)))) + case SumByLocalKeys(CoGroupedPipe(cg), sg) => + CoGroupedPipe(CoGrouped.MapGroup(cg, MapValueStream(SumAll(sg)))) + } + } + + /////// + // These are composed rules that are related + ////// + + /** + * Like kinds can be composed .map(f).map(g), filter(f).filter(g) etc... + */ + val composeSame: Rule[TypedPipe] = + Rule.orElse(List(ComposeMap, ComposeFilter, ComposeFlatMap, ComposeWithOnComplete)) + + /** + * If you are going to do a flatMap, following it or preceding it with map/filter you might as well compose + * into the flatMap + */ + val composeIntoFlatMap: Rule[TypedPipe] = + Rule.orElse(List(ComposeMapFlatMap, ComposeFilterFlatMap, ComposeFlatMap)) + + val simplifyEmpty: Rule[TypedPipe] = + EmptyIsOftenNoOp.orElse(EmptyIterableIsEmpty) + + /** + * These are a list of rules to be applied in order (Dag.applySeq) that should generally always improve + * things on Map/Reduce-like platforms. + * + * These are rules we should apply to any TypedPipe before handing to cascading. These should be a bit + * conservative in that they should be highly likely to improve the graph. + */ + val standardMapReduceRules: List[Rule[TypedPipe]] = + List( + // phase 0, add explicit forks to not duplicate pipes on fanout below + AddExplicitForks, + RemoveUselessFork, + // phase 1, compose flatMap/map, move descriptions down, defer merge, filter pushup etc... + IgnoreNoOpGroup + .orElse(composeSame) + .orElse(DescribeLater) + .orElse(DeferMerge), + // phase 2, combine different kinds of mapping operations into flatMaps, including redundant merges + composeIntoFlatMap + .orElse(simplifyEmpty) + .orElse(ComposeDescriptions) + .orElse(DescribeLater) + .orElse(DeferMerge) + .orElse(DeDiamondMappers), // better to put expensive rules last in an orElse + // phase 3, after we can do any de-diamonding, we finally pull mapValues into reducers + // if we do this before de-diamonding, it can hide diamonds as an artifact + // of making reduce operations look different + MapValuesInReducers + .orElse(FilterKeysEarly), + // phase 4, remove duplicates forces/forks (e.g. .fork.fork or .forceToDisk.fork, ....) + RemoveDuplicateForceFork + ) + + /** + * a Convenience function to avoid needing to pass toLiteral + */ + def apply[A](t: TypedPipe[A], r: Rule[TypedPipe]): TypedPipe[A] = + Dag.applyRule(t, toLiteral, r) +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/Resolver.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/Resolver.scala new file mode 100644 index 0000000000..f9ead5ad3c --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/Resolver.scala @@ -0,0 +1,92 @@ +package com.twitter.scalding.typed + +import com.twitter.scalding.dagon.HMap +import java.io.Serializable +import scala.util.hashing.MurmurHash3 + +/** + * This class is an like a higher kinded PartialFunction which we use to look up sources and sinks in a safe + * way + */ +abstract class Resolver[I[_], O[_]] extends Serializable { + def apply[A](i: I[A]): Option[O[A]] + + def orElse(that: Resolver[I, O]): Resolver[I, O] = + Resolver.orElse(this, that) + + def andThen[O2[_]](that: Resolver[O, O2]): Resolver[I, O2] = + Resolver.AndThen(this, that) +} + +object Resolver extends Serializable { + private case class HMapResolver[I[_], O[_]](toHMap: HMap[I, O]) extends Resolver[I, O] { + override val hashCode = toHMap.hashCode + + def apply[A](i: I[A]): Option[O[A]] = toHMap.get(i) + } + + private case class OrElse[I[_], O[_]](first: Resolver[I, O], second: Resolver[I, O]) + extends Resolver[I, O] { + override val hashCode: Int = MurmurHash3.productHash(this) + + def apply[A](i: I[A]): Option[O[A]] = { + @annotation.tailrec + def lookup(from: Resolver[I, O], rest: List[Resolver[I, O]]): Option[O[A]] = + from match { + case OrElse(first, second) => + lookup(first, second :: rest) + case notOrElse => + notOrElse(i) match { + case some @ Some(_) => some + case None => + rest match { + case Nil => None + case h :: tail => + lookup(h, tail) + } + } + } + + lookup(first, second :: Nil) + } + } + + private case class AndThen[X[_], Y[_], Z[_]](first: Resolver[X, Y], second: Resolver[Y, Z]) + extends Resolver[X, Z] { + override val hashCode: Int = MurmurHash3.productHash(this) + + def apply[A](i: X[A]): Option[Z[A]] = + first(i).flatMap(second(_)) + } + + def empty[I[_], O[_]]: Resolver[I, O] = + HMapResolver(HMap.empty[I, O]) + + def pair[I[_], O[_], A](input: I[A], output: O[A]): Resolver[I, O] = + HMapResolver[I, O](HMap.empty[I, O] + (input -> output)) + + def fromHMap[I[_], O[_]](hmap: HMap[I, O]): Resolver[I, O] = + HMapResolver(hmap) + + def orElse[I[_], O[_]](first: Resolver[I, O], second: Resolver[I, O]): Resolver[I, O] = + first match { + case same if same == second => same + case hmp @ HMapResolver(fhm) => + second match { + case HMapResolver(shm) => + // dagon does not have a ++ :( + val merged = fhm.keySet.foldLeft(shm) { (hmap, k) => + def addKey[A](k: I[A]): HMap[I, O] = + hmap + (k -> fhm(k)) + addKey(k) + } + HMapResolver(merged) + case notHMap => + OrElse(hmp, notHMap) + } + case OrElse(a, b) => + // Make sure we are right associated + orElse(a, orElse(b, second)) + case notOrElse => OrElse(notOrElse, second) + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/Sketched.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/Sketched.scala new file mode 100644 index 0000000000..242889e543 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/Sketched.scala @@ -0,0 +1,136 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +import com.twitter.algebird.{Batched, Bytes, CMS} +import com.twitter.scalding.serialization.macros.impl.BinaryOrdering._ +import com.twitter.scalding.serialization.{OrderedSerialization, OrderedSerialization2} +import com.twitter.algebird.CMSMonoid + +// This was a bad design choice, we should have just put these in the CMSHasher object + +/** + * This class is generally only created by users with the TypedPipe.sketch method + */ +case class Sketched[K, V](pipe: TypedPipe[(K, V)], numReducers: Int, delta: Double, eps: Double, seed: Int)( + implicit + val serialization: K => Array[Byte], + ordering: Ordering[K] +) extends MustHaveReducers { + + def reducers = Some(numReducers) + + lazy val sketch: TypedPipe[CMS[Bytes]] = { + // don't close over Sketched + val localSer = serialization + val (leps, ldelta, lseed) = (eps, delta, seed) + lazy implicit val cms: CMSMonoid[Bytes] = CMS.monoid[Bytes](leps, ldelta, lseed) + + // every 10k items, compact into a CMS to prevent very slow mappers + lazy implicit val batchedSG: com.twitter.algebird.Semigroup[Batched[CMS[Bytes]]] = + Batched.compactingSemigroup[CMS[Bytes]](10000) + + pipe + .map { case (k, _) => ((), Batched(cms.create(Bytes(localSer(k))))) } + .sumByLocalKeys + .map { case (_, batched) => + batched.sum + } // remove the Batched before going to the reducers + .groupAll + .sum + .values + .forceToDisk // make sure we materialize when we have 1 item + } + + /** + * Like a hashJoin, this joiner does not see all the values V at one time, only one at a time. This is + * sufficient to implement join and leftJoin + */ + def cogroup[V2, R](right: TypedPipe[(K, V2)])( + joiner: (K, V, Iterable[V2]) => Iterator[R] + ): SketchJoined[K, V, V2, R] = + new SketchJoined(this, right, numReducers)(joiner) + + /** + * Does a logical inner join but replicates the heavy keys of the left hand side across the reducers + */ + def join[V2](right: TypedPipe[(K, V2)]): SketchJoined[K, V, V2, (V, V2)] = + cogroup(right)(Joiner.hashInner2) + + /** + * Does a logical left join but replicates the heavy keys of the left hand side across the reducers + */ + def leftJoin[V2](right: TypedPipe[(K, V2)]): SketchJoined[K, V, V2, (V, Option[V2])] = + cogroup(right)(Joiner.hashLeft2) +} + +case class SketchJoined[K: Ordering, V, V2, R]( + left: Sketched[K, V], + right: TypedPipe[(K, V2)], + numReducers: Int +)(joiner: (K, V, Iterable[V2]) => Iterator[R]) + extends MustHaveReducers { + + def reducers = Some(numReducers) + + // the most of any one reducer we want to try to take up with a single key + private val maxReducerFraction = 0.1 + + private def flatMapWithReplicas[W](pipe: TypedPipe[(K, W)])(fn: Int => Iterable[Int]) = { + // don't close over Sketched + val localSer = left.serialization + val localNumReducers = numReducers + val localMaxReducerFraction = maxReducerFraction + + pipe.cross(left.sketch).flatMap { case ((k, w), cms) => + val maxPerReducer = ((cms.totalCount * localMaxReducerFraction) / localNumReducers) + 1 + val maxReplicas = cms.frequency(Bytes(localSer(k))).estimate.toDouble / maxPerReducer + // if the frequency is 0, maxReplicas.ceil will be 0 so we will filter out this key entirely + // if it's < maxPerReducer, the ceil will round maxReplicas up to 1 to ensure we still see it + val replicas = fn(maxReplicas.ceil.toInt.min(localNumReducers)) + replicas.map(i => (i, k) -> w) + } + } + + val toTypedPipe: TypedPipe[(K, R)] = { + lazy val rand = new scala.util.Random(left.seed) + val lhs = flatMapWithReplicas(left.pipe)(n => (rand.nextInt(n) + 1) :: Nil) + val rhs = flatMapWithReplicas(right)(n => 1.to(n)) + + lhs.group + .cogroup(rhs.group)((k, itv, itu) => itv.flatMap(v => joiner(k._2, v, itu))) + .withReducers(numReducers) + .map { case ((r, k), v) => (k, v) } + } + + private implicit def intKeyOrd: Ordering[(Int, K)] = { + val kord = implicitly[Ordering[K]] + + kord match { + case kos: OrderedSerialization[_] => + new OrderedSerialization2(ordSer[Int], kos.asInstanceOf[OrderedSerialization[K]]) + case _ => Ordering.Tuple2[Int, K] + } + } + +} + +object SketchJoined { + implicit def toTypedPipe[K, V, V2, R](joined: SketchJoined[K, V, V2, R]): TypedPipe[(K, R)] = + joined.toTypedPipe + implicit def toTypedPipeKeyed[K, V, V2, R](joined: SketchJoined[K, V, V2, R]): TypedPipe.Keyed[K, R] = + new TypedPipe.Keyed(joined.toTypedPipe) +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/TypedPipe.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/TypedPipe.scala new file mode 100644 index 0000000000..1ece95fe6b --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/TypedPipe.scala @@ -0,0 +1,903 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +import java.io.{InputStream, OutputStream, Serializable} + +import com.twitter.algebird.{Aggregator, Batched, Monoid, Semigroup} +import com.twitter.scalding._ +import com.twitter.scalding.typed.functions.{ + AsLeft, + AsRight, + Constant, + ConstantKey, + DropValue1, + GetKey, + GetValue, + Identity, + MakeKey, + PartialFunctionToFilter, + RandomFilter, + RandomNextInt, + SubTypes, + Swap, + TuplizeFunction, + WithConstant +} +import com.twitter.scalding.serialization.{EquivSerialization, OrderedSerialization, UnitOrderedSerialization} +import com.twitter.scalding.serialization.OrderedSerialization.Result +import com.twitter.scalding.serialization.macros.impl.BinaryOrdering +import com.twitter.scalding.serialization.macros.impl.BinaryOrdering._ +import com.twitter.scalding.dagon.{Memoize, RefPair} + +import scala.util.Try +import scala.util.hashing.MurmurHash3 + +/** + * This is an identifier which should have a good equals and hashCode and has a Type. Different backends are + * responsible for reading/writing + */ +trait Input[+A] extends Serializable + +/** + * This is an identifier which should have a good equals and hashCode and has a Type. Different backends are + * responsible for reading/writing + */ +trait Output[-A] extends Serializable + +/** + * factory methods for TypedPipe, which is the typed representation of distributed lists in scalding. This + * object is here rather than in the typed package because a lot of code was written using the functions in + * the object, which we do not see how to hide with package object tricks. + */ +object TypedPipe extends Serializable { + + /** + * Create a TypedPipe from a Input. This is the preferred way to make a TypedPipe + */ + def from[T](source: Input[T]): TypedPipe[T] = + SourcePipe(source) + + /** + * Create a TypedPipe from an Iterable in memory. + */ + def from[T](iter: Iterable[T]): TypedPipe[T] = + if (iter.isEmpty) empty else IterablePipe[T](iter) + + /** + * Create an empty TypedPipe. This is sometimes useful when a method must return a TypedPipe, but sometimes + * at runtime we can check a condition and see that it should be empty. This is the zero of the + * Monoid[TypedPipe] + */ + def empty: TypedPipe[Nothing] = EmptyTypedPipe + + /** + * This enables pipe.hashJoin(that) or pipe.join(that) syntax This is a safe enrichment because hashJoinable + * and CoGroupable are only used in the argument position or to give cogroup, join, leftJoin, rightJoin, + * outerJoin methods. Since those methods are unlikely to be used on TypedPipe in the future, this + * enrichment seems safe. + * + * This method is the Vitaly-was-right method. + */ + implicit def toHashJoinable[K, V](pipe: TypedPipe[(K, V)])(implicit ord: Ordering[K]): HashJoinable[K, V] = + /* + * Note, it would not be safe to make the return type of this Grouped[K, V] since that has some + * different semantics than TypedPipe, however, it is not unclear when we only go to + * HashJoinable + */ + pipe.group + + /** + * TypedPipe instances are monoids. They are isomorphic to multisets. + */ + implicit def typedPipeMonoid[T]: Monoid[TypedPipe[T]] = new Monoid[TypedPipe[T]] { + def zero = TypedPipe.empty + def plus(left: TypedPipe[T], right: TypedPipe[T]): TypedPipe[T] = + left ++ right + override def sumOption(pipes: TraversableOnce[TypedPipe[T]]): Option[TypedPipe[T]] = + if (pipes.isEmpty) None + else { + // we can't combine these but want to avoid a linear graph which can be slow + // to optimize + def combine(ps: Vector[TypedPipe[T]]): TypedPipe[T] = { + val sz = ps.size + if (sz == 0) TypedPipe.empty + else if (sz == 1) ps(0) + else { + val left = combine(ps.take(sz / 2)) + val right = combine(ps.drop(sz / 2)) + left ++ right + } + } + Some(combine(pipes.toVector)) + } + } + + private case object IdentityOrdering extends OrderedSerialization[Int] with EquivSerialization[Int] { + val delegate = BinaryOrdering.ordSer[Int] + + override def compareBinary(a: InputStream, b: InputStream): Result = delegate.compareBinary(a, b) + override def compare(x: Int, y: Int): Int = delegate.compare(x, y) + override def dynamicSize(t: Int): Option[Int] = delegate.dynamicSize(t) + override def write(out: OutputStream, t: Int): Try[Unit] = delegate.write(out, t) + override def read(in: InputStream): Try[Int] = delegate.read(in) + override def staticSize: Option[Int] = delegate.staticSize + override def hash(x: Int): Int = x + } + + final case class CoGroupedPipe[K, V](@transient cogrouped: CoGrouped[K, V]) extends TypedPipe[(K, V)] + final case class CounterPipe[A](pipe: TypedPipe[(A, Iterable[((String, String), Long)])]) + extends TypedPipe[A] + final case class CrossPipe[T, U](left: TypedPipe[T], right: TypedPipe[U]) extends TypedPipe[(T, U)] { + def viaHashJoin: TypedPipe[(T, U)] = + left.withKey(()).hashJoin(right.withKey(())).values + } + final case class CrossValue[T, U](left: TypedPipe[T], right: ValuePipe[U]) extends TypedPipe[(T, U)] { + def viaHashJoin: TypedPipe[(T, U)] = + right match { + case EmptyValue => + EmptyTypedPipe + case LiteralValue(v) => + left.map(WithConstant(v)) + case ComputedValue(pipe) => + CrossPipe(left, pipe) + } + } + final case class DebugPipe[T](input: TypedPipe[T]) extends TypedPipe[T] + final case class FilterKeys[K, V](input: TypedPipe[(K, V)], @transient fn: K => Boolean) + extends TypedPipe[(K, V)] + final case class Filter[T](input: TypedPipe[T], @transient fn: T => Boolean) extends TypedPipe[T] + final case class FlatMapValues[K, V, U](input: TypedPipe[(K, V)], @transient fn: V => TraversableOnce[U]) + extends TypedPipe[(K, U)] + final case class FlatMapped[T, U](input: TypedPipe[T], @transient fn: T => TraversableOnce[U]) + extends TypedPipe[U] + final case class ForceToDisk[T](input: TypedPipe[T]) extends TypedPipe[T] + final case class Fork[T](input: TypedPipe[T]) extends TypedPipe[T] + final case class HashCoGroup[K, V, W, R]( + left: TypedPipe[(K, V)], + @transient right: HashJoinable[K, W], + @transient joiner: (K, V, Iterable[W]) => Iterator[R] + ) extends TypedPipe[(K, R)] + final case class IterablePipe[T](iterable: Iterable[T]) extends TypedPipe[T] + final case class MapValues[K, V, U](input: TypedPipe[(K, V)], @transient fn: V => U) + extends TypedPipe[(K, U)] + final case class Mapped[T, U](input: TypedPipe[T], @transient fn: T => U) extends TypedPipe[U] + final case class MergedTypedPipe[T](left: TypedPipe[T], right: TypedPipe[T]) extends TypedPipe[T] + final case class ReduceStepPipe[K, V1, V2](@transient reduce: ReduceStep[K, V1, V2]) + extends TypedPipe[(K, V2)] + final case class SourcePipe[T](@transient source: Input[T]) extends TypedPipe[T] + final case class SumByLocalKeys[K, V](input: TypedPipe[(K, V)], @transient semigroup: Semigroup[V]) + extends TypedPipe[(K, V)] + + final case class TrappedPipe[T]( + input: TypedPipe[T], + @transient sink: Output[T] + ) extends TypedPipe[T] + + /** + * descriptions carry a boolean that is true if we should deduplicate the message. This is used for line + * numbers which are otherwise often duplicated + */ + final case class WithDescriptionTypedPipe[T](input: TypedPipe[T], descriptions: List[(String, Boolean)]) + extends TypedPipe[T] + final case class WithOnComplete[T](input: TypedPipe[T], @transient fn: () => Unit) extends TypedPipe[T] + + case object EmptyTypedPipe extends TypedPipe[Nothing] { + // we can't let the default TypedPipe == go here, it will stack overflow on a pattern match + override def equals(that: Any): Boolean = + that match { + case e: EmptyTypedPipe.type => true + case _ => false + } + } + + implicit class InvariantTypedPipe[T](val pipe: TypedPipe[T]) extends AnyVal { + + /** + * Returns the set of distinct elements in the TypedPipe This is the same as: .map((_, ())).group.sum.keys + * If you want a distinct while joining, consider: instead of: {@code + * a.join(b.distinct.asKeys) } manually do the distinct: {@code + * a.join(b.asKeys.sum) } The latter creates 1 map/reduce phase rather than 2 + */ + @annotation.implicitNotFound( + msg = "For distinct method to work, the type in TypedPipe must have an Ordering." + ) + def distinct(implicit ord: Ordering[T]): TypedPipe[T] = + pipe.asKeys.sum.keys + + /** + * If any errors happen below this line, but before a groupBy, write to a TypedSink + */ + @deprecated( + "semantics of addTrap are hard to follow, prefer to use Either and manually write out error branchs", + "0.18.0" + ) + def addTrap(trapSink: Output[T]): TypedPipe[T] = + TypedPipe.TrappedPipe[T](pipe, trapSink).withLine + } + + /** + * This is where all the methods that require TypedPipe[(K, V)] live. + * + * previously, these were directly on TypedPipe with the use of T <:< (K, V) however that complicates type + * inference on many functions. + */ + implicit class Keyed[K, V](val kvpipe: TypedPipe[(K, V)]) extends AnyVal { + + /** + * Sometimes useful for implementing custom joins with groupBy + mapValueStream when you know that the + * value/key can fit in memory. Beware. + */ + def eitherValues[R](that: TypedPipe[(K, R)]): TypedPipe[(K, Either[V, R])] = + mapValues(AsLeft[V, R]()) ++ (that.mapValues(AsRight[V, R]())) + + /** + * If T is a (K, V) for some V, then we can use this function to filter. Prefer to use this if your filter + * only touches the key. + * + * This is here to match the function in KeyedListLike, where it is optimized + */ + def filterKeys(fn: K => Boolean): TypedPipe[(K, V)] = + TypedPipe.FilterKeys(kvpipe, fn).withLine + + /** Similar to mapValues, but allows to return a collection of outputs for each input value */ + def flatMapValues[U](f: V => TraversableOnce[U]): TypedPipe[(K, U)] = + TypedPipe.FlatMapValues(kvpipe, f).withLine + + /** + * flatten just the values This is more useful on KeyedListLike, but added here to reduce assymmetry in + * the APIs + */ + def flattenValues[U](implicit ev: V <:< TraversableOnce[U]): TypedPipe[(K, U)] = { + val st = SubTypes.tuple2_2[K, V, TraversableOnce[U]](SubTypes.fromEv(ev)) + kvpipe + .widen(st.toEv) + .flatMapValues[U](Identity[TraversableOnce[U]]()) + } + + /** + * This is the default means of grouping all pairs with the same key. Generally this triggers 1 Map/Reduce + * transition + */ + def group(implicit ord: Ordering[K]): Grouped[K, V] = + Grouped(kvpipe.withLine) + + /** Group using an explicit Ordering on the key. */ + def groupWith(ord: Ordering[K]): Grouped[K, V] = group(ord) + + /** + * These operations look like joins, but they do not force any communication of the current TypedPipe. + * They are mapping operations where this pipe is streamed through one item at a time. + * + * WARNING These behave semantically very differently than cogroup. This is because we handle (K,V) tuples + * on the left as we see them. The iterable on the right is over all elements with a matching key K, and + * it may be empty if there are no values for this key K. + */ + def hashCogroup[K1 >: K, W, R](smaller: HashJoinable[K1, W])( + joiner: (K1, V, Iterable[W]) => Iterator[R] + ): TypedPipe[(K1, R)] = + TypedPipe.HashCoGroup(kvpipe.widen[(K1, V)], smaller, joiner).withLine + + /** Do an inner-join without shuffling this TypedPipe, but replicating argument to all tasks */ + def hashJoin[K1 >: K, W](smaller: HashJoinable[K1, W]): TypedPipe[(K1, (V, W))] = + hashCogroup[K1, W, (V, W)](smaller)(Joiner.hashInner2) + + /** Do an leftjoin without shuffling this TypedPipe, but replicating argument to all tasks */ + def hashLeftJoin[K1 >: K, W](smaller: HashJoinable[K1, W]): TypedPipe[(K1, (V, Option[W]))] = + hashCogroup[K1, W, (V, Option[W])](smaller)(Joiner.hashLeft2) + + /** Just keep the keys, or ._1 (if this type is a Tuple2) */ + def keys: TypedPipe[K] = + kvpipe.map(GetKey()) + + /** Transform only the values (sometimes requires giving the types due to scala type inference) */ + def mapValues[U](f: V => U): TypedPipe[(K, U)] = + TypedPipe.MapValues(kvpipe, f).withLine + + /** + * Enables joining when this TypedPipe has some keys with many many values and but many with very few + * values. For instance, a graph where some nodes have millions of neighbors, but most have only a few. + * + * We build a (count-min) sketch of each key's frequency, and we use that to shard the heavy keys across + * many reducers. This increases communication cost in order to reduce the maximum time needed to complete + * the join. + * + * {@code pipe.sketch(100).join(thatPipe) } will add an extra map/reduce job over a standard join to + * create the count-min-sketch. This will generally only be beneficial if you have really heavy skew, + * where without this you have 1 or 2 reducers taking hours longer than the rest. + */ + def sketch( + reducers: Int, + eps: Double = 1.0e-5, // 272k width = 1MB per row + delta: Double = 0.01, // 5 rows (= 5 hashes) + seed: Int = 12345 + )(implicit serialization: K => Array[Byte], ordering: Ordering[K]): Sketched[K, V] = + Sketched(kvpipe, reducers, delta, eps, seed) + + /** + * Reasonably common shortcut for cases of associative/commutative reduction by Key + */ + def sumByKey(implicit ord: Ordering[K], plus: Semigroup[V]): UnsortedGrouped[K, V] = + group.sum[V] + + /** + * This does a sum of values WITHOUT triggering a shuffle. the contract is, if followed by a group.sum the + * result is the same with or without this present, and it never increases the number of items. BUT due to + * the cost of caching, it might not be faster if there is poor key locality. + * + * It is only useful for expert tuning, and best avoided unless you are struggling with performance + * problems. If you are not sure you need this, you probably don't. + * + * The main use case is to reduce the values down before a key expansion such as is often done in a data + * cube. + */ + def sumByLocalKeys(implicit sg: Semigroup[V]): TypedPipe[(K, V)] = + TypedPipe.SumByLocalKeys(kvpipe, sg).withLine + + /** swap the keys with the values */ + def swap: TypedPipe[(V, K)] = + kvpipe.map(Swap()) + + /** Just keep the values, or ._2 (if this type is a Tuple2) */ + def values: TypedPipe[V] = + kvpipe.map(GetValue()) + } + + private case class TallyByFn[A](group: String, fn: A => String) + extends Function1[A, (A, Iterable[((String, String), Long)])] { + def apply(a: A) = (a, (((group, fn(a)), 1L)) :: Nil) + } + private case class TallyFn[A](group: String, counter: String) + extends Function1[A, (A, Iterable[((String, String), Long)])] { + private[this] val inc = ((group, counter), 1L) :: Nil + def apply(a: A) = (a, inc) + } + private case class TallyLeft[A, B](group: String, fn: A => Either[String, B]) + extends Function1[A, (List[B], Iterable[((String, String), Long)])] { + def apply(a: A) = fn(a) match { + case Right(b) => (b :: Nil, Nil) + case Left(cnt) => (Nil, ((group, cnt), 1L) :: Nil) + } + } + + implicit class TallyEnrichment[A, B <: Iterable[((String, String), Long)]](val pipe: TypedPipe[(A, B)]) + extends AnyVal { + + /** + * Increment hadoop counters with a (group, counter) by the amount in the second part of the tuple, and + * remove that second part + */ + def tally: TypedPipe[A] = + CounterPipe(pipe) + } + + /** + * This is a def because it allocates a new memo on each call. This is important to avoid growing a memo + * indefinitely + */ + private def eqFn: RefPair[TypedPipe[Any], TypedPipe[Any]] => Boolean = { + + def eqCoGroupable( + left: CoGroupable[_, _], + right: CoGroupable[_, _], + rec: RefPair[TypedPipe[_], TypedPipe[_]] => Boolean + ): Boolean = { + import CoGrouped._ + (left, right) match { + case (Pair(la, lb, lfn), Pair(ra, rb, rfn)) => + (lfn == rfn) && eqCoGroupable(la, ra, rec) && eqCoGroupable(lb, rb, rec) + case (WithReducers(left, leftRed), WithReducers(right, rightRed)) => + (leftRed == rightRed) && eqCoGroupable(left, right, rec) + case (WithDescription(left, leftDesc), WithDescription(right, rightDesc)) => + (leftDesc == rightDesc) && eqCoGroupable(left, right, rec) + case (CoGrouped.FilterKeys(left, lfn), CoGrouped.FilterKeys(right, rfn)) => + (lfn == rfn) && eqCoGroupable(left, right, rec) + case (MapGroup(left, lfn), MapGroup(right, rfn)) => + (lfn == rfn) && eqCoGroupable(left, right, rec) + case (left: ReduceStep[_, _, _], right: ReduceStep[_, _, _]) => + eqReduceStep(left, right, rec) + case (_, _) => false + } + } + + def eqHashJoinable( + left: HashJoinable[_, _], + right: HashJoinable[_, _], + rec: RefPair[TypedPipe[_], TypedPipe[_]] => Boolean + ): Boolean = + (left, right) match { + case (lrs: ReduceStep[_, _, _], rrs: ReduceStep[_, _, _]) => + eqReduceStep(lrs, rrs, rec) + } + + def eqReduceStep( + left: ReduceStep[_, _, _], + right: ReduceStep[_, _, _], + rec: RefPair[TypedPipe[_], TypedPipe[_]] => Boolean + ): Boolean = { + val zeroLeft = ReduceStep.setInput(left, EmptyTypedPipe) + val zeroRight = ReduceStep.setInput(right, EmptyTypedPipe) + + (zeroLeft == zeroRight) && rec(RefPair(left.mapped, right.mapped)) + } + + Memoize.function[RefPair[TypedPipe[Any], TypedPipe[Any]], Boolean] { + case (pair, _) if pair.itemsEq => true + case (RefPair(CoGroupedPipe(left), CoGroupedPipe(right)), rec) => + eqCoGroupable(left, right, rec) + case (RefPair(CounterPipe(left), CounterPipe(right)), rec) => + rec(RefPair(left, right)) + case (RefPair(CrossPipe(leftA, rightA), CrossPipe(leftB, rightB)), rec) => + rec(RefPair(leftA, leftB)) && rec(RefPair(rightA, rightB)) + case (RefPair(CrossValue(pipeA, valueA), CrossValue(pipeB, valueB)), rec) => + // have to deconstruct values + val valEq = (valueA, valueB) match { + case (ComputedValue(pA), ComputedValue(pB)) => rec(RefPair(pA, pB)) + case (l, r) => l == r + } + valEq && rec(RefPair(pipeA, pipeB)) + case (RefPair(DebugPipe(left), DebugPipe(right)), rec) => + rec(RefPair(left, right)) + case (RefPair(FilterKeys(leftIn, leftF), FilterKeys(rightIn, rightF)), rec) => + // check the non-pipes first: + (leftF == rightF) && rec(RefPair(leftIn, rightIn)) + case (RefPair(Filter(leftIn, leftF), Filter(rightIn, rightF)), rec) => + // check the non-pipes first: + (leftF == rightF) && rec(RefPair(leftIn, rightIn)) + case (RefPair(FlatMapValues(leftIn, leftF), FlatMapValues(rightIn, rightF)), rec) => + // check the non-pipes first: + (leftF == rightF) && rec(RefPair(leftIn, rightIn)) + case (RefPair(FlatMapped(leftIn, leftF), FlatMapped(rightIn, rightF)), rec) => + // check the non-pipes first: + (leftF == rightF) && rec(RefPair(leftIn, rightIn)) + case (RefPair(ForceToDisk(left), ForceToDisk(right)), rec) => + rec(RefPair(left, right)) + case (RefPair(Fork(left), Fork(right)), rec) => + rec(RefPair(left, right)) + case (RefPair(HashCoGroup(leftA, rightA, fnA), HashCoGroup(leftB, rightB, fnB)), rec) => + (fnA == fnB) && rec(RefPair(leftA, leftB)) && eqHashJoinable(rightA, rightB, rec) + case (RefPair(IterablePipe(itA), IterablePipe(itB)), _) => itA == itB + case (RefPair(MapValues(leftIn, leftF), MapValues(rightIn, rightF)), rec) => + // check the non-pipes first: + (leftF == rightF) && rec(RefPair(leftIn, rightIn)) + case (RefPair(Mapped(leftIn, leftF), Mapped(rightIn, rightF)), rec) => + // check the non-pipes first: + (leftF == rightF) && rec(RefPair(leftIn, rightIn)) + case (RefPair(MergedTypedPipe(leftA, rightA), MergedTypedPipe(leftB, rightB)), rec) => + rec(RefPair(leftA, leftB)) && rec(RefPair(rightA, rightB)) + case (RefPair(ReduceStepPipe(left), ReduceStepPipe(right)), rec) => + eqReduceStep(left, right, rec) + case (RefPair(SourcePipe(srcA), SourcePipe(srcB)), _) => srcA == srcB + case (RefPair(SumByLocalKeys(leftIn, leftSg), SumByLocalKeys(rightIn, rightSg)), rec) => + (leftSg == rightSg) && rec(RefPair(leftIn, rightIn)) + case (RefPair(TrappedPipe(inA, sinkA), TrappedPipe(inB, sinkB)), rec) => + (sinkA == sinkB) && rec(RefPair(inA, inB)) + case ( + RefPair(WithDescriptionTypedPipe(leftIn, leftDesc), WithDescriptionTypedPipe(rightIn, rightDesc)), + rec + ) => + // check the non-pipes first: + (leftDesc == rightDesc) && rec(RefPair(leftIn, rightIn)) + case (RefPair(WithOnComplete(leftIn, leftF), WithOnComplete(rightIn, rightF)), rec) => + // check the non-pipes first: + (leftF == rightF) && rec(RefPair(leftIn, rightIn)) + case (RefPair(EmptyTypedPipe, EmptyTypedPipe), _) => true + case _ => false // we don't match on which subtype we are + } + } +} + +/** + * Think of a TypedPipe as a distributed unordered list that may or may not yet have been materialized in + * memory or disk. + * + * Represents a phase in a distributed computation on an input data source Wraps a cascading Pipe object, and + * holds the transformation done up until that point + */ +sealed abstract class TypedPipe[+T] extends Serializable with Product { + + override val hashCode: Int = MurmurHash3.productHash(this) + override def equals(that: Any): Boolean = that match { + case thatTP: TypedPipe[_] => + if (thatTP eq this) true + else if (thatTP.hashCode != hashCode) false // since we have a cached hashCode, use it + else { + // we only check this in the case of true equality without reference + // equality or rarely due to hash collisions. So we can expect to + // walk the entire graph in most cases where we get here. + // Without the memoization below, that graph walking can + // be exponentially slow. With the memoization, it becomes O(N) + // where N is the size of the reachable graph distinct by reference + // equality + val fn = TypedPipe.eqFn + fn(RefPair(this, thatTP)) + } + case _ => false + } + + private[scalding] def withLine: TypedPipe[T] = + LineNumber.tryNonScaldingCaller.map(_.toString) match { + case None => + this + case Some(desc) => + TypedPipe.WithDescriptionTypedPipe(this, (desc, true) :: Nil) // deduplicate line numbers + } + + /** + * Increment diagnostic counters by 1 for each item in the pipe. The counter group will be the same for each + * item, the counter name is determined by the result of the `fn` passed in. + */ + def tallyBy(group: String)(fn: T => String): TypedPipe[T] = + map(TypedPipe.TallyByFn(group, fn)).tally + + /** + * Increment a specific diagnostic counter by 1 for each item in the pipe. + * + * this is the same as tallyBy(group)(_ => counter) + */ + def tallyAll(group: String, counter: String): TypedPipe[T] = + map(TypedPipe.TallyFn(group, counter)).tally + + /** + * Increment a diagnostic counter for each failure. This is like map, where the `fn` should return a + * `Right[U]` for each successful transformation and a `Left[String]` for each failure, with the String + * describing the failure. Each failure will be counted, and the result is just the successes. + */ + def tallyLeft[B](group: String)(fn: T => Either[String, B]): TypedPipe[B] = + map(TypedPipe.TallyLeft(group, fn)).tally.flatten + + /** + * Implements a cross product. The right side should be tiny This gives the same results as {code for { l <- + * list1; l2 <- list2 } yield (l, l2) } + */ + def cross[U](tiny: TypedPipe[U]): TypedPipe[(T, U)] = + TypedPipe.CrossPipe(this, tiny).withLine + + /** + * This is the fundamental mapper operation. It behaves in a way similar to List.flatMap, which means that + * each item is fed to the input function, which can return 0, 1, or many outputs (as a TraversableOnce) per + * input. The returned results will be iterated through once and then flattened into a single TypedPipe + * which is passed to the next step in the pipeline. + * + * This behavior makes it a powerful operator -- it can be used to filter records (by returning 0 items for + * a given input), it can be used the way map is used (by returning 1 item per input), it can be used to + * explode 1 input into many outputs, or even a combination of all of the above at once. + */ + def flatMap[U](f: T => TraversableOnce[U]): TypedPipe[U] = + TypedPipe.FlatMapped(this, f).withLine + + /** + * Merge two TypedPipes (no order is guaranteed) This is only realized when a group (or join) is performed. + */ + def ++[U >: T](other: TypedPipe[U]): TypedPipe[U] = + TypedPipe.MergedTypedPipe(this, other).withLine + + /** + * Aggregate all items in this pipe into a single ValuePipe + * + * Aggregators are composable reductions that allow you to glue together several reductions and process them + * in one pass. + * + * Same as groupAll.aggregate.values + */ + def aggregate[B, C](agg: Aggregator[T, B, C]): ValuePipe[C] = + ComputedValue(groupAll.aggregate(agg).values) + + /** + * Put the items in this into the keys, and unit as the value in a Group in some sense, this is the dual of + * groupAll + */ + @annotation.implicitNotFound( + msg = "For asKeys method to work, the type in TypedPipe must have an Ordering." + ) + def asKeys[U >: T](implicit ord: Ordering[U]): Grouped[U, Unit] = + widen[U] + .withValue(()) + .group + + /** + * Set a key to to the given value. + */ + def withKey[K](key: K): TypedPipe[(K, T)] = + map(ConstantKey(key)) + + /** + * Set a key to to the given value. + */ + def withValue[V](value: V): TypedPipe[(T, V)] = + map(WithConstant(value)) + + /** + * If T <:< U, then this is safe to treat as TypedPipe[U] due to covariance + */ + def widen[U](implicit ev: T <:< U): TypedPipe[U] = + SubTypes.fromEv(ev).liftCo[TypedPipe](this) + + /** + * Filter and map. See scala.collection.List.collect. {@code collect { case Some(x) => fn(x) } } + */ + def collect[U](fn: PartialFunction[T, U]): TypedPipe[U] = + filter(PartialFunctionToFilter(fn)).map(fn) + + /** + * Attach a ValuePipe to each element this TypedPipe + */ + def cross[V](p: ValuePipe[V]): TypedPipe[(T, V)] = + TypedPipe.CrossValue(this, p).withLine + + /** prints the current pipe to stdout */ + def debug: TypedPipe[T] = + TypedPipe.DebugPipe(this).withLine + + /** adds a description to the pipe */ + def withDescription(description: String): TypedPipe[T] = + TypedPipe.WithDescriptionTypedPipe[T](this, (description, false) :: Nil) + + /** + * Returns the set of distinct elements identified by a given lambda extractor in the TypedPipe + */ + @annotation.implicitNotFound( + msg = "For distinctBy method to work, the type to distinct on in the TypedPipe must have an Ordering." + ) + def distinctBy[U](fn: T => U, numReducers: Option[Int] = None)(implicit ord: Ordering[U]): TypedPipe[T] = { + val op = groupBy(fn).head + val reduced = numReducers match { + case Some(red) => op.withReducers(red) + case None => op + } + reduced.map(GetValue()) + } + + /** Merge two TypedPipes of different types by using Either */ + def either[R](that: TypedPipe[R]): TypedPipe[Either[T, R]] = + map(AsLeft()) ++ (that.map(AsRight())) + + /** + * If you are going to create two branches or forks, it may be more efficient to call this method first + * which will create a node in the cascading graph. Without this, both full branches of the fork will be put + * into separate cascading pipes, which can, in some cases, be slower. + * + * Ideally the planner would see this + */ + def fork: TypedPipe[T] = TypedPipe.Fork(this).withLine + + /** + * limit the output to at most count items, if at least count items exist. + */ + def limit(count: Int): TypedPipe[T] = + groupAll.bufferedTake(count).values + + /** Transform each element via the function f */ + def map[U](f: T => U): TypedPipe[U] = + TypedPipe.Mapped(this, f).withLine + + /** + * Keep only items that satisfy this predicate + */ + def filter(f: T => Boolean): TypedPipe[T] = + TypedPipe.Filter(this, f).withLine + + // This is just to appease for comprehension + def withFilter(f: T => Boolean): TypedPipe[T] = filter(f) + + /** + * Keep only items that don't satisfy the predicate. `filterNot` is the same as `filter` with a negated + * predicate. + */ + def filterNot(f: T => Boolean): TypedPipe[T] = + filter(!f(_)) + + /** flatten an Iterable */ + def flatten[U](implicit ev: T <:< TraversableOnce[U]): TypedPipe[U] = + widen[TraversableOnce[U]].flatMap(Identity[TraversableOnce[U]]()) + + /** + * Force a materialization of this pipe prior to the next operation. This is useful if you filter almost + * everything before a hashJoin, for instance. This is useful for experts who see some heuristic of the + * planner causing slower performance. + */ + def forceToDisk: TypedPipe[T] = + TypedPipe.ForceToDisk(this).withLine + + /** Send all items to a single reducer */ + def groupAll: Grouped[Unit, T] = + groupBy(Constant(()))(UnitOrderedSerialization).withReducers(1) + + /** Given a key function, add the key, then call .group */ + def groupBy[K](g: T => K)(implicit ord: Ordering[K]): Grouped[K, T] = + map(MakeKey(g)).group + + /** + * Forces a shuffle by randomly assigning each item into one of the partitions. + * + * This is for the case where you mappers take a long time, and it is faster to shuffle them to more + * reducers and then operate. + * + * You probably want shard if you are just forcing a shuffle. + */ + def groupRandomly(partitions: Int): Grouped[Int, T] = + groupBy(RandomNextInt(123, partitions))(TypedPipe.IdentityOrdering) + .withReducers(partitions) + + /** + * Partitions this into two pipes according to a predicate. + * + * Sometimes what you really want is a groupBy in these cases. + */ + def partition(p: T => Boolean): (TypedPipe[T], TypedPipe[T]) = { + val forked = fork + (forked.filter(p), forked.filterNot(p)) + } + + private[this] def defaultSeed: Long = System.identityHashCode(this) * 2654435761L ^ System.currentTimeMillis + + /** + * Sample a fraction (between 0 and 1) uniformly independently at random each element of the pipe does not + * require a reduce step. This method makes sure to fix the seed, otherwise restarts cause subtle errors. + */ + def sample(fraction: Double): TypedPipe[T] = sample(fraction, defaultSeed) + + /** + * Sample a fraction (between 0 and 1) uniformly independently at random each element of the pipe with a + * given seed. Does not require a reduce step. + */ + def sample(fraction: Double, seed: Long): TypedPipe[T] = { + require(0.0 <= fraction && fraction <= 1.0, s"got $fraction which is an invalid fraction") + filter(RandomFilter(seed, fraction)) + } + + /** + * Used to force a shuffle into a given size of nodes. Only use this if your mappers are taking far longer + * than the time to shuffle. + */ + def shard(partitions: Int): TypedPipe[T] = groupRandomly(partitions).forceToReducers.values + + /** + * Reasonably common shortcut for cases of total associative/commutative reduction returns a ValuePipe with + * only one element if there is any input, otherwise EmptyValue. + */ + def sum[U >: T](implicit plus: Semigroup[U]): ValuePipe[U] = { + // every 1000 items, compact. + lazy implicit val batchedSG: Semigroup[Batched[U]] = Batched.compactingSemigroup[U](1000) + // TODO: literals like this defeat caching in the planner + ComputedValue( + map(t => ((), Batched[U](t))).sumByLocalKeys + // remove the Batched before going to the reducers + // TODO: literals like this defeat caching in the planner + .map { case (_, batched) => batched.sum } + .groupAll + .forceToReducers + .sum + .values + ) + } + + /** + * This is used when you are working with Execution[T] to create loops. You might do this to checkpoint and + * then flatMap Execution to continue from there. Probably only useful if you need to flatMap it twice to + * fan out the data into two children jobs. + * + * This writes the current TypedPipe into a temporary file and then opens it after complete so that you can + * continue from that point + */ + def forceToDiskExecution: Execution[TypedPipe[T]] = + Execution.forceToDisk(this) + + /** + * This gives an Execution that when run evaluates the TypedPipe, writes it to disk, and then gives you an + * Iterable that reads from disk on the submit node each time .iterator is called. Because of how scala + * Iterables work, mapping/flatMapping/filtering the Iterable forces a read of the entire thing. If you need + * it to be lazy, call .iterator and use the Iterator inside instead. + */ + def toIterableExecution: Execution[Iterable[T]] = + Execution.toIterable(this) + + /** + * This attaches a function that is called at the end of the map phase on EACH of the tasks that are + * executing. This is for expert use only. You probably won't ever need it. Try hard to avoid it. Execution + * also has onComplete that can run when an Execution has completed. + */ + def onComplete(fn: () => Unit): TypedPipe[T] = + TypedPipe.WithOnComplete[T](this, fn).withLine + + /** + * This is the functionally pure approach to building jobs. Note, that you have to call run on the result or + * flatMap/zip it into an Execution that is run for anything to happen here. + */ + def writeExecution(dest: Output[T]): Execution[Unit] = + Execution.write(this, dest) + + /** + * If you want to write to a specific location, and then read from that location going forward, use this. + */ + def writeThrough[U >: T](dest: Output[T] with Input[U]): Execution[TypedPipe[U]] = + Execution.write(this, dest, TypedPipe.from(dest)) + + /** + * ValuePipe may be empty, so, this attaches it as an Option cross is the same as leftCross(p).collect { + * case (t, Some(v)) => (t, v) } + */ + def leftCross[V](p: ValuePipe[V]): TypedPipe[(T, Option[V])] = + p match { + case EmptyValue => map(WithConstant(None)) + case LiteralValue(v) => map(WithConstant(Some(v))) + case ComputedValue(pipe) => leftCross(pipe) + } + + /** uses hashJoin but attaches None if thatPipe is empty */ + def leftCross[V](thatPipe: TypedPipe[V]): TypedPipe[(T, Option[V])] = + withKey(()).hashLeftJoin(thatPipe.withKey(())).values + + /** + * common pattern of attaching a value and then map recommended style: {@code mapWithValue(vpu) { case (t, + * Some(u)) => op(t, u) case (t, None) => // if you never expect this: sys.error("unexpected empty value + * pipe") } } + */ + def mapWithValue[U, V](value: ValuePipe[U])(f: (T, Option[U]) => V): TypedPipe[V] = + leftCross(value).map(TuplizeFunction(f)) + + /** + * common pattern of attaching a value and then flatMap recommended style: {@code flatMapWithValue(vpu) { + * case (t, Some(u)) => op(t, u) case (t, None) => // if you never expect this: sys.error("unexpected empty + * value pipe") } } + */ + def flatMapWithValue[U, V](value: ValuePipe[U])(f: (T, Option[U]) => TraversableOnce[V]): TypedPipe[V] = + leftCross(value).flatMap(TuplizeFunction(f)) + + /** + * common pattern of attaching a value and then filter recommended style: {@code filterWithValue(vpu) { case + * (t, Some(u)) => op(t, u) case (t, None) => // if you never expect this: sys.error("unexpected empty value + * pipe") } } + */ + def filterWithValue[U](value: ValuePipe[U])(f: (T, Option[U]) => Boolean): TypedPipe[T] = + leftCross(value).filter(TuplizeFunction(f)).map(GetKey()) + + /** + * For each element, do a map-side (hash) left join to look up a value + */ + def hashLookup[K >: T, V](grouped: HashJoinable[K, V]): TypedPipe[(K, Option[V])] = + map(WithConstant(())) + .widen[(K, Unit)] + .hashLeftJoin(grouped) + .map(DropValue1()) + +} + +/** + * This class is for the syntax enrichment enabling .joinBy on TypedPipes. To access this, do import + * Syntax.joinOnMappablePipe + */ +class MappablePipeJoinEnrichment[T](pipe: TypedPipe[T]) { + def joinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit + ord: Ordering[K] + ): CoGrouped[K, (T, U)] = pipe.groupBy(g).withReducers(reducers).join(smaller.groupBy(h)) + def leftJoinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit + ord: Ordering[K] + ): CoGrouped[K, (T, Option[U])] = pipe.groupBy(g).withReducers(reducers).leftJoin(smaller.groupBy(h)) + def rightJoinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit + ord: Ordering[K] + ): CoGrouped[K, (Option[T], U)] = pipe.groupBy(g).withReducers(reducers).rightJoin(smaller.groupBy(h)) + def outerJoinBy[K, U](smaller: TypedPipe[U])(g: (T => K), h: (U => K), reducers: Int = -1)(implicit + ord: Ordering[K] + ): CoGrouped[K, (Option[T], Option[U])] = + pipe.groupBy(g).withReducers(reducers).outerJoin(smaller.groupBy(h)) +} + +/** + * These are named syntax extensions that users can optionally import. Avoid import Syntax._ + */ +object Syntax { + implicit def joinOnMappablePipe[T](p: TypedPipe[T]): MappablePipeJoinEnrichment[T] = + new MappablePipeJoinEnrichment(p) +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/TypedPipeDiff.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/TypedPipeDiff.scala new file mode 100644 index 0000000000..727479276e --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/TypedPipeDiff.scala @@ -0,0 +1,131 @@ +package com.twitter.scalding.typed + +import scala.reflect.ClassTag + +/** + * Some methods for comparing two typed pipes and finding out the difference between them. + * + * Has support for the normal case where the typed pipes are pipes of objects usable as keys in scalding (have + * an ordering, proper equals and hashCode), as well as some special cases for dealing with Arrays and thrift + * objects. + * + * See diffByHashCode for comparing typed pipes of objects that have no ordering but a stable hash code (such + * as Scrooge thrift). + * + * See diffByGroup for comparing typed pipes of objects that have no ordering *and* an unstable hash code. + */ +object TypedPipeDiff { + + /** + * Returns a mapping from T to a count of the occurrences of T in the left and right pipes, only for cases + * where the counts are not equal. + * + * Requires that T have an ordering and a hashCode and equals that is stable across JVMs (not reference + * based). See diffArrayPipes for diffing pipes of arrays, since arrays do not meet these requirements by + * default. + */ + def diff[T: Ordering]( + left: TypedPipe[T], + right: TypedPipe[T], + reducers: Option[Int] = None + ): UnsortedGrouped[T, (Long, Long)] = { + val lefts = left.map(x => (x, (1L, 0L))) + val rights = right.map(x => (x, (0L, 1L))) + val counts = (lefts ++ rights).sumByKey + val diff = counts.filter { case (key, (lCount, rCount)) => lCount != rCount } + reducers.map(diff.withReducers).getOrElse(diff) + } + + /** + * Same as diffByHashCode, but takes care to wrap the Array[T] in a wrapper, which has the correct hashCode + * and equals needed. This does not involve copying the arrays, just wrapping them, and is specialized for + * primitive arrays. + */ + def diffArrayPipes[T: ClassTag]( + left: TypedPipe[Array[T]], + right: TypedPipe[Array[T]], + reducers: Option[Int] = None + ): TypedPipe[(Array[T], (Long, Long))] = { + + // cache this instead of reflecting on every single array + val wrapFn = HashEqualsArrayWrapper.wrapByClassTagFn[T] + + diffByHashCode(left.map(wrapFn), right.map(wrapFn), reducers) + .map { case (k, counts) => (k.wrapped, counts) } + } + + /** + * NOTE: Prefer diff over this method if you can find or construct an Ordering[T]. + * + * Returns a mapping from T to a count of the occurrences of T in the left and right pipes, only for cases + * where the counts are not equal. + * + * This implementation does not require an ordering on T, but does require a function (groupByFn) that + * extracts a value of type K (which has an ordering) from a record of type T. + * + * The groupByFn should be something that partitions records as evenly as possible, because all unique + * records that result in the same groupByFn value will be materialized into an in memory map. + * + * groupByFn must be a pure function, such that: x == y implies that groupByFn(x) == groupByFn(y) + * + * T must have a hash code suitable for use in a hash map on a single JVM (doesn't have to be stable cross + * JVM) K must have a hash code this *is* stable across JVMs. K must have an ordering. + * + * Example groupByFns would be x => x.hashCode, assuming x's hashCode is stable across jvms, or maybe x => + * x.timestamp, if x's hashCode is not stable, assuming there's shouldn't be too many records with the same + * timestamp. + */ + def diffByGroup[T, K: Ordering](left: TypedPipe[T], right: TypedPipe[T], reducers: Option[Int] = None)( + groupByFn: T => K + ): TypedPipe[(T, (Long, Long))] = { + + val lefts = left.map(t => (groupByFn(t), Map(t -> (1L, 0L)))) + val rights = right.map(t => (groupByFn(t), Map(t -> (0L, 1L)))) + + val diff = (lefts ++ rights).sumByKey.flattenValues + .filter { case (k, (t, (lCount, rCount))) => lCount != rCount } + + reducers.map(diff.withReducers).getOrElse(diff).values + } + + /** + * NOTE: Prefer diff over this method if you can find or construct an Ordering[T]. + * + * Same as diffByGroup but uses T.hashCode as the groupByFn + * + * This method does an exact diff, it does not use the hashCode as a proxy for equality. + */ + def diffByHashCode[T]( + left: TypedPipe[T], + right: TypedPipe[T], + reducers: Option[Int] = None + ): TypedPipe[(T, (Long, Long))] = diffByGroup(left, right, reducers)(_.hashCode) + + object Enrichments { + + implicit class Diff[T](val left: TypedPipe[T]) extends AnyVal { + + def diff(right: TypedPipe[T], reducers: Option[Int] = None)(implicit + ev: Ordering[T] + ): UnsortedGrouped[T, (Long, Long)] = + TypedPipeDiff.diff(left, right, reducers) + + def diffByGroup[K: Ordering](right: TypedPipe[T], reducers: Option[Int] = None)( + groupByFn: T => K + ): TypedPipe[(T, (Long, Long))] = + TypedPipeDiff.diffByGroup(left, right, reducers)(groupByFn) + + def diffByHashCode(right: TypedPipe[T], reducers: Option[Int] = None): TypedPipe[(T, (Long, Long))] = + TypedPipeDiff.diffByHashCode(left, right, reducers) + } + + implicit class DiffArray[T](val left: TypedPipe[Array[T]]) extends AnyVal { + + def diffArrayPipes(right: TypedPipe[Array[T]], reducers: Option[Int] = None)(implicit + ev: ClassTag[T] + ): TypedPipe[(Array[T], (Long, Long))] = + TypedPipeDiff.diffArrayPipes(left, right, reducers) + } + + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/ValuePipe.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/ValuePipe.scala new file mode 100644 index 0000000000..70b78b9d8e --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/ValuePipe.scala @@ -0,0 +1,111 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +import com.twitter.scalding.Execution + +object ValuePipe extends java.io.Serializable { + implicit def toTypedPipe[V](v: ValuePipe[V]): TypedPipe[V] = v.toTypedPipe + + def fold[T, U, V](l: ValuePipe[T], r: ValuePipe[U])(f: (T, U) => V): ValuePipe[V] = + l.leftCross(r).collect { case (t, Some(u)) => f(t, u) } + + def apply[T](t: T): ValuePipe[T] = LiteralValue(t) + def empty: ValuePipe[Nothing] = EmptyValue +} + +/** + * ValuePipe is special case of a TypedPipe of just a optional single element. It is like a distribute Option + * type It allows to perform scalar based operations on pipes like normalization. + */ +sealed trait ValuePipe[+T] extends java.io.Serializable { + def leftCross[U](that: ValuePipe[U]): ValuePipe[(T, Option[U])] = that match { + case EmptyValue => map((_, None)) + case LiteralValue(v2) => map((_, Some(v2))) + // We don't know if a computed value is empty or not. We need to run the MR job: + case _ => ComputedValue(toTypedPipe.leftCross(that)) + } + def collect[U](fn: PartialFunction[T, U]): ValuePipe[U] = + filter(fn.isDefinedAt(_)).map(fn(_)) + + def map[U](fn: T => U): ValuePipe[U] + def filter(fn: T => Boolean): ValuePipe[T] + + /** + * Identical to toOptionExecution.map(_.get) The result will be an exception if there is no value. The name + * here follows the convention of adding Execution to the name so in the repl in is removed + */ + def getExecution: Execution[T] = toOptionExecution.flatMap { + case Some(t) => Execution.from(t) + // same exception as scala.None.get + // https://github.com/scala/scala/blob/2.12.x/src/library/scala/Option.scala#L347 + case None => Execution.failed(new java.util.NoSuchElementException("None.get")) + } + + /** + * Like the above, but with a lazy parameter that is evaluated if the value pipe is empty The name here + * follows the convention of adding Execution to the name so in the repl in is removed + */ + def getOrElseExecution[U >: T](t: => U): Execution[U] = toOptionExecution.map(_.getOrElse(t)) + def toTypedPipe: TypedPipe[T] + + /** + * Convert this value to an Option. It is an error if somehow this is not either empty or has one value. The + * name here follows the convention of adding Execution to the name so in the repl in is removed + */ + def toOptionExecution: Execution[Option[T]] = + toTypedPipe.toIterableExecution.map { it => + it.iterator.take(2).toList match { + case Nil => None + case h :: Nil => Some(h) + case items => sys.error("More than 1 item in an ValuePipe: " + items.toString) + } + } + + def debug: ValuePipe[T] +} +case object EmptyValue extends ValuePipe[Nothing] { + override def leftCross[U](that: ValuePipe[U]) = this + override def map[U](fn: Nothing => U): ValuePipe[U] = this + override def filter(fn: Nothing => Boolean) = this + override def toTypedPipe: TypedPipe[Nothing] = TypedPipe.empty + override def toOptionExecution = Execution.from(None) + + def debug: ValuePipe[Nothing] = { + println("EmptyValue") + this + } +} +final case class LiteralValue[T](value: T) extends ValuePipe[T] { + override def map[U](fn: T => U) = LiteralValue(fn(value)) + override def filter(fn: T => Boolean) = if (fn(value)) this else EmptyValue + override def toTypedPipe = TypedPipe.from(Iterable(value)) + override def toOptionExecution = Execution.from(Some(value)) + + def debug: ValuePipe[T] = map { v => + println("LiteralValue(" + v.toString + ")") + v + } +} +final case class ComputedValue[T](override val toTypedPipe: TypedPipe[T]) extends ValuePipe[T] { + override def map[U](fn: T => U) = ComputedValue(toTypedPipe.map(fn)) + override def filter(fn: T => Boolean) = ComputedValue(toTypedPipe.filter(fn)) + + def debug: ValuePipe[T] = map { value => + println("ComputedValue(" + value.toString + ")") + value + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/WithDescription.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/WithDescription.scala new file mode 100644 index 0000000000..e8d4ee8f4a --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/WithDescription.scala @@ -0,0 +1,38 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +/** + * Used for objects that may have a description set to be used in .dot and MR step names. + */ +trait HasDescription { + def descriptions: Seq[String] +} + +/** + * Used for objects that may _set_ a description to be used in .dot and MR step names. + */ +trait WithDescription[+This <: WithDescription[This]] extends HasDescription { self: This => + + /** never mutates this, instead returns a new item. */ + def withDescription(description: String): This + + def withDescription(descriptionOpt: Option[String]): This = + descriptionOpt match { + case Some(description) => withDescription(description) + case None => self + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/WithReducers.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/WithReducers.scala new file mode 100644 index 0000000000..ebff2c078f --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/WithReducers.scala @@ -0,0 +1,66 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed +import java.io.Serializable + +/** + * used for types that may know how many reducers they need e.g. CoGrouped, Grouped, SortedGrouped, + * UnsortedGrouped + */ +trait HasReducers { + def reducers: Option[Int] +} + +/** + * used for types that must know how many reducers they need e.g. Sketched + */ +trait MustHaveReducers extends HasReducers { + def reducers: Some[Int] +} + +/** + * used for objects that may _set_ how many reducers they need e.g. CoGrouped, Grouped, SortedGrouped, + * UnsortedGrouped + */ +trait WithReducers[+This <: WithReducers[This]] extends HasReducers { + + /** never mutates this, instead returns a new item. */ + def withReducers(reds: Int): This +} + +object WithReducers extends Serializable { + implicit class Enrichment[W <: WithReducers[W]](val w: W) extends AnyVal { + def maybeWithReducers(optReducers: Option[Int]): W = + WithReducers.maybeWithReducers(w, optReducers) + } + + def maybeWithReducers[W <: WithReducers[W]](w: W, reds: Option[Int]): W = + reds match { + case None => w + case Some(r) => w.withReducers(r) + } + + /** + * Return the max of the two number of reducers + */ + def maybeCombine(optR1: Option[Int], optR2: Option[Int]): Option[Int] = + (optR1, optR2) match { + case (None, other) => other + case (other, None) => other + case (Some(r1), Some(r2)) => Some(r1.max(r2)) + } + +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/WritePartitioner.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/WritePartitioner.scala new file mode 100644 index 0000000000..e662fb6e13 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/WritePartitioner.scala @@ -0,0 +1,423 @@ +package com.twitter.scalding.typed + +import com.twitter.scalding.dagon.{Dag, FunctionK, Id, Memoize, Rule} +import com.twitter.scalding.Execution +import com.twitter.scalding.typed.functions.EqTypes +import org.slf4j.LoggerFactory +import scala.language.higherKinds + +object WritePartitioner { + private[this] val logger = LoggerFactory.getLogger(getClass) + + type PairK[F[_], +G[_], T] = (F[T], G[T]) + + /** + * This breaks a job at all the places it explicitly fans out, (and currently after each reduce/join). + */ + def breakAtForks[M[+_]](ws: List[PairK[TypedPipe, Output, _]])(implicit M: Materializer[M]): M[Unit] = { + val rules = List(OptimizationRules.AddExplicitForks, OptimizationRules.RemoveDuplicateForceFork) + materialize[M](rules, ws) + } + + /** + * Partition a single TypedPipe. + * + * This is really only useful for jobs with single final outputs since you want to partition the entire job, + * not a portion of it. + */ + def partitionSingle[A](phases: Seq[Rule[TypedPipe]], pipe: TypedPipe[A]): Execution[TypedPipe[A]] = { + type Const[B] = EqTypes[B, A] + + val writes = materialize1[Execution, Const](phases, List((pipe, EqTypes.reflexive[A]))) + require(writes.size == 1) + + def fix[F[_], B](t: WritePartitioner.PairK[F, Const, B]): F[A] = + t._2.subst[F](t._1) + + // We don't want any further optimization on this job + fix(writes.head) + } + + /** + * This enables us to write the partitioning in terms of this applicative type that is equipped with two + * extra operations: materialized and write, but not a general flatMap + * + * so the only sequencing power we have is to materialize + * + * This allows us to test the properties we want without having to deal with Execution, which is a black box + * concerned with actually running jobs + */ + trait Materializer[M[+_]] { + type TP[+A] = M[TypedPipe[A]] + + def pure[A](a: A): M[A] + def map[A, B](ma: M[A])(fn: A => B): M[B] + def zip[A, B](ma: M[A], mb: M[B]): M[(A, B)] + def materialize[A](t: M[TypedPipe[A]]): M[TypedPipe[A]] + def write[A](tp: M[TypedPipe[A]], sink: Output[A]): M[Unit] + def sequence_[A](as: Seq[M[A]]): M[Unit] + } + + object Materializer { + implicit val executionMaterializer: Materializer[Execution] = + new Materializer[Execution] { + def pure[A](a: A) = Execution.from(a) + def map[A, B](ma: Execution[A])(fn: A => B) = ma.map(fn) + def zip[A, B](ma: Execution[A], mb: Execution[B]): Execution[(A, B)] = ma.zip(mb) + def materialize[A](t: Execution[TypedPipe[A]]): Execution[TypedPipe[A]] = + t.flatMap(_.forceToDiskExecution) + def write[A](tp: Execution[TypedPipe[A]], sink: Output[A]): Execution[Unit] = + tp.flatMap(_.writeExecution(sink)) + def sequence_[A](as: Seq[Execution[A]]): Execution[Unit] = Execution.sequence(as).unit + } + } + + def materialize[M[+_]](phases: Seq[Rule[TypedPipe]], ws: List[PairK[TypedPipe, Output, _]])(implicit + mat: Materializer[M] + ): M[Unit] = { + val writes = materialize1[M, Output](phases, ws)(mat) + val toSeq = writes.map { case (mt, sink) => mat.write(mt, sink) } + mat.sequence_(toSeq) + } + + def materialize1[M[+_], S[_]](phases: Seq[Rule[TypedPipe]], ws: List[PairK[TypedPipe, S, _]])(implicit + mat: Materializer[M] + ): List[PairK[mat.TP, S, _]] = { + val e = Dag.empty(OptimizationRules.toLiteral) + + logger.info(s"converting ${ws.size} writes into several parts") + val (finalDag, writeIds) = ws.foldLeft((e, List.empty[PairK[Id, S, _]])) { case ((dag, writes), pair) => + val (dag1, id) = dag.addRoot(pair._1) + (dag1, (id, pair._2) :: writes) + } + // Now apply the rules: + logger.info(s"applying rules to graph of size: ${finalDag.allNodes.size}") + val optDag = finalDag.applySeq(phases :+ OptimizationRules.RemoveUselessFork) + logger.info(s"optimized graph hash size: ${optDag.allNodes.size}") + + import TypedPipe.{ReduceStepPipe, HashCoGroup} + + def handleHashCoGroup[K, V, V2, R]( + hj: HashCoGroup[K, V, V2, R], + recurse: FunctionK[TypedPipe, mat.TP] + ): mat.TP[(K, R)] = { + import TypedPipe._ + val exright: M[HashJoinable[K, V2]] = hj.right match { + case step @ IdentityReduce(_, _, _, _, _) => + type TK[+Z] = TypedPipe[(K, Z)] + val mappedV2 = step.evidence.subst[TK](step.mapped) + mat.map(recurse(mappedV2)) { (tp: TypedPipe[(K, V2)]) => + IdentityReduce[K, V2, V2](step.keyOrdering, tp, step.reducers, step.descriptions, implicitly) + } + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + type TK[+Z] = TypedPipe[(K, Z)] + val mappedV2 = step.evidence.subst[TK](step.mapped) + mat.map(recurse(mappedV2)) { (tp: TypedPipe[(K, V2)]) => + UnsortedIdentityReduce[K, V2, V2]( + step.keyOrdering, + tp, + step.reducers, + step.descriptions, + implicitly + ) + } + case step @ IteratorMappedReduce(_, _, _, _, _) => + def go[A, B, C](imr: IteratorMappedReduce[A, B, C]) = + mat.map(recurse(imr.mapped))((tp: TypedPipe[(A, B)]) => imr.copy(mapped = tp)) + + go(step) + } + + val zipped = mat.zip(recurse(hj.left), exright) + mat.map(zipped) { case (left, right) => + HashCoGroup(left, right, hj.joiner) + } + } + + def widen[A, B <: A](exb: M[B]): M[A] = exb + + def handleReduceStep[K, V1, V2]( + rs: ReduceStep[K, V1, V2], + recurse: FunctionK[TypedPipe, mat.TP] + ): mat.TP[(K, V2)] = + mat.map(recurse(rs.mapped)) { pipe => + TypedPipe.ReduceStepPipe(ReduceStep.setInput[K, V1, V2](rs, pipe)) + } + + def handleCoGrouped[K, V]( + cg: CoGroupable[K, V], + recurse: FunctionK[TypedPipe, mat.TP] + ): mat.TP[(K, V)] = { + import CoGrouped._ + import TypedPipe._ + + def pipeToCG[V1](t: TypedPipe[(K, V1)]): CoGroupable[K, V1] = + t match { + case ReduceStepPipe(cg: CoGroupable[K @unchecked, V1 @unchecked]) => + // we are relying on the fact that we use Ordering[K] + // as a contravariant type, despite it not being defined + // that way. + cg + case CoGroupedPipe(cg) => + // we are relying on the fact that we use Ordering[K] + // as a contravariant type, despite it not being defined + // that way. + cg.asInstanceOf[CoGroupable[K, V1]] + case kvPipe => IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly) + } + + cg match { + case p @ Pair(_, _, _) => + def go[A, B, C](pair: Pair[K, A, B, C]): mat.TP[(K, C)] = { + val mleft = handleCoGrouped(pair.larger, recurse) + val mright = handleCoGrouped(pair.smaller, recurse) + val both = mat.zip(mleft, mright) + mat.map(both) { case (l, r) => + CoGroupedPipe(Pair(pipeToCG(l), pipeToCG(r), pair.fn)) + } + } + widen(go(p)) + case wr @ WithReducers(_, _) => + def go[V1 <: V](wr: WithReducers[K, V1]): mat.TP[(K, V)] = { + val reds = wr.reds + mat.map(handleCoGrouped(wr.on, recurse)) { (tp: TypedPipe[(K, V1)]) => + tp match { + case ReduceStepPipe(rs) => + ReduceStepPipe(ReduceStep.withReducers(rs, reds)) + case CoGroupedPipe(cg) => + CoGroupedPipe(WithReducers(cg, reds)) + case kvPipe => + ReduceStepPipe( + IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly) + .withReducers(reds) + ) + } + } + } + go(wr) + case wd @ WithDescription(_, _) => + def go[V1 <: V](wd: WithDescription[K, V1]): mat.TP[(K, V)] = { + val desc = wd.description + mat.map(handleCoGrouped(wd.on, recurse)) { (tp: TypedPipe[(K, V1)]) => + tp match { + case ReduceStepPipe(rs) => + ReduceStepPipe(ReduceStep.withDescription(rs, desc)) + case CoGroupedPipe(cg) => + CoGroupedPipe(WithDescription(cg, desc)) + case kvPipe => + kvPipe.withDescription(desc) + } + } + } + go(wd) + case fk @ CoGrouped.FilterKeys(_, _) => + def go[V1 <: V](fk: CoGrouped.FilterKeys[K, V1]): mat.TP[(K, V)] = { + val fn = fk.fn + mat.map(handleCoGrouped(fk.on, recurse)) { (tp: TypedPipe[(K, V1)]) => + tp match { + case ReduceStepPipe(rs) => + val mapped = rs.mapped + val mappedF = TypedPipe.FilterKeys(mapped, fn) + ReduceStepPipe(ReduceStep.setInput(rs, mappedF)) + case CoGroupedPipe(cg) => + CoGroupedPipe(CoGrouped.FilterKeys(cg, fn)) + case kvPipe => + TypedPipe.FilterKeys(kvPipe, fn) + } + } + } + go(fk) + case mg @ MapGroup(_, _) => + def go[V1, V2 <: V](mg: MapGroup[K, V1, V2]): mat.TP[(K, V)] = { + val fn = mg.fn + mat.map(handleCoGrouped(mg.on, recurse)) { (tp: TypedPipe[(K, V1)]) => + tp match { + case ReduceStepPipe(rs) => + ReduceStepPipe(ReduceStep.mapGroup(rs)(fn)) + case CoGroupedPipe(cg) => + CoGroupedPipe(MapGroup(cg, fn)) + case kvPipe => + val rs = IdentityReduce[K, V1, V1](cg.keyOrdering, kvPipe, None, Nil, implicitly) + ReduceStepPipe(ReduceStep.mapGroup(rs)(fn)) + } + } + } + go(mg) + case step @ IdentityReduce(_, _, _, _, _) => + widen(handleReduceStep(step, recurse)) // the widen trick sidesteps GADT bugs + case step @ UnsortedIdentityReduce(_, _, _, _, _) => + widen(handleReduceStep(step, recurse)) + case step @ IteratorMappedReduce(_, _, _, _, _) => + widen(handleReduceStep(step, recurse)) + } + } + + /** + * If cascading would consider the current pipe as a Logical reduce we can avoid some forces below. This + * method returns true if the pipe is ending on a reduce (not potentially a map-only job) + */ + @annotation.tailrec + def isLogicalReduce(tp: TypedPipe[Any]): Boolean = { + import TypedPipe._ + tp match { + case EmptyTypedPipe | IterablePipe(_) | SourcePipe(_) => false + case CounterPipe(a) => isLogicalReduce(a) + case cp @ CrossPipe(_, _) => isLogicalReduce(cp.viaHashJoin) + case cp @ CrossValue(_, _) => isLogicalReduce(cp.viaHashJoin) + case DebugPipe(p) => isLogicalReduce(p) + case FilterKeys(p, _) => isLogicalReduce(p) + case Filter(p, _) => isLogicalReduce(p) + case FlatMapValues(p, _) => isLogicalReduce(p) + case FlatMapped(p, _) => isLogicalReduce(p) + case ForceToDisk(_) => false // not reducers for sure, could be a map-only job + case Fork(_) => false // TODO, not super clear + case HashCoGroup(left, _, _) => isLogicalReduce(left) + case MapValues(p, _) => isLogicalReduce(p) + case Mapped(p, _) => isLogicalReduce(p) + case MergedTypedPipe(_, _) => false + case ReduceStepPipe(_) => true + case SumByLocalKeys(p, _) => isLogicalReduce(p) + case TrappedPipe(p, _) => isLogicalReduce(p) + case CoGroupedPipe(_) => true + case WithOnComplete(p, _) => isLogicalReduce(p) + case WithDescriptionTypedPipe(p, _) => isLogicalReduce(p) + } + } + + /** + * We use this state to track where we are as we recurse up the graph. Since we know at the very end we + * will write, we can avoid, for instance forcing a reduce operation that is followed only by a map and a + * write. + * + * Coupled with the isLogicalReduce above, we can emulate the behavior of the cascading planner as we + * recurse up. + */ + sealed abstract class BelowState { + def |(that: BelowState): BelowState = + (this, that) match { + case (BelowState.Write, later) => later + case (BelowState.OnlyMapping, BelowState.Write) => BelowState.OnlyMapping + case (BelowState.OnlyMapping, mapOrMater) => mapOrMater + case (BelowState.Materialized, _) => BelowState.Materialized + } + } + object BelowState { + case object Write extends BelowState + case object OnlyMapping extends BelowState + case object Materialized extends BelowState + } + type P[a] = (TypedPipe[a], BelowState) + + /** + * Given a pipe, and the state below it, return the materialized version of that pipe. This should cause + * no more materializations than cascading would do, and indeed we test for this property + */ + val fn = Memoize.functionK[P, mat.TP](new Memoize.RecursiveK[P, mat.TP] { + import TypedPipe._ + import BelowState._ + + def toFunction[A] = { + case ((cp: CounterPipe[a], bs), rec) => + mat.map(rec((cp.pipe, bs)))(CounterPipe(_: TypedPipe[(a, Iterable[((String, String), Long)])])) + case ((c: CrossPipe[a, b], bs), rec) => + rec((c.viaHashJoin, bs)) + case ((cv @ CrossValue(_, _), bs), rec) => + rec((cv.viaHashJoin, bs)) + case ((p: DebugPipe[a], bs), rec) => + mat.map(rec((p.input, bs)))(DebugPipe(_: TypedPipe[a])) + case ((p: FilterKeys[a, b], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(FilterKeys(_: TypedPipe[(a, b)], p.fn)) + case ((p: Filter[a], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(Filter(_: TypedPipe[a], p.fn)) + case ((Fork(of), bs), rec) => + // Treat forks as forceToDisk after + // optimizations (which should have removed unneeded forks + rec((ForceToDisk(of), bs)) + case ((p: FlatMapValues[a, b, c], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(FlatMapValues(_: TypedPipe[(a, b)], p.fn)) + case ((p: FlatMapped[a, b], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(FlatMapped(_: TypedPipe[a], p.fn)) + case ((ForceToDisk(src @ IterablePipe(_)), bs), rec) => + // no need to put a checkpoint here: + rec((src, bs)) + case ((ForceToDisk(src @ SourcePipe(_)), bs), rec) => + // no need to put a checkpoint here: + rec((src, bs)) + case ((p: ForceToDisk[a], bs), rec) => + val newBs = + if (isLogicalReduce(p.input)) OnlyMapping + else Materialized + val matP = rec((p.input, newBs)) + bs match { + case Write => + // there is no need force to disk immediately before a write + matP + case _ => mat.materialize(matP) + } + case ((it @ IterablePipe(_), _), _) => + mat.pure(it) + case ((p: MapValues[a, b, c], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(MapValues(_: TypedPipe[(a, b)], p.fn)) + case ((p: Mapped[a, b], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(Mapped(_: TypedPipe[a], p.fn)) + case ((p: MergedTypedPipe[a], bs), rec) => + val mleft = rec((p.left, bs)) + val mright = rec((p.right, bs)) + val both = mat.zip(mleft, mright) + mat.map(both) { case (l, r) => MergedTypedPipe(l, r) } + case ((src @ SourcePipe(_), _), _) => + mat.pure(src) + case ((p: SumByLocalKeys[a, b], bs), rec) => + mat.map(rec((p.input, bs | OnlyMapping)))(SumByLocalKeys(_: TypedPipe[(a, b)], p.semigroup)) + case ((p: TrappedPipe[a], bs), rec) => + // TODO: it is a bit unclear if a trap is allowed on the back of a reduce? + mat.map(rec((p.input, bs)))(TrappedPipe[a](_: TypedPipe[a], p.sink)) + case ((p: WithDescriptionTypedPipe[a], bs), rec) => + mat.map(rec((p.input, bs)))(WithDescriptionTypedPipe(_: TypedPipe[a], p.descriptions)) + case ((p: WithOnComplete[a], bs), rec) => + mat.map(rec((p.input, bs)))(WithOnComplete(_: TypedPipe[a], p.fn)) + case ((EmptyTypedPipe, _), _) => + mat.pure(EmptyTypedPipe) + case ((hg: HashCoGroup[a, b, c, d], bs), rec) => + val withBs = new FunctionK[TypedPipe, P] { + def toFunction[A] = { tp => (tp, bs | OnlyMapping) } + } + // TODO: hashJoins may not be allowed in a reduce step in cascading, + // not clear + val recHG = FunctionK.andThen[TypedPipe, P, mat.TP](withBs, rec) + handleHashCoGroup(hg, recHG) + case ((CoGroupedPipe(cg), bs), rec) => + val withBs = new FunctionK[TypedPipe, P] { + def toFunction[A] = { tp => (tp, bs | Materialized) } + } + // TODO: hashJoins may not be allowed in a reduce step in cascading, + // not clear + val recHG = FunctionK.andThen[TypedPipe, P, mat.TP](withBs, rec) + val hcg = handleCoGrouped(cg, recHG) + bs match { + case BelowState.Materialized => mat.materialize(hcg) + case _ => hcg + } + case ((ReduceStepPipe(rs), bs), rec) => + val withBs = new FunctionK[TypedPipe, P] { + def toFunction[A] = { tp => (tp, bs | BelowState.Materialized) } + } + // TODO: hashJoins may not be allowed in a reduce step in cascading, + // not clear + val recHG = FunctionK.andThen[TypedPipe, P, mat.TP](withBs, rec) + val hrs = handleReduceStep(rs, recHG) + bs match { + case BelowState.Materialized => mat.materialize(hrs) + case _ => hrs + } + } + }) + + def write[A](p: PairK[Id, S, A]): (M[TypedPipe[A]], S[A]) = { + val materialized: M[TypedPipe[A]] = fn((optDag.evaluate(p._1), BelowState.Write)) + (materialized, p._2) + } + + writeIds.map(write(_)) + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/EqTypes.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/EqTypes.scala new file mode 100644 index 0000000000..c4350294ce --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/EqTypes.scala @@ -0,0 +1,35 @@ +package com.twitter.scalding.typed.functions + +/** + * This is a more powerful version of =:= that can allow us to remove casts and also not have any runtime cost + * for our function calls in some cases of trivial functions + */ +sealed abstract class EqTypes[A, B] extends java.io.Serializable { + def apply(a: A): B + def subst[F[_]](f: F[A]): F[B] + + final def reverse: EqTypes[B, A] = { + val aa = EqTypes.reflexive[A] + type F[T] = EqTypes[T, A] + subst[F](aa) + } + + def toEv: A =:= B = { + val aa = implicitly[A =:= A] + type F[T] = A =:= T + subst[F](aa) + } +} + +object EqTypes extends java.io.Serializable { + private[this] final case class ReflexiveEquality[A]() extends EqTypes[A, A] { + def apply(a: A): A = a + def subst[F[_]](f: F[A]): F[A] = f + } + + implicit def reflexive[A]: EqTypes[A, A] = ReflexiveEquality() + + def fromEv[A, B](ev: A =:= B): EqTypes[A, B] = // linter:disable:UnusedParameter + // in scala 2.13, this won't need a cast, but the cast is safe + reflexive[A].asInstanceOf[EqTypes[A, B]] +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/FlatMappedFn.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/FlatMappedFn.scala new file mode 100644 index 0000000000..de87cb2fe6 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/FlatMappedFn.scala @@ -0,0 +1,128 @@ +/* +Copyright 2013 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed.functions + +import java.io.Serializable + +/** + * This is a composition of one or more FlatMappings + * + * For some reason, this fails in scala 2.12 if this is an abstract class + */ +sealed trait FlatMappedFn[-A, +B] extends (A => TraversableOnce[B]) with Serializable { + import FlatMappedFn._ + + final def runAfter[Z](fn: FlatMapping[Z, A]): FlatMappedFn[Z, B] = this match { + case Single(FlatMapping.Identity(ev)) => + type F[T] = FlatMapping[Z, T] + Single(ev.subst[F](fn)) + case notId => + fn match { + case FlatMapping.Identity(ev) => + type F[T] = FlatMappedFn[T, B] + ev.reverse.subst[F](this) + case notIdFn => Series(notIdFn, notId) // only make a Series without either side being identity + } + } + + final def combine[C](next: FlatMappedFn[B, C]): FlatMappedFn[A, C] = { + /* + * We have to reassociate so the front of the series has the + * first flatmap, so we can bail out early when there are no more + * items in any flatMap result. + */ + def loop[X, Y, Z](fn0: FlatMappedFn[X, Y], fn1: FlatMappedFn[Y, Z]): FlatMappedFn[X, Z] = + fn0 match { + case Single(FlatMapping.Identity(ev)) => + type F[T] = FlatMappedFn[T, Z] + ev.reverse.subst[F](fn1) + case Single(f0) => + Series(f0, fn1) + case Series(f0f, f1f) => + Series(f0f, loop(f1f, fn1)) + } + loop(this, next) + } + + /** + * We interpret this composition once to minimize pattern matching when we execute + */ + private[this] val toFn: A => TraversableOnce[B] = { + import FlatMapping._ + + def loop[A1, B1](fn: FlatMappedFn[A1, B1]): A1 => TraversableOnce[B1] = fn match { + case Single(Identity(ev)) => + val const: A1 => TraversableOnce[A1] = FlatMapFunctions.FromIdentity[A1]() + type F[T] = A1 => TraversableOnce[T] + ev.subst[F](const) + case Single(Filter(f, ev)) => + val filter: A1 => TraversableOnce[A1] = FlatMapFunctions.FromFilter(f) + type F[T] = A1 => TraversableOnce[T] + ev.subst[F](filter) + case Single(Map(f)) => FlatMapFunctions.FromMap(f) + case Single(FlatM(f)) => f + case Series(Identity(ev), rest) => + type F[T] = T => TraversableOnce[B1] + ev.subst[F](loop(rest)) + case Series(Filter(f, ev), rest) => + type F[T] = T => TraversableOnce[B1] + val next = ev.subst[F](loop(rest)) // linter:disable:UndesirableTypeInference + + FlatMapFunctions.FromFilterCompose(f, next) + case Series(Map(f), rest) => + val next = loop(rest) // linter:disable:UndesirableTypeInference + FlatMapFunctions.FromMapCompose(f, next) + case Series(FlatM(f), rest) => + val next = loop(rest) // linter:disable:UndesirableTypeInference + FlatMapFunctions.FromFlatMapCompose(f, next) + } + + loop(this) + } + + def apply(a: A): TraversableOnce[B] = toFn(a) +} + +object FlatMappedFn extends Serializable { + + def asId[A, B](f: FlatMappedFn[A, B]): Option[EqTypes[_ >: A, _ <: B]] = f match { + case Single(FlatMapping.Identity(ev)) => Some(ev) + case _ => None + } + + def asFilter[A, B](f: FlatMappedFn[A, B]): Option[(A => Boolean, EqTypes[(_ >: A), (_ <: B)])] = f match { + case Single(filter @ FlatMapping.Filter(_, _)) => Some((filter.fn, filter.ev)) + case _ => None + } + + def apply[A, B](fn: A => TraversableOnce[B]): FlatMappedFn[A, B] = + fn match { + case fmf: FlatMappedFn[A, B] => fmf + case rawfn => Single(FlatMapping.FlatM(rawfn)) + } + + def identity[T]: FlatMappedFn[T, T] = Single(FlatMapping.Identity[T, T](EqTypes.reflexive[T])) + + def fromFilter[A](fn: A => Boolean): FlatMappedFn[A, A] = + Single(FlatMapping.Filter[A, A](fn, EqTypes.reflexive)) + + def fromMap[A, B](fn: A => B): FlatMappedFn[A, B] = + Single(FlatMapping.Map(fn)) + + final case class Single[A, B](fn: FlatMapping[A, B]) extends FlatMappedFn[A, B] + final case class Series[A, B, C](first: FlatMapping[A, B], next: FlatMappedFn[B, C]) + extends FlatMappedFn[A, C] +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/FlatMapping.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/FlatMapping.scala new file mode 100644 index 0000000000..8e11763268 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/FlatMapping.scala @@ -0,0 +1,21 @@ +package com.twitter.scalding.typed.functions + +import java.io.Serializable + +/** + * This is one of 4 core, non composed operations: identity filter map flatMap + */ +sealed abstract class FlatMapping[-A, +B] extends Serializable + +object FlatMapping extends Serializable { + def filter[A](fn: A => Boolean): FlatMapping[A, A] = + Filter[A, A](fn, implicitly) + + def filterKeys[K, V](fn: K => Boolean): FlatMapping[(K, V), (K, V)] = + filter[(K, V)](FilterKeysToFilter(fn)) + + final case class Identity[A, B](ev: EqTypes[A, B]) extends FlatMapping[A, B] + final case class Filter[A, B](fn: A => Boolean, ev: EqTypes[A, B]) extends FlatMapping[A, B] + final case class Map[A, B](fn: A => B) extends FlatMapping[A, B] + final case class FlatM[A, B](fn: A => TraversableOnce[B]) extends FlatMapping[A, B] +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/Functions.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/Functions.scala new file mode 100644 index 0000000000..06d9df895e --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/Functions.scala @@ -0,0 +1,327 @@ +package com.twitter.scalding.typed.functions + +import com.twitter.algebird.{Aggregator, Fold, Ring, Semigroup} +import java.util.Random +import java.io.Serializable + +case class Constant[T](result: T) extends Function1[Any, T] { + def apply(a: Any) = result +} + +case class ConstantKey[K, V](key: K) extends Function1[V, (K, V)] { + def apply(v: V) = (key, v) +} + +case class DebugFn[A]() extends Function1[A, A] { + def apply(a: A) = { + println(a) + a + } +} + +case class WithConstant[A, B](constant: B) extends Function1[A, (A, B)] { + def apply(a: A) = (a, constant) +} + +case class MakeKey[K, V](fn: V => K) extends Function1[V, (K, V)] { + def apply(v: V) = (fn(v), v) +} + +case class MapOptionToFlatMap[A, B](fn: A => Option[B]) extends Function1[A, List[B]] { + def apply(a: A) = fn(a) match { + case None => Nil + case Some(a) => a :: Nil + } +} + +case class PartialFunctionToFilter[A, B](fn: PartialFunction[A, B]) extends Function1[A, Boolean] { + def apply(a: A) = fn.isDefinedAt(a) +} + +case class MapValueStream[A, B](fn: Iterator[A] => Iterator[B]) + extends Function2[Any, Iterator[A], Iterator[B]] { + def apply(k: Any, vs: Iterator[A]) = fn(vs) +} + +case class Drop[A](count: Int) extends Function1[Iterator[A], Iterator[A]] { + def apply(as: Iterator[A]) = as.drop(count) +} +case class DropWhile[A](fn: A => Boolean) extends Function1[Iterator[A], Iterator[A]] { + def apply(as: Iterator[A]) = as.dropWhile(fn) +} + +case class Take[A](count: Int) extends Function1[Iterator[A], Iterator[A]] { + def apply(as: Iterator[A]) = as.take(count) +} + +case class TakeWhile[A](fn: A => Boolean) extends Function1[Iterator[A], Iterator[A]] { + def apply(as: Iterator[A]) = as.takeWhile(fn) +} + +case class Identity[A, B](eqTypes: EqTypes[A, B]) extends Function1[A, B] { + def apply(a: A) = eqTypes(a) +} + +object Identity extends Serializable { + def apply[A](): Identity[A, A] = Identity[A, A](EqTypes.reflexive[A]) +} + +case class Widen[A, B](subTypes: SubTypes[A, B]) extends Function1[A, B] { + def apply(a: A) = subTypes(a) +} + +case class GetKey[K]() extends Function1[(K, Any), K] { + def apply(kv: (K, Any)) = kv._1 +} + +case class GetValue[V]() extends Function1[(Any, V), V] { + def apply(kv: (Any, V)) = kv._2 +} + +case class Swap[A, B]() extends Function1[(A, B), (B, A)] { + def apply(ab: (A, B)) = (ab._2, ab._1) +} + +case class SumAll[T](sg: Semigroup[T]) extends Function1[TraversableOnce[T], Iterator[T]] { + def apply(ts: TraversableOnce[T]) = sg.sumOption(ts).iterator +} + +case class Fill[A](size: Int) extends Function1[A, Iterator[A]] { + def apply(a: A) = Iterator.fill(size)(a) +} + +case class AggPrepare[A, B, C](agg: Aggregator[A, B, C]) extends Function1[A, B] { + def apply(a: A) = agg.prepare(a) +} + +case class AggPresent[A, B, C](agg: Aggregator[A, B, C]) extends Function1[B, C] { + def apply(a: B) = agg.present(a) +} + +case class FoldLeftIterator[A, B](init: B, fold: (B, A) => B) extends Function1[Iterator[A], Iterator[B]] { + def apply(as: Iterator[A]) = Iterator.single(as.foldLeft(init)(fold)) +} + +case class ScanLeftIterator[A, B](init: B, fold: (B, A) => B) extends Function1[Iterator[A], Iterator[B]] { + def apply(as: Iterator[A]) = as.scanLeft(init)(fold) +} + +case class FoldIterator[A, B](fold: Fold[A, B]) extends Function1[Iterator[A], Iterator[B]] { + def apply(as: Iterator[A]) = Iterator.single(fold.overTraversable(as)) +} + +case class FoldWithKeyIterator[K, A, B](foldfn: K => Fold[A, B]) + extends Function2[K, Iterator[A], Iterator[B]] { + def apply(k: K, as: Iterator[A]) = Iterator.single(foldfn(k).overTraversable(as)) +} + +case class AsRight[A, B]() extends Function1[B, Either[A, B]] { + def apply(b: B) = Right(b) +} + +case class AsLeft[A, B]() extends Function1[A, Either[A, B]] { + def apply(b: A) = Left(b) +} + +case class TuplizeFunction[A, B, C](fn: (A, B) => C) extends Function1[(A, B), C] { + def apply(ab: (A, B)) = fn(ab._1, ab._2) +} + +case class DropValue1[A, B, C]() extends Function1[(A, (B, C)), (A, C)] { + def apply(abc: (A, (B, C))) = (abc._1, abc._2._2) +} + +case class RandomNextInt(seed: Long, modulus: Int) extends Function1[Any, Int] { + private[this] lazy val rng = new Random(seed) + def apply(a: Any) = { + val raw = rng.nextInt(modulus) + a.hashCode() + val mod = raw % modulus + if (mod >= 0) mod else mod + modulus + } +} + +case class RandomFilter(seed: Long, fraction: Double) extends Function1[Any, Boolean] { + private[this] lazy val rng = new Random(seed) + def apply(a: Any) = rng.nextDouble < fraction +} + +case class Count[T](fn: T => Boolean) extends Function1[T, Long] { + def apply(t: T) = if (fn(t)) 1L else 0L +} + +case class SizeOfSet[T]() extends Function1[Set[T], Long] { + def apply(s: Set[T]) = s.size.toLong +} + +case class HeadSemigroup[T]() extends Semigroup[T] { + def plus(a: T, b: T) = a + // Don't enumerate every item, just take the first + override def sumOption(to: TraversableOnce[T]): Option[T] = + if (to.isEmpty) None + else Some(to.toIterator.next) +} + +case class SemigroupFromFn[T](fn: (T, T) => T) extends Semigroup[T] { + def plus(a: T, b: T) = fn(a, b) +} + +case class SemigroupFromProduct[T](ring: Ring[T]) extends Semigroup[T] { + def plus(a: T, b: T) = ring.times(a, b) +} + +/** + * This is a semigroup that throws IllegalArgumentException if there is more than one item. This is used to + * trigger optimizations where the user knows there is at most one value per key. + */ +case class RequireSingleSemigroup[T]() extends Semigroup[T] { + def plus(a: T, b: T) = throw new IllegalArgumentException(s"expected only one item, calling plus($a, $b)") +} + +case class ConsList[T]() extends Function1[(T, List[T]), List[T]] { + def apply(results: (T, List[T])) = results._1 :: results._2 +} + +case class ReverseList[T]() extends Function1[List[T], List[T]] { + def apply(results: List[T]) = results.reverse +} + +case class ToList[A]() extends Function1[Iterator[A], Iterator[List[A]]] { + def apply(as: Iterator[A]) = + // This should never really happen, but we are being defensive + if (as.isEmpty) Iterator.empty + else Iterator.single(as.toList) +} + +case class ToSet[A]() extends Function1[A, Set[A]] { + // this allows us to access Set1 without boxing into varargs + private[this] val empty = Set.empty[A] + def apply(a: A) = empty + a +} + +case class MaxOrd[A, B >: A](ord: Ordering[B]) extends Function2[A, A, A] { + def apply(a1: A, a2: A) = + if (ord.lt(a1, a2)) a2 else a1 +} + +case class MaxOrdBy[A, B](fn: A => B, ord: Ordering[B]) extends Function2[A, A, A] { + def apply(a1: A, a2: A) = + if (ord.lt(fn(a1), fn(a2))) a2 else a1 +} + +case class MinOrd[A, B >: A](ord: Ordering[B]) extends Function2[A, A, A] { + def apply(a1: A, a2: A) = + if (ord.lt(a1, a2)) a1 else a2 +} + +case class MinOrdBy[A, B](fn: A => B, ord: Ordering[B]) extends Function2[A, A, A] { + def apply(a1: A, a2: A) = + if (ord.lt(fn(a1), fn(a2))) a1 else a2 +} + +case class FilterKeysToFilter[K](fn: K => Boolean) extends Function1[(K, Any), Boolean] { + def apply(kv: (K, Any)) = fn(kv._1) +} + +case class FlatMapValuesToFlatMap[K, A, B](fn: A => TraversableOnce[B]) + extends Function1[(K, A), TraversableOnce[(K, B)]] { + def apply(ka: (K, A)) = { + val k = ka._1 + fn(ka._2).map((k, _)) + } +} + +case class MergeFlatMaps[A, B](fns: Iterable[A => TraversableOnce[B]]) + extends Function1[A, TraversableOnce[B]] { + def apply(a: A) = fns.iterator.flatMap(fn => fn(a)) +} + +case class MapValuesToMap[K, A, B](fn: A => B) extends Function1[(K, A), (K, B)] { + def apply(ka: (K, A)) = (ka._1, fn(ka._2)) +} + +case class EmptyGuard[K, A, B](fn: (K, Iterator[A]) => Iterator[B]) + extends Function2[K, Iterator[A], Iterator[B]] { + def apply(k: K, as: Iterator[A]) = + if (as.nonEmpty) fn(k, as) else Iterator.empty +} + +case class FilterGroup[A, B](fn: ((A, B)) => Boolean) extends Function2[A, Iterator[B], Iterator[B]] { + def apply(a: A, bs: Iterator[B]) = bs.filter(fn(a, _)) +} + +case class MapGroupMapValues[A, B, C](fn: B => C) extends Function2[A, Iterator[B], Iterator[C]] { + def apply(a: A, bs: Iterator[B]) = bs.map(fn) +} + +case class MapGroupFlatMapValues[A, B, C](fn: B => TraversableOnce[C]) + extends Function2[A, Iterator[B], Iterator[C]] { + def apply(a: A, bs: Iterator[B]) = bs.flatMap(fn) +} + +object FlatMapFunctions extends Serializable { + case class FromIdentity[A]() extends Function1[A, Iterator[A]] { + def apply(a: A) = Iterator.single(a) + } + case class FromFilter[A](fn: A => Boolean) extends Function1[A, Iterator[A]] { + def apply(a: A) = if (fn(a)) Iterator.single(a) else Iterator.empty + } + case class FromMap[A, B](fn: A => B) extends Function1[A, Iterator[B]] { + def apply(a: A) = Iterator.single(fn(a)) + } + case class FromFilterCompose[A, B](fn: A => Boolean, next: A => TraversableOnce[B]) + extends Function1[A, TraversableOnce[B]] { + def apply(a: A) = if (fn(a)) next(a) else Iterator.empty + } + case class FromMapCompose[A, B, C](fn: A => B, next: B => TraversableOnce[C]) + extends Function1[A, TraversableOnce[C]] { + def apply(a: A) = next(fn(a)) + } + case class FromFlatMapCompose[A, B, C](fn: A => TraversableOnce[B], next: B => TraversableOnce[C]) + extends Function1[A, TraversableOnce[C]] { + def apply(a: A) = fn(a).flatMap(next) + } +} + +object ComposedFunctions extends Serializable { + + case class ComposedMapFn[A, B, C](fn0: A => B, fn1: B => C) extends Function1[A, C] { + def apply(a: A) = fn1(fn0(a)) + } + case class ComposedFilterFn[-A](fn0: A => Boolean, fn1: A => Boolean) extends Function1[A, Boolean] { + def apply(a: A) = fn0(a) && fn1(a) + } + + /** + * This is only called at the end of a task, so might as well make it stack safe since a little extra + * runtime cost won't matter + */ + case class ComposedOnComplete(fn0: () => Unit, fn1: () => Unit) extends Function0[Unit] { + def apply(): Unit = { + @annotation.tailrec + def loop(fn: () => Unit, stack: List[() => Unit]): Unit = + fn match { + case ComposedOnComplete(left, right) => loop(left, right :: stack) + case notComposed => + notComposed() + stack match { + case h :: tail => loop(h, tail) + case Nil => () + } + } + + loop(fn0, List(fn1)) + } + } + + case class ComposedMapGroup[A, B, C, D]( + f: (A, Iterator[B]) => Iterator[C], + g: (A, Iterator[C]) => Iterator[D] + ) extends Function2[A, Iterator[B], Iterator[D]] { + + def apply(a: A, bs: Iterator[B]) = { + val cs = f(a, bs) + if (cs.nonEmpty) g(a, cs) + else Iterator.empty + } + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/ScaldingPriorityQueueMonoid.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/ScaldingPriorityQueueMonoid.scala new file mode 100644 index 0000000000..0894ce8991 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/ScaldingPriorityQueueMonoid.scala @@ -0,0 +1,8 @@ +package com.twitter.scalding.typed.functions + +import com.twitter.algebird.mutable.PriorityQueueMonoid + +class ScaldingPriorityQueueMonoid[K]( + val count: Int +)(implicit val ordering: Ordering[K]) + extends PriorityQueueMonoid[K](count)(ordering) diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/SubTypes.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/SubTypes.scala new file mode 100644 index 0000000000..a3dd42471a --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/functions/SubTypes.scala @@ -0,0 +1,63 @@ +package com.twitter.scalding.typed.functions + +/** + * This is a more powerful version of <:< that can allow us to remove casts and also not have any runtime cost + * for our function calls in some cases of trivial functions + */ +sealed abstract class SubTypes[-A, +B] extends java.io.Serializable { + def apply(a: A): B + def subst[F[-_]](f: F[B]): F[A] + + def toEv: A <:< B = { + val aa = implicitly[B <:< B] + type F[-T] = T <:< B + subst[F](aa) + } + + def liftCo[F[+_]]: SubTypes[F[A], F[B]] = { + type G[-T] = SubTypes[F[T], F[B]] + subst[G](SubTypes.fromSubType[F[B], F[B]]) + } + + /** + * create a new evidence for a contravariant type F[_] + */ + def liftContra[F[-_]]: SubTypes[F[B], F[A]] = { + type G[-T] = SubTypes[F[B], F[T]] + subst[G](SubTypes.fromSubType[F[B], F[B]]) + } +} + +object SubTypes extends java.io.Serializable { + private[this] final case class ReflexiveSubTypes[A]() extends SubTypes[A, A] { + def apply(a: A): A = a + def subst[F[-_]](f: F[A]): F[A] = f + } + + implicit def fromSubType[A, B >: A]: SubTypes[A, B] = ReflexiveSubTypes[A]() + + def fromEv[A, B](ev: A <:< B): SubTypes[A, B] = // linter:disable:UnusedParameter + // in scala 2.13, this won't need a cast, but the cast is safe + fromSubType[A, A].asInstanceOf[SubTypes[A, B]] + + def tuple2_1[A, B, C](implicit ev: SubTypes[A, B]): SubTypes[(A, C), (B, C)] = { + // This is a bit complex, but it is a proof that this + // is safe that does not use casting + type Pair[-T] = SubTypes[(T, C), (B, C)] + val idPair: Pair[B] = SubTypes.fromSubType[(B, C), (B, C)] + ev.subst[Pair](idPair) + } + + def tuple2_2[A, B, C](implicit ev: SubTypes[B, C]): SubTypes[(A, B), (A, C)] = { + // This is a bit complex, but it is a proof that this + // is safe that does not use casting + type Pair[-T] = SubTypes[(A, T), (A, C)] + val idPair: Pair[C] = SubTypes.fromSubType[(A, C), (A, C)] + ev.subst[Pair](idPair) + } + + def compose[A, B, C](sub0: SubTypes[A, B], sub1: SubTypes[B, C]): SubTypes[A, C] = { + type SubC[-X] = SubTypes[X, C] + sub0.subst[SubC](sub1) + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/AtomicBox.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/AtomicBox.scala new file mode 100644 index 0000000000..80c1a605a0 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/AtomicBox.scala @@ -0,0 +1,34 @@ +package com.twitter.scalding.typed.memory_backend + +import java.util.concurrent.atomic.AtomicReference + +class AtomicBox[T <: AnyRef](init: T) { + private[this] val ref = new AtomicReference[T](init) + + def lazySet(t: T): Unit = + ref.lazySet(t) + + def set(t: T): Unit = + ref.set(t) + + def swap(t: T): T = + ref.getAndSet(t) + + /** + * use a pure function to update the state. fn may be called more than once + */ + def update[R](fn: T => (T, R)): R = { + + @annotation.tailrec + def loop(): R = { + val init = ref.get + val (next, res) = fn(init) + if (ref.compareAndSet(init, next)) res + else loop() + } + + loop() + } + + def get(): T = ref.get +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryBackend.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryBackend.scala new file mode 100644 index 0000000000..cc40f78943 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryBackend.scala @@ -0,0 +1,91 @@ +package com.twitter.scalding.typed.memory_backend + +import com.twitter.scalding.typed._ +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} +import scala.util.{Failure, Success} + +trait MemorySource[A] { + def read()(implicit ec: ConcurrentExecutionContext): Future[Iterator[A]] +} + +object MemorySource { + case class FromIterable[A](iter: Iterable[A]) extends MemorySource[A] { + def read()(implicit ec: ConcurrentExecutionContext) = Future.successful(iter.iterator) + } + case class Fn[A](toFn: ConcurrentExecutionContext => Future[Iterator[A]]) extends MemorySource[A] { + def read()(implicit ec: ConcurrentExecutionContext) = toFn(ec) + } + + def readOption[T](optSrc: Option[MemorySource[T]], name: String)(implicit + ec: ConcurrentExecutionContext + ): Future[Iterator[T]] = + optSrc match { + case Some(src) => src.read() + case None => + Future.failed( + new Exception(s"Source: $name not wired. Please provide an input with MemoryMode.addSource") + ) + } + +} + +trait MemorySink[A] { + def write(data: Iterable[A])(implicit ec: ConcurrentExecutionContext): Future[Unit] +} + +object MemorySink { + + /** + * This is a sink that writes into local memory which you can read out by a future + * + * this needs to be reset between each write (so it only works for a single write per Execution) + */ + class LocalVar[A] extends MemorySink[A] { + private[this] val box: AtomicBox[Promise[Iterable[A]]] = new AtomicBox(Promise[Iterable[A]]()) + + /** + * This is a future that completes when a write comes. If no write happens before a reset, the future + * fails + */ + def read(): Future[Iterable[A]] = box.get().future + + /** + * This takes the current future and resets the promise making it safe for another write. + */ + def reset(): Option[Iterable[A]] = { + val current = box.swap(Promise[Iterable[A]]()) + // if the promise is not set, it never will be, so + // go ahead and poll now + // + // also note we never set this future to failed + current.future.value match { + case Some(Success(res)) => + Some(res) + case Some(Failure(err)) => + throw new IllegalStateException( + "We should never reach this because, we only complete with failure below", + err + ) + case None => + // make sure we complete the original future so readers don't block forever + current.failure(new Exception(s"sink never written to before reset() called $this")) + None + } + } + + def write(data: Iterable[A])(implicit ec: ConcurrentExecutionContext): Future[Unit] = + Future { + box.update(p => (p.success(data), ())) + } + } +} + +/** + * These are just used as type markers which are connected to inputs via the MemoryMode + */ +case class SourceT[T](ident: String) extends Input[T] + +/** + * These are just used as type markers which are connected to outputs via the MemoryMode + */ +case class SinkT[T](indent: String) extends Output[T] diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryMode.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryMode.scala new file mode 100644 index 0000000000..7bef91863a --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryMode.scala @@ -0,0 +1,58 @@ +package com.twitter.scalding.typed.memory_backend + +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future} +import com.twitter.scalding.{Execution, Mode} +import com.twitter.scalding.typed._ +import Execution.Writer + +final case class MemoryMode(srcs: Resolver[Input, MemorySource], sinks: Resolver[Output, MemorySink]) + extends Mode { + + def newWriter(): Writer = + new MemoryWriter(this) + + /** + * Add a new source resolver whose sources take precedence over any currently registered sources + */ + def addSourceResolver(res: Resolver[Input, MemorySource]): MemoryMode = + MemoryMode(res.orElse(srcs), sinks) + + def addSource[T](src: Input[T], ts: MemorySource[T]): MemoryMode = + addSourceResolver(Resolver.pair(src, ts)) + + def addSourceFn[T](src: Input[T])(fn: ConcurrentExecutionContext => Future[Iterator[T]]): MemoryMode = + addSource(src, MemorySource.Fn(fn)) + + def addSourceIterable[T](src: Input[T], iter: Iterable[T]): MemoryMode = + addSource(src, MemorySource.FromIterable(iter)) + + /** + * Add a new sink resolver whose sinks take precedence over any currently registered sinks + */ + def addSinkResolver(res: Resolver[Output, MemorySink]): MemoryMode = + MemoryMode(srcs, res.orElse(sinks)) + + def addSink[T](sink: Output[T], msink: MemorySink[T]): MemoryMode = + addSinkResolver(Resolver.pair(sink, msink)) + + /** + * This has a side effect of mutating the corresponding MemorySink + */ + def writeSink[T](t: Output[T], iter: Iterable[T])(implicit + ec: ConcurrentExecutionContext + ): Future[Unit] = + sinks(t) match { + case Some(sink) => sink.write(iter) + case None => + Future.failed( + new Exception( + s"missing sink for $t, with first 10 values to write: ${iter.take(10).toList.toString}..." + ) + ) + } +} + +object MemoryMode { + def empty: MemoryMode = + apply(Resolver.empty[Input, MemorySource], Resolver.empty[Output, MemorySink]) +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryPlanner.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryPlanner.scala new file mode 100644 index 0000000000..b1c2201bc5 --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryPlanner.scala @@ -0,0 +1,182 @@ +package com.twitter.scalding.typed.memory_backend + +import scala.collection.mutable.ArrayBuffer +import com.twitter.scalding.dagon.{FunctionK, Memoize} +import com.twitter.scalding.typed._ +import com.twitter.scalding.Config + +object MemoryPlanner { + + /** + * This builds an new memoizing planner that reads from the given MemoryMode. + * + * Note, this assumes all forks are made explicit in the graph, so it is up to any caller to make sure that + * optimization rule has first been applied + */ + def planner(conf: Config, srcs: Resolver[Input, MemorySource]): FunctionK[TypedPipe, Op] = + Memoize.functionK(new Memoize.RecursiveK[TypedPipe, Op] { + import TypedPipe._ + + def toFunction[T] = { + case (CounterPipe(pipe), rec) => + // TODO: counters not yet supported, but can be with an concurrent hashmap + rec(pipe.map(_._1)) + case (cp @ CrossPipe(_, _), rec) => + rec(cp.viaHashJoin) + case (CrossValue(left, EmptyValue), _) => Op.empty + case (CrossValue(left, LiteralValue(v)), rec) => + val op = rec(left) // linter:disable:UndesirableTypeInference + op.map((_, v)) + case (CrossValue(left, ComputedValue(right)), rec) => + rec(CrossPipe(left, right)) + case (DebugPipe(p), rec) => + // There is really little that can be done here but println + rec(p.map { t => println(t); t }) + + case (EmptyTypedPipe, _) => + // just use an empty iterable pipe. + Op.empty[T] + + case (fk @ FilterKeys(_, _), rec) => + def go[K, V](node: FilterKeys[K, V]): Op[(K, V)] = { + val FilterKeys(pipe, fn) = node + rec(pipe).concatMap { case (k, v) => + if (fn(k)) { (k, v) :: Nil } + else Nil + } + } + go(fk) + + case (f @ Filter(_, _), rec) => + def go[T](f: Filter[T]): Op[T] = { + val Filter(p, fn) = f + rec(p).filter(fn) + } + go(f) + + case (f @ FlatMapValues(_, _), rec) => + def go[K, V, U](node: FlatMapValues[K, V, U]) = { + val fn = node.fn + rec(node.input).concatMap { case (k, v) => fn(v).map((k, _)) } + } + + go(f) + + case (FlatMapped(prev, fn), rec) => + rec(prev).concatMap(fn) // linter:disable:UndesirableTypeInference + + case (ForceToDisk(pipe), rec) => + rec(pipe).materialize + + case (Fork(pipe), rec) => + rec(pipe).materialize + + case (IterablePipe(iterable), _) => + Op.source(iterable) + + case (f @ MapValues(_, _), rec) => + def go[K, V, U](node: MapValues[K, V, U]) = { + val mvfn = node.fn + rec(node.input).map { case (k, v) => (k, mvfn(v)) } + } + + go(f) + + case (Mapped(input, fn), rec) => + rec(input).map(fn) // linter:disable:UndesirableTypeInference + + case (MergedTypedPipe(left, right), rec) => + Op.Concat(rec(left), rec(right)) + + case (SourcePipe(src), _) => + val optsrc = srcs(src) + Op.Source(cec => MemorySource.readOption(optsrc, src.toString)(cec)) + + case (slk @ SumByLocalKeys(_, _), rec) => + def sum[K, V](sblk: SumByLocalKeys[K, V]) = { + val SumByLocalKeys(p, sg) = sblk + + rec(p).transform[(K, V), (K, V)] { kvs => + val map = collection.mutable.Map.empty[K, V] + val iter = kvs.iterator + while (iter.hasNext) { + val (k, v) = iter.next + map(k) = map.get(k) match { + case None => v + case Some(v1) => sg.plus(v1, v) + } + } + val res = new ArrayBuffer[(K, V)](map.size) + map.foreach(res += _) + res + } + } + sum(slk) + case (TrappedPipe(input, _), rec) => + // this can be interpretted as catching any exception + // on the map-phase until the next partition, so it can + // be made to work by changing Op to return all + // the values that fail on error + rec(input) + + case (WithDescriptionTypedPipe(pipe, descriptions), rec) => + // TODO we could optionally print out the descriptions + // after the future completes + rec(pipe) + + case (WithOnComplete(pipe, fn), rec) => + Op.OnComplete(rec(pipe), fn) + + case (hcg @ HashCoGroup(_, _, _), rec) => + def go[K, V1, V2, R](hcg: HashCoGroup[K, V1, V2, R]) = { + val leftOp = rec(hcg.left) + val rightOp = rec(ReduceStepPipe(HashJoinable.toReduceStep(hcg.right))) + Op.Join[(K, V1), (K, V2), (K, R)]( + leftOp, + rightOp, + { (v1s, v2s) => + val kv2 = v2s.groupBy(_._1) + val result = new ArrayBuffer[(K, R)]() + v1s.foreach { case (k, v1) => + val v2 = kv2.getOrElse(k, Nil).map(_._2) + result ++= hcg.joiner(k, v1, v2).map((k, _)) + } + result + } + ) + } + go(hcg) + + case (CoGroupedPipe(cg), rec) => + def go[K, V](cg: CoGrouped[K, V]) = + Op.BulkJoin(cg.inputs.map(rec(_)), cg.joinFunction) + go(cg) + + case (ReduceStepPipe(ir @ IdentityReduce(_, _, _, descriptions, _)), rec) => + def go[K, V1, V2](ir: IdentityReduce[K, V1, V2]): Op[(K, V2)] = { + type OpT[V] = Op[(K, V)] + val op = rec(ir.mapped) + ir.evidence.subst[OpT](op) + } + go(ir) + case (ReduceStepPipe(uir @ UnsortedIdentityReduce(_, _, _, descriptions, _)), rec) => + def go[K, V1, V2](uir: UnsortedIdentityReduce[K, V1, V2]): Op[(K, V2)] = { + type OpT[V] = Op[(K, V)] + val op = rec(uir.mapped) + uir.evidence.subst[OpT](op) + } + go(uir) + case (ReduceStepPipe(IdentityValueSortedReduce(_, pipe, ord, _, _, _)), rec) => + def go[K, V](p: TypedPipe[(K, V)], ord: Ordering[V]) = { + val op = rec(p) + Op.Reduce[K, V, V](op, (k, vs) => vs, Some(ord)) + } + go(pipe, ord) + case (ReduceStepPipe(ValueSortedReduce(_, pipe, ord, fn, _, _)), rec) => + Op.Reduce(rec(pipe), fn, Some(ord)) + case (ReduceStepPipe(IteratorMappedReduce(_, pipe, fn, _, _)), rec) => + Op.Reduce(rec(pipe), fn, None) + } + }) + +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryWriter.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryWriter.scala new file mode 100644 index 0000000000..8b1ad1229b --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/MemoryWriter.scala @@ -0,0 +1,143 @@ +package com.twitter.scalding.typed.memory_backend + +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} +import com.twitter.scalding.dagon.{HMap, Rule} +import com.twitter.scalding.typed._ +import com.twitter.scalding.{CFuture, Config, Execution, ExecutionCounters} +import Execution.{ToWrite, Writer} + +/** + * This is the state of a single outer Execution execution running in memory mode + */ +class MemoryWriter(mem: MemoryMode) extends Writer { + + def start(): Unit = () + + /** + * This is called by an Execution to end processing + */ + def finished(): Unit = () + + private[this] case class State(id: Long, forced: HMap[TypedPipe, ({ type F[T] = Future[Iterable[T]] })#F]) { + + def simplifiedForce[A](t: TypedPipe[A], it: Future[Iterable[A]]): State = + copy(forced = forced.updated(t, it)) + } + + private[this] val state = + new AtomicBox[State](State(0, HMap.empty[TypedPipe, ({ type F[T] = Future[Iterable[T]] })#F])) + + /** + * do a batch of writes, possibly optimizing, and return a new unique Long. + * + * empty writes are legitimate and should still return a Long + */ + def execute(conf: Config, writes: List[ToWrite[_]])(implicit + cec: ConcurrentExecutionContext + ): CFuture[(Long, ExecutionCounters)] = { + + val planner = MemoryPlanner.planner(conf, mem.srcs) + + type Action = () => Future[Unit] + import Execution.ToWrite._ + + val phases: Seq[Rule[TypedPipe]] = + OptimizationRules.standardMapReduceRules // probably want to tweak this + + val optimizedWrites = ToWrite.optimizeWriteBatch(writes, phases) + + def force[T](p: TypedPipe[T], keyPipe: TypedPipe[T], oldState: State): (State, Action) = { + val pipePromise = Promise[Iterable[T]]() + val action = () => { + val op = planner(p) + val arrayBufferF = op.result + pipePromise.completeWith(arrayBufferF) + + arrayBufferF.map(_ => ()) + } + (oldState.copy(forced = oldState.forced.updated(keyPipe, pipePromise.future)), action) + } + + /** + * TODO If we have a typed pipe rooted twice, it is not clear it has fanout. If it does not we will not + * materialize it, so both branches can't own it. Since we only emit Iterable out, this may be okay + * because no external readers can modify, but worth thinking of + */ + val idActs: (Long, List[Action]) = state.update { s => + val (nextState, acts) = optimizedWrites.foldLeft((s, List.empty[Action])) { + case (old @ (state, acts), write) => + write match { + case OptimizedWrite(pipe, Force(opt)) => + if (state.forced.contains(opt)) old + else { + val (st, a) = force(opt, pipe, state) + (st, a :: acts) + } + case OptimizedWrite(pipe, ToIterable(opt)) => + opt match { + case TypedPipe.EmptyTypedPipe => + (state.simplifiedForce(pipe, Future.successful(Nil)), acts) + case TypedPipe.IterablePipe(i) => + (state.simplifiedForce(pipe, Future.successful(i)), acts) + case TypedPipe.SourcePipe(src) => + val fut = getSource(src) + (state.simplifiedForce(pipe, fut), acts) + case other if state.forced.contains(opt) => old + case other => + val (st, a) = force(opt, pipe, state) + (st, a :: acts) + } + case OptimizedWrite(pipe, ToWrite.SimpleWrite(opt, sink)) => + state.forced.get(opt) match { + case Some(iterf) => + val action = () => { + iterf.flatMap(mem.writeSink(sink, _)) + } + (state, action :: acts) + case None => + val op = planner(opt) // linter:disable:UndesirableTypeInference + val action = () => { + val arrayBufferF = op.result + arrayBufferF.flatMap(mem.writeSink(sink, _)) + } + (state, action :: acts) + } + } + } + (nextState.copy(id = nextState.id + 1), (nextState.id, acts)) + } + val (id, acts) = idActs + // now we run the actions: + val fut = Future.traverse(acts)(fn => fn()).map(_ => (id, ExecutionCounters.empty)) + // wrap the future in a CFuture -- this is uncancellable in memory mode + CFuture.uncancellable(fut) + } + + /** + * This should only be called after a call to execute + */ + def getForced[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ConcurrentExecutionContext + ): Future[TypedPipe[T]] = + state.get.forced.get(initial) match { + case None => Future.failed(new Exception(s"$initial not forced")) + case Some(f) => f.map(TypedPipe.from(_)) + } + + private def getSource[A](src: Input[A])(implicit + cec: ConcurrentExecutionContext + ): Future[Iterable[A]] = + MemorySource.readOption(mem.srcs(src), src.toString).map(_.toList) + + /** + * This should only be called after a call to execute + */ + def getIterable[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ConcurrentExecutionContext + ): Future[Iterable[T]] = initial match { + case TypedPipe.EmptyTypedPipe => Future.successful(Nil) + case TypedPipe.IterablePipe(iter) => Future.successful(iter) + case TypedPipe.SourcePipe(src) => getSource(src) + case other => getForced(conf, other).flatMap(getIterable(conf, _)) + } +} diff --git a/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/Op.scala b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/Op.scala new file mode 100644 index 0000000000..f6a645508a --- /dev/null +++ b/scalding-base/src/main/scala/com/twitter/scalding/typed/memory_backend/Op.scala @@ -0,0 +1,200 @@ +package com.twitter.scalding.typed.memory_backend + +import com.twitter.scalding.typed._ +import java.util.{ArrayList, Collections} +import scala.collection.JavaConverters._ +import scala.collection.mutable.{ArrayBuffer, Map => MMap} +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} + +sealed trait Op[+O] { + def result(implicit cec: ConcurrentExecutionContext): Future[ArrayBuffer[_ <: O]] + + def concatMap[O1](fn: O => TraversableOnce[O1]): Op[O1] = + transform { in: IndexedSeq[O] => + val res = ArrayBuffer[O1]() + val it = in.iterator + while (it.hasNext) { + val i = it.next + fn(i).foreach(res += _) + } + res + } + + def map[O1](fn: O => O1): Op[O1] = + Op.MapOp(this, fn) + + def filter(fn: O => Boolean): Op[O] = + Op.Filter(this, fn) + + def transform[O1 >: O, O2](fn: IndexedSeq[O1] => ArrayBuffer[O2]): Op[O2] = + Op.Transform[O1, O2](this, fn) + + def materialize: Op[O] = + Op.Materialize(this) +} +object Op { + def source[I](i: Iterable[I]): Op[I] = Source(_ => Future.successful(i.iterator)) + def empty[I]: Op[I] = source(Nil) + + final case class Source[I](input: ConcurrentExecutionContext => Future[Iterator[I]]) extends Op[I] { + + def result(implicit cec: ConcurrentExecutionContext): Future[ArrayBuffer[I]] = + input(cec).map(ArrayBuffer.empty[I] ++= _) + } + + // Here we need to make a copy on each result + final case class Materialize[O](op: Op[O]) extends Op[O] { + private[this] val promiseBox: AtomicBox[Option[Promise[ArrayBuffer[_ <: O]]]] = new AtomicBox(None) + + def result(implicit cec: ConcurrentExecutionContext) = { + val either = promiseBox.update { + case None => + val promise = Promise[ArrayBuffer[_ <: O]]() + (Some(promise), Right(promise)) + case s @ Some(promise) => + (s, Left(promise)) + } + + val fut = either match { + case Right(promise) => + // This is the one case where we call the op + promise.completeWith(op.result) + promise.future + case Left(promise) => + // we already started the previous work + promise.future + } + fut.map(ArrayBuffer.concat(_)) + } + } + + final case class Concat[O](left: Op[O], right: Op[O]) extends Op[O] { + def result(implicit cec: ConcurrentExecutionContext) = { + val f1 = left.result + val f2 = right.result + f1.zip(f2).map { case (l, r) => + if (l.size > r.size) l.asInstanceOf[ArrayBuffer[O]] ++= r + else r.asInstanceOf[ArrayBuffer[O]] ++= l + } + } + } + + // We reuse the input on map + final case class MapOp[I, O](input: Op[I], fn: I => O) extends Op[O] { + def result(implicit cec: ConcurrentExecutionContext): Future[ArrayBuffer[O]] = + input.result.map { array => + val res: ArrayBuffer[O] = array.asInstanceOf[ArrayBuffer[O]] + var pos = 0 + while (pos < array.length) { + res.update(pos, fn(array(pos))) + pos = pos + 1 + } + res + } + } + // We reuse the input on filter + final case class Filter[I](input: Op[I], fn: I => Boolean) extends Op[I] { + def result(implicit cec: ConcurrentExecutionContext): Future[ArrayBuffer[I]] = + input.result.map { array0 => + val array = array0.asInstanceOf[ArrayBuffer[I]] + var pos = 0 + var writePos = 0 + while (pos < array.length) { + val item = array(pos) + if (fn(item)) { + array(writePos) = item + writePos = writePos + 1 + } + pos = pos + 1 + } + // trim the tail off + array.remove(writePos, array.length - writePos) + array + } + } + + final case class OnComplete[O](of: Op[O], fn: () => Unit) extends Op[O] { + def result(implicit cec: ConcurrentExecutionContext) = { + val res = of.result + res.onComplete(_ => fn()) + res + } + } + + final case class Transform[I, O](input: Op[I], fn: IndexedSeq[I] => ArrayBuffer[O]) extends Op[O] { + def result(implicit cec: ConcurrentExecutionContext) = + input.result.map(fn) + } + + final case class Reduce[K, V1, V2]( + input: Op[(K, V1)], + fn: (K, Iterator[V1]) => Iterator[V2], + ord: Option[Ordering[V1]] + ) extends Op[(K, V2)] { + + def result(implicit cec: ConcurrentExecutionContext): Future[ArrayBuffer[(K, V2)]] = + input.result.map { kvs => + val valuesByKey = MMap[K, ArrayList[V1]]() + def add(kv: (K, V1)): Unit = { + val vs = valuesByKey.getOrElseUpdate(kv._1, new ArrayList[V1]()) + vs.add(kv._2) + } + kvs.foreach(add) + + /* + * This portion could be parallelized for each key, or we could split + * the keys into as many groups as there are CPUs and process that way + */ + val res = ArrayBuffer[(K, V2)]() + valuesByKey.foreach { case (k, vs) => + ord.foreach(Collections.sort[V1](vs, _)) + val v2iter = fn(k, vs.iterator.asScala) + while (v2iter.hasNext) { + res += ((k, v2iter.next)) + } + } + res + } + } + + final case class Join[A, B, C](opA: Op[A], opB: Op[B], fn: (IndexedSeq[A], IndexedSeq[B]) => ArrayBuffer[C]) + extends Op[C] { + + def result(implicit cec: ConcurrentExecutionContext) = { + // start both futures in parallel + val f1 = opA.result + val f2 = opB.result + f1.zip(f2).map { case (a, b) => fn(a, b) } + } + } + + final case class BulkJoin[K, A](ops: List[Op[(K, Any)]], joinF: MultiJoinFunction[K, A]) + extends Op[(K, A)] { + def result(implicit cec: ConcurrentExecutionContext) = + Future + .traverse(ops)(_.result) + .map { items => + // TODO this is not by any means optimal. + // we could copy into arrays then sort by key and iterate + // each through in K sorted order + val maps: List[Map[K, Iterable[(K, Any)]]] = items.map { kvs => + val kvMap: Map[K, Iterable[(K, Any)]] = kvs.groupBy(_._1) + kvMap + } + + val allKeys = maps.iterator.flatMap(_.keys.iterator).toSet + val result = ArrayBuffer[(K, A)]() + allKeys.foreach { k => + maps.map(_.getOrElse(k, Nil)) match { + case h :: tail => + joinF(k, h.iterator.map(_._2), tail.map(_.map(_._2))).foreach { a => + result += ((k, a)) + } + case other => sys.error(s"unreachable: $other, $k") + } + } + + result + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/DistinctByTest.scala b/scalding-base/src/test/scala/com/twitter/scalding/DistinctByTest.scala similarity index 85% rename from scalding-core/src/test/scala/com/twitter/scalding/DistinctByTest.scala rename to scalding-base/src/test/scala/com/twitter/scalding/DistinctByTest.scala index 154266ddd0..71e4bb4084 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/DistinctByTest.scala +++ b/scalding-base/src/test/scala/com/twitter/scalding/DistinctByTest.scala @@ -12,16 +12,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import com.twitter.scalding.typed.CoGrouped.distinctBy -import org.scalacheck.Arbitrary -import org.scalacheck.Arbitrary.arbitrary import org.scalacheck.Properties import org.scalacheck.Prop.forAll -import org.scalacheck.Gen._ object DistinctByProps extends Properties("CoGrouped.DistinctBy") { @@ -33,21 +30,22 @@ object DistinctByProps extends Properties("CoGrouped.DistinctBy") { } property("distinctBy to unit gives size 0 or 1") = forAll { (l: List[Int], fn: Int => Unit) => val dsize = distinctBy(l)(fn).size - ((dsize == 0) && (l.size == 0)) || dsize == 1 + ((dsize == 0) && l.isEmpty) || dsize == 1 } property("distinctBy to different values never changes the list") = forAll { (l: List[Int]) => var idx = 0 val fn = { (i: Int) => idx += 1; idx } distinctBy(l)(fn) == l } - property("distinctBy works like groupBy(fn).map(_._2.head).toSet") = forAll { (l: List[Int], fn: Int => Byte) => - distinctBy(l)(fn).toSet == l.groupBy(fn).map(_._2.head).toSet + property("distinctBy works like groupBy(fn).map(_._2.head).toSet") = forAll { + (l: List[Int], fn: Int => Byte) => + distinctBy(l)(fn).toSet == l.groupBy(fn).map(_._2.head).toSet } property("distinctBy matches a mutable implementation") = forAll { (l: List[Int], fn: Int => Byte) => val dlist = distinctBy(l)(fn) var seen = Set[Byte]() l.flatMap { it => - if(seen(fn(it))) Nil + if (seen(fn(it))) Nil else { seen += fn(it) List(it) diff --git a/scalding-base/src/test/scala/com/twitter/scalding/StringUtilityTest.scala b/scalding-base/src/test/scala/com/twitter/scalding/StringUtilityTest.scala new file mode 100644 index 0000000000..fa02d1fbcc --- /dev/null +++ b/scalding-base/src/test/scala/com/twitter/scalding/StringUtilityTest.scala @@ -0,0 +1,63 @@ +package com.twitter.scalding + +import org.scalatest.{Matchers, PropSpec, WordSpec} +import org.scalacheck.Prop.forAll +import org.scalatest.prop.Checkers +import org.scalacheck.Gen + +class StringUtilityTest extends WordSpec with Matchers { + "fastSplitTest" should { + "be able to split white space" in { + val text1 = "this is good time" + val res1 = StringUtility.fastSplit(text1, " ") // split single white space + res1 should be { + Seq("this", "is", "good", "time") + } + } + } + "be able to split other separators" in { + val text2 = "a:b:c:d:" + val res2 = StringUtility.fastSplit(text2, ":") + res2 should be { + Seq("a", "b", "c", "d", "") + } + } + "be able to split only one separators" in { + val text2 = "a@" + val res2 = StringUtility.fastSplit(text2, "@") + res2 should be { + Seq("a", "") + } + } + "be able to split when separator doesn't show up" in { + val text2 = "a" + val res2 = StringUtility.fastSplit(text2, "@") + res2 should be { + Seq("a") + } + } +} + +class StringUtilityPropertyTest extends PropSpec with Checkers { + val randomStringGen = for { + s <- Gen.pick(5, List.fill(100)(List("k", "l", "m", "x", "//.", "@")).flatten) + + } yield s + + // test for one separator and two + val randomSeparator = for { + s <- Gen.oneOf("@@", "@", "x", "//.") + } yield s + + property("fastSplit(s, sep) should match s.split(sep, -1) for non-regex sep") { + check { + forAll(randomStringGen, randomSeparator) { (str, separator) => + val t = str.mkString("") + val r1 = t.split(separator, -1).toList + val r2 = StringUtility.fastSplit(t, separator) + r1 == r2 + } + } + } + +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/SizeHintTest.scala b/scalding-base/src/test/scala/com/twitter/scalding/mathematics/SizeHintTest.scala similarity index 50% rename from scalding-core/src/test/scala/com/twitter/scalding/mathematics/SizeHintTest.scala rename to scalding-base/src/test/scala/com/twitter/scalding/mathematics/SizeHintTest.scala index ef06d454fc..34d80be15a 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/SizeHintTest.scala +++ b/scalding-base/src/test/scala/com/twitter/scalding/mathematics/SizeHintTest.scala @@ -12,90 +12,97 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics -import com.twitter.scalding._ -import org.specs._ - import org.scalacheck.Arbitrary -import org.scalacheck.Arbitrary.arbitrary import org.scalacheck.Properties import org.scalacheck.Prop.forAll import org.scalacheck.Gen._ object SizeHintProps extends Properties("SizeHint") { - val noClueGen = value(NoClue) + val noClueGen = const(NoClue) - val finiteHintGen = for ( rows <- choose(-1L, 1000000L); - cols <- choose(-1L, 1000000L)) - yield FiniteHint(rows, cols) + val finiteHintGen = for { + rows <- choose(-1L, 1000000L) + cols <- choose(-1L, 1000000L) + } yield FiniteHint(rows, cols) - val sparseHintGen = for ( rows <- choose(-1L, 1000000L); - cols <- choose(-1L, 1000000L); - sparsity <- choose(0.0, 1.0)) - yield SparseHint(sparsity, rows, cols) + val sparseHintGen = for { + rows <- choose(-1L, 1000000L) + cols <- choose(-1L, 1000000L) + sparsity <- choose(0.0, 1.0) + } yield SparseHint(sparsity, rows, cols) - implicit val finiteArb : Arbitrary[FiniteHint] = Arbitrary { finiteHintGen } - implicit val sparseArb : Arbitrary[SparseHint] = Arbitrary { sparseHintGen } - implicit val genHint : Arbitrary[SizeHint] = Arbitrary { oneOf(noClueGen, finiteHintGen, sparseHintGen) } + implicit val finiteArb: Arbitrary[FiniteHint] = Arbitrary(finiteHintGen) + implicit val sparseArb: Arbitrary[SparseHint] = Arbitrary(sparseHintGen) + implicit val genHint: Arbitrary[SizeHint] = Arbitrary(oneOf(noClueGen, finiteHintGen, sparseHintGen)) - property("a+b is at least as big as a") = forAll { (a : SizeHint, b : SizeHint) => - val addT = for( ta <- a.total; tsum <- (a+b).total) yield (tsum >= ta) + property("a+b is at least as big as a") = forAll { (a: SizeHint, b: SizeHint) => + val addT = for { + ta <- a.total + tsum <- (a + b).total + } yield (tsum >= ta) addT.getOrElse(true) } - property("a#*#b is at most as big as a") = forAll { (a : SizeHint, b : SizeHint) => - val addT = for( ta <- a.total; tsum <- (a#*#b).total) yield (tsum <= ta) + property("a#*#b is at most as big as a") = forAll { (a: SizeHint, b: SizeHint) => + val addT = for { + ta <- a.total + tsum <- (a #*# b).total + } yield (tsum <= ta) addT.getOrElse(true) - } - - property("ordering makes sense") = forAll { (a : SizeHint, b : SizeHint) => - (List(a,b).max.total.getOrElse(BigInt(-1L)) >= a.total.getOrElse(BigInt(-1L))) } - property("addition increases sparsity fraction") = forAll { (a : SparseHint, b : SparseHint) => + property("ordering makes sense") = forAll { (a: SizeHint, b: SizeHint) => + (List(a, b).max.total.getOrElse(BigInt(-1L)) >= a.total.getOrElse(BigInt(-1L))) + } + + property("addition increases sparsity fraction") = forAll { (a: SparseHint, b: SparseHint) => (a + b).asInstanceOf[SparseHint].sparsity >= a.sparsity } - property("Hadamard product does not increase sparsity fraction") = forAll { (a : SparseHint, b : SparseHint) => - (a #*# b).asInstanceOf[SparseHint].sparsity == (a.sparsity min b.sparsity) - } - - property("transpose preserves size") = forAll { (a : SizeHint) => + property("Hadamard product does not increase sparsity fraction") = forAll { + (a: SparseHint, b: SparseHint) => + (a #*# b).asInstanceOf[SparseHint].sparsity == (a.sparsity.min(b.sparsity)) + } + + property("transpose preserves size") = forAll { (a: SizeHint) => a.transpose.total == a.total } - property("squaring a finite hint preserves size") = forAll { (a : FiniteHint) => + property("squaring a finite hint preserves size") = forAll { (a: FiniteHint) => val sq = a.setRowsToCols val sq2 = a.setColsToRows (sq.total == (sq * sq).total) && (sq2.total == (sq2 * sq2).total) } - property("adding a finite hint to itself preserves size") = forAll { (a : FiniteHint) => + property("adding a finite hint to itself preserves size") = forAll { (a: FiniteHint) => (a + a).total == a.total } - property("hadamard product of a finite hint to itself preserves size") = forAll { (a : FiniteHint) => + property("hadamard product of a finite hint to itself preserves size") = forAll { (a: FiniteHint) => (a #*# a).total == a.total - } - - property("adding a sparse matrix to itself doesn't decrease size") = forAll { (a : SparseHint) => - (for ( doubleSize <- (a + a).total; - asize <- a.total ) yield(doubleSize >= asize)).getOrElse(true) } - property("diagonals are smaller") = forAll { (a : FiniteHint) => + property("adding a sparse matrix to itself doesn't decrease size") = forAll { (a: SparseHint) => + (for { + doubleSize <- (a + a).total + asize <- a.total + } yield (doubleSize >= asize)).getOrElse(true) + } + + property("diagonals are smaller") = forAll { (a: FiniteHint) => SizeHint.asDiagonal(a).total.getOrElse(BigInt(-2L)) < a.total.getOrElse(-1L) } - property("diagonals are about as big as the min(rows,cols)") = forAll { (a : FiniteHint) => - SizeHint.asDiagonal(a).total.getOrElse(BigInt(-1L)) <= (a.rows min a.cols) - SizeHint.asDiagonal(a).total.getOrElse(BigInt(-1L)) >= ((a.rows min a.cols) - 1L) + property("diagonals are about as big as the min(rows,cols)") = forAll { (a: FiniteHint) => + SizeHint.asDiagonal(a).total.getOrElse(BigInt(-1L)) <= (a.rows.min(a.cols)) + SizeHint.asDiagonal(a).total.getOrElse(BigInt(-1L)) >= ((a.rows.min(a.cols)) - 1L) } - property("transpose law is obeyed in total") = forAll { (a : SizeHint, b : SizeHint) => + property("transpose law is obeyed in total") = forAll { (a: SizeHint, b: SizeHint) => // (A B)^T = B^T A^T (a * b).transpose.total == ((b.transpose) * (a.transpose)).total } diff --git a/scalding-base/src/test/scala/com/twitter/scalding/typed/CoGroupableTest.scala b/scalding-base/src/test/scala/com/twitter/scalding/typed/CoGroupableTest.scala new file mode 100644 index 0000000000..a029cc1899 --- /dev/null +++ b/scalding-base/src/test/scala/com/twitter/scalding/typed/CoGroupableTest.scala @@ -0,0 +1,36 @@ +package com.twitter.scalding.typed + +import org.scalatest.FunSuite + +class CoGroupableTest extends FunSuite { + test("CoGroupable.atMostOneValue is consistent") { + val init = TypedPipe.from(List((1, 2))) + + assert(CoGroupable.atMostOneValue(init.sumByKey)) + assert(CoGroupable.atMostOneValue(init.group.sum)) + assert(CoGroupable.atMostOneValue(init.group.mapValues(_ + 100).sum)) + assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum)) + assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum.mapValues(_ - 100))) + assert(CoGroupable.atMostOneValue(init.group.forceToReducers.mapValues(_ + 100).sum.filter { + case (k, v) => k > v + })) + assert(CoGroupable.atMostOneValue(init.group.mapValues(_ * 2).sum.join(init.group.sum))) + + assert(!CoGroupable.atMostOneValue(init.group)) + assert(!CoGroupable.atMostOneValue(init.group.scanLeft(0)(_ + _))) + assert(!CoGroupable.atMostOneValue(init.join(init.group.mapValues(_ * 2)))) + assert(!CoGroupable.atMostOneValue(init.group.sum.flatMapValues(List(_)))) + + val sum1 = init.sumByKey + + assert(CoGroupable.atMostOneValue(sum1.join(sum1.join(sum1)))) + assert(CoGroupable.atMostOneValue(sum1.join(sum1).join(sum1))) + + assert(!CoGroupable.atMostOneValue(init.join(sum1.join(sum1)))) + assert(!CoGroupable.atMostOneValue(init.join(sum1).join(sum1))) + assert(!CoGroupable.atMostOneValue(sum1.join(init.join(sum1)))) + assert(!CoGroupable.atMostOneValue(sum1.join(init).join(sum1))) + assert(!CoGroupable.atMostOneValue(sum1.join(sum1.join(init)))) + assert(!CoGroupable.atMostOneValue(sum1.join(sum1).join(init))) + } +} diff --git a/scalding-base/src/test/scala/com/twitter/scalding/typed/HashEqualsArrayWrapperTest.scala b/scalding-base/src/test/scala/com/twitter/scalding/typed/HashEqualsArrayWrapperTest.scala new file mode 100644 index 0000000000..b3649241a8 --- /dev/null +++ b/scalding-base/src/test/scala/com/twitter/scalding/typed/HashEqualsArrayWrapperTest.scala @@ -0,0 +1,99 @@ +package com.twitter.scalding.typed + +import org.scalacheck.{Arbitrary, Prop} +import org.scalatest.{FunSuite, PropSpec} +import org.scalatest.prop.{Checkers, PropertyChecks} +import scala.reflect.ClassTag + +object HashArrayEqualsWrapperLaws { + + def check2[T]( + ordToTest: Ordering[HashEqualsArrayWrapper[T]] + )(implicit ord: Ordering[T], arb: Arbitrary[Array[T]]): Prop = + Prop.forAll { (left: Array[T], right: Array[T]) => + val leftWrapped = HashEqualsArrayWrapper.wrap(left) + val rightWrapped = HashEqualsArrayWrapper.wrap(right) + + import scala.Ordering.Implicits.seqDerivedOrdering + + val slowOrd: Ordering[Seq[T]] = seqDerivedOrdering[Seq, T](ord) + + val cmp = ordToTest.compare(leftWrapped, rightWrapped) + + val lenCmp = java.lang.Integer.compare(leftWrapped.wrapped.length, rightWrapped.wrapped.length) + if (lenCmp != 0) { + cmp.signum == lenCmp.signum + } else { + cmp.signum == slowOrd.compare(leftWrapped.wrapped.toSeq, rightWrapped.wrapped.toSeq).signum + } + } + + def check[T](ordToTest: Ordering[Array[T]])(implicit ord: Ordering[T], arb: Arbitrary[Array[T]]): Prop = + Prop.forAll { (left: Array[T], right: Array[T]) => + import scala.Ordering.Implicits.seqDerivedOrdering + + val slowOrd: Ordering[Seq[T]] = seqDerivedOrdering[Seq, T](ord) + + val cmp = ordToTest.compare(left, right) + + val lenCmp = java.lang.Integer.compare(left.length, right.length) + if (lenCmp != 0) { + cmp.signum == lenCmp.signum + } else { + cmp.signum == slowOrd.compare(left.toSeq, right.toSeq).signum + } + } +} + +class HashArrayEqualsWrapperProps extends PropSpec with PropertyChecks with Checkers { + + property("Specialized orderings obey all laws for Arrays") { + check(HashArrayEqualsWrapperLaws.check(HashEqualsArrayWrapper.longArrayOrd)) + check(HashArrayEqualsWrapperLaws.check(HashEqualsArrayWrapper.intArrayOrd)) + check(HashArrayEqualsWrapperLaws.check(HashEqualsArrayWrapper.shortArrayOrd)) + check(HashArrayEqualsWrapperLaws.check(HashEqualsArrayWrapper.charArrayOrd)) + check(HashArrayEqualsWrapperLaws.check(HashEqualsArrayWrapper.byteArrayOrd)) + check(HashArrayEqualsWrapperLaws.check(HashEqualsArrayWrapper.booleanArrayOrd)) + check(HashArrayEqualsWrapperLaws.check(HashEqualsArrayWrapper.floatArrayOrd)) + check(HashArrayEqualsWrapperLaws.check(HashEqualsArrayWrapper.doubleArrayOrd)) + } + + property("Specialized orderings obey all laws for wrapped Arrays") { + check(HashArrayEqualsWrapperLaws.check2(HashEqualsArrayWrapper.hashEqualsLongOrdering)) + check(HashArrayEqualsWrapperLaws.check2(HashEqualsArrayWrapper.hashEqualsIntOrdering)) + check(HashArrayEqualsWrapperLaws.check2(HashEqualsArrayWrapper.hashEqualsShortOrdering)) + check(HashArrayEqualsWrapperLaws.check2(HashEqualsArrayWrapper.hashEqualsCharOrdering)) + check(HashArrayEqualsWrapperLaws.check2(HashEqualsArrayWrapper.hashEqualsByteOrdering)) + check(HashArrayEqualsWrapperLaws.check2(HashEqualsArrayWrapper.hashEqualsBooleanOrdering)) + check(HashArrayEqualsWrapperLaws.check2(HashEqualsArrayWrapper.hashEqualsFloatOrdering)) + check(HashArrayEqualsWrapperLaws.check2(HashEqualsArrayWrapper.hashEqualsDoubleOrdering)) + } + +} + +class HashArrayEqualsWrapperTest extends FunSuite { + + def testWrap[T: ClassTag](arr: Array[T], expected: Class[_]): Unit = { + val fn = HashEqualsArrayWrapper.wrapByClassTagFn[T] + val wrapped = fn(arr) + assert(wrapped.getClass === expected) + } + + test("wrap function returns correct wrapper") { + testWrap[Long](Array[Long](1), classOf[HashEqualsLongArrayWrapper]) + testWrap[Int](Array[Int](1), classOf[HashEqualsIntArrayWrapper]) + testWrap[Short](Array[Short](1), classOf[HashEqualsShortArrayWrapper]) + testWrap[Char](Array[Char]('a'), classOf[HashEqualsCharArrayWrapper]) + testWrap[Byte](Array[Byte](1), classOf[HashEqualsByteArrayWrapper]) + testWrap[Boolean](Array[Boolean](true), classOf[HashEqualsBooleanArrayWrapper]) + testWrap[Float](Array[Float](1), classOf[HashEqualsFloatArrayWrapper]) + testWrap[Double](Array[Double](1), classOf[HashEqualsDoubleArrayWrapper]) + + testWrap[String](Array[String]("hi"), classOf[HashEqualsObjectArrayWrapper[String]]) + } + + test("classForTag works correctly") { + assert(HashEqualsArrayWrapper.classForTag(implicitly[ClassTag[String]]) === classOf[String]) + assert(HashEqualsArrayWrapper.classForTag(implicitly[ClassTag[Array[Byte]]]) === classOf[Array[Byte]]) + } +} diff --git a/scalding-base/src/test/scala/com/twitter/scalding/typed/ResolverTest.scala b/scalding-base/src/test/scala/com/twitter/scalding/typed/ResolverTest.scala new file mode 100644 index 0000000000..0977c16f8e --- /dev/null +++ b/scalding-base/src/test/scala/com/twitter/scalding/typed/ResolverTest.scala @@ -0,0 +1,65 @@ +package com.twitter.scalding.typed + +import org.scalatest.FunSuite + +import com.twitter.scalding.typed.functions.EqTypes + +class ResolverTest extends FunSuite { + + class Key[A] + class Value[A] + + val k1 = new Key[Int] + val k2 = new Key[Int] + val k3 = new Key[Int] + val v1 = new Value[Int] + val v2 = new Value[Int] + val v3 = new Value[Int] + + // if they are eq, they have the same type + def keq[A, B](ka: Key[A], kb: Key[B]): Option[EqTypes[A, B]] = + if (ka == null || kb == null) None + else if (ka eq kb) Some(EqTypes.reflexive[A].asInstanceOf[EqTypes[A, B]]) + else None + + val custom = new Resolver[Key, Value] { + def apply[A](k: Key[A]) = + keq(k1, k).map { eqtypes => + eqtypes.subst[Value](v3) + } + } + + import Resolver.pair + + test("orElse order is correct") { + + assert((pair(k1, v1).orElse(pair(k1, v2)))(k1) == Some(v1)) + assert((pair(k1, v2).orElse(pair(k1, v1)))(k1) == Some(v2)) + assert((pair(k2, v1).orElse(pair(k1, v2)))(k1) == Some(v2)) + assert((pair(k2, v2).orElse(pair(k1, v1)))(k1) == Some(v1)) + + assert(((pair(k1, v1).orElse(pair(k1, v2))).orElse(pair(k1, v3)))(k1) == Some(v1)) + assert(((pair(k1, v2).orElse(pair(k1, v1))).orElse(pair(k1, v3)))(k1) == Some(v2)) + assert(((pair(k1, v1).orElse(pair(k1, v2))).orElse(pair(k2, v3)))(k2) == Some(v3)) + + assert(custom(k1) == Some(v3)) + assert(custom(k2) == None) + + assert((custom.orElse(pair(k1, v2)))(k1) == Some(v3)) + assert((custom.orElse(pair(k2, v2)))(k2) == Some(v2)) + assert((pair(k1, v2).orElse(custom))(k1) == Some(v2)) + assert((pair(k2, v2).orElse(custom))(k1) == Some(v3)) + assert((pair(k2, v2).orElse(custom))(k2) == Some(v2)) + } + + test("test remapping with andThen") { + val remap = Resolver.pair(k1, k2).orElse(Resolver.pair(k2, k3)).orElse(Resolver.pair(k3, k1)) + + assert((remap.andThen(custom.orElse(pair(k1, v2))))(k1) == None) + assert((remap.andThen(custom.orElse(pair(k2, v2))))(k2) == None) + assert((remap.andThen(pair(k1, v2).orElse(custom)))(k3) == Some(v2)) + assert((remap.andThen(pair(k2, v2).orElse(custom)))(k3) == Some(v3)) + assert((remap.andThen(pair(k2, v2).orElse(custom)))(k1) == Some(v2)) + + } +} diff --git a/scalding-base/src/test/scala/com/twitter/scalding/typed/TypedPipeDiffTest.scala b/scalding-base/src/test/scala/com/twitter/scalding/typed/TypedPipeDiffTest.scala new file mode 100644 index 0000000000..b9e0880d89 --- /dev/null +++ b/scalding-base/src/test/scala/com/twitter/scalding/typed/TypedPipeDiffTest.scala @@ -0,0 +1,220 @@ +package com.twitter.scalding.typed + +import com.twitter.algebird.MapAlgebra +import com.twitter.scalding.Config +import org.scalacheck.{Arbitrary, Prop} +import org.scalatest.prop.{Checkers, PropertyChecks} +import org.scalatest.{FunSuite, PropSpec} +import scala.reflect.ClassTag + +object TypedPipeDiffExtensions { + implicit class InMemoryToListEnrichment[A](tp: TypedPipe[A]) { + def inMemoryToList: List[A] = + tp.toIterableExecution.map(_.toList).waitFor(Config.empty, memory_backend.MemoryMode.empty).get + } +} + +import TypedPipeDiffExtensions._ + +class NoOrdering(val x: String) { + + override def equals(other: Any): Boolean = other match { + case that: NoOrdering => x.equals(that.x) + case _ => false + } + + override def hashCode(): Int = x.hashCode +} + +class NoOrderingHashCollisions(val x: String) { + + override def equals(other: Any): Boolean = other match { + case that: NoOrderingHashCollisions => x.equals(that.x) + case _ => false + } + + override def hashCode(): Int = 0 +} + +class TypedPipeDiffTest extends FunSuite { + + val left = List("hi", "hi", "bye", "foo", "bar") + val right = List("hi", "bye", "foo", "baz") + val expectedSortedDiff = List(("bar", (1, 0)), ("baz", (0, 1)), ("hi", (2, 1))).sorted + + val leftArr = List(Array[Byte](3, 3, 5, 3, 2), Array[Byte](2, 2, 2), Array[Byte](0, 1, 0)) + + val rightArr = + List(Array[Byte](2, 2, 2), Array[Byte](2, 2, 2), Array[Byte](3, 3, 5, 3, 2), Array[Byte](0, 1, 1)) + + val expectedSortedArrDiff = List( + (Array[Byte](0, 1, 0).toSeq, (1, 0)), + (Array[Byte](0, 1, 1).toSeq, (0, 1)), + (Array[Byte](2, 2, 2).toSeq, (1, 2)) + ) + + test("diff works for objects with ordering and good hashcodes") { + val pipe1 = TypedPipe.from(left) + val pipe2 = TypedPipe.from(right) + val diff = TypedPipeDiff.diff(pipe1, pipe2) + + assert(expectedSortedDiff === diff.toTypedPipe.inMemoryToList.sorted) + } + + // this lets us sort the results, + // without bringing an ordering into scope + private def sort(x: List[(Seq[Byte], (Long, Long))]): List[(Seq[Byte], (Long, Long))] = { + import scala.Ordering.Implicits.seqDerivedOrdering + x.sorted + } + + test("diffArrayPipes works for arrays") { + val pipe1 = TypedPipe.from(leftArr) + val pipe2 = TypedPipe.from(rightArr) + + val diff = TypedPipeDiff.diffArrayPipes(pipe1, pipe2).map { case (arr, counts) => (arr.toSeq, counts) } + + assert(expectedSortedArrDiff === sort(diff.inMemoryToList)) + } + + test("diffWithoutOrdering works for objects with ordering and good hashcodes") { + val pipe1 = TypedPipe.from(left) + val pipe2 = TypedPipe.from(right) + val diff = TypedPipeDiff.diffByHashCode(pipe1, pipe2) + + assert(expectedSortedDiff === diff.inMemoryToList.sorted) + } + + test("diffWithoutOrdering does not require ordering") { + val pipe1 = TypedPipe.from(left.map(new NoOrdering(_))) + val pipe2 = TypedPipe.from(right.map(new NoOrdering(_))) + val diff = TypedPipeDiff.diffByHashCode(pipe1, pipe2) + + assert(expectedSortedDiff === diff.inMemoryToList.map { case (nord, counts) => (nord.x, counts) }.sorted) + } + + test("diffWithoutOrdering works even with hash collisions") { + val pipe1 = TypedPipe.from(left.map(new NoOrderingHashCollisions(_))) + val pipe2 = TypedPipe.from(right.map(new NoOrderingHashCollisions(_))) + val diff = TypedPipeDiff.diffByHashCode(pipe1, pipe2) + assert(expectedSortedDiff === diff.inMemoryToList.map { case (nord, counts) => (nord.x, counts) }.sorted) + } + + test("diffArrayPipesWithoutOrdering works for arrays of objects with no ordering") { + val pipe1 = TypedPipe.from(leftArr.map(arr => arr.map(b => new NoOrdering(b.toString)))) + val pipe2 = TypedPipe.from(rightArr.map(arr => arr.map(b => new NoOrdering(b.toString)))) + val diff = TypedPipeDiff.diffArrayPipes(pipe1, pipe2) + + assert(expectedSortedArrDiff === sort(diff.inMemoryToList.map { case (arr, counts) => + (arr.map(_.x.toByte).toSeq, counts) + })) + } + +} + +object TypedPipeDiffLaws { + import com.twitter.scalding.typed.TypedPipeDiff.Enrichments._ + + def checkDiff[T](left: List[T], right: List[T], diff: List[(T, (Long, Long))]): Boolean = { + val noDuplicates = diff.size == diff.map(_._1).toSet.size + val expected = MapAlgebra + .sumByKey(left.map((_, (1L, 0L))).iterator ++ right.map((_, (0L, 1L))).iterator) + .filter { case (t, (rCount, lCount)) => rCount != lCount } + + noDuplicates && expected == diff.toMap + } + + def checkArrayDiff[T]( + left: List[Array[T]], + right: List[Array[T]], + diff: List[(Seq[T], (Long, Long))] + ): Boolean = + checkDiff(left.map(_.toSeq), right.map(_.toSeq), diff) + + def diffLaw[T: Ordering: Arbitrary]: Prop = Prop.forAll { (left: List[T], right: List[T]) => + val diff = TypedPipe.from(left).diff(TypedPipe.from(right)).toTypedPipe.inMemoryToList + checkDiff(left, right, diff) + } + + def diffArrayLaw[T](implicit arb: Arbitrary[List[Array[T]]], ct: ClassTag[T]): Prop = Prop.forAll { + (left: List[Array[T]], right: List[Array[T]]) => + val diff = TypedPipe + .from(left) + .diffArrayPipes(TypedPipe.from(right)) + .inMemoryToList + .map { case (arr, counts) => (arr.toSeq, counts) } + checkArrayDiff(left, right, diff) + } + + def diffByGroupLaw[T: Arbitrary]: Prop = Prop.forAll { (left: List[T], right: List[T]) => + val diff = TypedPipe.from(left).diffByHashCode(TypedPipe.from(right)).inMemoryToList + checkDiff(left, right, diff) + } + +} + +class TypedPipeDiffLaws extends PropSpec with PropertyChecks with Checkers { + override implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 5) + + property("diffLaws") { + check(TypedPipeDiffLaws.diffLaw[Int]) + check(TypedPipeDiffLaws.diffLaw[String]) + } + + property("diffArrayLaws") { + + implicit val arbNoOrdering: Arbitrary[Array[NoOrdering]] = Arbitrary { + for { + strs <- Arbitrary.arbitrary[Array[String]] + } yield { + strs.map(new NoOrdering(_)) + } + } + + implicit val arbNoOrderingHashCollision: Arbitrary[Array[NoOrderingHashCollisions]] = Arbitrary { + for { + strs <- Arbitrary.arbitrary[Array[String]] + } yield { + strs.map(new NoOrderingHashCollisions(_)) + } + } + + check(TypedPipeDiffLaws.diffArrayLaw[Long]) + check(TypedPipeDiffLaws.diffArrayLaw[Int]) + check(TypedPipeDiffLaws.diffArrayLaw[Short]) + check(TypedPipeDiffLaws.diffArrayLaw[Char]) + check(TypedPipeDiffLaws.diffArrayLaw[Byte]) + check(TypedPipeDiffLaws.diffArrayLaw[Boolean]) + check(TypedPipeDiffLaws.diffArrayLaw[Float]) + check(TypedPipeDiffLaws.diffArrayLaw[Double]) + check(TypedPipeDiffLaws.diffArrayLaw[String]) + check(TypedPipeDiffLaws.diffArrayLaw[NoOrdering]) + check(TypedPipeDiffLaws.diffArrayLaw[NoOrderingHashCollisions]) + } + + property("diffByGroupLaws") { + + implicit val arbNoOrdering: Arbitrary[NoOrdering] = Arbitrary { + for { + name <- Arbitrary.arbitrary[String] + } yield { + new NoOrdering(name) + } + } + + implicit val arbNoOrderingHashCollision: Arbitrary[NoOrderingHashCollisions] = Arbitrary { + for { + name <- Arbitrary.arbitrary[String] + } yield { + new NoOrderingHashCollisions(name) + } + } + + check(TypedPipeDiffLaws.diffByGroupLaw[Int]) + check(TypedPipeDiffLaws.diffByGroupLaw[String]) + check(TypedPipeDiffLaws.diffByGroupLaw[NoOrdering]) + check(TypedPipeDiffLaws.diffByGroupLaw[NoOrderingHashCollisions]) + } + +} diff --git a/scalding-base/src/test/scala/com/twitter/scalding/typed/TypedPipeMonoidTest.scala b/scalding-base/src/test/scala/com/twitter/scalding/typed/TypedPipeMonoidTest.scala new file mode 100644 index 0000000000..5797f2223f --- /dev/null +++ b/scalding-base/src/test/scala/com/twitter/scalding/typed/TypedPipeMonoidTest.scala @@ -0,0 +1,48 @@ +package com.twitter.scalding +package typed + +import memory_backend.MemoryMode +import com.twitter.algebird.Monoid.{plus, sum, zero} +import org.scalatest.FunSuite +import org.scalatest.prop.PropertyChecks + +class TypedPipeMonoidTest extends FunSuite with PropertyChecks { + + def run[A](t: TypedPipe[A]): List[A] = + t.toIterableExecution.map(_.toList).waitFor(Config.empty, MemoryMode.empty).get + + def sortedEq[A: Ordering](a: List[A], b: List[A]): Boolean = + a.sorted == b.sorted + + def eqvPipe[A: Ordering](a: TypedPipe[A], b: TypedPipe[A]): Boolean = + sortedEq(run(a), run(b)) + + test("typedPipeMonoid.zero should be equal to TypePipe.empty") { + assert(zero[TypedPipe[Int]] == TypedPipe.empty) + } + + test("monoid is associative") { + forAll { (a: List[Int], b: List[Int], c: List[Int]) => + val left = plus(plus(TypedPipe.from(a), TypedPipe.from(b)), TypedPipe.from(c)) + val right = plus(TypedPipe.from(a), plus(TypedPipe.from(b), TypedPipe.from(c))) + assert(eqvPipe(left, right)) + } + } + + test("monoid is commutative") { + forAll { (a: List[Int], b: List[Int]) => + val left = plus(TypedPipe.from(a), TypedPipe.from(b)) + val right = plus(TypedPipe.from(b), TypedPipe.from(a)) + assert(eqvPipe(left, right)) + } + } + + test("monoid sum is equivalent to a union") { + forAll { (as: List[List[Int]]) => + val pipes = as.map(TypedPipe.from(_)) + val bigPipe = TypedPipe.from(as.flatten) + assert(eqvPipe(sum(pipes), bigPipe)) + } + } + +} diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamBackend.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamBackend.scala new file mode 100644 index 0000000000..f71a80a6a7 --- /dev/null +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamBackend.scala @@ -0,0 +1,157 @@ +package com.twitter.scalding.beam_backend + +import com.twitter.scalding.dagon.{FunctionK, Memoize, Rule} +import com.twitter.chill.KryoInstantiator +import com.twitter.chill.config.ScalaMapConfig +import com.twitter.scalding.Config +import com.twitter.scalding.beam_backend.BeamOp.{CoGroupedOp, MergedBeamOp} +import com.twitter.scalding.serialization.KryoHadoop +import com.twitter.scalding.typed._ +import com.twitter.scalding.typed.functions.{ + FilterKeysToFilter, + FlatMapValuesToFlatMap, + MapValuesToMap, + ScaldingPriorityQueueMonoid +} + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions.ConfigCascadingExtensions + +object BeamPlanner { + def plan( + config: Config, + srcs: Resolver[Input, BeamSource] + ): FunctionK[TypedPipe, BeamOp] = { + implicit val kryoCoder: KryoCoder = new KryoCoder(defaultKryoCoderConfiguration(config)) + Memoize.functionK(f = new Memoize.RecursiveK[TypedPipe, BeamOp] { + + import TypedPipe._ + + def toFunction[A] = { + case (f @ Filter(_, _), rec) => + def go[T](f: Filter[T]): BeamOp[T] = { + val Filter(p, fn) = f + rec[T](p).filter(fn) + } + go(f) + case (fk @ FilterKeys(_, _), rec) => + def go[K, V](node: FilterKeys[K, V]): BeamOp[(K, V)] = { + val FilterKeys(pipe, fn) = node + rec(pipe).filter(FilterKeysToFilter(fn)) + } + go(fk) + case (Mapped(input, fn), rec) => + val op = rec(input) + op.map(fn) + case (FlatMapped(input, fn), rec) => + val op = rec(input) + op.flatMap(fn) + case (f @ MapValues(_, _), rec) => + def go[K, V, U](node: MapValues[K, V, U]): BeamOp[(K, U)] = { + val MapValues(pipe, fn) = node + rec(pipe).map(MapValuesToMap[K, V, U](fn)) + } + go(f) + case (f @ FlatMapValues(_, _), rec) => + def go[K, V, U](node: FlatMapValues[K, V, U]): BeamOp[(K, U)] = { + val FlatMapValues(pipe, fn) = node + rec(pipe).flatMap(FlatMapValuesToFlatMap[K, V, U](fn)) + } + go(f) + case (SourcePipe(src), _) => + BeamOp.Source(config, src, srcs(src)) + case (IterablePipe(iterable), _) => + BeamOp.FromIterable(iterable, kryoCoder) + case (wd: WithDescriptionTypedPipe[a], rec) => + rec[a](wd.input) + case (SumByLocalKeys(pipe, sg), rec) => + val op = rec(pipe) + config.getMapSideAggregationThreshold match { + case None => op + case Some(count) => + // Semigroup is invariant on T. We cannot pattern match as it is a Semigroup[PriorityQueue[T]] + if (sg.isInstanceOf[ScaldingPriorityQueueMonoid[_]]) { + op + } else { + op.mapSideAggregator(count, sg) + } + } + case (ReduceStepPipe(ir @ IdentityReduce(_, _, _, _, _)), rec) => + def go[K, V1, V2](ir: IdentityReduce[K, V1, V2]): BeamOp[(K, V2)] = { + type BeamOpT[V] = BeamOp[(K, V)] + val op = rec(ir.mapped) + ir.evidence.subst[BeamOpT](op) + } + go(ir) + case (ReduceStepPipe(uir @ UnsortedIdentityReduce(_, _, _, _, _)), rec) => + def go[K, V1, V2](uir: UnsortedIdentityReduce[K, V1, V2]): BeamOp[(K, V2)] = { + type BeamOpT[V] = BeamOp[(K, V)] + val op = rec(uir.mapped) + uir.evidence.subst[BeamOpT](op) + } + go(uir) + case (ReduceStepPipe(ivsr @ IdentityValueSortedReduce(_, _, _, _, _, _)), rec) => + def go[K, V1, V2](uir: IdentityValueSortedReduce[K, V1, V2]): BeamOp[(K, V2)] = { + type BeamOpT[V] = BeamOp[(K, V)] + val op = rec(uir.mapped) + val sortedOp = op.sorted(uir.keyOrdering, uir.valueSort, kryoCoder) + uir.evidence.subst[BeamOpT](sortedOp) + } + go(ivsr) + case (ReduceStepPipe(ValueSortedReduce(keyOrdering, pipe, valueSort, reduceFn, _, _)), rec) => + val op = rec(pipe) + op.sortedMapGroup(reduceFn)(keyOrdering, valueSort, kryoCoder) + case (ReduceStepPipe(IteratorMappedReduce(keyOrdering, pipe, reduceFn, _, _)), rec) => + val op = rec(pipe) + op.mapGroup(reduceFn)(keyOrdering, kryoCoder) + case (hcg @ HashCoGroup(_, _, _), rec) => + def go[K, V1, V2, W](hcg: HashCoGroup[K, V1, V2, W]): BeamOp[(K, W)] = { + val leftOp = rec(hcg.left) + implicit val orderingK: Ordering[K] = hcg.right.keyOrdering + val rightOp = rec(ReduceStepPipe(HashJoinable.toReduceStep(hcg.right))) + leftOp.hashJoin(rightOp, hcg.joiner) + } + go(hcg) + case (CoGroupedPipe(cg), rec) => + def go[K, V](cg: CoGrouped[K, V]): BeamOp[(K, V)] = { + val ops: Seq[BeamOp[(K, Any)]] = cg.inputs.map(tp => rec(tp)) + CoGroupedOp(cg, ops) + } + go(cg) + case (Fork(input), rec) => + rec(input) + case (m @ MergedTypedPipe(_, _), rec) => + OptimizationRules.unrollMerge(m) match { + case Nil => rec(EmptyTypedPipe) + case single :: Nil => rec(single) + case first :: second :: tail => MergedBeamOp(rec(first), rec(second), tail.map(rec(_))) + } + } + }) + } + + def defaultKryoCoderConfiguration(config: Config): KryoInstantiator = + config.getKryo match { + case Some(kryoInstantiator) => kryoInstantiator + case None => new KryoHadoop(new ScalaMapConfig(Map.empty)) + } + + def defaultOptimizationRules(config: Config): Seq[Rule[TypedPipe]] = { + def std(forceHash: Rule[TypedPipe]) = + OptimizationRules.standardMapReduceRules ::: + List( + OptimizationRules.FilterLocally, // after filtering, we may have filtered to nothing, lets see + OptimizationRules.simplifyEmpty, + // add any explicit forces to the optimized graph + Rule.orElse(List(forceHash, OptimizationRules.RemoveDuplicateForceFork)) + ) + + config.getOptimizationPhases match { + case Some(tryPhases) => tryPhases.get.phases + case None => + val force = + if (config.getHashJoinAutoForceRight) OptimizationRules.ForceToDiskBeforeHashJoin + else Rule.empty[TypedPipe] + std(force) + } + } +} diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamFunctions.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamFunctions.scala new file mode 100644 index 0000000000..76f06fecc7 --- /dev/null +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamFunctions.scala @@ -0,0 +1,95 @@ +package com.twitter.scalding.beam_backend + +import com.twitter.algebird.{Semigroup, SummingCache} +import org.apache.beam.sdk.transforms.DoFn.{FinishBundle, ProcessElement, StartBundle} +import org.apache.beam.sdk.transforms.windowing.{BoundedWindow, GlobalWindow} +import org.apache.beam.sdk.transforms.{DoFn, ProcessFunction} +import org.apache.beam.sdk.values.{PCollection, PCollectionView} +import scala.collection.JavaConverters._ + +object BeamFunctions { + case class ProcessPredicate[A](f: A => Boolean) extends ProcessFunction[A, java.lang.Boolean] { + @throws[Exception] + override def apply(input: A): java.lang.Boolean = java.lang.Boolean.valueOf(f(input)) + } + + case class FlatMapFn[A, B](f: A => TraversableOnce[B]) extends DoFn[A, B] { + @ProcessElement + def processElement(c: DoFn[A, B]#ProcessContext): Unit = { + val it = f(c.element()).toIterator + while (it.hasNext) c.output(it.next()) + } + } + + case class MapFn[A, B](f: A => B) extends DoFn[A, B] { + @ProcessElement + def processElement(c: DoFn[A, B]#ProcessContext): Unit = + c.output(f(c.element())) + } + + case class MapSideAggregator[K, V]( + size: Int, + semigroup: Semigroup[V] + ) extends DoFn[(K, V), (K, V)] { + var cache: SummingCache[K, V] = _ + @StartBundle + def startBundle(): Unit = + cache = new SummingCache[K, V](size)(semigroup) + + @ProcessElement + def processElement(c: DoFn[(K, V), (K, V)]#ProcessContext): Unit = { + val evicted = cache.put(Map(c.element())) + evicted match { + case Some(m) => + val mit = m.iterator + while (mit.hasNext) { + c.output(mit.next()) + } + case None => () + } + } + + @FinishBundle + def finishBundle(c: DoFn[(K, V), (K, V)]#FinishBundleContext): Unit = { + val evicted = cache.flush + evicted match { + case Some(m) => + val mit = m.iterator + while (mit.hasNext) { + c.output(mit.next(), BoundedWindow.TIMESTAMP_MIN_VALUE, GlobalWindow.INSTANCE) + } + case None => () + } + } + } + + case class HashJoinFn[K, V, U, W]( + joiner: (K, V, Iterable[U]) => Iterator[W], + sideInput: PCollectionView[java.util.Map[K, java.lang.Iterable[U]]] + ) extends DoFn[(K, V), (K, W)] { + private[this] var mapRight: java.util.Map[K, java.lang.Iterable[U]] = null + private[this] val emptyUs: Iterable[U] = Seq.empty[U] + + @ProcessElement + def processElement(c: DoFn[(K, V), (K, W)]#ProcessContext): Unit = { + if (mapRight == null) { + mapRight = c.sideInput(sideInput) + } + val key = c.element()._1 + val value = c.element()._2 + val it = mapRight.get(key) match { + case null => joiner(key, value, emptyUs) + case notEmpty => joiner(key, value, notEmpty.asScala) + } + while (it.hasNext) { + c.output((key, it.next())) + } + } + + @FinishBundle + def finishBundle(c: DoFn[(K, V), (K, W)]#FinishBundleContext): Unit = + mapRight = null + } + + def widenPCollection[A, B >: A](p: PCollection[_ <: A]): PCollection[B] = p.asInstanceOf[PCollection[B]] +} diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamJoiner.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamJoiner.scala new file mode 100644 index 0000000000..903c2ad4ae --- /dev/null +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamJoiner.scala @@ -0,0 +1,283 @@ +package com.twitter.scalding.beam_backend + +import com.twitter.scalding.serialization.Externalizer +import com.twitter.scalding.typed.Joiner +import com.twitter.scalding.typed +import java.io.Serializable + +object BeamJoiner { + def beamMapGroupJoin[K, R, R1]( + fn: (K, Iterator[R]) => Iterator[R1] + ): (K, Iterable[R]) => Iterable[R1] = { (k: K, iter: Iterable[R]) => + new Iterable[R1] { + def iterator: Iterator[R1] = fn(k, iter.iterator) + } + } + + private def beamHashJoin[K, V1, V2, R]( + hj: (K, V1, Iterable[V2]) => Iterator[R] + ): HashJoinFunction[K, V1, V2, R] = + hj match { + case _: Joiner.HashInner[K, v, u] => HashInner() + case _: Joiner.HashLeft[K, v, u] => HashLeft() + case f: Joiner.FilteredHashJoin[K, v1, v2, r] => FilteredHashJoin(beamHashJoin(f.jf), f.fn) + case f: Joiner.MappedHashJoin[K, v1, v2, r, r1] => MappedHashJoin(beamHashJoin(f.jf), f.fn) + case f: Joiner.FlatMappedHashJoin[K, v1, v2, r, r1] => FlatMappedHashJoin(beamHashJoin(f.jf), f.fn) + case f => ArbitraryHashJoin(f) + } + + private def beamJoin[K, A, B, C]( + fn: (K, Iterator[A], Iterable[B]) => Iterator[C] + ): JoinFunction[K, A, B, C] = + fn match { + case _: Joiner.LeftJoin[K, v1, v2] => LeftJoin() + case _: Joiner.RightJoin[K, v1, v2] => RightJoin() + case _: Joiner.OuterJoin[K, v1, v2] => OuterJoin() + case _: Joiner.InnerJoin[K, v1, v2] => InnerJoin() + case join: Joiner.FilteredJoin[K, v1, v2, r] => FilteredJoin(beamJoin(join.jf), join.fn) + case join: Joiner.MappedJoin[K, v1, v2, r, r1] => MappedJoin(beamJoin(join.jf), join.fn) + case join: Joiner.FlatMappedJoin[K, v1, v2, r, r1] => FlatMappedJoin(beamJoin(join.jf), join.fn) + case join: Joiner.MappedGroupJoin[K, v1, v2, r, r1] => + MappedGroupJoin(beamJoin(join.jf), beamMapGroupJoin(join.fn)) + case join: Joiner.JoinFromHashJoin[K, v1, v2, r] => JoinFromHashJoin(beamHashJoin(join.hj)) + case join => ArbitraryJoin(join) + } + + def beamMultiJoin[A, B](m: typed.MultiJoinFunction[A, B]): MultiJoinFunction[A, B] = + m match { + case _: typed.MultiJoinFunction.Casting[A, B] => MultiJoinFunction.Casting[A, B]() + case join: typed.MultiJoinFunction.PairCachedRight[A, x, y, B] => + MultiJoinFunction + .PairCachedRight[A, x, y, B](beamMultiJoin(join.left), beamMultiJoin(join.right), beamJoin(join.fn)) + case join: typed.MultiJoinFunction.Pair[A, x, y, B] => + MultiJoinFunction + .Pair[A, x, y, B](beamMultiJoin(join.left), beamMultiJoin(join.right), beamJoin(join.fn)) + case join: typed.MultiJoinFunction.MapGroup[A, x, B] => + MultiJoinFunction.MapGroup[A, x, B](beamMultiJoin(join.input), beamMapGroupJoin(join.mapGroupFn)) + case join: typed.MultiJoinFunction.MapCast[A, x, B] => + MultiJoinFunction.MapCast[A, x, B](beamMapGroupJoin(join.mapGroupFn)) + } + + sealed abstract class MultiJoinFunction[A, +B] extends Serializable { + def inputSize: Int + def apply(key: A, streams: Seq[Iterable[Any]]): Iterable[B] + } + + object MultiJoinFunction extends Serializable { + final case class Casting[A, B]() extends MultiJoinFunction[A, B] { + override def inputSize: Int = 1 + override def apply(key: A, streams: Seq[Iterable[Any]]): Iterable[B] = { + require(streams.size == 1, "this join function should never be called with multiple streams") + streams.head.asInstanceOf[Iterable[B]] + } + } + + final case class PairCachedRight[K, A, B, C]( + left: MultiJoinFunction[K, A], + right: MultiJoinFunction[K, B], + @transient fn: (K, Iterable[A], Iterable[B]) => Iterable[C] + ) extends MultiJoinFunction[K, C] { + + private[this] val fnEx = Externalizer(fn) + + override val inputSize: Int = left.inputSize + right.inputSize + + def apply(key: K, streams: Seq[Iterable[Any]]): Iterable[C] = { + /* + * This require is just an extra check (which should never possibly fail unless we have a programming bug) + * that the number of streams we are joining matches the total joining operation we have. + * + * Since we have one stream in leftMost, the others should be in rightStreams. + * + * This check is cheap compared with the whole join, so we put this here to aid in checking + * correctness due to the weak types that MultiJoinFunction has (non-static size of Seq and + * the use of Any) + */ + require(streams.size == inputSize, s"expected $inputSize inputSize, found ${streams.size}") + val (leftSeq, rightSeq) = streams.splitAt(left.inputSize) + val joinedLeft = left(key, leftSeq) + + // we should materialize the final right one time: + val joinedRight = right(key, rightSeq).toList + fnEx.get(key, joinedLeft, joinedRight) + } + } + + final case class Pair[K, A, B, C]( + left: MultiJoinFunction[K, A], + right: MultiJoinFunction[K, B], + @transient fn: (K, Iterable[A], Iterable[B]) => Iterable[C] + ) extends MultiJoinFunction[K, C] { + + private[this] val fnEx = Externalizer(fn) + + override val inputSize: Int = left.inputSize + right.inputSize + + def apply(key: K, streams: Seq[Iterable[Any]]): Iterable[C] = { + /* + * This require is just an extra check (which should never possibly fail unless we have a programming bug) + * that the number of streams we are joining matches the total joining operation we have. + * + * Since we have one stream in leftMost, the others should be in rightStreams. + * + * This check is cheap compared with the whole join, so we put this here to aid in checking + * correctness due to the weak types that MultiJoinFunction has (non-static size of Seq and + * the use of Any) + */ + require(streams.size == inputSize, s"expected $inputSize inputSize, found ${streams.size}") + val (leftSeq, rightSeq) = streams.splitAt(left.inputSize) + val joinedLeft = left(key, leftSeq) + + // TODO: it might make sense to cache this in memory as an IndexedSeq and not + // recompute it on every value for the left if the smallerJf is non-trivial + // we could see how long it is, and possible switch to a cached version the + // second time through if it is small enough + val joinedRight = right(key, rightSeq) + + fnEx.get(key, joinedLeft, joinedRight) + } + } + + /** + * This is used to implement mapGroup on already joined streams + */ + final case class MapGroup[K, A, B]( + input: MultiJoinFunction[K, A], + @transient mapGroupFn: (K, Iterable[A]) => Iterable[B] + ) extends MultiJoinFunction[K, B] { + + private[this] val fnEx = Externalizer(mapGroupFn) + + def inputSize = input.inputSize + + def apply(key: K, streams: Seq[Iterable[Any]]): Iterable[B] = { + val joined = input(key, streams) + fnEx.get(key, joined) + } + } + + /** + * This is used to join IteratorMappedReduce with others. We could compose Casting[A] with MapGroup[K, A, + * B] but since it is common enough we give it its own case. + */ + final case class MapCast[K, A, B](@transient mapGroupFn: (K, Iterable[A]) => Iterable[B]) + extends MultiJoinFunction[K, B] { + + private[this] val fnEx = Externalizer(mapGroupFn) + + def inputSize = 1 + + def apply(key: K, streams: Seq[Iterable[Any]]): Iterable[B] = { + require(streams.size == 1, "this join function should never be called with multiple streams") + fnEx.get(key, streams.head.asInstanceOf[Iterable[A]]) + } + } + } + + def asOuter[U](it: Iterable[U]): Iterable[Option[U]] = + if (it.isEmpty) Iterable(None) + else it.map(Some(_)) + + /** + * Optimizers want to match on the kinds of joins we are doing. This gives them that ability + */ + sealed abstract class HashJoinFunction[-K, -V, -U, +R] extends Function3[K, V, Iterable[U], Iterable[R]] + + final case class HashInner[K, V, U]() extends HashJoinFunction[K, V, U, (V, U)] { + def apply(k: K, v: V, u: Iterable[U]) = u.map((v, _)) + } + final case class HashLeft[K, V, U]() extends HashJoinFunction[K, V, U, (V, Option[U])] { + def apply(k: K, v: V, u: Iterable[U]) = asOuter(u).map((v, _)) + } + final case class FilteredHashJoin[K, V1, V2, R](jf: HashJoinFunction[K, V1, V2, R], fn: ((K, R)) => Boolean) + extends HashJoinFunction[K, V1, V2, R] { + def apply(k: K, left: V1, right: Iterable[V2]) = + jf.apply(k, left, right).filter(r => fn((k, r))) + } + final case class MappedHashJoin[K, V1, V2, R, R1](jf: HashJoinFunction[K, V1, V2, R], fn: R => R1) + extends HashJoinFunction[K, V1, V2, R1] { + def apply(k: K, left: V1, right: Iterable[V2]) = + jf.apply(k, left, right).map(fn) + } + final case class FlatMappedHashJoin[K, V1, V2, R, R1]( + jf: HashJoinFunction[K, V1, V2, R], + fn: R => TraversableOnce[R1] + ) extends HashJoinFunction[K, V1, V2, R1] { + def apply(k: K, left: V1, right: Iterable[V2]) = + jf.apply(k, left, right).flatMap(fn) + } + final case class ArbitraryHashJoin[K, V1, V2, R]( + hj: (K, V1, Iterable[V2]) => Iterator[R] + ) extends HashJoinFunction[K, V1, V2, R] { + def apply(k: K, left: V1, right: Iterable[V2]): Iterable[R] = + new Iterable[R] { + def iterator: Iterator[R] = hj.apply(k, left, right) + } + } + + /** + * As opposed to Scalding's JoinFunction, in Beam we make 'right' be the one iterated once and 'left' many + * times It replaces all uses of Iterator with Iterable since Beam can always provide Iterables. + */ + sealed abstract class JoinFunction[-K, -V1, -V2, +R] + extends ((K, Iterable[V1], Iterable[V2]) => Iterable[R]) + + final case class InnerJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (V1, V2)] { + def apply(k: K, left: Iterable[V1], right: Iterable[V2]): Iterable[(V1, V2)] = + right.flatMap(v2 => left.map((_, v2))) + } + final case class LeftJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (V1, Option[V2])] { + def apply(k: K, left: Iterable[V1], right: Iterable[V2]): Iterable[(V1, Option[V2])] = + asOuter(right).flatMap(v2 => left.map((_, v2))) + } + final case class RightJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (Option[V1], V2)] { + def apply(k: K, left: Iterable[V1], right: Iterable[V2]): Iterable[(Option[V1], V2)] = + right.flatMap(v2 => asOuter(left).map((_, v2))) + } + final case class OuterJoin[K, V1, V2]() extends JoinFunction[K, V1, V2, (Option[V1], Option[V2])] { + def apply(k: K, left: Iterable[V1], right: Iterable[V2]): Iterable[(Option[V1], Option[V2])] = + if (left.isEmpty && right.isEmpty) Iterable.empty + else asOuter(right).flatMap(v2 => asOuter(left).map((_, v2))) + } + final case class FilteredJoin[K, V1, V2, R](jf: JoinFunction[K, V1, V2, R], fn: ((K, R)) => Boolean) + extends JoinFunction[K, V1, V2, R] { + def apply(k: K, left: Iterable[V1], right: Iterable[V2]): Iterable[R] = + jf.apply(k, left, right).filter(r => fn((k, r))) + } + final case class MappedJoin[K, V1, V2, R, R1](jf: JoinFunction[K, V1, V2, R], fn: R => R1) + extends JoinFunction[K, V1, V2, R1] { + def apply(k: K, left: Iterable[V1], right: Iterable[V2]): Iterable[R1] = + jf.apply(k, left, right).map(fn) + } + final case class FlatMappedJoin[K, V1, V2, R, R1]( + jf: JoinFunction[K, V1, V2, R], + fn: R => TraversableOnce[R1] + ) extends JoinFunction[K, V1, V2, R1] { + def apply(k: K, left: Iterable[V1], right: Iterable[V2]): Iterable[R1] = + jf.apply(k, left, right).flatMap(fn) + } + final case class MappedGroupJoin[K, V1, V2, R, R1]( + jf: JoinFunction[K, V1, V2, R], + fn: (K, Iterable[R]) => Iterable[R1] + ) extends JoinFunction[K, V1, V2, R1] { + def apply(k: K, left: Iterable[V1], right: Iterable[V2]): Iterable[R1] = { + val iterr = jf.apply(k, left, right) + if (iterr.isEmpty) Iterable.empty // mapGroup operates on non-empty groups + else fn(k, iterr) + } + } + final case class JoinFromHashJoin[K, V1, V2, R](hj: (K, V1, Iterable[V2]) => Iterable[R]) + extends JoinFunction[K, V1, V2, R] { + def apply(k: K, left: Iterable[V1], right: Iterable[V2]): Iterable[R] = + left.flatMap(hj(k, _, right)) + } + + final case class ArbitraryJoin[K, V1, V2, R](fn: (K, Iterator[V1], Iterable[V2]) => Iterator[R]) + extends JoinFunction[K, V1, V2, R] { + def apply(k: K, left: Iterable[V1], right: Iterable[V2]): Iterable[R] = + new Iterable[R] { + def iterator: Iterator[R] = fn(k, left.iterator, right) + } + } + +} diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamMode.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamMode.scala new file mode 100644 index 0000000000..126575ea5a --- /dev/null +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamMode.scala @@ -0,0 +1,169 @@ +package com.twitter.scalding.beam_backend + +import com.twitter.scalding.Execution.Writer +import com.twitter.scalding.typed.{Input, Output, Resolver} +import com.twitter.scalding.{Config, Mode, TextLine} +import java.io.{EOFException, InputStream} +import java.nio.channels.{Channels, WritableByteChannel} +import org.apache.beam.sdk.Pipeline +import org.apache.beam.sdk.coders.Coder +import org.apache.beam.sdk.io.fs.MatchResult +import org.apache.beam.sdk.io.{FileIO, FileSystems, TextIO} +import org.apache.beam.sdk.options.PipelineOptions +import org.apache.beam.sdk.transforms.DoFn.ProcessElement +import org.apache.beam.sdk.transforms.{Create, DoFn, ParDo} +import org.apache.beam.sdk.values.PCollection + +case class BeamMode( + pipelineOptions: PipelineOptions, + sources: Resolver[Input, BeamSource], + sink: Resolver[Output, BeamSink] +) extends Mode { + def newWriter(): Writer = new BeamWriter(this) +} + +object BeamMode { + def empty(pipelineOptions: PipelineOptions): BeamMode = + BeamMode(pipelineOptions, Resolver.empty, Resolver.empty) + def default(pipelineOptions: PipelineOptions): BeamMode = + BeamMode(pipelineOptions, BeamSource.Default, BeamSink.Default) +} + +trait BeamSource[+A] extends Serializable { + def read(pipeline: Pipeline, config: Config): PCollection[_ <: A] +} + +object BeamSource extends Serializable { + val Default: Resolver[Input, BeamSource] = + new Resolver[Input, BeamSource] { + def apply[A](source: Input[A]): Option[BeamSource[A]] = + source match { + case tl: TextLine => + tl.localPaths match { + case path :: Nil => Some(textLine(path)) + case _ => throw new Exception("Can not accept multiple paths to BeamSource") + } + case TempSource(path, coder) => Some(new BeamTempFileSource(coder, path)) + case _ => None + } + } + + def textLine(path: String): BeamSource[String] = + new BeamSource[String] { + override def read(pipeline: Pipeline, config: Config): PCollection[_ <: String] = + pipeline.apply(TextIO.read().from(path)) + } +} + +trait BeamSink[-A] extends Serializable { + def write(pc: PCollection[_ <: A], config: Config): Unit +} + +object BeamSink extends Serializable { + val Default: Resolver[Output, BeamSink] = + new Resolver[Output, BeamSink] { + def apply[A](sink: Output[A]): Option[BeamSink[A]] = + sink match { + case tl: TextLine => + tl.localPaths match { + case path :: Nil => Some(textLine(path).asInstanceOf[BeamSink[A]]) + case _ => throw new Exception("Can not accept multiple paths to BeamSink") + } + case _ => None + } + } + + def textLine(path: String): BeamSink[String] = + new BeamSink[String] { + override def write(pc: PCollection[_ <: String], config: Config): Unit = { + val stringPCollection: PCollection[String] = BeamFunctions.widenPCollection(pc) + stringPCollection.apply(TextIO.write().to(path)) + } + } +} + +class BeamTempFileSink[T](output: String) extends BeamSink[T] { + override def write( + pc: PCollection[_ <: T], + config: Config + ): Unit = { + val pColT: PCollection[T] = BeamFunctions.widenPCollection(pc) + + pColT.apply( + FileIO + .write() + .via(new CoderFileSink(pColT.getCoder)) + .to(output) + ) + } +} + +class BeamTempFileSource[T](coder: Coder[T], output: String) extends BeamSource[T] { + override def read( + pipeline: Pipeline, + config: Config + ): PCollection[_ <: T] = + pipeline + .apply(Create.of(s"$output/*")) + .apply(FileIO.matchAll()) + .apply(ParDo.of(new TempSourceDoFn[T](coder))) + .setCoder(coder) +} + +case class TempSourceDoFn[T](coder: Coder[T]) extends DoFn[MatchResult.Metadata, T] { + @ProcessElement + def processElement(c: DoFn[MatchResult.Metadata, T]#ProcessContext): Unit = { + // We do not split the files produced in the previous stage and use a single thread per file + val stream = Channels.newInputStream(FileSystems.open(c.element().resourceId())) + val it = InputStreamIterator.closingIterator(stream, coder) + while (it.hasNext) c.output(it.next()) + } +} + +class CoderFileSink[T](coder: Coder[T]) extends FileIO.Sink[T] { + private var outputStream: java.io.OutputStream = _ + + override def open(channel: WritableByteChannel): Unit = + outputStream = Channels.newOutputStream(channel) + + override def write(element: T): Unit = coder.encode(element, outputStream) + override def flush(): Unit = outputStream.flush() +} + +class InputStreamIterator[T](stream: InputStream, coder: Coder[T]) extends Iterator[T] { + var hasNextRecord: Boolean = _ + var nextRecord: T = _ + + fetchNext() + override def hasNext: Boolean = hasNextRecord + + override def next(): T = { + val recordToReturn = nextRecord + fetchNext() + recordToReturn + } + + private def fetchNext(): Unit = + try { + nextRecord = coder.decode(stream) + hasNextRecord = true + } catch { + case _: EOFException => + hasNextRecord = false + } +} + +object InputStreamIterator { + // an empty Iterator that closes an InputStream when it is iterated + def closingIterator[T](stream: InputStream, coder: Coder[T]): Iterator[T] = { + def closeIt(is: InputStream): Iterator[T] = + new Iterator[T] { + def hasNext: Boolean = { + is.close() + false + } + def next = Iterator.empty.next + } + new InputStreamIterator[T](stream, coder) ++ closeIt(stream) + } +} diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamOp.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamOp.scala new file mode 100644 index 0000000000..32cc6fad3a --- /dev/null +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamOp.scala @@ -0,0 +1,436 @@ +package com.twitter.scalding.beam_backend + +import com.twitter.scalding.dagon.Memoize +import com.twitter.algebird.Semigroup +import com.twitter.scalding.Config +import com.twitter.scalding.beam_backend.BeamFunctions._ +import com.twitter.scalding.beam_backend.BeamJoiner.MultiJoinFunction +import com.twitter.scalding.serialization.Externalizer +import com.twitter.scalding.typed.functions.ComposedFunctions.ComposedMapGroup +import com.twitter.scalding.typed.functions.{EmptyGuard, MapValueStream, ScaldingPriorityQueueMonoid, SumAll} +import com.twitter.scalding.typed.{CoGrouped, Input} +import java.util.{Comparator, PriorityQueue} +import org.apache.beam.sdk.Pipeline +import org.apache.beam.sdk.coders.{Coder, IterableCoder, KvCoder} +import org.apache.beam.sdk.transforms.DoFn.ProcessElement +import org.apache.beam.sdk.transforms.Top.TopCombineFn +import org.apache.beam.sdk.transforms._ +import org.apache.beam.sdk.transforms.join.{CoGbkResult, CoGroupByKey, KeyedPCollectionTuple} +import org.apache.beam.sdk.values.PCollectionList +import org.apache.beam.sdk.values.PCollectionTuple +import org.apache.beam.sdk.values.{KV, PCollection, TupleTag} +import scala.collection.JavaConverters._ +import scala.reflect.ClassTag + +sealed abstract class BeamOp[+A] { + + import BeamOp.TransformBeamOp + + protected lazy val cachedRun = Memoize.function[Pipeline, PCollection[_ <: A]] { case (pipeline, _) => + runNoCache(pipeline) + } + + final def run(pipeline: Pipeline): PCollection[_ <: A] = cachedRun(pipeline) + + protected def runNoCache(p: Pipeline): PCollection[_ <: A] + + def map[B](f: A => B)(implicit kryoCoder: KryoCoder): BeamOp[B] = + parDo(MapFn(f), "map") + + def parDo[C >: A, B](f: DoFn[C, B], name: String)(implicit kryoCoder: KryoCoder): BeamOp[B] = { + val pTransform = new PTransform[PCollection[C], PCollection[B]]() { + override def expand(input: PCollection[C]): PCollection[B] = input.apply(ParDo.of(f)) + } + applyPTransform(pTransform, name) + } + + def filter(f: A => Boolean)(implicit kryoCoder: KryoCoder): BeamOp[A] = + applyPTransform(Filter.by[A, ProcessFunction[A, java.lang.Boolean]](ProcessPredicate(f)), "filter") + + def applyPTransform[C >: A, B]( + f: PTransform[PCollection[C], PCollection[B]], + name: String + )(implicit kryoCoder: KryoCoder): BeamOp[B] = + TransformBeamOp(this, f, kryoCoder, name) + + def flatMap[B](f: A => TraversableOnce[B])(implicit kryoCoder: KryoCoder): BeamOp[B] = + parDo(FlatMapFn(f), "flatMap") +} + +private final case class SerializableComparator[T](comp: Comparator[T]) extends Comparator[T] { + private[this] val extCmp = Externalizer(comp) + override def compare(o1: T, o2: T): Int = extCmp.get.compare(o1, o2) +} + +object BeamOp extends Serializable { + implicit private def fakeClassTag[A]: ClassTag[A] = ClassTag(classOf[AnyRef]).asInstanceOf[ClassTag[A]] + + def planMapGroup[K, V, U]( + pcoll: PCollection[KV[K, java.lang.Iterable[V]]], + reduceFn: (K, Iterator[V]) => Iterator[U] + )(implicit ordK: Ordering[K], kryoCoder: KryoCoder): PCollection[KV[K, java.lang.Iterable[U]]] = + reduceFn match { + case ComposedMapGroup(f, g) => planMapGroup(planMapGroup(pcoll, f), g) + case EmptyGuard(MapValueStream(SumAll(pqm: ScaldingPriorityQueueMonoid[v]))) => + val vCollection = pcoll.asInstanceOf[PCollection[KV[K, java.lang.Iterable[PriorityQueue[v]]]]] + + vCollection + .apply( + MapElements.via( + new SimpleFunction[ + KV[K, java.lang.Iterable[PriorityQueue[v]]], + KV[K, java.lang.Iterable[U]] + ]() { + private final val topCombineFn = new TopCombineFn[v, SerializableComparator[v]]( + pqm.count, + SerializableComparator[v](pqm.ordering.reverse) + ) + + override def apply( + input: KV[K, java.lang.Iterable[PriorityQueue[v]]] + ): KV[K, java.lang.Iterable[U]] = { + @inline def flattenedValues: Stream[v] = + input.getValue.asScala.toStream.flatMap(_.asScala.toStream) + + val outputs: java.util.List[v] = topCombineFn.apply(flattenedValues.asJava) + // We are building the PriorityQueue back as output type U is PriorityQueue[v] + val pqs = pqm.build(outputs.asScala) + KV.of(input.getKey, Iterable(pqs.asInstanceOf[U]).asJava) + } + } + ) + ) + .setCoder(KvCoder.of(OrderedSerializationCoder(ordK, kryoCoder), IterableCoder.of(kryoCoder))) + case EmptyGuard(MapValueStream(sa: SumAll[V])) => + pcoll + .apply(Combine.groupedValues(new SerializableBiFunction[V, V, V] { + override def apply(t: V, u: V): V = sa.sg.plus(t, u) + })) + .setCoder(KvCoder.of(OrderedSerializationCoder(ordK, kryoCoder), kryoCoder)) + .apply(MapElements.via(new SimpleFunction[KV[K, V], KV[K, java.lang.Iterable[U]]]() { + override def apply(input: KV[K, V]): KV[K, java.lang.Iterable[U]] = + KV.of(input.getKey, Seq(input.getValue.asInstanceOf[U]).toIterable.asJava) + })) + .setCoder( + KvCoder.of(OrderedSerializationCoder(ordK, kryoCoder), IterableCoder.of(kryoCoder)) + ) + case notComposedOrSum => + val fn = BeamJoiner.beamMapGroupJoin(notComposedOrSum) + pcoll + .apply(ParDo.of(MapFn[KV[K, java.lang.Iterable[V]], KV[K, java.lang.Iterable[U]]] { elem => + KV.of( + elem.getKey, + fn(elem.getKey, elem.getValue.asScala).asJava + ) + })) + .setCoder(KvCoder.of(OrderedSerializationCoder(ordK, kryoCoder), IterableCoder.of(kryoCoder))) + } + + final case class Source[A](conf: Config, original: Input[A], input: Option[BeamSource[A]]) + extends BeamOp[A] { + override def runNoCache(pipeline: Pipeline): PCollection[_ <: A] = + input match { + case None => + throw new IllegalArgumentException( + s"source $original was not connected to a beam source" + ) + case Some(src) => src.read(pipeline, conf) + } + } + + final case class FromIterable[A](iterable: Iterable[A], kryoCoder: KryoCoder) extends BeamOp[A] { + override def runNoCache(pipeline: Pipeline): PCollection[_ <: A] = + pipeline.apply(Create.of(iterable.asJava).withCoder(kryoCoder)) + } + + final case class TransformBeamOp[A, B]( + source: BeamOp[A], + f: PTransform[PCollection[A], PCollection[B]], + kryoCoder: KryoCoder, + name: String + ) extends BeamOp[B] { + override def runNoCache(pipeline: Pipeline): PCollection[B] = { + val pCollection: PCollection[A] = widenPCollection(source.run(pipeline)) + pCollection.apply(name, f).setCoder(kryoCoder) + } + } + + final case class HashJoinTransform[K, V, U, W]( + keyCoder: Coder[K], + joiner: (K, V, Iterable[U]) => Iterator[W] + )(implicit kryoCoder: KryoCoder) + extends PTransform[PCollectionTuple, PCollection[_ <: (K, W)]]("HashJoin") { + + override def expand(input: PCollectionTuple): PCollection[_ <: (K, W)] = { + val leftPCollection = input.get("left").asInstanceOf[PCollection[(K, V)]] + val rightPCollection = input.get("right").asInstanceOf[PCollection[(K, U)]] + + val rightPCollectionView = rightPCollection + .apply(TupleToKV[K, U](keyCoder, kryoCoder)) + .apply(GroupByKey.create[K, U]()) + .setCoder(KvCoder.of(keyCoder, kryoCoder)) + .apply(View.asMap[K, java.lang.Iterable[U]]()) + + leftPCollection + .apply( + ParDo + .of(HashJoinFn[K, V, U, W](joiner, rightPCollectionView)) + .withSideInputs(rightPCollectionView) + ) + .setCoder(TupleCoder(keyCoder, kryoCoder)) + } + } + + final case class HashJoinOp[K, V, U, W]( + left: BeamOp[(K, V)], + right: BeamOp[(K, U)], + joiner: (K, V, Iterable[U]) => Iterator[W] + )(implicit kryoCoder: KryoCoder, ordK: Ordering[K]) + extends BeamOp[(K, W)] { + override def runNoCache(pipeline: Pipeline): PCollection[_ <: (K, W)] = { + val leftPCollection = left.run(pipeline) + val keyCoder: Coder[K] = OrderedSerializationCoder.apply(ordK, kryoCoder) + val rightPCollection: PCollection[(K, U)] = widenPCollection(right.run(pipeline)) + + val tuple: PCollectionTuple = PCollectionTuple.of[(K, _)]( + "left", + widenPCollection(leftPCollection): PCollection[(K, _)], + "right", + widenPCollection(rightPCollection): PCollection[(K, _)] + ) + + tuple.apply(HashJoinTransform(keyCoder, joiner)) + } + } + + final case class MergedBeamOp[A](first: BeamOp[A], second: BeamOp[A], tail: Seq[BeamOp[A]]) + extends BeamOp[A] { + override def runNoCache(pipeline: Pipeline): PCollection[_ <: A] = { + val collections = PCollectionList + .of(widenPCollection(first.run(pipeline)): PCollection[A]) + .and(widenPCollection(second.run(pipeline)): PCollection[A]) + .and(tail.map(op => widenPCollection(op.run(pipeline)): PCollection[A]).asJava) + + collections.apply(Flatten.pCollections[A]()) + } + } + + final case class CoGroupedTransform[K, V]( + joinFunction: MultiJoinFunction[K, V], + tupleTags: Seq[TupleTag[Any]], + keyCoder: Coder[K] + )(implicit kryoCoder: KryoCoder) + extends PTransform[PCollectionList[(K, Any)], PCollection[_ <: (K, V)]]("CoGrouped") { + + override def expand(collections: PCollectionList[(K, Any)]): PCollection[_ <: (K, V)] = { + val pcols = collections.getAll.asScala.map(_.apply(TupleToKV[K, Any](keyCoder, kryoCoder))) + + val keyedPCollectionTuple: KeyedPCollectionTuple[K] = pcols + .zip(tupleTags) + .foldLeft( + KeyedPCollectionTuple.empty[K](collections.getPipeline) + )((keyed, colWithTag) => keyed.and[Any](colWithTag._2, colWithTag._1)) + + keyedPCollectionTuple + .apply(CoGroupByKey.create()) + .apply(ParDo.of(new CoGroupDoFn[K, V](joinFunction, tupleTags))) + .setCoder(KvCoder.of(keyCoder, kryoCoder)) + .apply(KVToTuple[K, V](keyCoder, kryoCoder)) + } + } + + final case class CoGroupedOp[K, V]( + cg: CoGrouped[K, V], + inputOps: Seq[BeamOp[(K, Any)]] + )(implicit kryoCoder: KryoCoder) + extends BeamOp[(K, V)] { + override def runNoCache(pipeline: Pipeline): PCollection[_ <: (K, V)] = { + val keyCoder: Coder[K] = OrderedSerializationCoder.apply(cg.keyOrdering, kryoCoder) + + val pcols = inputOps.map { inputOp => + widenPCollection(inputOp.op.run(pipeline)): PCollection[(K, Any)] + } + + val tupleTags = (1 to inputOps.size).map(idx => new TupleTag[Any](idx.toString)) + val joinFunction = BeamJoiner.beamMultiJoin(cg.joinFunction) + + PCollectionList + .of(pcols.asJava) + .apply(CoGroupedTransform(joinFunction, tupleTags, keyCoder)) + } + } + + final case class CoGroupDoFn[K, V]( + joinFunction: MultiJoinFunction[K, V], + tags: Seq[TupleTag[Any]] + ) extends DoFn[KV[K, CoGbkResult], KV[K, V]] { + @ProcessElement + def processElement(c: DoFn[KV[K, CoGbkResult], KV[K, V]]#ProcessContext): Unit = { + val key = c.element().getKey + val value = c.element().getValue + + val outputIter = joinFunction(key, tags.map(t => value.getAll(t).asScala)).iterator + + while (outputIter.hasNext) { + c.output(KV.of(key, outputIter.next())) + } + } + } + + implicit final class KVOp[K, V](val op: BeamOp[(K, V)]) extends AnyVal { + def mapGroup[U]( + reduceFn: (K, Iterator[V]) => Iterator[U] + )(implicit ordK: Ordering[K], kryoCoder: KryoCoder): BeamOp[(K, U)] = + TransformBeamOp[(K, V), (K, U)]( + op, + new PTransform[PCollection[(K, V)], PCollection[(K, U)]]() { + override def expand(input: PCollection[(K, V)]): PCollection[(K, U)] = { + val keyCoder: Coder[K] = OrderedSerializationCoder(ordK, kryoCoder) + + val groupedValues = input + .apply(TupleToKV[K, V](keyCoder, kryoCoder)) + .apply(GroupByKey.create[K, V]()) + .setCoder(KvCoder.of(keyCoder, IterableCoder.of(kryoCoder))) + + planMapGroup[K, V, U](groupedValues, reduceFn) + .apply(ParDo.of(FlatMapFn[KV[K, java.lang.Iterable[U]], KV[K, U]] { elem => + elem.getValue.asScala.map(KV.of(elem.getKey, _)) + })) + .setCoder(KvCoder.of(keyCoder, kryoCoder)) + .apply(KVToTuple[K, U](keyCoder, kryoCoder)) + } + }, + kryoCoder, + "mapGroup" + ) + + def sortedMapGroup[U]( + reduceFn: (K, Iterator[V]) => Iterator[U] + )(implicit ordK: Ordering[K], ordV: Ordering[V], kryoCoder: KryoCoder): BeamOp[(K, U)] = + TransformBeamOp[(K, V), (K, U)]( + op, + new PTransform[PCollection[(K, V)], PCollection[(K, U)]]() { + override def expand(input: PCollection[(K, V)]): PCollection[(K, U)] = { + val keyCoder: Coder[K] = OrderedSerializationCoder(ordK, kryoCoder) + val valueCoder: Coder[V] = OrderedSerializationCoder(ordV, kryoCoder) + + val groupedSortedValues = input + .apply(TupleToKV[K, V](keyCoder, valueCoder)) + .apply(GroupByKey.create[K, V]()) + .setCoder(KvCoder.of(keyCoder, IterableCoder.of(valueCoder))) + .apply(SortGroupedValues[K, V]) + + planMapGroup[K, V, U](groupedSortedValues, reduceFn) + .apply(ParDo.of(FlatMapFn[KV[K, java.lang.Iterable[U]], KV[K, U]] { elem => + elem.getValue.asScala.map(KV.of(elem.getKey, _)) + })) + .setCoder(KvCoder.of(keyCoder, kryoCoder)) + .apply(KVToTuple[K, U](keyCoder, kryoCoder)) + } + }, + kryoCoder, + "sortedMapGroup" + ) + + def sorted(implicit + ordK: Ordering[K], + ordV: Ordering[V], + kryoCoder: KryoCoder + ): BeamOp[(K, V)] = + TransformBeamOp[(K, V), (K, V)]( + op, + new PTransform[PCollection[(K, V)], PCollection[(K, V)]]() { + override def expand(input: PCollection[(K, V)]): PCollection[(K, V)] = { + val keyCoder: Coder[K] = OrderedSerializationCoder(ordK, kryoCoder) + val valueCoder: Coder[V] = OrderedSerializationCoder(ordV, kryoCoder) + input + .apply(TupleToKV[K, V](keyCoder, valueCoder)) + .apply(GroupByKey.create[K, V]()) + .setCoder(KvCoder.of(keyCoder, IterableCoder.of(valueCoder))) + .apply(SortGroupedValues[K, V]) + .apply(ParDo.of(FlatMapFn[KV[K, java.lang.Iterable[V]], KV[K, V]] { elem => + elem.getValue.asScala.map(x => KV.of(elem.getKey, x)) + })) + .setCoder(KvCoder.of(keyCoder, valueCoder)) + .apply(KVToTuple[K, V](keyCoder, valueCoder)) + } + }, + kryoCoder, + "sorted" + ) + + def mapSideAggregator( + size: Int, + semigroup: Semigroup[V] + )(implicit kryoCoder: KryoCoder): BeamOp[(K, V)] = + TransformBeamOp[(K, V), (K, V)]( + op, + new PTransform[PCollection[(K, V)], PCollection[(K, V)]]() { + override def expand(input: PCollection[(K, V)]): PCollection[(K, V)] = + input.apply(ParDo.of(MapSideAggregator[K, V](size, semigroup))).setCoder(kryoCoder) + }, + kryoCoder, + "mapSideAggregator" + ) + + def hashJoin[U, W]( + right: BeamOp[(K, U)], + fn: (K, V, Iterable[U]) => Iterator[W] + )(implicit kryoCoder: KryoCoder, ord: Ordering[K]): BeamOp[(K, W)] = + HashJoinOp(op, right, fn) + } + + /** + * @todo + * this needs to be changed to some external sorter, current Beam external sorter implementation does not + * provide an option to sort with custom Ordering + * @see + * [[org.apache.beam.sdk.extensions.sorter.ExternalSorter]] + */ + final case class SortGroupedValues[K, V](implicit + ordK: Ordering[K], + ordV: Ordering[V], + kryoCoder: KryoCoder + ) extends PTransform[PCollection[KV[K, java.lang.Iterable[V]]], PCollection[KV[K, java.lang.Iterable[V]]]]( + "SortGroupedValues" + ) { + override def expand( + input: PCollection[KV[K, java.lang.Iterable[V]]] + ): PCollection[KV[K, java.lang.Iterable[V]]] = + input + .apply(ParDo.of(MapFn[KV[K, java.lang.Iterable[V]], KV[K, java.lang.Iterable[V]]] { elem => + KV.of(elem.getKey, elem.getValue.asScala.toArray.sorted.toIterable.asJava) + })) + .setCoder( + KvCoder.of( + OrderedSerializationCoder(ordK, kryoCoder), + IterableCoder.of(OrderedSerializationCoder(ordV, kryoCoder)) + ) + ) + } + + final case class TupleToKV[K, V]( + kCoder: Coder[K], + vCoder: Coder[V] + ) extends PTransform[PCollection[(K, V)], PCollection[KV[K, V]]]("TupleToKV") { + override def expand(input: PCollection[(K, V)]): PCollection[KV[K, V]] = + input + .apply(MapElements.via[(K, V), KV[K, V]](new SimpleFunction[(K, V), KV[K, V]]() { + override def apply(input: (K, V)): KV[K, V] = KV.of(input._1, input._2) + })) + .setCoder(KvCoder.of(kCoder, vCoder)) + } + + final case class KVToTuple[K, V]( + coderK: Coder[K], + coderV: Coder[V] + ) extends PTransform[PCollection[KV[K, V]], PCollection[(K, V)]]("KVToTuple") { + override def expand(input: PCollection[KV[K, V]]): PCollection[(K, V)] = + input + .apply(MapElements.via[KV[K, V], (K, V)](new SimpleFunction[KV[K, V], (K, V)]() { + override def apply(input: KV[K, V]): (K, V) = (input.getKey, input.getValue) + })) + .setCoder(TupleCoder(coderK, coderV)) + } + +} diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamWriter.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamWriter.scala new file mode 100644 index 0000000000..20adbe0407 --- /dev/null +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/BeamWriter.scala @@ -0,0 +1,135 @@ +package com.twitter.scalding.beam_backend + +import com.twitter.scalding.dagon.Rule +import com.twitter.scalding.Execution.{ToWrite, Writer} +import com.twitter.scalding.typed._ +import com.twitter.scalding.{CFuture, CancellationHandler, Config, Execution, ExecutionCounters} +import java.nio.channels.Channels +import java.util.concurrent.ConcurrentHashMap +import java.util.concurrent.atomic.AtomicLong +import org.apache.beam.sdk.Pipeline +import org.apache.beam.sdk.coders.Coder +import org.apache.beam.sdk.io.FileSystems +import scala.annotation.tailrec +import scala.collection.convert.decorateAsScala._ +import scala.concurrent.{ExecutionContext, Future} +import scala.concurrent.blocking +import scala.collection.JavaConversions._ + +case class TempSource[A](path: String, coder: Coder[A]) extends Input[A] + +class BeamWriter(val beamMode: BeamMode) extends Writer { + private val state = new AtomicLong() + + private val sourceCounter: AtomicLong = new AtomicLong(0L) + + val tempSources: scala.collection.concurrent.Map[TypedPipe[_], TempSource[_]] = + new ConcurrentHashMap[TypedPipe[_], TempSource[_]]().asScala + + override def start(): Unit = () + + override def finished(): Unit = { + // `FileSystems.delete` fails to delete dir as it contains files, hence we delete the files in the dir + // There is a temp subdir with name starting with "." which is not matched by `match`. + // So currently a single empty dir is left behind. + val resources = tempSources.values + .map(ts => s"${ts.path}*") + .flatMap(path => FileSystems.`match`(path).metadata().map(_.resourceId())) + FileSystems.delete(resources.toList) + } + + def getForced[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ExecutionContext + ): Future[TypedPipe[T]] = + tempSources.get(initial) match { + case Some(source) => Future.successful(TypedPipe.from(source).asInstanceOf[TypedPipe[T]]) + case None => Future.failed(new IllegalStateException(s"TypedPipe = $initial, has not yet been forced")) + } + + def getIterable[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ExecutionContext + ): Future[Iterable[T]] = + tempSources.get(initial) match { + case Some(TempSource(path, coder)) => + val c: Coder[T] = coder.asInstanceOf[Coder[T]] + Future { + blocking { + // Single dir by default just matches the dir, we need to match files inside + val matchedResources = FileSystems.`match`(s"$path*").metadata().asScala + val records = matchedResources.iterator.flatMap { resource => + val is = Channels.newInputStream(FileSystems.open(resource.resourceId())) + InputStreamIterator.closingIterator(is, c) + }.toList + + new Iterable[T] { + override def iterator: Iterator[T] = records.toIterator + } + } + } + case _ => Future.failed(new IllegalStateException(s"TypedPipe = $initial has no existing Iterable")) + } + + override def execute(conf: Config, writes: List[ToWrite[_]])(implicit + cec: ExecutionContext + ): CFuture[(Long, ExecutionCounters)] = { + import Execution.ToWrite._ + val planner = BeamPlanner.plan(conf, beamMode.sources) + val phases: Seq[Rule[TypedPipe]] = BeamPlanner.defaultOptimizationRules(conf) + val optimizedWrites = ToWrite.optimizeWriteBatch(writes, phases) + val pipeline = Pipeline.create(beamMode.pipelineOptions) + + @tailrec + def rec(optimizedWrites: List[OptimizedWrite[TypedPipe, _]]): Unit = + optimizedWrites match { + case Nil => () + case x :: xs => + x match { + case OptimizedWrite(pipe, ToWrite.SimpleWrite(opt, sink)) => { + val pcoll = planner(opt).run(pipeline) + beamMode.sink(sink) match { + case Some(ssink) => + ssink.write(pcoll, conf) + case _ => throw new Exception(s"unknown sink: $sink when writing $pipe") + } + rec(xs) + } + case OptimizedWrite(pipe, toWrite @ (ToIterable(_) | Force(_))) if !tempSources.contains(pipe) => + val opt = toWrite match { + case ToIterable(o) => o + case Force(o) => o + } + val pcoll = planner(opt).run(pipeline) + val tempLocation = pcoll.getPipeline.getOptions.getTempLocation + require(tempLocation != null, "Temp location cannot be null when using toIterableExecution") + + val outputPath = BeamWriter.addPaths(tempLocation, sourceCounter.getAndIncrement().toString) + // Here we add a sink transformation on the PCollection. + // This does not run till the final `pipeline.run` step + new BeamTempFileSink(outputPath).write(pcoll, conf) + tempSources += ((pipe, TempSource(outputPath, pcoll.getCoder))) + + // we know that tempSources.contains(pipe) on this branch, which means it was already computed. + case OptimizedWrite(_, ToIterable(_) | Force(_)) => () + } + } + rec(optimizedWrites) + val result = pipeline.run + val runId = state.getAndIncrement() + CFuture( + Future { + result.waitUntilFinish() + (runId, ExecutionCounters.empty) + }, + CancellationHandler.fromFn { ec => + Future { result.cancel(); () }(ec) + } + ) + } +} + +object BeamWriter { + // This is manually done because java.nio.File.Paths & java.io.File convert "gs://" to "gs:/" + def addPaths(basePath: String, dir: String): String = + if (basePath.endsWith("/")) s"$basePath$dir/" + else s"$basePath/$dir/" +} diff --git a/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/KryoCoder.scala b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/KryoCoder.scala new file mode 100644 index 0000000000..527376d3b4 --- /dev/null +++ b/scalding-beam/src/main/scala/com/twitter/scalding/beam_backend/KryoCoder.scala @@ -0,0 +1,55 @@ +package com.twitter.scalding.beam_backend + +import com.esotericsoftware.kryo.io.Input +import com.twitter.chill.{KryoInstantiator, KryoPool} +import com.twitter.scalding.serialization.JavaStreamEnrichments.{RichInputStream, RichOutputStream} +import com.twitter.scalding.serialization.OrderedSerialization +import java.io.{InputStream, OutputStream} +import org.apache.beam.sdk.coders.{AtomicCoder, Coder} +import scala.language.implicitConversions + +final class KryoCoder(kryoInstantiator: KryoInstantiator) extends AtomicCoder[Any] { + @transient private[this] lazy val kryoPool: KryoPool = + KryoPool.withByteArrayOutputStream(Runtime.getRuntime.availableProcessors, kryoInstantiator) + + override def encode(value: Any, os: OutputStream): Unit = { + val bytes = kryoPool.toBytesWithClass(value) + os.writePosVarInt(bytes.length) + os.write(bytes) + os.flush() + } + + override def decode(is: InputStream): Any = { + val size = is.readPosVarInt + val input = new Input(is, size) + kryoPool.fromBytes(input.readBytes(size)) + } +} + +object KryoCoder { + implicit def castType[T](kryoCoder: KryoCoder): AtomicCoder[T] = + kryoCoder.asInstanceOf[AtomicCoder[T]] +} + +case class OrderedSerializationCoder[T](ordSer: OrderedSerialization[T]) extends AtomicCoder[T] { + override def encode(value: T, outStream: OutputStream): Unit = ordSer.write(outStream, value) + override def decode(inStream: InputStream): T = ordSer.read(inStream).get +} + +object OrderedSerializationCoder { + def apply[T](ord: Ordering[T], fallback: Coder[T]): Coder[T] = + ord match { + case ordSer: OrderedSerialization[T] @unchecked => OrderedSerializationCoder(ordSer) + case _ => fallback + } +} + +case class TupleCoder[K, V](coderK: Coder[K], coderV: Coder[V]) extends AtomicCoder[(K, V)] { + override def encode(value: (K, V), outStream: OutputStream): Unit = { + coderK.encode(value._1, outStream) + coderV.encode(value._2, outStream) + } + + override def decode(inStream: InputStream): (K, V) = + (coderK.decode(inStream), coderV.decode(inStream)) +} diff --git a/scalding-beam/src/test/scala/com/twitter/scalding/beam_backend/BeamBackendTests.scala b/scalding-beam/src/test/scala/com/twitter/scalding/beam_backend/BeamBackendTests.scala new file mode 100644 index 0000000000..c7df081042 --- /dev/null +++ b/scalding-beam/src/test/scala/com/twitter/scalding/beam_backend/BeamBackendTests.scala @@ -0,0 +1,491 @@ +package com.twitter.scalding.beam_backend + +import com.twitter.algebird.{AveragedValue, Semigroup} +import com.twitter.scalding.beam_backend.BeamOp.{CoGroupedOp, FromIterable, HashJoinOp, MergedBeamOp} +import com.twitter.scalding.{Config, Execution, TextLine, TypedPipe} +import java.io.File +import java.nio.file.Paths +import org.apache.beam.sdk.Pipeline +import org.apache.beam.sdk.options.{PipelineOptions, PipelineOptionsFactory} +import org.scalatest.{BeforeAndAfter, FunSuite} +import scala.io.Source + +class BeamBackendTests extends FunSuite with BeforeAndAfter { + + private var pipelineOptions: PipelineOptions = _ + private var testPath: String = _ + + def beamMatchesSeq[A](t: TypedPipe[A], expectedResult: Seq[A], config: Config = Config.empty) = { + val bmode = BeamMode.default(pipelineOptions) + val outRoute = tmpPath("out") + t.map(_.toString).writeExecution(TextLine(outRoute)).waitFor(config, bmode).get + val result = getContents(testPath, outRoute).sorted + assert(result == expectedResult.map(_.toString).sorted) + } + + def beamUnoptimizedPlan[A](t: TypedPipe[A], config: Config = Config.empty): (Pipeline, BeamOp[A]) = { + val bmode = BeamMode.default(pipelineOptions) + val planner = BeamPlanner.plan(config, bmode.sources) + val pipeline = Pipeline.create(bmode.pipelineOptions) + (pipeline, planner(t)) + } + + before { + testPath = Paths.get(System.getProperty("java.io.tmpdir"), "scalding", "beam_backend").toString + pipelineOptions = PipelineOptionsFactory.create() + pipelineOptions.setTempLocation(testPath) + } + + after { + removeDir(testPath) + } + + def tmpPath(suffix: String): String = + Paths.get(testPath, suffix).toString + + test("BeamOp caching: FromIterable") { + val a = TypedPipe.from(0 to 5) + + val (pipeline, op) = beamUnoptimizedPlan(a) + + assert(op.isInstanceOf[FromIterable[Int]]) + assert(op.run(pipeline) eq op.run(pipeline)) + } + + test("BeamOp caching: CoGroupedOp") { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + + val (pipeline, op) = beamUnoptimizedPlan(leftPipe.join(rightPipe)) + + assert(op.isInstanceOf[CoGroupedOp[Int, Int]]) + assert(op.run(pipeline) eq op.run(pipeline)) + } + + test("BeamOp caching: HashJoinOp") { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + + val (pipeline, op) = beamUnoptimizedPlan(leftPipe.hashJoin(rightPipe)) + + assert(op.isInstanceOf[HashJoinOp[Int, Int, Int, Int]]) + assert(op.run(pipeline) eq op.run(pipeline)) + } + + test("BeamOp caching: MergedBeamOp") { + val a = TypedPipe.from(0 to 5) + val b = TypedPipe.from(6 to 10) + + val (pipeline, op) = beamUnoptimizedPlan(a ++ b) + + assert(op.isInstanceOf[MergedBeamOp[Int]]) + assert(op.run(pipeline) eq op.run(pipeline)) + } + + test("BeamOp caching: Source") { + val source = TypedPipe.from(TextLine("/")) + + val (pipeline, op) = beamUnoptimizedPlan(source) + + assert(op.isInstanceOf[BeamOp.Source[String]]) + assert(op.run(pipeline) eq op.run(pipeline)) + } + + test("BeamOp caching: TransformBeamOp") { + val pipe = TypedPipe.from(0 to 5).filter(_ % 2 == 0) + + val (pipeline, op) = beamUnoptimizedPlan(pipe) + + assert(op.isInstanceOf[BeamOp.TransformBeamOp[Int, Int]]) + assert(op.run(pipeline) eq op.run(pipeline)) + } + + test("map") { + beamMatchesSeq( + TypedPipe.from(0 to 5).map(_ * 2), + Seq(0, 2, 4, 6, 8, 10) + ) + } + + test("flatMap") { + beamMatchesSeq( + TypedPipe.from(0 to 3).flatMap(x => 0 to x), + Seq(0, 0, 1, 0, 1, 2, 0, 1, 2, 3) + ) + } + + test("mapValues") { + beamMatchesSeq( + TypedPipe.from(0 to 3).map(x => (x, x)).mapValues(_ * 2), + Seq((0, 0), (1, 2), (2, 4), (3, 6)) + ) + } + + test("flatMapValues") { + beamMatchesSeq( + TypedPipe.from(0 to 2).map(x => (x, x)).flatMapValues(x => 0 to x), + Seq((0, 0), (1, 0), (1, 1), (2, 0), (2, 1), (2, 2)) + ) + } + + test("filter") { + beamMatchesSeq( + TypedPipe.from(0 to 10).filter(x => x % 2 == 0), + Seq(0, 2, 4, 6, 8, 10) + ) + } + + test("filterKeys") { + beamMatchesSeq( + TypedPipe.from(0 to 10).map(x => (x, x)).filterKeys(x => x % 2 == 1), + Seq((1, 1), (3, 3), (5, 5), (7, 7), (9, 9)) + ) + } + + test("mapGroup") { + beamMatchesSeq( + TypedPipe + .from(Seq(5, 3, 2, 0, 1, 4)) + .map(x => x.toDouble) + .groupAll + .aggregate(AveragedValue.aggregator), + Seq(((), 2.5)) + ) + } + + test("sortedMapGroup") { + beamMatchesSeq( + TypedPipe + .from(Seq(5, 3, 2, 6, 1, 4)) + .groupBy(_ % 2) + .sorted(Ordering[Int].reverse) + .foldLeft(0)((a, b) => a * 10 + b), + Seq((0, 642), (1, 531)) + ) + } + + test("sortedTake") { + beamMatchesSeq( + TypedPipe + .from(Seq(5, 3, 2, 0, 1, 4)) + .map(x => x.toDouble) + .groupAll + .sortedReverseTake(3) + .flatMap(_._2), + Seq(5.0, 4.0, 3.0) + ) + } + + test("bufferedTake") { + beamMatchesSeq( + TypedPipe + .from(1 to 50) + .groupAll + .bufferedTake(100) + .map(_._2), + 1 to 50, + Config(Map("cascading.aggregateby.threshold" -> "100")) + ) + } + + test("SumByLocalKeys") { + beamMatchesSeq( + TypedPipe + .from(0 to 5) + .map(x => (x, x)) + .flatMapValues(x => 0 to x) + .sumByLocalKeys(new Semigroup[Int] { + override def plus(x: Int, y: Int): Int = x + y + }), + Seq((0, 0), (1, 1), (2, 3), (3, 6), (4, 10), (5, 15)), + Config.empty.setMapSideAggregationThreshold(5) + ) + } + + test("HashJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.hashJoin(rightPipe) + }, + Seq((0, (0, 0)), (0, (0, 3)), (0, (1, 0)), (0, (1, 3))) + ) + } + + test("HashLeftJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.hashLeftJoin(rightPipe) + }, + Seq( + (0, (0, Some(0))), + (0, (0, Some(3))), + (0, (1, Some(0))), + (0, (1, Some(3))), + (1, (1, None)), + (3, (3, None)) + ) + ) + } + + test("InnerJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.join(rightPipe) + }, + Seq((0, (0, 0)), (0, (0, 3)), (0, (1, 0)), (0, (1, 3))) + ) + } + + test("LeftJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.leftJoin(rightPipe) + }, + Seq( + (0, (0, Some(0))), + (0, (0, Some(3))), + (0, (1, Some(0))), + (0, (1, Some(3))), + (1, (1, None)), + (3, (3, None)) + ) + ) + } + + test("Multiple LeftJoins") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + + val thirdPipe: TypedPipe[(Int, String)] = TypedPipe.from(Seq((0, "a"), (1, "b"))) + + leftPipe.join(rightPipe).leftJoin(thirdPipe) + }, + Seq( + (0, ((0, 0), Some("a"))), + (0, ((0, 3), Some("a"))), + (0, ((1, 0), Some("a"))), + (0, ((1, 3), Some("a"))) + ) + ) + } + + test("Multiple Joins") { + beamMatchesSeq( + { + val firstPipe: TypedPipe[(Int, Float)] = TypedPipe.from(Seq((0, 0f), (0, 1.5f), (1, 1.5f), (3, 3.5f))) + val secondPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + val thirdPipe: TypedPipe[(Int, String)] = TypedPipe.from(Seq((0, "a"), (1, "b"))) + + firstPipe.leftJoin(secondPipe).leftJoin(thirdPipe) + }, + Seq( + (0, ((0f, Some(0)), Some("a"))), + (0, ((0f, Some(3)), Some("a"))), + (0, ((1.5f, Some(0)), Some("a"))), + (0, ((1.5f, Some(3)), Some("a"))), + (1, ((1.5f, None), Some("b"))), + (3, ((3.5f, None), None)) + ) + ) + } + + test("RightJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.rightJoin(rightPipe) + }, + Seq( + (0, (Some(0), 0)), + (0, (Some(0), 3)), + (0, (Some(1), 0)), + (0, (Some(1), 3)), + (2, (None, 2)), + (2, (None, 3)) + ) + ) + } + + test("OuterJoin") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.outerJoin(rightPipe) + }, + Seq( + (0, (Some(0), Some(0))), + (0, (Some(0), Some(3))), + (0, (Some(1), Some(0))), + (0, (Some(1), Some(3))), + (1, (Some(1), None)), + (3, (Some(3), None)), + (2, (None, Some(2))), + (2, (None, Some(3))) + ) + ) + } + + test("CoGroup") { + beamMatchesSeq( + { + val leftPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 1), (1, 1), (3, 3))) + val rightPipe: TypedPipe[(Int, Int)] = TypedPipe.from(Seq((0, 0), (0, 3), (2, 2), (2, 3))) + leftPipe.cogroup(rightPipe)((_, iter1, iter2) => Seq((iter1 ++ iter2).toSeq.sum).toIterator) + }, + Seq( + (0, 4), + (1, 1), + (2, 5), + (3, 3) + ) + ) + } + + test("Merge (++) two pipes") { + // 5 in both typed pipes + // duplicate element in both typed pipes + val a = TypedPipe.from(Seq(5, 3, 2, 6, 1, 4, 1)) + val b = TypedPipe.from(Seq(15, 13, 12, 16, 11, 14, 5, 11)) + + beamMatchesSeq( + a ++ b, + Seq(1, 1, 2, 3, 4, 5, 5, 6, 11, 11, 12, 13, 14, 15, 16) + ) + } + + test("Merge (++) many pipes") { + // 5 in all typed pipes + // duplicate element in all typed pipes + val a = TypedPipe.from(Seq(5, 3, 2, 6, 1, 4, 1)) + val b = TypedPipe.from(Seq(15, 13, 12, 16, 11, 14, 5, 11)) + val c = TypedPipe.from(Seq(25, 23, 22, 26, 21, 24, 5, 21)) + val d = TypedPipe.from(Seq(35, 33, 32, 36, 31, 34, 5, 31)) + + beamMatchesSeq( + a ++ b ++ c ++ d, + Seq(1, 1, 2, 3, 4, 5, 5, 5, 5, 6, 11, 11, 12, 13, 14, 15, 16, 21, 21, 22, 23, 24, 25, 26, 31, 31, 32, + 33, 34, 35, 36) + ) + } + + test("Merge (++) same pipe") { + val a = TypedPipe.from(Seq(5, 3, 2, 6, 1, 4)) + + beamMatchesSeq( + a ++ a ++ a ++ a, + Seq(1, 2, 3, 4, 5, 6).flatMap(x => Seq(x, x, x, x)) + ) + } + + test("Testing without force to disk") { + val bmode = BeamMode.default(pipelineOptions) + val tmpPath1 = tmpPath("tp1") + val tmpPath2 = tmpPath("tp2") + + val tmpDir = new File(tmpPath("forced")) + tmpDir.mkdirs() + + val forcedExecution = Execution.from { + TypedPipe + .from(Seq(1, 2)) + .map { e => + // This is called twice and hence second execution will have false values + new File(tmpDir, e.toString).createNewFile() + } + } + + val tp1 = forcedExecution.flatMap(f => f.map(_.toString).writeExecution(TextLine(tmpPath1))) + val tp2 = forcedExecution.flatMap(f => f.map(_.toString).writeExecution(TextLine(tmpPath2))) + tp1.flatMap(_ => tp2).waitFor(Config.empty, bmode) + + val result1 = getContents(testPath, tmpPath1).sorted + val result2 = getContents(testPath, tmpPath2).sorted + + assert(result1 == Seq("true", "true") && result2 == Seq("false", "false")) + } + + test("Force to Disk execution") { + val bmode = BeamMode.default(pipelineOptions) + val tmpPath1 = tmpPath("tp1") + val tmpPath2 = tmpPath("tp2") + + val tmpDir = new File(tmpPath("forced")) + tmpDir.mkdirs() + + val forcedExecution = + TypedPipe + .from(Seq(1, 2)) + .map { e => + // Since this is forced it is called only once. Hence output is always true + new File(tmpDir, e.toString).createNewFile() + } + .forceToDiskExecution + + val tp1 = forcedExecution.flatMap(f => f.map(_.toString).writeExecution(TextLine(tmpPath1))) + val tp2 = forcedExecution.flatMap(f => f.map(_.toString).writeExecution(TextLine(tmpPath2))) + tp1.flatMap(_ => tp2).waitFor(Config.empty, bmode) + + val result1 = getContents(testPath, tmpPath1).sorted + val result2 = getContents(testPath, tmpPath2).sorted + + // verify that temp dir contains no files + assert(new File(testPath, "0").listFiles.filter(_.isFile).isEmpty) + assert(result1 == Seq("true", "true") && result2 == Seq("true", "true")) + } + + test("toIterableExecutionTest1") { + val input = Seq(5, 3, 2, 6, 1, 4) + val bmode = BeamMode.default(pipelineOptions) + + val out = TypedPipe + .from(input) + .toIterableExecution + .waitFor(Config.empty, bmode) + .get + + assert(out.toSet == input.toSet) + } + + test("toIterableExecutionWithJoin") { + val bmode = BeamMode.default(pipelineOptions) + val tp1 = TypedPipe.from(1 to 10).map(x => (x, 1)) + val tp2 = TypedPipe.from(1 to 10).map(x => (x, 2)) + val output = tp1 + .join(tp2) + .mapValues { case (left, right) => left + right } + .filter(_._1 % 5 == 0) + .toIterableExecution + .waitFor(Config.empty, bmode) + .get + + assert(output.toSet == Seq((5, 3), (10, 3)).toSet) + } + + private def getContents(path: String, prefix: String): List[String] = + new File(path).listFiles.flatMap { file => + if (file.getPath.startsWith(prefix)) { + Source.fromFile(file).getLines().flatMap(line => line.split("\\s+").toList) + } else List.empty[String] + }.toList + + private def removeDir(path: String): Unit = { + def deleteRecursively(file: File): Unit = { + if (file.isDirectory) file.listFiles.foreach(deleteRecursively) + if (file.exists && !file.delete) + sys.error(s"Unable to delete ${file.getAbsolutePath}") + } + deleteRecursively(new File(path)) + } +} diff --git a/scalding-benchmarks/src/test/scala/com/twitter/scalding/Serialization.scala b/scalding-benchmarks/src/test/scala/com/twitter/scalding/Serialization.scala new file mode 100644 index 0000000000..de4ef75323 --- /dev/null +++ b/scalding-benchmarks/src/test/scala/com/twitter/scalding/Serialization.scala @@ -0,0 +1,306 @@ +package com.twitter.scalding.benchmarks + +import com.twitter.chill.KryoPool +import com.twitter.scalding.serialization._ +import java.io.ByteArrayInputStream +import org.scalacheck.{ Gen => scGen, Arbitrary } // We use scalacheck Gens to generate random scalameter gens. +import org.scalameter.api._ +import scala.collection.generic.CanBuildFrom +import scala.language.experimental.macros + +trait LowerPriorityImplicit { + implicit def ordBuf[T]: OrderedSerialization[T] = macro com.twitter.scalding.macros.impl.OrderedSerializationProviderImpl[T] +} + +object SerializationBenchmark extends PerformanceTest.Quickbenchmark with LowerPriorityImplicit { + import JavaStreamEnrichments._ + + val sizes = Gen.range("size")(300000, 1500000, 300000) + val smallSizes = Gen.range("size")(30000, 150000, 30000) + + /** + * This tends to create ascii strings + */ + def asciiStringGen: scGen[String] = scGen.parameterized { p => + val thisSize = p.rng.nextInt(p.size + 1) + scGen.const(new String(Array.fill(thisSize)(p.rng.nextInt(128).toByte))) + } + def charStringGen: scGen[String] = + scGen.listOf(scGen.choose(0.toChar, Char.MaxValue)).map(_.mkString) + + // Biases to ascii 80% of the time + def stringGen: scGen[String] = scGen.frequency((4, asciiStringGen), (1, charStringGen)) + + implicit def stringArb: Arbitrary[String] = Arbitrary(stringGen) + + def collection[T, C[_]](size: Gen[Int])(implicit arbT: Arbitrary[T], cbf: CanBuildFrom[Nothing, T, C[T]]): Gen[C[T]] = + collection[T, C](size, arbT.arbitrary)(cbf) + + def collection[T, C[_]](size: Gen[Int], item: scGen[T])(implicit cbf: CanBuildFrom[Nothing, T, C[T]]): Gen[C[T]] = + size.map { s => + val builder = cbf() + builder.sizeHint(s) + // Initialize a fixed random number generator + val rng = new scala.util.Random("scalding".hashCode) + val p = scGen.Parameters.default.withRng(rng) + + def get(attempt: Int): T = + if (attempt > 1000) sys.error("Failed to generate after 100 tries") + else { + item(p) match { + case None => get(attempt + 1) + case Some(t) => t + } + } + + (0 until s).foreach { _ => + builder += get(0) + } + builder.result() + } + + def roundTrip[T: Serialization](ts: Iterator[T]): Unit = + ts.map { t => + Serialization.fromBytes(Serialization.toBytes(t)).get + }.foreach(_ => ()) + + def kryoRoundTrip[T](k: KryoPool, ts: Iterator[T]): Unit = + ts.map { t => k.fromBytes(k.toBytesWithClass(t)) } + .foreach(_ => ()) + + def toArrayOrd[T](t: OrderedSerialization[T]): Ordering[Array[Byte]] = new Ordering[Array[Byte]] { + def compare(a: Array[Byte], b: Array[Byte]) = { + t.compareBinary(new ByteArrayInputStream(a), new ByteArrayInputStream(b)).unsafeToInt + } + } + def toArrayOrd[T](k: KryoPool, ord: Ordering[T]): Ordering[Array[Byte]] = new Ordering[Array[Byte]] { + def compare(a: Array[Byte], b: Array[Byte]) = + ord.compare(k.fromBytes(a).asInstanceOf[T], + k.fromBytes(b).asInstanceOf[T]) + } + + val longArrayByte: Gen[Array[Byte]] = + collection[Byte, Array](sizes.map(s => (s / 8) * 8)) + + // This is here to make sure the compiler cannot optimize away reads + var effectInt: Int = 0 + var effectLong: Long = 0L + + performance of "Serialization" in { + measure method "JavaStreamEnrichments.readInt" in { + using(longArrayByte) in { a => + val length = a.length + val is = new ByteArrayInputStream(a) + var ints = length / 4 + while (ints > 0) { + effectInt ^= is.readInt + ints -= 1 + } + } + } + measure method "JavaStreamEnrichments.readLong" in { + using(longArrayByte) in { a => + val length = a.length + val is = new ByteArrayInputStream(a) + var longs = length / 8 + while (longs > 0) { + effectLong ^= is.readLong + longs -= 1 + } + } + } + measure method "UnsignedComparisons.unsignedLongCompare" in { + using(collection[Long, Array](sizes)) in { a => + val max = a.length - 1 + var pos = 0 + while (pos < max) { + effectInt ^= UnsignedComparisons.unsignedLongCompare(a(pos), a(pos + 1)) + pos += 2 + } + } + } + measure method "normal long compare" in { + using(collection[Long, Array](sizes)) in { a => + val max = a.length - 1 + var pos = 0 + while (pos < max) { + effectInt ^= java.lang.Long.compare(a(pos), a(pos + 1)) + pos += 2 + } + } + } + measure method "UnsignedComparisons.unsignedInt" in { + using(collection[Int, Array](sizes)) in { a => + val max = a.length - 1 + var pos = 0 + while (pos < max) { + effectInt ^= UnsignedComparisons.unsignedIntCompare(a(pos), a(pos + 1)) + pos += 2 + } + } + } + measure method "normal int compare" in { + using(collection[Int, Array](sizes)) in { a => + val max = a.length - 1 + var pos = 0 + while (pos < max) { + effectInt ^= java.lang.Integer.compare(a(pos), a(pos + 1)) + pos += 2 + } + } + } + measure method "UnsignedComparisons.unsignedShort" in { + using(collection[Short, Array](sizes)) in { a => + val max = a.length - 1 + var pos = 0 + while (pos < max) { + effectInt ^= UnsignedComparisons.unsignedShortCompare(a(pos), a(pos + 1)) + pos += 2 + } + } + } + measure method "normal short compare" in { + using(collection[Short, Array](sizes)) in { a => + val max = a.length - 1 + var pos = 0 + while (pos < max) { + effectInt ^= java.lang.Short.compare(a(pos), a(pos + 1)) + pos += 2 + } + } + } + measure method "UnsignedComparisons.unsignedByte" in { + using(collection[Byte, Array](sizes)) in { a => + val max = a.length - 1 + var pos = 0 + while (pos < max) { + effectInt ^= UnsignedComparisons.unsignedByteCompare(a(pos), a(pos + 1)) + pos += 2 + } + } + } + measure method "normal byte compare" in { + using(collection[Byte, Array](sizes)) in { a => + val max = a.length - 1 + var pos = 0 + while (pos < max) { + effectInt ^= java.lang.Byte.compare(a(pos), a(pos + 1)) + pos += 2 + } + } + } + measure method "typeclass: Int" in { + using(collection[Int, List](sizes)) in { l => roundTrip(l.iterator) } + } + measure method "kryo: Int" in { + val kryo = KryoPool.withByteArrayOutputStream(1, + com.twitter.scalding.Config.default.getKryo.get) + + using(collection[Int, List](sizes)) in { l => kryoRoundTrip(kryo, l.iterator) } + } + measure method "typeclass: String" in { + using(collection[String, List](smallSizes)) in { l => roundTrip(l.iterator) } + } + measure method "kryo: String" in { + val kryo = KryoPool.withByteArrayOutputStream(1, + com.twitter.scalding.Config.default.getKryo.get) + + using(collection[String, List](smallSizes)) in { l => kryoRoundTrip(kryo, l.iterator) } + } + measure method "typeclass: (Int, (Long, String))" in { + using(collection[(Int, (Long, String)), List](smallSizes)) in { l => roundTrip(l.iterator) } + } + measure method "kryo: (Int, (Long, String))" in { + val kryo = KryoPool.withByteArrayOutputStream(1, + com.twitter.scalding.Config.default.getKryo.get) + + using(collection[(Int, (Long, String)), List](smallSizes)) in { l => kryoRoundTrip(kryo, l.iterator) } + } + measure method "typeclass: (Int, Long, Short)" in { + using(collection[(Int, Long, Short), List](smallSizes)) in { l => roundTrip(l.iterator) } + } + measure method "kryo: (Int, Long, Short)" in { + val kryo = KryoPool.withByteArrayOutputStream(1, + com.twitter.scalding.Config.default.getKryo.get) + + using(collection[(Int, Long, Short), List](smallSizes)) in { l => kryoRoundTrip(kryo, l.iterator) } + } + measure method "sort typeclass: Int" in { + val ordSer = implicitly[OrderedSerialization[Int]] + using(collection[Int, List](smallSizes) + .map { items => + items.map { Serialization.toBytes(_) }.toArray + }) in { ary => java.util.Arrays.sort(ary, toArrayOrd(ordSer)) } + } + measure method "sort kryo: Int" in { + val kryo = KryoPool.withByteArrayOutputStream(1, + com.twitter.scalding.Config.default.getKryo.get) + + val ord = implicitly[Ordering[Int]] + using(collection[Int, List](smallSizes) + .map { items => + items.map { kryo.toBytesWithClass(_) }.toArray + }) in { ary => java.util.Arrays.sort(ary, toArrayOrd(kryo, ord)) } + } + measure method "sort typeclass: Long" in { + val ordSer = implicitly[OrderedSerialization[Long]] + using(collection[Long, List](smallSizes) + .map { items => + items.map { Serialization.toBytes(_) }.toArray + }) in { ary => java.util.Arrays.sort(ary, toArrayOrd(ordSer)) } + } + measure method "sort kryo: Long" in { + val kryo = KryoPool.withByteArrayOutputStream(1, + com.twitter.scalding.Config.default.getKryo.get) + + val ord = implicitly[Ordering[Long]] + using(collection[Long, List](smallSizes) + .map { items => + items.map { kryo.toBytesWithClass(_) }.toArray + }) in { ary => java.util.Arrays.sort(ary, toArrayOrd(kryo, ord)) } + } + measure method "sort typeclass: String" in { + val ordSer = implicitly[OrderedSerialization[String]] + using(collection[String, List](smallSizes) + .map { items => + items.map { Serialization.toBytes(_) }.toArray + }) in { ary => java.util.Arrays.sort(ary, toArrayOrd(ordSer)) } + } + measure method "sort kryo: String" in { + val kryo = KryoPool.withByteArrayOutputStream(1, + com.twitter.scalding.Config.default.getKryo.get) + + val ord = implicitly[Ordering[String]] + using(collection[String, List](smallSizes) + .map { items => + items.map { kryo.toBytesWithClass(_) }.toArray + }) in { ary => java.util.Arrays.sort(ary, toArrayOrd(kryo, ord)) } + } + + measure method "sort typeclass: (Int, (Long, String))" in { + val ordSer = implicitly[OrderedSerialization[(Int, (Long, String))]] + using(collection[(Int, (Long, String)), List](smallSizes) + .map { items => + items.map { Serialization.toBytes(_) }.toArray + }) in { ary => java.util.Arrays.sort(ary, toArrayOrd(ordSer)) } + } + measure method "sort kryo: (Int, (Long, String))" in { + val kryo = KryoPool.withByteArrayOutputStream(1, + com.twitter.scalding.Config.default.getKryo.get) + + val ord = implicitly[Ordering[(Int, (Long, String))]] + using(collection[(Int, (Long, String)), List](smallSizes) + .map { items => + items.map { kryo.toBytesWithClass(_) }.toArray + }) in { ary => java.util.Arrays.sort(ary, toArrayOrd(kryo, ord)) } + } + + /** + * TODO: + * 1) simple case class + * 2) case class with some nesting and collections + * 3) sorting of an Array[Array[Byte]] using OrderedSerialization vs Array[T] + * 4) fastest binary sorting for strings (byte-by-byte, longs, etc...) + */ + } +} diff --git a/scalding-cats/src/main/scala/com/twitter/scalding/hellcats/HellCats.scala b/scalding-cats/src/main/scala/com/twitter/scalding/hellcats/HellCats.scala new file mode 100644 index 0000000000..3c1149ab84 --- /dev/null +++ b/scalding-cats/src/main/scala/com/twitter/scalding/hellcats/HellCats.scala @@ -0,0 +1,177 @@ +package com.twitter.scalding.hellcats + +import cats.{Functor, FunctorFilter, MonoidK, Semigroupal, StackSafeMonad} +import cats.effect.{Async, Effect, ExitCase, IO, SyncIO} +import com.twitter.scalding.{Config, Execution, Mode, TypedPipe} +import com.twitter.scalding.typed.CoGroupable +import com.twitter.scalding.typed.functions.{Identity, MapOptionToFlatMap} +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} + +/** + * Instances for cats types when working with Scalding + */ +object HellCats { + implicit val instancesTypedPipe: Functor[TypedPipe] with MonoidK[TypedPipe] = + new Functor[TypedPipe] with MonoidK[TypedPipe] { + def empty[A] = TypedPipe.empty + def map[A, B](ta: TypedPipe[A])(fn: A => B) = ta.map(fn) + def combineK[A](left: TypedPipe[A], right: TypedPipe[A]) = left ++ right + // we could implement Applicative[TypedPipe], but cross is very dangerous + // on map-reduce, so I hesitate to add it at this point + } + + implicit val functorFilterTypedPipe: FunctorFilter[TypedPipe] = + new FunctorFilter[TypedPipe] { + def functor = instancesTypedPipe + def mapFilter[A, B](ta: TypedPipe[A])(fn: A => Option[B]): TypedPipe[B] = + ta.flatMap(MapOptionToFlatMap(fn)) + + override def flattenOption[A](ta: TypedPipe[Option[A]]): TypedPipe[A] = + mapFilter(ta)(Identity()) + + override def collect[A, B](ta: TypedPipe[A])(fn: PartialFunction[A, B]): TypedPipe[B] = + ta.collect(fn) + + override def filter[A](ta: TypedPipe[A])(fn: A => Boolean) = ta.filter(fn) + } + + implicit def semigroupalCoGroupable[K]: Semigroupal[({ type F[V] = CoGroupable[K, V] })#F] = + new Semigroupal[({ type F[V] = CoGroupable[K, V] })#F] { + def product[A, B](ca: CoGroupable[K, A], cb: CoGroupable[K, B]) = ca.join(cb) + } + + /** + * Async[Execution] includes MonadError[Throwable, Execution] and Defer[Execution] which together are the + * most commonly used typeclasses + */ + implicit val asyncExecution: Async[Execution] with StackSafeMonad[Execution] = + new AsyncExecution + + /** + * To use Execution as an Effect, which is to say, we can run it, we need the Config, Mode and + * ExecutionContext to use + */ + def executionEffect(c: Config, m: Mode)(implicit cec: ConcurrentExecutionContext): Effect[Execution] = + new ExecutionEffect(c, m) + + class AsyncExecution extends Async[Execution] with StackSafeMonad[Execution] { + private[this] val neverNothing: Execution[Nothing] = + Execution.fromFuture { _ => + val p = Promise[Nothing]() + p.future + } + + override def ap[A, B](ef: Execution[A => B])(ea: Execution[A]): Execution[B] = + ef.zip(ea).map { case (f, a) => f(a) } + + def async[A](k: (Either[Throwable, A] => Unit) => Unit): Execution[A] = + Execution.withNewCache(Execution.fromFuture { implicit cec: ConcurrentExecutionContext => + val p = Promise[A]() + Future { + k { + case Right(a) => + p.success(a) + () + case Left(err) => + p.failure(err) + () + } + } + p.future + }) + + def asyncF[A](k: (Either[Throwable, A] => Unit) => Execution[Unit]): Execution[A] = + delay(Promise[A]()).flatMap { p => + val asyncEx = Execution + .withNewCache(Execution.fromFuture { implicit cec: ConcurrentExecutionContext => + Future { + k { + case Right(a) => + p.success(a) + () + case Left(err) => + p.failure(err) + () + } + } + }) + .flatten + + val result = Execution.fromFuture(_ => p.future) + + // this is not quite what is meant by async. We should actually + // allow the result to complete before the Execution that k returns + // completes. This is a bit weird for a distributed compute Effect like + // Execution. We can still pass the laws by blocking on Execution, + // so we do that here. + // + // Note, we liftToTry here because the contract of asyncF is that + // it should be running independent of the result A. Failures + // are signaled by calling k with Left(err), not by failing the + // Execution. + asyncEx.liftToTry + .zip(result) + .map(_._2) + } + + // Members declared in cats.effect.Bracket + def bracketCase[A, B]( + acquire: Execution[A] + )(use: A => Execution[B])(release: (A, ExitCase[Throwable]) => Execution[Unit]): Execution[B] = + acquire.flatMap { a => + attempt(use(a)).flatMap { + case Right(b) => + release(a, ExitCase.Completed) + .map(_ => b) + case Left(t) => + release(a, ExitCase.Error(t)) + .flatMap(_ => Execution.failed(t)) + } + } + + override def delay[A](a: => A): Execution[A] = + // we can't lawfully cache this + Execution.withNewCache(Execution.from(a)) + + def handleErrorWith[A](ea: Execution[A])(fn: Throwable => Execution[A]): Execution[A] = + ea.recoverWith { case t => fn(t) } + + def pure[A](a: A): Execution[A] = Execution.from(a) + + def flatMap[A, B](ea: Execution[A])(fn: A => Execution[B]): Execution[B] = + ea.flatMap(fn) + + override def map[A, B](ea: Execution[A])(fn: A => B): Execution[B] = + ea.map(fn) + + override def never[A]: Execution[A] = neverNothing + + override def product[A, B](ea: Execution[A], eb: Execution[B]): Execution[(A, B)] = + ea.zip(eb) + + def raiseError[A](t: Throwable): Execution[A] = Execution.failed(t) + + override def recoverWith[A](ea: Execution[A])( + fn: PartialFunction[Throwable, Execution[A]] + ): Execution[A] = + ea.recoverWith(fn) + + def suspend[A](ea: => Execution[A]): Execution[A] = + delay(ea).flatten + } + + class ExecutionEffect(c: Config, m: Mode)(implicit cec: ConcurrentExecutionContext) + extends AsyncExecution + with Effect[Execution] { + def runAsync[A](ea: Execution[A])(cb: Either[Throwable, A] => IO[Unit]): SyncIO[Unit] = + SyncIO { + val funit = ea + .run(c, m) + .map(a => Right(a)) + .recover { case t => Left(t) } + .map(e => cb(e).unsafeRunSync) + // we can discard this future, since we have started the work + () + } + } +} diff --git a/scalding-cats/src/test/scala/com/twitter/scalding/hellcats/HellCatsTests.scala b/scalding-cats/src/test/scala/com/twitter/scalding/hellcats/HellCatsTests.scala new file mode 100644 index 0000000000..ac2bf2f861 --- /dev/null +++ b/scalding-cats/src/test/scala/com/twitter/scalding/hellcats/HellCatsTests.scala @@ -0,0 +1,93 @@ +package com.twitter.scalding.hellcats + +import cats.{Eq, MonadError} +import cats.laws.discipline.SemigroupalTests.Isomorphisms +import cats.effect.{Effect, IO} +import cats.effect.laws.discipline.EffectTests +import com.twitter.scalding.typed.memory_backend.MemoryMode +import com.twitter.scalding.{Config, Execution} +import org.scalatest.FunSuite +import org.scalacheck.{Arbitrary, Gen} +import org.typelevel.discipline.scalatest.Discipline +import scala.concurrent.{Await, ExecutionContext} +import scala.concurrent.duration._ +import scala.util.{Failure, Success, Try} + +import HellCats._ +import cats.implicits._ + +object ExecutionGen { + def genMonadError[F[_], A](depth: Int, g: Gen[A])(implicit me: MonadError[F, Throwable]): Gen[F[A]] = { + val recurse = Gen.lzy(genMonadError[F, A](depth - 1, g)) + val g0 = Gen.frequency((5, g.map(me.pure(_))), (1, Gen.const(me.raiseError[A](new Exception("failed"))))) + if (depth <= 0) g0 + else { + implicit val arbEx: Arbitrary[F[A]] = Arbitrary(recurse) + val genFn = Arbitrary.arbitrary[Int => F[A]] + val genIntEx = Gen.lzy(genMonadError[F, Int](depth - 1, Arbitrary.arbitrary[Int])) + val genFlatMap = for { + ei <- genIntEx + fn <- genFn + } yield ei.flatMap(fn) + val zip = for { + a <- recurse + b <- recurse + aOrB <- Gen.oneOf(a, b) + } yield aOrB + + Gen.frequency((4, g0), (4, genFlatMap), (1, zip)) // use zip less because it branches + } + } + def genExecution[A](depth: Int, g: Gen[A]): Gen[Execution[A]] = + genMonadError[Execution, A](depth, g) + + implicit def arbEx[A](implicit arb: Arbitrary[A]): Arbitrary[Execution[A]] = + Arbitrary(genExecution(5, arb.arbitrary)) + + implicit def arbIO[A](implicit arb: Arbitrary[A]): Arbitrary[IO[A]] = + Arbitrary(genMonadError[IO, A](5, arb.arbitrary)) + + implicit def eqEx[A: Eq](implicit ec: ExecutionContext): Eq[Execution[A]] = + new Eq[Execution[A]] { + def get[A](ex: Execution[A]): Try[A] = + Try(Await.result(ex.run(Config.empty, MemoryMode.empty), Duration(10, SECONDS))) + + def eqv(l: Execution[A], r: Execution[A]) = + (get(l), get(r)) match { + case (Success(a), Success(b)) => Eq[A].eqv(a, b) + case (Failure(_), Failure(_)) => true + case _ => false + } + } + + implicit def eqIO[A: Eq]: Eq[IO[A]] = + new Eq[IO[A]] { + def eqv(l: IO[A], r: IO[A]) = + (Try(l.unsafeRunTimed(Duration(10, SECONDS))), Try(r.unsafeRunTimed(Duration(10, SECONDS)))) match { + case (Success(a), Success(b)) => Eq[Option[A]].eqv(a, b) + case (Failure(_), Failure(_)) => true + case _ => false + } + } + + // We consider all failures the same, we don't care about failure order + // in Execution because we want to fail fast + implicit val allEqThrowable: Eq[Throwable] = + Eq.by { t: Throwable => () } + + implicit val isos: Isomorphisms[Execution] = Isomorphisms.invariant[Execution] + // Need non-fatal Throwables for Future recoverWith/handleError + implicit val nonFatalArbitrary: Arbitrary[Throwable] = + Arbitrary(Arbitrary.arbitrary[Exception].map(identity)) +} + +class HellCatsTests extends FunSuite with Discipline { + import ExecutionGen._ + + implicit val ec: ExecutionContext = ExecutionContext.global + + { + implicit val exeEff: Effect[Execution] = executionEffect(Config.empty, MemoryMode.empty) + checkAll("Execution", EffectTests[Execution].effect[Int, Int, Int]) + } +} diff --git a/scalding-commons/src/main/java/com/twitter/scalding/commons/datastores/Utils.java b/scalding-commons/src/main/java/com/twitter/scalding/commons/datastores/Utils.java new file mode 100644 index 0000000000..b3dfe3418f --- /dev/null +++ b/scalding-commons/src/main/java/com/twitter/scalding/commons/datastores/Utils.java @@ -0,0 +1,32 @@ +package com.twitter.scalding.commons.datastores; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; + +public class Utils { + + public static FileSystem getFS(String path) throws IOException { + return getFS(path, new Configuration()); + } + + public static FileSystem getFS(String path, Configuration conf) throws IOException { + return new Path(path).getFileSystem(conf); + } + + /** + * Return true or false if the input is a long + * @param input + * @return boolean + */ + public static boolean isLong(String input) { + try { + Long.parseLong(input); + return true; + } catch (Exception e) { + return false; + } + } +} diff --git a/scalding-commons/src/main/java/com/twitter/scalding/commons/datastores/VersionedStore.java b/scalding-commons/src/main/java/com/twitter/scalding/commons/datastores/VersionedStore.java new file mode 100644 index 0000000000..0962909e3b --- /dev/null +++ b/scalding-commons/src/main/java/com/twitter/scalding/commons/datastores/VersionedStore.java @@ -0,0 +1,271 @@ +package com.twitter.scalding.commons.datastores; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.security.AccessControlException; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class VersionedStore { + public static final String FINISHED_VERSION_SUFFIX = ".version"; + public static final String HADOOP_SUCCESS_FLAG = "_SUCCESS"; + + private String root; + private FileSystem fs; + + public VersionedStore(String path) throws IOException { + this(Utils.getFS(path), path); + } + + public VersionedStore(FileSystem fs, String path) throws IOException { + this.fs = fs; + root = path; + mkdirs(root); + } + + public VersionedStore(Path path, Configuration conf) throws IOException { + this.fs = path.getFileSystem(conf); + this.root = path.toString(); + } + + public FileSystem getFileSystem() { + return fs; + } + + public String getRoot() { + return root; + } + + public String versionPath(long version) { + return new Path(getRoot(), "" + version).toString(); + } + + public String mostRecentVersionPath() throws IOException { + Long v = mostRecentVersion(); + return (v == null) ? null : versionPath(v); + } + + public String mostRecentVersionPath(long maxVersion) throws IOException { + Long v = mostRecentVersion(maxVersion); + return (v == null) ? null : versionPath(v); + } + + public Long mostRecentVersion() throws IOException { + return mostRecentVersion(false, null); + } + + public Long mostRecentVersion(boolean skipVersionSuffix) throws IOException { + return mostRecentVersion(skipVersionSuffix, null); + } + + public Long mostRecentVersion(long maxVersion) throws IOException { + return mostRecentVersion(false, maxVersion); + } + + public Long mostRecentVersion(boolean skipVersionSuffix, Long maxVersion) throws IOException { + List all = getAllVersions(skipVersionSuffix); + if (maxVersion == null) { + return (all.size() == 0) ? null : all.get(0); + } else { + for(Long v: all) { + if(v <= maxVersion) + return v; + } + return null; + } + } + + public long newVersion() { + return System.currentTimeMillis(); + } + + public String createVersion() throws IOException { + return createVersion(newVersion()); + } + + public String createVersion(long version) throws IOException { + String ret = versionPath(version); + if(getAllVersions().contains(version)) + throw new RuntimeException("Version already exists or data already exists"); + else { + //in case there's an incomplete version there, delete it + fs.delete(new Path(versionPath(version)), true); + return ret; + } + } + + public void failVersion(String path) throws IOException { + deleteVersion(validateAndGetVersion(path)); + } + + public void deleteVersion(long version) throws IOException { + // Be sure to delete success indicators before data + fs.delete(new Path(tokenPath(version)), false); + fs.delete(new Path(successFlagPath(version)), false); + fs.delete(new Path(versionPath(version)), true); + } + + public void succeedVersion(String path) throws IOException { + succeedVersion(validateAndGetVersion(path)); + } + + public void succeedVersion(long version) throws IOException { + createNewFile(tokenPath(version)); + } + + public void cleanup() throws IOException { + // Default behavior is to clean up NOTHING + cleanup(-1); + } + + public void cleanup(int versionsToKeep) throws IOException { + if (versionsToKeep < 0) return; + final List versions = getAllVersions(); + int numExisting = versions.size(); + if (numExisting <= versionsToKeep) return; + for (Long v : versions.subList(versionsToKeep, numExisting)) { + deleteVersion(v); + } + } + + /** + * Sorted from most recent to oldest + */ + public List getAllVersions() throws IOException { + return getAllVersions(false); + } + + public List getAllVersions(boolean skipVersionSuffix) throws IOException { + + Path rootPath = new Path(getRoot()); + if (getFileSystem().exists(rootPath)) { + // we use a set so we can automatically de-dupe folders that + // have both version suffix and success flag below + Set ret = new HashSet(); + for(Path p: listDir(getRoot())) { + if (skipVersionSuffix) { + // backwards compatible if version suffix does not exist + if(Utils.isLong(p.getName())) { + ret.add(Long.valueOf(p.getName())); + } + } else { + if (!p.getName().startsWith("_")) { + try { + if (p.getName().endsWith(FINISHED_VERSION_SUFFIX)) { + ret.add(validateAndGetVersion(p.toString())); + } else { + final FileStatus status = getFileSystem().getFileStatus(p); + if (status != null && status.isDir() && getFileSystem().exists(new Path(p, HADOOP_SUCCESS_FLAG))) { + // FORCE the _SUCCESS flag into the versioned store directory. + ret.add(validateAndGetVersion(p.toString() + FINISHED_VERSION_SUFFIX)); + } + } + } catch (RuntimeException e) { + // Skip this version + continue; + } + } + } + } + List retList = new ArrayList(ret); + // now sort the versions most recent first per the api contract + Collections.sort(retList); + Collections.reverse(retList); + return retList; + } else { + return Collections.emptyList(); + } + } + + public boolean hasVersion(long version) throws IOException { + return getAllVersions().contains(version); + } + + private String tokenPath(long version) { + return new Path(root, "" + version + FINISHED_VERSION_SUFFIX).toString(); + } + + /** The path to the hadoop-created success flag file which may or may not exist */ + private String successFlagPath(long version) { + return new Path(versionPath(version), HADOOP_SUCCESS_FLAG).toString(); + } + + private Path normalizePath(String p) { + return new Path(p).makeQualified(fs); + } + + private long validateAndGetVersion(String path) { + Path parent = new Path(path).getParent(); + if(!normalizePath(path).getParent().equals(normalizePath(root))) { + throw new RuntimeException(path + " " + parent + " is not part of the versioned store located at " + root); + } + Long v = parseVersion(path); + if (v==null) throw new RuntimeException(path + " is not a valid version"); + + // Check that versioned folder exists + Path versionPath = new Path(parent, v.toString()); + try { + FileStatus status = getFileSystem().getFileStatus(versionPath); + if (status == null || !status.isDir()) throw new RuntimeException(versionPath + " is not a valid version subfolder"); + } catch (IOException e) { + throw new RuntimeException("could not stat path: " + versionPath); + } + + return v; + } + + public Long parseVersion(String path) { + String name = new Path(path).getName(); + if(name.endsWith(FINISHED_VERSION_SUFFIX)) { + name = name.substring(0, name.length()-FINISHED_VERSION_SUFFIX.length()); + } + try { + return Long.parseLong(name); + } catch(NumberFormatException e) { + return null; + } + } + + private void createNewFile(String path) throws IOException { + if(fs instanceof LocalFileSystem) + new File(path).createNewFile(); + else + fs.createNewFile(new Path(path)); + } + + private void mkdirs(String path) throws IOException { + if(fs instanceof LocalFileSystem) + new File(path).mkdirs(); + else { + try { + fs.mkdirs(new Path(path)); + } catch (AccessControlException e) { + throw new RuntimeException("Root directory doesn't exist, and user doesn't have the permissions " + + "to create" + path + ".", e); + } + } + } + + private List listDir(String dir) throws IOException { + List ret = new ArrayList(); + if(fs instanceof LocalFileSystem) { + for(File f: new File(dir).listFiles()) { + ret.add(new Path(f.getAbsolutePath())); + } + } else { + for(FileStatus status: fs.listStatus(new Path(dir))) { + ret.add(status.getPath()); + } + } + return ret; + } +} diff --git a/scalding-commons/src/main/java/com/twitter/scalding/commons/scheme/KeyValueByteScheme.java b/scalding-commons/src/main/java/com/twitter/scalding/commons/scheme/KeyValueByteScheme.java new file mode 100644 index 0000000000..a436c75e6d --- /dev/null +++ b/scalding-commons/src/main/java/com/twitter/scalding/commons/scheme/KeyValueByteScheme.java @@ -0,0 +1,61 @@ +package com.twitter.scalding.commons.scheme; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.RecordReader; + +import cascading.flow.FlowProcess; +import cascading.scheme.SinkCall; +import cascading.scheme.SourceCall; +import cascading.tuple.Fields; +import cascading.tuple.Tuple; +import cascading.tuple.TupleEntry; + +import com.twitter.elephantbird.cascading2.scheme.CombinedWritableSequenceFile; + +/** + * + */ +public class KeyValueByteScheme extends CombinedWritableSequenceFile { + public KeyValueByteScheme(Fields fields) { + super(fields, BytesWritable.class, BytesWritable.class); + } + + public static byte[] getBytes(BytesWritable key) { + return Arrays.copyOfRange(key.getBytes(), 0, key.getLength()); + } + + @Override + public boolean source(FlowProcess flowProcess, + SourceCall sourceCall) throws IOException { + BytesWritable key = (BytesWritable) sourceCall.getContext()[0]; + BytesWritable value = (BytesWritable) sourceCall.getContext()[1]; + boolean result = sourceCall.getInput().next(key, value); + + if (!result) { return false; } + + Tuple tuple = sourceCall.getIncomingEntry().getTuple(); + tuple.clear(); + + tuple.add(getBytes(key)); + tuple.add(getBytes(value)); + + return true; + } + + @Override + public void sink(FlowProcess flowProcess, SinkCall sinkCall) + throws IOException { + TupleEntry tupleEntry = sinkCall.getOutgoingEntry(); + + byte[] key = (byte[]) tupleEntry.getObject(0); + byte[] val = (byte[]) tupleEntry.getObject(1); + + sinkCall.getOutput().collect(new BytesWritable(key), new BytesWritable(val)); + } +} + diff --git a/scalding-commons/src/main/java/com/twitter/scalding/commons/tap/VersionedTap.java b/scalding-commons/src/main/java/com/twitter/scalding/commons/tap/VersionedTap.java new file mode 100644 index 0000000000..4c0ea264b8 --- /dev/null +++ b/scalding-commons/src/main/java/com/twitter/scalding/commons/tap/VersionedTap.java @@ -0,0 +1,182 @@ +package com.twitter.scalding.commons.tap; + +import java.io.IOException; + +import com.twitter.scalding.commons.datastores.VersionedStore; +import com.twitter.scalding.tap.GlobHfs; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.RecordReader; + +import cascading.flow.FlowProcess; +import cascading.scheme.Scheme; + +public class VersionedTap extends GlobHfs { + public static enum TapMode {SOURCE, SINK} + + public Long version = null; + + // a sane default for the number of versions of your data to keep around + private int versionsToKeep = 3; + + // source-specific + public TapMode mode; + + // sink-specific + private String newVersionPath; + private String writtenPath; + + public VersionedTap(String dir, Scheme scheme, TapMode mode) + throws IOException { + super(scheme, dir); + this.mode = mode; + } + + + public VersionedTap setVersion(long version) { + this.version = version; + return this; + } + + /** + * Sets the number of versions of your data to keep. Unneeded versions are cleaned up on creation + * of a new one. Pass a negative number to keep all versions. + */ + public VersionedTap setVersionsToKeep(int versionsToKeep) { + this.versionsToKeep = versionsToKeep; + return this; + } + + public int getVersionsToKeep() { + return this.versionsToKeep; + } + + public String getOutputDirectory() { + return getPath().toString(); + } + + public VersionedStore getStore(JobConf conf) throws IOException { + return new VersionedStore(getPath().getFileSystem(conf), getOutputDirectory()); + } + + public String getSourcePath(JobConf conf) { + VersionedStore store; + try { + store = getStore(conf); + String sourcePath = (version != null) ? store.versionPath(version) : store.mostRecentVersionPath(); + if (sourcePath == null) { + throw new RuntimeException("Could not find valid source path for VersionTap with root: " + store.getRoot()); + } + return sourcePath; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public String getSinkPath(JobConf conf) { + try { + VersionedStore store = getStore(conf); + String sinkPath = (version == null) ? store.createVersion() : store.createVersion(version); + if (sinkPath == null) { + throw new RuntimeException("Could not find valid sink path for VersionTap with root: " + store.getRoot()); + } + return sinkPath; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + public void sourceConfInit(FlowProcess process, JobConf conf) { + super.sourceConfInit(process, conf); + FileInputFormat.setInputPaths(conf, getSourcePath(conf)); + } + + @Override + public void sinkConfInit(FlowProcess process, JobConf conf) { + super.sinkConfInit(process, conf); + + if (newVersionPath == null) + newVersionPath = getSinkPath(conf); + + FileOutputFormat.setOutputPath(conf, new Path(newVersionPath)); + } + + @Override + public long getSize(JobConf conf) throws IOException { + return getSize(new Path(getSourcePath(conf)), conf); + } + + @Override + public boolean resourceExists(JobConf jc) throws IOException { + return getStore(jc).mostRecentVersion() != null; + } + + @Override + public boolean createResource(JobConf jc) throws IOException { + throw new UnsupportedOperationException("Not supported yet."); + } + + @Override + public boolean deleteResource(JobConf jc) throws IOException { + throw new UnsupportedOperationException("Not supported yet."); + } + + @Override + public String getIdentifier() { + String outDir = getOutputDirectory(); + String versionString = (version == null) ? "LATEST" : version.toString(); + return outDir + Path.SEPARATOR + + ((mode == TapMode.SINK) ? "sink" : "source") + + Path.SEPARATOR + versionString; + } + + @Override + public long getModifiedTime(JobConf conf) throws IOException { + VersionedStore store = getStore(conf); + return (mode == TapMode.SINK) ? 0 : store.mostRecentVersion(); + } + + @Override + public boolean commitResource(JobConf conf) throws IOException { + VersionedStore store = getStore(conf); + + if (newVersionPath != null) { + store.succeedVersion(newVersionPath); + markSuccessfulOutputDir(new Path(newVersionPath), conf); + writtenPath = newVersionPath; + newVersionPath = null; + store.cleanup(getVersionsToKeep()); + } + + return true; + } + + public String getWrittenPath() { + return writtenPath; + } + + private static void markSuccessfulOutputDir(Path path, JobConf conf) throws IOException { + FileSystem fs = path.getFileSystem(conf); + // create a file in the folder to mark it + if (fs.exists(path)) { + Path filePath = new Path(path, VersionedStore.HADOOP_SUCCESS_FLAG); + fs.create(filePath).close(); + } + } + + @Override + public boolean rollbackResource(JobConf conf) throws IOException { + if (newVersionPath != null) { + getStore(conf).failVersion(newVersionPath); + newVersionPath = null; + } + + return true; + } +} diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/extensions/Checkpoint.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/extensions/Checkpoint.scala index 3b06f3add3..ee6c710eb4 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/extensions/Checkpoint.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/extensions/Checkpoint.scala @@ -12,78 +12,70 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.extensions import com.twitter.scalding._ import com.twitter.scalding.Dsl._ -import java.io.File import cascading.flow.FlowDef import cascading.pipe.Pipe -import cascading.tuple.{ Fields, TupleEntry } -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{ FileSystem, Path } +import cascading.tuple.Fields import org.slf4j.{Logger, LoggerFactory => LogManager} +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ + /** - * Checkpoint provides a simple mechanism to read and write intermediate results - * from a Scalding flow to HDFS. + * Checkpoint provides a simple mechanism to read and write intermediate results from a Scalding flow to HDFS. * - * Checkpoints are useful for debugging one part of a long flow, when you would - * otherwise have to run many steps to get to the one you care about. To enable - * checkpoints, sprinkle calls to Checkpoint() throughout your flow, ideally - * after expensive steps. + * Checkpoints are useful for debugging one part of a long flow, when you would otherwise have to run many + * steps to get to the one you care about. To enable checkpoints, sprinkle calls to Checkpoint() throughout + * your flow, ideally after expensive steps. * - * When checkpoints are enabled, each Checkpoint() looks for a checkpoint file - * on HDFS. If it exists we read results from the file; otherwise we execute - * the flow and write the results to the file. When checkpoints are disabled, - * the flow is always executed and the results are never stored. + * When checkpoints are enabled, each Checkpoint() looks for a checkpoint file on HDFS. If it exists we read + * results from the file; otherwise we execute the flow and write the results to the file. When checkpoints + * are disabled, the flow is always executed and the results are never stored. * - * Each call to Checkpoint() takes the checkpoint name, as well as the types and - * names of the expected fields. A sample invocation might look like this: - * val pipe = Checkpoint[(Long, String, Long)]( - * "clicks", ('tweetId, 'clickUrl, 'clickCount)) { ... } - * where { ... } contains a flow which computes the result. + * Each call to Checkpoint() takes the checkpoint name, as well as the types and names of the expected fields. + * A sample invocation might look like this: val pipe = Checkpoint[(Long, String, Long)]( "clicks", ('tweetId, + * 'clickUrl, 'clickCount)) { ... } where { ... } contains a flow which computes the result. * * Most checkpoint parameters are specified via command-line flags: - * --checkpoint.clobber: if true, recompute and overwrite any existing - * checkpoint files. + * --checkpoint.clobber: if true, recompute and overwrite any existing checkpoint files. * --checkpoint.clobber.: override clobber for the given checkpoint. - * --checkpoint.file: specifies a filename prefix to use for checkpoint files. - * If blank, checkpoints are disabled; otherwise the file for checkpoint - * is _. - * --checkpoint.file.: override --checkpoint.file for the given - * checkpoint; specifies the whole filename, not the prefix. - * --checkpoint.format: specifies a file format, either sequencefile or tsv. - * Default is sequencefile for HDFS, tsv for local. + * --checkpoint.file: specifies a filename prefix to use for checkpoint files. If blank, checkpoints are + * disabled; otherwise the file for checkpoint is _. + * --checkpoint.file.: override --checkpoint.file for the given checkpoint; specifies the whole + * filename, not the prefix. + * --checkpoint.format: specifies a file format, either sequencefile or tsv. Default is sequencefile for HDFS, + * tsv for local. * --checkpoint.format.: specifies file format for the given checkpoint. * - * @author Mike Jahr + * @author + * Mike Jahr */ object Checkpoint { private val LOG: Logger = LogManager.getLogger(this.getClass) /** - * Type parameters: - * A: tuple of result types + * Type parameters: A: tuple of result types * - * Parameters: - * checkpointName: name of the checkpoint - * resultFields: tuple of result field names - * flow: a function to run a flow to compute the result + * Parameters: checkpointName: name of the checkpoint resultFields: tuple of result field names flow: a + * function to run a flow to compute the result * - * Implicit parameters: - * args: provided by com.twitter.pluck.job.TwitterJob - * mode: provided by com.twitter.scalding.Job - * flowDef: provided by com.twitter.scalding.Job - * conv: provided by com.twitter.scalding.TupleConversions - * setter: provided by com.twitter.scalding.TupleConversions + * Implicit parameters: args: provided by com.twitter.pluck.job.TwitterJob mode: provided by + * com.twitter.scalding.Job flowDef: provided by com.twitter.scalding.Job conv: provided by + * com.twitter.scalding.TupleConversions setter: provided by com.twitter.scalding.TupleConversions */ - def apply[A](checkpointName: String, resultFields: Fields)(flow: => Pipe)(implicit args: Args, mode: Mode, flowDef: FlowDef, - conv: TupleConverter[A], setter: TupleSetter[A]): Pipe = { + def apply[A](checkpointName: String, resultFields: Fields)(flow: => Pipe)(implicit + args: Args, + mode: Mode, + flowDef: FlowDef, + conv: TupleConverter[A], + setter: TupleSetter[A] + ): Pipe = { conv.assertArityMatches(resultFields) setter.assertArityMatches(resultFields) @@ -92,36 +84,37 @@ object Checkpoint { val filename: Option[String] = getFilename(checkpointName) val format: String = getFormat(checkpointName) - if (filename.isDefined && hasInput(checkpointName, filename.get)) { - // We have checkpoint input; read the file instead of executing the flow. - LOG.info("Checkpoint \"" + checkpointName + "\": reading " + format + - " input from \"" + filename.get + "\"") - getSource(format, filename.get) - .read - .mapTo(List.range(0, resultFields.size) -> resultFields)((x: A) => x)(conv, setter) - } else { + filename match { + case Some(name) if hasInput(checkpointName, name) => + // We have checkpoint input; read the file instead of executing the flow. + LOG.info(s"""Checkpoint "$checkpointName": reading $format input from "$name"""") + getSource(format, name).read + .mapTo(List.range(0, resultFields.size) -> resultFields)((x: A) => x)(conv, setter) // We don't have checkpoint input; execute the flow and project to the // requested fields. - val pipe = flow.project(resultFields) - - // If requested, write the checkpoint output. - if (filename.isDefined) { - LOG.info("Checkpoint \"" + checkpointName + "\": writing " + format + - " output to \"" + filename.get + "\"") - pipe.write(getSource(format, filename.get)) - } else { - pipe - } + case Some(name) => + val pipe = flow.project(resultFields) + + // Write the checkpoint output. + LOG.info(s"""Checkpoint "$checkpointName": writing $format output to "$name"""") + pipe.write(getSource(format, name)) + case None => + flow.project(resultFields) } } - // Wrapper for Checkpoint when using a TypedPipe - def apply[A](checkpointName: String)(flow: => TypedPipe[A])(implicit args: Args, mode: Mode, flowDef: FlowDef, - conv: TupleConverter[A], setter: TupleSetter[A]): TypedPipe[A] = { + // Wrapper for Checkpoint when using a TypedPipe + def apply[A](checkpointName: String)(flow: => TypedPipe[A])(implicit + args: Args, + mode: Mode, + flowDef: FlowDef, + conv: TupleConverter[A], + setter: TupleSetter[A] + ): TypedPipe[A] = { val rPipe = apply(checkpointName, Dsl.intFields(0 until conv.arity)) { flow.toPipe(Dsl.intFields(0 until conv.arity)) } - TypedPipe.from[A](rPipe,Dsl.intFields(0 until conv.arity)) + TypedPipe.fromPipe[A](rPipe, Dsl.intFields(0 until conv.arity)) } // Helper class for looking up checkpoint arguments, either the base value from @@ -139,7 +132,7 @@ object Checkpoint { } else { baseValue } - def isTrue: Boolean = value.isDefined && value.get.toLowerCase != "false" + def isTrue: Boolean = value.exists(_.toLowerCase != "false") } // Returns the filename to use for the given checkpoint, or None if this @@ -150,13 +143,12 @@ object Checkpoint { // The flag "--checkpoint.file.=" is present; use its // value as the filename. fileArg.overrideValue - } else if (fileArg.baseValue.isDefined) { - // The flag "--checkpoint.file="; use "_" as the - // filename. - Some(fileArg.baseValue.get + "_" + checkpointName) } else { - // Neither flag is present; the checkpoint is disabled. - None + fileArg.baseValue.map { value => + // The flag "--checkpoint.file="; use "_" as the + // filename. + value + "_" + checkpointName + } } } @@ -165,22 +157,20 @@ object Checkpoint { private def getFormat(checkpointName: String)(implicit args: Args, mode: Mode): String = { val defaultFormat = mode match { case Hdfs(_, _) | HadoopTest(_, _) => "sequencefile" - case _ => "tsv" + case _ => "tsv" } CheckpointArg(checkpointName, "format").value.getOrElse(defaultFormat).toLowerCase } // Returns a source for the checkpoint in the given format. - private def getSource(format: String, filename: String)(implicit mode: Mode): Source = { + private def getSource(format: String, filename: String)(implicit mode: Mode): Source = format match { case "sequencefile" => SequenceFile(filename) - case "tsv" => Tsv(filename) - case _ => sys.error("Invalid value for --checkpoint.format: " + format) + case "tsv" => Tsv(filename) + case _ => sys.error("Invalid value for --checkpoint.format: " + format) } - } // Returns true if the given checkpoint file exists and should be read. - private def hasInput(checkpointName: String, filename: String)(implicit args: Args, mode: Mode): Boolean = { - !CheckpointArg(checkpointName, "clobber").isTrue && mode.fileExists(filename) - } + private def hasInput(checkpointName: String, filename: String)(implicit args: Args, mode: Mode): Boolean = + !CheckpointArg(checkpointName, "clobber").isTrue && CascadingMode.cast(mode).fileExists(filename) } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/scheme/CombinedSequenceFileScheme.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/scheme/CombinedSequenceFileScheme.scala new file mode 100644 index 0000000000..6036b561d6 --- /dev/null +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/scheme/CombinedSequenceFileScheme.scala @@ -0,0 +1,20 @@ +package com.twitter.scalding.commons.scheme + +import cascading.scheme.Scheme +import com.twitter.elephantbird.cascading2.scheme.{CombinedSequenceFile, CombinedWritableSequenceFile} +import com.twitter.scalding.{HadoopSchemeInstance, SequenceFileScheme, WritableSequenceFileScheme} + +trait CombinedSequenceFileScheme extends SequenceFileScheme { + // TODO Cascading doesn't support local mode yet + override def hdfsScheme = HadoopSchemeInstance( + new CombinedSequenceFile(fields).asInstanceOf[Scheme[_, _, _, _, _]] + ) +} + +trait CombinedWritableSequenceFileScheme extends WritableSequenceFileScheme { + // TODO Cascading doesn't support local mode yet + override def hdfsScheme = + HadoopSchemeInstance( + new CombinedWritableSequenceFile(fields, keyType, valueType).asInstanceOf[Scheme[_, _, _, _, _]] + ) +} diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/BinaryConverters.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/BinaryConverters.scala new file mode 100644 index 0000000000..a8b308c9cd --- /dev/null +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/BinaryConverters.scala @@ -0,0 +1,57 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding.commons.source + +import com.twitter.elephantbird.mapreduce.io.BinaryConverter +import com.twitter.scrooge.{BinaryThriftStructSerializer, ThriftStruct, ThriftStructCodec} +import scala.reflect.ClassTag +import scala.util.Try + +/* + * Common BinaryConverters to be used with GenericSource / GenericScheme. + */ + +case object IdentityBinaryConverter extends BinaryConverter[Array[Byte]] { + override def fromBytes(messageBuffer: Array[Byte]) = messageBuffer + override def toBytes(message: Array[Byte]) = message +} + +object ScroogeBinaryConverter { + + // codec code borrowed from chill's ScroogeThriftStructSerializer class + private[this] def codecForNormal[T <: ThriftStruct]( + thriftStructClass: Class[T] + ): Try[ThriftStructCodec[T]] = + Try(Class.forName(thriftStructClass.getName + "$").getField("MODULE$").get(null)) + .map(_.asInstanceOf[ThriftStructCodec[T]]) + + private[this] def codecForUnion[T <: ThriftStruct](maybeUnion: Class[T]): Try[ThriftStructCodec[T]] = + Try(Class.forName(maybeUnion.getName.reverse.dropWhile(_ != '$').reverse).getField("MODULE$").get(null)) + .map(_.asInstanceOf[ThriftStructCodec[T]]) + + def apply[T <: ThriftStruct: ClassTag]: BinaryConverter[T] = { + val ct = implicitly[ClassTag[T]] + new BinaryConverter[T] { + val serializer = BinaryThriftStructSerializer[T] { + val clazz = ct.runtimeClass.asInstanceOf[Class[T]] + codecForNormal[T](clazz).orElse(codecForUnion[T](clazz)).get + } + override def toBytes(struct: T) = serializer.toBytes(struct) + override def fromBytes(bytes: Array[Byte]): T = serializer.fromBytes(bytes) + } + } +} diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/DailySources.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/DailySources.scala index 77ed525f2b..5f7109f125 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/DailySources.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/DailySources.scala @@ -12,15 +12,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source import com.google.protobuf.Message import com.twitter.bijection.Injection import com.twitter.chill.Externalizer -import com.twitter.elephantbird.cascading2.scheme._ -import com.twitter.elephantbird.util.{ ThriftUtils, TypeRef } import com.twitter.scalding._ import com.twitter.scalding.source._ @@ -29,56 +27,95 @@ import java.io.Serializable import org.apache.thrift.TBase // Retrieve implicits -import Dsl._ -abstract class DailySuffixLzoCodec[T](prefix: String, dateRange: DateRange) -(implicit @transient suppliedInjection: Injection[T,Array[Byte]]) - extends DailySuffixSource(prefix, dateRange) with LzoCodec[T] { +abstract class DailySuffixLzoCodec[T](prefix: String, dateRange: DateRange)(implicit + @transient suppliedInjection: Injection[T, Array[Byte]] +) extends DailySuffixSource(prefix, dateRange) + with LzoCodec[T] { val boxed = Externalizer(suppliedInjection) override lazy val injection = boxed.get } abstract class DailySuffixLzoProtobuf[T <: Message: Manifest](prefix: String, dateRange: DateRange) - extends DailySuffixSource(prefix, dateRange) with LzoProtobuf[T] { - override def column = manifest[T].erasure + extends DailySuffixSource(prefix, dateRange) + with LzoProtobuf[T] { + override def column = manifest[T].runtimeClass +} + +abstract class DailySuffixMostRecentLzoProtobuf[T <: Message: Manifest](prefix: String, dateRange: DateRange) + extends DailySuffixMostRecentSource(prefix, dateRange) + with LzoProtobuf[T] { + override def column = manifest[T].runtimeClass } abstract class DailySuffixLzoThrift[T <: TBase[_, _]: Manifest](prefix: String, dateRange: DateRange) - extends DailySuffixSource(prefix, dateRange) with LzoThrift[T] { - override def column = manifest[T].erasure + extends DailySuffixSource(prefix, dateRange) + with LzoThrift[T] { + override def column = manifest[T].runtimeClass } -abstract class DailyPrefixSuffixLzoThrift[T <: TBase[_,_] : Manifest](prefix : String, suffix : String, dateRange : DateRange) - extends DailyPrefixSuffixSource(prefix, suffix, dateRange) with LzoThrift[T] { - override def column = manifest[T].erasure +abstract class DailyPrefixSuffixLzoThrift[T <: TBase[_, _]: Manifest]( + prefix: String, + suffix: String, + dateRange: DateRange +) extends DailyPrefixSuffixSource(prefix, suffix, dateRange) + with LzoThrift[T] { + override def column = manifest[T].runtimeClass } -abstract class TimePathedLongThriftSequenceFile[V <: TBase[_, _]: Manifest](f: Fields, prefix: String, dateFormat: String, dateRange: DateRange) - extends TimePathedSource(prefix + dateFormat + "/*", dateRange, DateOps.UTC) - with WritableSequenceFileScheme - with Serializable - with Mappable[(Long, V)] - with LongThriftTransformer[V] { +abstract class TimePathedLongThriftSequenceFile[V <: TBase[_, _]: Manifest]( + f: Fields, + prefix: String, + dateFormat: String, + dateRange: DateRange +) extends TimePathedSource(prefix + dateFormat + "/*", dateRange, DateOps.UTC) + with WritableSequenceFileScheme + with Serializable + with Mappable[(Long, V)] + with TypedSink[(Long, V)] + with LongThriftTransformer[V] { override val fields = f + override def sinkFields = f override val mt = implicitly[Manifest[V]] - override def converter[U >: (Long, V)] = TupleConverter.asSuperConverter[(Long, V), U](TupleConverter.of[(Long, V)]) + override def converter[U >: (Long, V)] = + TupleConverter.asSuperConverter[(Long, V), U](TupleConverter.of[(Long, V)]) + override def setter[U <: (Long, V)] = TupleSetter.asSubSetter[(Long, V), U](TupleSetter.of[(Long, V)]) } -abstract class MostRecentGoodLongThriftSequenceFile[V <: TBase[_, _]: Manifest](f: Fields, pattern: String, dateRange: DateRange) - extends MostRecentGoodSource(pattern, dateRange, DateOps.UTC) +abstract class MostRecentGoodLongThriftSequenceFile[V <: TBase[_, _]: Manifest]( + f: Fields, + pattern: String, + dateRange: DateRange +) extends MostRecentGoodSource(pattern, dateRange, DateOps.UTC) with WritableSequenceFileScheme - with Serializable - with Mappable[(Long, V)] - with LongThriftTransformer[V] { + with Serializable + with Mappable[(Long, V)] + with TypedSink[(Long, V)] + with LongThriftTransformer[V] { override val fields = f + override def sinkFields = f override val mt = implicitly[Manifest[V]] - override def converter[U >: (Long, V)] = TupleConverter.asSuperConverter[(Long, V), U](TupleConverter.of[(Long, V)]) + override def converter[U >: (Long, V)] = + TupleConverter.asSuperConverter[(Long, V), U](TupleConverter.of[(Long, V)]) + override def setter[U <: (Long, V)] = TupleSetter.asSubSetter[(Long, V), U](TupleSetter.of[(Long, V)]) } -abstract class DailySuffixLongThriftSequenceFile[V <: TBase[_, _]: Manifest](f: Fields, prefix: String, dateRange: DateRange) - extends TimePathedLongThriftSequenceFile[V](f, prefix, TimePathedSource.YEAR_MONTH_DAY, dateRange) +abstract class DailySuffixLongThriftSequenceFile[V <: TBase[_, _]: Manifest]( + f: Fields, + prefix: String, + dateRange: DateRange +) extends TimePathedLongThriftSequenceFile[V](f, prefix, TimePathedSource.YEAR_MONTH_DAY, dateRange) + +case class DailySuffixLzoTsv(prefix: String, fs: Fields = Fields.ALL)( + override implicit val dateRange: DateRange +) extends DailySuffixSource(prefix, dateRange) + with LzoTsv { + override val fields = fs +} -case class DailySuffixLzoTsv(prefix: String, fs: Fields = Fields.ALL)(override implicit val dateRange: DateRange) - extends DailySuffixSource(prefix, dateRange) with LzoTsv { +case class DailyPrefixSuffixLzoTsv(prefix: String, suffix: String, fs: Fields = Fields.ALL)(implicit + override val dateRange: DateRange +) extends DailyPrefixSuffixSource(prefix, suffix, dateRange) + with LzoTsv { override val fields = fs } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/FixedPathSources.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/FixedPathSources.scala index 82281e96fa..6c032930fc 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/FixedPathSources.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/FixedPathSources.scala @@ -12,22 +12,22 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source import com.google.protobuf.Message import com.twitter.scalding._ -import com.twitter.scalding.Dsl._ -import java.io.Serializable import org.apache.thrift.TBase abstract class FixedPathLzoThrift[T <: TBase[_, _]: Manifest](path: String*) - extends FixedPathSource(path: _*) with LzoThrift[T] { - def column = manifest[T].erasure + extends FixedPathSource(path: _*) + with LzoThrift[T] { + def column = manifest[T].runtimeClass } abstract class FixedPathLzoProtobuf[T <: Message: Manifest](path: String) - extends FixedPathSource(path) with LzoProtobuf[T] { - def column = manifest[T].erasure + extends FixedPathSource(path) + with LzoProtobuf[T] { + def column = manifest[T].runtimeClass } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/GeneratedLzoTypedTsv.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/GeneratedLzoTypedTsv.scala index 64af121bd7..4f1a55c5dd 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/GeneratedLzoTypedTsv.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/GeneratedLzoTypedTsv.scala @@ -5,46 +5,80 @@ import com.twitter.scalding._ trait LzoTypedTsv1[A] extends LzoTypedTsv[Tuple1[A]] with Mappable1[A] -trait LzoTypedTsv2[A,B] extends LzoTypedTsv[Tuple2[A,B]] with Mappable2[A,B] +trait LzoTypedTsv2[A, B] extends LzoTypedTsv[Tuple2[A, B]] with Mappable2[A, B] -trait LzoTypedTsv3[A,B,C] extends LzoTypedTsv[Tuple3[A,B,C]] with Mappable3[A,B,C] +trait LzoTypedTsv3[A, B, C] extends LzoTypedTsv[Tuple3[A, B, C]] with Mappable3[A, B, C] -trait LzoTypedTsv4[A,B,C,D] extends LzoTypedTsv[Tuple4[A,B,C,D]] with Mappable4[A,B,C,D] +trait LzoTypedTsv4[A, B, C, D] extends LzoTypedTsv[Tuple4[A, B, C, D]] with Mappable4[A, B, C, D] -trait LzoTypedTsv5[A,B,C,D,E] extends LzoTypedTsv[Tuple5[A,B,C,D,E]] with Mappable5[A,B,C,D,E] +trait LzoTypedTsv5[A, B, C, D, E] extends LzoTypedTsv[Tuple5[A, B, C, D, E]] with Mappable5[A, B, C, D, E] -trait LzoTypedTsv6[A,B,C,D,E,F] extends LzoTypedTsv[Tuple6[A,B,C,D,E,F]] with Mappable6[A,B,C,D,E,F] +trait LzoTypedTsv6[A, B, C, D, E, F] + extends LzoTypedTsv[Tuple6[A, B, C, D, E, F]] + with Mappable6[A, B, C, D, E, F] -trait LzoTypedTsv7[A,B,C,D,E,F,G] extends LzoTypedTsv[Tuple7[A,B,C,D,E,F,G]] with Mappable7[A,B,C,D,E,F,G] +trait LzoTypedTsv7[A, B, C, D, E, F, G] + extends LzoTypedTsv[Tuple7[A, B, C, D, E, F, G]] + with Mappable7[A, B, C, D, E, F, G] -trait LzoTypedTsv8[A,B,C,D,E,F,G,H] extends LzoTypedTsv[Tuple8[A,B,C,D,E,F,G,H]] with Mappable8[A,B,C,D,E,F,G,H] +trait LzoTypedTsv8[A, B, C, D, E, F, G, H] + extends LzoTypedTsv[Tuple8[A, B, C, D, E, F, G, H]] + with Mappable8[A, B, C, D, E, F, G, H] -trait LzoTypedTsv9[A,B,C,D,E,F,G,H,I] extends LzoTypedTsv[Tuple9[A,B,C,D,E,F,G,H,I]] with Mappable9[A,B,C,D,E,F,G,H,I] +trait LzoTypedTsv9[A, B, C, D, E, F, G, H, I] + extends LzoTypedTsv[Tuple9[A, B, C, D, E, F, G, H, I]] + with Mappable9[A, B, C, D, E, F, G, H, I] -trait LzoTypedTsv10[A,B,C,D,E,F,G,H,I,J] extends LzoTypedTsv[Tuple10[A,B,C,D,E,F,G,H,I,J]] with Mappable10[A,B,C,D,E,F,G,H,I,J] +trait LzoTypedTsv10[A, B, C, D, E, F, G, H, I, J] + extends LzoTypedTsv[Tuple10[A, B, C, D, E, F, G, H, I, J]] + with Mappable10[A, B, C, D, E, F, G, H, I, J] -trait LzoTypedTsv11[A,B,C,D,E,F,G,H,I,J,K] extends LzoTypedTsv[Tuple11[A,B,C,D,E,F,G,H,I,J,K]] with Mappable11[A,B,C,D,E,F,G,H,I,J,K] +trait LzoTypedTsv11[A, B, C, D, E, F, G, H, I, J, K] + extends LzoTypedTsv[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] + with Mappable11[A, B, C, D, E, F, G, H, I, J, K] -trait LzoTypedTsv12[A,B,C,D,E,F,G,H,I,J,K,L] extends LzoTypedTsv[Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]] with Mappable12[A,B,C,D,E,F,G,H,I,J,K,L] +trait LzoTypedTsv12[A, B, C, D, E, F, G, H, I, J, K, L] + extends LzoTypedTsv[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] + with Mappable12[A, B, C, D, E, F, G, H, I, J, K, L] -trait LzoTypedTsv13[A,B,C,D,E,F,G,H,I,J,K,L,M] extends LzoTypedTsv[Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]] with Mappable13[A,B,C,D,E,F,G,H,I,J,K,L,M] +trait LzoTypedTsv13[A, B, C, D, E, F, G, H, I, J, K, L, M] + extends LzoTypedTsv[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] + with Mappable13[A, B, C, D, E, F, G, H, I, J, K, L, M] -trait LzoTypedTsv14[A,B,C,D,E,F,G,H,I,J,K,L,M,N] extends LzoTypedTsv[Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]] with Mappable14[A,B,C,D,E,F,G,H,I,J,K,L,M,N] +trait LzoTypedTsv14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + extends LzoTypedTsv[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] + with Mappable14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] -trait LzoTypedTsv15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O] extends LzoTypedTsv[Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]] with Mappable15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O] +trait LzoTypedTsv15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + extends LzoTypedTsv[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] + with Mappable15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] -trait LzoTypedTsv16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P] extends LzoTypedTsv[Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]] with Mappable16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P] +trait LzoTypedTsv16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + extends LzoTypedTsv[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] + with Mappable16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] -trait LzoTypedTsv17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q] extends LzoTypedTsv[Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]] with Mappable17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q] +trait LzoTypedTsv17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + extends LzoTypedTsv[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] + with Mappable17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] -trait LzoTypedTsv18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R] extends LzoTypedTsv[Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]] with Mappable18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R] +trait LzoTypedTsv18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + extends LzoTypedTsv[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] + with Mappable18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] -trait LzoTypedTsv19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S] extends LzoTypedTsv[Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]] with Mappable19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S] +trait LzoTypedTsv19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + extends LzoTypedTsv[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] + with Mappable19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] -trait LzoTypedTsv20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T] extends LzoTypedTsv[Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]] with Mappable20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T] +trait LzoTypedTsv20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + extends LzoTypedTsv[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] + with Mappable20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] -trait LzoTypedTsv21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U] extends LzoTypedTsv[Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]] with Mappable21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U] +trait LzoTypedTsv21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + extends LzoTypedTsv[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] + with Mappable21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] -trait LzoTypedTsv22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V] extends LzoTypedTsv[Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]] with Mappable22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V] +trait LzoTypedTsv22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + extends LzoTypedTsv[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] + with Mappable22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] // end of autogenerated diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/HourlySources.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/HourlySources.scala index 60c0030c75..55d51d049f 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/HourlySources.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/HourlySources.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source @@ -21,32 +21,36 @@ import com.google.protobuf.Message import com.twitter.bijection.Injection import com.twitter.chill.Externalizer import com.twitter.scalding._ -import com.twitter.scalding.Dsl._ import com.twitter.scalding.source._ -import java.io.Serializable import org.apache.thrift.TBase -abstract class HourlySuffixLzoCodec[T](prefix: String, dateRange: DateRange) -(implicit @transient suppliedInjection: Injection[T,Array[Byte]]) - extends HourlySuffixSource(prefix, dateRange) with LzoCodec[T] { +abstract class HourlySuffixLzoCodec[T](prefix: String, dateRange: DateRange)(implicit + @transient suppliedInjection: Injection[T, Array[Byte]] +) extends HourlySuffixSource(prefix, dateRange) + with LzoCodec[T] { val boxed = Externalizer(suppliedInjection) override lazy val injection = boxed.get } -case class HourlySuffixLzoTsv(prefix: String, fs: Fields = Fields.ALL)(override implicit val dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with LzoTsv { +case class HourlySuffixLzoTsv(prefix: String, fs: Fields = Fields.ALL)( + override implicit val dateRange: DateRange +) extends HourlySuffixSource(prefix, dateRange) + with LzoTsv { override val fields = fs } abstract class HourlySuffixLzoThrift[T <: TBase[_, _]: Manifest](prefix: String, dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with LzoThrift[T] { - override def column = manifest[T].erasure + extends HourlySuffixSource(prefix, dateRange) + with LzoThrift[T] { + override def column = manifest[T].runtimeClass } abstract class HourlySuffixLzoProtobuf[T <: Message: Manifest](prefix: String, dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with LzoProtobuf[T] { - override def column = manifest[T].erasure + extends HourlySuffixSource(prefix, dateRange) + with LzoProtobuf[T] { + override def column = manifest[T].runtimeClass } abstract class HourlySuffixLzoText(prefix: String, dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with LzoText + extends HourlySuffixSource(prefix, dateRange) + with LzoText diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LongThriftTransformer.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LongThriftTransformer.scala index d3f6ea9e75..91725e3897 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LongThriftTransformer.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LongThriftTransformer.scala @@ -12,17 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source import cascading.pipe.Pipe import cascading.tuple.Fields import com.twitter.elephantbird.mapreduce.io.ThriftWritable -import com.twitter.elephantbird.util.{ ThriftUtils, TypeRef } +import com.twitter.elephantbird.util.{ThriftUtils, TypeRef} import com.twitter.scalding._ -import com.twitter.scalding.Dsl._ -import org.apache.hadoop.io.{ LongWritable, Writable } +import org.apache.hadoop.io.{LongWritable, Writable} import org.apache.thrift.TBase trait LongThriftTransformer[V <: TBase[_, _]] extends Source { @@ -33,18 +32,16 @@ trait LongThriftTransformer[V <: TBase[_, _]] extends Source { // meant to override fields within WritableSequenceFileScheme. val keyType = classOf[LongWritable] val valueType = classOf[ThriftWritable[V]].asInstanceOf[Class[Writable]] - override protected def transformForRead(pipe: Pipe): Pipe = { + override protected def transformForRead(pipe: Pipe): Pipe = new RichPipe(pipe).mapTo(fields -> fields) { v: (LongWritable, ThriftWritable[V]) => - v._2.setConverter(mt.erasure.asInstanceOf[Class[V]]) + v._2.setConverter(mt.runtimeClass.asInstanceOf[Class[V]]) (v._1.get, v._2.get) } - } - override protected def transformForWrite(pipe: Pipe) = { + override protected def transformForWrite(pipe: Pipe) = new RichPipe(pipe).mapTo(fields -> fields) { v: (Long, V) => val key = new LongWritable(v._1) val value = new ThriftWritable(v._2, typeRef) (key, value) } - } - lazy val typeRef = ThriftUtils.getTypeRef(mt.erasure).asInstanceOf[TypeRef[TBase[_, _]]] + lazy val typeRef = ThriftUtils.getTypeRef(mt.runtimeClass).asInstanceOf[TypeRef[TBase[_, _]]] } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoCodecSource.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoCodecSource.scala index c97a612644..bb3c40e617 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoCodecSource.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoCodecSource.scala @@ -12,24 +12,22 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source import com.twitter.chill.Externalizer -import com.twitter.scalding._ import com.twitter.bijection.Injection /** - * Source used to write some type T into an LZO-compressed SequenceFile using a - * codec on T for serialization. + * Source used to write some type T into an LZO-compressed SequenceFile using a codec on T for serialization. */ object LzoCodecSource { def apply[T](paths: String*)(implicit passedInjection: Injection[T, Array[Byte]]) = new LzoCodec[T] { val hdfsPaths = paths - val localPath = { assert(paths.size == 1, "Cannot use multiple input files on local mode"); paths(0) } + val localPaths = paths val boxed = Externalizer(passedInjection) override def injection = boxed.get } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericScheme.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericScheme.scala new file mode 100644 index 0000000000..4e9764ece6 --- /dev/null +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericScheme.scala @@ -0,0 +1,157 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding.commons.source + +import scala.reflect.ClassTag + +import com.twitter.bijection._ +import com.twitter.chill.Externalizer +import com.twitter.elephantbird.cascading2.scheme.LzoBinaryScheme +import com.twitter.elephantbird.mapreduce.input.combine.DelegateCombineFileInputFormat +import com.twitter.elephantbird.mapreduce.io.{BinaryConverter, GenericWritable} +import com.twitter.elephantbird.mapreduce.input.{BinaryConverterProvider, MultiInputFormat} +import com.twitter.elephantbird.mapreduce.output.LzoGenericBlockOutputFormat +import com.twitter.elephantbird.mapred.output.DeprecatedOutputFormatWrapper + +import org.apache.hadoop.mapred.{JobConf, OutputCollector, RecordReader} +import org.apache.hadoop.conf.Configuration + +import cascading.tap.Tap +import cascading.flow.FlowProcess + +/** + * Serializes BinaryConverters to JobConf. + */ +private[source] object ExternalizerSerializer { + def inj[T]: Injection[Externalizer[T], String] = { + import com.twitter.bijection.Inversion.attemptWhen + import com.twitter.bijection.codec.Base64 + + implicit val baseInj: Injection[Externalizer[T], Array[Byte]] = + JavaSerializationInjection[Externalizer[T]] + + implicit val unwrap: Injection[GZippedBase64String, String] = + // this does not catch cases where it's Base64 but not compressed + // but the decompression injection will, so it's safe to do this + new AbstractInjection[GZippedBase64String, String] { + override def apply(gzbs: GZippedBase64String) = gzbs.str + override def invert(str: String) = attemptWhen(str)(Base64.isBase64)(GZippedBase64String(_)) + } + + Injection.connect[Externalizer[T], Array[Byte], GZippedBase64String, String] + } +} + +private[source] object SourceConfigBinaryConverterProvider { + val ProviderConfKey = "com.twitter.scalding.lzo.converter.provider.source" +} +private[source] class SourceConfigBinaryConverterProvider[M] + extends ConfigBinaryConverterProvider[M](SourceConfigBinaryConverterProvider.ProviderConfKey) + +private[source] object SinkConfigBinaryConverterProvider { + val ProviderConfKey = "com.twitter.scalding.lzo.converter.provider.sink" +} +private[source] class SinkConfigBinaryConverterProvider[M] + extends ConfigBinaryConverterProvider[M](SinkConfigBinaryConverterProvider.ProviderConfKey) + +/** + * Provides BinaryConverter serialized in JobConf. + */ +private[source] class ConfigBinaryConverterProvider[M](private[this] val confKey: String) + extends BinaryConverterProvider[M] { + private[this] var cached: Option[(String, BinaryConverter[M])] = None + + override def getConverter(conf: Configuration): BinaryConverter[M] = { + val data = conf.get(confKey) + require(data != null, s"$confKey is not set in configuration") + cached match { + case Some((d, conv)) if d == data => conv + case _ => + val extern = ExternalizerSerializer.inj.invert(data).get + val conv = extern.get.asInstanceOf[BinaryConverter[M]] + cached = Some((data, conv)) + conv + } + } +} + +object LzoGenericScheme { + def apply[M: ClassTag](conv: BinaryConverter[M]): LzoGenericScheme[M] = + new LzoGenericScheme(conv, implicitly[ClassTag[M]].runtimeClass.asInstanceOf[Class[M]]) + + def apply[M](conv: BinaryConverter[M], clazz: Class[M]): LzoGenericScheme[M] = + new LzoGenericScheme(conv, clazz) + + /** + * From a Binary Converter passed in configure in the JobConf using of that by ElephantBird + */ + def setConverter[M]( + conv: BinaryConverter[M], + conf: JobConf, + confKey: String, + overrideConf: Boolean = false + ): Unit = + if ((conf.get(confKey) == null) || overrideConf) { + val extern = Externalizer(conv) + try { + ExternalizerSerializer.inj.invert(ExternalizerSerializer.inj(extern)).get + } catch { + case e: Exception => + throw new RuntimeException("Unable to roundtrip the BinaryConverter in the Externalizer.", e) + } + conf.set(confKey, ExternalizerSerializer.inj(extern)) + } + +} + +/** + * Generic scheme for data stored as lzo-compressed protobuf messages. Serialization is performed using the + * supplied BinaryConverter. + */ +class LzoGenericScheme[M](@transient conv: BinaryConverter[M], clazz: Class[M]) + extends LzoBinaryScheme[M, GenericWritable[M]] { + + override protected def prepareBinaryWritable(): GenericWritable[M] = + new GenericWritable(conv) + + override def sourceConfInit( + fp: FlowProcess[JobConf], + tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], + conf: JobConf + ): Unit = { + + LzoGenericScheme.setConverter(conv, conf, SourceConfigBinaryConverterProvider.ProviderConfKey) + MultiInputFormat.setClassConf(clazz, conf) + MultiInputFormat.setGenericConverterClassConf(classOf[SourceConfigBinaryConverterProvider[_]], conf) + + DelegateCombineFileInputFormat.setDelegateInputFormat(conf, classOf[MultiInputFormat[_]]) + } + + override def sinkConfInit( + fp: FlowProcess[JobConf], + tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], + conf: JobConf + ): Unit = { + LzoGenericScheme.setConverter(conv, conf, SinkConfigBinaryConverterProvider.ProviderConfKey) + LzoGenericBlockOutputFormat.setClassConf(clazz, conf) + LzoGenericBlockOutputFormat.setGenericConverterClassConf( + classOf[SinkConfigBinaryConverterProvider[_]], + conf + ) + DeprecatedOutputFormatWrapper.setOutputFormat(classOf[LzoGenericBlockOutputFormat[_]], conf) + } +} diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericSource.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericSource.scala new file mode 100644 index 0000000000..72c305fcaa --- /dev/null +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoGenericSource.scala @@ -0,0 +1,48 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding.commons.source + +import com.twitter.elephantbird.mapreduce.io.BinaryConverter +import com.twitter.scalding._ + +import cascading.scheme.Scheme + +/** + * Generic source with an underlying GenericScheme that uses the supplied BinaryConverter. + */ +abstract class LzoGenericSource[T] + extends FileSource + with SingleMappable[T] + with TypedSink[T] + with LocalTapSource { + def clazz: Class[T] + def conv: BinaryConverter[T] + override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) + override def hdfsScheme = HadoopSchemeInstance( + LzoGenericScheme[T](conv, clazz).asInstanceOf[Scheme[_, _, _, _, _]] + ) +} + +object LzoGenericSource { + def apply[T](passedConv: BinaryConverter[T], passedClass: Class[T], paths: String*) = + new LzoGenericSource[T] { + override val conv: BinaryConverter[T] = passedConv + override val clazz = passedClass + override val hdfsPaths = paths + override val localPaths = paths + } +} diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTraits.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTraits.scala index 4dc52366b0..d32dab1747 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTraits.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTraits.scala @@ -12,14 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source -import collection.mutable.ListBuffer - import cascading.pipe.Pipe -import cascading.scheme.local.{ TextDelimited => CLTextDelimited, TextLine => CLTextLine } import cascading.scheme.Scheme import org.apache.thrift.TBase @@ -28,25 +25,38 @@ import com.twitter.bijection.Injection import com.twitter.elephantbird.cascading2.scheme._ import com.twitter.scalding._ import com.twitter.scalding.Dsl._ -import com.twitter.scalding.source.{ CheckedInversion, MaxFailuresCheck } +import com.twitter.scalding.source.{CheckedInversion, MaxFailuresCheck} import com.twitter.scalding.typed.TypedSink +import scala.collection.JavaConverters._ trait LzoCodec[T] extends FileSource with SingleMappable[T] with TypedSink[T] with LocalTapSource { - def injection: Injection[T,Array[Byte]] - override def setter[U <:T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) + def injection: Injection[T, Array[Byte]] + override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) override def hdfsScheme = HadoopSchemeInstance((new LzoByteArrayScheme).asInstanceOf[Scheme[_, _, _, _, _]]) override def transformForRead(pipe: Pipe) = - pipe.map(0 -> 0) { injection.invert(_: Array[Byte]).get } + pipe.flatMap(0 -> 0)(fromBytes(_: Array[Byte])) override def transformForWrite(pipe: Pipe) = - pipe.mapTo(0 -> 0) { injection.apply(_: T) } + pipe.mapTo(0 -> 0)(injection.apply(_: T)) + + protected def fromBytes(b: Array[Byte]): Option[T] = Some(injection.invert(b).get) + + override def toIterator(implicit config: Config, mode: Mode): Iterator[T] = { + val tap = createTap(Read)(mode) + CascadingMode + .cast(mode) + .openForRead(config, tap) + .asScala + .flatMap { te => + fromBytes(te.selectTuple(sourceFields).getObject(0).asInstanceOf[Array[Byte]]) + } + } } trait ErrorHandlingLzoCodec[T] extends LzoCodec[T] { def checkedInversion: CheckedInversion[T, Array[Byte]] - override def transformForRead(pipe: Pipe) = - pipe.flatMap(0 -> 0) { (b: Array[Byte]) => checkedInversion(b) } + override def fromBytes(b: Array[Byte]) = checkedInversion(b) } // Common case of setting a maximum number of errors @@ -55,40 +65,50 @@ trait ErrorThresholdLzoCodec[T] extends ErrorHandlingLzoCodec[T] { lazy val checkedInversion: CheckedInversion[T, Array[Byte]] = new MaxFailuresCheck(maxErrors)(injection) } -trait LzoProtobuf[T <: Message] extends FileSource with SingleMappable[T] with TypedSink[T] with LocalTapSource { +trait LzoProtobuf[T <: Message] extends LocalTapSource with SingleMappable[T] with TypedSink[T] { def column: Class[_] - override def setter[U <:T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) - override def hdfsScheme = HadoopSchemeInstance((new LzoProtobufScheme[T](column)).asInstanceOf[Scheme[_,_,_,_,_]]) + override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) + override def hdfsScheme = HadoopSchemeInstance( + (new LzoProtobufScheme[T](column)).asInstanceOf[Scheme[_, _, _, _, _]] + ) } -trait LzoThrift[T <: TBase[_, _]] extends FileSource with SingleMappable[T] with TypedSink[T] with LocalTapSource { +trait LzoThrift[T <: TBase[_, _]] extends LocalTapSource with SingleMappable[T] with TypedSink[T] { def column: Class[_] - override def setter[U <:T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) - override def hdfsScheme = HadoopSchemeInstance((new LzoThriftScheme[T](column)).asInstanceOf[Scheme[_,_,_,_,_]]) + override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) + override def hdfsScheme = HadoopSchemeInstance( + (new LzoThriftScheme[T](column)).asInstanceOf[Scheme[_, _, _, _, _]] + ) } -trait LzoText extends FileSource with SingleMappable[String] with TypedSink[String] with LocalTapSource { +trait LzoText extends LocalTapSource with SingleMappable[String] with TypedSink[String] { override def setter[U <: String] = TupleSetter.asSubSetter[String, U](TupleSetter.singleSetter[String]) override def hdfsScheme = HadoopSchemeInstance(new LzoTextLine()) + override def sourceFields = Dsl.intFields(Seq(1)) } trait LzoTsv extends DelimitedScheme with LocalTapSource { - override def hdfsScheme = HadoopSchemeInstance(new LzoTextDelimited(fields, separator, types)) + override def hdfsScheme = HadoopSchemeInstance( + (new LzoTextDelimited(fields, skipHeader, writeHeader, separator, strict, quote, types, safe)) + .asInstanceOf[Scheme[_, _, _, _, _]] + ) } trait LzoTypedTsv[T] extends DelimitedScheme with Mappable[T] with TypedSink[T] with LocalTapSource { - override def setter[U <:T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) - override def hdfsScheme = HadoopSchemeInstance(new LzoTextDelimited(fields, separator, types)) + override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) + override def hdfsScheme = HadoopSchemeInstance( + (new LzoTextDelimited(fields, skipHeader, writeHeader, separator, strict, quote, types, safe)) + .asInstanceOf[Scheme[_, _, _, _, _]] + ) def mf: Manifest[T] - override val types: Array[Class[_]] = { - if (classOf[scala.Product].isAssignableFrom(mf.erasure)) { - //Assume this is a Tuple: - mf.typeArguments.map { _.erasure }.toArray + override val types: Array[Class[_]] = + if (classOf[scala.Product].isAssignableFrom(mf.runtimeClass)) { + // Assume this is a Tuple: + mf.typeArguments.map(_.runtimeClass).toArray } else { - //Assume there is only a single item - Array(mf.erasure) + // Assume there is only a single item + Array(mf.runtimeClass) } - } } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTypedText.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTypedText.scala new file mode 100644 index 0000000000..7f32be0259 --- /dev/null +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/LzoTypedText.scala @@ -0,0 +1,110 @@ +package com.twitter.scalding.commons.source + +import cascading.scheme.Scheme +import com.twitter.elephantbird.cascading2.scheme.LzoTextDelimited +import com.twitter.scalding._ +import com.twitter.scalding.source.TypedTextDelimited +import com.twitter.scalding.source.TypedSep + +object LzoTypedText { + + val TAB = TypedSep("\t") + val ONE = TypedSep("\u0001") + val COMMA = TypedSep(",") + + /* + * To use these, you will generally want to + * import com.twitter.scalding.commons.source.typedtext._ + * to get the implicit TypedDescriptor. + * Then use TypedText.lzoTzv[MyCaseClass]("path") + */ + def lzoTsv[T: TypeDescriptor](path: String*): TypedTextDelimited[T] = + new FixedLzoTypedText[T](TAB, path: _*) + def lzoOsv[T: TypeDescriptor](path: String*): TypedTextDelimited[T] = + new FixedLzoTypedText[T](ONE, path: _*) + def lzoCsv[T: TypeDescriptor](path: String*): TypedTextDelimited[T] = + new FixedLzoTypedText[T](COMMA, path: _*) + + def hourlyLzoTsv[T]( + prefix: String + )(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + require(prefix.last != '/', "prefix should not include trailing /") + new TimePathLzoTypedText[T](TAB, prefix + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*") + } + + def hourlyLzoOsv[T]( + prefix: String + )(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + require(prefix.last != '/', "prefix should not include trailing /") + new TimePathLzoTypedText[T](ONE, prefix + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*") + } + + def hourlyLzoCsv[T]( + prefix: String + )(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + require(prefix.last != '/', "prefix should not include trailing /") + new TimePathLzoTypedText[T](COMMA, prefix + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*") + } + + def dailyLzoTsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + require(prefix.last != '/', "prefix should not include trailing /") + new TimePathLzoTypedText[T](TAB, prefix + TimePathedSource.YEAR_MONTH_DAY + "/*") + } + + def dailyLzoOsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + require(prefix.last != '/', "prefix should not include trailing /") + new TimePathLzoTypedText[T](ONE, prefix + TimePathedSource.YEAR_MONTH_DAY + "/*") + } + + def dailyLzoCsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = { + require(prefix.last != '/', "prefix should not include trailing /") + new TimePathLzoTypedText[T](COMMA, prefix + TimePathedSource.YEAR_MONTH_DAY + "/*") + } + + def dailyPrefixSuffixLzoOsv[T](prefix: String, suffix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = { + require(prefix.last != '/', "prefix should not include trailing /") + require(suffix.head == '/', "suffix should include a preceding /") + new TimePathLzoTypedText[T](ONE, prefix + TimePathedSource.YEAR_MONTH_DAY + suffix + "/*") + } + +} + +trait LzoTypedTextDelimited[T] extends TypedTextDelimited[T] with LocalTapSource { + override def hdfsScheme = + HadoopSchemeInstance( + new LzoTextDelimited( + typeDescriptor.fields, + false, + false, + separator.str, + strict, + null /* quote */, + typeDescriptor.fields.getTypesClasses, + safe + ).asInstanceOf[Scheme[_, _, _, _, _]] + ) +} + +class TimePathLzoTypedText[T](sep: TypedSep, path: String)(implicit dr: DateRange, td: TypeDescriptor[T]) + extends TimePathedSource(path, dr, DateOps.UTC) + with LzoTypedTextDelimited[T] { + override def typeDescriptor = td + protected override def separator = sep +} + +class MostRecentLzoTypedText[T](sep: TypedSep, path: String)(implicit dr: DateRange, td: TypeDescriptor[T]) + extends MostRecentGoodSource(path, dr, DateOps.UTC) + with LzoTypedTextDelimited[T] { + override def typeDescriptor = td + protected override def separator = sep +} + +class FixedLzoTypedText[T](sep: TypedSep, path: String*)(implicit td: TypeDescriptor[T]) + extends FixedPathSource(path: _*) + with LzoTypedTextDelimited[T] { + override def typeDescriptor = td + protected override def separator = sep +} diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/PailSource.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/PailSource.scala deleted file mode 100644 index 554ad381a9..0000000000 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/PailSource.scala +++ /dev/null @@ -1,216 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package com.twitter.scalding.commons.source - -import com.backtype.cascading.tap.PailTap -import com.backtype.hadoop.pail.{Pail, PailStructure} -import cascading.pipe.Pipe -import cascading.scheme.Scheme -import cascading.tap.Tap -import com.twitter.bijection.Injection -import com.twitter.chill.Externalizer -import com.twitter.scalding._ -import java.util.{ List => JList } -import org.apache.hadoop.mapred.{ JobConf, OutputCollector, RecordReader } -import scala.collection.JavaConverters._ - -/** - * The PailSource enables scalding integration with the Pail class in the - * dfs-datastores library. PailSource allows scalding to sink 1-tuples - * to subdirectories of a root folder by applying a routing function to - * each tuple. - * - * SEE EXAMPLE : https://gist.github.com/krishnanraman/5224937 - */ - -object PailSource { - - /** - * the simplest version of sink - THE MOST COMMON USE CASE - * specify exactly 2 parameters - * rootPath - the location ie. Where do you want your Pail to reside ? - * targetFn - the partition function ie. How do we create Pail subdirectories out of your input space ? - * - * SEE EXAMPLE : https://gist.github.com/krishnanraman/5224937 - */ - def sink[T]( rootPath: String, - targetFn: (T) => List[String] ) - (implicit cmf: ClassManifest[T], - injection: Injection[T, Array[Byte]]):PailSource[T] = { - - val validator = ((x:List[String])=> true) - val cps = new CodecPailStructure[T]() - cps.setParams( targetFn, validator, cmf.erasure.asInstanceOf[Class[T]], injection) - sink(rootPath, cps) - } - - /** - * the simplest version of source - THE MOST COMMON USE CASE - * specify exactly 2 parameters - * rootPath - the location ie. Where does your Pail reside - its root directory ? - * subPath - the location ie. Where does your Pail reside - its subdirectories ? - * eg. Say your data resides in foo/bar, foo/obj, foo/ghj - * If you care about obj & ghj, the rootPath = "foo", subPaths = Array(List("obj"), List("ghj")) - * Notice that subPaths != Array(List("obj", "ghj")) - this would fail. - * Every subdirectory goes in its own list. - * - * SEE EXAMPLE : https://gist.github.com/krishnanraman/5224937 - */ - def source[T](rootPath: String, - subPaths: Array[List[String]]) - (implicit cmf: ClassManifest[T], - injection: Injection[T, Array[Byte]]):PailSource[T] = { - - val validator = ((x:List[String])=> true) - val cps = new CodecPailStructure[T]() - cps.setParams( null, validator, cmf.erasure.asInstanceOf[Class[T]], injection) - source( rootPath, cps, subPaths) - } - - /** Generic version of Pail sink accepts a PailStructure. - */ - def sink[T](rootPath: String, structure: PailStructure[T]):PailSource[T] = - new PailSource(rootPath, structure) - - /** A Pail sink can also build its structure on the fly from a - * couple of functions. - */ - def sink[T]( rootPath: String, - targetFn: (T) => List[String], - validator: (List[String]) => Boolean, - mytype:java.lang.Class[T], - injection: Injection[T, Array[Byte]]):PailSource[T] = { - - val cps = new CodecPailStructure[T]() - cps.setParams( targetFn, validator, mytype, injection) - sink( rootPath, cps) - } - - /** Alternate sink construction - * Using implicit injections & classmanifest for the type - */ - def sink[T]( rootPath: String, - targetFn: (T) => List[String], - validator: (List[String]) => Boolean) - (implicit cmf: ClassManifest[T], - injection: Injection[T, Array[Byte]]):PailSource[T] = { - val cps = new CodecPailStructure[T]() - cps.setParams( targetFn, validator, cmf.erasure.asInstanceOf[Class[T]], injection) - sink(rootPath, cps) - } - - /** Generic version of Pail source accepts a PailStructure. - */ - def source[T](rootPath: String, structure: PailStructure[T], subPaths: Array[List[String]]):PailSource[T] = { - assert( subPaths != null && subPaths.size > 0) - new PailSource(rootPath, structure, subPaths) - } - - /** The most explicit method to construct a Pail source - specify all 5 params - */ - def source[T](rootPath: String, - validator: (List[String]) => Boolean, - mytype:java.lang.Class[T], - injection: Injection[T, Array[Byte]] , - subPaths: Array[List[String]]):PailSource[T] = { - val cps = new CodecPailStructure[T]() - cps.setParams( null, validator, mytype, injection) - source( rootPath, cps, subPaths) - } - - /** Alternate Pail source construction - specify 3 params, rest implicit - */ - def source[T](rootPath: String, - validator: (List[String]) => Boolean, - subPaths: Array[List[String]]) - (implicit cmf: ClassManifest[T], - injection: Injection[T, Array[Byte]]):PailSource[T] = { - val cps = new CodecPailStructure[T]() - cps.setParams( null, validator, cmf.erasure.asInstanceOf[Class[T]], injection) - source( rootPath, cps, subPaths) - } -} - -class PailSource[T] private (rootPath: String, structure: PailStructure[T], subPaths: Array[List[String]] = null) - (implicit conv: TupleConverter[T]) -extends Source with Mappable[T] { - import Dsl._ - - override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](conv) - val fieldName = "pailItem" - - lazy val getTap = { - val spec = PailTap.makeSpec(null, structure) - val javaSubPath = if ((subPaths == null) || (subPaths.size == 0)) null else subPaths map { _.asJava } - val opts = new PailTap.PailTapOptions(spec, fieldName, javaSubPath , null) - new PailTap(rootPath, opts) - } - - override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = { - val tap = com.twitter.scalding.CastHfsTap(getTap) - - mode match { - case Hdfs(strict, config) => - readOrWrite match { - case Read => tap - case Write => tap - } - case _ => - TestTapFactory(this, tap.getScheme).createTap(readOrWrite)(mode) - } - } - -} - -/** - * It is quite unlikely for client code to make a CodecPailStructure - * CodecPailStructure is constructed by PailSource's factory methods. - * - * targetFn takes an instance of T and returns a list - *"path components". Pail joins these components with - * File.separator and sinks the instance of T into the pail at that location. - * - * Usual implementations of "validator" will check that the length of - * the supplied list is >= the length f the list returned by targetFn. - * - * CodecPailStructure has a default constructor because it is instantiated via reflection - * This unfortunately means params must be set via setParams to make it usefuls -*/ - -class CodecPailStructure[T] extends PailStructure[T] { - - private var targetFn: T => List[String] = null - private var validator :List[String] => Boolean = ((x:List[String])=> true) - private var mytype: java.lang.Class[T] = null - private var injection: Injection[T, Array[Byte]] = null - - private[source] def setParams( targetFn: T => List[String], - validator: List[String] => Boolean, - mytype:java.lang.Class[T], - injection: Injection[T, Array[Byte]]) = { - - this.targetFn = targetFn - this.validator = validator - this.mytype = mytype - this.injection = injection - } - override def isValidTarget(paths: String*): Boolean = validator(paths.toList) - override def getTarget(obj: T): JList[String] = targetFn(obj).toList.asJava - override def serialize(obj: T): Array[Byte] = injection.apply(obj) - override def deserialize(bytes: Array[Byte]): T = injection.invert(bytes).get - override val getType = mytype -} diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/TsvWithHeader.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/TsvWithHeader.scala index 46dd53744a..5aefae7f62 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/TsvWithHeader.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/TsvWithHeader.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source @@ -22,10 +22,8 @@ import cascading.tuple.Fields import com.google.common.base.Charsets import com.google.common.io.Files import com.twitter.scalding._ -import com.twitter.scalding.Dsl._ -import java.io.{ BufferedWriter, File, FileOutputStream, IOException, OutputStreamWriter, Serializable } -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{ FileSystem, Path } +import java.io.{BufferedWriter, File, FileOutputStream, IOException, OutputStreamWriter} +import org.apache.hadoop.fs.Path /** * A tsv source with the column name header info. @@ -33,9 +31,9 @@ import org.apache.hadoop.fs.{ FileSystem, Path } * Header file format: tab separated column names. */ class TsvWithHeader(p: String, f: Fields = Fields.UNKNOWN)(implicit mode: Mode) - extends FixedPathSource(p) - with DelimitedScheme - with FieldConversions { + extends FixedPathSource(p) + with DelimitedScheme + with FieldConversions { val headerPath = p.replaceAll("/+$", "") + ".HEADER" // make it lazy so as to only do once @@ -53,12 +51,12 @@ class TsvWithHeader(p: String, f: Fields = Fields.UNKNOWN)(implicit mode: Mode) } // TODO: move this method to make it a util function. - def readFromFile(filename: String)(implicit mode: Mode) = { + def readFromFile(filename: String)(implicit mode: Mode) = mode match { case Hdfs(_, conf) => { try { val pt = new Path(filename) - val fs = FileSystem.get(conf) + val fs = pt.getFileSystem(conf) fs.open(pt).readUTF } catch { case e: IOException => { @@ -77,15 +75,14 @@ class TsvWithHeader(p: String, f: Fields = Fields.UNKNOWN)(implicit mode: Mode) } } } - } // TODO: move this method to make it a util function. - def writeToFile(filename: String, text: String)(implicit mode: Mode) { + def writeToFile(filename: String, text: String)(implicit mode: Mode): Unit = mode match { case Hdfs(_, conf) => { try { val pt = new Path(filename) - val fs = FileSystem.get(conf) + val fs = pt.getFileSystem(conf) val br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, true))) br.write(text) @@ -99,8 +96,7 @@ class TsvWithHeader(p: String, f: Fields = Fields.UNKNOWN)(implicit mode: Mode) // Local mode case _ => { try { - val br = new BufferedWriter( - new OutputStreamWriter(new FileOutputStream(filename), "utf-8")) + val br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(filename), "utf-8")) br.write(text) br.close() @@ -111,7 +107,6 @@ class TsvWithHeader(p: String, f: Fields = Fields.UNKNOWN)(implicit mode: Mode) } } } - } override def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode) = { val ret = super.writeFrom(pipe)(flowDef, mode) diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/VersionedKeyValSource.scala b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/VersionedKeyValSource.scala index 118e51c6af..389a4f592a 100644 --- a/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/VersionedKeyValSource.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/commons/source/VersionedKeyValSource.scala @@ -12,55 +12,67 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source -import com.backtype.cascading.scheme.KeyValueByteScheme -import com.backtype.cascading.tap.VersionedTap -import com.backtype.cascading.tap.VersionedTap.TapMode import cascading.flow.FlowDef import cascading.pipe.Pipe import cascading.scheme.Scheme -import cascading.scheme.local.TextDelimited import cascading.tap.Tap import cascading.tuple.Fields import com.twitter.algebird.Monoid import com.twitter.bijection.Injection import com.twitter.chill.Externalizer +import com.twitter.scalding.TDsl._ +import com.twitter.scalding.Dsl._ import com.twitter.scalding._ +import com.twitter.scalding.commons.scheme.KeyValueByteScheme +import com.twitter.scalding.commons.tap.VersionedTap +import com.twitter.scalding.commons.tap.VersionedTap.TapMode +import com.twitter.scalding.source.{CheckedInversion, MaxFailuresCheck} import com.twitter.scalding.typed.KeyedListLike import com.twitter.scalding.typed.TypedSink -import com.twitter.scalding.source.{ CheckedInversion, MaxFailuresCheck } -import org.apache.hadoop.mapred.{ JobConf, OutputCollector, RecordReader } - -// Get the tuple adding syntax: -import com.twitter.scalding.TDsl._ +import org.apache.hadoop.mapred.JobConf +import scala.collection.JavaConverters._ /** - * Source used to write key-value pairs as byte arrays into a versioned store. - * Supports incremental updates via the monoid on V. + * Source used to write key-value pairs as byte arrays into a versioned store. Supports incremental updates + * via the monoid on V. */ object VersionedKeyValSource { val defaultVersionsToKeep = 3 // TODO: have two apply methods here for binary compatibility purpose. Need to clean it up in next release. - def apply[K,V](path: String, sourceVersion: Option[Long] = None, sinkVersion: Option[Long] = None, maxFailures: Int = 0) - (implicit codec: Injection[(K,V),(Array[Byte],Array[Byte])]) = { - new VersionedKeyValSource[K,V](path, sourceVersion, sinkVersion, maxFailures, defaultVersionsToKeep) - } - - def apply[K,V](path: String, sourceVersion: Option[Long], sinkVersion: Option[Long], maxFailures: Int, versionsToKeep: Int) - (implicit codec: Injection[(K,V),(Array[Byte],Array[Byte])]) = - new VersionedKeyValSource[K,V](path, sourceVersion, sinkVersion, maxFailures, versionsToKeep) + def apply[K, V]( + path: String, + sourceVersion: Option[Long] = None, + sinkVersion: Option[Long] = None, + maxFailures: Int = 0 + )(implicit codec: Injection[(K, V), (Array[Byte], Array[Byte])]) = + new VersionedKeyValSource[K, V](path, sourceVersion, sinkVersion, maxFailures, defaultVersionsToKeep) + + def apply[K, V]( + path: String, + sourceVersion: Option[Long], + sinkVersion: Option[Long], + maxFailures: Int, + versionsToKeep: Int + )(implicit codec: Injection[(K, V), (Array[Byte], Array[Byte])]) = + new VersionedKeyValSource[K, V](path, sourceVersion, sinkVersion, maxFailures, versionsToKeep) } -class VersionedKeyValSource[K,V](val path: String, val sourceVersion: Option[Long], val sinkVersion: Option[Long], - val maxFailures: Int, val versionsToKeep: Int)( - implicit @transient codec: Injection[(K,V),(Array[Byte],Array[Byte])]) extends Source with Mappable[(K,V)] with TypedSink[(K,V)] { - - import Dsl._ +class VersionedKeyValSource[K, V]( + val path: String, + val sourceVersion: Option[Long], + val sinkVersion: Option[Long], + val maxFailures: Int, + val versionsToKeep: Int +)(implicit @transient codec: Injection[(K, V), (Array[Byte], Array[Byte])]) + extends Source + with Mappable[(K, V)] + with TypedSink[(K, V)] { val keyField = "key" val valField = "value" @@ -69,36 +81,62 @@ class VersionedKeyValSource[K,V](val path: String, val sourceVersion: Option[Lon override def converter[U >: (K, V)] = TupleConverter.asSuperConverter[(K, V), U](TupleConverter.of[(K, V)]) - override def setter[U <: (K, V)] = TupleSetter.asSubSetter[(K, V), U](TupleSetter.of[(K,V)]) + override def setter[U <: (K, V)] = TupleSetter.asSubSetter[(K, V), U](TupleSetter.of[(K, V)]) def hdfsScheme = HadoopSchemeInstance(new KeyValueByteScheme(fields).asInstanceOf[Scheme[_, _, _, _, _]]) @deprecated("This method is deprecated", "0.1.6") - def this(path: String, sourceVersion: Option[Long], sinkVersion: Option[Long], maxFailures: Int) - (implicit @transient codec: Injection[(K,V),(Array[Byte],Array[Byte])]) = + def this(path: String, sourceVersion: Option[Long], sinkVersion: Option[Long], maxFailures: Int)(implicit + @transient codec: Injection[(K, V), (Array[Byte], Array[Byte])] + ) = this(path, sourceVersion, sinkVersion, maxFailures, VersionedKeyValSource.defaultVersionsToKeep)(codec) def getTap(mode: TapMode) = { val tap = new VersionedTap(path, hdfsScheme, mode).setVersionsToKeep(versionsToKeep) - if (mode == TapMode.SOURCE && sourceVersion.isDefined) - tap.setVersion(sourceVersion.get) - else if (mode == TapMode.SINK && sinkVersion.isDefined) - tap.setVersion(sinkVersion.get) - else - tap + (sourceVersion, sinkVersion) match { + case (Some(v), _) if mode == TapMode.SOURCE => + tap.setVersion(v) + case (_, Some(v)) if mode == TapMode.SINK => + tap.setVersion(v) + case _ => + tap + } } val source = getTap(TapMode.SOURCE) val sink = getTap(TapMode.SINK) - def resourceExists(mode: Mode) = + override def validateTaps(mode: Mode): Unit = + // if a version is explicitly supplied, ensure that it exists + sourceVersion.foreach { version => + mode match { + case hadoopMode: HadoopMode => { + val store = source.getStore(new JobConf(hadoopMode.jobConf)) + + if (!store.hasVersion(version)) { + throw new InvalidSourceException( + "Version %s does not exist. Currently available versions are: %s" + .format(version, store.getAllVersions) + ) + } + } + + case _ => + throw new IllegalArgumentException( + "VersionedKeyValSource does not support mode %s. Only HadoopMode is supported" + .format(mode) + ) + } + } + + def resourceExists(mode: Mode): Boolean = mode match { case Test(buffers) => { - buffers(this) map { !_.isEmpty } getOrElse false + buffers(this).map(!_.isEmpty).getOrElse(false) } case HadoopTest(conf, buffers) => { - buffers(this) map { !_.isEmpty } getOrElse false + buffers(this).map(!_.isEmpty).getOrElse(false) } case _ => { val conf = new JobConf(mode.asInstanceOf[HadoopMode].jobConf) @@ -106,7 +144,24 @@ class VersionedKeyValSource[K,V](val path: String, val sourceVersion: Option[Lon } } - override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_,_,_] = { + def sinkExists(mode: Mode): Boolean = + sinkVersion.exists { version => + mode match { + case Test(buffers) => + buffers(this).map(!_.isEmpty).getOrElse(false) + + case HadoopTest(conf, buffers) => + buffers(this).map(!_.isEmpty).getOrElse(false) + + case m: HadoopMode => + val conf = new JobConf(m.jobConf) + val store = sink.getStore(conf) + store.hasVersion(version) + case _ => sys.error(s"Unknown mode $mode") + } + } + + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = { import com.twitter.scalding.CastHfsTap mode match { case Hdfs(_strict, _config) => @@ -120,21 +175,40 @@ class VersionedKeyValSource[K,V](val path: String, val sourceVersion: Option[Lon } // Override this for more control on failure on decode - protected lazy val checkedInversion: CheckedInversion[(K,V), (Array[Byte],Array[Byte])] = + protected lazy val checkedInversion: CheckedInversion[(K, V), (Array[Byte], Array[Byte])] = new MaxFailuresCheck(maxFailures)(codecBox.get) - override def sinkFields = fields + override def sinkFields: Fields = fields - override def transformForRead(pipe: Pipe) = { - pipe.flatMap((keyField, valField) -> (keyField, valField)) { pair: (Array[Byte],Array[Byte]) => + override def transformForRead(pipe: Pipe): Pipe = + pipe.flatMap((keyField, valField) -> (keyField, valField)) { pair: (Array[Byte], Array[Byte]) => checkedInversion(pair) } - } - override def transformForWrite(pipe: Pipe) = { - pipe.mapTo((0,1) -> (keyField, valField)) { pair: (K,V) => + override def transformForWrite(pipe: Pipe): Pipe = + pipe.mapTo((0, 1) -> (keyField, valField)) { pair: (K, V) => codecBox.get.apply(pair) } + + override def toIterator(implicit config: Config, mode: Mode): Iterator[(K, V)] = { + val tap = createTap(Read)(mode) + CascadingMode + .cast(mode) + .openForRead(config, tap) + .asScala + .flatMap { te => + val item = te.selectTuple(fields) + mode match { + case _: TestMode => + val key = item.getObject(0).asInstanceOf[K] + val value = item.getObject(1).asInstanceOf[V] + Some((key, value)) + case _ => + val key = item.getObject(0).asInstanceOf[Array[Byte]] + val value = item.getObject(1).asInstanceOf[Array[Byte]] + checkedInversion((key, value)) + } + } } override def toString = @@ -153,38 +227,38 @@ class VersionedKeyValSource[K,V](val path: String, val sourceVersion: Option[Lon object RichPipeEx extends java.io.Serializable { implicit def pipeToRichPipeEx(pipe: Pipe): RichPipeEx = new RichPipeEx(pipe) - implicit def typedPipeToRichPipeEx[K: Ordering, V: Monoid](pipe: TypedPipe[(K,V)]) = + implicit def typedPipeToRichPipeEx[K: Ordering, V: Monoid](pipe: TypedPipe[(K, V)]): TypedRichPipeEx[K, V] = new TypedRichPipeEx(pipe) implicit def keyedListLikeToRichPipeEx[K: Ordering, V: Monoid, T[K, +V] <: KeyedListLike[K, V, T]]( - kll: KeyedListLike[K, V, T]) = typedPipeToRichPipeEx(kll.toTypedPipe) + kll: KeyedListLike[K, V, T] + ): TypedRichPipeEx[K, V] = typedPipeToRichPipeEx(kll.toTypedPipe) } -class TypedRichPipeEx[K: Ordering, V: Monoid](pipe: TypedPipe[(K,V)]) extends java.io.Serializable { - import Dsl._ - import TDsl._ - +class TypedRichPipeEx[K: Ordering, V: Monoid](pipe: TypedPipe[(K, V)]) extends java.io.Serializable { // Tap reads existing data from the `sourceVersion` (or latest // version) of data specified in `src`, merges the K,V pairs from // the pipe in using an implicit `Monoid[V]` and sinks all results // into the `sinkVersion` of data (or a new version) specified by // `src`. - def writeIncremental(src: VersionedKeyValSource[K,V], reducers: Int = 1) - (implicit flowDef: FlowDef, mode: Mode): TypedPipe[(K, V)] = { + def writeIncremental(src: VersionedKeyValSource[K, V], reducers: Int = 1)(implicit + flowDef: FlowDef, + mode: Mode + ): TypedPipe[(K, V)] = { val outPipe = if (!src.resourceExists(mode)) pipe else { val oldPairs = TypedPipe - .from[(K,V)](src.read, (0,1)) - .map { case (k, v) => (k, v ,0) } + .fromPipe[(K, V)](src.read, (0, 1)) + .map { case (k, v) => (k, v, 0) } val newPairs = pipe.sumByLocalKeys.map { case (k, v) => (k, v, 1) } (oldPairs ++ newPairs) - .groupBy { _._1 } + .groupBy(_._1) .withReducers(reducers) - .sortBy { _._3 } - .mapValues { _._2 } + .sortBy(_._3) + .mapValues(_._2) .sum .toTypedPipe } @@ -198,12 +272,13 @@ class RichPipeEx(pipe: Pipe) extends java.io.Serializable { // VersionedKeyValSource always merges with the most recent complete // version - def writeIncremental[K,V](src: VersionedKeyValSource[K,V], fields: Fields, reducers: Int = 1) - (implicit monoid: Monoid[V], - flowDef: FlowDef, - mode: Mode) = { + def writeIncremental[K, V](src: VersionedKeyValSource[K, V], fields: Fields, reducers: Int = 1)(implicit + monoid: Monoid[V], + flowDef: FlowDef, + mode: Mode + ) = { def appendToken(pipe: Pipe, token: Int) = - pipe.mapTo((0,1) -> ('key,'value,'isNew)) { pair: (K,V) => pair :+ token } + pipe.mapTo((0, 1) -> ('key, 'value, 'isNew)) { pair: (K, V) => pair :+ token } val outPipe = if (!src.resourceExists(mode)) @@ -213,8 +288,8 @@ class RichPipeEx(pipe: Pipe) extends java.io.Serializable { val newPairs = appendToken(pipe, 1) (oldPairs ++ newPairs) - .groupBy('key) { _.reducers(reducers).sortBy('isNew).sum[V]('value) } - .project(('key,'value)) + .groupBy('key)(_.reducers(reducers).sortBy('isNew).sum[V]('value)) + .project(('key, 'value)) .rename(('key, 'value) -> fields) } diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/examples/KMeans.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/KMeans.scala new file mode 100644 index 0000000000..21ba4c8d78 --- /dev/null +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/KMeans.scala @@ -0,0 +1,158 @@ +package com.twitter.scalding.examples + +import com.twitter.scalding._ +import com.twitter.scalding.typed.ComputedValue + +object KMeans { + + /** + * This is the euclidean norm between two vectors + */ + private def distance(v1: Vector[Double], v2: Vector[Double]): Double = + math.sqrt( + v1.iterator + .zip(v2.iterator) + .map { case (l, r) => (l - r) * (l - r) } + .sum + ) + + // Just normal vector addition + private def add(v1: Vector[Double], v2: Vector[Double]): Vector[Double] = + v1.zip(v2).map { case (l, r) => l + r } + + // normal scalar multiplication + private def scale(s: Double, v: Vector[Double]): Vector[Double] = + v.map(x => s * x) + + // Here we return the centroid of some vectors + private def centroidOf(vecs: TraversableOnce[Vector[Double]]): Vector[Double] = { + val (vec, count) = vecs + // add a 1 to each value to count the number of vectors in one pass: + .map(v => (v, 1)) + // Here we add both the count and the vectors: + .reduce { (ll, rr) => + val (l, lc) = ll + val (r, rc) = rr + (add(l, r), lc + rc) + } + // Now scale to get the pointwise average + scale(1.0 / count, vec) + } + + private def closest[Id]( + from: Vector[Double], + centroids: TraversableOnce[(Id, Vector[Double])] + ): (Id, Vector[Double]) = + centroids + // compute the distance to each center + .map { case (id, cent) => (distance(from, cent), (id, cent)) } + // take the minimum by the distance, ignoring the id and the centroid + .minBy { case (dist, _) => dist } + // Just keep the id and the centroid + ._2 + + type LabeledVector = (Int, Vector[Double]) + + /** + * This runs one step in a kmeans algorithm It returns the number of vectors that changed clusters, the new + * clusters and the new list of labeled vectors + */ + def kmeansStep( + k: Int, + s: Stat, + clusters: ValuePipe[List[LabeledVector]], + points: TypedPipe[LabeledVector] + ): Execution[(ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = { + + // Do a cross product to produce all point, cluster pairs + // in scalding, the smaller pipe should go on the right. + val next = points + .leftCross(clusters) + // now compute the closest cluster for each vector + .map { + case ((oldId, vector), Some(centroids)) => + val (id, newcentroid) = closest(vector, centroids) + if (id != oldId) s.inc() + (id, vector) + case (_, None) => sys.error("Missing clusters, this should never happen") + } + .forceToDiskExecution + + // Now update the clusters: + next.map { pipe => + ( + ComputedValue( + pipe.group + // There is no need to use more than k reducers + .withReducers(k) + .mapValueStream(vectors => Iterator(centroidOf(vectors))) + // Now collect them all into one big + .groupAll + .toList + // discard the "all" key used to group them together + .values + ), + pipe + ) + } + } + + def initializeClusters( + k: Int, + points: TypedPipe[Vector[Double]] + ): (ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector]) = { + val rng = new java.util.Random(123) + // take a random k vectors: + val clusters = points + .map(v => (rng.nextDouble, v)) + .groupAll + .sortedTake(k)(Ordering.by(_._1)) + .mapValues { randk => + randk.iterator.zipWithIndex.map { case ((_, v), id) => (id, v) }.toList + } + .values + + // attach a random cluster to each vector + val labeled = points.map(v => (rng.nextInt(k), v)) + + (ComputedValue(clusters), labeled) + } + + /* + * Run the full k-means algorithm by flatMapping the above function into itself + * while the number of vectors that changed is not zero + */ + def kmeans( + k: Int, + clusters: ValuePipe[List[LabeledVector]], + points: TypedPipe[LabeledVector] + ): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = { + + val key = StatKey("changed", "scalding.kmeans") + + def go( + s: Stat, + c: ValuePipe[List[LabeledVector]], + p: TypedPipe[LabeledVector], + step: Int + ): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = + kmeansStep(k, s, c, p).getAndResetCounters + .flatMap { case ((nextC, nextP), counters) => + val changed = counters(key) + if (changed == 0L) Execution.from((step, nextC, nextP)) + else go(s, nextC, nextP, step + 1) + } + + Execution.withId { implicit uid => + go(Stat(key), clusters, points, 0) + } + } + + def apply( + k: Int, + points: TypedPipe[Vector[Double]] + ): Execution[(Int, ValuePipe[List[LabeledVector]], TypedPipe[LabeledVector])] = { + val (clusters, labeled) = initializeClusters(k, points) + kmeans(k, clusters, labeled) + } +} diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/examples/MergeTest.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/MergeTest.scala new file mode 100644 index 0000000000..9ec42dc186 --- /dev/null +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/MergeTest.scala @@ -0,0 +1,49 @@ +package com.twitter.scalding.examples + +import scala.annotation.tailrec + +import com.twitter.scalding._ + +/** + * This example job does not yet work. It is a test for Kyro serialization + */ +class MergeTest(args: Args) extends Job(args) { + TextLine(args("input")) + .flatMapTo('word)(_.split("""\s+""")) + .groupBy('word)(_.size) + // Now, let's get the top 10 words: + .groupAll { + _.mapReduceMap(('word, 'size) -> 'list) /* map1 */ { tup: (String, Long) => List(tup) } /* reduce */ { + (l1: List[(String, Long)], l2: List[(String, Long)]) => + mergeSort2(l1, l2, 10, cmpTup) + } /* map2 */ { lout: List[(String, Long)] => + lout + } + } + // Now expand out the list. + .flatMap('list -> ('word, 'cnt)) { list: List[(String, Long)] => list } + .project('word, 'cnt) + .write(Tsv(args("output"))) + + // Reverse sort to get the top items + def cmpTup(t1: (String, Long), t2: (String, Long)) = t2._2.compareTo(t1._2) + + def mergeSort2[T](v1: List[T], v2: List[T], k: Int, cmp: Function2[T, T, Int]) = { + @tailrec + def mergeSortR(acc: List[T], list1: List[T], list2: List[T], k: Int): List[T] = + (list1, list2, k) match { + case (_, _, 0) => acc + case (x1 :: t1, x2 :: t2, _) => { + if (cmp(x1, x2) < 0) { + mergeSortR(x1 :: acc, t1, list2, k - 1) + } else { + mergeSortR(x2 :: acc, list1, t2, k - 1) + } + } + case (x1 :: t1, Nil, _) => mergeSortR(x1 :: acc, t1, Nil, k - 1) + case (Nil, x2 :: t2, _) => mergeSortR(x2 :: acc, Nil, t2, k - 1) + case (Nil, Nil, _) => acc + } + mergeSortR(Nil, v1, v2, k).reverse + } +} diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/examples/PageRank.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/PageRank.scala new file mode 100644 index 0000000000..42dd4d0c51 --- /dev/null +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/PageRank.scala @@ -0,0 +1,173 @@ +package com.twitter.scalding.examples + +import scala.annotation.tailrec +import com.twitter.scalding._ + +/** + * Options: + * --input: the three column TSV with node, comma-sep-out-neighbors, initial pagerank (set to 1.0 first) + * --output: the name for the TSV you want to write to, same as above. optional arguments: + * --errorOut: name of where to write the L1 error between the input page-rank and the output if this is + * omitted, we don't compute the error + * --iterations: how many iterations to run inside this job. Default is 1, 10 is about as much as cascading + * can handle. + * --jumpprob: probability of a random jump, default is 0.15 + * --convergence: if this is set, after every "--iterations" steps, we check the error and see if we should + * continue. Since the error check is expensive (involving a join), you should avoid doing this too + * frequently. 10 iterations is probably a good number to set. + * --temp: this is the name where we will store a temporary output so we can compare to the previous for + * convergence checking. If convergence is set, this MUST be. + */ +class PageRank(args: Args) extends Job(args) { + + // How many steps + val STEPS = args.getOrElse("iterations", "1").toInt + // Probability of taking a random jump in the network. + val ALPHA = args.getOrElse("jumpprob", "0.15").toDouble + // How many times have we checked for convergence: + val JOB_COUNT = args.getOrElse("jobCount", "0").toInt + // These are constants used by the algorithm: + val NODESET = 0 + val EDGE = 1 + + // Read the input, this can be subclassed, but should produce a pipe with three + // columns: source node, comma separated (no spaces) destination nodes as a string, and + // initial rank (default to 1.0 if you are starting from nothing) + initialize('src, 'dst, 'rank) + /* + * This algorithm works by having two types of rows that have the same column structure. + * the node -> list(neighbors), and node -> individual neighbor. + * We distinguish these two types with an id which nodes if this is a NODESET or an EDGE. + * The first step is to append that value. We also need to have a column for the degree. + * It doesn't matter what the initial degree is, we recompute below + */ + .map(() -> ('rowtype, 'd_src))((u: Unit) => (NODESET, -1)) + .thenDo(doPageRank(STEPS) _) + .thenDo(computeError _) + .thenDo(output _) + + /** + * Here is where we check for convergence and then run the next job if we're not converged + */ + override def next: Option[Job] = + args + .optional("convergence") + .flatMap { convErr => + /* + * It's easy for this to seem broken, so think about it twice: + * We are swapping between two writing files: temp and output, with the ultimate + * goal to land up at output. So, each next input is this output, but the temp + * and output should be swapping. + */ + val nextArgs = args + ("input", Some(args("output"))) + + ("temp", Some(args("output"))) + + ("output", Some(args("temp"))) + + ("jobCount", Some((JOB_COUNT + 1).toString)) + // Actually read the error: + val error = TypedTsv[Double](args("errorOut")).toIterator.next; + // The last job should be even numbered so output is not in temp. + // TODO: if we had a way to do HDFS operations easily (like rm, mv, tempname) + // this code would be cleaner and more efficient. As is, we may go a whole extra + // set of operations past the point of convergence. + if (error > convErr.toDouble || (JOB_COUNT % 2 == 1)) { + // try again to get under the error + Some(clone(nextArgs)) + } else { + None + } + } + + /** + * override this function to change how you generate a pipe of (Long, String, Double) where the first entry + * is the nodeid, the second is the list of neighbors, as a comma (no spaces) separated string + * representation of the numeric nodeids, the third is the initial page rank (if not starting from a + * previous run, this should be 1.0 + * + * NOTE: if you want to run until convergence, the initialize method must read the same EXACT format as the + * output method writes. This is your job! + */ + def initialize(nodeCol: Symbol, neighCol: Symbol, pageRank: Symbol) = + Tsv(args("input")).read + // Just to name the columns: + .mapTo((0, 1, 2) -> (nodeCol, neighCol, pageRank)) { input: (Long, String, Double) => + input + } + + /** + * The basic idea is to groupBy the dst key with BOTH the nodeset and the edge rows. the nodeset rows have + * the old page-rank, the edge rows are reversed, so we can get the incoming page-rank from the nodes that + * point to each destination. + */ + + @tailrec + final def doPageRank(steps: Int)(pagerank: RichPipe): RichPipe = + if (steps <= 0) { pagerank } + else { + val nodeRows = pagerank + // remove any EDGE rows from the previous loop + .filter('rowtype)((rowtype: Int) => rowtype == NODESET) + // compute the incremental rank due to the random jump: + val randomJump = nodeRows.map('rank -> 'rank)((rank: Double) => ALPHA) + // expand the neighbor list inte an edge list and out-degree of the src + val edges = nodeRows + .flatMap(('dst, 'd_src) -> ('dst, 'd_src)) { args: (String, Long) => + if (args._1.length > 0) { + val dsts = args._1.split(",") + // Ignore the old degree: + val deg = dsts.size + dsts.map(str => (str.toLong, deg)) + } else { + // Here is a node that points to no other nodes (dangling) + Nil + } + } + // Here we make a false row that we use to tell dst how much incoming + // Page rank it needs to add to itself: + .map(('src, 'd_src, 'dst, 'rank, 'rowtype) -> ('src, 'd_src, 'dst, 'rank, 'rowtype)) { + intup: (Long, Long, Long, Double, Int) => + val (src: Long, d_src: Long, dst: Long, rank: Double, row: Int) = intup + // The d_src, and dst are ignored in the merge below + // We swap destination into the source position + (dst, -1L, "", rank * (1.0 - ALPHA) / d_src, EDGE) + } + + /** + * Here we do the meat of the algorithm: if N = number of nodes, pr(N_i) prob of walking to node i, + * then: N pr(N_i) = (\sum_{j points to i} N pr(N_j) * (1-ALPHA)/d_j) + ALPHA N pr(N_i) is the page rank + * of node i. + */ + val nextPr = (edges ++ randomJump).groupBy('src) { + /* + * Note that NODESET < EDGE, so if we take the min(rowtype, ...) + * using dictionary ordering, we only keep NODESET rows UNLESS + * there are rows that had no outdegrees, so they had no NODESET row + * to begin with. To fix the later case, we have to additionally + * filter the result to keep only NODESET rows. + */ + _.min('rowtype, 'dst, 'd_src) + .sum[Double]('rank) // Sum the page-rank from both the nodeset and edge rows + } + // Must call ourselves in the tail position: + doPageRank(steps - 1)(nextPr) + } + + // This outputs in the same format as the input, so you can run the job + // iteratively, subclass to change the final behavior + def output(pipe: RichPipe) = + pipe.project('src, 'dst, 'rank).write(Tsv(args("output"))) + + // Optionally compute the average error: + def computeError(pr: RichPipe): RichPipe = { + args.optional("errorOut").map { errOut => + Tsv(args("input")).read + .mapTo((0, 1, 2) -> ('src0, 'dst0, 'rank0)) { tup: (Long, String, Double) => tup } + .joinWithSmaller('src0 -> 'src, pr) + .mapTo(('rank0, 'rank) -> 'err) { ranks: (Double, Double) => + scala.math.abs(ranks._1 - ranks._2) + } + .groupAll(_.average('err)) + .write(TypedTsv[Double](errOut)) + } + pr + } +} diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRank.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRank.scala new file mode 100644 index 0000000000..7c08d5a3d0 --- /dev/null +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRank.scala @@ -0,0 +1,200 @@ +package com.twitter.scalding.examples + +import com.twitter.scalding._ + +/** + * weighted page rank for the given graph, start from the given pagerank, perform one iteartion, test for + * convergence, if not yet, clone itself and start the next page rank job with updated pagerank as input. + * + * This class is very similar to the PageRank class, main differences are: + * 1. supported weighted pagerank 2. the reset pagerank is pregenerated, possibly through a previous job 3. + * dead pagerank is evenly distributed + * + * Options: + * --pwd: working directory, will read/generate the following files there numnodes: total number of nodes + * nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> pagerank: the page rank file eg pagerank_0, + * pagerank_1 etc totaldiff: the current max pagerank delta Optional arguments: + * --weighted: do weighted pagerank, default false + * --curiteration: what is the current iteration, default 0 + * --maxiterations: how many iterations to run. Default is 20 + * --jumpprob: probability of a random jump, default is 0.1 + * --threshold: total difference before finishing early, default 0.001 + */ +class WeightedPageRank(args: Args) extends Job(args) { + val ROW_TYPE_1 = 1 + val ROW_TYPE_2 = 2 + + val PWD = args("pwd") + val ALPHA = args.getOrElse("jumpprob", "0.1").toDouble + val WEIGHTED = args.getOrElse("weighted", "false").toBoolean + val THRESHOLD = args.getOrElse("threshold", "0.001").toDouble + val MAXITERATIONS = args.getOrElse("maxiterations", "20").toInt + val CURITERATION = args.getOrElse("curiteration", "0").toInt + + // 'size + val numNodes = getNumNodes(PWD + "/numnodes") + + // 'src_id, 'dst_ids, 'weights, 'mass_prior + val nodes = getNodes(PWD + "/nodes") + + // 'src_id_input, 'mass_input + val inputPagerank = getInputPagerank(PWD + "/pagerank_" + CURITERATION) + + // one iteration of pagerank + val outputPagerank = doPageRank(nodes, inputPagerank) + val outputFileName = PWD + "/pagerank_" + (CURITERATION + 1) + outputPagerank + .project('src_id, 'mass_n) + .write(Tsv(outputFileName)) + + // detect convergence + val totalDiff = outputPagerank + .mapTo(('mass_input, 'mass_n) -> 'mass_diff) { args: (Double, Double) => + scala.math.abs(args._1 - args._2) + } + .groupAll(_.sum[Double]('mass_diff)) + .write(TypedTsv[Double](PWD + "/totaldiff")) + + /** + * test convergence, if not yet, kick off the next iteration + */ + override def next = { + // the max diff generated above + val totalDiff = TypedTsv[Double](PWD + "/totaldiff").toIterator.next + + if (CURITERATION < MAXITERATIONS - 1 && totalDiff > THRESHOLD) { + val newArgs = args + ("curiteration", Some((CURITERATION + 1).toString)) + Some(clone(newArgs)) + } else { + None + } + } + + def getInputPagerank(fileName: String) = + Tsv(fileName).read + .mapTo((0, 1) -> ('src_id_input, 'mass_input)) { input: (Int, Double) => + input + } + + /** + * read the pregenerated nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> + */ + def getNodes(fileName: String) = + mode match { + case Hdfs(_, conf) => { + SequenceFile(fileName).read + .mapTo((0, 1, 2, 3) -> ('src_id, 'dst_ids, 'weights, 'mass_prior)) { + input: (Int, Array[Int], Array[Float], Double) => input + } + } + case _ => { + Tsv(fileName).read + .mapTo((0, 1, 2, 3) -> ('src_id, 'dst_ids, 'weights, 'mass_prior)) { + input: (Int, String, String, Double) => + ( + input._1, + // convert string to int array + if (input._2 != null && input._2.length > 0) { + input._2.split(",").map(_.toInt) + } else { + Array[Int]() + }, + // convert string to float array + if (input._3 != null && input._3.length > 0) { + input._3.split(",").map(_.toFloat) + } else { + Array[Float]() + }, + input._4 + ) + } + } + } + + /** + * the total number of nodes, single line file + */ + def getNumNodes(fileName: String) = + Tsv(fileName).read + .mapTo(0 -> 'size) { input: Int => input } + + /** + * one iteration of pagerank inputPagerank: <'src_id_input, 'mass_input> return <'src_id, 'mass_n, + * 'mass_input> + * + * Here is a highlevel view of the unweighted algorithm: let N: number of nodes inputPagerank(N_i): prob of + * walking to node i, d(N_j): N_j's out degree then pagerankNext(N_i) = (\sum_{j points to i} + * inputPagerank(N_j) / d_j) deadPagerank = (1 - \sum_{i} pagerankNext(N_i)) / N randomPagerank(N_i) = + * userMass(N_i) * ALPHA + deadPagerank * (1-ALPHA) pagerankOutput(N_i) = randomPagerank(N_i) + + * pagerankNext(N_i) * (1-ALPHA) + * + * For weighted algorithm: let w(N_j, N_i): weight from N_j to N_i tw(N_j): N_j's total out weights then + * pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) * w(N_j, N_i) / tw(N_j)) + */ + def doPageRank(nodeRows: RichPipe, inputPagerank: RichPipe): RichPipe = { + // 'src_id, 'dst_ids, 'weights, 'mass_prior, 'mass_input + val nodeJoined = nodeRows + .joinWithSmaller('src_id -> 'src_id_input, inputPagerank) + .discard('src_id_input) + + // 'src_id, 'mass_n + val pagerankNext = nodeJoined + .flatMapTo(('dst_ids, 'weights, 'mass_input) -> ('src_id, 'mass_n)) { + args: (Array[Int], Array[Float], Double) => + if (args._1.length > 0) { + if (WEIGHTED) { + // weighted distribution + val total: Double = args._2.sum + args._1.zip(args._2).map { idWeight: (Int, Float) => + (idWeight._1, args._3 * idWeight._2 / total) + } + } else { + // equal distribution + val dist: Double = args._3 / args._1.length + args._1.map { id: Int => (id, dist) } + } + } else { + // Here is a node that points to no other nodes (dangling) + Nil + } + } + .groupBy('src_id) { + _.sum[Double]('mass_n) + } + + // 'sum_mass + val sumPagerankNext = pagerankNext.groupAll(_.sum[Double]('mass_n -> 'sum_mass)) + + // 'deadMass + // single row jobs + // the dead page rank equally distributed to every node + val deadPagerank = sumPagerankNext + .crossWithTiny(numNodes) + .map(('sum_mass, 'size) -> 'deadMass) { input: (Double, Int) => + (1.0 - input._1) / input._2 + } + .discard('size, 'sum_mass) + + // 'src_id_r, 'mass_n_r + // random jump probability plus dead page rank + val randomPagerank = nodeJoined + .crossWithTiny(deadPagerank) + .mapTo(('src_id, 'mass_prior, 'deadMass, 'mass_input) -> ('src_id, 'mass_n, 'mass_input)) { + ranks: (Int, Double, Double, Double) => + (ranks._1, ranks._2 * ALPHA + ranks._3 * (1 - ALPHA), ranks._4) + } + + // 'src_id, 'mass_n + // scale next page rank to 1-ALPHA + val pagerankNextScaled = pagerankNext + .map('mass_n -> ('mass_n, 'mass_input)) { m: Double => ((1 - ALPHA) * m, 0.0) } + + // 'src_id, 'mass_n, 'mass_input + // random probability + next probability + (randomPagerank ++ pagerankNextScaled) + .groupBy('src_id) { + _.sum[Double]('mass_input) // keep the input pagerank + .sum[Double]('mass_n) // take the sum + } + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrix.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrix.scala similarity index 59% rename from scalding-core/src/main/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrix.scala rename to scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrix.scala index aa1ac611d2..d2ef97ae50 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrix.scala +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrix.scala @@ -1,41 +1,28 @@ package com.twitter.scalding.examples -import scala.collection._ - import com.twitter.scalding._ -import com.twitter.scalding.mathematics.{Matrix, ColVector} +import com.twitter.scalding.mathematics.{ColVector, Matrix} import com.twitter.scalding.mathematics.Matrix._ /** - * A weighted PageRank implementation using the Scalding Matrix API. This - * assumes that all rows and columns are of type {@link Int} and values or egde - * weights are {@link Double}. If you want an unweighted PageRank, simply set - * the weights on the edges to 1. + * A weighted PageRank implementation using the Scalding Matrix API. This assumes that all rows and columns + * are of type {@link Int} and values or egde weights are {@link Double}. If you want an unweighted PageRank, + * simply set the weights on the edges to 1. * * Input arguments: * - * d -- damping factor - * n -- number of nodes in the graph - * currentIteration -- start with 0 probably - * maxIterations -- stop after n iterations - * convergenceThreshold -- using the sum of the absolute difference between - * iteration solutions, iterating stops once we reach - * this threshold - * rootDir -- the root directory holding all starting, intermediate and final - * data/output + * d -- damping factor n -- number of nodes in the graph currentIteration -- start with 0 probably + * maxIterations -- stop after n iterations convergenceThreshold -- using the sum of the absolute difference + * between iteration solutions, iterating stops once we reach this threshold rootDir -- the root directory + * holding all starting, intermediate and final data/output * * The expected structure of the rootDir is: * - * rootDir - * |- iterations - * | |- 0 <-- a TSV of (row, value) of size n, value can be 1/n (generate this) - * | |- n <-- holds future iterations/solutions - * |- edges <-- a TSV of (row, column, value) for edges in the graph - * |- onesVector <-- a TSV of (row, 1) of size n (generate this) - * |- diff <-- a single line representing the difference between the last iterations - * |- constants <-- built at iteration 0, these are constant for any given matrix/graph - * |- M_hat - * |- priorVector + * rootDir \|- iterations \| |- 0 <-- a TSV of (row, value) of size n, value can be 1/n (generate this) \| |- + * n <-- holds future iterations/solutions \|- edges <-- a TSV of (row, column, value) for edges in the graph + * \|- onesVector <-- a TSV of (row, 1) of size n (generate this) \|- diff <-- a single line representing the + * difference between the last iterations \|- constants <-- built at iteration 0, these are constant for any + * given matrix/graph \|- M_hat \|- priorVector * * Don't forget to set the number of reducers for this job: * -D mapred.reduce.tasks=n @@ -70,8 +57,7 @@ class WeightedPageRankFromMatrix(args: Args) extends Job(args) { measureConvergenceAndStore() /** - * Recurse and iterate again iff we are under the max number of iterations and - * vector has not converged. + * Recurse and iterate again iff we are under the max number of iterations and vector has not converged. */ override def next = { val diff = TypedTsv[Double](diffLoc).toIterator.next @@ -85,22 +71,19 @@ class WeightedPageRankFromMatrix(args: Args) extends Job(args) { } /** - * Measure convergence by calculating the total of the absolute difference - * between the previous and next vectors. This stores the result after - * calculation. + * Measure convergence by calculating the total of the absolute difference between the previous and next + * vectors. This stores the result after calculation. */ - def measureConvergenceAndStore() { - (previousVector - nextVector). - mapWithIndex { case (value, index) => math.abs(value) }. - sum. - write(TypedTsv[Double](diffLoc)) - } + def measureConvergenceAndStore(): Unit = + (previousVector - nextVector) + .mapWithIndex { case (value, index) => math.abs(value) } + .sum + .write(TypedTsv[Double](diffLoc)) /** * Load or generate on first iteration the matrix M^ given A. */ - def M_hat: Matrix[Int, Int, Double] = { - + def M_hat: Matrix[Int, Int, Double] = if (currentIteration == 0) { val A = matrixFromTsv(edgesLoc) val M = A.rowL1Normalize.transpose @@ -110,13 +93,11 @@ class WeightedPageRankFromMatrix(args: Args) extends Job(args) { } else { matrixFromTsv(rootDir + "/constants/M_hat") } - } /** * Load or generate on first iteration the prior vector given d and n. */ - def priorVector: ColVector[Int, Double] = { - + def priorVector: ColVector[Int, Double] = if (currentIteration == 0) { val onesVector = colVectorFromTsv(onesVectorLoc) val priorVector = ((1 - d) / n) * onesVector.toMatrix(0) @@ -125,7 +106,6 @@ class WeightedPageRankFromMatrix(args: Args) extends Job(args) { } else { colVectorFromTsv(rootDir + "/constants/priorVector") } - } def matrixFromTsv(input: String): Matrix[Int, Int, Double] = TypedTsv[(Int, Int, Double)](input).toMatrix diff --git a/scalding-commons/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala new file mode 100644 index 0000000000..64298cd306 --- /dev/null +++ b/scalding-commons/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala @@ -0,0 +1,13 @@ +package com.twitter.scalding.examples + +import com.twitter.scalding._ + +class WordCountJob(args: Args) extends Job(args) { + TypedPipe + .from(TextLine(args("input"))) + .flatMap(line => line.split("\\s+")) + .map(word => (word, 1L)) + .sumByKey + // The compiler will enforce the type coming out of the sumByKey is the same as the type we have for our sink + .write(TypedTsv[(String, Long)](args("output"))) +} diff --git a/scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/FSTestCase.java b/scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/FSTestCase.java new file mode 100644 index 0000000000..31aceab77b --- /dev/null +++ b/scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/FSTestCase.java @@ -0,0 +1,20 @@ +package com.twitter.scalding.commons.datastores; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; + +import java.io.IOException; + +public class FSTestCase { + public FileSystem local; + public FileSystem fs; + + public FSTestCase() { + try { + local = FileSystem.getLocal(new Configuration()); + fs = FileSystem.get(new Configuration()); + } catch(IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/TestUtils.java b/scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/TestUtils.java new file mode 100644 index 0000000000..9a6becccca --- /dev/null +++ b/scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/TestUtils.java @@ -0,0 +1,20 @@ +package com.twitter.scalding.commons.datastores; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; + +public class TestUtils { + + private static final String TMP_ROOT = "/tmp/unittests"; + + public static String getTmpPath(FileSystem fs, String name) throws IOException { + fs.mkdirs(new Path(TMP_ROOT)); + String full = TMP_ROOT + "/" + name; + if (fs.exists(new Path(full))) { + fs.delete(new Path(full), true); + } + return full; + } +} diff --git a/scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/VersionedStoreTest.java b/scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/VersionedStoreTest.java new file mode 100644 index 0000000000..fd96614e9b --- /dev/null +++ b/scalding-commons/src/test/java/com/twitter/scalding/commons/datastores/VersionedStoreTest.java @@ -0,0 +1,111 @@ +package com.twitter.scalding.commons.datastores; + +import junit.framework.Assert; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.junit.Test; + +import java.io.File; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class VersionedStoreTest extends FSTestCase { + + @Test + public void testCleanup() throws Exception { + String tmp1 = TestUtils.getTmpPath(fs, "versions_test1"); + VersionedStore vs = new VersionedStore(tmp1); + for (int i = 1; i <= 4; i ++) { + String version = vs.createVersion(i); + fs.mkdirs(new Path(version)); + vs.succeedVersion(i); + } + FileStatus[] files = fs.listStatus(new Path(tmp1)); + Assert.assertEquals(files.length, 8); + vs.cleanup(2); + files = fs.listStatus(new Path(tmp1)); + Assert.assertEquals(files.length, 4); + for (FileStatus f : files) { + String path = f.getPath().toString(); + Assert.assertTrue(path.endsWith("3") || + path.endsWith("4") || + path.endsWith("3.version") || + path.endsWith(("4.version"))); + } + } + + // verify cleanup works correctly when datasets have success files only + @Test + public void testCleanupWithSuccessFiles() throws Exception { + String tmp1 = TestUtils.getTmpPath(fs, "versions_test2"); + VersionedStore vs = new VersionedStore(tmp1); + for (int i = 1; i <= 4; i ++) { + String version = vs.createVersion(i); + fs.mkdirs(new Path(version)); + fs.createNewFile(new Path(version, VersionedStore.HADOOP_SUCCESS_FLAG)); + } + FileStatus[] files = fs.listStatus(new Path(tmp1)); + Assert.assertEquals(files.length, 4); // one success file per version + vs.cleanup(2); + files = fs.listStatus(new Path(tmp1)); + Assert.assertEquals(files.length, 2); // one success file per version after cleanup + for (FileStatus f : files) { + String path = f.getPath().toString(); + Assert.assertTrue(path.endsWith("3") || + path.endsWith("4")); + } + } + + // verify cleanup works correctly when datasets have both version suffix files and success files + @Test + public void testCleanupWithMix() throws Exception { + String tmp1 = TestUtils.getTmpPath(fs, "versions_test3"); + VersionedStore vs = new VersionedStore(tmp1); + for (int i = 1; i <= 4; i ++) { + String version = vs.createVersion(i); + fs.mkdirs(new Path(version)); + fs.createNewFile(new Path(version, VersionedStore.HADOOP_SUCCESS_FLAG)); + vs.succeedVersion(i); // adds .version file + } + FileStatus[] files = fs.listStatus(new Path(tmp1)); + Assert.assertEquals(files.length, 8); // one success file + version suffix per version + vs.cleanup(2); + files = fs.listStatus(new Path(tmp1)); + Assert.assertEquals(files.length, 4); // after cleanup + for (FileStatus f : files) { + String path = f.getPath().toString(); + Assert.assertTrue(path.endsWith("3") || + path.endsWith("4") || + path.endsWith("3.version") || + path.endsWith(("4.version"))); + } + } + + @Test + public void testMultipleVersions() throws Exception { + String tmp1 = TestUtils.getTmpPath(fs, "versions_checker"); + VersionedStore vs = new VersionedStore(tmp1); + for (int i = 1; i <= 4; i ++) { + String version = vs.createVersion(i); + fs.mkdirs(new Path(version)); + vs.succeedVersion(i); + } + new File(new Path(tmp1, "5" + VersionedStore.FINISHED_VERSION_SUFFIX).toString()).createNewFile(); + Path invalidPath = new Path(tmp1, "_test"); + fs.mkdirs(invalidPath); + new File(new Path(invalidPath, VersionedStore.HADOOP_SUCCESS_FLAG).toString()).createNewFile(); + + List allVersions = vs.getAllVersions(); + Set output = new HashSet(); + output.addAll(allVersions); + Set expected = new HashSet(); + for (int i = 1; i <= 4; i ++) { + expected.add(Long.valueOf(i)); + } + + Assert.assertEquals(output, expected); + } + +} + diff --git a/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/_SUCCESS b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00000 b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00000 new file mode 100644 index 0000000000..8f298c95cd Binary files /dev/null and b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00000 differ diff --git a/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00001 b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00001 new file mode 100644 index 0000000000..1653a36e45 Binary files /dev/null and b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/09/part-00001 differ diff --git a/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/_SUCCESS b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00000 b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00000 new file mode 100644 index 0000000000..8f298c95cd Binary files /dev/null and b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00000 differ diff --git a/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00001 b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00001 new file mode 100644 index 0000000000..1653a36e45 Binary files /dev/null and b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00001 differ diff --git a/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00002 b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00002 new file mode 100644 index 0000000000..8f298c95cd Binary files /dev/null and b/scalding-commons/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/10/part-00002 differ diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/ExecutionKMeansTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/ExecutionKMeansTest.scala new file mode 100644 index 0000000000..2a820c7f87 --- /dev/null +++ b/scalding-commons/src/test/scala/com/twitter/scalding/ExecutionKMeansTest.scala @@ -0,0 +1,60 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +import org.scalatest.{Matchers, WordSpec} + +import com.twitter.scalding._ + +import com.twitter.scalding.examples.KMeans + +class ExecutionKMeansTest extends WordSpec with Matchers { + + "Execution K-means" should { + "find the correct clusters for trivial cases" in { + val dim = 20 + val k = 20 + val rng = new java.util.Random + // if you are in cluster i, then position i == 100, else all the first k are 0. + // Then all the tail are random, but very small enough to never bridge the gap + def randVect(cluster: Int): Vector[Double] = + Vector.fill(k)(0.0).updated(cluster, 100.0) ++ Vector.fill(dim - k)(rng.nextDouble / (1e6 * dim)) + + // To have the seeds stay sane for kmeans k == vectorCount + val vectorCount = k + val vectors = TypedPipe.from((0 until vectorCount).map(i => randVect(i % k))) + + val labels = KMeans(k, vectors) + .flatMap { case (_, _, labeledPipe) => + labeledPipe.toIterableExecution + } + .waitFor(Config.default, Local(false)) + .get + .toList + + def clusterOf(v: Vector[Double]): Int = v.indexWhere(_ > 0.0) + + val byCluster = labels.groupBy { case (id, v) => clusterOf(v) } + + // The rule is this: if two vectors share the same prefix, + // the should be in the same cluster + byCluster.foreach { case (clusterId, vs) => + val id = vs.head._1 + vs.foreach { case (thisId, _) => id shouldBe thisId } + } + } + } +} diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/PageRankTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/PageRankTest.scala new file mode 100644 index 0000000000..f9998f1b65 --- /dev/null +++ b/scalding-commons/src/test/scala/com/twitter/scalding/PageRankTest.scala @@ -0,0 +1,54 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import org.scalatest.{Matchers, WordSpec} + +class PageRankTest extends WordSpec with Matchers { + "A PageRank job" should { + JobTest(new com.twitter.scalding.examples.PageRank(_)) + .arg("input", "inputFile") + .arg("output", "outputFile") + .arg("errorOut", "error") + .arg("temp", "tempBuffer") + // How many iterations to do each time: + .arg("iterations", "6") + .arg("convergence", "0.05") + .source(Tsv("inputFile"), List((1L, "2", 1.0), (2L, "1,3", 1.0), (3L, "2", 1.0))) + // Don't check the tempBuffer: + .sink[(Long, String, Double)](Tsv("tempBuffer"))(ob => ()) + .sink[Double](TypedTsv[Double]("error")) { ob => + "have low error" in { + ob.head should be <= 0.05 + } + } + .sink[(Long, String, Double)](Tsv("outputFile")) { outputBuffer => + val pageRank = outputBuffer.map(res => (res._1, res._3)).toMap + "correctly compute pagerank" in { + val d = 0.85 + val twoPR = (1.0 + 2 * d) / (1.0 + d) + val otherPR = (1.0 + d / 2.0) / (1.0 + d) + println(pageRank) + (pageRank(1L) + pageRank(2L) + pageRank(3L)) shouldBe 3.0 +- 0.1 + pageRank(1L) shouldBe otherPR +- 0.1 + pageRank(2L) shouldBe twoPR +- 0.1 + pageRank(3L) shouldBe otherPR +- 0.1 + } + } + .run + .finish() + } +} diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankFromMatrixTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankFromMatrixTest.scala new file mode 100644 index 0000000000..9dad6b2950 --- /dev/null +++ b/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankFromMatrixTest.scala @@ -0,0 +1,139 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.examples + +import scala.collection._ + +import org.scalatest.{Matchers, WordSpec} + +import com.twitter.scalding._ + +import WeightedPageRankFromMatrixSpec._ + +class WeightedPageRankFromMatrixSpec extends WordSpec with Matchers { + + "Weighted PageRank from Matrix job" should { + + // 0.0 0.0 0.0 0.0 1.0 + // 0.5 0.0 0.0 0.0 0.0 + // 0.5 0.0 0.0 0.0 0.0 + // 0.0 1.0 0.5 0.0 0.0 + // 0.0 0.0 0.5 1.0 0.0 + val edges = + List((0, 4, 1.0), (1, 0, 0.5), (2, 0, 0.5), (3, 1, 1.0), (3, 2, 0.5), (4, 2, 0.5), (4, 3, 1.0)) + + val d = 0.4d // damping factor + val n = 5 // number of nodes + val onesVector = filledColumnVector(1d, n) + val iterationZeroVector = filledColumnVector(1d / n, n) + + val expectedSolution = Array(0.28, 0.173333, 0.173333, 0.173333, 0.2) + + JobTest(new WeightedPageRankFromMatrix(_)) + .arg("d", d.toString) + .arg("n", n.toString) + .arg("convergenceThreshold", "0.0001") + .arg("maxIterations", "1") + .arg("currentIteration", "0") + .arg("rootDir", "root") + .source(TypedTsv[(Int, Int, Double)]("root/edges"), edges) + .source(TypedTsv[(Int, Double)]("root/onesVector"), onesVector) + .source(TypedTsv[(Int, Double)]("root/iterations/0"), iterationZeroVector) + .sink[(Int, Int, Double)](Tsv("root/constants/M_hat")) { outputBuffer => + outputBuffer should have size 7 + val outputMap = toSparseMap(outputBuffer) + outputMap((0 -> 1)) shouldBe 0.4 + outputMap((0 -> 2)) shouldBe 0.4 + outputMap((1 -> 3)) shouldBe 0.26666 +- 0.00001 + outputMap((2 -> 3)) shouldBe 0.13333 +- 0.00001 + outputMap((2 -> 4)) shouldBe 0.13333 +- 0.00001 + outputMap((3 -> 4)) shouldBe 0.26666 +- 0.00001 + outputMap((4 -> 0)) shouldBe 0.4 + } + .sink[(Int, Double)](Tsv("root/constants/priorVector")) { outputBuffer => + outputBuffer should have size 5 + val expectedValue = ((1 - d) / 2) * d + assertVectorsEqual(new Array[Double](5).map(v => expectedValue), outputBuffer.map(_._2).toArray) + } + .sink[(Int, Double)](Tsv("root/iterations/1")) { outputBuffer => + outputBuffer should have size 5 + assertVectorsEqual(expectedSolution, outputBuffer.map(_._2).toArray, 0.00001) + } + .typedSink(TypedTsv[Double]("root/diff")) { outputBuffer => + outputBuffer should have size 1 + + val expectedDiff = + expectedSolution.zip(iterationZeroVector.map(_._2)).map { case (a, b) => math.abs(a - b) }.sum + outputBuffer.head shouldBe expectedDiff +- 0.00001 + } + .run + .finish() + } + + private def assertVectorsEqual(expected: Array[Double], actual: Array[Double], variance: Double): Unit = + actual.zipWithIndex.foreach { case (value, i) => + value shouldBe (expected(i)) +- variance + } + + private def assertVectorsEqual(expected: Array[Double], actual: Array[Double]): Unit = + actual.zipWithIndex.foreach { case (value, i) => + value shouldBe (expected(i)) + } +} + +object WeightedPageRankFromMatrixSpec { + + def toSparseMap[Row, Col, V](iterable: Iterable[(Row, Col, V)]): Map[(Row, Col), V] = + iterable.map(entry => ((entry._1, entry._2), entry._3)).toMap + + def filledColumnVector(value: Double, size: Int): List[(Int, Double)] = { + val vector = mutable.ListBuffer[(Int, Double)]() + (0 until size).foreach { row => + vector += new Tuple2(row, value) + } + + vector.toList + } +} + +/** + * Octave/Matlab implementations to provide the expected ranks. This comes from the Wikipedia page on + * PageRank: http://en.wikipedia.org/wiki/PageRank#Computation + * + * function [v] = iterate(A, sv, d) + * + * N = size(A, 2) M = (spdiags(1 ./ sum(A, 2), 0, N, N) * A)'; v = (d * M * sv) + (((1 - d) / N) .* ones(N, + * 1)); + * + * endfunction + * + * iterate([0 0 0 0 1; 0.5 0 0 0 0; 0.5 0 0 0 0; 0 1 0.5 0 0; 0 0 0.5 1 0], [0.2; 0.2; 0.2; 0.2; 0.2], 0.4) + * + * % Parameter M adjacency matrix where M_i,j represents the link from 'j' to 'i', such that for all 'j' + * sum(i, M_i,j) = 1 % Parameter d damping factor % Parameter v_quadratic_error quadratic error for v % Return + * v, a vector of ranks such that v_i is the i-th rank from [0, 1] + * + * function [v] = rank(M, d, v_quadratic_error) + * + * N = size(M, 2); % N is equal to half the size of M v = rand(N, 1); v = v ./ norm(v, 2); last_v = ones(N, 1) + * * inf; M_hat = (d .* M) + (((1 - d) / N) .* ones(N, N)); + * + * while(norm(v - last_v, 2) > v_quadratic_error) last_v = v; v = M_hat * v; v = v ./ norm(v, 2); end + * + * endfunction + * + * M = [0 0 0 0 1 ; 0.5 0 0 0 0 ; 0.5 0 0 0 0 ; 0 1 0.5 0 0 ; 0 0 0.5 1 0]; rank(M, 0.4, 0.001) + */ diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankTest.scala new file mode 100644 index 0000000000..b55e30ab5f --- /dev/null +++ b/scalding-commons/src/test/scala/com/twitter/scalding/WeightedPageRankTest.scala @@ -0,0 +1,57 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import org.scalatest.{Matchers, WordSpec} + +class WeightedPageRankSpec extends WordSpec with Matchers { + "Weighted PageRank job" should { + var idx = 0 + JobTest(new com.twitter.scalding.examples.WeightedPageRank(_)) + .arg("pwd", ".") + .arg("weighted", "true") + .arg("maxiterations", "1") + .arg("jumpprob", "0.1") + .source(Tsv("./nodes"), List((1, "2,3", "1,2", 0.26), (2, "3", "1", 0.54), (3, "", "", 0.2))) + .source(Tsv("./numnodes"), List(3)) + .source(Tsv("./pagerank_0"), List((1, 0.086), (2, 0.192), (3, 0.722))) + .typedSink(TypedTsv[Double]("./totaldiff")) { ob => + (idx + ": have low error") in { + ob.head shouldBe (0.722 - 0.461 + 0.2964 - 0.192 + 0.2426 - 0.086) +- 0.001 + } + idx += 1 + } + .sink[(Int, Double)](Tsv("./pagerank_1")) { outputBuffer => + val pageRank = outputBuffer.map(res => (res._1, res._2)).toMap + (idx + ": correctly compute pagerank") in { + val deadMass = 0.722 / 3 * 0.9 + val userMass = List(0.26, 0.54, 0.2).map(_ * 0.1) + val massNext = List(0, 0.086 / 3, (0.086 * 2 / 3 + 0.192)).map(_ * 0.9) + val expected = (userMass.zip(massNext)).map { a: (Double, Double) => a._1 + a._2 + deadMass } + + println(pageRank) + (pageRank(1) + pageRank(2) + pageRank(3)) shouldBe 1.0 +- 0.001 + pageRank(1) shouldBe (expected(0)) +- 0.001 + pageRank(2) shouldBe (expected(1)) +- 0.001 + pageRank(3) shouldBe (expected(2)) +- 0.001 + } + idx += 1 + } + .runWithoutNext(useHadoop = false) + .runWithoutNext(useHadoop = true) + .finish() + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/WordCountTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/WordCountTest.scala similarity index 58% rename from scalding-core/src/test/scala/com/twitter/scalding/WordCountTest.scala rename to scalding-commons/src/test/scala/com/twitter/scalding/WordCountTest.scala index 98240cada5..b25b232cd9 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/WordCountTest.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/WordCountTest.scala @@ -12,25 +12,25 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ +import org.scalatest.{Matchers, WordSpec} -class WordCountTest extends Specification { +class WordCountTest extends WordSpec with Matchers { "A WordCount job" should { - JobTest("com.twitter.scalding.examples.WordCountJob"). - arg("input", "inputFile"). - arg("output", "outputFile"). - source(TextLine("inputFile"), List((0, "hack hack hack and hack"))). - sink[(String,Int)](Tsv("outputFile")){ outputBuffer => + JobTest(new com.twitter.scalding.examples.WordCountJob(_)) + .arg("input", "inputFile") + .arg("output", "outputFile") + .source(TextLine("inputFile"), List((0, "hack hack hack and hack"))) + .sink[(String, Int)](TypedTsv[(String, Long)]("outputFile")) { outputBuffer => val outMap = outputBuffer.toMap "count words correctly" in { - outMap("hack") must be_==(4) - outMap("and") must be_==(1) + outMap("hack") shouldBe 4 + outMap("and") shouldBe 1 } - }. - run. - finish + } + .run + .finish() } } diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/commons/VersionedKeyValSourceTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/commons/VersionedKeyValSourceTest.scala index ea5be878a6..dd232c069f 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/commons/VersionedKeyValSourceTest.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/commons/VersionedKeyValSourceTest.scala @@ -12,27 +12,28 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.source -import org.specs._ +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ +import com.twitter.scalding.commons.datastores.VersionedStore import com.twitter.bijection.Injection - +import com.google.common.io.Files +import org.apache.hadoop.mapred.JobConf +import java.io.{File, FileWriter} // Use the scalacheck generators -import org.scalacheck.Gen import scala.collection.mutable.Buffer -import TDsl._ - class TypedWriteIncrementalJob(args: Args) extends Job(args) { import RichPipeEx._ val pipe = TypedPipe.from(TypedTsv[Int]("input")) - implicit val inj = Injection.connect[(Int, Int), (Array[Byte], Array[Byte])] + implicit val inj: Injection[(Int, Int), (Array[Byte], Array[Byte])] = + Injection.connect[(Int, Int), (Array[Byte], Array[Byte])] pipe - .map{k => (k, k)} + .map(k => (k, k)) .writeIncremental(VersionedKeyValSource[Int, Int]("output")) } @@ -40,46 +41,146 @@ class MoreComplexTypedWriteIncrementalJob(args: Args) extends Job(args) { import RichPipeEx._ val pipe = TypedPipe.from(TypedTsv[Int]("input")) - implicit val inj = Injection.connect[(Int, Int), (Array[Byte], Array[Byte])] + implicit val inj: Injection[(Int, Int), (Array[Byte], Array[Byte])] = + Injection.connect[(Int, Int), (Array[Byte], Array[Byte])] pipe - .map{k => (k, k)} + .map(k => (k, k)) .group .sum .writeIncremental(VersionedKeyValSource[Int, Int]("output")) } -class TypedWriteIncrementalTest extends Specification { - import Dsl._ - noDetailedDiffs() +class ToIteratorJob(args: Args) extends Job(args) { + import RichPipeEx._ + val source = VersionedKeyValSource[Int, Int]("input") + + val iteratorCopy = source.toIterator.toList + val iteratorPipe = TypedPipe.from(iteratorCopy) + + val duplicatedPipe = TypedPipe.from(source) ++ iteratorPipe + + duplicatedPipe.group.sum + .writeIncremental(VersionedKeyValSource[Int, Int]("output")) +} +class VersionedKeyValSourceTest extends WordSpec with Matchers { val input = (1 to 100).toList "A TypedWriteIncrementalJob" should { JobTest(new TypedWriteIncrementalJob(_)) .source(TypedTsv[Int]("input"), input) - .sink[(Int, Int)](VersionedKeyValSource[Array[Byte], Array[Byte]]("output")) { outputBuffer: Buffer[(Int, Int)] => - "Outputs must be as expected" in { - outputBuffer.size must_== input.size - val singleInj = implicitly[Injection[Int, Array[Byte]]] - input.map{k => (k, k)}.sortBy(_._1).toString must be_==(outputBuffer.sortBy(_._1).toList.toString) - } + .sink[(Int, Int)](VersionedKeyValSource[Array[Byte], Array[Byte]]("output")) { + outputBuffer: Buffer[(Int, Int)] => + "Outputs must be as expected" in { + assert(outputBuffer.size === input.size) + val singleInj = implicitly[Injection[Int, Array[Byte]]] + assert( + input.map(k => (k, k)).sortBy(_._1).toString === outputBuffer.sortBy(_._1).toList.toString + ) + } } .run - .finish + .finish() } "A MoreComplexTypedWriteIncrementalJob" should { JobTest(new MoreComplexTypedWriteIncrementalJob(_)) .source(TypedTsv[Int]("input"), input) - .sink[(Int, Int)](VersionedKeyValSource[Array[Byte], Array[Byte]]("output")) { outputBuffer: Buffer[(Int, Int)] => - "Outputs must be as expected" in { - outputBuffer.size must_== input.size - val singleInj = implicitly[Injection[Int, Array[Byte]]] - input.map{k => (k, k)}.sortBy(_._1).toString must be_==(outputBuffer.sortBy(_._1).toList.toString) - } + .sink[(Int, Int)](VersionedKeyValSource[Array[Byte], Array[Byte]]("output")) { + outputBuffer: Buffer[(Int, Int)] => + "Outputs must be as expected" in { + assert(outputBuffer.size === input.size) + val singleInj = implicitly[Injection[Int, Array[Byte]]] + assert( + input.map(k => (k, k)).sortBy(_._1).toString === outputBuffer.sortBy(_._1).toList.toString + ) + } } .run - .finish + .finish() + } + + "A ToIteratorJob" should { + "return the values via toIterator" in { + JobTest(new ToIteratorJob(_)) + .source(VersionedKeyValSource[Int, Int]("input"), input.zip(input)) + .sink(VersionedKeyValSource[Int, Int]("output")) { outputBuffer: Buffer[(Int, Int)] => + val (keys, vals) = outputBuffer.unzip + assert(keys.map(_ * 2) === vals) + } + .run + .finish() + } + } + + "A VersionedKeyValSource" should { + "Validate that explicitly provided versions exist" in { + val path = setupLocalVersionStore(100L to 102L) + + val thrown = the[InvalidSourceException] thrownBy { validateVersion(path, Some(103)) } + assert( + thrown.getMessage === "Version 103 does not exist. " + + "Currently available versions are: [102, 101, 100]" + ) + + // should not throw + validateVersion(path, Some(101)) + + // should not throw + validateVersion(path) + } + + "calculate right size of source" in { + val oldContent = "size of old content should be ignored" + val content = "Hello World" + val contentSize = content.getBytes.length + val path = setupLocalVersionStore( + 100L to 102L, + { + case 102L => Some(content) + case _ => Some(oldContent) + } + ) + + val keyValueSize = VersionedKeyValSource(path).source + .getSize(new JobConf()) + + contentSize should be(keyValueSize) + } } + + /** + * Creates a temp dir and then creates the provided versions within it. + */ + private def setupLocalVersionStore( + versions: Seq[Long], + contentFn: Long => Option[String] = _ => None + ): String = { + val root = Files.createTempDir() + root.deleteOnExit() + val store = new VersionedStore(root.getAbsolutePath) + versions.foreach { v => + val p = store.createVersion(v) + new File(p).mkdirs() + + contentFn(v) + .foreach { text => + val content = new FileWriter(new File(p + "/test")) + content.write(text) + content.close() + } + + store.succeedVersion(p) + } + + root.getAbsolutePath + } + + /** + * Creates a VersionedKeyValSource using the provided version and then validates it. + */ + private def validateVersion(path: String, version: Option[Long] = None) = + VersionedKeyValSource(path = path, sourceVersion = version) + .validateTaps(Hdfs(false, new JobConf())) } diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/commons/extensions/CheckpointSpec.scala b/scalding-commons/src/test/scala/com/twitter/scalding/commons/extensions/CheckpointSpec.scala index 08a26398c0..a960c26dd1 100644 --- a/scalding-commons/src/test/scala/com/twitter/scalding/commons/extensions/CheckpointSpec.scala +++ b/scalding-commons/src/test/scala/com/twitter/scalding/commons/extensions/CheckpointSpec.scala @@ -12,16 +12,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.commons.extensions import com.twitter.scalding._ -import org.specs._ +import org.scalatest.WordSpec import scala.collection.mutable.Buffer /** - * @author Mike Jahr + * @author + * Mike Jahr */ class CheckpointJob(args: Args) extends Job(args) { @@ -37,7 +38,7 @@ class CheckpointJob(args: Args) extends Job(args) { in0 .joinWithSmaller('y0 -> 'y1, in1) .map(('s0, 's1) -> 'score) { v: (Int, Int) => v._1 * v._2 } - .groupBy('x0, 'x1) { _.sum[Double]('score) } + .groupBy('x0, 'x1)(_.sum[Double]('score)) } out.write(Tsv("output")) @@ -48,50 +49,48 @@ class TypedCheckpointJob(args: Args) extends Job(args) { implicit val implicitArgs: Args = args def in0 = Checkpoint[(Int, Int, Int)]("c0") { - TypedTsv[(Int, Int, Int)]("input0").map( x => x ) + TypedTsv[(Int, Int, Int)]("input0").map(x => x) } - def in1 = Checkpoint[(Int, Int, Int)]("c1"){ - TypedTsv[(Int,Int,Int)]("input1").map( x => x ) + def in1 = Checkpoint[(Int, Int, Int)]("c1") { + TypedTsv[(Int, Int, Int)]("input1").map(x => x) } def out = Checkpoint[(Int, Int, Double)]("c2") { - in0.groupBy(_._2) + in0 + .groupBy(_._2) .join(in1.groupBy(_._2)) - .mapValues{ case (l,r) => ((l._1, r._1),(l._3 * r._3).toDouble) } + .mapValues { case (l, r) => ((l._1, r._1), (l._3 * r._3).toDouble) } .values .group .sum - .map{ tup => (tup._1._1, tup._1._2, tup._2)} // super ugly, don't do this in a real job + .map(tup => (tup._1._1, tup._1._2, tup._2)) // super ugly, don't do this in a real job } - out.write(TypedTsv[(Int, Int,Double)]("output")) + out.write(TypedTsv[(Int, Int, Double)]("output")) } -class CheckpointSpec extends Specification { +class CheckpointSpec extends WordSpec { "A CheckpointJob" should { val in0 = Set((0, 0, 1), (0, 1, 1), (1, 0, 2), (2, 0, 4)) val in1 = Set((0, 1, 1), (1, 0, 2), (2, 4, 5)) val out = Set((0, 1, 2.0), (0, 0, 1.0), (1, 1, 4.0), (2, 1, 8.0)) // Verifies output when passed as a callback to JobTest.sink(). - def verifyOutput[A](expectedOutput: Set[A], actualOutput: Buffer[A]): Unit = { - val unordered = actualOutput.toSet - unordered must_== expectedOutput - } + def verifyOutput[A](expectedOutput: Set[A], actualOutput: Buffer[A]): Unit = + assert(actualOutput.toSet === expectedOutput) // Runs a test in both local test and hadoop test mode, verifies the final // output, and clears the local file set. - def runTest(test: JobTest) = { + def runTest(test: JobTest) = // runHadoop seems to have trouble with sequencefile format; use TSV. test .arg("checkpoint.format", "tsv") .sink[(Int, Int, Double)](Tsv("output"))(verifyOutput(out, _)) .run .runHadoop - .finish - } + .finish() "run without checkpoints" in runTest { - JobTest("com.twitter.scalding.commons.extensions.CheckpointJob") + JobTest(new CheckpointJob(_)) .source(Tsv("input0"), in0) .source(Tsv("input1"), in1) } @@ -99,7 +98,7 @@ class CheckpointSpec extends Specification { "read c0, write c1 and c2" in runTest { // Adding filenames to Checkpoint.testFileSet makes Checkpoint think that // they exist. - JobTest("com.twitter.scalding.commons.extensions.CheckpointJob") + JobTest(new CheckpointJob(_)) .arg("checkpoint.file", "test") .registerFile("test_c0") .source(Tsv("test_c0"), in0) @@ -109,14 +108,14 @@ class CheckpointSpec extends Specification { } "read c2, skipping c0 and c1" in runTest { - JobTest("com.twitter.scalding.commons.extensions.CheckpointJob") + JobTest(new CheckpointJob(_)) .arg("checkpoint.file", "test") .registerFile("test_c2") .source(Tsv("test_c2"), out) } "clobber c0" in runTest { - JobTest("com.twitter.scalding.commons.extensions.CheckpointJob") + JobTest(new CheckpointJob(_)) .arg("checkpoint.file.c0", "test_c0") .arg("checkpoint.clobber", "") .registerFile("test_c0") @@ -126,7 +125,7 @@ class CheckpointSpec extends Specification { } "read c0 and clobber c1" in runTest { - JobTest("com.twitter.scalding.commons.extensions.CheckpointJob") + JobTest(new CheckpointJob(_)) .arg("checkpoint.file", "test") .arg("checkpoint.clobber.c1", "") .registerFile("test_c0") @@ -139,73 +138,70 @@ class CheckpointSpec extends Specification { } } -class TypedCheckpointSpec extends Specification { +class TypedCheckpointSpec extends WordSpec { "A TypedCheckpointJob" should { val in0 = Set((0, 0, 1), (0, 1, 1), (1, 0, 2), (2, 0, 4)) val in1 = Set((0, 1, 1), (1, 0, 2), (2, 4, 5)) val out = Set((0, 1, 2.0), (0, 0, 1.0), (1, 1, 4.0), (2, 1, 8.0)) // Verifies output when passed as a callback to JobTest.sink(). - def verifyOutput[A](expectedOutput: Set[A], actualOutput: Buffer[A]): Unit = { - val unordered = actualOutput.toSet - unordered must_== expectedOutput - } + def verifyOutput[A](expectedOutput: Set[A], actualOutput: Buffer[A]): Unit = + assert(actualOutput.toSet === expectedOutput) // Runs a test in both local test and hadoop test mode, verifies the final // output, and clears the local file set. - def runTest(test: JobTest) = { + def runTest(test: JobTest) = // runHadoop seems to have trouble with sequencefile format; use TSV. test .arg("checkpoint.format", "tsv") .sink[(Int, Int, Double)](TypedTsv[(Int, Int, Double)]("output"))(verifyOutput(out, _)) .run .runHadoop - .finish - } + .finish() "run without checkpoints" in runTest { - JobTest("com.twitter.scalding.commons.extensions.TypedCheckpointJob") - .source(TypedTsv[(Int,Int,Int)]("input0"), in0) - .source(TypedTsv[(Int,Int,Int)]("input1"), in1) + JobTest(new TypedCheckpointJob(_)) + .source(TypedTsv[(Int, Int, Int)]("input0"), in0) + .source(TypedTsv[(Int, Int, Int)]("input1"), in1) } "read c0, write c1 and c2" in runTest { // Adding filenames to Checkpoint.testFileSet makes Checkpoint think that // they exist. - JobTest("com.twitter.scalding.commons.extensions.TypedCheckpointJob") + JobTest(new TypedCheckpointJob(_)) .arg("checkpoint.file", "test") .registerFile("test_c0") .source(Tsv("test_c0"), in0) - .source(TypedTsv[(Int,Int,Int)]("input1"), in1) + .source(TypedTsv[(Int, Int, Int)]("input1"), in1) .sink[(Int, Int, Int)](Tsv("test_c1"))(verifyOutput(in1, _)) .sink[(Int, Int, Double)](Tsv("test_c2"))(verifyOutput(out, _)) } "read c2, skipping c0 and c1" in runTest { - JobTest("com.twitter.scalding.commons.extensions.TypedCheckpointJob") + JobTest(new TypedCheckpointJob(_)) .arg("checkpoint.file", "test") .registerFile("test_c2") .source(Tsv("test_c2"), out) } "clobber c0" in runTest { - JobTest("com.twitter.scalding.commons.extensions.TypedCheckpointJob") + JobTest(new TypedCheckpointJob(_)) .arg("checkpoint.file.c0", "test_c0") .arg("checkpoint.clobber", "") .registerFile("test_c0") - .source(TypedTsv[(Int,Int,Int)]("input0"), in0) - .source(TypedTsv[(Int,Int,Int)]("input1"), in1) + .source(TypedTsv[(Int, Int, Int)]("input0"), in0) + .source(TypedTsv[(Int, Int, Int)]("input1"), in1) .sink[(Int, Int, Int)](Tsv("test_c0"))(verifyOutput(in0, _)) } "read c0 and clobber c1" in runTest { - JobTest("com.twitter.scalding.commons.extensions.TypedCheckpointJob") + JobTest(new TypedCheckpointJob(_)) .arg("checkpoint.file", "test") .arg("checkpoint.clobber.c1", "") .registerFile("test_c0") .registerFile("test_c1") .source(Tsv("test_c0"), in0) - .source(TypedTsv[(Int,Int,Int)]("input1"), in1) + .source(TypedTsv[(Int, Int, Int)]("input1"), in1) .sink[(Int, Int, Int)](Tsv("test_c1"))(verifyOutput(in1, _)) .sink[(Int, Int, Double)](Tsv("test_c2"))(verifyOutput(out, _)) } diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/commons/scheme/ExecutionTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/commons/scheme/ExecutionTest.scala new file mode 100644 index 0000000000..157deb1ea9 --- /dev/null +++ b/scalding-commons/src/test/scala/com/twitter/scalding/commons/scheme/ExecutionTest.scala @@ -0,0 +1,78 @@ +package com.twitter.scalding.commons.scheme + +import com.twitter.scalding.source.TypedSequenceFile +import com.twitter.scalding.{Config, Execution, Hdfs, Local, TypedPipe} +import org.apache.hadoop.conf.Configuration +import org.scalatest.{Matchers, WordSpec} +import scala.util.{Failure, Success} + +class ExecutionTest extends WordSpec with Matchers { + object TestPath { + def getCurrentDirectory = new java.io.File(".").getCanonicalPath + def prefix = getCurrentDirectory.split("/").last match { + case "scalding-commons" => getCurrentDirectory + case _ => getCurrentDirectory + "/scalding-commons" + } + val testfsPathRoot = prefix + "/src/test/resources/com/twitter/scalding/test_filesystem/" + } + + implicit class ExecutionTestHelper[T](ex: Execution[T]) { + def shouldSucceed(): T = { + val r = ex.waitFor(Config.default, Local(true)) + r match { + case Success(s) => s + case Failure(e) => fail(s"Failed running execution, exception:\n$e") + } + } + + def shouldSucceedHadoop(): T = { + val mode = Hdfs(true, new Configuration) + val r = ex.waitFor(Config.defaultFrom(mode), mode) + r match { + case Success(s) => s + case Failure(e) => fail(s"Failed running execution, exception:\n$e") + } + } + + def shouldFail(): Unit = { + val r = ex.waitFor(Config.default, Local(true)) + assert(r.isFailure) + } + + def shouldFailWith(message: String): Unit = { + val r = ex.waitFor(Config.default, Local(true)) + assert(r.isFailure) + r.failed.get.getMessage shouldBe message + } + } + + "Execution" should { + class TypedSequenceFileSource[T](override val path: String) + extends TypedSequenceFile[T](path) + with CombinedSequenceFileScheme + + "toIterableExecution works correctly on partly empty input (empty part, part with value)" in { + val exec = + TypedPipe + .from(new TypedSequenceFileSource[(Long, Long)](TestPath.testfsPathRoot + "test_data/2013/09")) + .toIterableExecution + .map(_.toSet) + + val res = exec.shouldSucceedHadoop() + + assert(res == Set((1L, 1L))) + } + + "toIterableExecution works correctly on partly empty input (empty part, part with value, empty part)" in { + val exec = + TypedPipe + .from(new TypedSequenceFileSource[(Long, Long)](TestPath.testfsPathRoot + "test_data/2013/10")) + .toIterableExecution + .map(_.toSet) + + val res = exec.shouldSucceedHadoop() + + assert(res == Set((1L, 1L))) + } + } +} diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/LzoGenericSourceSpec.scala b/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/LzoGenericSourceSpec.scala new file mode 100644 index 0000000000..955d9f3e2b --- /dev/null +++ b/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/LzoGenericSourceSpec.scala @@ -0,0 +1,30 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.commons.source + +import com.twitter.bijection.JavaSerializationInjection +import org.scalatest.{Matchers, WordSpec} +import scala.util.Success + +class LzoGenericSourceSpec extends WordSpec with Matchers { + "LzoGenericScheme" should { + "be serializable" in { + val scheme = LzoGenericScheme[Array[Byte]](IdentityBinaryConverter) + val inj = JavaSerializationInjection[LzoGenericScheme[Array[Byte]]] + inj.invert(inj.apply(scheme)) shouldBe Success(scheme) + } + } +} diff --git a/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/typedtext/TypedTextTest.scala b/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/typedtext/TypedTextTest.scala new file mode 100644 index 0000000000..60c1c513a2 --- /dev/null +++ b/scalding-commons/src/test/scala/com/twitter/scalding/commons/source/typedtext/TypedTextTest.scala @@ -0,0 +1,16 @@ +package com.twitter.scalding.commons.source + +import com.twitter.scalding._ + +import org.scalatest.FunSuite + +case class Test1(a: Int, b: Long, c: Option[Double]) +case class Test2(one: Test1, d: String) + +class TypedTextTest extends FunSuite { + test("Test with a nested tuple: Daily") { + val source = + LzoTypedText.dailyLzoTsv[Test2]("myPath")(DateRange(RichDate.now, RichDate.now + Hours(1)), implicitly) + assert(source.sourceFields.size == 4) + } +} diff --git a/scalding-core/codegen/flatten_group_generator.rb b/scalding-core/codegen/flatten_group_generator.rb new file mode 100755 index 0000000000..f989c851aa --- /dev/null +++ b/scalding-core/codegen/flatten_group_generator.rb @@ -0,0 +1,121 @@ +#!/usr/bin/env ruby + +# Run it like this: +# +# ./codegen/flatten_group_generator.rb > src/main/scala/com/twitter/scalding/typed/GeneratedFlattenGroup.scala + +$indent = " " + +TYPES = ('A'..'Z').to_a + +# generating too many implicit enrichments really slows down compile time, so we only generate +# a limited number of enrichments +MAX_IMPLICIT_ENRICHMENT_ARITY = 6 + +def make_nested_type(arity) + if arity < 2 + raise "arity < 2 doesn't make sense here" + end + + if arity == 2 + return "(#{TYPES[0]}, #{TYPES[1]})" + else + prev = make_nested_type(arity - 1) + return "(#{prev}, #{TYPES[arity - 1]})" + end +end + +def make_flatten_left_join(arity) + nested_type = make_nested_type(arity) + flat_type = TYPES[0..(arity - 1)].join(", ") + + puts "#{$indent}def flattenNestedTuple[#{flat_type}](nested: #{nested_type}): (#{flat_type}) = {" + puts "#{$indent*2}val #{nested_type.downcase} = nested" + puts "#{$indent*2}(#{flat_type.downcase})" + puts "#{$indent}}" + puts + + if arity <= MAX_IMPLICIT_ENRICHMENT_ARITY + puts "#{$indent}class FlattenLeftJoin#{arity}[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], #{flat_type}](nested: KLL[KEY, #{nested_type}]) {" + puts "#{$indent*2}def flattenValueTuple: KLL[KEY, (#{flat_type})] = nested.mapValues { tup => FlattenGroup.flattenNestedTuple(tup) }" + puts "#{$indent}}" + puts + puts "#{$indent}implicit def toFlattenLeftJoin#{arity}[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], #{flat_type}](nested: KLL[KEY, #{nested_type}]) = new FlattenLeftJoin#{arity}(nested)" + end + +end + +def make_alternating_nested_type(arity) + if arity < 2 + raise "arity < 2 doesn't make sense here" + end + + if arity == 2 + return "(Option[#{TYPES[0]}], Option[#{TYPES[1]}])" + else + prev = make_alternating_nested_type(arity - 1) + return "(Option[#{prev}], Option[#{TYPES[arity - 1]}])" + end +end + +def make_flatten_outer_join(arity) + nested_type = make_alternating_nested_type(arity) + types = TYPES[0..(arity - 1)] + flat_type = types.join(", ") + flat_type_options = types.map {|x| "Option[#{x}]"}.join(", ") + + puts "#{$indent}def flattenNestedOptionTuple[#{flat_type}](nested: #{nested_type}): (#{flat_type_options}) = {" + puts "#{$indent*2}val (rest1, #{TYPES[arity-1].downcase}) = nested" + + (1..(arity-3)).each do |n| + puts "#{$indent*2}val (rest#{n+1}, #{TYPES[arity-1-n].downcase}) = rest#{n}.getOrElse(pairOfNones)" + end + + puts "#{$indent*2}val (#{TYPES[0].downcase}, #{TYPES[1].downcase}) = rest#{arity-2}.getOrElse(pairOfNones)" + + puts "#{$indent*2}(#{flat_type.downcase})" + + puts "#{$indent}}" + puts + + if arity <= MAX_IMPLICIT_ENRICHMENT_ARITY + puts "#{$indent}class FlattenOuterJoin#{arity}[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], #{flat_type}](nested: KLL[KEY, #{nested_type}]) {" + puts "#{$indent*2}def flattenValueTuple: KLL[KEY, (#{flat_type_options})] = nested.mapValues { tup => FlattenGroup.flattenNestedOptionTuple(tup) }" + puts "#{$indent}}" + puts + puts "#{$indent}implicit def toFlattenOuterJoin#{arity}[KEY, KLL[KLL_K, +KLL_V] <: KeyedListLike[KLL_K, KLL_V, KLL], #{flat_type}](nested: KLL[KEY, #{nested_type}]) = new FlattenOuterJoin#{arity}(nested)" + end + +end + +puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit" +puts %q|package com.twitter.scalding.typed + +/** + * Autogenerated methods for flattening the nested value tuples that result after + * joining many pipes together. These methods can be used directly, or via the + * the joins available in MultiJoin. + */ +object FlattenGroup { + val pairOfNones = (None, None) + +| + +puts "#{$indent}// methods for flattening results of join / leftJoin" +puts +(3..22).each { |a| + make_flatten_left_join(a) + puts +} + +puts "#{$indent}// methods for flattening results of outerJoin" +puts +(3..22).each { |a| + make_flatten_outer_join(a) + puts +} + + +puts "}" + +puts "// end of autogenerated" diff --git a/scalding-core/codegen/multi_join_generator.rb b/scalding-core/codegen/multi_join_generator.rb new file mode 100755 index 0000000000..51c7a66c9b --- /dev/null +++ b/scalding-core/codegen/multi_join_generator.rb @@ -0,0 +1,80 @@ +#!/usr/bin/env ruby + +# Run it like this: +# +# ./codegen/multi_join_generator.rb > src/main/scala/com/twitter/scalding/typed/MultiJoin.scala + +$indent = " " + +TYPES = ('B'..'Z').to_a + +def make_multi_joins(joinType, arity) + meth_name = if joinType == "join" then "apply" + elsif joinType == "leftJoin" then "left" + elsif joinType == "outerJoin" then "outer" + else raise "unknown join " + joinType + end + + flatten_meth_name = if joinType == "join" then "flattenNestedTuple" + elsif joinType == "leftJoin" then "flattenNestedTuple" + elsif joinType == "outerJoin" then "flattenNestedOptionTuple" + else raise "unknown join " + joinType + end + + types = TYPES[0..(arity - 1)] + flat_type = types.join(", ") + flat_type_options = types.map {|x| "Option[#{x}]"}.join(", ") + out_type = if joinType == "join" then flat_type else flat_type_options end + + method_decl = "#{$indent}def #{meth_name}[KEY, A, #{flat_type}](a: CoGroupable[KEY, A], " + inputs = types.map { |t| + "#{t.downcase}: CoGroupable[KEY, #{t}]" + }.join(", ") + + value_type = if joinType == "outerJoin" then "Option[A]" else "A" end + + puts method_decl + inputs + "): CoGrouped[KEY, (#{value_type}, #{out_type})] =" + + puts "#{$indent*2}a.#{joinType}(b)" + + types[1..-1].each { |t| + puts "#{$indent*3}.#{joinType}(#{t.downcase})" + } + + if arity > 1 then + puts "#{$indent*3}.mapValues { tup => #{flatten_meth_name}(tup) }" + end +end + +puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit" +puts %q|package com.twitter.scalding.typed + +/** + * This is an autogenerated object which gives you easy access to + * doing N-way joins so the types are cleaner. However, it just calls + * the underlying methods on CoGroupable and flattens the resulting tuple + */ +object MultiJoin extends java.io.Serializable { + import com.twitter.scalding.typed.FlattenGroup._ + +| + +(1..21).each { |a| + make_multi_joins("join", a) + puts +} + +(1..21).each { |a| + make_multi_joins("leftJoin", a) + puts +} + +(1..21).each { |a| + make_multi_joins("outerJoin", a) + puts +} + + +puts "}" + +puts "// end of autogenerated" diff --git a/scalding-core/codegen/scalding_gen.rb b/scalding-core/codegen/scalding_gen.rb index 286209f2ac..f5be9e51b9 100755 --- a/scalding-core/codegen/scalding_gen.rb +++ b/scalding-core/codegen/scalding_gen.rb @@ -7,20 +7,22 @@ def make_tuple_conv(cnt) indices = (0...cnt).to_a comma_tn = type_names.join(",") getters = type_names.map { |n| - #" g#{n} : TupleGetter[#{n}]" - " g#{n} : TupleGetter[#{n}]" + "g#{n}: TupleGetter[#{n}]" }.join(",\n#{$indent}") + gvalues = type_names.map { |n| "g#{n}" }.join(", ") typed_args = type_names.zip(indices).map { |n,ni| "g#{n}.get(tup, #{ni})" }.join(",\n#{$indent} ") - %Q|\n#{$indent}implicit def tuple#{cnt}Converter[#{comma_tn}](implicit -#{$indent}#{getters}): TupleConverter[Tuple#{cnt}[#{comma_tn}]] = new TupleConverter[Tuple#{cnt}[#{comma_tn}]]{ + + %Q|\n#{$indent}case class TupleConverter#{cnt}[#{comma_tn}](#{getters}) extends TupleConverter[Tuple#{cnt}[#{comma_tn}]] { #{$indent} def apply(te : TupleEntry) = { #{$indent} val tup = te.getTuple #{$indent} Tuple#{cnt}(#{typed_args}) #{$indent} } #{$indent} def arity = #{cnt} #{$indent}} +#{$indent}implicit def tuple#{cnt}Converter[#{comma_tn}](implicit +#{$indent}#{getters}): TupleConverter[Tuple#{cnt}[#{comma_tn}]] = TupleConverter#{cnt}(#{gvalues}) | end @@ -28,7 +30,7 @@ def make_setter(cnt) underscores = (["_"]*cnt).join(",") type_names = ('A'..'Y').to_a[0...cnt] comma_tn = type_names.join(",") - head = %Q|\n#{$indent}implicit def tup#{cnt}Setter[Z <: Tuple#{cnt}[#{underscores}]]: TupleSetter[Z] = new TupleSetter[Z] { + head = %Q|\n#{$indent}case class TupleSetter#{cnt}[Z <: Tuple#{cnt}[#{underscores}]]() extends TupleSetter[Z] { #{$indent} override def apply(arg: Z) = { #{$indent} val tup = Tuple.size(#{cnt}) #{$indent} | @@ -36,12 +38,36 @@ def make_setter(cnt) tail = %Q| #{$indent} tup #{$indent} } - #{$indent} override def arity = #{cnt} -#{$indent}}| +#{$indent}} +#{$indent}implicit def tup#{cnt}Setter[Z <: Tuple#{cnt}[#{underscores}]]: TupleSetter[Z] = TupleSetter#{cnt}[Z]()| head + middle + tail end +# case TupleSetterN() => Some(obj.TupleConverterN(TupleGetter.Casting(), ...)) +def make_converter_from_setter_case(cnt, obj) + i3 = $indent * 3 + i4 = $indent * 4 + casting = "TupleGetter.Casting()" + casting_args = (1..cnt).map { |c| casting }.join(",\n#{i4}") + + "#{i3}case TupleSetter#{cnt}() => Some(#{obj}.TupleConverter#{cnt}(#{casting_args}))" +end + +# def converterFromSetter[A](ts: TupleSetter[A], gtc: GeneratedTupleConverter): Option[TupleConverter[A]] =# +# (ts match { +# ... +# case _ => None +# }).asInstanceOf[Option[TupleConverter[A]]] + +def make_converter_from_setter + ["#{$indent}def converterFromSetter[A](ts: TupleSetter[A], gtc: GeneratedTupleConverters): Option[TupleConverter[A]] =", + "#{$indent}#{$indent}(ts match {", + (1..22).map { |c| make_converter_from_setter_case(c, "gtc") }.to_a, + "#{$indent}#{$indent}#{$indent}case _ => None", + "#{$indent}}).asInstanceOf[Option[TupleConverter[A]]]"].flatten.join("\n") +end + puts "// following were autogenerated by #{__FILE__} at #{Time.now} do not edit" puts %q|package com.twitter.scalding import cascading.tuple.Tuple @@ -55,5 +81,9 @@ def make_setter(cnt) puts "trait GeneratedTupleSetters extends LowPriorityTupleSetters {" (1..22).each { |c| puts make_setter(c) } +puts "" +puts make_converter_from_setter +puts "" + puts "}" puts "// end of autogenerated" diff --git a/scalding-core/src/main/java/com/twitter/scalding/cascading_interop/FlowListenerPromise.java b/scalding-core/src/main/java/com/twitter/scalding/cascading_interop/FlowListenerPromise.java new file mode 100644 index 0000000000..bf403cc113 --- /dev/null +++ b/scalding-core/src/main/java/com/twitter/scalding/cascading_interop/FlowListenerPromise.java @@ -0,0 +1,93 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package com.twitter.scalding.cascading_interop; + +import cascading.flow.FlowListener; +import cascading.flow.FlowException; +import cascading.flow.FlowStepListener; +import cascading.flow.FlowStep; +import cascading.flow.Flow; +import cascading.stats.CascadingStats; + +import scala.concurrent.Promise$; +import scala.concurrent.Promise; +import scala.concurrent.Future; + +import com.twitter.scalding.FatalExecutionError; + +/* + * The cascading API uses a raw type here which is difficult to + * deal with in scala + */ +public class FlowListenerPromise { + public static class FlowStopException extends FatalExecutionError { + public FlowStopException(String message) { + super(message); + } + } + + /* + * This starts the flow and applies a mapping function fn in + * the same thread that completion happens + */ + public static Future start(Flow flow, final scala.Function1, T> fn) { + final Promise result = Promise$.MODULE$.apply(); + flow.addListener(new FlowListener() { + public void onStarting(Flow f) { } // ignore + public void onStopping(Flow f) { // in case of runtime exception cascading call onStopping + result.tryFailure(new FlowStopException("Flow was stopped")); + } + public void onCompleted(Flow f) { + // This is always called, but onThrowable is called first + if(!result.isCompleted()) { + if (f.getFlowStats().isSuccessful()) { + // we use the above rather than trySuccess to avoid calling fn twice + try { + T toPut = (T) fn.apply(f); + result.success(toPut); + } catch (Throwable t) { + result.tryFailure(t); + } + } else { + result.tryFailure(new Exception("Flow was not successfully finished")); + } + } + } + public boolean onThrowable(Flow f, Throwable t) { + result.tryFailure(t); + // The exception is handled by the owner of the promise and should not be rethrown + return true; + } + }); + flow.addStepListener(new FlowStepListener() { + public void onStepStarting(FlowStep flowStep) { } // ignore + public void onStepRunning(FlowStep flowStep) { } // ignore + public void onStepCompleted(FlowStep flowStep) { } // ignore + public void onStepStopping(FlowStep f) { result.tryFailure(new FlowStopException("Flow step was stopped")); } + public boolean onStepThrowable(FlowStep f, Throwable t) { + if (t != null) { + result.tryFailure(t); + } else { + result.tryFailure(new FlowException("Flow step failed: " + f.getName())); + } + // The exception is handled by the owner of the promise and should not be rethrown + return true; + } + }); + flow.start(); + return result.future(); + } +} diff --git a/scalding-core/src/main/java/com/twitter/scalding/tap/GlobHfs.java b/scalding-core/src/main/java/com/twitter/scalding/tap/GlobHfs.java new file mode 100644 index 0000000000..8a0dd41e82 --- /dev/null +++ b/scalding-core/src/main/java/com/twitter/scalding/tap/GlobHfs.java @@ -0,0 +1,51 @@ +package com.twitter.scalding.tap; + +import java.io.FileNotFoundException; +import java.io.IOException; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.RecordReader; + +import cascading.scheme.Scheme; + +/** + * Default implementation of getSize in {@link cascading.tap.hadoop.Hfs} don't respect to paths with glob patterns, + * that will throw IOException where we actually can calculate size of source. + */ +public class GlobHfs extends ScaldingHfs { + public GlobHfs(Scheme scheme) { + super(scheme); + } + + public GlobHfs(Scheme scheme, String stringPath) { + super(scheme, stringPath); + } + + @Override + public long getSize(JobConf conf) throws IOException { + return getSize(getPath(), conf); + } + + /** + * Get the total size of the file(s) specified by the Hfs, which may contain a glob + * pattern in its path, so we must be ready to handle that case. + */ + public static long getSize(Path path, JobConf conf) throws IOException { + FileSystem fs = path.getFileSystem(conf); + FileStatus[] statuses = fs.globStatus(path); + + if (statuses == null) { + throw new FileNotFoundException(String.format("File not found: %s", path)); + } + + long size = 0; + for (FileStatus status : statuses) { + size += fs.getContentSummary(status.getPath()).getLength(); + } + return size; + } +} diff --git a/scalding-core/src/main/java/com/twitter/scalding/tap/ScaldingHfs.java b/scalding-core/src/main/java/com/twitter/scalding/tap/ScaldingHfs.java new file mode 100644 index 0000000000..8af0eb6edf --- /dev/null +++ b/scalding-core/src/main/java/com/twitter/scalding/tap/ScaldingHfs.java @@ -0,0 +1,57 @@ +package com.twitter.scalding.tap; + +import java.io.IOException; + +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.RecordReader; + +import com.twitter.scalding.tuple.HadoopTupleEntrySchemeIterator; + +import cascading.flow.FlowProcess; +import cascading.scheme.Scheme; +import cascading.tap.SinkMode; +import cascading.tuple.Fields; +import cascading.tuple.TupleEntryIterator; + +public class ScaldingHfs extends cascading.tap.hadoop.Hfs { + protected ScaldingHfs() { + } + + protected ScaldingHfs(Scheme scheme) { + super(scheme); + } + + @Deprecated + public ScaldingHfs(Fields fields, String stringPath) { + super(fields, stringPath); + } + + @Deprecated + public ScaldingHfs(Fields fields, String stringPath, boolean replace) { + super(fields, stringPath, replace); + } + + @Deprecated + public ScaldingHfs(Fields fields, String stringPath, SinkMode sinkMode) { + super(fields, stringPath, sinkMode); + } + + public ScaldingHfs(Scheme scheme, String stringPath) { + super(scheme, stringPath); + } + + @Deprecated + public ScaldingHfs(Scheme scheme, String stringPath, boolean replace) { + super(scheme, stringPath, replace); + } + + public ScaldingHfs(Scheme scheme, String stringPath, SinkMode sinkMode) { + super(scheme, stringPath, sinkMode); + } + + @Override + public TupleEntryIterator openForRead(FlowProcess flowProcess, RecordReader input) throws IOException { + return new HadoopTupleEntrySchemeIterator(flowProcess, this, input); + } +} diff --git a/scalding-core/src/main/java/com/twitter/scalding/tuple/HadoopTupleEntrySchemeIterator.java b/scalding-core/src/main/java/com/twitter/scalding/tuple/HadoopTupleEntrySchemeIterator.java new file mode 100644 index 0000000000..236b8f2598 --- /dev/null +++ b/scalding-core/src/main/java/com/twitter/scalding/tuple/HadoopTupleEntrySchemeIterator.java @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. + * + * Project and contact information: http://www.cascading.org/ + * + * This file is part of the Cascading project. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.twitter.scalding.tuple; + +import java.io.IOException; + +import cascading.flow.FlowProcess; +import cascading.flow.SliceCounters; +import cascading.scheme.Scheme; +import cascading.tap.Tap; +import cascading.tap.hadoop.io.MultiInputSplit; +import cascading.tap.hadoop.io.MultiRecordReaderIterator; +import cascading.tap.hadoop.io.RecordReaderIterator; +import cascading.tap.hadoop.util.MeasuredRecordReader; +import cascading.util.CloseableIterator; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; + +/** + * + */ +public class HadoopTupleEntrySchemeIterator extends TupleEntrySchemeIterator +{ + private MeasuredRecordReader measuredRecordReader; + + public HadoopTupleEntrySchemeIterator( FlowProcess flowProcess, Tap parentTap, RecordReader recordReader ) throws IOException + { + this( flowProcess, parentTap.getScheme(), makeIterator( flowProcess, parentTap, recordReader ) ); + } + + public HadoopTupleEntrySchemeIterator( FlowProcess flowProcess, Scheme scheme, CloseableIterator closeableIterator ) + { + super( flowProcess, scheme, closeableIterator, flowProcess.getStringProperty( MultiInputSplit.CASCADING_SOURCE_PATH ) ); + } + + private static CloseableIterator makeIterator( FlowProcess flowProcess, Tap parentTap, RecordReader recordReader ) throws IOException + { + if( recordReader != null ) + return new RecordReaderIterator( recordReader ); + + return new MultiRecordReaderIterator( flowProcess, parentTap ); + } + + @Override + protected RecordReader wrapInput( RecordReader recordReader ) + { + if( measuredRecordReader == null ) + measuredRecordReader = new MeasuredRecordReader( getFlowProcess(), SliceCounters.Read_Duration ); + + measuredRecordReader.setRecordReader( recordReader ); + + return measuredRecordReader; + } +} diff --git a/scalding-core/src/main/java/com/twitter/scalding/tuple/TupleEntrySchemeIterator.java b/scalding-core/src/main/java/com/twitter/scalding/tuple/TupleEntrySchemeIterator.java new file mode 100644 index 0000000000..7b9bc1f567 --- /dev/null +++ b/scalding-core/src/main/java/com/twitter/scalding/tuple/TupleEntrySchemeIterator.java @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2007-2014 Concurrent, Inc. All Rights Reserved. + * + * Project and contact information: http://www.cascading.org/ + * + * This file is part of the Cascading project. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.twitter.scalding.tuple; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Collections; +import java.util.Set; + +import cascading.flow.FlowProcess; +import cascading.scheme.ConcreteCall; +import cascading.scheme.Scheme; +import cascading.tuple.TupleEntry; +import cascading.tuple.TupleEntryIterator; +import cascading.tuple.TupleEntrySchemeIteratorProps; +import cascading.tuple.TupleException; +import cascading.tuple.Tuples; +import cascading.util.CloseableIterator; +import cascading.util.SingleCloseableInputIterator; +import cascading.util.Util; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Class TupleEntrySchemeIterator is a helper class for wrapping a {@link Scheme} instance, calling + * {@link Scheme#source(cascading.flow.FlowProcess, cascading.scheme.SourceCall)} on every call to + * {@link #next()}. The behavior can be controlled via properties defined in {@link TupleEntrySchemeIteratorProps}. + *

+ * Use this class inside a custom {@link cascading.tap.Tap} when overriding the + * {@link cascading.tap.Tap#openForRead(cascading.flow.FlowProcess)} method. + */ +public class TupleEntrySchemeIterator extends TupleEntryIterator +{ + /** Field LOG */ + private static final Logger LOG = LoggerFactory.getLogger( cascading.tuple.TupleEntrySchemeIterator.class ); + + private final FlowProcess flowProcess; + private final Scheme scheme; + private final CloseableIterator inputIterator; + private final Set> permittedExceptions; + private ConcreteCall sourceCall; + + private String identifier; + private boolean isComplete = false; + private boolean hasWaiting = false; + private TupleException currentException; + + public TupleEntrySchemeIterator( FlowProcess flowProcess, Scheme scheme, Input input ) + { + this( flowProcess, scheme, input, null ); + } + + public TupleEntrySchemeIterator( FlowProcess flowProcess, Scheme scheme, Input input, String identifier ) + { + this( flowProcess, scheme, (CloseableIterator) new SingleCloseableInputIterator( (Closeable) input ), identifier ); + } + + public TupleEntrySchemeIterator( FlowProcess flowProcess, Scheme scheme, CloseableIterator inputIterator ) + { + this( flowProcess, scheme, inputIterator, null ); + } + + public TupleEntrySchemeIterator( FlowProcess flowProcess, Scheme scheme, CloseableIterator inputIterator, String identifier ) + { + super( scheme.getSourceFields() ); + this.flowProcess = flowProcess; + this.scheme = scheme; + this.inputIterator = inputIterator; + this.identifier = identifier; + + Object permittedExceptions = flowProcess.getProperty( TupleEntrySchemeIteratorProps.PERMITTED_EXCEPTIONS ); + + if( permittedExceptions != null ) + this.permittedExceptions = Util.asClasses( permittedExceptions.toString(), "unable to load permitted exception class" ); + else + this.permittedExceptions = Collections.emptySet(); + + if( this.identifier == null || this.identifier.isEmpty() ) + this.identifier = "'unknown'"; + + if( !inputIterator.hasNext() ) + { + isComplete = true; + return; + } + + sourceCall = new ConcreteCall(); + + sourceCall.setIncomingEntry( getTupleEntry() ); + sourceCall.setInput( wrapInput( inputIterator.next() ) ); + + try + { + this.scheme.sourcePrepare( flowProcess, sourceCall ); + } + catch( IOException exception ) + { + throw new TupleException( "unable to prepare source for input identifier: " + this.identifier, exception ); + } + } + + protected FlowProcess getFlowProcess() + { + return flowProcess; + } + + protected Input wrapInput( Input input ) + { + return input; + } + + @Override + public boolean hasNext() + { + if( isComplete ) + return false; + + if( hasWaiting ) + return true; + + try + { + getNext(); + } + catch( Exception exception ) + { + if( identifier == null || identifier.isEmpty() ) + identifier = "'unknown'"; + + if( permittedExceptions.contains( exception.getClass() ) ) + { + LOG.warn( "Caught permitted exception while reading {}", identifier, exception ); + return false; + } + + currentException = new TupleException( "unable to read from input identifier: " + identifier, exception ); + + return true; + } + + if( !hasWaiting ) + isComplete = true; + + return !isComplete; + } + + private TupleEntry getNext() throws IOException + { + Tuples.asModifiable( sourceCall.getIncomingEntry().getTuple() ); + hasWaiting = scheme.source( flowProcess, sourceCall ); + + if( !hasWaiting && inputIterator.hasNext() ) + { + sourceCall.setInput( wrapInput( inputIterator.next() ) ); + scheme.sourcePrepare(flowProcess, sourceCall); + + return getNext(); + } + + return getTupleEntry(); + } + + @Override + public TupleEntry next() + { + try + { + if( currentException != null ) + throw currentException; + } + finally + { + currentException = null; // data may be trapped + } + + if( isComplete ) + throw new IllegalStateException( "no next element" ); + + try + { + if( hasWaiting ) + return getTupleEntry(); + + return getNext(); + } + catch( Exception exception ) + { + throw new TupleException( "unable to source from input identifier: " + identifier, exception ); + } + finally + { + hasWaiting = false; + } + } + + @Override + public void remove() + { + throw new UnsupportedOperationException( "may not remove elements from this iterator" ); + } + + @Override + public void close() throws IOException + { + try + { + if( sourceCall != null ) + scheme.sourceCleanup( flowProcess, sourceCall ); + } + finally + { + inputIterator.close(); + } + } +} diff --git a/scalding-core/src/main/scala/com/twitter/package.scala b/scalding-core/src/main/scala/com/twitter/package.scala index ffdfc14f9b..5a62e5fb0b 100644 --- a/scalding-core/src/main/scala/com/twitter/package.scala +++ b/scalding-core/src/main/scala/com/twitter/package.scala @@ -12,64 +12,60 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter import org.apache.hadoop.fs.{Path, PathFilter} package object scalding { + /** - * The objects for the Typed-API live in the scalding.typed package - * but are aliased here. + * The objects for the Typed-API live in the scalding.typed package but are aliased here. */ val TDsl = com.twitter.scalding.typed.TDsl val TypedPipe = com.twitter.scalding.typed.TypedPipe type TypedPipe[+T] = com.twitter.scalding.typed.TypedPipe[T] type TypedSink[-T] = com.twitter.scalding.typed.TypedSink[T] type TypedSource[+T] = com.twitter.scalding.typed.TypedSource[T] - type KeyedList[K,+V] = com.twitter.scalding.typed.KeyedList[K,V] + type KeyedList[K, +V] = com.twitter.scalding.typed.KeyedList[K, V] type ValuePipe[+T] = com.twitter.scalding.typed.ValuePipe[T] type Grouped[K, +V] = com.twitter.scalding.typed.Grouped[K, V] + /** - * Make sure this is in sync with version.sbt + * scaldingVersion is logged sometimes to inform a user of what scalding version they are using The value is + * obtained through code gen with https://github.com/sbt/sbt-buildinfo */ - val scaldingVersion: String = "0.9.0rc4" + val scaldingVersion: String = BuildInfo.version object RichPathFilter { - implicit def toRichPathFilter(f: PathFilter) = new RichPathFilter(f) + implicit def toRichPathFilter(f: PathFilter): RichPathFilter = new RichPathFilter(f) } class RichPathFilter(f: PathFilter) { - def and(filters: PathFilter*): PathFilter = { + def and(filters: PathFilter*): PathFilter = new AndPathFilter(Seq(f) ++ filters) - } - def or(filters: PathFilter*): PathFilter = { + def or(filters: PathFilter*): PathFilter = new OrPathFilter(Seq(f) ++ filters) - } - def not: PathFilter = { + def not: PathFilter = new NotPathFilter(f) - } } private[this] class AndPathFilter(filters: Seq[PathFilter]) extends PathFilter { - override def accept(p: Path): Boolean = { + override def accept(p: Path): Boolean = filters.forall(_.accept(p)) - } } private[this] class OrPathFilter(filters: Seq[PathFilter]) extends PathFilter { - override def accept(p: Path): Boolean = { + override def accept(p: Path): Boolean = filters.exists(_.accept(p)) - } } private[this] class NotPathFilter(filter: PathFilter) extends PathFilter { - override def accept(p: Path): Boolean = { + override def accept(p: Path): Boolean = !filter.accept(p) - } } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ArgHelp.scala b/scalding-core/src/main/scala/com/twitter/scalding/ArgHelp.scala new file mode 100644 index 0000000000..6470ff2550 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/ArgHelp.scala @@ -0,0 +1,131 @@ +package com.twitter.scalding + +sealed trait DescribedArg { + def key: String + def description: String +} + +final case class RequiredArg(key: String, description: String) extends DescribedArg +final case class OptionalArg(key: String, description: String) extends DescribedArg +final case class ListArg(key: String, description: String) extends DescribedArg +final case class BooleanArg(key: String, description: String) extends DescribedArg + +class HelpException extends RuntimeException("User asked for help") +class DescriptionValidationException(msg: String) extends RuntimeException(msg) + +trait ArgHelper { + + /** + * Similar to describe but validate all args are described + * + * @param describedArgs + * List of Argument Descriptions + * @param ex + * Input Execution + * @return + * Output Execution + */ + def validatedDescribe[T](describedArgs: Seq[DescribedArg], ex: Execution[T]): Execution[T] = + Execution.getArgs.flatMap { args => + validatedDescribe(describedArgs, args) + ex + } + + /** + * Describe a set of Args given Descriptions and validate all Args are described + * @param describedArgs + * List of Argument Descriptions + * @param args + * Job Arguments + */ + def validatedDescribe(describedArgs: Seq[DescribedArg], args: Args): Unit = { + describe(describedArgs, args) + + val describedKeys = describedArgs.map(_.key).toSet + val missingKeys = args.m.keySet.filter(_.nonEmpty).diff(describedKeys) + + if (missingKeys.nonEmpty) { + val msg = missingKeys.mkString(", ") + throw new DescriptionValidationException(s"Must describe missing keys : $msg") + } + } + + /** + * Describe the Arguments of this Execution. By running --help the args will output and the execution will + * end + * + * @param describedArgs + * List of Argument Descriptions + * @param ex + * Input Execution + * @return + * Output Execution + */ + def describe[T](describedArgs: Seq[DescribedArg], ex: Execution[T]): Execution[T] = + Execution.getArgs.flatMap { args => + describe(describedArgs, args) + ex + } + + /** + * Describe a set of Args given Descriptions + * + * @param describedArgs + * List of Argument Descriptions + * @param args + * Job Arguments + */ + def describe(describedArgs: Seq[DescribedArg], args: Args): Unit = + if (args.boolean("help")) helpRequest(describedArgs) + else () + + def helpRequest(describedArgs: Seq[DescribedArg]): Nothing = { + val top = "\n###########################################################################\n\n" + val usage = s"Command Line Args :: ${argString(describedArgs)}\n\n\n" + val bottom = "\n\n###########################################################################\n" + + println(top + usage + help(describedArgs) + bottom) + + throw new HelpException() + } + + /** + * Command line arg string given the Described Args + * + * @param describedArgs + * List of Argument Descriptions + * @return + * Command Line Parameters + */ + private[this] def argString(describedArgs: Seq[DescribedArg]): String = + describedArgs.foldLeft("") { case (str, describedArg) => + val msg = describedArg match { + case RequiredArg(key, _) => s"--$key VALUE " + case OptionalArg(key, _) => s"[--$key VALUE] " + case ListArg(key, _) => s"[--$key VALUE VALUE2] " + case BooleanArg(key, _) => s"[--$key] " + } + str + msg + } + "[--help]" + + /** + * More detailed help command for these described arguments + * + * @param describedArgs + * List of Argument Descriptions + * @return + * Detailed Help for the Args + */ + private[this] def help(describedArgs: Seq[DescribedArg]): String = + describedArgs.foldLeft("") { case (str, describedArg) => + val msg = describedArg match { + case RequiredArg(key, description) => s"--$key(Required) :: $description \n" + case OptionalArg(key, description) => s"--$key(Optional) :: $description \n" + case ListArg(key, description) => s"--$key(List) :: $description \n" + case BooleanArg(key, description) => s"--$key(Boolean) :: $description \n" + } + str + msg + } + "--help :: Show this help message." +} + +object ArgHelp extends ArgHelper diff --git a/scalding-core/src/main/scala/com/twitter/scalding/BijectedOrderedSerialization.scala b/scalding-core/src/main/scala/com/twitter/scalding/BijectedOrderedSerialization.scala new file mode 100644 index 0000000000..befebc1553 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/BijectedOrderedSerialization.scala @@ -0,0 +1,33 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import com.twitter.scalding.serialization.OrderedSerialization +import com.twitter.bijection.{ImplicitBijection, Injection} + +object BijectedOrderedSerialization { + implicit def fromBijection[T, U](implicit + bij: ImplicitBijection[T, U], + ordSer: OrderedSerialization[U] + ): OrderedSerialization[T] = + OrderedSerialization.viaTransform[T, U](bij.apply(_), bij.invert(_)) + + implicit def fromInjection[T, U](implicit + bij: Injection[T, U], + ordSer: OrderedSerialization[U] + ): OrderedSerialization[T] = + OrderedSerialization.viaTryTransform[T, U](bij.apply(_), bij.invert(_)) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CascadeJob.scala b/scalding-core/src/main/scala/com/twitter/scalding/CascadeJob.scala index a73e1b96ae..9a1a8f4d7e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/CascadeJob.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/CascadeJob.scala @@ -8,7 +8,7 @@ abstract class CascadeJob(args: Args) extends Job(args) { def jobs: Seq[Job] override def run = { - val flows = jobs.map { _.buildFlow } + val flows = jobs.map(_.buildFlow) val cascade = new CascadeConnector().connect(flows: _*) preProcessCascade(cascade) cascade.complete() @@ -19,18 +19,17 @@ abstract class CascadeJob(args: Args) extends Job(args) { statsData.isSuccessful } - override def validate { - jobs.foreach { _.validate } - } + override def validate(): Unit = + jobs.foreach(_.validate()) /* * Good for printing a dot file, setting the flow skip strategy, etc */ - def preProcessCascade(cascade: Cascade) = { } + def preProcessCascade(cascade: Cascade) = {} // linter:ignore /* * Good for checking the cascade stats */ - def postProcessCascade(cascade: Cascade) = { } + def postProcessCascade(cascade: Cascade) = {} // linter:ignore } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CascadingMode.scala b/scalding-core/src/main/scala/com/twitter/scalding/CascadingMode.scala new file mode 100644 index 0000000000..7e7226db19 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/CascadingMode.scala @@ -0,0 +1,252 @@ +package com.twitter.scalding + +import cascading.flow.local.{LocalFlowConnector, LocalFlowProcess} +import cascading.flow.{FlowConnector, FlowProcess} +import cascading.property.AppProps +import cascading.tap.{CompositeTap, Tap} +import cascading.tap.hadoop.Hfs +import cascading.tuple.{Tuple, TupleEntryIterator} +import com.twitter.scalding.tap.ScaldingHfs +import com.twitter.scalding.typed.cascading_backend.AsyncFlowDefRunner +import java.io.File +import java.util.{Properties, UUID} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapred.JobConf +import org.slf4j.LoggerFactory +import scala.annotation.tailrec +import scala.collection.JavaConverters._ +import scala.collection.mutable.{Buffer, Map => MMap, Set => MSet} +import scala.util.{Failure, Success} + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ + +/** + * Any Mode running on cascading extends CascadingMode + */ +trait CascadingMode extends Mode { + def newWriter(): Execution.Writer = + new AsyncFlowDefRunner(this) + + override def defaultConfig: Map[String, String] = + (this match { + case m: HadoopMode => Config.hadoopWithDefaults(m.jobConf) + case _ => Config.unitTestDefault + }).toMap + + /* + * Using a new FlowProcess, which is only suitable for reading outside + * of a map/reduce job, open a given tap and return the TupleEntryIterator + */ + def openForRead(config: Config, tap: Tap[_, _, _]): TupleEntryIterator + + @deprecated("A Config is needed, especially if any kryo serialization has been used", "0.12.0") + final def openForRead(tap: Tap[_, _, _]): TupleEntryIterator = + openForRead(Config.defaultFrom(this), tap) + + // Returns true if the file exists on the current filesystem. + def fileExists(filename: String): Boolean + + /** Create a new FlowConnector for this cascading planner */ + def newFlowConnector(props: Config): FlowConnector + + /** + * Make sure we are using our `ScaldingHfs` for `Hfs` taps. + */ + protected def checkTap(tap: Tap[_, _, _], config: Config): Unit = + if (config.getCheckHfsTaps) { + tap match { + case hfs: Hfs => + assert( + hfs.getClass.isAssignableFrom(classOf[ScaldingHfs]), + """You are using instance of tap inherited from cascading.tap.hadoop.Hfs in toIterator method, + |which is broken in cascading 2.6.1, instead you need to use com.twitter.scalding.tap.ScaldingHfs. + """.stripMargin + ) + case composite: CompositeTap[t] => + composite.getChildTaps.asScala + .map(_.asInstanceOf[Tap[_, _, _]]) + .foreach(checkTap(_, config)) + case _ => + } + } +} + +object CascadingMode { + def cast(m: Mode): CascadingMode = + m match { + case cm: CascadingMode => cm + case other => throw new ModeException(s"mode: $other is not a CascadingMode") + } +} + +trait HadoopMode extends CascadingMode { + def jobConf: Configuration + + override def newFlowConnector(conf: Config) = { + val asMap = conf.toMap.toMap[AnyRef, AnyRef] // linter:ignore + val jarKey = AppProps.APP_JAR_CLASS + + val finalMap = conf.getCascadingAppJar match { + case Some(Success(cls)) => asMap + (jarKey -> cls) + case Some(Failure(err)) => + // This may or may not cause the job to fail at submission, let's punt till then + LoggerFactory + .getLogger(getClass) + .error( + "Could not create class from: %s in config key: %s, Job may fail." + .format(conf.get(jarKey), AppProps.APP_JAR_CLASS), + err + ) + // Just delete the key and see if it fails when cascading tries to submit + asMap - jarKey + case None => asMap + } + + val flowConnectorClass = jobConf.get(Mode.CascadingFlowConnectorClassKey, Mode.DefaultHadoopFlowConnector) + + try { + val clazz = Class.forName(flowConnectorClass) + val ctor = clazz.getConstructor(classOf[java.util.Map[_, _]]) + ctor.newInstance(finalMap.asJava).asInstanceOf[FlowConnector] + } catch { + case ncd: ClassNotFoundException => { + throw new ModeLoadException( + "Failed to load Cascading flow connector class " + flowConnectorClass, + ncd + ) + } + } + } + + // TODO unlike newFlowConnector, this does not look at the Job.config + override def openForRead(config: Config, tap: Tap[_, _, _]) = { + checkTap(tap, config) + val htap = tap.asInstanceOf[Tap[JobConf, _, _]] + val conf = new JobConf(true) // initialize the default config + // copy over Config + config.toMap.foreach { case (k, v) => conf.set(k, v) } + + val flowProcessClass = jobConf.get(Mode.CascadingFlowProcessClassKey, Mode.DefaultHadoopFlowProcess) + + val fp = + try { + val clazz = Class.forName(flowProcessClass) + val ctor = clazz.getConstructor(classOf[JobConf]) + ctor.newInstance(conf).asInstanceOf[FlowProcess[JobConf]] + } catch { + case ncd: ClassNotFoundException => { + throw new ModeLoadException("Failed to load Cascading flow process class " + flowProcessClass, ncd) + } + } + + htap.retrieveSourceFields(fp) + htap.sourceConfInit(fp, conf) + htap.openForRead(fp) + } +} + +trait CascadingLocal extends CascadingMode { + override def newFlowConnector(conf: Config) = + new LocalFlowConnector(conf.toMap.toMap[AnyRef, AnyRef].asJava) // linter:ignore + + override def openForRead(config: Config, tap: Tap[_, _, _]) = { + checkTap(tap, config) + val ltap = tap.asInstanceOf[Tap[Properties, _, _]] + val props = new java.util.Properties + config.toMap.foreach { case (k, v) => props.setProperty(k, v) } + val fp = new LocalFlowProcess(props) + ltap.retrieveSourceFields(fp) + ltap.sourceConfInit(fp, props) + ltap.openForRead(fp) + } +} + +// Mix-in trait for test modes; overrides fileExists to allow the registration +// of mock filenames for testing. +trait TestMode extends CascadingMode { + private var fileSet = Set[String]() + def registerTestFiles(files: Set[String]) = fileSet = files + override def fileExists(filename: String): Boolean = fileSet.contains(filename) +} + +case class Hdfs(strict: Boolean, @transient conf: Configuration) extends HadoopMode { + override def jobConf = conf + override def fileExists(filename: String): Boolean = { + val path = new Path(filename) + path.getFileSystem(jobConf).exists(path) + } +} + +object Hdfs { + + /** + * Make an Hdfs instance in strict mode with new Configuration + */ + def default: Hdfs = Hdfs(true, new Configuration) +} + +case class HadoopTest(@transient conf: Configuration, @transient buffers: Source => Option[Buffer[Tuple]]) + extends HadoopMode + with TestMode { + + // This is a map from source.toString to disk path + private val writePaths = MMap[Source, String]() + private val allPaths = MSet[String]() + + override def jobConf = conf + + @tailrec + private def allocateNewPath(prefix: String, idx: Int): String = { + val candidate = prefix + idx.toString + if (allPaths(candidate)) { + // Already taken, try again: + allocateNewPath(prefix, idx + 1) + } else { + // Update all paths: + allPaths += candidate + candidate + } + } + + private val thisTestID = UUID.randomUUID + private val basePath = "/tmp/scalding/%s/".format(thisTestID) + // Looks up a local path to write the given source to + def getWritePathFor(src: Source): String = { + val rndIdx = new java.util.Random().nextInt(1 << 30) + writePaths.getOrElseUpdate(src, allocateNewPath(basePath + src.getClass.getName, rndIdx)) + } + + def finalize(src: Source): Unit = { + /* The following `_.get` is only safe if `src` belongs to the source map. + * This invariant is preserved by the `JobTest.sink` and `JobTest.runJob` + * functions, and those functions have been documented accordingly to + * warn about this invariant. + */ + @SuppressWarnings( + Array("org.wartremover.warts.OptionPartial") + ) // Get the buffer for the given source, and empty it: + val buf = buffers(src).get + buf.clear() + // Now fill up this buffer with the content of the file + val path = getWritePathFor(src) + // We read the write tap in order to add its contents in the test buffers + val it = openForRead(Config.defaultFrom(this), src.createTap(Write)(this)) + while (it != null && it.hasNext) { + buf += new Tuple(it.next.getTuple) + } + it.close() + // Clean up this data off the disk + new File(path).delete() + writePaths -= src + } +} + +case class Local(strictSources: Boolean) extends CascadingLocal { + override def fileExists(filename: String): Boolean = new File(filename).exists +} + +/** + * Memory only testing for unit tests + */ +case class Test(buffers: (Source) => Option[Buffer[Tuple]]) extends TestMode with CascadingLocal diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CascadingTokenUpdater.scala b/scalding-core/src/main/scala/com/twitter/scalding/CascadingTokenUpdater.scala new file mode 100644 index 0000000000..dcde617a07 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/CascadingTokenUpdater.scala @@ -0,0 +1,88 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import cascading.tuple.hadoop.SerializationToken + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions.ConfigCascadingExtensions + +object CascadingTokenUpdater { + private final val lowestAllowed = 128 // cascading rules + + // Take a cascading string of tokens and turns it into a map + // from token index to class + def parseTokens(tokClass: String): Map[Int, String] = + if (tokClass == null || tokClass.isEmpty) + Map[Int, String]() + else + tokClass + .split(",") + .toIterator + .map(_.trim) + .filter(_.nonEmpty) + .map(_.split("=")) + .filter(_.length == 2) + .map(ary => (ary(0).toInt, ary(1))) + .toMap + + // does the inverse of the previous function, given a Map of index to class + // return the cascading token format for it + private def toksToString(m: Map[Int, String]): String = + m.map { case (tok, clazz) => s"$tok=$clazz" }.mkString(",") + + // Given the map of already assigned tokens, what is the next available one + private def firstAvailableToken(m: Map[Int, String]): Int = + if (m.isEmpty) lowestAllowed + else scala.math.max(m.keys.max + 1, lowestAllowed) + + // Given the first free token spot + // assign each of the class names given to al the subsequent + // positions + private def assignTokens(first: Int, names: Iterable[String]): Map[Int, String] = + names + .foldLeft((first, Map[Int, String]())) { (idMap, clz) => + val (id, m) = idMap + (id + 1, m + (id -> clz)) + } + ._2 + + def update(config: Config, clazzes: Set[Class[_]]): Config = { + val toks = config.getCascadingSerializationTokens + + val serializations = config.get(Config.IoSerializationsKey).getOrElse("") + val fromSerializations: Seq[String] = + if (serializations.isEmpty) + Seq.empty + else + for { + serialization <- serializations.split(",") + clazz = Class.forName(serialization) + tokenAnnotation = clazz.getAnnotation(classOf[SerializationToken]) + if tokenAnnotation != null + className <- tokenAnnotation.classNames() + } yield { + className + } + + // We don't want to assign tokens to classes already in the map + val newClasses: Iterable[String] = clazzes.map(_.getName) -- fromSerializations -- toks.values + + config + (Config.CascadingSerializationTokens -> toksToString( + toks ++ assignTokens(firstAvailableToken(toks), newClasses) + )) + } + +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/CoGroupBuilder.scala b/scalding-core/src/main/scala/com/twitter/scalding/CoGroupBuilder.scala index 750f095dc6..1d454916cc 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/CoGroupBuilder.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/CoGroupBuilder.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.pipe.{CoGroup, Every, Pipe} @@ -20,38 +20,37 @@ import cascading.pipe.joiner.MixedJoin import cascading.tuple.Fields /** - * Builder classes used internally to implement coGroups (joins). - * Can also be used for more generalized joins, e.g., star joins. - * + * Builder classes used internally to implement coGroups (joins). Can also be used for more generalized joins, + * e.g., star joins. */ -class CoGroupBuilder(groupFields : Fields, joinMode : JoinMode) extends GroupBuilder(groupFields) { - protected var coGroups : List[(Fields, Pipe, JoinMode)] = Nil +class CoGroupBuilder(groupFields: Fields, joinMode: JoinMode) extends GroupBuilder(groupFields) { + protected var coGroups: List[(Fields, Pipe, JoinMode)] = Nil // Joins (cogroups) with pipe p on fields f. // Make sure that pipe p is smaller than the left side pipe, otherwise this // might take a while. - def coGroup(f : Fields, p : Pipe, j : JoinMode = InnerJoinMode) = { + def coGroup(f: Fields, p: Pipe, j: JoinMode = InnerJoinMode) = { coGroups ::= (f, RichPipe.assignName(p), j) this } // TODO: move the automatic renaming of fields here // and remove it from joinWithSmaller/joinWithTiny - override def schedule(name : String, pipe : Pipe) : Pipe = { + override def schedule(name: String, pipe: Pipe): Pipe = { assert(!sorting.isDefined, "cannot use a sortBy when doing a coGroup") assert(!coGroups.isEmpty, "coGroupBy requires at least one other pipe to .coGroup") - val fields = (groupFields :: coGroups.map{ _._1 }).toArray - val pipes = (pipe :: coGroups.map{ _._2 }).map{ RichPipe.assignName(_) }.toArray - val joinModes = (joinMode :: coGroups.map{ _._3 }).map{ _.booleanValue }.toArray + val fields = (groupFields :: coGroups.map(_._1)).toArray + val pipes = (pipe :: coGroups.map(_._2)).map(RichPipe.assignName(_)).toArray + val joinModes = (joinMode :: coGroups.map(_._3)).map(_.booleanValue).toArray val mixedJoiner = new MixedJoin(joinModes) - val cg : Pipe = new CoGroup(pipes, fields, null, mixedJoiner) + val cg: Pipe = new CoGroup(pipes, fields, null, WrappedJoiner(mixedJoiner)) overrideReducers(cg) - evs.foldRight(cg)( (op : Pipe => Every, p) => op(p) ) + evs.foldRight(cg)((op: Pipe => Every, p) => op(p)) } } sealed abstract class JoinMode { - def booleanValue : Boolean + def booleanValue: Boolean } case object InnerJoinMode extends JoinMode { override def booleanValue = true diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Dsl.scala b/scalding-core/src/main/scala/com/twitter/scalding/Dsl.scala index 2998c36d89..59846773fd 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Dsl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Dsl.scala @@ -12,21 +12,26 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.pipe.Pipe +import cascading.flow.FlowDef /** - * This object has all the implicit functions and values that are used - * to make the scalding DSL, which includes the functions for automatically - * creating cascading.tuple.Fields objects from scala tuples of Strings, Symbols - * or Ints, as well as the cascading.pipe.Pipe enrichment to RichPipe which - * adds the scala.collections-like API to Pipe. + * This object has all the implicit functions and values that are used to make the scalding DSL, which + * includes the functions for automatically creating cascading.tuple.Fields objects from scala tuples of + * Strings, Symbols or Ints, as well as the cascading.pipe.Pipe enrichment to RichPipe which adds the + * scala.collections-like API to Pipe. * - * It's useful to import Dsl._ when you are writing scalding code outside - * of a Job. + * It's useful to import Dsl._ when you are writing scalding code outside of a Job. */ object Dsl extends FieldConversions with java.io.Serializable { - implicit def pipeToRichPipe(pipe : Pipe) : RichPipe = new RichPipe(pipe) + implicit def pipeToRichPipe(pipe: Pipe): RichPipe = new RichPipe(pipe) + + /** + * Enrichment on FlowDef + */ + implicit def flowDefToRichFlowDef(fd: FlowDef): RichFlowDef = new RichFlowDef(fd) + } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ExecutionApp.scala b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionApp.scala new file mode 100644 index 0000000000..81805632dc --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionApp.scala @@ -0,0 +1,124 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +/* + * We will explicitly import any non-hadoop names + */ +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.util.GenericOptionsParser + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ + +case class HadoopArgs(toArray: Array[String]) + +case class NonHadoopArgs(toArray: Array[String]) + +/* + * Make an object that extend this trait, and you can run + * it as a normal java application. + */ +object ExecutionApp { + /* + * Anything that looks like hadoop args, use + * the hadoop arg parser, otherwise, scalding. + */ + + private[this] val dArgPattern = "-D([^=]+)=([^\\s]+)".r + + private[this] val hadoopReservedArgs = List("-fs", "-jt", "-files", "-libjars", "-archives") + + def extractUserHadoopArgs(args: Array[String]): (HadoopArgs, NonHadoopArgs) = { + + val argsWithLibJars = ExpandLibJarsGlobs(args) + + // This adds a look back mechanism to match on other hadoop args we need to support + // currently thats just libjars + val (hadoopArgs, tmpNonHadoop, finalLast) = + argsWithLibJars.foldLeft(Array[String](), Array[String](), Option.empty[String]) { + // Current is a -D, so store the last in non hadoop, and add current to hadoop args + case ((hadoopArgs, nonHadoop, Some(l)), current) if dArgPattern.findFirstIn(current).isDefined => + (hadoopArgs :+ current, nonHadoop :+ l, None) + // Current is a -D, but no last to concern with, and add current to hadoop args + case ((hadoopArgs, nonHadoop, None), current) if dArgPattern.findFirstIn(current).isDefined => + (hadoopArgs :+ current, nonHadoop, None) + // Current is ignored, but last was hadoop reserved arg so store them both in the hadoop args + case ((hadoopArgs, nonHadoop, Some(x)), current) if hadoopReservedArgs.contains(x) => + (hadoopArgs ++ Array(x, current), nonHadoop, None) + // Have a last but nothing matches current. So store last in non-hadoop and current in the last holder + case ((hadoopArgs, nonHadoop, Some(l)), current) => + (hadoopArgs, nonHadoop :+ l, Some(current)) + // Have no last, and nothing matches. So just store current in the last spot + case ((hadoopArgs, nonHadoop, None), current) => + (hadoopArgs, nonHadoop, Some(current)) + } + // We can have something left in the last bucket, so extract it. + val nonHadoop = finalLast match { + case Some(x) => tmpNonHadoop :+ x + case None => tmpNonHadoop + } + + // Throwaway hadoop config + // see which of our hadoop config args are not ones + val unparsed = (new GenericOptionsParser(new Configuration, hadoopArgs)).getRemainingArgs + + (HadoopArgs(hadoopArgs.filter(!unparsed.contains(_))), NonHadoopArgs(nonHadoop ++ unparsed)) + } +} + +trait ExecutionApp extends java.io.Serializable { + def job: Execution[Unit] + + /** + * The first argument should be the mode name (hdfs or local) + * + * The default for this is to parse all hadoop arguments and put them into the config. Any unparsed hadoop + * arguments are put into the Args. + */ + def config(inputArgs: Array[String]): (Config, Mode) = { + /* + * Anything that looks like hadoop args, use + * the hadoop arg parser, otherwise, scalding. + */ + val (hadoopArgs, nonHadoop) = ExecutionApp.extractUserHadoopArgs(inputArgs) + val hconf = new Configuration + // This has the side-effect of mutating the hconf + new GenericOptionsParser(hconf, hadoopArgs.toArray) + val args = Args(nonHadoop.toArray) + val mode = Mode(args, hconf) + val config = + Config + .hadoopWithDefaults(hconf) + .setArgs(args) + .setExecutionCleanupOnFinish( + true + ) // since ExecutionApp returns Execution[Unit], temp paths can't escape + /* + * Make sure the hadoop config is set in sync with the config + * which should not matter for execution, but especially legacy + * code that accesses the jobConf is the Hdfs class, we keep + * it in sync. + */ + config.toMap.foreach { case (k, v) => hconf.set(k, v) } + + (config, mode) + } + + def main(args: Array[String]): Unit = + config(args) match { + case (conf, mode) => job.waitFor(conf, mode).get + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ExecutionContext.scala b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionContext.scala new file mode 100644 index 0000000000..d19a5bc95f --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionContext.scala @@ -0,0 +1,222 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import cascading.flow.hadoop.HadoopFlow +import cascading.flow.planner.BaseFlowStep +import cascading.flow.{Flow, FlowDef, FlowStepStrategy} +import cascading.pipe.Pipe +import com.twitter.scalding.estimation.memory.MemoryEstimatorStepStrategy +import com.twitter.scalding.reducer_estimation.ReducerEstimatorStepStrategy +import com.twitter.scalding.serialization.CascadingBinaryComparator +import com.twitter.scalding.typed.cascading_backend.{CascadingBackend, CascadingExtensions} +import org.apache.hadoop.mapred.JobConf +import org.slf4j.{Logger, LoggerFactory} +import scala.collection.JavaConverters._ +import scala.concurrent.Future +import scala.util.{Failure, Success, Try} + +import CascadingExtensions._ + +/* + * This has all the state needed to build a single flow + * This is used with the implicit-arg-as-dependency-injection + * style and with the Reader-as-dependency-injection + */ +trait ExecutionContext { + def config: Config + def flowDef: FlowDef + def mode: CascadingMode + + private def getIdentifierOpt(descriptions: Seq[String]): Option[String] = + if (descriptions.nonEmpty) Some(descriptions.distinct.mkString(", ")) else None + + private def updateStepConfigWithDescriptions(step: BaseFlowStep[JobConf]): Unit = { + val conf = step.getConfig + getIdentifierOpt(ExecutionContext.getDesc(step)).foreach { descriptionString => + conf.set(Config.StepDescriptions, descriptionString) + } + } + + /** + * @return + * Success(Some(flow)) -- when everything is right and we can build a flow from flowDef Success(None) -- + * when flowDef doesn't have sinks, even after we applied pending writes Failure(exception) -- when it’s + * impossible to build a flow + */ + final def buildFlow: Try[Option[Flow[_]]] = + // For some horrible reason, using Try( ) instead of the below gets me stuck: + // [error] + // /Users/oscar/workspace/scalding/scalding-core/src/main/scala/com/twitter/scalding/Execution.scala:92: + // type mismatch; + // [error] found : cascading.flow.Flow[_] + // [error] required: cascading.flow.Flow[?0(in method buildFlow)] where type ?0(in method + // buildFlow) + // [error] Note: Any >: ?0, but Java-defined trait Flow is invariant in type Config. + // [error] You may wish to investigate a wildcard type such as `_ >: ?0`. (SLS 3.2.10) + // [error] (resultT, Try(mode.newFlowConnector(finalConf).connect(newFlowDef))) + try { + // Set the name: + def withCounterSuffix(name: String): String = + config.getScaldingFlowCounterValue match { + case None => name + case Some(counter) => + s"$name (execution-step $counter)" + } + + val name: Option[String] = Option(flowDef.getName) + .orElse(config.getCascadingAppName) + .orElse(config.getScaldingExecutionId) + .map(withCounterSuffix(_)) + + name.foreach(flowDef.setName) + + // Do the optimization of the typed pipes, and register them + CascadingBackend.planTypedWrites(flowDef, mode) + + // We can have empty flowDef even after applying pending writers + if (flowDef.getSinks.isEmpty) { + Success(None) + } else { + // identify the flowDef + val configWithId = config.addUniqueId(UniqueID.getIDFor(flowDef)) + val flow = mode.newFlowConnector(configWithId).connect(flowDef) + + config.getRequireOrderedSerializationMode.map { mode => + // This will throw, but be caught by the outer try if + // we have groupby/cogroupby not using OrderedSerializations + CascadingBinaryComparator.checkForOrderedSerialization(flow, mode).get + } + + flow match { + case hadoopFlow: HadoopFlow => + val flowSteps = hadoopFlow.getFlowSteps.asScala + flowSteps.foreach { case baseFlowStep: BaseFlowStep[JobConf] => + updateStepConfigWithDescriptions(baseFlowStep) + } + case _ => // descriptions not yet supported in other modes + } + + // if any reducer estimators have been set, register the step strategy + // which instantiates and runs them + mode match { + case _: HadoopMode => + val reducerEstimatorStrategy: Seq[FlowStepStrategy[JobConf]] = config + .get(Config.ReducerEstimators) + .toList + .map(_ => ReducerEstimatorStepStrategy) + val memoryEstimatorStrategy: Seq[FlowStepStrategy[JobConf]] = config + .get(Config.MemoryEstimators) + .toList + .map(_ => MemoryEstimatorStepStrategy) + + val otherStrategies: Seq[FlowStepStrategy[JobConf]] = config.getFlowStepStrategies.map { + case Success(fn) => fn(mode, configWithId) + case Failure(e) => + throw new Exception("Failed to decode flow step strategy when submitting job", e) + } + + val optionalFinalStrategy = FlowStepStrategies() + .sumOption(reducerEstimatorStrategy ++ memoryEstimatorStrategy ++ otherStrategies) + + optionalFinalStrategy.foreach { strategy => + flow.setFlowStepStrategy(strategy) + } + + config.getFlowListeners.foreach { + case Success(fn) => flow.addListener(fn(mode, configWithId)) + case Failure(e) => throw new Exception("Failed to decode flow listener", e) + } + + config.getFlowStepListeners.foreach { + case Success(fn) => flow.addStepListener(fn(mode, configWithId)) + case Failure(e) => new Exception("Failed to decode flow step listener when submitting job", e) + } + case _: CascadingLocal => + config.getFlowStepStrategies.foreach { + case Success(fn) => flow.setFlowStepStrategy(fn(mode, configWithId)) + case Failure(e) => + throw new Exception("Failed to decode flow step strategy when submitting job", e) + } + + config.getFlowListeners.foreach { + case Success(fn) => flow.addListener(fn(mode, configWithId)) + case Failure(e) => throw new Exception("Failed to decode flow listener", e) + } + + config.getFlowStepListeners.foreach { + case Success(fn) => flow.addStepListener(fn(mode, configWithId)) + case Failure(e) => new Exception("Failed to decode flow step listener when submitting job", e) + } + + case _ => () + } + Success(Option(flow)) + } + } catch { + case err: Throwable => Failure(err) + } + + /** + * Asynchronously execute the plan currently contained in the FlowDef + */ + final def run: Future[JobStats] = + buildFlow match { + case Success(Some(flow)) => Execution.run(flow) + case Success(None) => Future.successful(JobStats.empty) + case Failure(err) => Future.failed(err) + } + + /** + * Synchronously execute the plan in the FlowDef + */ + final def waitFor: Try[JobStats] = + buildFlow.flatMap { + case Some(flow) => Execution.waitFor(flow) + case None => Success(JobStats.empty) + } +} + +/* + * import ExecutionContext._ + * is generally needed to use the ExecutionContext as the single + * dependency injected. For instance, TypedPipe needs FlowDef and Mode + * in many cases, so if you have an implicit ExecutionContext, you need + * modeFromImplicit, etc... below. + */ +object ExecutionContext { + private val LOG: Logger = LoggerFactory.getLogger(ExecutionContext.getClass) + + private[scalding] def getDesc[T](baseFlowStep: BaseFlowStep[T]): Seq[String] = + baseFlowStep.getGraph.vertexSet.asScala.flatMap { + case pipe: Pipe => RichPipe.getPipeDescriptions(pipe) + case _ => List() // no descriptions + }(collection.breakOut) + /* + * implicit val ec = ExecutionContext.newContext(config) + * can be used inside of a Job to get an ExecutionContext if you want + * to call a function that requires an implicit ExecutionContext + */ + def newContext(conf: Config)(implicit fd: FlowDef, m: Mode): ExecutionContext = + new ExecutionContext { + def config = conf + def flowDef = fd + def mode = CascadingMode.cast(m) + } + + implicit def modeFromContext(implicit ec: ExecutionContext): Mode = ec.mode + implicit def flowDefFromContext(implicit ec: ExecutionContext): FlowDef = ec.flowDef +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ExecutionUtil.scala b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionUtil.scala new file mode 100644 index 0000000000..4f79353175 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/ExecutionUtil.scala @@ -0,0 +1,78 @@ +package com.twitter.scalding + +import com.twitter.algebird.Semigroup + +object ExecutionUtil { + + /** + * Generate a list of executions from a date range + * + * @param duration + * Duration to split daterange + * @param fn + * Function to run a execution given a date range + * @return + * Sequence of Executions per Day + */ + def executionsFromDates[T](duration: Duration)(fn: DateRange => Execution[T])(implicit + dr: DateRange + ): Seq[Execution[T]] = + dr.each(duration).map(fn).toSeq + + /** + * Split a DateRange and allow for max parallel running of executions + * + * @param duration + * Duration to split daterange + * @param parallelism + * How many jobs to run in parallel + * @param fn + * Function to run a execution given a date range + * @return + * Seq of Dates split by Duration with corresponding execution result + */ + def runDatesWithParallelism[T](duration: Duration, parallelism: Int = 1)( + fn: DateRange => Execution[T] + )(implicit dr: DateRange): Execution[Seq[(DateRange, T)]] = { + + val dates = dr.each(duration).toSeq + Execution.withParallelism(dates.map(fn), parallelism).map(e => dates.zip(e)) + } + + /** + * Split a DateRange and allow for max parallel running of executions + * + * @param duration + * Duration to split daterange + * @param parallelism + * How many jobs to run in parallel + * @param fn + * Function to run a execution given a date range + * @return + * Execution of Sequences + */ + def runDateRangeWithParallelism[T](duration: Duration, parallelism: Int = 1)(fn: DateRange => Execution[T])( + implicit dr: DateRange + ): Execution[Seq[T]] = + runDatesWithParallelism(duration, parallelism)(fn).map(_.map { case (_, t) => t }) + + /** + * Same as runDateRangeWithParallelism, but sums the sequence of values after running. This is useful when + * you want to do a calculation in parallel over many durations and join the results together. + * + * For example, a common use case is when T is a TypedPipe[U] and you want to independently compute the + * pipes on each day and union them into a single TypedPipe at the end. + * + * Another possible use case would be if the executions were created by summing intermediate monoids (e.g. T + * was a Map[String,HLL] since algebird supports monoids for maps and hll) and you wanted to do a final + * aggregation of the Monoids computed for each duration. + */ + def runDateRangeWithParallelismSum[T](duration: Duration, parallelism: Int = 1)( + fn: DateRange => Execution[T] + )(implicit dr: DateRange, semigroup: Semigroup[T]): Execution[T] = { + require(dr.each(duration).nonEmpty, "Date Range can not be empty") + + runDateRangeWithParallelism(duration, parallelism)(fn)(dr) + .map(_.reduceLeft[T] { case (l, r) => Semigroup.plus(l, r) }) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/FieldConversions.scala b/scalding-core/src/main/scala/com/twitter/scalding/FieldConversions.scala index a772afa169..0e25e732c8 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/FieldConversions.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/FieldConversions.scala @@ -12,32 +12,29 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields - -import scala.collection.JavaConversions._ - import cascading.pipe.Pipe -import scala.annotation.tailrec -import java.util.Comparator - import com.esotericsoftware.kryo.DefaultSerializer +import java.util.Comparator +import scala.annotation.tailrec +import scala.collection.JavaConverters._ + trait LowPriorityFieldConversions { protected def anyToFieldArg(f: Any): Comparable[_] = f match { - case x: Symbol => x.name - case y: String => y + case x: Symbol => x.name + case y: String => y case z: java.lang.Integer => z case v: Enumeration#Value => v.toString - case fld: Field[_] => fld.id + case fld: Field[_] => fld.id case flds: Fields => { if (flds.size == 1) { flds.get(0) - } - else { + } else { throw new Exception("Cannot convert Fields(" + flds.toString + ") to a single fields arg") } } @@ -45,19 +42,16 @@ trait LowPriorityFieldConversions { } /** - * Handles treating any TupleN as a Fields object. - * This is low priority because List is also a Product, but this method - * will not work for List (because List is Product2(head, tail) and so - * productIterator won't work as expected. - * Lists are handled by an implicit in FieldConversions, which have - * higher priority. - */ - implicit def productToFields( f : Product ) = { - val fields = new Fields(f.productIterator.map { anyToFieldArg }.toSeq :_* ) - f.productIterator.foreach { _ match { + * Handles treating any TupleN as a Fields object. This is low priority because List is also a Product, but + * this method will not work for List (because List is Product2(head, tail) and so productIterator won't + * work as expected. Lists are handled by an implicit in FieldConversions, which have higher priority. + */ + implicit def productToFields(f: Product): Fields = { + val fields = new Fields(f.productIterator.map(anyToFieldArg).toSeq: _*) + f.productIterator.foreach { case field: Field[_] => fields.setComparator(field.id, field.ord) - case _ => - }} + case _ => + } fields } } @@ -65,148 +59,140 @@ trait LowPriorityFieldConversions { trait FieldConversions extends LowPriorityFieldConversions { // Cascading Fields are either java.lang.String or java.lang.Integer, both are comparable. - def asList(f : Fields) : List[Comparable[_]] = { - f.iterator.toList.asInstanceOf[List[Comparable[_]]] - } + def asList(f: Fields): List[Comparable[_]] = + f.iterator.asScala.toList.asInstanceOf[List[Comparable[_]]] // Cascading Fields are either java.lang.String or java.lang.Integer, both are comparable. - def asSet(f : Fields) : Set[Comparable[_]] = asList(f).toSet + def asSet(f: Fields): Set[Comparable[_]] = asList(f).toSet // TODO get the comparator also - def getField(f : Fields, idx : Int) : Fields = { new Fields(f.get(idx)) } + def getField(f: Fields, idx: Int): Fields = new Fields(f.get(idx)) - def hasInts(f : Fields): Boolean = f.iterator.exists { _.isInstanceOf[java.lang.Integer] } + def hasInts(f: Fields): Boolean = f.iterator.asScala.exists(_.isInstanceOf[java.lang.Integer]) /** - * Rather than give the full power of cascading's selectors, we have - * a simpler set of rules encoded below: - * 1) if the input is non-definite (ALL, GROUP, ARGS, etc...) ALL is the output. - * Perhaps only fromFields=ALL will make sense - * 2) If one of from or to is a strict super set of the other, SWAP is used. - * 3) If they are equal, REPLACE is used. - * 4) Otherwise, ALL is used. - */ - def defaultMode(fromFields : Fields, toFields : Fields) : Fields = { - if(toFields.isArguments) { - //In this case we replace the input with the output + * Rather than give the full power of cascading's selectors, we have a simpler set of rules encoded below: + * 1) if the input is non-definite (ALL, GROUP, ARGS, etc...) ALL is the output. Perhaps only fromFields=ALL + * will make sense 2) If one of from or to is a strict super set of the other, SWAP is used. 3) If they are + * equal, REPLACE is used. 4) Otherwise, ALL is used. + */ + def defaultMode(fromFields: Fields, toFields: Fields): Fields = + if (toFields.isArguments || (fromFields.isAll && toFields.isAll)) { + // 1. In this case we replace the input with the output or: + // 2. if you go from all to all, you must mean replace (ALL would fail at the cascading layer) Fields.REPLACE - } - else if( fromFields.size == 0 ) { - //This is all the UNKNOWN, ALL, etc... + } else if (fromFields.size == 0) { + // This is all the UNKNOWN, ALL, etc... Fields.ALL - } - else { + } else { val fromSet = asSet(fromFields) val toSet = asSet(toFields) (fromSet.subsetOf(toSet), toSet.subsetOf(fromSet)) match { - case (true, true) => Fields.REPLACE //equal - case (true, false) => Fields.SWAP //output super set, replaces input - case (false, true) => Fields.SWAP //throw away some input + case (true, true) => Fields.REPLACE // equal + case (true, false) => Fields.SWAP // output super set, replaces input + case (false, true) => Fields.SWAP // throw away some input /* - * the next case is that they are disjoint or have some nontrivial intersection - * if disjoint, everything is fine. - * if they intersect, it is ill-defined and cascading is going to throw an error BEFORE - * starting the flow. - */ + * the next case is that they are disjoint or have some nontrivial intersection + * if disjoint, everything is fine. + * if they intersect, it is ill-defined and cascading is going to throw an error BEFORE + * starting the flow. + */ case (false, false) => Fields.ALL } } - } - //Single entry fields: - implicit def unitToFields(u : Unit) = Fields.NONE - implicit def intToFields(x : Int) = new Fields(new java.lang.Integer(x)) - implicit def integerToFields(x : java.lang.Integer) = new Fields(x) - implicit def stringToFields(x : String) = new Fields(x) - implicit def enumValueToFields(x : Enumeration#Value) = new Fields(x.toString) + // Single entry fields: + implicit def unitToFields(u: Unit): Fields = Fields.NONE // linter:ignore + implicit def intToFields(x: Int): Fields = new Fields(new java.lang.Integer(x)) + implicit def integerToFields(x: java.lang.Integer): Fields = new Fields(x) + implicit def stringToFields(x: String): Fields = new Fields(x) + implicit def enumValueToFields(x: Enumeration#Value): Fields = new Fields(x.toString) + /** - * '* means Fields.ALL, otherwise we take the .name - */ - implicit def symbolToFields(x : Symbol) = { - if(x == '*) { + * '* means Fields.ALL, otherwise we take the .name + */ + implicit def symbolToFields(x: Symbol): Fields = + if (x == '*) { Fields.ALL - } - else { + } else { new Fields(x.name) } - } - implicit def fieldToFields(f : Field[_]) = RichFields(f) + implicit def fieldToFields(f: Field[_]): RichFields = RichFields(f) @tailrec - final def newSymbol(avoid : Set[Symbol], guess : Symbol, trial : Int = 0) : Symbol = { + final def newSymbol(avoid: Set[Symbol], guess: Symbol, trial: Int = 0): Symbol = if (!avoid(guess)) { - //We are good: + // We are good: guess - } - else if (0 == trial) { + } else if (trial == 0) { newSymbol(avoid, guess, 1) - } - else { + } else { val newGuess = Symbol(guess.name + trial.toString) if (!avoid(newGuess)) { newGuess - } - else { + } else { newSymbol(avoid, guess, trial + 1) } } - } - final def ensureUniqueFields(left : Fields, right : Fields, rightPipe : Pipe) : (Fields, Pipe) = { + final def ensureUniqueFields(left: Fields, right: Fields, rightPipe: Pipe): (Fields, Pipe) = { val leftSet = asSet(left) val collisions = asSet(left) & asSet(right) if (collisions.isEmpty) { (right, rightPipe) - } - else { + } else { // Rename the collisions with random integer names: - val leftSetSyms = leftSet.map { f => Symbol(f.toString) } - val (_,reversedRename) = asList(right).map { f => Symbol(f.toString) } + val leftSetSyms = leftSet.map(f => Symbol(f.toString)) + val (_, reversedRename) = asList(right) + .map(f => Symbol(f.toString)) .foldLeft((leftSetSyms, List[Symbol]())) { (takenRename, name) => - val (taken, renames) = takenRename - val newName = newSymbol(taken, name) - (taken + newName, newName :: renames) - } + val (taken, renames) = takenRename + val newName = newSymbol(taken, name) + (taken + newName, newName :: renames) + } val newRight = fields(reversedRename.reverse) // We pushed in as a stack, so we need to reverse - (newRight, RichPipe(rightPipe).rename( right -> newRight )) + (newRight, RichPipe(rightPipe).rename(right -> newRight)) } } /** - * Multi-entry fields. This are higher priority than Product conversions so - * that List will not conflict with Product. + * Multi-entry fields. This are higher priority than Product conversions so that List will not conflict with + * Product. */ implicit def fromEnum[T <: Enumeration](enumeration: T): Fields = - new Fields(enumeration.values.toList.map { _.toString } : _* ) + new Fields(enumeration.values.toList.map(_.toString): _*) + + implicit def fields[T <: TraversableOnce[Symbol]](f: T): Fields = new Fields(f.toSeq.map(_.name): _*) + implicit def strFields[T <: TraversableOnce[String]](f: T): Fields = new Fields(f.toSeq: _*) + implicit def intFields[T <: TraversableOnce[Int]](f: T): Fields = + new Fields(f.toSeq.map(new java.lang.Integer(_)): _*) + implicit def fieldFields[T <: TraversableOnce[Field[_]]](f: T): RichFields = RichFields(f.toSeq) - implicit def fields[T <: TraversableOnce[Symbol]](f : T) = new Fields(f.toSeq.map(_.name) : _*) - implicit def strFields[T <: TraversableOnce[String]](f : T) = new Fields(f.toSeq : _*) - implicit def intFields[T <: TraversableOnce[Int]](f : T) = { - new Fields(f.toSeq.map { new java.lang.Integer(_) } : _*) - } - implicit def fieldFields[T <: TraversableOnce[Field[_]]](f : T) = RichFields(f.toSeq) /** - * Useful to convert f : Any* to Fields. This handles mixed cases ("hey", 'you). - * Not sure we should be this flexible, but given that Cascading will throw an - * exception before scheduling the job, I guess this is okay. - */ - implicit def parseAnySeqToFields[T <: TraversableOnce[Any]](anyf : T) = { - val fields = new Fields(anyf.toSeq.map { anyToFieldArg } : _* ) - anyf.foreach { _ match { + * Useful to convert f : Any* to Fields. This handles mixed cases ("hey", 'you). Not sure we should be this + * flexible, but given that Cascading will throw an exception before scheduling the job, I guess this is + * okay. + */ + implicit def parseAnySeqToFields[T <: TraversableOnce[Any]](anyf: T): Fields = { + val fields = new Fields(anyf.toSeq.map(anyToFieldArg): _*) + anyf.foreach { case field: Field[_] => fields.setComparator(field.id, field.ord) - case _ => - }} + case _ => + } fields } - //Handle a pair generally: - implicit def tuple2ToFieldsPair[T,U]( pair : (T,U) ) - (implicit tf : T => Fields, uf : U => Fields) : (Fields,Fields) = { + // Handle a pair generally: + implicit def tuple2ToFieldsPair[T, U]( + pair: (T, U) + )(implicit tf: T => Fields, uf: U => Fields): (Fields, Fields) = { val f1 = tf(pair._1) val f2 = uf(pair._2) (f1, f2) } - /** We can't set the field Manifests because cascading doesn't (yet) expose field type information - * in the Fields API. + + /** + * We can't set the field Manifests because cascading doesn't (yet) expose field type information in the + * Fields API. */ implicit def fieldsToRichFields(fields: Fields): RichFields = { if (!fields.isDefined) { @@ -226,13 +212,15 @@ trait FieldConversions extends LowPriorityFieldConversions { // "one at a time" by querying for a specific index, while the Comparators are only // available "all at once" by calling getComparators.) - new RichFields(asList(fields).zip(fields.getComparators).map { - case (id: Comparable[_], comparator: Comparator[_]) => id match { - case x: java.lang.Integer => IntField(x)(Ordering.comparatorToOrdering(comparator), None) - case y: String => StringField(y)(Ordering.comparatorToOrdering(comparator), None) - case z => sys.error("not expecting object of type " + z.getClass + " as field name") + new RichFields( + asList(fields).zip(fields.getComparators).map { case (id: Comparable[_], comparator: Comparator[_]) => + id match { + case x: java.lang.Integer => IntField(x)(Ordering.comparatorToOrdering(comparator), None) + case y: String => StringField(y)(Ordering.comparatorToOrdering(comparator), None) + case z => sys.error("not expecting object of type " + z.getClass + " as field name") + } } - }) + ) } } @@ -243,7 +231,7 @@ trait FieldConversions extends LowPriorityFieldConversions { // val myFields: Fields = ... // myFields.toFieldList -case class RichFields(val toFieldList : List[Field[_]]) extends Fields(toFieldList.map { _.id } : _*) { +case class RichFields(val toFieldList: List[Field[_]]) extends Fields(toFieldList.map(_.id): _*) { toFieldList.foreach { field: Field[_] => setComparator(field.id, field.ord) } } @@ -254,19 +242,32 @@ object RichFields { } sealed trait Field[T] extends java.io.Serializable { - def id : Comparable[_] - def ord : Ordering[T] - def mf : Option[Manifest[T]] + def id: Comparable[_] + def ord: Ordering[T] + def mf: Option[Manifest[T]] } @DefaultSerializer(classOf[serialization.IntFieldSerializer]) -case class IntField[T](override val id: java.lang.Integer)(implicit override val ord : Ordering[T], override val mf : Option[Manifest[T]]) extends Field[T] +final case class IntField[T](override val id: java.lang.Integer)(implicit + override val ord: Ordering[T], + override val mf: Option[Manifest[T]] +) extends Field[T] @DefaultSerializer(classOf[serialization.StringFieldSerializer]) -case class StringField[T](override val id: String)(implicit override val ord : Ordering[T], override val mf : Option[Manifest[T]]) extends Field[T] +final case class StringField[T](override val id: String)(implicit + override val ord: Ordering[T], + override val mf: Option[Manifest[T]] +) extends Field[T] object Field { - def apply[T](index: Int)(implicit ord : Ordering[T], mf : Manifest[T]) = IntField[T](index)(ord, Some(mf)) - def apply[T](name: String)(implicit ord : Ordering[T], mf : Manifest[T]) = StringField[T](name)(ord, Some(mf)) - def apply[T](symbol: Symbol)(implicit ord : Ordering[T], mf : Manifest[T]) = StringField[T](symbol.name)(ord, Some(mf)) + def apply[T](index: Int)(implicit ord: Ordering[T], mf: Manifest[T]) = IntField[T](index)(ord, Some(mf)) + def apply[T](name: String)(implicit ord: Ordering[T], mf: Manifest[T]) = StringField[T](name)(ord, Some(mf)) + def apply[T](symbol: Symbol)(implicit ord: Ordering[T], mf: Manifest[T]) = + StringField[T](symbol.name)(ord, Some(mf)) + + def singleOrdered[T](name: String)(implicit ord: Ordering[T]): Fields = { + val f = new Fields(name) + f.setComparator(name, ord) + f + } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/FileSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/FileSource.scala index 3da6cd9092..e321a6801c 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/FileSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/FileSource.scala @@ -12,35 +12,30 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.io.{InputStream, OutputStream} -import java.util.{UUID, Properties} - +import java.util.{Properties, UUID} import cascading.scheme.Scheme -import cascading.scheme.local.{TextLine => CLTextLine, TextDelimited => CLTextDelimited} import cascading.scheme.hadoop.{ - TextLine => CHTextLine, + SequenceFile => CHSequenceFile, TextDelimited => CHTextDelimited, - SequenceFile => CHSequenceFile + TextLine => CHTextLine } +import cascading.scheme.local.{TextDelimited => CLTextDelimited, TextLine => CLTextLine} import cascading.tap.hadoop.Hfs -import cascading.tap.MultiSourceTap -import cascading.tap.SinkMode -import cascading.tap.Tap +import cascading.tap.{MultiSourceTap, SinkMode, Tap} import cascading.tap.local.FileTap import cascading.tuple.Fields - import com.etsy.cascading.tap.local.LocalTap - +import com.twitter.algebird.{MapAlgebra, OrVal} +import com.twitter.scalding.tap.ScaldingHfs import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileStatus, PathFilter, Path} -import org.apache.hadoop.mapred.JobConf -import org.apache.hadoop.mapred.OutputCollector -import org.apache.hadoop.mapred.RecordReader - -import scala.util.control.Exception.allCatch +import org.apache.hadoop.fs.{FileStatus, Path, PathFilter} +import org.apache.hadoop.mapred.{JobConf, OutputCollector, RecordReader} +import org.slf4j.LoggerFactory +import scala.util.{Failure, Success, Try} /** * A base class for sources that take a scheme trait. @@ -49,30 +44,61 @@ abstract class SchemedSource extends Source { /** The scheme to use if the source is local. */ def localScheme: Scheme[Properties, InputStream, OutputStream, _, _] = - sys.error("Cascading local mode not supported for: " + toString) + throw ModeException("Cascading local mode not supported for: " + toString) /** The scheme to use if the source is on hdfs. */ - def hdfsScheme: Scheme[JobConf,RecordReader[_,_],OutputCollector[_,_],_,_] = - sys.error("Cascading Hadoop mode not supported for: " + toString) + def hdfsScheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _] = + throw ModeException("Cascading Hadoop mode not supported for: " + toString) // The mode to use for output taps determining how conflicts with existing output are handled. val sinkMode: SinkMode = SinkMode.REPLACE } +trait HfsTapProvider { + def createHfsTap( + scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _], + path: String, + sinkMode: SinkMode + ): Hfs = + new ScaldingHfs(scheme, path, sinkMode) +} + +private[scalding] object CastFileTap { + // The scala compiler has problems with the generics in Cascading + def apply(tap: FileTap): Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]] = + tap.asInstanceOf[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]]] +} + /** * A trait which provides a method to create a local tap. */ trait LocalSourceOverride extends SchemedSource { + /** A path to use for the local tap. */ - def localPath: String + def localPaths: Iterable[String] + + // By default, we write to the last path for local paths + def localWritePath: String = localPaths.last /** * Creates a local tap. * - * @param sinkMode The mode for handling output conflicts. - * @returns A tap. + * @param sinkMode + * The mode for handling output conflicts. + * @return + * A tap. */ - def createLocalTap(sinkMode : SinkMode) : Tap[_,_,_] = new FileTap(localScheme, localPath, sinkMode) + def createLocalTap(sinkMode: SinkMode): Tap[JobConf, _, _] = { + val taps = localPaths.map { p: String => + CastFileTap(new FileTap(localScheme, p, sinkMode)) + }.toList + + taps match { + case Nil => throw new InvalidSourceException("LocalPaths is empty") + case oneTap :: Nil => oneTap + case many => new ScaldingMultiSourceTap(many) + } + } } object HiddenFileFilter extends PathFilter { @@ -83,7 +109,7 @@ object HiddenFileFilter extends PathFilter { } object SuccessFileFilter extends PathFilter { - def accept(p: Path) = { p.getName == "_SUCCESS" } + def accept(p: Path) = p.getName == "_SUCCESS" } object AcceptAllPathFilter extends PathFilter { @@ -91,89 +117,189 @@ object AcceptAllPathFilter extends PathFilter { } object FileSource { + val LOG = LoggerFactory.getLogger(this.getClass) + + private[this] def verboseLogEnabled(conf: Configuration): Boolean = + conf.getBoolean(Config.VerboseFileSourceLoggingKey, false) + + private[this] def ifVerboseLog(conf: Configuration)(msgFn: => String): Unit = + if (verboseLogEnabled(conf)) { + val stack = Thread.currentThread.getStackTrace.iterator + .drop(2) // skip getStackTrace and ifVerboseLog + .mkString("\n") + + // evaluate call by name param once + val msg = msgFn + + LOG.info(s""" + |***FileSource Verbose Log*** + |$stack + | + |$msg + """.stripMargin) + } - def glob(glob: String, conf: Configuration, filter: PathFilter = AcceptAllPathFilter): Iterable[FileStatus] = { + def glob( + glob: String, + conf: Configuration, + filter: PathFilter = AcceptAllPathFilter + ): Iterable[FileStatus] = { val path = new Path(glob) - Option(path.getFileSystem(conf).globStatus(path, filter)).map { - _.toIterable // convert java Array to scala Iterable - } getOrElse { - Iterable.empty - } + Option(path.getFileSystem(conf).globStatus(path, filter)) + .map { + _.toIterable // convert java Array to scala Iterable + } + .getOrElse { + Iterable.empty + } } /** - * @return whether globPath contains non hidden files + * @return + * whether globPath contains non hidden files */ - def globHasNonHiddenPaths(globPath : String, conf : Configuration): Boolean = { - !glob(globPath, conf, HiddenFileFilter).isEmpty + def globHasNonHiddenPaths(globPath: String, conf: Configuration): Boolean = { + val res = glob(globPath, conf, HiddenFileFilter) + + ifVerboseLog(conf) { + val allFiles = glob(globPath, conf, AcceptAllPathFilter).mkString("\n") + val matched = res.mkString("\n") + s""" + |globHasNonHiddenPaths: + |globPath: $globPath + |all files matching globPath, using HiddenFileFilter: + |$matched + |all files matching globPath, w/o filtering: + |$allFiles + """.stripMargin + } + + res.nonEmpty } /** - * @return whether globPath contains a _SUCCESS file + * @return + * whether globPath contains a _SUCCESS file */ - def globHasSuccessFile(globPath : String, conf : Configuration): Boolean = { - !glob(globPath, conf, SuccessFileFilter).isEmpty - } + def globHasSuccessFile(globPath: String, conf: Configuration): Boolean = + allGlobFilesWithSuccess(globPath, conf, hiddenFilter = false) + /** + * Determines whether each file in the glob has a _SUCCESS sibling file in the same directory + * @param globPath + * path to check + * @param conf + * Hadoop Configuration to create FileSystem + * @param hiddenFilter + * true, if only non-hidden files are checked + * @return + * true if the directory has files after filters are applied + */ + def allGlobFilesWithSuccess(globPath: String, conf: Configuration, hiddenFilter: Boolean): Boolean = { + // Produce tuples (dirName, hasSuccess, hasNonHidden) keyed by dir + // + val usedDirs = glob(globPath, conf, AcceptAllPathFilter) + .map { fileStatus: FileStatus => + // stringify Path for Semigroup + val dir = + if (fileStatus.isDirectory) + fileStatus.getPath.toString + else + fileStatus.getPath.getParent.toString + + // HiddenFileFilter should better be called non-hidden but it borrows its name from the + // private field name in hadoop FileInputFormat + // + dir -> ( + OrVal(SuccessFileFilter.accept(fileStatus.getPath) && fileStatus.isFile), + OrVal(HiddenFileFilter.accept(fileStatus.getPath)) + ) + } + + // OR by key + val uniqueUsedDirs = MapAlgebra + .sumByKey(usedDirs) + .filter { case (_, (_, hasNonHidden)) => (!hiddenFilter || hasNonHidden.get) } + + // there is at least one valid path, and all paths have success + // + uniqueUsedDirs.nonEmpty && uniqueUsedDirs.forall { case (_, (hasSuccess, _)) => + hasSuccess.get + } + } } /** -* This is a base class for File-based sources -*/ -abstract class FileSource extends SchemedSource with LocalSourceOverride { + * This is a base class for File-based sources + */ +abstract class FileSource extends SchemedSource with LocalSourceOverride with HfsTapProvider { /** - * Determines if a path is 'valid' for this source. In strict mode all paths must be valid. - * In non-strict mode, all invalid paths will be filtered out. + * Determines if a path is 'valid' for this source. In strict mode all paths must be valid. In non-strict + * mode, all invalid paths will be filtered out. * * Subclasses can override this to validate paths. * - * The default implementation is a quick sanity check to look for missing or empty directories. - * It is necessary but not sufficient -- there are cases where this will return true but there is - * in fact missing data. + * The default implementation is a quick sanity check to look for missing or empty directories. It is + * necessary but not sufficient -- there are cases where this will return true but there is in fact missing + * data. * * TODO: consider writing a more in-depth version of this method in [[TimePathedSource]] that looks for * TODO: missing days / hours etc. */ - protected def pathIsGood(p : String, conf : Configuration) = FileSource.globHasNonHiddenPaths(p, conf) + protected def pathIsGood(globPattern: String, conf: Configuration) = + if (conf.getBoolean("scalding.require_success_file", false)) { + FileSource.allGlobFilesWithSuccess(globPattern, conf, true) + } else { + FileSource.globHasNonHiddenPaths(globPattern, conf) + } - def hdfsPaths : Iterable[String] + def hdfsPaths: Iterable[String] // By default, we write to the LAST path returned by hdfsPaths - def hdfsWritePath = hdfsPaths.last + def hdfsWritePath: String = hdfsPaths.last - override def createTap(readOrWrite : AccessMode)(implicit mode : Mode) : Tap[_,_,_] = { + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = mode match { // TODO support strict in Local case Local(_) => { - createLocalTap(sinkMode) - } - case hdfsMode @ Hdfs(_, _) => readOrWrite match { - case Read => createHdfsReadTap(hdfsMode) - case Write => CastHfsTap(new Hfs(hdfsScheme, hdfsWritePath, sinkMode)) + readOrWrite match { + case Read => createLocalTap(sinkMode) + case Write => new FileTap(localScheme, localWritePath, sinkMode) + } } - case _ => { - allCatch.opt( - TestTapFactory(this, hdfsScheme, sinkMode) - ).map { - _.createTap(readOrWrite) // these java types are invariant, so we cast here - .asInstanceOf[Tap[Any, Any, Any]] + case hdfsMode @ Hdfs(_, _) => + readOrWrite match { + case Read => createHdfsReadTap(hdfsMode) + case Write => CastHfsTap(createHfsTap(hdfsScheme, hdfsWritePath, sinkMode)) } - .orElse { - allCatch.opt( - TestTapFactory(this, localScheme.getSourceFields, sinkMode) - ).map { + case _ => { + val tryTtp = Try(TestTapFactory(this, hdfsScheme, sinkMode)) + .map { + // these java types are invariant, so we cast here _.createTap(readOrWrite) - .asInstanceOf[Tap[Any, Any, Any]] + .asInstanceOf[Tap[Any, Any, Any]] } - }.getOrElse(sys.error("Failed to create a tap for: " + toString)) + .orElse { + Try(TestTapFactory(this, localScheme.getSourceFields, sinkMode)).map { + _.createTap(readOrWrite) + .asInstanceOf[Tap[Any, Any, Any]] + } + } + + tryTtp match { + case Success(s) => s + case Failure(e) => + throw new java.lang.IllegalArgumentException( + s"Failed to create tap for: $toString, with error: ${e.getMessage}", + e + ) + } } } - } // This is only called when Mode.sourceStrictness is true - protected def hdfsReadPathsAreGood(conf : Configuration) = { - hdfsPaths.forall { pathIsGood(_, conf) } - } + protected def hdfsReadPathsAreGood(conf: Configuration) = + hdfsPaths.forall(pathIsGood(_, conf)) /* * This throws InvalidSourceException if: @@ -181,46 +307,53 @@ abstract class FileSource extends SchemedSource with LocalSourceOverride { * 2) we are not in the above, but some source has no input whatsoever * TODO this only does something for HDFS now. Maybe we should do the same for LocalMode */ - override def validateTaps(mode : Mode) : Unit = { + override def validateTaps(mode: Mode): Unit = mode match { case Hdfs(strict, conf) => { if (strict && (!hdfsReadPathsAreGood(conf))) { throw new InvalidSourceException( "[" + this.toString + "] Data is missing from one or more paths in: " + - hdfsPaths.toString) + hdfsPaths.toString + ) + } else if (!hdfsPaths.exists(pathIsGood(_, conf))) { + // Check that there is at least one good path: + throw new InvalidSourceException("[" + this.toString + "] No good paths in: " + hdfsPaths.toString) } - else if (!hdfsPaths.exists { pathIsGood(_, conf) }) { - //Check that there is at least one good path: - throw new InvalidSourceException( - "[" + this.toString + "] No good paths in: " + hdfsPaths.toString) + } + + case Local(strict) => { + val files = localPaths.map(p => new java.io.File(p)) + if (strict && !files.forall(_.exists)) { + throw new InvalidSourceException("[" + this.toString + s"] Data is missing from: ${localPaths + .filterNot(p => new java.io.File(p).exists)}") + } else if (!files.exists(_.exists)) { + throw new InvalidSourceException("[" + this.toString + "] No good paths in: " + hdfsPaths.toString) } } case _ => () } - } /* * Get all the set of valid paths based on source strictness. */ - protected def goodHdfsPaths(hdfsMode : Hdfs) = { + protected def goodHdfsPaths(hdfsMode: Hdfs): Iterable[String] = hdfsMode match { - //we check later that all the paths are good + // we check later that all the paths are good case Hdfs(true, _) => hdfsPaths // If there are no matching paths, this is still an error, we need at least something: - case Hdfs(false, conf) => hdfsPaths.filter{ pathIsGood(_, conf) } + case Hdfs(false, conf) => hdfsPaths.filter(pathIsGood(_, conf)) } - } - protected def createHdfsReadTap(hdfsMode : Hdfs) : Tap[JobConf, _, _] = { - val taps : List[Tap[JobConf, RecordReader[_,_], OutputCollector[_,_]]] = - goodHdfsPaths(hdfsMode) - .toList.map { path => CastHfsTap(new Hfs(hdfsScheme, path, sinkMode)) } + protected def createHdfsReadTap(hdfsMode: Hdfs): Tap[JobConf, _, _] = { + val taps: List[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]]] = + goodHdfsPaths(hdfsMode).toList.map(path => CastHfsTap(createHfsTap(hdfsScheme, path, sinkMode))) taps.size match { case 0 => { // This case is going to result in an error, but we don't want to throw until - // validateTaps, so we just put a dummy path to return something so the - // Job constructor does not fail. - CastHfsTap(new Hfs(hdfsScheme, hdfsPaths.head, sinkMode)) + // validateTaps. Return an InvalidSource here so the Job constructor does not fail. + // In the worst case if the flow plan is misconfigured, + // openForRead on mappers should fail when using this tap. + new InvalidSourceTap(hdfsPaths) } case 1 => taps.head case _ => new ScaldingMultiSourceTap(taps) @@ -228,39 +361,48 @@ abstract class FileSource extends SchemedSource with LocalSourceOverride { } } -class ScaldingMultiSourceTap(taps : Seq[Tap[JobConf, RecordReader[_,_], OutputCollector[_,_]]]) - extends MultiSourceTap[Tap[JobConf, RecordReader[_,_], OutputCollector[_,_]], JobConf, RecordReader[_,_]](taps : _*) { +class ScaldingMultiSourceTap(taps: Seq[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]]]) + extends MultiSourceTap[ + Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], + JobConf, + RecordReader[_, _] + ](taps: _*) { private final val randomId = UUID.randomUUID.toString override def getIdentifier() = randomId + override def hashCode: Int = randomId.hashCode } /** -* The fields here are ('offset, 'line) -*/ -trait TextLineScheme extends SchemedSource with Mappable[String] { - override def converter[U >: String] = TupleConverter.asSuperConverter[String, U](TupleConverter.of[String]) - override def localScheme = new CLTextLine(new Fields("offset","line"), Fields.ALL, textEncoding) - override def hdfsScheme = HadoopSchemeInstance(new CHTextLine(CHTextLine.DEFAULT_SOURCE_FIELDS, textEncoding)) - //In textline, 0 is the byte position, the actual text string is in column 1 - override def sourceFields = Dsl.intFields(Seq(1)) + * The fields here are ('offset, 'line) + */ +trait TextSourceScheme extends SchemedSource { // The text-encoding to use when writing out the lines (default is UTF-8). val textEncoding: String = CHTextLine.DEFAULT_CHARSET + + override def localScheme = new CLTextLine(new Fields("offset", "line"), Fields.ALL, textEncoding) + override def hdfsScheme = HadoopSchemeInstance( + new CHTextLine(CHTextLine.DEFAULT_SOURCE_FIELDS, textEncoding) + ) +} + +trait TextLineScheme extends TextSourceScheme with SingleMappable[String] { + // In textline, 0 is the byte position, the actual text string is in column 1 + override def sourceFields = Dsl.intFields(Seq(1)) } /** -* Mix this in for delimited schemes such as TSV or one-separated values -* By default, TSV is given -*/ + * Mix this in for delimited schemes such as TSV or one-separated values By default, TSV is given + */ trait DelimitedScheme extends SchemedSource { - //override these as needed: + // override these as needed: val fields = Fields.ALL - //This is passed directly to cascading where null is interpretted as string - val types : Array[Class[_]] = null + // This is passed directly to cascading where null is interpretted as string + val types: Array[Class[_]] = null val separator = "\t" val skipHeader = false val writeHeader = false - val quote : String = null + val quote: String = null // Whether to throw an exception or not if the number of fields does not match an expected number. // If set to false, missing fields will be set to null. @@ -270,91 +412,137 @@ trait DelimitedScheme extends SchemedSource { // If set to false, then fields that cannot be coerced will be set to null. val safe = true - //These should not be changed: - override def localScheme = new CLTextDelimited(fields, skipHeader, writeHeader, separator, strict, quote, types, safe) + // These should not be changed: + override def localScheme = + new CLTextDelimited(fields, skipHeader, writeHeader, separator, strict, quote, types, safe) override def hdfsScheme = { - HadoopSchemeInstance(new CHTextDelimited(fields, null, skipHeader, writeHeader, separator, strict, quote, types, safe)) + assert( + types == null || fields.size == types.size, + "Fields [" + fields + "] of different size than types array [" + types.mkString(",") + "]" + ) + HadoopSchemeInstance( + new CHTextDelimited(fields, null, skipHeader, writeHeader, separator, strict, quote, types, safe) + ) } } trait SequenceFileScheme extends SchemedSource { - //override these as needed: + // override these as needed: val fields = Fields.ALL // TODO Cascading doesn't support local mode yet - override def hdfsScheme = { - HadoopSchemeInstance(new CHSequenceFile(fields)) - } + override def hdfsScheme = HadoopSchemeInstance(new CHSequenceFile(fields)) } /** - * Ensures that a _SUCCESS file is present in the Source path, which must be a glob, - * as well as the requirements of [[FileSource.pathIsGood]] + * Ensures that a _SUCCESS file is present in every directory included by a glob, as well as the requirements + * of [[FileSource.pathIsGood]]. The set of directories to check for _SUCCESS is determined by examining the + * list of all paths returned by globPaths and adding parent directories of the non-hidden files encountered. + * pathIsGood should still be considered just a best-effort test. As an illustration the following layout with + * an in-flight job is accepted for the glob dir*/*:

 dir1/_temporary dir2/file1 dir2/_SUCCESS 
+ * + * Similarly if dir1 is physically empty pathIsGood is still true for dir*/* above + * + * On the other hand it will reject an empty output directory of a finished job:
 dir1/_SUCCESS 
*/ trait SuccessFileSource extends FileSource { - override protected def pathIsGood(p: String, conf: Configuration) = { - FileSource.globHasNonHiddenPaths(p, conf) && FileSource.globHasSuccessFile(p, conf) - } + override protected def pathIsGood(p: String, conf: Configuration) = + FileSource.allGlobFilesWithSuccess(p, conf, true) } /** - * Use this class to add support for Cascading local mode via the Hadoop tap. - * Put another way, this runs a Hadoop tap outside of Hadoop in the Cascading local mode + * Use this class to add support for Cascading local mode via the Hadoop tap. Put another way, this runs a + * Hadoop tap outside of Hadoop in the Cascading local mode */ trait LocalTapSource extends LocalSourceOverride { - override def createLocalTap(sinkMode : SinkMode) = new LocalTap(localPath, hdfsScheme, sinkMode).asInstanceOf[Tap[_, _, _]] + override def createLocalTap(sinkMode: SinkMode): Tap[JobConf, _, _] = { + val taps = localPaths.map { p => + new LocalTap(p, hdfsScheme, sinkMode) + .asInstanceOf[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]]] + }.toSeq + + taps match { + case Nil => throw new InvalidSourceException("LocalPaths is empty") + case oneTap :: Nil => oneTap + case many => new ScaldingMultiSourceTap(many) + } + } } -abstract class FixedPathSource(path : String*) extends FileSource { - def localPath = { assert(path.size == 1, "Cannot use multiple input files on local mode"); path(0) } - def hdfsPaths = path.toList - override def toString = getClass.getName + path +abstract class FixedPathSource(path: String*) extends FileSource { + override def localPaths: Iterable[String] = path.toList + override def hdfsPaths: Iterable[String] = path.toList + + // `toString` is used by equals in JobTest, which causes + // problems due to unstable collection type of `path` + override def toString = getClass.getName + path.mkString("(", ",", ")") + override def hdfsWritePath: String = stripTrailing(super.hdfsWritePath) + override def hashCode = toString.hashCode override def equals(that: Any): Boolean = (that != null) && (that.toString == toString) + + /** + * Similar in behavior to {@link TimePathedSource.writePathFor}. Strip out the trailing slash star. + */ + protected def stripTrailing(path: String): String = { + assert(path != "*", "Path must not be *") + assert(path != "/*", "Path must not be /*") + if (path.takeRight(2) == "/*") { + path.dropRight(2) + } else { + path + } + } } /** -* Tab separated value source -*/ + * Tab separated value source + */ -case class Tsv(p : String, override val fields : Fields = Fields.ALL, - override val skipHeader : Boolean = false, override val writeHeader: Boolean = false, - override val sinkMode: SinkMode = SinkMode.REPLACE) extends FixedPathSource(p) with DelimitedScheme +case class Tsv( + p: String, + override val fields: Fields = Fields.ALL, + override val skipHeader: Boolean = false, + override val writeHeader: Boolean = false, + override val sinkMode: SinkMode = SinkMode.REPLACE +) extends FixedPathSource(p) + with DelimitedScheme /** - * Allows the use of multiple Tsv input paths. The Tsv files will - * be process through your flow as if they are a single pipe. Tsv - * files must have the same schema. - * For more details on how multiple files are handled check the - * cascading docs. + * Allows the use of multiple Tsv input paths. The Tsv files will be process through your flow as if they are + * a single pipe. Tsv files must have the same schema. For more details on how multiple files are handled + * check the cascading docs. */ -case class MultipleTsvFiles(p : Seq[String], override val fields : Fields = Fields.ALL, - override val skipHeader : Boolean = false, override val writeHeader: Boolean = false) extends FixedPathSource(p:_*) - with DelimitedScheme +case class MultipleTsvFiles( + p: Seq[String], + override val fields: Fields = Fields.ALL, + override val skipHeader: Boolean = false, + override val writeHeader: Boolean = false +) extends FixedPathSource(p: _*) + with DelimitedScheme /** -* Csv value source -* separated by commas and quotes wrapping all fields -*/ -case class Csv(p : String, - override val separator : String = ",", - override val fields : Fields = Fields.ALL, - override val skipHeader : Boolean = false, - override val writeHeader : Boolean = false, - override val quote : String ="\"", - override val sinkMode: SinkMode = SinkMode.REPLACE) extends FixedPathSource(p) with DelimitedScheme - - - + * Csv value source separated by commas and quotes wrapping all fields + */ +case class Csv( + p: String, + override val separator: String = ",", + override val fields: Fields = Fields.ALL, + override val skipHeader: Boolean = false, + override val writeHeader: Boolean = false, + override val quote: String = "\"", + override val sinkMode: SinkMode = SinkMode.REPLACE +) extends FixedPathSource(p) + with DelimitedScheme /** -* One separated value (commonly used by Pig) -*/ -case class Osv(p : String, f : Fields = Fields.ALL, - override val sinkMode: SinkMode = SinkMode.REPLACE) extends FixedPathSource(p) - with DelimitedScheme { - override val fields = f - override val separator = "\1" + * One separated value (commonly used by Pig) + */ +case class Osv(p: String, f: Fields = Fields.ALL, override val sinkMode: SinkMode = SinkMode.REPLACE) + extends FixedPathSource(p) + with DelimitedScheme { + override val fields = f + override val separator = "\u0001" } object TextLine { @@ -366,29 +554,70 @@ object TextLine { new TextLine(p, sm, textEncoding) } -class TextLine(p : String, override val sinkMode: SinkMode, override val textEncoding: String) extends FixedPathSource(p) with TextLineScheme { +class TextLine(p: String, override val sinkMode: SinkMode, override val textEncoding: String) + extends FixedPathSource(p) + with TextLineScheme + with TypedSink[String] { // For some Java interop + def this(p: String) = this(p, TextLine.defaultSinkMode, TextLine.defaultTextEncoding) + + override def setter[U <: String] = TupleSetter.asSubSetter[String, U](TupleSetter.of[String]) +} + +/** + * Alternate typed TextLine source that keeps both 'offset and 'line fields. + */ +class OffsetTextLine(filepath: String, override val sinkMode: SinkMode, override val textEncoding: String) + extends FixedPathSource(filepath) + with Mappable[(Long, String)] + with TextSourceScheme { + + override def converter[U >: (Long, String)] = + TupleConverter.asSuperConverter[(Long, String), U](TupleConverter.of[(Long, String)]) } -case class SequenceFile(p : String, f : Fields = Fields.ALL, override val sinkMode: SinkMode = SinkMode.REPLACE) - extends FixedPathSource(p) with SequenceFileScheme with LocalTapSource { +/** + * Alternate typed TextLine source that keeps both 'offset and 'line fields. + */ +object OffsetTextLine { + // Default encoding is UTF-8 + val defaultTextEncoding: String = CHTextLine.DEFAULT_CHARSET + val defaultSinkMode: SinkMode = SinkMode.REPLACE + + def apply( + p: String, + sm: SinkMode = defaultSinkMode, + textEncoding: String = defaultTextEncoding + ): OffsetTextLine = + new OffsetTextLine(p, sm, textEncoding) +} + +case class SequenceFile(p: String, f: Fields = Fields.ALL, override val sinkMode: SinkMode = SinkMode.REPLACE) + extends FixedPathSource(p) + with SequenceFileScheme + with LocalTapSource { override val fields = f } -case class MultipleSequenceFiles(p : String*) extends FixedPathSource(p:_*) with SequenceFileScheme with LocalTapSource +case class MultipleSequenceFiles(p: String*) + extends FixedPathSource(p: _*) + with SequenceFileScheme + with LocalTapSource -case class MultipleTextLineFiles(p : String*) extends FixedPathSource(p:_*) with TextLineScheme +case class MultipleTextLineFiles(p: String*) extends FixedPathSource(p: _*) with TextLineScheme /** -* Delimited files source -* allowing to override separator and quotation characters and header configuration -*/ -case class MultipleDelimitedFiles (f: Fields, - override val separator : String, - override val quote : String, - override val skipHeader : Boolean, - override val writeHeader : Boolean, - p : String*) extends FixedPathSource(p:_*) with DelimitedScheme { - override val fields = f + * Delimited files source allowing to override separator and quotation characters and header configuration + */ +case class MultipleDelimitedFiles( + f: Fields, + override val separator: String, + override val quote: String, + override val skipHeader: Boolean, + override val writeHeader: Boolean, + p: String* +) extends FixedPathSource(p: _*) + with DelimitedScheme { + override val fields = f } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/FlowState.scala b/scalding-core/src/main/scala/com/twitter/scalding/FlowState.scala index abebadbd40..d787eed9f3 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/FlowState.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/FlowState.scala @@ -12,85 +12,127 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.pipe.Pipe + import cascading.flow.FlowDef -import java.util.{Map => JMap, WeakHashMap} -import scala.collection.JavaConverters._ +import com.twitter.algebird.Monoid +import java.util.WeakHashMap + /** * Immutable state that we attach to the Flow using the FlowStateMap + * + * There are three kinds of things we want to attach to FlowDefs: + * + * 1) which scalding Sources are being read (sourceMap), so we can call validateTaps on each of them before we + * run (see validateSources) + * + * 2) the configuration updates that need to be applied to the Pipe instances in the Typed API (this could be + * removed by better plumbing in CascadingBackend) + * + * 3) The list of TypedPipe writes that have not yet been planned. We want to defer planning as long as + * possible so the optimizer can see as much as possible of the graph to make the best decisions. */ -case class FlowState(sourceMap: Map[String, (Source, Pipe)] = Map.empty) { - /** - * Cascading can't handle multiple head pipes with the same - * name. This handles them by caching the source and only - * having a single head pipe to represent each head. - */ - def getReadPipe(s: Source, p: => Pipe) : (FlowState, Pipe) = - sourceMap.get(s.toString) match { - case Some((src, pipe)) => - if (src.toString == s.toString && (src != s)) { - // We have seen errors with case class equals, and names so we are paranoid here: - throw new Exception( - "Duplicate Source.toString are equal, but values are not. May result in invalid data: " + - s.toString) - } - (this, pipe) - case None => - val newPipe = p // evaluate the call by name - (FlowState(sourceMap + (s.toString -> (s, newPipe))), newPipe) - } +case class FlowState( + sourceMap: Map[String, Source], + flowConfigUpdates: Set[(String, String)], + pendingTypedWrites: List[FlowStateMap.TypedWrite[_]] +) { + + def getSourceNamed(name: String): Option[Source] = + sourceMap.get(name) - def getSourceNamed(name : String) : Option[Source] = - sourceMap.get(name).map { _._1 } - - def validateSources(flowDef: FlowDef, mode: Mode): Unit = { - flowDef.getSources - .asInstanceOf[JMap[String,AnyRef]] - .asScala - // this is a map of (name, Tap) - .foreach { nameTap => - // Each named source must be present: - getSourceNamed(nameTap._1) - .get - // This can throw a InvalidSourceException - .validateTaps(mode) - } - } + def validateSources(mode: Mode): Unit = + // This can throw a InvalidSourceException + sourceMap.values.toSet[Source].foreach(_.validateTaps(mode)) + + def merge(that: FlowState): FlowState = + FlowState( + sourceMap = sourceMap ++ that.sourceMap, + flowConfigUpdates = flowConfigUpdates ++ that.flowConfigUpdates, + pendingTypedWrites = pendingTypedWrites ::: that.pendingTypedWrites + ) } -/** This is a mutable threadsafe store for attaching scalding - * information to the mutable flowDef +object FlowState { + val empty: FlowState = FlowState(Map.empty, Set.empty, Nil) + + def withSource(id: String, s: Source): FlowState = + FlowState(Map(id -> s), Set.empty, Nil) + + def withConfigSetting(k: String, v: String): FlowState = + FlowState(Map.empty, Set((k, v)), Nil) + + def withTypedWrite[A](p: TypedPipe[A], s: TypedSink[A], m: Mode): FlowState = + FlowState(Map.empty, Set.empty, FlowStateMap.TypedWrite(p, s, m) :: Nil) + + implicit val monoid: Monoid[FlowState] = + Monoid.from(empty)(_.merge(_)) +} + +/** + * This is a mutable threadsafe store for attaching scalding information to the mutable flowDef * - * NOTE: there is a subtle bug in scala regarding case classes - * with multiple sets of arguments, and their equality. - * For this reason, we use Source.toString as the key in this map + * NOTE: there is a subtle bug in scala regarding case classes with multiple sets of arguments, and their + * equality. For this reason, we use Source.sourceId as the key in this map */ object FlowStateMap { // Make sure we don't hold FlowState after the FlowDef is gone @transient private val flowMap = new WeakHashMap[FlowDef, FlowState]() - /** Function to update a state. + case class TypedWrite[T](pipe: TypedPipe[T], sink: TypedSink[T], mode: Mode) + + /** + * Function to update a state. + * + * note if fn mutates the FlowStateMap, this can easily be incorrect (you can lose a write), any mutation + * that itself mutates the FlowState is responsible for returning the correct value from fn. */ - def mutate[T](fd: FlowDef)(fn: FlowState => (FlowState, T)): T = { + private def mutate[T](fd: FlowDef)(fn: FlowState => (FlowState, T)): T = flowMap.synchronized { - val oldState = Option(flowMap.get(fd)).getOrElse(FlowState()) - val (newState, t) = fn(oldState) + val (newState, t) = fn(apply(fd)) flowMap.put(fd, newState) t } - } + + /** + * Get the FlowState or return FlowState.empty + */ + def apply(fd: FlowDef): FlowState = + get(fd).getOrElse(FlowState.empty) + def get(fd: FlowDef): Option[FlowState] = - flowMap.synchronized { Option(flowMap.get(fd)) } + flowMap.synchronized(Option(flowMap.get(fd))) def clear(fd: FlowDef): Unit = - flowMap.synchronized { flowMap.remove(fd) } + flowMap.synchronized(flowMap.remove(fd)) + + /** + * Merge a FlowState into the current one for this FlowDef and return the value before the merge + */ + def merge(fd: FlowDef, state: FlowState): FlowState = + mutate(fd) { fs => + val newFs = fs.merge(state) + (newFs, fs) + } + + /** + * Remove a set of writes (called by the cascading planner) + * + * returns the original + */ + def removeWrites(fd: FlowDef): FlowState = + mutate(fd)(fs => (fs.copy(pendingTypedWrites = Nil), fs)) def validateSources(flowDef: FlowDef, mode: Mode): Unit = - get(flowDef) - .getOrElse(sys.error("Could not find a flowState for flowDef: %s".format(flowDef))) - .validateSources(flowDef, mode) + /* + * We don't need to validate if there are no sources, this comes up for + * cases of no-op jobs + */ + if (!flowDef.getSources.isEmpty) { + get(flowDef) + .getOrElse(sys.error("Could not find a flowState for flowDef: %s".format(flowDef))) + .validateSources(mode) + } else () } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/FoldOperations.scala b/scalding-core/src/main/scala/com/twitter/scalding/FoldOperations.scala index b6edfeacb4..d8a89a183e 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/FoldOperations.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/FoldOperations.scala @@ -12,17 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields -/** Implements reductions on top of a simple abstraction for the Fields-API - * We use the f-bounded polymorphism trick to return the type called Self - * in each operation. +/** + * Implements reductions on top of a simple abstraction for the Fields-API We use the f-bounded polymorphism + * trick to return the type called Self in each operation. */ -trait FoldOperations[+Self <: FoldOperations[Self]] extends ReduceOperations[Self] - with Sortable[Self] { +trait FoldOperations[+Self <: FoldOperations[Self]] extends ReduceOperations[Self] with Sortable[Self] { /* * prefer reduce or mapReduceMap. foldLeft will force all work to be * done on the reducers. If your function is not associative and @@ -31,19 +30,19 @@ trait FoldOperations[+Self <: FoldOperations[Self]] extends ReduceOperations[Sel * NOTE: init needs to be serializable with Kryo (because we copy it for each * grouping to avoid possible errors using a mutable init object). */ - def foldLeft[X,T](fieldDef : (Fields,Fields))(init : X)(fn : (X,T) => X) - (implicit setter : TupleSetter[X], conv : TupleConverter[T]) : Self + def foldLeft[X, T](fieldDef: (Fields, Fields))(init: X)( + fn: (X, T) => X + )(implicit setter: TupleSetter[X], conv: TupleConverter[T]): Self - //If there is an ordering, we need to reverse the list - override def mapList[T,R](fieldDef : (Fields, Fields))(fn : (List[T]) => R) - (implicit conv : TupleConverter[T], setter : TupleSetter[R]) : Self = { - if(sorting.isDefined) { - //the list is built in reverse order so we need to reverse it here - super.mapList[T,R](fieldDef) { l => fn(l.reverse) }(conv,setter) - } - else { + // If there is an ordering, we need to reverse the list + override def mapList[T, R]( + fieldDef: (Fields, Fields) + )(fn: (List[T]) => R)(implicit conv: TupleConverter[T], setter: TupleSetter[R]): Self = + if (sorting.isDefined) { + // the list is built in reverse order so we need to reverse it here + super.mapList[T, R](fieldDef)(l => fn(l.reverse))(conv, setter) + } else { // Ordering doesn't matter, so skip the reversal - super.mapList[T,R](fieldDef)(fn)(conv,setter) + super.mapList[T, R](fieldDef)(fn)(conv, setter) } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/FunctionImplicits.scala b/scalding-core/src/main/scala/com/twitter/scalding/FunctionImplicits.scala index feaf7377c2..0d7b2415ef 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/FunctionImplicits.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/FunctionImplicits.scala @@ -3,47 +3,332 @@ package com.twitter.scalding object FunctionImplicits { - implicit def function2ToTupledFunction1[T1, T2, R](f: Function2[T1, T2, R]): Function1[(T1, T2), R] = f.tupled + implicit def function2ToTupledFunction1[T1, T2, R](f: Function2[T1, T2, R]): Function1[(T1, T2), R] = + f.tupled - implicit def function3ToTupledFunction1[T1, T2, T3, R](f: Function3[T1, T2, T3, R]): Function1[(T1, T2, T3), R] = f.tupled + implicit def function3ToTupledFunction1[T1, T2, T3, R]( + f: Function3[T1, T2, T3, R] + ): Function1[(T1, T2, T3), R] = f.tupled - implicit def function4ToTupledFunction1[T1, T2, T3, T4, R](f: Function4[T1, T2, T3, T4, R]): Function1[(T1, T2, T3, T4), R] = f.tupled + implicit def function4ToTupledFunction1[T1, T2, T3, T4, R]( + f: Function4[T1, T2, T3, T4, R] + ): Function1[(T1, T2, T3, T4), R] = f.tupled - implicit def function5ToTupledFunction1[T1, T2, T3, T4, T5, R](f: Function5[T1, T2, T3, T4, T5, R]): Function1[(T1, T2, T3, T4, T5), R] = f.tupled + implicit def function5ToTupledFunction1[T1, T2, T3, T4, T5, R]( + f: Function5[T1, T2, T3, T4, T5, R] + ): Function1[(T1, T2, T3, T4, T5), R] = f.tupled - implicit def function6ToTupledFunction1[T1, T2, T3, T4, T5, T6, R](f: Function6[T1, T2, T3, T4, T5, T6, R]): Function1[(T1, T2, T3, T4, T5, T6), R] = f.tupled + implicit def function6ToTupledFunction1[T1, T2, T3, T4, T5, T6, R]( + f: Function6[T1, T2, T3, T4, T5, T6, R] + ): Function1[(T1, T2, T3, T4, T5, T6), R] = f.tupled - implicit def function7ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, R](f: Function7[T1, T2, T3, T4, T5, T6, T7, R]): Function1[(T1, T2, T3, T4, T5, T6, T7), R] = f.tupled + implicit def function7ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, R]( + f: Function7[T1, T2, T3, T4, T5, T6, T7, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7), R] = f.tupled - implicit def function8ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, R](f: Function8[T1, T2, T3, T4, T5, T6, T7, T8, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8), R] = f.tupled + implicit def function8ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, R]( + f: Function8[T1, T2, T3, T4, T5, T6, T7, T8, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8), R] = f.tupled - implicit def function9ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, R](f: Function9[T1, T2, T3, T4, T5, T6, T7, T8, T9, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9), R] = f.tupled + implicit def function9ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, R]( + f: Function9[T1, T2, T3, T4, T5, T6, T7, T8, T9, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9), R] = f.tupled - implicit def function10ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, R](f: Function10[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10), R] = f.tupled + implicit def function10ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, R]( + f: Function10[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10), R] = f.tupled - implicit def function11ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, R](f: Function11[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11), R] = f.tupled + implicit def function11ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, R]( + f: Function11[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11), R] = f.tupled - implicit def function12ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, R](f: Function12[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12), R] = f.tupled + implicit def function12ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, R]( + f: Function12[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12), R] = f.tupled - implicit def function13ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, R](f: Function13[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13), R] = f.tupled + implicit def function13ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, R]( + f: Function13[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13), R] = f.tupled - implicit def function14ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, R](f: Function14[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14), R] = f.tupled + implicit def function14ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, R]( + f: Function14[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14), R] = f.tupled - implicit def function15ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, R](f: Function15[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15), R] = f.tupled + implicit def function15ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + R + ]( + f: Function15[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15), R] = f.tupled - implicit def function16ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, R](f: Function16[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16), R] = f.tupled + implicit def function16ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + R + ]( + f: Function16[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16), R] = f.tupled - implicit def function17ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, R](f: Function17[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17), R] = f.tupled + implicit def function17ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + R + ]( + f: Function17[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17), R] = f.tupled - implicit def function18ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, R](f: Function18[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18), R] = f.tupled + implicit def function18ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + R + ]( + f: Function18[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18), R] = + f.tupled - implicit def function19ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, R](f: Function19[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19), R] = f.tupled + implicit def function19ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + R + ]( + f: Function19[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, R] + ): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19), R] = + f.tupled - implicit def function20ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, R](f: Function20[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20), R] = f.tupled + implicit def function20ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + R + ]( + f: Function20[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + R + ] + ): Function1[ + (T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20), + R + ] = f.tupled - implicit def function21ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, R](f: Function21[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21), R] = f.tupled + implicit def function21ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + R + ]( + f: Function21[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + R + ] + ): Function1[ + (T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21), + R + ] = f.tupled - implicit def function22ToTupledFunction1[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, R](f: Function22[T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, R]): Function1[(T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22), R] = f.tupled + implicit def function22ToTupledFunction1[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + R + ]( + f: Function22[ + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + R + ] + ): Function1[ + (T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22), + R + ] = f.tupled } // end of autogenerated diff --git a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedConversions.scala b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedConversions.scala index 7fb21d55df..7791367474 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedConversions.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedConversions.scala @@ -1,694 +1,1036 @@ -// following were autogenerated by ./scalding_gen.rb at Thu May 23 10:22:26 -0700 2013 do not edit +// following were autogenerated by ./scalding-core/codegen/scalding_gen.rb at 2022-03-26 10:07:03 -1000 do not edit package com.twitter.scalding import cascading.tuple.Tuple import cascading.tuple.TupleEntry trait GeneratedTupleConverters extends LowPriorityTupleConverters { - implicit def tuple1Converter[A](implicit - gA : TupleGetter[A]): TupleConverter[Tuple1[A]] = new TupleConverter[Tuple1[A]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple1(gA.get(tup, 0)) - } - def arity = 1 + case class TupleConverter1[A](gA: TupleGetter[A]) extends TupleConverter[Tuple1[A]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple1(gA.get(tup, 0)) + } + def arity = 1 } - - implicit def tuple2Converter[A,B](implicit - gA : TupleGetter[A], - gB : TupleGetter[B]): TupleConverter[Tuple2[A,B]] = new TupleConverter[Tuple2[A,B]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple2(gA.get(tup, 0), - gB.get(tup, 1)) - } - def arity = 2 + implicit def tuple1Converter[A](implicit gA: TupleGetter[A]): TupleConverter[Tuple1[A]] = TupleConverter1( + gA + ) + + case class TupleConverter2[A, B](gA: TupleGetter[A], gB: TupleGetter[B]) + extends TupleConverter[Tuple2[A, B]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple2(gA.get(tup, 0), gB.get(tup, 1)) + } + def arity = 2 } - - implicit def tuple3Converter[A,B,C](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C]): TupleConverter[Tuple3[A,B,C]] = new TupleConverter[Tuple3[A,B,C]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple3(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2)) - } - def arity = 3 + implicit def tuple2Converter[A, B](implicit + gA: TupleGetter[A], + gB: TupleGetter[B] + ): TupleConverter[Tuple2[A, B]] = TupleConverter2(gA, gB) + + case class TupleConverter3[A, B, C](gA: TupleGetter[A], gB: TupleGetter[B], gC: TupleGetter[C]) + extends TupleConverter[Tuple3[A, B, C]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple3(gA.get(tup, 0), gB.get(tup, 1), gC.get(tup, 2)) + } + def arity = 3 } - - implicit def tuple4Converter[A,B,C,D](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D]): TupleConverter[Tuple4[A,B,C,D]] = new TupleConverter[Tuple4[A,B,C,D]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple4(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3)) - } - def arity = 4 + implicit def tuple3Converter[A, B, C](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C] + ): TupleConverter[Tuple3[A, B, C]] = TupleConverter3(gA, gB, gC) + + case class TupleConverter4[A, B, C, D]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D] + ) extends TupleConverter[Tuple4[A, B, C, D]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple4(gA.get(tup, 0), gB.get(tup, 1), gC.get(tup, 2), gD.get(tup, 3)) + } + def arity = 4 } - - implicit def tuple5Converter[A,B,C,D,E](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E]): TupleConverter[Tuple5[A,B,C,D,E]] = new TupleConverter[Tuple5[A,B,C,D,E]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple5(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4)) - } - def arity = 5 + implicit def tuple4Converter[A, B, C, D](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D] + ): TupleConverter[Tuple4[A, B, C, D]] = TupleConverter4(gA, gB, gC, gD) + + case class TupleConverter5[A, B, C, D, E]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E] + ) extends TupleConverter[Tuple5[A, B, C, D, E]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple5(gA.get(tup, 0), gB.get(tup, 1), gC.get(tup, 2), gD.get(tup, 3), gE.get(tup, 4)) + } + def arity = 5 } - - implicit def tuple6Converter[A,B,C,D,E,F](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F]): TupleConverter[Tuple6[A,B,C,D,E,F]] = new TupleConverter[Tuple6[A,B,C,D,E,F]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple6(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5)) - } - def arity = 6 + implicit def tuple5Converter[A, B, C, D, E](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E] + ): TupleConverter[Tuple5[A, B, C, D, E]] = TupleConverter5(gA, gB, gC, gD, gE) + + case class TupleConverter6[A, B, C, D, E, F]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F] + ) extends TupleConverter[Tuple6[A, B, C, D, E, F]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple6(gA.get(tup, 0), gB.get(tup, 1), gC.get(tup, 2), gD.get(tup, 3), gE.get(tup, 4), gF.get(tup, 5)) + } + def arity = 6 } - - implicit def tuple7Converter[A,B,C,D,E,F,G](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G]): TupleConverter[Tuple7[A,B,C,D,E,F,G]] = new TupleConverter[Tuple7[A,B,C,D,E,F,G]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple7(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6)) - } - def arity = 7 + implicit def tuple6Converter[A, B, C, D, E, F](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F] + ): TupleConverter[Tuple6[A, B, C, D, E, F]] = TupleConverter6(gA, gB, gC, gD, gE, gF) + + case class TupleConverter7[A, B, C, D, E, F, G]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G] + ) extends TupleConverter[Tuple7[A, B, C, D, E, F, G]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple7( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6) + ) + } + def arity = 7 } - - implicit def tuple8Converter[A,B,C,D,E,F,G,H](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H]): TupleConverter[Tuple8[A,B,C,D,E,F,G,H]] = new TupleConverter[Tuple8[A,B,C,D,E,F,G,H]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple8(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7)) - } - def arity = 8 + implicit def tuple7Converter[A, B, C, D, E, F, G](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G] + ): TupleConverter[Tuple7[A, B, C, D, E, F, G]] = TupleConverter7(gA, gB, gC, gD, gE, gF, gG) + + case class TupleConverter8[A, B, C, D, E, F, G, H]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H] + ) extends TupleConverter[Tuple8[A, B, C, D, E, F, G, H]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple8( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7) + ) + } + def arity = 8 } - - implicit def tuple9Converter[A,B,C,D,E,F,G,H,I](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I]): TupleConverter[Tuple9[A,B,C,D,E,F,G,H,I]] = new TupleConverter[Tuple9[A,B,C,D,E,F,G,H,I]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple9(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8)) - } - def arity = 9 + implicit def tuple8Converter[A, B, C, D, E, F, G, H](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H] + ): TupleConverter[Tuple8[A, B, C, D, E, F, G, H]] = TupleConverter8(gA, gB, gC, gD, gE, gF, gG, gH) + + case class TupleConverter9[A, B, C, D, E, F, G, H, I]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I] + ) extends TupleConverter[Tuple9[A, B, C, D, E, F, G, H, I]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple9( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8) + ) + } + def arity = 9 } - - implicit def tuple10Converter[A,B,C,D,E,F,G,H,I,J](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J]): TupleConverter[Tuple10[A,B,C,D,E,F,G,H,I,J]] = new TupleConverter[Tuple10[A,B,C,D,E,F,G,H,I,J]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple10(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9)) - } - def arity = 10 + implicit def tuple9Converter[A, B, C, D, E, F, G, H, I](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I] + ): TupleConverter[Tuple9[A, B, C, D, E, F, G, H, I]] = TupleConverter9(gA, gB, gC, gD, gE, gF, gG, gH, gI) + + case class TupleConverter10[A, B, C, D, E, F, G, H, I, J]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J] + ) extends TupleConverter[Tuple10[A, B, C, D, E, F, G, H, I, J]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple10( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9) + ) + } + def arity = 10 } - - implicit def tuple11Converter[A,B,C,D,E,F,G,H,I,J,K](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K]): TupleConverter[Tuple11[A,B,C,D,E,F,G,H,I,J,K]] = new TupleConverter[Tuple11[A,B,C,D,E,F,G,H,I,J,K]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple11(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10)) - } - def arity = 11 + implicit def tuple10Converter[A, B, C, D, E, F, G, H, I, J](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J] + ): TupleConverter[Tuple10[A, B, C, D, E, F, G, H, I, J]] = + TupleConverter10(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ) + + case class TupleConverter11[A, B, C, D, E, F, G, H, I, J, K]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K] + ) extends TupleConverter[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple11( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10) + ) + } + def arity = 11 } - - implicit def tuple12Converter[A,B,C,D,E,F,G,H,I,J,K,L](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L]): TupleConverter[Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]] = new TupleConverter[Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple12(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11)) - } - def arity = 12 + implicit def tuple11Converter[A, B, C, D, E, F, G, H, I, J, K](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K] + ): TupleConverter[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] = + TupleConverter11(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK) + + case class TupleConverter12[A, B, C, D, E, F, G, H, I, J, K, L]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L] + ) extends TupleConverter[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple12( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11) + ) + } + def arity = 12 } - - implicit def tuple13Converter[A,B,C,D,E,F,G,H,I,J,K,L,M](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M]): TupleConverter[Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]] = new TupleConverter[Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple13(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12)) - } - def arity = 13 + implicit def tuple12Converter[A, B, C, D, E, F, G, H, I, J, K, L](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L] + ): TupleConverter[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] = + TupleConverter12(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL) + + case class TupleConverter13[A, B, C, D, E, F, G, H, I, J, K, L, M]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M] + ) extends TupleConverter[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple13( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12) + ) + } + def arity = 13 } - - implicit def tuple14Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N]): TupleConverter[Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]] = new TupleConverter[Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple14(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13)) - } - def arity = 14 + implicit def tuple13Converter[A, B, C, D, E, F, G, H, I, J, K, L, M](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M] + ): TupleConverter[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] = + TupleConverter13(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM) + + case class TupleConverter14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N] + ) extends TupleConverter[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple14( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13) + ) + } + def arity = 14 } - - implicit def tuple15Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O]): TupleConverter[Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]] = new TupleConverter[Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple15(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14)) - } - def arity = 15 + implicit def tuple14Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N] + ): TupleConverter[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] = + TupleConverter14(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN) + + case class TupleConverter15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O] + ) extends TupleConverter[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple15( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14) + ) + } + def arity = 15 } - - implicit def tuple16Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P]): TupleConverter[Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]] = new TupleConverter[Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple16(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15)) - } - def arity = 16 + implicit def tuple15Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O] + ): TupleConverter[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] = + TupleConverter15(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO) + + case class TupleConverter16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P] + ) extends TupleConverter[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple16( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15) + ) + } + def arity = 16 } - - implicit def tuple17Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q]): TupleConverter[Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]] = new TupleConverter[Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple17(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16)) - } - def arity = 17 + implicit def tuple16Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P] + ): TupleConverter[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] = + TupleConverter16(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP) + + case class TupleConverter17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q] + ) extends TupleConverter[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple17( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16) + ) + } + def arity = 17 } - - implicit def tuple18Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R]): TupleConverter[Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]] = new TupleConverter[Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple18(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16), - gR.get(tup, 17)) - } - def arity = 18 + implicit def tuple17Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q] + ): TupleConverter[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] = + TupleConverter17(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ) + + case class TupleConverter18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R] + ) extends TupleConverter[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple18( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16), + gR.get(tup, 17) + ) + } + def arity = 18 } - - implicit def tuple19Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S]): TupleConverter[Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]] = new TupleConverter[Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple19(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16), - gR.get(tup, 17), - gS.get(tup, 18)) - } - def arity = 19 + implicit def tuple18Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R] + ): TupleConverter[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] = + TupleConverter18(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR) + + case class TupleConverter19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S] + ) extends TupleConverter[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple19( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16), + gR.get(tup, 17), + gS.get(tup, 18) + ) + } + def arity = 19 } - - implicit def tuple20Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S], - gT : TupleGetter[T]): TupleConverter[Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]] = new TupleConverter[Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple20(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16), - gR.get(tup, 17), - gS.get(tup, 18), - gT.get(tup, 19)) - } - def arity = 20 + implicit def tuple19Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S] + ): TupleConverter[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] = + TupleConverter19(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS) + + case class TupleConverter20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T] + ) extends TupleConverter[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple20( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16), + gR.get(tup, 17), + gS.get(tup, 18), + gT.get(tup, 19) + ) + } + def arity = 20 } - - implicit def tuple21Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S], - gT : TupleGetter[T], - gU : TupleGetter[U]): TupleConverter[Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]] = new TupleConverter[Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple21(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16), - gR.get(tup, 17), - gS.get(tup, 18), - gT.get(tup, 19), - gU.get(tup, 20)) - } - def arity = 21 + implicit def tuple20Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T] + ): TupleConverter[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] = + TupleConverter20(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS, gT) + + case class TupleConverter21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T], + gU: TupleGetter[U] + ) extends TupleConverter[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple21( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16), + gR.get(tup, 17), + gS.get(tup, 18), + gT.get(tup, 19), + gU.get(tup, 20) + ) + } + def arity = 21 } - - implicit def tuple22Converter[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V](implicit - gA : TupleGetter[A], - gB : TupleGetter[B], - gC : TupleGetter[C], - gD : TupleGetter[D], - gE : TupleGetter[E], - gF : TupleGetter[F], - gG : TupleGetter[G], - gH : TupleGetter[H], - gI : TupleGetter[I], - gJ : TupleGetter[J], - gK : TupleGetter[K], - gL : TupleGetter[L], - gM : TupleGetter[M], - gN : TupleGetter[N], - gO : TupleGetter[O], - gP : TupleGetter[P], - gQ : TupleGetter[Q], - gR : TupleGetter[R], - gS : TupleGetter[S], - gT : TupleGetter[T], - gU : TupleGetter[U], - gV : TupleGetter[V]): TupleConverter[Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]] = new TupleConverter[Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]]{ - def apply(te : TupleEntry) = { - val tup = te.getTuple - Tuple22(gA.get(tup, 0), - gB.get(tup, 1), - gC.get(tup, 2), - gD.get(tup, 3), - gE.get(tup, 4), - gF.get(tup, 5), - gG.get(tup, 6), - gH.get(tup, 7), - gI.get(tup, 8), - gJ.get(tup, 9), - gK.get(tup, 10), - gL.get(tup, 11), - gM.get(tup, 12), - gN.get(tup, 13), - gO.get(tup, 14), - gP.get(tup, 15), - gQ.get(tup, 16), - gR.get(tup, 17), - gS.get(tup, 18), - gT.get(tup, 19), - gU.get(tup, 20), - gV.get(tup, 21)) - } - def arity = 22 + implicit def tuple21Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T], + gU: TupleGetter[U] + ): TupleConverter[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] = + TupleConverter21(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS, gT, gU) + + case class TupleConverter22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T], + gU: TupleGetter[U], + gV: TupleGetter[V] + ) extends TupleConverter[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] { + def apply(te: TupleEntry) = { + val tup = te.getTuple + Tuple22( + gA.get(tup, 0), + gB.get(tup, 1), + gC.get(tup, 2), + gD.get(tup, 3), + gE.get(tup, 4), + gF.get(tup, 5), + gG.get(tup, 6), + gH.get(tup, 7), + gI.get(tup, 8), + gJ.get(tup, 9), + gK.get(tup, 10), + gL.get(tup, 11), + gM.get(tup, 12), + gN.get(tup, 13), + gO.get(tup, 14), + gP.get(tup, 15), + gQ.get(tup, 16), + gR.get(tup, 17), + gS.get(tup, 18), + gT.get(tup, 19), + gU.get(tup, 20), + gV.get(tup, 21) + ) + } + def arity = 22 } + implicit def tuple22Converter[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V](implicit + gA: TupleGetter[A], + gB: TupleGetter[B], + gC: TupleGetter[C], + gD: TupleGetter[D], + gE: TupleGetter[E], + gF: TupleGetter[F], + gG: TupleGetter[G], + gH: TupleGetter[H], + gI: TupleGetter[I], + gJ: TupleGetter[J], + gK: TupleGetter[K], + gL: TupleGetter[L], + gM: TupleGetter[M], + gN: TupleGetter[N], + gO: TupleGetter[O], + gP: TupleGetter[P], + gQ: TupleGetter[Q], + gR: TupleGetter[R], + gS: TupleGetter[S], + gT: TupleGetter[T], + gU: TupleGetter[U], + gV: TupleGetter[V] + ): TupleConverter[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] = + TupleConverter22(gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO, gP, gQ, gR, gS, gT, gU, gV) } trait GeneratedTupleSetters extends LowPriorityTupleSetters { - implicit def tup1Setter[Z <: Tuple1[_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter1[Z <: Tuple1[_]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(1) tup.set(0, arg._1) tup } - override def arity = 1 } + implicit def tup1Setter[Z <: Tuple1[_]]: TupleSetter[Z] = TupleSetter1[Z]() - implicit def tup2Setter[Z <: Tuple2[_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter2[Z <: Tuple2[_, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(2) tup.set(0, arg._1) tup.set(1, arg._2) tup } - override def arity = 2 } + implicit def tup2Setter[Z <: Tuple2[_, _]]: TupleSetter[Z] = TupleSetter2[Z]() - implicit def tup3Setter[Z <: Tuple3[_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter3[Z <: Tuple3[_, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(3) tup.set(0, arg._1) @@ -696,11 +1038,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(2, arg._3) tup } - override def arity = 3 } + implicit def tup3Setter[Z <: Tuple3[_, _, _]]: TupleSetter[Z] = TupleSetter3[Z]() - implicit def tup4Setter[Z <: Tuple4[_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter4[Z <: Tuple4[_, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(4) tup.set(0, arg._1) @@ -709,11 +1051,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(3, arg._4) tup } - override def arity = 4 } + implicit def tup4Setter[Z <: Tuple4[_, _, _, _]]: TupleSetter[Z] = TupleSetter4[Z]() - implicit def tup5Setter[Z <: Tuple5[_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter5[Z <: Tuple5[_, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(5) tup.set(0, arg._1) @@ -723,11 +1065,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(4, arg._5) tup } - override def arity = 5 } + implicit def tup5Setter[Z <: Tuple5[_, _, _, _, _]]: TupleSetter[Z] = TupleSetter5[Z]() - implicit def tup6Setter[Z <: Tuple6[_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter6[Z <: Tuple6[_, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(6) tup.set(0, arg._1) @@ -738,11 +1080,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(5, arg._6) tup } - override def arity = 6 } + implicit def tup6Setter[Z <: Tuple6[_, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter6[Z]() - implicit def tup7Setter[Z <: Tuple7[_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter7[Z <: Tuple7[_, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(7) tup.set(0, arg._1) @@ -754,11 +1096,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(6, arg._7) tup } - override def arity = 7 } + implicit def tup7Setter[Z <: Tuple7[_, _, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter7[Z]() - implicit def tup8Setter[Z <: Tuple8[_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter8[Z <: Tuple8[_, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(8) tup.set(0, arg._1) @@ -771,11 +1113,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(7, arg._8) tup } - override def arity = 8 } + implicit def tup8Setter[Z <: Tuple8[_, _, _, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter8[Z]() - implicit def tup9Setter[Z <: Tuple9[_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter9[Z <: Tuple9[_, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(9) tup.set(0, arg._1) @@ -789,11 +1131,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(8, arg._9) tup } - override def arity = 9 } + implicit def tup9Setter[Z <: Tuple9[_, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter9[Z]() - implicit def tup10Setter[Z <: Tuple10[_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter10[Z <: Tuple10[_, _, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(10) tup.set(0, arg._1) @@ -808,11 +1150,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(9, arg._10) tup } - override def arity = 10 } + implicit def tup10Setter[Z <: Tuple10[_, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter10[Z]() - implicit def tup11Setter[Z <: Tuple11[_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter11[Z <: Tuple11[_, _, _, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(11) tup.set(0, arg._1) @@ -828,11 +1170,11 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(10, arg._11) tup } - override def arity = 11 } + implicit def tup11Setter[Z <: Tuple11[_, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = TupleSetter11[Z]() - implicit def tup12Setter[Z <: Tuple12[_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter12[Z <: Tuple12[_, _, _, _, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(12) tup.set(0, arg._1) @@ -849,11 +1191,12 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(11, arg._12) tup } - override def arity = 12 } + implicit def tup12Setter[Z <: Tuple12[_, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter12[Z]() - implicit def tup13Setter[Z <: Tuple13[_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter13[Z <: Tuple13[_, _, _, _, _, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(13) tup.set(0, arg._1) @@ -871,11 +1214,12 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(12, arg._13) tup } - override def arity = 13 } + implicit def tup13Setter[Z <: Tuple13[_, _, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter13[Z]() - implicit def tup14Setter[Z <: Tuple14[_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter14[Z <: Tuple14[_, _, _, _, _, _, _, _, _, _, _, _, _, _]]() extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(14) tup.set(0, arg._1) @@ -894,11 +1238,13 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(13, arg._14) tup } - override def arity = 14 } + implicit def tup14Setter[Z <: Tuple14[_, _, _, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter14[Z]() - implicit def tup15Setter[Z <: Tuple15[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter15[Z <: Tuple15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(15) tup.set(0, arg._1) @@ -918,11 +1264,13 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(14, arg._15) tup } - override def arity = 15 } + implicit def tup15Setter[Z <: Tuple15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter15[Z]() - implicit def tup16Setter[Z <: Tuple16[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter16[Z <: Tuple16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(16) tup.set(0, arg._1) @@ -943,11 +1291,13 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(15, arg._16) tup } - override def arity = 16 } + implicit def tup16Setter[Z <: Tuple16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter16[Z]() - implicit def tup17Setter[Z <: Tuple17[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter17[Z <: Tuple17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(17) tup.set(0, arg._1) @@ -969,11 +1319,13 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(16, arg._17) tup } - override def arity = 17 } + implicit def tup17Setter[Z <: Tuple17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]: TupleSetter[Z] = + TupleSetter17[Z]() - implicit def tup18Setter[Z <: Tuple18[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter18[Z <: Tuple18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(18) tup.set(0, arg._1) @@ -996,11 +1348,13 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(17, arg._18) tup } - override def arity = 18 } + implicit def tup18Setter[Z <: Tuple18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]] + : TupleSetter[Z] = TupleSetter18[Z]() - implicit def tup19Setter[Z <: Tuple19[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter19[Z <: Tuple19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(19) tup.set(0, arg._1) @@ -1024,11 +1378,13 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(18, arg._19) tup } - override def arity = 19 } + implicit def tup19Setter[Z <: Tuple19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]] + : TupleSetter[Z] = TupleSetter19[Z]() - implicit def tup20Setter[Z <: Tuple20[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter20[Z <: Tuple20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(20) tup.set(0, arg._1) @@ -1053,11 +1409,13 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(19, arg._20) tup } - override def arity = 20 } + implicit def tup20Setter[Z <: Tuple20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]] + : TupleSetter[Z] = TupleSetter20[Z]() - implicit def tup21Setter[Z <: Tuple21[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter21[Z <: Tuple21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(21) tup.set(0, arg._1) @@ -1083,11 +1441,13 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(20, arg._21) tup } - override def arity = 21 } + implicit def tup21Setter[Z <: Tuple21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]] + : TupleSetter[Z] = TupleSetter21[Z]() - implicit def tup22Setter[Z <: Tuple22[_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_]]: TupleSetter[Z] = new TupleSetter[Z] { + case class TupleSetter22[Z <: Tuple22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]]() + extends TupleSetter[Z] { override def apply(arg: Z) = { val tup = Tuple.size(22) tup.set(0, arg._1) @@ -1114,8 +1474,361 @@ trait GeneratedTupleSetters extends LowPriorityTupleSetters { tup.set(21, arg._22) tup } - override def arity = 22 } + implicit def tup22Setter[Z <: Tuple22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]] + : TupleSetter[Z] = TupleSetter22[Z]() + + def converterFromSetter[A](ts: TupleSetter[A], gtc: GeneratedTupleConverters): Option[TupleConverter[A]] = + (ts match { + case TupleSetter1() => Some(gtc.TupleConverter1(TupleGetter.Casting())) + case TupleSetter2() => Some(gtc.TupleConverter2(TupleGetter.Casting(), TupleGetter.Casting())) + case TupleSetter3() => + Some(gtc.TupleConverter3(TupleGetter.Casting(), TupleGetter.Casting(), TupleGetter.Casting())) + case TupleSetter4() => + Some( + gtc.TupleConverter4( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter5() => + Some( + gtc.TupleConverter5( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter6() => + Some( + gtc.TupleConverter6( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter7() => + Some( + gtc.TupleConverter7( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter8() => + Some( + gtc.TupleConverter8( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter9() => + Some( + gtc.TupleConverter9( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter10() => + Some( + gtc.TupleConverter10( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter11() => + Some( + gtc.TupleConverter11( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter12() => + Some( + gtc.TupleConverter12( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter13() => + Some( + gtc.TupleConverter13( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter14() => + Some( + gtc.TupleConverter14( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter15() => + Some( + gtc.TupleConverter15( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter16() => + Some( + gtc.TupleConverter16( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter17() => + Some( + gtc.TupleConverter17( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter18() => + Some( + gtc.TupleConverter18( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter19() => + Some( + gtc.TupleConverter19( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter20() => + Some( + gtc.TupleConverter20( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter21() => + Some( + gtc.TupleConverter21( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case TupleSetter22() => + Some( + gtc.TupleConverter22( + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting(), + TupleGetter.Casting() + ) + ) + case _ => None + }).asInstanceOf[Option[TupleConverter[A]]] + } // end of autogenerated diff --git a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedMappable.scala b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedMappable.scala index 6840300165..76308c2d6f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedMappable.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedMappable.scala @@ -5,88 +5,132 @@ trait Mappable1[A] extends Mappable[Tuple1[A]] { def converter[Z >: Tuple1[A]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple1[A]]) } -trait Mappable2[A,B] extends Mappable[Tuple2[A,B]] { - def converter[Z >: Tuple2[A,B]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple2[A,B]]) +trait Mappable2[A, B] extends Mappable[Tuple2[A, B]] { + def converter[Z >: Tuple2[A, B]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple2[A, B]]) } -trait Mappable3[A,B,C] extends Mappable[Tuple3[A,B,C]] { - def converter[Z >: Tuple3[A,B,C]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple3[A,B,C]]) +trait Mappable3[A, B, C] extends Mappable[Tuple3[A, B, C]] { + def converter[Z >: Tuple3[A, B, C]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple3[A, B, C]]) } -trait Mappable4[A,B,C,D] extends Mappable[Tuple4[A,B,C,D]] { - def converter[Z >: Tuple4[A,B,C,D]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple4[A,B,C,D]]) +trait Mappable4[A, B, C, D] extends Mappable[Tuple4[A, B, C, D]] { + def converter[Z >: Tuple4[A, B, C, D]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple4[A, B, C, D]]) } -trait Mappable5[A,B,C,D,E] extends Mappable[Tuple5[A,B,C,D,E]] { - def converter[Z >: Tuple5[A,B,C,D,E]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple5[A,B,C,D,E]]) +trait Mappable5[A, B, C, D, E] extends Mappable[Tuple5[A, B, C, D, E]] { + def converter[Z >: Tuple5[A, B, C, D, E]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple5[A, B, C, D, E]]) } -trait Mappable6[A,B,C,D,E,F] extends Mappable[Tuple6[A,B,C,D,E,F]] { - def converter[Z >: Tuple6[A,B,C,D,E,F]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple6[A,B,C,D,E,F]]) +trait Mappable6[A, B, C, D, E, F] extends Mappable[Tuple6[A, B, C, D, E, F]] { + def converter[Z >: Tuple6[A, B, C, D, E, F]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple6[A, B, C, D, E, F]]) } -trait Mappable7[A,B,C,D,E,F,G] extends Mappable[Tuple7[A,B,C,D,E,F,G]] { - def converter[Z >: Tuple7[A,B,C,D,E,F,G]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple7[A,B,C,D,E,F,G]]) +trait Mappable7[A, B, C, D, E, F, G] extends Mappable[Tuple7[A, B, C, D, E, F, G]] { + def converter[Z >: Tuple7[A, B, C, D, E, F, G]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple7[A, B, C, D, E, F, G]]) } -trait Mappable8[A,B,C,D,E,F,G,H] extends Mappable[Tuple8[A,B,C,D,E,F,G,H]] { - def converter[Z >: Tuple8[A,B,C,D,E,F,G,H]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple8[A,B,C,D,E,F,G,H]]) +trait Mappable8[A, B, C, D, E, F, G, H] extends Mappable[Tuple8[A, B, C, D, E, F, G, H]] { + def converter[Z >: Tuple8[A, B, C, D, E, F, G, H]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple8[A, B, C, D, E, F, G, H]]) } -trait Mappable9[A,B,C,D,E,F,G,H,I] extends Mappable[Tuple9[A,B,C,D,E,F,G,H,I]] { - def converter[Z >: Tuple9[A,B,C,D,E,F,G,H,I]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple9[A,B,C,D,E,F,G,H,I]]) +trait Mappable9[A, B, C, D, E, F, G, H, I] extends Mappable[Tuple9[A, B, C, D, E, F, G, H, I]] { + def converter[Z >: Tuple9[A, B, C, D, E, F, G, H, I]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple9[A, B, C, D, E, F, G, H, I]]) } -trait Mappable10[A,B,C,D,E,F,G,H,I,J] extends Mappable[Tuple10[A,B,C,D,E,F,G,H,I,J]] { - def converter[Z >: Tuple10[A,B,C,D,E,F,G,H,I,J]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple10[A,B,C,D,E,F,G,H,I,J]]) +trait Mappable10[A, B, C, D, E, F, G, H, I, J] extends Mappable[Tuple10[A, B, C, D, E, F, G, H, I, J]] { + def converter[Z >: Tuple10[A, B, C, D, E, F, G, H, I, J]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple10[A, B, C, D, E, F, G, H, I, J]]) } -trait Mappable11[A,B,C,D,E,F,G,H,I,J,K] extends Mappable[Tuple11[A,B,C,D,E,F,G,H,I,J,K]] { - def converter[Z >: Tuple11[A,B,C,D,E,F,G,H,I,J,K]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple11[A,B,C,D,E,F,G,H,I,J,K]]) +trait Mappable11[A, B, C, D, E, F, G, H, I, J, K] extends Mappable[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] { + def converter[Z >: Tuple11[A, B, C, D, E, F, G, H, I, J, K]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple11[A, B, C, D, E, F, G, H, I, J, K]]) } -trait Mappable12[A,B,C,D,E,F,G,H,I,J,K,L] extends Mappable[Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]] { - def converter[Z >: Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]]) +trait Mappable12[A, B, C, D, E, F, G, H, I, J, K, L] + extends Mappable[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] { + def converter[Z >: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]]) } -trait Mappable13[A,B,C,D,E,F,G,H,I,J,K,L,M] extends Mappable[Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]] { - def converter[Z >: Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]]) +trait Mappable13[A, B, C, D, E, F, G, H, I, J, K, L, M] + extends Mappable[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] { + def converter[Z >: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]]) } -trait Mappable14[A,B,C,D,E,F,G,H,I,J,K,L,M,N] extends Mappable[Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]] { - def converter[Z >: Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]]) +trait Mappable14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + extends Mappable[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] { + def converter[Z >: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]]) } -trait Mappable15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O] extends Mappable[Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]] { - def converter[Z >: Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]]) +trait Mappable15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + extends Mappable[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] { + def converter[Z >: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]]) } -trait Mappable16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P] extends Mappable[Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]] { - def converter[Z >: Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]]) +trait Mappable16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + extends Mappable[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] { + def converter[Z >: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] + ) } -trait Mappable17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q] extends Mappable[Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]] { - def converter[Z >: Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]]) +trait Mappable17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + extends Mappable[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] { + def converter[Z >: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] + ) } -trait Mappable18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R] extends Mappable[Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]] { - def converter[Z >: Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]]) +trait Mappable18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + extends Mappable[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] { + def converter[Z >: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] + ) } -trait Mappable19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S] extends Mappable[Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]] { - def converter[Z >: Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]]) +trait Mappable19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + extends Mappable[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] { + def converter[Z >: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] + ) } -trait Mappable20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T] extends Mappable[Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]] { - def converter[Z >: Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]]) +trait Mappable20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + extends Mappable[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] { + def converter[Z >: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] + ) } -trait Mappable21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U] extends Mappable[Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]] { - def converter[Z >: Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]]) +trait Mappable21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + extends Mappable[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] { + def converter[Z >: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] + ) } -trait Mappable22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V] extends Mappable[Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]] { - def converter[Z >: Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]]) +trait Mappable22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + extends Mappable[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] { + def converter[Z >: Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] + ) } // end of autogenerated diff --git a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedTupleAdders.scala b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedTupleAdders.scala index b86694ae74..8c9a866e46 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/GeneratedTupleAdders.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/GeneratedTupleAdders.scala @@ -2,1160 +2,4631 @@ package com.twitter.scalding trait GeneratedTupleAdders { - class Tuple1Adder[A](tup : Tuple1[A]) { - def :+[B](other : B) = { - (tup._1,other) - } - def +:[B](other : B) = { - (other,tup._1) - } - - def ++[B](other : Tuple1[B]) = { - (tup._1,other._1) - } - - def ++[B,C](other : Tuple2[B,C]) = { - (tup._1,other._1,other._2) - } - - def ++[B,C,D](other : Tuple3[B,C,D]) = { - (tup._1,other._1,other._2,other._3) - } - - def ++[B,C,D,E](other : Tuple4[B,C,D,E]) = { - (tup._1,other._1,other._2,other._3,other._4) - } - - def ++[B,C,D,E,F](other : Tuple5[B,C,D,E,F]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5) - } - - def ++[B,C,D,E,F,G](other : Tuple6[B,C,D,E,F,G]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[B,C,D,E,F,G,H](other : Tuple7[B,C,D,E,F,G,H]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[B,C,D,E,F,G,H,I](other : Tuple8[B,C,D,E,F,G,H,I]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[B,C,D,E,F,G,H,I,J](other : Tuple9[B,C,D,E,F,G,H,I,J]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[B,C,D,E,F,G,H,I,J,K](other : Tuple10[B,C,D,E,F,G,H,I,J,K]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } - - def ++[B,C,D,E,F,G,H,I,J,K,L](other : Tuple11[B,C,D,E,F,G,H,I,J,K,L]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11) - } - - def ++[B,C,D,E,F,G,H,I,J,K,L,M](other : Tuple12[B,C,D,E,F,G,H,I,J,K,L,M]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12) - } - - def ++[B,C,D,E,F,G,H,I,J,K,L,M,N](other : Tuple13[B,C,D,E,F,G,H,I,J,K,L,M,N]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13) - } - - def ++[B,C,D,E,F,G,H,I,J,K,L,M,N,O](other : Tuple14[B,C,D,E,F,G,H,I,J,K,L,M,N,O]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14) - } - - def ++[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P](other : Tuple15[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15) - } - - def ++[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q](other : Tuple16[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16) - } - - def ++[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R](other : Tuple17[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17) - } - - def ++[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S](other : Tuple18[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17,other._18) - } - - def ++[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T](other : Tuple19[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17,other._18,other._19) - } - - def ++[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U](other : Tuple20[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17,other._18,other._19,other._20) - } - - def ++[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V](other : Tuple21[B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17,other._18,other._19,other._20,other._21) - } + class Tuple1Adder[A](tup: Tuple1[A]) { + def :+[B](other: B) = + (tup._1, other) + def +:[B](other: B) = + (other, tup._1) + + def ++[B](other: Tuple1[B]) = + (tup._1, other._1) + + def ++[B, C](other: Tuple2[B, C]) = + (tup._1, other._1, other._2) + + def ++[B, C, D](other: Tuple3[B, C, D]) = + (tup._1, other._1, other._2, other._3) + + def ++[B, C, D, E](other: Tuple4[B, C, D, E]) = + (tup._1, other._1, other._2, other._3, other._4) + + def ++[B, C, D, E, F](other: Tuple5[B, C, D, E, F]) = + (tup._1, other._1, other._2, other._3, other._4, other._5) + + def ++[B, C, D, E, F, G](other: Tuple6[B, C, D, E, F, G]) = + (tup._1, other._1, other._2, other._3, other._4, other._5, other._6) + + def ++[B, C, D, E, F, G, H](other: Tuple7[B, C, D, E, F, G, H]) = + (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7) + + def ++[B, C, D, E, F, G, H, I](other: Tuple8[B, C, D, E, F, G, H, I]) = + (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) + + def ++[B, C, D, E, F, G, H, I, J](other: Tuple9[B, C, D, E, F, G, H, I, J]) = + (tup._1, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8, other._9) + + def ++[B, C, D, E, F, G, H, I, J, K](other: Tuple10[B, C, D, E, F, G, H, I, J, K]) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L](other: Tuple11[B, C, D, E, F, G, H, I, J, K, L]) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M](other: Tuple12[B, C, D, E, F, G, H, I, J, K, L, M]) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N](other: Tuple13[B, C, D, E, F, G, H, I, J, K, L, M, N]) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + other: Tuple14[B, C, D, E, F, G, H, I, J, K, L, M, N, O] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + other: Tuple15[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + other: Tuple16[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + other: Tuple17[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + other: Tuple18[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple19[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple20[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19, + other._20 + ) + + def ++[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple21[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19, + other._20, + other._21 + ) } - implicit def tup1ToAdder[A](tup : Tuple1[A]) = new Tuple1Adder(tup) - - class Tuple2Adder[A,B](tup : Tuple2[A,B]) { - def :+[C](other : C) = { - (tup._1,tup._2,other) - } - def +:[C](other : C) = { - (other,tup._1,tup._2) - } - - def ++[C](other : Tuple1[C]) = { - (tup._1,tup._2,other._1) - } - - def ++[C,D](other : Tuple2[C,D]) = { - (tup._1,tup._2,other._1,other._2) - } - - def ++[C,D,E](other : Tuple3[C,D,E]) = { - (tup._1,tup._2,other._1,other._2,other._3) - } - - def ++[C,D,E,F](other : Tuple4[C,D,E,F]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4) - } - - def ++[C,D,E,F,G](other : Tuple5[C,D,E,F,G]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5) - } - - def ++[C,D,E,F,G,H](other : Tuple6[C,D,E,F,G,H]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[C,D,E,F,G,H,I](other : Tuple7[C,D,E,F,G,H,I]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[C,D,E,F,G,H,I,J](other : Tuple8[C,D,E,F,G,H,I,J]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[C,D,E,F,G,H,I,J,K](other : Tuple9[C,D,E,F,G,H,I,J,K]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[C,D,E,F,G,H,I,J,K,L](other : Tuple10[C,D,E,F,G,H,I,J,K,L]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } - - def ++[C,D,E,F,G,H,I,J,K,L,M](other : Tuple11[C,D,E,F,G,H,I,J,K,L,M]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11) - } - - def ++[C,D,E,F,G,H,I,J,K,L,M,N](other : Tuple12[C,D,E,F,G,H,I,J,K,L,M,N]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12) - } - - def ++[C,D,E,F,G,H,I,J,K,L,M,N,O](other : Tuple13[C,D,E,F,G,H,I,J,K,L,M,N,O]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13) - } - - def ++[C,D,E,F,G,H,I,J,K,L,M,N,O,P](other : Tuple14[C,D,E,F,G,H,I,J,K,L,M,N,O,P]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14) - } - - def ++[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q](other : Tuple15[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15) - } - - def ++[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R](other : Tuple16[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16) - } - - def ++[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S](other : Tuple17[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17) - } - - def ++[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T](other : Tuple18[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17,other._18) - } - - def ++[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U](other : Tuple19[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17,other._18,other._19) - } - - def ++[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V](other : Tuple20[C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17,other._18,other._19,other._20) - } + implicit def tup1ToAdder[A](tup: Tuple1[A]): Tuple1Adder[A] = new Tuple1Adder(tup) + + class Tuple2Adder[A, B](tup: Tuple2[A, B]) { + def :+[C](other: C) = + (tup._1, tup._2, other) + def +:[C](other: C) = + (other, tup._1, tup._2) + + def ++[C](other: Tuple1[C]) = + (tup._1, tup._2, other._1) + + def ++[C, D](other: Tuple2[C, D]) = + (tup._1, tup._2, other._1, other._2) + + def ++[C, D, E](other: Tuple3[C, D, E]) = + (tup._1, tup._2, other._1, other._2, other._3) + + def ++[C, D, E, F](other: Tuple4[C, D, E, F]) = + (tup._1, tup._2, other._1, other._2, other._3, other._4) + + def ++[C, D, E, F, G](other: Tuple5[C, D, E, F, G]) = + (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5) + + def ++[C, D, E, F, G, H](other: Tuple6[C, D, E, F, G, H]) = + (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6) + + def ++[C, D, E, F, G, H, I](other: Tuple7[C, D, E, F, G, H, I]) = + (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7) + + def ++[C, D, E, F, G, H, I, J](other: Tuple8[C, D, E, F, G, H, I, J]) = + (tup._1, tup._2, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) + + def ++[C, D, E, F, G, H, I, J, K](other: Tuple9[C, D, E, F, G, H, I, J, K]) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[C, D, E, F, G, H, I, J, K, L](other: Tuple10[C, D, E, F, G, H, I, J, K, L]) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M](other: Tuple11[C, D, E, F, G, H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N](other: Tuple12[C, D, E, F, G, H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O](other: Tuple13[C, D, E, F, G, H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + other: Tuple14[C, D, E, F, G, H, I, J, K, L, M, N, O, P] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + other: Tuple15[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + other: Tuple16[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + other: Tuple17[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple18[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple19[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19 + ) + + def ++[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple20[C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19, + other._20 + ) } - implicit def tup2ToAdder[A,B](tup : Tuple2[A,B]) = new Tuple2Adder(tup) - - class Tuple3Adder[A,B,C](tup : Tuple3[A,B,C]) { - def :+[D](other : D) = { - (tup._1,tup._2,tup._3,other) - } - def +:[D](other : D) = { - (other,tup._1,tup._2,tup._3) - } - - def ++[D](other : Tuple1[D]) = { - (tup._1,tup._2,tup._3,other._1) - } - - def ++[D,E](other : Tuple2[D,E]) = { - (tup._1,tup._2,tup._3,other._1,other._2) - } - - def ++[D,E,F](other : Tuple3[D,E,F]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3) - } - - def ++[D,E,F,G](other : Tuple4[D,E,F,G]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4) - } - - def ++[D,E,F,G,H](other : Tuple5[D,E,F,G,H]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5) - } - - def ++[D,E,F,G,H,I](other : Tuple6[D,E,F,G,H,I]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[D,E,F,G,H,I,J](other : Tuple7[D,E,F,G,H,I,J]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[D,E,F,G,H,I,J,K](other : Tuple8[D,E,F,G,H,I,J,K]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[D,E,F,G,H,I,J,K,L](other : Tuple9[D,E,F,G,H,I,J,K,L]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[D,E,F,G,H,I,J,K,L,M](other : Tuple10[D,E,F,G,H,I,J,K,L,M]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } - - def ++[D,E,F,G,H,I,J,K,L,M,N](other : Tuple11[D,E,F,G,H,I,J,K,L,M,N]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11) - } - - def ++[D,E,F,G,H,I,J,K,L,M,N,O](other : Tuple12[D,E,F,G,H,I,J,K,L,M,N,O]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12) - } - - def ++[D,E,F,G,H,I,J,K,L,M,N,O,P](other : Tuple13[D,E,F,G,H,I,J,K,L,M,N,O,P]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13) - } - - def ++[D,E,F,G,H,I,J,K,L,M,N,O,P,Q](other : Tuple14[D,E,F,G,H,I,J,K,L,M,N,O,P,Q]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14) - } - - def ++[D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R](other : Tuple15[D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15) - } - - def ++[D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S](other : Tuple16[D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16) - } - - def ++[D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T](other : Tuple17[D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17) - } - - def ++[D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U](other : Tuple18[D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17,other._18) - } - - def ++[D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V](other : Tuple19[D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17,other._18,other._19) - } + implicit def tup2ToAdder[A, B](tup: Tuple2[A, B]): Tuple2Adder[A, B] = new Tuple2Adder(tup) + + class Tuple3Adder[A, B, C](tup: Tuple3[A, B, C]) { + def :+[D](other: D) = + (tup._1, tup._2, tup._3, other) + def +:[D](other: D) = + (other, tup._1, tup._2, tup._3) + + def ++[D](other: Tuple1[D]) = + (tup._1, tup._2, tup._3, other._1) + + def ++[D, E](other: Tuple2[D, E]) = + (tup._1, tup._2, tup._3, other._1, other._2) + + def ++[D, E, F](other: Tuple3[D, E, F]) = + (tup._1, tup._2, tup._3, other._1, other._2, other._3) + + def ++[D, E, F, G](other: Tuple4[D, E, F, G]) = + (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4) + + def ++[D, E, F, G, H](other: Tuple5[D, E, F, G, H]) = + (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5) + + def ++[D, E, F, G, H, I](other: Tuple6[D, E, F, G, H, I]) = + (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6) + + def ++[D, E, F, G, H, I, J](other: Tuple7[D, E, F, G, H, I, J]) = + (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7) + + def ++[D, E, F, G, H, I, J, K](other: Tuple8[D, E, F, G, H, I, J, K]) = + (tup._1, tup._2, tup._3, other._1, other._2, other._3, other._4, other._5, other._6, other._7, other._8) + + def ++[D, E, F, G, H, I, J, K, L](other: Tuple9[D, E, F, G, H, I, J, K, L]) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[D, E, F, G, H, I, J, K, L, M](other: Tuple10[D, E, F, G, H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N](other: Tuple11[D, E, F, G, H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O](other: Tuple12[D, E, F, G, H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P](other: Tuple13[D, E, F, G, H, I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + other: Tuple14[D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + other: Tuple15[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + other: Tuple16[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple17[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple18[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18 + ) + + def ++[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple19[D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18, + other._19 + ) } - implicit def tup3ToAdder[A,B,C](tup : Tuple3[A,B,C]) = new Tuple3Adder(tup) - - class Tuple4Adder[A,B,C,D](tup : Tuple4[A,B,C,D]) { - def :+[E](other : E) = { - (tup._1,tup._2,tup._3,tup._4,other) - } - def +:[E](other : E) = { - (other,tup._1,tup._2,tup._3,tup._4) - } - - def ++[E](other : Tuple1[E]) = { - (tup._1,tup._2,tup._3,tup._4,other._1) - } - - def ++[E,F](other : Tuple2[E,F]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2) - } - - def ++[E,F,G](other : Tuple3[E,F,G]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3) - } - - def ++[E,F,G,H](other : Tuple4[E,F,G,H]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4) - } - - def ++[E,F,G,H,I](other : Tuple5[E,F,G,H,I]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5) - } - - def ++[E,F,G,H,I,J](other : Tuple6[E,F,G,H,I,J]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[E,F,G,H,I,J,K](other : Tuple7[E,F,G,H,I,J,K]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[E,F,G,H,I,J,K,L](other : Tuple8[E,F,G,H,I,J,K,L]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[E,F,G,H,I,J,K,L,M](other : Tuple9[E,F,G,H,I,J,K,L,M]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[E,F,G,H,I,J,K,L,M,N](other : Tuple10[E,F,G,H,I,J,K,L,M,N]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } - - def ++[E,F,G,H,I,J,K,L,M,N,O](other : Tuple11[E,F,G,H,I,J,K,L,M,N,O]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11) - } - - def ++[E,F,G,H,I,J,K,L,M,N,O,P](other : Tuple12[E,F,G,H,I,J,K,L,M,N,O,P]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12) - } - - def ++[E,F,G,H,I,J,K,L,M,N,O,P,Q](other : Tuple13[E,F,G,H,I,J,K,L,M,N,O,P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13) - } - - def ++[E,F,G,H,I,J,K,L,M,N,O,P,Q,R](other : Tuple14[E,F,G,H,I,J,K,L,M,N,O,P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14) - } - - def ++[E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S](other : Tuple15[E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15) - } - - def ++[E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T](other : Tuple16[E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16) - } - - def ++[E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U](other : Tuple17[E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17) - } - - def ++[E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V](other : Tuple18[E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17,other._18) - } + implicit def tup3ToAdder[A, B, C](tup: Tuple3[A, B, C]): Tuple3Adder[A, B, C] = new Tuple3Adder(tup) + + class Tuple4Adder[A, B, C, D](tup: Tuple4[A, B, C, D]) { + def :+[E](other: E) = + (tup._1, tup._2, tup._3, tup._4, other) + def +:[E](other: E) = + (other, tup._1, tup._2, tup._3, tup._4) + + def ++[E](other: Tuple1[E]) = + (tup._1, tup._2, tup._3, tup._4, other._1) + + def ++[E, F](other: Tuple2[E, F]) = + (tup._1, tup._2, tup._3, tup._4, other._1, other._2) + + def ++[E, F, G](other: Tuple3[E, F, G]) = + (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3) + + def ++[E, F, G, H](other: Tuple4[E, F, G, H]) = + (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4) + + def ++[E, F, G, H, I](other: Tuple5[E, F, G, H, I]) = + (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5) + + def ++[E, F, G, H, I, J](other: Tuple6[E, F, G, H, I, J]) = + (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6) + + def ++[E, F, G, H, I, J, K](other: Tuple7[E, F, G, H, I, J, K]) = + (tup._1, tup._2, tup._3, tup._4, other._1, other._2, other._3, other._4, other._5, other._6, other._7) + + def ++[E, F, G, H, I, J, K, L](other: Tuple8[E, F, G, H, I, J, K, L]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[E, F, G, H, I, J, K, L, M](other: Tuple9[E, F, G, H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[E, F, G, H, I, J, K, L, M, N](other: Tuple10[E, F, G, H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O](other: Tuple11[E, F, G, H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P](other: Tuple12[E, F, G, H, I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q](other: Tuple13[E, F, G, H, I, J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + other: Tuple14[E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + other: Tuple15[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple16[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple17[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17 + ) + + def ++[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple18[E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17, + other._18 + ) } - implicit def tup4ToAdder[A,B,C,D](tup : Tuple4[A,B,C,D]) = new Tuple4Adder(tup) - - class Tuple5Adder[A,B,C,D,E](tup : Tuple5[A,B,C,D,E]) { - def :+[F](other : F) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other) - } - def +:[F](other : F) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5) - } - - def ++[F](other : Tuple1[F]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1) - } - - def ++[F,G](other : Tuple2[F,G]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2) - } - - def ++[F,G,H](other : Tuple3[F,G,H]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3) - } - - def ++[F,G,H,I](other : Tuple4[F,G,H,I]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4) - } - - def ++[F,G,H,I,J](other : Tuple5[F,G,H,I,J]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5) - } - - def ++[F,G,H,I,J,K](other : Tuple6[F,G,H,I,J,K]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[F,G,H,I,J,K,L](other : Tuple7[F,G,H,I,J,K,L]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[F,G,H,I,J,K,L,M](other : Tuple8[F,G,H,I,J,K,L,M]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[F,G,H,I,J,K,L,M,N](other : Tuple9[F,G,H,I,J,K,L,M,N]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[F,G,H,I,J,K,L,M,N,O](other : Tuple10[F,G,H,I,J,K,L,M,N,O]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } - - def ++[F,G,H,I,J,K,L,M,N,O,P](other : Tuple11[F,G,H,I,J,K,L,M,N,O,P]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11) - } - - def ++[F,G,H,I,J,K,L,M,N,O,P,Q](other : Tuple12[F,G,H,I,J,K,L,M,N,O,P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12) - } - - def ++[F,G,H,I,J,K,L,M,N,O,P,Q,R](other : Tuple13[F,G,H,I,J,K,L,M,N,O,P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13) - } - - def ++[F,G,H,I,J,K,L,M,N,O,P,Q,R,S](other : Tuple14[F,G,H,I,J,K,L,M,N,O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14) - } - - def ++[F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T](other : Tuple15[F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15) - } - - def ++[F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U](other : Tuple16[F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16) - } - - def ++[F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V](other : Tuple17[F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16,other._17) - } + implicit def tup4ToAdder[A, B, C, D](tup: Tuple4[A, B, C, D]): Tuple4Adder[A, B, C, D] = new Tuple4Adder( + tup + ) + + class Tuple5Adder[A, B, C, D, E](tup: Tuple5[A, B, C, D, E]) { + def :+[F](other: F) = + (tup._1, tup._2, tup._3, tup._4, tup._5, other) + def +:[F](other: F) = + (other, tup._1, tup._2, tup._3, tup._4, tup._5) + + def ++[F](other: Tuple1[F]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, other._1) + + def ++[F, G](other: Tuple2[F, G]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2) + + def ++[F, G, H](other: Tuple3[F, G, H]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3) + + def ++[F, G, H, I](other: Tuple4[F, G, H, I]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4) + + def ++[F, G, H, I, J](other: Tuple5[F, G, H, I, J]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5) + + def ++[F, G, H, I, J, K](other: Tuple6[F, G, H, I, J, K]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, other._1, other._2, other._3, other._4, other._5, other._6) + + def ++[F, G, H, I, J, K, L](other: Tuple7[F, G, H, I, J, K, L]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[F, G, H, I, J, K, L, M](other: Tuple8[F, G, H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[F, G, H, I, J, K, L, M, N](other: Tuple9[F, G, H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[F, G, H, I, J, K, L, M, N, O](other: Tuple10[F, G, H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P](other: Tuple11[F, G, H, I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q](other: Tuple12[F, G, H, I, J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R](other: Tuple13[F, G, H, I, J, K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + other: Tuple14[F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple15[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple16[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) + + def ++[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple17[F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16, + other._17 + ) } - implicit def tup5ToAdder[A,B,C,D,E](tup : Tuple5[A,B,C,D,E]) = new Tuple5Adder(tup) - - class Tuple6Adder[A,B,C,D,E,F](tup : Tuple6[A,B,C,D,E,F]) { - def :+[G](other : G) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other) - } - def +:[G](other : G) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6) - } - - def ++[G](other : Tuple1[G]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1) - } - - def ++[G,H](other : Tuple2[G,H]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2) - } - - def ++[G,H,I](other : Tuple3[G,H,I]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3) - } - - def ++[G,H,I,J](other : Tuple4[G,H,I,J]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4) - } - - def ++[G,H,I,J,K](other : Tuple5[G,H,I,J,K]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5) - } - - def ++[G,H,I,J,K,L](other : Tuple6[G,H,I,J,K,L]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[G,H,I,J,K,L,M](other : Tuple7[G,H,I,J,K,L,M]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[G,H,I,J,K,L,M,N](other : Tuple8[G,H,I,J,K,L,M,N]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[G,H,I,J,K,L,M,N,O](other : Tuple9[G,H,I,J,K,L,M,N,O]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[G,H,I,J,K,L,M,N,O,P](other : Tuple10[G,H,I,J,K,L,M,N,O,P]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } - - def ++[G,H,I,J,K,L,M,N,O,P,Q](other : Tuple11[G,H,I,J,K,L,M,N,O,P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11) - } - - def ++[G,H,I,J,K,L,M,N,O,P,Q,R](other : Tuple12[G,H,I,J,K,L,M,N,O,P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12) - } - - def ++[G,H,I,J,K,L,M,N,O,P,Q,R,S](other : Tuple13[G,H,I,J,K,L,M,N,O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13) - } - - def ++[G,H,I,J,K,L,M,N,O,P,Q,R,S,T](other : Tuple14[G,H,I,J,K,L,M,N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14) - } - - def ++[G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U](other : Tuple15[G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15) - } - - def ++[G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V](other : Tuple16[G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15,other._16) - } + implicit def tup5ToAdder[A, B, C, D, E](tup: Tuple5[A, B, C, D, E]): Tuple5Adder[A, B, C, D, E] = + new Tuple5Adder(tup) + + class Tuple6Adder[A, B, C, D, E, F](tup: Tuple6[A, B, C, D, E, F]) { + def :+[G](other: G) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other) + def +:[G](other: G) = + (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6) + + def ++[G](other: Tuple1[G]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1) + + def ++[G, H](other: Tuple2[G, H]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2) + + def ++[G, H, I](other: Tuple3[G, H, I]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3) + + def ++[G, H, I, J](other: Tuple4[G, H, I, J]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4) + + def ++[G, H, I, J, K](other: Tuple5[G, H, I, J, K]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, other._1, other._2, other._3, other._4, other._5) + + def ++[G, H, I, J, K, L](other: Tuple6[G, H, I, J, K, L]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[G, H, I, J, K, L, M](other: Tuple7[G, H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[G, H, I, J, K, L, M, N](other: Tuple8[G, H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[G, H, I, J, K, L, M, N, O](other: Tuple9[G, H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[G, H, I, J, K, L, M, N, O, P](other: Tuple10[G, H, I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q](other: Tuple11[G, H, I, J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q, R](other: Tuple12[G, H, I, J, K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S](other: Tuple13[G, H, I, J, K, L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + other: Tuple14[G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple15[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) + + def ++[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple16[G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15, + other._16 + ) } - implicit def tup6ToAdder[A,B,C,D,E,F](tup : Tuple6[A,B,C,D,E,F]) = new Tuple6Adder(tup) - - class Tuple7Adder[A,B,C,D,E,F,G](tup : Tuple7[A,B,C,D,E,F,G]) { - def :+[H](other : H) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other) - } - def +:[H](other : H) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7) - } - - def ++[H](other : Tuple1[H]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1) - } - - def ++[H,I](other : Tuple2[H,I]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2) - } - - def ++[H,I,J](other : Tuple3[H,I,J]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3) - } - - def ++[H,I,J,K](other : Tuple4[H,I,J,K]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4) - } - - def ++[H,I,J,K,L](other : Tuple5[H,I,J,K,L]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4,other._5) - } - - def ++[H,I,J,K,L,M](other : Tuple6[H,I,J,K,L,M]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[H,I,J,K,L,M,N](other : Tuple7[H,I,J,K,L,M,N]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[H,I,J,K,L,M,N,O](other : Tuple8[H,I,J,K,L,M,N,O]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[H,I,J,K,L,M,N,O,P](other : Tuple9[H,I,J,K,L,M,N,O,P]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[H,I,J,K,L,M,N,O,P,Q](other : Tuple10[H,I,J,K,L,M,N,O,P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } - - def ++[H,I,J,K,L,M,N,O,P,Q,R](other : Tuple11[H,I,J,K,L,M,N,O,P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11) - } - - def ++[H,I,J,K,L,M,N,O,P,Q,R,S](other : Tuple12[H,I,J,K,L,M,N,O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12) - } - - def ++[H,I,J,K,L,M,N,O,P,Q,R,S,T](other : Tuple13[H,I,J,K,L,M,N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13) - } - - def ++[H,I,J,K,L,M,N,O,P,Q,R,S,T,U](other : Tuple14[H,I,J,K,L,M,N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14) - } - - def ++[H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V](other : Tuple15[H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14,other._15) - } + implicit def tup6ToAdder[A, B, C, D, E, F](tup: Tuple6[A, B, C, D, E, F]): Tuple6Adder[A, B, C, D, E, F] = + new Tuple6Adder(tup) + + class Tuple7Adder[A, B, C, D, E, F, G](tup: Tuple7[A, B, C, D, E, F, G]) { + def :+[H](other: H) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other) + def +:[H](other: H) = + (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7) + + def ++[H](other: Tuple1[H]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1) + + def ++[H, I](other: Tuple2[H, I]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2) + + def ++[H, I, J](other: Tuple3[H, I, J]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3) + + def ++[H, I, J, K](other: Tuple4[H, I, J, K]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, other._1, other._2, other._3, other._4) + + def ++[H, I, J, K, L](other: Tuple5[H, I, J, K, L]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[H, I, J, K, L, M](other: Tuple6[H, I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[H, I, J, K, L, M, N](other: Tuple7[H, I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[H, I, J, K, L, M, N, O](other: Tuple8[H, I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[H, I, J, K, L, M, N, O, P](other: Tuple9[H, I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q](other: Tuple10[H, I, J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q, R](other: Tuple11[H, I, J, K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q, R, S](other: Tuple12[H, I, J, K, L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple13[H, I, J, K, L, M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + other: Tuple14[H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) + + def ++[H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple15[H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14, + other._15 + ) } - implicit def tup7ToAdder[A,B,C,D,E,F,G](tup : Tuple7[A,B,C,D,E,F,G]) = new Tuple7Adder(tup) - - class Tuple8Adder[A,B,C,D,E,F,G,H](tup : Tuple8[A,B,C,D,E,F,G,H]) { - def :+[I](other : I) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other) - } - def +:[I](other : I) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8) - } - - def ++[I](other : Tuple1[I]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1) - } - - def ++[I,J](other : Tuple2[I,J]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2) - } - - def ++[I,J,K](other : Tuple3[I,J,K]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3) - } - - def ++[I,J,K,L](other : Tuple4[I,J,K,L]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3,other._4) - } - - def ++[I,J,K,L,M](other : Tuple5[I,J,K,L,M]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3,other._4,other._5) - } - - def ++[I,J,K,L,M,N](other : Tuple6[I,J,K,L,M,N]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[I,J,K,L,M,N,O](other : Tuple7[I,J,K,L,M,N,O]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[I,J,K,L,M,N,O,P](other : Tuple8[I,J,K,L,M,N,O,P]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[I,J,K,L,M,N,O,P,Q](other : Tuple9[I,J,K,L,M,N,O,P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[I,J,K,L,M,N,O,P,Q,R](other : Tuple10[I,J,K,L,M,N,O,P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } - - def ++[I,J,K,L,M,N,O,P,Q,R,S](other : Tuple11[I,J,K,L,M,N,O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11) - } - - def ++[I,J,K,L,M,N,O,P,Q,R,S,T](other : Tuple12[I,J,K,L,M,N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12) - } - - def ++[I,J,K,L,M,N,O,P,Q,R,S,T,U](other : Tuple13[I,J,K,L,M,N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13) - } - - def ++[I,J,K,L,M,N,O,P,Q,R,S,T,U,V](other : Tuple14[I,J,K,L,M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13,other._14) - } + implicit def tup7ToAdder[A, B, C, D, E, F, G]( + tup: Tuple7[A, B, C, D, E, F, G] + ): Tuple7Adder[A, B, C, D, E, F, G] = new Tuple7Adder(tup) + + class Tuple8Adder[A, B, C, D, E, F, G, H](tup: Tuple8[A, B, C, D, E, F, G, H]) { + def :+[I](other: I) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other) + def +:[I](other: I) = + (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8) + + def ++[I](other: Tuple1[I]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1) + + def ++[I, J](other: Tuple2[I, J]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2) + + def ++[I, J, K](other: Tuple3[I, J, K]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3) + + def ++[I, J, K, L](other: Tuple4[I, J, K, L]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, other._1, other._2, other._3, other._4) + + def ++[I, J, K, L, M](other: Tuple5[I, J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[I, J, K, L, M, N](other: Tuple6[I, J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[I, J, K, L, M, N, O](other: Tuple7[I, J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[I, J, K, L, M, N, O, P](other: Tuple8[I, J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[I, J, K, L, M, N, O, P, Q](other: Tuple9[I, J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[I, J, K, L, M, N, O, P, Q, R](other: Tuple10[I, J, K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[I, J, K, L, M, N, O, P, Q, R, S](other: Tuple11[I, J, K, L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[I, J, K, L, M, N, O, P, Q, R, S, T](other: Tuple12[I, J, K, L, M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[I, J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple13[I, J, K, L, M, N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) + + def ++[I, J, K, L, M, N, O, P, Q, R, S, T, U, V]( + other: Tuple14[I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + ) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13, + other._14 + ) } - implicit def tup8ToAdder[A,B,C,D,E,F,G,H](tup : Tuple8[A,B,C,D,E,F,G,H]) = new Tuple8Adder(tup) - - class Tuple9Adder[A,B,C,D,E,F,G,H,I](tup : Tuple9[A,B,C,D,E,F,G,H,I]) { - def :+[J](other : J) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other) - } - def +:[J](other : J) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9) - } - - def ++[J](other : Tuple1[J]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1) - } - - def ++[J,K](other : Tuple2[J,K]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2) - } - - def ++[J,K,L](other : Tuple3[J,K,L]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2,other._3) - } - - def ++[J,K,L,M](other : Tuple4[J,K,L,M]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2,other._3,other._4) - } - - def ++[J,K,L,M,N](other : Tuple5[J,K,L,M,N]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2,other._3,other._4,other._5) - } - - def ++[J,K,L,M,N,O](other : Tuple6[J,K,L,M,N,O]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[J,K,L,M,N,O,P](other : Tuple7[J,K,L,M,N,O,P]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[J,K,L,M,N,O,P,Q](other : Tuple8[J,K,L,M,N,O,P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[J,K,L,M,N,O,P,Q,R](other : Tuple9[J,K,L,M,N,O,P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[J,K,L,M,N,O,P,Q,R,S](other : Tuple10[J,K,L,M,N,O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } - - def ++[J,K,L,M,N,O,P,Q,R,S,T](other : Tuple11[J,K,L,M,N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11) - } - - def ++[J,K,L,M,N,O,P,Q,R,S,T,U](other : Tuple12[J,K,L,M,N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12) - } - - def ++[J,K,L,M,N,O,P,Q,R,S,T,U,V](other : Tuple13[J,K,L,M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12,other._13) - } + implicit def tup8ToAdder[A, B, C, D, E, F, G, H]( + tup: Tuple8[A, B, C, D, E, F, G, H] + ): Tuple8Adder[A, B, C, D, E, F, G, H] = new Tuple8Adder(tup) + + class Tuple9Adder[A, B, C, D, E, F, G, H, I](tup: Tuple9[A, B, C, D, E, F, G, H, I]) { + def :+[J](other: J) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other) + def +:[J](other: J) = + (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9) + + def ++[J](other: Tuple1[J]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1) + + def ++[J, K](other: Tuple2[J, K]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2) + + def ++[J, K, L](other: Tuple3[J, K, L]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, other._1, other._2, other._3) + + def ++[J, K, L, M](other: Tuple4[J, K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[J, K, L, M, N](other: Tuple5[J, K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[J, K, L, M, N, O](other: Tuple6[J, K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[J, K, L, M, N, O, P](other: Tuple7[J, K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[J, K, L, M, N, O, P, Q](other: Tuple8[J, K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[J, K, L, M, N, O, P, Q, R](other: Tuple9[J, K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[J, K, L, M, N, O, P, Q, R, S](other: Tuple10[J, K, L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[J, K, L, M, N, O, P, Q, R, S, T](other: Tuple11[J, K, L, M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[J, K, L, M, N, O, P, Q, R, S, T, U](other: Tuple12[J, K, L, M, N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) + + def ++[J, K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple13[J, K, L, M, N, O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12, + other._13 + ) } - implicit def tup9ToAdder[A,B,C,D,E,F,G,H,I](tup : Tuple9[A,B,C,D,E,F,G,H,I]) = new Tuple9Adder(tup) - - class Tuple10Adder[A,B,C,D,E,F,G,H,I,J](tup : Tuple10[A,B,C,D,E,F,G,H,I,J]) { - def :+[K](other : K) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other) - } - def +:[K](other : K) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10) - } - - def ++[K](other : Tuple1[K]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1) - } - - def ++[K,L](other : Tuple2[K,L]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1,other._2) - } - - def ++[K,L,M](other : Tuple3[K,L,M]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1,other._2,other._3) - } - - def ++[K,L,M,N](other : Tuple4[K,L,M,N]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1,other._2,other._3,other._4) - } - - def ++[K,L,M,N,O](other : Tuple5[K,L,M,N,O]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1,other._2,other._3,other._4,other._5) - } - - def ++[K,L,M,N,O,P](other : Tuple6[K,L,M,N,O,P]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[K,L,M,N,O,P,Q](other : Tuple7[K,L,M,N,O,P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[K,L,M,N,O,P,Q,R](other : Tuple8[K,L,M,N,O,P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[K,L,M,N,O,P,Q,R,S](other : Tuple9[K,L,M,N,O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[K,L,M,N,O,P,Q,R,S,T](other : Tuple10[K,L,M,N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } - - def ++[K,L,M,N,O,P,Q,R,S,T,U](other : Tuple11[K,L,M,N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11) - } - - def ++[K,L,M,N,O,P,Q,R,S,T,U,V](other : Tuple12[K,L,M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11,other._12) - } + implicit def tup9ToAdder[A, B, C, D, E, F, G, H, I]( + tup: Tuple9[A, B, C, D, E, F, G, H, I] + ): Tuple9Adder[A, B, C, D, E, F, G, H, I] = new Tuple9Adder(tup) + + class Tuple10Adder[A, B, C, D, E, F, G, H, I, J](tup: Tuple10[A, B, C, D, E, F, G, H, I, J]) { + def :+[K](other: K) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other) + def +:[K](other: K) = + (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10) + + def ++[K](other: Tuple1[K]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1) + + def ++[K, L](other: Tuple2[K, L]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, other._1, other._2) + + def ++[K, L, M](other: Tuple3[K, L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3 + ) + + def ++[K, L, M, N](other: Tuple4[K, L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[K, L, M, N, O](other: Tuple5[K, L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[K, L, M, N, O, P](other: Tuple6[K, L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[K, L, M, N, O, P, Q](other: Tuple7[K, L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[K, L, M, N, O, P, Q, R](other: Tuple8[K, L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[K, L, M, N, O, P, Q, R, S](other: Tuple9[K, L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[K, L, M, N, O, P, Q, R, S, T](other: Tuple10[K, L, M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[K, L, M, N, O, P, Q, R, S, T, U](other: Tuple11[K, L, M, N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) + + def ++[K, L, M, N, O, P, Q, R, S, T, U, V](other: Tuple12[K, L, M, N, O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11, + other._12 + ) } - implicit def tup10ToAdder[A,B,C,D,E,F,G,H,I,J](tup : Tuple10[A,B,C,D,E,F,G,H,I,J]) = new Tuple10Adder(tup) - - class Tuple11Adder[A,B,C,D,E,F,G,H,I,J,K](tup : Tuple11[A,B,C,D,E,F,G,H,I,J,K]) { - def :+[L](other : L) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other) - } - def +:[L](other : L) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11) - } - - def ++[L](other : Tuple1[L]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other._1) - } - - def ++[L,M](other : Tuple2[L,M]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other._1,other._2) - } - - def ++[L,M,N](other : Tuple3[L,M,N]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other._1,other._2,other._3) - } - - def ++[L,M,N,O](other : Tuple4[L,M,N,O]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other._1,other._2,other._3,other._4) - } - - def ++[L,M,N,O,P](other : Tuple5[L,M,N,O,P]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other._1,other._2,other._3,other._4,other._5) - } - - def ++[L,M,N,O,P,Q](other : Tuple6[L,M,N,O,P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[L,M,N,O,P,Q,R](other : Tuple7[L,M,N,O,P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[L,M,N,O,P,Q,R,S](other : Tuple8[L,M,N,O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[L,M,N,O,P,Q,R,S,T](other : Tuple9[L,M,N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[L,M,N,O,P,Q,R,S,T,U](other : Tuple10[L,M,N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } - - def ++[L,M,N,O,P,Q,R,S,T,U,V](other : Tuple11[L,M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10,other._11) - } + implicit def tup10ToAdder[A, B, C, D, E, F, G, H, I, J]( + tup: Tuple10[A, B, C, D, E, F, G, H, I, J] + ): Tuple10Adder[A, B, C, D, E, F, G, H, I, J] = new Tuple10Adder(tup) + + class Tuple11Adder[A, B, C, D, E, F, G, H, I, J, K](tup: Tuple11[A, B, C, D, E, F, G, H, I, J, K]) { + def :+[L](other: L) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other) + def +:[L](other: L) = + (other, tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11) + + def ++[L](other: Tuple1[L]) = + (tup._1, tup._2, tup._3, tup._4, tup._5, tup._6, tup._7, tup._8, tup._9, tup._10, tup._11, other._1) + + def ++[L, M](other: Tuple2[L, M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2 + ) + + def ++[L, M, N](other: Tuple3[L, M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3 + ) + + def ++[L, M, N, O](other: Tuple4[L, M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[L, M, N, O, P](other: Tuple5[L, M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[L, M, N, O, P, Q](other: Tuple6[L, M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[L, M, N, O, P, Q, R](other: Tuple7[L, M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[L, M, N, O, P, Q, R, S](other: Tuple8[L, M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[L, M, N, O, P, Q, R, S, T](other: Tuple9[L, M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[L, M, N, O, P, Q, R, S, T, U](other: Tuple10[L, M, N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) + + def ++[L, M, N, O, P, Q, R, S, T, U, V](other: Tuple11[L, M, N, O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10, + other._11 + ) } - implicit def tup11ToAdder[A,B,C,D,E,F,G,H,I,J,K](tup : Tuple11[A,B,C,D,E,F,G,H,I,J,K]) = new Tuple11Adder(tup) - - class Tuple12Adder[A,B,C,D,E,F,G,H,I,J,K,L](tup : Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]) { - def :+[M](other : M) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,other) - } - def +:[M](other : M) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12) - } - - def ++[M](other : Tuple1[M]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,other._1) - } - - def ++[M,N](other : Tuple2[M,N]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,other._1,other._2) - } - - def ++[M,N,O](other : Tuple3[M,N,O]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,other._1,other._2,other._3) - } - - def ++[M,N,O,P](other : Tuple4[M,N,O,P]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,other._1,other._2,other._3,other._4) - } - - def ++[M,N,O,P,Q](other : Tuple5[M,N,O,P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,other._1,other._2,other._3,other._4,other._5) - } - - def ++[M,N,O,P,Q,R](other : Tuple6[M,N,O,P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[M,N,O,P,Q,R,S](other : Tuple7[M,N,O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[M,N,O,P,Q,R,S,T](other : Tuple8[M,N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[M,N,O,P,Q,R,S,T,U](other : Tuple9[M,N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } - - def ++[M,N,O,P,Q,R,S,T,U,V](other : Tuple10[M,N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9,other._10) - } + implicit def tup11ToAdder[A, B, C, D, E, F, G, H, I, J, K]( + tup: Tuple11[A, B, C, D, E, F, G, H, I, J, K] + ): Tuple11Adder[A, B, C, D, E, F, G, H, I, J, K] = new Tuple11Adder(tup) + + class Tuple12Adder[A, B, C, D, E, F, G, H, I, J, K, L](tup: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]) { + def :+[M](other: M) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other + ) + def +:[M](other: M) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12 + ) + + def ++[M](other: Tuple1[M]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1 + ) + + def ++[M, N](other: Tuple2[M, N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2 + ) + + def ++[M, N, O](other: Tuple3[M, N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3 + ) + + def ++[M, N, O, P](other: Tuple4[M, N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[M, N, O, P, Q](other: Tuple5[M, N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[M, N, O, P, Q, R](other: Tuple6[M, N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[M, N, O, P, Q, R, S](other: Tuple7[M, N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[M, N, O, P, Q, R, S, T](other: Tuple8[M, N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[M, N, O, P, Q, R, S, T, U](other: Tuple9[M, N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) + + def ++[M, N, O, P, Q, R, S, T, U, V](other: Tuple10[M, N, O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9, + other._10 + ) } - implicit def tup12ToAdder[A,B,C,D,E,F,G,H,I,J,K,L](tup : Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]) = new Tuple12Adder(tup) - - class Tuple13Adder[A,B,C,D,E,F,G,H,I,J,K,L,M](tup : Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]) { - def :+[N](other : N) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,other) - } - def +:[N](other : N) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13) - } - - def ++[N](other : Tuple1[N]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,other._1) - } - - def ++[N,O](other : Tuple2[N,O]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,other._1,other._2) - } - - def ++[N,O,P](other : Tuple3[N,O,P]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,other._1,other._2,other._3) - } - - def ++[N,O,P,Q](other : Tuple4[N,O,P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,other._1,other._2,other._3,other._4) - } - - def ++[N,O,P,Q,R](other : Tuple5[N,O,P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,other._1,other._2,other._3,other._4,other._5) - } - - def ++[N,O,P,Q,R,S](other : Tuple6[N,O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[N,O,P,Q,R,S,T](other : Tuple7[N,O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[N,O,P,Q,R,S,T,U](other : Tuple8[N,O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } - - def ++[N,O,P,Q,R,S,T,U,V](other : Tuple9[N,O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8,other._9) - } + implicit def tup12ToAdder[A, B, C, D, E, F, G, H, I, J, K, L]( + tup: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L] + ): Tuple12Adder[A, B, C, D, E, F, G, H, I, J, K, L] = new Tuple12Adder(tup) + + class Tuple13Adder[A, B, C, D, E, F, G, H, I, J, K, L, M]( + tup: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M] + ) { + def :+[N](other: N) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other + ) + def +:[N](other: N) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13 + ) + + def ++[N](other: Tuple1[N]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1 + ) + + def ++[N, O](other: Tuple2[N, O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2 + ) + + def ++[N, O, P](other: Tuple3[N, O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3 + ) + + def ++[N, O, P, Q](other: Tuple4[N, O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[N, O, P, Q, R](other: Tuple5[N, O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[N, O, P, Q, R, S](other: Tuple6[N, O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[N, O, P, Q, R, S, T](other: Tuple7[N, O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[N, O, P, Q, R, S, T, U](other: Tuple8[N, O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) + + def ++[N, O, P, Q, R, S, T, U, V](other: Tuple9[N, O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8, + other._9 + ) } - implicit def tup13ToAdder[A,B,C,D,E,F,G,H,I,J,K,L,M](tup : Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]) = new Tuple13Adder(tup) - - class Tuple14Adder[A,B,C,D,E,F,G,H,I,J,K,L,M,N](tup : Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]) { - def :+[O](other : O) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,other) - } - def +:[O](other : O) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14) - } - - def ++[O](other : Tuple1[O]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,other._1) - } - - def ++[O,P](other : Tuple2[O,P]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,other._1,other._2) - } - - def ++[O,P,Q](other : Tuple3[O,P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,other._1,other._2,other._3) - } - - def ++[O,P,Q,R](other : Tuple4[O,P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,other._1,other._2,other._3,other._4) - } - - def ++[O,P,Q,R,S](other : Tuple5[O,P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,other._1,other._2,other._3,other._4,other._5) - } - - def ++[O,P,Q,R,S,T](other : Tuple6[O,P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[O,P,Q,R,S,T,U](other : Tuple7[O,P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } - - def ++[O,P,Q,R,S,T,U,V](other : Tuple8[O,P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,other._1,other._2,other._3,other._4,other._5,other._6,other._7,other._8) - } + implicit def tup13ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M]( + tup: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M] + ): Tuple13Adder[A, B, C, D, E, F, G, H, I, J, K, L, M] = new Tuple13Adder(tup) + + class Tuple14Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + tup: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + ) { + def :+[O](other: O) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other + ) + def +:[O](other: O) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14 + ) + + def ++[O](other: Tuple1[O]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1 + ) + + def ++[O, P](other: Tuple2[O, P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2 + ) + + def ++[O, P, Q](other: Tuple3[O, P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3 + ) + + def ++[O, P, Q, R](other: Tuple4[O, P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[O, P, Q, R, S](other: Tuple5[O, P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[O, P, Q, R, S, T](other: Tuple6[O, P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[O, P, Q, R, S, T, U](other: Tuple7[O, P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) + + def ++[O, P, Q, R, S, T, U, V](other: Tuple8[O, P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7, + other._8 + ) } - implicit def tup14ToAdder[A,B,C,D,E,F,G,H,I,J,K,L,M,N](tup : Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]) = new Tuple14Adder(tup) - - class Tuple15Adder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O](tup : Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]) { - def :+[P](other : P) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,other) - } - def +:[P](other : P) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15) - } - - def ++[P](other : Tuple1[P]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,other._1) - } - - def ++[P,Q](other : Tuple2[P,Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,other._1,other._2) - } - - def ++[P,Q,R](other : Tuple3[P,Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,other._1,other._2,other._3) - } - - def ++[P,Q,R,S](other : Tuple4[P,Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,other._1,other._2,other._3,other._4) - } - - def ++[P,Q,R,S,T](other : Tuple5[P,Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,other._1,other._2,other._3,other._4,other._5) - } - - def ++[P,Q,R,S,T,U](other : Tuple6[P,Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,other._1,other._2,other._3,other._4,other._5,other._6) - } - - def ++[P,Q,R,S,T,U,V](other : Tuple7[P,Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,other._1,other._2,other._3,other._4,other._5,other._6,other._7) - } + implicit def tup14ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N]( + tup: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + ): Tuple14Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N] = new Tuple14Adder(tup) + + class Tuple15Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + tup: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + ) { + def :+[P](other: P) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other + ) + def +:[P](other: P) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15 + ) + + def ++[P](other: Tuple1[P]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1 + ) + + def ++[P, Q](other: Tuple2[P, Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2 + ) + + def ++[P, Q, R](other: Tuple3[P, Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2, + other._3 + ) + + def ++[P, Q, R, S](other: Tuple4[P, Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[P, Q, R, S, T](other: Tuple5[P, Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[P, Q, R, S, T, U](other: Tuple6[P, Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) + + def ++[P, Q, R, S, T, U, V](other: Tuple7[P, Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6, + other._7 + ) } - implicit def tup15ToAdder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O](tup : Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]) = new Tuple15Adder(tup) - - class Tuple16Adder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P](tup : Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]) { - def :+[Q](other : Q) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,other) - } - def +:[Q](other : Q) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16) - } - - def ++[Q](other : Tuple1[Q]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,other._1) - } - - def ++[Q,R](other : Tuple2[Q,R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,other._1,other._2) - } - - def ++[Q,R,S](other : Tuple3[Q,R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,other._1,other._2,other._3) - } - - def ++[Q,R,S,T](other : Tuple4[Q,R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,other._1,other._2,other._3,other._4) - } - - def ++[Q,R,S,T,U](other : Tuple5[Q,R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,other._1,other._2,other._3,other._4,other._5) - } - - def ++[Q,R,S,T,U,V](other : Tuple6[Q,R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,other._1,other._2,other._3,other._4,other._5,other._6) - } + implicit def tup15ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]( + tup: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + ): Tuple15Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] = new Tuple15Adder(tup) + + class Tuple16Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + tup: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + ) { + def :+[Q](other: Q) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other + ) + def +:[Q](other: Q) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16 + ) + + def ++[Q](other: Tuple1[Q]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1 + ) + + def ++[Q, R](other: Tuple2[Q, R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1, + other._2 + ) + + def ++[Q, R, S](other: Tuple3[Q, R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1, + other._2, + other._3 + ) + + def ++[Q, R, S, T](other: Tuple4[Q, R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[Q, R, S, T, U](other: Tuple5[Q, R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1, + other._2, + other._3, + other._4, + other._5 + ) + + def ++[Q, R, S, T, U, V](other: Tuple6[Q, R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + other._1, + other._2, + other._3, + other._4, + other._5, + other._6 + ) } - implicit def tup16ToAdder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P](tup : Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]) = new Tuple16Adder(tup) - - class Tuple17Adder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q](tup : Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]) { - def :+[R](other : R) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,other) - } - def +:[R](other : R) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17) - } - - def ++[R](other : Tuple1[R]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,other._1) - } - - def ++[R,S](other : Tuple2[R,S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,other._1,other._2) - } - - def ++[R,S,T](other : Tuple3[R,S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,other._1,other._2,other._3) - } - - def ++[R,S,T,U](other : Tuple4[R,S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,other._1,other._2,other._3,other._4) - } - - def ++[R,S,T,U,V](other : Tuple5[R,S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,other._1,other._2,other._3,other._4,other._5) - } + implicit def tup16ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]( + tup: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + ): Tuple16Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] = new Tuple16Adder(tup) + + class Tuple17Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + tup: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + ) { + def :+[R](other: R) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other + ) + def +:[R](other: R) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17 + ) + + def ++[R](other: Tuple1[R]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other._1 + ) + + def ++[R, S](other: Tuple2[R, S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other._1, + other._2 + ) + + def ++[R, S, T](other: Tuple3[R, S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other._1, + other._2, + other._3 + ) + + def ++[R, S, T, U](other: Tuple4[R, S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other._1, + other._2, + other._3, + other._4 + ) + + def ++[R, S, T, U, V](other: Tuple5[R, S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + other._1, + other._2, + other._3, + other._4, + other._5 + ) } - implicit def tup17ToAdder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q](tup : Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]) = new Tuple17Adder(tup) - - class Tuple18Adder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R](tup : Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]) { - def :+[S](other : S) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,other) - } - def +:[S](other : S) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18) - } - - def ++[S](other : Tuple1[S]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,other._1) - } - - def ++[S,T](other : Tuple2[S,T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,other._1,other._2) - } - - def ++[S,T,U](other : Tuple3[S,T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,other._1,other._2,other._3) - } - - def ++[S,T,U,V](other : Tuple4[S,T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,other._1,other._2,other._3,other._4) - } + implicit def tup17ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]( + tup: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + ): Tuple17Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] = new Tuple17Adder(tup) + + class Tuple18Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + tup: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ) { + def :+[S](other: S) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + other + ) + def +:[S](other: S) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18 + ) + + def ++[S](other: Tuple1[S]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + other._1 + ) + + def ++[S, T](other: Tuple2[S, T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + other._1, + other._2 + ) + + def ++[S, T, U](other: Tuple3[S, T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + other._1, + other._2, + other._3 + ) + + def ++[S, T, U, V](other: Tuple4[S, T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + other._1, + other._2, + other._3, + other._4 + ) } - implicit def tup18ToAdder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R](tup : Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]) = new Tuple18Adder(tup) - - class Tuple19Adder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S](tup : Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]) { - def :+[T](other : T) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19,other) - } - def +:[T](other : T) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19) - } - - def ++[T](other : Tuple1[T]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19,other._1) - } - - def ++[T,U](other : Tuple2[T,U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19,other._1,other._2) - } - - def ++[T,U,V](other : Tuple3[T,U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19,other._1,other._2,other._3) - } + implicit def tup18ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]( + tup: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + ): Tuple18Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] = new Tuple18Adder(tup) + + class Tuple19Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + tup: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ) { + def :+[T](other: T) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + other + ) + def +:[T](other: T) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19 + ) + + def ++[T](other: Tuple1[T]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + other._1 + ) + + def ++[T, U](other: Tuple2[T, U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + other._1, + other._2 + ) + + def ++[T, U, V](other: Tuple3[T, U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + other._1, + other._2, + other._3 + ) } - implicit def tup19ToAdder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S](tup : Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]) = new Tuple19Adder(tup) - - class Tuple20Adder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T](tup : Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]) { - def :+[U](other : U) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19,tup._20,other) - } - def +:[U](other : U) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19,tup._20) - } - - def ++[U](other : Tuple1[U]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19,tup._20,other._1) - } - - def ++[U,V](other : Tuple2[U,V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19,tup._20,other._1,other._2) - } + implicit def tup19ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]( + tup: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + ): Tuple19Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] = new Tuple19Adder(tup) + + class Tuple20Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + tup: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ) { + def :+[U](other: U) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + other + ) + def +:[U](other: U) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20 + ) + + def ++[U](other: Tuple1[U]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + other._1 + ) + + def ++[U, V](other: Tuple2[U, V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + other._1, + other._2 + ) } - implicit def tup20ToAdder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T](tup : Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]) = new Tuple20Adder(tup) - - class Tuple21Adder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U](tup : Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]) { - def :+[V](other : V) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19,tup._20,tup._21,other) - } - def +:[V](other : V) = { - (other,tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19,tup._20,tup._21) - } - - def ++[V](other : Tuple1[V]) = { - (tup._1,tup._2,tup._3,tup._4,tup._5,tup._6,tup._7,tup._8,tup._9,tup._10,tup._11,tup._12,tup._13,tup._14,tup._15,tup._16,tup._17,tup._18,tup._19,tup._20,tup._21,other._1) - } + implicit def tup20ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]( + tup: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + ): Tuple20Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] = new Tuple20Adder(tup) + + class Tuple21Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + tup: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ) { + def :+[V](other: V) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + tup._21, + other + ) + def +:[V](other: V) = + ( + other, + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + tup._21 + ) + + def ++[V](other: Tuple1[V]) = + ( + tup._1, + tup._2, + tup._3, + tup._4, + tup._5, + tup._6, + tup._7, + tup._8, + tup._9, + tup._10, + tup._11, + tup._12, + tup._13, + tup._14, + tup._15, + tup._16, + tup._17, + tup._18, + tup._19, + tup._20, + tup._21, + other._1 + ) } - implicit def tup21ToAdder[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U](tup : Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]) = new Tuple21Adder(tup) + implicit def tup21ToAdder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]( + tup: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + ): Tuple21Adder[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] = new Tuple21Adder(tup) } // end of autogenerated diff --git a/scalding-core/src/main/scala/com/twitter/scalding/GroupBuilder.scala b/scalding-core/src/main/scala/com/twitter/scalding/GroupBuilder.scala index 8273837229..01fd041f72 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/GroupBuilder.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/GroupBuilder.scala @@ -11,75 +11,73 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.pipe._ import cascading.pipe.assembly._ import cascading.operation._ -import cascading.operation.aggregator._ -import cascading.operation.filter._ import cascading.tuple.Fields -import cascading.tuple.{Tuple => CTuple, TupleEntry} +import cascading.tuple.TupleEntry -import scala.collection.JavaConverters._ -import scala.annotation.tailrec -import scala.math.Ordering -import scala.{ Range => ScalaRange } +import scala.{Range => ScalaRange} /** - * This controls the sequence of reductions that happen inside a - * particular grouping operation. Not all elements can be combined, - * for instance, a scanLeft/foldLeft generally requires a sorting - * but such sorts are (at least for now) incompatible with doing a combine - * which includes some map-side reductions. + * This controls the sequence of reductions that happen inside a particular grouping operation. Not all + * elements can be combined, for instance, a scanLeft/foldLeft generally requires a sorting but such sorts are + * (at least for now) incompatible with doing a combine which includes some map-side reductions. */ -class GroupBuilder(val groupFields : Fields) extends - FoldOperations[GroupBuilder] with - StreamOperations[GroupBuilder] { +class GroupBuilder(val groupFields: Fields) + extends FoldOperations[GroupBuilder] + with StreamOperations[GroupBuilder] { // We need the implicit conversions from symbols to Fields import Dsl._ /** - * Holds the "reducers/combiners", the things that we can do paritially map-side. - */ - private var reds : Option[List[AggregateBy]] = Some(Nil) + * Holds the "reducers/combiners", the things that we can do paritially map-side. + */ + private var reds: Option[List[AggregateBy]] = Some(Nil) /** * This is the description of this Grouping in terms of a sequence of Every operations */ - protected var evs : List[Pipe => Every] = Nil - protected var isReversed : Boolean = false + protected var evs: List[Pipe => Every] = Nil + protected var isReversed: Boolean = false - protected var sortF : Option[Fields] = None + protected var sortF: Option[Fields] = None def sorting = sortF /* - * maxMF is the maximum index of a "middle field" allocated for mapReduceMap operations - */ - private var maxMF : Int = 0 + * maxMF is the maximum index of a "middle field" allocated for mapReduceMap operations + */ + private var maxMF: Int = 0 - private def getNextMiddlefield : String = { + private def getNextMiddlefield: String = { val out = "__middlefield__" + maxMF.toString maxMF += 1 - return out + out } - private def tryAggregateBy(ab : AggregateBy, ev : Pipe => Every) : Boolean = { + private def tryAggregateBy(ab: AggregateBy, ev: Pipe => Every): Boolean = { // Concat if there if not none - reds = reds.map(rl => ab::rl) + reds = reds.map(rl => ab :: rl) evs = ev :: evs - return !reds.isEmpty + reds.nonEmpty } /** - * Holds the number of reducers to use in the reduce stage of the groupBy/aggregateBy. - * By default uses whatever value is set in the jobConf. + * Holds the number of reducers to use in the reduce stage of the groupBy/aggregateBy. By default uses + * whatever value is set in the jobConf. */ - private var numReducers : Option[Int] = None + private var numReducers: Option[Int] = None /** - * Limit of number of keys held in SpillableTupleMap on an AggregateBy - */ + * Holds an optional user-specified description to be used in .dot and MR step names. + */ + private var descriptions: Seq[String] = Nil + + /** + * Limit of number of keys held in SpillableTupleMap on an AggregateBy + */ private var spillThreshold: Option[Int] = None /** @@ -90,42 +88,49 @@ class GroupBuilder(val groupFields : Fields) extends /** * Override the number of reducers used in the groupBy. */ - def reducers(r : Int) = { - if(r > 0) { + def reducers(r: Int) = { + if (r > 0) { numReducers = Some(r) } this } + /** + * Override the description to be used in .dot and MR step names. + */ + def setDescriptions(newDescriptions: Seq[String]) = { + descriptions = newDescriptions + this + } + /** * Override the spill threshold on AggregateBy */ - def spillThreshold(t : Int) : GroupBuilder = { + def spillThreshold(t: Int): GroupBuilder = { spillThreshold = Some(t) this } /** - * This cancels map side aggregation - * and forces everything to the reducers + * This cancels map side aggregation and forces everything to the reducers */ def forceToReducers = { reds = None this } - protected def overrideReducers(p : Pipe) : Pipe = { - numReducers.map { r => RichPipe.setReducers(p, r) }.getOrElse(p) - } + protected def overrideReducers(p: Pipe): Pipe = + numReducers.map(r => RichPipe.setReducers(p, r)).getOrElse(p) + + protected def overrideDescription(p: Pipe): Pipe = + RichPipe.setPipeDescriptions(p, descriptions) /** - * == Warning == - * This may significantly reduce performance of your job. - * It kills the ability to do map-side aggregation. + * ==Warning== + * This may significantly reduce performance of your job. It kills the ability to do map-side aggregation. */ - def buffer(args : Fields)(b : Buffer[_]) : GroupBuilder = { + def buffer(args: Fields)(b: Buffer[_]): GroupBuilder = every(pipe => new Every(pipe, args, b)) - } /** * Prefer aggregateBy operations! @@ -138,100 +143,109 @@ class GroupBuilder(val groupFields : Fields) extends } /** - * Prefer reduce or mapReduceMap. foldLeft will force all work to be - * done on the reducers. If your function is not associative and - * commutative, foldLeft may be required. + * Prefer reduce or mapReduceMap. foldLeft will force all work to be done on the reducers. If your function + * is not associative and commutative, foldLeft may be required. * - * == Best Practice == + * ==Best Practice== * Make sure init is an immutable object. * - * == Note == - * Init needs to be serializable with Kryo (because we copy it for each - * grouping to avoid possible errors using a mutable init object). + * ==Note== + * Init needs to be serializable with Kryo (because we copy it for each grouping to avoid possible errors + * using a mutable init object). */ - def foldLeft[X,T](fieldDef : (Fields,Fields))(init : X)(fn : (X,T) => X) - (implicit setter : TupleSetter[X], conv : TupleConverter[T]) : GroupBuilder = { - val (inFields, outFields) = fieldDef - conv.assertArityMatches(inFields) - setter.assertArityMatches(outFields) - val ag = new FoldAggregator[T,X](fn, init, outFields, conv, setter) - val beforePF = projectFields - every(pipe => new Every(pipe, inFields, ag)) - // Update projectFields, which makes sense in a fold, but invalidated on every - projectFields = beforePF.map { Fields.merge(_, inFields) } - this + def foldLeft[X, T]( + fieldDef: (Fields, Fields) + )(init: X)(fn: (X, T) => X)(implicit setter: TupleSetter[X], conv: TupleConverter[T]): GroupBuilder = { + val (inFields, outFields) = fieldDef + conv.assertArityMatches(inFields) + setter.assertArityMatches(outFields) + val ag = new FoldAggregator[T, X](fn, init, outFields, conv, setter) + val beforePF = projectFields + every(pipe => new Every(pipe, inFields, ag)) + // Update projectFields, which makes sense in a fold, but invalidated on every + projectFields = beforePF.map(Fields.merge(_, inFields)) + this } - /** * Type `T` is the type of the input field `(input to map, T => X)` * - * Type `X` is the intermediate type, which your reduce function operates on - * `(reduce is (X,X) => X)` + * Type `X` is the intermediate type, which your reduce function operates on `(reduce is (X,X) => X)` * * Type `U` is the final result type, `(final map is: X => U)` * - * The previous output goes into the reduce function on the left, like foldLeft, - * so if your operation is faster for the accumulator to be on one side, be aware. + * The previous output goes into the reduce function on the left, like foldLeft, so if your operation is + * faster for the accumulator to be on one side, be aware. */ - def mapReduceMap[T,X,U](fieldDef : (Fields, Fields))(mapfn : T => X )(redfn : (X, X) => X) - (mapfn2 : X => U)(implicit startConv : TupleConverter[T], - middleSetter : TupleSetter[X], - middleConv : TupleConverter[X], - endSetter : TupleSetter[U]) : GroupBuilder = { + def mapReduceMap[T, X, U]( + fieldDef: (Fields, Fields) + )(mapfn: T => X)(redfn: (X, X) => X)(mapfn2: X => U)(implicit + startConv: TupleConverter[T], + middleSetter: TupleSetter[X], + middleConv: TupleConverter[X], + endSetter: TupleSetter[U] + ): GroupBuilder = { val (maybeSortedFromFields, maybeSortedToFields) = fieldDef - //Check for arity safety: + // Check for arity safety: // To fields CANNOT have a sorting, or cascading gets unhappy: // TODO this may be fixed in cascading later - val toFields = new Fields(asList(maybeSortedToFields) :_*) - val fromFields = new Fields(asList(maybeSortedFromFields) :_*) + val toFields = new Fields(asList(maybeSortedToFields): _*) + val fromFields = new Fields(asList(maybeSortedFromFields): _*) startConv.assertArityMatches(fromFields) endSetter.assertArityMatches(toFields) // Update projectFields - projectFields = projectFields.map { Fields.merge(_, fromFields) } - val ag = new MRMAggregator[T,X,U](mapfn, redfn, mapfn2, toFields, startConv, endSetter) - val ev = (pipe => new Every(pipe, fromFields, ag)) : Pipe => Every - assert(middleSetter.arity > 0, - "The middle arity must have definite size, try wrapping in scala.Tuple1 if you need a hack") + projectFields = projectFields.map(Fields.merge(_, fromFields)) + val ag = new MRMAggregator[T, X, U](mapfn, redfn, mapfn2, toFields, startConv, endSetter) + val ev = (pipe => new Every(pipe, fromFields, ag)): Pipe => Every + assert( + middleSetter.arity > 0, + "The middle arity must have definite size, try wrapping in scala.Tuple1 if you need a hack" + ) // Create the required number of middlefields based on the arity of middleSetter - val middleFields = strFields( ScalaRange(0, middleSetter.arity).map {i => getNextMiddlefield } ) - val mrmBy = new MRMBy[T,X,U](fromFields, middleFields, toFields, - mapfn, redfn, mapfn2, startConv, middleSetter, middleConv, endSetter) + val middleFields = strFields(ScalaRange(0, middleSetter.arity).map(i => getNextMiddlefield)) + val mrmBy = new MRMBy[T, X, U]( + fromFields, + middleFields, + toFields, + mapfn, + redfn, + mapfn2, + startConv, + middleSetter, + middleConv, + endSetter + ) tryAggregateBy(mrmBy, ev) this } /** - * Corresponds to a Cascading Buffer - * which allows you to stream through the data, keeping some, dropping, scanning, etc... - * The iterator you are passed is lazy, and mapping will not trigger the - * entire evaluation. If you convert to a list (i.e. to reverse), you need to be aware - * that memory constraints may become an issue. + * Corresponds to a Cascading Buffer which allows you to stream through the data, keeping some, dropping, + * scanning, etc... The iterator you are passed is lazy, and mapping will not trigger the entire evaluation. + * If you convert to a list (i.e. to reverse), you need to be aware that memory constraints may become an + * issue. * - * == Warning == - * Any fields not referenced by the input fields will be aligned to the first output, - * and the final hadoop stream will have a length of the maximum of the output of this, and - * the input stream. So, if you change the length of your inputs, the other fields won't - * be aligned. YOU NEED TO INCLUDE ALL THE FIELDS YOU WANT TO KEEP ALIGNED IN THIS MAPPING! - * POB: This appears to be a Cascading design decision. + * ==Warning== + * Any fields not referenced by the input fields will be aligned to the first output, and the final hadoop + * stream will have a length of the maximum of the output of this, and the input stream. So, if you change + * the length of your inputs, the other fields won't be aligned. YOU NEED TO INCLUDE ALL THE FIELDS YOU WANT + * TO KEEP ALIGNED IN THIS MAPPING! POB: This appears to be a Cascading design decision. * - * == Warning == - * mapfn needs to be stateless. Multiple calls needs to be safe (no mutable - * state captured) + * ==Warning== + * mapfn needs to be stateless. Multiple calls needs to be safe (no mutable state captured) */ - def mapStream[T,X](fieldDef : (Fields,Fields))(mapfn : (Iterator[T]) => TraversableOnce[X]) - (implicit conv : TupleConverter[T], setter : TupleSetter[X]) = { + def mapStream[T, X]( + fieldDef: (Fields, Fields) + )(mapfn: (Iterator[T]) => TraversableOnce[X])(implicit conv: TupleConverter[T], setter: TupleSetter[X]) = { val (inFields, outFields) = fieldDef - //Check arity + // Check arity conv.assertArityMatches(inFields) setter.assertArityMatches(outFields) - val b = new BufferOp[Unit,T,X]((), - (u : Unit, it: Iterator[T]) => mapfn(it), outFields, conv, setter) + val b = new BufferOp[Unit, T, X]((), (u: Unit, it: Iterator[T]) => mapfn(it), outFields, conv, setter) every(pipe => new Every(pipe, inFields, b, defaultMode(inFields, outFields))) } - - def reverse : GroupBuilder = { + def reverse: GroupBuilder = { assert(reds.isEmpty, "Cannot sort when reducing") assert(!isReversed, "Reverse called a second time! Only one allowed") isReversed = true @@ -239,75 +253,82 @@ class GroupBuilder(val groupFields : Fields) extends } /** - * Analog of standard scanLeft (@see scala.collection.Iterable.scanLeft ) - * This invalidates map-side aggregation, forces all data to be transferred - * to reducers. Use only if you REALLY have to. + * Analog of standard scanLeft (@see scala.collection.Iterable.scanLeft ) This invalidates map-side + * aggregation, forces all data to be transferred to reducers. Use only if you REALLY have to. * - * == Best Practice == + * ==Best Practice== * Make sure init is an immutable object. * - * == Note == - * init needs to be serializable with Kryo (because we copy it for each - * grouping to avoid possible errors using a mutable init object). - * We override the default implementation here to use Kryo to serialize - * the initial value, for immutable serializable inits, this is not needed + * ==Note== + * init needs to be serializable with Kryo (because we copy it for each grouping to avoid possible errors + * using a mutable init object). We override the default implementation here to use Kryo to serialize the + * initial value, for immutable serializable inits, this is not needed */ - override def scanLeft[X,T](fieldDef : (Fields,Fields))(init : X)(fn : (X,T) => X) - (implicit setter : TupleSetter[X], conv : TupleConverter[T]) : GroupBuilder = { + override def scanLeft[X, T]( + fieldDef: (Fields, Fields) + )(init: X)(fn: (X, T) => X)(implicit setter: TupleSetter[X], conv: TupleConverter[T]): GroupBuilder = { val (inFields, outFields) = fieldDef - //Check arity + // Check arity conv.assertArityMatches(inFields) setter.assertArityMatches(outFields) - val b = new BufferOp[X,T,X](init, + val b = new BufferOp[X, T, X]( + init, // On scala 2.8, there is no scanLeft // On scala 2.9, their implementation creates an off-by-one bug with the unused fields - (i : X, it: Iterator[T]) => new ScanLeftIterator(it, i, fn), - outFields, conv, setter) + (i: X, it: Iterator[T]) => new ScanLeftIterator(it, i, fn), + outFields, + conv, + setter + ) every(pipe => new Every(pipe, inFields, b, defaultMode(inFields, outFields))) } - def groupMode : GroupMode = + def groupMode: GroupMode = (reds, evs, sortF) match { - case (None, Nil, Some(_)) => IdentityMode // no reducers or everys, just a sort - case (Some(Nil), Nil, _) => IdentityMode // no sort, just identity. used to shuffle data - case (None, _, _) => GroupByMode + case (None, Nil, Some(_)) => IdentityMode // no reducers or everys, just a sort + case (Some(Nil), Nil, _) => IdentityMode // no sort, just identity. used to shuffle data + case (None, _, _) => GroupByMode case (Some(redList), _, None) => AggregateByMode // use map-side aggregation case _ => sys.error("Invalid GroupBuilder state: %s, %s, %s".format(reds, evs, sortF)) } - protected def groupedPipeOf(name: String, in: Pipe): GroupBy = { - val gb : GroupBy = sortF match { - case None => new GroupBy(name, in, groupFields) + val gb: GroupBy = sortF match { + case None => new GroupBy(name, in, groupFields) case Some(sf) => new GroupBy(name, in, groupFields, sf, isReversed) } overrideReducers(gb) + overrideDescription(gb) gb } - def schedule(name : String, pipe : Pipe) : Pipe = { - val maybeProjectedPipe = projectFields.map { pipe.project(_) }.getOrElse(pipe) + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + def schedule(name: String, pipe: Pipe): Pipe = { + val maybeProjectedPipe = projectFields.map(pipe.project(_)).getOrElse(pipe) groupMode match { case GroupByMode => - //In this case we cannot aggregate, so group: + // In this case we cannot aggregate, so group: val start: Pipe = groupedPipeOf(name, maybeProjectedPipe) // Time to schedule the Every operations - evs.foldRight(start) { (op : (Pipe => Every), p) => op(p) } + evs.foldRight(start)((op: (Pipe => Every), p) => op(p)) case IdentityMode => - //This is the case where the group function is identity: { g => g } + // This is the case where the group function is identity: { g => g } groupedPipeOf(name, pipe) case AggregateByMode => - //There is some non-empty AggregateBy to do: + // There is some non-empty AggregateBy to do: val redlist = reds.get - val ag = new AggregateBy(name, - maybeProjectedPipe, - groupFields, - spillThreshold.getOrElse(0), // cascading considers 0 to be the default - redlist.reverse.toArray : _*) + val ag = new AggregateBy( + name, + maybeProjectedPipe, + groupFields, + spillThreshold.getOrElse(0), // cascading considers 0 to be the default + redlist.reverse.toArray: _* + ) overrideReducers(ag.getGroupBy()) + overrideDescription(ag.getGroupBy()) ag } } @@ -315,53 +336,57 @@ class GroupBuilder(val groupFields : Fields) extends /** * This invalidates aggregateBy! */ - def sortBy(f : Fields) : GroupBuilder = { + def sortBy(f: Fields): GroupBuilder = { reds = None - sortF = sortF match { - case None => Some(f) + val sort = sortF match { + case None => f case Some(sf) => { sf.append(f) - Some(sf) + sf } } + sortF = Some(sort) // Update projectFields - projectFields = projectFields.map { Fields.merge(_, sortF.get) } + projectFields = projectFields.map(Fields.merge(_, sort)) this } /** - * This is convenience method to allow plugging in blocks - * of group operations similar to `RichPipe.thenDo` + * This is convenience method to allow plugging in blocks of group operations similar to `RichPipe.thenDo` */ - def thenDo(fn : (GroupBuilder) => GroupBuilder) = fn(this) + def thenDo(fn: (GroupBuilder) => GroupBuilder) = fn(this) /** - * An identity function that keeps all the tuples. A hack to implement - * groupAll and groupRandomly. + * An identity function that keeps all the tuples. A hack to implement groupAll and groupRandomly. */ - def pass : GroupBuilder = takeWhile(0) { (t: TupleEntry) => true } + def pass: GroupBuilder = takeWhile(0)((t: TupleEntry) => true) - /** - * begining of block with access to expensive nonserializable state. The state object should - * contain a function release() for resource management purpose. + /** + * beginning of block with access to expensive nonserializable state. The state object should contain a + * function release() for resource management purpose. */ - def using[C <: { def release() }](bf: => C) = new { + def using[C <: { def release(): Unit }](bf: => C) = new { /** * mapStream with state. */ - def mapStream[T,X](fieldDef : (Fields,Fields))(mapfn : (C, Iterator[T]) => TraversableOnce[X]) - (implicit conv : TupleConverter[T], setter : TupleSetter[X]) = { + def mapStream[T, X](fieldDef: (Fields, Fields))( + mapfn: (C, Iterator[T]) => TraversableOnce[X] + )(implicit conv: TupleConverter[T], setter: TupleSetter[X]) = { val (inFields, outFields) = fieldDef - //Check arity + // Check arity conv.assertArityMatches(inFields) setter.assertArityMatches(outFields) - val b = new SideEffectBufferOp[Unit,T,C,X]( - (), bf, - (u : Unit, c : C, it: Iterator[T]) => mapfn(c, it), - new Function1[C, Unit] with java.io.Serializable { def apply(c: C) { c.release() }}, - outFields, conv, setter) + val b = new SideEffectBufferOp[Unit, T, C, X]( + (), + bf, + (u: Unit, c: C, it: Iterator[T]) => mapfn(c, it), + new Function1[C, Unit] with java.io.Serializable { def apply(c: C): Unit = c.release() }, + outFields, + conv, + setter + ) every(pipe => new Every(pipe, inFields, b, defaultMode(inFields, outFields))) } } @@ -369,15 +394,20 @@ class GroupBuilder(val groupFields : Fields) extends } /** - * Scala 2.8 Iterators don't support scanLeft so we have to reimplement - * The Scala 2.9 implementation creates an off-by-one bug with the unused fields in the Fields API + * Scala 2.8 Iterators don't support scanLeft so we have to reimplement The Scala 2.9 implementation creates + * an off-by-one bug with the unused fields in the Fields API */ -class ScanLeftIterator[T,U](it : Iterator[T], init : U, fn : (U,T) => U) extends Iterator[U] with java.io.Serializable { - protected var prev : Option[U] = None - def hasNext : Boolean = { prev.isEmpty || it.hasNext } +class ScanLeftIterator[T, U](it: Iterator[T], init: U, fn: (U, T) => U) + extends Iterator[U] + with java.io.Serializable { + protected var prev: Option[U] = None + def hasNext: Boolean = prev.isEmpty || it.hasNext + // Don't use pattern matching in a performance-critical section + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) def next = { - prev = prev.map { fn(_, it.next) } - .orElse(Some(init)) + prev = prev + .map(fn(_, it.next)) + .orElse(Some(init)) prev.get } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/HfsConfPropertySetter.scala b/scalding-core/src/main/scala/com/twitter/scalding/HfsConfPropertySetter.scala new file mode 100644 index 0000000000..98882187b6 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/HfsConfPropertySetter.scala @@ -0,0 +1,79 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import cascading.tap.SinkMode +import org.apache.hadoop.mapred.JobConf +import cascading.flow.FlowProcess +import org.apache.hadoop.mapred.RecordReader +import org.apache.hadoop.mapred.OutputCollector +import cascading.scheme.Scheme +import cascading.tap.hadoop.Hfs +import com.twitter.scalding.tap.ScaldingHfs + +private[scalding] class ConfPropertiesHfsTap( + sourceConfig: Config, + sinkConfig: Config, + scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _], + stringPath: String, + sinkMode: SinkMode +) extends ScaldingHfs(scheme, stringPath, sinkMode) { + override def sourceConfInit(process: FlowProcess[JobConf], conf: JobConf): Unit = { + sourceConfig.toMap.foreach { case (k, v) => + conf.set(k, v) + } + super.sourceConfInit(process, conf) + } + + override def sinkConfInit(process: FlowProcess[JobConf], conf: JobConf): Unit = { + sinkConfig.toMap.foreach { case (k, v) => + conf.set(k, v) + } + super.sinkConfInit(process, conf) + } +} + +/* + * The HfsConfPropertySetter can be added to sources to allow close in changes + * to the Hadoop configuration properties for a source/sink in the flow. + * Operations like changing the split sizes can be done here. + * + * Changes here however will not show up in the hadoop UI + */ +trait HfsConfPropertySetter extends HfsTapProvider { + @deprecated( + "Tap config is deprecated, use sourceConfig or sinkConfig directly. In cascading configs applied to sinks can leak to sources in the step writing to the sink.", + "0.17.0" + ) + def tapConfig: Config = Config.empty + + def sourceConfig: Config = Config.empty + def sinkConfig: Config = Config.empty + + override def createHfsTap( + scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _], + path: String, + sinkMode: SinkMode + ): Hfs = { + // Deprecation handling + val (srcCfg, sinkCfg) = if (sourceConfig == Config.empty && sinkConfig == Config.empty) { + (tapConfig, tapConfig) + } else { + (sourceConfig, sinkConfig) + } + new ConfPropertiesHfsTap(srcCfg, sinkCfg, scheme, path, sinkMode) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/IntegralComparator.scala b/scalding-core/src/main/scala/com/twitter/scalding/IntegralComparator.scala index 739d65f2c3..78d506d4a7 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/IntegralComparator.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/IntegralComparator.scala @@ -12,35 +12,36 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ -package com.twitter.scalding; +package com.twitter.scalding -import cascading.tuple.Hasher; +import cascading.tuple.Hasher -import java.io.Serializable; -import java.util.Comparator; +import java.io.Serializable +import java.util.Comparator /* * Handles numerical hashing properly */ class IntegralComparator extends Comparator[AnyRef] with Hasher[AnyRef] with Serializable { - val integralTypes : Set[Class[_]] = Set(classOf[java.lang.Long], - classOf[java.lang.Integer], - classOf[java.lang.Short], - classOf[java.lang.Byte]) + val integralTypes: Set[Class[_]] = Set( + classOf[java.lang.Long], + classOf[java.lang.Integer], + classOf[java.lang.Short], + classOf[java.lang.Byte] + ) - def isIntegral(boxed : AnyRef) = integralTypes(boxed.getClass) + def isIntegral(boxed: AnyRef) = integralTypes(boxed.getClass) - override def compare(a1: AnyRef, a2: AnyRef) : Int = { - val a1IsNull = if (null == a1) 1 else 0 - val a2IsNull = if (null == a2) 1 else 0 + override def compare(a1: AnyRef, a2: AnyRef): Int = { + val a1IsNull = if (a1 == null) 1 else 0 + val a2IsNull = if (a2 == null) 1 else 0 if (a1IsNull + a2IsNull > 0) { - //if a2IsNull, but a1IsNot, a2 is less: + // if a2IsNull, but a1IsNot, a2 is less: a2IsNull - a1IsNull - } - else if (isIntegral(a1) && isIntegral(a2)) { + } else if (isIntegral(a1) && isIntegral(a2)) { val long1 = a1.asInstanceOf[Number].longValue val long2 = a2.asInstanceOf[Number].longValue if (long1 < long2) @@ -49,23 +50,17 @@ class IntegralComparator extends Comparator[AnyRef] with Hasher[AnyRef] with Ser 1 else 0 - } - else + } else a1.asInstanceOf[Comparable[AnyRef]].compareTo(a2) } - override def hashCode(obj : AnyRef) : Int = { - if (null == obj) { + override def hashCode(obj: AnyRef): Int = + if (obj == null) { 0 - } - else if (isIntegral(obj)) { - obj.asInstanceOf[Number] - .longValue - .hashCode - } - else { - //Use the default: + } else if (isIntegral(obj)) { + obj.asInstanceOf[Number].longValue.hashCode + } else { + // Use the default: obj.hashCode } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/IterableSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/IterableSource.scala index 5a3eeca11b..39b5142a96 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/IterableSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/IterableSource.scala @@ -12,63 +12,63 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import com.twitter.maple.tap.MemorySourceTap -import cascading.flow.FlowProcess -import cascading.scheme.local.{TextDelimited => CLTextDelimited} -import cascading.scheme.Scheme import cascading.tap.Tap import cascading.tuple.Tuple import cascading.tuple.Fields import cascading.scheme.NullScheme -import java.io.{InputStream,OutputStream} - -import org.apache.hadoop.mapred.JobConf -import org.apache.hadoop.mapred.OutputCollector -import org.apache.hadoop.mapred.RecordReader +import java.io.{InputStream, OutputStream} import scala.collection.mutable.Buffer import scala.collection.JavaConverters._ /** - * Allows working with an iterable object defined in the job (on the submitter) - * to be used within a Job as you would a Pipe/RichPipe + * Allows working with an iterable object defined in the job (on the submitter) to be used within a Job as you + * would a Pipe/RichPipe * - * These lists should probably be very tiny by Hadoop standards. If they are - * getting large, you should probably dump them to HDFS and use the normal - * mechanisms to address the data (a FileSource). + * These lists should probably be very tiny by Hadoop standards. If they are getting large, you should + * probably dump them to HDFS and use the normal mechanisms to address the data (a FileSource). */ -case class IterableSource[+T](@transient iter: Iterable[T], inFields : Fields = Fields.NONE) - (implicit set: TupleSetter[T], conv: TupleConverter[T]) extends Source with Mappable[T] { +case class IterableSource[+T](@transient iter: Iterable[T], inFields: Fields = Fields.NONE)(implicit + set: TupleSetter[T], + conv: TupleConverter[T] +) extends Source + with Mappable[T] { - def fields = { + def fields = if (inFields.isNone && set.arity > 0) { Dsl.intFields(0 until set.arity) - } - else inFields - } + } else inFields - override def converter[U>:T] = TupleConverter.asSuperConverter[T, U](conv) + override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](conv) @transient - private val asBuffer : Buffer[Tuple] = iter.map { set(_) }.toBuffer + private val asBuffer: Buffer[Tuple] = iter.map(set(_)).toBuffer - private lazy val hdfsTap : Tap[_,_,_] = new MemorySourceTap(asBuffer.asJava, fields) + private lazy val hdfsTap: Tap[_, _, _] = new MemorySourceTap(asBuffer.asJava, fields) - override def createTap(readOrWrite : AccessMode)(implicit mode : Mode) : Tap[_,_,_] = { + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = { if (readOrWrite == Write) { sys.error("IterableSource is a Read-only Source") } mode match { - case Local(_) => new MemoryTap[InputStream,OutputStream](new NullScheme(fields, fields), asBuffer) - case Test(_) => new MemoryTap[InputStream,OutputStream](new NullScheme(fields, fields), asBuffer) + case Local(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), asBuffer) + case Test(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), asBuffer) case Hdfs(_, _) => hdfsTap - case HadoopTest(_,_) => hdfsTap - case _ => sys.error("Unsupported mode for IterableSource: " + mode.toString) + case HadoopTest(_, _) => hdfsTap + case _ => throw ModeException("Unsupported mode for IterableSource: " + mode.toString) } } + + /** + * Don't use the whole string of the iterable, which can be huge. We take the first 10 items + the + * identityHashCode of the iter. + */ + override val sourceId: String = + "IterableSource(%s)-%d".format(iter.take(10).toString, System.identityHashCode(iter)) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Job.scala b/scalding-core/src/main/scala/com/twitter/scalding/Job.scala index b886038485..44cecd2de8 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Job.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Job.scala @@ -12,55 +12,148 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import com.twitter.chill.config.{ScalaAnyRefMapConfig, ConfiguredInstantiator} - -import cascading.pipe.assembly.AggregateBy -import cascading.flow.{Flow, FlowDef, FlowProps, FlowListener, FlowSkipStrategy, FlowStepStrategy} +import cascading.flow.{ + Flow, + FlowDef, + FlowListener, + FlowSkipStrategy, + FlowStep, + FlowStepListener, + FlowStepStrategy +} import cascading.pipe.Pipe import cascading.property.AppProps -import cascading.tuple.collect.SpillableProps import cascading.stats.CascadingStats +import com.twitter.algebird.Semigroup +import com.twitter.scalding.typed.cascading_backend.{CascadingBackend, CascadingExtensions} + import org.apache.hadoop.io.serializer.{Serialization => HSerialization} -//For java -> scala implicits on collections -import scala.collection.JavaConversions._ +import scala.concurrent.{Future, Promise} +import scala.util.{Failure, Success, Try} -import java.io.{ BufferedWriter, File, FileOutputStream, OutputStreamWriter } -import java.util.{Calendar, UUID} +import java.io.{BufferedWriter, FileOutputStream, OutputStreamWriter} +import java.util.{List => JList} -import java.util.concurrent.{Executors, TimeUnit, ThreadFactory, Callable, TimeoutException} +import java.util.concurrent.{Callable, Executors, ThreadFactory, TimeUnit, TimeoutException} import java.util.concurrent.atomic.AtomicInteger -import java.security.MessageDigest object Job { - val UNIQUE_JOB_ID = "scalding.job.uniqueId" + /** - * Use reflection to create the job by name. We use the thread's - * context classloader so that classes in the submitted jar and any - * jars included via -libjar can be found. + * Use reflection to create the job by name. We use the thread's context classloader so that classes in the + * submitted jar and any jars included via -libjar can be found. */ - def apply(jobName : String, args : Args) : Job = { - Class.forName(jobName, true, Thread.currentThread().getContextClassLoader) + def apply(jobName: String, args: Args): Job = + Class + .forName(jobName, true, Thread.currentThread().getContextClassLoader) .getConstructor(classOf[Args]) .newInstance(args) .asInstanceOf[Job] - } + + /** + * Make a job reflectively from the given class and the Args contained in the Config. + */ + def makeJob[J <: Job](cls: Class[J]): Execution[J] = + Execution.getConfigMode.flatMap { case (conf, mode) => + // Now we need to allocate the job + Execution.from { + val argsWithMode = Mode.putMode(mode, conf.getArgs) + cls + .getConstructor(classOf[Args]) + .newInstance(argsWithMode) + } + } + + /** + * Create a job reflectively from a class, which handles threading through the Args and Mode correctly in + * the way Job subclasses expect + */ + def toExecutionFromClass[J <: Job](cls: Class[J], onEmpty: Execution[Unit]): Execution[Unit] = + makeJob(cls).flatMap(toExecution(_, onEmpty)) + + /** + * Convert Jobs that only use the TypedPipe API to an Execution + * + * This can fail for some exotic jobs, but for standard subclasses of Job (that don't override existing + * methods in Job except config) it should work + * + * onEmpty is the execution to run if you have an empty job. Common values might be Execution.unit or + * Execution.failed(new Exeception("unexpected empty execution")) + */ + def toExecution(job: Job, onEmpty: Execution[Unit]): Execution[Unit] = + job match { + case (exJob: ExecutionJob[_]) => exJob.execution.unit + case _ => + val ex = CascadingBackend.flowDefToExecution(job.flowDef, None).getOrElse(onEmpty) + + // next may have a side effect so we + // evaluate this *after* the current Execution + val nextJobEx: Execution[Unit] = + Execution.from(job.next).flatMap { // putting inside Execution.from memoizes this call + case None => Execution.unit + case Some(nextJob) => toExecution(nextJob, onEmpty) + } + + for { + conf <- Execution.fromTry(Config.tryFrom(job.config)) + // since we are doing an Execution[Unit], it is always safe to cleanup temp on finish + _ <- Execution.withConfig(ex)(_ => conf.setExecutionCleanupOnFinish(true)) + _ <- nextJobEx + } yield () + } +} + +trait UntypedPipeExtensions0 extends UntypedPipeExtensions1 { + + /** + * you should never call this directly, it is here to make the DSL work. Just know, you can treat a Pipe as + * a RichPipe within a Job + */ + implicit def pipeToRichPipe(pipe: Pipe): RichPipe = new RichPipe(pipe) + + // This converts an Iterable into a Pipe or RichPipe with index (int-based) fields + implicit def toPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): Pipe = + IterableSource[T](iter)(set, conv).read(flowDef, mode) + + implicit def iterableToRichPipe[T]( + iter: Iterable[T] + )(implicit set: TupleSetter[T], conv: TupleConverter[T]): RichPipe = + RichPipe(toPipe(iter)(set, conv)) } -/** Job is a convenience class to make using Scalding easier. - * Subclasses of Job automatically have a number of nice implicits to enable more concise - * syntax, including: - * conversion from Pipe, Source or Iterable to RichPipe - * conversion from Source or Iterable to Pipe - * conversion to collections or Tuple[1-22] to cascading.tuple.Fields +trait UntypedPipeExtensions1 { + def mode: Mode + protected def flowDef: FlowDef + + // We do put things here to lower the priority below typed extensions we + // mix in at the same level as UntypedCascadingExtensions0 + // this is using subclassing to control priority + + /** + * This implicit is to enable RichPipe methods directly on Source objects, such as map/flatMap, etc... + * + * Note that Mappable is a subclass of Source, and Mappable already has mapTo and flatMapTo BUT WITHOUT + * incoming fields used (see the Mappable trait). This creates some confusion when using these methods (this + * is an unfortunate mistake in our design that was not noticed until later). To remove ambiguity, + * explicitly call .read on any Source that you begin operating with a mapTo/flatMapTo. + */ + implicit def sourceToRichPipe(src: Source): RichPipe = new RichPipe(src.read(flowDef, mode)) +} + +/** + * Job is a convenience class to make using Scalding easier. Subclasses of Job automatically have a number of + * nice implicits to enable more concise syntax, including: conversion from Pipe, Source or Iterable to + * RichPipe conversion from Source or Iterable to Pipe conversion to collections or Tuple[1-22] to + * cascading.tuple.Fields * - * Additionally, the job provides an implicit Mode and FlowDef so that functions that - * register starts or ends of a flow graph, specifically anything that reads or writes data - * on Hadoop, has the needed implicits available. + * Additionally, the job provides an implicit Mode and FlowDef so that functions that register starts or ends + * of a flow graph, specifically anything that reads or writes data on Hadoop, has the needed implicits + * available. * * If you want to write code outside of a Job, you will want to either: * @@ -68,195 +161,200 @@ object Job { * * OR: * - * write code that rather than returning values, it returns a (FlowDef, Mode) => T, - * these functions can be combined Monadically using algebird.monad.Reader. + * write code that rather than returning values, it returns a (FlowDef, Mode) => T, these functions can be + * combined Monadically using algebird.monad.Reader. */ -class Job(val args : Args) extends FieldConversions with java.io.Serializable { +class Job(val args: Args) + extends FieldConversions + with CascadingExtensions + with UntypedPipeExtensions0 + with java.io.Serializable { + Tracing.init() + // Set specific Mode implicit def mode: Mode = Mode.getMode(args).getOrElse(sys.error("No Mode defined")) - // This allows us to register this job in a global space when processing on the cluster - // and find it again. - // E.g. stats can all locate the same job back again to find the right flowProcess - final implicit val uniqueId = UniqueID(UUID.randomUUID.toString) - - // Use this if a map or reduce phase takes a while before emitting tuples. - def keepAlive { - val flowProcess = RuntimeStats.getFlowProcessForUniqueId(uniqueId.get) - flowProcess.keepAlive - } - /** - * you should never call this directly, it is here to make - * the DSL work. Just know, you can treat a Pipe as a RichPipe - * within a Job - */ - implicit def pipeToRichPipe(pipe : Pipe): RichPipe = new RichPipe(pipe) - /** - * This implicit is to enable RichPipe methods directly on Source - * objects, such as map/flatMap, etc... - * - * Note that Mappable is a subclass of Source, and Mappable already - * has mapTo and flatMapTo BUT WITHOUT incoming fields used (see - * the Mappable trait). This creates some confusion when using these methods - * (this is an unfortuate mistake in our design that was not noticed until later). - * To remove ambiguity, explicitly call .read on any Source that you begin - * operating with a mapTo/flatMapTo. + * Use this if a map or reduce phase takes a while before emitting tuples. */ - implicit def sourceToRichPipe(src : Source): RichPipe = new RichPipe(src.read) - - // This converts an Iterable into a Pipe or RichPipe with index (int-based) fields - implicit def toPipe[T](iter : Iterable[T])(implicit set: TupleSetter[T], conv : TupleConverter[T]): Pipe = - IterableSource[T](iter)(set, conv).read + def keepAlive(): Unit = { + val flowProcess = RuntimeStats.getFlowProcessForUniqueId(uniqueId) + flowProcess.keepAlive() + } - implicit def iterableToRichPipe[T](iter : Iterable[T]) - (implicit set: TupleSetter[T], conv : TupleConverter[T]): RichPipe = - RichPipe(toPipe(iter)(set, conv)) + // Provide args as an implicit val for extensions such as the Checkpoint extension. + implicit protected def _implicitJobArgs: Args = args - // Override this if you want change how the mapred.job.name is written in Hadoop - def name : String = getClass.getName + // Override this if you want to change how the mapred.job.name is written in Hadoop + def name: String = Config.defaultFrom(mode).toMap.getOrElse("mapred.job.name", getClass.getName) - //This is the FlowDef used by all Sources this job creates + // This is the FlowDef used by all Sources this job creates @transient - implicit protected val flowDef = { + implicit protected val flowDef: FlowDef = { val fd = new FlowDef fd.setName(name) fd } - /** Copy this job - * By default, this uses reflection and the single argument Args constructor + // Do this before the job is submitted, because the flowDef is transient + protected implicit val uniqueId: UniqueID = UniqueID.fromSystemHashCode(flowDef) + + /** + * Copy this job By default, this uses reflection and the single argument Args constructor */ def clone(nextargs: Args): Job = this.getClass - .getConstructor(classOf[Args]) - .newInstance(Mode.putMode(mode, nextargs)) - .asInstanceOf[Job] + .getConstructor(classOf[Args]) + .newInstance(Mode.putMode(mode, nextargs)) + .asInstanceOf[Job] /** - * Implement this method if you want some other jobs to run after the current - * job. These will not execute until the current job has run successfully. - */ - def next : Option[Job] = None + * Implement this method if you want some other jobs to run after the current job. These will not execute + * until the current job has run successfully. + */ + def next: Option[Job] = None - /** Keep 100k tuples in memory by default before spilling - * Turn this up as high as you can without getting OOM. + /** + * Keep 100k tuples in memory by default before spilling Turn this up as high as you can without getting + * OOM. * - * This is ignored if there is a value set in the incoming mode.config + * This is ignored if there is a value set in the incoming jobConf on Hadoop */ def defaultSpillThreshold: Int = 100 * 1000 /** Override this to control how dates are parsed */ implicit def dateParser: DateParser = DateParser.default - def fromInputStream(s: java.io.InputStream): Array[Byte] = - Stream.continually(s.read).takeWhile(-1 !=).map(_.toByte).toArray + // Generated the MD5 hex of the bytes in the job classfile + def classIdentifier: String = Config.md5Identifier(getClass) - def toHexString(bytes: Array[Byte]): String = - bytes.map("%02X".format(_)).mkString + /** + * This is the exact config that is passed to the Cascading FlowConnector. By default: if there are no spill + * thresholds in mode.config, we replace with defaultSpillThreshold we overwrite io.serializations with + * ioSerializations we overwrite cascading.tuple.element.comparator.default to defaultComparator we add some + * scalding keys for debugging/logging + * + * Tip: override this method, call super, and ++ your additional map to add or overwrite more options + * + * This returns Map[AnyRef, AnyRef] for compatibility with older code + */ + def config: Map[AnyRef, AnyRef] = { + val base = Config.empty + .setListSpillThreshold(defaultSpillThreshold) + .setMapSpillThreshold(defaultSpillThreshold) + .setMapSideAggregationThreshold(defaultSpillThreshold) + + // This is setting a property for cascading/driven + AppProps.addApplicationFramework(null, String.format("scalding:%s", scaldingVersion)) + + val modeConf = mode match { + case h: HadoopMode => Config.fromHadoop(h.jobConf) + case _: CascadingLocal => Config.unitTestDefault + case _ => Config.empty + } - def md5Hex(bytes: Array[Byte]): String = { - val md = MessageDigest.getInstance("MD5") - md.update(bytes) - toHexString(md.digest) + val init = base ++ modeConf + + defaultComparator + .map(init.setDefaultComparator) + .getOrElse(init) + .setSerialization(Right(classOf[serialization.KryoHadoop]), ioSerializations) + .addCascadingClassSerializationTokens(reflectedClasses) + .setScaldingVersion + .setCascadingAppName(name) + .setCascadingAppId(name) + .setScaldingFlowClass(getClass) + .setArgs(args) + .maybeSetSubmittedTimestamp() + ._2 + .toMap + .toMap[AnyRef, AnyRef] // linter:disable:TypeToType // the second one is to lift from String -> AnyRef } - // Generated the MD5 hex of the the bytes in the job classfile - lazy val classIdentifier : String = { - val classAsPath = getClass.getName.replace(".", "/") + ".class" - val is = getClass.getClassLoader.getResourceAsStream(classAsPath) - val bytes = fromInputStream(is) - is.close() - md5Hex(bytes) - } + private def reflectedClasses: Set[Class[_]] = + if (args.optional(Args.jobClassReflection).map(_.toBoolean).getOrElse(true)) { + ReferencedClassFinder.findReferencedClasses(getClass) + } else Set.empty - /** This is the exact config that is passed to the Cascading FlowConnector. - * By default: - * if there are no spill thresholds in mode.config, we replace with defaultSpillThreshold - * we overwrite io.serializations with ioSerializations - * we overwrite cascading.tuple.element.comparator.default to defaultComparator - * we add some scalding keys for debugging/logging - * - * Tip: override this method, call super, and ++ your additional - * map to add or overwrite more options + /** + * This is here so that Mappable.toIterator can find an implicit config */ - def config: Map[AnyRef,AnyRef] = { - // These are ignored if set in mode.config - val lowPriorityDefaults = - Map(SpillableProps.LIST_THRESHOLD -> defaultSpillThreshold.toString, - SpillableProps.MAP_THRESHOLD -> defaultSpillThreshold.toString, - AggregateBy.AGGREGATE_BY_THRESHOLD -> defaultSpillThreshold.toString - ) - // Set up the keys for chill - val chillConf = ScalaAnyRefMapConfig(lowPriorityDefaults) - ConfiguredInstantiator.setReflect(chillConf, classOf[serialization.KryoHadoop]) - - System.setProperty(AppProps.APP_FRAMEWORKS, - String.format("scalding:%s", scaldingVersion)) - - chillConf.toMap ++ - mode.config ++ - // Optionally set a default Comparator - (defaultComparator match { - case Some(defcomp) => Map(FlowProps.DEFAULT_ELEMENT_COMPARATOR -> defcomp.getName) - case None => Map.empty[AnyRef, AnyRef] - }) ++ - Map( - "io.serializations" -> ioSerializations.map { _.getName }.mkString(","), - "scalding.version" -> scaldingVersion, - "cascading.app.name" -> name, - "cascading.app.id" -> name, - "scalding.flow.class.name" -> getClass.getName, - "scalding.flow.class.signature" -> classIdentifier, - "scalding.job.args" -> args.toString, - Job.UNIQUE_JOB_ID -> uniqueId.get, - "scalding.flow.submitted.timestamp" -> - Calendar.getInstance().getTimeInMillis().toString - ) - } + implicit protected def scaldingConfig: Config = Config.tryFrom(config).get def skipStrategy: Option[FlowSkipStrategy] = None + /** + * Specify a callback to run before the start of each flow step. + * + * Defaults to what Config.getReducerEstimator specifies. + * @see + * ExecutionContext.buildFlow + */ def stepStrategy: Option[FlowStepStrategy[_]] = None + private def executionContext: Try[ExecutionContext] = + Config.tryFrom(config).map { conf => + ExecutionContext.newContext(conf)(flowDef, mode) + } + /** * combine the config, flowDef and the Mode to produce a flow */ - def buildFlow: Flow[_] = { - val flow = mode.newFlowConnector(config).connect(flowDef) - listeners.foreach { flow.addListener(_) } - skipStrategy.foreach { flow.setFlowSkipStrategy(_) } - stepStrategy.foreach { flow.setFlowStepStrategy(_) } - flow - } + def buildFlow: Flow[_] = + executionContext + .flatMap(_.buildFlow) + .flatMap[Flow[_]] { + case None => + Failure(new IllegalStateException("sink taps are required")) + case Some(flow) => + listeners.foreach(flow.addListener(_)) + stepListeners.foreach(flow.addStepListener(_)) + skipStrategy.foreach(flow.setFlowSkipStrategy(_)) + stepStrategy.foreach { strategy => + val existing = flow.getFlowStepStrategy + val composed = + if (existing == null) + strategy + else + FlowStepStrategies[Any].plus( + existing.asInstanceOf[FlowStepStrategy[Any]], + strategy.asInstanceOf[FlowStepStrategy[Any]] + ) + flow.setFlowStepStrategy(composed) + } + Success(flow) + } + .get // called before run // only override if you do not use flowDef - def validate { + def validate(): Unit = { + CascadingBackend.planTypedWrites(flowDef, mode) FlowStateMap.validateSources(flowDef, mode) } - // called after successfull run + // called after successful run // only override if you do not use flowDef - def clear { + def clear(): Unit = FlowStateMap.clear(flowDef) - } - protected def handleStats(statsData: CascadingStats) { + protected def handleStats(statsData: CascadingStats): Unit = { scaldingCascadingStats = Some(statsData) // TODO: Why the two ways to do stats? Answer: jank-den. - if(args.boolean("scalding.flowstats")) { + if (args.boolean("scalding.flowstats")) { val statsFilename = args.getOrElse("scalding.flowstats", name + "._flowstats.json") val br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(statsFilename), "utf-8")) br.write(JobStats(statsData).toJson) - br.close + br.close() } - // Print custom counters unless --scalding.nocounters is used + // Print custom counters unless --scalding.nocounters is used or there are no custom stats if (!args.boolean("scalding.nocounters")) { - implicit val statProvider = statsData - println("Dumping custom counters:") - Stats.getAllCustomCounters.foreach { case (counter, value) => - println("%s\t%s".format(counter, value)) + val jobStats = Stats.getAllCustomCounters()(statsData) + if (!jobStats.isEmpty) { + println("Dumping custom counters:") + jobStats.foreach { case (counter, value) => + println("%s\t%s".format(counter, value)) + } } } } @@ -267,45 +365,53 @@ class Job(val args : Args) extends FieldConversions with java.io.Serializable { @transient private[scalding] var scaldingCascadingStats: Option[CascadingStats] = None - //Override this if you need to do some extra processing other than complete the flow - def run: Boolean = { + /** + * Save the Flow object after a run to allow clients to inspect the job. + * @see + * HadoopPlatformJobTest + */ + @transient + private[scalding] var completedFlow: Option[Flow[_]] = None + + // Override this if you need to do some extra processing other than complete the flow + def run(): Boolean = { val flow = buildFlow - flow.complete + flow.complete() val statsData = flow.getFlowStats - handleStats(statsData) + completedFlow = Some(flow) statsData.isSuccessful } - //override this to add any listeners you need - def listeners : List[FlowListener] = Nil + // override these to add any listeners you need + def listeners: List[FlowListener] = Nil + def stepListeners: List[FlowStepListener] = Nil + + /** + * These are user-defined serializations IN-ADDITION to (but deduped) with the required serializations + */ + def ioSerializations: List[Class[_ <: HSerialization[_]]] = Nil - /** The exact list of Hadoop serializations passed into the config - * These replace the config serializations - * Cascading tuple serialization should be in this list, and probably - * before any custom code + /** + * Override this if you want to customize comparisons/hashing for your job the config method overwrites + * using this before sending to cascading The one we use by default is needed used to make Joins in the + * Fields-API more robust to Long vs Int differences. If you only use the Typed-API, consider changing this + * to return None */ - def ioSerializations: List[Class[_ <: HSerialization[_]]] = List( - classOf[org.apache.hadoop.io.serializer.WritableSerialization], - classOf[cascading.tuple.hadoop.TupleSerialization], - classOf[com.twitter.chill.hadoop.KryoSerialization] - ) - /** Override this if you want to customize comparisons/hashing for your job - * the config method overwrites using this before sending to cascading - */ def defaultComparator: Option[Class[_ <: java.util.Comparator[_]]] = Some(classOf[IntegralComparator]) /** - * This is implicit so that a Source can be used as the argument - * to a join or other method that accepts Pipe. + * This is implicit so that a Source can be used as the argument to a join or other method that accepts + * Pipe. */ - implicit def read(src : Source) : Pipe = src.read - /** This is only here for Java jobs which cannot automatically - * access the implicit Pipe => RichPipe which makes: pipe.write( ) - * convenient + implicit def read(src: Source): Pipe = src.read + + /** + * This is only here for Java jobs which cannot automatically access the implicit Pipe => RichPipe which + * makes: pipe.write( ) convenient */ - def write(pipe : Pipe, src : Source) {src.writeFrom(pipe)} + def write(pipe: Pipe, src: Source): Unit = src.writeFrom(pipe) /* * Need to be lazy to be used within pipes. @@ -319,10 +425,10 @@ class Job(val args : Args) extends FieldConversions with java.io.Serializable { * TODO: once we have a mechanism to access FlowProcess from user functions, we can use this * function to allow long running jobs by notifying Cascading of progress. */ - def timeout[T](timeout: AbsoluteDuration)(t: =>T): Option[T] = { + def timeout[T](timeout: AbsoluteDuration)(t: => T): Option[T] = { val f = timeoutExecutor.submit(new Callable[Option[T]] { def call(): Option[T] = Some(t) - }); + }) try { f.get(timeout.toMillisecs, TimeUnit.MILLISECONDS) } catch { @@ -352,24 +458,21 @@ class NamedPoolThreadFactory(name: String, makeDaemons: Boolean) extends ThreadF } } - /** -* Sets up an implicit dateRange to use in your sources and an implicit -* timezone. -* Example args: --date 2011-10-02 2011-10-04 --tz UTC -* If no timezone is given, Pacific is assumed. -*/ + * Sets up an implicit dateRange to use in your sources and an implicit timezone. Example args: --date + * 2011-10-02 2011-10-04 --tz UTC If no timezone is given, Pacific is assumed. + */ trait DefaultDateRangeJob extends Job { - //Get date implicits and PACIFIC and UTC vals. + // Get date implicits and PACIFIC and UTC vals. import DateOps._ // Optionally take --tz argument, or use Pacific time. Derived classes may // override defaultTimeZone to change the default. def defaultTimeZone = PACIFIC - implicit lazy val tz = args.optional("tz") match { - case Some(tzn) => java.util.TimeZone.getTimeZone(tzn) - case None => defaultTimeZone - } + implicit lazy val tz: java.util.TimeZone = args.optional("tz") match { + case Some(tzn) => java.util.TimeZone.getTimeZone(tzn) + case None => defaultTimeZone + } // Optionally take a --period, which determines how many days each job runs over (rather // than over the whole date range) @@ -387,17 +490,19 @@ trait DefaultDateRangeJob extends Job { (s, e) } - implicit lazy val dateRange = DateRange(startDate, if (period > 0) startDate + Days(period) - Millisecs(1) else endDate) + implicit lazy val dateRange: DateRange = + DateRange(startDate, if (period > 0) startDate + Days(period) - Millisecs(1) else endDate) - override def next : Option[Job] = + override def next: Option[Job] = if (period > 0) { val nextStartDate = startDate + Days(period) if (nextStartDate + Days(period - 1) > endDate) - None // we're done - else // return a new job with the new startDate - Some(clone(args + ("date" -> List(nextStartDate.toString("yyyy-MM-dd"), endDate.toString("yyyy-MM-dd"))))) - } - else + None // we're done + else // return a new job with the new startDate + Some( + clone(args + ("date" -> List(nextStartDate.toString("yyyy-MM-dd"), endDate.toString("yyyy-MM-dd")))) + ) + } else None } @@ -406,8 +511,52 @@ trait UtcDateRangeJob extends DefaultDateRangeJob { override def defaultTimeZone = DateOps.UTC } -// Used to inject a typed unique identifier into the Job class -case class UniqueID(get: String) +/** + * This is a simple job that allows you to launch Execution[T] instances using scalding.Tool and scald.rb. You + * cannot print the graph. + */ +abstract class ExecutionJob[+T](args: Args) extends Job(args) { + import scala.concurrent.{Await, ExecutionContext => scEC} + + /** + * To avoid serialization issues, this should not be a val, but a def, and prefer to keep as much as + * possible inside the method. + */ + def execution: Execution[T] + + /* + * Override this to control the execution context used + * to execute futures + */ + protected def concurrentExecutionContext: scEC = scEC.global + + @transient private[this] val resultPromise: Promise[T] = Promise[T]() + def result: Future[T] = resultPromise.future + + override def buildFlow: Flow[_] = + sys.error( + "ExecutionJobs do not have a single accessible flow. " + + "You cannot print the graph as it may be dynamically built or recurrent" + ) + + final override def run = { + val r = Config + .tryFrom(config) + .map { conf => + Await.result( + execution.run(conf, mode)(concurrentExecutionContext), + scala.concurrent.duration.Duration.Inf + ) + } + if (!resultPromise.tryComplete(r)) { + // The test framework can call this more than once. + println("Warning: run called more than once, should not happen in production") + } + // Force an exception if the run failed + r.get + true + } +} /* * Run a list of shell commands through bash in the given order. Return success @@ -415,23 +564,57 @@ case class UniqueID(get: String) * failing command is printed to stdout. */ class ScriptJob(cmds: Iterable[String]) extends Job(Args("")) { - override def run = { + override def run = try { - cmds.dropWhile { - cmd: String => { - new java.lang.ProcessBuilder("bash", "-c", cmd).start().waitFor() match { - case x if x != 0 => - println(cmd + " failed, exitStatus: " + x) - false - case 0 => true - } + cmds.dropWhile { cmd: String => + new java.lang.ProcessBuilder("bash", "-c", cmd).start().waitFor() match { + case x if x != 0 => + println(cmd + " failed, exitStatus: " + x) + false + case 0 => true } }.isEmpty } catch { - case e : Exception => { + case e: Exception => { e.printStackTrace false } } - } +} + +/** + * Allows custom counter verification logic when the job completes. + */ +trait CounterVerification extends Job { + + /** + * Verify counter values. The job will fail if this returns false or throws an exception. + */ + def verifyCounters(counters: Map[StatKey, Long]): Try[Unit] + + /** + * Override this to false to skip counter verification in tests. + */ + def verifyCountersInTest: Boolean = true + + override def listeners: List[FlowListener] = + if (this.mode.isInstanceOf[TestMode] && !this.verifyCountersInTest) { + super.listeners + } else { + super.listeners :+ new StatsFlowListener(this.verifyCounters) + } +} + +private[scalding] case class FlowStepStrategies[A]() extends Semigroup[FlowStepStrategy[A]] { + + /** + * Returns a new FlowStepStrategy that runs both strategies in sequence. + */ + def plus(l: FlowStepStrategy[A], r: FlowStepStrategy[A]): FlowStepStrategy[A] = + new FlowStepStrategy[A] { + override def apply(flow: Flow[A], predecessorSteps: JList[FlowStep[A]], flowStep: FlowStep[A]): Unit = { + l.apply(flow, predecessorSteps, flowStep) + r.apply(flow, predecessorSteps, flowStep) + } + } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/JobStats.scala b/scalding-core/src/main/scala/com/twitter/scalding/JobStats.scala deleted file mode 100644 index 2e1a6ec2d2..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/JobStats.scala +++ /dev/null @@ -1,77 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding - -import java.io.{ File, OutputStream } -import scala.collection.JavaConverters._ -import cascading.flow.Flow -import cascading.stats.{CascadeStats, CascadingStats, FlowStats} - -import scala.util.Try - -object JobStats { - def apply(stats: CascadingStats): JobStats = { - val m = statsMap(stats) - new JobStats( - stats match { - case cs: CascadeStats => m - case fs: FlowStats => m + ("flow_step_stats" -> fs.getFlowStepStats.asScala.map(statsMap)) - } - ) - } - - private def counterMap(stats: CascadingStats): Map[String, Any] = - stats.getCounterGroups.asScala.map { group => - (group, stats.getCountersFor(group).asScala.map { counter => - (counter, stats.getCounterValue(group, counter)) - }.toMap) - }.toMap - - private def statsMap(stats: CascadingStats): Map[String, Any] = - Map( - "counters" -> counterMap(stats), - "duration" -> stats.getDuration, - "finished_time" -> stats.getFinishedTime, - "id" -> stats.getID, - "name" -> stats.getName, - "run_time" -> stats.getRunTime, - "start_time" -> stats.getStartTime, - "submit_time" -> stats.getSubmitTime, - "failed" -> stats.isFailed, - "skipped" -> stats.isSkipped, - "stopped" -> stats.isStopped, - "successful" -> stats.isSuccessful - ) - - def toJsonValue(a: Any): String = { - Try(a.toString.toInt) - .recoverWith { case t: Throwable => Try(a.toString.toDouble) } - .recover { case t: Throwable => - val s = a.toString - "\"%s\"".format(s) - } - .get - .toString - } -} - -// Simple wrapper for a Map that contains the useful info from the job flow's stats -// If you want to write this, call toMap and use json, etc... to write it -case class JobStats(toMap: Map[String, Any]) { - def toJson: String = - toMap.map { case (k, v) => "\"%s\" : %s".format(k, JobStats.toJsonValue(v))} - .mkString("{",",","}") -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/JobTest.scala b/scalding-core/src/main/scala/com/twitter/scalding/JobTest.scala index 4e88cef5ff..ef2535e368 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/JobTest.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/JobTest.scala @@ -1,104 +1,140 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ package com.twitter.scalding -import scala.collection.mutable.{Buffer, ListBuffer} +import scala.collection.mutable +import scala.collection.JavaConverters._ import scala.annotation.tailrec import cascading.tuple.Tuple import cascading.tuple.TupleEntry import cascading.stats.CascadingStats +import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapred.JobConf -import scala.util.Try - object JobTest { - def apply(jobName : String) = { - new JobTest((args : Args) => Job(jobName,args)) - } - def apply(cons : (Args) => Job) = { + + @deprecated(message = "Use the non-reflection based JobTest apply methods", since = "0.16.1") + def apply(jobName: String) = + new JobTest((args: Args) => Job(jobName, args)) + + def apply(cons: (Args) => Job) = new JobTest(cons) - } - def apply[T <: Job : Manifest] = { - val cons = { (args : Args) => manifest[T].erasure - .getConstructor(classOf[Args]) - .newInstance(args) - .asInstanceOf[Job] } + + def apply[T <: Job: Manifest] = { + val cons = { (args: Args) => + manifest[T].runtimeClass + .getConstructor(classOf[Args]) + .newInstance(args) + .asInstanceOf[Job] + } new JobTest(cons) } + + // We have to memoize to return the same buffer each time. + private case class MemoizedSourceFn[T]( + fn: Source => Option[Iterable[T]], + setter: TupleSetter[T] + ) extends (Source => Option[mutable.Buffer[Tuple]]) { + private val memo = mutable.Map[Source, Option[mutable.Buffer[Tuple]]]() + private val lock = new Object() + + def apply(src: Source): Option[mutable.Buffer[Tuple]] = lock.synchronized { + memo.getOrElseUpdate(src, fn(src).map(elements => elements.map(t => setter(t)).toBuffer)) + } + } } object CascadeTest { - def apply(jobName : String) = { - new CascadeTest((args : Args) => Job(jobName,args)) - } + def apply(jobName: String) = + new CascadeTest((args: Args) => Job(jobName, args)) } /** - * This class is used to construct unit tests for scalding jobs. - * You should not use it unless you are writing tests. - * For examples of how to do that, see the tests included in the - * main scalding repository: - * https://github.com/twitter/scalding/tree/master/src/test/scala/com/twitter/scalding + * This class is used to construct unit tests for scalding jobs. You should not use it unless you are writing + * tests. For examples of how to do that, see the tests included in the main scalding repository: + * https://github.com/twitter/scalding/tree/master/scalding-core/src/test/scala/com/twitter/scalding */ -class JobTest(cons : (Args) => Job) { +class JobTest(cons: (Args) => Job) { private var argsMap = Map[String, List[String]]() - private val callbacks = Buffer[() => Unit]() - private val statsCallbacks = Buffer[(CascadingStats) => Unit]() + private val callbacks = mutable.Buffer[() => Unit]() + private val statsCallbacks = mutable.Buffer[(CascadingStats) => Unit]() // TODO: Switch the following maps and sets from Source to String keys // to guard for scala equality bugs - private var sourceMap: (Source) => Option[Buffer[Tuple]] = { _ => None } + private var sourceMap: (Source) => Option[mutable.Buffer[Tuple]] = { _ => None } private var sinkSet = Set[Source]() private var fileSet = Set[String]() + private var validateJob = false - def arg(inArg : String, value : List[String]) = { + def arg(inArg: String, value: List[String]) = { argsMap += inArg -> value this } - def arg(inArg : String, value : String) = { + def arg(inArg: String, value: String) = { argsMap += inArg -> List(value) this } - private def sourceBuffer[T:TupleSetter](s: Source, tups: Iterable[T]): JobTest = { - source {src => if(src == s) Some(tups) else None } + private def sourceBuffer[T: TupleSetter](s: Source, tups: Iterable[T]): JobTest = { + source(src => if (src == s) Some(tups) else None) this } /** Add a function to produce a mock when a certain source is requested */ def source[T](fn: Source => Option[Iterable[T]])(implicit setter: TupleSetter[T]): JobTest = { + val memoized = JobTest.MemoizedSourceFn(fn, setter) val oldSm = sourceMap - val bufferTupFn = fn.andThen { optItT => optItT.map { _.map(t => setter(t)).toBuffer } } - // We have to memoize to return the same buffer each time - val memo = scala.collection.mutable.Map[Source, Option[Buffer[Tuple]]]() - sourceMap = { (src: Source) => memo.getOrElseUpdate(src, bufferTupFn(src)).orElse(oldSm(src)) } + sourceMap = { src: Source => + memoized(src).orElse(oldSm(src)) + } this } /** - * Enables syntax like: - * .ifSource { case Tsv("in") => List(1, 2, 3) } - * We need a different function name from source to help the compiler + * Enables syntax like: .ifSource { case Tsv("in") => List(1, 2, 3) } We need a different function name from + * source to help the compiler */ def ifSource[T](fn: PartialFunction[Source, Iterable[T]])(implicit setter: TupleSetter[T]): JobTest = source(fn.lift) - def source(s : Source, iTuple : Iterable[Product]): JobTest = - source[Product](s, iTuple)(TupleSetter.ProductSetter) - - def source[T](s : Source, iTuple : Iterable[T])(implicit setter: TupleSetter[T]): JobTest = + def source[T](s: Source, iTuple: Iterable[T])(implicit setter: TupleSetter[T]): JobTest = sourceBuffer(s, iTuple) - def sink[A](s : Source)(op : Buffer[A] => Unit ) - (implicit conv : TupleConverter[A]) = { + // This use of `_.get` is probably safe, but difficult to prove correct + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + def sink[A](s: Source)(op: mutable.Buffer[A] => Unit)(implicit conv: TupleConverter[A]) = { if (sourceMap(s).isEmpty) { // if s is also used as a source, we shouldn't reset its buffer - source(s, new ListBuffer[Tuple]) + source(s, new mutable.ListBuffer[Tuple]) } val buffer = sourceMap(s).get + /* NOTE: `HadoopTest.finalize` depends on `sinkSet` matching the set of + * "keys" in the `sourceMap`. Do not change the following line unless + * you also modify the `finalize` function accordingly. + */ sinkSet += s - callbacks += (() => op(buffer.map { tup => conv(new TupleEntry(tup)) })) + callbacks += (() => op(buffer.map(tup => conv(new TupleEntry(tup))))) this } + def typedSink[A](s: Source with TypedSink[A])(op: mutable.Buffer[A] => Unit)(implicit + conv: TupleConverter[A] + ) = + sink[A](s)(op) + // Used to pass an assertion about a counter defined by the given group and name. // If this test is checking for multiple jobs chained by next, this only checks // for the counters in the final job's FlowStat. @@ -113,10 +149,10 @@ class JobTest(cons : (Args) => Job) { this } - // Simulates the existance of a file so that mode.fileExists returns true. We + // Simulates the existence of a file so that mode.fileExists returns true. We // do not simulate the file contents; that should be done through mock // sources. - def registerFile(filename : String) = { + def registerFile(filename: String) = { fileSet += filename this } @@ -126,7 +162,7 @@ class JobTest(cons : (Args) => Job) { this } - def runWithoutNext(useHadoop : Boolean = false) = { + def runWithoutNext(useHadoop: Boolean = false) = { runJob(initJob(useHadoop), false) this } @@ -136,23 +172,34 @@ class JobTest(cons : (Args) => Job) { this } - def runHadoopWithConf(conf : JobConf) = { + def runHadoopWithConf(conf: Configuration) = { runJob(initJob(true, Some(conf)), true) this } - // This SITS is unfortunately needed to get around Specs - def finish : Unit = { () } + // This is just syntax to end the "builder" pattern to satify some test frameworks + def finish(): Unit = () - // Registers test files, initializes the global mode, and creates a job. - private def initJob(useHadoop : Boolean, job: Option[JobConf] = None) : Job = { + def validate(v: Boolean) = { + validateJob = v + this + } + + def getArgs: Args = + new Args(argsMap) + + /** + * This method does not mutate the JobTest instance + */ + def getTestMode(useHadoop: Boolean, optConfig: Option[Configuration] = None): TestMode = { // Create a global mode to use for testing. - val testMode : TestMode = + val testMode: TestMode = if (useHadoop) { - val conf = job.getOrElse(new JobConf) + val conf = optConfig.getOrElse(new JobConf) // Set the polling to a lower value to speed up tests: conf.set("jobclient.completion.poll.interval", "100") conf.set("cascading.flow.job.pollinginterval", "5") + conf.set("mapreduce.framework.name", "local") // Work around for local hadoop race conf.set("mapred.local.dir", "/tmp/hadoop/%s/mapred/local".format(java.util.UUID.randomUUID)) HadoopTest(conf, sourceMap) @@ -160,36 +207,70 @@ class JobTest(cons : (Args) => Job) { Test(sourceMap) } testMode.registerTestFiles(fileSet) - val args = new Args(argsMap) + testMode + } + /** + * Run the clean ups and checks after a job has executed + */ + def postRunChecks(mode: Mode): Unit = { + mode match { + case hadoopTest @ HadoopTest(_, _) => { + /* NOTE: `HadoopTest.finalize` depends on `sinkSet` matching the set of + * "keys" in the `sourceMap`. Do not change the following line unless + * you also modify the `finalize` function accordingly. + */ + // The sinks are written to disk, we need to clean them up: + sinkSet.foreach(hadoopTest.finalize(_)) + } + case _ => () + } + // Now it is time to check the test conditions: + callbacks.foreach(cb => cb()) + } + + // Registers test files, initializes the global mode, and creates a job. + private[scalding] def initJob(useHadoop: Boolean, job: Option[Configuration] = None): Job = { + val testMode = getTestMode(useHadoop, job) // Construct a job. - cons(Mode.putMode(testMode, args)) + cons(Mode.putMode(testMode, getArgs)) } @tailrec - private final def runJob(job : Job, runNext : Boolean) : Unit = { + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + private final def runJob(job: Job, runNext: Boolean): Unit = { + // Disable automatic cascading update + System.setProperty("cascading.update.skip", "true") - job.run + // create cascading 3.0 planner trace files during tests + if (System.getenv.asScala.getOrElse("SCALDING_CASCADING3_DEBUG", "0") == "1") { + System.setProperty("cascading.planner.plan.path", "target/test/cascading/traceplan/" + job.name) + System.setProperty( + "cascading.planner.plan.transforms.path", + "target/test/cascading/traceplan/" + job.name + "/transform" + ) + System.setProperty( + "cascading.planner.stats.path", + "target/test/cascading/traceplan/" + job.name + "/stats" + ) + } + + if (validateJob) { + job.validate() + } + job.run() // Make sure to clean the state: - job.clear + job.clear() - val next : Option[Job] = if (runNext) { job.next } else { None } + val next: Option[Job] = if (runNext) { job.next } + else { None } next match { case Some(nextjob) => runJob(nextjob, runNext) - case None => { - job.mode match { - case hadoopTest @ HadoopTest(_,_) => { - // The sinks are written to disk, we need to clean them up: - sinkSet.foreach{ hadoopTest.finalize(_) } - } - case _ => () - } - // Now it is time to check the test conditions: - callbacks.foreach { cb => cb() } - statsCallbacks.foreach { cb => cb(job.scaldingCascadingStats.get) } - } + case None => + postRunChecks(job.mode) + statsCallbacks.foreach(cb => cb(job.scaldingCascadingStats.get)) } } } -class CascadeTest(cons : (Args) => Job) extends JobTest(cons) { } +class CascadeTest(cons: (Args) => Job) extends JobTest(cons) {} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/JoinAlgorithms.scala b/scalding-core/src/main/scala/com/twitter/scalding/JoinAlgorithms.scala index 79f32a2945..70f56cb549 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/JoinAlgorithms.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/JoinAlgorithms.scala @@ -12,25 +12,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.tap._ -import cascading.scheme._ import cascading.pipe._ -import cascading.pipe.assembly._ import cascading.pipe.joiner._ -import cascading.flow._ -import cascading.operation._ -import cascading.operation.aggregator._ -import cascading.operation.filter._ import cascading.tuple._ -import cascading.cascade._ -import scala.util.Random +import java.util.{Iterator => JIterator} +import java.util.Random // this one is serializable, scala.util.Random is not import scala.collection.JavaConverters._ - object JoinAlgorithms extends java.io.Serializable { // seed is ascii codes for "scalding" combined as affine maps. val Seed: Long = (((115L * 99 + 97) * 108 + 100) * 105 + 110) * 103 @@ -45,76 +37,77 @@ trait JoinAlgorithms { import RichPipe.assignName import JoinAlgorithms.Seed - def pipe : Pipe + def pipe: Pipe /** - * This method is used internally to implement all joins. - * You can use this directly if you want to implement something like a star join, - * e.g., when joining a single pipe to multiple other pipes. Make sure that you call this method - * on the larger pipe to make the grouping as efficient as possible. - * - * If you are only joining two pipes, then you are better off - * using joinWithSmaller/joinWithLarger/joinWithTiny/leftJoinWithTiny. + * This method is used internally to implement all joins. You can use this directly if you want to implement + * something like a star join, e.g., when joining a single pipe to multiple other pipes. Make sure that you + * call this method on the larger pipe to make the grouping as efficient as possible. * + * If you are only joining two pipes, then you are better off using + * joinWithSmaller/joinWithLarger/joinWithTiny/leftJoinWithTiny. */ - def coGroupBy(f : Fields, j : JoinMode = InnerJoinMode)(builder : CoGroupBuilder => GroupBuilder) : Pipe = { + def coGroupBy(f: Fields, j: JoinMode = InnerJoinMode)(builder: CoGroupBuilder => GroupBuilder): Pipe = builder(new CoGroupBuilder(f, j)).schedule(pipe.getName, pipe) - } /** - * == WARNING == - * Doing a cross product with even a moderate sized pipe can - * create ENORMOUS output. The use-case here is attaching a constant (e.g. - * a number or a dictionary or set) to each row in another pipe. - * A common use-case comes from a groupAll and reduction to one row, - * then you want to send the results back out to every element in a pipe + * ==WARNING== + * Doing a cross product with even a moderate sized pipe can create ENORMOUS output. The use-case here is + * attaching a constant (e.g. a number or a dictionary or set) to each row in another pipe. A common + * use-case comes from a groupAll and reduction to one row, then you want to send the results back out to + * every element in a pipe * - * This uses joinWithTiny, so tiny pipe is replicated to all Mappers. If it - * is large, this will blow up. Get it: be foolish here and LOSE IT ALL! + * This uses joinWithTiny, so tiny pipe is replicated to all Mappers. If it is large, this will blow up. Get + * it: be foolish here and LOSE IT ALL! * * Use at your own risk. */ - def crossWithTiny(tiny : Pipe) = { - val tinyJoin = tiny.map(() -> '__joinTiny__) { (u:Unit) => 1 } - pipe.map(() -> '__joinBig__) { (u:Unit) => 1 } + def crossWithTiny(tiny: Pipe) = { + val tinyJoin = tiny.map(() -> '__joinTiny__)((u: Unit) => 1) + pipe + .map(() -> '__joinBig__)((u: Unit) => 1) .joinWithTiny('__joinBig__ -> '__joinTiny__, tinyJoin) .discard('__joinBig__, '__joinTiny__) } + /** - * Does a cross-product by doing a blockJoin. - * Useful when doing a large cross, if your cluster can take it. + * Does a cross-product by doing a blockJoin. Useful when doing a large cross, if your cluster can take it. * Prefer crossWithTiny */ - def crossWithSmaller(p : Pipe, replication : Int = 20) = { - val smallJoin = p.map(() -> '__joinSmall__) { (u:Unit) => 1 } - pipe.map(() -> '__joinBig__) { (u:Unit) => 1 } + def crossWithSmaller(p: Pipe, replication: Int = 20) = { + val smallJoin = p.map(() -> '__joinSmall__)((u: Unit) => 1) + pipe + .map(() -> '__joinBig__)((u: Unit) => 1) .blockJoinWithSmaller('__joinBig__ -> '__joinSmall__, smallJoin, rightReplication = replication) .discard('__joinBig__, '__joinSmall__) } /** - * Rename the collisions and return the pipe and the new names, - * and the fields to discard + * Rename the collisions and return the pipe and the new names, and the fields to discard */ - private def renameCollidingFields(p : Pipe, fields : Fields, - collisions: Set[Comparable[_]]) : (Pipe, Fields, Fields) = { + private def renameCollidingFields( + p: Pipe, + fields: Fields, + collisions: Set[Comparable[_]] + ): (Pipe, Fields, Fields) = { // Here is how we rename colliding fields - def rename(f : Comparable[_]) : String = "__temp_join_" + f.toString + def rename(f: Comparable[_]): String = "__temp_join_" + f.toString // convert to list, so we are explicit that ordering is fixed below: val renaming = collisions.toList - val orig = new Fields(renaming : _*) - val temp = new Fields(renaming.map { rename } : _*) + val orig = new Fields(renaming: _*) + val temp = new Fields(renaming.map(rename): _*) // Now construct the new join keys, where we check for a rename // otherwise use the original key: - val newJoinKeys = new Fields( asList(fields) - .map { fname => - // If we renamed, get the rename, else just use the field - if (collisions(fname)) { - rename(fname) - } - else fname - } : _*) + val newJoinKeys = new Fields( + asList(fields) + .map { fname => + // If we renamed, get the rename, else just use the field + if (collisions(fname)) { + rename(fname) + } else fname + }: _* + ) val renamedPipe = p.rename(orig -> temp) (renamedPipe, newJoinKeys, temp) } @@ -122,36 +115,37 @@ trait JoinAlgorithms { /** * Flip between LeftJoin to RightJoin */ - private def flipJoiner(j : Joiner) = { + private def flipJoiner(j: Joiner): Joiner = j match { - case outer : OuterJoin => outer + case outer: OuterJoin => outer case inner: InnerJoin => inner - case left : LeftJoin => new RightJoin - case right : RightJoin => new LeftJoin - case other => throw new InvalidJoinModeException("cannot use joiner " + other + - " since it cannot be flipped safely") + case left: LeftJoin => new RightJoin + case right: RightJoin => new LeftJoin + case other => + throw new InvalidJoinModeException( + "cannot use joiner " + other + + " since it cannot be flipped safely" + ) } - } - def joinerToJoinModes(j : Joiner) = { + def joinerToJoinModes(j: Joiner) = j match { - case i : InnerJoin => (InnerJoinMode, InnerJoinMode) - case l : LeftJoin => (InnerJoinMode, OuterJoinMode) - case r : RightJoin => (OuterJoinMode, InnerJoinMode) - case o : OuterJoin => (OuterJoinMode, OuterJoinMode) - case _ => throw new InvalidJoinModeException("cannot convert joiner to joiner modes") + case i: InnerJoin => (InnerJoinMode, InnerJoinMode) + case l: LeftJoin => (InnerJoinMode, OuterJoinMode) + case r: RightJoin => (OuterJoinMode, InnerJoinMode) + case o: OuterJoin => (OuterJoinMode, OuterJoinMode) + case _ => throw new InvalidJoinModeException("cannot convert joiner to joiner modes") } - } /** - * Joins the first set of keys in the first pipe to the second set of keys in the second pipe. - * All keys must be unique UNLESS it is an inner join, then duplicated join keys are allowed, but - * the second copy is deleted (as cascading does not allow duplicated field names). + * Joins the first set of keys in the first pipe to the second set of keys in the second pipe. All keys must + * be unique UNLESS it is an inner join, then duplicated join keys are allowed, but the second copy is + * deleted (as cascading does not allow duplicated field names). * * Smaller here means that the values/key is smaller than the left. * - * Avoid going crazy adding more explicit join modes. Instead do for some other join - * mode with a larger pipe: + * Avoid going crazy adding more explicit join modes. Instead do for some other join mode with a larger + * pipe: * * {{{ * .then { pipe => other. @@ -159,119 +153,130 @@ trait JoinAlgorithms { * } * }}} */ - def joinWithSmaller(fs :(Fields,Fields), that : Pipe, joiner : Joiner = new InnerJoin, reducers : Int = -1) = { + def joinWithSmaller( + fs: (Fields, Fields), + that: Pipe, + joiner: Joiner = new InnerJoin, + reducers: Int = -1 + ) = { // If we are not doing an inner join, the join fields must be disjoint: val joiners = joinerToJoinModes(joiner) val intersection = asSet(fs._1).intersect(asSet(fs._2)) - if (intersection.size == 0) { + if (intersection.isEmpty) { // Common case: no intersection in names: just CoGroup, which duplicates the grouping fields: pipe.coGroupBy(fs._1, joiners._1) { _.coGroup(fs._2, that, joiners._2) .reducers(reducers) } - } - else if (joiners._1 == InnerJoinMode && joiners._2 == InnerJoinMode) { + } else if (joiners._1 == InnerJoinMode && joiners._2 == InnerJoinMode) { /* * Since it is an inner join, we only output if the key is present an equal in both sides. * For this (common) case, it doesn't matter if we drop one of the matching grouping fields. * So, we rename the right hand side to temporary names, then discard them after the operation */ val (renamedThat, newJoinFields, temp) = renameCollidingFields(that, fs._2, intersection) - pipe.coGroupBy(fs._1, joiners._1) { - _.coGroup(newJoinFields, renamedThat, joiners._2) - .reducers(reducers) - }.discard(temp) - } - else { - throw new IllegalArgumentException("join keys must be disjoint unless you are doing an InnerJoin. Found: " + - fs.toString + ", which overlap with: " + intersection.toString) + pipe + .coGroupBy(fs._1, joiners._1) { + _.coGroup(newJoinFields, renamedThat, joiners._2) + .reducers(reducers) + } + .discard(temp) + } else { + throw new IllegalArgumentException( + "join keys must be disjoint unless you are doing an InnerJoin. Found: " + + fs.toString + ", which overlap with: " + intersection.toString + ) } } /** * same as reversing the order on joinWithSmaller */ - def joinWithLarger(fs : (Fields, Fields), that : Pipe, joiner : Joiner = new InnerJoin, reducers : Int = -1) = { + def joinWithLarger(fs: (Fields, Fields), that: Pipe, joiner: Joiner = new InnerJoin, reducers: Int = -1) = that.joinWithSmaller((fs._2, fs._1), pipe, flipJoiner(joiner), reducers) - } /** - * This is joinWithSmaller with joiner parameter fixed to LeftJoin. If the item is absent on the right put null for the keys and values + * This is joinWithSmaller with joiner parameter fixed to LeftJoin. If the item is absent on the right put + * null for the keys and values */ - def leftJoinWithSmaller(fs :(Fields,Fields), that : Pipe, reducers : Int = -1) = { + def leftJoinWithSmaller(fs: (Fields, Fields), that: Pipe, reducers: Int = -1) = joinWithSmaller(fs, that, new LeftJoin, reducers) - } /** - * This is joinWithLarger with joiner parameter fixed to LeftJoin. If the item is absent on the right put null for the keys and values + * This is joinWithLarger with joiner parameter fixed to LeftJoin. If the item is absent on the right put + * null for the keys and values */ - def leftJoinWithLarger(fs :(Fields,Fields), that : Pipe, reducers : Int = -1) = { + def leftJoinWithLarger(fs: (Fields, Fields), that: Pipe, reducers: Int = -1) = joinWithLarger(fs, that, new LeftJoin, reducers) - } /** - * This does an assymmetric join, using cascading's "HashJoin". This only runs through - * this pipe once, and keeps the right hand side pipe in memory (but is spillable). - * - * Choose this when Left > max(mappers,reducers) * Right, or when the left side is three - * orders of magnitude larger. + * This does an assymmetric join, using cascading's "HashJoin". This only runs through this pipe once, and + * keeps the right hand side pipe in memory (but is spillable). * - * joins the first set of keys in the first pipe to the second set of keys in the second pipe. - * Duplicated join keys are allowed, but - * the second copy is deleted (as cascading does not allow duplicated field names). + * Choose this when Left > max(mappers,reducers) * Right, or when the left side is three orders of magnitude + * larger. * + * joins the first set of keys in the first pipe to the second set of keys in the second pipe. Duplicated + * join keys are allowed, but the second copy is deleted (as cascading does not allow duplicated field + * names). * - * == Warning == - * This does not work with outer joins, or right joins, only inner and - * left join versions are given. + * ==Warning== + * This does not work with outer joins, or right joins, only inner and left join versions are given. */ - def joinWithTiny(fs :(Fields,Fields), that : Pipe) = { + def joinWithTiny(fs: (Fields, Fields), that: Pipe) = { val intersection = asSet(fs._1).intersect(asSet(fs._2)) - if (intersection.size == 0) { - new HashJoin(assignName(pipe), fs._1, assignName(that), fs._2, new InnerJoin) - } - else { + if (intersection.isEmpty) { + new HashJoin(assignName(pipe), fs._1, assignName(that), fs._2, WrappedJoiner(new InnerJoin)) + } else { val (renamedThat, newJoinFields, temp) = renameCollidingFields(that, fs._2, intersection) - (new HashJoin(assignName(pipe), fs._1, assignName(renamedThat), newJoinFields, new InnerJoin)) + (new HashJoin( + assignName(pipe), + fs._1, + assignName(renamedThat), + newJoinFields, + WrappedJoiner(new InnerJoin) + )) .discard(temp) } } - def leftJoinWithTiny(fs :(Fields,Fields), that : Pipe) = { - //Rename these pipes to avoid cascading name conflicts - new HashJoin(assignName(pipe), fs._1, assignName(that), fs._2, new LeftJoin) - } + def leftJoinWithTiny(fs: (Fields, Fields), that: Pipe) = + // Rename these pipes to avoid cascading name conflicts + new HashJoin(assignName(pipe), fs._1, assignName(that), fs._2, WrappedJoiner(new LeftJoin)) /** - * Performs a block join, otherwise known as a replicate fragment join (RF join). - * The input params leftReplication and rightReplication control the replication of the left and right - * pipes respectively. + * Performs a block join, otherwise known as a replicate fragment join (RF join). The input params + * leftReplication and rightReplication control the replication of the left and right pipes respectively. * - * This is useful in cases where the data has extreme skew. A symptom of this is that we may see a job stuck for - * a very long time on a small number of reducers. + * This is useful in cases where the data has extreme skew. A symptom of this is that we may see a job stuck + * for a very long time on a small number of reducers. * - * A block join is way to get around this: we add a random integer field and a replica field - * to every tuple in the left and right pipes. We then join on the original keys and - * on these new dummy fields. These dummy fields make it less likely that the skewed keys will - * be hashed to the same reducer. + * A block join is way to get around this: we add a random integer field and a replica field to every tuple + * in the left and right pipes. We then join on the original keys and on these new dummy fields. These dummy + * fields make it less likely that the skewed keys will be hashed to the same reducer. * - * The final data size is right * rightReplication + left * leftReplication - * but because of the fragmentation, we are guaranteed the same number of hits as the original join. + * The final data size is right * rightReplication + left * leftReplication but because of the + * fragmentation, we are guaranteed the same number of hits as the original join. * - * If the right pipe is really small then you are probably better off with a joinWithTiny. If however - * the right pipe is medium sized, then you are better off with a blockJoinWithSmaller, and a good rule - * of thumb is to set rightReplication = left.size / right.size and leftReplication = 1 + * If the right pipe is really small then you are probably better off with a joinWithTiny. If however the + * right pipe is medium sized, then you are better off with a blockJoinWithSmaller, and a good rule of thumb + * is to set rightReplication = left.size / right.size and leftReplication = 1 * - * Finally, if both pipes are of similar size, e.g. in case of a self join with a high data skew, - * then it makes sense to set leftReplication and rightReplication to be approximately equal. + * Finally, if both pipes are of similar size, e.g. in case of a self join with a high data skew, then it + * makes sense to set leftReplication and rightReplication to be approximately equal. * - * == Note == - * You can only use an InnerJoin or a LeftJoin with a leftReplication of 1 - * (or a RightJoin with a rightReplication of 1) when doing a blockJoin. + * ==Note== + * You can only use an InnerJoin or a LeftJoin with a leftReplication of 1 (or a RightJoin with a + * rightReplication of 1) when doing a blockJoin. */ - def blockJoinWithSmaller(fs : (Fields, Fields), - otherPipe : Pipe, rightReplication : Int = 1, leftReplication : Int = 1, - joiner : Joiner = new InnerJoin, reducers : Int = -1) : Pipe = { + def blockJoinWithSmaller( + fs: (Fields, Fields), + otherPipe: Pipe, + rightReplication: Int = 1, + leftReplication: Int = 1, + joiner: Joiner = new InnerJoin, + reducers: Int = -1 + ): Pipe = { assert(rightReplication > 0, "Must specify a positive number for the right replication in block join") assert(leftReplication > 0, "Must specify a positive number for the left replication in block join") @@ -283,7 +288,8 @@ trait JoinAlgorithms { // Add the new dummy replication fields val newLeft = addReplicationFields(pipe, leftFields, leftReplication, rightReplication) - val newRight = addReplicationFields(otherPipe, rightFields, rightReplication, leftReplication, swap = true) + val newRight = + addReplicationFields(otherPipe, rightFields, rightReplication, leftReplication, swap = true) val leftJoinFields = Fields.join(fs._1, leftFields) val rightJoinFields = Fields.join(fs._2, rightFields) @@ -297,85 +303,95 @@ trait JoinAlgorithms { /** * Adds one random field and one replica field. */ - private def addReplicationFields(p : Pipe, f : Fields, - replication : Int, otherReplication : Int, swap : Boolean = false) : Pipe = { + private def addReplicationFields( + p: Pipe, + f: Fields, + replication: Int, + otherReplication: Int, + swap: Boolean = false + ): Pipe = /** - * We need to seed exactly once and capture that seed. If we let - * each task create a seed, a restart will change the computation, - * and this could result in subtle bugs. + * We need to seed exactly once and capture that seed. If we let each task create a seed, a restart will + * change the computation, and this could result in subtle bugs. */ - p.using(new Random(Seed) with Stateful).flatMap(() -> f) { (rand : Random, _ : Unit) => + p.using(new Random(Seed) with Stateful).flatMap(() -> f) { (rand: Random, _: Unit) => val rfs = getReplicationFields(rand, replication, otherReplication) - if (swap) rfs.map { case(i, j) => (j, i) } else rfs + if (swap) rfs.map { case (i, j) => (j, i) } + else rfs } - } /** * Returns a list of the dummy replication fields used to replicate groups in skewed joins. * - * For example, suppose you have two pipes P1 and P2. While performing a skewed join for a particular - * key K, you want to replicate every row in P1 with this key 3 times, and every row in P2 with this - * key 5 times. + * For example, suppose you have two pipes P1 and P2. While performing a skewed join for a particular key K, + * you want to replicate every row in P1 with this key 3 times, and every row in P2 with this key 5 times. * * Then: * - * - For the P1 replication, the first element of each tuple is the same random integer between 0 and 4, - * and the second element of each tuple is the index of the replication (between 0 and 2). This first - * random element guarantees that we will match exactly one random row in P2 with the same key. - * - Similarly for the P2 replication. + * - For the P1 replication, the first element of each tuple is the same random integer between 0 and 4, + * and the second element of each tuple is the index of the replication (between 0 and 2). This first + * random element guarantees that we will match exactly one random row in P2 with the same key. + * - Similarly for the P2 replication. * * Examples: * - * getReplicationFields(3, 5) - * => List( (1, 0), (1, 1), (1, 2) ) + * getReplicationFields(3, 5) \=> List( (1, 0), (1, 1), (1, 2) ) * - * getReplicationFields(5, 3) - * => List( (2, 0), (2, 1), (2, 2), (2, 3), (2, 4) ) + * getReplicationFields(5, 3) \=> List( (2, 0), (2, 1), (2, 2), (2, 3), (2, 4) ) */ - private def getReplicationFields(r : Random, replication : Int, otherReplication : Int) : IndexedSeq[(Int, Int)] = { + private def getReplicationFields( + r: Random, + replication: Int, + otherReplication: Int + ): IndexedSeq[(Int, Int)] = { assert(replication >= 1 && otherReplication >= 1, "Replication counts must be >= 1") val rand = r.nextInt(otherReplication) - (0 until replication).map { rep => (rand, rep) } + (0 until replication).map(rep => (rand, rep)) } - private def assertValidJoinMode(joiner : Joiner, left : Int, right : Int): Unit = { + private def assertValidJoinMode(joiner: Joiner, left: Int, right: Int): Unit = (joiner, left, right) match { - case (i : InnerJoin, _, _) => () - case (k : LeftJoin, 1, _) => () - case (m : RightJoin, _, 1) => () + case (i: InnerJoin, _, _) => () + case (k: LeftJoin, 1, _) => () + case (m: RightJoin, _, 1) => () case (j, l, r) => throw new InvalidJoinModeException( "you cannot use joiner " + j + " with left replication " + l + " and right replication " + r ) } - } /** * Performs a skewed join, which is useful when the data has extreme skew. * - * For example, imagine joining a pipe of Twitter's follow graph against a pipe of user genders, - * in order to find the gender distribution of the accounts every Twitter user follows. Since celebrities - * (e.g., Justin Bieber and Lady Gaga) have a much larger follower base than other users, and (under - * a standard join algorithm) all their followers get sent to the same reducer, the job will likely be - * stuck on a few reducers for a long time. A skewed join attempts to alleviate this problem. + * For example, imagine joining a pipe of Twitter's follow graph against a pipe of user genders, in order to + * find the gender distribution of the accounts every Twitter user follows. Since celebrities (e.g., Justin + * Bieber and Lady Gaga) have a much larger follower base than other users, and (under a standard join + * algorithm) all their followers get sent to the same reducer, the job will likely be stuck on a few + * reducers for a long time. A skewed join attempts to alleviate this problem. * * This works as follows: * - * 1. First, we sample from the left and right pipes with some small probability, in order to determine - * approximately how often each join key appears in each pipe. - * 2. We use these estimated counts to replicate the join keys, according to the given replication strategy. - * 3. Finally, we join the replicated pipes together. + * 1. First, we sample from the left and right pipes with some small probability, in order to determine + * approximately how often each join key appears in each pipe. 2. We use these estimated counts to + * replicate the join keys, according to the given replication strategy. 3. Finally, we join the + * replicated pipes together. * - * @param sampleRate This controls how often we sample from the left and right pipes when estimating key counts. - * @param replicator Algorithm for determining how much to replicate a join key in the left and right pipes. + * @param sampleRate + * This controls how often we sample from the left and right pipes when estimating key counts. + * @param replicator + * Algorithm for determining how much to replicate a join key in the left and right pipes. * * Note: since we do not set the replication counts, only inner joins are allowed. (Otherwise, replicated * rows would stay replicated when there is no counterpart in the other pipe.) */ - def skewJoinWithSmaller(fs : (Fields, Fields), otherPipe : Pipe, - sampleRate : Double = 0.001, reducers : Int = -1, - replicator : SkewReplication = SkewReplicationA()) : Pipe = { + def skewJoinWithSmaller( + fs: (Fields, Fields), + otherPipe: Pipe, + sampleRate: Double = 0.001, + reducers: Int = -1, + replicator: SkewReplication = SkewReplicationA() + ): Pipe = { assert(sampleRate > 0 && sampleRate < 1, "Sampling rate for skew joins must lie strictly between 0 and 1") @@ -383,10 +399,11 @@ trait JoinAlgorithms { // Resolve colliding fields val (rightPipe, rightResolvedJoinFields, dupeFields) = - if (intersection == 0) + if (intersection.isEmpty) (otherPipe, fs._2, Fields.NONE) else // For now, we are assuming an inner join. renameCollidingFields(otherPipe, fs._2, intersection) + val mergedJoinKeys = Fields.join(fs._1, rightResolvedJoinFields) // 1. First, get an approximate count of the left join keys and the right join keys, so that we // know how much to replicate. @@ -396,26 +413,65 @@ trait JoinAlgorithms { val sampledCountFields = new Fields(leftSampledCountField, rightSampledCountField) /** - * We need to seed exactly once and capture that seed. If we let - * each task create a seed, a restart will change the computation, - * and this could result in subtle bugs. + * We need to seed exactly once and capture that seed. If we let each task create a seed, a restart will + * change the computation, and this could result in subtle bugs. */ - val sampledLeft = pipe.sample(sampleRate, Seed) - .groupBy(fs._1) { _.size(leftSampledCountField) } - val sampledRight = rightPipe.sample(sampleRate, Seed) - .groupBy(rightResolvedJoinFields) { _.size(rightSampledCountField) } - val sampledCounts = sampledLeft.joinWithSmaller(fs._1 -> rightResolvedJoinFields, sampledRight, joiner = new OuterJoin) - .project(Fields.join(fs._1, rightResolvedJoinFields, sampledCountFields)) + val sampledLeft = pipe + .sample(sampleRate, Seed) + .groupBy(fs._1)(_.size(leftSampledCountField)) + val sampledRight = rightPipe + .sample(sampleRate, Seed) + .groupBy(rightResolvedJoinFields)(_.size(rightSampledCountField)) + val sampledCounts = sampledLeft + .joinWithSmaller(fs._1 -> rightResolvedJoinFields, sampledRight, joiner = new OuterJoin) + .project(Fields.join(mergedJoinKeys, sampledCountFields)) + .map(mergedJoinKeys -> mergedJoinKeys) { t: cascading.tuple.Tuple => + // Make the outer join look like an inner join so that we can join + // either the left or right fields for every entry. + // Accomplished by replacing any null field with the corresponding + // field from the other half. E.g., + // (1, 2, "foo", null, null, null) -> (1, 2, "foo", 1, 2, "foo") + val keysSize = t.size / 2 + val result = new cascading.tuple.Tuple(t) + + for (index <- 0 until keysSize) { + val leftValue = result.getObject(index) + val rightValue = result.getObject(index + keysSize) + + if (leftValue == null) { + result.set(index, rightValue) + } else if (rightValue == null) { + result.set(index + keysSize, leftValue) + } + } + + result + } // 2. Now replicate each group of join keys in the left and right pipes, according to the sampled counts // from the previous step. val leftReplicationFields = new Fields("__LEFT_RAND__", "__LEFT_REP__") val rightReplicationFields = new Fields("__RIGHT_REP__", "__RIGHT_RAND__") - val replicatedLeft = skewReplicate(pipe, sampledCounts, fs._1, sampledCountFields, leftReplicationFields, - replicator, reducers) - val replicatedRight = skewReplicate(rightPipe, sampledCounts, rightResolvedJoinFields, sampledCountFields, rightReplicationFields, - replicator, reducers, true) + val replicatedLeft = skewReplicate( + pipe, + sampledCounts, + fs._1, + sampledCountFields, + leftReplicationFields, + replicator, + reducers + ) + val replicatedRight = skewReplicate( + rightPipe, + sampledCounts, + rightResolvedJoinFields, + sampledCountFields, + rightReplicationFields, + replicator, + reducers, + true + ) // 3. Finally, join the replicated pipes together. val leftJoinFields = Fields.join(fs._1, leftReplicationFields) @@ -427,47 +483,60 @@ trait JoinAlgorithms { .discard(leftReplicationFields) .discard(rightReplicationFields) - if (intersection == 0) joinedPipe + if (intersection.isEmpty) joinedPipe else joinedPipe.discard(dupeFields) } - def skewJoinWithLarger(fs : (Fields, Fields), otherPipe : Pipe, - sampleRate : Double = 0.001, reducers : Int = -1, - replicator : SkewReplication = SkewReplicationA()) : Pipe = { + def skewJoinWithLarger( + fs: (Fields, Fields), + otherPipe: Pipe, + sampleRate: Double = 0.001, + reducers: Int = -1, + replicator: SkewReplication = SkewReplicationA() + ): Pipe = otherPipe.skewJoinWithSmaller((fs._2, fs._1), pipe, sampleRate, reducers, replicator) - } /** - * Helper method for performing skewed joins. This replicates the rows in {pipe} according - * to the estimated counts in {sampledCounts}. + * Helper method for performing skewed joins. This replicates the rows in {pipe} according to the estimated + * counts in {sampledCounts}. * - * @param pipe The pipe to be replicated. - * @param sampledCounts A pipe containing, for each key, the estimated counts of how often - * this key appeared in the samples of the original left and right pipes. - * @param replicator Strategy for how the pipe is replicated. - * @param isPipeOnRight Set to true when replicating the right pipe. + * @param pipe + * The pipe to be replicated. + * @param sampledCounts + * A pipe containing, for each key, the estimated counts of how often this key appeared in the samples of + * the original left and right pipes. + * @param replicator + * Strategy for how the pipe is replicated. + * @param isPipeOnRight + * Set to true when replicating the right pipe. */ - private def skewReplicate(pipe : Pipe, sampledCounts : Pipe, joinFields : Fields, - countFields : Fields, replicationFields : Fields, - replicator : SkewReplication, - numReducers : Int = -1, isPipeOnRight : Boolean = false) = { + private def skewReplicate( + pipe: Pipe, + sampledCounts: Pipe, + joinFields: Fields, + countFields: Fields, + replicationFields: Fields, + replicator: SkewReplication, + numReducers: Int = -1, + isPipeOnRight: Boolean = false + ) = { // Rename the fields to prepare for the leftJoin below. - val renamedFields = joinFields.iterator.asScala.toList.map { field => "__RENAMED_" + field + "__" } - val renamedSampledCounts = sampledCounts.rename(joinFields -> renamedFields) - .project(Fields.join(renamedFields, countFields)) + val renamedFields = joinFields.iterator.asScala.toList.map(field => "__RENAMED_" + field + "__") + val renamedSampledCounts = sampledCounts + .rename(joinFields -> renamedFields) + .project(Fields.join(renamedFields, countFields)) /** - * We need to seed exactly once and capture that seed. If we let - * each task create a seed, a restart will change the computation, - * and this could result in subtle bugs. + * We need to seed exactly once and capture that seed. If we let each task create a seed, a restart will + * change the computation, and this could result in subtle bugs. */ pipe // Join the pipe against the sampled counts, so that we know approximately how often each // join key appears. .leftJoinWithTiny(joinFields -> renamedFields, renamedSampledCounts) .using(new Random(Seed) with Stateful) - .flatMap(countFields -> replicationFields) { (rand : Random, counts : (Int, Int)) => + .flatMap(countFields -> replicationFields) { (rand: Random, counts: (Int, Int)) => val (leftRep, rightRep) = replicator.getReplications(counts._1, counts._2, numReducers) val (rep, otherRep) = if (isPipeOnRight) (rightRep, leftRep) else (leftRep, rightRep) @@ -479,4 +548,35 @@ trait JoinAlgorithms { } } -class InvalidJoinModeException(args : String) extends Exception(args) +class InvalidJoinModeException(args: String) extends Exception(args) + +/** + * Wraps a Joiner instance so that the active FlowProcess may be noted. This allows features of Scalding that + * need access to a FlowProcess (e.g., counters) to function properly inside a Joiner. + */ +private[scalding] class WrappedJoiner(val joiner: Joiner) extends Joiner { + override def getIterator(joinerClosure: JoinerClosure): JIterator[Tuple] = { + RuntimeStats.addFlowProcess(joinerClosure.getFlowProcess) + joiner.getIterator(joinerClosure) + } + + override def numJoins(): Int = joiner.numJoins() + + override def hashCode(): Int = joiner.hashCode() + + override def toString: String = joiner.toString + + override def equals(other: Any): Boolean = joiner.equals(other) +} + +private[scalding] object WrappedJoiner { + + /** + * Wrap the given Joiner in a WrappedJoiner instance if it is not already wrapped. + */ + def apply(joiner: Joiner): WrappedJoiner = + joiner match { + case wrapped: WrappedJoiner => wrapped + case _ => new WrappedJoiner(joiner) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/LibJarsExpansion.scala b/scalding-core/src/main/scala/com/twitter/scalding/LibJarsExpansion.scala new file mode 100644 index 0000000000..6c72308614 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/LibJarsExpansion.scala @@ -0,0 +1,67 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import java.io.File +import java.nio.file.Path + +object ExpandLibJarsGlobs { + def apply(inputArgs: Array[String]): Array[String] = { + // First we are going to expand out the libjars if we find it + val libJarsIdx = inputArgs.indexOf("-libjars") + 1 + if (libJarsIdx > 0 && libJarsIdx < inputArgs.length) { // 0 would mean we never found -libjars + val newArgs = new Array[String](inputArgs.length) + System.arraycopy(inputArgs, 0, newArgs, 0, inputArgs.length) + + val existing = newArgs(libJarsIdx) + val replacement = existing + .split(",") + .flatMap { element => + fromGlob(element).map(_.toString) + } + .mkString(",") + + newArgs(libJarsIdx) = replacement + newArgs + } else inputArgs + } + + // tree from Duncan McGregor @ http://stackoverflow.com/questions/2637643/how-do-i-list-all-files-in-a-subdirectory-in-scala + private[this] def tree(root: File, skipHidden: Boolean = false): Stream[File] = + if (!root.exists || (skipHidden && root.isHidden)) Stream.empty + else + root #:: (root.listFiles match { + case null => Stream.empty + case files => files.toStream.flatMap(tree(_, skipHidden)) + }) + + def fromGlob(glob: String, filesOnly: Boolean = true): Stream[Path] = { + import java.nio.file._ + val fs = FileSystems.getDefault() + val expandedSlash = if (glob.endsWith("/")) s"$glob/*" else glob + val absoluteGlob = fs.getPath(expandedSlash).toAbsolutePath + val matcher: PathMatcher = fs.getPathMatcher(s"glob:$absoluteGlob") + + val parentPath = + if (absoluteGlob.getFileName.toString.contains("*")) absoluteGlob.getParent else absoluteGlob + + val pathStream = tree(parentPath.toFile).map(_.toPath) + + val globMatchingPaths = pathStream.filter(matcher.matches) + + if (filesOnly) globMatchingPaths.filter(_.toFile.isFile) else globMatchingPaths + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/MemoryTap.scala b/scalding-core/src/main/scala/com/twitter/scalding/MemoryTap.scala index 950632db7a..c722a6c0ad 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/MemoryTap.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/MemoryTap.scala @@ -12,56 +12,56 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tap.Tap import java.util.Properties import cascading.tuple._ -import scala.collection.JavaConversions._ import cascading.scheme.Scheme import cascading.flow.FlowProcess -import collection.mutable.{Buffer, MutableList} -class MemoryTap[In,Out](val scheme : Scheme[Properties,In,Out,_,_], val tupleBuffer : Buffer[Tuple]) - extends Tap[Properties, In, Out](scheme) { +import scala.collection.mutable.Buffer +import scala.collection.JavaConverters._ + +class MemoryTap[In, Out](val scheme: Scheme[Properties, In, Out, _, _], val tupleBuffer: Buffer[Tuple]) + extends Tap[Properties, In, Out](scheme) { private var modifiedTime: Long = 1L - def updateModifiedTime: Unit = { + def updateModifiedTime(): Unit = modifiedTime = System.currentTimeMillis - } - override def createResource(conf : Properties) = { - updateModifiedTime + override def createResource(conf: Properties) = { + updateModifiedTime() true } - override def deleteResource(conf : Properties) = { - tupleBuffer.clear + override def deleteResource(conf: Properties) = { + tupleBuffer.clear() true } - override def resourceExists(conf : Properties) = tupleBuffer.size > 0 - override def getModifiedTime(conf : Properties) = if(resourceExists(conf)) modifiedTime else 0L + override def resourceExists(conf: Properties) = tupleBuffer.size > 0 + override def getModifiedTime(conf: Properties) = if (resourceExists(conf)) modifiedTime else 0L override lazy val getIdentifier: String = scala.math.random.toString - override def openForRead(flowProcess : FlowProcess[Properties], input : In) = { - new TupleEntryChainIterator(scheme.getSourceFields, tupleBuffer.toIterator) - } + override def openForRead(flowProcess: FlowProcess[Properties], input: In) = + new TupleEntryChainIterator(scheme.getSourceFields, tupleBuffer.toIterator.asJava) - override def openForWrite(flowProcess : FlowProcess[Properties], output : Out) : TupleEntryCollector = { - tupleBuffer.clear + override def openForWrite(flowProcess: FlowProcess[Properties], output: Out): TupleEntryCollector = { + tupleBuffer.clear() new MemoryTupleEntryCollector(tupleBuffer, this) } - override def equals(other : Any) = this.eq(other.asInstanceOf[AnyRef]) + override def equals(other: Any) = this.eq(other.asInstanceOf[AnyRef]) override def hashCode() = System.identityHashCode(this) } -class MemoryTupleEntryCollector(val tupleBuffer : Buffer[Tuple], mt: MemoryTap[_,_]) extends TupleEntryCollector { +class MemoryTupleEntryCollector(val tupleBuffer: Buffer[Tuple], mt: MemoryTap[_, _]) + extends TupleEntryCollector { - override def collect(tupleEntry : TupleEntry) { - mt.updateModifiedTime + override def collect(tupleEntry: TupleEntry): Unit = { + mt.updateModifiedTime() tupleBuffer += tupleEntry.getTupleCopy } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Mode.scala b/scalding-core/src/main/scala/com/twitter/scalding/Mode.scala deleted file mode 100644 index 45adca0eef..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/Mode.scala +++ /dev/null @@ -1,202 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding - -import java.io.File -import java.util.{Map => JMap, UUID, Properties} - -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.{FileSystem, Path} -import org.apache.hadoop.mapred.JobConf - -import cascading.flow.{FlowConnector, FlowDef, Flow} -import cascading.flow.hadoop.HadoopFlowProcess -import cascading.flow.hadoop.HadoopFlowConnector -import cascading.flow.local.LocalFlowConnector -import cascading.flow.local.LocalFlowProcess -import cascading.pipe.Pipe -import cascading.tap.Tap -import cascading.tuple.Tuple -import cascading.tuple.TupleEntryIterator - -import scala.annotation.tailrec -import scala.collection.JavaConverters._ -import scala.collection.mutable.Buffer -import scala.collection.mutable.{Map => MMap} -import scala.collection.mutable.{Set => MSet} -import scala.collection.mutable.{Iterable => MIterable} - -object Mode { - /** This is a Args and a Mode together. It is used purely as - * a work-around for the fact that Job only accepts an Args object, - * but needs a Mode inside. - */ - private class ArgsWithMode(argsMap: Map[String, List[String]], val mode: Mode) extends Args(argsMap) { - override def +(keyvals: (String, Iterable[String])): Args = - new ArgsWithMode(super.+(keyvals).m, mode) - } - - /** Attach a mode to these Args and return the new Args */ - def putMode(mode: Mode, args: Args): Args = new ArgsWithMode(args.m, mode) - - /** Get a Mode if this Args was the result of a putMode */ - def getMode(args: Args): Option[Mode] = args match { - case withMode: ArgsWithMode => Some(withMode.mode) - case _ => None - } - - // This should be passed ALL the args supplied after the job name - def apply(args : Args, config : Configuration) : Mode = { - val strictSources = args.boolean("tool.partialok") == false - if (!strictSources) { - // TODO we should do smarter logging here - println("[Scalding:INFO] using --tool.partialok. Missing log data won't cause errors.") - } - - if (args.boolean("local")) - Local(strictSources) - else if (args.boolean("hdfs")) - Hdfs(strictSources, config) - else - sys.error("[ERROR] Mode must be one of --local or --hdfs, you provided neither") - } -} - -trait Mode extends java.io.Serializable { - /** - * This is the input config of arguments passed in from Hadoop/Java - * this map is transformed by Job.config before running - */ - def config: Map[AnyRef, AnyRef] - /* - * Using a new FlowProcess, which is only suitable for reading outside - * of a map/reduce job, open a given tap and return the TupleEntryIterator - */ - def openForRead(tap : Tap[_,_,_]) : TupleEntryIterator - // Returns true if the file exists on the current filesystem. - def fileExists(filename : String) : Boolean - /** Create a new FlowConnector for this cascading planner */ - def newFlowConnector(props : Map[AnyRef,AnyRef]): FlowConnector -} - - -trait HadoopMode extends Mode { - def jobConf : Configuration - - override def config = - jobConf.asScala.foldLeft(Map[AnyRef, AnyRef]()) { - (acc, kv) => acc + ((kv.getKey, kv.getValue)) - } - - override def newFlowConnector(props : Map[AnyRef,AnyRef]) = - new HadoopFlowConnector(props.asJava) - - // TODO unlike newFlowConnector, this does not look at the Job.config - override def openForRead(tap : Tap[_,_,_]) = { - val htap = tap.asInstanceOf[Tap[JobConf,_,_]] - val conf = new JobConf(jobConf) - val fp = new HadoopFlowProcess(conf) - htap.retrieveSourceFields(fp) - htap.sourceConfInit(fp, conf) - htap.openForRead(fp) - } -} - -trait CascadingLocal extends Mode { - override def config = Map[AnyRef, AnyRef]() - - override def newFlowConnector(props : Map[AnyRef,AnyRef]) = - new LocalFlowConnector(props.asJava) - - override def openForRead(tap : Tap[_,_,_]) = { - val ltap = tap.asInstanceOf[Tap[Properties,_,_]] - val fp = new LocalFlowProcess - ltap.retrieveSourceFields(fp) - ltap.openForRead(fp) - } -} - -// Mix-in trait for test modes; overrides fileExists to allow the registration -// of mock filenames for testing. -trait TestMode extends Mode { - private var fileSet = Set[String]() - def registerTestFiles(files : Set[String]) = fileSet = files - override def fileExists(filename : String) : Boolean = fileSet.contains(filename) -} - -case class Hdfs(strict : Boolean, @transient conf : Configuration) extends HadoopMode { - override def jobConf = conf - override def fileExists(filename : String) : Boolean = - FileSystem.get(jobConf).exists(new Path(filename)) -} - -case class HadoopTest(@transient conf: Configuration, - @transient buffers: Source => Option[Buffer[Tuple]]) - extends HadoopMode with TestMode { - - // This is a map from source.toString to disk path - private val writePaths = MMap[Source, String]() - private val allPaths = MSet[String]() - - override def jobConf = conf - - @tailrec - private def allocateNewPath(prefix : String, idx : Int) : String = { - val candidate = prefix + idx.toString - if (allPaths(candidate)) { - //Already taken, try again: - allocateNewPath(prefix, idx + 1) - } - else { - // Update all paths: - allPaths += candidate - candidate - } - } - - private val thisTestID = UUID.randomUUID - private val basePath = "/tmp/scalding/%s/".format(thisTestID) - // Looks up a local path to write the given source to - def getWritePathFor(src : Source) : String = { - val rndIdx = new java.util.Random().nextInt(1 << 30) - writePaths.getOrElseUpdate(src, allocateNewPath(basePath + src.getClass.getName, rndIdx)) - } - - def finalize(src : Source) { - // Get the buffer for the given source, and empty it: - val buf = buffers(src).get - buf.clear() - // Now fill up this buffer with the content of the file - val path = getWritePathFor(src) - // We read the write tap in order to add its contents in the test buffers - val it = openForRead(src.createTap(Write)(this)) - while(it != null && it.hasNext) { - buf += new Tuple(it.next.getTuple) - } - //Clean up this data off the disk - new File(path).delete() - writePaths -= src - } -} - -case class Local(strictSources: Boolean) extends CascadingLocal { - override def fileExists(filename : String) : Boolean = new File(filename).exists -} - -/** -* Memory only testing for unit tests -*/ -case class Test(buffers : (Source) => Option[Buffer[Tuple]]) extends TestMode with CascadingLocal diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Operations.scala b/scalding-core/src/main/scala/com/twitter/scalding/Operations.scala index e52e7b53c8..3220fb79b5 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Operations.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Operations.scala @@ -12,61 +12,98 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding { -import cascading.operation._ -import cascading.tuple._ -import cascading.flow._ -import cascading.pipe.assembly.AggregateBy -import cascading.pipe._ -import com.twitter.chill.MeatLocker -import scala.collection.JavaConverters._ + import cascading.operation._ + import cascading.tuple._ + import cascading.flow._ + import cascading.pipe.assembly.AggregateBy + import com.twitter.chill.MeatLocker + import scala.collection.JavaConverters._ -import org.apache.hadoop.conf.Configuration - -import com.esotericsoftware.kryo.Kryo; - -import com.twitter.algebird.{Semigroup, SummingCache} -import com.twitter.scalding.mathematics.Poisson -import serialization.Externalizer + import com.twitter.algebird.{AdaptiveCache, Semigroup, SummingWithHitsCache} + import com.twitter.scalding.mathematics.Poisson + import serialization.Externalizer + import scala.util.Try trait ScaldingPrepare[C] extends Operation[C] { - abstract override def prepare(flowProcess: FlowProcess[_], operationCall: OperationCall[C]) { + abstract override def prepare(flowProcess: FlowProcess[_], operationCall: OperationCall[C]): Unit = { RuntimeStats.addFlowProcess(flowProcess) super.prepare(flowProcess, operationCall) } } - class FlatMapFunction[S,T](@transient fn : S => TraversableOnce[T], fields : Fields, - conv : TupleConverter[S], set : TupleSetter[T]) - extends BaseOperation[Any](fields) with Function[Any] with ScaldingPrepare[Any] { + class FlatMapFunction[S, T]( + @transient fn: S => TraversableOnce[T], + fields: Fields, + conv: TupleConverter[S], + set: TupleSetter[T] + ) extends BaseOperation[Any](fields) + with Function[Any] + with ScaldingPrepare[Any] { val lockedFn = Externalizer(fn) - def operate(flowProcess : FlowProcess[_], functionCall : FunctionCall[Any]) { - lockedFn.get(conv(functionCall.getArguments)).foreach { arg : T => + + /** + * Private helper to get at the function that this FlatMapFunction wraps + */ + private[scalding] def getFunction = fn + + def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Any]): Unit = + lockedFn.get(conv(functionCall.getArguments)).foreach { arg: T => val this_tup = set(arg) functionCall.getOutputCollector.add(this_tup) } - } } - class MapFunction[S,T](@transient fn : S => T, fields : Fields, - conv : TupleConverter[S], set : TupleSetter[T]) - extends BaseOperation[Any](fields) with Function[Any] with ScaldingPrepare[Any] { + class MapFunction[S, T](@transient fn: S => T, fields: Fields, conv: TupleConverter[S], set: TupleSetter[T]) + extends BaseOperation[Any](fields) + with Function[Any] + with ScaldingPrepare[Any] { val lockedFn = Externalizer(fn) - def operate(flowProcess : FlowProcess[_], functionCall : FunctionCall[Any]) { + def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Any]): Unit = { val res = lockedFn.get(conv(functionCall.getArguments)) functionCall.getOutputCollector.add(set(res)) } } - class CollectFunction[S,T](@transient fn : PartialFunction[S, T], fields : Fields, - conv : TupleConverter[S], set : TupleSetter[T]) - extends BaseOperation[Any](fields) with Function[Any] with ScaldingPrepare[Any] { + /* + The IdentityFunction puts empty nodes in the cascading graph. We use these to nudge the cascading planner + in some edge cases. + */ + object IdentityFunction + extends BaseOperation[Any](Fields.ALL) + with Function[Any] + with ScaldingPrepare[Any] { + def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Any]): Unit = + functionCall.getOutputCollector.add(functionCall.getArguments) + } + + class CleanupIdentityFunction(@transient fn: () => Unit) + extends BaseOperation[Any](Fields.ALL) + with Filter[Any] + with ScaldingPrepare[Any] { + + val lockedEf = Externalizer(fn) + + def isRemove(flowProcess: FlowProcess[_], filterCall: FilterCall[Any]) = false + + override def cleanup(flowProcess: FlowProcess[_], operationCall: OperationCall[Any]): Unit = + Try(lockedEf.get).foreach(_()) + } + + class CollectFunction[S, T]( + @transient fn: PartialFunction[S, T], + fields: Fields, + conv: TupleConverter[S], + set: TupleSetter[T] + ) extends BaseOperation[Any](fields) + with Function[Any] + with ScaldingPrepare[Any] { val lockedFn = Externalizer(fn) - def operate(flowProcess : FlowProcess[_], functionCall : FunctionCall[Any]) { + def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Any]): Unit = { val partialfn = lockedFn.get val args = conv(functionCall.getArguments) @@ -74,134 +111,349 @@ import serialization.Externalizer functionCall.getOutputCollector.add(set(partialfn(args))) } } -} + } - /** An implementation of map-side combining which is appropriate for associative and commutative functions - * If a cacheSize is given, it is used, else we query - * the config for cascading.aggregateby.threshold (standard cascading param for an equivalent case) - * else we use a default value of 100,000 + /** + * An implementation of map-side combining which is appropriate for associative and commutative functions If + * a cacheSize is given, it is used, else we query the config for cascading.aggregateby.threshold (standard + * cascading param for an equivalent case) else we use a default value of 100,000 * - * This keeps a cache of keys up to the cache-size, summing values as keys collide - * On eviction, or completion of this Operation, the key-value pairs are put into outputCollector. + * This keeps a cache of keys up to the cache-size, summing values as keys collide On eviction, or + * completion of this Operation, the key-value pairs are put into outputCollector. * - * This NEVER spills to disk and generally never be a performance penalty. If you have - * poor locality in the keys, you just don't get any benefit but little added cost. + * This NEVER spills to disk and generally never be a performance penalty. If you have poor locality in the + * keys, you just don't get any benefit but little added cost. * - * Note this means that you may still have repeated keys in the output even on a single mapper - * since the key space may be so large that you can't fit all of them in the cache at the same - * time. + * Note this means that you may still have repeated keys in the output even on a single mapper since the key + * space may be so large that you can't fit all of them in the cache at the same time. * * You can use this with the Fields-API by doing: * {{{ - * val msr = new MapsideReduce(Semigroup.from(fn), 'key, 'value, None) - * // MUST map onto the same key,value space (may be multiple fields) - * val mapSideReduced = pipe.eachTo(('key, 'value) -> ('key, 'value)) { _ => msr } + * val msr = new MapsideReduce(Semigroup.from(fn), 'key, 'value, None) + * // MUST map onto the same key,value space (may be multiple fields) + * val mapSideReduced = pipe.eachTo(('key, 'value) -> ('key, 'value)) { _ => msr } * }}} - * That said, this is equivalent to AggregateBy, and the only value is that it is much simpler than AggregateBy. - * AggregateBy assumes several parallel reductions are happening, and thus has many loops, and array lookups - * to deal with that. Since this does many fewer allocations, and has a smaller code-path it may be faster for - * the typed-API. + * That said, this is equivalent to AggregateBy, and the only value is that it is much simpler than + * AggregateBy. AggregateBy assumes several parallel reductions are happening, and thus has many loops, and + * array lookups to deal with that. Since this does many fewer allocations, and has a smaller code-path it + * may be faster for the typed-API. */ + object MapsideReduce { + val COUNTER_GROUP = "MapsideReduce" + } + class MapsideReduce[V]( - @transient commutativeSemigroup: Semigroup[V], - keyFields: Fields, valueFields: Fields, - cacheSize: Option[Int])(implicit conv: TupleConverter[V], set: TupleSetter[V]) - extends BaseOperation[SummingCache[Tuple,V]](Fields.join(keyFields, valueFields)) - with Function[SummingCache[Tuple,V]] - with ScaldingPrepare[SummingCache[Tuple,V]] { + @transient commutativeSemigroup: Semigroup[V], + keyFields: Fields, + valueFields: Fields, + cacheSize: Option[Int] + )(implicit conv: TupleConverter[V], set: TupleSetter[V]) + extends BaseOperation[MapsideCache[Tuple, V]](Fields.join(keyFields, valueFields)) + with Function[MapsideCache[Tuple, V]] + with ScaldingPrepare[MapsideCache[Tuple, V]] { val boxedSemigroup = Externalizer(commutativeSemigroup) - val DEFAULT_CACHE_SIZE = 100000 - val SIZE_CONFIG_KEY = AggregateBy.AGGREGATE_BY_THRESHOLD - - def cacheSize(fp: FlowProcess[_]): Int = - cacheSize.orElse { - Option(fp.getStringProperty(SIZE_CONFIG_KEY)) - .filterNot { _.isEmpty } - .map { _.toInt } - } - .getOrElse( DEFAULT_CACHE_SIZE ) - - override def prepare(flowProcess: FlowProcess[_], operationCall: OperationCall[SummingCache[Tuple,V]]) { - //Set up the context: + override def prepare( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[Tuple, V]] + ): Unit = { + // Set up the context: implicit val sg: Semigroup[V] = boxedSemigroup.get - val cache = SummingCache[Tuple,V](cacheSize(flowProcess)) + val cache = MapsideCache[Tuple, V](cacheSize, flowProcess) operationCall.setContext(cache) } @inline - private def add(evicted: Option[Map[Tuple,V]], functionCall: FunctionCall[SummingCache[Tuple,V]]) { + private def add( + evicted: Option[Map[Tuple, V]], + functionCall: FunctionCall[MapsideCache[Tuple, V]] + ): Unit = // Use iterator and while for optimal performance (avoid closures/fn calls) - if(evicted.isDefined) { + if (evicted.isDefined) { + // Don't use pattern matching in performance-critical code + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) val it = evicted.get.iterator val tecol = functionCall.getOutputCollector - while(it.hasNext) { + while (it.hasNext) { val (key, value) = it.next // Safe to mutate this key as it is evicted from the map key.addAll(set(value)) tecol.add(key) } } - } - override def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[SummingCache[Tuple,V]]) { + override def operate( + flowProcess: FlowProcess[_], + functionCall: FunctionCall[MapsideCache[Tuple, V]] + ): Unit = { val cache = functionCall.getContext val keyValueTE = functionCall.getArguments // Have to keep a copy of the key tuple because cascading will modify it val key = keyValueTE.selectEntry(keyFields).getTupleCopy val value = conv(keyValueTE.selectEntry(valueFields)) - add(cache.put(Map(key -> value)), functionCall) + val evicted = cache.put(key, value) + add(evicted, functionCall) + } + + override def flush( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[Tuple, V]] + ): Unit = { + // Docs say it is safe to do this cast: + // http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/Operation.html#flush(cascading.flow.FlowProcess, cascading.operation.OperationCall) + val functionCall = operationCall.asInstanceOf[FunctionCall[MapsideCache[Tuple, V]]] + val cache = functionCall.getContext + add(cache.flush, functionCall) + } + + override def cleanup( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[Tuple, V]] + ): Unit = + // The cache may be large, but super sure we drop any reference to it ASAP + // probably overly defensive, but it's super cheap. + operationCall.setContext(null) + } + + class TypedMapsideReduce[K, V]( + @transient fn: TupleEntry => TraversableOnce[(K, V)], + @transient commutativeSemigroup: Semigroup[V], + sourceFields: Fields, + keyFields: Fields, + valueFields: Fields, + cacheSize: Option[Int] + )(implicit setKV: TupleSetter[(K, V)]) + extends BaseOperation[MapsideCache[K, V]](Fields.join(keyFields, valueFields)) + with Function[MapsideCache[K, V]] + with ScaldingPrepare[MapsideCache[K, V]] { + + val boxedSemigroup = Externalizer(commutativeSemigroup) + val lockedFn = Externalizer(fn) + + override def prepare( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[K, V]] + ): Unit = { + // Set up the context: + implicit val sg: Semigroup[V] = boxedSemigroup.get + val cache = MapsideCache[K, V](cacheSize, flowProcess) + operationCall.setContext(cache) + } + + // Don't use pattern matching in a performance-critical section + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + @inline + private def add(evicted: Option[Map[K, V]], functionCall: FunctionCall[MapsideCache[K, V]]): Unit = + // Use iterator and while for optimal performance (avoid closures/fn calls) + if (evicted.isDefined) { + val it = evicted.get.iterator + val tecol = functionCall.getOutputCollector + while (it.hasNext) { + val (key, value) = it.next + // Safe to mutate this key as it is evicted from the map + tecol.add(setKV(key, value)) + } + } + + import scala.collection.mutable.{Map => MMap} + + private[this] class CollectionBackedMap[K, V](val backingMap: MMap[K, V]) + extends Map[K, V] + with java.io.Serializable { + def get(key: K) = backingMap.get(key) + + def iterator = backingMap.iterator + + def +[B1 >: V](kv: (K, B1)) = backingMap.toMap + kv + + def -(key: K) = backingMap.toMap - key + } + + // Don't use pattern matching in a performance-critical section + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + private[this] def mergeTraversableOnce[K, V: Semigroup](items: TraversableOnce[(K, V)]): Map[K, V] = { + val mutable = + scala.collection.mutable + .OpenHashMap[K, V]() // Scala's OpenHashMap seems faster than Java and Scala's HashMap Impl's + val innerIter = items.toIterator + while (innerIter.hasNext) { + val (k, v) = innerIter.next + val oldVOpt: Option[V] = mutable.get(k) + // sorry for the micro optimization here: avoiding a closure + val newV: V = if (oldVOpt.isEmpty) v else Semigroup.plus(oldVOpt.get, v) + mutable.update(k, newV) + } + new CollectionBackedMap(mutable) } - override def flush(flowProcess: FlowProcess[_], operationCall: OperationCall[SummingCache[Tuple,V]]) { + override def operate( + flowProcess: FlowProcess[_], + functionCall: FunctionCall[MapsideCache[K, V]] + ): Unit = { + val cache = functionCall.getContext + implicit val sg: Semigroup[V] = boxedSemigroup.get + val res: Map[K, V] = mergeTraversableOnce(lockedFn.get(functionCall.getArguments)) + val evicted = cache.putAll(res) + add(evicted, functionCall) + } + + override def flush( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[K, V]] + ): Unit = { // Docs say it is safe to do this cast: // http://docs.cascading.org/cascading/2.1/javadoc/cascading/operation/Operation.html#flush(cascading.flow.FlowProcess, cascading.operation.OperationCall) - val functionCall = operationCall.asInstanceOf[FunctionCall[SummingCache[Tuple,V]]] + val functionCall = operationCall.asInstanceOf[FunctionCall[MapsideCache[K, V]]] val cache = functionCall.getContext add(cache.flush, functionCall) } - override def cleanup(flowProcess: FlowProcess[_], operationCall: OperationCall[SummingCache[Tuple,V]]) { + override def cleanup( + flowProcess: FlowProcess[_], + operationCall: OperationCall[MapsideCache[K, V]] + ): Unit = // The cache may be large, but super sure we drop any reference to it ASAP // probably overly defensive, but it's super cheap. operationCall.setContext(null) + } + + sealed trait MapsideCache[K, V] { + def flush: Option[Map[K, V]] + def put(key: K, value: V): Option[Map[K, V]] + + def putAll(key: Map[K, V]): Option[Map[K, V]] + } + + object MapsideCache { + val DEFAULT_CACHE_SIZE = 100000 + val SIZE_CONFIG_KEY = AggregateBy.AGGREGATE_BY_THRESHOLD + val ADAPTIVE_CACHE_KEY = "scalding.mapsidecache.adaptive" + + private def getCacheSize(fp: FlowProcess[_]): Int = + Option(fp.getStringProperty(SIZE_CONFIG_KEY)) + .filterNot(_.isEmpty) + .map(_.toInt) + .getOrElse(DEFAULT_CACHE_SIZE) + + def apply[K, V: Semigroup](cacheSize: Option[Int], flowProcess: FlowProcess[_]): MapsideCache[K, V] = { + val size = cacheSize.getOrElse(getCacheSize(flowProcess)) + val adaptive = Option(flowProcess.getStringProperty(ADAPTIVE_CACHE_KEY)).isDefined + if (adaptive) + new AdaptiveMapsideCache(flowProcess, new AdaptiveCache(size)) + else + new SummingMapsideCache(flowProcess, new SummingWithHitsCache(size)) + } + } + + final class SummingMapsideCache[K, V](flowProcess: FlowProcess[_], summingCache: SummingWithHitsCache[K, V]) + extends MapsideCache[K, V] { + private[this] val misses = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "misses")) + private[this] val hits = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "hits")) + private[this] val evictions = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "evictions")) + + def flush = summingCache.flush + + // Don't use pattern matching in performance-critical code + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + def put(key: K, value: V): Option[Map[K, V]] = { + val (curHits, evicted) = summingCache.putWithHits(Map(key -> value)) + misses.increment(1 - curHits) + hits.increment(curHits) + + if (evicted.isDefined) + evictions.increment(evicted.get.size) + evicted + } + + // Don't use pattern matching in a performance-critical section + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + def putAll(kvs: Map[K, V]): Option[Map[K, V]] = { + val (curHits, evicted) = summingCache.putWithHits(kvs) + misses.increment(kvs.size - curHits) + hits.increment(curHits) + + if (evicted.isDefined) + evictions.increment(evicted.get.size) + evicted + } + } + + final class AdaptiveMapsideCache[K, V](flowProcess: FlowProcess[_], adaptiveCache: AdaptiveCache[K, V]) + extends MapsideCache[K, V] { + private[this] val misses = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "misses")) + private[this] val hits = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "hits")) + private[this] val capacity = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "capacity")) + private[this] val sentinel = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "sentinel")) + private[this] val evictions = CounterImpl(flowProcess, StatKey(MapsideReduce.COUNTER_GROUP, "evictions")) + + def flush = adaptiveCache.flush + + // Don't use pattern matching in performance-critical code + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + def put(key: K, value: V) = { + val (stats, evicted) = adaptiveCache.putWithStats(Map(key -> value)) + misses.increment(1 - stats.hits) + hits.increment(stats.hits) + capacity.increment(stats.cacheGrowth) + sentinel.increment(stats.sentinelGrowth) + + if (evicted.isDefined) + evictions.increment(evicted.get.size) + + evicted + + } + + // Don't use pattern matching in a performance-critical section + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + def putAll(kvs: Map[K, V]): Option[Map[K, V]] = { + val (stats, evicted) = adaptiveCache.putWithStats(kvs) + misses.increment(kvs.size - stats.hits) + hits.increment(stats.hits) + capacity.increment(stats.cacheGrowth) + sentinel.increment(stats.sentinelGrowth) + + if (evicted.isDefined) + evictions.increment(evicted.get.size) + + evicted } } /* * BaseOperation with support for context */ - abstract class SideEffectBaseOperation[C] ( - @transient bf: => C, // begin function returns a context - @transient ef: C => Unit, // end function to clean up context object - fields: Fields - ) extends BaseOperation[C](fields) with ScaldingPrepare[C] { + abstract class SideEffectBaseOperation[C]( + @transient bf: => C, // begin function returns a context + @transient ef: C => Unit, // end function to clean up context object + fields: Fields + ) extends BaseOperation[C](fields) + with ScaldingPrepare[C] { val lockedBf = Externalizer(() => bf) val lockedEf = Externalizer(ef) - override def prepare(flowProcess: FlowProcess[_], operationCall: OperationCall[C]) { + override def prepare(flowProcess: FlowProcess[_], operationCall: OperationCall[C]): Unit = operationCall.setContext(lockedBf.get.apply) - } - override def cleanup(flowProcess: FlowProcess[_], operationCall: OperationCall[C]) { + override def cleanup(flowProcess: FlowProcess[_], operationCall: OperationCall[C]): Unit = lockedEf.get(operationCall.getContext) - } - } + } /* * A map function that allows state object to be set up and tear down. */ - class SideEffectMapFunction[S, C, T] ( - bf: => C, // begin function returns a context - @transient fn: (C, S) => T, // function that takes a context and a tuple and generate a new tuple - ef: C => Unit, // end function to clean up context object - fields: Fields, - conv: TupleConverter[S], - set: TupleSetter[T] - ) extends SideEffectBaseOperation[C](bf, ef, fields) with Function[C] { + class SideEffectMapFunction[S, C, T]( + bf: => C, // begin function returns a context + @transient fn: (C, S) => T, // function that takes a context and a tuple and generate a new tuple + ef: C => Unit, // end function to clean up context object + fields: Fields, + conv: TupleConverter[S], + set: TupleSetter[T] + ) extends SideEffectBaseOperation[C](bf, ef, fields) + with Function[C] { val lockedFn = Externalizer(fn) - override def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[C]) { + override def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[C]): Unit = { val context = functionCall.getContext val s = conv(functionCall.getArguments) val res = lockedFn.get(context, s) @@ -212,91 +464,98 @@ import serialization.Externalizer /* * A flatmap function that allows state object to be set up and tear down. */ - class SideEffectFlatMapFunction[S, C, T] ( - bf: => C, // begin function returns a context - @transient fn: (C, S) => TraversableOnce[T], // function that takes a context and a tuple, returns TraversableOnce of T - ef: C => Unit, // end function to clean up context object - fields: Fields, - conv: TupleConverter[S], - set: TupleSetter[T] - ) extends SideEffectBaseOperation[C](bf, ef, fields) with Function[C] { + class SideEffectFlatMapFunction[S, C, T]( + bf: => C, // begin function returns a context + @transient fn: ( + C, + S + ) => TraversableOnce[T], // function that takes a context and a tuple, returns TraversableOnce of T + ef: C => Unit, // end function to clean up context object + fields: Fields, + conv: TupleConverter[S], + set: TupleSetter[T] + ) extends SideEffectBaseOperation[C](bf, ef, fields) + with Function[C] { val lockedFn = Externalizer(fn) - override def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[C]) { + override def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[C]): Unit = { val context = functionCall.getContext val s = conv(functionCall.getArguments) - lockedFn.get(context, s) foreach { t => functionCall.getOutputCollector.add(set(t)) } + lockedFn.get(context, s).foreach(t => functionCall.getOutputCollector.add(set(t))) } } - class FilterFunction[T](@transient fn : T => Boolean, conv : TupleConverter[T]) - extends BaseOperation[Any] with Filter[Any] with ScaldingPrepare[Any] { + class FilterFunction[T](@transient fn: T => Boolean, conv: TupleConverter[T]) + extends BaseOperation[Any] + with Filter[Any] + with ScaldingPrepare[Any] { val lockedFn = Externalizer(fn) - def isRemove(flowProcess : FlowProcess[_], filterCall : FilterCall[Any]) = { + def isRemove(flowProcess: FlowProcess[_], filterCall: FilterCall[Any]) = !lockedFn.get(conv(filterCall.getArguments)) - } } // All the following are operations for use in GroupBuilder - class FoldAggregator[T,X](@transient fn : (X,T) => X, @transient init : X, fields : Fields, - conv : TupleConverter[T], set : TupleSetter[X]) - extends BaseOperation[X](fields) with Aggregator[X] with ScaldingPrepare[X] { + class FoldAggregator[T, X]( + @transient fn: (X, T) => X, + @transient init: X, + fields: Fields, + conv: TupleConverter[T], + set: TupleSetter[X] + ) extends BaseOperation[X](fields) + with Aggregator[X] + with ScaldingPrepare[X] { val lockedFn = Externalizer(fn) private val lockedInit = MeatLocker(init) def initCopy = lockedInit.copy - def start(flowProcess : FlowProcess[_], call : AggregatorCall[X]) { + def start(flowProcess: FlowProcess[_], call: AggregatorCall[X]): Unit = call.setContext(initCopy) - } - def aggregate(flowProcess : FlowProcess[_], call : AggregatorCall[X]) { + def aggregate(flowProcess: FlowProcess[_], call: AggregatorCall[X]): Unit = { val left = call.getContext val right = conv(call.getArguments) call.setContext(lockedFn.get(left, right)) } - def complete(flowProcess : FlowProcess[_], call : AggregatorCall[X]) { - emit(flowProcess, call) - } - - def emit(flowProcess : FlowProcess[_], call : AggregatorCall[X]) { + def complete(flowProcess: FlowProcess[_], call: AggregatorCall[X]): Unit = call.getOutputCollector.add(set(call.getContext)) - } } /* * fields are the declared fields of this aggregator */ - class MRMAggregator[T,X,U]( - @transient inputFsmf : T => X, - @transient inputRfn : (X,X) => X, - @transient inputMrfn : X => U, - fields : Fields, conv : TupleConverter[T], set : TupleSetter[U]) - extends BaseOperation[Tuple](fields) with Aggregator[Tuple] with ScaldingPrepare[Tuple] { + class MRMAggregator[T, X, U]( + @transient inputFsmf: T => X, + @transient inputRfn: (X, X) => X, + @transient inputMrfn: X => U, + fields: Fields, + conv: TupleConverter[T], + set: TupleSetter[U] + ) extends BaseOperation[Tuple](fields) + with Aggregator[Tuple] + with ScaldingPrepare[Tuple] { val fsmf = Externalizer(inputFsmf) val rfn = Externalizer(inputRfn) val mrfn = Externalizer(inputMrfn) // The context is a singleton Tuple, which is mutable so // we don't have to allocate at every step of the loop: - def start(flowProcess : FlowProcess[_], call : AggregatorCall[Tuple]) { - call.setContext(null) - } + def start(flowProcess: FlowProcess[_], call: AggregatorCall[Tuple]): Unit = + call.setContext(null) - def extractArgument(call : AggregatorCall[Tuple]) : X = fsmf.get(conv(call.getArguments)) + def extractArgument(call: AggregatorCall[Tuple]): X = fsmf.get(conv(call.getArguments)) - def aggregate(flowProcess : FlowProcess[_], call : AggregatorCall[Tuple]) { + def aggregate(flowProcess: FlowProcess[_], call: AggregatorCall[Tuple]): Unit = { val arg = extractArgument(call) val ctx = call.getContext - if (null == ctx) { + if (ctx == null) { // Initialize the context, this is the only allocation done by this loop. val newCtx = Tuple.size(1) newCtx.set(0, arg.asInstanceOf[AnyRef]) call.setContext(newCtx) - } - else { + } else { // Mutate the context: val oldValue = ctx.getObject(0).asInstanceOf[X] val newValue = rfn.get(oldValue, arg) @@ -304,31 +563,29 @@ import serialization.Externalizer } } - def complete(flowProcess : FlowProcess[_], call : AggregatorCall[Tuple]) { + def complete(flowProcess: FlowProcess[_], call: AggregatorCall[Tuple]): Unit = { val ctx = call.getContext if (null != ctx) { val lastValue = ctx.getObject(0).asInstanceOf[X] // Make sure to drop the reference to the lastValue as soon as possible (it may be big) call.setContext(null) call.getOutputCollector.add(set(mrfn.get(lastValue))) - } - else { + } else { throw new Exception("MRMAggregator completed without any args") } } } /** - * This handles the mapReduceMap work on the map-side of the operation. The code below - * attempts to be optimal with respect to memory allocations and performance, not functional - * style purity. + * This handles the mapReduceMap work on the map-side of the operation. The code below attempts to be + * optimal with respect to memory allocations and performance, not functional style purity. */ - abstract class FoldFunctor[X](fields : Fields) extends AggregateBy.Functor { + abstract class FoldFunctor[X](fields: Fields) extends AggregateBy.Functor { // Extend these three methods: - def first(args : TupleEntry) : X - def subsequent(oldValue : X, newArgs : TupleEntry) : X - def finish(lastValue : X) : Tuple + def first(args: TupleEntry): X + def subsequent(oldValue: X, newArgs: TupleEntry): X + def finish(lastValue: X): Tuple override final def getDeclaredFields = fields @@ -337,153 +594,183 @@ import serialization.Externalizer * reuse these objects, so any per instance state might give unexpected * results. */ - override final def aggregate(flowProcess : FlowProcess[_], args : TupleEntry, context : Tuple) = { - var nextContext : Tuple = null - val newContextObj = if (null == context) { + override final def aggregate(flowProcess: FlowProcess[_], args: TupleEntry, context: Tuple) = { + var nextContext: Tuple = null + val newContextObj = if (context == null) { // First call, make a new mutable tuple to reduce allocations: nextContext = Tuple.size(1) first(args) - } - else { - //We are updating + } else { + // We are updating val oldValue = context.getObject(0).asInstanceOf[X] nextContext = context subsequent(oldValue, args) } nextContext.set(0, newContextObj.asInstanceOf[AnyRef]) - //Return context for reuse next time: + // Return context for reuse next time: nextContext } - override final def complete(flowProcess : FlowProcess[_], context : Tuple) = { - if (null == context) { + override final def complete(flowProcess: FlowProcess[_], context: Tuple) = + if (context == null) { throw new Exception("FoldFunctor completed with any aggregate calls") - } - else { + } else { val res = context.getObject(0).asInstanceOf[X] // Make sure we remove the ref to the context ASAP: context.set(0, null) finish(res) } - } } /** - * This handles the mapReduceMap work on the map-side of the operation. The code below - * attempts to be optimal with respect to memory allocations and performance, not functional - * style purity. + * This handles the mapReduceMap work on the map-side of the operation. The code below attempts to be + * optimal with respect to memory allocations and performance, not functional style purity. */ - class MRMFunctor[T,X]( - @transient inputMrfn : T => X, - @transient inputRfn : (X, X) => X, - fields : Fields, - conv : TupleConverter[T], set : TupleSetter[X]) - extends FoldFunctor[X](fields) { + class MRMFunctor[T, X]( + @transient inputMrfn: T => X, + @transient inputRfn: (X, X) => X, + fields: Fields, + conv: TupleConverter[T], + set: TupleSetter[X] + ) extends FoldFunctor[X](fields) { val mrfn = Externalizer(inputMrfn) val rfn = Externalizer(inputRfn) - override def first(args : TupleEntry) : X = mrfn.get(conv(args)) - override def subsequent(oldValue : X, newArgs : TupleEntry) = { + override def first(args: TupleEntry): X = mrfn.get(conv(args)) + override def subsequent(oldValue: X, newArgs: TupleEntry) = { val right = mrfn.get(conv(newArgs)) rfn.get(oldValue, right) } - override def finish(lastValue : X) = set(lastValue) + override def finish(lastValue: X) = set(lastValue) } /** * MapReduceMapBy Class */ - class MRMBy[T,X,U](arguments : Fields, - middleFields : Fields, - declaredFields : Fields, - mfn : T => X, - rfn : (X,X) => X, - mfn2 : X => U, - startConv : TupleConverter[T], - midSet : TupleSetter[X], - midConv : TupleConverter[X], - endSet : TupleSetter[U]) extends AggregateBy( + class MRMBy[T, X, U]( + arguments: Fields, + middleFields: Fields, + declaredFields: Fields, + mfn: T => X, + rfn: (X, X) => X, + mfn2: X => U, + startConv: TupleConverter[T], + midSet: TupleSetter[X], + midConv: TupleConverter[X], + endSet: TupleSetter[U] + ) extends AggregateBy( arguments, - new MRMFunctor[T,X](mfn, rfn, middleFields, startConv, midSet), - new MRMAggregator[X,X,U](args => args, rfn, mfn2, declaredFields, midConv, endSet)) - - class BufferOp[I,T,X]( - @transient init : I, - @transient inputIterfn : (I, Iterator[T]) => TraversableOnce[X], - fields : Fields, conv : TupleConverter[T], set : TupleSetter[X]) - extends BaseOperation[Any](fields) with Buffer[Any] with ScaldingPrepare[Any] { + new MRMFunctor[T, X](mfn, rfn, middleFields, startConv, midSet), + new MRMAggregator[X, X, U](args => args, rfn, mfn2, declaredFields, midConv, endSet) + ) + + class BufferOp[I, T, X]( + @transient init: I, + @transient inputIterfn: (I, Iterator[T]) => TraversableOnce[X], + fields: Fields, + conv: TupleConverter[T], + set: TupleSetter[X] + ) extends BaseOperation[Any](fields) + with Buffer[Any] + with ScaldingPrepare[Any] { val iterfn = Externalizer(inputIterfn) private val lockedInit = MeatLocker(init) def initCopy = lockedInit.copy - def operate(flowProcess : FlowProcess[_], call : BufferCall[Any]) { + def operate(flowProcess: FlowProcess[_], call: BufferCall[Any]): Unit = { val oc = call.getOutputCollector - val in = call.getArgumentsIterator.asScala.map { entry => conv(entry) } - iterfn.get(initCopy, in).foreach { x => oc.add(set(x)) } + val in = call.getArgumentsIterator.asScala.map(entry => conv(entry)) + iterfn.get(initCopy, in).foreach(x => oc.add(set(x))) } } /* * A buffer that allows state object to be set up and tear down. */ - class SideEffectBufferOp[I,T,C,X]( - @transient init : I, - bf: => C, // begin function returns a context - @transient inputIterfn: (I, C, Iterator[T]) => TraversableOnce[X], - ef: C => Unit, // end function to clean up context object - fields: Fields, - conv: TupleConverter[T], - set: TupleSetter[X] - ) extends SideEffectBaseOperation[C](bf, ef, fields) with Buffer[C] { + class SideEffectBufferOp[I, T, C, X]( + @transient init: I, + bf: => C, // begin function returns a context + @transient inputIterfn: (I, C, Iterator[T]) => TraversableOnce[X], + ef: C => Unit, // end function to clean up context object + fields: Fields, + conv: TupleConverter[T], + set: TupleSetter[X] + ) extends SideEffectBaseOperation[C](bf, ef, fields) + with Buffer[C] { val iterfn = Externalizer(inputIterfn) private val lockedInit = MeatLocker(init) def initCopy = lockedInit.copy - def operate(flowProcess : FlowProcess[_], call : BufferCall[C]) { + def operate(flowProcess: FlowProcess[_], call: BufferCall[C]): Unit = { val context = call.getContext val oc = call.getOutputCollector - val in = call.getArgumentsIterator.asScala.map { entry => conv(entry) } - iterfn.get(initCopy, context, in).foreach { x => oc.add(set(x)) } + val in = call.getArgumentsIterator.asScala.map(entry => conv(entry)) + iterfn.get(initCopy, context, in).foreach(x => oc.add(set(x))) } } - class SampleWithReplacement(frac : Double, val seed : Int = new scala.util.Random().nextInt) extends BaseOperation[Poisson]() - with Function[Poisson] with ScaldingPrepare[Poisson] { - override def prepare(flowProcess : FlowProcess[_], operationCall : OperationCall[Poisson]) { + class SampleWithReplacement(frac: Double, val seed: Int = new java.util.Random().nextInt) + extends BaseOperation[Poisson]() + with Function[Poisson] + with ScaldingPrepare[Poisson] { + override def prepare(flowProcess: FlowProcess[_], operationCall: OperationCall[Poisson]): Unit = { super.prepare(flowProcess, operationCall) val p = new Poisson(frac, seed) - operationCall.setContext( p ); + operationCall.setContext(p) } - def operate(flowProcess : FlowProcess[_], functionCall : FunctionCall[Poisson]) { + def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Poisson]): Unit = { val r = functionCall.getContext.nextInt for (i <- 0 until r) - functionCall.getOutputCollector().add( Tuple.NULL ) + functionCall.getOutputCollector().add(Tuple.NULL) } } /** In the typed API every reduce operation is handled by this Buffer */ - class TypedBufferOp[K,V,U]( - @transient reduceFn: (K, Iterator[V]) => Iterator[U], - valueField: Fields) - extends BaseOperation[Any](valueField) with Buffer[Any] with ScaldingPrepare[Any] { + class TypedBufferOp[K, V, U]( + conv: TupleConverter[K], + convV: TupleConverter[V], + @transient reduceFn: (K, Iterator[V]) => Iterator[U], + valueField: Fields + ) extends BaseOperation[Any](valueField) + with Buffer[Any] + with ScaldingPrepare[Any] { val reduceFnSer = Externalizer(reduceFn) - def operate(flowProcess: FlowProcess[_], call: BufferCall[Any]) { + def operate(flowProcess: FlowProcess[_], call: BufferCall[Any]): Unit = { val oc = call.getOutputCollector - val key = call.getGroup.getObject(0).asInstanceOf[K] - val values = call.getArgumentsIterator - .asScala - .map(_.getObject(0).asInstanceOf[V]) + val key = conv(call.getGroup) + val values = call.getArgumentsIterator.asScala + .map(convV(_)) // Avoiding a lambda here val resIter = reduceFnSer.get(key, values) - while(resIter.hasNext) { + while (resIter.hasNext) { val tup = Tuple.size(1) tup.set(0, resIter.next) oc.add(tup) } } } + + /** + * This gets a pair out of a tuple, incruments the counters with the left, and passes the value on + */ + class IncrementCounters[A](pass: Fields, conv: TupleConverter[(A, Iterable[((String, String), Long)])]) + extends BaseOperation[Any](pass) + with Function[Any] { + + override def operate(flowProcess: FlowProcess[_], functionCall: FunctionCall[Any]): Unit = { + val (a, inc) = conv(functionCall.getArguments) + val iter = inc.iterator + while (iter.hasNext) { + val ((k1, k2), amt) = iter.next + flowProcess.increment(k1, k2, amt) + } + val tup = Tuple.size(1) + tup.set(0, a) + functionCall.getOutputCollector.add(tup) + } + } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/OptionalSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/OptionalSource.scala new file mode 100644 index 0000000000..aa4125326c --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/OptionalSource.scala @@ -0,0 +1,33 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import scala.util.{Failure, Success, Try} +import cascading.tap.Tap + +case class OptionalSource[T](src: Mappable[T]) extends Source with Mappable[T] { + override def converter[U >: T] = TupleConverter.asSuperConverter(src.converter) + + def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = + Try(src.validateTaps(mode)) match { + case Success(_) => + src.createTap(readOrWrite) + case Failure(_) => + IterableSource[T](Nil)(TupleSetter.singleSetter[T], src.converter) + .createTap(readOrWrite) + .asInstanceOf[Tap[_, _, _]] + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/PartitionSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/PartitionSource.scala new file mode 100644 index 0000000000..782d948996 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/PartitionSource.scala @@ -0,0 +1,208 @@ +/* +Copyright 2014 Snowplow Analytics Ltd + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import cascading.tap.hadoop.Hfs +import cascading.tap.hadoop.{PartitionTap => HPartitionTap} +import cascading.tap.local.FileTap +import cascading.tap.local.{PartitionTap => LPartitionTap} +import cascading.tap.partition.DelimitedPartition +import cascading.tap.partition.Partition +import cascading.tap.SinkMode +import cascading.tap.Tap +import cascading.tuple.Fields + +/** + * This is a base class for partition-based output sources + */ +abstract class PartitionSource(val openWritesThreshold: Option[Int] = None) + extends SchemedSource + with HfsTapProvider { + + // The root path of the partitioned output. + def basePath: String + // The partition. + def partition: Partition = new DelimitedPartition(Fields.ALL, "/") + + /** + * Creates the partition tap. + * + * @param readOrWrite + * Describes if this source is being read from or written to. + * @param mode + * The mode of the job. (implicit) + * + * @return + * A cascading PartitionTap. + */ + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = + readOrWrite match { + case Read => throw new InvalidSourceException("Using PartitionSource for input not yet implemented") + case Write => { + mode match { + case Local(_) => { + val localTap = new FileTap(localScheme, basePath, sinkMode) + openWritesThreshold match { + case Some(threshold) => new LPartitionTap(localTap, partition, threshold) + case None => new LPartitionTap(localTap, partition) + } + } + case hdfsMode @ Hdfs(_, _) => { + val hfsTap = createHfsTap(hdfsScheme, basePath, sinkMode) + getHPartitionTap(hfsTap) + } + case hdfsTest @ HadoopTest(_, _) => { + val hfsTap = createHfsTap(hdfsScheme, hdfsTest.getWritePathFor(this), sinkMode) + getHPartitionTap(hfsTap) + } + case _ => TestTapFactory(this, hdfsScheme).createTap(readOrWrite) + } + } + } + + /** + * Validates the taps, makes sure there are no nulls in the path. + * + * @param mode + * The mode of the job. + */ + override def validateTaps(mode: Mode): Unit = + if (basePath == null) { + throw new InvalidSourceException("basePath cannot be null for PartitionTap") + } + + private[this] def getHPartitionTap(hfsTap: Hfs): HPartitionTap = + openWritesThreshold match { + case Some(threshold) => new HPartitionTap(hfsTap, partition, threshold) + case None => new HPartitionTap(hfsTap, partition) + } +} + +/** + * An implementation of TSV output, split over a partition tap. + * + * Similar to TemplateSource, but with addition of tsvFields, to let users explicitly specify which fields + * they want to see in the TSV (allows user to discard path fields). + * + * apply assumes user wants a DelimitedPartition (the only strategy bundled with Cascading). + * + * @param basePath + * The root path for the output. + * @param delimiter + * The path delimiter, defaults to / to create sub-directory bins. + * @param pathFields + * The set of fields to apply to the path. + * @param writeHeader + * Flag to indicate that the header should be written to the file. + * @param tsvFields + * The set of fields to include in the TSV output. + * @param sinkMode + * How to handle conflicts with existing output. + */ +object PartitionedTsv { + def apply( + basePath: String, + delimiter: String = "/", + pathFields: Fields = Fields.ALL, + writeHeader: Boolean = false, + tsvFields: Fields = Fields.ALL, + sinkMode: SinkMode = SinkMode.REPLACE + ) = new PartitionedTsv( + basePath, + new DelimitedPartition(pathFields, delimiter), + writeHeader, + tsvFields, + sinkMode + ) +} + +/** + * An implementation of TSV output, split over a partition tap. + * + * @param basePath + * The root path for the output. + * @param partition + * The partitioning strategy to use. + * @param writeHeader + * Flag to indicate that the header should be written to the file. + * @param sinkMode + * How to handle conflicts with existing output. + */ +case class PartitionedTsv( + override val basePath: String, + override val partition: Partition, + override val writeHeader: Boolean, + val tsvFields: Fields, + override val sinkMode: SinkMode +) extends PartitionSource + with DelimitedScheme { + + override val fields = tsvFields +} + +/** + * An implementation of SequenceFile output, split over a partition tap. + * + * apply assumes user wants a DelimitedPartition (the only strategy bundled with Cascading). + * + * @param basePath + * The root path for the output. + * @param delimiter + * The path delimiter, defaults to / to create sub-directory bins. + * @param pathFields + * The set of fields to apply to the path. + * @param sequenceFields + * The set of fields to use for the sequence file. + * @param sinkMode + * How to handle conflicts with existing output. + */ +object PartitionedSequenceFile { + def apply( + basePath: String, + delimiter: String = "/", + pathFields: Fields = Fields.ALL, + sequenceFields: Fields = Fields.ALL, + sinkMode: SinkMode = SinkMode.REPLACE + ) = new PartitionedSequenceFile( + basePath, + new DelimitedPartition(pathFields, delimiter), + sequenceFields, + sinkMode + ) +} + +/** + * An implementation of SequenceFile output, split over a partition tap. + * + * @param basePath + * The root path for the output. + * @param partition + * The partitioning strategy to use. + * @param sequenceFields + * The set of fields to use for the sequence file. + * @param sinkMode + * How to handle conflicts with existing output. + */ +case class PartitionedSequenceFile( + override val basePath: String, + override val partition: Partition, + val sequenceFields: Fields, + override val sinkMode: SinkMode +) extends PartitionSource + with SequenceFileScheme { + + override val fields = sequenceFields +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/PipeDebug.scala b/scalding-core/src/main/scala/com/twitter/scalding/PipeDebug.scala index a867e8ada4..dddc108c3a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/PipeDebug.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/PipeDebug.scala @@ -12,23 +12,25 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.pipe.{Pipe, Each} +import cascading.pipe.{Each, Pipe} import cascading.operation.Debug import cascading.operation.Debug.Output -/** This is a builder for Cascading's Debug object. - * The default instance is the same default as cascading's new Debug() +/** + * This is a builder for Cascading's Debug object. The default instance is the same default as cascading's new + * Debug() * https://github.com/cwensel/cascading/blob/wip-2.5/cascading-core/src/main/java/cascading/operation/Debug.java#L46 - * This is based on work by: https://github.com/granthenke - * https://github.com/twitter/scalding/pull/559 + * This is based on work by: https://github.com/granthenke https://github.com/twitter/scalding/pull/559 */ -case class PipeDebug(output: Output = Output.STDERR, - prefix: String = null, - printFieldsEvery: Option[Int] = None, - printTuplesEvery: Int = 1) { +case class PipeDebug( + output: Output = Output.STDERR, + prefix: String = null, + printFieldsEvery: Option[Int] = None, + printTuplesEvery: Int = 1 +) { def toStdOut: PipeDebug = copy(output = Output.STDOUT) def toStdErr: PipeDebug = copy(output = Output.STDERR) @@ -39,8 +41,8 @@ case class PipeDebug(output: Output = Output.STDERR, def toDebug: Debug = { val debug = new Debug(output, prefix, printFieldsEvery.isDefined) - if(printFieldsEvery.isDefined) { - debug.setPrintFieldsEvery(printFieldsEvery.get) + printFieldsEvery.foreach { x => + debug.setPrintFieldsEvery(x) } debug.setPrintTupleEvery(printTuplesEvery) debug @@ -48,4 +50,3 @@ case class PipeDebug(output: Output = Output.STDERR, def apply(p: Pipe): Pipe = new Each(p, toDebug) } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala b/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala index 173420e2b7..9eed5e9905 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/ReduceOperations.scala @@ -12,24 +12,15 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields -import cascading.tuple.{Tuple => CTuple, TupleEntry} - -import com.twitter.algebird.{ - Monoid, - Semigroup, - Ring, - AveragedValue, - Moments, - HyperLogLogMonoid, - HLL, - Aggregator -} +import cascading.tuple.{Tuple => CTuple} + +import com.twitter.algebird.{Aggregator, AveragedValue, HLL, HyperLogLogMonoid, Moments, Ring, Semigroup} -import com.twitter.algebird.mutable.PriorityQueueMonoid +import com.twitter.scalding.typed.functions.ScaldingPriorityQueueMonoid import java.util.PriorityQueue @@ -37,58 +28,60 @@ import scala.collection.JavaConverters._ import Dsl._ //Get the conversion implicits -/** Implements reductions on top of a simple abstraction for the Fields-API - * This is for associative and commutive operations (particularly Monoids and Semigroups play a big role here) +/** + * Implements reductions on top of a simple abstraction for the Fields-API This is for associative and + * commutive operations (particularly Monoids and Semigroups play a big role here) * - * We use the f-bounded polymorphism trick to return the type called Self - * in each operation. + * We use the f-bounded polymorphism trick to return the type called Self in each operation. */ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializable { - /** - * Type T is the type of the input field (input to map, T => X) - * Type X is the intermediate type, which your reduce function operates on - * (reduce is (X,X) => X) - * Type U is the final result type, (final map is: X => U) - * - * The previous output goes into the reduce function on the left, like foldLeft, - * so if your operation is faster for the accumulator to be on one side, be aware. - * - * Assumed to be a commutative operation. If you don't want that, use .forceToReducers - */ - def mapReduceMap[T,X,U](fieldDef : (Fields, Fields))(mapfn : T => X )(redfn : (X, X) => X) - (mapfn2 : X => U)(implicit startConv : TupleConverter[T], - middleSetter : TupleSetter[X], - middleConv : TupleConverter[X], - endSetter : TupleSetter[U]) : Self + + /** + * Type T is the type of the input field (input to map, T => X) Type X is the intermediate type, which your + * reduce function operates on (reduce is (X,X) => X) Type U is the final result type, (final map is: X => + * U) + * + * The previous output goes into the reduce function on the left, like foldLeft, so if your operation is + * faster for the accumulator to be on one side, be aware. + * + * Assumed to be a commutative operation. If you don't want that, use .forceToReducers + */ + def mapReduceMap[T, X, U]( + fieldDef: (Fields, Fields) + )(mapfn: T => X)(redfn: (X, X) => X)(mapfn2: X => U)(implicit + startConv: TupleConverter[T], + middleSetter: TupleSetter[X], + middleConv: TupleConverter[X], + endSetter: TupleSetter[U] + ): Self ///////////////////////////////////////// // All the below functions are implemented in terms of the above ///////////////////////////////////////// /** Pretty much a synonym for mapReduceMap with the methods collected into a trait. */ - def aggregate[A,B,C](fieldDef : (Fields, Fields))(ag: Aggregator[A,B,C]) - (implicit startConv : TupleConverter[A], - middleSetter : TupleSetter[B], - middleConv : TupleConverter[B], - endSetter : TupleSetter[C]): Self = - mapReduceMap[A,B,C](fieldDef)(ag.prepare _)(ag.reduce _)(ag.present _) + def aggregate[A, B, C](fieldDef: (Fields, Fields))(ag: Aggregator[A, B, C])(implicit + startConv: TupleConverter[A], + middleSetter: TupleSetter[B], + middleConv: TupleConverter[B], + endSetter: TupleSetter[C] + ): Self = + mapReduceMap[A, B, C](fieldDef)(ag.prepare _)(ag.reduce _)(ag.present _) /** - * uses a more stable online algorithm which should - * be suitable for large numbers of records + * uses a more stable online algorithm which should be suitable for large numbers of records * - * == Similar To == - * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm + * ==Similar To== + * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm */ - def average(f : (Fields, Fields)) = mapPlusMap(f) { (x : Double) => AveragedValue(1L, x) } { _.value } - def average(f : Symbol) : Self = average(f->f) + def average(f: (Fields, Fields)) = mapPlusMap(f)((x: Double) => AveragedValue(1L, x))(_.value) + def average(f: Symbol): Self = average(f -> f) /** - * Approximate number of unique values - * We use about m = (104/errPercent)^2 bytes of memory per key - * Uses `.toString.getBytes` to serialize the data so you MUST - * ensure that .toString is an equivalance on your counted fields - * (i.e. `x.toString == y.toString` if and only if `x == y`) + * Approximate number of unique values We use about m = (104/errPercent)^2 bytes of memory per key Uses + * `.toString.getBytes` to serialize the data so you MUST ensure that .toString is an equivalance on your + * counted fields (i.e. `x.toString == y.toString` if and only if `x == y`) * * For each key: * {{{ @@ -100,69 +93,58 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ * 0.25% error ~ 256kB * }}} */ - def approximateUniqueCount[T <% Array[Byte] : TupleConverter] - (f : (Fields, Fields), errPercent : Double = 1.0) = { - hyperLogLogMap[T,Double](f, errPercent) { _.estimatedSize } - } - - def hyperLogLog[T <% Array[Byte] : TupleConverter] - (f : (Fields, Fields), errPercent : Double = 1.0) = { - hyperLogLogMap[T,HLL](f, errPercent) { hll => hll } - } - - @deprecated("use of approximateUniqueCount is preferred.", "0.8.3") - def approxUniques(f : (Fields, Fields), errPercent : Double = 1.0) = { - // Legacy (pre-bijection) approximate unique count that uses in.toString.getBytes to - // obtain a long hash code. We specify the kludgy CTuple => Array[Byte] bijection - // explicitly. - implicit def kludgeHasher(in: CTuple) = in.toString.getBytes("UTF-8") - hyperLogLogMap[CTuple,Double](f, errPercent) { _.estimatedSize } - } - - private[this] def hyperLogLogMap[T <% Array[Byte] : TupleConverter, U : TupleSetter] - (f : (Fields, Fields), errPercent : Double = 1.0)(fn : HLL => U) = { - //bits = log(m) == 2 *log(104/errPercent) = 2log(104) - 2*log(errPercent) - def log2(x : Double) = scala.math.log(x)/scala.math.log(2.0) + def approximateUniqueCount[T <% Array[Byte]: TupleConverter]( + f: (Fields, Fields), + errPercent: Double = 1.0 + ) = + hyperLogLogMap[T, Double](f, errPercent)(_.estimatedSize) + + def hyperLogLog[T <% Array[Byte]: TupleConverter](f: (Fields, Fields), errPercent: Double = 1.0) = + hyperLogLogMap[T, HLL](f, errPercent)(hll => hll) + + private[this] def hyperLogLogMap[T <% Array[Byte]: TupleConverter, U: TupleSetter]( + f: (Fields, Fields), + errPercent: Double = 1.0 + )(fn: HLL => U) = { + // bits = log(m) == 2 *log(104/errPercent) = 2log(104) - 2*log(errPercent) + def log2(x: Double) = scala.math.log(x) / scala.math.log(2.0) val bits = 2 * scala.math.ceil(log2(104) - log2(errPercent)).toInt - implicit val hmm = new HyperLogLogMonoid(bits) - mapPlusMap(f) { (t : T) => hmm(t) } (fn) + implicit val hmm: HyperLogLogMonoid = new HyperLogLogMonoid(bits) + mapPlusMap(f)((t: T) => hmm.create(t))(fn) } /** - * This is count with a predicate: only counts the tuples for which - * `fn(tuple)` is true + * This is count with a predicate: only counts the tuples for which `fn(tuple)` is true */ - def count[T:TupleConverter](fieldDef : (Fields, Fields))(fn : T => Boolean) : Self = { - mapPlusMap(fieldDef){(arg : T) => if(fn(arg)) 1L else 0L} { s => s } - } + def count[T: TupleConverter](fieldDef: (Fields, Fields))(fn: T => Boolean): Self = + mapPlusMap(fieldDef)((arg: T) => if (fn(arg)) 1L else 0L)(s => s) /** - * Opposite of RichPipe.unpivot. See SQL/Excel for more on this function - * converts a row-wise representation into a column-wise one. + * Opposite of RichPipe.unpivot. See SQL/Excel for more on this function converts a row-wise representation + * into a column-wise one. * - * == Example == + * ==Example== * {{{ * pivot(('feature, 'value) -> ('clicks, 'impressions, 'requests)) * }}} * - * it will find the feature named "clicks", and put the value in the column with the field named - * clicks. + * it will find the feature named "clicks", and put the value in the column with the field named clicks. * * Absent fields result in null unless a default value is provided. Unnamed output fields are ignored. * - * == Note == + * ==Note== * Duplicated fields will result in an error. * - * == Hint == + * ==Hint== * if you want more precision, first do a * * {{{ * map('value -> value) { x : AnyRef => Option(x) } * }}} * - * and you will have non-nulls for all present values, and Nones for values that were present - * but previously null. All nulls in the final output will be those truly missing. - * Similarly, if you want to check if there are any items present that shouldn't be: + * and you will have non-nulls for all present values, and Nones for values that were present but previously + * null. All nulls in the final output will be those truly missing. Similarly, if you want to check if there + * are any items present that shouldn't be: * * {{{ * map('feature -> 'feature) { fname : String => @@ -171,128 +153,123 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ * } * }}} */ - def pivot(fieldDef : (Fields, Fields), defaultVal : Any = null) : Self = { + def pivot(fieldDef: (Fields, Fields), defaultVal: Any = null): Self = // Make sure the fields are strings: - mapList[(String,AnyRef),CTuple](fieldDef) { outputList => + mapList[(String, AnyRef), CTuple](fieldDef) { outputList => val asMap = outputList.toMap assert(asMap.size == outputList.size, "Repeated pivot key fields: " + outputList.toString) - val values = fieldDef._2 - .iterator.asScala + val values = fieldDef._2.iterator.asScala // Look up this key: - .map { fname => asMap.getOrElse(fname.asInstanceOf[String], defaultVal.asInstanceOf[AnyRef]) } + .map(fname => asMap.getOrElse(fname.asInstanceOf[String], defaultVal.asInstanceOf[AnyRef])) // Create the cascading tuple - new CTuple(values.toSeq : _*) + new CTuple(values.toSeq: _*) } - } /** - * Compute the count, ave and standard deviation in one pass - * example: g.sizeAveStdev('x -> ('cntx, 'avex, 'stdevx)) + * Compute the count, ave and standard deviation in one pass example: g.sizeAveStdev('x -> ('cntx, 'avex, + * 'stdevx)) */ - def sizeAveStdev(fieldDef : (Fields,Fields)) = { - mapPlusMap(fieldDef) { (x : Double) => Moments(x) } - { (mom : Moments) => (mom.count, mom.mean, mom.stddev) } - } + def sizeAveStdev(fieldDef: (Fields, Fields)) = + mapPlusMap(fieldDef)((x: Double) => Moments(x))((mom: Moments) => (mom.count, mom.mean, mom.stddev)) /* * check if a predicate is satisfied for all in the values for this key */ - def forall[T:TupleConverter](fieldDef : (Fields,Fields))(fn : (T) => Boolean) : Self = { - mapReduceMap(fieldDef)(fn)({(x : Boolean, y : Boolean) => x && y})({ x => x }) - } + def forall[T: TupleConverter](fieldDef: (Fields, Fields))(fn: (T) => Boolean): Self = + mapReduceMap(fieldDef)(fn)((x: Boolean, y: Boolean) => x && y)(x => x) /** * Return the first, useful probably only for sorted case. */ - def head(fd : (Fields,Fields)) : Self = { - //CTuple's have unknown arity so we have to put them into a Tuple1 in the middle phase: - mapReduceMap(fd) { ctuple : CTuple => Tuple1(ctuple) } - { (oldVal, newVal) => oldVal } - { result => result._1 } - } - def head(f : Symbol*) : Self = head(f -> f) + def head(fd: (Fields, Fields)): Self = + // CTuple's have unknown arity so we have to put them into a Tuple1 in the middle phase: + mapReduceMap(fd) { ctuple: CTuple => Tuple1(ctuple) }((oldVal, newVal) => oldVal) { result => + result._1 + } + def head(f: Symbol*): Self = head(f -> f) - def last(fd : (Fields,Fields)) = { - //CTuple's have unknown arity so we have to put them into a Tuple1 in the middle phase: - mapReduceMap(fd) { ctuple : CTuple => Tuple1(ctuple) } - { (oldVal, newVal) => newVal } - { result => result._1 } - } - def last(f : Symbol*) : Self = last(f -> f) + def last(fd: (Fields, Fields)) = + // CTuple's have unknown arity so we have to put them into a Tuple1 in the middle phase: + mapReduceMap(fd) { ctuple: CTuple => Tuple1(ctuple) }((oldVal, newVal) => newVal) { result => + result._1 + } + def last(f: Symbol*): Self = last(f -> f) /** - * Collect all the values into a List[T] and then operate on that - * list. This fundamentally uses as much memory as it takes to store the list. - * This gives you the list in the reverse order it was encounted (it is built - * as a stack for efficiency reasons). If you care about order, call .reverse in your fn + * Collect all the values into a List[T] and then operate on that list. This fundamentally uses as much + * memory as it takes to store the list. This gives you the list in the reverse order it was encounted (it + * is built as a stack for efficiency reasons). If you care about order, call .reverse in your fn * * STRONGLY PREFER TO AVOID THIS. Try reduce or plus and an O(1) memory algorithm. */ - def mapList[T,R](fieldDef : (Fields, Fields))(fn : (List[T]) => R) - (implicit conv : TupleConverter[T], setter : TupleSetter[R]) : Self = { + def mapList[T, R]( + fieldDef: (Fields, Fields) + )(fn: (List[T]) => R)(implicit conv: TupleConverter[T], setter: TupleSetter[R]): Self = { val midset = implicitly[TupleSetter[List[T]]] val midconv = implicitly[TupleConverter[List[T]]] - mapReduceMap[T, List[T], R](fieldDef) { //Map + mapReduceMap[T, List[T], R](fieldDef) { // Map x => List(x) - } { //Reduce, note the bigger list is likely on the left, so concat into it: + } { // Reduce, note the bigger list is likely on the left, so concat into it: (prev, current) => current ++ prev - } { fn(_) }(conv, midset, midconv, setter) + }(fn(_))(conv, midset, midconv, setter) } - def mapPlusMap[T,X,U](fieldDef : (Fields, Fields))(mapfn : T => X)(mapfn2 : X => U) - (implicit startConv : TupleConverter[T], - middleSetter : TupleSetter[X], - middleConv : TupleConverter[X], - endSetter : TupleSetter[U], - sgX : Semigroup[X]) : Self = { - mapReduceMap[T,X,U](fieldDef) (mapfn)((x,y) => sgX.plus(x,y))(mapfn2) (startConv, middleSetter, middleConv, endSetter) - } + def mapPlusMap[T, X, U](fieldDef: (Fields, Fields))(mapfn: T => X)(mapfn2: X => U)(implicit + startConv: TupleConverter[T], + middleSetter: TupleSetter[X], + middleConv: TupleConverter[X], + endSetter: TupleSetter[U], + sgX: Semigroup[X] + ): Self = + mapReduceMap[T, X, U](fieldDef)(mapfn)((x, y) => sgX.plus(x, y))(mapfn2)( + startConv, + middleSetter, + middleConv, + endSetter + ) + + private def extremum(max: Boolean, fieldDef: (Fields, Fields)): Self = { + // CTuple's have unknown arity so we have to put them into a Tuple1 in the middle phase: + val select = if (max) { (a: CTuple, b: CTuple) => + (a.compareTo(b) >= 0) + } else { (a: CTuple, b: CTuple) => + (a.compareTo(b) <= 0) + } - private def extremum(max : Boolean, fieldDef : (Fields,Fields)) : Self = { - //CTuple's have unknown arity so we have to put them into a Tuple1 in the middle phase: - val select = if(max) { - { (a : CTuple, b : CTuple) => (a.compareTo(b) >= 0) } - } - else { - { (a : CTuple, b : CTuple) => (a.compareTo(b) <= 0) } - } - - mapReduceMap(fieldDef) { ctuple : CTuple => Tuple1(ctuple) } - { (oldVal, newVal) => if (select(oldVal._1, newVal._1)) oldVal else newVal } - { result => result._1 } + mapReduceMap(fieldDef) { ctuple: CTuple => Tuple1(ctuple) } { (oldVal, newVal) => + if (select(oldVal._1, newVal._1)) oldVal else newVal + }(result => result._1) } - def max(fieldDef : (Fields, Fields)) = extremum(true, fieldDef) - def max(f : Symbol*) = extremum(true, (f -> f)) - def min(fieldDef : (Fields, Fields)) = extremum(false, fieldDef) - def min(f : Symbol*) = extremum(false, (f -> f)) + def max(fieldDef: (Fields, Fields)) = extremum(true, fieldDef) + def max(f: Symbol*) = extremum(true, (f -> f)) + def min(fieldDef: (Fields, Fields)) = extremum(false, fieldDef) + def min(f: Symbol*) = extremum(false, (f -> f)) /** - * Similar to the scala.collection.Iterable.mkString - * takes the source and destination fieldname, which should be a single - * field. The result will be start, each item.toString separated by sep, - * followed by end for convenience there several common variants below + * Similar to the scala.collection.Iterable.mkString takes the source and destination fieldname, which + * should be a single field. The result will be start, each item.toString separated by sep, followed by end + * for convenience there several common variants below */ - def mkString(fieldDef : (Fields,Fields), start : String, sep : String, end : String) : Self = { - mapList[String,String](fieldDef) { _.mkString(start, sep, end) } - } - def mkString(fieldDef : (Fields,Fields), sep : String) : Self = mkString(fieldDef,"",sep,"") - def mkString(fieldDef : (Fields,Fields)) : Self = mkString(fieldDef,"","","") + def mkString(fieldDef: (Fields, Fields), start: String, sep: String, end: String): Self = + mapList[String, String](fieldDef)(_.mkString(start, sep, end)) + def mkString(fieldDef: (Fields, Fields), sep: String): Self = mkString(fieldDef, "", sep, "") + def mkString(fieldDef: (Fields, Fields)): Self = mkString(fieldDef, "", "", "") + /** - * these will only be called if a tuple is not passed, meaning just one - * column - */ - def mkString(fieldDef : Symbol, start : String, sep : String, end : String) : Self = { - val f : Fields = fieldDef - mkString((f,f),start,sep,end) + * these will only be called if a tuple is not passed, meaning just one column + */ + def mkString(fieldDef: Symbol, start: String, sep: String, end: String): Self = { + val f: Fields = fieldDef + mkString((f, f), start, sep, end) } - def mkString(fieldDef : Symbol, sep : String) : Self = mkString(fieldDef,"",sep,"") - def mkString(fieldDef : Symbol) : Self = mkString(fieldDef,"","","") + def mkString(fieldDef: Symbol, sep: String): Self = mkString(fieldDef, "", sep, "") + def mkString(fieldDef: Symbol): Self = mkString(fieldDef, "", "", "") - /** + /** * Apply an associative/commutative operation on the left field. * - * == Example == + * ==Example== * {{{ * reduce(('mass,'allids)->('totalMass, 'idset)) { (left:(Double,Set[Long]),right:(Double,Set[Long])) => * (left._1 + right._1, left._2 ++ right._2) @@ -301,115 +278,99 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ * * Equivalent to a mapReduceMap with trivial (identity) map functions. * - * Assumed to be a commutative operation. If you don't want that, use .forceToReducers + * Assumed to be a commutative operation. If you don't want that, use .forceToReducers * - * The previous output goes into the reduce function on the left, like foldLeft, - * so if your operation is faster for the accumulator to be on one side, be aware. + * The previous output goes into the reduce function on the left, like foldLeft, so if your operation is + * faster for the accumulator to be on one side, be aware. */ - def reduce[T](fieldDef : (Fields, Fields))(fn : (T,T)=>T) - (implicit setter : TupleSetter[T], conv : TupleConverter[T]) : Self = { - mapReduceMap[T,T,T](fieldDef)({ t => t })(fn)({t => t})(conv,setter,conv,setter) - } - //Same as reduce(f->f) - def reduce[T](fieldDef : Symbol*)(fn : (T,T)=>T)(implicit setter : TupleSetter[T], - conv : TupleConverter[T]) : Self = { - reduce(fieldDef -> fieldDef)(fn)(setter,conv) - } + def reduce[T](fieldDef: (Fields, Fields))( + fn: (T, T) => T + )(implicit setter: TupleSetter[T], conv: TupleConverter[T]): Self = + mapReduceMap[T, T, T](fieldDef)(t => t)(fn)(t => t)(conv, setter, conv, setter) + // Same as reduce(f->f) + def reduce[T](fieldDef: Symbol*)( + fn: (T, T) => T + )(implicit setter: TupleSetter[T], conv: TupleConverter[T]): Self = + reduce(fieldDef -> fieldDef)(fn)(setter, conv) // Abstract algebra reductions (sum, times, dot): - /** - * Use `Semigroup.plus` to compute a sum. Not called sum to avoid conflicting with standard sum - * Your `Semigroup[T]` should be associated and commutative, else this doesn't make sense + /** + * Use `Semigroup.plus` to compute a sum. Not called sum to avoid conflicting with standard sum Your + * `Semigroup[T]` should be associated and commutative, else this doesn't make sense * - * Assumed to be a commutative operation. If you don't want that, use .forceToReducers + * Assumed to be a commutative operation. If you don't want that, use .forceToReducers */ - def sum[T](fd : (Fields,Fields)) - (implicit sg: Semigroup[T], tconv : TupleConverter[T], tset : TupleSetter[T]) : Self = { + def sum[T]( + fd: (Fields, Fields) + )(implicit sg: Semigroup[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self = // We reverse the order because the left is the old value in reduce, and for list concat // we are much better off concatenating into the bigger list - reduce[T](fd)({ (left, right) => sg.plus(right, left) })(tset, tconv) - } - /** - * The same as `sum(fs -> fs)` - * Assumed to be a commutative operation. If you don't want that, use .forceToReducers - */ - def sum[T](fs : Symbol*) - (implicit sg: Semigroup[T], tconv : TupleConverter[T], tset : TupleSetter[T]) : Self = - sum[T](fs -> fs)(sg,tconv,tset) - - @deprecated("Use sum", "0.9.0") - def plus[T](fd : (Fields,Fields)) - (implicit sg: Semigroup[T], tconv : TupleConverter[T], tset : TupleSetter[T]) : Self = - sum[T](fd)(sg, tconv, tset) + reduce[T](fd)((left, right) => sg.plus(right, left))(tset, tconv) + /** - * The same as `plus(fs -> fs)` - * Assumed to be a commutative operation. If you don't want that, use .forceToReducers + * The same as `sum(fs -> fs)` Assumed to be a commutative operation. If you don't want that, use + * .forceToReducers */ - @deprecated("Use sum", "0.9.0") - def plus[T](fs : Symbol*) - (implicit sg: Semigroup[T], tconv : TupleConverter[T], tset : TupleSetter[T]) : Self = - sum[T](fs -> fs)(sg,tconv,tset) + def sum[T](fs: Symbol*)(implicit sg: Semigroup[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self = + sum[T](fs -> fs)(sg, tconv, tset) /** * Returns the product of all the items in this grouping */ - def times[T](fd : (Fields,Fields)) - (implicit ring : Ring[T], tconv : TupleConverter[T], tset : TupleSetter[T]) : Self = { + def times[T]( + fd: (Fields, Fields) + )(implicit ring: Ring[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self = // We reverse the order because the left is the old value in reduce, and for list concat // we are much better off concatenating into the bigger list - reduce[T](fd)({ (left, right) => ring.times(right, left) })(tset, tconv) - } + reduce[T](fd)((left, right) => ring.times(right, left))(tset, tconv) /** * The same as `times(fs -> fs)` */ - def times[T](fs : Symbol*) - (implicit ring : Ring[T], tconv : TupleConverter[T], tset : TupleSetter[T]) : Self = { - times[T](fs -> fs)(ring,tconv,tset) - } + def times[T](fs: Symbol*)(implicit ring: Ring[T], tconv: TupleConverter[T], tset: TupleSetter[T]): Self = + times[T](fs -> fs)(ring, tconv, tset) /** * Convert a subset of fields into a list of Tuples. Need to provide the types of the tuple fields. */ - def toList[T](fieldDef : (Fields, Fields))(implicit conv : TupleConverter[T]) : Self = { + def toList[T](fieldDef: (Fields, Fields))(implicit conv: TupleConverter[T]): Self = // TODO(POB) this is jank in my opinion. Nulls should be filter by the user if they want - mapList[T,List[T]](fieldDef) { _.filter { t => t != null } } - } - + mapList[T, List[T]](fieldDef)(_.filter(t => t != null)) /** * First do "times" on each pair, then "plus" them all together. * - * == Example == + * ==Example== * {{{ * groupBy('x) { _.dot('y,'z, 'ydotz) } * }}} */ - def dot[T](left : Fields, right : Fields, result : Fields) - (implicit ttconv : TupleConverter[Tuple2[T,T]], ring : Ring[T], - tconv : TupleConverter[T], tset : TupleSetter[T]) : Self = { - mapReduceMap[(T,T),T,T](Fields.merge(left, right) -> result) { init : (T,T) => + def dot[T](left: Fields, right: Fields, result: Fields)(implicit + ttconv: TupleConverter[Tuple2[T, T]], + ring: Ring[T], + tconv: TupleConverter[T], + tset: TupleSetter[T] + ): Self = + mapReduceMap[(T, T), T, T](Fields.merge(left, right) -> result) { init: (T, T) => ring.times(init._1, init._2) - } { (left : T, right: T) => + } { (left: T, right: T) => ring.plus(left, right) - } { result => result } - } + }(result => result) /** * How many values are there for this key */ - def size : Self = size('size) - def size(thisF : Fields) : Self = { - mapPlusMap(() -> thisF) { (u : Unit) => 1L } { s => s } - } + def size: Self = size('size) + def size(thisF: Fields): Self = + mapPlusMap(() -> thisF)((u: Unit) => 1L)(s => s) /** - * Equivalent to sorting by a comparison function - * then take-ing k items. This is MUCH more efficient than doing a total sort followed by a take, - * since these bounded sorts are done on the mapper, so only a sort of size k is needed. + * Equivalent to sorting by a comparison function then take-ing k items. This is MUCH more efficient than + * doing a total sort followed by a take, since these bounded sorts are done on the mapper, so only a sort + * of size k is needed. * - * == Example == + * ==Example== * {{{ * sortWithTake( ('clicks, 'tweet) -> 'topClicks, 5) { * fn : (t0 :(Long,Long), t1:(Long,Long) => t0._1 < t1._1 } @@ -417,35 +378,34 @@ trait ReduceOperations[+Self <: ReduceOperations[Self]] extends java.io.Serializ * * topClicks will be a List[(Long,Long)] */ - def sortWithTake[T:TupleConverter](f : (Fields, Fields), k : Int)(lt : (T,T) => Boolean) : Self = { - val ord = Ordering.fromLessThan(lt); + def sortWithTake[T: TupleConverter](f: (Fields, Fields), k: Int)(lt: (T, T) => Boolean): Self = { + val ord = Ordering.fromLessThan(lt) sortedTake(f, k)(implicitly[TupleConverter[T]], ord) } /** * Reverse of above when the implicit ordering makes sense. */ - def sortedReverseTake[T](f : (Fields, Fields), k : Int) - (implicit conv : TupleConverter[T], ord : Ordering[T]) : Self = { + def sortedReverseTake[T](f: (Fields, Fields), k: Int)(implicit + conv: TupleConverter[T], + ord: Ordering[T] + ): Self = sortedTake[T](f, k)(conv, ord.reverse) - } /** * Same as above but useful when the implicit ordering makes sense. */ - def sortedTake[T](f : (Fields, Fields), k : Int) - (implicit conv : TupleConverter[T], ord : Ordering[T]) : Self = { + def sortedTake[T](f: (Fields, Fields), k: Int)(implicit conv: TupleConverter[T], ord: Ordering[T]): Self = { assert(f._2.size == 1, "output field size must be 1") - implicit val mon = new PriorityQueueMonoid[T](k) - mapPlusMap(f) { (tup : T) => mon.build(tup) } { - (lout : PriorityQueue[T]) => lout.iterator.asScala.toList.sorted + implicit val mon: ScaldingPriorityQueueMonoid[T] = new ScaldingPriorityQueueMonoid[T](k) + mapPlusMap(f)((tup: T) => mon.build(tup)) { (lout: PriorityQueue[T]) => + lout.iterator.asScala.toList.sorted } } - def histogram(f : (Fields, Fields), binWidth : Double = 1.0) = { - mapPlusMap(f) - {x : Double => Map((math.floor(x / binWidth) * binWidth) -> 1L)} - {map => new mathematics.Histogram(map, binWidth)} - } + def histogram(f: (Fields, Fields), binWidth: Double = 1.0) = + mapPlusMap(f) { x: Double => Map((math.floor(x / binWidth) * binWidth) -> 1L) } { map => + new mathematics.Histogram(map, binWidth) + } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/ReferencedClassFinder.scala b/scalding-core/src/main/scala/com/twitter/scalding/ReferencedClassFinder.scala new file mode 100644 index 0000000000..9bb48c70b0 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/ReferencedClassFinder.scala @@ -0,0 +1,125 @@ +package com.twitter.scalding + +import com.twitter.scalding.typed.CoGroupable +import org.slf4j.LoggerFactory +import scala.reflect.runtime.universe +import scala.reflect.runtime.universe.{NullaryMethodType, RuntimeMirror, Symbol, Type, TypeRef} +import java.lang.{reflect => jReflect} + +object ReferencedClassFinder { + + private val LOG = LoggerFactory.getLogger(this.getClass) + + private val baseContainers = List( + classOf[Execution[_]], + classOf[TypedPipe[_]], + classOf[TypedSink[_]], + classOf[TypedSource[_]], + classOf[CoGroupable[_, _]], + classOf[KeyedList[_, _]] + ) + + /** + * Add the given type, as well as all referenced types to the cascading tokens list. note, for maximal + * efficiency, you should also register those types with the kryo instantiator being used. + */ + def addCascadingTokensFrom(c: Class[_], config: Config): Config = + CascadingTokenUpdater.update(config, findReferencedClasses(c) + c) + + /** + * Reflect over a scalding job to try and identify types it uses so they can be tokenized by cascading. + * Since scala reflection is broken with the Hadoop InterfaceAudiance annotation (see + * https://issues.scala-lang.org/browse/SI-10129), we can't iterate over scalaType.members, so we instead + * use java reflection to iterate over fields to find the ones we care about, and then look those up in + * scala reflection to find the full un-erased type signatures, and try to find types from those. + * + * Note: this not guaranteed to find every used type. Eg, it can't find types used in a step that isn't + * referred to in a field + */ + def findReferencedClasses(outerClass: Class[_]): Set[Class[_]] = { + val scalaPackage = Package.getPackage("scala") + val mirror = universe.runtimeMirror(outerClass.getClassLoader) + getClassType(outerClass, mirror) match { + case Some(scalaType) => + (for { + field <- outerClass.getDeclaredFields + if baseContainers.exists(_.isAssignableFrom(field.getType)) + scalaSignature <- getFieldType(outerClass, scalaType, field).toSeq + clazz <- getClassesForType(mirror, scalaSignature) + /* The scala root package contains a lot of shady stuff, eg compile-time wrappers (scala.Int/Array etc), + * which reflection will present as type parameters. Skip the whole package - chill-hadoop already ensures most + * of the ones we care about (eg tuples) get tokenized in cascading. + */ + if !(clazz.isPrimitive || clazz.isArray || clazz.getPackage.equals(scalaPackage)) + } yield { + clazz + }).toSet + case _ => Set() + } + } + + private def getFieldType( + outerClass: Class[_], + scalaType: universe.Type, + field: jReflect.Field + ): Option[universe.Type] = + safeScalaReflectionCall(outerClass) { + scalaType.member(universe.stringToTermName(field.getName)).typeSignature + } + + private def getClassType(outerClass: Class[_], mirror: universe.Mirror): Option[universe.Type] = + safeScalaReflectionCall(outerClass) { + mirror.classSymbol(outerClass).toType + } + + private def safeScalaReflectionCall[T](outerClass: Class[_])(call: => T): Option[T] = + try { + Some(call) + } catch { + // In some cases we fail to find references classes, it shouldn't be fatal. + case r: RuntimeException if r.getMessage.contains("error reading Scala signature") => + LOG.warn( + s"Unable to find referenced classes for: $outerClass. This is potentially due to missing dependencies", + r + ) + None + case t: Throwable if t.getMessage.contains("illegal cyclic reference") => + // Related to: https://issues.scala-lang.org/browse/SI-10129 + LOG.warn( + s"Unable to find referenced classes for: $outerClass. Related to Scala language issue: SI-10129", + t + ) + None + case ae: AssertionError if ae.getMessage.contains("no symbol could be loaded from interface") => + // Related to: https://issues.scala-lang.org/browse/SI-10129 + LOG.warn( + s"Unable to find referenced classes for: $outerClass. Related to Scala language issue: SI-10129", + ae + ) + None + case t: Throwable => throw t + } + + private def getClassesForType(mirror: RuntimeMirror, typeSignature: Type): Seq[Class[_]] = + typeSignature match { + case TypeRef(_, _, args) => + args.flatMap { generic => + // If the wrapped type is a Tuple, recurse into its types + if (generic.typeSymbol.fullName.startsWith("scala.Tuple")) { + getClassesForType(mirror, generic) + } else { + getClassOpt(mirror, generic.typeSymbol) + } + } + // .member returns the accessor method for the variable unless the field is private[this], so inspect the return type + case NullaryMethodType(resultType) => getClassesForType(mirror, resultType) + case _ => Nil + } + + private def getClassOpt(mirror: RuntimeMirror, typeSymbol: Symbol): Option[Class[_]] = + try { + Some(mirror.runtimeClass(typeSymbol.asClass)) + } catch { + case _: ClassNotFoundException | ScalaReflectionException(_) => None + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/RichFlowDef.scala b/scalding-core/src/main/scala/com/twitter/scalding/RichFlowDef.scala new file mode 100644 index 0000000000..49fbea4b0b --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/RichFlowDef.scala @@ -0,0 +1,181 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import cascading.flow.FlowDef +import cascading.pipe.Pipe +import com.twitter.algebird.Monoid +import java.util.{List => JList, Map => JMap} + +/** + * This is an enrichment-pattern class for cascading.flow.FlowDef. The rule is to never use this class + * directly in input or return types, but only to add methods to FlowDef. + */ +class RichFlowDef(val fd: FlowDef) { + // allow .asScala conversions + import collection.JavaConverters._ + + // RichPipe and RichFlowDef implicits + import Dsl._ + + def copy: FlowDef = { + val newFd = new FlowDef + newFd.mergeFrom(fd) + newFd + } + + /** + * Merge state from FlowDef excluding Sources/Sinks/Tails (sometimes we don't want both) + */ + private[scalding] def mergeMiscFrom(o: FlowDef): Unit = { + // See the cascading code that this string is a "," separated set. + StringUtility.fastSplit(o.getTags, ",").foreach(fd.addTag) + + mergeLeft(fd.getTraps, o.getTraps) + mergeLeft(fd.getCheckpoints, o.getCheckpoints) + + appendLeft(fd.getClassPath, o.getClassPath) + + fd.setAssertionLevel(preferLeft(fd.getAssertionLevel, o.getAssertionLevel)) + fd.setName(preferLeft(fd.getName, o.getName)) + } + + private[this] def preferLeft[T](left: T, right: T): T = + Option(left).getOrElse(right) + + private[this] def mergeLeft[K, V](left: JMap[K, V], right: JMap[K, V]): Unit = + right.asScala.foreach { case (k, v) => + if (!left.containsKey(k)) left.put(k, v) + } + private[this] def appendLeft[T](left: JList[T], right: JList[T]): Unit = { + val existing = left.asScala.toSet + right.asScala + .filterNot(existing) + .foreach(left.add) + } + + def isEmpty: Boolean = + fd.getTraps.isEmpty && + fd.getCheckpoints.isEmpty && + fd.getSources.isEmpty && + fd.getSinks.isEmpty && + fd.getTails.isEmpty + + /** + * Mutate current flow def to add all sources/sinks/etc from given FlowDef + */ + def mergeFrom(o: FlowDef): Unit = { + mergeLeft(fd.getSources, o.getSources) + mergeLeft(fd.getSinks, o.getSinks) + appendLeft(fd.getTails, o.getTails) + + fd.mergeMiscFrom(o) + // Merge the FlowState + val oFS = FlowStateMap(o) + FlowStateMap.merge(fd, oFS) + } + + /** + * find all heads reachable from the tails (as a set of names) + */ + def heads: Set[Pipe] = fd.getTails.asScala.flatMap(_.getHeads).toSet + + /** + * New flow def with only sources upstream from tails. + */ + def withoutUnusedSources: FlowDef = { + + // add taps associated with heads to localFlow + val filteredSources = fd.getSources.asScala.filterKeys(heads.map(p => p.getName)).asJava + + val newFd = fd.copy + newFd.getSources.clear() + newFd.addSources(filteredSources) + + newFd + } + + /** + * FlowDef that only includes things upstream from the given Pipe + */ + def onlyUpstreamFrom(pipe: Pipe): FlowDef = { + val newFd = new FlowDef + // don't copy any sources/sinks + newFd.mergeMiscFrom(fd) + + val sourceTaps = fd.getSources + val newSrcs = newFd.getSources + + val upipes = pipe.upstreamPipes + val headNames: Set[String] = upipes + .filter(_.getPrevious.length == 0) // implies _ is a head + .map(_.getName) + + headNames + .foreach { head => + // TODO: make sure we handle checkpoints correctly + if (!newSrcs.containsKey(head)) { + newFd.addSource(head, sourceTaps.get(head)) + } + } + + val sinks = fd.getSinks + if (sinks.containsKey(pipe.getName)) { + newFd.addTailSink(pipe, sinks.get(pipe.getName)) + } + + // Update the FlowState: + FlowStateMap + .get(fd) + .foreach { thisFS => + /** + * these are all the sources that are upstream of the pipe in question + */ + val subFlowState = + Monoid.sum( + thisFS.sourceMap + .collect { + case (name, source) if headNames(name) => + FlowState.withSource(name, source) + } + ) + /* + * We assume all the old config updates need to be + * done, but this may an over approximation and not + * be 100% correct. We have been doing it for a while + * however + * + * Note, this method is only used to convert a pipe + * to a TypedPipe. So, we assume there should be + * no pending typed writes upstream of this pipe + * that are relevant to this pipe when brought + * into the TypedAPI + */ + val withConfig = thisFS.copy(sourceMap = Map.empty, pendingTypedWrites = Nil) + + /* + * Note that newFd was just allocated, so it has no + * FlowState at all, we verify that here to be defensive + * since this is not performance critical code + */ + require(FlowStateMap(newFd) == FlowState.empty, s"FlowState is not empty: ${FlowStateMap(newFd)}") + + FlowStateMap.merge(newFd, Monoid.plus(subFlowState, withConfig)) + } + + newFd + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/RichPipe.scala b/scalding-core/src/main/scala/com/twitter/scalding/RichPipe.scala index ca5759d556..50bb8b2b6f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/RichPipe.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/RichPipe.scala @@ -12,25 +12,20 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.tap._ -import cascading.scheme._ +import cascading.property.ConfigDef.Getter import cascading.pipe._ -import cascading.pipe.assembly._ -import cascading.pipe.joiner._ import cascading.flow._ import cascading.operation._ -import cascading.operation.aggregator._ import cascading.operation.filter._ import cascading.tuple._ -import cascading.cascade._ -import cascading.operation.Debug.Output import scala.util.Random import java.util.concurrent.atomic.AtomicInteger +import scala.collection.immutable.Queue object RichPipe extends java.io.Serializable { private val nextPipe = new AtomicInteger(-1) @@ -41,29 +36,129 @@ object RichPipe extends java.io.Serializable { def getNextName: String = "_pipe_" + nextPipe.incrementAndGet.toString - def assignName(p : Pipe) = new Pipe(getNextName, p) + private[scalding] val FormerNameBitLength = 12 + private[scalding] val FormerAssignedPipeNamePattern = "^_pipe_([0-9]+).*$".r + private[scalding] val FromUuidPattern = + "^.*[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-([0-9a-f]{12}).*$".r + + // grab some bit of the previous pipe name to help walk up the graph across name assignments + private def getFormerNameBit(p: Pipe): String = p.getName match { + case FormerAssignedPipeNamePattern(pipeNumber) => pipeNumber + case FromUuidPattern(lastGroup) => lastGroup /* 12 characters */ + case s if s.length > FormerNameBitLength => s.substring(s.length - FormerNameBitLength, s.length) + case s => s + } + + /** + * Assign a new, guaranteed-unique name to the pipe. + * @param p + * a pipe, whose name should be changed + * @return + * a pipe with a new name which is guaranteed to be new and never re-assigned by this function + * + * Note: the assigned name includes a few characters from the former name to assisgit dift in debugging. + */ + def assignName(p: Pipe): Pipe = new Pipe(getNextName + "-" + getFormerNameBit(p), p) private val REDUCER_KEY = "mapred.reduce.tasks" + /** - * Gets the underlying config for this pipe and sets the number of reducers - * useful for cascading GroupBy/CoGroup pipes. + * Gets the underlying config for this pipe and sets the number of reducers useful for cascading + * GroupBy/CoGroup pipes. */ - def setReducers(p : Pipe, reducers : Int) : Pipe = { - if(reducers > 0) { + def setReducers(p: Pipe, reducers: Int): Pipe = { + if (reducers > 0) { p.getStepConfigDef() .setProperty(REDUCER_KEY, reducers.toString) - } else if(reducers != -1) { - throw new IllegalArgumentException("Number of reducers must be non-negative") + p.getStepConfigDef() + .setProperty(Config.WithReducersSetExplicitly, "true") + } else if (reducers != -1) { + throw new IllegalArgumentException(s"Number of reducers must be non-negative. Got: $reducers") } p } + + // A pipe can have more than one description when merged together, so we store them delimited with 255.toChar. + // Cannot use 1.toChar as we get an error if it is not a printable character. + private def encodePipeDescriptions(descriptions: Seq[String]): String = + descriptions.map(_.replace(255.toChar, ' ')).filter(_.nonEmpty).mkString(255.toChar.toString) + + private def decodePipeDescriptions(encoding: String): Seq[String] = + encoding.split(255.toChar).toSeq + + def getPipeDescriptions(p: Pipe): Seq[String] = + if (p.getStepConfigDef.isEmpty) + Nil + else { + // We use empty getter so we can get latest config value of Config.PipeDescriptions in the step ConfigDef. + val encodedResult = p.getStepConfigDef.apply( + Config.PipeDescriptions, + new Getter { + override def update(s: String, s1: String): String = ??? + override def get(s: String): String = null + } + ) + Option(encodedResult) + .filterNot(_.isEmpty) + .map(decodePipeDescriptions) + .getOrElse(Nil) + } + + def setPipeDescriptions(p: Pipe, descriptions: Seq[String]): Pipe = { + p.getStepConfigDef() + .setProperty(Config.PipeDescriptions, encodePipeDescriptions(getPipeDescriptions(p) ++ descriptions)) + p + } + + def setPipeDescriptionFrom(p: Pipe, ste: Option[StackTraceElement]): Pipe = { + ste.foreach { ste => + setPipeDescriptions(p, List(ste.toString)) + } + p + } + + /** + * If there is exactly one previous Pipe, get it, otherwise None + */ + def getSinglePreviousPipe(p: Pipe): Option[Pipe] = + if (p.getPrevious != null && p.getPrevious.length == 1) p.getPrevious.headOption + else None + + /** + * Is the given Pipe a source (it has no previous and is not a splice + */ + def isSourcePipe(pipe: Pipe): Boolean = + pipe.getParent == null && + (pipe.getPrevious == null || pipe.getPrevious.isEmpty) && + (!pipe.isInstanceOf[Splice]) + + /** + * This is true if a pipe passes through all input fields without explicitly remapping + */ + @annotation.tailrec + final def isPassthrough(pipe: Pipe): Boolean = { + def element(p: Pipe): Boolean = + p match { + case e: Each if e.isFilter => true + case cp: Checkpoint => true + case _ => false + } + + isSourcePipe(pipe) || { + element(pipe) && + (getSinglePreviousPipe(pipe) match { + case Some(prev) => isPassthrough(prev) + case None => false + }) + } + } } -/** This is an enrichment-pattern class for cascading.pipe.Pipe. - * The rule is to never use this class directly in input or return types, but - * only to add methods to Pipe. +/** + * This is an enrichment-pattern class for cascading.pipe.Pipe. The rule is to never use this class directly + * in input or return types, but only to add methods to Pipe. */ -class RichPipe(val pipe : Pipe) extends java.io.Serializable with JoinAlgorithms { +class RichPipe(val pipe: Pipe) extends java.io.Serializable with JoinAlgorithms { // We need this for the implicits import Dsl._ import RichPipe.assignName @@ -74,22 +169,32 @@ class RichPipe(val pipe : Pipe) extends java.io.Serializable with JoinAlgorithms def name(s: String): Pipe = new Pipe(s, pipe) /** - * Beginning of block with access to expensive nonserializable state. The state object should - * contain a function release() for resource management purpose. + * Beginning of block with access to expensive nonserializable state. The state object should contain a + * function release() for resource management purpose. */ - def using[C <: { def release() }](bf: => C) = new { + def using[C <: { def release(): Unit }](bf: => C) = new { /** * For pure side effect. */ - def foreach[A](f: Fields)(fn: (C, A) => Unit) - (implicit conv: TupleConverter[A], set: TupleSetter[Unit], flowDef: FlowDef, mode: Mode) = { + def foreach[A](f: Fields)( + fn: (C, A) => Unit + )(implicit conv: TupleConverter[A], set: TupleSetter[Unit], flowDef: FlowDef, mode: Mode) = { conv.assertArityMatches(f) - val newPipe = new Each(pipe, f, new SideEffectMapFunction(bf, fn, - new Function1[C, Unit] with java.io.Serializable { - def apply(c: C) { c.release() } - }, - Fields.NONE, conv, set)) + val newPipe = new Each( + pipe, + f, + new SideEffectMapFunction( + bf, + fn, + new Function1[C, Unit] with java.io.Serializable { + def apply(c: C): Unit = c.release() + }, + Fields.NONE, + conv, + set + ) + ) NullSource.writeFrom(newPipe)(flowDef, mode) newPipe } @@ -97,61 +202,70 @@ class RichPipe(val pipe : Pipe) extends java.io.Serializable with JoinAlgorithms /** * map with state */ - def map[A,T](fs: (Fields,Fields))(fn: (C, A) => T) - (implicit conv: TupleConverter[A], set: TupleSetter[T]) = { + def map[A, T]( + fs: (Fields, Fields) + )(fn: (C, A) => T)(implicit conv: TupleConverter[A], set: TupleSetter[T]) = { conv.assertArityMatches(fs._1) set.assertArityMatches(fs._2) - val mf = new SideEffectMapFunction(bf, fn, + val mf = new SideEffectMapFunction( + bf, + fn, new Function1[C, Unit] with java.io.Serializable { - def apply(c: C) { c.release() } + def apply(c: C): Unit = c.release() }, - fs._2, conv, set) + fs._2, + conv, + set + ) new Each(pipe, fs._1, mf, defaultMode(fs._1, fs._2)) } /** * flatMap with state */ - def flatMap[A,T](fs: (Fields,Fields))(fn: (C, A) => TraversableOnce[T]) - (implicit conv: TupleConverter[A], set: TupleSetter[T]) = { + def flatMap[A, T]( + fs: (Fields, Fields) + )(fn: (C, A) => TraversableOnce[T])(implicit conv: TupleConverter[A], set: TupleSetter[T]) = { conv.assertArityMatches(fs._1) set.assertArityMatches(fs._2) - val mf = new SideEffectFlatMapFunction(bf, fn, + val mf = new SideEffectFlatMapFunction( + bf, + fn, new Function1[C, Unit] with java.io.Serializable { - def apply(c: C) { c.release() } + def apply(c: C): Unit = c.release() }, - fs._2, conv, set) + fs._2, + conv, + set + ) new Each(pipe, fs._1, mf, defaultMode(fs._1, fs._2)) } } /** - * Keep only the given fields, and discard the rest. - * takes any number of parameters as long as we can convert - * them to a fields object + * Keep only the given fields, and discard the rest. takes any number of parameters as long as we can + * convert them to a fields object */ - def project(fields : Fields): Pipe = + def project(fields: Fields): Pipe = new Each(pipe, fields, new Identity(fields)) /** - * Discard the given fields, and keep the rest. - * Kind of the opposite of project method. + * Discard the given fields, and keep the rest. Kind of the opposite of project method. */ - def discard(f : Fields): Pipe = + def discard(f: Fields): Pipe = new Each(pipe, f, new NoOp, Fields.SWAP) /** * Insert a function into the pipeline: */ - def thenDo[T,U](pfn : (T) => U)(implicit in : (RichPipe)=>T): U = pfn(in(this)) + def thenDo[T, U](pfn: (T) => U)(implicit in: (RichPipe) => T): U = pfn(in(this)) /** * group the Pipe based on fields * - * builder is typically a block that modifies the given GroupBuilder - * the final OUTPUT of the block is used to schedule the new pipe - * each method in GroupBuilder returns this, so it is recommended - * to chain them and use the default input: + * builder is typically a block that modifies the given GroupBuilder the final OUTPUT of the block is used + * to schedule the new pipe each method in GroupBuilder returns this, so it is recommended to chain them and + * use the default input: * * {{{ * _.size.max('f1) etc... @@ -164,205 +278,185 @@ class RichPipe(val pipe : Pipe) extends java.io.Serializable with JoinAlgorithms * Returns the set of distinct tuples containing the specified fields */ def distinct(f: Fields): Pipe = - groupBy(f) { _.size('__uniquecount__) }.project(f) + groupBy(f)(_.size('__uniquecount__)).project(f) /** * Returns the set of unique tuples containing the specified fields. Same as distinct */ - def unique(f : Fields) : Pipe = distinct(f) + def unique(f: Fields): Pipe = distinct(f) /** * Merge or Concatenate several pipes together with this one: */ - def ++(that : Pipe): Pipe = { - if(this.pipe == that) { + def ++(that: Pipe): Pipe = + if (this.pipe == that) { // Cascading fails on self merge: // solution by Jack Guo new Merge(assignName(this.pipe), assignName(new Each(that, new Identity))) - } - else { + } else { new Merge(assignName(this.pipe), assignName(that)) } - } /** - * Group all tuples down to one reducer. - * (due to cascading limitation). - * This is probably only useful just before setting a tail such as Database - * tail, so that only one reducer talks to the DB. Kind of a hack. + * Group all tuples down to one reducer. (due to cascading limitation). This is probably only useful just + * before setting a tail such as Database tail, so that only one reducer talks to the DB. Kind of a hack. */ - def groupAll: Pipe = groupAll { _.pass } + def groupAll: Pipe = groupAll(_.pass) /** - * == Warning == + * ==Warning== * This kills parallelism. All the work is sent to one reducer. * - * Only use this in the case that you truly need all the data on one - * reducer. + * Only use this in the case that you truly need all the data on one reducer. * - * Just about the only reasonable case of this method is to reduce all values of a column - * or count all the rows. + * Just about the only reasonable case of this method is to reduce all values of a column or count all the + * rows. */ - def groupAll(gs : GroupBuilder => GroupBuilder) = - map(()->'__groupAll__) { (u:Unit) => 1 } - .groupBy('__groupAll__) { gs(_).reducers(1) } - .discard('__groupAll__) + def groupAll(gs: GroupBuilder => GroupBuilder) = + map(() -> '__groupAll__)((u: Unit) => 1) + .groupBy('__groupAll__)(gs(_).reducers(1)) + .discard('__groupAll__) /** * Force a random shuffle of all the data to exactly n reducers */ - def shard(n: Int): Pipe = groupRandomly(n) { _.pass } + def shard(n: Int): Pipe = groupRandomly(n)(_.pass) + /** - * Force a random shuffle of all the data to exactly n reducers, - * with a given seed if you need repeatability. + * Force a random shuffle of all the data to exactly n reducers, with a given seed if you need + * repeatability. */ - def shard(n : Int, seed : Int) : Pipe = groupRandomly(n, seed) { _.pass } + def shard(n: Int, seed: Int): Pipe = groupRandomly(n, seed)(_.pass) /** * Like groupAll, but randomly groups data into n reducers. * - * you can provide a seed for the random number generator - * to get reproducible results + * you can provide a seed for the random number generator to get reproducible results */ - def groupRandomly(n : Int)(gs : GroupBuilder => GroupBuilder) : Pipe = + def groupRandomly(n: Int)(gs: GroupBuilder => GroupBuilder): Pipe = groupRandomlyAux(n, None)(gs) /** * like groupRandomly(n : Int) with a given seed in the randomization */ - def groupRandomly(n : Int, seed : Long)(gs : GroupBuilder => GroupBuilder) : Pipe = + def groupRandomly(n: Int, seed: Long)(gs: GroupBuilder => GroupBuilder): Pipe = groupRandomlyAux(n, Some(seed))(gs) // achieves the behavior that reducer i gets i_th shard // by relying on cascading to use java's hashCode, which hash ints // to themselves - protected def groupRandomlyAux(n : Int, optSeed : Option[Long])(gs : GroupBuilder => GroupBuilder) : Pipe = { + protected def groupRandomlyAux(n: Int, optSeed: Option[Long])(gs: GroupBuilder => GroupBuilder): Pipe = using(statefulRandom(optSeed)) - .map(()->'__shard__) { (r:Random, _:Unit) => r.nextInt(n) } - .groupBy('__shard__) { gs(_).reducers(n) } + .map(() -> '__shard__)((r: Random, _: Unit) => r.nextInt(n)) + .groupBy('__shard__)(gs(_).reducers(n)) .discard('__shard__) - } - private def statefulRandom(optSeed : Option[Long]) : Random with Stateful = { + private def statefulRandom(optSeed: Option[Long]): Random with Stateful = { val random = new Random with Stateful - if (optSeed.isDefined) { random.setSeed(optSeed.get) } + optSeed.foreach(x => random.setSeed(x)) random } /** * Put all rows in random order * - * you can provide a seed for the random number generator - * to get reproducible results + * you can provide a seed for the random number generator to get reproducible results */ - def shuffle(shards : Int) : Pipe = groupAndShuffleRandomly(shards) { _.pass } - def shuffle(shards : Int, seed : Long) : Pipe = groupAndShuffleRandomly(shards, seed) { _.pass } + def shuffle(shards: Int): Pipe = groupAndShuffleRandomly(shards)(_.pass) + def shuffle(shards: Int, seed: Long): Pipe = groupAndShuffleRandomly(shards, seed)(_.pass) /** * Like shard, except do some operation im the reducers */ - def groupAndShuffleRandomly(reducers : Int)(gs : GroupBuilder => GroupBuilder) : Pipe = + def groupAndShuffleRandomly(reducers: Int)(gs: GroupBuilder => GroupBuilder): Pipe = groupAndShuffleRandomlyAux(reducers, None)(gs) /** * Like groupAndShuffleRandomly(reducers : Int) but with a fixed seed. */ - def groupAndShuffleRandomly(reducers : Int, seed : Long) - (gs : GroupBuilder => GroupBuilder) : Pipe = + def groupAndShuffleRandomly(reducers: Int, seed: Long)(gs: GroupBuilder => GroupBuilder): Pipe = groupAndShuffleRandomlyAux(reducers, Some(seed))(gs) - private def groupAndShuffleRandomlyAux(reducers : Int, optSeed : Option[Long]) - (gs : GroupBuilder => GroupBuilder) : Pipe = { + private def groupAndShuffleRandomlyAux(reducers: Int, optSeed: Option[Long])( + gs: GroupBuilder => GroupBuilder + ): Pipe = using(statefulRandom(optSeed)) - .map(()->('__shuffle__)) { (r:Random, _:Unit) => r.nextDouble() } - .groupRandomlyAux(reducers, optSeed){ g : GroupBuilder => + .map(() -> '__shuffle__)((r: Random, _: Unit) => r.nextDouble()) + .groupRandomlyAux(reducers, optSeed) { g: GroupBuilder => gs(g.sortBy('__shuffle__)) } .discard('__shuffle__) - } /** * Adds a field with a constant value. * - * == Usage == + * ==Usage== * {{{ * insert('a, 1) * }}} */ def insert[A](fs: Fields, value: A)(implicit setter: TupleSetter[A]): Pipe = - map[Unit,A](() -> fs) { _:Unit => value }(implicitly[TupleConverter[Unit]], setter) - + map[Unit, A](() -> fs) { _: Unit => value }(implicitly[TupleConverter[Unit]], setter) /** * Rename some set of N fields as another set of N fields * - * == Usage == + * ==Usage== * {{{ * rename('x -> 'z) - * rename(('x,'y) -> ('X,'Y)) + * rename(('x,'y) -> ('X,'Y)) * }}} * - * == Warning == - * `rename('x,'y)` is interpreted by scala as `rename(Tuple2('x,'y))` - * which then does `rename('x -> 'y)`. This is probably not what is intended - * but the compiler doesn't resolve the ambiguity. YOU MUST CALL THIS WITH - * A TUPLE2! If you don't, expect the unexpected. + * ==Warning== + * `rename('x,'y)` is interpreted by scala as `rename(Tuple2('x,'y))` which then does `rename('x -> 'y)`. + * This is probably not what is intended but the compiler doesn't resolve the ambiguity. YOU MUST CALL THIS + * WITH A TUPLE2! If you don't, expect the unexpected. */ - def rename(fields : (Fields,Fields)) : Pipe = { + def rename(fields: (Fields, Fields)): Pipe = { val (fromFields, toFields) = fields val in_arity = fromFields.size val out_arity = toFields.size assert(in_arity == out_arity, "Number of field names must match for rename") - new Each(pipe, fromFields, new Identity( toFields ), Fields.SWAP) + new Each(pipe, fromFields, new Identity(toFields), Fields.SWAP) } /** * Keep only items that satisfy this predicate. */ - def filter[A](f : Fields)(fn : (A) => Boolean) - (implicit conv : TupleConverter[A]) : Pipe = { + def filter[A](f: Fields)(fn: (A) => Boolean)(implicit conv: TupleConverter[A]): Pipe = { conv.assertArityMatches(f) new Each(pipe, f, new FilterFunction(fn, conv)) } /** - * Keep only items that don't satisfy this predicate. - * `filterNot` is equal to negating a `filter` operation. + * Keep only items that don't satisfy this predicate. `filterNot` is equal to negating a `filter` operation. * - * {{{ filterNot('name) { name: String => name contains "a" } }}} + * {{{filterNot('name) { name: String => name contains "a"}}} } * * is the same as: * - * {{{ filter('name) { name: String => !(name contains "a") } }}} + * {{{filter('name) { name: String => !(name contains "a")}}} } */ - def filterNot[A](f : Fields)(fn : (A) => Boolean) - (implicit conv : TupleConverter[A]) : Pipe = + def filterNot[A](f: Fields)(fn: (A) => Boolean)(implicit conv: TupleConverter[A]): Pipe = filter[A](f)(!fn(_)) /** - * Text files can have corrupted data. If you use this function and a - * cascading trap you can filter out corrupted data from your pipe. + * Text files can have corrupted data. If you use this function and a cascading trap you can filter out + * corrupted data from your pipe. */ - def verifyTypes[A](f: Fields)(implicit conv: TupleConverter[A]): Pipe = { - pipe.filter(f) { (a: A) => true } - } + def verifyTypes[A](f: Fields)(implicit conv: TupleConverter[A]): Pipe = + pipe.filter(f)((a: A) => true) /** - * Given a function, partitions the pipe into several groups based on the - * output of the function. Then applies a GroupBuilder function on each of the - * groups. + * Given a function, partitions the pipe into several groups based on the output of the function. Then + * applies a GroupBuilder function on each of the groups. * - * Example: - pipe - .mapTo(()->('age, 'weight) { ... } - .partition('age -> 'isAdult) { _ > 18 } { _.average('weight) } - * pipe now contains the average weights of adults and minors. + * Example: pipe .mapTo(()->('age, 'weight) { ... } .partition('age -> 'isAdult) { _ > 18 } { + * _.average('weight) } pipe now contains the average weights of adults and minors. */ - def partition[A,R](fs: (Fields, Fields))(fn: (A) => R)( - builder: GroupBuilder => GroupBuilder)( - implicit conv: TupleConverter[A], - ord: Ordering[R], - rset: TupleSetter[R]): Pipe = { + def partition[A, R](fs: (Fields, Fields))(fn: (A) => R)( + builder: GroupBuilder => GroupBuilder + )(implicit conv: TupleConverter[A], ord: Ordering[R], rset: TupleSetter[R]): Pipe = { val (fromFields, toFields) = fs conv.assertArityMatches(fromFields) rset.assertArityMatches(toFields) @@ -372,79 +466,80 @@ class RichPipe(val pipe : Pipe) extends java.io.Serializable with JoinAlgorithms map(fromFields -> tmpFields)(fn)(conv, TupleSetter.singleSetter[R]) .groupBy(tmpFields)(builder) - .map[R,R](tmpFields -> toFields){ (r:R) => r }(TupleConverter.singleConverter[R], rset) + .map[R, R](tmpFields -> toFields)((r: R) => r)(TupleConverter.singleConverter[R], rset) .discard(tmpFields) } /** - * If you use a map function that does not accept TupleEntry args, - * which is the common case, an implicit conversion in GeneratedConversions - * will convert your function into a `(TupleEntry => T)`. The result type - * T is converted to a cascading Tuple by an implicit `TupleSetter[T]`. - * acceptable T types are primitive types, cascading Tuples of those types, - * or `scala.Tuple(1-22)` of those types. + * If you use a map function that does not accept TupleEntry args, which is the common case, an implicit + * conversion in GeneratedConversions will convert your function into a `(TupleEntry => T)`. The result type + * T is converted to a cascading Tuple by an implicit `TupleSetter[T]`. acceptable T types are primitive + * types, cascading Tuples of those types, or `scala.Tuple(1-22)` of those types. * - * After the map, the input arguments will be set to the output of the map, - * so following with filter or map is fine without a new using statement if - * you mean to operate on the output. + * After the map, the input arguments will be set to the output of the map, so following with filter or map + * is fine without a new using statement if you mean to operate on the output. * * {{{ * map('data -> 'stuff) * }}} * - * * if output equals input, REPLACE is used. - * * if output or input is a subset of the other SWAP is used. - * * otherwise we append the new fields (cascading Fields.ALL is used) + * * if output equals input, REPLACE is used. * if output or input is a subset of the other SWAP is used. * + * otherwise we append the new fields (cascading Fields.ALL is used) * * {{{ * mapTo('data -> 'stuff) * }}} * - * Only the results (stuff) are kept (cascading Fields.RESULTS) + * Only the results (stuff) are kept (cascading Fields.RESULTS) * - * == Note == - * Using mapTo is the same as using map followed by a project for - * selecting just the ouput fields + * ==Note== + * Using mapTo is the same as using map followed by a project for selecting just the output fields */ - def map[A,T](fs : (Fields,Fields))(fn : A => T) - (implicit conv : TupleConverter[A], setter : TupleSetter[T]) : Pipe = { - conv.assertArityMatches(fs._1) - setter.assertArityMatches(fs._2) - each(fs)(new MapFunction[A,T](fn, _, conv, setter)) + def map[A, T]( + fs: (Fields, Fields) + )(fn: A => T)(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + conv.assertArityMatches(fs._1) + setter.assertArityMatches(fs._2) + each(fs)(new MapFunction[A, T](fn, _, conv, setter)) } - def mapTo[A,T](fs : (Fields,Fields))(fn : A => T) - (implicit conv : TupleConverter[A], setter : TupleSetter[T]) : Pipe = { - conv.assertArityMatches(fs._1) - setter.assertArityMatches(fs._2) - eachTo(fs)(new MapFunction[A,T](fn, _, conv, setter)) + def mapTo[A, T]( + fs: (Fields, Fields) + )(fn: A => T)(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + conv.assertArityMatches(fs._1) + setter.assertArityMatches(fs._2) + eachTo(fs)(new MapFunction[A, T](fn, _, conv, setter)) } - def flatMap[A,T](fs : (Fields,Fields))(fn : A => TraversableOnce[T]) - (implicit conv : TupleConverter[A], setter : TupleSetter[T]) : Pipe = { - conv.assertArityMatches(fs._1) - setter.assertArityMatches(fs._2) - each(fs)(new FlatMapFunction[A,T](fn, _, conv, setter)) + def flatMap[A, T]( + fs: (Fields, Fields) + )(fn: A => TraversableOnce[T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + conv.assertArityMatches(fs._1) + setter.assertArityMatches(fs._2) + each(fs)(new FlatMapFunction[A, T](fn, _, conv, setter)) } - def flatMapTo[A,T](fs : (Fields,Fields))(fn : A => TraversableOnce[T]) - (implicit conv : TupleConverter[A], setter : TupleSetter[T]) : Pipe = { - conv.assertArityMatches(fs._1) - setter.assertArityMatches(fs._2) - eachTo(fs)(new FlatMapFunction[A,T](fn, _, conv, setter)) + def flatMapTo[A, T]( + fs: (Fields, Fields) + )(fn: A => TraversableOnce[T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + conv.assertArityMatches(fs._1) + setter.assertArityMatches(fs._2) + eachTo(fs)(new FlatMapFunction[A, T](fn, _, conv, setter)) } /** * Filters all data that is defined for this partial function and then applies that function */ - def collect[A,T](fs : (Fields,Fields))(fn : PartialFunction[A,T]) - (implicit conv : TupleConverter[A], setter : TupleSetter[T]) : Pipe = { - conv.assertArityMatches(fs._1) - setter.assertArityMatches(fs._2) - pipe.each(fs)(new CollectFunction[A,T](fn, _, conv, setter)) + def collect[A, T]( + fs: (Fields, Fields) + )(fn: PartialFunction[A, T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + conv.assertArityMatches(fs._1) + setter.assertArityMatches(fs._2) + pipe.each(fs)(new CollectFunction[A, T](fn, _, conv, setter)) } - def collectTo[A,T](fs : (Fields,Fields))(fn : PartialFunction[A,T]) - (implicit conv : TupleConverter[A], setter : TupleSetter[T]) : Pipe = { - conv.assertArityMatches(fs._1) - setter.assertArityMatches(fs._2) - pipe.eachTo(fs)(new CollectFunction[A,T](fn, _, conv, setter)) + def collectTo[A, T]( + fs: (Fields, Fields) + )(fn: PartialFunction[A, T])(implicit conv: TupleConverter[A], setter: TupleSetter[T]): Pipe = { + conv.assertArityMatches(fs._1) + setter.assertArityMatches(fs._2) + pipe.eachTo(fs)(new CollectFunction[A, T](fn, _, conv, setter)) } /** @@ -456,9 +551,10 @@ class RichPipe(val pipe : Pipe) extends java.io.Serializable with JoinAlgorithms * * Common enough to be useful. */ - def flatten[T](fs: (Fields, Fields)) - (implicit conv: TupleConverter[TraversableOnce[T]], setter: TupleSetter[T]): Pipe = - flatMap[TraversableOnce[T],T](fs)({ it : TraversableOnce[T] => it })(conv, setter) + def flatten[T]( + fs: (Fields, Fields) + )(implicit conv: TupleConverter[TraversableOnce[T]], setter: TupleSetter[T]): Pipe = + flatMap[TraversableOnce[T], T](fs) { it: TraversableOnce[T] => it }(conv, setter) /** * the same as @@ -469,37 +565,36 @@ class RichPipe(val pipe : Pipe) extends java.io.Serializable with JoinAlgorithms * * Common enough to be useful. */ - def flattenTo[T](fs : (Fields, Fields)) - (implicit conv : TupleConverter[TraversableOnce[T]], setter : TupleSetter[T]): Pipe = - flatMapTo[TraversableOnce[T],T](fs)({ it : TraversableOnce[T] => it })(conv, setter) + def flattenTo[T]( + fs: (Fields, Fields) + )(implicit conv: TupleConverter[TraversableOnce[T]], setter: TupleSetter[T]): Pipe = + flatMapTo[TraversableOnce[T], T](fs) { it: TraversableOnce[T] => it }(conv, setter) /** - * Force a materialization to disk in the flow. - * This is useful before crossWithTiny if you filter just before. Ideally scalding/cascading would - * see this (and may in future versions), but for now it is here to aid in hand-tuning jobs + * Force a materialization to disk in the flow. This is useful before crossWithTiny if you filter just + * before. Ideally scalding/cascading would see this (and may in future versions), but for now it is here to + * aid in hand-tuning jobs */ lazy val forceToDisk: Pipe = new Checkpoint(pipe) /** * Convenience method for integrating with existing cascading Functions */ - def each(fs : (Fields,Fields))(fn : Fields => Function[_]) = { + def each(fs: (Fields, Fields))(fn: Fields => Function[_]) = new Each(pipe, fs._1, fn(fs._2), defaultMode(fs._1, fs._2)) - } /** * Same as above, but only keep the results field. */ - def eachTo(fs : (Fields,Fields))(fn : Fields => Function[_]) = { + def eachTo(fs: (Fields, Fields))(fn: Fields => Function[_]) = new Each(pipe, fs._1, fn(fs._2), Fields.RESULTS) - } /** - * This is an analog of the SQL/Excel unpivot function which converts columns of data - * into rows of data. Only the columns given as input fields are expanded in this way. - * For this operation to be reversible, you need to keep some unique key on each row. - * See GroupBuilder.pivot to reverse this operation assuming you leave behind a grouping key - * == Example == + * This is an analog of the SQL/Excel unpivot function which converts columns of data into rows of data. + * Only the columns given as input fields are expanded in this way. For this operation to be reversible, you + * need to keep some unique key on each row. See GroupBuilder.pivot to reverse this operation assuming you + * leave behind a grouping key + * ==Example== * {{{ * pipe.unpivot(('w,'x,'y,'z) -> ('feature, 'value)) * }}} @@ -519,35 +614,35 @@ class RichPipe(val pipe : Pipe) extends java.io.Serializable with JoinAlgorithms * }}} * etc... */ - def unpivot(fieldDef : (Fields,Fields)) : Pipe = { + def unpivot(fieldDef: (Fields, Fields)): Pipe = { assert(fieldDef._2.size == 2, "Must specify exactly two Field names for the results") // toKeyValueList comes from TupleConversions - pipe.flatMap(fieldDef) { te: TupleEntry => TupleConverter.KeyValueList(te) } + pipe + .flatMap(fieldDef) { te: TupleEntry => TupleConverter.KeyValueList(te) } .discard(fieldDef._1) } /** - * Keep at most n elements. This is implemented by keeping - * approximately n/k elements on each of the k mappers or reducers (whichever we wind - * up being scheduled on). + * Keep at most n elements. This is implemented by keeping approximately n/k elements on each of the k + * mappers or reducers (whichever we wind up being scheduled on). */ - def limit(n : Long) : Pipe = new Each(pipe, new Limit(n)) + def limit(n: Long): Pipe = new Each(pipe, new Limit(n)) - /** - * Sample percent of elements. percent should be between 0.00 (0%) and 1.00 (100%) - * you can provide a seed to get reproducible results - * + /** + * Sample a fraction of elements. fraction should be between 0.00 (0%) and 1.00 (100%) you can provide a + * seed to get reproducible results */ - def sample(percent : Double) : Pipe = new Each(pipe, new Sample(percent)) - def sample(percent : Double, seed : Long) : Pipe = new Each(pipe, new Sample(seed, percent)) + def sample(fraction: Double): Pipe = new Each(pipe, new Sample(fraction)) + def sample(fraction: Double, seed: Long): Pipe = new Each(pipe, new Sample(seed, fraction)) /** - * Sample percent of elements with return. percent should be between 0.00 (0%) and 1.00 (100%) - * you can provide a seed to get reproducible results - * + * Sample fraction of elements with return. fraction should be between 0.00 (0%) and 1.00 (100%) you can + * provide a seed to get reproducible results */ - def sampleWithReplacement(percent : Double) : Pipe = new Each(pipe, new SampleWithReplacement(percent), Fields.ALL) - def sampleWithReplacement(percent : Double, seed : Int) : Pipe = new Each(pipe, new SampleWithReplacement(percent, seed), Fields.ALL) + def sampleWithReplacement(fraction: Double): Pipe = + new Each(pipe, new SampleWithReplacement(fraction), Fields.ALL) + def sampleWithReplacement(fraction: Double, seed: Int): Pipe = + new Each(pipe, new SampleWithReplacement(fraction, seed), Fields.ALL) /** * Print all the tuples that pass to stderr @@ -555,32 +650,38 @@ class RichPipe(val pipe : Pipe) extends java.io.Serializable with JoinAlgorithms def debug: Pipe = debug(PipeDebug()) /** - * Print the tuples that pass with the options configured in debugger - * For instance: - * {{{ debug(PipeDebug().toStdOut.printTuplesEvery(100)) }}} + * Print the tuples that pass with the options configured in debugger For instance: + * {{{debug(PipeDebug().toStdOut.printTuplesEvery(100))}}} */ def debug(dbg: PipeDebug): Pipe = dbg(pipe) /** * Write all the tuples to the given source and return this Pipe */ - def write(outsource : Source)(implicit flowDef : FlowDef, mode : Mode) = { - outsource.writeFrom(pipe)(flowDef, mode) + def write(outsource: Source)(implicit flowDef: FlowDef, mode: Mode) = { + /* This code is to hack around a known Cascading bug that they have decided not to fix. In a graph: + A -> FlatMap -> write(tsv) -> FlatMap + in the second flatmap cascading will read from the written tsv for running it. However TSV's use toString and so is not a bijection. + here we stick in an identity function before the tsv write to keep to force cascading to do any fork/split beforehand. + */ + val writePipe: Pipe = outsource match { + case t: Tsv => new Each(pipe, Fields.ALL, IdentityFunction, Fields.REPLACE) + case _ => pipe + } + outsource.writeFrom(writePipe)(flowDef, mode) pipe } /** - * Adds a trap to the current pipe, - * which will capture all exceptions that occur in this pipe - * and save them to the trapsource given + * Adds a trap to the current pipe, which will capture all exceptions that occur in this pipe and save them + * to the trapsource given * - * Traps do not include the original fields in a tuple, - * only the fields seen in an operation. - * Traps also do not include any exception information. + * Traps do not include the original fields in a tuple, only the fields seen in an operation. Traps also do + * not include any exception information. * * There can only be at most one trap for each pipe. - **/ - def addTrap(trapsource : Source)(implicit flowDef : FlowDef, mode : Mode) = { + */ + def addTrap(trapsource: Source)(implicit flowDef: FlowDef, mode: Mode) = { flowDef.addTrap(pipe, trapsource.createTap(Write)(mode)) pipe } @@ -589,52 +690,52 @@ class RichPipe(val pipe : Pipe) extends java.io.Serializable with JoinAlgorithms * Divides sum of values for this variable by their sum; assumes without checking that division is supported * on this type and that sum is not zero * - * If those assumptions do not hold, will throw an exception -- consider checking sum sepsarately and/or using addTrap + * If those assumptions do not hold, will throw an exception -- consider checking sum sepsarately and/or + * using addTrap * * in some cases, crossWithTiny has been broken, the implementation supports a work-around */ - def normalize(f : Fields, useTiny : Boolean = true) : Pipe = { - val total = groupAll { _.sum[Double](f -> '__total_for_normalize__) } - (if(useTiny) { - crossWithTiny(total) - } else { - crossWithSmaller(total) - }) - .map(Fields.merge(f, '__total_for_normalize__) -> f) { args : (Double, Double) => - args._1 / args._2 - } + def normalize(f: Fields, useTiny: Boolean = true): Pipe = { + val total = groupAll(_.sum[Double](f -> '__total_for_normalize__)) + (if (useTiny) { + crossWithTiny(total) + } else { + crossWithSmaller(total) + }) + .map(Fields.merge(f, '__total_for_normalize__) -> f) { args: (Double, Double) => + args._1 / args._2 + } } - /** Maps the input fields into an output field of type T. For example: + /** + * Maps the input fields into an output field of type T. For example: * * {{{ * pipe.pack[(Int, Int)] (('field1, 'field2) -> 'field3) * }}} * - * will pack fields 'field1 and 'field2 to field 'field3, as long as 'field1 and 'field2 - * can be cast into integers. The output field 'field3 will be of tupel `(Int, Int)` - * + * will pack fields 'field1 and 'field2 to field 'field3, as long as 'field1 and 'field2 can be cast into + * integers. The output field 'field3 will be of tupel `(Int, Int)` */ - def pack[T](fs : (Fields, Fields))(implicit packer : TuplePacker[T], setter : TupleSetter[T]) : Pipe = { + def pack[T](fs: (Fields, Fields))(implicit packer: TuplePacker[T], setter: TupleSetter[T]): Pipe = { val (fromFields, toFields) = fs assert(toFields.size == 1, "Can only output 1 field in pack") val conv = packer.newConverter(fromFields) - pipe.map(fs) { input : T => input } (conv, setter) + pipe.map(fs) { input: T => input }(conv, setter) } /** * Same as pack but only the to fields are preserved. */ - def packTo[T](fs : (Fields, Fields))(implicit packer : TuplePacker[T], setter : TupleSetter[T]) : Pipe = { + def packTo[T](fs: (Fields, Fields))(implicit packer: TuplePacker[T], setter: TupleSetter[T]): Pipe = { val (fromFields, toFields) = fs assert(toFields.size == 1, "Can only output 1 field in pack") val conv = packer.newConverter(fromFields) - pipe.mapTo(fs) { input : T => input } (conv, setter) + pipe.mapTo(fs) { input: T => input }(conv, setter) } /** - * The opposite of pack. Unpacks the input field of type `T` into - * the output fields. For example: + * The opposite of pack. Unpacks the input field of type `T` into the output fields. For example: * * {{{ * pipe.unpack[(Int, Int)] ('field1 -> ('field2, 'field3)) @@ -642,29 +743,85 @@ class RichPipe(val pipe : Pipe) extends java.io.Serializable with JoinAlgorithms * * will unpack 'field1 into 'field2 and 'field3 */ - def unpack[T](fs : (Fields, Fields))(implicit unpacker : TupleUnpacker[T], conv : TupleConverter[T]) : Pipe = { + def unpack[T](fs: (Fields, Fields))(implicit unpacker: TupleUnpacker[T], conv: TupleConverter[T]): Pipe = { val (fromFields, toFields) = fs assert(fromFields.size == 1, "Can only take 1 input field in unpack") val fields = (fromFields, unpacker.getResultFields(toFields)) val setter = unpacker.newSetter(toFields) - pipe.map(fields) { input : T => input } (conv, setter) + pipe.map(fields) { input: T => input }(conv, setter) } /** * Same as unpack but only the to fields are preserved. */ - def unpackTo[T](fs : (Fields, Fields))(implicit unpacker : TupleUnpacker[T], conv : TupleConverter[T]) : Pipe = { + def unpackTo[T]( + fs: (Fields, Fields) + )(implicit unpacker: TupleUnpacker[T], conv: TupleConverter[T]): Pipe = { val (fromFields, toFields) = fs assert(fromFields.size == 1, "Can only take 1 input field in unpack") val fields = (fromFields, unpacker.getResultFields(toFields)) val setter = unpacker.newSetter(toFields) - pipe.mapTo(fields) { input : T => input } (conv, setter) + pipe.mapTo(fields) { input: T => input }(conv, setter) + } + + /** + * Set of pipes reachable from this pipe (transitive closure of 'Pipe.getPrevious') + */ + def upstreamPipes: Set[Pipe] = + Iterator + .iterate(Seq(pipe))(pipes => + for { + p <- pipes + prev <- p.getPrevious + } yield prev + ) + .takeWhile(_.length > 0) + .flatten + .toSet + + /** + * This finds all the boxed serializations stored in the flow state map for this flowdef. We then find all + * the pipes back in the DAG from this pipe and apply those serializations. + */ + private[scalding] def applyFlowConfigProperties(flowDef: FlowDef): Pipe = { + case class ToVisit[T](queue: Queue[T], inQueue: Set[T]) { + def maybeAdd(t: T): ToVisit[T] = if (inQueue(t)) this + else { + ToVisit(queue :+ t, inQueue + t) + } + def next: Option[(T, ToVisit[T])] = + if (inQueue.isEmpty) None + else Some((queue.head, ToVisit(queue.tail, inQueue - queue.head))) + } + + @annotation.tailrec + def go(p: Pipe, visited: Set[Pipe], toVisit: ToVisit[Pipe]): Set[Pipe] = { + val notSeen: Set[Pipe] = p.getPrevious.filter(i => !visited.contains(i)).toSet + val nextVisited: Set[Pipe] = visited + p + val nextToVisit = notSeen.foldLeft(toVisit) { case (prev, n) => prev.maybeAdd(n) } + + nextToVisit.next match { + case Some((h, innerNextToVisit)) => go(h, nextVisited, innerNextToVisit) + case _ => nextVisited + } + } + val allPipes = go(pipe, Set[Pipe](), ToVisit[Pipe](Queue.empty, Set.empty)) + + FlowStateMap.get(flowDef).foreach { fstm => + fstm.flowConfigUpdates.foreach { case (k, v) => + allPipes.foreach { p => + p.getStepConfigDef().setProperty(k, v) + } + } + } + pipe } + } /** * A simple trait for releasable resource. Provides noop implementation. */ trait Stateful { - def release() {} + def release(): Unit = () } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/SkewReplication.scala b/scalding-core/src/main/scala/com/twitter/scalding/SkewReplication.scala index 3dd80f120c..31507ea4ec 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/SkewReplication.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/SkewReplication.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding /** @@ -22,22 +22,21 @@ sealed abstract class SkewReplication { val DEFAULT_NUM_REDUCERS = 100 /** - * Given the estimated frequencies of a join key in two pipes that we want to skew-join together, - * this returns the key's replication amount in each pipe. + * Given the estimated frequencies of a join key in two pipes that we want to skew-join together, this + * returns the key's replication amount in each pipe. * - * Note: if we switch to a Count-Min sketch, we'll need to change the meaning of these counts - * from "sampled counts" to "estimates of full counts", and also change how we deal with counts of - * zero. + * Note: if we switch to a Count-Min sketch, we'll need to change the meaning of these counts from "sampled + * counts" to "estimates of full counts", and also change how we deal with counts of zero. */ - def getReplications(leftCount : Int, rightCount : Int, reducers : Int) : (Int, Int) + def getReplications(leftCount: Int, rightCount: Int, reducers: Int): (Int, Int) } /** * See https://github.com/twitter/scalding/pull/229#issuecomment-10773810 */ -case class SkewReplicationA(replicationFactor : Int = 1) extends SkewReplication { +final case class SkewReplicationA(replicationFactor: Int = 1) extends SkewReplication { - override def getReplications(leftCount : Int, rightCount : Int, reducers : Int) = { + override def getReplications(leftCount: Int, rightCount: Int, reducers: Int) = { val numReducers = if (reducers <= 0) DEFAULT_NUM_REDUCERS else reducers val left = scala.math.min(rightCount * replicationFactor, numReducers) @@ -52,10 +51,10 @@ case class SkewReplicationA(replicationFactor : Int = 1) extends SkewReplication /** * See https://github.com/twitter/scalding/pull/229#issuecomment-10792296 */ -case class SkewReplicationB(maxKeysInMemory : Int = 1E6.toInt, maxReducerOutput : Int = 1E7.toInt) - extends SkewReplication { +final case class SkewReplicationB(maxKeysInMemory: Int = 1e6.toInt, maxReducerOutput: Int = 1e7.toInt) + extends SkewReplication { - override def getReplications(leftCount : Int, rightCount : Int, reducers : Int) = { + override def getReplications(leftCount: Int, rightCount: Int, reducers: Int) = { val numReducers = if (reducers <= 0) DEFAULT_NUM_REDUCERS else reducers val left = scala.math.max(1, rightCount / maxKeysInMemory) @@ -64,4 +63,4 @@ case class SkewReplicationB(maxKeysInMemory : Int = 1E6.toInt, maxReducerOutput (left, if (right == 0) 1 else right) } -} \ No newline at end of file +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Sortable.scala b/scalding-core/src/main/scala/com/twitter/scalding/Sortable.scala index 4da02de24a..bcda603eda 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Sortable.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Sortable.scala @@ -12,13 +12,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields trait Sortable[+Self] { // Perform an inner secondary sort - def sortBy(innerSort : Fields) : Self - def sorting : Option[Fields] + def sortBy(innerSort: Fields): Self + def sorting: Option[Fields] } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Source.scala b/scalding-core/src/main/scala/com/twitter/scalding/Source.scala index ad9b4726e5..2f5e157e3f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Source.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Source.scala @@ -12,41 +12,94 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import java.io.{File, InputStream, OutputStream} -import java.util.{TimeZone, Calendar, Map => JMap, Properties} +import java.io.{InputStream, OutputStream} +import java.util.{Map => JMap, Properties, UUID} import cascading.flow.FlowDef import cascading.flow.FlowProcess -import cascading.flow.hadoop.HadoopFlowProcess -import cascading.flow.local.LocalFlowProcess import cascading.scheme.{NullScheme, Scheme} -import cascading.scheme.local.{TextLine => CLTextLine, TextDelimited => CLTextDelimited} -import cascading.scheme.hadoop.{TextLine => CHTextLine, TextDelimited => CHTextDelimited, SequenceFile => CHSequenceFile} import cascading.tap.hadoop.Hfs -import cascading.tap.{MultiSourceTap, SinkMode} -import cascading.tap.{Tap, SinkTap} -import cascading.tap.local.FileTap -import cascading.tuple.{Fields, Tuple => CTuple, TupleEntry, TupleEntryCollector} +import cascading.tap.SinkMode +import cascading.tap.{SinkTap, SourceTap, Tap} +import cascading.tuple.{Fields, Tuple => CTuple, TupleEntry, TupleEntryCollector, TupleEntryIterator} import cascading.pipe.Pipe -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapred.InputFormat +import org.apache.hadoop.mapred.InputSplit import org.apache.hadoop.mapred.JobConf -import org.apache.hadoop.mapred.OutputCollector; -import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.OutputCollector +import org.apache.hadoop.mapred.RecordReader -import collection.mutable.{Buffer, MutableList} import scala.collection.JavaConverters._ - /** * thrown when validateTaps fails */ -class InvalidSourceException(message : String) extends RuntimeException(message) +class InvalidSourceException(message: String, cause: Throwable) extends RuntimeException(message, cause) { + def this(message: String) = this(message, null) +} + +/** + * InvalidSourceTap used in createTap method when we want to defer the failures to validateTaps method. + * + * This is used because for Job classes, createTap method on sources is called when the class is initialized. + * In most cases though, we want any exceptions to be thrown by validateTaps method, which is called + * subsequently during flow planning. + * + * hdfsPaths represents user-supplied list that was detected as not containing any valid paths. + */ +class InvalidSourceTap(val e: Throwable) extends SourceTap[JobConf, RecordReader[_, _]] { + + def this(hdfsPaths: Iterable[String]) = + this(new InvalidSourceException(s"No good paths in $hdfsPaths")) + + private final val randomId = UUID.randomUUID.toString + + override def getIdentifier: String = s"InvalidSourceTap-$randomId" + + override def hashCode: Int = randomId.hashCode + + override def getModifiedTime(conf: JobConf): Long = 0L + + override def openForRead(flow: FlowProcess[JobConf], input: RecordReader[_, _]): TupleEntryIterator = + throw new InvalidSourceException("Encountered InvalidSourceTap!", e) + + override def resourceExists(conf: JobConf): Boolean = false + + override def getScheme = new NullScheme() + + // We set a dummy input format here so that mapred.input.format.class key is present, + // which is a requirement for casading's MultiInputFormat at flow plan time. + // So the order of operations here will be: + // 1. source.createTap + // 2. tap.sourceConfInit + // 3. scheme.sourceConfInit + // 4. source.validateTaps (throws InvalidSourceException) + // In the worst case if the flow plan is misconfigured, + // openForRead on mappers should fail when using this tap. + override def sourceConfInit(flow: FlowProcess[JobConf], conf: JobConf): Unit = { + conf.setInputFormat(classOf[InvalidInputFormat]) + super.sourceConfInit(flow, conf) + } +} + +/** + * Better error messaging for the occasion where an InvalidSourceTap does not fail in validation. + */ +private[scalding] class InvalidInputFormat extends InputFormat[Nothing, Nothing] { + override def getSplits(conf: JobConf, numSplits: Int): Nothing = + throw new InvalidSourceException("getSplits called on InvalidInputFormat") + override def getRecordReader( + split: InputSplit, + conf: JobConf, + reporter: org.apache.hadoop.mapred.Reporter + ): Nothing = + throw new InvalidSourceException("getRecordReader called on InvalidInputFormat") +} /* * Denotes the access mode for a Source @@ -66,153 +119,187 @@ object HadoopSchemeInstance { object CastHfsTap { // The scala compiler has problems with the generics in Cascading - def apply(tap : Hfs) : Tap[JobConf, RecordReader[_,_], OutputCollector[_,_]] = - tap.asInstanceOf[Tap[JobConf, RecordReader[_,_], OutputCollector[_,_]]] + def apply(tap: Hfs): Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]] = + tap.asInstanceOf[Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]]] } /** -* Every source must have a correct toString method. If you use -* case classes for instances of sources, you will get this for free. -* This is one of the several reasons we recommend using cases classes -* -* java.io.Serializable is needed if the Source is going to have any -* methods attached that run on mappers or reducers, which will happen -* if you implement transformForRead or transformForWrite. -*/ + * Every source must have a correct toString method. If you use case classes for instances of sources, you + * will get this for free. This is one of the several reasons we recommend using cases classes + * + * java.io.Serializable is needed if the Source is going to have any methods attached that run on mappers or + * reducers, which will happen if you implement transformForRead or transformForWrite. + */ abstract class Source extends java.io.Serializable { /** - * The mock passed in to scalding.JobTest may be considered - * as a mock of the Tap or the Source. By default, as of 0.9.0, - * it is considered as a Mock of the Source. If you set this - * to true, the mock in TestMode will be considered to be a - * mock of the Tap (which must be transformed) and not the Source. + * The mock passed in to scalding.JobTest may be considered as a mock of the Tap or the Source. By default, + * as of 0.9.0, it is considered as a Mock of the Source. If you set this to true, the mock in TestMode will + * be considered to be a mock of the Tap (which must be transformed) and not the Source. */ def transformInTest: Boolean = false - def read(implicit flowDef : FlowDef, mode : Mode): Pipe = { - checkFlowDefNotNull - - //workaround for a type erasure problem, this is a map of String -> Tap[_,_,_] - val sources = flowDef.getSources().asInstanceOf[JMap[String,Any]] - val srcName = this.toString - if (!sources.containsKey(srcName)) { - sources.put(srcName, createTap(Read)(mode)) - } - FlowStateMap.mutate(flowDef) { st => - val newPipe = (mode, transformInTest) match { - case (test: TestMode, false) => new Pipe(srcName) - case _ => transformForRead(new Pipe(srcName)) - } - st.getReadPipe(this, newPipe) + /** + * This is a name the refers to this exact instance of the source (put another way, if s1.sourceId == + * s2.sourceId, the job should work the same if one is replaced with the other + */ + def sourceId: String = toString + + def read(implicit flowDef: FlowDef, mode: Mode): Pipe = { + checkFlowDefNotNull() + + // workaround for a type erasure problem, this is a map of String -> Tap[_,_,_] + val sources = flowDef.getSources().asInstanceOf[JMap[String, Any]] + /* + * Starting in scalding 0.12, we assign a unique name for each head + * pipe so that we can always merge two FlowDefs + */ + val uuid = java.util.UUID.randomUUID + val srcName = sourceId + uuid.toString + assert(!sources.containsKey(srcName), "Source %s had collision in uuid: %s".format(this, uuid)) + sources.put(srcName, createTap(Read)(mode)) + + FlowStateMap.merge(flowDef, FlowState.withSource(srcName, this)) + + (mode, transformInTest) match { + case (test: TestMode, false) => new Pipe(srcName) + case _ => transformForRead(new Pipe(srcName)) } } /** - * write the pipe and return the input so it can be chained into - * the next operation - */ - def writeFrom(pipe : Pipe)(implicit flowDef : FlowDef, mode : Mode) = { - checkFlowDefNotNull - - //insane workaround for scala compiler bug - val sinks = flowDef.getSinks().asInstanceOf[JMap[String,Any]] - val sinkName = this.toString + * write the pipe but return the input so it can be chained into the next operation + */ + def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe = { + checkFlowDefNotNull() + + // insane workaround for scala compiler bug + val sinks = flowDef.getSinks.asInstanceOf[JMap[String, Any]] + val sinkName = sourceId if (!sinks.containsKey(sinkName)) { sinks.put(sinkName, createTap(Write)(mode)) } val newPipe = (mode, transformInTest) match { case (test: TestMode, false) => pipe - case _ => transformForWrite(pipe) + case _ => transformForWrite(pipe) } - flowDef.addTail(new Pipe(sinkName, newPipe)) + val outPipe = new Pipe(sinkName, newPipe) + flowDef.addTail(outPipe) pipe } - protected def checkFlowDefNotNull(implicit flowDef : FlowDef, mode : Mode) { + protected def checkFlowDefNotNull()(implicit flowDef: FlowDef, mode: Mode): Unit = assert(flowDef != null, "Trying to access null FlowDef while in mode: %s".format(mode)) - } - protected def transformForWrite(pipe : Pipe) = pipe - protected def transformForRead(pipe : Pipe) = pipe + protected def transformForWrite(pipe: Pipe) = pipe + protected def transformForRead(pipe: Pipe) = pipe /** - * Subclasses of Source MUST override this method. They may call out to TestTapFactory for - * making Taps suitable for testing. - */ - def createTap(readOrWrite : AccessMode)(implicit mode : Mode) : Tap[_,_,_] + * Subclasses of Source MUST override this method. They may call out to TestTapFactory for making Taps + * suitable for testing. + */ + def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] /* * This throws InvalidSourceException if this source is invalid. */ - def validateTaps(mode : Mode) : Unit = { } + def validateTaps(mode: Mode): Unit = {} // linter:ignore @deprecated("replace with Mappable.toIterator", "0.9.0") - def readAtSubmitter[T](implicit mode : Mode, conv : TupleConverter[T]) : Stream[T] = { + def readAtSubmitter[T](implicit mode: Mode, conv: TupleConverter[T]): Stream[T] = { + validateTaps(mode) val tap = createTap(Read)(mode) - mode.openForRead(tap).asScala.map { conv(_) }.toStream + + CascadingMode + .cast(mode) + .openForRead(Config.defaultFrom(mode), tap) + .asScala + .map(conv(_)) + .toStream } } /** -* Usually as soon as we open a source, we read and do some mapping -* operation on a single column or set of columns. -* T is the type of the single column. If doing multiple columns -* T will be a TupleN representing the types, e.g. (Int,Long,String) -* -* Prefer to use TypedSource unless you are working with the fields API -* -* NOTE: If we don't make this extend Source, established implicits are ambiguous -* when TDsl is in scope. -*/ + * Usually as soon as we open a source, we read and do some mapping operation on a single column or set of + * columns. T is the type of the single column. If doing multiple columns T will be a TupleN representing the + * types, e.g. (Int,Long,String) + * + * Prefer to use TypedSource unless you are working with the fields API + * + * NOTE: If we don't make this extend Source, established implicits are ambiguous when TDsl is in scope. + */ trait Mappable[+T] extends Source with TypedSource[T] { - final def mapTo[U](out : Fields)(mf : (T) => U) - (implicit flowDef : FlowDef, mode : Mode, setter : TupleSetter[U]): Pipe = { - RichPipe(read(flowDef, mode)).mapTo[T,U](sourceFields -> out)(mf)(converter, setter) - } + final def mapTo[U](out: Fields)( + mf: (T) => U + )(implicit flowDef: FlowDef, mode: Mode, setter: TupleSetter[U]): Pipe = + RichPipe(read(flowDef, mode)).mapTo[T, U](sourceFields -> out)(mf)(converter, setter) + /** - * If you want to filter, you should use this and output a 0 or 1 length Iterable. - * Filter does not change column names, and we generally expect to change columns here - */ - final def flatMapTo[U](out : Fields)(mf : (T) => TraversableOnce[U]) - (implicit flowDef : FlowDef, mode : Mode, setter : TupleSetter[U]): Pipe = { - RichPipe(read(flowDef, mode)).flatMapTo[T,U](sourceFields -> out)(mf)(converter, setter) - } + * If you want to filter, you should use this and output a 0 or 1 length Iterable. Filter does not change + * column names, and we generally expect to change columns here + */ + final def flatMapTo[U](out: Fields)( + mf: (T) => TraversableOnce[U] + )(implicit flowDef: FlowDef, mode: Mode, setter: TupleSetter[U]): Pipe = + RichPipe(read(flowDef, mode)).flatMapTo[T, U](sourceFields -> out)(mf)(converter, setter) /** - * Allows you to read a Tap on the submit node NOT FOR USE IN THE MAPPERS OR REDUCERS. - * Typical use might be to read in Job.next to determine if another job is needed - */ - def toIterator(implicit mode: Mode): Iterator[T] = { + * Allows you to read a Tap on the submit node NOT FOR USE IN THE MAPPERS OR REDUCERS. Typical use might be + * to read in Job.next to determine if another job is needed + */ + def toIterator(implicit config: Config, mode: Mode): Iterator[T] = { + validateTaps(mode) val tap = createTap(Read)(mode) val conv = converter - mode.openForRead(tap).asScala.map { conv(_) } + CascadingMode + .cast(mode) + .openForRead(config, tap) + .asScala + .map(te => conv(te.selectEntry(sourceFields))) + } + + /** + * Transform this Mappable into another by mapping after. We don't call this map because of conflicts with + * Mappable, unfortunately + */ + override def andThen[U](fn: T => U): Mappable[U] = { + val self = this // compiler generated self can cause problems with serialization + new Mappable[U] { + override def sourceFields = self.sourceFields + def converter[V >: U]: TupleConverter[V] = self.converter.andThen(fn) + override def read(implicit fd: FlowDef, mode: Mode): Pipe = self.read + override def andThen[U1](fn2: U => U1) = self.andThen(fn.andThen(fn2)) + def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = + self.createTap(readOrWrite)(mode) + override def validateTaps(mode: Mode): Unit = self.validateTaps(mode) + } } + } /** - * Mappable extension that defines the proper converter - * implementation for a Mappable with a single item. - */ + * Mappable extension that defines the proper converter implementation for a Mappable with a single item. + */ trait SingleMappable[T] extends Mappable[T] { override def converter[U >: T] = TupleConverter.asSuperConverter(TupleConverter.singleConverter[T]) } /** - * A tap that output nothing. It is used to drive execution of a task for side effect only. This - * can be used to drive a pipe without actually writing to HDFS. + * A tap that output nothing. It is used to drive execution of a task for side effect only. This can be used + * to drive a pipe without actually writing to HDFS. */ class NullTap[Config, Input, Output, SourceContext, SinkContext] - extends SinkTap[Config, Output] ( - new NullScheme[Config, Input, Output, SourceContext, SinkContext](Fields.NONE, Fields.ALL), - SinkMode.UPDATE) { + extends SinkTap[Config, Output]( + new NullScheme[Config, Input, Output, SourceContext, SinkContext](Fields.NONE, Fields.ALL), + SinkMode.UPDATE + ) { def getIdentifier = "nullTap" def openForWrite(flowProcess: FlowProcess[Config], output: Output) = new TupleEntryCollector { - override def add(te: TupleEntry) {} - override def add(t: CTuple) {} - protected def collect(te: TupleEntry) {} + override def add(te: TupleEntry): Unit = () + override def add(t: CTuple): Unit = () + protected def collect(te: TupleEntry): Unit = () } def createResource(conf: Config) = true @@ -221,18 +308,20 @@ class NullTap[Config, Input, Output, SourceContext, SinkContext] def getModifiedTime(conf: Config) = 0 } -/** - * A source outputs nothing. It is used to drive execution of a task for side effect only. - */ -object NullSource extends Source { - override def createTap(readOrWrite : AccessMode)(implicit mode : Mode) : Tap[_,_,_] = { +trait BaseNullSource extends Source { + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = readOrWrite match { case Read => throw new Exception("not supported, reading from null") - case Write => mode match { - case Hdfs(_, _) => new NullTap[JobConf, RecordReader[_,_], OutputCollector[_,_], Any, Any] - case Local(_) => new NullTap[Properties, InputStream, OutputStream, Any, Any] - case Test(_) => new NullTap[Properties, InputStream, OutputStream, Any, Any] - } + case Write => + mode match { + case Hdfs(_, _) => new NullTap[JobConf, RecordReader[_, _], OutputCollector[_, _], Any, Any] + case Local(_) => new NullTap[Properties, InputStream, OutputStream, Any, Any] + case Test(_) => new NullTap[Properties, InputStream, OutputStream, Any, Any] + } } - } } + +/** + * A source outputs nothing. It is used to drive execution of a task for side effect only. + */ +object NullSource extends BaseNullSource diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Stats.scala b/scalding-core/src/main/scala/com/twitter/scalding/Stats.scala index c41543ac99..9c90b4058a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Stats.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Stats.scala @@ -1,69 +1,194 @@ package com.twitter.scalding -import cascading.stats.{ CascadeStats, CascadingStats } -import cascading.flow.FlowProcess -import cascading.stats.FlowStats - +import cascading.flow.{Flow, FlowDef, FlowListener, FlowProcess} +import cascading.flow.hadoop.HadoopFlowProcess +import cascading.stats.CascadingStats +import java.util.concurrent.ConcurrentHashMap +import org.apache.hadoop.mapreduce.Counter +import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ +import scala.collection.mutable +import scala.ref.WeakReference +import scala.util.Try -import org.slf4j.{Logger, LoggerFactory} +/* + * This can be a bit tricky to use, but it is important that incBy and inc + * are called INSIDE any map or reduce functions. + * Like: + * val stat = Stat("test") + * .map { x => + * stat.inc + * 2 * x + * } + * NOT: map( { stat.inc; { x => 2*x } } ) + * which increments on the submitter before creating the function. See the difference? + */ +trait Stat extends java.io.Serializable { -import java.util.WeakHashMap + /** + * increment by the given amount + */ + def incBy(amount: Long): Unit -case class Stat(name: String, group: String = Stats.ScaldingGroup)(@transient implicit val uniqueIdCont: UniqueID) { - @transient private lazy val logger: Logger = LoggerFactory.getLogger(this.getClass) - val uniqueId = uniqueIdCont.get - lazy val flowProcess: FlowProcess[_] = RuntimeStats.getFlowProcessForUniqueId(uniqueId) + /** increment by 1L */ + def inc(): Unit = incBy(1L) - def incBy(amount: Long) = flowProcess.increment(group, name, amount) + /** increment by -1L (decrement) */ + def dec(): Unit = incBy(-1L) + def key: StatKey +} - def inc = incBy(1L) +private[scalding] object CounterImpl { + def apply(fp: FlowProcess[_], statKey: StatKey): CounterImpl = + fp match { + case hFP: HadoopFlowProcess => HadoopFlowPCounterImpl(hFP, statKey) + case _ => GenericFlowPCounterImpl(fp, statKey) + } } + +sealed private[scalding] trait CounterImpl { + def increment(amount: Long): Unit +} + +private[scalding] final case class GenericFlowPCounterImpl(fp: FlowProcess[_], statKey: StatKey) + extends CounterImpl { + override def increment(amount: Long): Unit = fp.increment(statKey.group, statKey.counter, amount) +} + +private[scalding] final case class HadoopFlowPCounterImpl(fp: HadoopFlowProcess, statKey: StatKey) + extends CounterImpl { + // we use a nullable type here for efficiency + private[this] val counter: Counter = (for { + r <- Option(fp.getReporter) + c <- Option(r.getCounter(statKey.group, statKey.counter)) + } yield c).orNull + + def skipNull: Boolean = + fp.getProperty(Config.SkipNullCounters) match { + case null => false // by default don't skip + case isset => isset.toString.toBoolean + } + + require( + (counter != null) || skipNull, + s"counter for $statKey is null and ${Config.SkipNullCounters} is not set to true" + ) + + override def increment(amount: Long): Unit = + if (counter != null) counter.increment(amount) else () +} + +object Stat { + + def apply(k: StatKey)(implicit uid: UniqueID): Stat = new Stat { + // This is materialized on the mappers, and will throw an exception if users incBy before then + private[this] lazy val cntr = CounterImpl(RuntimeStats.getFlowProcessForUniqueId(uid), k) + + def incBy(amount: Long): Unit = cntr.increment(amount) + def key: StatKey = k + } + + implicit def toStatKey(stat: Stat): StatKey = stat.key +} + +object Stats { + // This is the group that we assign all custom counters to + val ScaldingGroup = StatKey.ScaldingGroup + + // When getting a counter value, cascadeStats takes precedence (if set) and + // flowStats is used after that. Returns None if neither is defined. + def getCounterValue(key: StatKey)(implicit cascadingStats: CascadingStats): Long = + cascadingStats.getCounterValue(key.group, key.counter) + + // Returns a map of all custom counter names and their counts. + def getAllCustomCounters()(implicit cascadingStats: CascadingStats): Map[String, Long] = + cascadingStats + .getCountersFor(ScaldingGroup) + .asScala + .map { counter => + val value = getCounterValue(counter) + (counter, value) + } + .toMap +} + /** * Wrapper around a FlowProcess useful, for e.g. incrementing counters. */ object RuntimeStats extends java.io.Serializable { @transient private lazy val logger: Logger = LoggerFactory.getLogger(this.getClass) - private val flowMappingStore = new WeakHashMap[String, FlowProcess[_]] - + private val flowMappingStore: mutable.Map[String, WeakReference[FlowProcess[_]]] = + (new ConcurrentHashMap[String, WeakReference[FlowProcess[_]]]).asScala - def getFlowProcessForUniqueId(uniqueId: String): FlowProcess[_] = { - val ret = flowMappingStore.synchronized { - flowMappingStore.get(uniqueId) + def getFlowProcessForUniqueId(uniqueId: UniqueID): FlowProcess[_] = + (for { + weakFlowProcess <- flowMappingStore.get(uniqueId.get) + flowProcess <- weakFlowProcess.get + } yield { + flowProcess + }).getOrElse { + logger.debug( + s"The FlowProcess for unique id $uniqueId isn't available. Returning a NullFlowProcess instead." + ) + FlowProcess.NULL } - if (ret == null) { - sys.error("Error in job deployment, the FlowProcess for unique id %s isn't available".format(uniqueId)) + + private[this] var prevFP: FlowProcess[_] = null + def addFlowProcess(fp: FlowProcess[_]): Unit = + if (!(prevFP eq fp)) { + val uniqueJobIdObj = fp.getProperty(UniqueID.UNIQUE_JOB_ID) + if (uniqueJobIdObj != null) { + // for speed concern, use a while loop instead of foreach here + var splitted = StringUtility.fastSplit(uniqueJobIdObj.asInstanceOf[String], ",") + while (!splitted.isEmpty) { + val uniqueId = splitted.head + splitted = splitted.tail + logger.debug("Adding flow process id: " + uniqueId) + flowMappingStore.put(uniqueId, new WeakReference(fp)) + } + } + prevFP = fp } - ret - } - def addFlowProcess(fp: FlowProcess[_]) { - val uniqueJobIdObj = fp.getProperty(Job.UNIQUE_JOB_ID) - if(uniqueJobIdObj != null) { - val uniqueId = uniqueJobIdObj.asInstanceOf[String] - logger.debug("Adding flow process id: " + uniqueId) - flowMappingStore.synchronized {flowMappingStore.put(uniqueId, fp)} + /** + * For serialization, you may need to do: val keepAlive = RuntimeStats.getKeepAliveFunction outside of a + * closure passed to map/etc..., and then call: keepAlive() inside of your closure (mapping, reducing + * function) + */ + def getKeepAliveFunction(implicit flowDef: FlowDef): () => Unit = { + // Don't capture the flowDef, just the id + val id = UniqueID.fromSystemHashCode(flowDef) + () => { + val flowProcess = RuntimeStats.getFlowProcessForUniqueId(id) + flowProcess.keepAlive() } } } -object Stats { - // This is the group that we assign all custom counters to - val ScaldingGroup = "Scalding Custom" +/** + * FlowListener that checks counter values against a function. + */ +class StatsFlowListener(f: Map[StatKey, Long] => Try[Unit]) extends FlowListener { - // When getting a counter value, cascadeStats takes precedence (if set) and - // flowStats is used after that. Returns None if neither is defined. - def getCounterValue(counter: String, group: String = ScaldingGroup) - (implicit cascadingStats: CascadingStats): Long = - cascadingStats.getCounterValue(ScaldingGroup, counter) + private var success = true - // Returns a map of all custom counter names and their counts. - def getAllCustomCounters()(implicit cascadingStats: CascadingStats): Map[String, Long] = { - val counts = for { - counter <- cascadingStats.getCountersFor(ScaldingGroup).asScala - value = getCounterValue(counter) - } yield (counter, value) - counts.toMap + override def onCompleted(flow: Flow[_]): Unit = + if (success) { + val stats = flow.getFlowStats + val keys = + stats.getCounterGroups.asScala.flatMap(g => stats.getCountersFor(g).asScala.map(c => StatKey(c, g))) + val values = keys.map(k => (k, stats.getCounterValue(k.group, k.counter))).toMap + f(values).get + } + + override def onThrowable(flow: Flow[_], throwable: Throwable): Boolean = { + success = false + false } -} \ No newline at end of file + + override def onStarting(flow: Flow[_]): Unit = {} + + override def onStopping(flow: Flow[_]): Unit = {} + +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/StreamOperations.scala b/scalding-core/src/main/scala/com/twitter/scalding/StreamOperations.scala index 00ff2146d6..ddceef9948 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/StreamOperations.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/StreamOperations.scala @@ -12,38 +12,34 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields import cascading.tuple.{Tuple => CTuple, TupleEntry} -import scala.collection.JavaConverters._ - -import Dsl._ //Get the conversion implicits - -/** Implements reductions on top of a simple abstraction for the Fields-API - * We use the f-bounded polymorphism trick to return the type called Self - * in each operation. +/** + * Implements reductions on top of a simple abstraction for the Fields-API We use the f-bounded polymorphism + * trick to return the type called Self in each operation. */ trait StreamOperations[+Self <: StreamOperations[Self]] extends Sortable[Self] with java.io.Serializable { - /** Corresponds to a Cascading Buffer - * which allows you to stream through the data, keeping some, dropping, scanning, etc... - * The iterator you are passed is lazy, and mapping will not trigger the - * entire evaluation. If you convert to a list (i.e. to reverse), you need to be aware - * that memory constraints may become an issue. + + /** + * Corresponds to a Cascading Buffer which allows you to stream through the data, keeping some, dropping, + * scanning, etc... The iterator you are passed is lazy, and mapping will not trigger the entire evaluation. + * If you convert to a list (i.e. to reverse), you need to be aware that memory constraints may become an + * issue. * - * WARNING: Any fields not referenced by the input fields will be aligned to the first output, - * and the final hadoop stream will have a length of the maximum of the output of this, and - * the input stream. So, if you change the length of your inputs, the other fields won't - * be aligned. YOU NEED TO INCLUDE ALL THE FIELDS YOU WANT TO KEEP ALIGNED IN THIS MAPPING! - * POB: This appears to be a Cascading design decision. + * WARNING: Any fields not referenced by the input fields will be aligned to the first output, and the final + * hadoop stream will have a length of the maximum of the output of this, and the input stream. So, if you + * change the length of your inputs, the other fields won't be aligned. YOU NEED TO INCLUDE ALL THE FIELDS + * YOU WANT TO KEEP ALIGNED IN THIS MAPPING! POB: This appears to be a Cascading design decision. * - * WARNING: mapfn needs to be stateless. Multiple calls needs to be safe (no mutable - * state captured) + * WARNING: mapfn needs to be stateless. Multiple calls needs to be safe (no mutable state captured) */ - def mapStream[T,X](fieldDef : (Fields,Fields))(mapfn : (Iterator[T]) => TraversableOnce[X]) - (implicit conv : TupleConverter[T], setter : TupleSetter[X]) : Self + def mapStream[T, X](fieldDef: (Fields, Fields))( + mapfn: (Iterator[T]) => TraversableOnce[X] + )(implicit conv: TupleConverter[T], setter: TupleSetter[X]): Self ///////////////////////////////////////// // All the below functions are implemented in terms of the above @@ -52,44 +48,39 @@ trait StreamOperations[+Self <: StreamOperations[Self]] extends Sortable[Self] w /** * Remove the first cnt elements */ - def drop(cnt : Int) : Self = { - mapStream[CTuple,CTuple](Fields.VALUES -> Fields.ARGS){ s => + def drop(cnt: Int): Self = + mapStream[CTuple, CTuple](Fields.VALUES -> Fields.ARGS) { s => s.drop(cnt) }(TupleConverter.CTupleConverter, TupleSetter.CTupleSetter) - } /** * Drop while the predicate is true, starting at the first false, output all */ - def dropWhile[T](f : Fields)(fn : (T) => Boolean)(implicit conv : TupleConverter[T]) : Self = { - mapStream[TupleEntry,CTuple](f -> Fields.ARGS){ s => - s.dropWhile(te => fn(conv(te))).map { _.getTuple } + def dropWhile[T](f: Fields)(fn: (T) => Boolean)(implicit conv: TupleConverter[T]): Self = + mapStream[TupleEntry, CTuple](f -> Fields.ARGS) { s => + s.dropWhile(te => fn(conv(te))).map(_.getTuple) }(TupleConverter.TupleEntryConverter, TupleSetter.CTupleSetter) - } - def scanLeft[X,T](fieldDef : (Fields,Fields))(init : X)(fn : (X,T) => X) - (implicit setter : TupleSetter[X], conv : TupleConverter[T]) : Self = { - mapStream[T,X](fieldDef){ s => + def scanLeft[X, T]( + fieldDef: (Fields, Fields) + )(init: X)(fn: (X, T) => X)(implicit setter: TupleSetter[X], conv: TupleConverter[T]): Self = + mapStream[T, X](fieldDef) { s => // scala's default is not consistent in 2.8 and 2.9, this standardizes the behavior new ScanLeftIterator(s, init, fn) - }(conv,setter) - } + }(conv, setter) /** * Only keep the first cnt elements */ - def take(cnt : Int) : Self = { - mapStream[CTuple,CTuple](Fields.VALUES -> Fields.ARGS){ s => + def take(cnt: Int): Self = + mapStream[CTuple, CTuple](Fields.VALUES -> Fields.ARGS) { s => s.take(cnt) }(TupleConverter.CTupleConverter, TupleSetter.CTupleSetter) - } /** - * Take while the predicate is true, stopping at the - * first false. Output all taken elements. + * Take while the predicate is true, stopping at the first false. Output all taken elements. */ - def takeWhile[T](f : Fields)(fn : (T) => Boolean)(implicit conv : TupleConverter[T]) : Self = { - mapStream[TupleEntry,CTuple](f -> Fields.ARGS){ s => - s.takeWhile(te => fn(conv(te))).map { _.getTuple } + def takeWhile[T](f: Fields)(fn: (T) => Boolean)(implicit conv: TupleConverter[T]): Self = + mapStream[TupleEntry, CTuple](f -> Fields.ARGS) { s => + s.takeWhile(te => fn(conv(te))).map(_.getTuple) }(TupleConverter.TupleEntryConverter, TupleSetter.CTupleSetter) - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TemplateSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/TemplateSource.scala index 32f2450ceb..75075ada1f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TemplateSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TemplateSource.scala @@ -12,28 +12,20 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.apache.hadoop.mapred.JobConf -import org.apache.hadoop.mapred.RecordReader -import org.apache.hadoop.mapred.OutputCollector - -import cascading.scheme.hadoop.{ TextDelimited => CHTextDelimited } -import cascading.scheme.hadoop.TextLine.Compress -import cascading.scheme.Scheme -import cascading.tap.hadoop.Hfs -import cascading.tap.hadoop.{ TemplateTap => HTemplateTap } +import cascading.tap.hadoop.{TemplateTap => HTemplateTap} import cascading.tap.local.FileTap -import cascading.tap.local.{ TemplateTap => LTemplateTap } +import cascading.tap.local.{TemplateTap => LTemplateTap} import cascading.tap.SinkMode import cascading.tap.Tap import cascading.tuple.Fields /** -* This is a base class for template based output sources -*/ -abstract class TemplateSource extends SchemedSource { + * This is a base class for template based output sources + */ +abstract class TemplateSource extends SchemedSource with HfsTapProvider { // The root path of the templated output. def basePath: String @@ -45,12 +37,15 @@ abstract class TemplateSource extends SchemedSource { /** * Creates the template tap. * - * @param readOrWrite Describes if this source is being read from or written to. - * @param mode The mode of the job. (implicit) + * @param readOrWrite + * Describes if this source is being read from or written to. + * @param mode + * The mode of the job. (implicit) * - * @returns A cascading TemplateTap. + * @return + * A cascading TemplateTap. */ - override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = { + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = readOrWrite match { case Read => throw new InvalidSourceException("Cannot use TemplateSource for input") case Write => { @@ -60,67 +55,80 @@ abstract class TemplateSource extends SchemedSource { new LTemplateTap(localTap, template, pathFields) } case hdfsMode @ Hdfs(_, _) => { - val hfsTap = new Hfs(hdfsScheme, basePath, sinkMode) + val hfsTap = createHfsTap(hdfsScheme, basePath, sinkMode) new HTemplateTap(hfsTap, template, pathFields) } case hdfsTest @ HadoopTest(_, _) => { - val hfsTap = new Hfs(hdfsScheme, hdfsTest.getWritePathFor(this), sinkMode) + val hfsTap = createHfsTap(hdfsScheme, hdfsTest.getWritePathFor(this), sinkMode) new HTemplateTap(hfsTap, template, pathFields) } case _ => TestTapFactory(this, hdfsScheme).createTap(readOrWrite) } } } - } /** * Validates the taps, makes sure there are no nulls as the path or template. * - * @param mode The mode of the job. + * @param mode + * The mode of the job. */ - override def validateTaps(mode: Mode): Unit = { + override def validateTaps(mode: Mode): Unit = if (basePath == null) { throw new InvalidSourceException("basePath cannot be null for TemplateTap") } else if (template == null) { throw new InvalidSourceException("template cannot be null for TemplateTap") } - } } /** * An implementation of TSV output, split over a template tap. * - * @param basePath The root path for the output. - * @param template The java formatter style string to use as the template. e.g. %s/%s. - * @param pathFields The set of fields to apply to the path. - * @param writeHeader Flag to indicate that the header should be written to the file. - * @param sinkMode How to handle conflicts with existing output. + * @param basePath + * The root path for the output. + * @param template + * The java formatter style string to use as the template. e.g. %s/%s. + * @param pathFields + * The set of fields to apply to the path. + * @param writeHeader + * Flag to indicate that the header should be written to the file. + * @param sinkMode + * How to handle conflicts with existing output. + * @param fields + * The set of fields to apply to the output. */ case class TemplatedTsv( - override val basePath: String, - override val template: String, - override val pathFields: Fields = Fields.ALL, - override val writeHeader: Boolean = false, - override val sinkMode: SinkMode = SinkMode.REPLACE) - extends TemplateSource with DelimitedScheme + override val basePath: String, + override val template: String, + override val pathFields: Fields = Fields.ALL, + override val writeHeader: Boolean = false, + override val sinkMode: SinkMode = SinkMode.REPLACE, + override val fields: Fields = Fields.ALL +) extends TemplateSource + with DelimitedScheme /** * An implementation of SequenceFile output, split over a template tap. * - * @param basePath The root path for the output. - * @param template The java formatter style string to use as the template. e.g. %s/%s. - * @param sequenceFields The set of fields to use for the sequence file. - * @param pathFields The set of fields to apply to the path. - * @param sinkMode How to handle conflicts with existing output. + * @param basePath + * The root path for the output. + * @param template + * The java formatter style string to use as the template. e.g. %s/%s. + * @param sequenceFields + * The set of fields to use for the sequence file. + * @param pathFields + * The set of fields to apply to the path. + * @param sinkMode + * How to handle conflicts with existing output. */ case class TemplatedSequenceFile( - override val basePath: String, - override val template: String, - val sequenceFields: Fields = Fields.ALL, - override val pathFields: Fields = Fields.ALL, - override val sinkMode: SinkMode = SinkMode.REPLACE) - extends TemplateSource with SequenceFileScheme { + override val basePath: String, + override val template: String, + val sequenceFields: Fields = Fields.ALL, + override val pathFields: Fields = Fields.ALL, + override val sinkMode: SinkMode = SinkMode.REPLACE +) extends TemplateSource + with SequenceFileScheme { override val fields = sequenceFields } - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TestTapFactory.scala b/scalding-core/src/main/scala/com/twitter/scalding/TestTapFactory.scala index 34685fabd3..f578d303d2 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TestTapFactory.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TestTapFactory.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import com.twitter.maple.tap.MemorySourceTap @@ -20,90 +20,98 @@ import cascading.scheme.Scheme import cascading.tuple.Fields import cascading.tap.SinkMode import cascading.tap.Tap -import cascading.tap.hadoop.Hfs import cascading.scheme.NullScheme - -import java.io.{Serializable, InputStream, OutputStream} - +import com.twitter.scalding.tap.ScaldingHfs +import java.io.{InputStream, OutputStream, Serializable} import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapred.OutputCollector import org.apache.hadoop.mapred.RecordReader - import scala.collection.JavaConverters._ -/** Use this to create Taps for testing. +/** + * Use this to create Taps for testing. */ -object TestTapFactory extends Serializable { +object TestTapFactory extends Serializable { val sourceNotFoundError: String = "Source %s does not appear in your test sources. Make sure " + "each source in your job has a corresponding source in the test sources that is EXACTLY " + - "equal. Call the '.source' or '.sink' methods as appropriate on your JobTest to add test " + - "buffers for each source or sink." + "equal. Call the '.source' method on your JobTest to add test buffers for each source." - def apply(src: Source, fields: Fields, sinkMode: SinkMode = SinkMode.REPLACE): TestTapFactory = new TestTapFactory(src, sinkMode) { - override def sourceFields: Fields = fields - override def sinkFields: Fields = fields - } - def apply[A,B](src: Source, scheme: Scheme[JobConf, RecordReader[_,_], OutputCollector[_,_], A, B]): TestTapFactory = apply(src, scheme, SinkMode.REPLACE) - def apply[A,B](src: Source, - scheme: Scheme[JobConf, RecordReader[_,_], OutputCollector[_,_], A, B], sinkMode: SinkMode): TestTapFactory = - new TestTapFactory(src, sinkMode) { override def hdfsScheme = Some(scheme) } + val sinkNotFoundError: String = "Sink %s does not appear in your test sinks. Make sure " + + "each sink in your job has a corresponding sink in the test sinks that is EXACTLY " + + "equal. Call the '.sink' method on your JobTest to add test buffers for each sink." + + def apply(src: Source, fields: Fields, sinkMode: SinkMode = SinkMode.REPLACE): TestTapFactory = + new TestTapFactory(src, sinkMode) { + override def sourceFields: Fields = fields + override def sinkFields: Fields = fields + } + def apply[A, B]( + src: Source, + scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], A, B] + ): TestTapFactory = apply(src, scheme, SinkMode.REPLACE) + def apply[A, B]( + src: Source, + scheme: Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], A, B], + sinkMode: SinkMode + ): TestTapFactory = + new TestTapFactory(src, sinkMode) { override def hdfsScheme = Some(scheme) } } class TestTapFactory(src: Source, sinkMode: SinkMode) extends Serializable { def sourceFields: Fields = - hdfsScheme.map { _.getSourceFields }.getOrElse(sys.error("No sourceFields defined")) + hdfsScheme.map(_.getSourceFields).getOrElse(sys.error("No sourceFields defined")) def sinkFields: Fields = - hdfsScheme.map { _.getSinkFields }.getOrElse(sys.error("No sinkFields defined")) + hdfsScheme.map(_.getSinkFields).getOrElse(sys.error("No sinkFields defined")) - def hdfsScheme: Option[Scheme[JobConf,RecordReader[_,_],OutputCollector[_,_],_,_]] = None + def hdfsScheme: Option[Scheme[JobConf, RecordReader[_, _], OutputCollector[_, _], _, _]] = None - def createTap(readOrWrite : AccessMode)(implicit mode : Mode) : Tap[_,_,_] = { + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = mode match { case Test(buffers) => { /* - * There MUST have already been a registered sink or source in the Test mode. - * to access this. You must explicitly name each of your test sources in your - * JobTest. - */ - require( - buffers(src).isDefined, - TestTapFactory.sourceNotFoundError.format(src) - ) + * There MUST have already been a registered sink or source in the Test mode. + * to access this. You must explicitly name each of your test sources in your + * JobTest. + */ + val errorMsg = readOrWrite match { + case Read => TestTapFactory.sourceNotFoundError + case Write => TestTapFactory.sinkNotFoundError + } + + require(buffers(src).isDefined, errorMsg.format(src)) val buffer = if (readOrWrite == Write) { val buf = buffers(src).get - //Make sure we wipe it out: + // Make sure we wipe it out: buf.clear() buf } else { // if the source is also used as a sink, we don't want its contents to get modified buffers(src).get.clone() } - new MemoryTap[InputStream, OutputStream]( - new NullScheme(sourceFields, sinkFields), - buffer) + new MemoryTap[InputStream, OutputStream](new NullScheme(sourceFields, sinkFields), buffer) } case hdfsTest @ HadoopTest(conf, buffers) => readOrWrite match { case Read => { val bufOpt = buffers(src) - if(bufOpt.isDefined) { + if (bufOpt.isDefined) { val buffer = bufOpt.get val fields = sourceFields - (new MemorySourceTap(buffer.toList.asJava, fields)).asInstanceOf[Tap[JobConf,_,_]] + (new MemorySourceTap(buffer.toList.asJava, fields)).asInstanceOf[Tap[JobConf, _, _]] } else { - CastHfsTap(new Hfs(hdfsScheme.get, hdfsTest.getWritePathFor(src), sinkMode)) + CastHfsTap(new ScaldingHfs(hdfsScheme.get, hdfsTest.getWritePathFor(src), sinkMode)) } } case Write => { val path = hdfsTest.getWritePathFor(src) - CastHfsTap(new Hfs(hdfsScheme.get, path, sinkMode)) + CastHfsTap(new ScaldingHfs(hdfsScheme.get, path, sinkMode)) } } case _ => { throw new RuntimeException("TestTapFactory doesn't support mode: " + mode.toString) } } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TimePathedSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/TimePathedSource.scala index 3c09803220..0da81b54b5 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TimePathedSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TimePathedSource.scala @@ -12,15 +12,12 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.util.TimeZone import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.FileStatus -import org.apache.hadoop.fs.Path -import org.apache.hadoop.mapred.JobConf object TimePathedSource { val YEAR_MONTH_DAY = "/%1$tY/%1$tm/%1$td" @@ -30,40 +27,77 @@ object TimePathedSource { String.format(pattern, date.toCalendar(tz)) def stepSize(pattern: String, tz: TimeZone): Option[Duration] = - List("%1$tH" -> Hours(1), "%1$td" -> Days(1)(tz), - "%1$tm" -> Months(1)(tz), "%1$tY" -> Years(1)(tz)) - .find { unitDur : (String, Duration) => pattern.contains(unitDur._1) } + List("%1$tH" -> Hours(1), "%1$td" -> Days(1)(tz), "%1$tm" -> Months(1)(tz), "%1$tY" -> Years(1)(tz)) + .find { unitDur: (String, Duration) => pattern.contains(unitDur._1) } .map(_._2) + + /** + * Gives all paths in the given daterange with windows based on the provided duration. + */ + def allPathsWithDuration( + pattern: String, + duration: Duration, + dateRange: DateRange, + tz: TimeZone + ): Iterable[String] = + // This method is exhaustive, but too expensive for Cascading's JobConf writing. + dateRange + .each(duration) + .map { dr: DateRange => + toPath(pattern, dr.start, tz) + } + + /** + * Gives all read paths in the given daterange. + */ + def readPathsFor(pattern: String, dateRange: DateRange, tz: TimeZone): Iterable[String] = + TimePathedSource.stepSize(pattern, tz) match { + case Some(duration) => allPathsWithDuration(pattern, duration, dateRange, tz) + case None => sys.error(s"No suitable step size for pattern: $pattern") + } + + /** + * Gives the write path based on daterange end. + */ + def writePathFor(pattern: String, dateRange: DateRange, tz: TimeZone): String = { + assert(pattern != "/*", "Pattern must not be /*") + assert(pattern.takeRight(2) == "/*", "Pattern must end with /* " + pattern) + val stripped = pattern.dropRight(2) + toPath(stripped, dateRange.end, tz) + } } -abstract class TimeSeqPathedSource(val patterns : Seq[String], val dateRange : DateRange, val tz : TimeZone) extends FileSource { +abstract class TimeSeqPathedSource(val patterns: Seq[String], val dateRange: DateRange, val tz: TimeZone) + extends FileSource { override def hdfsPaths = patterns - .flatMap{ pattern: String => + .flatMap { pattern: String => Globifier(pattern)(tz).globify(dateRange) } - protected def allPathsFor(pattern: String): Iterable[String] = + /** + * Override this if you have for instance an hourly pattern but want to run every 6 hours. By default, we + * call TimePathedSource.stepSize(pattern, tz) + */ + protected def defaultDurationFor(pattern: String): Option[Duration] = TimePathedSource.stepSize(pattern, tz) - .map { dur => - // This method is exhaustive, but too expensive for Cascading's JobConf writing. - dateRange.each(dur) - .map { dr: DateRange => - TimePathedSource.toPath(pattern, dr.start, tz) - } - } - .getOrElse(Nil) + + protected def allPathsFor(pattern: String): Iterable[String] = + defaultDurationFor(pattern) match { + case Some(duration) => TimePathedSource.allPathsWithDuration(pattern, duration, dateRange, tz) + case None => sys.error(s"No suitable step size for pattern: $pattern") + } /** These are all the paths we will read for this data completely enumerated */ def allPaths: Iterable[String] = patterns.flatMap(allPathsFor(_)) /** - * Get path statuses based on daterange. This tests each path with pathIsGood - * (which by default checks that there is at least on file in that directory) + * Get path statuses based on daterange. This tests each path with pathIsGood (which by default checks that + * there is at least on file in that directory) */ def getPathStatuses(conf: Configuration): Iterable[(String, Boolean)] = - allPaths.map { path => (path, pathIsGood(path, conf)) } + allPaths.map(path => (path, pathIsGood(path, conf))) // Override because we want to check UNGLOBIFIED paths that each are present. override def hdfsReadPathsAreGood(conf: Configuration): Boolean = @@ -75,14 +109,14 @@ abstract class TimeSeqPathedSource(val patterns : Seq[String], val dateRange : D } override def toString = "TimeSeqPathedSource(" + patterns.mkString(",") + - ", " + dateRange + ", " + tz + ")" + ", " + dateRange + ", " + tz + ")" - override def equals(that : Any) = + override def equals(that: Any) = (that != null) && - (this.getClass == that.getClass) && - this.patterns == that.asInstanceOf[TimeSeqPathedSource].patterns && - this.dateRange == that.asInstanceOf[TimeSeqPathedSource].dateRange && - this.tz == that.asInstanceOf[TimeSeqPathedSource].tz + (this.getClass == that.getClass) && + this.patterns == that.asInstanceOf[TimeSeqPathedSource].patterns && + this.dateRange == that.asInstanceOf[TimeSeqPathedSource].dateRange && + this.tz == that.asInstanceOf[TimeSeqPathedSource].tz override def hashCode = patterns.hashCode + 31 * dateRange.hashCode + @@ -90,43 +124,34 @@ abstract class TimeSeqPathedSource(val patterns : Seq[String], val dateRange : D } /** - * This will automatically produce a globbed version of the given path. - * THIS MEANS YOU MUST END WITH A / followed by * to match a file - * For writing, we write to the directory specified by the END time. + * This will automatically produce a globbed version of the given path. THIS MEANS YOU MUST END WITH A / + * followed by * to match a file For writing, we write to the directory specified by the END time. */ -abstract class TimePathedSource(val pattern: String, - dateRange: DateRange, - tz: TimeZone) extends TimeSeqPathedSource(Seq(pattern), dateRange, tz) { - - //Write to the path defined by the end time: - override def hdfsWritePath = { - // TODO this should be required everywhere but works on read without it - // maybe in 0.9.0 be more strict - assert(pattern.takeRight(2) == "/*", "Pattern must end with /* " + pattern) - val lastSlashPos = pattern.lastIndexOf('/') - val stripped = pattern.slice(0, lastSlashPos) - TimePathedSource.toPath(stripped, dateRange.end, tz) - } - override def localPath = pattern +abstract class TimePathedSource(val pattern: String, dateRange: DateRange, tz: TimeZone) + extends TimeSeqPathedSource(Seq(pattern), dateRange, tz) { + + // Write to the path defined by the end time: + override def hdfsWritePath = TimePathedSource.writePathFor(pattern, dateRange, tz) + + override def localPaths = patterns + .flatMap { pattern: String => + Globifier(pattern)(tz).globify(dateRange) + } } /* * A source that contains the most recent existing path in this date range. */ -abstract class MostRecentGoodSource(p : String, dr : DateRange, t : TimeZone) +abstract class MostRecentGoodSource(p: String, dr: DateRange, t: TimeZone) extends TimePathedSource(p, dr, t) { override def toString = "MostRecentGoodSource(" + p + ", " + dr + ", " + t + ")" - override protected def goodHdfsPaths(hdfsMode: Hdfs) = getPathStatuses(hdfsMode.jobConf) - .toList - .reverse + override protected def goodHdfsPaths(hdfsMode: Hdfs) = getPathStatuses(hdfsMode.jobConf).toList.reverse .find(_._2) .map(_._1) override def hdfsReadPathsAreGood(conf: Configuration) = getPathStatuses(conf) .exists(_._2) } - - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Tool.scala b/scalding-core/src/main/scala/com/twitter/scalding/Tool.scala index cbadb20e72..1a79939e13 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/Tool.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/Tool.scala @@ -12,66 +12,63 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.apache.hadoop -import cascading.tuple.Tuple -import collection.mutable.{ListBuffer, Buffer} +import cascading.flow.hadoop.HadoopFlow +import cascading.flow.planner.BaseFlowStep + +import org.apache.hadoop.conf.Configured +import org.apache.hadoop.mapred.JobConf +import org.apache.hadoop.util.{GenericOptionsParser, Tool => HTool, ToolRunner} + import scala.annotation.tailrec -import scala.util.Try -import java.io.{ BufferedWriter, File, FileOutputStream, OutputStreamWriter } -import java.util.UUID +import scala.collection.JavaConverters._ -class Tool extends hadoop.conf.Configured with hadoop.util.Tool { +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ + +class Tool extends Configured with HTool { // This mutable state is not my favorite, but we are constrained by the Hadoop API: - var rootJob : Option[(Args) => Job] = None + var rootJob: Option[(Args) => Job] = None // Allows you to set the job for the Tool to run - def setJobConstructor(jobc : (Args) => Job) { - if(rootJob.isDefined) { + def setJobConstructor(jobc: (Args) => Job): Unit = + if (rootJob.isDefined) { sys.error("Job is already defined") - } - else { + } else { rootJob = Some(jobc) } - } - protected def getJob(args : Args) : Job = { - if( rootJob.isDefined ) { - rootJob.get.apply(args) - } - else if(args.positional.isEmpty) { - sys.error("Usage: Tool --local|--hdfs [args...]") - } - else { - val jobName = args.positional(0) + protected def getJob(args: Args): Job = rootJob match { + case Some(job) => job(args) + case None if args.positional.isEmpty => + throw ArgsException("Usage: Tool --local|--hdfs [args...]") + case None => // has at least one arg + val jobName = args.positional.head // Remove the job name from the positional arguments: val nonJobNameArgs = args + ("" -> args.positional.tail) Job(jobName, nonJobNameArgs) - } } // This both updates the jobConf with hadoop arguments // and returns all the non-hadoop arguments. Should be called once if // you want to process hadoop arguments (like -libjars). - protected def nonHadoopArgsFrom(args : Array[String]) : Array[String] = { - (new hadoop.util.GenericOptionsParser(getConf, args)).getRemainingArgs - } + protected def nonHadoopArgsFrom(args: Array[String]): Array[String] = + (new GenericOptionsParser(getConf, args)).getRemainingArgs - def parseModeArgs(args : Array[String]) : (Mode, Args) = { + def parseModeArgs(args: Array[String]): (Mode, Args) = { val a = Args(nonHadoopArgsFrom(args)) (Mode(a, getConf), a) } // Parse the hadoop args, and if job has not been set, instantiate the job - def run(args : Array[String]) : Int = { + def run(args: Array[String]): Int = { val (mode, jobArgs) = parseModeArgs(args) // Connect mode with job Args run(getJob(Mode.putMode(mode, jobArgs))) } - protected def run(job : Job) : Int = { + protected def run(job: Job): Int = { val onlyPrintGraph = job.args.boolean("tool.graph") if (onlyPrintGraph) { @@ -80,72 +77,82 @@ class Tool extends hadoop.conf.Configured with hadoop.util.Tool { } /* - * This is a tail recursive loop that runs all the - * jobs spawned from this one - */ + * This is a tail recursive loop that runs all the + * jobs spawned from this one + */ val jobName = job.getClass.getName @tailrec - def start(j : Job, cnt : Int) { + def start(j: Job, cnt: Int): Unit = { val successful = if (onlyPrintGraph) { val flow = j.buildFlow /* - * This just writes out the graph representing - * all the cascading elements that are created for this - * flow. Use graphviz to render it as a PDF. - * The job is NOT run in this case. - */ + * This just writes out the graph representing + * all the cascading elements that are created for this + * flow. Use graphviz to render it as a PDF. + * The job is NOT run in this case. + */ val thisDot = jobName + cnt + ".dot" println("writing DOT: " + thisDot) + + /* We add descriptions if they exist to the stepName so it appears in the .dot file */ + flow match { + case hadoopFlow: HadoopFlow => + val flowSteps = hadoopFlow.getFlowSteps.asScala + flowSteps.foreach { step => + val baseFlowStep: BaseFlowStep[JobConf] = step.asInstanceOf[BaseFlowStep[JobConf]] + val descriptions = baseFlowStep.getConfig.get(Config.StepDescriptions, "") + if (!descriptions.isEmpty) { + val stepXofYData = """\(\d+/\d+\)""".r.findFirstIn(baseFlowStep.getName).getOrElse("") + // Reflection is only temporary. Latest cascading has setName public: https://github.com/cwensel/cascading/commit/487a6e9ef#diff-0feab84bc8832b2a39312dbd208e3e69L175 + // https://github.com/twitter/scalding/issues/1294 + val x = classOf[BaseFlowStep[JobConf]].getDeclaredMethod("setName", classOf[String]) + x.setAccessible(true) + x.invoke(step, "%s %s".format(stepXofYData, descriptions)) + } + } + case _ => // descriptions not yet supported in other modes + } + flow.writeDOT(thisDot) val thisStepsDot = jobName + cnt + "_steps.dot" println("writing Steps DOT: " + thisStepsDot) flow.writeStepsDOT(thisStepsDot) true + } else { + j.validate() + j.run() } - else { - j.validate - j.run - } - j.clear - //When we get here, the job is finished - if(successful) { - j.next match { + j.clear() + // When we get here, the job is finished + if (successful) { + // we need to use match not foreach to get tail recursion + j.next match { // linter:disable:UseOptionForeachNotPatMatch case Some(nextj) => start(nextj, cnt + 1) - case None => Unit + case None => () } } else { - throw new RuntimeException("Job failed to run: " + jobName + - (if(cnt > 0) { " child: " + cnt.toString + ", class: " + j.getClass.getName } - else { "" }) + throw new RuntimeException( + "Job failed to run: " + jobName + + (if (cnt > 0) { " child: " + cnt.toString + ", class: " + j.getClass.getName } + else { "" }) ) } } - //start a counter to see how deep we recurse: + // start a counter to see how deep we recurse: start(job, 0) 0 } } object Tool { - def main(args: Array[String]) { + def main(args: Array[String]): Unit = try { - hadoop.util.ToolRunner.run(new hadoop.mapred.JobConf, new Tool, args) + ToolRunner.run(new JobConf, new Tool, ExpandLibJarsGlobs(args)) } catch { case t: Throwable => { - //create the exception URL link in GitHub wiki - val gitHubLink = RichXHandler.createXUrl(t) - val extraInfo = (if(RichXHandler().handlers.exists(h => h(t))) { - RichXHandler.mapping(t.getClass) + "\n" - } - else { - "" - }) + - "If you know what exactly caused this error, please consider contributing to GitHub via following link.\n" + gitHubLink - - //re-throw the exception with extra info - throw new Throwable(extraInfo, t) + // re-throw the exception with extra info + throw new Throwable(RichXHandler(t), t) } } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/Tracing.scala b/scalding-core/src/main/scala/com/twitter/scalding/Tracing.scala new file mode 100644 index 0000000000..978f1eda51 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/Tracing.scala @@ -0,0 +1,87 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding + +import java.lang.reflect.InvocationTargetException + +import org.slf4j.{Logger, LoggerFactory => LogManager} + +/** + * Calling init registers "com.twitter.scalding" as a "tracing boundary" for Cascading. That means that when + * Cascading sends trace information to a DocumentService such as Driven, the trace will have information + * about the caller of Scalding instead of about the internals of Scalding. com.twitter.scalding.Job and its + * subclasses will automatically initialize Tracing. + * + * register and unregister methods are provided for testing, but should not be needed for most development + */ +object Tracing { + private val LOG: Logger = LogManager.getLogger(this.getClass) + + // TODO: remove this once we no longer want backwards compatibility + // with cascading versions pre 2.6 + private val traceUtilClassName = "cascading.util.TraceUtil" + + /** + * Put a barrier at com.twitter.scalding, but exclude things like Tool that are common entry points for + * calling user code + */ + private val defaultRegex = """^com\.twitter\.scalding\.(?!Tool|Job|ExecutionContext).*""" + + register() + + /** + * Forces the initialization of the Tracing object which in turn causes the one time registration of + * "com.twitter.scalding" as a tracing boundary in Cascading + */ + def init(): Unit = { /* do nothing */ } + + /** + * Explicitly registers "com.twitter.scalding" as a Cascading tracing boundary. Normally not needed, but may + * be useful after a call to unregister() + */ + def register(regex: String = defaultRegex) = + invokeStaticMethod(traceUtilClassName, "registerApiBoundary", regex) + + /** + * Unregisters "com.twitter.scalding" as a Cascading tracing bounardy. After calling this, Cascading + * DocumentServices such as Driven will show nodes as being created by Scalding class such as RichPipe + * instead of end user written code. This should normally not be called but can be useful in testing the + * development of Scalding internals + */ + def unregister(regex: String = defaultRegex) = + invokeStaticMethod(traceUtilClassName, "unregisterApiBoundary", regex) + + /** + * Use reflection to register/unregister tracing boundaries so that cascading versions prior to 2.6 can be + * used without completely breaking + */ + private def invokeStaticMethod(clazz: String, methodName: String, args: AnyRef*): Unit = + try { + val argTypes = args.map(_.getClass()) + Class.forName(clazz).getMethod(methodName, argTypes: _*).invoke(null, args: _*) + } catch { + case e @ (_: NoSuchMethodException | _: SecurityException | _: IllegalAccessException | + _: IllegalArgumentException | _: InvocationTargetException | _: NullPointerException | + _: ClassNotFoundException) => + LOG.warn( + "There was an error initializing tracing. " + + "Tracing information in DocumentServices such as Driven may point to Scalding code instead of " + + "user code. The most likely cause is a mismatch in Cascading library version. Upgrading the " + + "Cascading library to at least 2.6 should fix this issue.The cause was [" + e + "]" + ) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleArity.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleArity.scala index b5970c8ffd..e7d11a3ace 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleArity.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleArity.scala @@ -12,35 +12,36 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields /** -* Mixed in to both TupleConverter and TupleSetter to improve arity safety -* of cascading jobs before we run anything on Hadoop. -*/ + * Mixed in to both TupleConverter and TupleSetter to improve arity safety of cascading jobs before we run + * anything on Hadoop. + */ trait TupleArity { + /** - * Return the arity of product types, should probably only be used implicitly - * The use case here is to see how many fake field names we need in Cascading - * to hold an intermediate value for mapReduceMap - */ - def arity : Int + * Return the arity of product types, should probably only be used implicitly The use case here is to see + * how many fake field names we need in Cascading to hold an intermediate value for mapReduceMap + */ + def arity: Int /** - * assert that the arity of this setter matches the fields given. - * if arity == -1, we can't check, and if Fields is not a definite - * size, (such as Fields.ALL), we also cannot check, so this should - * only be considered a weak check. - */ - def assertArityMatches(f : Fields) { - //Fields.size == 0 for the indefinite Fields: ALL, GROUP, VALUES, UNKNOWN, etc.. - if(f.size > 0 && arity >= 0) { - assert(arity == f.size, "Arity of (" + super.getClass + ") is " - + arity + ", which doesn't match: + (" + f.toString + ")") + * assert that the arity of this setter matches the fields given. if arity == -1, we can't check, and if + * Fields is not a definite size, (such as Fields.ALL), we also cannot check, so this should only be + * considered a weak check. + */ + def assertArityMatches(f: Fields): Unit = + // Fields.size == 0 for the indefinite Fields: ALL, GROUP, VALUES, UNKNOWN, etc.. + if (f.size > 0 && arity >= 0) { + assert( + arity == f.size, + "Arity of (" + super.getClass + ") is " + + arity + ", which doesn't match: + (" + f.toString + ")" + ) } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleConversions.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleConversions.scala index 8e8a47d638..600bb51a4a 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleConversions.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleConversions.scala @@ -12,17 +12,8 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.tuple.TupleEntry -import cascading.tuple.TupleEntryIterator -import cascading.tuple.{Tuple => CTuple} -import cascading.tuple.Tuples - -import java.io.Serializable - -import scala.collection.JavaConverters._ - @deprecated("This trait does nothing now", "0.9.0") trait TupleConversions diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleConverter.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleConverter.scala index 1a4370f03c..da6ff34af0 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleConverter.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleConverter.scala @@ -12,103 +12,117 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.TupleEntry import cascading.tuple.{Tuple => CTuple} - +import com.twitter.scalding.serialization.Externalizer import scala.collection.breakOut -/** Typeclass to represent converting from cascading TupleEntry to some type T. - * The most common application is to convert to scala Tuple objects for use - * with the Fields API. The typed API internally manually handles its mapping - * to cascading Tuples, so the implicit resolution mechanism is not used. +/** + * Typeclass to represent converting from cascading TupleEntry to some type T. The most common application is + * to convert to scala Tuple objects for use with the Fields API. The typed API internally manually handles + * its mapping to cascading Tuples, so the implicit resolution mechanism is not used. * - * WARNING: if you are seeing issues with the singleConverter being found when you - * expect something else, you may have an issue where the enclosing scope needs to - * take an implicit TupleConverter of the correct type. + * WARNING: if you are seeing issues with the singleConverter being found when you expect something else, you + * may have an issue where the enclosing scope needs to take an implicit TupleConverter of the correct type. * - * Unfortunately, the semantics we want (prefer to flatten tuples, but otherwise - * put everything into one postition in the tuple) are somewhat difficlut to - * encode in scala. + * Unfortunately, the semantics we want (prefer to flatten tuples, but otherwise put everything into one + * postition in the tuple) are somewhat difficlut to encode in scala. */ -trait TupleConverter[@specialized(Int,Long,Float,Double)T] extends java.io.Serializable with TupleArity { self => - def apply(te : TupleEntry) : T - def andThen[U](fn: T => U): TupleConverter[U] = new TupleConverter[U] { - def apply(te: TupleEntry) = fn(self(te)) - def arity = self.arity - } +trait TupleConverter[@specialized(Int, Long, Float, Double) T] extends java.io.Serializable with TupleArity { + self => + def apply(te: TupleEntry): T + def andThen[U](fn: T => U): TupleConverter[U] = + TupleConverter.AndThen(this, fn) } trait LowPriorityTupleConverters extends java.io.Serializable { - implicit def singleConverter[@specialized(Int,Long,Float,Double)A](implicit g : TupleGetter[A]) = - new TupleConverter[A] { - def apply(tup : TupleEntry) = g.get(tup.getTuple, 0) - def arity = 1 - } + implicit def singleConverter[@specialized(Int, Long, Float, Double) A](implicit + g: TupleGetter[A] + ): TupleConverter[A] = + TupleConverter.Single[A](g) } object TupleConverter extends GeneratedTupleConverters { - /** Treat this TupleConverter as one for a superclass - * We do this because we want to use implicit resolution invariantly, - * but clearly, the operation is covariant - */ - def asSuperConverter[T,U>:T](tc: TupleConverter[T]): TupleConverter[U] = tc.asInstanceOf[TupleConverter[U]] + final case class Single[@specialized(Int, Long, Float, Double) A](getter: TupleGetter[A]) + extends TupleConverter[A] { + def apply(tup: TupleEntry): A = getter.get(tup.getTuple, 0) + def arity = 1 + } + + final case class AndThen[A, B](first: TupleConverter[A], @transient fn: A => B) extends TupleConverter[B] { + private val lockedFn = Externalizer(fn) - def build[T](thisArity: Int)(fn: TupleEntry => T): TupleConverter[T] = new TupleConverter[T] { - def apply(te: TupleEntry) = fn(te) - def arity = thisArity + def apply(te: TupleEntry) = lockedFn.get(first(te)) + def arity: Int = first.arity } + + final case class FromFn[A](@transient fn: TupleEntry => A, arity: Int) extends TupleConverter[A] { + private val lockedFn = Externalizer(fn) + + def apply(te: TupleEntry) = lockedFn.get(te) + } + + /** + * Treat this TupleConverter as one for a superclass We do this because we want to use implicit resolution + * invariantly, but clearly, the operation is covariant + */ + def asSuperConverter[T, U >: T](tc: TupleConverter[T]): TupleConverter[U] = + tc.asInstanceOf[TupleConverter[U]] + + def build[T](thisArity: Int)(fn: TupleEntry => T): TupleConverter[T] = FromFn(fn, thisArity) def fromTupleEntry[T](t: TupleEntry)(implicit tc: TupleConverter[T]): T = tc(t) def arity[T](implicit tc: TupleConverter[T]): Int = tc.arity def of[T](implicit tc: TupleConverter[T]): TupleConverter[T] = tc - /** Copies the tupleEntry, since cascading may change it after the end of an - * operation (and it is not safe to assume the consumer has not kept a ref - * to this tuple) + /** + * Copies the tupleEntry, since cascading may change it after the end of an operation (and it is not safe to + * assume the consumer has not kept a ref to this tuple) */ implicit lazy val TupleEntryConverter: TupleConverter[TupleEntry] = new TupleConverter[TupleEntry] { - override def apply(tup : TupleEntry) = new TupleEntry(tup) + override def apply(tup: TupleEntry) = new TupleEntry(tup) override def arity = -1 } - /** Copies the tuple, since cascading may change it after the end of an - * operation (and it is not safe to assume the consumer has not kept a ref - * to this tuple + /** + * Copies the tuple, since cascading may change it after the end of an operation (and it is not safe to + * assume the consumer has not kept a ref to this tuple */ implicit lazy val CTupleConverter: TupleConverter[CTuple] = new TupleConverter[CTuple] { - override def apply(tup : TupleEntry) = tup.getTupleCopy + override def apply(tup: TupleEntry) = tup.getTupleCopy override def arity = -1 } - - /** In the case where you don't know the arity, prefer to use this. + /** + * In the case where you don't know the arity, prefer to use this. */ implicit lazy val ProductTupleConverter: TupleConverter[Product] = new TupleConverter[Product] { def wrap(tup: CTuple): Product = new Product { def canEqual(that: Any) = that match { case p: Product => true - case _ => false + case _ => false } def productArity = tup.size def productElement(idx: Int) = tup.getObject(idx) } - override def apply(tup : TupleEntry) = wrap(tup.getTupleCopy) + override def apply(tup: TupleEntry) = wrap(tup.getTupleCopy) override def arity = -1 } implicit lazy val UnitConverter: TupleConverter[Unit] = new TupleConverter[Unit] { - override def apply(arg : TupleEntry) = () + override def apply(arg: TupleEntry) = () override def arity = 0 } // Doesn't seem safe to make these implicit by default: - /** Convert a TupleEntry to a List of CTuple, of length 2, with key, value - * from the TupleEntry (useful for RichPipe.unpivot) + /** + * Convert a TupleEntry to a List of CTuple, of length 2, with key, value from the TupleEntry (useful for + * RichPipe.unpivot) */ object KeyValueList extends TupleConverter[List[CTuple]] { - def apply(tupe : TupleEntry): List[CTuple] = { + def apply(tupe: TupleEntry): List[CTuple] = { val keys = tupe.getFields (0 until keys.size).map { idx => new CTuple(keys.get(idx).asInstanceOf[Object], tupe.getObject(idx)) @@ -127,7 +141,8 @@ object TupleConverter extends GeneratedTupleConverters { def arity = -1 } - /** Utility to create a single item Tuple + /** + * Utility to create a single item Tuple */ def tupleAt(idx: Int)(tup: CTuple): CTuple = { val obj = tup.getObject(idx) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleGetter.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleGetter.scala index 252346485a..508f20f524 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleGetter.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleGetter.scala @@ -12,61 +12,62 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.{Tuple => CTuple} -/** Typeclass roughly equivalent to a Lens, which allows getting items out of a tuple. - * This is useful because cascading has type coercion (string to int, for instance) that - * users expect in the fields API. This code is not used in the typesafe API, which - * does not allow suc silent coercion. - * See the generated TupleConverters for an example of where this is used +/** + * Typeclass roughly equivalent to a Lens, which allows getting items out of a tuple. This is useful because + * cascading has type coercion (string to int, for instance) that users expect in the fields API. This code is + * not used in the typesafe API, which does not allow suc silent coercion. See the generated TupleConverters + * for an example of where this is used */ -trait TupleGetter[@specialized(Int,Long,Float,Double)T] extends java.io.Serializable { +trait TupleGetter[@specialized(Int, Long, Float, Double) T] extends java.io.Serializable { def get(tup: CTuple, i: Int): T } trait LowPriorityTupleGetter extends java.io.Serializable { - implicit def castingGetter[T]: TupleGetter[T] = new TupleGetter[T] { - def get(tup: CTuple, i: Int) = tup.getObject(i).asInstanceOf[T] - } + implicit def castingGetter[T]: TupleGetter[T] = TupleGetter.Casting() } object TupleGetter extends LowPriorityTupleGetter { + case class Casting[A]() extends TupleGetter[A] { + def get(tup: CTuple, i: Int) = tup.getObject(i).asInstanceOf[A] + } def get[T](tup: CTuple, i: Int)(implicit tg: TupleGetter[T]): T = tg.get(tup, i) def of[T](implicit tg: TupleGetter[T]): TupleGetter[T] = tg implicit object UnitGetter extends TupleGetter[Unit] { - override def get(tup : CTuple, i : Int) = () + override def get(tup: CTuple, i: Int) = () } implicit object BooleanGetter extends TupleGetter[Boolean] { - override def get(tup : CTuple, i : Int) = tup.getBoolean(i) + override def get(tup: CTuple, i: Int) = tup.getBoolean(i) } implicit object ShortGetter extends TupleGetter[Short] { - override def get(tup : CTuple, i : Int) = tup.getShort(i) + override def get(tup: CTuple, i: Int) = tup.getShort(i) } implicit object IntGetter extends TupleGetter[Int] { - override def get(tup : CTuple, i : Int) = tup.getInteger(i) + override def get(tup: CTuple, i: Int) = tup.getInteger(i) } implicit object LongGetter extends TupleGetter[Long] { - override def get(tup : CTuple, i : Int) = tup.getLong(i) + override def get(tup: CTuple, i: Int) = tup.getLong(i) } implicit object FloatGetter extends TupleGetter[Float] { - override def get(tup : CTuple, i : Int) = tup.getFloat(i) + override def get(tup: CTuple, i: Int) = tup.getFloat(i) } implicit object DoubleGetter extends TupleGetter[Double] { - override def get(tup : CTuple, i : Int) = tup.getDouble(i) + override def get(tup: CTuple, i: Int) = tup.getDouble(i) } implicit object StringGetter extends TupleGetter[String] { - override def get(tup : CTuple, i : Int) = tup.getString(i) + override def get(tup: CTuple, i: Int) = tup.getString(i) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TuplePacker.scala b/scalding-core/src/main/scala/com/twitter/scalding/TuplePacker.scala index 904bd071f6..5111b0ab04 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TuplePacker.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TuplePacker.scala @@ -12,11 +12,9 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.pipe._ -import cascading.pipe.joiner._ import cascading.tuple._ import java.lang.reflect.Method @@ -24,101 +22,110 @@ import java.lang.reflect.Constructor import scala.reflect.Manifest -/** Typeclass for packing a cascading Tuple into some type T, - * this is used to put fields of a cascading tuple into Thrift, Protobuf, - * or case classes, for instance, but you can add your own instances to control - * how this is done. - * - * @author Argyris Zymnis - * @author Oscar Boykin - */ +/** + * Typeclass for packing a cascading Tuple into some type T, this is used to put fields of a cascading tuple + * into Thrift, Protobuf, or case classes, for instance, but you can add your own instances to control how + * this is done. + * + * @author + * Argyris Zymnis + * @author + * Oscar Boykin + */ trait TuplePacker[T] extends java.io.Serializable { - def newConverter(fields : Fields) : TupleConverter[T] + def newConverter(fields: Fields): TupleConverter[T] } object TuplePacker extends CaseClassPackers trait CaseClassPackers extends LowPriorityTuplePackers { - implicit def caseClassPacker[T <: Product](implicit mf : Manifest[T]) = new OrderedTuplePacker[T] + implicit def caseClassPacker[T <: Product](implicit mf: Manifest[T]): OrderedTuplePacker[T] = + new OrderedTuplePacker[T] } trait LowPriorityTuplePackers extends java.io.Serializable { - implicit def genericTuplePacker[T : Manifest] = new ReflectionTuplePacker[T] + implicit def genericTuplePacker[T: Manifest]: ReflectionTuplePacker[T] = new ReflectionTuplePacker[T] } -/** Packs a tuple into any object with set methods, e.g. thrift or proto objects. - * TODO: verify that protobuf setters for field camel_name are of the form setCamelName. - * In that case this code works for proto. - * - * @author Argyris Zymnis - * @author Oscar Boykin - */ -class ReflectionTuplePacker[T](implicit m : Manifest[T]) extends TuplePacker[T] { - override def newConverter(fields : Fields) = new ReflectionTupleConverter[T](fields)(m) +/** + * Packs a tuple into any object with set methods, e.g. thrift or proto objects. TODO: verify that protobuf + * setters for field camel_name are of the form setCamelName. In that case this code works for proto. + * + * @author + * Argyris Zymnis + * @author + * Oscar Boykin + */ +class ReflectionTuplePacker[T](implicit m: Manifest[T]) extends TuplePacker[T] { + override def newConverter(fields: Fields) = new ReflectionTupleConverter[T](fields)(m) } -class ReflectionTupleConverter[T](fields : Fields)(implicit m : Manifest[T]) extends TupleConverter[T] { +class ReflectionTupleConverter[T](fields: Fields)(implicit m: Manifest[T]) extends TupleConverter[T] { override val arity = fields.size - def lowerFirst(s : String) = s.substring(0,1).toLowerCase + s.substring(1) + def lowerFirst(s: String) = s.substring(0, 1).toLowerCase + s.substring(1) // Cut out "set" and lower case the first after - def setterToFieldName(setter : Method) = lowerFirst(setter.getName.substring(3)) + def setterToFieldName(setter: Method) = lowerFirst(setter.getName.substring(3)) + + /* The `_.get` is safe because of the `_.isEmpty` check. ScalaTest does not + * seem to support a more type safe way of doing this. + */ + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + def validate(): Unit = { + // We can't touch setters because that shouldn't be accessed until map/reduce side, not + // on submitter. + val missing = Dsl.asList(fields).find(f => !getSetters.contains(f.toString)) - def validate { - //We can't touch setters because that shouldn't be accessed until map/reduce side, not - //on submitter. - val missing = Dsl.asList(fields).filter { f => !getSetters.contains(f.toString) }.headOption assert(missing.isEmpty, "Field: " + missing.get.toString + " not in setters") } - validate + validate() - def getSetters = m.erasure - .getDeclaredMethods - .filter { _.getName.startsWith("set") } - .groupBy { setterToFieldName(_) } - .mapValues { _.head } + def getSetters = m.runtimeClass.getDeclaredMethods + .filter(_.getName.startsWith("set")) + .groupBy(setterToFieldName(_)) + .mapValues(_.head) // Do all the reflection for the setters we need: // This needs to be lazy because Method is not serializable // TODO: filter by isAccessible, which somehow seems to fail lazy val setters = getSetters - override def apply(input : TupleEntry) : T = { - val newInst = m.erasure.newInstance() + override def apply(input: TupleEntry): T = { + val newInst = m.runtimeClass.newInstance().asInstanceOf[T] val fields = input.getFields (0 until fields.size).map { idx => val thisField = fields.get(idx) val setMethod = setters(thisField.toString) setMethod.invoke(newInst, input.getObject(thisField)) } - newInst.asInstanceOf[T] + newInst } } /** * This just blindly uses the first public constructor with the same arity as the fields size */ -class OrderedTuplePacker[T](implicit m : Manifest[T]) extends TuplePacker[T] { - override def newConverter(fields : Fields) = new OrderedConstructorConverter[T](fields)(m) +class OrderedTuplePacker[T](implicit m: Manifest[T]) extends TuplePacker[T] { + override def newConverter(fields: Fields) = new OrderedConstructorConverter[T](fields)(m) } -class OrderedConstructorConverter[T](fields : Fields)(implicit mf : Manifest[T]) extends TupleConverter[T] { +class OrderedConstructorConverter[T](fields: Fields)(implicit mf: Manifest[T]) extends TupleConverter[T] { override val arity = fields.size // Keep this as a method, so we can validate by calling, but don't serialize it, and keep it lazy // below - def getConstructor = mf.erasure - .getConstructors - .filter { _.getParameterTypes.size == fields.size } - .head.asInstanceOf[Constructor[T]] + def getConstructor = mf.runtimeClass.getConstructors + .filter(_.getParameterTypes.size == fields.size) + .head + .asInstanceOf[Constructor[T]] - //Make sure we can actually get a constructor: + // Make sure we can actually get a constructor: getConstructor lazy val cons = getConstructor - override def apply(input : TupleEntry) : T = { + override def apply(input: TupleEntry): T = { val tup = input.getTuple - val args = (0 until tup.size).map { tup.getObject(_) } - cons.newInstance(args : _*) + val args = (0 until tup.size).map(tup.getObject(_)) + cons.newInstance(args: _*) } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleSetter.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleSetter.scala index a9895f4815..fed3a690c7 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleSetter.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleSetter.scala @@ -12,60 +12,71 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.{Tuple => CTuple} -/** Typeclass to represent converting back to (setting into) a cascading Tuple - * This looks like it can be contravariant, but it can't because of our approach - * of falling back to the singleSetter, you really want the most specific setter - * you can get. Put more directly: a TupleSetter[Any] is not just as good as TupleSetter[(Int, Int)] - * from the scalding DSL's point of view. The latter will flatten the (Int, Int), but the former - * won't. +/** + * Typeclass to represent converting back to (setting into) a cascading Tuple This looks like it can be + * contravariant, but it can't because of our approach of falling back to the singleSetter, you really want + * the most specific setter you can get. Put more directly: a TupleSetter[Any] is not just as good as + * TupleSetter[(Int, Int)] from the scalding DSL's point of view. The latter will flatten the (Int, Int), but + * the former won't. */ -trait TupleSetter[T] extends java.io.Serializable with TupleArity { - def apply(arg : T) : CTuple +trait TupleSetter[T] extends java.io.Serializable with TupleArity { self => + def apply(arg: T): CTuple + + def contraMap[U](fn: U => T): TupleSetter[U] = + TupleSetter.ContraMap(this, fn) } trait LowPriorityTupleSetters extends java.io.Serializable { - /** If it is not a scala Tuple, and not any defined in the object TupleSetter - * we just assume it is a single entry in the tuple - * For some reason, putting a val TupleSetter[Any] here messes up implicit resolution + + /** + * If it is not a scala Tuple, and not any defined in the object TupleSetter we just assume it is a single + * entry in the tuple For some reason, putting a val TupleSetter[Any] here messes up implicit resolution */ - implicit def singleSetter[A]: TupleSetter[A] = new TupleSetter[A] { - override def apply(arg : A) = { + implicit def singleSetter[A]: TupleSetter[A] = TupleSetter.Single[A]() +} + +object TupleSetter extends GeneratedTupleSetters { + + case class ContraMap[A, B](second: TupleSetter[B], fn: A => B) extends TupleSetter[A] { + def apply(arg: A) = second.apply(fn(arg)) + def arity = second.arity + } + + case class Single[A]() extends TupleSetter[A] { + override def apply(arg: A) = { val tup = CTuple.size(1) tup.set(0, arg) tup } override def arity = 1 } -} - -object TupleSetter extends GeneratedTupleSetters { - /** Treat this TupleSetter as one for a subclass - * We do this because we want to use implicit resolution invariantly, - * but clearly, the operation is contravariant + /** + * Treat this TupleSetter as one for a subclass We do this because we want to use implicit resolution + * invariantly, but clearly, the operation is contravariant */ - def asSubSetter[T,U<:T](ts: TupleSetter[T]): TupleSetter[U] = ts.asInstanceOf[TupleSetter[U]] + def asSubSetter[T, U <: T](ts: TupleSetter[T]): TupleSetter[U] = ts.asInstanceOf[TupleSetter[U]] def toCTuple[T](t: T)(implicit ts: TupleSetter[T]): CTuple = ts(t) def arity[T](implicit ts: TupleSetter[T]): Int = ts.arity def of[T](implicit ts: TupleSetter[T]): TupleSetter[T] = ts - //This is here for handling functions that return cascading tuples: + // This is here for handling functions that return cascading tuples: implicit lazy val CTupleSetter: TupleSetter[CTuple] = new TupleSetter[CTuple] { - override def apply(arg : CTuple) = new CTuple(arg) - //We return an invalid value here, so we must check returns + override def apply(arg: CTuple) = new CTuple(arg) + // We return an invalid value here, so we must check returns override def arity = -1 } - //Unit is like a Tuple0. It corresponds to Tuple.NULL + // Unit is like a Tuple0. It corresponds to Tuple.NULL implicit lazy val UnitSetter: TupleSetter[Unit] = new TupleSetter[Unit] { - override def apply(arg : Unit) = CTuple.NULL + override def apply(arg: Unit) = CTuple.NULL override def arity = 0 } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TupleUnpacker.scala b/scalding-core/src/main/scala/com/twitter/scalding/TupleUnpacker.scala index ca6d1405ce..58660946a0 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TupleUnpacker.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TupleUnpacker.scala @@ -12,88 +12,80 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.pipe._ -import cascading.pipe.joiner._ import cascading.tuple._ import scala.reflect.Manifest -import scala.collection.JavaConverters._ - -/** Typeclass for objects which unpack an object into a tuple. - * The packer can verify the arity, types, and also the existence - * of the getter methods at plan time, without having the job - * blow up in the middle of a run. - * - * @author Argyris Zymnis - * @author Oscar Boykin - */ + +/** + * Typeclass for objects which unpack an object into a tuple. The packer can verify the arity, types, and also + * the existence of the getter methods at plan time, without having the job blow up in the middle of a run. + * + * @author + * Argyris Zymnis + * @author + * Oscar Boykin + */ object TupleUnpacker extends LowPriorityTupleUnpackers trait TupleUnpacker[T] extends java.io.Serializable { - def newSetter(fields : Fields) : TupleSetter[T] - def getResultFields(fields : Fields) : Fields = fields + def newSetter(fields: Fields): TupleSetter[T] + def getResultFields(fields: Fields): Fields = fields } trait LowPriorityTupleUnpackers { - implicit def genericUnpacker[T : Manifest] = new ReflectionTupleUnpacker[T] + implicit def genericUnpacker[T: Manifest]: ReflectionTupleUnpacker[T] = new ReflectionTupleUnpacker[T] } /** - * A helper for working with class reflection. - * Allows us to avoid code repetition. - */ + * A helper for working with class reflection. Allows us to avoid code repetition. + */ object ReflectionUtils { /** - * Returns the set of fields in the given class. - * We use a List to ensure fields are in the same - * order they were declared. + * Returns the set of fields in the given class. We use a List to ensure fields are in the same order they + * were declared. */ def fieldsOf[T](c: Class[T]): List[String] = c.getDeclaredFields - .map { f => f.getName } - .toList - .distinct + .map(f => f.getName) + .toList + .distinct /** - * For a given class, give a function that takes - * a T, and a fieldname and returns the values. + * For a given class, give a function that takes a T, and a fieldname and returns the values. */ // def fieldGetters[T](c: Class[T]): (T,String) => AnyRef /** - * For a given class, give a function of T, fieldName, - * fieldValue that returns a new T (possibly a copy, - * if T is immutable). + * For a given class, give a function of T, fieldName, fieldValue that returns a new T (possibly a copy, if + * T is immutable). */ // def fieldSetters[T](c: Class[T]): (T,String,AnyRef) => T } -class ReflectionTupleUnpacker[T](implicit m : Manifest[T]) extends TupleUnpacker[T] { +class ReflectionTupleUnpacker[T](implicit m: Manifest[T]) extends TupleUnpacker[T] { // A Fields object representing all of m's // fields, in the declared field order. // Lazy because we need this twice or not at all. - lazy val allFields = new Fields(ReflectionUtils.fieldsOf(m.erasure).toSeq : _*) + lazy val allFields = new Fields(ReflectionUtils.fieldsOf(m.runtimeClass).toSeq: _*) /** - * A helper to check the passed-in - * fields to see if Fields.ALL is set. - * If it is, return lazy allFields. + * A helper to check the passed-in fields to see if Fields.ALL is set. If it is, return lazy allFields. */ - def expandIfAll(fields : Fields) = + def expandIfAll(fields: Fields) = if (fields.isAll) allFields else fields - override def newSetter(fields : Fields) = + override def newSetter(fields: Fields) = new ReflectionSetter[T](expandIfAll(fields))(m) - override def getResultFields(fields : Fields) : Fields = + override def getResultFields(fields: Fields): Fields = expandIfAll(fields) } -class ReflectionSetter[T](fields : Fields)(implicit m : Manifest[T]) extends TupleSetter[T] { +class ReflectionSetter[T](fields: Fields)(implicit m: Manifest[T]) extends TupleSetter[T] { validate // Call the validation method at the submitter @@ -104,56 +96,52 @@ class ReflectionSetter[T](fields : Fields)(implicit m : Manifest[T]) extends Tup // Methods and Fields are not serializable so we // make these defs instead of vals // TODO: filter by isAccessible, which somehow seems to fail - def methodMap = m.erasure - .getDeclaredMethods + def methodMap = m.runtimeClass.getDeclaredMethods // Keep only methods with 0 parameter types - .filter { m => m.getParameterTypes.length == 0 } - .groupBy { _.getName } - .mapValues { _.head } + .filter(m => m.getParameterTypes.length == 0) + .groupBy(_.getName) + .mapValues(_.head) // TODO: filter by isAccessible, which somehow seems to fail - def fieldMap = m.erasure - .getDeclaredFields - .groupBy { _.getName } - .mapValues { _.head } + def fieldMap = m.runtimeClass.getDeclaredFields + .groupBy(_.getName) + .mapValues(_.head) - def makeSetters = { + def makeSetters = (0 until fields.size).map { idx => val fieldName = fields.get(idx).toString setterForFieldName(fieldName) } - } // This validation makes sure that the setters exist // but does not save them in a val (due to serialization issues) def validate = makeSetters - override def apply(input : T) : Tuple = { - val values = setters.map { setFn => setFn(input) } - new Tuple(values : _*) + override def apply(input: T): Tuple = { + val values = setters.map(setFn => setFn(input)) + new Tuple(values: _*) } override def arity = fields.size - private def setterForFieldName(fieldName : String) : (T => AnyRef) = { + private def setterForFieldName(fieldName: String): (T => AnyRef) = getValueFromMethod(createGetter(fieldName)) .orElse(getValueFromMethod(fieldName)) .orElse(getValueFromField(fieldName)) .getOrElse( - throw new TupleUnpackerException("Unrecognized field: " + fieldName + " for class: " + m.erasure.getName) + throw new TupleUnpackerException( + "Unrecognized field: " + fieldName + " for class: " + m.runtimeClass.getName + ) ) - } - private def getValueFromField(fieldName : String) : Option[(T => AnyRef)] = { - fieldMap.get(fieldName).map { f => (x : T) => f.get(x) } - } + private def getValueFromField(fieldName: String): Option[(T => AnyRef)] = + fieldMap.get(fieldName).map(f => (x: T) => f.get(x)) - private def getValueFromMethod(methodName : String) : Option[(T => AnyRef)] = { - methodMap.get(methodName).map { m => (x : T) => m.invoke(x) } - } + private def getValueFromMethod(methodName: String): Option[(T => AnyRef)] = + methodMap.get(methodName).map(m => (x: T) => m.invoke(x)) - private def upperFirst(s : String) = s.substring(0,1).toUpperCase + s.substring(1) - private def createGetter(s : String) = "get" + upperFirst(s) + private def upperFirst(s: String) = s.substring(0, 1).toUpperCase + s.substring(1) + private def createGetter(s: String) = "get" + upperFirst(s) } -class TupleUnpackerException(args : String) extends Exception(args) +class TupleUnpackerException(args: String) extends Exception(args) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TypeDescriptor.scala b/scalding-core/src/main/scala/com/twitter/scalding/TypeDescriptor.scala new file mode 100644 index 0000000000..ddd1714e54 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/TypeDescriptor.scala @@ -0,0 +1,49 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import cascading.tuple.Fields +import scala.annotation.implicitNotFound +import scala.language.experimental.{macros => sMacros} + +/** + * This class is used to bind together a Fields instance which may contain a type array via getTypes, a + * TupleConverter and TupleSetter, which are inverses of one another. Note the size of the Fields object and + * the arity values for the converter and setter are all the same. Note in the com.twitter.scalding.macros + * package there are macros to generate this for case classes, which may be very convenient. + */ +@implicitNotFound( + """This class is used to bind together a Fields instance to an instance of type T. There is a implicit macro that generates a TypeDescriptor[T] for any type T where T is Boolean, String, Short, Int, Long, FLoat, or Double, or an option of these (with the exception of Option[String]), or a tuple or case class of a supported type. (Nested tuples and case classes are allowed.) Note: Option[String] specifically is not allowed as Some("") and None are indistinguishable. If your type T is not one of these, then you must write your own TypeDescriptor.""" +) +trait TypeDescriptor[T] extends java.io.Serializable { + def setter: TupleSetter[T] + def converter: TupleConverter[T] + def fields: Fields +} +object TypeDescriptor { + + /** + * This type descriptor flattens tuples and case classes left to right, depth first. It supports any type T + * where T is Boolean, String, Short, Int, Long, Float or Double, or an Option of these, or a tuple of a + * supported type. So, ((Int, Int), Int) is supported, and is flattened into a length 3 cascading + * Tuple/Fields. ((Int, Int), (Int, Int)) would be a length 4 cascading tuple, similarly with case classes. + * Note, the Fields types are populated at the end of this with the exception that Option[T] is recorded as + * Object (since recording it as the java type would have different consequences for Cascading's null + * handling. + */ + implicit def typeDescriptor[T]: TypeDescriptor[T] = + macro com.twitter.scalding.macros.impl.TypeDescriptorProviderImpl.caseClassTypeDescriptorImpl[T] +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TypedDelimited.scala b/scalding-core/src/main/scala/com/twitter/scalding/TypedDelimited.scala index 92d9e4c2d0..2d062218b4 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/TypedDelimited.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/TypedDelimited.scala @@ -16,107 +16,150 @@ package com.twitter.scalding import java.io.Serializable +import java.lang.reflect.Type import cascading.tuple.Fields /** - * Trait to assist with creating objects such as [[TypedTsv]] to read from separated files. - * Override separator, skipHeader, writeHeader as needed. - */ + * Trait to assist with creating objects such as [[TypedTsv]] to read from separated files. Override + * separator, skipHeader, writeHeader as needed. + */ trait TypedSeperatedFile extends Serializable { def separator: String def skipHeader: Boolean = false def writeHeader: Boolean = false - def apply[T : Manifest : TupleConverter : TupleSetter](path : String) : TypedDelimited[T] = + def apply[T: Manifest: TupleConverter: TupleSetter](path: String): FixedPathTypedDelimited[T] = apply(Seq(path)) - def apply[T : Manifest : TupleConverter : TupleSetter](paths : Seq[String]) : TypedDelimited[T] = { + def apply[T: Manifest: TupleConverter: TupleSetter](paths: Seq[String]): FixedPathTypedDelimited[T] = { val f = Dsl.intFields(0 until implicitly[TupleConverter[T]].arity) apply(paths, f) } - def apply[T : Manifest : TupleConverter : TupleSetter](path : String, f : Fields) : TypedDelimited[T] = + def apply[T: Manifest: TupleConverter: TupleSetter](path: String, f: Fields): FixedPathTypedDelimited[T] = apply(Seq(path), f) - def apply[T : Manifest : TupleConverter : TupleSetter](paths : Seq[String], f : Fields) : TypedDelimited[T] = - new TypedDelimited[T](paths, f, skipHeader, writeHeader, separator) + def apply[T: Manifest: TupleConverter: TupleSetter]( + paths: Seq[String], + f: Fields + ): FixedPathTypedDelimited[T] = + new FixedPathTypedDelimited[T](paths, f, skipHeader, writeHeader, separator) } /** - * Typed tab separated values file - */ + * Typed tab separated values file + */ object TypedTsv extends TypedSeperatedFile { val separator = "\t" } /** - * Typed comma separated values file - */ + * Typed comma separated values file + */ object TypedCsv extends TypedSeperatedFile { val separator = "," } /** - * Typed pipe separated values flile - */ + * Typed pipe separated values flile + */ object TypedPsv extends TypedSeperatedFile { val separator = "|" } /** - * Typed one separated values file (commonly used by Pig) - */ + * Typed one separated values file (commonly used by Pig) + */ object TypedOsv extends TypedSeperatedFile { - val separator = "\1" + val separator = "\u0001" } -object TypedDelimited { - def apply[T : Manifest : TupleConverter : TupleSetter](path : String, separator : String) : TypedDelimited[T] = +object FixedPathTypedDelimited { + def apply[T: Manifest: TupleConverter: TupleSetter]( + path: String, + separator: String + ): FixedPathTypedDelimited[T] = apply(Seq(path), separator) - def apply[T : Manifest : TupleConverter : TupleSetter](paths : Seq[String], separator : String) : TypedDelimited[T] = { + def apply[T: Manifest: TupleConverter: TupleSetter]( + paths: Seq[String], + separator: String + ): FixedPathTypedDelimited[T] = { val f = Dsl.intFields(0 until implicitly[TupleConverter[T]].arity) apply(paths, f, separator) } - def apply[T : Manifest : TupleConverter : TupleSetter](path : String, f : Fields, separator: String) : TypedDelimited[T] = + def apply[T: Manifest: TupleConverter: TupleSetter]( + path: String, + f: Fields, + separator: String + ): FixedPathTypedDelimited[T] = apply(Seq(path), f, separator) - def apply[T : Manifest : TupleConverter : TupleSetter](paths : Seq[String], f : Fields, separator : String) : TypedDelimited[T] = - new TypedDelimited[T](paths, f, false, false, separator) + def apply[T: Manifest: TupleConverter: TupleSetter]( + paths: Seq[String], + f: Fields, + separator: String + ): FixedPathTypedDelimited[T] = + new FixedPathTypedDelimited[T](paths, f, false, false, separator) } -/** Allows you to set the types, prefer this: - * If T is a subclass of Product, we assume it is a tuple. If it is not, wrap T in a Tuple1: - * e.g. TypedTsv[Tuple1[List[Int]]] - */ -class TypedDelimited[T](p : Seq[String], - override val fields : Fields = Fields.ALL, - override val skipHeader : Boolean = false, - override val writeHeader : Boolean = false, - override val separator : String = "\t") - (implicit mf : Manifest[T], conv: TupleConverter[T], tset: TupleSetter[T]) extends FixedPathSource(p : _*) - with DelimitedScheme with Mappable[T] with TypedSink[T] { - - override def converter[U>:T] = TupleConverter.asSuperConverter[T,U](conv) - override def setter[U<:T] = TupleSetter.asSubSetter[T,U](tset) - - override val types : Array[Class[_]] = { - if (classOf[scala.Product].isAssignableFrom(mf.erasure)) { - //Assume this is a Tuple: - mf.typeArguments.map { _.erasure }.toArray - } - else { - //Assume there is only a single item - Array(mf.erasure) +/** + * Allows you to set the types, prefer this: If T is a subclass of Product, we assume it is a tuple. If it is + * not, wrap T in a Tuple1: e.g. TypedTsv[Tuple1[List[Int]]] + */ +@deprecated("Use TypedTextDelimited instead", "2015-07") +trait TypedDelimited[T] extends DelimitedScheme with Mappable[T] with TypedSink[T] { + + override val skipHeader: Boolean = false + override val writeHeader: Boolean = false + override val separator: String = "\t" + + implicit val mf: Manifest[T] + implicit val conv: TupleConverter[T] + implicit val tset: TupleSetter[T] + + override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](conv) + override def setter[U <: T] = TupleSetter.asSubSetter[T, U](tset) + + override val types: Array[Class[_]] = + if (classOf[scala.Product].isAssignableFrom(mf.runtimeClass)) { + // Assume this is a Tuple: + mf.typeArguments.map(_.runtimeClass).toArray + } else { + // Assume there is only a single item + Array(mf.runtimeClass) } - } - override lazy val toString : String = "TypedDelimited" + - ((p,fields,skipHeader,writeHeader, separator,mf).toString) - override def equals(that : Any) : Boolean = Option(that) - .map { _.toString == this.toString }.getOrElse(false) + // This is used to add types to a Field, which Cascading now supports. While we do not do this much generally + // through the code, it is good practice and something that, ideally, we can do wherever possible. + def addTypes(sel: Array[Comparable[_]]) = new Fields(sel, types.map(_.asInstanceOf[Type])) + + override val fields: Fields = addTypes((0 until types.length).toArray.map(_.asInstanceOf[Comparable[_]])) + final override def sinkFields = fields +} - override lazy val hashCode : Int = toString.hashCode +@deprecated("Use FixedTypedText instead", "2015-07") +class FixedPathTypedDelimited[T]( + p: Seq[String], + override val fields: Fields = Fields.ALL, + override val skipHeader: Boolean = false, + override val writeHeader: Boolean = false, + override val separator: String = "\t" +)(implicit + override val mf: Manifest[T], + override val conv: TupleConverter[T], + override val tset: TupleSetter[T] +) extends FixedPathSource(p: _*) + with TypedDelimited[T] { + + override lazy val toString: String = "FixedPathTypedDelimited" + + ((p, fields, skipHeader, writeHeader, separator, mf).toString) + + override def equals(that: Any): Boolean = Option(that) + .map(_.toString == this.toString) + .getOrElse(false) + + override lazy val hashCode: Int = toString.hashCode } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/TypedPipeChecker.scala b/scalding-core/src/main/scala/com/twitter/scalding/TypedPipeChecker.scala new file mode 100644 index 0000000000..cf165352ed --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/TypedPipeChecker.scala @@ -0,0 +1,37 @@ +package com.twitter.scalding + +/** + * This class is used to assist with testing a TypedPipe + */ +object TypedPipeChecker { + + /* + * Takes a List and a transform function. + * The resulting TypedPipe form the transform will be run through asserts + */ + def checkOutputTransform[T, U, R](input: List[T])(transform: TypedPipe[T] => TypedPipe[U])( + assertions: List[U] => R + ): R = + assertions(inMemoryToList(transform(TypedPipe.from(input)))) + + /* + * Execute a TypedPipe in memory, convert the resulting Iterator to + * a list and run it through a function that makes arbitrary + * assertions on it. + */ + def checkOutput[T, R](output: TypedPipe[T])(assertions: List[T] => R): R = + assertions(inMemoryToList(output)) + + /** + * Execute a TypedPipe in memory and return the result as a List + */ + def inMemoryToList[T](output: TypedPipe[T]): List[T] = + output.toIterableExecution + .waitFor(Config.unitTestDefault, Local(strictSources = true)) + .get + .toList + + implicit class InMemoryToListEnrichment[T](val pipe: TypedPipe[T]) extends AnyVal { + def inMemoryToList: List[T] = TypedPipeChecker.inMemoryToList(pipe) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/WritableSequenceFile.scala b/scalding-core/src/main/scala/com/twitter/scalding/WritableSequenceFile.scala index 908282cac5..25f6da7b47 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/WritableSequenceFile.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/WritableSequenceFile.scala @@ -12,20 +12,20 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.scheme.hadoop.{WritableSequenceFile => CHWritableSequenceFile } +import cascading.scheme.hadoop.{WritableSequenceFile => CHWritableSequenceFile} import cascading.tap.SinkMode import cascading.tuple.Fields import org.apache.hadoop.io.Writable trait WritableSequenceFileScheme extends SchemedSource { - /** There are three allowed cases: - * fields.size == 1 and keyType == null - * fields.size == 1 and valueType == null - * fields.size == 2 and keyType != null and valueType != null + + /** + * There are three allowed cases: fields.size == 1 and keyType == null fields.size == 1 and valueType == + * null fields.size == 2 and keyType != null and valueType != null */ def fields: Fields def keyType: Class[_ <: Writable] @@ -33,62 +33,68 @@ trait WritableSequenceFileScheme extends SchemedSource { // TODO Cascading doesn't support local mode yet override def hdfsScheme = - HadoopSchemeInstance(new CHWritableSequenceFile(fields, keyType, valueType)) + HadoopSchemeInstance( + new CHWritableSequenceFile(fields, keyType, valueType) + .asInstanceOf[cascading.scheme.Scheme[_, _, _, _, _]] + ) } - object WritableSequenceFile { + /** by default uses the first two fields in the tuple */ - def apply[K <: Writable : Manifest, V <: Writable : Manifest](path: String): WritableSequenceFile[K, V] = + def apply[K <: Writable: Manifest, V <: Writable: Manifest](path: String): WritableSequenceFile[K, V] = WritableSequenceFile(path, Dsl.intFields(0 to 1)) } -case class WritableSequenceFile[K <: Writable : Manifest, V <: Writable : Manifest]( - p : String, - f : Fields, - override val sinkMode: SinkMode = SinkMode.REPLACE) - extends FixedPathSource(p) +case class WritableSequenceFile[K <: Writable: Manifest, V <: Writable: Manifest]( + p: String, + f: Fields, + override val sinkMode: SinkMode = SinkMode.REPLACE +) extends FixedPathSource(p) with WritableSequenceFileScheme with LocalTapSource with TypedSink[(K, V)] - with TypedSource[(K, V)] { + with Mappable[(K, V)] { override val fields = f - override val keyType = manifest[K].erasure.asInstanceOf[Class[_ <: Writable]] - override val valueType = manifest[V].erasure.asInstanceOf[Class[_ <: Writable]] + override val keyType = manifest[K].runtimeClass.asInstanceOf[Class[_ <: Writable]] + override val valueType = manifest[V].runtimeClass.asInstanceOf[Class[_ <: Writable]] - def setter[U <: (K,V)]: TupleSetter[U] = - TupleSetter.asSubSetter[(K,V), U](TupleSetter.tup2Setter[(K, V)]) + def setter[U <: (K, V)]: TupleSetter[U] = + TupleSetter.asSubSetter[(K, V), U](TupleSetter.tup2Setter[(K, V)]) override def sinkFields = f - def converter[U >: (K,V)]: TupleConverter[U] = - TupleConverter.asSuperConverter(TupleConverter.tuple2Converter[K,V]) + def converter[U >: (K, V)]: TupleConverter[U] = + TupleConverter.asSuperConverter(TupleConverter.tuple2Converter[K, V]) override def sourceFields = f } - object MultipleWritableSequenceFiles { + /** by default uses the first two fields in the tuple */ - def apply[K <: Writable : Manifest, V <: Writable : Manifest](paths: Seq[String]): - MultipleWritableSequenceFiles[K, V] = + def apply[K <: Writable: Manifest, V <: Writable: Manifest]( + paths: Seq[String] + ): MultipleWritableSequenceFiles[K, V] = MultipleWritableSequenceFiles(paths, Dsl.intFields(0 to 1)) } /** - * This is only a TypedSource as sinking into multiple directories is not well defined + * This is only a TypedSource (which is a superclass of Mappable) as sinking into multiple directories is not + * well defined */ -case class MultipleWritableSequenceFiles[K <: Writable : Manifest, V <: Writable : Manifest]( - p: Seq[String], f: Fields) - extends FixedPathSource(p:_*) +case class MultipleWritableSequenceFiles[K <: Writable: Manifest, V <: Writable: Manifest]( + p: Seq[String], + f: Fields +) extends FixedPathSource(p: _*) with WritableSequenceFileScheme with LocalTapSource - with TypedSource[(K, V)] { + with Mappable[(K, V)] { override val fields = f - override val keyType = manifest[K].erasure.asInstanceOf[Class[_ <: Writable]] - override val valueType = manifest[V].erasure.asInstanceOf[Class[_ <: Writable]] + override val keyType = manifest[K].runtimeClass.asInstanceOf[Class[_ <: Writable]] + override val valueType = manifest[V].runtimeClass.asInstanceOf[Class[_ <: Writable]] - def converter[U >: (K,V)]: TupleConverter[U] = - TupleConverter.asSuperConverter(TupleConverter.tuple2Converter[K,V]) + def converter[U >: (K, V)]: TupleConverter[U] = + TupleConverter.asSuperConverter(TupleConverter.tuple2Converter[K, V]) override def sourceFields = f } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/XHandler.scala b/scalding-core/src/main/scala/com/twitter/scalding/XHandler.scala index 320391f38f..2a087de77d 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/XHandler.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/XHandler.scala @@ -4,21 +4,23 @@ import cascading.flow.planner.PlannerException /** * Provide handlers and mapping for exceptions - * @param xMap - mapping as Map with Throwable class as key and String as value - * @param dVal - default value for undefined keys in mapping + * @param xMap + * - mapping as Map with Throwable class as key and String as value + * @param dVal + * - default value for undefined keys in mapping */ class XHandler(xMap: Map[Class[_ <: Throwable], String], dVal: String) { - def handlers = xMap.keys.map(kCls => ((t: Throwable) => kCls == t.getClass)).toList - - def mapping: Class[_ <: Throwable] => String = xMap.withDefaultValue(dVal) + def handlers: List[Throwable => Boolean] = + xMap.keys.map(kCls => ((t: Throwable) => kCls == t.getClass)).toList + def mapping: Class[_ <: Throwable] => String = + xMap.withDefaultValue(dVal) } - /** - * Provide apply method for creating XHandlers with default or custom settings - * and contain messages and mapping + * Provide apply method for creating XHandlers with default or custom settings and contain messages and + * mapping */ object RichXHandler { @@ -27,11 +29,16 @@ object RichXHandler { val BinaryProblem = "GUESS: This may be a problem with the binary version of a dependency. " + "Check which versions of dependencies you're pulling in." + val RequiredCascadingFabricNotInClassPath = + "GUESS: Required Cascading fabric is not supplied in the classpath." + + "Check which versions and variants of dependencies you're pulling in." + val DataIsMissing = "GUESS: Data is missing from the path you provided." val RequireSinks = "GUESS: Cascading requires all sources to have final sinks on disk." val mapping: Map[Class[_ <: Throwable], String] = Map( + classOf[ModeLoadException] -> RequiredCascadingFabricNotInClassPath, classOf[NoClassDefFoundError] -> BinaryProblem, classOf[AbstractMethodError] -> BinaryProblem, classOf[NoSuchMethodError] -> BinaryProblem, @@ -41,9 +48,32 @@ object RichXHandler { val gitHubUrl = "https://github.com/twitter/scalding/wiki/Common-Exceptions-and-possible-reasons#" - def createXUrl(t: Throwable) : String = { - gitHubUrl + t.getClass.getName.replace(".", "").toLowerCase - } + @annotation.tailrec + final def rootOf(t: Throwable): Throwable = + t.getCause match { + case null => t + case cause => rootOf(cause) + } + + @annotation.tailrec + final def peelUntilMappable(t: Throwable): Class[_ <: Throwable] = + (mapping.get(t.getClass), t.getCause) match { + case (Some(diag), _) => t.getClass // we're going to find a mappable cause. + case (None, null) => t.getClass // we're at the root. There won't be any cause + case (None, cause) => peelUntilMappable(cause) + } + + def createXUrl(t: Throwable): String = + gitHubUrl + (peelUntilMappable(t).getName.replace(".", "").toLowerCase) + + def apply(xMap: Map[Class[_ <: Throwable], String] = mapping, dVal: String = Default) = + new XHandler(xMap, dVal) - def apply(xMap: Map[Class[_ <: Throwable], String] = mapping, dVal: String = Default) = new XHandler(xMap, dVal) + def apply(t: Throwable): String = + mapping + .get(peelUntilMappable(t)) + .map(_ + "\n") + .getOrElse("") + + "If you know what exactly caused this error, please consider contributing to GitHub via following link.\n" + + createXUrl(t) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/bdd/BddDsl.scala b/scalding-core/src/main/scala/com/twitter/scalding/bdd/BddDsl.scala index a22940f26d..881d1db7e7 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/bdd/BddDsl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/bdd/BddDsl.scala @@ -3,10 +3,7 @@ package com.twitter.scalding.bdd import com.twitter.scalding._ import scala.collection.mutable.Buffer import cascading.tuple.Fields -import scala.Predef._ import com.twitter.scalding.Tsv -import org.slf4j.LoggerFactory - trait BddDsl extends FieldConversions with PipeOperationsConversions { def Given(source: TestSource): TestCaseGiven1 = new TestCaseGiven1(source) @@ -19,19 +16,15 @@ trait BddDsl extends FieldConversions with PipeOperationsConversions { def withSchema(schema: Fields) = new TestSource(this, schema) } - - class ProductTestSourceWithoutSchema(val data: Iterable[Product]) extends TestSourceWithoutSchema { - def addSourceToJob(jobTest: JobTest, source: Source): JobTest = jobTest.source(source, data) - } - - class SimpleTypeTestSourceWithoutSchema[T](val data: Iterable[T])(implicit setter: TupleSetter[T]) extends TestSourceWithoutSchema { + class SimpleTypeTestSourceWithoutSchema[T](val data: Iterable[T])(implicit setter: TupleSetter[T]) + extends TestSourceWithoutSchema { def addSourceToJob(jobTest: JobTest, source: Source): JobTest = jobTest.source[T](source, data)(setter) } - implicit def fromProductDataToSourceWithoutSchema(data: Iterable[Product]) = new ProductTestSourceWithoutSchema(data) - - implicit def fromSimpleTypeDataToSourceWithoutSchema[T](data: Iterable[T])(implicit setter: TupleSetter[T]) = + implicit def fromSimpleTypeDataToSourceWithoutSchema[T](data: Iterable[T])(implicit + setter: TupleSetter[T] + ): SimpleTypeTestSourceWithoutSchema[T] = new SimpleTypeTestSourceWithoutSchema(data)(setter) class TestSource(data: TestSourceWithoutSchema, schema: Fields) { @@ -69,12 +62,17 @@ trait BddDsl extends FieldConversions with PipeOperationsConversions { } case class TestCaseWhen(sources: List[TestSource], operation: PipeOperation) { - def Then[OutputType](assertion: Buffer[OutputType] => Unit)(implicit conv: TupleConverter[OutputType]) : Unit = { + def Then[OutputType](assertion: Buffer[OutputType] => Unit)(implicit + conv: TupleConverter[OutputType] + ): Unit = CompleteTestCase(sources, operation, assertion).run() - } } - case class CompleteTestCase[OutputType](sources: List[TestSource], operation: PipeOperation, assertion: Buffer[OutputType] => Unit)(implicit conv: TupleConverter[OutputType]) { + case class CompleteTestCase[OutputType]( + sources: List[TestSource], + operation: PipeOperation, + assertion: Buffer[OutputType] => Unit + )(implicit conv: TupleConverter[OutputType]) { class DummyJob(args: Args) extends Job(args) { val inputPipes: List[RichPipe] = sources.map(testSource => RichPipe(testSource.asSource.read)) @@ -84,20 +82,19 @@ trait BddDsl extends FieldConversions with PipeOperationsConversions { outputPipe.write(Tsv("output")) } - def run() : Unit = { + def run(): Unit = { val jobTest = JobTest(new DummyJob(_)) // Add Sources - val op = sources.foreach { - _.addSourceDataToJobTest(jobTest) - } + sources.foreach(_.addSourceDataToJobTest(jobTest)) + // Add Sink jobTest.sink[OutputType](Tsv("output")) { assertion(_) } // Execute - jobTest.run.finish + jobTest.run.finish() } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/bdd/PipeOperationsConversions.scala b/scalding-core/src/main/scala/com/twitter/scalding/bdd/PipeOperationsConversions.scala index 31f280af19..29e4c9965c 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/bdd/PipeOperationsConversions.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/bdd/PipeOperationsConversions.scala @@ -8,27 +8,30 @@ trait PipeOperationsConversions { trait PipeOperation { def assertPipeSize(pipes: List[RichPipe], expectedSize: Int) = - require(pipes.size == expectedSize, "Cannot apply an operation for " + expectedSize + "pipes to " + pipes.size + " pipes. " + - "Verify matching of given and when clauses in test case definition") + require( + pipes.size == expectedSize, + "Cannot apply an operation for " + expectedSize + "pipes to " + pipes.size + " pipes. " + + "Verify matching of given and when clauses in test case definition" + ) def apply(pipes: List[RichPipe]): Pipe } class OnePipeOperation(op: RichPipe => Pipe) extends PipeOperation { def apply(pipes: List[RichPipe]): Pipe = { - assertPipeSize(pipes, 1); op(pipes(0)) + assertPipeSize(pipes, 1); op(pipes.head) } } class TwoPipesOperation(op: (RichPipe, Pipe) => RichPipe) extends PipeOperation { def apply(pipes: List[RichPipe]): Pipe = { - assertPipeSize(pipes, 2); op(pipes(0), pipes(1)) + assertPipeSize(pipes, 2); op(pipes(0), pipes(1)) // linter:disable } } class ThreePipesOperation(op: (RichPipe, RichPipe, RichPipe) => Pipe) extends PipeOperation { def apply(pipes: List[RichPipe]): Pipe = { - assertPipeSize(pipes, 3); op(pipes(0), pipes(1), pipes(2)) + assertPipeSize(pipes, 3); op(pipes(0), pipes(1), pipes(2)) // linter:disable } } @@ -37,31 +40,50 @@ trait PipeOperationsConversions { } class ListPipesOperation(op: List[Pipe] => Pipe) extends PipeOperation { - def apply(pipes: List[RichPipe]): Pipe = op( pipes.map( _.pipe ).toList ) + def apply(pipes: List[RichPipe]): Pipe = op(pipes.map(_.pipe)) } - implicit val fromSingleRichPipeFunctionToOperation = (op: RichPipe => RichPipe) => new OnePipeOperation(op(_).pipe) - implicit val fromSingleRichPipeToPipeFunctionToOperation = (op: RichPipe => Pipe) => new OnePipeOperation(op(_)) - - implicit val fromTwoRichPipesFunctionToOperation = (op: (RichPipe, RichPipe) => RichPipe) => new TwoPipesOperation(op(_, _).pipe) - implicit val fromTwoRichPipesToRichPipeFunctionToOperation = (op: (RichPipe, RichPipe) => Pipe) => new TwoPipesOperation(op(_, _)) - - implicit val fromThreeRichPipesFunctionToOperation = (op: (RichPipe, RichPipe, RichPipe) => RichPipe) => new ThreePipesOperation(op(_, _, _).pipe) - implicit val fromThreeRichPipesToPipeFunctionToOperation = (op: (RichPipe, RichPipe, RichPipe) => Pipe) => new ThreePipesOperation(op(_, _, _)) - - implicit val fromRichPipeListFunctionToOperation = (op: List[RichPipe] => RichPipe) => new ListRichPipesOperation(op(_).pipe) - implicit val fromRichPipeListToPipeFunctionToOperation = (op: List[RichPipe] => Pipe) => new ListRichPipesOperation(op(_)) - - - implicit val fromSinglePipeFunctionToOperation = (op: Pipe => RichPipe) => new OnePipeOperation(op(_).pipe) - implicit val fromSinglePipeToRichPipeFunctionToOperation = (op: Pipe => Pipe) => new OnePipeOperation(op(_)) - - implicit val fromTwoPipeFunctionToOperation = (op: (Pipe, Pipe) => RichPipe) => new TwoPipesOperation(op(_, _).pipe) - implicit val fromTwoRichPipeToPipeFunctionToOperation = (op: (Pipe, Pipe) => Pipe) => new TwoPipesOperation(op(_, _)) - - implicit val fromThreePipeFunctionToOperation = (op: (Pipe, Pipe, Pipe) => RichPipe) => new ThreePipesOperation(op(_, _, _).pipe) - implicit val fromThreeRichPipeToPipeFunctionToOperation = (op: (Pipe, Pipe, Pipe) => Pipe) => new ThreePipesOperation(op(_, _, _)) - - implicit val fromListPipeFunctionToOperation = (op: List[Pipe] => RichPipe) => new ListPipesOperation(op(_).pipe) - implicit val fromListRichPipeToPipeFunctionToOperation = (op: List[Pipe] => Pipe) => new ListPipesOperation(op(_)) + implicit val fromSingleRichPipeFunctionToOperation: (RichPipe => RichPipe) => OnePipeOperation = + (op: RichPipe => RichPipe) => new OnePipeOperation(op(_).pipe) + implicit val fromSingleRichPipeToPipeFunctionToOperation: (RichPipe => Pipe) => OnePipeOperation = + (op: RichPipe => Pipe) => new OnePipeOperation(op(_)) + + implicit val fromTwoRichPipesFunctionToOperation: ((RichPipe, RichPipe) => RichPipe) => TwoPipesOperation = + (op: (RichPipe, RichPipe) => RichPipe) => new TwoPipesOperation(op(_, _).pipe) + implicit val fromTwoRichPipesToRichPipeFunctionToOperation + : ((RichPipe, RichPipe) => Pipe) => TwoPipesOperation = (op: (RichPipe, RichPipe) => Pipe) => + new TwoPipesOperation(op(_, _)) + + implicit val fromThreeRichPipesFunctionToOperation + : ((RichPipe, RichPipe, RichPipe) => RichPipe) => ThreePipesOperation = + (op: (RichPipe, RichPipe, RichPipe) => RichPipe) => new ThreePipesOperation(op(_, _, _).pipe) + implicit val fromThreeRichPipesToPipeFunctionToOperation + : ((RichPipe, RichPipe, RichPipe) => Pipe) => ThreePipesOperation = + (op: (RichPipe, RichPipe, RichPipe) => Pipe) => new ThreePipesOperation(op(_, _, _)) + + implicit val fromRichPipeListFunctionToOperation: (List[RichPipe] => RichPipe) => ListRichPipesOperation = + (op: List[RichPipe] => RichPipe) => new ListRichPipesOperation(op(_).pipe) + implicit val fromRichPipeListToPipeFunctionToOperation: (List[RichPipe] => Pipe) => ListRichPipesOperation = + (op: List[RichPipe] => Pipe) => new ListRichPipesOperation(op(_)) + + implicit val fromSinglePipeFunctionToOperation: (Pipe => RichPipe) => OnePipeOperation = + (op: Pipe => RichPipe) => new OnePipeOperation(op(_).pipe) + implicit val fromSinglePipeToRichPipeFunctionToOperation: (Pipe => Pipe) => OnePipeOperation = + (op: Pipe => Pipe) => new OnePipeOperation(op(_)) + + implicit val fromTwoPipeFunctionToOperation: ((Pipe, Pipe) => RichPipe) => TwoPipesOperation = + (op: (Pipe, Pipe) => RichPipe) => new TwoPipesOperation(op(_, _).pipe) + implicit val fromTwoRichPipeToPipeFunctionToOperation: ((Pipe, Pipe) => Pipe) => TwoPipesOperation = + (op: (Pipe, Pipe) => Pipe) => new TwoPipesOperation(op(_, _)) + + implicit val fromThreePipeFunctionToOperation: ((Pipe, Pipe, Pipe) => RichPipe) => ThreePipesOperation = + (op: (Pipe, Pipe, Pipe) => RichPipe) => new ThreePipesOperation(op(_, _, _).pipe) + implicit val fromThreeRichPipeToPipeFunctionToOperation + : ((Pipe, Pipe, Pipe) => Pipe) => ThreePipesOperation = (op: (Pipe, Pipe, Pipe) => Pipe) => + new ThreePipesOperation(op(_, _, _)) + + implicit val fromListPipeFunctionToOperation: (List[Pipe] => RichPipe) => ListPipesOperation = + (op: List[Pipe] => RichPipe) => new ListPipesOperation(op(_).pipe) + implicit val fromListRichPipeToPipeFunctionToOperation: (List[Pipe] => Pipe) => ListPipesOperation = + (op: List[Pipe] => Pipe) => new ListPipesOperation(op(_)) } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/bdd/README.md b/scalding-core/src/main/scala/com/twitter/scalding/bdd/README.md similarity index 85% rename from scalding-core/src/test/scala/com/twitter/scalding/bdd/README.md rename to scalding-core/src/main/scala/com/twitter/scalding/bdd/README.md index ab72a28c8a..3863bd5990 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/bdd/README.md +++ b/scalding-core/src/main/scala/com/twitter/scalding/bdd/README.md @@ -6,10 +6,11 @@ each one testable independently before being assembled into the main Job that wi tests. It is not an alternative to the JobTest class but it covers a different part of the testing phase. The JobTest class can be used to test a full Job end to end while this framework allows you to test every single sub-step in isolation. +It supports fields API using the BddDsl package and typed API using TBddDsl ## What does it look like -A test written with scalding unit look as shown below: +A test written with using the BddDsl or the TBddDsl look as shown below: With Specs @@ -80,8 +81,57 @@ class SampleJobPipeTransformationsSpec2Spec extends mutableSpec.SpecificationWit Where `addUserInfo` is a function joining two pipes to generate an enriched one. +An example using the Typed API and Specs2, both using tuples or more complex types + +```scala +case class UserInfo(name: String, gender: String, age: Int) +case class EstimatedContribution(name: String, suggestedPensionContributionPerMonth: Double) + +class TypedApiTest extends Specification with TBddDsl { + + "A test with a single source" should { + + "accept an operation from working with a single tuple-typed pipe" in { + Given { + List(("Joe", "M", 40), ("Sarah", "F", 22)) + } When { + in: TypedPipe[(String, String, Int)] => + in.map[(String, Double)] { person => + person match { + case (name, "M", age) => (name, (1000.0 / (72 - age)).toDouble) + case (name, _, age) => (name, (1000.0 / (80 - age)).toDouble) + } + } + } Then { + buffer: mutable.Buffer[(String, Double)] => + buffer.toList mustEqual List(("Joe", 1000.0 / 32), ("Sarah", 1000.0 / 58)) + } + } + + "accept an operation from single case class-typed pipe" in { + Given { + List(UserInfo("Joe", "M", 40), UserInfo("Sarah", "F", 22)) + } When { + in: TypedPipe[UserInfo] => + in.map { person => + person match { + case UserInfo(name, "M", age) => EstimatedContribution(name, (1000.0 / (72 - age))) + case UserInfo(name, _, age) => EstimatedContribution(name, (1000.0 / (80 - age))) + } + } + } Then { + buffer: mutable.Buffer[EstimatedContribution] => + buffer.toList mustEqual List(EstimatedContribution("Joe", 1000.0 / 32), EstimatedContribution("Sarah", 1000.0 / 58)) + } + } + } +} +``` + ## Motivation and details +Please note that the discussion describes an example using the field API but everything maps to the typed API. + A Scalding job consists in a series of transformations applied to one or more sources in order to create one or more output resources or sinks. A very simple example taken from the Scalding documentations is as follows. diff --git a/scalding-core/src/main/scala/com/twitter/scalding/bdd/TBddDsl.scala b/scalding-core/src/main/scala/com/twitter/scalding/bdd/TBddDsl.scala new file mode 100644 index 0000000000..9f52b95dbf --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/bdd/TBddDsl.scala @@ -0,0 +1,123 @@ +package com.twitter.scalding.bdd + +import cascading.flow.FlowDef +import com.twitter.scalding._ +import com.twitter.scalding.source.TypedText +import scala.collection.mutable.Buffer +import TDsl._ + +trait TBddDsl extends FieldConversions with TypedPipeOperationsConversions { + + def Given[TypeIn](source: TypedTestSource[TypeIn]): TestCaseGiven1[TypeIn] = + new TestCaseGiven1[TypeIn](source) + + def GivenSources(sources: List[TypedTestSource[_]]): TestCaseGivenList = new TestCaseGivenList(sources) + + abstract class TypedTestSource[T] { + def data: Iterable[T] + + def asSource: Source = + IterableSource(data.map(Tuple1(_)), 'tuple) + + def readFromSourceAsTyped(implicit flowDef: FlowDef, mode: Mode): TypedPipe[T] = + asSource.read.toTypedPipe[Tuple1[T]]('tuple).map(_._1) + + def addSourceDataToJobTest(jobTest: JobTest) = jobTest.source(asSource, data) + } + + class SimpleTypedTestSource[T](val data: Iterable[T]) extends TypedTestSource[T] { + def addSourceToJob(jobTest: JobTest, source: Source): JobTest = + jobTest.source[T](source, data) + } + + implicit def fromSimpleTypeToTypedSource[T](data: Iterable[T]): SimpleTypedTestSource[T] = + new SimpleTypedTestSource(data) + + case class TestCaseGiven1[TypeIn](source: TypedTestSource[TypeIn]) { + def And[TypeIn2](other: TypedTestSource[TypeIn2]) = TestCaseGiven2[TypeIn, TypeIn2](source, other) + + def When[TypeOut: Manifest: TupleConverter: TupleSetter]( + op: OneTypedPipeOperation[TypeIn, TypeOut] + ): TestCaseWhen[TypeOut] = TestCaseWhen(List(source), op) + } + + case class TestCaseGiven2[TypeIn1, TypeIn2]( + source: TypedTestSource[TypeIn1], + other: TypedTestSource[TypeIn2] + ) { + def And[TypeIn3](third: TypedTestSource[TypeIn3]) = TestCaseGiven3(source, other, third) + + def When[TypeOut: Manifest: TupleConverter: TupleSetter]( + op: TwoTypedPipesOperation[TypeIn1, TypeIn2, TypeOut] + ): TestCaseWhen[TypeOut] = TestCaseWhen(List(source, other), op) + } + + case class TestCaseGiven3[TypeIn1, TypeIn2, TypeIn3]( + source: TypedTestSource[TypeIn1], + other: TypedTestSource[TypeIn2], + third: TypedTestSource[TypeIn3] + ) { + def And(next: TypedTestSource[_]) = TestCaseGivenList(List(source, other, third, next)) + + def When[TypeOut: Manifest: TupleConverter: TupleSetter]( + op: ThreeTypedPipesOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut] + ): TestCaseWhen[TypeOut] = TestCaseWhen(List(source, other, third), op) + } + + case class TestCaseGivenList(sources: List[TypedTestSource[_]]) { + def And(next: TypedTestSource[_]) = TestCaseGivenList((next :: sources.reverse).reverse) + + def When[TypeOut: Manifest](op: ListOfTypedPipesOperations[TypeOut]): TestCaseWhen[TypeOut] = + TestCaseWhen(sources, op) + } + + case class TestCaseWhen[OutputType: Manifest]( + sources: List[TypedTestSource[_]], + operation: TypedPipeOperation[OutputType] + ) { + def Then(assertion: Buffer[OutputType] => Unit): Unit = + CompleteTestCase(sources, operation, assertion).run() + } + + case class CompleteTestCase[OutputType: Manifest]( + sources: List[TypedTestSource[_]], + operation: TypedPipeOperation[OutputType], + assertion: Buffer[OutputType] => Unit + ) { + + class DummyJob(args: Args) extends Job(args) { + val inputPipes: List[TypedPipe[_]] = sources.map(testSource => testSource.readFromSourceAsTyped) + + val outputPipe = operation(inputPipes) + + implicit val td: TypeDescriptor[OutputType] = new TypeDescriptor[OutputType] { + def converter = TupleConverter.singleConverter + def setter = TupleSetter.singleSetter + def fields = new cascading.tuple.Fields("item") + } + outputPipe.write(TypedText.tsv[OutputType]("output")) + } + + def run(): Unit = { + val jobTest = JobTest(new DummyJob(_)) + + // Add Sources + sources.foreach(_.addSourceDataToJobTest(jobTest)) + + implicit val td: TypeDescriptor[OutputType] = new TypeDescriptor[OutputType] { + def converter = TupleConverter.singleConverter + def setter = TupleSetter.singleSetter + def fields = new cascading.tuple.Fields("item") + } + + // Add Sink + jobTest.sink[OutputType](TypedText.tsv[OutputType]("output")) { buffer: Buffer[OutputType] => + assertion(buffer) + } + + // Execute + jobTest.run.finish() + } + } + +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/bdd/TypedPipeOperationsConversions.scala b/scalding-core/src/main/scala/com/twitter/scalding/bdd/TypedPipeOperationsConversions.scala new file mode 100644 index 0000000000..cb0ef31537 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/bdd/TypedPipeOperationsConversions.scala @@ -0,0 +1,75 @@ +package com.twitter.scalding.bdd + +import com.twitter.scalding.TypedPipe + +trait TypedPipeOperationsConversions { + + trait TypedPipeOperation[TypeOut] { + def assertPipeSize(pipes: List[TypedPipe[_]], expectedSize: Int) = + require( + pipes.size == expectedSize, + "Cannot apply an operation for " + expectedSize + "pipes to " + pipes.size + " pipes. " + + "Verify matching of given and when clauses in test case definition" + ) + + def apply(pipes: List[TypedPipe[_]]): TypedPipe[TypeOut] + } + + class OneTypedPipeOperation[TypeIn, TypeOut](op: TypedPipe[TypeIn] => TypedPipe[TypeOut]) + extends TypedPipeOperation[TypeOut] { + override def apply(pipes: List[TypedPipe[_]]): TypedPipe[TypeOut] = { + assertPipeSize(pipes, 1) + op(pipes.head.asInstanceOf[TypedPipe[TypeIn]]) + } + } + + class TwoTypedPipesOperation[TypeIn1, TypeIn2, TypeOut]( + op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2]) => TypedPipe[TypeOut] + ) extends TypedPipeOperation[TypeOut] { + override def apply(pipes: List[TypedPipe[_]]): TypedPipe[TypeOut] = { + assertPipeSize(pipes, 2) + op( + pipes(0).asInstanceOf[TypedPipe[TypeIn1]], // linter:disable + pipes(1).asInstanceOf[TypedPipe[TypeIn2]] + ) + } + } + + class ThreeTypedPipesOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut]( + op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2], TypedPipe[TypeIn3]) => TypedPipe[TypeOut] + ) extends TypedPipeOperation[TypeOut] { + override def apply(pipes: List[TypedPipe[_]]): TypedPipe[TypeOut] = { + assertPipeSize(pipes, 3) + op( + pipes(0).asInstanceOf[TypedPipe[TypeIn1]], // linter:disable + pipes(1).asInstanceOf[TypedPipe[TypeIn2]], + pipes(2).asInstanceOf[TypedPipe[TypeIn3]] + ) + } + } + + class ListOfTypedPipesOperations[TypeOut](op: List[TypedPipe[_]] => TypedPipe[TypeOut]) + extends TypedPipeOperation[TypeOut] { + override def apply(pipes: List[TypedPipe[_]]): TypedPipe[TypeOut] = op(pipes) + } + + implicit def fromSingleTypedPipeFunctionToOperation[TypeIn, TypeOut]( + op: TypedPipe[TypeIn] => TypedPipe[TypeOut] + ): OneTypedPipeOperation[TypeIn, TypeOut] = + new OneTypedPipeOperation[TypeIn, TypeOut](op) + + implicit def fromTwoTypedPipesFunctionToOperation[TypeIn1, TypeIn2, TypeOut]( + op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2]) => TypedPipe[TypeOut] + ): TwoTypedPipesOperation[TypeIn1, TypeIn2, TypeOut] = + new TwoTypedPipesOperation[TypeIn1, TypeIn2, TypeOut](op) + + implicit def fromThreeTypedPipesFunctionToOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut]( + op: (TypedPipe[TypeIn1], TypedPipe[TypeIn2], TypedPipe[TypeIn3]) => TypedPipe[TypeOut] + ): ThreeTypedPipesOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut] = + new ThreeTypedPipesOperation[TypeIn1, TypeIn2, TypeIn3, TypeOut](op) + + implicit def fromListOfTypedPipesFunctionToOperation[TypeOut]( + op: List[TypedPipe[_]] => TypedPipe[TypeOut] + ): ListOfTypedPipesOperations[TypeOut] = + new ListOfTypedPipesOperations[TypeOut](op) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/Common.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/Common.scala new file mode 100644 index 0000000000..6b2a616a33 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/Common.scala @@ -0,0 +1,36 @@ +package com.twitter.scalding.estimation + +import cascading.flow.FlowStep +import cascading.tap.hadoop.Hfs +import cascading.tap.{CompositeTap, Tap} +import com.twitter.scalding.tap.GlobHfs +import org.apache.hadoop.mapred.JobConf +import org.slf4j.LoggerFactory +import scala.collection.JavaConverters._ + +object Common { + private[this] val LOG = LoggerFactory.getLogger(this.getClass) + + private def unrollTaps(taps: Seq[Tap[_, _, _]]): Seq[Tap[_, _, _]] = + taps.flatMap { + case multi: CompositeTap[_] => + unrollTaps(multi.getChildTaps.asScala.toSeq) + case t => Seq(t) + } + + def unrollTaps(step: FlowStep[JobConf]): Seq[Tap[_, _, _]] = + unrollTaps(step.getSources.asScala.toSeq) + + def inputSizes(step: FlowStep[JobConf]): Seq[(String, Long)] = { + val conf = step.getConfig + unrollTaps(step).flatMap { + case tap: GlobHfs => Some(tap.toString -> tap.getSize(conf)) + case tap: Hfs => Some(tap.toString -> GlobHfs.getSize(tap.getPath, conf)) + case tap => + LOG.warn("InputSizeReducerEstimator unable to calculate size: " + tap) + None + } + } + + def totalInputSize(step: FlowStep[JobConf]): Long = inputSizes(step).map(_._2).sum +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/Estimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/Estimator.scala new file mode 100644 index 0000000000..82b6036543 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/Estimator.scala @@ -0,0 +1,68 @@ +package com.twitter.scalding.estimation + +import cascading.flow.{Flow, FlowStep} +import com.twitter.algebird.Monoid +import org.apache.hadoop.mapred.JobConf +import org.slf4j.LoggerFactory +import scala.util.{Failure, Success} + +case class FlowStrategyInfo( + flow: Flow[JobConf], + predecessorSteps: Seq[FlowStep[JobConf]], + step: FlowStep[JobConf] +) + +/** + * Trait for estimation some parameters of Job. + * @tparam T + * return type of estimation + */ +trait Estimator[T] { + def estimate(info: FlowStrategyInfo): Option[T] +} + +case class FallbackEstimator[T](first: Estimator[T], fallback: Estimator[T]) extends Estimator[T] { + private val LOG = LoggerFactory.getLogger(this.getClass) + + override def estimate(info: FlowStrategyInfo): Option[T] = + first.estimate(info).orElse { + LOG.warn(s"$first estimator failed. Falling back to $fallback.") + fallback.estimate(info) + } +} + +class FallbackEstimatorMonoid[T] extends Monoid[Estimator[T]] { + override def zero: Estimator[T] = new Estimator[T] { + override def estimate(info: FlowStrategyInfo): Option[T] = None + } + + override def plus(l: Estimator[T], r: Estimator[T]): Estimator[T] = FallbackEstimator(l, r) +} + +trait HistoryEstimator[T] extends Estimator[T] { + private val LOG = LoggerFactory.getLogger(this.getClass) + + def maxHistoryItems(conf: JobConf): Int + + def historyService: HistoryService + + override def estimate(info: FlowStrategyInfo): Option[T] = { + val conf = info.step.getConfig + + historyService.fetchHistory(info, maxHistoryItems(conf)) match { + case Success(history) if history.isEmpty => + LOG.warn(s"No matching history found for $info") + None + case Success(history) => + LOG.info(s"${history.length} history entries found for $info") + val estimation = estimate(info, conf, history) + LOG.info(s"$getClass estimate: $estimation") + estimation + case Failure(f) => + LOG.warn(s"Unable to fetch history in $getClass", f) + None + } + } + + protected def estimate(info: FlowStrategyInfo, conf: JobConf, history: Seq[FlowStepHistory]): Option[T] +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/HistoryService.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/HistoryService.scala new file mode 100644 index 0000000000..94beb9db3d --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/HistoryService.scala @@ -0,0 +1,51 @@ +package com.twitter.scalding.estimation + +import scala.util.Try + +/** + * Info about a prior FlowStep, provided by implementers of HistoryService + */ +final case class FlowStepHistory( + keys: FlowStepKeys, + submitTimeMillis: Long, + launchTimeMillis: Long, + finishTimeMillis: Long, + totalMaps: Long, + totalReduces: Long, + finishedMaps: Long, + finishedReduces: Long, + failedMaps: Long, + failedReduces: Long, + mapFileBytesRead: Long, + mapFileBytesWritten: Long, + mapOutputBytes: Long, + reduceFileBytesRead: Long, + hdfsBytesRead: Long, + hdfsBytesWritten: Long, + mapperTimeMillis: Long, + reducerTimeMillis: Long, + reduceShuffleBytes: Long, + cost: Double, + tasks: Seq[Task] +) + +final case class FlowStepKeys( + jobName: String, + user: String, + priority: String, + status: String, + version: String, + queue: String +) + +final case class Task(details: Map[String, Any], counters: Map[String, Long]) { + def taskType: Option[String] = details.get(Task.TaskType).map(_.asInstanceOf[String]) +} + +object Task { + val TaskType = "taskType" +} + +trait HistoryService { + def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorConfig.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorConfig.scala new file mode 100644 index 0000000000..10dc824d07 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorConfig.scala @@ -0,0 +1,47 @@ +package com.twitter.scalding.estimation.memory + +import org.apache.hadoop.mapred.JobConf + +object MemoryEstimatorConfig { + + /** Output param: what the original job map memory was. */ + val originalMapMemory = "scalding.map.memory.estimator.original" + + /** Output param: what the original job reduce memory was. */ + val originalReduceMemory = "scalding.reduce.memory.estimator.original" + + /** + * Value of alpha for exponential smoothing. Lower values ensure more smoothing and less importance to newer + * data Higher values provide lesser smoothing and more importance to newer data + */ + val alphaKey = "scalding.memory.estimator.alpha" + + /** Indicates how much to scale the memory estimate after it's calculated */ + val memoryScaleFactor = "scalding.memory.estimator.scale.factor" + + val XmxToMemoryScaleFactorKey = "scalding.memory.estimator.xmx.scale.factor" + + val maxContainerMemoryKey = "scalding.memory.estimator.container.max" + + val minContainerMemoryKey = "scalding.memory.estimator.container.min" + + /** yarn allocates in increments. So we might as well round up our container ask * */ + val yarnSchedulerIncrementAllocationMB = "yarn.scheduler.increment-allocation-mb" + + /** Maximum number of history items to use for memory estimation. */ + val maxHistoryKey = "scalding.memory.estimator.max.history" + + def getMaxContainerMemory(conf: JobConf): Long = conf.getLong(maxContainerMemoryKey, 8 * 1024) + + def getMinContainerMemory(conf: JobConf): Long = conf.getLong(minContainerMemoryKey, 1 * 1024) + + def getAlpha(conf: JobConf): Double = conf.getDouble(alphaKey, 1.0) + + def getScaleFactor(conf: JobConf): Double = conf.getDouble(memoryScaleFactor, 1.2) + + def getXmxScaleFactor(conf: JobConf): Double = conf.getDouble(XmxToMemoryScaleFactorKey, 1.25) + + def getYarnSchedulerIncrement(conf: JobConf): Int = conf.getInt(yarnSchedulerIncrementAllocationMB, 512) + + def getMaxHistory(conf: JobConf): Int = conf.getInt(maxHistoryKey, 5) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategy.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategy.scala new file mode 100644 index 0000000000..669fc4911b --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategy.scala @@ -0,0 +1,96 @@ +package com.twitter.scalding.estimation.memory + +import cascading.flow.{Flow, FlowStep, FlowStepStrategy} +import com.twitter.algebird.Monoid +import com.twitter.scalding.estimation.{Estimator, FallbackEstimatorMonoid, FlowStrategyInfo} +import com.twitter.scalding.{Config, StringUtility} +import java.util.{List => JList} +import org.apache.hadoop.mapred.JobConf +import org.slf4j.LoggerFactory +import scala.collection.JavaConverters._ + +object MemoryEstimatorStepStrategy extends FlowStepStrategy[JobConf] { + + private val LOG = LoggerFactory.getLogger(this.getClass) + + implicit val estimatorMonoid: Monoid[Estimator[MemoryEstimate]] = + new FallbackEstimatorMonoid[MemoryEstimate] + + /** + * Make memory estimate, possibly overriding explicitly-set memory settings, and save useful info (such as + * the original & estimate value of memory settings) in JobConf for later consumption. + * + * Called by Cascading at the start of each job step. + */ + final override def apply( + flow: Flow[JobConf], + preds: JList[FlowStep[JobConf]], + step: FlowStep[JobConf] + ): Unit = + if (skipMemoryEstimation(step)) { + LOG.info(s"Skipping memory estimation as ${Config.MemoryEstimators} is not set ") + } else { + estimate(flow, preds.asScala, step) + } + + private[estimation] def skipMemoryEstimation(step: FlowStep[JobConf]): Boolean = + step.getConfig.get(Config.MemoryEstimators, "").isEmpty + + private[estimation] def estimate( + flow: Flow[JobConf], + preds: Seq[FlowStep[JobConf]], + step: FlowStep[JobConf] + ): Unit = { + val conf = step.getConfig + + Option(conf.get(Config.MemoryEstimators)).foreach { clsNames => + val clsLoader = Thread.currentThread.getContextClassLoader + + val estimators = StringUtility + .fastSplit(clsNames, ",") + .map(clsLoader.loadClass(_).newInstance.asInstanceOf[Estimator[MemoryEstimate]]) + val combinedEstimator = Monoid.sum(estimators) + + val info = FlowStrategyInfo(flow, preds, step) + + // get memory estimate + val memoryEstimate: Option[MemoryEstimate] = combinedEstimator.estimate(info) + + memoryEstimate match { + case Some(MemoryEstimate(Some(mapMem), Some(reduceMem))) => + LOG.info(s"Overriding map memory to: $mapMem in Mb and reduce memory to: $reduceMem in Mb") + setMemory(mapMem, (Config.MapJavaOpts, Config.MapMemory), conf) + setMemory(reduceMem, (Config.ReduceJavaOpts, Config.ReduceMemory), conf) + case Some(MemoryEstimate(Some(mapMem), _)) => + LOG.info(s"Overriding only map memory to: $mapMem in Mb") + setMemory(mapMem, (Config.MapJavaOpts, Config.MapMemory), conf) + case Some(MemoryEstimate(_, Some(reduceMem))) => + LOG.info(s"Overriding only reduce memory to: $reduceMem in Mb") + setMemory(reduceMem, (Config.ReduceJavaOpts, Config.ReduceMemory), conf) + case _ => LOG.info("Memory estimators didn't calculate any value. Skipping setting memory overrides") + } + } + } + + private[estimation] def setMemory( + memorySettings: (Long, Long), + keys: (String, String), + conf: JobConf + ): Unit = { + val (xmxMemory, containerMemory) = memorySettings + val (xmxKey, containerKey) = keys + + conf.setLong(containerKey, containerMemory) + + setXmxMemory(xmxKey, xmxMemory, conf) + } + + private[estimation] def setXmxMemory(xmxKey: String, xmxMemory: Long, conf: JobConf): Unit = { + val xmxOpts = conf.get(xmxKey, "") + // remove existing xmx / xms + val xmxOptsWithoutXm = + xmxOpts.split(" ").filterNot(s => s.startsWith("-Xmx") || s.startsWith("-Xms")).mkString(" ") + + conf.set(xmxKey, xmxOptsWithoutXm + s" -Xmx${xmxMemory}m") + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimator.scala new file mode 100644 index 0000000000..9e1d12847d --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimator.scala @@ -0,0 +1,127 @@ +package com.twitter.scalding.estimation.memory + +import com.twitter.scalding.estimation.{FlowStepHistory, FlowStrategyInfo, HistoryEstimator, Task} +import org.apache.hadoop.mapred.JobConf +import org.slf4j.LoggerFactory + +// Tuple(MapMemory in MB for java process and container, ReduceMemory in MB for java process and container), +// or None to keep the default. +case class MemoryEstimate(mapMemoryInMB: Option[(Long, Long)], reduceMemoryInMB: Option[(Long, Long)]) + +object SmoothedHistoryMemoryEstimator { + val CommittedHeapBytes = "COMMITTED_HEAP_BYTES" + val CpuMs = "CPU_MILLISECONDS" + val PhysicalMemoryBytes = "PHYSICAL_MEMORY_BYTES" + val GCTimeMs = "GC_TIME_MILLIS" + + implicit class MemoryRichTask(val task: Task) extends AnyVal { + def committedHeapBytes: Option[Long] = task.counters.get(CommittedHeapBytes) + } +} + +trait SmoothedHistoryMemoryEstimator extends HistoryEstimator[MemoryEstimate] { + import SmoothedHistoryMemoryEstimator.MemoryRichTask + + private val LOG = LoggerFactory.getLogger(this.getClass) + + override def maxHistoryItems(conf: JobConf): Int = MemoryEstimatorConfig.getMaxHistory(conf) + + override protected def estimate( + info: FlowStrategyInfo, + conf: JobConf, + history: Seq[FlowStepHistory] + ): Option[MemoryEstimate] = { + // iterate over mem history + // collect: for maps, list of max memory in past runs + // for reduce, list of max memory in past runs + // compute smoothed memory est + // multiple by scale factor + // return + val maxMemory = history.map(historyMemory) + + val xmxMemoryOfMapper = xmxMemory(maxMemory.flatMap(_._1), conf) + val xmxMemoryOfReducer = xmxMemory(maxMemory.flatMap(_._2), conf) + + val containerMemoryOfMapper = containerMemory(xmxMemoryOfMapper, conf) + val containerMemoryOfReducer = containerMemory(xmxMemoryOfReducer, conf) + + Some( + MemoryEstimate( + cappedMemory(containerMemoryOfMapper, conf), + cappedMemory(containerMemoryOfReducer, conf) + ) + ) + } + + private def xmxMemory(historyMemory: Seq[Long], conf: JobConf): Double = { + val scaleFactor = MemoryEstimatorConfig.getScaleFactor(conf) + val alpha = MemoryEstimatorConfig.getAlpha(conf) + + val smoothEstimation = smoothedAverage(historyMemory, alpha) + val scaledEstimation = smoothEstimation * scaleFactor + + // TODO handle gc + + LOG.info( + s"Calculated xmx memory for: $historyMemory smoothAvg = $smoothEstimation, scaled: $scaledEstimation" + ) + + scaledEstimation / (1024L * 1024) + } + + private def containerMemory(xmxMemory: Double, conf: JobConf): Double = + xmxMemory * MemoryEstimatorConfig.getXmxScaleFactor(conf) + + private def cappedMemory(containerMemory: Double, conf: JobConf): Option[(Long, Long)] = { + val schedulerIncrement = MemoryEstimatorConfig.getYarnSchedulerIncrement(conf) + val roundedContainerMemory = roundUp(containerMemory, schedulerIncrement) + + val maxContainerMemory = MemoryEstimatorConfig.getMaxContainerMemory(conf) + val minContainerMemory = MemoryEstimatorConfig.getMinContainerMemory(conf) + val scaleFactor = MemoryEstimatorConfig.getXmxScaleFactor(conf) + + if (roundedContainerMemory == 0) { + None + } else if (roundedContainerMemory > maxContainerMemory) { + Some(((maxContainerMemory / scaleFactor).toLong, maxContainerMemory)) + } else if (roundedContainerMemory < minContainerMemory) { + Some(((minContainerMemory / scaleFactor).toLong, minContainerMemory)) + } else { + Some((roundedContainerMemory / scaleFactor).toLong, roundedContainerMemory) + } + } + + private def historyMemory(history: FlowStepHistory): (Option[Long], Option[Long]) = { + LOG.debug(s"Processing tasks: ${history.tasks}") + val reduceTasks: Seq[Task] = history.tasks.filter(t => t.taskType.contains("REDUCE")) + val mapTasks: Seq[Task] = history.tasks.filter(t => t.taskType.contains("MAP")) + + // handle empty task list due to either no task history / lack of reducers + val maxReduceCommittedHeap: Option[Long] = + if (reduceTasks.isEmpty) + None + else + Some(reduceTasks.flatMap(_.committedHeapBytes).max) + + val maxMapCommittedHeap: Option[Long] = + if (mapTasks.isEmpty) + None + else + Some(mapTasks.flatMap(_.committedHeapBytes).max) + + LOG.info( + s"Calculated max committed heap for job: ${history.keys}, map: $maxMapCommittedHeap reduce: $maxReduceCommittedHeap" + ) + (maxMapCommittedHeap, maxReduceCommittedHeap) + } + + // memoryEstimate = (currentMemoryValue * alpha) + (1 - alpha) * oldEstimate + private def smoothedAverage(memoryList: Seq[Long], alpha: Double): Double = + memoryList + .foldLeft(0.0) { (oldEstimate, currentVal) => + (currentVal * alpha) + (1 - alpha) * oldEstimate + } + + private def roundUp(value: Double, block: Double): Long = + (Math.ceil(value / block) * block).toLong +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/examples/MergeTest.scala b/scalding-core/src/main/scala/com/twitter/scalding/examples/MergeTest.scala deleted file mode 100644 index 1d50f7b96c..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/examples/MergeTest.scala +++ /dev/null @@ -1,50 +0,0 @@ -package com.twitter.scalding.examples - -import scala.annotation.tailrec - -import com.twitter.scalding._ - -/** -* This example job does not yet work. It is a test for Kyro serialization -*/ -class MergeTest(args : Args) extends Job(args) { - TextLine(args("input")).flatMapTo('word) { _.split("""\s+""") } - .groupBy('word) { _.size } - //Now, let's get the top 10 words: - .groupAll { - _.mapReduceMap(('word,'size)->'list) /* map1 */ { tup : (String,Long) => List(tup) } - /* reduce */ { (l1 : List[(String,Long)], l2 : List[(String,Long)]) => - mergeSort2(l1, l2, 10, cmpTup) - } /* map2 */ { - lout : List[(String,Long)] => lout - } - } - //Now expand out the list. - .flatMap('list -> ('word, 'cnt)) { list : List[(String,Long)] => list } - .project('word, 'cnt) - .write(Tsv(args("output"))) - - //Reverse sort to get the top items - def cmpTup( t1 : (String,Long), t2 : (String,Long) ) = t2._2.compareTo(t1._2) - - def mergeSort2[T](v1 : List[T], v2 : List[T], k : Int, cmp : Function2[T,T,Int]) = { - @tailrec - def mergeSortR(acc : List[T], list1 : List[T], list2 : List[T], k : Int) : List[T] = { - (list1, list2, k) match { - case (_,_,0) => acc - case (x1 :: t1, x2 :: t2, _) => { - if( cmp(x1,x2) < 0 ) { - mergeSortR(x1 :: acc, t1, list2, k-1) - } - else { - mergeSortR(x2 :: acc, list1, t2, k-1) - } - } - case (x1 :: t1, Nil, _) => mergeSortR(x1 :: acc, t1, Nil, k-1) - case (Nil, x2 :: t2, _) => mergeSortR(x2 :: acc, Nil, t2, k-1) - case (Nil, Nil, _) => acc - } - } - mergeSortR(Nil, v1, v2, k).reverse - } -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/examples/PageRank.scala b/scalding-core/src/main/scala/com/twitter/scalding/examples/PageRank.scala deleted file mode 100644 index ad68b5152c..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/examples/PageRank.scala +++ /dev/null @@ -1,179 +0,0 @@ -package com.twitter.scalding.examples - -import scala.annotation.tailrec -import com.twitter.scalding._ - -/** -* Options: -* --input: the three column TSV with node, comma-sep-out-neighbors, initial pagerank (set to 1.0 first) -* --ouput: the name for the TSV you want to write to, same as above. -* optional arguments: -* --errorOut: name of where to write the L1 error between the input page-rank and the output -* if this is omitted, we don't compute the error -* --iterations: how many iterations to run inside this job. Default is 1, 10 is about as -* much as cascading can handle. -* --jumpprob: probability of a random jump, default is 0.15 -* --convergence: if this is set, after every "--iterations" steps, we check the error and see -* if we should continue. Since the error check is expensive (involving a join), you should -* avoid doing this too frequently. 10 iterations is probably a good number to set. -* --temp: this is the name where we will store a temporary output so we can compare to the previous -* for convergence checking. If convergence is set, this MUST be. -*/ -class PageRank(args : Args) extends Job(args) { - - //How many steps - val STEPS = args.getOrElse("iterations","1").toInt - //Probability of taking a random jump in the network. - val ALPHA = args.getOrElse("jumpprob","0.15").toDouble - //How many times have we checked for convergence: - val JOB_COUNT = args.getOrElse("jobCount","0").toInt - //These are constants used by the algorithm: - val NODESET = 0 - val EDGE = 1 - - //Read the input, this can be subclassed, but should produce a pipe with three - //columns: source node, comma separated (no spaces) destination nodes as a string, and - //initial rank (default to 1.0 if you are starting from nothing) - initialize('src, 'dst, 'rank) - /* - * This algorithm works by having two types of rows that have the same column structure. - * the node -> list(neighbors), and node -> individual neighbor. - * We distinguish these two types with an id which nodes if this is a NODESET or an EDGE. - * The first step is to append that value. We also need to have a column for the degree. - * It doesn't matter what the initial degree is, we recompute below - */ - .map(() -> ('rowtype, 'd_src)) { (u:Unit) => (NODESET,-1) } - .thenDo( doPageRank(STEPS)_ ) - .thenDo( computeError _ ) - .thenDo( output _ ) - - /** - * Here is where we check for convergence and then run the next job if we're not converged - */ - override def next : Option[Job] = { - args.optional("convergence") - .flatMap { convErr => - /* - * It's easy for this to seem broken, so think about it twice: - * We are swapping between two writing files: temp and output, with the ultimate - * goal to land up at output. So, each next input is this output, but the temp - * and output should be swapping. - */ - val nextArgs = args + ("input", Some(args("output"))) + - ("temp", Some(args("output"))) + - ("output", Some(args("temp"))) + - ("jobCount", Some((JOB_COUNT + 1).toString)) - //Actually read the error: - val error = TypedTsv[Double](args("errorOut")).toIterator.next; - // The last job should be even numbered so output is not in temp. - // TODO: if we had a way to do HDFS operations easily (like rm, mv, tempname) - // this code would be cleaner and more efficient. As is, we may go a whole extra - // set of operations past the point of convergence. - if (error > convErr.toDouble || (JOB_COUNT % 2 == 1)) { - //try again to get under the error - Some(clone(nextArgs)) - } - else { - None - } - } - } - /** - * override this function to change how you generate a pipe of - * (Long, String, Double) - * where the first entry is the nodeid, the second is the list of neighbors, - * as a comma (no spaces) separated string representation of the numeric nodeids, - * the third is the initial page rank (if not starting from a previous run, this - * should be 1.0 - * - * NOTE: if you want to run until convergence, the initialize method must read the same - * EXACT format as the output method writes. This is your job! - */ - def initialize(nodeCol : Symbol, neighCol : Symbol, pageRank : Symbol) = { - Tsv(args("input")).read - //Just to name the columns: - .mapTo((0,1,2)->(nodeCol, neighCol, pageRank)) { - input : (Long, String, Double) => input - } - } - - /** - * The basic idea is to groupBy the dst key with BOTH the nodeset and the edge rows. - * the nodeset rows have the old page-rank, the edge rows are reversed, so we can get - * the incoming page-rank from the nodes that point to each destination. - */ - - @tailrec - final def doPageRank(steps : Int)(pagerank : RichPipe) : RichPipe = { - if( steps <= 0 ) { pagerank } - else { - val nodeRows = pagerank - //remove any EDGE rows from the previous loop - .filter('rowtype) { (rowtype : Int) => rowtype == NODESET } - //compute the incremental rank due to the random jump: - val randomJump = nodeRows.map('rank -> 'rank) { (rank : Double) => ALPHA } - //expand the neighbor list inte an edge list and out-degree of the src - val edges = nodeRows.flatMap(('dst,'d_src) -> ('dst,'d_src)) { args : (String, Long) => - if (args._1.length > 0) { - val dsts = args._1.split(",") - //Ignore the old degree: - val deg = dsts.size - dsts.map { str => (str.toLong, deg) } - } - else { - //Here is a node that points to no other nodes (dangling) - Nil - } - } - //Here we make a false row that we use to tell dst how much incoming - //Page rank it needs to add to itself: - .map(('src,'d_src,'dst,'rank,'rowtype)->('src,'d_src,'dst,'rank,'rowtype)) { - intup : (Long,Long,Long,Double,Int) => - val (src : Long, d_src : Long, dst : Long, rank : Double, row : Int) = intup - //The d_src, and dst are ignored in the merge below - //We swap destination into the source position - (dst, -1L, "", rank*(1.0 - ALPHA)/ d_src, EDGE) - } - /** - * Here we do the meat of the algorithm: - * if N = number of nodes, pr(N_i) prob of walking to node i, then: - * N pr(N_i) = (\sum_{j points to i} N pr(N_j) * (1-ALPHA)/d_j) + ALPHA - * N pr(N_i) is the page rank of node i. - */ - val nextPr = (edges ++ randomJump).groupBy('src) { - /* - * Note that NODESET < EDGE, so if we take the min(rowtype, ...) - * using dictionary ordering, we only keep NODESET rows UNLESS - * there are rows that had no outdegrees, so they had no NODESET row - * to begin with. To fix the later case, we have to additionally - * filter the result to keep only NODESET rows. - */ - _.min('rowtype, 'dst, 'd_src) - .sum[Double]('rank) //Sum the page-rank from both the nodeset and edge rows - } - //Must call ourselves in the tail position: - doPageRank(steps-1)(nextPr) - } - } - - //This outputs in the same format as the input, so you can run the job - //iteratively, subclass to change the final behavior - def output(pipe : RichPipe) = { - pipe.project('src, 'dst, 'rank).write(Tsv(args("output"))) - } - - //Optionally compute the average error: - def computeError(pr : RichPipe) : RichPipe = { - args.optional("errorOut").map { errOut => - Tsv(args("input")).read - .mapTo((0,1,2)->('src0, 'dst0, 'rank0)) { tup : (Long, String, Double) => tup } - .joinWithSmaller('src0 -> 'src, pr) - .mapTo(('rank0,'rank) -> 'err) { ranks : (Double, Double) => - scala.math.abs(ranks._1 - ranks._2) - } - .groupAll { _.average('err) } - .write(TypedTsv[Double](errOut)) - } - pr - } -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/examples/WeightedPageRank.scala b/scalding-core/src/main/scala/com/twitter/scalding/examples/WeightedPageRank.scala deleted file mode 100644 index c7b6e9a0e8..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/examples/WeightedPageRank.scala +++ /dev/null @@ -1,220 +0,0 @@ -package com.twitter.scalding.examples - -import com.twitter.scalding._ - -/** - * weighted page rank for the given graph, start from the given pagerank, - * perform one iteartion, test for convergence, if not yet, clone itself - * and start the next page rank job with updated pagerank as input. - * - * This class is very similar to the PageRank class, main differences are: - * 1. supported weighted pagerank - * 2. the reset pagerank is pregenerated, possibly through a previous job - * 3. dead pagerank is evenly distributed - * - * Options: - * --pwd: working directory, will read/generate the following files there - * numnodes: total number of nodes - * nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> - * pagerank: the page rank file eg pagerank_0, pagerank_1 etc - * totaldiff: the current max pagerank delta - * Optional arguments: - * --weighted: do weighted pagerank, default false - * --curiteration: what is the current iteration, default 0 - * --maxiterations: how many iterations to run. Default is 20 - * --jumpprob: probability of a random jump, default is 0.1 - * --threshold: total difference before finishing early, default 0.001 - */ -class WeightedPageRank(args: Args) extends Job(args) { - val ROW_TYPE_1 = 1 - val ROW_TYPE_2 = 2 - - val PWD = args("pwd") - val ALPHA = args.getOrElse("jumpprob","0.1").toDouble - val WEIGHTED = args.getOrElse("weighted","false").toBoolean - val THRESHOLD = args.getOrElse("threshold", "0.001").toDouble - val MAXITERATIONS = args.getOrElse("maxiterations", "20").toInt - val CURITERATION = args.getOrElse("curiteration", "0").toInt - - // 'size - val numNodes = getNumNodes(PWD + "/numnodes") - - // 'src_id, 'dst_ids, 'weights, 'mass_prior - val nodes = getNodes(PWD + "/nodes") - - // 'src_id_input, 'mass_input - val inputPagerank = getInputPagerank(PWD + "/pagerank_" + CURITERATION) - - // one iteration of pagerank - val outputPagerank = doPageRank(nodes, inputPagerank) - val outputFileName = PWD + "/pagerank_" + (CURITERATION + 1) - outputPagerank - .project('src_id, 'mass_n) - .write(Tsv(outputFileName)) - - // detect convergence - val totalDiff = outputPagerank - .mapTo(('mass_input, 'mass_n) -> 'mass_diff) { args : (Double, Double) => - scala.math.abs(args._1 - args._2) - } - .groupAll { _.sum[Double]('mass_diff) } - .write(TypedTsv[Double](PWD + "/totaldiff")) - - /** - * test convergence, if not yet, kick off the next iteration - */ - override def next = { - // the max diff generated above - val totalDiff = TypedTsv[Double](PWD + "/totaldiff").toIterator.next - - if (CURITERATION < MAXITERATIONS-1 && totalDiff > THRESHOLD) { - val newArgs = args + ("curiteration", Some( (CURITERATION+1).toString)) - Some(clone(newArgs)) - } else { - None - } - } - - def getInputPagerank(fileName: String) = { - Tsv(fileName).read - .mapTo((0,1) -> ('src_id_input, 'mass_input)) { - input : (Int, Double) => input - } - } - - /** - * read the pregenerated nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> - */ - def getNodes(fileName: String) = { - mode match { - case Hdfs(_, conf) => { - SequenceFile(fileName).read - .mapTo((0,1,2,3)->('src_id, 'dst_ids, 'weights, 'mass_prior)) { - input : (Int, Array[Int], Array[Float], Double) => input - } - } - case _ => { - Tsv(fileName).read - .mapTo((0,1,2,3)->('src_id, 'dst_ids, 'weights, 'mass_prior)) { - input : (Int, String, String, Double) => { - ( - input._1, - // convert string to int array - if (input._2 != null && input._2.length > 0) { - input._2.split(",").map { _.toInt } - } else { - Array[Int]() - }, - // convert string to float array - if (input._3 != null && input._3.length > 0) { - input._3.split(",").map { _.toFloat } - } else { - Array[Float]() - }, - input._4 - ) - } - } - } - } - } - - /** - * the total number of nodes, single line file - */ - def getNumNodes(fileName: String) = { - Tsv(fileName).read - .mapTo(0 -> 'size) { input: Int => input } - } - - /** - * one iteration of pagerank - * inputPagerank: <'src_id_input, 'mass_input> - * return <'src_id, 'mass_n, 'mass_input> - * - * Here is a highlevel view of the unweighted algorithm: - * let - * N: number of nodes - * inputPagerank(N_i): prob of walking to node i, - * d(N_j): N_j's out degree - * then - * pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) / d_j) - * deadPagerank = (1 - \sum_{i} pagerankNext(N_i)) / N - * randomPagerank(N_i) = userMass(N_i) * ALPHA + deadPagerank * (1-ALPHA) - * pagerankOutput(N_i) = randomPagerank(N_i) + pagerankNext(N_i) * (1-ALPHA) - * - * For weighted algorithm: - * let - * w(N_j, N_i): weight from N_j to N_i - * tw(N_j): N_j's total out weights - * then - * pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) * w(N_j, N_i) / tw(N_j)) - * - */ - def doPageRank(nodeRows: RichPipe, inputPagerank: RichPipe) : RichPipe = { - // 'src_id, 'dst_ids, 'weights, 'mass_prior, 'mass_input - val nodeJoined = nodeRows - .joinWithSmaller('src_id -> 'src_id_input, inputPagerank) - .discard('src_id_input) - - // 'src_id, 'mass_n - val pagerankNext = nodeJoined - .flatMapTo(('dst_ids, 'weights, 'mass_input) -> ('src_id, 'mass_n)) { - args : (Array[Int], Array[Float], Double) => { - if (args._1.length > 0) { - if (WEIGHTED) { - // weighted distribution - val total: Double = args._2.sum - (args._1 zip args._2).map { idWeight : (Int, Float) => - (idWeight._1, args._3 * idWeight._2 / total) - } - } else { - // equal distribution - val dist: Double = args._3 / args._1.length - args._1.map { id: Int => (id, dist) } - } - } else { - //Here is a node that points to no other nodes (dangling) - Nil - } - } - } - .groupBy('src_id) { - _.sum[Double]('mass_n) - } - - // 'sum_mass - val sumPagerankNext = pagerankNext.groupAll { _.sum[Double]('mass_n -> 'sum_mass) } - - // 'deadMass - // single row jobs - // the dead page rank equally distributed to every node - val deadPagerank = sumPagerankNext - .crossWithTiny(numNodes) - .map(('sum_mass, 'size) -> 'deadMass) { input : (Double, Int) => - (1.0 - input._1) / input._2 - } - .discard('size, 'sum_mass) - - // 'src_id_r, 'mass_n_r - // random jump probability plus dead page rank - val randomPagerank = nodeJoined.crossWithTiny(deadPagerank) - .mapTo(('src_id, 'mass_prior, 'deadMass, 'mass_input) -> ('src_id, 'mass_n, 'mass_input)) { - ranks : (Int, Double, Double, Double) => - (ranks._1, ranks._2 * ALPHA + ranks._3 * (1-ALPHA), ranks._4) - } - - // 'src_id, 'mass_n - // scale next page rank to 1-ALPHA - val pagerankNextScaled = pagerankNext - .map('mass_n -> ('mass_n, 'mass_input)) { m: Double => ((1-ALPHA) * m, 0.0) } - - // 'src_id, 'mass_n, 'mass_input - // random probability + next probability - (randomPagerank ++ pagerankNextScaled) - .groupBy('src_id) { - _.sum[Double]('mass_input) // keep the input pagerank - .sum[Double]('mass_n) // take the sum - } - } -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala b/scalding-core/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala deleted file mode 100644 index 67a47d4e6e..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/examples/WordCountJob.scala +++ /dev/null @@ -1,10 +0,0 @@ -package com.twitter.scalding.examples - -import com.twitter.scalding._ - -class WordCountJob(args : Args) extends Job(args) { - TextLine( args("input") ).read. - flatMap('line -> 'word) { line : String => line.split("\\s+") }. - groupBy('word) { _.size }. - write( Tsv( args("output") ) ) -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheFile.scala b/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheFile.scala index 4f548ba6c0..65810be7bc 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheFile.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/filecache/DistributedCacheFile.scala @@ -5,11 +5,12 @@ import com.twitter.scalding._ import java.io.File import java.net.URI import java.nio.ByteBuffer + import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.filecache.{DistributedCache => HDistributedCache} +import org.apache.hadoop.mapreduce.MRJobConfig +import org.apache.hadoop.mapreduce.filecache.{DistributedCache => HDistributedCache} import org.apache.hadoop.fs.Path - object URIHasher { private[this] final val HashFunc = MurmurHash128(1L) @@ -21,9 +22,11 @@ object URIHasher { /** * generates hashes of hdfs URIs using algebird's MurmurHash128 - * @param uri the URI to generate a hash for - * @return a hex-encoded string of the bytes of the 128 bit hash. The results are zero padded on the left, so - * this string will always be 32 characters long. + * @param uri + * the URI to generate a hash for + * @return + * a hex-encoded string of the bytes of the 128 bit hash. The results are zero padded on the left, so this + * string will always be 32 characters long. */ def apply(uri: URI): String = { val (h1, h2) = HashFunc(uri.toASCIIString) @@ -33,103 +36,202 @@ object URIHasher { } /** - * The distributed cache is simply hadoop's method for allowing each node local access to a - * specific file. The registration of that file must be called with the Configuration of the job, - * and not when it's on a mapper or reducer. Additionally, a unique name for the node-local access - * path must be used to prevent collisions in the cluster. This class provides this functionality. + * The distributed cache is simply hadoop's method for allowing each node local access to a specific file. The + * registration of that file must be called with the Configuration of the job, and not when it's on a mapper + * or reducer. Additionally, a unique name for the node-local access path must be used to prevent collisions + * in the cluster. This class provides this functionality. * - * In the configuration phase, the file URI is used to construct an UncachedFile instance. The name - * of the symlink to use on the mappers is only available after calling the add() method, which - * registers the file and computes the unique symlink name and returns a CachedFile instance. - * The CachedFile instance is Serializable, it's designed to be assigned to a val and accessed later. + * In the configuration phase, the file URI is used to construct an UncachedFile instance. The name of the + * symlink to use on the mappers is only available after calling the add() method, which registers the file + * and computes the unique symlink name and returns a CachedFile instance. The CachedFile instance is + * Serializable, it's designed to be assigned to a val and accessed later. * * The local symlink is available thorugh .file or .path depending on what type you need. * * example: * * {{{ - * class YourJob(args: Args) extends Job(args) { - * val theCachedFile = DistributedCacheFile("hdfs://ur-namenode/path/to/your/file.txt") + * class YourJob(args: Args) extends Job(args) { + * val theCachedFile = DistributedCacheFile("/path/to/your/file.txt") * - * def somethingThatUsesTheCachedFile() { - * doSomethingWith(theCachedFile.path) // or theCachedFile.file - * } - * } - * }}} + * def somethingThatUsesTheCachedFile() { + * doSomethingWith(theCachedFile.path) // or theCachedFile.file + * } + * } + * + * example with Execution: + * + * {{{ + * object YourExecJob extends ExecutionApp { + * override def job = + * DistributedCacheFile + * .execution("/path/to/your/file.txt") { file => + * doSomething(theCachedFile.path) + * } + * } * + * example with Execution and multiple files: + * + * object YourExecJob extends ExecutionApp { + * override def job = + * DistributedCacheFile.execution("/path/to/your/one.txt") { one => + * DistributedCacheFile.execution("/path/to/your/second.txt") { second => + * doSomething(one.path, second.path) + * } + * } + * } + * + * }}} */ object DistributedCacheFile { + /** - * Create an object that can be used to register a given URI (representing an hdfs file) - * that should be added to the DistributedCache. + * Create an object that can be used to register a given URI (representing an hdfs file) that should be + * added to the DistributedCache. * - * @param uri The fully qualified URI that points to the hdfs file to add - * @return A CachedFile instance + * @param uri + * The fully qualified URI that points to the hdfs file to add + * @return + * A CachedFile instance */ - def apply(uri: URI)(implicit mode: Mode): CachedFile = - UncachedFile(Right(uri)).add() + def apply(uri: URI)(implicit mode: Mode): CachedFile = { + val cachedFile = UncachedFile(Right(uri)).cached(mode) + + addCachedFile(cachedFile, mode) - def apply(path: String)(implicit mode: Mode): CachedFile = - UncachedFile(Left(path)).add() + cachedFile + } + + def apply(path: String)(implicit mode: Mode): CachedFile = { + val cachedFile = UncachedFile(Left(path)).cached(mode) + + addCachedFile(cachedFile, mode) + + cachedFile + } + + private[scalding] def cachedFile(uri: URI, mode: Mode): CachedFile = + UncachedFile(Right(uri)).cached(mode) + + private[scalding] def cachedFile(path: String, mode: Mode): CachedFile = + UncachedFile(Left(path)).cached(mode) + + private[scalding] def addCachedFile(cachedFile: CachedFile, mode: Mode): Unit = + (cachedFile, mode) match { + case (hadoopFile: HadoopCachedFile, hadoopMode: HadoopMode) => + HDistributedCache.addCacheFile(symlinkedUriFor(hadoopFile.sourceUri), hadoopMode.jobConf) + case _ => + } def symlinkNameFor(uri: URI): String = { val hexsum = URIHasher(uri) val fileName = new File(uri.toString).getName - Seq(fileName, hexsum).mkString("-") + Seq(hexsum, fileName).mkString("-") } def symlinkedUriFor(sourceUri: URI): URI = new URI(sourceUri.getScheme, sourceUri.getSchemeSpecificPart, symlinkNameFor(sourceUri)) -} + /** + * Make a file available to an Execution + */ + def execution[A](path: String)(use: CachedFile => Execution[A]): Execution[A] = + Execution.getMode.flatMap { mode => + val cached = cachedFile(path, mode) + Execution.withConfig(use(cached))(addDistributedCacheFiles(_, cached)) + } -final case class UncachedFile private[scalding] (source: Either[String, URI]) { + /** + * Add files to be localized to the config. Intended to be used by user code. + * @param cachedFiles + * CachedFiles to be added + * @return + * new Config with cached files + */ + def addDistributedCacheFiles(config: Config, cachedFiles: CachedFile*): Config = + cachedFiles.foldLeft(config) { case (config, file) => + file match { + case hadoopFile: HadoopCachedFile => + /* + * @see + * basic logic from [[org.apache.hadoop.mapreduce.filecache.DistributedCache.addCacheFile]] + */ + val newFile = DistributedCacheFile + .symlinkedUriFor(hadoopFile.sourceUri) + .toString + + val newFiles = config + .get(MRJobConfig.CACHE_FILES) + .map(files => files + "," + newFile) + .getOrElse(newFile) + + config + (MRJobConfig.CACHE_FILES -> newFiles) + case _ => config + } + } - import DistributedCacheFile._ + /** + * Get cached files from config + */ + def getDistributedCachedFiles(config: Config): Seq[CachedFile] = + config + .get(MRJobConfig.CACHE_FILES) + .toSeq + .flatMap(_.split(",")) + .filter(_.nonEmpty) + .map { file => + val symlinkedUri = new URI(file) + val qualifiedUri = new URI(symlinkedUri.getScheme, symlinkedUri.getSchemeSpecificPart, null) + HadoopCachedFile(qualifiedUri) + } - def add()(implicit mode: Mode): CachedFile = +} + +final case class UncachedFile private[scalding] (source: Either[String, URI]) { + + def cached(mode: Mode): CachedFile = mode match { - case Hdfs(_, conf) => addHdfs(conf) - case HadoopTest(conf, _) => addHdfs(conf) + case Hdfs(_, conf) => addHdfs(conf) + case HadoopTest(conf, _) => addHdfs(conf) case (Local(_) | Test(_)) => addLocal() - case _ => throw new RuntimeException("unhandled mode: %s".format(mode)) + case _ => throw new RuntimeException("unhandled mode: %s".format(mode)) } private[this] def addLocal(): CachedFile = { val path = source match { case Left(strPath) => strPath - case Right(uri) => uri.getPath + case Right(uri) => uri.getPath } LocallyCachedFile(path) } private[this] def addHdfs(conf: Configuration): CachedFile = { - HDistributedCache.createSymlink(conf) - def makeQualifiedStr(path: String, conf: Configuration): URI = makeQualified(new Path(path), conf) def makeQualifiedURI(uri: URI, conf: Configuration): URI = makeQualified(new Path(uri.toString), conf) // uri.toString because hadoop 0.20.2 doesn't take a URI - def makeQualified(p: Path, conf: Configuration): URI = - p.makeQualified(p.getFileSystem(conf)).toUri // make sure we have fully-qualified URI + def makeQualified(p: Path, conf: Configuration): URI = { + val fileSystem = p.getFileSystem(conf) + p.makeQualified(fileSystem.getUri, fileSystem.getWorkingDirectory).toUri + } val sourceUri = source match { case Left(strPath) => makeQualifiedStr(strPath, conf) - case Right(uri) => makeQualifiedURI(uri, conf) + case Right(uri) => makeQualifiedURI(uri, conf) } - HDistributedCache.addCacheFile(symlinkedUriFor(sourceUri), conf) HadoopCachedFile(sourceUri) } } sealed abstract class CachedFile { + /** The path to the cahced file on disk (the symlink registered at configuration time) */ def path: String @@ -138,8 +240,8 @@ sealed abstract class CachedFile { } final case class LocallyCachedFile private[scalding] (sourcePath: String) extends CachedFile { - def path = file.getCanonicalPath - def file = new File(sourcePath).getCanonicalFile + def path: String = file.getCanonicalPath + def file: File = new File(sourcePath).getCanonicalFile } final case class HadoopCachedFile private[scalding] (sourceUri: URI) extends CachedFile { diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/MacroImplicits.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/MacroImplicits.scala new file mode 100644 index 0000000000..7a98dd380c --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/MacroImplicits.scala @@ -0,0 +1,34 @@ +/* + Copyright 2012 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.macros + +import scala.language.experimental.macros + +import com.twitter.scalding._ +import com.twitter.scalding.macros.impl._ + +object MacroImplicits { + + /** + * This method provides proof that the given type is a case class. + */ + implicit def materializeCaseClassTupleSetter[T]: TupleSetter[T] = + macro TupleSetterImpl.caseClassTupleSetterImpl[T] + implicit def materializeCaseClassTupleConverter[T]: TupleConverter[T] = + macro TupleConverterImpl.caseClassTupleConverterImpl[T] + implicit def materializeCaseClassTypeDescriptor[T]: TypeDescriptor[T] = + macro TypeDescriptorProviderImpl.caseClassTypeDescriptorImpl[T] +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/Macros.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/Macros.scala new file mode 100644 index 0000000000..790fc767ae --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/Macros.scala @@ -0,0 +1,51 @@ +/* + Copyright 2014 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.macros + +import scala.language.experimental.macros + +import com.twitter.scalding._ +import com.twitter.scalding.macros.impl._ +import cascading.tuple.Fields + +object Macros { + + // There is two flavors of the below functions, the pure vs withUnknown. + // In both cases recursive case classes, primitive types, and options are flattened down onto cascading tuples. + // In the unknown casehowever if a type is reached that we don't know what to do we store that type into the tuple. + + def caseClassTupleSetter[T]: TupleSetter[T] = macro TupleSetterImpl.caseClassTupleSetterImpl[T] + def caseClassTupleSetterWithUnknown[T]: TupleSetter[T] = + macro TupleSetterImpl.caseClassTupleSetterWithUnknownImpl[T] + + def caseClassTupleConverter[T]: TupleConverter[T] = macro TupleConverterImpl.caseClassTupleConverterImpl[T] + def caseClassTupleConverterWithUnknown[T]: TupleConverter[T] = + macro TupleConverterImpl.caseClassTupleConverterWithUnknownImpl[T] + + def toFields[T]: Fields = macro FieldsProviderImpl.toFieldsImpl[T] + def toFieldsWithUnknown[T]: Fields = macro FieldsProviderImpl.toFieldsWithUnknownImpl[T] + + def toNamedFields[T]: Fields = macro FieldsProviderImpl.toFieldsImpl[T] + def toNamedFieldsWithUnknown[T]: Fields = macro FieldsProviderImpl.toFieldsWithUnknownImpl[T] + + def toIndexedFields[T]: Fields = macro FieldsProviderImpl.toIndexedFieldsImpl[T] + def toIndexedFieldsWithUnknown[T]: Fields = macro FieldsProviderImpl.toIndexedFieldsWithUnknownImpl[T] + + def caseClassTypeDescriptor[T]: TypeDescriptor[T] = + macro TypeDescriptorProviderImpl.caseClassTypeDescriptorImpl[T] + def caseClassTypeDescriptorWithUnknown[T]: TypeDescriptor[T] = + macro TypeDescriptorProviderImpl.caseClassTypeDescriptorWithUnknownImpl[T] +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassBasedSetterImpl.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassBasedSetterImpl.scala new file mode 100644 index 0000000000..1493c17172 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassBasedSetterImpl.scala @@ -0,0 +1,125 @@ +/* + Copyright 2014 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.macros.impl + +import scala.reflect.macros.Context +import scala.util.{Failure, Success} + +/** + * Helper class for generating setters from case class to other types. E.g. cascading Tuple, jdbc + * PreparedStatement + */ +object CaseClassBasedSetterImpl { + + def apply[T](c: Context)(container: c.TermName, allowUnknownTypes: Boolean, fsetter: CaseClassFieldSetter)( + implicit T: c.WeakTypeTag[T] + ): (Int, c.Tree) = { + import c.universe._ + + sealed trait SetterBuilder { + def columns: Int + + /** + * This Tree assumes that "val $value = ..." has been set + */ + def setTree(value: Tree, offset: Int): Tree + } + final case class PrimitiveSetter(tpe: Type) extends SetterBuilder { + def columns = 1 + def setTree(value: Tree, offset: Int) = fsetter.from(c)(tpe, offset, container, value) match { + case Success(tree) => tree + case Failure(e) => + c.abort(c.enclosingPosition, s"Case class $T is supported. Error on $tpe, ${e.getMessage}") + } + } + case object DefaultSetter extends SetterBuilder { + def columns = 1 + def setTree(value: Tree, offset: Int) = fsetter.default(c)(offset, container, value) + } + final case class OptionSetter(inner: SetterBuilder) extends SetterBuilder { + def columns = inner.columns + def setTree(value: Tree, offset: Int) = { + val someVal = newTermName(c.fresh("someVal")) + val someValTree = q"$someVal" + q"""if($value.isDefined) { + val $someVal = $value.get + ${inner.setTree(someValTree, offset)} + } else { + ${fsetter.absent(c)(offset, container)} + }""" + } + } + final case class CaseClassSetter(members: Vector[(Tree => Tree, SetterBuilder)]) extends SetterBuilder { + val columns = members.map(_._2.columns).sum + def setTree(value: Tree, offset: Int) = { + val setters = members + .scanLeft((offset, Option.empty[Tree])) { case ((off, _), (access, sb)) => + val cca = newTermName(c.fresh("access")) + val ccaT = q"$cca" + (off + sb.columns, Some(q"val $cca = ${access(value)}; ${sb.setTree(ccaT, off)}")) + } + .collect { case (_, Some(tree)) => tree } + q"""..$setters""" + } + } + + @annotation.tailrec + def normalized(tpe: Type): Type = { + val norm = tpe.normalize + if (!(norm =:= tpe)) + normalized(norm) + else + tpe + } + + def matchField(outerType: Type): SetterBuilder = { + // we do this just to see if the setter matches. + val dummyIdx = 0 + val dummyTree = q"t" + outerType match { + case tpe if fsetter.from(c)(tpe, dummyIdx, container, dummyTree).isSuccess => + PrimitiveSetter(tpe) + case tpe if tpe.erasure =:= typeOf[Option[Any]] => + val innerType = tpe.asInstanceOf[TypeRefApi].args.head + OptionSetter(matchField(innerType)) + case tpe if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass => + CaseClassSetter(expandMethod(normalized(tpe)).map { case (fn, tpe) => + (fn, matchField(tpe)) + }) + case tpe if allowUnknownTypes => + DefaultSetter + case _ => + c.abort(c.enclosingPosition, s"Case class ${T.tpe} is not supported at type: $outerType") + } + } + def expandMethod(outerTpe: Type): Vector[(Tree => Tree, Type)] = + outerTpe.declarations + .collect { case m: MethodSymbol if m.isCaseAccessor => m } + .map { accessorMethod => + val fieldType = + normalized(accessorMethod.returnType.asSeenFrom(outerTpe, outerTpe.typeSymbol.asClass)) + + ({ pTree: Tree => q"""$pTree.$accessorMethod""" }, fieldType) + } + .toVector + + // in TupleSetterImpl, the outer-most input val is called t, so we pass that in here: + val sb = matchField(normalized(T.tpe)) + if (sb.columns == 0) + c.abort(c.enclosingPosition, "Didn't consume any elements in the tuple, possibly empty case class?") + (sb.columns, sb.setTree(q"t", 0)) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassFieldSetter.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassFieldSetter.scala new file mode 100644 index 0000000000..abe07cbc66 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/CaseClassFieldSetter.scala @@ -0,0 +1,37 @@ +/* + Copyright 2015 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.scalding.macros.impl + +import scala.reflect.macros.Context +import scala.util.Try + +/** + * Helper to set fields from a case class to other "container" types E.g. cascading Tuple, jdbc + * PreparedStatement + */ +trait CaseClassFieldSetter { + + // mark the field as absent/null + def absent(c: Context)(idx: Int, container: c.TermName): c.Tree + + // use the default field setter (for when there is no type-specific setter) + def default(c: Context)(idx: Int, container: c.TermName, fieldValue: c.Tree): c.Tree + + // use the field setter known specific to the given field type + // return scala.util.Failure if no type specific setter in the container + def from(c: Context)(fieldType: c.Type, idx: Int, container: c.TermName, fieldValue: c.Tree): Try[c.Tree] +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/FieldsProviderImpl.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/FieldsProviderImpl.scala new file mode 100644 index 0000000000..8bc2ce9391 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/FieldsProviderImpl.scala @@ -0,0 +1,178 @@ +/* + Copyright 2014 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.macros.impl + +import scala.annotation.tailrec +import scala.reflect.macros.Context + +/** + * Naming scheme for cascading Tuple fields used by FieldsProviderImpl macro. + */ +sealed trait NamingScheme + +/** + * Uses zero-based indexes for field names. + */ +case object Indexed extends NamingScheme + +/** + * Uses prefixes for naming nested fields. For e.g. for the following nested case class: + * {{{ + * case class Outer(id: Long, name: String, details: Inner) + * case class Inner(phone: Int) + * }}} + * the nested field's name will be "details.phone". + */ +case object NamedWithPrefix extends NamingScheme + +/** + * No prefixes for naming nested fields. For e.g. for the following nested case class: + * {{{ + * case class Outer(id: Long, name: String, details: Inner) + * case class Inner(phone: Int) + * }}} + * the nested field's name will remain "phone". + * + * Useful esp. for flattening nested case classes to SQL table columns. + */ +case object NamedNoPrefix extends NamingScheme + +/** + * This class contains the core macro implementations. This is in a separate module to allow it to be in a + * separate compilation unit, which makes it easier to provide helper methods interfacing with macros. + */ +object FieldsProviderImpl { + def toFieldsImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[cascading.tuple.Fields] = + toFieldsCommonImpl(c, NamedWithPrefix, false)(T) + + def toFieldsWithUnknownImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[cascading.tuple.Fields] = + toFieldsCommonImpl(c, NamedWithPrefix, true)(T) + + def toFieldsWithUnknownNoPrefixImpl[T](c: Context)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[cascading.tuple.Fields] = + toFieldsCommonImpl(c, NamedNoPrefix, true)(T) + + def toIndexedFieldsImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[cascading.tuple.Fields] = + toFieldsCommonImpl(c, Indexed, false)(T) + + def toIndexedFieldsWithUnknownImpl[T](c: Context)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[cascading.tuple.Fields] = + toFieldsCommonImpl(c, Indexed, true)(T) + + def toFieldsCommonImpl[T](c: Context, namingScheme: NamingScheme, allowUnknownTypes: Boolean)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[cascading.tuple.Fields] = { + import c.universe._ + + import TypeDescriptorProviderImpl.optionInner + + @tailrec + def isNumbered(t: Type): Boolean = + t match { + case tpe if tpe =:= typeOf[Boolean] => true + case tpe if tpe =:= typeOf[Short] => true + case tpe if tpe =:= typeOf[Int] => true + case tpe if tpe =:= typeOf[Long] => true + case tpe if tpe =:= typeOf[Float] => true + case tpe if tpe =:= typeOf[Double] => true + case tpe if tpe =:= typeOf[String] => true + case tpe => + optionInner(c)(tpe) match { // linter:disable:UseOptionExistsNotPatMatch + case Some(t) => + // we need this match style to do tailrec + isNumbered(t) + case None => false + } + } + + object FieldBuilder { + // This is method on the object to work around this compiler bug: SI-6231 + def toFieldsTree(fb: FieldBuilder, scheme: NamingScheme): Tree = { + val nameTree = scheme match { + case Indexed => + val indices = fb.names.zipWithIndex.map(_._2) + q"""_root_.scala.Array.apply[_root_.java.lang.Comparable[_]](..$indices)""" + case _ => + q"""_root_.scala.Array.apply[_root_.java.lang.Comparable[_]](..${fb.names})""" + } + q"""new _root_.cascading.tuple.Fields($nameTree, + _root_.scala.Array.apply[_root_.java.lang.reflect.Type](..${fb.columnTypes})) + """ + } + } + sealed trait FieldBuilder { + def columnTypes: Vector[Tree] + def names: Vector[String] + } + final case class Primitive(name: String, tpe: Type) extends FieldBuilder { + def columnTypes = Vector(q"""_root_.scala.Predef.classOf[$tpe]""") + def names = Vector(name) + } + final case class OptionBuilder(of: FieldBuilder) extends FieldBuilder { + // Options just use Object as the type, due to the way cascading works on number types + def columnTypes = of.columnTypes.map(_ => q"""_root_.scala.Predef.classOf[_root_.java.lang.Object]""") + def names = of.names + } + final case class CaseClassBuilder(prefix: String, members: Vector[FieldBuilder]) extends FieldBuilder { + def columnTypes = members.flatMap(_.columnTypes) + def names = for { + member <- members + name <- member.names + } yield if (namingScheme == NamedWithPrefix && prefix.nonEmpty) s"$prefix.$name" else name + } + + /** + * This returns a List of pairs which flatten fieldType into (class, name) pairs + */ + def matchField(fieldType: Type, name: String): FieldBuilder = + fieldType match { + case tpe if tpe =:= typeOf[String] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[Boolean] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[Short] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[Int] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[Long] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[Float] => Primitive(name, tpe) + case tpe if tpe =:= typeOf[Double] => Primitive(name, tpe) + case tpe if tpe.erasure =:= typeOf[Option[Any]] => + val innerType = tpe.asInstanceOf[TypeRefApi].args.head + OptionBuilder(matchField(innerType, name)) + case tpe if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass => + CaseClassBuilder(name, expandMethod(tpe).map { case (t, s) => matchField(t, s) }) + case tpe if allowUnknownTypes => Primitive(name, tpe) + case tpe => + c.abort(c.enclosingPosition, s"${T.tpe} is unsupported at $tpe") + } + + def expandMethod(outerTpe: Type): Vector[(Type, String)] = + outerTpe.declarations + .collect { case m: MethodSymbol if m.isCaseAccessor => m } + .map { accessorMethod => + val fieldName = accessorMethod.name.toString + val fieldType = accessorMethod.returnType.asSeenFrom(outerTpe, outerTpe.typeSymbol.asClass) + (fieldType, fieldName) + } + .toVector + + val builder = matchField(T.tpe, "") + if (builder.columnTypes.isEmpty) + c.abort(c.enclosingPosition, s"Case class ${T.tpe} has no primitive types we were able to extract") + val scheme = if (isNumbered(T.tpe)) Indexed else namingScheme + val tree = FieldBuilder.toFieldsTree(builder, scheme) + c.Expr[cascading.tuple.Fields](tree) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleConverterImpl.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleConverterImpl.scala new file mode 100644 index 0000000000..8886bb38ea --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleConverterImpl.scala @@ -0,0 +1,136 @@ +/* + Copyright 2014 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.macros.impl + +import scala.reflect.macros.Context + +import com.twitter.scalding._ + +/** + * This class contains the core macro implementations. This is in a separate module to allow it to be in a + * separate compilation unit, which makes it easier to provide helper methods interfacing with macros. + */ + +object TupleConverterImpl { + def caseClassTupleConverterImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[TupleConverter[T]] = + caseClassTupleConverterCommonImpl(c, false) + + def caseClassTupleConverterWithUnknownImpl[T](c: Context)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TupleConverter[T]] = + caseClassTupleConverterCommonImpl(c, true) + + def caseClassTupleConverterCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TupleConverter[T]] = { + import c.universe._ + + import TypeDescriptorProviderImpl.evidentColumn + + def membersOf(outerTpe: Type): Vector[Type] = + outerTpe.declarations + .collect { case m: MethodSymbol if m.isCaseAccessor => m } + .map { accessorMethod => + accessorMethod.returnType.asSeenFrom(outerTpe, outerTpe.typeSymbol.asClass) + } + .toVector + + sealed trait ConverterBuilder { + def columns: Int + def applyTree(offset: Int): Tree + } + final case class PrimitiveBuilder(primitiveGetter: Int => Tree) extends ConverterBuilder { + def columns = 1 + def applyTree(offset: Int) = primitiveGetter(offset) + } + final case class OptionBuilder(evidentCol: Int, of: ConverterBuilder) extends ConverterBuilder { + def columns = of.columns + def applyTree(offset: Int) = { + val testIdx = offset + evidentCol + q"""if (t.getObject($testIdx) == null) None + else Some(${of.applyTree(offset)})""" + } + } + final case class CaseClassBuilder(tpe: Type, members: Vector[ConverterBuilder]) extends ConverterBuilder { + val columns = members.map(_.columns).sum + def applyTree(offset: Int) = { + val trees = members + .scanLeft((offset, Option.empty[Tree])) { case ((o, _), cb) => + val nextOffset = o + cb.columns + (nextOffset, Some(cb.applyTree(o))) + } + .collect { case (_, Some(tree)) => tree } + + q"${tpe.typeSymbol.companionSymbol}(..$trees)" + } + } + + def matchField(outerTpe: Type): ConverterBuilder = + outerTpe match { + /* + * First we handle primitives, which never recurse + */ + case tpe if tpe =:= typeOf[String] && allowUnknownTypes => + PrimitiveBuilder(idx => q"""t.getString($idx)""") + case tpe if tpe =:= typeOf[String] => + // In this case, null is identical to empty, and we always return non-null + PrimitiveBuilder(idx => q"""{val s = t.getString($idx); if (s == null) "" else s}""") + case tpe if tpe =:= typeOf[Boolean] => + PrimitiveBuilder(idx => q"""t.getBoolean($idx)""") + case tpe if tpe =:= typeOf[Short] => + PrimitiveBuilder(idx => q"""t.getShort($idx)""") + case tpe if tpe =:= typeOf[Int] => + PrimitiveBuilder(idx => q"""t.getInteger($idx)""") + case tpe if tpe =:= typeOf[Long] => + PrimitiveBuilder(idx => q"""t.getLong($idx)""") + case tpe if tpe =:= typeOf[Float] => + PrimitiveBuilder(idx => q"""t.getFloat($idx)""") + case tpe if tpe =:= typeOf[Double] => + PrimitiveBuilder(idx => q"""t.getDouble($idx)""") + case tpe if tpe.erasure =:= typeOf[Option[Any]] => + val innerType = tpe.asInstanceOf[TypeRefApi].args.head + evidentColumn(c, allowUnknownTypes)(innerType) match { + case None => // there is no evident column, not supported. + c.abort(c.enclosingPosition, s"$tpe has unsupported nesting of Options at: $innerType") + case Some(ev) => // we can recurse here + OptionBuilder(ev, matchField(innerType)) + } + case tpe if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass => + CaseClassBuilder(tpe, membersOf(tpe).map(matchField)) + case tpe if allowUnknownTypes => + PrimitiveBuilder(idx => q"""t.getObject($idx).asInstanceOf[$tpe]""") + case tpe => + c.abort( + c.enclosingPosition, + s"${T.tpe} is not pure primitives, Option of a primitive, nested case classes when looking at type $tpe" + ) + } + + val builder = matchField(T.tpe) + if (builder.columns == 0) + c.abort(c.enclosingPosition, "Didn't consume any elements in the tuple, possibly empty case class?") + + val res = q""" + new _root_.com.twitter.scalding.TupleConverter[$T] with _root_.com.twitter.bijection.macros.MacroGenerated { + override def apply(t: _root_.cascading.tuple.TupleEntry): $T = { + ${builder.applyTree(0)} + } + override val arity: _root_.scala.Int = ${builder.columns} + } + """ + c.Expr[TupleConverter[T]](res) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleFieldSetter.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleFieldSetter.scala new file mode 100644 index 0000000000..d5a2726db0 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleFieldSetter.scala @@ -0,0 +1,59 @@ +/* + Copyright 2015 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.macros.impl + +import scala.reflect.macros.Context +import scala.util.Try + +/** + * Helper class for setting case class fields in cascading Tuple + */ +object TupleFieldSetter extends CaseClassFieldSetter { + + override def absent(c: Context)(idx: Int, container: c.TermName): c.Tree = { + import c.universe._ + /* A more defensive approach is to set to null, but since + * we always allocate an empty TupleEntry, which is initially null, + * this is unneeded. + * q"""$container.set($idx, null)""" + */ + q"""()""" + } + + override def default(c: Context)(idx: Int, container: c.TermName, fieldValue: c.Tree): c.Tree = { + import c.universe._ + q"""$container.set($idx, $fieldValue)""" + } + + override def from( + c: Context + )(fieldType: c.Type, idx: Int, container: c.TermName, fieldValue: c.Tree): Try[c.Tree] = Try { + import c.universe._ + + def simpleType(accessor: Tree) = q"""$accessor($idx, $fieldValue)""" + + fieldType match { + case tpe if tpe =:= typeOf[String] => simpleType(q"$container.setString") + case tpe if tpe =:= typeOf[Boolean] => simpleType(q"$container.setBoolean") + case tpe if tpe =:= typeOf[Short] => simpleType(q"$container.setShort") + case tpe if tpe =:= typeOf[Int] => simpleType(q"$container.setInteger") + case tpe if tpe =:= typeOf[Long] => simpleType(q"$container.setLong") + case tpe if tpe =:= typeOf[Float] => simpleType(q"$container.setFloat") + case tpe if tpe =:= typeOf[Double] => simpleType(q"$container.setDouble") + case _ => sys.error(s"Unsupported primitive type $fieldType") + } + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleSetterImpl.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleSetterImpl.scala new file mode 100644 index 0000000000..89e29a76fe --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TupleSetterImpl.scala @@ -0,0 +1,56 @@ +/* + Copyright 2014 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.macros.impl + +import scala.reflect.macros.Context + +import com.twitter.scalding._ + +/** + * This class contains the core macro implementations. This is in a separate module to allow it to be in a + * separate compilation unit, which makes it easier to provide helper methods interfacing with macros. + */ +object TupleSetterImpl { + + def caseClassTupleSetterImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[TupleSetter[T]] = + caseClassTupleSetterCommonImpl(c, false) + + def caseClassTupleSetterWithUnknownImpl[T](c: Context)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TupleSetter[T]] = + caseClassTupleSetterCommonImpl(c, true) + + def caseClassTupleSetterCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TupleSetter[T]] = { + import c.universe._ + + val tupTerm = newTermName(c.fresh("tup")) + val (finalIdx, set) = CaseClassBasedSetterImpl(c)(tupTerm, allowUnknownTypes, TupleFieldSetter) + + val res = q""" + new _root_.com.twitter.scalding.TupleSetter[$T] with _root_.com.twitter.bijection.macros.MacroGenerated { + override def apply(t: $T): _root_.cascading.tuple.Tuple = { + val $tupTerm = _root_.cascading.tuple.Tuple.size($finalIdx) + $set + $tupTerm + } + override val arity: _root_.scala.Int = $finalIdx + } + """ + c.Expr[TupleSetter[T]](res) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TypeDescriptorProviderImpl.scala b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TypeDescriptorProviderImpl.scala new file mode 100644 index 0000000000..df9cf0100b --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/macros/impl/TypeDescriptorProviderImpl.scala @@ -0,0 +1,217 @@ +/* + Copyright 2014 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.macros.impl + +import scala.reflect.macros.Context + +import com.twitter.scalding._ + +/** + * This class contains the core macro implementations. This is in a separate module to allow it to be in a + * separate compilation unit, which makes it easier to provide helper methods interfacing with macros. + */ +object TypeDescriptorProviderImpl { + + def caseClassTypeDescriptorImpl[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[TypeDescriptor[T]] = + caseClassTypeDescriptorCommonImpl(c, false)(T) + + def caseClassTypeDescriptorWithUnknownImpl[T](c: Context)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TypeDescriptor[T]] = + caseClassTypeDescriptorCommonImpl(c, true)(T) + + /** + * When flattening a nested structure with Options, the evidentColumn is a column, relative to the the first + * 0-offset column, that represents evidence of this T, and hence set of columns, are present or absent. + * This is to handle Option types in text files such as CSV and TSV. a type T is evident if it the + * evidentColumn.exists + * + * primitive numbers are evident case classes are evident if they have at least one evident member. + * + * Strings are not evident (we can't distinguish Empty from "") Option[T] is not evident (we can't tell + * Some(None) from None). + */ + def evidentColumn(c: Context, allowUnknown: Boolean = false)(tpe: c.universe.Type): Option[Int] = { + import c.universe._ + + def flattenOnce(t: Type): List[Type] = + t.declarations + .collect { case m: MethodSymbol if m.isCaseAccessor => m } + .map(_.returnType.asSeenFrom(t, t.typeSymbol.asClass)) + .toList + + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + def go(t: Type, offset: Int): (Int, Option[Int]) = { + val thisColumn = (offset + 1, Some(offset)) + t match { + case tpe if tpe =:= typeOf[String] => + // if we don't allowUnknown here, we treat null and "" is indistinguishable + // for text formats + if (allowUnknown) thisColumn + else (offset + 1, None) + case tpe if tpe =:= typeOf[Boolean] => thisColumn + case tpe if tpe =:= typeOf[Short] => thisColumn + case tpe if tpe =:= typeOf[Int] => thisColumn + case tpe if tpe =:= typeOf[Long] => thisColumn + case tpe if tpe =:= typeOf[Float] => thisColumn + case tpe if tpe =:= typeOf[Double] => thisColumn + // We recurse on Option and case classes + case tpe if tpe.erasure =:= typeOf[Option[Any]] => + val innerTpe = optionInner(c)(tpe).get + // we have no evidentColumn, but we need to compute the next index + (go(innerTpe, offset)._1, None) + case tpe if tpe.typeSymbol.isClass && tpe.typeSymbol.asClass.isCaseClass => + val flattened = flattenOnce(tpe) + .scanLeft((offset, Option.empty[Int])) { case ((off, _), t) => go(t, off) } + + val nextPos = flattened.last._1 + val ev = flattened.collectFirst { case (_, Some(col)) => col } + (nextPos, ev) + case _ if allowUnknown => thisColumn + case t => + c.abort(c.enclosingPosition, s"Case class $tpe at $t is not pure primitives or nested case classes") + } + } + go(tpe, 0)._2 + } + + def optionInner(c: Context)(opt: c.universe.Type): Option[c.universe.Type] = + if (opt.erasure =:= c.universe.typeOf[Option[Any]]) { + Some(opt.asInstanceOf[c.universe.TypeRefApi].args.head) + } else None + + def isTuple[T](c: Context)(implicit T: c.WeakTypeTag[T]): Boolean = { + import c.universe._ + val tupleTypes = List( + typeOf[Tuple1[Any]], + typeOf[Tuple2[Any, Any]], + typeOf[Tuple3[Any, Any, Any]], + typeOf[Tuple4[Any, Any, Any, Any]], + typeOf[Tuple5[Any, Any, Any, Any, Any]], + typeOf[Tuple6[Any, Any, Any, Any, Any, Any]], + typeOf[Tuple7[Any, Any, Any, Any, Any, Any, Any]], + typeOf[Tuple8[Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[Tuple9[Any, Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[Tuple10[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[Tuple11[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[Tuple12[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[Tuple13[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[Tuple14[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[Tuple15[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[Tuple16[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[Tuple17[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any]], + typeOf[ + Tuple18[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any] + ], + typeOf[ + Tuple19[Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any, Any] + ], + typeOf[Tuple20[ + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any + ]], + typeOf[Tuple21[ + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any + ]], + typeOf[Tuple22[ + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any, + Any + ]] + ) + tupleTypes.exists(_ =:= T.tpe.erasure) + } + + def caseClassTypeDescriptorCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[TypeDescriptor[T]] = { + import c.universe._ + + val converter = TupleConverterImpl.caseClassTupleConverterCommonImpl[T](c, allowUnknownTypes) + val setter = TupleSetterImpl.caseClassTupleSetterCommonImpl[T](c, allowUnknownTypes) + + val namingScheme = if (isTuple[T](c)) Indexed else NamedWithPrefix + + val fields = FieldsProviderImpl.toFieldsCommonImpl[T](c, namingScheme, allowUnknownTypes) + + val res = q""" + new _root_.com.twitter.scalding.TypeDescriptor[$T] with _root_.com.twitter.bijection.macros.MacroGenerated { + override val converter = $converter + override val setter = $setter + override val fields = $fields + } + """ + c.Expr[TypeDescriptor[T]](res) + } + +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Combinatorics.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Combinatorics.scala index dbe1f82c6a..82ffe1f779 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Combinatorics.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Combinatorics.scala @@ -2,226 +2,223 @@ package com.twitter.scalding.mathematics import com.twitter.scalding._ import com.twitter.scalding.Dsl._ import cascading.flow.FlowDef -import cascading.tuple.{Fields, TupleEntry} +import cascading.tuple.TupleEntry import cascading.pipe.Pipe /** -Serve as a repo for self-contained combinatorial functions with no dependencies -such as -combinations, aka n choose k, nCk -permutations , aka nPk -subset sum : numbers that add up to a finite sum -weightedSum: For weights (a,b,c, ...), want integers (x,y,z,...) to satisfy constraint |ax + by + cz + ... - result | < error -... - -@author : Krishnan Raman, kraman@twitter.com -*/ + * Serve as a repo for self-contained combinatorial functions with no dependencies such as combinations, aka n + * choose k, nCk permutations , aka nPk subset sum : numbers that add up to a finite sum weightedSum: For + * weights (a,b,c, ...), want integers (x,y,z,...) to satisfy constraint |ax + by + cz + ... - result | < + * error ... + * + * @author + * : Krishnan Raman, kraman@twitter.com + */ object Combinatorics { -/** - Given an int k, and an input of size n, - return a pipe with nCk combinations, with k columns per row - - - Computes nCk = n choose k, for large values of nCk - - Use-case: Say you have 100 hashtags sitting in an array - You want a table with 5 hashtags per row, all possible combinations - If the hashtags are sitting in a string array, then - combinations[String]( hashtags, 5) - will create the 100 chose 5 combinations. - - Algorithm: Use k pipes, cross pipes two at a time, filter out non-monotonic entries - - eg. 10C2 = 10 choose 2 - Use 2 pipes. - Pipe1 = (1,2,3,...10) - Pipe2 = (2,3,4....10) - Cross Pipe1 with Pipe2 for 10*9 = 90 tuples - Filter out tuples that are non-monotonic - For (t1,t2) we want t1 Symbol("n"+x)) // all column names + val allc = (1 to k).toList.map(x => Symbol("n" + x)) // all column names - val pipes = allc.zipWithIndex.map( x=> { - val num = x._2 + 1 - val pipe = IterableSource( (num to n), x._1 ).read - (pipe, num) - }) + val pipes = allc.zipWithIndex.map { x => + val num = x._2 + 1 + val pipe = IterableSource((num to n), x._1).read + (pipe, num) + } - val res = pipes.reduceLeft( (a,b) => { + val res = pipes.reduceLeft { (a, b) => val num = b._2 val prevname = Symbol("n" + (num - 1)) - val myname = Symbol( "n" + num) + val myname = Symbol("n" + num) val mypipe = a._1 - .crossWithSmaller(b._1) - .filter( prevname, myname ){ - foo:(Int, Int) => - val( nn1, nn2) = foo - nn1 < nn2 - } + .crossWithSmaller(b._1) + .filter(prevname, myname) { foo: (Int, Int) => + val (nn1, nn2) = foo + nn1 < nn2 + } (mypipe, -1) - })._1 + }._1 - (1 to k).foldLeft(res)((a,b)=>{ - val myname = Symbol( "n" + b) + (1 to k).foldLeft(res) { (a, b) => + val myname = Symbol("n" + b) val newname = Symbol("k" + b) - a.map(myname->newname){ - inpc:Int => input(inpc-1) + a.map(myname -> newname) { inpc: Int => + input(inpc - 1) }.discard(myname) - }) + } } /** - Return a pipe with all nCk combinations, with k columns per row - */ - def combinations(n:Int, k:Int)(implicit flowDef: FlowDef, mode: Mode) = combinations[Int]((1 to n).toArray, k) + * Return a pipe with all nCk combinations, with k columns per row + */ + def combinations(n: Int, k: Int)(implicit flowDef: FlowDef, mode: Mode) = + combinations[Int]((1 to n).toArray, k) /** - Return a pipe with all nPk permutations, with k columns per row - For details, see combinations(...) above - */ - - + * Return a pipe with all nPk permutations, with k columns per row For details, see combinations(...) above + */ - def permutations[T](input:IndexedSeq[T], k:Int)(implicit flowDef: FlowDef, mode: Mode):Pipe = { + def permutations[T](input: IndexedSeq[T], k: Int)(implicit flowDef: FlowDef, mode: Mode): Pipe = { val n = input.size - val allc = (1 to k).toList.map( x=> Symbol("n"+x)) // all column names + val allc = (1 to k).toList.map(x => Symbol("n" + x)) // all column names - val pipes = allc.map( x=> IterableSource(1 to n, x).read) + val pipes = allc.map(x => IterableSource(1 to n, x).read) // on a given row, we cannot have duplicate columns in a permutation val res = pipes - .reduceLeft( (a,b) => { a.crossWithSmaller(b) }) - .filter( allc ) { - x: TupleEntry => Boolean - val values = (0 until allc.size).map( i=> x.getInteger( i.asInstanceOf[java.lang.Integer])) - values.size == values.distinct.size - } - - // map numerals to actual data - (1 to k).foldLeft(res)((a,b)=>{ - val myname = Symbol( "n" + b) + .reduceLeft((a, b) => a.crossWithSmaller(b)) + .filter(allc) { x: TupleEntry => + Boolean + val values = (0 until allc.size).map(i => x.getInteger(i.asInstanceOf[java.lang.Integer])) + values.size == values.distinct.size + } + + // map numerals to actual data + (1 to k).foldLeft(res) { (a, b) => + val myname = Symbol("n" + b) val newname = Symbol("k" + b) - a.map(myname->newname){ - inpc:Int => input(inpc-1) + a.map(myname -> newname) { inpc: Int => + input(inpc - 1) }.discard(myname) - }) + } } /** - Return a pipe with all nPk permutations, with k columns per row - */ - def permutations(n:Int, k:Int)(implicit flowDef: FlowDef, mode: Mode) = permutations[Int]((1 to n).toArray, k) - + * Return a pipe with all nPk permutations, with k columns per row + */ + def permutations(n: Int, k: Int)(implicit flowDef: FlowDef, mode: Mode) = + permutations[Int]((1 to n).toArray, k) /** - Goal: Given weights (a,b,c, ...), we seek integers (x,y,z,...) to satisft - the constraint |ax + by + cz + ... - result | < error - - Parameters: The weights (a,b,c,...) must be non-negative doubles. - Our search space is 0 to result/min(weights) - The returned pipe will contain integer tuples (x,y,z,...) that satisfy ax+by+cz +... = result - - Note: This is NOT Simplex - WE use a slughtly-improved brute-force algorithm that performs well on account of parallelization. - Algorithm: - Create as many pipes as the number of weights - Each pipe copntains integral multiples of the weight w ie. (0,1w,2w,3w,4w,....) - Iterate as below - - Cross two pipes - Create a temp column that stores intermediate results - Apply progressive filtering on the temp column - Discard the temp column - Once all pipes are crossed, test for temp column within error bounds of result - Discard duplicates at end of process - - Usecase: We'd like to generate all integer tuples for typical usecases like - - 0. How many ways can you invest $1000 in facebook, microsoft, hp ? - val cash = 1000.0 - val error = 5.0 // max error $5, so its ok if we cannot invest the last $5 or less - val (FB, MSFT, HP) = (23.3,27.4,51.2) // share prices - val stocks = IndexedSeq( FB,MSFT,HP ) - weightedSum( stocks, cash, error).write( Tsv("invest.txt")) - - 1. find all (x,y,z) such that 2x+3y+5z = 23, with max error 1 - weightedSum( IndexedSeq(2.0,3.0,5.0), 23.0, 1.0) - - 2. find all (a,b,c,d) such that 2a+12b+12.5c+34.7d = 3490 with max error 3 - weightedSum( IndexedSeq(2.0,12.0,2.5,34.7),3490.0,3.0) - - This is at the heart of portfolio mgmt( Markowitz optimization), subset-sum, operations-research LP problems. - - */ - - def weightedSum( weights:IndexedSeq[Double], result:Double, error:Double)(implicit flowDef: FlowDef, mode: Mode):Pipe = { + * Goal: Given weights (a,b,c, ...), we seek integers (x,y,z,...) to satisft the constraint |ax + by + cz + + * ... - result | < error + * + * Parameters: The weights (a,b,c,...) must be non-negative doubles. Our search space is 0 to + * result/min(weights) The returned pipe will contain integer tuples (x,y,z,...) that satisfy ax+by+cz +... + * \= result + * + * Note: This is NOT Simplex WE use a slughtly-improved brute-force algorithm that performs well on account + * of parallelization. Algorithm: Create as many pipes as the number of weights Each pipe copntains integral + * multiples of the weight w ie. (0,1w,2w,3w,4w,....) Iterate as below - Cross two pipes Create a temp + * column that stores intermediate results Apply progressive filtering on the temp column Discard the temp + * column Once all pipes are crossed, test for temp column within error bounds of result Discard duplicates + * at end of process + * + * Usecase: We'd like to generate all integer tuples for typical usecases like + * + * 0. How many ways can you invest $1000 in facebook, microsoft, hp ? val cash = 1000.0 val error = 5.0 // + * max error $5, so its ok if we cannot invest the last $5 or less val (FB, MSFT, HP) = (23.3,27.4,51.2) // + * share prices val stocks = IndexedSeq( FB,MSFT,HP ) weightedSum( stocks, cash, error).write( + * Tsv("invest.txt")) + * + * 1. find all (x,y,z) such that 2x+3y+5z = 23, with max error 1 weightedSum( IndexedSeq(2.0,3.0,5.0), + * 23.0, 1.0) + * + * 2. find all (a,b,c,d) such that 2a+12b+12.5c+34.7d = 3490 with max error 3 weightedSum( + * IndexedSeq(2.0,12.0,2.5,34.7),3490.0,3.0) + * + * This is at the heart of portfolio mgmt( Markowitz optimization), subset-sum, operations-research LP + * problems. + */ + + def weightedSum(weights: IndexedSeq[Double], result: Double, error: Double)(implicit + flowDef: FlowDef, + mode: Mode + ): Pipe = { val numWeights = weights.size - val allColumns = (1 to numWeights).map( x=> Symbol("k"+x)) + val allColumns = (1 to numWeights).map(x => Symbol("k" + x)) // create as many single-column pipes as the number of weights - val pipes = allColumns.zip(weights).map( x=> { - val (name,wt) = x - IterableSource( (0.0 to result by wt), name).read - }).zip( allColumns ) + val pipes = allColumns + .zip(weights) + .map { x => + val (name, wt) = x + val points = Stream.iterate(0.0)(_ + wt).takeWhile(_ <= result) + IterableSource(points, name).read + } + .zip(allColumns) val first = pipes.head val accum = (first._1, List[Symbol](first._2)) val rest = pipes.tail - val res = rest.foldLeft(accum)((a,b)=>{ - - val (apipe, aname) = a - val (bpipe, bname) = b - val allc = (List(aname)).flatten ++ List[Symbol](bname) - - // Algorithm: - // Cross two pipes - // Create a temp column that stores intermediate results - // Apply progressive filtering on the temp column - // Discard the temp column - // Once all pipes are crossed, test for temp column within error bounds of result - // Discard duplicates at end of process - - ( apipe.crossWithSmaller(bpipe) - .map(allc->'temp){ - x:TupleEntry => - val values = (0 until allc.size).map( i=> x.getDouble( i.asInstanceOf[java.lang.Integer])) - values.sum - }.filter('temp){ - x:Double => if( allc.size == numWeights) (math.abs(x-result)<= error) else (x <= result) - }.discard('temp), allc ) - })._1.unique(allColumns) - - (1 to numWeights).zip(weights).foldLeft( res) ((a,b) => { - val (num,wt) = b - val myname = Symbol("k"+num) - a.map( myname->myname){ x:Int => (x/wt).toInt } - }) + val res = rest + .foldLeft(accum) { (a, b) => + val (apipe, aname) = a + val (bpipe, bname) = b + val allc = (List(aname)).flatten ++ List[Symbol](bname) + + // Algorithm: + // Cross two pipes + // Create a temp column that stores intermediate results + // Apply progressive filtering on the temp column + // Discard the temp column + // Once all pipes are crossed, test for temp column within error bounds of result + // Discard duplicates at end of process + + ( + apipe + .crossWithSmaller(bpipe) + .map(allc -> 'temp) { x: TupleEntry => + val values = (0 until allc.size).map(i => x.getDouble(i.asInstanceOf[java.lang.Integer])) + values.sum + } + .filter('temp) { x: Double => + if (allc.size == numWeights) (math.abs(x - result) <= error) else (x <= result) + } + .discard('temp), + allc + ) + } + ._1 + .unique(allColumns) + + (1 to numWeights) + .zip(weights) + .foldLeft(res) { (a, b) => + val (num, wt) = b + val myname = Symbol("k" + num) + a.map(myname -> myname) { x: Int => (x / wt).toInt } + } } /** - Does the exact same thing as weightedSum, but filters out tuples with a weight of 0 - The returned pipe contain only positive non-zero weights. - */ - def positiveWeightedSum( weights:IndexedSeq[Double], result:Double, error:Double)(implicit flowDef: FlowDef, mode: Mode):Pipe = { - val allColumns = (1 to weights.size).map( x=> Symbol("k"+x)) - weightedSum( weights, result, error).filter( allColumns ){ - x:TupleEntry => (0 until allColumns.size).map( i=> x.getDouble(i.asInstanceOf[java.lang.Integer])!=0.0).reduceLeft(_&&_) - } + * Does the exact same thing as weightedSum, but filters out tuples with a weight of 0 The returned pipe + * contain only positive non-zero weights. + */ + def positiveWeightedSum(weights: IndexedSeq[Double], result: Double, error: Double)(implicit + flowDef: FlowDef, + mode: Mode + ): Pipe = { + val allColumns = (1 to weights.size).map(x => Symbol("k" + x)) + weightedSum(weights, result, error) + .filter(allColumns) { x: TupleEntry => + (0 until allColumns.size).forall(i => x.getDouble(java.lang.Integer.valueOf(i)) != 0.0) + } } - } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Histogram.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Histogram.scala index 6760d0b91e..29916be750 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Histogram.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Histogram.scala @@ -1,22 +1,24 @@ package com.twitter.scalding.mathematics -class Histogram(map : Map[Double,Long], binWidth : Double) { +class Histogram(map: Map[Double, Long], binWidth: Double) { lazy val size = map.values.sum - lazy val sum = map.foldLeft(0.0){case (acc, (bin, count)) => acc + bin * count} + lazy val sum = map.foldLeft(0.0) { case (acc, (bin, count)) => acc + bin * count } lazy val keys = map.keys.toList.sorted lazy val min = keys.head lazy val max = keys.last lazy val stdDev = { - val squaredDiff = map.foldLeft(0.0){case (acc, (bin, count)) => acc + count * math.pow(bin - mean, 2.0) } + val squaredDiff = map.foldLeft(0.0) { case (acc, (bin, count)) => + acc + count * math.pow(bin - mean, 2.0) + } math.sqrt(squaredDiff / size) } lazy val cdf = { var cumulative = 0L - var result = Map[Double,Double]() - keys.foreach {bin => + var result = Map[Double, Double]() + keys.foreach { bin => cumulative += map(bin) result += (bin -> (cumulative.toDouble / size)) } @@ -26,8 +28,8 @@ class Histogram(map : Map[Double,Long], binWidth : Double) { lazy val lorenz = { var cumulativeUnique = 0.0 var cumulativeTotal = 0.0 - var result = Map[Double,Double]() - keys.foreach {bin => + var result = Map[Double, Double]() + keys.foreach { bin => cumulativeUnique += map(bin) cumulativeTotal += bin * map(bin) result += (cumulativeUnique / size -> cumulativeTotal / sum) @@ -35,7 +37,7 @@ class Histogram(map : Map[Double,Long], binWidth : Double) { result } - def percentile(p : Int) = keys.find{bin => cdf(bin) * 100 >= p}.getOrElse(-1d) + def percentile(p: Int) = keys.find(bin => cdf(bin) * 100 >= p).getOrElse(-1d) lazy val median = percentile(50) lazy val q1 = percentile(25) diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix.scala index 2a340ddbcc..98b63b9794 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Matrix.scala @@ -12,207 +12,229 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics -import com.twitter.algebird.{Monoid, Group, Ring, Field} +import com.twitter.algebird.{Field, Group, Monoid, Ring} +import com.twitter.algebird.field._ // backwards compatiblity support import com.twitter.scalding._ -import cascading.pipe.assembly._ import cascading.pipe.joiner._ import cascading.pipe.Pipe import cascading.tuple.Fields import cascading.tuple._ import cascading.flow._ -import cascading.tap._ import com.twitter.scalding.Dsl._ -import scala.math.max -import scala.annotation.tailrec + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ /** - * Matrix class - represents an infinite (hopefully sparse) matrix. - * any elements without a row are interpretted to be zero. - * the pipe hold ('rowIdx, 'colIdx, 'val) where in principle - * each row/col/value type is generic, with the constraint that ValT is a Ring[T] - * In practice, RowT and ColT are going to be Strings, Integers or Longs in the usual case. + * Matrix class - represents an infinite (hopefully sparse) matrix. any elements without a row are + * interpretted to be zero. the pipe hold ('rowIdx, 'colIdx, 'val) where in principle each row/col/value type + * is generic, with the constraint that ValT is a Ring[T] In practice, RowT and ColT are going to be Strings, + * Integers or Longs in the usual case. * - * WARNING: - * It is NOT OKAY to use the same instance of Matrix/Row/Col with DIFFERENT Monoids/Rings/Fields. - * If you want to change, midstream, the Monoid on your ValT, you have to construct a new Matrix. - * This is due to caching of internal computation graphs. + * WARNING: It is NOT OKAY to use the same instance of Matrix/Row/Col with DIFFERENT Monoids/Rings/Fields. If + * you want to change, midstream, the Monoid on your ValT, you have to construct a new Matrix. This is due to + * caching of internal computation graphs. * - * RowVector - handles matrices of row dimension one. It is the result of some of the matrix methods and has methods - * that return ColVector and diagonal matrix + * RowVector - handles matrices of row dimension one. It is the result of some of the matrix methods and has + * methods that return ColVector and diagonal matrix * - * ColVector - handles matrices of col dimension one. It is the result of some of the matrix methods and has methods - * that return RowVector and diagonal matrix + * ColVector - handles matrices of col dimension one. It is the result of some of the matrix methods and has + * methods that return RowVector and diagonal matrix */ // Implicit coversions // Add methods we want to add to pipes here: -class MatrixPipeExtensions(pipe : Pipe) { - def toMatrix[RowT,ColT,ValT](fields : Fields) - (implicit conv : TupleConverter[(RowT,ColT,ValT)], setter : TupleSetter[(RowT,ColT,ValT)]) = { - val matPipe = RichPipe(pipe).mapTo(fields -> ('row,'col,'val))((tup : (RowT,ColT,ValT)) => tup)(conv,setter) - new Matrix[RowT,ColT,ValT]('row, 'col, 'val, matPipe) - } - def mapToMatrix[T,RowT,ColT,ValT](fields : Fields)(mapfn : T => (RowT,ColT,ValT)) - (implicit conv : TupleConverter[T], setter : TupleSetter[(RowT,ColT,ValT)]) = { - val matPipe = RichPipe(pipe).mapTo(fields -> ('row,'col,'val))(mapfn)(conv,setter) - new Matrix[RowT,ColT,ValT]('row, 'col, 'val, matPipe) - } - def flatMapToMatrix[T,RowT,ColT,ValT](fields : Fields)(flatMapfn : T => Iterable[(RowT,ColT,ValT)]) - (implicit conv : TupleConverter[T], setter : TupleSetter[(RowT,ColT,ValT)]) = { - val matPipe = RichPipe(pipe).flatMapTo(fields -> ('row,'col,'val))(flatMapfn)(conv,setter) - new Matrix[RowT,ColT,ValT]('row, 'col, 'val, matPipe) - } - - private def groupPipeIntoMap[ColT, ValT](pipe: Pipe) : Pipe = { - pipe.groupBy('group, 'row) { - _.mapReduceMap[(ColT, ValT), Map[ColT, ValT], Map[ColT, ValT]](('col, 'val) -> 'val) - { (colval: (ColT, ValT)) => Map(colval._1 -> colval._2) } - { (l: Map[ColT, ValT], r: Map[ColT, ValT]) => l ++ r } - { (red: Map[ColT, ValT]) => red } - } +class MatrixPipeExtensions(pipe: Pipe) { + def toMatrix[RowT, ColT, ValT]( + fields: Fields + )(implicit conv: TupleConverter[(RowT, ColT, ValT)], setter: TupleSetter[(RowT, ColT, ValT)]) = { + val matPipe = + RichPipe(pipe).mapTo(fields -> ('row, 'col, 'val))((tup: (RowT, ColT, ValT)) => tup)(conv, setter) + new Matrix[RowT, ColT, ValT]('row, 'col, 'val, matPipe) + } + def mapToMatrix[T, RowT, ColT, ValT](fields: Fields)( + mapfn: T => (RowT, ColT, ValT) + )(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ColT, ValT)]) = { + val matPipe = RichPipe(pipe).mapTo(fields -> ('row, 'col, 'val))(mapfn)(conv, setter) + new Matrix[RowT, ColT, ValT]('row, 'col, 'val, matPipe) + } + def flatMapToMatrix[T, RowT, ColT, ValT](fields: Fields)( + flatMapfn: T => Iterable[(RowT, ColT, ValT)] + )(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ColT, ValT)]) = { + val matPipe = RichPipe(pipe).flatMapTo(fields -> ('row, 'col, 'val))(flatMapfn)(conv, setter) + new Matrix[RowT, ColT, ValT]('row, 'col, 'val, matPipe) + } + + private def groupPipeIntoMap[ColT, ValT](pipe: Pipe): Pipe = + pipe + .groupBy('group, 'row) { + _.mapReduceMap[(ColT, ValT), Map[ColT, ValT], Map[ColT, ValT]](('col, 'val) -> 'val) { + (colval: (ColT, ValT)) => Map(colval._1 -> colval._2) + }((l: Map[ColT, ValT], r: Map[ColT, ValT]) => l ++ r)((red: Map[ColT, ValT]) => red) + } .rename('group, 'col) - } - def toBlockMatrix[GroupT,RowT,ColT,ValT](fields : Fields) - (implicit conv : TupleConverter[(GroupT,RowT,ColT,ValT)], setter : TupleSetter[(GroupT,RowT,ColT,ValT)]) = { + def toBlockMatrix[GroupT, RowT, ColT, ValT](fields: Fields)(implicit + conv: TupleConverter[(GroupT, RowT, ColT, ValT)], + setter: TupleSetter[(GroupT, RowT, ColT, ValT)] + ) = { val matPipe = RichPipe(pipe) - .mapTo(fields -> ('group,'row,'col,'val))((tup : (GroupT,RowT,ColT,ValT)) => tup)(conv,setter) + .mapTo(fields -> ('group, 'row, 'col, 'val))((tup: (GroupT, RowT, ColT, ValT)) => tup)(conv, setter) - new BlockMatrix[GroupT,RowT,ColT,ValT](new Matrix('row, 'col, 'val, groupPipeIntoMap(matPipe))) + new BlockMatrix[GroupT, RowT, ColT, ValT](new Matrix('row, 'col, 'val, groupPipeIntoMap(matPipe))) } - def mapToBlockMatrix[T,GroupT,RowT,ColT,ValT](fields : Fields)(mapfn : T => (GroupT,RowT,ColT,ValT)) - (implicit conv : TupleConverter[T], setter : TupleSetter[(GroupT,RowT,ColT,ValT)]) = { + def mapToBlockMatrix[T, GroupT, RowT, ColT, ValT](fields: Fields)( + mapfn: T => (GroupT, RowT, ColT, ValT) + )(implicit conv: TupleConverter[T], setter: TupleSetter[(GroupT, RowT, ColT, ValT)]) = { val matPipe = RichPipe(pipe) - .mapTo(fields -> ('group,'row,'col,'val))(mapfn)(conv,setter) + .mapTo(fields -> ('group, 'row, 'col, 'val))(mapfn)(conv, setter) - new BlockMatrix[GroupT,RowT,ColT,ValT](new Matrix('row, 'col, 'val, groupPipeIntoMap(matPipe))) + new BlockMatrix[GroupT, RowT, ColT, ValT](new Matrix('row, 'col, 'val, groupPipeIntoMap(matPipe))) } - def flatMapToBlockMatrix[T,GroupT,RowT,ColT,ValT](fields : Fields)(flatMapfn : T => Iterable[(GroupT,RowT,ColT,ValT)]) - (implicit conv : TupleConverter[T], setter : TupleSetter[(GroupT,RowT,ColT,ValT)]) = { - val matPipe = RichPipe(pipe).flatMapTo(fields -> ('group,'row,'col,'val))(flatMapfn)(conv,setter) - new BlockMatrix[GroupT,RowT,ColT,ValT](new Matrix('row, 'col, 'val, groupPipeIntoMap(matPipe))) + def flatMapToBlockMatrix[T, GroupT, RowT, ColT, ValT](fields: Fields)( + flatMapfn: T => Iterable[(GroupT, RowT, ColT, ValT)] + )(implicit conv: TupleConverter[T], setter: TupleSetter[(GroupT, RowT, ColT, ValT)]) = { + val matPipe = RichPipe(pipe).flatMapTo(fields -> ('group, 'row, 'col, 'val))(flatMapfn)(conv, setter) + new BlockMatrix[GroupT, RowT, ColT, ValT](new Matrix('row, 'col, 'val, groupPipeIntoMap(matPipe))) } - def toColVector[RowT,ValT](fields : Fields) - (implicit conv : TupleConverter[(RowT,ValT)], setter : TupleSetter[(RowT,ValT)]) = { - val vecPipe = RichPipe(pipe).mapTo(fields -> ('row, 'val))((tup : (RowT, ValT)) => tup)(conv,setter) - new ColVector[RowT,ValT]('row, 'val, vecPipe) + def toColVector[RowT, ValT]( + fields: Fields + )(implicit conv: TupleConverter[(RowT, ValT)], setter: TupleSetter[(RowT, ValT)]) = { + val vecPipe = RichPipe(pipe).mapTo(fields -> ('row, 'val))((tup: (RowT, ValT)) => tup)(conv, setter) + new ColVector[RowT, ValT]('row, 'val, vecPipe) } - def mapToColVector[T,RowT,ValT](fields : Fields)(mapfn : T => (RowT,ValT)) - (implicit conv : TupleConverter[T], setter : TupleSetter[(RowT,ValT)]) = { - val vecPipe = RichPipe(pipe).mapTo(fields -> ('row, 'val))(mapfn)(conv,setter) - new ColVector[RowT,ValT]('row, 'val, vecPipe) + def mapToColVector[T, RowT, ValT]( + fields: Fields + )(mapfn: T => (RowT, ValT))(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ValT)]) = { + val vecPipe = RichPipe(pipe).mapTo(fields -> ('row, 'val))(mapfn)(conv, setter) + new ColVector[RowT, ValT]('row, 'val, vecPipe) } - def flatMapToColVector[T,RowT,ValT](fields : Fields)(flatMapfn : T => Iterable[(RowT,ValT)]) - (implicit conv : TupleConverter[T], setter : TupleSetter[(RowT,ValT)]) = { - val vecPipe = RichPipe(pipe).flatMapTo(fields -> ('row, 'val))(flatMapfn)(conv,setter) - new ColVector[RowT,ValT]('row, 'val, vecPipe) + def flatMapToColVector[T, RowT, ValT](fields: Fields)( + flatMapfn: T => Iterable[(RowT, ValT)] + )(implicit conv: TupleConverter[T], setter: TupleSetter[(RowT, ValT)]) = { + val vecPipe = RichPipe(pipe).flatMapTo(fields -> ('row, 'val))(flatMapfn)(conv, setter) + new ColVector[RowT, ValT]('row, 'val, vecPipe) } - def toRowVector[ColT,ValT](fields : Fields) - (implicit conv : TupleConverter[(ColT,ValT)], setter : TupleSetter[(ColT,ValT)]) = { - val vecPipe = RichPipe(pipe).mapTo(fields -> ('col, 'val))((tup : (ColT, ValT)) => tup)(conv,setter) - new RowVector[ColT,ValT]('col, 'val, vecPipe) + def toRowVector[ColT, ValT]( + fields: Fields + )(implicit conv: TupleConverter[(ColT, ValT)], setter: TupleSetter[(ColT, ValT)]) = { + val vecPipe = RichPipe(pipe).mapTo(fields -> ('col, 'val))((tup: (ColT, ValT)) => tup)(conv, setter) + new RowVector[ColT, ValT]('col, 'val, vecPipe) } - def mapToRowVector[T,ColT,ValT](fields : Fields)(mapfn : T => (ColT,ValT)) - (implicit conv : TupleConverter[T], setter : TupleSetter[(ColT,ValT)]) = { - val vecPipe = RichPipe(pipe).mapTo(fields -> ('col, 'val))(mapfn)(conv,setter) - new RowVector[ColT,ValT]('col, 'val, vecPipe) + def mapToRowVector[T, ColT, ValT]( + fields: Fields + )(mapfn: T => (ColT, ValT))(implicit conv: TupleConverter[T], setter: TupleSetter[(ColT, ValT)]) = { + val vecPipe = RichPipe(pipe).mapTo(fields -> ('col, 'val))(mapfn)(conv, setter) + new RowVector[ColT, ValT]('col, 'val, vecPipe) } - def flatMapToRowVector[T,ColT,ValT](fields : Fields)(flatMapfn : T => Iterable[(ColT,ValT)]) - (implicit conv : TupleConverter[T], setter : TupleSetter[(ColT,ValT)]) = { - val vecPipe = RichPipe(pipe).flatMapTo(fields -> ('col, 'val))(flatMapfn)(conv,setter) - new RowVector[ColT,ValT]('col, 'val, vecPipe) + def flatMapToRowVector[T, ColT, ValT](fields: Fields)( + flatMapfn: T => Iterable[(ColT, ValT)] + )(implicit conv: TupleConverter[T], setter: TupleSetter[(ColT, ValT)]) = { + val vecPipe = RichPipe(pipe).flatMapTo(fields -> ('col, 'val))(flatMapfn)(conv, setter) + new RowVector[ColT, ValT]('col, 'val, vecPipe) } } -/** This is the enrichment pattern on Mappable[T] for converting to Matrix types +/** + * This is the enrichment pattern on Mappable[T] for converting to Matrix types */ class MatrixMappableExtensions[T](mappable: Mappable[T])(implicit fd: FlowDef, mode: Mode) { - def toMatrix[Row,Col,Val](implicit ev: <:<[T,(Row,Col,Val)], - setter: TupleSetter[(Row,Col,Val)]) : Matrix[Row,Col,Val] = - mapToMatrix { _.asInstanceOf[(Row,Col,Val)] } - - def mapToMatrix[Row,Col,Val](fn: (T) => (Row,Col,Val)) - (implicit setter: TupleSetter[(Row,Col,Val)]) : Matrix[Row,Col,Val] = { + def toMatrix[Row, Col, Val](implicit + ev: <:<[T, (Row, Col, Val)], + setter: TupleSetter[(Row, Col, Val)] + ): Matrix[Row, Col, Val] = + mapToMatrix(_.asInstanceOf[(Row, Col, Val)]) + + def mapToMatrix[Row, Col, Val]( + fn: (T) => (Row, Col, Val) + )(implicit setter: TupleSetter[(Row, Col, Val)]): Matrix[Row, Col, Val] = { val fields = ('row, 'col, 'val) val matPipe = mappable.mapTo(fields)(fn) - new Matrix[Row,Col,Val]('row, 'col, 'val, matPipe) + new Matrix[Row, Col, Val]('row, 'col, 'val, matPipe) } - def toBlockMatrix[Group,Row,Col,Val](implicit ev: <:<[T,(Group,Row,Col,Val)], ord: Ordering[(Group, Row)], - setter: TupleSetter[(Group,Row,Col,Val)]) : BlockMatrix[Group,Row,Col,Val] = - mapToBlockMatrix { _.asInstanceOf[(Group,Row,Col,Val)] } + def toBlockMatrix[Group, Row, Col, Val](implicit + ev: <:<[T, (Group, Row, Col, Val)], + ord: Ordering[(Group, Row)], + setter: TupleSetter[(Group, Row, Col, Val)] + ): BlockMatrix[Group, Row, Col, Val] = + mapToBlockMatrix(_.asInstanceOf[(Group, Row, Col, Val)]) - def mapToBlockMatrix[Group,Row,Col,Val](fn: (T) => (Group,Row,Col,Val))(implicit ord: Ordering[(Group, Row)]) : BlockMatrix[Group,Row,Col,Val] = { + def mapToBlockMatrix[Group, Row, Col, Val]( + fn: (T) => (Group, Row, Col, Val) + )(implicit ord: Ordering[(Group, Row)]): BlockMatrix[Group, Row, Col, Val] = { val matPipe = TypedPipe .from(mappable) .map(fn) .groupBy(t => (t._1, t._2)) - .mapValueStream(s => Iterator(s.map{ case (_, _, c, v) => (c, v) }.toMap)) + .mapValueStream(s => Iterator(s.map { case (_, _, c, v) => (c, v) }.toMap)) .toTypedPipe - .map{ case ((g, r), m) => (r, g, m) } + .map { case ((g, r), m) => (r, g, m) } .toPipe(('row, 'col, 'val)) - new BlockMatrix[Group,Row,Col,Val](new Matrix('row, 'col, 'val, matPipe)) + new BlockMatrix[Group, Row, Col, Val](new Matrix('row, 'col, 'val, matPipe)) } - def toRow[Row,Val](implicit ev: <:<[T,(Row,Val)], setter: TupleSetter[(Row,Val)]) - : RowVector[Row,Val] = mapToRow { _.asInstanceOf[(Row,Val)] } + def toRow[Row, Val](implicit ev: <:<[T, (Row, Val)], setter: TupleSetter[(Row, Val)]): RowVector[Row, Val] = + mapToRow(_.asInstanceOf[(Row, Val)]) - def mapToRow[Row,Val](fn: (T) => (Row,Val)) - (implicit setter: TupleSetter[(Row,Val)], fd: FlowDef) : RowVector[Row,Val] = { + def mapToRow[Row, Val]( + fn: (T) => (Row, Val) + )(implicit setter: TupleSetter[(Row, Val)], fd: FlowDef): RowVector[Row, Val] = { val fields = ('row, 'val) val rowPipe = mappable.mapTo(fields)(fn) - new RowVector[Row,Val]('row,'val, rowPipe) + new RowVector[Row, Val]('row, 'val, rowPipe) } - def toCol[Col,Val](implicit ev: <:<[T,(Col,Val)], setter: TupleSetter[(Col,Val)]) : ColVector[Col,Val] = - mapToCol { _.asInstanceOf[(Col,Val)] } + def toCol[Col, Val](implicit ev: <:<[T, (Col, Val)], setter: TupleSetter[(Col, Val)]): ColVector[Col, Val] = + mapToCol(_.asInstanceOf[(Col, Val)]) - def mapToCol[Col,Val](fn: (T) => (Col,Val)) - (implicit setter: TupleSetter[(Col,Val)]) : ColVector[Col,Val] = { + def mapToCol[Col, Val]( + fn: (T) => (Col, Val) + )(implicit setter: TupleSetter[(Col, Val)]): ColVector[Col, Val] = { val fields = ('col, 'val) val colPipe = mappable.mapTo(fields)(fn) - new ColVector[Col,Val]('col,'val, colPipe) + new ColVector[Col, Val]('col, 'val, colPipe) } } object Matrix { // If this function is implicit, you can use the PipeExtensions methods on pipe - implicit def pipeExtensions[P <% Pipe](p : P) = new MatrixPipeExtensions(p) - implicit def mappableExtensions[T](mt: Mappable[T])(implicit fd: FlowDef, mode: Mode) = + implicit def pipeExtensions[P <% Pipe](p: P): MatrixPipeExtensions = new MatrixPipeExtensions(p) + implicit def mappableExtensions[T]( + mt: Mappable[T] + )(implicit fd: FlowDef, mode: Mode): MatrixMappableExtensions[T] = new MatrixMappableExtensions(mt)(fd, mode) - def filterOutZeros[ValT](fSym : Symbol, group : Monoid[ValT])(fpipe : Pipe) : Pipe = { - fpipe.filter(fSym) { tup : Tuple1[ValT] => group.isNonZero(tup._1) } - } + def filterOutZeros[ValT](fSym: Symbol, group: Monoid[ValT])(fpipe: Pipe): Pipe = + fpipe.filter(fSym) { tup: Tuple1[ValT] => group.isNonZero(tup._1) } - def meanCenter[T](vct: Iterable[(T,Double)]) : Iterable[(T,Double)] = { - val valList = vct.map { _._2 } + def meanCenter[T](vct: Iterable[(T, Double)]): Iterable[(T, Double)] = { + val valList = vct.map(_._2) val sum = valList.sum val count = valList.size val avg = sum / count - vct.map { tup => (tup._1, tup._2 - avg) } + vct.map(tup => (tup._1, tup._2 - avg)) } - implicit def literalToScalar[ValT](v : ValT) = new LiteralScalar(v) + implicit def literalToScalar[ValT](v: ValT): LiteralScalar[ValT] = new LiteralScalar(v) // Converts to Matrix for addition - implicit def diagonalToMatrix[RowT,ValT](diag : DiagonalMatrix[RowT,ValT]) : Matrix[RowT,RowT,ValT] = { + implicit def diagonalToMatrix[RowT, ValT](diag: DiagonalMatrix[RowT, ValT]): Matrix[RowT, RowT, ValT] = { val colSym = newSymbol(Set(diag.idxSym, diag.valSym), 'col) - val newPipe = diag.pipe.map(diag.idxSym -> colSym) { (x : RowT) => x } - new Matrix[RowT,RowT,ValT](diag.idxSym, colSym, diag.valSym, newPipe, diag.sizeHint) + val newPipe = diag.pipe.map(diag.idxSym -> colSym)((x: RowT) => x) + new Matrix[RowT, RowT, ValT](diag.idxSym, colSym, diag.valSym, newPipe, diag.sizeHint) } } @@ -220,89 +242,97 @@ object Matrix { // common properties. The main common pattern is the desire to write them to sources // without needless duplication of code. trait WrappedPipe { - def fields : Fields - def pipe : Pipe - def writePipe(src : Source, outFields : Fields = Fields.NONE)(implicit fd : FlowDef, mode: Mode) { + def fields: Fields + def pipe: Pipe + def writePipe(src: Source, outFields: Fields = Fields.NONE)(implicit fd: FlowDef, mode: Mode): Unit = { val toWrite = if (outFields.isNone) pipe else pipe.rename(fields -> outFields) toWrite.write(src) } } -class Matrix[RowT, ColT, ValT] - (val rowSym : Symbol, val colSym : Symbol, val valSym : Symbol, - inPipe : Pipe, val sizeHint : SizeHint = NoClue) - extends WrappedPipe with java.io.Serializable { +class Matrix[RowT, ColT, ValT]( + val rowSym: Symbol, + val colSym: Symbol, + val valSym: Symbol, + inPipe: Pipe, + val sizeHint: SizeHint = NoClue +) extends WrappedPipe + with java.io.Serializable { import Matrix._ import MatrixProduct._ import Dsl.ensureUniqueFields import Dsl.getField - //The access function for inPipe. Ensures the right order of: row,col,val - lazy val pipe = inPipe.project(rowSym,colSym,valSym) + // The access function for inPipe. Ensures the right order of: row,col,val + lazy val pipe = inPipe.project(rowSym, colSym, valSym) def fields = rowColValSymbols - def pipeAs(toFields : Fields) = pipe.rename((rowSym,colSym,valSym) -> toFields) + def pipeAs(toFields: Fields) = pipe.rename((rowSym, colSym, valSym) -> toFields) def hasHint = sizeHint != NoClue override def hashCode = inPipe.hashCode - override def equals(that : Any) : Boolean = { - (that != null) && (that.isInstanceOf[Matrix[_,_,_]]) && { - val thatM = that.asInstanceOf[Matrix[RowT,ColT,ValT]] + override def equals(that: Any): Boolean = + (that != null) && (that.isInstanceOf[Matrix[_, _, _]]) && { + val thatM = that.asInstanceOf[Matrix[RowT, ColT, ValT]] (this.rowSym == thatM.rowSym) && (this.colSym == thatM.colSym) && (this.valSym == thatM.valSym) && (this.pipe == thatM.pipe) } - } // Value operations - def mapValues[ValU](fn:(ValT) => ValU)(implicit mon : Monoid[ValU]) : Matrix[RowT,ColT,ValU] = { - val newPipe = pipe.flatMap(valSym -> valSym) { imp : Tuple1[ValT] => //Ensure an arity of 1 - //This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. - mon.nonZeroOption(fn(imp._1)).map { Tuple1(_) } + def mapValues[ValU](fn: (ValT) => ValU)(implicit mon: Monoid[ValU]): Matrix[RowT, ColT, ValU] = { + val newPipe = pipe.flatMap(valSym -> valSym) { imp: Tuple1[ValT] => // Ensure an arity of 1 + // This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. + mon.nonZeroOption(fn(imp._1)).map(Tuple1(_)) } - new Matrix[RowT,ColT,ValU](this.rowSym, this.colSym, this.valSym, newPipe, sizeHint) + new Matrix[RowT, ColT, ValU](this.rowSym, this.colSym, this.valSym, newPipe, sizeHint) } - /** like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. - * Note you will only see non-zero elements on the matrix. This does not enumerate the zeros + + /** + * like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. Note you will only see non-zero elements + * on the matrix. This does not enumerate the zeros */ - def mapWithIndex[ValNew](fn: (ValT,RowT,ColT) => ValNew)(implicit mon: Monoid[ValNew]): - Matrix[RowT,ColT,ValNew] = { - val newPipe = pipe.flatMap(fields -> fields) { imp : (RowT,ColT,ValT) => - mon.nonZeroOption(fn(imp._3, imp._1, imp._2)).map { (imp._1, imp._2, _) } + def mapWithIndex[ValNew]( + fn: (ValT, RowT, ColT) => ValNew + )(implicit mon: Monoid[ValNew]): Matrix[RowT, ColT, ValNew] = { + val newPipe = pipe.flatMap(fields -> fields) { imp: (RowT, ColT, ValT) => + mon.nonZeroOption(fn(imp._3, imp._1, imp._2)).map((imp._1, imp._2, _)) } - new Matrix[RowT,ColT,ValNew](rowSym, colSym, valSym, newPipe, sizeHint) + new Matrix[RowT, ColT, ValNew](rowSym, colSym, valSym, newPipe, sizeHint) } // Filter values - def filterValues(fn : (ValT) => Boolean) : Matrix[RowT,ColT,ValT] = { - val newPipe = pipe.filter(valSym) { imp : Tuple1[ValT] => //Ensure an arity of 1 - //This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. + def filterValues(fn: (ValT) => Boolean): Matrix[RowT, ColT, ValT] = { + val newPipe = pipe.filter(valSym) { imp: Tuple1[ValT] => // Ensure an arity of 1 + // This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. fn(imp._1) } - new Matrix[RowT,ColT,ValT](this.rowSym, this.colSym, this.valSym, newPipe, sizeHint) + new Matrix[RowT, ColT, ValT](this.rowSym, this.colSym, this.valSym, newPipe, sizeHint) } // Binarize values, all x != 0 become 1 - def binarizeAs[NewValT](implicit mon : Monoid[ValT], ring : Ring[NewValT]) : Matrix[RowT,ColT,NewValT] = { - mapValues( x => if ( mon.isNonZero(x) ) { ring.one } else { ring.zero } )(ring) - } + def binarizeAs[NewValT](implicit mon: Monoid[ValT], ring: Ring[NewValT]): Matrix[RowT, ColT, NewValT] = + mapValues(x => + if (mon.isNonZero(x)) { ring.one } + else { ring.zero } + )(ring) // Row Operations // Get a specific row - def getRow (index : RowT) : RowVector[ColT,ValT] = { + def getRow(index: RowT): RowVector[ColT, ValT] = { val newPipe = inPipe - .filter(rowSym){ input : RowT => input == index } - .project(colSym,valSym) + .filter(rowSym) { input: RowT => input == index } + .project(colSym, valSym) val newHint = sizeHint.setRows(1L) - new RowVector[ColT,ValT](colSym,valSym,newPipe,newHint) + new RowVector[ColT, ValT](colSym, valSym, newPipe, newHint) } // Reduce all rows to a single row (zeros or ignored) - def reduceRowVectors(fn: (ValT,ValT) => ValT)(implicit mon : Monoid[ValT]) : RowVector[ColT,ValT] = { + def reduceRowVectors(fn: (ValT, ValT) => ValT)(implicit mon: Monoid[ValT]): RowVector[ColT, ValT] = { val newPipe = filterOutZeros(valSym, mon) { pipe.groupBy(colSym) { - _.reduce(valSym) { (x : Tuple1[ValT], y: Tuple1[ValT]) => Tuple1(fn(x._1,y._1)) } + _.reduce(valSym)((x: Tuple1[ValT], y: Tuple1[ValT]) => Tuple1(fn(x._1, y._1))) // Matrices are generally huge and cascading has problems with diverse key spaces and // mapside operations // TODO continually evaluate if this is needed to avoid OOM @@ -311,322 +341,302 @@ class Matrix[RowT, ColT, ValT] } } val newHint = sizeHint.setRows(1L) - new RowVector[ColT,ValT](colSym,valSym,newPipe,newHint) + new RowVector[ColT, ValT](colSym, valSym, newPipe, newHint) } // Sums all the rows per column - def sumRowVectors(implicit mon : Monoid[ValT]) : RowVector[ColT,ValT] = { - this.reduceRowVectors((x,y) => mon.plus(x,y)) - } + def sumRowVectors(implicit mon: Monoid[ValT]): RowVector[ColT, ValT] = + this.reduceRowVectors((x, y) => mon.plus(x, y)) // Maps rows using a per-row mapping function // Use this for non-decomposable vector processing functions // and with vectors that can fit in one-single machine memory - def mapRows (fn: Iterable[(ColT,ValT)] => Iterable[(ColT,ValT)])(implicit mon : Monoid[ValT]) - : Matrix[RowT,ColT,ValT] = { + def mapRows( + fn: Iterable[(ColT, ValT)] => Iterable[(ColT, ValT)] + )(implicit mon: Monoid[ValT]): Matrix[RowT, ColT, ValT] = { val newListSym = Symbol(colSym.name + "_" + valSym.name + "_list") // TODO, I think we can count the rows/cols for free here val newPipe = filterOutZeros(valSym, mon) { - pipe.groupBy(rowSym) { - _.toList[(ColT,ValT)]((colSym,valSym) -> newListSym) - } - .flatMapTo( (rowSym, newListSym) -> (rowSym,colSym,valSym) ) { tup : (RowT,List[(ColT,ValT)]) => + pipe + .groupBy(rowSym) { + _.toList[(ColT, ValT)]((colSym, valSym) -> newListSym) + } + .flatMapTo((rowSym, newListSym) -> (rowSym, colSym, valSym)) { tup: (RowT, List[(ColT, ValT)]) => val row = tup._1 val list = fn(tup._2) // Now flatten out to (row, col, val): - list.map{ imp : (ColT,ValT) => (row,imp._1,imp._2) } - } + list.map { imp: (ColT, ValT) => (row, imp._1, imp._2) } + } } - new Matrix[RowT,ColT,ValT](rowSym, colSym, valSym, newPipe, sizeHint) + new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, newPipe, sizeHint) } - - def topRowElems( k : Int )(implicit ord : Ordering[ValT]) : Matrix[RowT,ColT,ValT] = { + def topRowElems(k: Int)(implicit ord: Ordering[ValT]): Matrix[RowT, ColT, ValT] = if (k < 1000) { topRowWithTiny(k) - } - else { - val newPipe = pipe.groupBy(rowSym){ _ - .sortBy(valSym) - .reverse - .take(k) + } else { + val newPipe = pipe + .groupBy(rowSym) { + _.sortBy(valSym).reverse + .take(k) } - .project(rowSym,colSym,valSym) - new Matrix[RowT,ColT,ValT](rowSym, colSym, valSym, newPipe, FiniteHint(-1L,k)) + .project(rowSym, colSym, valSym) + new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, newPipe, FiniteHint(-1L, k)) } - } - protected def topRowWithTiny( k : Int )(implicit ord : Ordering[ValT]) : Matrix[RowT,ColT,ValT] = { + protected def topRowWithTiny(k: Int)(implicit ord: Ordering[ValT]): Matrix[RowT, ColT, ValT] = { val topSym = Symbol(colSym.name + "_topK") - val newPipe = pipe.groupBy(rowSym){ _ - .sortWithTake( (colSym, valSym) -> 'top_vals, k ) ( (t0 :(ColT,ValT), t1:(ColT,ValT)) => ord.gt(t0._2,t1._2) ) - } - .flatMapTo((0,1) ->(rowSym,topSym,valSym)) { imp:(RowT,List[(ColT,ValT)]) => - val row = imp._1 - val list = imp._2 - list.map{ imp : (ColT,ValT) => (row,imp._1,imp._2) } - } - new Matrix[RowT,ColT,ValT](rowSym, topSym, valSym, newPipe, FiniteHint(-1L,k)) + val newPipe = pipe + .groupBy(rowSym) { + _.sortWithTake((colSym, valSym) -> 'top_vals, k)((t0: (ColT, ValT), t1: (ColT, ValT)) => + ord.gt(t0._2, t1._2) + ) + } + .flatMapTo((0, 1) -> (rowSym, topSym, valSym)) { imp: (RowT, List[(ColT, ValT)]) => + val row = imp._1 + val list = imp._2 + list.map { imp: (ColT, ValT) => (row, imp._1, imp._2) } + } + new Matrix[RowT, ColT, ValT](rowSym, topSym, valSym, newPipe, FiniteHint(-1L, k)) } protected lazy val rowL0Norm = { - val matD = this.asInstanceOf[Matrix[RowT,ColT,Double]] - (matD.mapValues { x => 1.0 } - .sumColVectors - .diag - .inverse) * matD + val matD = this.asInstanceOf[Matrix[RowT, ColT, Double]] + (matD.mapValues(x => 1.0).sumColVectors.diag.inverse) * matD } - def rowL0Normalize(implicit ev : =:=[ValT,Double]) : Matrix[RowT,ColT,Double] = rowL0Norm + def rowL0Normalize(implicit ev: =:=[ValT, Double]): Matrix[RowT, ColT, Double] = rowL0Norm protected lazy val rowL1Norm = { - val matD = this.asInstanceOf[Matrix[RowT,ColT,Double]] - (matD.mapValues { x => x.abs } - .sumColVectors - .diag - .inverse) * matD + val matD = this.asInstanceOf[Matrix[RowT, ColT, Double]] + (matD.mapValues(x => x.abs).sumColVectors.diag.inverse) * matD } // Row L1 normalization, only makes sense for Doubles // At the end of L1 normalization, sum of row values is one - def rowL1Normalize(implicit ev : =:=[ValT,Double]) : Matrix[RowT,ColT,Double] = rowL1Norm + def rowL1Normalize(implicit ev: =:=[ValT, Double]): Matrix[RowT, ColT, Double] = rowL1Norm protected lazy val rowL2Norm = { - val matD = this.asInstanceOf[Matrix[RowT,ColT,Double]] - (matD.mapValues { x => x*x } + val matD = this.asInstanceOf[Matrix[RowT, ColT, Double]] + (matD + .mapValues(x => x * x) .sumColVectors .diag - .mapValues { x => scala.math.sqrt(x) } + .mapValues(x => scala.math.sqrt(x)) .diagonal .inverse) * matD } // Row L2 normalization (can only be called for Double) // After this operation, the sum(|x|^2) along each row will be 1. - def rowL2Normalize(implicit ev : =:=[ValT,Double]) : Matrix[RowT,ColT,Double] = rowL2Norm + def rowL2Normalize(implicit ev: =:=[ValT, Double]): Matrix[RowT, ColT, Double] = rowL2Norm // Remove the mean of each row from each value in a row. // Double ValT only (only over the observed values, not dividing by the unobserved ones) - def rowMeanCentering(implicit ev : =:=[ValT,Double]) = { - val matD = this.asInstanceOf[Matrix[RowT,ColT,Double]] - matD.mapRows { Matrix.meanCenter } + def rowMeanCentering(implicit ev: =:=[ValT, Double]) = { + val matD = this.asInstanceOf[Matrix[RowT, ColT, Double]] + matD.mapRows(Matrix.meanCenter) } - // Row non-zeroes, ave and standard deviation in one pass - Double ValT only // It produces a matrix with the same number of rows, but the cols are the three moments. // (moments are computed only over the observed values, not taking into account the unobserved ones) - def rowSizeAveStdev(implicit ev : =:=[ValT,Double]) = { + def rowSizeAveStdev(implicit ev: =:=[ValT, Double]) = { val newColSym = Symbol(colSym.name + "_newCol") val newValSym = Symbol(valSym.name + "_newVal") val newPipe = inPipe - .groupBy(rowSym) { _.sizeAveStdev((valSym)->('size,'ave,'stdev)) } - .flatMapTo( (rowSym,'size,'ave,'stdev) -> (rowSym,newColSym,newValSym) ) { tup : (RowT,Long,Double,Double) => + .groupBy(rowSym)(_.sizeAveStdev(valSym -> ('size, 'ave, 'stdev))) + .flatMapTo((rowSym, 'size, 'ave, 'stdev) -> (rowSym, newColSym, newValSym)) { + tup: (RowT, Long, Double, Double) => val row = tup._1 val size = tup._2.toDouble val avg = tup._3 val stdev = tup._4 - List((row,1,size),(row,2,avg),(row,3,stdev)) + List((row, 1, size), (row, 2, avg), (row, 3, stdev)) } val newHint = sizeHint.setCols(3L) - new Matrix[RowT,Int,Double](rowSym, newColSym, newValSym, newPipe, newHint) + new Matrix[RowT, Int, Double](rowSym, newColSym, newValSym, newPipe, newHint) } - def rowColValSymbols : Fields = (rowSym, colSym, valSym) + def rowColValSymbols: Fields = (rowSym, colSym, valSym) // Column operations - see Row operations above - def getCol (index : ColT) : ColVector[RowT,ValT] = { + def getCol(index: ColT): ColVector[RowT, ValT] = this.transpose.getRow(index).transpose - } - def reduceColVectors (fn: (ValT,ValT) => ValT)( implicit mon: Monoid[ValT] ) : ColVector[RowT,ValT] = { + def reduceColVectors(fn: (ValT, ValT) => ValT)(implicit mon: Monoid[ValT]): ColVector[RowT, ValT] = this.transpose.reduceRowVectors(fn)(mon).transpose - } - def sumColVectors( implicit mon : Monoid[ValT] ) : ColVector[RowT,ValT] = { + def sumColVectors(implicit mon: Monoid[ValT]): ColVector[RowT, ValT] = this.transpose.sumRowVectors(mon).transpose - } - def mapCols(fn: Iterable[(RowT,ValT)] => Iterable[(RowT,ValT)])( implicit mon : Monoid[ValT] ) : Matrix[RowT,ColT,ValT] = { + def mapCols(fn: Iterable[(RowT, ValT)] => Iterable[(RowT, ValT)])(implicit + mon: Monoid[ValT] + ): Matrix[RowT, ColT, ValT] = this.transpose.mapRows(fn)(mon).transpose - } - def topColElems( k : Int )(implicit ord : Ordering[ValT]) : Matrix[RowT,ColT,ValT] = { + def topColElems(k: Int)(implicit ord: Ordering[ValT]): Matrix[RowT, ColT, ValT] = this.transpose.topRowElems(k)(ord).transpose - } - - def colL0Normalize(implicit ev : =:=[ValT,Double]) = { + def colL0Normalize(implicit ev: =:=[ValT, Double]) = this.transpose.rowL0Normalize.transpose - } - def colL1Normalize(implicit ev : =:=[ValT,Double]) = { + def colL1Normalize(implicit ev: =:=[ValT, Double]) = this.transpose.rowL1Normalize.transpose - } - def colL2Normalize(implicit ev : =:=[ValT,Double]) = { + def colL2Normalize(implicit ev: =:=[ValT, Double]) = this.transpose.rowL2Normalize.transpose - } - def colMeanCentering(implicit ev : =:=[ValT,Double]) = { + def colMeanCentering(implicit ev: =:=[ValT, Double]) = this.transpose.rowMeanCentering.transpose - } - def colSizeAveStdev(implicit ev : =:=[ValT,Double]) = { + def colSizeAveStdev(implicit ev: =:=[ValT, Double]) = this.transpose.rowSizeAveStdev - } - def *[That,Res](that : That)(implicit prod : MatrixProduct[Matrix[RowT,ColT,ValT],That,Res]) : Res = { + def *[That, Res](that: That)(implicit prod: MatrixProduct[Matrix[RowT, ColT, ValT], That, Res]): Res = prod(this, that) - } - def /(that : LiteralScalar[ValT])(implicit field : Field[ValT]) = { + def /(that: LiteralScalar[ValT])(implicit field: Field[ValT]) = { field.assertNotZero(that.value) - mapValues(elem => field.div(elem, that.value))(field) + mapValues(elem => field.div(elem, that.value)) } - def /(that : Scalar[ValT])(implicit field : Field[ValT]) = { + def /(that: Scalar[ValT])(implicit field: Field[ValT]) = nonZerosWith(that) - .mapValues({leftRight : (ValT,ValT) => + .mapValues { leftRight: (ValT, ValT) => val (left, right) = leftRight field.div(left, right) - })(field) - } + } // Between Matrix value reduction - Generalizes matrix addition with an arbitrary value aggregation function // It assumes that the function fn(0,0) = 0 // This function assumes only one value in each matrix for a given row and column index. (no stacking of operations yet) // TODO: Optimize this later and be lazy on groups and joins. - def elemWiseOp(that : Matrix[RowT,ColT,ValT])(fn : (ValT,ValT) => ValT)(implicit mon : Monoid[ValT]) - : Matrix[RowT,ColT,ValT] = { + def elemWiseOp(that: Matrix[RowT, ColT, ValT])(fn: (ValT, ValT) => ValT)(implicit + mon: Monoid[ValT] + ): Matrix[RowT, ColT, ValT] = // If the following is not true, it's not clear this is meaningful // assert(mon.isZero(fn(mon.zero,mon.zero)), "f is illdefined") - zip(that).mapValues({ pair => fn(pair._1, pair._2) })(mon) - } + zip(that).mapValues(pair => fn(pair._1, pair._2))(mon) // Matrix summation - def +(that : Matrix[RowT,ColT,ValT])(implicit mon : Monoid[ValT]) : Matrix[RowT,ColT,ValT] = { + def +(that: Matrix[RowT, ColT, ValT])(implicit mon: Monoid[ValT]): Matrix[RowT, ColT, ValT] = if (equals(that)) { // No need to do any groupBy operation - mapValues { v => mon.plus(v,v) }(mon) - } - else { - elemWiseOp(that)((x,y) => mon.plus(x,y))(mon) + mapValues(v => mon.plus(v, v))(mon) + } else { + elemWiseOp(that)((x, y) => mon.plus(x, y))(mon) } - } // Matrix difference - def -(that : Matrix[RowT,ColT,ValT])(implicit grp : Group[ValT]) : Matrix[RowT,ColT,ValT] = { - elemWiseOp(that)((x,y) => grp.minus(x,y))(grp) - } + def -(that: Matrix[RowT, ColT, ValT])(implicit grp: Group[ValT]): Matrix[RowT, ColT, ValT] = + elemWiseOp(that)((x, y) => grp.minus(x, y))(grp) // Matrix elementwise product / Hadamard product // see http://en.wikipedia.org/wiki/Hadamard_product_(matrices) - def hProd(mat: Matrix[RowT,ColT,ValT])(implicit ring : Ring[ValT]) : Matrix[RowT,ColT,ValT] = { - elemWiseOp(mat)((x,y) => ring.times(x,y))(ring) - } + def hProd(mat: Matrix[RowT, ColT, ValT])(implicit ring: Ring[ValT]): Matrix[RowT, ColT, ValT] = + elemWiseOp(mat)((x, y) => ring.times(x, y))(ring) - /** Considering the matrix as a graph, propagate the column: - * Does the calculation: \sum_{j where M(i,j) == true) c_j + /** + * Considering the matrix as a graph, propagate the column: Does the calculation: \sum_{j where M(i,j) == + * true) c_j */ - def propagate[ColValT](vec: ColVector[ColT,ColValT])(implicit ev: =:=[ValT,Boolean], monT: Monoid[ColValT]) - : ColVector[RowT,ColValT] = { - //This cast will always succeed: - val boolMat = this.asInstanceOf[Matrix[RowT,ColT,Boolean]] - boolMat.zip(vec.transpose) - .mapValues { boolT => if (boolT._1) boolT._2 else monT.zero } - .sumColVectors + def propagate[ColValT]( + vec: ColVector[ColT, ColValT] + )(implicit ev: =:=[ValT, Boolean], monT: Monoid[ColValT]): ColVector[RowT, ColValT] = { + // This cast will always succeed: + val boolMat = this.asInstanceOf[Matrix[RowT, ColT, Boolean]] + boolMat.zip(vec.transpose).mapValues(boolT => if (boolT._1) boolT._2 else monT.zero).sumColVectors } // Compute the sum of the main diagonal. Only makes sense cases where the row and col type are // equal - def trace(implicit mon : Monoid[ValT], ev : =:=[RowT,ColT]) : Scalar[ValT] = { + def trace(implicit mon: Monoid[ValT], ev: =:=[RowT, ColT]): Scalar[ValT] = diagonal.trace(mon) - } // Compute the sum of all the elements in the matrix - def sum(implicit mon : Monoid[ValT]) : Scalar[ValT] = { + def sum(implicit mon: Monoid[ValT]): Scalar[ValT] = sumRowVectors.sum - } - def transpose : Matrix[ColT, RowT, ValT] = { - new Matrix[ColT,RowT,ValT](colSym, rowSym, valSym, inPipe, sizeHint.transpose) - } + def transpose: Matrix[ColT, RowT, ValT] = + new Matrix[ColT, RowT, ValT](colSym, rowSym, valSym, inPipe, sizeHint.transpose) // This should only be called by def diagonal, which verifies that RowT == ColT - protected lazy val mainDiagonal : DiagonalMatrix[RowT,ValT] = { - val diagPipe = pipe.filter(rowSym, colSym) { input : (RowT, RowT) => + protected lazy val mainDiagonal: DiagonalMatrix[RowT, ValT] = { + val diagPipe = pipe + .filter(rowSym, colSym) { input: (RowT, RowT) => (input._1 == input._2) } .project(rowSym, valSym) - new DiagonalMatrix[RowT,ValT](rowSym, valSym, diagPipe, SizeHint.asDiagonal(sizeHint)) + new DiagonalMatrix[RowT, ValT](rowSym, valSym, diagPipe, SizeHint.asDiagonal(sizeHint)) } // This method will only work if the row type and column type are the same // the type constraint below means there is evidence that RowT and ColT are // the same type - def diagonal(implicit ev : =:=[RowT,ColT]) = mainDiagonal + def diagonal(implicit ev: =:=[RowT, ColT]) = mainDiagonal /* * This just removes zeros after the join inside a zip */ - private def cleanUpZipJoin[ValU](otherVSym : Fields, pairMonoid : Monoid[(ValT,ValU)])(joinedPipe : Pipe) - : Pipe = { + private def cleanUpZipJoin[ValU](otherVSym: Fields, pairMonoid: Monoid[(ValT, ValU)])( + joinedPipe: Pipe + ): Pipe = joinedPipe - //Make sure the zeros are set correctly: - .map(valSym -> valSym) { (x : ValT) => + // Make sure the zeros are set correctly: + .map(valSym -> valSym) { (x: ValT) => if (null == x) pairMonoid.zero._1 else x } - .map(otherVSym -> otherVSym) { (x : ValU) => + .map(otherVSym -> otherVSym) { (x: ValU) => if (null == x) pairMonoid.zero._2 else x } - //Put the pair into a single item, ugly in scalding sadly... - .map(valSym.append(otherVSym) -> valSym) { tup : (ValT,ValU) => Tuple1(tup) } + // Put the pair into a single item, ugly in scalding sadly... + .map(valSym.append(otherVSym) -> valSym) { tup: (ValT, ValU) => Tuple1(tup) } .project(rowColValSymbols) - } /* * This ensures both side rows and columns have correct indexes (fills in nulls from the other side * in the case of outerjoins) */ - private def cleanUpIndexZipJoin(fields : Fields, joinedPipe : RichPipe) - : Pipe = { + private def cleanUpIndexZipJoin(fields: Fields, joinedPipe: RichPipe): Pipe = { - def anyRefOr( tup : (AnyRef, AnyRef)) : (AnyRef, AnyRef) = { - val newRef = Option(tup._1).getOrElse(tup._2) - (newRef, newRef) - } + def anyRefOr(tup: (AnyRef, AnyRef)): (AnyRef, AnyRef) = { + val newRef = Option(tup._1).getOrElse(tup._2) + (newRef, newRef) + } - joinedPipe - .map(fields -> fields) { tup : (AnyRef, AnyRef) => anyRefOr(tup) } + joinedPipe + .map(fields -> fields) { tup: (AnyRef, AnyRef) => anyRefOr(tup) } } // Similar to zip, but combine the scalar on the right with all non-zeros in this matrix: - def nonZerosWith[ValU](that : Scalar[ValU]) : Matrix[RowT,ColT,(ValT,ValU)] = { + def nonZerosWith[ValU](that: Scalar[ValU]): Matrix[RowT, ColT, (ValT, ValU)] = { val (newRFields, newRPipe) = ensureUniqueFields(rowColValSymbols, that.valSym, that.pipe) - val newPipe = inPipe.crossWithTiny(newRPipe) - .map(valSym.append(getField(newRFields, 0)) -> valSym) { leftRight : (ValT, ValU) => Tuple1(leftRight) } + val newPipe = inPipe + .crossWithTiny(newRPipe) + .map(valSym.append(getField(newRFields, 0)) -> valSym) { leftRight: (ValT, ValU) => Tuple1(leftRight) } .project(rowColValSymbols) - new Matrix[RowT,ColT,(ValT,ValU)](rowSym, colSym, valSym, newPipe, sizeHint) + new Matrix[RowT, ColT, (ValT, ValU)](rowSym, colSym, valSym, newPipe, sizeHint) } // Similar to zip, but combine the scalar on the right with all non-zeros in this matrix: - def nonZerosWith[ValU](that : LiteralScalar[ValU]) : Matrix[RowT,ColT,(ValT,ValU)] = { - val newPipe = inPipe.map(valSym -> valSym) { left : Tuple1[ValT] => + def nonZerosWith[ValU](that: LiteralScalar[ValU]): Matrix[RowT, ColT, (ValT, ValU)] = { + val newPipe = inPipe + .map(valSym -> valSym) { left: Tuple1[ValT] => Tuple1((left._1, that.value)) } .project(rowColValSymbols) - new Matrix[RowT,ColT,(ValT,ValU)](rowSym, colSym, valSym, newPipe, sizeHint) + new Matrix[RowT, ColT, (ValT, ValU)](rowSym, colSym, valSym, newPipe, sizeHint) } // Override the size hint - def withSizeHint(sh : SizeHint) : Matrix[RowT,ColT,ValT] = { - new Matrix[RowT,ColT,ValT](rowSym, colSym, valSym, pipe, sh) - } + def withSizeHint(sh: SizeHint): Matrix[RowT, ColT, ValT] = + new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, pipe, sh) // Zip the given row with all the rows of the matrix - def zip[ValU](that : ColVector[RowT,ValU])(implicit pairMonoid : Monoid[(ValT,ValU)]) - : Matrix[RowT,ColT,(ValT,ValU)] = { + def zip[ValU]( + that: ColVector[RowT, ValU] + )(implicit pairMonoid: Monoid[(ValT, ValU)]): Matrix[RowT, ColT, (ValT, ValU)] = { val (newRFields, newRPipe) = ensureUniqueFields(rowColValSymbols, (that.rowS, that.valS), that.pipe) // we must do an outer join to preserve zeros on one side or the other. // joinWithTiny can't do outer. And since the number @@ -635,13 +645,14 @@ class Matrix[RowT, ColT, ValT] val zipped = cleanUpZipJoin(getField(newRFields, 1), pairMonoid) { pipe .joinWithSmaller(rowSym -> getField(newRFields, 0), newRPipe, new OuterJoin) - .thenDo{ p : RichPipe => cleanUpIndexZipJoin(rowSym.append(getField(newRFields, 0)),p) } + .thenDo { p: RichPipe => cleanUpIndexZipJoin(rowSym.append(getField(newRFields, 0)), p) } } - new Matrix[RowT,ColT,(ValT,ValU)](rowSym, colSym, valSym, zipped, sizeHint + that.sizeH) + new Matrix[RowT, ColT, (ValT, ValU)](rowSym, colSym, valSym, zipped, sizeHint + that.sizeH) } // Zip the given row with all the rows of the matrix - def zip[ValU](that : RowVector[ColT,ValU])(implicit pairMonoid : Monoid[(ValT,ValU)]) - : Matrix[RowT,ColT,(ValT,ValU)] = { + def zip[ValU]( + that: RowVector[ColT, ValU] + )(implicit pairMonoid: Monoid[(ValT, ValU)]): Matrix[RowT, ColT, (ValT, ValU)] = { val (newRFields, newRPipe) = ensureUniqueFields(rowColValSymbols, (that.colS, that.valS), that.pipe) // we must do an outer join to preserve zeros on one side or the other. // joinWithTiny can't do outer. And since the number @@ -650,14 +661,15 @@ class Matrix[RowT, ColT, ValT] val zipped = cleanUpZipJoin(getField(newRFields, 1), pairMonoid) { pipe .joinWithSmaller(colSym -> getField(newRFields, 0), newRPipe, new OuterJoin) - .thenDo{ p : RichPipe => cleanUpIndexZipJoin(colSym.append(getField(newRFields, 0)),p) } + .thenDo { p: RichPipe => cleanUpIndexZipJoin(colSym.append(getField(newRFields, 0)), p) } } - new Matrix[RowT,ColT,(ValT,ValU)](rowSym, colSym, valSym, zipped, sizeHint + that.sizeH) + new Matrix[RowT, ColT, (ValT, ValU)](rowSym, colSym, valSym, zipped, sizeHint + that.sizeH) } // This creates the matrix with pairs for the entries - def zip[ValU](that : Matrix[RowT,ColT,ValU])(implicit pairMonoid : Monoid[(ValT,ValU)]) - : Matrix[RowT,ColT,(ValT,ValU)] = { + def zip[ValU]( + that: Matrix[RowT, ColT, ValU] + )(implicit pairMonoid: Monoid[(ValT, ValU)]): Matrix[RowT, ColT, (ValT, ValU)] = { val (newRFields, newRPipe) = ensureUniqueFields(rowColValSymbols, that.rowColValSymbols, that.pipe) // we must do an outer join to preserve zeros on one side or the other. // joinWithTiny can't do outer. And since the number @@ -665,238 +677,284 @@ class Matrix[RowT, ColT, ValT] // TODO optimize the number of reducers val zipped = cleanUpZipJoin[ValU](getField(newRFields, 2), pairMonoid) { pipe - .joinWithSmaller((rowSym, colSym) -> - (getField(newRFields, 0).append(getField(newRFields, 1))), - newRPipe, new OuterJoin) - .thenDo{ p : RichPipe => cleanUpIndexZipJoin(rowSym.append(getField(newRFields,0)),p) } - .thenDo{ p : RichPipe => cleanUpIndexZipJoin(colSym.append(getField(newRFields,1)),p) } + .joinWithSmaller( + (rowSym, colSym) -> + (getField(newRFields, 0).append(getField(newRFields, 1))), + newRPipe, + new OuterJoin + ) + .thenDo { p: RichPipe => cleanUpIndexZipJoin(rowSym.append(getField(newRFields, 0)), p) } + .thenDo { p: RichPipe => cleanUpIndexZipJoin(colSym.append(getField(newRFields, 1)), p) } } - new Matrix[RowT,ColT,(ValT,ValU)](rowSym, colSym, valSym, zipped, sizeHint + that.sizeHint) + new Matrix[RowT, ColT, (ValT, ValU)](rowSym, colSym, valSym, zipped, sizeHint + that.sizeHint) } - def toBlockMatrix[G](grouping: (RowT) => (G, RowT)) : BlockMatrix[G, RowT, ColT, ValT] = { + def toBlockMatrix[G](grouping: (RowT) => (G, RowT)): BlockMatrix[G, RowT, ColT, ValT] = inPipe.map('row -> ('group, 'row))(grouping).toBlockMatrix(('group, 'row, 'col, 'val)) - } /** * removes any elements in this matrix that also appear in the argument matrix */ - def removeElementsBy[ValU](that : Matrix[RowT,ColT,ValU]) : Matrix[RowT,ColT,ValT] = { + def removeElementsBy[ValU](that: Matrix[RowT, ColT, ValU]): Matrix[RowT, ColT, ValT] = { val filterR = '___filterR___ val filterC = '___filterC___ val filterV = '___filterV___ - val joined = pipe.joinWithSmaller((rowSym, colSym) -> (filterR, filterC), - that.pipe.rename((that.rowSym, that.colSym, that.valSym) -> (filterR, filterC, filterV)), new LeftJoin) - val filtered = joined.filter(filterV){ x : ValU => null == x } - new Matrix[RowT,ColT,ValT](rowSym,colSym,valSym, filtered.project(rowSym,colSym,valSym)) + val joined = pipe.joinWithSmaller( + (rowSym, colSym) -> (filterR, filterC), + that.pipe.rename((that.rowSym, that.colSym, that.valSym) -> (filterR, filterC, filterV)), + new LeftJoin + ) + val filtered = joined.filter(filterV) { x: ValU => null == x } + new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, filtered.project(rowSym, colSym, valSym)) } /** * keep only elements in this matrix that also appear in the argument matrix */ - def keepElementsBy[ValU](that : Matrix[RowT,ColT,ValU]) : Matrix[RowT,ColT,ValT] = { + def keepElementsBy[ValU](that: Matrix[RowT, ColT, ValU]): Matrix[RowT, ColT, ValT] = { val keepR = '___keepR___ val keepC = '___keepC___ val keepV = '___keepV___ - val joined = pipe.joinWithSmaller((rowSym, colSym) -> (keepR, keepC), - that.pipe.rename((that.rowSym, that.colSym, that.valSym) -> (keepR, keepC, keepV))) - new Matrix[RowT,ColT,ValT](rowSym,colSym,valSym, joined.project(rowSym,colSym,valSym)) + val joined = pipe.joinWithSmaller( + (rowSym, colSym) -> (keepR, keepC), + that.pipe.rename((that.rowSym, that.colSym, that.valSym) -> (keepR, keepC, keepV)) + ) + new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, joined.project(rowSym, colSym, valSym)) } - /** * keeps only those rows that are in the joining column */ - def keepRowsBy[ValU](that : ColVector[RowT,ValU]) : Matrix[RowT,ColT,ValT] = { + def keepRowsBy[ValU](that: ColVector[RowT, ValU]): Matrix[RowT, ColT, ValT] = { val index = '____index____ val joined = pipe.joinWithSmaller(rowSym -> index, that.pipe.rename(that.rowS -> index).project(index)) - new Matrix[RowT,ColT,ValT](rowSym,colSym,valSym, joined.project(rowSym,colSym,valSym)) + new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, joined.project(rowSym, colSym, valSym)) } /** * keeps only those cols that are in the joining rows */ - def keepColsBy[ValU](that : RowVector[ColT,ValU]) : Matrix[RowT,ColT,ValT] = { + def keepColsBy[ValU](that: RowVector[ColT, ValU]): Matrix[RowT, ColT, ValT] = { val index = '____index____ val joined = pipe.joinWithSmaller(colSym -> index, that.pipe.rename(that.colS -> index).project(index)) - new Matrix[RowT,ColT,ValT](rowSym,colSym,valSym, joined.project(rowSym,colSym,valSym)) + new Matrix[RowT, ColT, ValT](rowSym, colSym, valSym, joined.project(rowSym, colSym, valSym)) } /** * removes those rows that are in the joining column */ - def removeRowsBy[ValU](that : ColVector[RowT,ValU]) : Matrix[RowT,ColT,ValT] = { + def removeRowsBy[ValU](that: ColVector[RowT, ValU]): Matrix[RowT, ColT, ValT] = { val index = '____index____ - val joined = pipe.joinWithSmaller(rowSym -> index, that.pipe.rename(that.rowS -> index).project(index), joiner = new LeftJoin) - new Matrix[RowT,ColT,ValT](rowSym,colSym,valSym, joined.filter(index){ x : RowT => null == x } - .project(rowSym,colSym,valSym)) + val joined = pipe.joinWithSmaller( + rowSym -> index, + that.pipe.rename(that.rowS -> index).project(index), + joiner = new LeftJoin + ) + new Matrix[RowT, ColT, ValT]( + rowSym, + colSym, + valSym, + joined + .filter(index) { x: RowT => null == x } + .project(rowSym, colSym, valSym) + ) } /** * removes those cols that are in the joining column */ - def removeColsBy[ValU](that : RowVector[ColT,ValU]) : Matrix[RowT,ColT,ValT] = { + def removeColsBy[ValU](that: RowVector[ColT, ValU]): Matrix[RowT, ColT, ValT] = { val index = '____index____ - val joined = pipe.joinWithSmaller(colSym -> index, that.pipe.rename(that.colS -> index).project(index), joiner = new LeftJoin) - new Matrix[RowT,ColT,ValT](rowSym,colSym,valSym, joined.filter(index){ x : ColT => null == x } - .project(rowSym,colSym,valSym)) + val joined = pipe.joinWithSmaller( + colSym -> index, + that.pipe.rename(that.colS -> index).project(index), + joiner = new LeftJoin + ) + new Matrix[RowT, ColT, ValT]( + rowSym, + colSym, + valSym, + joined + .filter(index) { x: ColT => null == x } + .project(rowSym, colSym, valSym) + ) } - - /** Write the matrix, optionally renaming row,col,val fields to the given fields - * then return this. + /** + * Write the matrix, optionally renaming row,col,val fields to the given fields then return this. */ - def write(src : Source, outFields : Fields = Fields.NONE)(implicit fd : FlowDef, mode: Mode) - : Matrix[RowT,ColT,ValT] = { + def write(src: Source, outFields: Fields = Fields.NONE)(implicit + fd: FlowDef, + mode: Mode + ): Matrix[RowT, ColT, ValT] = { writePipe(src, outFields) this } } -class LiteralScalar[ValT](val value : ValT) extends java.io.Serializable { - def *[That,Res](that : That)(implicit prod : MatrixProduct[LiteralScalar[ValT],That,Res]) : Res - = { prod(this, that) } +class LiteralScalar[ValT](val value: ValT) extends java.io.Serializable { + def *[That, Res](that: That)(implicit prod: MatrixProduct[LiteralScalar[ValT], That, Res]): Res = + prod(this, that) } -class Scalar[ValT](val valSym : Symbol, inPipe : Pipe) extends WrappedPipe with java.io.Serializable { +class Scalar[ValT](val valSym: Symbol, inPipe: Pipe) extends WrappedPipe with java.io.Serializable { def pipe = inPipe def fields = valSym - def *[That,Res](that : That)(implicit prod : MatrixProduct[Scalar[ValT],That,Res]) : Res - = { prod(this, that) } - /** Write the Scalar, optionally renaming val fields to the given fields - * then return this. + def *[That, Res](that: That)(implicit prod: MatrixProduct[Scalar[ValT], That, Res]): Res = prod(this, that) + + /** + * Write the Scalar, optionally renaming val fields to the given fields then return this. */ - def write(src : Source, outFields : Fields = Fields.NONE)(implicit fd : FlowDef, mode: Mode) = { + def write(src: Source, outFields: Fields = Fields.NONE)(implicit fd: FlowDef, mode: Mode) = { writePipe(src, outFields) this } } -class DiagonalMatrix[IdxT,ValT](val idxSym : Symbol, - val valSym : Symbol, inPipe : Pipe, val sizeHint : SizeHint = FiniteHint(1L, -1L)) - extends WrappedPipe with java.io.Serializable { +class DiagonalMatrix[IdxT, ValT]( + val idxSym: Symbol, + val valSym: Symbol, + inPipe: Pipe, + val sizeHint: SizeHint = FiniteHint(1L, -1L) +) extends WrappedPipe + with java.io.Serializable { - def *[That,Res](that : That)(implicit prod : MatrixProduct[DiagonalMatrix[IdxT,ValT],That,Res]) : Res - = { prod(this, that) } + def *[That, Res](that: That)(implicit prod: MatrixProduct[DiagonalMatrix[IdxT, ValT], That, Res]): Res = + prod(this, that) def pipe = inPipe def fields = (idxSym, valSym) - def trace(implicit mon : Monoid[ValT]) : Scalar[ValT] = { + def trace(implicit mon: Monoid[ValT]): Scalar[ValT] = { val scalarPipe = inPipe.groupAll { - _.reduce(valSym -> valSym) { (left : Tuple1[ValT], right : Tuple1[ValT]) => + _.reduce(valSym -> valSym) { (left: Tuple1[ValT], right: Tuple1[ValT]) => Tuple1(mon.plus(left._1, right._1)) } } new Scalar[ValT](valSym, scalarPipe) } - def toCol : ColVector[IdxT,ValT] = { - new ColVector[IdxT,ValT](idxSym, valSym, inPipe, sizeHint.setRows(1L)) - } - def toRow : RowVector[IdxT,ValT] = { - new RowVector[IdxT,ValT](idxSym, valSym, inPipe, sizeHint.setCols(1L)) - } + def toCol: ColVector[IdxT, ValT] = + new ColVector[IdxT, ValT](idxSym, valSym, inPipe, sizeHint.setRows(1L)) + def toRow: RowVector[IdxT, ValT] = + new RowVector[IdxT, ValT](idxSym, valSym, inPipe, sizeHint.setCols(1L)) // Inverse of this matrix *IGNORING ZEROS* - def inverse(implicit field : Field[ValT]) : DiagonalMatrix[IdxT, ValT] = { - val diagPipe = inPipe.flatMap(valSym -> valSym) { element : ValT => - field.nonZeroOption(element) - .map { field.inverse } - } - new DiagonalMatrix[IdxT,ValT](idxSym, valSym, diagPipe, sizeHint) + def inverse(implicit field: Field[ValT]): DiagonalMatrix[IdxT, ValT] = { + val diagPipe = inPipe.flatMap(valSym -> valSym) { element: ValT => + field + .nonZeroOption(element) + .map(field.inverse) + } + new DiagonalMatrix[IdxT, ValT](idxSym, valSym, diagPipe, sizeHint) } // Value operations - def mapValues[ValU](fn:(ValT) => ValU)(implicit mon : Monoid[ValU]) : DiagonalMatrix[IdxT,ValU] = { - val newPipe = pipe.flatMap(valSym -> valSym) { imp : Tuple1[ValT] => // Ensure an arity of 1 - //This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. - mon.nonZeroOption(fn(imp._1)).map { Tuple1(_) } + def mapValues[ValU](fn: (ValT) => ValU)(implicit mon: Monoid[ValU]): DiagonalMatrix[IdxT, ValU] = { + val newPipe = pipe.flatMap(valSym -> valSym) { imp: Tuple1[ValT] => // Ensure an arity of 1 + // This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. + mon.nonZeroOption(fn(imp._1)).map(Tuple1(_)) } - new DiagonalMatrix[IdxT,ValU](this.idxSym, this.valSym, newPipe, sizeHint) + new DiagonalMatrix[IdxT, ValU](this.idxSym, this.valSym, newPipe, sizeHint) } - /** Write optionally renaming val fields to the given fields - * then return this. + /** + * Write optionally renaming val fields to the given fields then return this. */ - def write(src : Source, outFields : Fields = Fields.NONE)(implicit fd : FlowDef, mode: Mode) = { + def write(src: Source, outFields: Fields = Fields.NONE)(implicit fd: FlowDef, mode: Mode) = { writePipe(src, outFields) this } } -class RowVector[ColT,ValT] (val colS:Symbol, val valS:Symbol, inPipe: Pipe, val sizeH: SizeHint = FiniteHint(1L, -1L)) - extends java.io.Serializable with WrappedPipe { +class RowVector[ColT, ValT]( + val colS: Symbol, + val valS: Symbol, + inPipe: Pipe, + val sizeH: SizeHint = FiniteHint(1L, -1L) +) extends java.io.Serializable + with WrappedPipe { - def pipe = inPipe.project(colS,valS) - def fields = (colS,valS) + def pipe = inPipe.project(colS, valS) + def fields = (colS, valS) - def *[That,Res](that : That)(implicit prod : MatrixProduct[RowVector[ColT,ValT],That,Res]) : Res - = { prod(this, that) } + def *[That, Res](that: That)(implicit prod: MatrixProduct[RowVector[ColT, ValT], That, Res]): Res = + prod(this, that) - def +(that : RowVector[ColT,ValT])(implicit mon : Monoid[ValT]) = (this.toMatrix(true) + that.toMatrix(true)).getRow(true) + def +(that: RowVector[ColT, ValT])(implicit mon: Monoid[ValT]) = + (this.toMatrix(true) + that.toMatrix(true)).getRow(true) - def -(that : RowVector[ColT,ValT])(implicit group : Group[ValT]) = (this.toMatrix(true) - that.toMatrix(true)).getRow(true) + def -(that: RowVector[ColT, ValT])(implicit group: Group[ValT]) = + (this.toMatrix(true) - that.toMatrix(true)).getRow(true) - def hProd(that: RowVector[ColT,ValT])(implicit ring: Ring[ValT]) : RowVector[ColT,ValT] = (this.transpose hProd that.transpose).transpose + def hProd(that: RowVector[ColT, ValT])(implicit ring: Ring[ValT]): RowVector[ColT, ValT] = + this.transpose.hProd(that.transpose).transpose - def transpose : ColVector[ColT,ValT] = { - new ColVector[ColT,ValT](colS, valS, inPipe, sizeH.transpose) - } + def transpose: ColVector[ColT, ValT] = + new ColVector[ColT, ValT](colS, valS, inPipe, sizeH.transpose) - def diag : DiagonalMatrix[ColT,ValT] = { + def diag: DiagonalMatrix[ColT, ValT] = { val newHint = SizeHint.asDiagonal(sizeH.setRowsToCols) - new DiagonalMatrix[ColT,ValT](colS, valS, inPipe, newHint) + new DiagonalMatrix[ColT, ValT](colS, valS, inPipe, newHint) } - /** like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. - * Note you will only see non-zero elements on the vector. This does not enumerate the zeros + /** + * like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. Note you will only see non-zero elements + * on the vector. This does not enumerate the zeros */ - def mapWithIndex[ValNew](fn: (ValT,ColT) => ValNew)(implicit mon: Monoid[ValNew]): - RowVector[ColT,ValNew] = { - val newPipe = pipe.mapTo((valS,colS) -> (valS,colS)) { tup: (ValT,ColT) => (fn(tup._1, tup._2), tup._2) } - .filter(valS) { (v: ValNew) => mon.isNonZero(v) } + def mapWithIndex[ValNew]( + fn: (ValT, ColT) => ValNew + )(implicit mon: Monoid[ValNew]): RowVector[ColT, ValNew] = { + val newPipe = pipe + .mapTo((valS, colS) -> (valS, colS)) { tup: (ValT, ColT) => (fn(tup._1, tup._2), tup._2) } + .filter(valS)((v: ValNew) => mon.isNonZero(v)) new RowVector(colS, valS, newPipe, sizeH) } // Value operations - def mapValues[ValU](fn:(ValT) => ValU)(implicit mon : Monoid[ValU]) : RowVector[ColT,ValU] = { - val newPipe = pipe.flatMap(valS -> valS) { imp : Tuple1[ValT] => // Ensure an arity of 1 - //This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. - mon.nonZeroOption(fn(imp._1)).map { Tuple1(_) } + def mapValues[ValU](fn: (ValT) => ValU)(implicit mon: Monoid[ValU]): RowVector[ColT, ValU] = { + val newPipe = pipe.flatMap(valS -> valS) { imp: Tuple1[ValT] => // Ensure an arity of 1 + // This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. + mon.nonZeroOption(fn(imp._1)).map(Tuple1(_)) } - new RowVector[ColT,ValU](this.colS, this.valS, newPipe, sizeH) + new RowVector[ColT, ValU](this.colS, this.valS, newPipe, sizeH) } - /** Do a right-propogation of a row, transpose of Matrix.propagate + /** + * Do a right-propogation of a row, transpose of Matrix.propagate */ - def propagate[MatColT](mat: Matrix[ColT,MatColT,Boolean])(implicit monT: Monoid[ValT]) - : RowVector[MatColT,ValT] = { + def propagate[MatColT](mat: Matrix[ColT, MatColT, Boolean])(implicit + monT: Monoid[ValT] + ): RowVector[MatColT, ValT] = mat.transpose.propagate(this.transpose).transpose - } - def L0Normalize(implicit ev : =:=[ValT,Double]) : RowVector[ColT,ValT] = { + def L0Normalize(implicit ev: =:=[ValT, Double]): RowVector[ColT, ValT] = { val normedMatrix = this.toMatrix(0).rowL0Normalize - new RowVector(normedMatrix.colSym, - normedMatrix.valSym, - normedMatrix.pipe.project(normedMatrix.colSym, normedMatrix.valSym)) + new RowVector( + normedMatrix.colSym, + normedMatrix.valSym, + normedMatrix.pipe.project(normedMatrix.colSym, normedMatrix.valSym) + ) } - def L1Normalize(implicit ev : =:=[ValT,Double]) : RowVector[ColT,ValT] = { + def L1Normalize(implicit ev: =:=[ValT, Double]): RowVector[ColT, ValT] = { val normedMatrix = this.toMatrix(0).rowL1Normalize - new RowVector(normedMatrix.colSym, - normedMatrix.valSym, - normedMatrix.pipe.project(normedMatrix.colSym, normedMatrix.valSym)) + new RowVector( + normedMatrix.colSym, + normedMatrix.valSym, + normedMatrix.pipe.project(normedMatrix.colSym, normedMatrix.valSym) + ) } - def sum(implicit mon : Monoid[ValT]) : Scalar[ValT] = { - val scalarPipe = pipe.groupAll{ _.reduce(valS -> valS) { (left : Tuple1[ValT], right : Tuple1[ValT]) => + def sum(implicit mon: Monoid[ValT]): Scalar[ValT] = { + val scalarPipe = pipe.groupAll { + _.reduce(valS -> valS) { (left: Tuple1[ValT], right: Tuple1[ValT]) => Tuple1(mon.plus(left._1, right._1)) } } new Scalar[ValT](valS, scalarPipe) } - def topElems( k : Int )(implicit ord : Ordering[ValT]) : RowVector[ColT,ValT] = { + def topElems(k: Int)(implicit ord: Ordering[ValT]): RowVector[ColT, ValT] = // TODO this should be tunable: if (k < 1000) { topWithTiny(k) } else { @@ -904,165 +962,187 @@ class RowVector[ColT,ValT] (val colS:Symbol, val valS:Symbol, inPipe: Pipe, val val ordValS = new Fields(fieldName) ordValS.setComparator(fieldName, ord) - val newPipe = pipe.groupAll{ _ - .sortBy(ordValS) - .reverse - .take(k) - }.project(colS,valS) - new RowVector[ColT,ValT](colS, valS, newPipe, sizeH.setCols(k).setRows(1L)) + val newPipe = pipe + .groupAll { + _.sortBy(ordValS).reverse + .take(k) + } + .project(colS, valS) + new RowVector[ColT, ValT](colS, valS, newPipe, sizeH.setCols(k).setRows(1L)) } - } - protected def topWithTiny( k : Int )(implicit ord : Ordering[ValT]) : RowVector[ColT,ValT] = { + protected def topWithTiny(k: Int)(implicit ord: Ordering[ValT]): RowVector[ColT, ValT] = { val topSym = Symbol(colS.name + "_topK") - val newPipe = pipe.groupAll{ _ - .sortWithTake( (colS, valS) -> 'top_vals, k ) ( (t0 :(ColT,ValT), t1:(ColT,ValT)) => ord.gt(t0._2,t1._2) ) - } - .flatMap('top_vals ->(topSym, valS)) { imp:List[(ColT,ValT)] => imp } - new RowVector[ColT,ValT](topSym, valS, newPipe, sizeH.setCols(k).setRows(1L)) + val newPipe = pipe + .groupAll { + _.sortWithTake((colS, valS) -> 'top_vals, k)((t0: (ColT, ValT), t1: (ColT, ValT)) => + ord.gt(t0._2, t1._2) + ) + } + .flatMap('top_vals -> (topSym, valS)) { imp: List[(ColT, ValT)] => imp } + new RowVector[ColT, ValT](topSym, valS, newPipe, sizeH.setCols(k).setRows(1L)) } - def toMatrix[RowT](rowId : RowT) : Matrix[RowT,ColT,ValT] = { - val rowSym = newSymbol(Set(colS, valS), 'row) //Matrix.newSymbol(Set(colS, valS), 'row) - val newPipe = inPipe.map(() -> rowSym){ u: Unit => rowId } + def toMatrix[RowT](rowId: RowT): Matrix[RowT, ColT, ValT] = { + val rowSym = newSymbol(Set(colS, valS), 'row) // Matrix.newSymbol(Set(colS, valS), 'row) + val newPipe = inPipe + .map(() -> rowSym) { u: Unit => rowId } .project(rowSym, colS, valS) - new Matrix[RowT,ColT,ValT](rowSym, colS, valS, newPipe, sizeH.setRows(1L)) + new Matrix[RowT, ColT, ValT](rowSym, colS, valS, newPipe, sizeH.setRows(1L)) } // Override the size hint - def withColsHint(cols : Long) : RowVector[ColT,ValT] = { - new RowVector[ColT,ValT](colS, valS, pipe, sizeH.setRows(1L).setCols(cols)) - } + def withColsHint(cols: Long): RowVector[ColT, ValT] = + new RowVector[ColT, ValT](colS, valS, pipe, sizeH.setRows(1L).setCols(cols)) - /** Write optionally renaming val fields to the given fields - * then return this. + /** + * Write optionally renaming val fields to the given fields then return this. */ - def write(src : Source, outFields : Fields = Fields.NONE)(implicit fd : FlowDef, mode: Mode) = { + def write(src: Source, outFields: Fields = Fields.NONE)(implicit fd: FlowDef, mode: Mode) = { writePipe(src, outFields) this } } -class ColVector[RowT,ValT] (val rowS:Symbol, val valS:Symbol, inPipe : Pipe, val sizeH: SizeHint = FiniteHint(-1L, 1L)) - extends java.io.Serializable with WrappedPipe { - - def pipe = inPipe.project(rowS,valS) - def fields = (rowS,valS) +class ColVector[RowT, ValT]( + val rowS: Symbol, + val valS: Symbol, + inPipe: Pipe, + val sizeH: SizeHint = FiniteHint(-1L, 1L) +) extends java.io.Serializable + with WrappedPipe { - def *[That,Res](that : That)(implicit prod : MatrixProduct[ColVector[RowT,ValT],That,Res]) : Res - = { prod(this, that) } + def pipe = inPipe.project(rowS, valS) + def fields = (rowS, valS) - def +(that : ColVector[RowT,ValT])(implicit mon : Monoid[ValT]) = (this.toMatrix(true) + that.toMatrix(true)).getCol(true) + def *[That, Res](that: That)(implicit prod: MatrixProduct[ColVector[RowT, ValT], That, Res]): Res = + prod(this, that) - def -(that : ColVector[RowT,ValT])(implicit group : Group[ValT]) = (this.toMatrix(true) - that.toMatrix(true)).getCol(true) + def +(that: ColVector[RowT, ValT])(implicit mon: Monoid[ValT]) = + (this.toMatrix(true) + that.toMatrix(true)).getCol(true) - def hProd(that: ColVector[RowT,ValT])(implicit ring: Ring[ValT]) : ColVector[RowT,ValT] = (this.toMatrix(true) hProd that.toMatrix(true)).getCol(true) + def -(that: ColVector[RowT, ValT])(implicit group: Group[ValT]) = + (this.toMatrix(true) - that.toMatrix(true)).getCol(true) + def hProd(that: ColVector[RowT, ValT])(implicit ring: Ring[ValT]): ColVector[RowT, ValT] = + this.toMatrix(true).hProd(that.toMatrix(true)).getCol(true) - def transpose : RowVector[RowT,ValT] = { - new RowVector[RowT,ValT](rowS, valS, inPipe, sizeH.transpose) - } + def transpose: RowVector[RowT, ValT] = + new RowVector[RowT, ValT](rowS, valS, inPipe, sizeH.transpose) - def diag : DiagonalMatrix[RowT,ValT] = { + def diag: DiagonalMatrix[RowT, ValT] = { val newHint = SizeHint.asDiagonal(sizeH.setColsToRows) - new DiagonalMatrix[RowT,ValT](rowS, valS, inPipe, newHint) + new DiagonalMatrix[RowT, ValT](rowS, valS, inPipe, newHint) } - /** like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. - * Note you will only see non-zero elements on the vector. This does not enumerate the zeros + /** + * like zipWithIndex.map but ONLY CHANGES THE VALUE not the index. Note you will only see non-zero elements + * on the vector. This does not enumerate the zeros */ - def mapWithIndex[ValNew](fn: (ValT,RowT) => ValNew)(implicit mon: Monoid[ValNew]): - ColVector[RowT,ValNew] = transpose.mapWithIndex(fn).transpose + def mapWithIndex[ValNew](fn: (ValT, RowT) => ValNew)(implicit + mon: Monoid[ValNew] + ): ColVector[RowT, ValNew] = transpose.mapWithIndex(fn).transpose // Value operations - def mapValues[ValU](fn:(ValT) => ValU)(implicit mon : Monoid[ValU]) : ColVector[RowT,ValU] = { - val newPipe = pipe.flatMap(valS -> valS) { imp : Tuple1[ValT] => // Ensure an arity of 1 - //This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. - mon.nonZeroOption(fn(imp._1)).map { Tuple1(_) } + def mapValues[ValU](fn: (ValT) => ValU)(implicit mon: Monoid[ValU]): ColVector[RowT, ValU] = { + val newPipe = pipe.flatMap(valS -> valS) { imp: Tuple1[ValT] => // Ensure an arity of 1 + // This annoying Tuple1 wrapping ensures we can handle ValT that may itself be a Tuple. + mon.nonZeroOption(fn(imp._1)).map(Tuple1(_)) } - new ColVector[RowT,ValU](this.rowS, this.valS, newPipe, sizeH) + new ColVector[RowT, ValU](this.rowS, this.valS, newPipe, sizeH) } - def sum(implicit mon : Monoid[ValT]) : Scalar[ValT] = { - val scalarPipe = pipe.groupAll{ _.reduce(valS -> valS) { (left : Tuple1[ValT], right : Tuple1[ValT]) => + def sum(implicit mon: Monoid[ValT]): Scalar[ValT] = { + val scalarPipe = pipe.groupAll { + _.reduce(valS -> valS) { (left: Tuple1[ValT], right: Tuple1[ValT]) => Tuple1(mon.plus(left._1, right._1)) } } new Scalar[ValT](valS, scalarPipe) } - def L0Normalize(implicit ev : =:=[ValT,Double]) : ColVector[RowT,ValT] = { + def L0Normalize(implicit ev: =:=[ValT, Double]): ColVector[RowT, ValT] = { val normedMatrix = this.toMatrix(0).colL0Normalize - new ColVector(normedMatrix.rowSym, - normedMatrix.valSym, - normedMatrix.pipe.project(normedMatrix.rowSym, normedMatrix.valSym)) + new ColVector( + normedMatrix.rowSym, + normedMatrix.valSym, + normedMatrix.pipe.project(normedMatrix.rowSym, normedMatrix.valSym) + ) } - def L1Normalize(implicit ev : =:=[ValT,Double]) : ColVector[RowT,ValT] = { + def L1Normalize(implicit ev: =:=[ValT, Double]): ColVector[RowT, ValT] = { val normedMatrix = this.toMatrix(0).colL1Normalize - new ColVector(normedMatrix.rowSym, - normedMatrix.valSym, - normedMatrix.pipe.project(normedMatrix.rowSym, normedMatrix.valSym)) + new ColVector( + normedMatrix.rowSym, + normedMatrix.valSym, + normedMatrix.pipe.project(normedMatrix.rowSym, normedMatrix.valSym) + ) } - def topElems( k : Int )(implicit ord : Ordering[ValT]) : ColVector[RowT,ValT] = { + def topElems(k: Int)(implicit ord: Ordering[ValT]): ColVector[RowT, ValT] = if (k < 1000) { topWithTiny(k) } else { - val newPipe = pipe.groupAll{ _ - .sortBy(valS) - .reverse - .take(k) - }.project(rowS,valS) - new ColVector[RowT,ValT](rowS, valS, newPipe, sizeH.setCols(1L).setRows(k)) + val newPipe = pipe + .groupAll { + _.sortBy(valS).reverse + .take(k) + } + .project(rowS, valS) + new ColVector[RowT, ValT](rowS, valS, newPipe, sizeH.setCols(1L).setRows(k)) } - } - protected def topWithTiny( k : Int )(implicit ord : Ordering[ValT]) : ColVector[RowT,ValT] = { + protected def topWithTiny(k: Int)(implicit ord: Ordering[ValT]): ColVector[RowT, ValT] = { val topSym = Symbol(rowS.name + "_topK") - val newPipe = pipe.groupAll{ _ - .sortWithTake( (rowS, valS) -> 'top_vals, k ) ( (t0 :(RowT,ValT), t1:(RowT,ValT)) => ord.gt(t0._2,t1._2) ) - } - .flatMap('top_vals ->(topSym, valS)) { imp:List[(RowT,ValT)] => imp } - new ColVector[RowT,ValT](topSym, valS, newPipe, sizeH.setCols(1L).setRows(k)) + val newPipe = pipe + .groupAll { + _.sortWithTake((rowS, valS) -> 'top_vals, k)((t0: (RowT, ValT), t1: (RowT, ValT)) => + ord.gt(t0._2, t1._2) + ) + } + .flatMap('top_vals -> (topSym, valS)) { imp: List[(RowT, ValT)] => imp } + new ColVector[RowT, ValT](topSym, valS, newPipe, sizeH.setCols(1L).setRows(k)) } - def toMatrix[ColT](colIdx : ColT) : Matrix[RowT,ColT,ValT] = { - val colSym = newSymbol(Set(rowS, valS), 'col) //Matrix.newSymbol(Set(rowS, valS), 'col) - val newPipe = inPipe.map(() -> colSym){ u:Unit => colIdx } + def toMatrix[ColT](colIdx: ColT): Matrix[RowT, ColT, ValT] = { + val colSym = newSymbol(Set(rowS, valS), 'col) // Matrix.newSymbol(Set(rowS, valS), 'col) + val newPipe = inPipe + .map(() -> colSym) { u: Unit => colIdx } .project(rowS, colSym, valS) - new Matrix[RowT,ColT,ValT](rowS, colSym, valS, newPipe, sizeH.setCols(1L)) + new Matrix[RowT, ColT, ValT](rowS, colSym, valS, newPipe, sizeH.setCols(1L)) } // Override the size hint - def withRowsHint(rows : Long) : ColVector[RowT,ValT] = { - new ColVector[RowT,ValT](rowS, valS, pipe, sizeH.setRows(rows).setCols(1L)) - } + def withRowsHint(rows: Long): ColVector[RowT, ValT] = + new ColVector[RowT, ValT](rowS, valS, pipe, sizeH.setRows(rows).setCols(1L)) - /** Write optionally renaming val fields to the given fields - * then return this. + /** + * Write optionally renaming val fields to the given fields then return this. */ - def write(src : Source, outFields : Fields = Fields.NONE)(implicit fd : FlowDef, mode: Mode) = { + def write(src: Source, outFields: Fields = Fields.NONE)(implicit fd: FlowDef, mode: Mode) = { writePipe(src, outFields) this } } -/** BlockMatrix is 3 dimensional matrix where the rows are grouped - * It is useful for when we want to multiply groups of vectors only between themselves. - * For example, grouping users by countries and calculating products only between users from the same country +/** + * BlockMatrix is 3 dimensional matrix where the rows are grouped It is useful for when we want to multiply + * groups of vectors only between themselves. For example, grouping users by countries and calculating + * products only between users from the same country */ -class BlockMatrix[RowT, GroupT, ColT, ValT](private val mat: Matrix[RowT,GroupT,Map[ColT,ValT]]) { - def dotProd[RowT2](that : BlockMatrix[GroupT, RowT2, ColT, ValT]) - (implicit prod : MatrixProduct[Matrix[RowT,GroupT,Map[ColT,ValT]],Matrix[GroupT,RowT2,Map[ColT,ValT]],Matrix[RowT,RowT2,Map[ColT,ValT]]], - mon: Monoid[ValT]) : Matrix[RowT, RowT2, ValT] = { +class BlockMatrix[RowT, GroupT, ColT, ValT](private val mat: Matrix[RowT, GroupT, Map[ColT, ValT]]) { + def dotProd[RowT2](that: BlockMatrix[GroupT, RowT2, ColT, ValT])(implicit + prod: MatrixProduct[ + Matrix[RowT, GroupT, Map[ColT, ValT]], + Matrix[GroupT, RowT2, Map[ColT, ValT]], + Matrix[RowT, RowT2, Map[ColT, ValT]] + ], + mon: Monoid[ValT] + ): Matrix[RowT, RowT2, ValT] = prod(mat, that.mat).mapValues(_.values.foldLeft(mon.zero)(mon.plus)) - } - def transpose : BlockMatrix[GroupT, RowT, ColT, ValT] = { + def transpose: BlockMatrix[GroupT, RowT, ColT, ValT] = new BlockMatrix(mat.transpose) - } - def withSizeHint(hint: SizeHint) = { + def withSizeHint(hint: SizeHint) = new BlockMatrix(mat.withSizeHint(hint)) - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/MatrixProduct.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/MatrixProduct.scala index 5d06166950..e9e91a5a99 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/MatrixProduct.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/MatrixProduct.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics @@ -20,337 +20,354 @@ package com.twitter.scalding.mathematics * Handles the implementation of various versions of MatrixProducts */ -import com.twitter.algebird.{Ring,Monoid,Group,Field} +import com.twitter.algebird.Ring import com.twitter.scalding.RichPipe import com.twitter.scalding.Dsl._ import cascading.pipe.Pipe import cascading.tuple.Fields -import scala.math.Ordering -import scala.annotation.tailrec - -/** Abstracts the approach taken to join the two matrices +/** + * Abstracts the approach taken to join the two matrices */ abstract class MatrixJoiner extends java.io.Serializable { - def apply(left : Pipe, joinFields : (Fields,Fields), right : Pipe) : Pipe + def apply(left: Pipe, joinFields: (Fields, Fields), right: Pipe): Pipe } case object AnyToTiny extends MatrixJoiner { - override def apply(left : Pipe, joinFields : (Fields,Fields), right : Pipe) : Pipe = { + override def apply(left: Pipe, joinFields: (Fields, Fields), right: Pipe): Pipe = RichPipe(left).joinWithTiny(joinFields, right) - } } class BigToSmall(red: Int) extends MatrixJoiner { - override def apply(left : Pipe, joinFields : (Fields,Fields), right : Pipe) : Pipe = { + override def apply(left: Pipe, joinFields: (Fields, Fields), right: Pipe): Pipe = RichPipe(left).joinWithSmaller(joinFields, right, reducers = red) - } } case object TinyToAny extends MatrixJoiner { - override def apply(left : Pipe, joinFields : (Fields,Fields), right : Pipe) : Pipe = { + override def apply(left: Pipe, joinFields: (Fields, Fields), right: Pipe): Pipe = { val reversed = (joinFields._2, joinFields._1) RichPipe(right).joinWithTiny(reversed, left) } } class SmallToBig(red: Int) extends MatrixJoiner { - override def apply(left : Pipe, joinFields : (Fields,Fields), right : Pipe) : Pipe = { + override def apply(left: Pipe, joinFields: (Fields, Fields), right: Pipe): Pipe = RichPipe(left).joinWithLarger(joinFields, right, reducers = red) - } } abstract class MatrixCrosser extends java.io.Serializable { - def apply(left: Pipe, right: Pipe) : Pipe + def apply(left: Pipe, right: Pipe): Pipe } case object AnyCrossTiny extends MatrixCrosser { - override def apply(left: Pipe, right: Pipe) : Pipe = { + override def apply(left: Pipe, right: Pipe): Pipe = RichPipe(left).crossWithTiny(right) - } } case object AnyCrossSmall extends MatrixCrosser { - override def apply(left: Pipe, right: Pipe) : Pipe = { + override def apply(left: Pipe, right: Pipe): Pipe = RichPipe(left).crossWithSmaller(right) - } } -trait MatrixProduct[Left,Right,Result] extends java.io.Serializable { - def apply(left : Left, right : Right) : Result +trait MatrixProduct[Left, Right, Result] extends java.io.Serializable { + def apply(left: Left, right: Right): Result } /** - * TODO: Muliplication is the expensive stuff. We need to optimize the methods below: - * This object holds the implicits to handle matrix products between various types + * TODO: Muliplication is the expensive stuff. We need to optimize the methods below: This object holds the + * implicits to handle matrix products between various types */ object MatrixProduct extends java.io.Serializable { // These are VARS, so you can set them before you start: var maxTinyJoin = 100000L // Bigger than this, and we use joinWithSmaller var maxReducers = 200 - def numOfReducers(hint: SizeHint) = { - hint.total.map { tot => - // + 1L is to make sure there is at least once reducer - (tot / MatrixProduct.maxTinyJoin + 1L).toInt min MatrixProduct.maxReducers - }.getOrElse(-1) - } + def numOfReducers(hint: SizeHint) = + hint.total + .map { tot => + // + 1L is to make sure there is at least once reducer + (tot / MatrixProduct.maxTinyJoin + 1L).toInt.min(MatrixProduct.maxReducers) + } + .getOrElse(-1) - def getJoiner(leftSize : SizeHint, rightSize : SizeHint) : MatrixJoiner = { + def getJoiner(leftSize: SizeHint, rightSize: SizeHint): MatrixJoiner = { val newHint = leftSize * rightSize if (SizeHintOrdering.lteq(leftSize, rightSize)) { // If leftsize is definite: - leftSize.total.map { t => if (t < maxTinyJoin) TinyToAny else new SmallToBig(numOfReducers(newHint)) } + leftSize.total + .map(t => if (t < maxTinyJoin) TinyToAny else new SmallToBig(numOfReducers(newHint))) // Else just assume the right is smaller, but both are unknown: .getOrElse(new BigToSmall(numOfReducers(newHint))) - } - else { + } else { // left > right - rightSize.total.map { rs => - if (rs < maxTinyJoin) AnyToTiny else new BigToSmall(numOfReducers(newHint)) - }.getOrElse(new BigToSmall(numOfReducers(newHint))) + rightSize.total + .map { rs => + if (rs < maxTinyJoin) AnyToTiny else new BigToSmall(numOfReducers(newHint)) + } + .getOrElse(new BigToSmall(numOfReducers(newHint))) } } - def getCrosser(rightSize: SizeHint) : MatrixCrosser = - rightSize.total.map { t => if (t < maxTinyJoin) AnyCrossTiny else AnyCrossSmall } + def getCrosser(rightSize: SizeHint): MatrixCrosser = + rightSize.total + .map(t => if (t < maxTinyJoin) AnyCrossTiny else AnyCrossSmall) .getOrElse(AnyCrossSmall) - implicit def literalScalarRightProduct[Row,Col,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[Matrix[Row,Col,ValT],LiteralScalar[ValT],Matrix[Row,Col,ValT]] = - new MatrixProduct[Matrix[Row,Col,ValT],LiteralScalar[ValT],Matrix[Row,Col,ValT]] { - def apply(left : Matrix[Row,Col,ValT], right : LiteralScalar[ValT]) = { - val newPipe = left.pipe.map(left.valSym -> left.valSym) { (v : ValT) => + implicit def literalScalarRightProduct[Row, Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[Row, Col, ValT], LiteralScalar[ValT], Matrix[Row, Col, ValT]] = + new MatrixProduct[Matrix[Row, Col, ValT], LiteralScalar[ValT], Matrix[Row, Col, ValT]] { + def apply(left: Matrix[Row, Col, ValT], right: LiteralScalar[ValT]) = { + val newPipe = left.pipe.map(left.valSym -> left.valSym) { (v: ValT) => ring.times(v, right.value) } - new Matrix[Row,Col,ValT](left.rowSym, left.colSym, left.valSym, newPipe, left.sizeHint) + new Matrix[Row, Col, ValT](left.rowSym, left.colSym, left.valSym, newPipe, left.sizeHint) } } - implicit def literalRightProduct[Row,Col,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[Matrix[Row,Col,ValT],ValT,Matrix[Row,Col,ValT]] = - new MatrixProduct[Matrix[Row,Col,ValT],ValT,Matrix[Row,Col,ValT]] { - def apply(left : Matrix[Row,Col,ValT], right : ValT) = { - val newPipe = left.pipe.map(left.valSym -> left.valSym) { (v : ValT) => + implicit def literalRightProduct[Row, Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[Row, Col, ValT], ValT, Matrix[Row, Col, ValT]] = + new MatrixProduct[Matrix[Row, Col, ValT], ValT, Matrix[Row, Col, ValT]] { + def apply(left: Matrix[Row, Col, ValT], right: ValT) = { + val newPipe = left.pipe.map(left.valSym -> left.valSym) { (v: ValT) => ring.times(v, right) } - new Matrix[Row,Col,ValT](left.rowSym, left.colSym, left.valSym, newPipe, left.sizeHint) + new Matrix[Row, Col, ValT](left.rowSym, left.colSym, left.valSym, newPipe, left.sizeHint) } } - - implicit def literalScalarLeftProduct[Row,Col,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[LiteralScalar[ValT],Matrix[Row,Col,ValT],Matrix[Row,Col,ValT]] = - new MatrixProduct[LiteralScalar[ValT],Matrix[Row,Col,ValT],Matrix[Row,Col,ValT]] { - def apply( left : LiteralScalar[ValT], right : Matrix[Row,Col,ValT]) = { - val newPipe = right.pipe.map(right.valSym -> right.valSym) { (v : ValT) => + implicit def literalScalarLeftProduct[Row, Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[LiteralScalar[ValT], Matrix[Row, Col, ValT], Matrix[Row, Col, ValT]] = + new MatrixProduct[LiteralScalar[ValT], Matrix[Row, Col, ValT], Matrix[Row, Col, ValT]] { + def apply(left: LiteralScalar[ValT], right: Matrix[Row, Col, ValT]) = { + val newPipe = right.pipe.map(right.valSym -> right.valSym) { (v: ValT) => ring.times(left.value, v) } - new Matrix[Row,Col,ValT](right.rowSym, right.colSym, right.valSym, newPipe, right.sizeHint) + new Matrix[Row, Col, ValT](right.rowSym, right.colSym, right.valSym, newPipe, right.sizeHint) } } - implicit def scalarPipeRightProduct[Row,Col,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[Matrix[Row,Col,ValT],Scalar[ValT],Matrix[Row,Col,ValT]] = - new MatrixProduct[Matrix[Row,Col,ValT],Scalar[ValT],Matrix[Row,Col,ValT]] { - def apply(left : Matrix[Row,Col,ValT], right : Scalar[ValT]) = { - left.nonZerosWith(right).mapValues({leftRight => - val (left, right) = leftRight - ring.times(left, right) - })(ring) - } + implicit def scalarPipeRightProduct[Row, Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[Row, Col, ValT], Scalar[ValT], Matrix[Row, Col, ValT]] = + new MatrixProduct[Matrix[Row, Col, ValT], Scalar[ValT], Matrix[Row, Col, ValT]] { + def apply(left: Matrix[Row, Col, ValT], right: Scalar[ValT]) = + left + .nonZerosWith(right) + .mapValues { leftRight => + val (left, right) = leftRight + ring.times(left, right) + }(ring) } - implicit def scalarPipeLeftProduct[Row,Col,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[Scalar[ValT],Matrix[Row,Col,ValT],Matrix[Row,Col,ValT]] = - new MatrixProduct[Scalar[ValT],Matrix[Row,Col,ValT],Matrix[Row,Col,ValT]] { - def apply(left : Scalar[ValT], right : Matrix[Row,Col,ValT]) = { - right.nonZerosWith(left).mapValues({matScal => - val (matVal, scalarVal) = matScal - ring.times(scalarVal, matVal) - })(ring) - } + implicit def scalarPipeLeftProduct[Row, Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Scalar[ValT], Matrix[Row, Col, ValT], Matrix[Row, Col, ValT]] = + new MatrixProduct[Scalar[ValT], Matrix[Row, Col, ValT], Matrix[Row, Col, ValT]] { + def apply(left: Scalar[ValT], right: Matrix[Row, Col, ValT]) = + right + .nonZerosWith(left) + .mapValues { matScal => + val (matVal, scalarVal) = matScal + ring.times(scalarVal, matVal) + }(ring) } - implicit def scalarRowRightProduct[Col,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[RowVector[Col,ValT],Scalar[ValT],RowVector[Col,ValT]] = - new MatrixProduct[RowVector[Col,ValT],Scalar[ValT],RowVector[Col,ValT]] { - def apply(left : RowVector[Col,ValT], right : Scalar[ValT]) : RowVector[Col,ValT]= { - val prod = left.toMatrix(0)*right + implicit def scalarRowRightProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[Col, ValT], Scalar[ValT], RowVector[Col, ValT]] = + new MatrixProduct[RowVector[Col, ValT], Scalar[ValT], RowVector[Col, ValT]] { + def apply(left: RowVector[Col, ValT], right: Scalar[ValT]): RowVector[Col, ValT] = { + val prod = left.toMatrix(0) * right - new RowVector[Col,ValT](prod.colSym, prod.valSym, prod.pipe.project(prod.colSym, prod.valSym)) + new RowVector[Col, ValT](prod.colSym, prod.valSym, prod.pipe.project(prod.colSym, prod.valSym)) } } - implicit def scalarRowLeftProduct[Col,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[Scalar[ValT],RowVector[Col,ValT],RowVector[Col,ValT]] = - new MatrixProduct[Scalar[ValT],RowVector[Col,ValT],RowVector[Col,ValT]] { - def apply(left : Scalar[ValT], right : RowVector[Col,ValT]) : RowVector[Col,ValT]= { - val prod = (right.transpose.toMatrix(0))*left + implicit def scalarRowLeftProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Scalar[ValT], RowVector[Col, ValT], RowVector[Col, ValT]] = + new MatrixProduct[Scalar[ValT], RowVector[Col, ValT], RowVector[Col, ValT]] { + def apply(left: Scalar[ValT], right: RowVector[Col, ValT]): RowVector[Col, ValT] = { + val prod = (right.transpose.toMatrix(0)) * left - new RowVector[Col,ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) + new RowVector[Col, ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) } } - implicit def scalarColRightProduct[Row,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[ColVector[Row,ValT],Scalar[ValT],ColVector[Row,ValT]] = - new MatrixProduct[ColVector[Row,ValT],Scalar[ValT],ColVector[Row,ValT]] { - def apply(left : ColVector[Row,ValT], right : Scalar[ValT]) : ColVector[Row,ValT]= { - val prod = left.toMatrix(0)*right + implicit def scalarColRightProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[ColVector[Row, ValT], Scalar[ValT], ColVector[Row, ValT]] = + new MatrixProduct[ColVector[Row, ValT], Scalar[ValT], ColVector[Row, ValT]] { + def apply(left: ColVector[Row, ValT], right: Scalar[ValT]): ColVector[Row, ValT] = { + val prod = left.toMatrix(0) * right - new ColVector[Row,ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) + new ColVector[Row, ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) } } - implicit def scalarColLeftProduct[Row,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[Scalar[ValT],ColVector[Row,ValT],ColVector[Row,ValT]] = - new MatrixProduct[Scalar[ValT],ColVector[Row,ValT],ColVector[Row,ValT]] { - def apply(left : Scalar[ValT], right : ColVector[Row,ValT]) : ColVector[Row,ValT]= { - val prod = (right.toMatrix(0))*left + implicit def scalarColLeftProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Scalar[ValT], ColVector[Row, ValT], ColVector[Row, ValT]] = + new MatrixProduct[Scalar[ValT], ColVector[Row, ValT], ColVector[Row, ValT]] { + def apply(left: Scalar[ValT], right: ColVector[Row, ValT]): ColVector[Row, ValT] = { + val prod = (right.toMatrix(0)) * left - new ColVector[Row,ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) + new ColVector[Row, ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) } } - implicit def litScalarRowRightProduct[Col,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[RowVector[Col,ValT],LiteralScalar[ValT],RowVector[Col,ValT]] = - new MatrixProduct[RowVector[Col,ValT],LiteralScalar[ValT],RowVector[Col,ValT]] { - def apply(left : RowVector[Col,ValT], right : LiteralScalar[ValT]) : RowVector[Col,ValT]= { - val prod = left.toMatrix(0)*right + implicit def litScalarRowRightProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[Col, ValT], LiteralScalar[ValT], RowVector[Col, ValT]] = + new MatrixProduct[RowVector[Col, ValT], LiteralScalar[ValT], RowVector[Col, ValT]] { + def apply(left: RowVector[Col, ValT], right: LiteralScalar[ValT]): RowVector[Col, ValT] = { + val prod = left.toMatrix(0) * right - new RowVector[Col,ValT](prod.colSym, prod.valSym, prod.pipe.project(prod.colSym, prod.valSym)) + new RowVector[Col, ValT](prod.colSym, prod.valSym, prod.pipe.project(prod.colSym, prod.valSym)) } } - implicit def litScalarRowLeftProduct[Col,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[LiteralScalar[ValT],RowVector[Col,ValT],RowVector[Col,ValT]] = - new MatrixProduct[LiteralScalar[ValT],RowVector[Col,ValT],RowVector[Col,ValT]] { - def apply(left : LiteralScalar[ValT], right : RowVector[Col,ValT]) : RowVector[Col,ValT]= { - val prod = (right.transpose.toMatrix(0))*left + implicit def litScalarRowLeftProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[LiteralScalar[ValT], RowVector[Col, ValT], RowVector[Col, ValT]] = + new MatrixProduct[LiteralScalar[ValT], RowVector[Col, ValT], RowVector[Col, ValT]] { + def apply(left: LiteralScalar[ValT], right: RowVector[Col, ValT]): RowVector[Col, ValT] = { + val prod = (right.transpose.toMatrix(0)) * left - new RowVector[Col,ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) + new RowVector[Col, ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) } } - implicit def litScalarColRightProduct[Row,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[ColVector[Row,ValT],LiteralScalar[ValT],ColVector[Row,ValT]] = - new MatrixProduct[ColVector[Row,ValT],LiteralScalar[ValT],ColVector[Row,ValT]] { - def apply(left : ColVector[Row,ValT], right : LiteralScalar[ValT]) : ColVector[Row,ValT]= { - val prod = left.toMatrix(0)*right + implicit def litScalarColRightProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[ColVector[Row, ValT], LiteralScalar[ValT], ColVector[Row, ValT]] = + new MatrixProduct[ColVector[Row, ValT], LiteralScalar[ValT], ColVector[Row, ValT]] { + def apply(left: ColVector[Row, ValT], right: LiteralScalar[ValT]): ColVector[Row, ValT] = { + val prod = left.toMatrix(0) * right - new ColVector[Row,ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) + new ColVector[Row, ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) } } - implicit def litScalarColLeftProduct[Row,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[LiteralScalar[ValT],ColVector[Row,ValT],ColVector[Row,ValT]] = - new MatrixProduct[LiteralScalar[ValT],ColVector[Row,ValT],ColVector[Row,ValT]] { - def apply(left : LiteralScalar[ValT], right : ColVector[Row,ValT]) : ColVector[Row,ValT]= { - val prod = (right.toMatrix(0))*left + implicit def litScalarColLeftProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[LiteralScalar[ValT], ColVector[Row, ValT], ColVector[Row, ValT]] = + new MatrixProduct[LiteralScalar[ValT], ColVector[Row, ValT], ColVector[Row, ValT]] { + def apply(left: LiteralScalar[ValT], right: ColVector[Row, ValT]): ColVector[Row, ValT] = { + val prod = (right.toMatrix(0)) * left - new ColVector[Row,ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) + new ColVector[Row, ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) } } - implicit def scalarDiagRightProduct[Row,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[DiagonalMatrix[Row,ValT],Scalar[ValT], DiagonalMatrix[Row,ValT]] = - new MatrixProduct[DiagonalMatrix[Row,ValT],Scalar[ValT],DiagonalMatrix[Row,ValT]] { - def apply(left : DiagonalMatrix[Row,ValT], right : Scalar[ValT]) : DiagonalMatrix[Row,ValT]= { - val prod = (left.toCol.toMatrix(0))*right + implicit def scalarDiagRightProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[DiagonalMatrix[Row, ValT], Scalar[ValT], DiagonalMatrix[Row, ValT]] = + new MatrixProduct[DiagonalMatrix[Row, ValT], Scalar[ValT], DiagonalMatrix[Row, ValT]] { + def apply(left: DiagonalMatrix[Row, ValT], right: Scalar[ValT]): DiagonalMatrix[Row, ValT] = { + val prod = (left.toCol.toMatrix(0)) * right - new DiagonalMatrix[Row,ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) + new DiagonalMatrix[Row, ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) } } - implicit def scalarDiagLeftProduct[Row,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[Scalar[ValT],DiagonalMatrix[Row,ValT],DiagonalMatrix[Row,ValT]] = - new MatrixProduct[Scalar[ValT],DiagonalMatrix[Row,ValT],DiagonalMatrix[Row,ValT]] { - def apply(left : Scalar[ValT], right : DiagonalMatrix[Row,ValT]) : DiagonalMatrix[Row,ValT]= { - val prod = (right.toCol.toMatrix(0))*left + implicit def scalarDiagLeftProduct[Row, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Scalar[ValT], DiagonalMatrix[Row, ValT], DiagonalMatrix[Row, ValT]] = + new MatrixProduct[Scalar[ValT], DiagonalMatrix[Row, ValT], DiagonalMatrix[Row, ValT]] { + def apply(left: Scalar[ValT], right: DiagonalMatrix[Row, ValT]): DiagonalMatrix[Row, ValT] = { + val prod = (right.toCol.toMatrix(0)) * left - new DiagonalMatrix[Row,ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) + new DiagonalMatrix[Row, ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) } } - implicit def litScalarDiagRightProduct[Col,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[DiagonalMatrix[Col,ValT],LiteralScalar[ValT],DiagonalMatrix[Col,ValT]] = - new MatrixProduct[DiagonalMatrix[Col,ValT],LiteralScalar[ValT],DiagonalMatrix[Col,ValT]] { - def apply(left : DiagonalMatrix[Col,ValT], right : LiteralScalar[ValT]) : DiagonalMatrix[Col,ValT]= { - val prod = (left.toRow.toMatrix(0))*right + implicit def litScalarDiagRightProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[DiagonalMatrix[Col, ValT], LiteralScalar[ValT], DiagonalMatrix[Col, ValT]] = + new MatrixProduct[DiagonalMatrix[Col, ValT], LiteralScalar[ValT], DiagonalMatrix[Col, ValT]] { + def apply(left: DiagonalMatrix[Col, ValT], right: LiteralScalar[ValT]): DiagonalMatrix[Col, ValT] = { + val prod = (left.toRow.toMatrix(0)) * right - new DiagonalMatrix[Col,ValT](prod.colSym, prod.valSym, prod.pipe.project(prod.colSym, prod.valSym)) + new DiagonalMatrix[Col, ValT](prod.colSym, prod.valSym, prod.pipe.project(prod.colSym, prod.valSym)) } } - implicit def litScalarDiagLeftProduct[Col,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[LiteralScalar[ValT],DiagonalMatrix[Col,ValT],DiagonalMatrix[Col,ValT]] = - new MatrixProduct[LiteralScalar[ValT],DiagonalMatrix[Col,ValT],DiagonalMatrix[Col,ValT]] { - def apply(left : LiteralScalar[ValT], right : DiagonalMatrix[Col,ValT]) : DiagonalMatrix[Col,ValT]= { - val prod = (right.toCol.toMatrix(0))*left + implicit def litScalarDiagLeftProduct[Col, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[LiteralScalar[ValT], DiagonalMatrix[Col, ValT], DiagonalMatrix[Col, ValT]] = + new MatrixProduct[LiteralScalar[ValT], DiagonalMatrix[Col, ValT], DiagonalMatrix[Col, ValT]] { + def apply(left: LiteralScalar[ValT], right: DiagonalMatrix[Col, ValT]): DiagonalMatrix[Col, ValT] = { + val prod = (right.toCol.toMatrix(0)) * left - new DiagonalMatrix[Col,ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) + new DiagonalMatrix[Col, ValT](prod.rowSym, prod.valSym, prod.pipe.project(prod.rowSym, prod.valSym)) } } - //TODO: remove in 0.9.0, only here just for compatibility. - def vectorInnerProduct[IdxT,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[RowVector[IdxT,ValT],ColVector[IdxT,ValT],Scalar[ValT]] = - rowColProduct(ring) - - implicit def rowColProduct[IdxT,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[RowVector[IdxT,ValT],ColVector[IdxT,ValT],Scalar[ValT]] = - new MatrixProduct[RowVector[IdxT,ValT],ColVector[IdxT,ValT],Scalar[ValT]] { - def apply(left : RowVector[IdxT,ValT], right : ColVector[IdxT,ValT]) : Scalar[ValT] = { + // TODO: remove in 0.9.0, only here just for compatibility. + def vectorInnerProduct[IdxT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[IdxT, ValT], ColVector[IdxT, ValT], Scalar[ValT]] = + rowColProduct(ring) + + implicit def rowColProduct[IdxT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[IdxT, ValT], ColVector[IdxT, ValT], Scalar[ValT]] = + new MatrixProduct[RowVector[IdxT, ValT], ColVector[IdxT, ValT], Scalar[ValT]] { + def apply(left: RowVector[IdxT, ValT], right: ColVector[IdxT, ValT]): Scalar[ValT] = { // Normal matrix multiplication works here, but we need to convert to a Scalar - val prod = (left.toMatrix(0) * right.toMatrix(0)) : Matrix[Int,Int,ValT] + val prod = (left.toMatrix(0) * right.toMatrix(0)): Matrix[Int, Int, ValT] new Scalar[ValT](prod.valSym, prod.pipe.project(prod.valSym)) } } - implicit def rowMatrixProduct[Common, ColR, ValT](implicit ring: Ring[ValT]) : - MatrixProduct[RowVector[Common, ValT], Matrix[Common, ColR, ValT], RowVector[ColR, ValT]] = + implicit def rowMatrixProduct[Common, ColR, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[Common, ValT], Matrix[Common, ColR, ValT], RowVector[ColR, ValT]] = new MatrixProduct[RowVector[Common, ValT], Matrix[Common, ColR, ValT], RowVector[ColR, ValT]] { - def apply(left: RowVector[Common, ValT], right: Matrix[Common, ColR, ValT]) = { + def apply(left: RowVector[Common, ValT], right: Matrix[Common, ColR, ValT]) = (left.toMatrix(true) * right).getRow(true) - } } - implicit def matrixColProduct[RowR, Common, ValT](implicit ring: Ring[ValT]) : - MatrixProduct[Matrix[RowR, Common, ValT], ColVector[Common, ValT], ColVector[RowR, ValT]] = + implicit def matrixColProduct[RowR, Common, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[RowR, Common, ValT], ColVector[Common, ValT], ColVector[RowR, ValT]] = new MatrixProduct[Matrix[RowR, Common, ValT], ColVector[Common, ValT], ColVector[RowR, ValT]] { - def apply(left: Matrix[RowR, Common, ValT], right: ColVector[Common, ValT]) = { - (left * right.toMatrix(true)).getCol(true) - } + def apply(left: Matrix[RowR, Common, ValT], right: ColVector[Common, ValT]) = + (left * right.toMatrix(true)).getCol(true) } - implicit def vectorOuterProduct[RowT, ColT, ValT](implicit ring: Ring[ValT]) : - MatrixProduct[ColVector[RowT, ValT], RowVector[ColT, ValT], Matrix[RowT, ColT, ValT]] = + implicit def vectorOuterProduct[RowT, ColT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[ColVector[RowT, ValT], RowVector[ColT, ValT], Matrix[RowT, ColT, ValT]] = new MatrixProduct[ColVector[RowT, ValT], RowVector[ColT, ValT], Matrix[RowT, ColT, ValT]] { - def apply(left: ColVector[RowT, ValT], right: RowVector[ColT, ValT]) : Matrix[RowT, ColT, ValT] = { - val (newRightFields, newRightPipe) = ensureUniqueFields( - (left.rowS,left.valS), - (right.colS, right.valS), - right.pipe - ) + def apply(left: ColVector[RowT, ValT], right: RowVector[ColT, ValT]): Matrix[RowT, ColT, ValT] = { + val (newRightFields, newRightPipe) = + ensureUniqueFields((left.rowS, left.valS), (right.colS, right.valS), right.pipe) val newColSym = Symbol(right.colS.name + "_newCol") - val newHint = left.sizeH * right.sizeH - val productPipe = Matrix.filterOutZeros(left.valS, ring) { - getCrosser(right.sizeH) - .apply(left.pipe, newRightPipe) - .map(left.valS.append(getField(newRightFields,1)) -> left.valS) { pair: (ValT, ValT) => - ring.times(pair._1, pair._2) - } + val newHint = left.sizeH * right.sizeH + val productPipe = Matrix + .filterOutZeros(left.valS, ring) { + getCrosser(right.sizeH) + .apply(left.pipe, newRightPipe) + .map(left.valS.append(getField(newRightFields, 1)) -> left.valS) { pair: (ValT, ValT) => + ring.times(pair._1, pair._2) + } } - .rename(getField(newRightFields,0)->newColSym) - new Matrix[RowT,ColT,ValT](left.rowS, newColSym, left.valS, productPipe, newHint) + .rename(getField(newRightFields, 0) -> newColSym) + new Matrix[RowT, ColT, ValT](left.rowS, newColSym, left.valS, productPipe, newHint) } } - implicit def standardMatrixProduct[RowL,Common,ColR,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[Matrix[RowL,Common,ValT],Matrix[Common,ColR,ValT],Matrix[RowL,ColR,ValT]] = - new MatrixProduct[Matrix[RowL,Common,ValT],Matrix[Common,ColR,ValT],Matrix[RowL,ColR,ValT]] { - def apply(left : Matrix[RowL,Common,ValT], right : Matrix[Common,ColR,ValT]) = { + implicit def standardMatrixProduct[RowL, Common, ColR, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[RowL, Common, ValT], Matrix[Common, ColR, ValT], Matrix[RowL, ColR, ValT]] = + new MatrixProduct[Matrix[RowL, Common, ValT], Matrix[Common, ColR, ValT], Matrix[RowL, ColR, ValT]] { + def apply(left: Matrix[RowL, Common, ValT], right: Matrix[Common, ColR, ValT]) = { val (newRightFields, newRightPipe) = ensureUniqueFields( - (left.rowSym,left.colSym,left.valSym), + (left.rowSym, left.colSym, left.valSym), (right.rowSym, right.colSym, right.valSym), right.pipe ) @@ -358,33 +375,35 @@ object MatrixProduct extends java.io.Serializable { // Hint of groupBy reducer size val grpReds = numOfReducers(newHint) - val productPipe = Matrix.filterOutZeros(left.valSym, ring) { - getJoiner(left.sizeHint, right.sizeHint) - // TODO: we should use the size hints to set the number of reducers: - .apply(left.pipe, (left.colSym -> getField(newRightFields, 0)), newRightPipe) - // Do the product: - .map((left.valSym.append(getField(newRightFields, 2))) -> left.valSym) { pair : (ValT,ValT) => - ring.times(pair._1, pair._2) - } - .groupBy(left.rowSym.append(getField(newRightFields, 1))) { - // We should use the size hints to set the number of reducers here - _.reduce(left.valSym) { (x: Tuple1[ValT], y: Tuple1[ValT]) => Tuple1(ring.plus(x._1, y._1)) } - // There is a low chance that many (row,col) keys are co-located, and the keyspace - // is likely huge, just push to reducers - .forceToReducers - .reducers(grpReds) - } + val productPipe = Matrix + .filterOutZeros(left.valSym, ring) { + getJoiner(left.sizeHint, right.sizeHint) + // TODO: we should use the size hints to set the number of reducers: + .apply(left.pipe, (left.colSym -> getField(newRightFields, 0)), newRightPipe) + // Do the product: + .map((left.valSym.append(getField(newRightFields, 2))) -> left.valSym) { pair: (ValT, ValT) => + ring.times(pair._1, pair._2) + } + .groupBy(left.rowSym.append(getField(newRightFields, 1))) { + // We should use the size hints to set the number of reducers here + _.reduce(left.valSym)((x: Tuple1[ValT], y: Tuple1[ValT]) => Tuple1(ring.plus(x._1, y._1))) + // There is a low chance that many (row,col) keys are co-located, and the keyspace + // is likely huge, just push to reducers + .forceToReducers + .reducers(grpReds) + } } // Keep the names from the left: .rename(getField(newRightFields, 1) -> left.colSym) - new Matrix[RowL,ColR,ValT](left.rowSym, left.colSym, left.valSym, productPipe, newHint) + new Matrix[RowL, ColR, ValT](left.rowSym, left.colSym, left.valSym, productPipe, newHint) } - } + } - implicit def diagMatrixProduct[RowT,ColT,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[DiagonalMatrix[RowT,ValT],Matrix[RowT,ColT,ValT],Matrix[RowT,ColT,ValT]] = - new MatrixProduct[DiagonalMatrix[RowT,ValT],Matrix[RowT,ColT,ValT],Matrix[RowT,ColT,ValT]] { - def apply(left : DiagonalMatrix[RowT,ValT], right : Matrix[RowT,ColT,ValT]) = { + implicit def diagMatrixProduct[RowT, ColT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[DiagonalMatrix[RowT, ValT], Matrix[RowT, ColT, ValT], Matrix[RowT, ColT, ValT]] = + new MatrixProduct[DiagonalMatrix[RowT, ValT], Matrix[RowT, ColT, ValT], Matrix[RowT, ColT, ValT]] { + def apply(left: DiagonalMatrix[RowT, ValT], right: Matrix[RowT, ColT, ValT]) = { val (newRightFields, newRightPipe) = ensureUniqueFields( (left.idxSym, left.valSym), (right.rowSym, right.colSym, right.valSym), @@ -396,64 +415,64 @@ object MatrixProduct extends java.io.Serializable { // TODO: we should use the size hints to set the number of reducers: .apply(left.pipe, (left.idxSym -> getField(newRightFields, 0)), newRightPipe) // Do the product: - .map((left.valSym.append(getField(newRightFields, 2))) -> getField(newRightFields,2)) { pair : (ValT,ValT) => - ring.times(pair._1, pair._2) + .map((left.valSym.append(getField(newRightFields, 2))) -> getField(newRightFields, 2)) { + pair: (ValT, ValT) => + ring.times(pair._1, pair._2) } // Keep the names from the right: .project(newRightFields) .rename(newRightFields -> (right.rowSym, right.colSym, right.valSym)) - } - new Matrix[RowT,ColT,ValT](right.rowSym, right.colSym, right.valSym, productPipe, newHint) + } + new Matrix[RowT, ColT, ValT](right.rowSym, right.colSym, right.valSym, productPipe, newHint) } } - implicit def matrixDiagProduct[RowT,ColT,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[Matrix[RowT,ColT,ValT],DiagonalMatrix[ColT,ValT],Matrix[RowT,ColT,ValT]] = - new MatrixProduct[Matrix[RowT,ColT,ValT],DiagonalMatrix[ColT,ValT],Matrix[RowT,ColT,ValT]] { - def apply(left : Matrix[RowT,ColT,ValT], right : DiagonalMatrix[ColT,ValT]) = { + implicit def matrixDiagProduct[RowT, ColT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[Matrix[RowT, ColT, ValT], DiagonalMatrix[ColT, ValT], Matrix[RowT, ColT, ValT]] = + new MatrixProduct[Matrix[RowT, ColT, ValT], DiagonalMatrix[ColT, ValT], Matrix[RowT, ColT, ValT]] { + def apply(left: Matrix[RowT, ColT, ValT], right: DiagonalMatrix[ColT, ValT]) = // (A * B) = (B^T * A^T)^T // note diagonal^T = diagonal (right * (left.transpose)).transpose - } } - implicit def diagDiagProduct[IdxT,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[DiagonalMatrix[IdxT,ValT],DiagonalMatrix[IdxT,ValT],DiagonalMatrix[IdxT,ValT]] = - new MatrixProduct[DiagonalMatrix[IdxT,ValT],DiagonalMatrix[IdxT,ValT],DiagonalMatrix[IdxT,ValT]] { - def apply(left : DiagonalMatrix[IdxT,ValT], right : DiagonalMatrix[IdxT,ValT]) = { - val (newRightFields, newRightPipe) = ensureUniqueFields( - (left.idxSym, left.valSym), - (right.idxSym, right.valSym), - right.pipe - ) + implicit def diagDiagProduct[IdxT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[DiagonalMatrix[IdxT, ValT], DiagonalMatrix[IdxT, ValT], DiagonalMatrix[IdxT, ValT]] = + new MatrixProduct[DiagonalMatrix[IdxT, ValT], DiagonalMatrix[IdxT, ValT], DiagonalMatrix[IdxT, ValT]] { + def apply(left: DiagonalMatrix[IdxT, ValT], right: DiagonalMatrix[IdxT, ValT]) = { + val (newRightFields, newRightPipe) = + ensureUniqueFields((left.idxSym, left.valSym), (right.idxSym, right.valSym), right.pipe) val newHint = left.sizeHint * right.sizeHint - val productPipe = Matrix.filterOutZeros(left.valSym, ring) { - getJoiner(left.sizeHint, right.sizeHint) - // TODO: we should use the size hints to set the number of reducers: - .apply(left.pipe, (left.idxSym -> getField(newRightFields, 0)), newRightPipe) - // Do the product: - .map((left.valSym.append(getField(newRightFields, 1))) -> left.valSym) { pair : (ValT,ValT) => - ring.times(pair._1, pair._2) - } + val productPipe = Matrix + .filterOutZeros(left.valSym, ring) { + getJoiner(left.sizeHint, right.sizeHint) + // TODO: we should use the size hints to set the number of reducers: + .apply(left.pipe, (left.idxSym -> getField(newRightFields, 0)), newRightPipe) + // Do the product: + .map((left.valSym.append(getField(newRightFields, 1))) -> left.valSym) { pair: (ValT, ValT) => + ring.times(pair._1, pair._2) + } } // Keep the names from the left: .project(left.idxSym, left.valSym) - new DiagonalMatrix[IdxT,ValT](left.idxSym, left.valSym, productPipe, newHint) + new DiagonalMatrix[IdxT, ValT](left.idxSym, left.valSym, productPipe, newHint) } } - implicit def diagColProduct[IdxT,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[DiagonalMatrix[IdxT,ValT],ColVector[IdxT,ValT],ColVector[IdxT,ValT]] = - new MatrixProduct[DiagonalMatrix[IdxT,ValT],ColVector[IdxT,ValT],ColVector[IdxT,ValT]] { - def apply(left : DiagonalMatrix[IdxT,ValT], right : ColVector[IdxT,ValT]) = { + implicit def diagColProduct[IdxT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[DiagonalMatrix[IdxT, ValT], ColVector[IdxT, ValT], ColVector[IdxT, ValT]] = + new MatrixProduct[DiagonalMatrix[IdxT, ValT], ColVector[IdxT, ValT], ColVector[IdxT, ValT]] { + def apply(left: DiagonalMatrix[IdxT, ValT], right: ColVector[IdxT, ValT]) = (left * (right.diag)).toCol - } } - implicit def rowDiagProduct[IdxT,ValT](implicit ring : Ring[ValT]) : - MatrixProduct[RowVector[IdxT,ValT],DiagonalMatrix[IdxT,ValT],RowVector[IdxT,ValT]] = - new MatrixProduct[RowVector[IdxT,ValT],DiagonalMatrix[IdxT,ValT],RowVector[IdxT,ValT]] { - def apply(left : RowVector[IdxT,ValT], right : DiagonalMatrix[IdxT,ValT]) = { + implicit def rowDiagProduct[IdxT, ValT](implicit + ring: Ring[ValT] + ): MatrixProduct[RowVector[IdxT, ValT], DiagonalMatrix[IdxT, ValT], RowVector[IdxT, ValT]] = + new MatrixProduct[RowVector[IdxT, ValT], DiagonalMatrix[IdxT, ValT], RowVector[IdxT, ValT]] { + def apply(left: RowVector[IdxT, ValT], right: DiagonalMatrix[IdxT, ValT]) = ((left.diag) * right).toRow - } } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Poisson.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Poisson.scala index 09b47cc508..67f38cccf1 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Poisson.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/Poisson.scala @@ -3,23 +3,22 @@ package com.twitter.scalding.mathematics import scala.util.Random /** - * Generating Poisson-distributed random variables - * according to Donald Knuth's algorith as shown on Wikipedia's - * Poisson Distribution page + * Generating Poisson-distributed random variables according to Donald Knuth's algorithm as shown on + * Wikipedia's Poisson Distribution page */ -class Poisson(fraction : Double, seed : Int) { - +class Poisson(fraction: Double, seed: Int) { + val L = math.exp(-fraction) val randomGenerator = new Random(seed) - + def nextInt = { var k = 0 var p = 1.0 do { k = k + 1 p = p * randomGenerator.nextDouble - } while(p > L) + } while (p > L) k - 1 } -} \ No newline at end of file +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/SizeHint.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/SizeHint.scala deleted file mode 100644 index 0cb133ffdb..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/SizeHint.scala +++ /dev/null @@ -1,163 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.mathematics - -object SizeHint { - implicit val ordering = SizeHintOrdering - // Return a sparsity assuming all the diagonal is present, but nothing else - def asDiagonal(h : SizeHint) : SizeHint = { - def make(r : BigInt, c : BigInt) = { - h.total.map { tot => - val maxElements = (r min c) - val sparsity = 1.0 / maxElements.doubleValue - SparseHint(sparsity, maxElements, maxElements) - }.getOrElse(NoClue) - } - h match { - case NoClue => NoClue - case FiniteHint(r,c) => make(r,c) - case SparseHint(sp,r,c) => make(r,c) - } - } -} - -sealed abstract class SizeHint { - def * (other : SizeHint) : SizeHint - def + (other : SizeHint) : SizeHint - // for estimating the Hadamard product - def #*# (other : SizeHint) : SizeHint - def total : Option[BigInt] - def setCols(cols : Long) : SizeHint - def setRows(rows : Long) : SizeHint - def setColsToRows : SizeHint - def setRowsToCols : SizeHint - def transpose : SizeHint -} - -// If we have no idea, we still don't have any idea, this is like NaN -case object NoClue extends SizeHint { - def * (other : SizeHint) = NoClue - def + (other : SizeHint) = NoClue - def #*# (other : SizeHint) = NoClue - def total = None - def setCols(cols : Long) = FiniteHint(-1L, cols) - def setRows(rows : Long) = FiniteHint(rows, -1L) - def setColsToRows = NoClue - def setRowsToCols = NoClue - def transpose = NoClue -} - -case class FiniteHint(rows : BigInt = -1L, cols : BigInt = -1L) extends SizeHint { - def *(other : SizeHint) = { - other match { - case NoClue => NoClue - case FiniteHint(orows, ocols) => FiniteHint(rows, ocols) - case sp@SparseHint(_,_,_) => (SparseHint(1.0, rows, cols) * sp) - } - } - def +(other : SizeHint) = { - other match { - case NoClue => NoClue - // In this case, a hint on one side, will overwrite lack of knowledge (-1L) - case FiniteHint(orows, ocols) => FiniteHint(rows.max(orows), cols.max(ocols)) - case sp@SparseHint(_,_,_) => (sp + this) - } - } - def #*#(other : SizeHint) = { - other match { - case NoClue => NoClue - // In this case, a hint on one side, will overwrite lack of knowledge (-1L) - case FiniteHint(orows, ocols) => FiniteHint(rows.min(orows), cols.min(ocols)) - case sp@SparseHint(_,_,_) => (sp #*# this) - } - } - def total = if(rows >= 0 && cols >= 0) { Some(rows * cols) } else None - def setCols(ncols : Long) = FiniteHint(rows, ncols) - def setRows(nrows : Long) = FiniteHint(nrows, cols) - def setColsToRows = FiniteHint(rows, rows) - def setRowsToCols = FiniteHint(cols, cols) - def transpose = FiniteHint(cols, rows) -} - -// sparsity is the fraction of the rows and columns that are expected to be present -case class SparseHint(sparsity : Double, rows : BigInt, cols : BigInt) extends SizeHint { - def * (other : SizeHint) : SizeHint = { - other match { - case NoClue => NoClue - case FiniteHint(r, c) => (this * SparseHint(1.0, r, c)) - case SparseHint(sp,r,c) => { - // if I occupy a bin with probability p, and you q, then both: pq - // There are cols samples of the, above, so the probability one is present: - // 1-(1-pq)^cols ~ (cols * p * q) min 1.0 - val newSp = (BigDecimal(cols) * sp * sparsity) - if(newSp >= 1.0) { - FiniteHint(rows, c) - } - else { - SparseHint(newSp.toDouble, rows, c) - } - } - } - } - def + (other : SizeHint) : SizeHint = { - other match { - case NoClue => NoClue - case FiniteHint(r, c) => (this + SparseHint(1.0, r, c)) - case SparseHint(sp,r,c) => { - // if I occupy a bin with probability p, and you q, then either: p + q - pq - if ((sparsity == 1.0) || (sp == 1.0)) { - FiniteHint(rows max r, cols max c) - } - else { - val newSp = sparsity + sp - sp*sparsity - SparseHint(newSp, rows max r, cols max c) - } - } - } - } - def #*# (other : SizeHint) : SizeHint = { - other match { - case NoClue => NoClue - case FiniteHint(r, c) => (this #*# SparseHint(1.0, r, c)) - case SparseHint(sp,r,c) => { - val newSp = sp min sparsity - SparseHint(newSp, rows min r, cols min c) - } - } - } - def total : Option[BigInt] = { - if((rows >= 0) && (cols >= 0)) { - Some((BigDecimal(rows) * BigDecimal(cols) * sparsity).toBigInt) - } - else - None - } - def setCols(c : Long) : SizeHint = copy(cols = c) - def setRows(r : Long) : SizeHint = copy(rows = r) - def setColsToRows : SizeHint = copy(cols = rows) - def setRowsToCols : SizeHint = copy(rows = cols) - def transpose : SizeHint = copy(cols = rows, rows = cols) -} - -/** Allows us to sort matrices by approximate type - */ -object SizeHintOrdering extends Ordering[SizeHint] with java.io.Serializable { - def compare(left : SizeHint, right : SizeHint) : Int = { - left.total.getOrElse(BigInt(-1L)) - .compare(right.total.getOrElse(BigInt(-1L))) - } -} - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/mathematics/TypedSimilarity.scala b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/TypedSimilarity.scala new file mode 100644 index 0000000000..42e343f173 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/mathematics/TypedSimilarity.scala @@ -0,0 +1,319 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.mathematics + +import com.twitter.scalding.typed.{Grouped, TypedPipe, WithReducers} + +import java.io.Serializable + +/** + * Implementation of DISCO and DIMSUM approximation similarity algorithm + * @author + * Oscar Boykin + * @author + * Kevin Lin + */ + +/** + * Represents an Edge in a graph with some edge data + */ +case class Edge[+N, +E](from: N, to: N, data: E) { + def mapData[F](fn: (E => F)): Edge[N, F] = Edge(from, to, fn(data)) + def reverse: Edge[N, E] = Edge(to, from, data) +} + +abstract sealed trait Degree { val degree: Int } +final case class InDegree(override val degree: Int) extends Degree +final case class OutDegree(override val degree: Int) extends Degree +final case class Weight(weight: Double) +final case class L2Norm(norm: Double) + +object GraphOperations extends Serializable { + + /** + * For each N, aggregate all the edges, and attach Edge state + */ + def joinAggregate[N, E, T]( + grouped: Grouped[N, Edge[N, E]] + )(agfn: Iterable[Edge[N, E]] => T): TypedPipe[Edge[N, (E, T)]] = + grouped + .cogroup(grouped) { (to: N, left: Iterator[Edge[N, E]], right: Iterable[Edge[N, E]]) => + val newState = agfn(right) + left.map(_.mapData { e: E => (e, newState) }) + } + .values + + // Returns all Vertices with non-zero in-degree + def withInDegree[N, E]( + g: TypedPipe[Edge[N, E]] + )(implicit ord: Ordering[N]): TypedPipe[Edge[N, (E, InDegree)]] = joinAggregate(g.groupBy(_.to)) { it => + InDegree(it.size) + } + + // Returns all Vertices with non-zero out-degree + def withOutDegree[N, E](g: TypedPipe[Edge[N, E]])(implicit + ord: Ordering[N] + ): TypedPipe[Edge[N, (E, OutDegree)]] = joinAggregate(g.groupBy(_.from)) { it => + OutDegree(it.size) + } + + // Returns all Vertices with weights and non-zero norms + def withInNorm[N, E](g: TypedPipe[Edge[N, Weight]])(implicit + ord: Ordering[N] + ): TypedPipe[Edge[N, (Weight, L2Norm)]] = joinAggregate(g.groupBy(_.to)) { it => + val norm = scala.math.sqrt(it.iterator.map { a => + val x = a.data.weight + x * x + }.sum) + + L2Norm(norm) + } +} + +case class SetSimilarity(intersection: Int, sizeLeft: Int, sizeRight: Int) { + lazy val cosine: Option[Double] = + if (intersection == 0) + Some(0.0) + else { + val denom = scala.math.sqrt(sizeLeft.toDouble * sizeRight.toDouble) + if (denom == 0.0) { + None + } else { + Some(intersection.toDouble / denom) + } + } +} + +trait TypedSimilarity[N, E, S] extends Serializable { + def nodeOrdering: Ordering[N] + + /** + * Given a TypedPipe of edges, and a predicate for a smaller group (smallpred) of nodes and a bigger group + * (bigpred), compute the similarity between each item in the two sets The Edge.from nodes in the result + * will all satisfy smallpred, and the Edge.to will all satisfy bigpred. It is more efficient if you keep + * the smallpred set smaller. + */ + def apply(g: TypedPipe[Edge[N, E]], smallpred: N => Boolean, bigpred: N => Boolean): TypedPipe[Edge[N, S]] + // Do similarity on all the nodes + def apply(g: TypedPipe[Edge[N, E]]): TypedPipe[Edge[N, S]] = { + val always = { n: N => true } + apply(g, always, always) + } +} + +object TypedSimilarity extends Serializable { + private def maybeWithReducers[T <: WithReducers[T]](withReds: T, reds: Option[Int]) = + reds match { + case Some(i) => withReds.withReducers(i) + case None => withReds + } + + // key: document, + // value: (word, documentsWithWord) + // return: Edge of similarity between words measured by documents + def exactSetSimilarity[N: Ordering]( + g: Grouped[N, (N, Int)], + smallpred: N => Boolean, + bigpred: N => Boolean + ): TypedPipe[Edge[N, SetSimilarity]] = + /* E_{ij} = 1 if document -> word exists + * (E^T E)_ij = # of shared documents of i,j + * = \sum_k E_ki E_kj + */ + // First compute (i,j) => E_{ki} E_{kj} + maybeWithReducers( + g.join(g) + .values + .flatMap { case ((node1, deg1), (node2, deg2)) => + if (smallpred(node1) && bigpred(node2)) Some(((node1, node2), (1, deg1, deg2))) else None + } + .group, + g.reducers + ) + // Use reduceLeft to push to reducers, no benefit in mapside here + .reduceLeft { (left, right) => + // The degrees we always take the left: + val (leftCnt, deg1, deg2) = left + (leftCnt + right._1, deg1, deg2) + } + .map { case ((node1, node2), (cnt, deg1, deg2)) => + Edge(node1, node2, SetSimilarity(cnt, deg1, deg2)) + } + + /* + * key: document, + * value: (word, documentsWithWord) + * return: Edge of similarity between words measured by documents + * See: https://arxiv.org/pdf/1206.2082v2.pdf + */ + def discoCosineSimilarity[N: Ordering]( + smallG: Grouped[N, (N, Int)], + bigG: Grouped[N, (N, Int)], + oversample: Double + ): TypedPipe[Edge[N, Double]] = { + // 1) make rnd lazy due to serialization, + // 2) fix seed so that map-reduce speculative execution does not give inconsistent results. + lazy val rnd = new scala.util.Random(1024) + maybeWithReducers( + smallG + .cogroup(bigG) { (n: N, leftit: Iterator[(N, Int)], rightit: Iterable[(N, Int)]) => + // Use a co-group to ensure this happens in the reducer: + leftit.flatMap { case (node1, deg1) => + rightit.iterator.flatMap { case (node2, deg2) => + val weight = 1.0 / scala.math.sqrt(deg1.toDouble * deg2.toDouble) + val prob = oversample * weight + if (prob >= 1.0) { + // Small degree case, just output all of them: + Iterator(((node1, node2), weight)) + } else if (rnd.nextDouble < prob) { + // Sample + Iterator(((node1, node2), 1.0 / oversample)) + } else + Iterator.empty + } + } + } + .values + .group, + smallG.reducers + ).forceToReducers.sum + .map { case ((node1, node2), sim) => Edge(node1, node2, sim) } + } + + /* + * key: document, + * value: (word, word weight in the document, norm of the word) + * return: Edge of similarity between words measured by documents + * See: https://stanford.edu/~rezab/papers/dimsum.pdf + */ + def dimsumCosineSimilarity[N: Ordering]( + smallG: Grouped[N, (N, Double, Double)], + bigG: Grouped[N, (N, Double, Double)], + oversample: Double + ): TypedPipe[Edge[N, Double]] = { + lazy val rnd = new scala.util.Random(1024) + maybeWithReducers( + smallG + .cogroup(bigG) { + (n: N, leftit: Iterator[(N, Double, Double)], rightit: Iterable[(N, Double, Double)]) => + // Use a co-group to ensure this happens in the reducer: + leftit.flatMap { case (node1, weight1, norm1) => + rightit.iterator.flatMap { case (node2, weight2, norm2) => + val weight = 1.0 / (norm1 * norm2) + val prob = oversample * weight + if (prob >= 1.0) { + // Small degree case, just output all of them: + Iterator(((node1, node2), weight * weight1 * weight2)) + } else if (rnd.nextDouble < prob) { + // Sample + Iterator(((node1, node2), 1.0 / oversample * weight1 * weight2)) + } else + Iterator.empty + } + } + } + .values + .group, + smallG.reducers + ).forceToReducers.sum + .map { case ((node1, node2), sim) => Edge(node1, node2, sim) } + } +} + +/** + * This algothm is just matrix multiplication done by hand to make it clearer when we do the sampling + * implementation + */ +class ExactInCosine[N](reducers: Int = -1)(implicit override val nodeOrdering: Ordering[N]) + extends TypedSimilarity[N, InDegree, Double] { + + def apply( + graph: TypedPipe[Edge[N, InDegree]], + smallpred: N => Boolean, + bigpred: N => Boolean + ): TypedPipe[Edge[N, Double]] = { + val groupedOnSrc = graph + .filter(e => smallpred(e.to) || bigpred(e.to)) + .map(e => (e.from, (e.to, e.data.degree))) + .group + .withReducers(reducers) + TypedSimilarity + .exactSetSimilarity(groupedOnSrc, smallpred, bigpred) + .flatMap(e => e.data.cosine.map(c => e.mapData(s => c))) + } +} + +/** + * Params: minCos: the minimum cosine similarity you care about accuracy for delta: the error on the + * approximated cosine (e.g. 0.05 = 5%) boundedProb: the probability we have larger than delta error see: + * https://arxiv.org/pdf/1206.2082v2.pdf for more details + */ +class DiscoInCosine[N](minCos: Double, delta: Double, boundedProb: Double, reducers: Int = -1)(implicit + override val nodeOrdering: Ordering[N] +) extends TypedSimilarity[N, InDegree, Double] { + + // The probability of being more than delta error is approx: + // boundedProb ~ exp(-p delta^2 / 2) + private val oversample = (-2.0 * scala.math.log(boundedProb) / (delta * delta)) / minCos + + def apply( + graph: TypedPipe[Edge[N, InDegree]], + smallpred: N => Boolean, + bigpred: N => Boolean + ): TypedPipe[Edge[N, Double]] = { + val bigGroupedOnSrc = graph + .filter(e => bigpred(e.to)) + .map(e => (e.from, (e.to, e.data.degree))) + .group + .withReducers(reducers) + val smallGroupedOnSrc = graph + .filter(e => smallpred(e.to)) + .map(e => (e.from, (e.to, e.data.degree))) + .group + .withReducers(reducers) + + TypedSimilarity.discoCosineSimilarity(smallGroupedOnSrc, bigGroupedOnSrc, oversample) + } + +} + +class DimsumInCosine[N](minCos: Double, delta: Double, boundedProb: Double, reducers: Int = -1)(implicit + override val nodeOrdering: Ordering[N] +) extends TypedSimilarity[N, (Weight, L2Norm), Double] { + + // The probability of being more than delta error is approx: + // boundedProb ~ exp(-p delta^2 / 2) + private val oversample = (-2.0 * scala.math.log(boundedProb) / (delta * delta)) / minCos + + def apply( + graph: TypedPipe[Edge[N, (Weight, L2Norm)]], + smallpred: N => Boolean, + bigpred: N => Boolean + ): TypedPipe[Edge[N, Double]] = { + val bigGroupedOnSrc = graph + .filter(e => bigpred(e.to)) + .map(e => (e.from, (e.to, e.data._1.weight, e.data._2.norm))) + .group + .withReducers(reducers) + val smallGroupedOnSrc = graph + .filter(e => smallpred(e.to)) + .map(e => (e.from, (e.to, e.data._1.weight, e.data._2.norm))) + .group + .withReducers(reducers) + + TypedSimilarity.dimsumCosineSimilarity(smallGroupedOnSrc, bigGroupedOnSrc, oversample) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/InputSizeReducerEstimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/InputSizeReducerEstimator.scala new file mode 100644 index 0000000000..1a5d89eb05 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/InputSizeReducerEstimator.scala @@ -0,0 +1,74 @@ +package com.twitter.scalding.reducer_estimation + +import cascading.tap.hadoop.Hfs +import com.twitter.scalding.estimation.{Common, Estimator, FlowStrategyInfo} +import org.apache.hadoop.mapred.JobConf +import org.slf4j.LoggerFactory + +object InputSizeReducerEstimator { + private[this] val LOG = LoggerFactory.getLogger(this.getClass) + + val BytesPerReducer = "scalding.reducer.estimator.bytes.per.reducer" + val defaultBytesPerReducer = 1L << 32 // 4 GB + + /** + * Get the target bytes/reducer from the JobConf. Supported formats are long or human readable + * format. For human readable format you can use the following suffix (case insensitive): k(kilo), m(mega), + * g(giga), t(tera), p(peta), e(exa). + * + * Examples: 1024, 128m, 1g. + */ + def getBytesPerReducer(conf: JobConf): Long = + conf.getLongBytes(BytesPerReducer, defaultBytesPerReducer) + + /** + * Same as estimateReducers, except doesn't round or ceil the result. This is useful for composing with + * other estimation strategies that don't want to lose the fractional number of reducers. Especially helpful + * for when less than 1 reducer is needed, but this fraction will be multiplied by a scaling factor later. + */ + def estimateReducersWithoutRounding(info: FlowStrategyInfo): Option[Double] = + Common.inputSizes(info.step) match { + case Nil => + LOG.warn( + "InputSizeReducerEstimator unable to estimate reducers; " + + "cannot compute size of (is it a non hfs tap?):\n - " + + Common.unrollTaps(info.step).filterNot(_.isInstanceOf[Hfs]).mkString("\n - ") + ) + None + case inputSizes => + val bytesPerReducer = + InputSizeReducerEstimator.getBytesPerReducer(info.step.getConfig) + + val totalBytes = inputSizes.map(_._2).sum + val nReducers = totalBytes.toDouble / bytesPerReducer.toDouble + + lazy val logStr = inputSizes + .map { case (name, bytes) => + s" - $name\t$bytes" + } + .mkString("\n") + + LOG.info( + "\nInputSizeReducerEstimator" + + "\n - input size (bytes): " + totalBytes + + "\n - reducer estimate: " + nReducers + + "\n - Breakdown:\n" + + logStr + ) + + Some(nReducers) + } + +} + +/** + * Estimator that uses the input size and a fixed "bytesPerReducer" target. + * + * Bytes per reducer can be configured with configuration parameter, defaults to 4 GB. + */ +class InputSizeReducerEstimator extends Estimator[Int] { + import InputSizeReducerEstimator._ + + override def estimate(info: FlowStrategyInfo): Option[Int] = + estimateReducersWithoutRounding(info).map(_.ceil.toInt.max(1)) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimator.scala new file mode 100644 index 0000000000..719d653747 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimator.scala @@ -0,0 +1,85 @@ +package com.twitter.scalding.reducer_estimation + +import com.twitter.scalding.estimation.{Common, FlowStepHistory, FlowStrategyInfo} +import org.apache.hadoop.mapred.JobConf +import org.slf4j.LoggerFactory + +object RatioBasedEstimator { + + /** + * RatioBasedEstimator optionally ignores history items whose input size is drastically different than the + * current job. This parameter specifies the lower bound on allowable input size ratio. Defaults to 0.10 + * (10%), which sets the upper bound to 10x. + */ + val inputRatioThresholdKey = "scalding.reducer.estimator.input.ratio.threshold" + def getInputRatioThreshold(conf: JobConf) = conf.getFloat(inputRatioThresholdKey, 0.10f) +} + +abstract class RatioBasedEstimator extends ReducerHistoryEstimator { + private val LOG = LoggerFactory.getLogger(this.getClass) + + /** + * Determines if this input and the previous input are close enough. If they're drastically different, we + * have no business trying to make an estimate based on the past job. + * + * @param threshold + * Specify lower bound on ratio (e.g. 0.10 for 10%) + */ + private def acceptableInputRatio(current: Long, past: Long, threshold: Double): Boolean = { + val ratio = current / past.toDouble + if (threshold > 0 && (ratio < threshold || ratio > 1 / threshold)) { + LOG.warn( + "Input sizes differ too much to use for estimation: " + + "current: " + current + ", past: " + past + ) + false + } else true + } + + /** + * Compute the average ratio of mapper bytes to reducer bytes and use that to scale the estimate produced by + * InputSizeReducerEstimator. + */ + override protected def estimate( + info: FlowStrategyInfo, + conf: JobConf, + history: Seq[FlowStepHistory] + ): Option[Int] = { + val threshold = RatioBasedEstimator.getInputRatioThreshold(conf) + val inputBytes = Common.totalInputSize(info.step) + + if (inputBytes == 0) { + LOG.warn("No input detected.") + None + } else { + val ratios = for { + h <- history + if h.mapOutputBytes > 0 + if acceptableInputRatio(inputBytes, h.hdfsBytesRead, threshold) + } yield { + h.mapOutputBytes / h.hdfsBytesRead.toDouble + } + + if (ratios.isEmpty) { + LOG.warn(s"No matching history found within input ratio threshold: $threshold") + None + } else { + val reducerRatio = ratios.sum / ratios.length + LOG.info("Getting base estimate from InputSizeReducerEstimator") + val inputSizeBasedEstimate = InputSizeReducerEstimator.estimateReducersWithoutRounding(info) + inputSizeBasedEstimate.map { baseEstimate => + // scale reducer estimate based on the historical input ratio + val e = (baseEstimate * reducerRatio).ceil.toInt.max(1) + + LOG.info( + "\nRatioBasedEstimator" + + "\n - past reducer ratio: " + reducerRatio + + "\n - reducer estimate: " + e + ) + + e + } + } + } + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorConfig.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorConfig.scala new file mode 100644 index 0000000000..62c6cac3be --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorConfig.scala @@ -0,0 +1,32 @@ +package com.twitter.scalding.reducer_estimation + +import org.apache.hadoop.mapred.JobConf + +object ReducerEstimatorConfig { + + /** Output param: what the Reducer Estimator recommended, regardless of if it was used. */ + val estimatedNumReducers = "scalding.reducer.estimator.result" + + /** + * Output param: same as estimatedNumReducers but with the cap specified by maxEstimatedReducersKey applied. + * Can be used to determine whether a cap was applied to the estimated number of reducers and potentially to + * trigger alerting / logging. + */ + val cappedEstimatedNumReducersKey = "scalding.reducer.estimator.result.capped" + + /** Output param: what the original job config was. */ + val originalNumReducers = "scalding.reducer.estimator.original.mapred.reduce.tasks" + + /** + * If we estimate more than this number of reducers, we will use this number instead of the estimated value + */ + val maxEstimatedReducersKey = "scalding.reducer.estimator.max.estimated.reducers" + + /* fairly arbitrary choice here -- you will probably want to configure this in your cluster defaults */ + val defaultMaxEstimatedReducers = 5000 + + /** Maximum number of history items to use for reducer estimation. */ + val maxHistoryKey = "scalding.reducer.estimator.max.history" + + def getMaxHistory(conf: JobConf): Int = conf.getInt(maxHistoryKey, 1) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorStepStrategy.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorStepStrategy.scala new file mode 100644 index 0000000000..557cddfd6f --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorStepStrategy.scala @@ -0,0 +1,106 @@ +package com.twitter.scalding.reducer_estimation + +import cascading.flow.{Flow, FlowStep, FlowStepStrategy} +import com.twitter.algebird.Monoid +import com.twitter.scalding.estimation.{Estimator, FallbackEstimatorMonoid, FlowStrategyInfo} +import com.twitter.scalding.{Config, StringUtility} +import java.util.{List => JList} +import org.apache.hadoop.mapred.JobConf +import org.slf4j.LoggerFactory +import scala.collection.JavaConverters._ + +object ReducerEstimatorStepStrategy extends FlowStepStrategy[JobConf] { + + private val LOG = LoggerFactory.getLogger(this.getClass) + + implicit val estimatorMonoid: Monoid[Estimator[Int]] = + new FallbackEstimatorMonoid[Int] + + /** + * Make reducer estimate, possibly overriding explicitly-set numReducers, and save useful info (such as the + * default & estimate) in JobConf for later consumption. + * + * Called by Cascading at the start of each job step. + */ + final override def apply( + flow: Flow[JobConf], + preds: JList[FlowStep[JobConf]], + step: FlowStep[JobConf] + ): Unit = { + val conf = step.getConfig + // for steps with reduce phase, mapred.reduce.tasks is set in the jobconf at this point + // so we check that to determine if this is a map-only step. + conf.getNumReduceTasks match { + case 0 => LOG.info(s"${flow.getName} is a map-only step. Skipping reducer estimation.") + case _ => + if (skipReducerEstimation(step)) { + LOG.info(s""" + |Flow step ${step.getName} was configured with reducers + |set explicitly (${Config.WithReducersSetExplicitly}=true) and the estimator + |explicit override turned off (${Config.ReducerEstimatorOverride}=false). Skipping + |reducer estimation. + """.stripMargin) + } else { + estimate(flow, preds.asScala, step) + } + } + } + + // whether the reducers have been set explicitly with `withReducers` + private def reducersSetExplicitly(step: FlowStep[JobConf]) = + step.getConfig.getBoolean(Config.WithReducersSetExplicitly, false) + + // whether we should override explicitly-specified numReducers + private def overrideExplicitReducers(step: FlowStep[JobConf]) = + step.getConfig.getBoolean(Config.ReducerEstimatorOverride, false) + + private def skipReducerEstimation(step: FlowStep[JobConf]) = + reducersSetExplicitly(step) && !overrideExplicitReducers(step) + + private def estimate(flow: Flow[JobConf], preds: Seq[FlowStep[JobConf]], step: FlowStep[JobConf]): Unit = { + val conf = step.getConfig + + val stepNumReducers = conf.get(Config.HadoopNumReducers) + Option(conf.get(Config.ReducerEstimators)).foreach { clsNames => + val clsLoader = Thread.currentThread.getContextClassLoader + + val estimators = StringUtility + .fastSplit(clsNames, ",") + .map(clsLoader.loadClass(_).newInstance.asInstanceOf[Estimator[Int]]) + val combinedEstimator = Monoid.sum(estimators) + + val info = FlowStrategyInfo(flow, preds, step) + + // get estimate + val estimatedNumReducers = combinedEstimator.estimate(info) + + // apply cap if needed + val cappedNumReducers = estimatedNumReducers.map { n => + val configuredMax = conf.getInt( + ReducerEstimatorConfig.maxEstimatedReducersKey, + ReducerEstimatorConfig.defaultMaxEstimatedReducers + ) + + if (n > configuredMax) { + LOG.warn(s""" + |Reducer estimator estimated $n reducers, which is more than the configured maximum of $configuredMax. + |Will use $configuredMax instead. + """.stripMargin) + configuredMax + } else { + n + } + } + + // save the estimate and capped estimate in the JobConf which should be saved by hRaven + conf.setInt(ReducerEstimatorConfig.estimatedNumReducers, estimatedNumReducers.getOrElse(-1)) + conf.setInt(ReducerEstimatorConfig.cappedEstimatedNumReducersKey, cappedNumReducers.getOrElse(-1)) + // set number of reducers + cappedNumReducers.foreach(conf.setNumReduceTasks) + // log in JobConf what was explicitly set by 'withReducers' + if (reducersSetExplicitly(step)) { + conf.set(ReducerEstimatorConfig.originalNumReducers, stepNumReducers) + } + } + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerHistoryEstimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerHistoryEstimator.scala new file mode 100644 index 0000000000..f7dd2291f5 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/ReducerHistoryEstimator.scala @@ -0,0 +1,20 @@ +package com.twitter.scalding.reducer_estimation + +import com.twitter.scalding.estimation.{HistoryEstimator, Task} +import org.apache.hadoop.mapred.JobConf + +object ReducerHistoryEstimator { + val Status = "status" + val StartTime = "startTime" + val FinishTime = "finishTime" + + implicit class ReducerRichTask(val task: Task) { + def status: Option[String] = task.details.get(Status).map(_.asInstanceOf[String]) + def startTime: Option[Long] = task.details.get(StartTime).map(_.asInstanceOf[Long]) + def finishTime: Option[Long] = task.details.get(FinishTime).map(_.asInstanceOf[Long]) + } +} + +trait ReducerHistoryEstimator extends HistoryEstimator[Int] { + override def maxHistoryItems(conf: JobConf): Int = ReducerEstimatorConfig.getMaxHistory(conf) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimator.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimator.scala new file mode 100644 index 0000000000..eb9b56a0ee --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimator.scala @@ -0,0 +1,224 @@ +package com.twitter.scalding.reducer_estimation + +import com.twitter.scalding.estimation.{Common, FlowStepHistory, FlowStrategyInfo} +import org.apache.hadoop.mapred.JobConf +import org.slf4j.LoggerFactory + +/** + * Estimator that uses the input size and a fixed "bytesPerReducer" target. + * + * Bytes per reducer can be configured with configuration parameter, defaults to 1 GB. + */ +trait RuntimeEstimationScheme { + + /** + * Given a list of times that each reducer took in a certain FlowStep, aggregates these times into a single + * estimate of the time that a "typical" reducer took. Suggested implementation: mean or median. + */ + def estimateTaskTime(times: Seq[Double]): Option[Double] + + /** + * Given a list of "typical" times observed in a series of jobs of the same FlowStep, aggregates these times + * into a single estimate of the time that a "typical" reducer took in a "typical" job. Suggested + * implementation: mean or median. + */ + def estimateJobTime(times: Seq[Double]): Option[Double] +} + +object MedianEstimationScheme extends RuntimeEstimationScheme { + def estimateJobTime(times: Seq[Double]) = median(times) + def estimateTaskTime(times: Seq[Double]) = median(times) +} + +object MeanEstimationScheme extends RuntimeEstimationScheme { + def estimateJobTime(times: Seq[Double]) = mean(times) + def estimateTaskTime(times: Seq[Double]) = mean(times) +} + +object RuntimeReducerEstimator { + import ReducerHistoryEstimator.ReducerRichTask + + val RuntimePerReducer = "scalding.reducer.estimator.runtime.per.reducer" + val EstimationScheme = "scalding.reducer.estimator.runtime.estimation.scheme" + val IgnoreInputSize = "scalding.reducer.estimator.runtime.ignore.input.size" + + /** Get the target bytes/reducer from the JobConf */ + def getRuntimePerReducer(conf: JobConf): Long = { + val default = 10 * 60 * 1000 // 10 mins + conf.getLong(RuntimePerReducer, default) + } + + /** + * Whether to use the median or the mean in the runtime estimation process. Default is median. + */ + def getRuntimeEstimationScheme(conf: JobConf): RuntimeEstimationScheme = { + val default = "median" + conf.get(EstimationScheme, default) match { + case "mean" => MeanEstimationScheme + case "median" => MedianEstimationScheme + case _ => + throw new Exception(s"""Value of $EstimationScheme must be "mean", "median", or not specified.""") + } + } + + /** + * Whether to ignore the input size of the data. If true, RuntimeReducerEstimator uses a non-input scaled + * estimator. If false, RuntimeReducerEstimator uses an input-scaled estimator first, and uses a + * non-input-scaled estimator as a fallback. Default is false. + */ + def getRuntimeIgnoreInputSize(conf: JobConf): Boolean = { + val default = false + conf.getBoolean(IgnoreInputSize, default) + } + + def getReduceTimes(history: Seq[FlowStepHistory]): Seq[Seq[Double]] = + history.map { h => + h.tasks + .filter(t => t.taskType.contains("REDUCE") && t.status.contains("SUCCEEDED")) + .flatMap { t => + t.finishTime + .zip(t.startTime) + .filter { case (finishedTime, startTime) => + finishedTime > startTime + } + .map { case (finishedTime, startTime) => + (finishedTime - startTime).toDouble + } + } + } +} + +trait BasicRuntimeReducerEstimator extends ReducerHistoryEstimator { + import RuntimeReducerEstimator._ + + private val LOG = LoggerFactory.getLogger(this.getClass) + + def runtimeEstimationScheme: RuntimeEstimationScheme + + override protected def estimate( + info: FlowStrategyInfo, + conf: JobConf, + history: Seq[FlowStepHistory] + ): Option[Int] = { + val reduceTimes: Seq[Seq[Double]] = getReduceTimes(history) + + LOG.info(s"""| + |History items have the following numbers of tasks: + | ${history.map(_.tasks.length)}, + |and the following numbers of tasks have valid task histories: + | ${reduceTimes.map(_.length)}""".stripMargin) + + // total time taken in the step = time per reducer * number of reducers + val jobTimes: Seq[Option[Double]] = reduceTimes + .map(xs => runtimeEstimationScheme.estimateTaskTime(xs).map(_ * xs.length)) + + // time per step, averaged over all the steps + val typicalJobTime: Option[Double] = runtimeEstimationScheme.estimateJobTime(jobTimes.flatten) + + val desiredRuntime: Long = getRuntimePerReducer(info.step.getConfig) + + val estimate = typicalJobTime.map { t: Double => (t / desiredRuntime).ceil.toInt } + + LOG.info(s""" + | - Typical job time: $typicalJobTime + | - Desired runtime: $desiredRuntime + | - Estimate: $estimate + """.stripMargin) + + estimate + } +} + +trait InputScaledRuntimeReducerEstimator extends ReducerHistoryEstimator { + import RuntimeReducerEstimator._ + + private val LOG = LoggerFactory.getLogger(this.getClass) + + def runtimeEstimationScheme: RuntimeEstimationScheme + + override protected def estimate( + info: FlowStrategyInfo, + conf: JobConf, + history: Seq[FlowStepHistory] + ): Option[Int] = { + val reduceTimes: Seq[Seq[Double]] = getReduceTimes(history) + + LOG.info(s"""| + |History items have the following numbers of tasks: + | ${history.map(_.tasks.length)}, + |and the following numbers of tasks have valid task histories: + | ${reduceTimes.map(_.length)}""".stripMargin) + + // total time taken in the step = time per reducer * number of reducers + val jobTimes: Seq[Option[Double]] = reduceTimes + .map(xs => runtimeEstimationScheme.estimateTaskTime(xs).map(_ * xs.length)) + + // time-to-byte ratio for a step = time per reducer * number of reducers / number of bytes + val timeToByteRatios: Seq[Double] = jobTimes + .zip { + history.map(_.hdfsBytesRead) + } + .collect { case (Some(time), bytes) => time / bytes } + + // time-to-byte ratio, averaged over all the steps + val typicalTimeToByteRatio: Option[Double] = runtimeEstimationScheme + .estimateJobTime(timeToByteRatios) + + val desiredRuntime = getRuntimePerReducer(info.step.getConfig) + val inputBytes = Common.totalInputSize(info.step) + + if (inputBytes == 0) { + LOG.warn("Input bytes is zero in current step.") + None + } else { + // numReducers = time-per-byte * numBytes / desiredRuntime + val estimate = typicalTimeToByteRatio.map { t: Double => + (t * inputBytes / desiredRuntime).ceil.toInt + } + + LOG.info(s""" + | - HDFS bytes read: ${history.map(_.hdfsBytesRead)} + | - Time-to-byte-ratios: $timeToByteRatios + | - Typical type-to-byte-ratio: $typicalTimeToByteRatio + | - Desired runtime: $desiredRuntime + | - Input bytes: $inputBytes + | - Estimate: $estimate + """.stripMargin) + estimate + } + } +} + +trait RuntimeReducerEstimator extends ReducerHistoryEstimator { + override def estimate(info: FlowStrategyInfo): Option[Int] = { + val estimationScheme = RuntimeReducerEstimator.getRuntimeEstimationScheme(info.step.getConfig) + + val runtimeHistoryService = historyService + + val basicEstimator = new BasicRuntimeReducerEstimator { + def runtimeEstimationScheme = estimationScheme + + def historyService = runtimeHistoryService + } + + val combinedEstimator = if (RuntimeReducerEstimator.getRuntimeIgnoreInputSize(info.step.getConfig)) { + basicEstimator + } else { + val inputScaledEstimator = new InputScaledRuntimeReducerEstimator { + def runtimeEstimationScheme = estimationScheme + + def historyService = runtimeHistoryService + } + ReducerEstimatorStepStrategy.estimatorMonoid.plus(inputScaledEstimator, basicEstimator) + } + + combinedEstimator.estimate(info) + } + + override protected def estimate( + info: FlowStrategyInfo, + conf: JobConf, + history: Seq[FlowStepHistory] + ): Option[Int] = + estimate(info) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/package.scala b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/package.scala new file mode 100644 index 0000000000..eb998f4071 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/reducer_estimation/package.scala @@ -0,0 +1,6 @@ +package com.twitter.scalding + +package object reducer_estimation { + def median(xs: Seq[Double]): Option[Double] = xs.sorted.lift(xs.length / 2) + def mean(xs: Seq[Double]): Option[Double] = if (xs.isEmpty) None else Some(xs.sum / xs.length) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/AlgebirdSerializers.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/AlgebirdSerializers.scala deleted file mode 100644 index fb76c761c2..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/serialization/AlgebirdSerializers.scala +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.serialization - -import com.esotericsoftware.kryo.Kryo -import com.esotericsoftware.kryo.{Serializer => KSerializer} -import com.esotericsoftware.kryo.io.{Input, Output} - -import com.twitter.algebird.{AveragedValue, DecayedValue, HLL, HyperLogLog, - HyperLogLogMonoid, Moments} - -import scala.collection.mutable.{Map => MMap} - -class AveragedValueSerializer extends KSerializer[AveragedValue] { - setImmutable(true) - def write(kser: Kryo, out : Output, s : AveragedValue) { - out.writeLong(s.count, true) - out.writeDouble(s.value) - } - def read(kser : Kryo, in : Input, cls : Class[AveragedValue]) : AveragedValue = - AveragedValue(in.readLong(true), in.readDouble) -} - -class MomentsSerializer extends KSerializer[Moments] { - setImmutable(true) - def write(kser: Kryo, out : Output, s : Moments) { - out.writeLong(s.m0, true) - out.writeDouble(s.m1) - out.writeDouble(s.m2) - out.writeDouble(s.m3) - out.writeDouble(s.m4) - } - def read(kser : Kryo, in : Input, cls : Class[Moments]) : Moments = { - Moments(in.readLong(true), - in.readDouble, - in.readDouble, - in.readDouble, - in.readDouble) - } -} - - -class DecayedValueSerializer extends KSerializer[DecayedValue] { - setImmutable(true) - def write(kser: Kryo, out : Output, s : DecayedValue) { - out.writeDouble(s.value) - out.writeDouble(s.scaledTime) - } - def read(kser : Kryo, in : Input, cls : Class[DecayedValue]) : DecayedValue = - DecayedValue(in.readDouble, in.readDouble) -} - -class HLLSerializer extends KSerializer[HLL] { - setImmutable(true) - def write(kser: Kryo, out : Output, s : HLL) { - val bytes = HyperLogLog.toBytes(s) - out.writeInt(bytes.size, true) - out.writeBytes(bytes) - } - def read(kser : Kryo, in : Input, cls : Class[HLL]) : HLL = { - HyperLogLog.fromBytes(in.readBytes(in.readInt(true))) - } -} - -class HLLMonoidSerializer extends KSerializer[HyperLogLogMonoid] { - setImmutable(true) - val hllMonoids = MMap[Int,HyperLogLogMonoid]() - def write(kser: Kryo, out : Output, mon : HyperLogLogMonoid) { - out.writeInt(mon.bits, true) - } - def read(kser : Kryo, in : Input, cls : Class[HyperLogLogMonoid]) : HyperLogLogMonoid = { - val bits = in.readInt(true) - hllMonoids.getOrElseUpdate(bits, new HyperLogLogMonoid(bits)) - } -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/CascadingBinaryComparator.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/CascadingBinaryComparator.scala new file mode 100644 index 0000000000..791bdbac73 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/CascadingBinaryComparator.scala @@ -0,0 +1,116 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding.serialization + +import cascading.flow.Flow +import cascading.flow.planner.BaseFlowStep +import cascading.tuple.{Hasher => CHasher, StreamComparator} +import com.twitter.scalding.ExecutionContext.getDesc +import java.io.InputStream +import java.util.Comparator +import scala.util.{Failure, Success, Try} +import org.slf4j.LoggerFactory + +/** + * This is the type that should be fed to cascading to enable binary comparators + */ +class CascadingBinaryComparator[T](ob: OrderedSerialization[T]) + extends Comparator[T] + with StreamComparator[InputStream] + with CHasher[T] + with Serializable { + + override def compare(a: T, b: T) = ob.compare(a, b) + override def hashCode(t: T): Int = ob.hash(t) + override def compare(a: InputStream, b: InputStream) = + ob.compareBinary(a, b).unsafeToInt +} + +object CascadingBinaryComparator { + + private val LOG = LoggerFactory.getLogger(this.getClass) + + /** + * This method will walk the flowDef and make sure all the groupBy/cogroups are using a + * CascadingBinaryComparator + */ + private[scalding] def checkForOrderedSerialization[T]( + flow: Flow[T], + mode: RequireOrderedSerializationMode + ): Try[Unit] = { + import collection.JavaConverters._ + import cascading.pipe._ + + // all successes or empty returns success + def reduce(it: TraversableOnce[Try[Unit]]): Try[Unit] = + it.find(_.isFailure).getOrElse(Success(())) + + def failure(s: String): Try[Unit] = { + val message = + s"Cannot verify OrderedSerialization: $s. Add `import com.twitter.scalding.serialization.RequiredBinaryComparators._`" + mode match { + case RequireOrderedSerializationMode.Fail => + Failure(new RuntimeException(message)) + case RequireOrderedSerializationMode.Log => + LOG.warn(message) + Try(()) + } + } + + def check(s: Splice): Try[Unit] = { + val m = s.getKeySelectors.asScala + val sortingSelectors = s.getSortingSelectors.asScala + + if (m.isEmpty) failure(s"Splice must have KeySelectors: $s") + else { + reduce(m.map { case (pipename, fields) => + /* + * Scalding typed-API ALWAYS puts the key into field position 0. + * If OrderedSerialization is enabled, this must be a CascadingBinaryComparator + */ + if (fields.getComparators()(0).isInstanceOf[CascadingBinaryComparator[_]]) + Success(()) + else failure(s"pipe: $s, fields: $fields, comparators: ${fields.getComparators.toList}") + }) + } + } + + def getDescriptionsForMissingOrdSer[U](bfs: BaseFlowStep[U]): Option[String] = + // does this job have any Splices without OrderedSerialization: + if ( + bfs.getGraph.vertexSet.asScala.exists { + case gb: GroupBy => check(gb).isFailure + case cg: CoGroup => check(cg).isFailure + case _ => false // only do sorting in groupBy/cogroupBy + } + ) { + Some(getDesc(bfs).mkString(", ")) + } else None + + // Get all the steps that have missing OrderedSerializations + val missing = flow.getFlowSteps.asScala + .map { case bfs: BaseFlowStep[_] => getDescriptionsForMissingOrdSer(bfs) } + .collect { case Some(desc) => desc } + + if (missing.isEmpty) Success(()) + else { + val badSteps = missing.size + val msg = missing.zipWithIndex.map { case (msg, idx) => s"$msg" }.mkString + failure(s"There are $badSteps missing OrderedSerializations: $msg") + } + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/Externalizer.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/Externalizer.scala index a2b4eee2a4..d0e503ec0b 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/serialization/Externalizer.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/Externalizer.scala @@ -12,14 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization -import com.twitter.chill.{ Externalizer => ChillExtern} + +import com.twitter.chill.{Externalizer => ChillExtern} import com.esotericsoftware.kryo.DefaultSerializer import com.esotericsoftware.kryo.serializers.JavaSerializer import com.twitter.chill.config.ScalaAnyRefMapConfig + /** * We need to control the Kryo created */ @@ -36,5 +38,3 @@ class Externalizer[T] extends ChillExtern[T] { protected override def kryo = new KryoHadoop(ScalaAnyRefMapConfig(Map("scalding.kryo.setreferences" -> "true"))) } - - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoHadoop.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoHadoop.scala index 361bfc1f2a..8a9caaf12c 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoHadoop.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoHadoop.scala @@ -12,48 +12,30 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization -import java.io.InputStream -import java.io.OutputStream -import java.io.Serializable -import java.nio.ByteBuffer - -import org.apache.hadoop.io.serializer.{Serialization, Deserializer, Serializer, WritableSerialization} - import com.esotericsoftware.kryo.Kryo -import com.esotericsoftware.kryo.{Serializer => KSerializer} -import com.esotericsoftware.kryo.io.{Input, Output} import com.esotericsoftware.kryo.serializers.FieldSerializer - -import cascading.tuple.hadoop.TupleSerialization -import cascading.tuple.hadoop.io.BufferedInputStream - -import scala.annotation.tailrec -import scala.collection.immutable.ListMap -import scala.collection.immutable.HashMap - -import com.twitter.scalding.DateRange -import com.twitter.scalding.RichDate -import com.twitter.scalding.Args - -import com.twitter.chill._ +import com.twitter.scalding.{Args, CascadingTokenUpdater, Config => ScaldingConfig, DateRange, RichDate} +import com.twitter.chill.algebird._ import com.twitter.chill.config.Config - -class KryoHadoop(config: Config) extends KryoInstantiator { - - /** TODO!!! - * Deal with this issue. The problem is grouping by Kryo serialized - * objects silently breaks the results. If Kryo gets in front of TupleSerialization - * (and possibly Writable, unclear at this time), grouping is broken. - * There are two issues here: - * 1) Kryo objects not being compared properly. - * 2) Kryo being used instead of cascading. +import com.twitter.chill.{IKryoRegistrar, KryoInstantiator, ScalaKryoInstantiator, SingletonSerializer} + +class KryoHadoop(@transient config: Config) extends KryoInstantiator { + // keeping track of references is costly for memory, and often triggers OOM on Hadoop + val useRefs = config.getBoolean("scalding.kryo.setreferences", false) + val cascadingSerializationTokens = config.get(ScaldingConfig.CascadingSerializationTokens) + + /** + * TODO!!! Deal with this issue. The problem is grouping by Kryo serialized objects silently breaks the + * results. If Kryo gets in front of TupleSerialization (and possibly Writable, unclear at this time), + * grouping is broken. There are two issues here: 1) Kryo objects not being compared properly. 2) Kryo being + * used instead of cascading. * * We must identify each and fix these bugs. */ - override def newKryo : Kryo = { + override def newKryo: Kryo = { val newK = (new ScalaKryoInstantiator).newKryo // These are scalding objects: newK.register(classOf[RichDate], new RichDateSerializer()) @@ -65,41 +47,97 @@ class KryoHadoop(config: Config) extends KryoInstantiator { newK.register(classOf[com.twitter.algebird.HyperLogLogMonoid], new HLLMonoidSerializer) newK.register(classOf[com.twitter.algebird.Moments], new MomentsSerializer) newK.addDefaultSerializer(classOf[com.twitter.algebird.HLL], new HLLSerializer) + // Don't serialize Boxed instances using Kryo. + newK.addDefaultSerializer(classOf[com.twitter.scalding.serialization.Boxed[_]], new ThrowingSerializer) + newK.addDefaultSerializer(classOf[com.twitter.scalding.typed.TypedPipe[_]], new SerializeAsUnit) + newK.addDefaultSerializer(classOf[com.twitter.scalding.typed.ReduceStep[_, _, _]], new SerializeAsUnit) + + // Register every boxed class so they are given cascading tokens + for { + boxedClass <- Boxed.allClasses + } { + newK.register(boxedClass, new ThrowingSerializer) + } - /** AdaptiveVector is IndexedSeq, which picks up the chill IndexedSeq serializer - * (which is its own bug), force using the fields serializer here + /** + * AdaptiveVector is IndexedSeq, which picks up the chill IndexedSeq serializer (which is its own bug), + * force using the fields serializer here */ - newK.register(classOf[com.twitter.algebird.DenseVector[_]], - new FieldSerializer[com.twitter.algebird.DenseVector[_]](newK, - classOf[com.twitter.algebird.DenseVector[_]])) - - newK.register(classOf[com.twitter.algebird.SparseVector[_]], - new FieldSerializer[com.twitter.algebird.SparseVector[_]](newK, - classOf[com.twitter.algebird.SparseVector[_]])) - - newK.addDefaultSerializer(classOf[com.twitter.algebird.AdaptiveVector[_]], - classOf[FieldSerializer[_]]) + newK.register( + classOf[com.twitter.algebird.DenseVector[_]], + new FieldSerializer[com.twitter.algebird.DenseVector[_]]( + newK, + classOf[com.twitter.algebird.DenseVector[_]] + ) + ) + + newK.register( + classOf[com.twitter.algebird.SparseVector[_]], + new FieldSerializer[com.twitter.algebird.SparseVector[_]]( + newK, + classOf[com.twitter.algebird.SparseVector[_]] + ) + ) + + newK.addDefaultSerializer(classOf[com.twitter.algebird.AdaptiveVector[_]], classOf[FieldSerializer[_]]) /** - * Pipes can be swept up into closures inside of case classes. This can generally - * be safely ignored. If the case class has a method that actually accesses something - * in the pipe (what would that even be?), you will get a null pointer exception, - * so it shouldn't cause data corruption. - * a more robust solution is to use Spark's closure cleaner approach on every object that - * is serialized, but that's very expensive. + * Pipes can be swept up into closures inside of case classes. This can generally be safely ignored. If + * the case class has a method that actually accesses something in the pipe (what would that even be?), + * you will get a null pointer exception, so it shouldn't cause data corruption. a more robust solution is + * to use Spark's closure cleaner approach on every object that is serialized, but that's very expensive. */ - newK.addDefaultSerializer(classOf[cascading.pipe.Pipe], new SingletonSerializer(null)) - // keeping track of references is costly for memory, and often triggers OOM on Hadoop - val useRefs = config.getBoolean("scalding.kryo.setreferences", false) + newK.addDefaultSerializer(classOf[cascading.pipe.Pipe], new SingletonSerializer(null)) + newK.addDefaultSerializer(classOf[com.twitter.scalding.typed.TypedPipe[_]], new SingletonSerializer(null)) + newK.addDefaultSerializer(classOf[com.twitter.scalding.Execution[_]], new SingletonSerializer(null)) + newK.addDefaultSerializer( + classOf[com.twitter.scalding.Execution.ToWrite[_]], + new SingletonSerializer(null) + ) + newK.setReferences(useRefs) /** - * Make sure we use the thread's context class loader to ensure the classes of the - * submitted jar and any -libjars arguments can be found + * Make sure we use the thread's context class loader to ensure the classes of the submitted jar and any + * -libjars arguments can be found */ val classLoader = Thread.currentThread.getContextClassLoader newK.setClassLoader(classLoader) + customRegistrar(newK) + + /** + * Register any cascading tokenized classes not already registered + */ + val tokenizedClasses = + CascadingTokenUpdater + .parseTokens(cascadingSerializationTokens) + .toList + .sorted // Go through this list in order the tokens were allocated + + for { + (id, className) <- tokenizedClasses + clazz <- getClassOpt(className) + if !newK.alreadyRegistered(clazz) + } { + newK.register(clazz) + } + newK } + + private def getClassOpt(name: String): Option[Class[_]] = + try { + Some(Class.forName(name)) + } catch { + case _: ClassNotFoundException => None + } + + /** + * If you override KryoHadoop, prefer to add registrations here instead of overriding [[newKryo]]. That way, + * any additional default serializers will be used for registering cascading tokenized classes. + */ + def customRegistrar: IKryoRegistrar = new IKryoRegistrar { + override def apply(k: Kryo): Unit = {} + } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoSerializers.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoSerializers.scala index 9579d268f0..407ae5cafe 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoSerializers.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/KryoSerializers.scala @@ -12,35 +12,33 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.serialization -import java.io.InputStream -import java.io.OutputStream -import java.io.Serializable -import java.nio.ByteBuffer - -import org.apache.hadoop.io.serializer.{Serialization, Deserializer, Serializer, WritableSerialization} - import com.esotericsoftware.kryo.Kryo import com.esotericsoftware.kryo.{Serializer => KSerializer} import com.esotericsoftware.kryo.io.{Input, Output} -import scala.annotation.tailrec -import scala.collection.immutable.ListMap -import scala.collection.mutable.{Map => MMap} - import com.twitter.scalding._ -/*** - * Below are some serializers for objects in the scalding project. +/** + * This is a runtime check for types we should never be serializing + */ +class ThrowingSerializer[T] extends KSerializer[T] { + override def write(kryo: Kryo, output: Output, t: T): Unit = + sys.error(s"Kryo should never be used to serialize an instance: $t") + override def read(kryo: Kryo, input: Input, t: Class[T]): T = + sys.error(s"Kryo should never be used to serialize an instance, class: $t") +} + +/** + * * Below are some serializers for objects in the scalding project. */ class RichDateSerializer extends KSerializer[RichDate] { // RichDates are immutable, no need to copy them setImmutable(true) - def write(kser: Kryo, out: Output, date: RichDate) { - out.writeLong(date.timestamp, true); - } + def write(kser: Kryo, out: Output, date: RichDate): Unit = + out.writeLong(date.timestamp, true) def read(kser: Kryo, in: Input, cls: Class[RichDate]): RichDate = RichDate(in.readLong(true)) @@ -49,35 +47,33 @@ class RichDateSerializer extends KSerializer[RichDate] { class DateRangeSerializer extends KSerializer[DateRange] { // DateRanges are immutable, no need to copy them setImmutable(true) - def write(kser: Kryo, out: Output, range: DateRange) { - out.writeLong(range.start.timestamp, true); - out.writeLong(range.end.timestamp, true); + def write(kser: Kryo, out: Output, range: DateRange): Unit = { + out.writeLong(range.start.timestamp, true) + out.writeLong(range.end.timestamp, true) } - def read(kser: Kryo, in: Input, cls: Class[DateRange]): DateRange = { - DateRange(RichDate(in.readLong(true)), RichDate(in.readLong(true))); - } + def read(kser: Kryo, in: Input, cls: Class[DateRange]): DateRange = + DateRange(RichDate(in.readLong(true)), RichDate(in.readLong(true))) } class ArgsSerializer extends KSerializer[Args] { // Args are immutable, no need to copy them setImmutable(true) - def write(kser: Kryo, out : Output, a : Args) { + def write(kser: Kryo, out: Output, a: Args): Unit = out.writeString(a.toString) - } - def read(kser : Kryo, in : Input, cls : Class[Args]) : Args = + def read(kser: Kryo, in: Input, cls: Class[Args]): Args = Args(in.readString) } class IntFieldSerializer extends KSerializer[IntField[_]] { - //immutable, no need to copy them + // immutable, no need to copy them setImmutable(true) - def write(kser: Kryo, out : Output, a : IntField[_]) { + def write(kser: Kryo, out: Output, a: IntField[_]): Unit = { out.writeInt(a.id) kser.writeClassAndObject(out, a.ord) kser.writeClassAndObject(out, a.mf) } - def read(kser : Kryo, in : Input, cls : Class[IntField[_]]) : IntField[_] = { + def read(kser: Kryo, in: Input, cls: Class[IntField[_]]): IntField[_] = { val id = in.readInt val ord = kser.readClassAndObject(in).asInstanceOf[Ordering[Any]] val mf = kser.readClassAndObject(in).asInstanceOf[Option[Manifest[Any]]] @@ -86,19 +82,17 @@ class IntFieldSerializer extends KSerializer[IntField[_]] { } class StringFieldSerializer extends KSerializer[StringField[_]] { - //immutable, no need to copy them + // immutable, no need to copy them setImmutable(true) - def write(kser: Kryo, out : Output, a : StringField[_]) { + def write(kser: Kryo, out: Output, a: StringField[_]): Unit = { out.writeString(a.id) kser.writeClassAndObject(out, a.ord) kser.writeClassAndObject(out, a.mf) } - def read(kser : Kryo, in : Input, cls : Class[StringField[_]]) : StringField[_] = { + def read(kser: Kryo, in: Input, cls: Class[StringField[_]]): StringField[_] = { val id = in.readString val ord = kser.readClassAndObject(in).asInstanceOf[Ordering[Any]] val mf = kser.readClassAndObject(in).asInstanceOf[Option[Manifest[Any]]] StringField[Any](id)(ord, mf) } } - - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/MultiJoinExternalizer.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/MultiJoinExternalizer.scala new file mode 100644 index 0000000000..088ecf5680 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/MultiJoinExternalizer.scala @@ -0,0 +1,33 @@ +package com.twitter.scalding.serialization + +import com.twitter.scalding.typed.MultiJoinFunction + +object MultiJoinExternalizer { + import MultiJoinFunction.Transformer + + final case class ExternalizeMapGroup[A, B, C](@transient fn: (A, Iterator[B]) => Iterator[C]) + extends Function2[A, Iterator[B], Iterator[C]] { + private[this] val fnEx = Externalizer(fn) + + def apply(a: A, it: Iterator[B]) = fnEx.get(a, it) + } + + final case class ExternalizeJoin[A, B, C, D](@transient fn: (A, Iterator[B], Iterable[C]) => Iterator[D]) + extends Function3[A, Iterator[B], Iterable[C], Iterator[D]] { + private[this] val fnEx = Externalizer(fn) + + def apply(a: A, bs: Iterator[B], cs: Iterable[C]) = fnEx.get(a, bs, cs) + } + + private[this] object ExtTrans extends Transformer { + def transformJoin[A, B, C, D]( + fn: (A, Iterator[B], Iterable[C]) => Iterator[D] + ): (A, Iterator[B], Iterable[C]) => Iterator[D] = + ExternalizeJoin(fn) + def transformMap[A, B, C](fn: (A, Iterator[B]) => Iterator[C]): (A, Iterator[B]) => Iterator[C] = + ExternalizeMapGroup(fn) + } + + def externalize[A, B](mjf: MultiJoinFunction[A, B]): MultiJoinFunction[A, B] = + ExtTrans(mjf) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparators.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparators.scala new file mode 100644 index 0000000000..652df358d2 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparators.scala @@ -0,0 +1,37 @@ +package com.twitter.scalding.serialization + +import com.twitter.scalding._ + +import scala.language.experimental.{macros => smacros} + +/** + * RequiredBinaryComparators provide comparators (or Ordering in Scala) that are capable of comparing keys in + * their serialized form reducing the amount of time spent in serialization/deserialization. These comparators + * are implemented using Scala macros, and currently provide binary comparators for primitives, strings, + * Options, tuples, collections, case classes and Scrooge objects. + */ +trait RequiredBinaryComparators extends RequiredBinaryComparatorsConfig { + + implicit def ordSer[T]: OrderedSerialization[T] = + macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] + +} + +object RequiredBinaryComparators { + + implicit def orderedSerialization[T]: OrderedSerialization[T] = + macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] +} + +/** + * Use this for an ExecutionApp. + */ +trait RequiredBinaryComparatorsExecutionApp extends ExecutionApp { + implicit def ordSer[T]: OrderedSerialization[T] = + macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] + def requireOrderedSerializationMode: RequireOrderedSerializationMode = RequireOrderedSerializationMode.Fail + override def config(inputArgs: Array[String]): (Config, Mode) = { + val (conf, m) = super.config(inputArgs) + (conf.setRequireOrderedSerializationMode(Some(requireOrderedSerializationMode)), m) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparatorsConfig.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparatorsConfig.scala new file mode 100644 index 0000000000..60d00d7025 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/RequiredBinaryComparatorsConfig.scala @@ -0,0 +1,9 @@ +package com.twitter.scalding.serialization + +import com.twitter.scalding.{Config, Job} + +trait RequiredBinaryComparatorsConfig extends Job { + def requireOrderedSerializationMode: RequireOrderedSerializationMode = RequireOrderedSerializationMode.Fail + override def config = + super.config + (Config.ScaldingRequireOrderedSerialization -> requireOrderedSerializationMode.toString) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/SerializeAsUnit.scala.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/SerializeAsUnit.scala.scala new file mode 100644 index 0000000000..023b21a418 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/SerializeAsUnit.scala.scala @@ -0,0 +1,11 @@ +package com.twitter.scalding.serialization + +import com.esotericsoftware.kryo.Kryo +import com.esotericsoftware.kryo.{Serializer => KSerializer} +import com.esotericsoftware.kryo.io.{Input, Output} + +// We use this for TypedPipe subclasses which should never be needed when we run +class SerializeAsUnit[T >: Null] extends KSerializer[T] { + override def write(kryo: Kryo, output: Output, t: T): Unit = () + override def read(kryo: Kryo, input: Input, t: Class[T]): T = null +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/serialization/WrappedSerialization.scala b/scalding-core/src/main/scala/com/twitter/scalding/serialization/WrappedSerialization.scala new file mode 100644 index 0000000000..a81841f8cd --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/serialization/WrappedSerialization.scala @@ -0,0 +1,126 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.serialization + +import org.apache.hadoop.io.serializer.{Deserializer, Serialization => HSerialization, Serializer} +import org.apache.hadoop.conf.{Configurable, Configuration} + +import java.io.{InputStream, OutputStream} +import com.twitter.bijection.{Base64String, Injection, JavaSerializationInjection} +import scala.collection.JavaConverters._ + +/** + * WrappedSerialization wraps a value in a wrapper class that has an associated Binary that is used to + * deserialize items wrapped in the wrapper + */ +class WrappedSerialization[T] extends HSerialization[T] with Configurable { + + private var conf: Option[Configuration] = None + private var serializations: Map[Class[_], Serialization[_]] = Map.empty + + /* This use of `_.get` can't be fixed since this is constrained by + * Hadoop's `Configurable` interface. + */ + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + override def getConf: Configuration = conf.get + override def setConf(config: Configuration): Unit = { + conf = Some(config) + serializations = WrappedSerialization.getBinary(config) + } + + def accept(c: Class[_]): Boolean = serializations.contains(c) + + def getSerialization(c: Class[T]): Option[Serialization[T]] = + serializations + .get(c) + // This cast should never fail since we matched the class + .asInstanceOf[Option[Serialization[T]]] + + def getSerializer(c: Class[T]): Serializer[T] = + new BinarySerializer( + getSerialization(c) + .getOrElse(sys.error(s"Serialization for class: $c not found")) + ) + + def getDeserializer(c: Class[T]): Deserializer[T] = + new BinaryDeserializer( + getSerialization(c) + .getOrElse(sys.error(s"Serialization for class: $c not found")) + ) + +} + +class BinarySerializer[T](buf: Serialization[T]) extends Serializer[T] { + private var out: OutputStream = _ + def open(os: OutputStream): Unit = + out = os + def close(): Unit = out = null + def serialize(t: T): Unit = { + if (out == null) throw new NullPointerException("OutputStream is null") + buf.write(out, t).get + } +} + +class BinaryDeserializer[T](buf: Serialization[T]) extends Deserializer[T] { + private var is: InputStream = _ + def open(i: InputStream): Unit = is = i + def close(): Unit = is = null + def deserialize(t: T): T = { + if (is == null) throw new NullPointerException("InputStream is null") + buf.read(is).get + } +} + +object WrappedSerialization { + type ClassSerialization[T] = (Class[T], Serialization[T]) + + private def getSerializer[U]: Injection[Externalizer[U], String] = { + implicit val initialInj: Injection[Externalizer[U], Array[Byte]] = + JavaSerializationInjection[Externalizer[U]] + Injection.connect[Externalizer[U], Array[Byte], Base64String, String] + } + + private def serialize[T](b: T): String = + getSerializer[T](Externalizer(b)) + + private def deserialize[T](str: String): T = + getSerializer[T].invert(str).get.get + + private val confKey = "com.twitter.scalding.serialization.WrappedSerialization" + + def rawSetBinary(bufs: Iterable[ClassSerialization[_]], fn: (String, String) => Unit) = + fn(confKey, bufs.map { case (cls, buf) => s"${cls.getName}:${serialize(buf)}" }.mkString(",")) + def setBinary(conf: Configuration, bufs: Iterable[ClassSerialization[_]]): Unit = + rawSetBinary(bufs, { case (k, v) => conf.set(k, v) }) + + def getBinary(conf: Configuration): Map[Class[_], Serialization[_]] = + conf.iterator.asScala + .map { it => + (it.getKey, it.getValue) + } + .filter(_._1.startsWith(confKey)) + .map { case (_, clsbuf) => + clsbuf.split(":") match { + case Array(className, serialization) => + // Jump through a hoop to get scalac happy + def deser[T](cls: Class[T]): ClassSerialization[T] = + (cls, deserialize[Serialization[T]](serialization)) + deser(conf.getClassByName(className)) + case _ => sys.error(s"ill formed bufferables: $clsbuf") + } + } + .toMap +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/CheckedInversion.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/CheckedInversion.scala index e60cb3e581..a4b4eab19f 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/CheckedInversion.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/CheckedInversion.scala @@ -12,18 +12,18 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.source import com.twitter.bijection.Injection import java.io.Serializable -/** Handles the error checking for Injection inversion - * if check fails, it will throw an unrecoverable exception stopping the job - * TODO: probably belongs in Bijection +/** + * Handles the error checking for Injection inversion if check fails, it will throw an unrecoverable exception + * stopping the job TODO: probably belongs in Bijection */ -trait CheckedInversion[T,U] extends Serializable { - def injection: Injection[T,U] +trait CheckedInversion[T, U] extends Serializable { + def injection: Injection[T, U] def apply(input: U): Option[T] } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/CodecSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/CodecSource.scala index 5c48b9a410..23d5c1f887 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/CodecSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/CodecSource.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.source @@ -26,10 +26,10 @@ import com.twitter.scalding._ import java.util.Arrays import org.apache.hadoop.io.BytesWritable +import scala.collection.JavaConverters._ /** - * Source used to write some type T into a WritableSequenceFile using a codec on T - * for serialization. + * Source used to write some type T into a WritableSequenceFile using a codec on T for serialization. */ object BytesWritableCodec { @@ -45,24 +45,40 @@ object CodecSource { def apply[T](paths: String*)(implicit codec: Injection[T, Array[Byte]]) = new CodecSource[T](paths) } -class CodecSource[T] private (val hdfsPaths: Seq[String], val maxFailures: Int = 0)(implicit @transient injection: Injection[T, Array[Byte]]) -extends FileSource -with Mappable[T] { +class CodecSource[T] private (val hdfsPaths: Seq[String], val maxFailures: Int = 0)(implicit + @transient injection: Injection[T, Array[Byte]] +) extends FileSource + with Mappable[T] + with LocalTapSource { import Dsl._ val fieldSym = 'encodedBytes lazy val field = new Fields(fieldSym.name) - val injectionBox = Externalizer(injection andThen BytesWritableCodec.get) + val injectionBox = Externalizer(injection.andThen(BytesWritableCodec.get)) + + def localPaths = hdfsPaths override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](TupleConverter.singleConverter[T]) - override def localPath = sys.error("Local mode not yet supported.") override def hdfsScheme = - HadoopSchemeInstance(new WritableSequenceFile(field, classOf[BytesWritable]).asInstanceOf[Scheme[_, _, _, _, _]]) + HadoopSchemeInstance( + new WritableSequenceFile(field, classOf[BytesWritable]).asInstanceOf[Scheme[_, _, _, _, _]] + ) protected lazy val checkedInversion = new MaxFailuresCheck[T, BytesWritable](maxFailures)(injectionBox.get) override def transformForRead(pipe: Pipe) = - pipe.flatMap((fieldSym) -> (fieldSym)) { (bw: BytesWritable) => checkedInversion(bw) } + pipe.flatMap(fieldSym -> fieldSym)((bw: BytesWritable) => checkedInversion(bw)) override def transformForWrite(pipe: Pipe) = - pipe.mapTo((0) -> (fieldSym)) { injectionBox.get.apply(_: T) } + pipe.mapTo(0 -> fieldSym)(injectionBox.get.apply(_: T)) + + override def toIterator(implicit config: Config, mode: Mode): Iterator[T] = { + val tap = createTap(Read)(mode) + CascadingMode + .cast(mode) + .openForRead(config, tap) + .asScala + .flatMap { te => + checkedInversion(te.selectTuple(sourceFields).getObject(0).asInstanceOf[BytesWritable]) + } + } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/DailySources.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/DailySources.scala index d05d5154d0..3224f9d6d0 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/DailySources.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/DailySources.scala @@ -1,65 +1,95 @@ /** * Copyright 2012 Twitter, Inc. * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. */ package com.twitter.scalding.source import com.twitter.scalding._ -import Dsl._ import cascading.tuple.Fields abstract class DailyPrefixSuffixSource(prefixTemplate: String, suffixTemplate: String, dateRange: DateRange) - extends TimePathedSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + suffixTemplate + "/*", dateRange, DateOps.UTC) + extends TimePathedSource( + prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + suffixTemplate + "/*", + dateRange, + DateOps.UTC + ) -abstract class DailyPrefixSuffixMostRecentSource(prefixTemplate: String, suffixTemplate: String, dateRange: DateRange) - extends MostRecentGoodSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + suffixTemplate + "/*", dateRange, DateOps.UTC) +abstract class DailyPrefixSuffixMostRecentSource( + prefixTemplate: String, + suffixTemplate: String, + dateRange: DateRange +) extends MostRecentGoodSource( + prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + suffixTemplate + "/*", + dateRange, + DateOps.UTC + ) abstract class DailySuffixSource(prefixTemplate: String, dateRange: DateRange) - extends TimePathedSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + "/*", dateRange, DateOps.UTC) + extends TimePathedSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + "/*", dateRange, DateOps.UTC) abstract class DailySuffixMostRecentSource(prefixTemplate: String, dateRange: DateRange) - extends MostRecentGoodSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + "/*", dateRange, DateOps.UTC) + extends MostRecentGoodSource( + prefixTemplate + TimePathedSource.YEAR_MONTH_DAY + "/*", + dateRange, + DateOps.UTC + ) object DailySuffixTsv { - def apply(prefix: String, fs: Fields = Fields.ALL) - (implicit dateRange: DateRange) = new DailySuffixTsv(prefix, fs) + def apply(prefix: String, fs: Fields = Fields.ALL)(implicit dateRange: DateRange) = + new DailySuffixTsv(prefix, fs) } class DailySuffixTsv(prefix: String, fs: Fields = Fields.ALL)(override implicit val dateRange: DateRange) - extends DailySuffixSource(prefix, dateRange) with DelimitedScheme { + extends DailySuffixSource(prefix, dateRange) + with DelimitedScheme { override val fields = fs } +object DailySuffixTypedTsv { + def apply[T]( + prefix: String + )(implicit dateRange: DateRange, mf: Manifest[T], conv: TupleConverter[T], tset: TupleSetter[T]) = + new DailySuffixTypedTsv[T](prefix) +} + +class DailySuffixTypedTsv[T](prefix: String)(implicit + override val dateRange: DateRange, + override val mf: Manifest[T], + override val conv: TupleConverter[T], + override val tset: TupleSetter[T] +) extends DailySuffixSource(prefix, dateRange) + with TypedDelimited[T] + object DailySuffixCsv { - def apply(prefix: String, fs: Fields = Fields.ALL) - (implicit dateRange: DateRange) = new DailySuffixCsv(prefix, fs) + def apply(prefix: String, fs: Fields = Fields.ALL)(implicit dateRange: DateRange) = + new DailySuffixCsv(prefix, fs) } class DailySuffixCsv(prefix: String, fs: Fields = Fields.ALL)(override implicit val dateRange: DateRange) -extends DailySuffixSource(prefix, dateRange) with DelimitedScheme { + extends DailySuffixSource(prefix, dateRange) + with DelimitedScheme { override val fields = fs override val separator = "," } object DailySuffixMostRecentCsv { - def apply(prefix: String, fs: Fields = Fields.ALL) - (implicit dateRange: DateRange) = new DailySuffixMostRecentCsv(prefix, fs) + def apply(prefix: String, fs: Fields = Fields.ALL)(implicit dateRange: DateRange) = + new DailySuffixMostRecentCsv(prefix, fs) } -class DailySuffixMostRecentCsv(prefix: String, fs: Fields = Fields.ALL)(override implicit val dateRange: DateRange) - extends DailySuffixMostRecentSource(prefix, dateRange) with DelimitedScheme { +class DailySuffixMostRecentCsv(prefix: String, fs: Fields = Fields.ALL)( + override implicit val dateRange: DateRange +) extends DailySuffixMostRecentSource(prefix, dateRange) + with DelimitedScheme { override val fields = fs override val separator = "," } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/HourlySources.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/HourlySources.scala index c954da211e..06fa819487 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/HourlySources.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/HourlySources.scala @@ -1,17 +1,14 @@ /** * Copyright 2012 Twitter, Inc. * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License is + * distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. */ package com.twitter.scalding.source @@ -19,23 +16,48 @@ package com.twitter.scalding.source import com.twitter.scalding._ abstract class HourlySuffixSource(prefixTemplate: String, dateRange: DateRange) - extends TimePathedSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*", dateRange, DateOps.UTC) + extends TimePathedSource( + prefixTemplate + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*", + dateRange, + DateOps.UTC + ) abstract class HourlySuffixMostRecentSource(prefixTemplate: String, dateRange: DateRange) - extends MostRecentGoodSource(prefixTemplate + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*", dateRange, DateOps.UTC) + extends MostRecentGoodSource( + prefixTemplate + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*", + dateRange, + DateOps.UTC + ) object HourlySuffixTsv { def apply(prefix: String)(implicit dateRange: DateRange) = new HourlySuffixTsv(prefix) } class HourlySuffixTsv(prefix: String)(override implicit val dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with DelimitedScheme + extends HourlySuffixSource(prefix, dateRange) + with DelimitedScheme + +object HourlySuffixTypedTsv { + def apply[T]( + prefix: String + )(implicit dateRange: DateRange, mf: Manifest[T], conv: TupleConverter[T], tset: TupleSetter[T]) = + new HourlySuffixTypedTsv[T](prefix) +} + +class HourlySuffixTypedTsv[T](prefix: String)(implicit + override val dateRange: DateRange, + override val mf: Manifest[T], + override val conv: TupleConverter[T], + override val tset: TupleSetter[T] +) extends HourlySuffixSource(prefix, dateRange) + with TypedDelimited[T] object HourlySuffixCsv { def apply(prefix: String)(implicit dateRange: DateRange) = new HourlySuffixCsv(prefix) } class HourlySuffixCsv(prefix: String)(override implicit val dateRange: DateRange) - extends HourlySuffixSource(prefix, dateRange) with DelimitedScheme { + extends HourlySuffixSource(prefix, dateRange) + with DelimitedScheme { override val separator = "," } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/MaxFailuresCheck.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/MaxFailuresCheck.scala index 2ad143c191..90e640f820 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/source/MaxFailuresCheck.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/MaxFailuresCheck.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.source @@ -20,24 +20,18 @@ import com.twitter.bijection.Injection import java.util.concurrent.atomic.AtomicInteger // TODO: this should actually increment an read a Hadoop counter -class MaxFailuresCheck[T,U](val maxFailures: Int) - (implicit override val injection: Injection[T,U]) - extends CheckedInversion[T,U] { +class MaxFailuresCheck[T, U](val maxFailures: Int)(implicit override val injection: Injection[T, U]) + extends CheckedInversion[T, U] { private val failures = new AtomicInteger(0) - def apply(input: U): Option[T] = { + def apply(input: U): Option[T] = try { Some(injection.invert(input).get) - } - catch { - case e => + } catch { + case e: Exception => // TODO: use proper logging e.printStackTrace() - assert( - failures.incrementAndGet <= maxFailures, - "maximum decoding errors exceeded" - ) + assert(failures.incrementAndGet <= maxFailures, "maximum decoding errors exceeded") None } - } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/NullSink.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/NullSink.scala new file mode 100644 index 0000000000..4cde1a85fe --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/NullSink.scala @@ -0,0 +1,12 @@ +package com.twitter.scalding.source + +import com.twitter.scalding.typed.TypedSink +import com.twitter.scalding.{BaseNullSource, TupleSetter} + +/** + * This can be used to cause cascading to run a flow, but discard the output. The only place this is likely of + * use is to do some (non-recommended, but sometimes the most expediant way to accomplish some task). + */ +object NullSink extends BaseNullSource with TypedSink[Any] { + def setter[U <: Any] = TupleSetter.asSubSetter[Any, U](TupleSetter.singleSetter) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/TypedSequenceFile.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/TypedSequenceFile.scala new file mode 100644 index 0000000000..f635244f5e --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/TypedSequenceFile.scala @@ -0,0 +1,47 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding.source + +import cascading.tuple.Fields +import com.twitter.scalding._ +import com.twitter.scalding.SequenceFile + +/** + * SequenceFile with explicit types. Useful for debugging flows using the Typed API. Not to be used for + * permanent storage: uses Kryo serialization which may not be consistent across JVM instances. Use Thrift + * sources instead. + */ +class TypedSequenceFile[T](val path: String) + extends SequenceFile(path, Fields.FIRST) + with Mappable[T] + with TypedSink[T] { + override def converter[U >: T] = + TupleConverter.asSuperConverter[T, U](TupleConverter.singleConverter[T]) + override def setter[U <: T] = + TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) + override def toString: String = "TypedSequenceFile(%s)".format(path) + override def equals(that: Any): Boolean = that match { + case null => false + case t: TypedSequenceFile[_] => t.p == p // horribly named fields in the SequenceFile case class + case _ => false + } + override def hashCode = path.hashCode +} + +object TypedSequenceFile { + def apply[T](path: String): TypedSequenceFile[T] = new TypedSequenceFile[T](path) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/source/TypedText.scala b/scalding-core/src/main/scala/com/twitter/scalding/source/TypedText.scala new file mode 100644 index 0000000000..fb091f15b5 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/source/TypedText.scala @@ -0,0 +1,162 @@ +package com.twitter.scalding.source + +import cascading.scheme.Scheme +import cascading.scheme.hadoop.{TextDelimited => CHTextDelimited} +import cascading.scheme.local.{TextDelimited => CLTextDelimited} +import com.twitter.scalding._ +import com.twitter.scalding.typed.TypedSink + +/** + * This object gives you easy access to text formats (possibly LZO compressed) by using a case class to + * describe the field names and types. + */ +case class TypedSep(str: String) extends AnyVal + +object TypedText { + + val TAB = TypedSep("\t") + val ONE = TypedSep("\u0001") + val COMMA = TypedSep(",") + + def tsv[T: TypeDescriptor](path: String*): Source with TypedTextDelimited[T] = + new FixedTypedText[T](TAB, path: _*) + def osv[T: TypeDescriptor](path: String*): Source with TypedTextDelimited[T] = + new FixedTypedText[T](ONE, path: _*) + def csv[T: TypeDescriptor](path: String*): Source with TypedTextDelimited[T] = + new FixedTypedText[T](COMMA, path: _*) + + /** + * Prefix might be "/logs/awesome" + */ + private def hourly[T](sep: TypedSep, prefix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = { + require(prefix.last != '/', "prefix should not include trailing /") + new TimePathTypedText[T](sep, prefix + TimePathedSource.YEAR_MONTH_DAY_HOUR + "/*") + } + + def hourlyTsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + hourly(TAB, prefix) + + def hourlyOsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + hourly(ONE, prefix) + + def hourlyCsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + hourly(COMMA, prefix) + + private def daily[T](sep: TypedSep, prefix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = { + require(prefix.last != '/', "prefix should not include trailing /") + new TimePathTypedText[T](sep, prefix + TimePathedSource.YEAR_MONTH_DAY + "/*") + } + + def dailyTsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + daily(TAB, prefix) + + def dailyOsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + daily(ONE, prefix) + + def dailyCsv[T](prefix: String)(implicit dr: DateRange, td: TypeDescriptor[T]): TypedTextDelimited[T] = + daily(COMMA, prefix) + + private def dailyPrefixSuffix[T](sep: TypedSep, prefix: String, suffix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = { + require(prefix.last != '/', "prefix should not include trailing /") + require(suffix.head == '/', "suffix should include a preceding /") + new TimePathTypedText[T](sep, prefix + TimePathedSource.YEAR_MONTH_DAY + suffix + "/*") + } + + def dailyPrefixSuffixTsv[T](prefix: String, suffix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = + dailyPrefixSuffix(TAB, prefix, suffix) + + def dailyPrefixSuffixOsv[T](prefix: String, suffix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = + dailyPrefixSuffix(ONE, prefix, suffix) + + def dailyPrefixSuffixCsv[T](prefix: String, suffix: String)(implicit + dr: DateRange, + td: TypeDescriptor[T] + ): TypedTextDelimited[T] = + dailyPrefixSuffix(COMMA, prefix, suffix) + +} + +trait TypedTextDelimited[T] extends SchemedSource with Mappable[T] with TypedSink[T] { + def typeDescriptor: TypeDescriptor[T] + + protected def separator: TypedSep + + /* + * These options make the string parsing strict. If you want + * to try to ignore some errors, you can change them, but refer + * to the cascading documentation on TextDelimited + */ + protected def strict: Boolean = true + protected def safe: Boolean = true + + /* + * Implemented in terms of the above + */ + override def converter[U >: T] = TupleConverter.asSuperConverter(typeDescriptor.converter) + override def setter[U <: T] = TupleSetter.asSubSetter(typeDescriptor.setter) + override def sinkFields = typeDescriptor.fields + override def sourceFields = typeDescriptor.fields + + override def localScheme = + new CLTextDelimited( + typeDescriptor.fields, + false, + false, + separator.str, + strict, + null /* quote */, + typeDescriptor.fields.getTypesClasses, + safe + ) + + override def hdfsScheme = + HadoopSchemeInstance( + new CHTextDelimited( + typeDescriptor.fields, + null /* compression */, + false, + false, + separator.str, + strict, + null /* quote */, + typeDescriptor.fields.getTypesClasses, + safe + ).asInstanceOf[Scheme[_, _, _, _, _]] + ) +} + +class TimePathTypedText[T](sep: TypedSep, path: String)(implicit dr: DateRange, td: TypeDescriptor[T]) + extends TimePathedSource(path, dr, DateOps.UTC) + with TypedTextDelimited[T] { + override def typeDescriptor = td + protected override def separator = sep +} + +class MostRecentTypedText[T](sep: TypedSep, path: String)(implicit dr: DateRange, td: TypeDescriptor[T]) + extends MostRecentGoodSource(path, dr, DateOps.UTC) + with TypedTextDelimited[T] { + override def typeDescriptor = td + protected override def separator = sep +} + +class FixedTypedText[T](sep: TypedSep, path: String*)(implicit td: TypeDescriptor[T]) + extends FixedPathSource(path: _*) + with TypedTextDelimited[T] { + override def typeDescriptor = td + protected override def separator = sep +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/BijectedSourceSink.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/BijectedSourceSink.scala new file mode 100644 index 0000000000..2794a32f22 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/BijectedSourceSink.scala @@ -0,0 +1,46 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +import cascading.flow.FlowDef +import cascading.pipe.Pipe + +import com.twitter.bijection.ImplicitBijection +import com.twitter.scalding._ +import serialization.Externalizer + +object BijectedSourceSink { + type SourceSink[T] = TypedSource[T] with TypedSink[T] + def apply[T, U](parent: SourceSink[T])(implicit + transformer: ImplicitBijection[T, U] + ): BijectedSourceSink[T, U] = + new BijectedSourceSink(parent)(transformer) +} + +class BijectedSourceSink[T, U](parent: BijectedSourceSink.SourceSink[T])(implicit + @transient transformer: ImplicitBijection[T, U] +) extends TypedSource[U] + with TypedSink[U] { + + val lockedBij = Externalizer(transformer) + + def setter[V <: U] = parent.setter.contraMap(lockedBij.get.invert(_)) + + override def converter[W >: U] = parent.converter.andThen { t: T => lockedBij.get(t) }: TupleConverter[W] + + override def read(implicit flowDef: FlowDef, mode: Mode): Pipe = parent.read + override def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode) = parent.writeFrom(pipe) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/CoGrouped.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/CoGrouped.scala deleted file mode 100644 index 812234a7c4..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/CoGrouped.scala +++ /dev/null @@ -1,276 +0,0 @@ -/* -Copyright 2014 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.typed - -import cascading.tuple.{Tuple => CTuple, Fields} -import cascading.pipe.joiner.{Joiner => CJoiner, JoinerClosure} - -import cascading.pipe.{CoGroup, Pipe} - -import com.twitter.scalding._ - -import scala.collection.JavaConverters._ - -object CoGrouped { - // distinct by mapped, but don't reorder if the list is unique - final def distinctBy[T,U](list: List[T])(fn: T => U): List[T] = { - @annotation.tailrec - def go(l: List[T], seen: Set[U] = Set[U](), acc: List[T] = Nil): List[T] = l match { - case Nil => acc.reverse // done - case h::tail => - val uh = fn(h) - if(seen(uh)) - go(tail, seen, acc) - else - go(tail, seen + uh, h::acc) - } - go(list) - } -} - -/** - * Represents something than can be CoGrouped with another CoGroupable - */ -trait CoGroupable[K, +R] extends HasReducers with java.io.Serializable { - /** This is the list of mapped pipes, just before the (reducing) joinFunction is applied - */ - def inputs: List[TypedPipe[(K, Any)]] - - def keyOrdering: Ordering[K] - - /** - * This function is not type-safe for others to call, but it should - * never have an error. By construction, we never call it with incorrect - * types. - * It would be preferable to have stronger type safety here, but unclear - * how to achieve, and since it is an internal function, not clear it - * would actually help anyone for it to be type-safe - */ - protected def joinFunction: (K, Iterator[CTuple], Seq[Iterable[CTuple]]) => Iterator[R] - - /** - * Smaller is about average values/key not total size (that does not matter, but is - * clearly related). - * - * Note that from the type signature we see that the right side is iterated (or may be) - * over and over, but the left side is not. That means that you want the side with - * fewer values per key on the right. If both sides are similar, no need to worry. - * If one side is a one-to-one mapping, that should be the "smaller" side. - */ - def cogroup[R1,R2](smaller: CoGroupable[K, R1])(fn: (K, Iterator[R], Iterable[R1]) => Iterator[R2]): CoGrouped[K, R2] = { - val self = this - val leftSeqCount = self.inputs.size - 1 - - new CoGrouped[K, R2] { - val inputs = self.inputs ++ smaller.inputs - val reducers = (self.reducers.toIterable ++ smaller.reducers.toIterable).reduceOption(_ max _) - def keyOrdering = smaller.keyOrdering - - def joinFunction = { (k: K, leftMost: Iterator[CTuple], joins: Seq[Iterable[CTuple]]) => - val joinedLeft = self.joinFunction(k, leftMost, joins.take(leftSeqCount)) - - val smallerIns = joins.drop(leftSeqCount) - val joinedRight = new Iterable[R1] { - def iterator = smaller.joinFunction(k, smallerIns.head.iterator, smallerIns.tail) - } - - fn(k, joinedLeft, joinedRight) - } - } - } - - def join[W](smaller: CoGroupable[K,W]) = - cogroup[W,(R,W)](smaller)(Joiner.inner2) - def leftJoin[W](smaller: CoGroupable[K,W]) = - cogroup[W,(R,Option[W])](smaller)(Joiner.left2) - def rightJoin[W](smaller: CoGroupable[K,W]) = - cogroup[W,(Option[R],W)](smaller)(Joiner.right2) - def outerJoin[W](smaller: CoGroupable[K,W]) = - cogroup[W,(Option[R],Option[W])](smaller)(Joiner.outer2) - // TODO: implement blockJoin -} - -trait CoGrouped[K,+R] extends KeyedListLike[K,R,CoGrouped] with CoGroupable[K, R] with WithReducers[CoGrouped[K,R]] { - override def withReducers(reds: Int) = { - val self = this // the usual self => trick leads to serialization errors - val joinF = joinFunction // can't access this on self, since it is protected - new CoGrouped[K, R] { - def inputs = self.inputs - def reducers = Some(reds) - def keyOrdering = self.keyOrdering - def joinFunction = joinF - } - } - - // Filter the keys before doing the join - override def filterKeys(fn: K => Boolean): CoGrouped[K, R] = { - val self = this // the usual self => trick leads to serialization errors - val joinF = joinFunction // can't access this on self, since it is protected - new CoGrouped[K, R] { - val inputs = self.inputs.map(_.filterKeys(fn)) - def reducers = self.reducers - def keyOrdering = self.keyOrdering - def joinFunction = joinF - } - } - - override def mapGroup[R1](fn: (K, Iterator[R]) => Iterator[R1]): CoGrouped[K, R1] = { - val self = this // the usual self => trick leads to serialization errors - val joinF = joinFunction // can't access this on self, since it is protected - new CoGrouped[K, R1] { - def inputs = self.inputs - def reducers = self.reducers - def keyOrdering = self.keyOrdering - def joinFunction = { (k: K, leftMost: Iterator[CTuple], joins: Seq[Iterable[CTuple]]) => - fn(k, joinF(k, leftMost, joins)) - } - } - } - - override lazy val toTypedPipe: TypedPipe[(K, R)] = { - // Cascading handles the first item in join differently, we have to see if it is repeated - val firstCount = inputs.count(_ == inputs.head) - - import Dsl._ - import RichPipe.assignName - - /* - * we only want key and value. - * Cascading requires you have the same number coming in as out. - * in the first case, we introduce (null0, null1), in the second - * we have (key1, value1), but they are then discarded: - */ - def outFields(inCount: Int): Fields = - List("key", "value") ++ (0 until (2*(inCount - 1))).map("null%d".format(_)) - - // Make this stable so the compiler does not make a closure - val ord = keyOrdering - - val newPipe = if(firstCount == inputs.size) { - /** This is a self-join - * Cascading handles this by sending the data only once, spilling to disk if - * the groups don't fit in RAM, then doing the join on this one set of data. - * This is fundamentally different than the case where the first item is - * not repeated. That case is below - */ - val NUM_OF_SELF_JOINS = firstCount - 1 - new CoGroup(assignName(inputs.head.toPipe[(Any, Any)](("key", "value"))), - RichFields(StringField("key")(ord, None)), - NUM_OF_SELF_JOINS, - outFields(firstCount), - new DistinctCoGroupJoiner(firstCount, joinFunction)) - } - else if(firstCount == 1) { - /** - * As long as the first one appears only once, we can handle self joins on the others: - * Cascading does this by maybe spilling all the streams other than the first item. - * This is handled by a different CoGroup constructor than the above case. - */ - def renamePipe(idx: Int, p: TypedPipe[(K, Any)]): Pipe = - p.toPipe[(K,Any)](List("key%d".format(idx), "value%d".format(idx))) - - // This is tested for the properties we need (non-reordering) - val distincts = CoGrouped.distinctBy(inputs)(identity) - val dsize = distincts.size - val isize = inputs.size - - val groupFields: Array[Fields] = (0 until dsize) - .map { idx => RichFields(StringField("key%d".format(idx))(ord, None)) } - .toArray - - val pipes: Array[Pipe] = distincts - .zipWithIndex - .map { case (item, idx) => assignName(renamePipe(idx, item)) } - .toArray - - val cjoiner = if(isize != dsize) { - // avoid capturing anything other than the mapping ints: - val mapping: Map[Int, Int] = inputs.zipWithIndex.map { case (item, idx) => - idx -> distincts.indexWhere(_ == item) - }.toMap - - new CoGroupedJoiner(isize, joinFunction) { - val distinctSize = dsize - def distinctIndexOf(orig: Int) = mapping(orig) - } - } - else new DistinctCoGroupJoiner(isize, joinFunction) - - new CoGroup(pipes, groupFields, outFields(dsize), cjoiner) - } - else { - /** This is non-trivial to encode in the type system, so we throw this exception - * at the planning phase. - */ - sys.error("Except for self joins, where you are joining something with only itself,\n" + - "left-most pipe can only appear once. Firsts: " + - inputs.collect { case x if x == inputs.head => x }.toString) - } - /* - * the CoGrouped only populates the first two fields, the second two - * are null. We then project out at the end of the method. - */ - val pipeWithRed = RichPipe.setReducers(newPipe, reducers.getOrElse(-1)).project('key, 'value) - //Construct the new TypedPipe - TypedPipe.from[(K,R)](pipeWithRed, ('key, 'value)) - } -} - -abstract class CoGroupedJoiner[K](inputSize: Int, joinFunction: (K, Iterator[CTuple], Seq[Iterable[CTuple]]) => Iterator[Any]) extends CJoiner { - val distinctSize: Int - def distinctIndexOf(originalPos: Int): Int - - // This never changes. Compute it once - protected val restIndices: IndexedSeq[Int] = (1 until inputSize).map { idx => - val didx = distinctIndexOf(idx) - assert(didx > 0, "the left most can only be iterated once") - didx - } - - override def getIterator(jc: JoinerClosure) = { - val iters = (0 until distinctSize).map { jc.getIterator(_).asScala.buffered } - val key = iters - .collectFirst { case iter if iter.nonEmpty => iter.head } - .get // One of these must have a key - .getObject(0) - .asInstanceOf[K] - - val leftMost = iters.head - - def toIterable(didx: Int) = - new Iterable[CTuple] { def iterator = jc.getIterator(didx).asScala } - - val rest = restIndices.map(toIterable(_)) - joinFunction(key, leftMost, rest).map { rval => - // There always has to be the same number of resulting fields as input - // or otherwise the flow planner will throw - val res = CTuple.size(distinctSize) - res.set(0, key) - res.set(1, rval) - res - }.asJava - } - - override def numJoins = distinctSize - 1 -} - -// If all the input pipes are unique, this works: -class DistinctCoGroupJoiner[K](count: Int, - joinFunction: (K, Iterator[CTuple], Seq[Iterable[CTuple]]) => Iterator[Any]) - extends CoGroupedJoiner[K](count, joinFunction) { - val distinctSize = count - def distinctIndexOf(idx: Int) = idx -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/FlatMappedFn.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/FlatMappedFn.scala deleted file mode 100644 index a21e63913f..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/FlatMappedFn.scala +++ /dev/null @@ -1,58 +0,0 @@ -/* -Copyright 2013 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.typed - -import java.io.Serializable - -import com.twitter.scalding.TupleConverter -import cascading.tuple.TupleEntry - -/** Closures are difficult for serialization. This class avoids that. */ -sealed trait FlatMapFn[+R] extends Function1[TupleEntry, TraversableOnce[R]] - with java.io.Serializable { - - def filter(fn2: R => Boolean): FlatMapFn[R] = - FilteredFn(this, fn2) - def flatMap[R1](fn2: R => TraversableOnce[R1]): FlatMapFn[R1] = - FlatMappedFn(this, fn2) - def map[R1](fn2: R => R1): FlatMapFn[R1] = - MapFn(this, fn2) -} - -/* This is the initial way we get a FlatMapFn */ -case class Converter[R](conv: TupleConverter[R]) extends FlatMapFn[R] { - // make sure not to start with an Iterator to keep everything lazy - def apply(te: TupleEntry) = Iterator(conv(te)) -} - -/* This is the mzero of this Monad */ -case object Empty extends FlatMapFn[Nothing] { - def apply(te: TupleEntry) = Iterator.empty - - override def filter(fn2: Nothing => Boolean): FlatMapFn[Nothing] = this - override def flatMap[R1](fn2: Nothing => TraversableOnce[R1]): FlatMapFn[R1] = this - override def map[R1](fn2: Nothing => R1): FlatMapFn[R1] = this -} -case class MapFn[T,R](fmap: FlatMapFn[T], fn: T => R) extends FlatMapFn[R] { - def apply(te: TupleEntry) = fmap(te).map(fn) -} -case class FlatMappedFn[T,R](fmap: FlatMapFn[T], fn: T => TraversableOnce[R]) extends FlatMapFn[R] { - def apply(te: TupleEntry) = fmap(te).flatMap(fn) -} -case class FilteredFn[R](fmap: FlatMapFn[R], fn: R => Boolean) extends FlatMapFn[R] { - def apply(te: TupleEntry) = fmap(te).filter(fn) -} - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedTypedSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedTypedSource.scala index b7508dda00..dc5cb89924 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedTypedSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/GeneratedTypedSource.scala @@ -5,176 +5,257 @@ trait TypedSource1[A] extends TypedSource[Tuple1[A]] { def converter[Z >: Tuple1[A]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple1[A]]) } -trait TypedSource2[A,B] extends TypedSource[Tuple2[A,B]] { - def converter[Z >: Tuple2[A,B]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple2[A,B]]) +trait TypedSource2[A, B] extends TypedSource[Tuple2[A, B]] { + def converter[Z >: Tuple2[A, B]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple2[A, B]]) } -trait TypedSource3[A,B,C] extends TypedSource[Tuple3[A,B,C]] { - def converter[Z >: Tuple3[A,B,C]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple3[A,B,C]]) +trait TypedSource3[A, B, C] extends TypedSource[Tuple3[A, B, C]] { + def converter[Z >: Tuple3[A, B, C]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple3[A, B, C]]) } -trait TypedSource4[A,B,C,D] extends TypedSource[Tuple4[A,B,C,D]] { - def converter[Z >: Tuple4[A,B,C,D]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple4[A,B,C,D]]) +trait TypedSource4[A, B, C, D] extends TypedSource[Tuple4[A, B, C, D]] { + def converter[Z >: Tuple4[A, B, C, D]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple4[A, B, C, D]]) } -trait TypedSource5[A,B,C,D,E] extends TypedSource[Tuple5[A,B,C,D,E]] { - def converter[Z >: Tuple5[A,B,C,D,E]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple5[A,B,C,D,E]]) +trait TypedSource5[A, B, C, D, E] extends TypedSource[Tuple5[A, B, C, D, E]] { + def converter[Z >: Tuple5[A, B, C, D, E]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple5[A, B, C, D, E]]) } -trait TypedSource6[A,B,C,D,E,F] extends TypedSource[Tuple6[A,B,C,D,E,F]] { - def converter[Z >: Tuple6[A,B,C,D,E,F]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple6[A,B,C,D,E,F]]) +trait TypedSource6[A, B, C, D, E, F] extends TypedSource[Tuple6[A, B, C, D, E, F]] { + def converter[Z >: Tuple6[A, B, C, D, E, F]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple6[A, B, C, D, E, F]]) } -trait TypedSource7[A,B,C,D,E,F,G] extends TypedSource[Tuple7[A,B,C,D,E,F,G]] { - def converter[Z >: Tuple7[A,B,C,D,E,F,G]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple7[A,B,C,D,E,F,G]]) +trait TypedSource7[A, B, C, D, E, F, G] extends TypedSource[Tuple7[A, B, C, D, E, F, G]] { + def converter[Z >: Tuple7[A, B, C, D, E, F, G]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple7[A, B, C, D, E, F, G]]) } -trait TypedSource8[A,B,C,D,E,F,G,H] extends TypedSource[Tuple8[A,B,C,D,E,F,G,H]] { - def converter[Z >: Tuple8[A,B,C,D,E,F,G,H]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple8[A,B,C,D,E,F,G,H]]) +trait TypedSource8[A, B, C, D, E, F, G, H] extends TypedSource[Tuple8[A, B, C, D, E, F, G, H]] { + def converter[Z >: Tuple8[A, B, C, D, E, F, G, H]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple8[A, B, C, D, E, F, G, H]]) } -trait TypedSource9[A,B,C,D,E,F,G,H,I] extends TypedSource[Tuple9[A,B,C,D,E,F,G,H,I]] { - def converter[Z >: Tuple9[A,B,C,D,E,F,G,H,I]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple9[A,B,C,D,E,F,G,H,I]]) +trait TypedSource9[A, B, C, D, E, F, G, H, I] extends TypedSource[Tuple9[A, B, C, D, E, F, G, H, I]] { + def converter[Z >: Tuple9[A, B, C, D, E, F, G, H, I]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple9[A, B, C, D, E, F, G, H, I]]) } -trait TypedSource10[A,B,C,D,E,F,G,H,I,J] extends TypedSource[Tuple10[A,B,C,D,E,F,G,H,I,J]] { - def converter[Z >: Tuple10[A,B,C,D,E,F,G,H,I,J]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple10[A,B,C,D,E,F,G,H,I,J]]) +trait TypedSource10[A, B, C, D, E, F, G, H, I, J] extends TypedSource[Tuple10[A, B, C, D, E, F, G, H, I, J]] { + def converter[Z >: Tuple10[A, B, C, D, E, F, G, H, I, J]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple10[A, B, C, D, E, F, G, H, I, J]]) } -trait TypedSource11[A,B,C,D,E,F,G,H,I,J,K] extends TypedSource[Tuple11[A,B,C,D,E,F,G,H,I,J,K]] { - def converter[Z >: Tuple11[A,B,C,D,E,F,G,H,I,J,K]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple11[A,B,C,D,E,F,G,H,I,J,K]]) +trait TypedSource11[A, B, C, D, E, F, G, H, I, J, K] + extends TypedSource[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] { + def converter[Z >: Tuple11[A, B, C, D, E, F, G, H, I, J, K]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple11[A, B, C, D, E, F, G, H, I, J, K]]) } -trait TypedSource12[A,B,C,D,E,F,G,H,I,J,K,L] extends TypedSource[Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]] { - def converter[Z >: Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]]) +trait TypedSource12[A, B, C, D, E, F, G, H, I, J, K, L] + extends TypedSource[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] { + def converter[Z >: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]]) } -trait TypedSource13[A,B,C,D,E,F,G,H,I,J,K,L,M] extends TypedSource[Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]] { - def converter[Z >: Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]]) +trait TypedSource13[A, B, C, D, E, F, G, H, I, J, K, L, M] + extends TypedSource[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] { + def converter[Z >: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]]) } -trait TypedSource14[A,B,C,D,E,F,G,H,I,J,K,L,M,N] extends TypedSource[Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]] { - def converter[Z >: Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]]) +trait TypedSource14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + extends TypedSource[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] { + def converter[Z >: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]]) } -trait TypedSource15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O] extends TypedSource[Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]] { - def converter[Z >: Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]]) +trait TypedSource15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + extends TypedSource[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] { + def converter[Z >: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] = + TupleConverter.asSuperConverter(TupleConverter.of[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]]) } -trait TypedSource16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P] extends TypedSource[Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]] { - def converter[Z >: Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]]) +trait TypedSource16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + extends TypedSource[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] { + def converter[Z >: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] + ) } -trait TypedSource17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q] extends TypedSource[Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]] { - def converter[Z >: Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]]) +trait TypedSource17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + extends TypedSource[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] { + def converter[Z >: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] + ) } -trait TypedSource18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R] extends TypedSource[Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]] { - def converter[Z >: Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]]) +trait TypedSource18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + extends TypedSource[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] { + def converter[Z >: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] + ) } -trait TypedSource19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S] extends TypedSource[Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]] { - def converter[Z >: Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]]) +trait TypedSource19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + extends TypedSource[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] { + def converter[Z >: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] + ) } -trait TypedSource20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T] extends TypedSource[Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]] { - def converter[Z >: Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]]) +trait TypedSource20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + extends TypedSource[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] { + def converter[Z >: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] + ) } -trait TypedSource21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U] extends TypedSource[Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]] { - def converter[Z >: Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]]) +trait TypedSource21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + extends TypedSource[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] { + def converter[Z >: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] + ) } -trait TypedSource22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V] extends TypedSource[Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]] { - def converter[Z >: Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]] = TupleConverter.asSuperConverter(TupleConverter.of[Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]]) +trait TypedSource22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + extends TypedSource[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] { + def converter[Z >: Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] = + TupleConverter.asSuperConverter( + TupleConverter.of[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] + ) } trait TypedSink1[A] extends TypedSink[Tuple1[A]] { final def setter[Z <: Tuple1[A]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple1[A]]) } -trait TypedSink2[A,B] extends TypedSink[Tuple2[A,B]] { - final def setter[Z <: Tuple2[A,B]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple2[A,B]]) +trait TypedSink2[A, B] extends TypedSink[Tuple2[A, B]] { + final def setter[Z <: Tuple2[A, B]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple2[A, B]]) } -trait TypedSink3[A,B,C] extends TypedSink[Tuple3[A,B,C]] { - final def setter[Z <: Tuple3[A,B,C]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple3[A,B,C]]) +trait TypedSink3[A, B, C] extends TypedSink[Tuple3[A, B, C]] { + final def setter[Z <: Tuple3[A, B, C]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple3[A, B, C]]) } -trait TypedSink4[A,B,C,D] extends TypedSink[Tuple4[A,B,C,D]] { - final def setter[Z <: Tuple4[A,B,C,D]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple4[A,B,C,D]]) +trait TypedSink4[A, B, C, D] extends TypedSink[Tuple4[A, B, C, D]] { + final def setter[Z <: Tuple4[A, B, C, D]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple4[A, B, C, D]]) } -trait TypedSink5[A,B,C,D,E] extends TypedSink[Tuple5[A,B,C,D,E]] { - final def setter[Z <: Tuple5[A,B,C,D,E]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple5[A,B,C,D,E]]) +trait TypedSink5[A, B, C, D, E] extends TypedSink[Tuple5[A, B, C, D, E]] { + final def setter[Z <: Tuple5[A, B, C, D, E]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple5[A, B, C, D, E]]) } -trait TypedSink6[A,B,C,D,E,F] extends TypedSink[Tuple6[A,B,C,D,E,F]] { - final def setter[Z <: Tuple6[A,B,C,D,E,F]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple6[A,B,C,D,E,F]]) +trait TypedSink6[A, B, C, D, E, F] extends TypedSink[Tuple6[A, B, C, D, E, F]] { + final def setter[Z <: Tuple6[A, B, C, D, E, F]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple6[A, B, C, D, E, F]]) } -trait TypedSink7[A,B,C,D,E,F,G] extends TypedSink[Tuple7[A,B,C,D,E,F,G]] { - final def setter[Z <: Tuple7[A,B,C,D,E,F,G]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple7[A,B,C,D,E,F,G]]) +trait TypedSink7[A, B, C, D, E, F, G] extends TypedSink[Tuple7[A, B, C, D, E, F, G]] { + final def setter[Z <: Tuple7[A, B, C, D, E, F, G]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple7[A, B, C, D, E, F, G]]) } -trait TypedSink8[A,B,C,D,E,F,G,H] extends TypedSink[Tuple8[A,B,C,D,E,F,G,H]] { - final def setter[Z <: Tuple8[A,B,C,D,E,F,G,H]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple8[A,B,C,D,E,F,G,H]]) +trait TypedSink8[A, B, C, D, E, F, G, H] extends TypedSink[Tuple8[A, B, C, D, E, F, G, H]] { + final def setter[Z <: Tuple8[A, B, C, D, E, F, G, H]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple8[A, B, C, D, E, F, G, H]]) } -trait TypedSink9[A,B,C,D,E,F,G,H,I] extends TypedSink[Tuple9[A,B,C,D,E,F,G,H,I]] { - final def setter[Z <: Tuple9[A,B,C,D,E,F,G,H,I]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple9[A,B,C,D,E,F,G,H,I]]) +trait TypedSink9[A, B, C, D, E, F, G, H, I] extends TypedSink[Tuple9[A, B, C, D, E, F, G, H, I]] { + final def setter[Z <: Tuple9[A, B, C, D, E, F, G, H, I]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple9[A, B, C, D, E, F, G, H, I]]) } -trait TypedSink10[A,B,C,D,E,F,G,H,I,J] extends TypedSink[Tuple10[A,B,C,D,E,F,G,H,I,J]] { - final def setter[Z <: Tuple10[A,B,C,D,E,F,G,H,I,J]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple10[A,B,C,D,E,F,G,H,I,J]]) +trait TypedSink10[A, B, C, D, E, F, G, H, I, J] extends TypedSink[Tuple10[A, B, C, D, E, F, G, H, I, J]] { + final def setter[Z <: Tuple10[A, B, C, D, E, F, G, H, I, J]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple10[A, B, C, D, E, F, G, H, I, J]]) } -trait TypedSink11[A,B,C,D,E,F,G,H,I,J,K] extends TypedSink[Tuple11[A,B,C,D,E,F,G,H,I,J,K]] { - final def setter[Z <: Tuple11[A,B,C,D,E,F,G,H,I,J,K]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple11[A,B,C,D,E,F,G,H,I,J,K]]) +trait TypedSink11[A, B, C, D, E, F, G, H, I, J, K] + extends TypedSink[Tuple11[A, B, C, D, E, F, G, H, I, J, K]] { + final def setter[Z <: Tuple11[A, B, C, D, E, F, G, H, I, J, K]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple11[A, B, C, D, E, F, G, H, I, J, K]]) } -trait TypedSink12[A,B,C,D,E,F,G,H,I,J,K,L] extends TypedSink[Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]] { - final def setter[Z <: Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple12[A,B,C,D,E,F,G,H,I,J,K,L]]) +trait TypedSink12[A, B, C, D, E, F, G, H, I, J, K, L] + extends TypedSink[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] { + final def setter[Z <: Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple12[A, B, C, D, E, F, G, H, I, J, K, L]]) } -trait TypedSink13[A,B,C,D,E,F,G,H,I,J,K,L,M] extends TypedSink[Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]] { - final def setter[Z <: Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple13[A,B,C,D,E,F,G,H,I,J,K,L,M]]) +trait TypedSink13[A, B, C, D, E, F, G, H, I, J, K, L, M] + extends TypedSink[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] { + final def setter[Z <: Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple13[A, B, C, D, E, F, G, H, I, J, K, L, M]]) } -trait TypedSink14[A,B,C,D,E,F,G,H,I,J,K,L,M,N] extends TypedSink[Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]] { - final def setter[Z <: Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple14[A,B,C,D,E,F,G,H,I,J,K,L,M,N]]) +trait TypedSink14[A, B, C, D, E, F, G, H, I, J, K, L, M, N] + extends TypedSink[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] { + final def setter[Z <: Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple14[A, B, C, D, E, F, G, H, I, J, K, L, M, N]]) } -trait TypedSink15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O] extends TypedSink[Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]] { - final def setter[Z <: Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple15[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O]]) +trait TypedSink15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O] + extends TypedSink[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] { + final def setter[Z <: Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple15[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]]) } -trait TypedSink16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P] extends TypedSink[Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]] { - final def setter[Z <: Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple16[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P]]) +trait TypedSink16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P] + extends TypedSink[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] { + final def setter[Z <: Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple16[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P]]) } -trait TypedSink17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q] extends TypedSink[Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]] { - final def setter[Z <: Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple17[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q]]) +trait TypedSink17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q] + extends TypedSink[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] { + final def setter[Z <: Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple17[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q]]) } -trait TypedSink18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R] extends TypedSink[Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]] { - final def setter[Z <: Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple18[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R]]) +trait TypedSink18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R] + extends TypedSink[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] { + final def setter[Z <: Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple18[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R]]) } -trait TypedSink19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S] extends TypedSink[Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]] { - final def setter[Z <: Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple19[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S]]) +trait TypedSink19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S] + extends TypedSink[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] { + final def setter[Z <: Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]] = + TupleSetter.asSubSetter(TupleSetter.of[Tuple19[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S]]) } -trait TypedSink20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T] extends TypedSink[Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]] { - final def setter[Z <: Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple20[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T]]) +trait TypedSink20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T] + extends TypedSink[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] { + final def setter[Z <: Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] = + TupleSetter.asSubSetter( + TupleSetter.of[Tuple20[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T]] + ) } -trait TypedSink21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U] extends TypedSink[Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]] { - final def setter[Z <: Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple21[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U]]) +trait TypedSink21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U] + extends TypedSink[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] { + final def setter[Z <: Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] = + TupleSetter.asSubSetter( + TupleSetter.of[Tuple21[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U]] + ) } -trait TypedSink22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V] extends TypedSink[Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]] { - final def setter[Z <: Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]] = TupleSetter.asSubSetter(TupleSetter.of[Tuple22[A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V]]) +trait TypedSink22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V] + extends TypedSink[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] { + final def setter[Z <: Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] = + TupleSetter.asSubSetter( + TupleSetter.of[Tuple22[A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V]] + ) } // end of autogenerated diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/Grouped.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/Grouped.scala deleted file mode 100644 index d4befed9be..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/Grouped.scala +++ /dev/null @@ -1,283 +0,0 @@ -/* -Copyright 2013 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.typed - -import java.io.Serializable - -import com.twitter.algebird.Semigroup -import com.twitter.scalding.TupleConverter.tuple2Converter - -import com.twitter.scalding._ - -import cascading.pipe.Pipe -import cascading.tuple.Fields - -import Dsl._ - -/** - * This encodes the rules that - * 1) sorting is only possible before doing any reduce, - * 2) reversing is only possible after sorting. - * 3) unsorted Groups can be CoGrouped or HashJoined - * - * This may appear a complex type, but it makes - * sure that code won't compile if it breaks the rule - */ -trait Grouped[K,+V] - extends KeyedListLike[K,V,UnsortedGrouped] - with HashJoinable[K,V] - with Sortable[V, ({type t[+x] = SortedGrouped[K, x] with Reversable[SortedGrouped[K, x]]})#t] - with WithReducers[Grouped[K,V]] - -/** After sorting, we are no longer CoGroupable, and we can only call reverse - * in the initial SortedGrouped created from the Sortable: - * .sortBy(_._2).reverse - * for instance - * - * Once we have sorted, we cannot do a HashJoin or a CoGrouping - */ -trait SortedGrouped[K,+V] - extends KeyedListLike[K,V,SortedGrouped] - with WithReducers[SortedGrouped[K,V]] - -/** This is the state after we have done some reducing. It is - * not possible to sort at this phase, but it is possible to - * do a CoGrouping or a HashJoin. - */ -trait UnsortedGrouped[K,+V] - extends KeyedListLike[K,V,UnsortedGrouped] - with HashJoinable[K,V] - with WithReducers[UnsortedGrouped[K,V]] - -object Grouped { - val ValuePosition: Int = 1 // The values are kept in this position in a Tuple - val valueField: Fields = new Fields("value") - val kvFields: Fields = new Fields("key", "value") - - def apply[K,V](pipe: TypedPipe[(K,V)])(implicit ordering: Ordering[K]): Grouped[K,V] = - IdentityReduce(ordering, pipe, None) - - def keySorting[T](ord : Ordering[T]): Fields = sorting("key", ord) - def valueSorting[T](implicit ord : Ordering[T]) : Fields = sorting("value", ord) - - def sorting[T](key : String, ord : Ordering[T]) : Fields = { - val f = new Fields(key) - f.setComparator(key, ord) - f - } -} - -trait Sortable[+T, +Sorted[+_]] { - def withSortOrdering[U >: T](so: Ordering[U]): Sorted[T] - - def sortBy[B:Ordering](fn : (T) => B): Sorted[T] = - withSortOrdering(Ordering.by(fn)) - - // Sorts the values for each key - def sorted[B >: T](implicit ord : Ordering[B]): Sorted[T] = - withSortOrdering(ord) - - def sortWith(lt : (T,T) => Boolean): Sorted[T] = - withSortOrdering(Ordering.fromLessThan(lt)) -} - -// Represents something that when we call reverse changes type to R -trait Reversable[+R] { - def reverse: R -} - -/** Represents anything that starts as a TypedPipe of Key Value, where - * the value type has been erased. Acts as proof that the K in the tuple - * has an Ordering - */ -trait KeyedPipe[K] { - def keyOrdering: Ordering[K] - def mapped: TypedPipe[(K, Any)] -} - -/** - * This is a class that models the logical portion of the reduce step. - * details like where this occurs, the number of reducers, etc... are - * left in the Grouped class - */ -sealed trait ReduceStep[K, V1] extends KeyedPipe[K] { - /** - * Note, this satisfies KeyedPipe.mapped: TypedPipe[(K, Any)] - */ - def mapped: TypedPipe[(K, V1)] - // make the pipe and group it, only here because it is common - protected def groupOp(gb: GroupBuilder => GroupBuilder): Pipe = - mapped.toPipe(Grouped.kvFields).groupBy(Grouped.keySorting(keyOrdering))(gb) -} - -case class IdentityReduce[K, V1]( - override val keyOrdering: Ordering[K], - override val mapped: TypedPipe[(K, V1)], - override val reducers: Option[Int]) - extends ReduceStep[K, V1] - with Grouped[K, V1] { - - - override def withSortOrdering[U >: V1](so: Ordering[U]): IdentityValueSortedReduce[K, V1] = - IdentityValueSortedReduce[K, V1](keyOrdering, mapped, so, reducers) - - override def withReducers(red: Int): IdentityReduce[K, V1] = - copy(reducers = Some(red)) - - override def filterKeys(fn: K => Boolean) = - IteratorMappedReduce(keyOrdering, mapped.filterKeys(fn), {(_, iter: Iterator[V1]) => iter}, reducers) - - override def mapGroup[V3](fn: (K, Iterator[V1]) => Iterator[V3]) = - IteratorMappedReduce(keyOrdering, mapped, fn, reducers) - - // It would be nice to return IdentityReduce here, but - // the type constraints prevent it currently - override def mapValues[V2](fn: V1 => V2) = - IteratorMappedReduce(keyOrdering, mapped.mapValues(fn), {(k, iter:Iterator[V2]) => iter}, reducers) - - // This is not correct in the type-system, but would be nice to encode - //override def mapValues[V3](fn: V1 => V3) = IdentityReduce(keyOrdering, mapped.mapValues(fn), reducers) - - override def sum[U >: V1](implicit sg: Semigroup[U]) = { - // there is no sort, mapValueStream or force to reducers: - val upipe: TypedPipe[(K, U)] = mapped // use covariance to set the type - IdentityReduce(keyOrdering, upipe.sumByLocalKeys, reducers).sumLeft - } - - override lazy val toTypedPipe = reducers match { - case None => mapped // free case - case Some(reds) => - // This is wierd, but it is sometimes used to force a partition - val reducedPipe = groupOp { _.reducers(reds) } - TypedPipe.from(reducedPipe, Grouped.kvFields)(tuple2Converter[K,V1]) - } - - /** This is just an identity that casts the result to V1 */ - override def joinFunction = { (k, iter, empties) => - assert(empties.isEmpty, "this join function should never be called with non-empty right-most") - iter.map(_.getObject(Grouped.ValuePosition).asInstanceOf[V1]) - } -} - -case class IdentityValueSortedReduce[K, V1]( - override val keyOrdering: Ordering[K], - override val mapped: TypedPipe[(K, V1)], - valueSort: Ordering[_ >: V1], - override val reducers: Option[Int] - ) extends ReduceStep[K, V1] - with SortedGrouped[K, V1] - with Reversable[IdentityValueSortedReduce[K, V1]] { - - override def reverse: IdentityValueSortedReduce[K, V1] = - IdentityValueSortedReduce[K, V1](keyOrdering, mapped, valueSort.reverse, reducers) - - override def withReducers(red: Int): IdentityValueSortedReduce[K, V1] = - // copy fails to get the types right, :/ - IdentityValueSortedReduce[K, V1](keyOrdering, mapped, valueSort, reducers = Some(red)) - - override def filterKeys(fn: K => Boolean) = - // copy fails to get the types right, :/ - IdentityValueSortedReduce[K, V1](keyOrdering, mapped.filterKeys(fn), valueSort, reducers) - - override def mapGroup[V3](fn: (K, Iterator[V1]) => Iterator[V3]) = - ValueSortedReduce[K, V1, V3](keyOrdering, mapped, valueSort, fn, reducers) - - override lazy val toTypedPipe = { - val reducedPipe = groupOp { - _.sortBy(Grouped.valueSorting(valueSort)) - .reducers(reducers.getOrElse(-1)) - } - TypedPipe.from(reducedPipe, Grouped.kvFields)(tuple2Converter[K,V1]) - } -} - -case class ValueSortedReduce[K, V1, V2]( - override val keyOrdering: Ordering[K], - override val mapped: TypedPipe[(K, V1)], - valueSort: Ordering[_ >: V1], - reduceFn: (K, Iterator[V1]) => Iterator[V2], - override val reducers: Option[Int]) - extends ReduceStep[K, V1] with SortedGrouped[K, V2] { - - override def withReducers(red: Int) = - // copy infers loose types. :( - ValueSortedReduce[K, V1, V2]( - keyOrdering, mapped, valueSort, reduceFn, Some(red)) - - override def filterKeys(fn: K => Boolean) = - // copy fails to get the types right, :/ - ValueSortedReduce[K, V1, V2](keyOrdering, mapped.filterKeys(fn), valueSort, reduceFn, reducers) - - override def mapGroup[V3](fn: (K, Iterator[V2]) => Iterator[V3]) = { - // don't make a closure - val localRed = reduceFn - val newReduce = {(k: K, iter: Iterator[V1]) => fn(k, localRed(k, iter))} - ValueSortedReduce[K, V1, V3]( - keyOrdering, mapped, valueSort, newReduce, reducers) - } - - override lazy val toTypedPipe = { - val vSort = Grouped.valueSorting(valueSort) - - val reducedPipe = groupOp { - _.sortBy(vSort) - .every(new cascading.pipe.Every(_, Grouped.valueField, - new TypedBufferOp(reduceFn, Grouped.valueField), Fields.REPLACE)) - .reducers(reducers.getOrElse(-1)) - } - TypedPipe.from(reducedPipe, Grouped.kvFields)(tuple2Converter[K,V2]) - } -} - -case class IteratorMappedReduce[K, V1, V2]( - override val keyOrdering: Ordering[K], - override val mapped: TypedPipe[(K, V1)], - reduceFn: (K, Iterator[V1]) => Iterator[V2], - override val reducers: Option[Int]) - extends ReduceStep[K, V1] with UnsortedGrouped[K, V2] { - - override def withReducers(red: Int): IteratorMappedReduce[K, V1, V2] = - copy(reducers = Some(red)) - - override def filterKeys(fn: K => Boolean) = - copy(mapped = mapped.filterKeys(fn)) - - override def mapGroup[V3](fn: (K, Iterator[V2]) => Iterator[V3]) = { - // don't make a closure - val localRed = reduceFn - val newReduce = {(k: K, iter: Iterator[V1]) => fn(k, localRed(k, iter))} - copy(reduceFn = newReduce) - } - - override lazy val toTypedPipe = { - val reducedPipe = groupOp { - _.every(new cascading.pipe.Every(_, Grouped.valueField, - new TypedBufferOp(reduceFn, Grouped.valueField), Fields.REPLACE)) - .reducers(reducers.getOrElse(-1)) - } - TypedPipe.from(reducedPipe, Grouped.kvFields)(tuple2Converter[K,V2]) - } - - override def joinFunction = { - // don't make a closure - val localRed = reduceFn; - { (k, iter, empties) => - assert(empties.isEmpty, "this join function should never be called with non-empty right-most") - localRed(k, iter.map(_.getObject(Grouped.ValuePosition).asInstanceOf[V1])) - } - } -} - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/HashJoinable.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/HashJoinable.scala deleted file mode 100644 index 44b45efdee..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/HashJoinable.scala +++ /dev/null @@ -1,53 +0,0 @@ -/* -Copyright 2014 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.typed - -import cascading.pipe.HashJoin -import com.twitter.scalding._ - -// For the Fields conversions -import Dsl._ - -/** If we can HashJoin, then we can CoGroup, but not vice-versa - * i.e., HashJoinable is a strict subset of CoGroupable (CoGrouped, for instance - * is CoGroupable, but not HashJoinable). - */ -trait HashJoinable[K, +V] extends CoGroupable[K, V] with KeyedPipe[K] { - /** A HashJoinable has a single input into to the cogroup */ - override def inputs = List(mapped) - /** This fully replicates this entire Grouped to the argument: mapside. - * This means that we never see the case where the key is absent in the pipe. This - * means implementing a right-join (from the pipe) is impossible. - * Note, there is no reduce-phase in this operation. - * The next issue is that obviously, unlike a cogroup, for a fixed key, each joiner will - * NOT See all the tuples with those keys. This is because the keys on the left are - * distributed across many machines - * See hashjoin: - * http://docs.cascading.org/cascading/2.0/javadoc/cascading/pipe/HashJoin.html - */ - def hashCogroupOn[V1,R](mapside: TypedPipe[(K, V1)])(joiner: (K, V1, Iterable[V]) => Iterator[R]): TypedPipe[(K,R)] = { - // Note, the Ordering must have that compare(x,y)== 0 being consistent with hashCode and .equals to - // otherwise, there may be funky issues with cascading - val newPipe = new HashJoin(RichPipe.assignName(mapside.toPipe(('key, 'value))), - RichFields(StringField("key")(keyOrdering, None)), - mapped.toPipe(('key1, 'value1)), - RichFields(StringField("key1")(keyOrdering, None)), - new HashJoiner(joinFunction, joiner)) - - //Construct the new TypedPipe - TypedPipe.from[(K,R)](newPipe.project('key,'value), ('key, 'value)) - } -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/Joiner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/Joiner.scala deleted file mode 100644 index 2b8b477490..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/Joiner.scala +++ /dev/null @@ -1,51 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.typed - -import com.twitter.scalding._ - -object Joiner extends java.io.Serializable { - def toCogroupJoiner2[K,V,U,R](hashJoiner : (K,V,Iterable[U]) => Iterator[R]) - : (K,Iterator[V], Iterable[U]) => Iterator[R] = { - (k : K, itv : Iterator[V], itu : Iterable[U]) => - itv.flatMap { hashJoiner(k,_,itu) } - } - - def hashInner2[K,V,U] = { (key: K, v: V, itu: Iterable[U]) => itu.iterator.map { (v,_) } } - def hashLeft2[K,V,U] = { (key: K, v: V, itu: Iterable[U]) => asOuter(itu.iterator).map { (v,_) } } - - def inner2[K,V,U] = { (key: K, itv: Iterator[V], itu: Iterable[U]) => - itv.flatMap { v => itu.map { u => (v,u) } } - } - def asOuter[U](it : Iterator[U]) : Iterator[Option[U]] = { - if(it.isEmpty) { - Iterator(None) - } - else { - it.map { Some(_) } - } - } - def outer2[K,V,U] = { (key: K, itv: Iterator[V], itu: Iterable[U]) => - asOuter(itv).flatMap { v => asOuter(itu.iterator).map { u => (v,u) } } - } - def left2[K,V,U] = { (key: K, itv: Iterator[V], itu: Iterable[U]) => - itv.flatMap { v => asOuter(itu.iterator).map { u => (v,u) } } - } - def right2[K,V,U] = { (key: K, itv: Iterator[V], itu: Iterable[U]) => - asOuter(itv).flatMap { v => itu.map { u => (v,u) } } - } -} - diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedList.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedList.scala deleted file mode 100644 index f58de3c140..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/KeyedList.scala +++ /dev/null @@ -1,235 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.typed - -import java.io.Serializable -import java.util.PriorityQueue -import scala.collection.JavaConverters._ - -import com.twitter.algebird.{Semigroup, Ring, Aggregator} -import com.twitter.algebird.mutable.PriorityQueueMonoid - -import com.twitter.scalding._ - -object KeyedListLike { - implicit def toTypedPipe[K,V,S[K,+V] <: KeyedListLike[K,V,S]] - (keyed: KeyedListLike[K, V, S]): TypedPipe[(K, V)] = keyed.toTypedPipe -} - -/** This is for the case where you don't want to expose any structure - * but the ability to operate on an iterator of the values - */ -trait KeyedList[K, +T] extends KeyedListLike[K,T,KeyedList] - -/** Represents sharded lists of items of type T - * There are exactly two the fundamental operations: - * toTypedPipe: marks the end of the grouped-on-key operations. - * mapValueStream: further transforms all values, in order, one at a time, - * with a function from Iterator to another Iterator - */ -trait KeyedListLike[K, +T, +This[K,+T] <: KeyedListLike[K,T,This]] - extends java.io.Serializable { - - /** End of the operations on values. From this point on the keyed structure - * is lost and another shuffle is generally required to reconstruct it - */ - def toTypedPipe: TypedPipe[(K, T)] - - /** filter keys on a predicate. More efficient than filter if you are - * only looking at keys - */ - def filterKeys(fn: K => Boolean): This[K, T] - /* an inefficient implementation is below, but - * since this can always be pushed mapside, we should avoid - * using this implementation, lest we accidentally forget to - * implement the smart thing - */ - // mapGroup { (k: K, items: Iterator[T]) => if (fn(k)) items else Iterator.empty } - - - /** Operate on an Iterator[T] of all the values for each key at one time. - * Avoid accumulating the whole list in memory if you can. Prefer sum, - * which is partially executed map-side by default. - */ - def mapGroup[V](smfn : (K, Iterator[T]) => Iterator[V]): This[K, V] - - /////////// - /// The below are all implemented in terms of the above: - /////////// - - /** Use Algebird Aggregator to do the reduction - */ - def aggregate[B,C](agg: Aggregator[T,B,C]): This[K,C] = - mapValues[B](agg.prepare(_)) - .reduce[B](agg.reduce _) - .mapValues[C](agg.present(_)) - - /** .filter(fn).toTypedPipe == .toTypedPipe.filter(fn) - * It is generally better to avoid going back to a TypedPipe - * as long as possible: this minimizes the times we go in - * and out of cascading/hadoop types. - */ - def filter(fn: ((K, T)) => Boolean): This[K, T] = - mapGroup { (k: K, items: Iterator[T]) => items.filter { t => fn((k, t)) } } - - /** This is just short hand for mapValueStream(identity), it makes sure the - * planner sees that you want to force a shuffle. For expert tuning - */ - def forceToReducers: This[K,T] = - mapValueStream(identity) - - /** Use this to get the first value encountered. - * prefer this to take(1). - */ - def head: This[K, T] = sum { - new Semigroup[T] { - override def plus(left: T, right: T) = left - // Don't enumerate every item, just take the first - override def sumOption(to: TraversableOnce[T]): Option[T] = - if(to.isEmpty) None - else Some(to.toIterator.next) - } - } - - /** This is a special case of mapValueStream, but can be optimized because it doesn't need - * all the values for a given key at once. An unoptimized implementation is: - * mapValueStream { _.map { fn } } - * but for Grouped we can avoid resorting to mapValueStream - */ - def mapValues[V](fn : T => V): This[K, V] = - mapGroup { (_, iter) => iter.map(fn) } - - /** Use this when you don't care about the key for the group, - * otherwise use mapGroup - */ - def mapValueStream[V](smfn : Iterator[T] => Iterator[V]): This[K, V] = - mapGroup { (k: K, items: Iterator[T]) => smfn(items) } - - /** - * If there is no ordering, we default to assuming the Semigroup is - * commutative. If you don't want that, define an ordering on the Values, - * or .forceToReducers. - * - * Semigroups MAY have a faster implementation of sum for iterators, - * so prefer using sum/sumLeft to reduce - */ - def sum[U >: T](implicit sg: Semigroup[U]): This[K, U] = sumLeft[U] - - /** reduce with fn which must be associative and commutative. - * Like the above this can be optimized in some Grouped cases. - * If you don't have a commutative operator, use reduceLeft - */ - def reduce[U >: T](fn : (U,U) => U): This[K, U] = sum(Semigroup.from(fn)) - - /** Take the largest k things according to the implicit ordering. - * Useful for top-k without having to call ord.reverse - */ - def sortedReverseTake(k: Int)(implicit ord: Ordering[_ >: T]): This[K, Seq[T]] = - sortedTake(k)(ord.reverse) - - /** This implements bottom-k (smallest k items) on each mapper for each key, then - * sends those to reducers to get the result. This is faster - * than using .take if k * (number of Keys) is small enough - * to fit in memory. - */ - def sortedTake(k: Int)(implicit ord: Ordering[_ >: T]): This[K, Seq[T]] = { - // cast because Ordering is not contravariant, but could be (and this cast is safe) - val ordT: Ordering[T] = ord.asInstanceOf[Ordering[T]] - val mon = new PriorityQueueMonoid[T](k)(ordT) - mapValues(mon.build(_)) - .sum(mon) // results in a PriorityQueue - // scala can't infer the type, possibly due to the view bound on TypedPipe - .mapValues(_.iterator.asScala.toList.sorted(ordT)) - } - - /** Like the above, but with a less than operation for the ordering */ - def sortWithTake[U >: T](k: Int)(lessThan: (U, U) => Boolean): This[K, Seq[T]] = - sortedTake(k)(Ordering.fromLessThan(lessThan)) - - def product[U >: T](implicit ring : Ring[U]): This[K, U] = reduce(ring.times) - - def count(fn : T => Boolean) : This[K, Long] = - mapValues { t => if (fn(t)) 1L else 0L }.sum - - def forall(fn : T => Boolean): This[K, Boolean] = - mapValues { fn(_) }.product - - /** - * Selects all elements except first n ones. - */ - def drop(n: Int): This[K, T] = - mapValueStream { _.drop(n) } - - /** - * Drops longest prefix of elements that satisfy the given predicate. - */ - def dropWhile(p: (T) => Boolean): This[K, T] = - mapValueStream {_.dropWhile(p)} - - /** - * Selects first n elements. Don't use this if n == 1, head is faster in that case. - */ - def take(n: Int): This[K, T] = - mapValueStream {_.take(n)} - - /** - * Takes longest prefix of elements that satisfy the given predicate. - */ - def takeWhile(p: (T) => Boolean): This[K, T] = - mapValueStream {_.takeWhile(p)} - - def foldLeft[B](z : B)(fn : (B,T) => B): This[K, B] = - mapValueStream { stream => Iterator(stream.foldLeft(z)(fn)) } - - def scanLeft[B](z : B)(fn : (B,T) => B): This[K, B] = - mapValueStream { _.scanLeft(z)(fn) } - - // Similar to reduce but always on the reduce-side (never optimized to mapside), - // and named for the scala function. fn need not be associative and/or commutative. - // Makes sense when you want to reduce, but in a particular sorted order. - // the old value comes in on the left. - def reduceLeft[U >: T](fn : (U,U) => U): This[K, U] = - sumLeft[U](Semigroup.from(fn)) - - /** - * Semigroups MAY have a faster implementation of sum for iterators, - * so prefer using sum/sumLeft to reduce/reduceLeft - */ - def sumLeft[U >: T](implicit sg: Semigroup[U]): This[K, U] = - mapValueStream[U](Semigroup.sumOption[U](_).iterator) - - def size : This[K,Long] = mapValues { x => 1L }.sum - def toList : This[K,List[T]] = mapValues { List(_) }.sum - // Note that toSet needs to be parameterized even though toList does not. - // This is because List is covariant in its type parameter in the scala API, - // but Set is invariant. See: - // http://stackoverflow.com/questions/676615/why-is-scalas-immutable-set-not-covariant-in-its-type - def toSet[U >: T] : This[K,Set[U]] = mapValues { Set[U](_) }.sum - def max[B >: T](implicit cmp : Ordering[B]): This[K, T] = - reduce(cmp.max).asInstanceOf[This[K, T]] - - def maxBy[B](fn : T => B)(implicit cmp : Ordering[B]): This[K, T] = - reduce(Ordering.by(fn).max) - - def min[B >: T](implicit cmp: Ordering[B]): This[K, T] = - reduce(cmp.min).asInstanceOf[This[K,T]] - - def minBy[B](fn : T => B)(implicit cmp: Ordering[B]): This[K,T] = - reduce(Ordering.by(fn).min) - - def keys: TypedPipe[K] = toTypedPipe.keys - def values: TypedPipe[T] = toTypedPipe.values -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/LookupJoin.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/LookupJoin.scala deleted file mode 100644 index a40121aeeb..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/LookupJoin.scala +++ /dev/null @@ -1,124 +0,0 @@ -/* - Copyright 2013 Twitter, Inc. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - */ - -package com.twitter.scalding.typed - -import java.io.Serializable - -/** - * lookupJoin simulates the behavior of a realtime system attempting - * to leftJoin (K, V) pairs against some other value type (JoinedV) - * by performing realtime lookups on a key-value Store. - * - * An example would join (K, V) pairs of (URL, Username) against a - * service of (URL, ImpressionCount). The result of this join would - * be a pipe of (ShortenedURL, (Username, - * Option[ImpressionCount])). - * - * To simulate this behavior, lookupJoin accepts pipes of key-value - * pairs with an explicit time value T attached. T must have some - * sensical ordering. The semantics are, if one were to hit the - * right pipe's simulated realtime service at any time between - * T(tuple) T(tuple + 1), one would receive Some((K, - * JoinedV)(tuple)). - * - * The entries in the left pipe's tuples have the following - * meaning: - * - * T: The the time at which the (K, W) lookup occurred. - * K: the join key. - * W: the current value for the join key. - * - * The right pipe's entries have the following meaning: - * - * T: The time at which the "service" was fed an update - * K: the join K. - * V: value of the key at time T - * - * Before the time T in the right pipe's very first entry, the - * simulated "service" will return None. After this time T, the - * right side will return None only if the key is absent, - * else, the service will return Some(joinedV). - */ -object LookupJoin extends Serializable { - def apply[T:Ordering, K:Ordering, V, JoinedV](left: TypedPipe[(T, (K, V))], right: TypedPipe[(T, (K, JoinedV))]): - TypedPipe[(T, (K, (V, Option[JoinedV])))] = { - /** - * Implicit ordering on an either that doesn't care about the - * actual container values, puts the lookups before the service - * writes Since we assume it takes non-zero time to do a lookup. - */ - implicit def eitherOrd[T, U]: Ordering[Either[T, U]] = - new Ordering[Either[T, U]] { - def compare(l: Either[T, U], r: Either[T, U]) = - (l, r) match { - case (Left(_), Right(_)) => -1 - case (Right(_), Left(_)) => 1 - case (Left(_), Left(_)) => 0 - case (Right(_), Right(_)) => 0 - } - } - - val joined: TypedPipe[(K, (Option[JoinedV], Option[(T, V, Option[JoinedV])]))] = - left.map { case (t, (k, v)) => (k, (t, Left(v): Either[V, JoinedV])) } - .++(right.map { case (t, (k, joinedV)) => (k, (t, Right(joinedV): Either[V, JoinedV])) }) - .group - .sortBy(identity) // time then left before right - /** - * Grouping by K leaves values of (T, Either[V, JoinedV]). Sort - * by time and scanLeft. The iterator will now represent pairs of - * T and either new values to join against or updates to the - * simulated "realtime store" described above. - */ - .scanLeft( - /** - * In the simulated realtime store described above, this - * None is the value in the store at the current - * time. Because we sort by time and scan forward, this - * value will be updated with a new value every time a - * Right(delta) shows up in the iterator. - * - * The second entry in the pair will be None when the - * JoinedV is updated and Some(newValue) when a (K, V) - * shows up and a new join occurs. - */ - (None: Option[JoinedV], None: Option[(T, V, Option[JoinedV])]) - ) { case ((lastJoined, _), (thisTime, leftOrRight)) => - leftOrRight match { - // Left(v) means that we have a new value from the left - // pipe that we need to join against the current - // "lastJoined" value sitting in scanLeft's state. This - // is equivalent to a lookup on the data in the right - // pipe at time "thisTime". - case Left(v) => (lastJoined, Some((thisTime, v, lastJoined))) - - // Right(joinedV) means that we've received a new value - // to use in the simulated realtime service described in - // the comments above - case Right(joined) => (Some(joined), None) - } - }.toTypedPipe - - for { - // Now, get rid of residual state from the scanLeft above: - (k, (_, optV)) <- joined - - // filter out every event that produced a Right(delta) above, - // leaving only the leftJoin events that occurred above: - (t, v, optJoined) <- optV - } yield (t, (k, (v, optJoined))) - } -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/MemorySink.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/MemorySink.scala new file mode 100644 index 0000000000..cf8c39b7ec --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/MemorySink.scala @@ -0,0 +1,51 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +import com.twitter.scalding._ + +import scala.collection.mutable.Buffer + +import java.util.UUID + +import cascading.pipe.Pipe +import cascading.flow.FlowDef +import cascading.scheme.NullScheme +import cascading.tuple.Tuple + +/* + * This is useful for in-memory testing with Execution + * It only works for CascadingLocal mode. + */ +class MemorySink[T] extends TypedSink[T] { + private[this] val buf = Buffer[Tuple]() + private[this] val name: String = UUID.randomUUID.toString + + // takes a copy as of NOW. Don't call this before the job has run + def readResults: Iterable[T] = + buf.iterator.map(_.getObject(0).asInstanceOf[T]).toList + + def setter[U <: T] = TupleSetter.asSubSetter(TupleSetter.singleSetter[T]) + def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe = + mode match { + case cl: CascadingLocal => + val tap = new MemoryTap(new NullScheme(sinkFields, sinkFields), buf) + flowDef.addSink(name, tap) + flowDef.addTail(new Pipe(name, pipe)) + pipe + case _ => sys.error("MemorySink only usable with cascading local") + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionSchemed.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionSchemed.scala new file mode 100644 index 0000000000..1abd31d573 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionSchemed.scala @@ -0,0 +1,85 @@ +// Copyright 2014 Commonwealth Bank of Australia +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.twitter.scalding +package typed + +import cascading.tap.hadoop.PartitionTap +import cascading.tap.local.{FileTap, PartitionTap => LocalPartitionTap} +import cascading.tap.{SinkMode, Tap} +import cascading.tuple.Fields + +/** + * Trait to assist with creating partitioned sources. + * + * Apart from the abstract members below, `hdfsScheme` and `localScheme` also need to be set. Note that for + * both of them the sink fields need to be set to only include the actual fields that should be written to + * file and not the partition fields. + */ +trait PartitionSchemed[P, T] + extends SchemedSource + with TypedSink[(P, T)] + with Mappable[(P, T)] + with HfsTapProvider { + def path: String + def template: String + def valueSetter: TupleSetter[T] + def valueConverter: TupleConverter[T] + def partitionSetter: TupleSetter[P] + def partitionConverter: TupleConverter[P] + def fields: Fields + + // The partition fields, offset by the value arity. + def partitionFields = + PartitionUtil.toFields(valueSetter.arity, valueSetter.arity + partitionSetter.arity) + + /* + Advertise all the sinkFields, both the value and partition ones, this needs to be like this even + though it is the incorrect sink fields, otherwise scalding validation falls over. The sink fields + of the scheme itself then to be over written to only include the actual sink fields. + */ + override def sinkFields: Fields = fields.append(partitionFields) + + /** + * Combine both the partition and value converter to extract the data from a flat cascading tuple into a + * pair of `P` and `T`. + */ + override def converter[U >: (P, T)] = + PartitionUtil.converter[P, T, U](valueConverter, partitionConverter) + + /** Flatten a pair of `P` and `T` into a cascading tuple. */ + override def setter[U <: (P, T)] = + PartitionUtil.setter[P, T, U](valueSetter, partitionSetter) + + /** Creates the taps for local and hdfs mode. */ + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = + mode match { + case Local(_) => { + val fileTap = new FileTap(localScheme, path, SinkMode.REPLACE) + new LocalPartitionTap(fileTap, new TemplatePartition(partitionFields, template), SinkMode.UPDATE) + .asInstanceOf[Tap[_, _, _]] + } + case Hdfs(_, _) => { + val hfs = createHfsTap(hdfsScheme, path, SinkMode.REPLACE) + new PartitionTap(hfs, new TemplatePartition(partitionFields, template), SinkMode.UPDATE) + .asInstanceOf[Tap[_, _, _]] + } + case hdfsTest @ HadoopTest(_, _) => { + val hfs = createHfsTap(hdfsScheme, hdfsTest.getWritePathFor(this), SinkMode.REPLACE) + new PartitionTap(hfs, new TemplatePartition(partitionFields, template), SinkMode.UPDATE) + .asInstanceOf[Tap[_, _, _]] + } + case _ => TestTapFactory(this, hdfsScheme).createTap(readOrWrite) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionUtil.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionUtil.scala new file mode 100644 index 0000000000..a53135ff89 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionUtil.scala @@ -0,0 +1,66 @@ +// Copyright 2014 Commonwealth Bank of Australia +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.twitter.scalding +package typed + +import cascading.tuple.{Fields, Tuple, TupleEntry} + +/** Utility functions to assist with creating partitioned sourced. */ +object PartitionUtil { + // DO NOT USE intFields, scalding / cascading Fields.merge is broken and gets called in bowels of + // TemplateTap. See scalding/#803. + def toFields(start: Int, end: Int): Fields = + Dsl.strFields((start until end).map(_.toString)) + + /** A tuple converter that splits a cascading tuple into a pair of types. */ + def converter[P, T, U >: (P, T)](valueConverter: TupleConverter[T], partitionConverter: TupleConverter[P]) = + TupleConverter.asSuperConverter[(P, T), U](new TupleConverter[(P, T)] { + val arity = valueConverter.arity + partitionConverter.arity + + def apply(te: TupleEntry): (P, T) = { + val value = Tuple.size(valueConverter.arity) + val partition = Tuple.size(partitionConverter.arity) + + (0 until valueConverter.arity).foreach(idx => value.set(idx, te.getObject(idx))) + (0 until partitionConverter.arity) + .foreach(idx => partition.set(idx, te.getObject(idx + valueConverter.arity))) + + val valueTE = new TupleEntry(toFields(0, valueConverter.arity), value) + val partitionTE = new TupleEntry(toFields(0, partitionConverter.arity), partition) + + (partitionConverter(partitionTE), valueConverter(valueTE)) + } + }) + + /** A tuple setter for a pair of types which are flattened into a cascading tuple. */ + def setter[P, T, U <: (P, T)]( + valueSetter: TupleSetter[T], + partitionSetter: TupleSetter[P] + ): TupleSetter[U] = + TupleSetter.asSubSetter[(P, T), U](new TupleSetter[(P, T)] { + val arity = valueSetter.arity + partitionSetter.arity + + def apply(in: (P, T)) = { + val partition = partitionSetter(in._1) + val value = valueSetter(in._2) + val output = Tuple.size(partition.size + value.size) + + (0 until value.size).foreach(idx => output.set(idx, value.getObject(idx))) + (0 until partition.size).foreach(idx => output.set(idx + value.size, partition.getObject(idx))) + + output + } + }) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedDelimitedSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedDelimitedSource.scala new file mode 100644 index 0000000000..3144db6842 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedDelimitedSource.scala @@ -0,0 +1,146 @@ +// Copyright 2014 Commonwealth Bank of Australia +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.twitter.scalding +package typed + +import java.util.Properties +import java.io.{InputStream, OutputStream, Serializable} + +import cascading.scheme.Scheme +import cascading.scheme.hadoop.TextDelimited +import cascading.scheme.local.{TextDelimited => LocalTextDelimited} +import cascading.tuple.Fields + +/** + * Scalding source to read or write partitioned delimited text. + * + * For writing it expects a pair of `(P, T)`, where `P` is the data used for partitioning and `T` is the + * output to write out. Below is an example. + * {{{ + * val data = List( + * (("a", "x"), ("i", 1)), + * (("a", "y"), ("j", 2)), + * (("b", "z"), ("k", 3)) + * ) + * IterablePipe(data, flowDef, mode) + * .write(PartitionedDelimited[(String, String), (String, Int)](args("out"), "col1=%s/col2=%s")) + * }}} + * + * For reading it produces a pair `(P, T)` where `P` is the partition data and `T` is data in the files. Below + * is an example. + * {{{ + * val in: TypedPipe[((String, String), (String, Int))] = PartitionedDelimited[(String, String), (String, Int)](args("in"), "col1=%s/col2=%s") + * }}} + */ +case class PartitionedDelimitedSource[P, T]( + path: String, + template: String, + separator: String, + fields: Fields, + skipHeader: Boolean = false, + writeHeader: Boolean = false, + quote: String = "\"", + strict: Boolean = true, + safe: Boolean = true +)(implicit + mt: Manifest[T], + val valueSetter: TupleSetter[T], + val valueConverter: TupleConverter[T], + val partitionSetter: TupleSetter[P], + val partitionConverter: TupleConverter[P] +) extends PartitionSchemed[P, T] + with Serializable { + assert( + fields.size == valueSetter.arity, + "The number of fields needs to be the same as the arity of the value setter" + ) + + val types: Array[Class[_]] = + if (classOf[scala.Product].isAssignableFrom(mt.runtimeClass)) { + // Assume this is a Tuple: + mt.typeArguments.map(_.runtimeClass).toArray + } else { + // Assume there is only a single item + Array(mt.runtimeClass) + } + + // Create the underlying scheme and explicitly set the sink fields to be only the specified fields + // see sinkFields in PartitionSchemed for other half of this work around. + override def hdfsScheme = { + val scheme = + HadoopSchemeInstance( + new TextDelimited(fields, null, skipHeader, writeHeader, separator, strict, quote, types, safe) + .asInstanceOf[Scheme[_, _, _, _, _]] + ) + scheme.setSinkFields(fields) + scheme + } + + // Create the underlying scheme and explicitly set the sink fields to be only the specified fields + // see sinkFields in PartitionSchemed for other half of this work around. + override def localScheme = { + val scheme = + new LocalTextDelimited(fields, skipHeader, writeHeader, separator, strict, quote, types, safe) + .asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]] + scheme.setSinkFields(fields) + scheme + } +} + +/** + * Trait to assist with creating objects such as [[PartitionedTsv]] to read from separated files. Override + * separator, skipHeader, writeHeader as needed. + */ +trait PartitionedDelimited extends Serializable { + def separator: String + + def apply[P: Manifest: TupleConverter: TupleSetter, T: Manifest: TupleConverter: TupleSetter]( + path: String, + template: String + ): PartitionedDelimitedSource[P, T] = + PartitionedDelimitedSource( + path, + template, + separator, + PartitionUtil.toFields(0, implicitly[TupleSetter[T]].arity) + ) + + def apply[P: Manifest: TupleConverter: TupleSetter, T: Manifest: TupleConverter: TupleSetter]( + path: String, + template: String, + fields: Fields + ): PartitionedDelimitedSource[P, T] = + PartitionedDelimitedSource(path, template, separator, fields) +} + +/** Partitioned typed tab separated source. */ +object PartitionedTsv extends PartitionedDelimited { + val separator = "\t" +} + +/** Partitioned typed commma separated source. */ +object PartitionedCsv extends PartitionedDelimited { + val separator = "," +} + +/** Partitioned typed pipe separated source. */ +object PartitionedPsv extends PartitionedDelimited { + val separator = "|" +} + +/** Partitioned typed `\1` separated source (commonly used by Pig). */ +object PartitionedOsv extends PartitionedDelimited { + val separator = "\u0001" +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedTextLine.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedTextLine.scala new file mode 100644 index 0000000000..dece665146 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/PartitionedTextLine.scala @@ -0,0 +1,137 @@ +// Copyright 2014 Commonwealth Bank of Australia +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.twitter.scalding +package typed + +import java.util.Properties +import java.io.{InputStream, OutputStream} + +import cascading.scheme.Scheme +import cascading.scheme.hadoop.TextLine +import cascading.scheme.local.{TextLine => LocalTextLine} +import cascading.tap.{SinkMode, Tap} +import cascading.tap.hadoop.PartitionTap +import cascading.tap.local.{FileTap, PartitionTap => LocalPartitionTap} +import cascading.tuple.Fields + +/** + * Scalding source to read or write partitioned text. + * + * For writing it expects a pair of `(P, String)`, where `P` is the data used for partitioning and `String` is + * the output to write out. Below is an example. + * {{{ + * val data = List( + * (("a", "x"), "line1"), + * (("a", "y"), "line2"), + * (("b", "z"), "line3") + * ) + * IterablePipe(data, flowDef, mode) + * .write(PartitionTextLine[(String, String)](args("out"), "col1=%s/col2=%s")) + * }}} + * + * For reading it produces a pair `(P, (Long, String))` where `P` is the partition data, `Long` is the offset + * into the file and `String` is a line from the file. Below is an example. + * {{{ + * val in: TypedPipe[((String, String), (Long, String))] = PartitionTextLine[(String, String)](args("in"), "col1=%s/col2=%s") + * }}} + * + * @param path + * Base path of the partitioned directory + * @param template + * Template for the partitioned path + * @param encoding + * Text encoding of the file content + */ +case class PartitionedTextLine[P]( + path: String, + template: String, + encoding: String = TextLine.DEFAULT_CHARSET +)(implicit + val valueSetter: TupleSetter[String], + val valueConverter: TupleConverter[(Long, String)], + val partitionSetter: TupleSetter[P], + val partitionConverter: TupleConverter[P] +) extends SchemedSource + with TypedSink[(P, String)] + with Mappable[(P, (Long, String))] + with HfsTapProvider + with java.io.Serializable { + + // The partition fields, offset by the value arity. + val partitionFields = + PartitionUtil.toFields(valueSetter.arity, valueSetter.arity + partitionSetter.arity) + + // Create the underlying scheme and explicitly set the sink fields to be only the specified fields + // see sinkFields in PartitionSchemed for other half of this work around. + override def hdfsScheme = { + val scheme = + HadoopSchemeInstance( + new TextLine(TextLine.DEFAULT_SOURCE_FIELDS, encoding) + .asInstanceOf[Scheme[_, _, _, _, _]] + ) + scheme.setSinkFields(PartitionUtil.toFields(0, valueSetter.arity)) + scheme + } + + // Create the underlying scheme and explicitly set the sink fields to be only the specified fields + // see sinkFields in PartitionSchemed for other half of this work around. + override def localScheme = { + val scheme = + new LocalTextLine(TextLine.DEFAULT_SOURCE_FIELDS, encoding) + .asInstanceOf[Scheme[Properties, InputStream, OutputStream, _, _]] + scheme.setSinkFields(PartitionUtil.toFields(0, valueSetter.arity)) + scheme + } + + /* + Advertise all the sinkFields, both the value and partition ones, this needs to be like this even + though it is the incorrect sink fields, otherwise scalding validation falls over, see hdfsScheme + for other part of tweak to narrow fields back to value again to work around this. + */ + override def sinkFields: Fields = + PartitionUtil.toFields(0, valueSetter.arity + partitionSetter.arity) + + /** Creates the taps for local and hdfs mode. */ + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = + mode match { + case Local(_) => { + val fileTap = new FileTap(localScheme, path, SinkMode.REPLACE) + new LocalPartitionTap(fileTap, new TemplatePartition(partitionFields, template), SinkMode.UPDATE) + .asInstanceOf[Tap[_, _, _]] + } + case Hdfs(_, _) => { + val hfs = createHfsTap(hdfsScheme, path, SinkMode.REPLACE) + new PartitionTap(hfs, new TemplatePartition(partitionFields, template), SinkMode.UPDATE) + .asInstanceOf[Tap[_, _, _]] + } + case hdfsTest @ HadoopTest(_, _) => { + val hfs = createHfsTap(hdfsScheme, hdfsTest.getWritePathFor(this), SinkMode.REPLACE) + new PartitionTap(hfs, new TemplatePartition(partitionFields, template), SinkMode.UPDATE) + .asInstanceOf[Tap[_, _, _]] + } + case _ => TestTapFactory(this, hdfsScheme).createTap(readOrWrite) + } + + /** + * Combine both the partition and value converter to extract the data from a flat cascading tuple into a + * pair of `P` and `(offset, line)`. + */ + override def converter[U >: (P, (Long, String))] = + PartitionUtil.converter[P, (Long, String), U](valueConverter, partitionConverter) + + /** Flatten a pair of `P` and `line` into a cascading tuple. */ + override def setter[U <: (P, String)] = + PartitionUtil.setter[P, String, U](valueSetter, partitionSetter) +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/Sketched.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/Sketched.scala deleted file mode 100644 index d1d778ff85..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/Sketched.scala +++ /dev/null @@ -1,90 +0,0 @@ -/* -Copyright 2014 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.typed - -import com.twitter.algebird.{CMS,MurmurHash128} - -case class Sketched[K,V] - (pipe: TypedPipe[(K,V)], - numReducers: Int, - delta: Double, - eps: Double, - seed: Int) - (implicit serialization: K => Array[Byte], - ordering: Ordering[K]) - extends HasReducers { - - val reducers = Some(numReducers) - - private lazy val murmurHash = MurmurHash128(seed) - def hash(key: K) : Long = murmurHash(serialization(key))._1 - - private lazy implicit val cms = CMS.monoid(eps, delta, seed) - lazy val sketch : TypedPipe[CMS] = - pipe - .map{kv => cms.create(hash(kv._1))} - .groupAll - .sum - .values - - def cogroup[V2,R](right: TypedPipe[(K,V2)]) - (joiner: (K, V, Iterable[V2]) => Iterator[R]) : SketchJoined[K,V,V2,R] = - new SketchJoined(this, right, numReducers)(joiner) - - def join[V2](right: TypedPipe[(K,V2)]) = cogroup(right)(Joiner.hashInner2) - def leftJoin[V2](right: TypedPipe[(K,V2)]) = cogroup(right)(Joiner.hashLeft2) -} - -case class SketchJoined[K:Ordering,V,V2,R] - (left: Sketched[K,V], - right: TypedPipe[(K,V2)], - numReducers: Int) - (joiner: (K, V, Iterable[V2]) => Iterator[R]) - extends HasReducers { - - val reducers = Some(numReducers) - - //the most of any one reducer we want to try to take up with a single key - private val maxReducerFraction = 0.1 - - private def flatMapWithReplicas[W](pipe: TypedPipe[(K,W)])(fn: Int => Iterable[Int]) = - pipe.cross(left.sketch).flatMap{case (v,cms) => - val maxPerReducer = (cms.totalCount / numReducers) * maxReducerFraction + 1 - val maxReplicas = (cms.frequency(left.hash(v._1)).estimate.toDouble / maxPerReducer) - - //if the frequency is 0, maxReplicas.ceil will be 0 so we will filter out this key entirely - //if it's < maxPerReducer, the ceil will round maxReplicas up to 1 to ensure we still see it - val replicas = fn(maxReplicas.ceil.toInt.min(numReducers)) - replicas.map{i => (i,v._1) -> v._2} - } - - lazy val toTypedPipe : TypedPipe[(K, R)] = { - lazy val rand = new scala.util.Random(left.seed) - val lhs = flatMapWithReplicas(left.pipe){n => Some(rand.nextInt(n) + 1)} - val rhs = flatMapWithReplicas(right){n => 1.to(n)} - - lhs - .group - .cogroup(rhs.group){(k,itv,itu) => itv.flatMap{v => joiner(k._2,v,itu)}} - .withReducers(numReducers) - .map{case ((r,k),v) => (k,v)} - } -} - -object SketchJoined { - implicit def toTypedPipe[K,V, V2, R] - (joined: SketchJoined[K, V, V2, R]): TypedPipe[(K, R)] = joined.toTypedPipe -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/TDsl.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/TDsl.scala index e0ff1f6f70..422b4696b2 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/TDsl.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/TDsl.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed import java.io.Serializable @@ -21,29 +21,38 @@ import cascading.flow.FlowDef import cascading.pipe.Pipe import cascading.tuple.Fields +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions + import com.twitter.scalding._ -/** implicits for the type-safe DSL - * import TDsl._ to get the implicit conversions from Grouping/CoGrouping to Pipe, - * to get the .toTypedPipe method on standard cascading Pipes. - * to get automatic conversion of Mappable[T] to TypedPipe[T] +/** + * implicits for the type-safe DSL import TDsl._ to get the implicit conversions from Grouping/CoGrouping to + * Pipe, to get the .toTypedPipe method on standard cascading Pipes. to get automatic conversion of + * Mappable[T] to TypedPipe[T] */ -object TDsl extends Serializable with GeneratedTupleAdders { - implicit def pipeTExtensions(pipe : Pipe) : PipeTExtensions = new PipeTExtensions(pipe) +object TDsl extends Serializable with GeneratedTupleAdders with CascadingExtensions { + implicit def pipeTExtensions(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): PipeTExtensions = + new PipeTExtensions(pipe, flowDef, mode) + + implicit def mappableToTypedPipe[T](src: Mappable[T]): TypedPipe[T] = + TypedPipe.from(src) - implicit def mappableToTypedPipe[T](src: Mappable[T]) - (implicit flowDef: FlowDef, mode: Mode): TypedPipe[T] = - TypedPipe.from(src)(flowDef, mode) + implicit def sourceToTypedPipe[T](src: TypedSource[T]): TypedPipe[T] = + TypedPipe.from(src) - implicit def sourceToTypedPipe[T](src: TypedSource[T]) - (implicit flowDef: FlowDef, mode: Mode): TypedPipe[T] = - TypedPipe.from(src)(flowDef, mode) + implicit def mappableToTypedPipeKeyed[K, V](src: Mappable[(K, V)]): TypedPipe.Keyed[K, V] = + new TypedPipe.Keyed(TypedPipe.from(src)) + + implicit def sourceToTypedPipeKeyed[K, V](src: TypedSource[(K, V)]): TypedPipe.Keyed[K, V] = + new TypedPipe.Keyed(TypedPipe.from(src)) } /* * This is an Enrichment pattern of adding methods to Pipe relevant to TypedPipe */ -class PipeTExtensions(pipe : Pipe) extends Serializable { +class PipeTExtensions(pipe: Pipe, flowDef: FlowDef, mode: Mode) extends Serializable { + import CascadingExtensions._ + /* Give you a syntax (you must put the full type on the TypedPipe, else type inference fails * pipe.typed(('in0, 'in1) -> 'out) { tpipe : TypedPipe[(Int,Int)] => * // let's group all: @@ -54,14 +63,15 @@ class PipeTExtensions(pipe : Pipe) extends Serializable { * } * The above sums all the tuples and returns a TypedPipe[Int] which has the total sum. */ - def typed[T,U](fielddef : (Fields, Fields))(fn : TypedPipe[T] => TypedPipe[U]) - (implicit conv : TupleConverter[T], setter : TupleSetter[U]) : Pipe = { - fn(TypedPipe.from(pipe, fielddef._1)(conv)).toPipe(fielddef._2)(setter) - } - def toTypedPipe[T](fields : Fields)(implicit conv : TupleConverter[T]) : TypedPipe[T] = { - TypedPipe.from[T](pipe, fields)(conv) - } - def packToTypedPipe[T](fields : Fields)(implicit tp : TuplePacker[T]) : TypedPipe[T] = { + def typed[T, U](fielddef: (Fields, Fields))( + fn: TypedPipe[T] => TypedPipe[U] + )(implicit conv: TupleConverter[T], setter: TupleSetter[U]): Pipe = + fn(TypedPipe.fromPipe(pipe, fielddef._1)(flowDef, mode, conv)).toPipe(fielddef._2)(flowDef, mode, setter) + + def toTypedPipe[T](fields: Fields)(implicit conv: TupleConverter[T]): TypedPipe[T] = + TypedPipe.fromPipe[T](pipe, fields)(flowDef, mode, conv) + + def packToTypedPipe[T](fields: Fields)(implicit tp: TuplePacker[T]): TypedPipe[T] = { val conv = tp.newConverter(fields) toTypedPipe(fields)(conv) } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/TemplatePartition.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/TemplatePartition.scala new file mode 100644 index 0000000000..1a8bd50bd0 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/TemplatePartition.scala @@ -0,0 +1,60 @@ +// Copyright 2014 Commonwealth Bank of Australia +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.twitter.scalding +package typed + +import scala.collection.JavaConverters._ + +import cascading.tap.partition.Partition +import cascading.tuple.{Fields, TupleEntry} + +/** + * Creates a partition using the given template string. + * + * The template string needs to have %s as placeholder for a given field. + */ +case class TemplatePartition(partitionFields: Fields, template: String) extends Partition { + assert( + partitionFields.size == "%s".r.findAllIn(template).length, + "Number of partition fields %s does not correspond to template (%s)".format(partitionFields, template) + ) + + /** Regex pattern created from the template to extract the partition values from a path. */ + lazy val pattern = template.replaceAll("%s", "(.*)").r.pattern + + /** Returns the path depth. In this case the number of partition fields. */ + override def getPathDepth(): Int = partitionFields.size + + /** Returns the partition fields. */ + override def getPartitionFields(): Fields = partitionFields + + /** + * Converts the given partition string to field values and populates the supplied tuple entry with it. + */ + override def toTuple(partition: String, tupleEntry: TupleEntry): Unit = { + val m = pattern.matcher(partition) + m.matches + val parts: Array[Object] = (1 to partitionFields.size).map(i => m.group(i)).toArray + tupleEntry.setCanonicalValues(parts) + } + + /** + * Given the specified tuple entry fill in the supplied template entry to create the partition path. + */ + override def toPartition(tupleEntry: TupleEntry): String = { + val fields = tupleEntry.asIterableOf(classOf[String]).asScala.toList + template.format(fields: _*) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedPipe.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedPipe.scala deleted file mode 100644 index 98ddb1b03c..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedPipe.scala +++ /dev/null @@ -1,588 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.typed - -import java.io.Serializable - -import com.twitter.algebird.{Semigroup, MapAlgebra, Monoid, Ring, Aggregator} - -import com.twitter.scalding.TupleConverter.{singleConverter, tuple2Converter, CTupleConverter, TupleEntryConverter} -import com.twitter.scalding.TupleSetter.{singleSetter, tup2Setter} - -import com.twitter.scalding._ - -import cascading.flow.FlowDef -import cascading.pipe.Pipe -import cascading.tuple.{Fields, Tuple => CTuple, TupleEntry} -import util.Random - -/** - * factory methods for TypedPipe, which is the typed representation of distributed lists in scalding. - * This object is here rather than in the typed package because a lot of code was written using - * the functions in the object, which we do not see how to hide with package object tricks. - */ -object TypedPipe extends Serializable { - def from[T](pipe: Pipe, fields: Fields)(implicit conv: TupleConverter[T]): TypedPipe[T] = - TypedPipeInst[T](pipe, fields, Converter(conv)) - - def from[T](mappable: TypedSource[T])(implicit flowDef: FlowDef, mode: Mode): TypedPipe[T] = - TypedPipeInst[T](mappable.read, mappable.sourceFields, Converter(mappable.converter)) - - // It might pay to use a view here, but you should experiment - def from[T](iter: Iterable[T])(implicit flowDef: FlowDef, mode: Mode): TypedPipe[T] = - IterablePipe[T](iter, flowDef, mode) - - /** Input must be a Pipe with exactly one Field */ - def fromSingleField[T](pipe: Pipe): TypedPipe[T] = - TypedPipeInst[T](pipe, new Fields(0), Converter(singleConverter[T])) - - def empty(implicit flowDef: FlowDef, mode: Mode): TypedPipe[Nothing] = - EmptyTypedPipe(flowDef, mode) -} - -/** Think of a TypedPipe as a distributed unordered list that may or may not yet - * have been materialized in memory or disk. - * - * Represents a phase in a distributed computation on an input data source - * Wraps a cascading Pipe object, and holds the transformation done up until that point - */ -trait TypedPipe[+T] extends Serializable { - - // Implements a cross product. The right side should be tiny - def cross[U](tiny: TypedPipe[U]): TypedPipe[(T,U)] - - def flatMap[U](f: T => TraversableOnce[U]): TypedPipe[U] - - /** If you are going to create two branches or forks, - * it may be more efficient to call this method first - * which will create a node in the cascading graph. - * Without this, both full branches of the fork will be - * put into separate cascading pipes, which can, in some cases, - * be slower. - * - * Ideally the planner would see this - */ - def fork: TypedPipe[T] - - /** limit the output to at most count items. - * useful for debugging, but probably that's about it. - * The number may be less than count, and not sampled particular method - */ - def limit(count: Int): TypedPipe[T] - - /** This does a sum of values WITHOUT triggering a shuffle. - * the contract is, if followed by a group.sum the result is the same - * with or without this present, and it never increases the number of - * items. BUT due to the cost of caching, it might not be faster if - * there is poor key locality. - * - * It is only useful for expert tuning, - * and best avoided unless you are struggling with performance problems. - * If you are not sure you need this, you probably don't. - * - * The main use case is to reduce the values down before a key expansion - * such as is often done in a data cube. - */ - def sumByLocalKeys[K,V](implicit ev : T <:< (K,V), sg: Semigroup[V]): TypedPipe[(K,V)] - - def sample(percent : Double): TypedPipe[T] - def sample(percent : Double, seed : Long): TypedPipe[T] - - /** Export back to a raw cascading Pipe. useful for interop with the scalding - * Fields API or with Cascading code. - */ - def toPipe[U >: T](fieldNames: Fields)(implicit setter: TupleSetter[U]): Pipe - -///////////////////////////////////////////// -// -// The following have default implementations in terms of the above -// -///////////////////////////////////////////// - - import Dsl._ - - def ++[U >: T](other: TypedPipe[U]): TypedPipe[U] = other match { - case EmptyTypedPipe(_,_) => this - case IterablePipe(thatIter,_,_) if thatIter.isEmpty => this - case _ => MergedTypedPipe(this, other) - } - - /** Same as groupAll.aggregate.values - */ - def aggregate[B,C](agg: Aggregator[T, B, C]): ValuePipe[C] = - ComputedValue(groupAll.aggregate(agg).values) - - def collect[U](fn: PartialFunction[T, U]): TypedPipe[U] = - filter(fn.isDefinedAt(_)).map(fn(_)) - - def cross[V](p: ValuePipe[V]) : TypedPipe[(T, V)] = - p match { - case e@EmptyValue() => e.toTypedPipe - case LiteralValue(v) => map { (_, v) } - case ComputedValue(pipe) => cross(pipe) - } - - // prints the current pipe to stdout - def debug: TypedPipe[T] = map { t => println(t); t } - - /** - * Returns the set of distinct elements in the TypedPipe - */ - @annotation.implicitNotFound(msg = "For distinct method to work, the type in TypedPipe must have an Ordering.") - def distinct(implicit ord: Ordering[_ >: T]): TypedPipe[T] = { - // cast because Ordering is not contravariant, but should be (and this cast is safe) - implicit val ordT: Ordering[T] = ord.asInstanceOf[Ordering[T]] - map{ (_, ()) }.group.sum.keys - } - - def either[R](that: TypedPipe[R]): TypedPipe[Either[T, R]] = - map(Left(_)) ++ (that.map(Right(_))) - - /** Sometimes useful for implementing custom joins with groupBy + mapValueStream when you know - * that the value/key can fit in memory. Beware. - */ - def eitherValues[K,V,R](that: TypedPipe[(K, R)])(implicit ev: T <:< (K,V)): TypedPipe[(K, Either[V, R])] = - mapValues { (v: V) => Left(v) } ++ (that.mapValues { (r: R) => Right(r) }) - - def map[U](f: T => U): TypedPipe[U] = flatMap { t => Iterator(f(t)) } - - def mapValues[K, V, U](f : V => U)(implicit ev: T <:< (K, V)): TypedPipe[(K, U)] = - map { t: T => - val (k, v) = t.asInstanceOf[(K, V)] //No need to capture ev and deal with serialization - (k, f(v)) - } - - /** Keep only items that satisfy this predicate - */ - def filter(f: T => Boolean): TypedPipe[T] = - flatMap { Iterator(_).filter(f) } - - /** If T is a (K, V) for some V, then we can use this function to filter. - * This is here to match the function in KeyedListLike, where it is optimized - */ - def filterKeys[K](fn: K => Boolean)(implicit ev: T <:< (K, Any)): TypedPipe[T] = - filter { t => fn(t.asInstanceOf[(K, Any)]._1) } - - /** Keep only items that don't satisfy the predicate. - * `filterNot` is the same as `filter` with a negated predicate. - */ - def filterNot(f: T => Boolean): TypedPipe[T] = - filter(!f(_)) - - /** flatten an Iterable */ - def flatten[U](implicit ev: T <:< TraversableOnce[U]): TypedPipe[U] = - flatMap { _.asInstanceOf[TraversableOnce[U]] } // don't use ev which may not be serializable - - /** Force a materialization of this pipe prior to the next operation. - * This is useful if you filter almost everything before a hashJoin, for instance. - */ - def forceToDisk: TypedPipe[T] = - TypedPipe.fromSingleField(fork.toPipe(0).forceToDisk) - - def group[K,V](implicit ev : <:<[T,(K,V)], ord : Ordering[K]): Grouped[K,V] = - //If the type of T is not (K,V), then at compile time, this will fail. It uses implicits to do - //a compile time check that one type is equivalent to another. If T is not (K,V), we can't - //automatically group. We cast because it is safe to do so, and we need to convert to K,V, but - //the ev is not needed for the cast. In fact, you can do the cast with ev(t) and it will return - //it as (K,V), but the problem is, ev is not serializable. So we do the cast, which due to ev - //being present, will always pass. - Grouped(this.asInstanceOf[TypedPipe[(K, V)]]) - - def groupAll: Grouped[Unit,T] = groupBy(x => ()).withReducers(1) - - def groupBy[K](g: T => K)(implicit ord: Ordering[K]): Grouped[K,T] = - map { t => (g(t), t) }.group - - /** Forces a shuffle by randomly assigning each item into one - * of the partitions. - * - * This is for the case where you mappers take a long time, and - * it is faster to shuffle them to more reducers and then operate. - * - * You probably want shard if you are just forcing a shuffle. - */ - def groupRandomly(partitions: Int): Grouped[Int, T] = { - // Make it lazy so all mappers get their own: - lazy val rng = new java.util.Random(123) // seed this so it is repeatable - groupBy { _ => rng.nextInt(partitions) } - .withReducers(partitions) - } - - /** Used to force a shuffle into a given size of nodes. - * Only use this if your mappers are taking far longer than - * the time to shuffle. - */ - def shard(partitions: Int): TypedPipe[T] = - groupRandomly(partitions).forceToReducers.values - - /** Reasonably common shortcut for cases of associative/commutative reduction - * returns a typed pipe with only one element. - */ - def sum[U >: T](implicit plus: Semigroup[U]): ValuePipe[U] = ComputedValue(groupAll.sum[U].values) - - /** Reasonably common shortcut for cases of associative/commutative reduction by Key - */ - def sumByKey[K,V](implicit ev: T<:<(K,V), ord: Ordering[K], plus: Semigroup[V]): TypedPipe[(K, V)] = - group[K, V].sum[V] - - def unpackToPipe[U >: T](fieldNames: Fields)(implicit up: TupleUnpacker[U]): Pipe = { - val setter = up.newSetter(fieldNames) - toPipe[U](fieldNames)(setter) - } - - /** Safely write to a TypedSink[T]. If you want to write to a Source (not a Sink) - * you need to do something like: toPipe(fieldNames).write(dest) - * @return a pipe equivalent to the current pipe. - */ - def write(dest: TypedSink[T]) - (implicit flowDef : FlowDef, mode : Mode): TypedPipe[T] = { - // Make sure that we don't render the whole pipeline twice: - val res = fork - dest.writeFrom(res.toPipe[T](dest.sinkFields)(dest.setter)) - res - } - - def keys[K](implicit ev : <:<[T,(K,_)]) : TypedPipe[K] = - // avoid capturing ev in the closure: - map { t => t.asInstanceOf[(K, _)]._1 } - - // swap the keys with the values - def swap[K,V](implicit ev: <:<[T,(K,V)]) : TypedPipe[(V,K)] = map { tup => - tup.asInstanceOf[(K,V)].swap - } - - def values[V](implicit ev : <:<[T,(_,V)]) : TypedPipe[V] = - // avoid capturing ev in the closure: - map { t => t.asInstanceOf[(_, V)]._2 } - - def leftCross[V](p: ValuePipe[V]): TypedPipe[(T, Option[V])] = - p match { - case EmptyValue() => map { (_, None) } - case LiteralValue(v) => map { (_, Some(v)) } - case ComputedValue(pipe) => leftCross(pipe) - } - - def leftCross[V](thatPipe: TypedPipe[V]): TypedPipe[(T, Option[V])] = - map(((), _)).hashLeftJoin(thatPipe.groupAll).values - - def mapWithValue[U, V](value: ValuePipe[U])(f: (T, Option[U]) => V) : TypedPipe[V] = - leftCross(value).map(t => f(t._1, t._2)) - - def flatMapWithValue[U, V](value: ValuePipe[U])(f: (T, Option[U]) => TraversableOnce[V]) : TypedPipe[V] = - leftCross(value).flatMap(t => f(t._1, t._2)) - - def filterWithValue[U](value: ValuePipe[U])(f: (T, Option[U]) => Boolean) : TypedPipe[T] = - leftCross(value).filter(t => f(t._1, t._2)).map(_._1) - - /** - * These operations look like joins, but they do not force any communication - * of the current TypedPipe. They are mapping operations where this pipe is streamed - * through one item at a time. - * - * WARNING These behave semantically very differently than cogroup. - * This is because we handle (K,V) tuples on the left as we see them. - * The iterable on the right is over all elements with a matching key K, and it may be empty - * if there are no values for this key K. - */ - def hashCogroup[K,V,W,R](smaller: HashJoinable[K,W]) - (joiner: (K, V, Iterable[W]) => Iterator[R]) - (implicit ev: TypedPipe[T] <:< TypedPipe[(K,V)]): TypedPipe[(K,R)] = - smaller.hashCogroupOn(ev(this))(joiner) - - def hashJoin[K,V,W](smaller: HashJoinable[K,W]) - (implicit ev: TypedPipe[T] <:< TypedPipe[(K,V)]): TypedPipe[(K,(V,W))] = - hashCogroup[K,V,W,(V,W)](smaller)(Joiner.hashInner2) - - def hashLeftJoin[K,V,W](smaller: HashJoinable[K,W]) - (implicit ev: TypedPipe[T] <:< TypedPipe[(K,V)]): TypedPipe[(K,(V,Option[W]))] = - hashCogroup[K,V,W,(V,Option[W])](smaller)(Joiner.hashLeft2) - - /** For each element, do a map-side (hash) left join to look up a value - */ - def hashLookup[K>:T,V](grouped: HashJoinable[K, V]): TypedPipe[(K, Option[V])] = - map((_, ())) - .hashLeftJoin(grouped) - .map { case (t, (_, optV)) => (t, optV) } - - def sketch[K,V] - (reducers: Int, - eps: Double = 1.0E-5, //272k width = 1MB per row - delta: Double = 0.01, //5 rows (= 5 hashes) - seed: Int = 12345) - (implicit ev: TypedPipe[T] <:< TypedPipe[(K,V)], - serialization: K => Array[Byte], - ordering: Ordering[K]): Sketched[K,V] = - Sketched(ev(this), reducers, delta, eps, seed) -} - - -final case class EmptyTypedPipe(@transient fd: FlowDef, @transient mode: Mode) extends TypedPipe[Nothing] { - import Dsl._ - - override def aggregate[B, C](agg: Aggregator[Nothing, B, C]): ValuePipe[C] = - EmptyValue()(fd, mode) - - // Cross product with empty is always empty. - override def cross[U](tiny : TypedPipe[U]): TypedPipe[(Nothing,U)] = - EmptyTypedPipe(fd, mode) - - override def distinct(implicit ord: Ordering[_ >: Nothing]) = - this - - override def flatMap[U](f: Nothing => TraversableOnce[U]) = - EmptyTypedPipe(fd, mode) - - override def fork: TypedPipe[Nothing] = this - - override def leftCross[V](p: ValuePipe[V]) = - EmptyTypedPipe(fd, mode) - - /** limit the output to at most count items. - * useful for debugging, but probably that's about it. - * The number may be less than count, and not sampled particular method - */ - override def limit(count: Int) = this - - override def sample(percent: Double) = this - override def sample(percent: Double, seed: Long) = this - - // prints the current pipe to either stdout or stderr - override def debug: TypedPipe[Nothing] = this - - override def ++[U >: Nothing](other : TypedPipe[U]): TypedPipe[U] = other - - override def toPipe[U >: Nothing](fieldNames: Fields)(implicit setter: TupleSetter[U]): Pipe = - IterableSource(Iterable.empty, fieldNames)(setter, singleConverter[U]).read(fd, mode) - - override def sum[U >: Nothing](implicit plus: Semigroup[U]): ValuePipe[U] = - EmptyValue()(fd, mode) - - override def sumByLocalKeys[K,V](implicit ev : Nothing <:< (K,V), sg: Semigroup[V]) = - EmptyTypedPipe(fd, mode) - - override def hashCogroup[K,V,W,R](smaller: HashJoinable[K,W]) - (joiner: (K, V, Iterable[W]) => Iterator[R]) - (implicit ev: TypedPipe[Nothing] <:< TypedPipe[(K,V)]): TypedPipe[(K,R)] = - EmptyTypedPipe(fd, mode) -} - -/** You should use a view here - * If you avoid toPipe, this class is more efficient than IterableSource. - */ -final case class IterablePipe[T](iterable: Iterable[T], - @transient fd: FlowDef, - @transient mode: Mode) extends TypedPipe[T] { - - override def aggregate[B, C](agg: Aggregator[T, B, C]): ValuePipe[C] = - Some(iterable) - .filterNot(_.isEmpty) - .map(it => LiteralValue(agg(it))(fd, mode)) - .getOrElse(EmptyValue()(fd, mode)) - - override def ++[U >: T](other: TypedPipe[U]): TypedPipe[U] = other match { - case IterablePipe(thatIter,_,_) => IterablePipe(iterable ++ thatIter, fd, mode) - case EmptyTypedPipe(_,_) => this - case _ if iterable.isEmpty => other - case _ => MergedTypedPipe(this, other) - } - - // Implements a cross product. - override def cross[U](tiny : TypedPipe[U]) = - tiny.flatMap { u => iterable.map { (_, u) } } - - override def filter(f: T => Boolean): TypedPipe[T] = - IterablePipe(iterable.filter(f), fd, mode) - - override def flatMap[U](f: T => TraversableOnce[U]) = - IterablePipe(iterable.flatMap(f), fd, mode) - - override def fork: TypedPipe[T] = this - - override def limit(count: Int): TypedPipe[T] = IterablePipe(iterable.take(count), fd, mode) - - private def defaultSeed: Long = System.identityHashCode(this) * 2654435761L ^ System.currentTimeMillis - def sample(percent: Double): TypedPipe[T] = sample(percent, defaultSeed) - def sample(percent: Double, seed: Long): TypedPipe[T] = { - val rand = new Random(seed) - IterablePipe(iterable.filter(_ => rand.nextDouble < percent), fd, mode) - } - - override def map[U](f: T => U): TypedPipe[U] = - IterablePipe(iterable.map(f), fd, mode) - - override def toPipe[U >: T](fieldNames: Fields)(implicit setter: TupleSetter[U]): Pipe = - IterableSource[U](iterable, fieldNames)(setter, singleConverter[U]).read(fd, mode) - - override def sum[U >: T](implicit plus: Semigroup[U]): ValuePipe[U] = - Semigroup.sumOption[U](iterable).map(LiteralValue(_)(fd, mode)) - .getOrElse(EmptyValue()(fd, mode)) - - override def sumByLocalKeys[K,V](implicit ev: T <:< (K,V), sg: Semigroup[V]) = - IterablePipe(MapAlgebra.sumByKey(iterable.map(ev(_))), fd, mode) -} - -/** This is an instance of a TypedPipe that wraps a cascading Pipe - */ -final case class TypedPipeInst[T](@transient inpipe: Pipe, - fields: Fields, - flatMapFn: FlatMapFn[T]) extends TypedPipe[T] { - - import Dsl._ - - // The output pipe has a single item CTuple with an object of type T in position 0 - @transient protected lazy val pipe: Pipe = toPipe(0)(singleSetter[T]) - - // Implements a cross product. The right side should be tiny (< 100MB) - override def cross[U](tiny: TypedPipe[U]): TypedPipe[(T,U)] = tiny match { - case EmptyTypedPipe(fd, m) => EmptyTypedPipe(fd, m) - case tpi@TypedPipeInst(_,_,_) => - map(((), _)).hashJoin(tiny.groupAll).values - case MergedTypedPipe(l, r) => - MergedTypedPipe(cross(l), cross(r)) - case IterablePipe(iter, _, _) => - flatMap { t => iter.map { (t, _) } } - } - - // prints the current pipe to either stdout or stderr - override def debug: TypedPipe[T] = - TypedPipe.fromSingleField(pipe.debug) - - override def filter(f: T => Boolean): TypedPipe[T] = - TypedPipeInst[T](inpipe, fields, flatMapFn.filter(f)) - - override def flatMap[U](f: T => TraversableOnce[U]): TypedPipe[U] = - TypedPipeInst[U](inpipe, fields, flatMapFn.flatMap(f)) - - /** If you are going to create two branches or forks, - * it may be more efficient to call this method first - * which will create a node in the cascading graph. - * Without this, both full branches of the fork will be - * put into separate cascading. - * - * Ideally the planner would see this - */ - override def fork: TypedPipe[T] = - TypedPipe.fromSingleField(pipe) - - /** Force a materialization of this pipe prior to the next operation. - * This is useful if you filter almost everything before a hashJoin, for instance. - */ - override lazy val forceToDisk: TypedPipe[T] = - TypedPipe.fromSingleField(pipe.forceToDisk) - - /** limit the output to at most count items. - * useful for debugging, but probably that's about it. - * The number may be less than count, and not sampled particular method - */ - override def limit(count: Int): TypedPipe[T] = - TypedPipe.fromSingleField(pipe.limit(count)) - - override def sample(percent: Double): TypedPipe[T] = TypedPipe.fromSingleField(pipe.sample(percent)) - override def sample(percent: Double, seed: Long): TypedPipe[T] = TypedPipe.fromSingleField(pipe.sample(percent, seed)) - - override def map[U](f: T => U): TypedPipe[U] = - TypedPipeInst[U](inpipe, fields, flatMapFn.map(f)) - - override def sumByLocalKeys[K,V](implicit ev : T <:< (K,V), sg: Semigroup[V]): TypedPipe[(K,V)] = { - val fields = ('key, 'value) - val msr = new MapsideReduce(sg, 'key, 'value, None)(singleConverter[V], singleSetter[V]) - TypedPipe.from[(K,V)]( - map(_.asInstanceOf[(K,V)]) - .toPipe[(K, V)](fields).eachTo(fields -> fields) { _ => msr }, - fields) - } - /** This actually runs all the pure map functions in one Cascading Each - * This approach is more efficient than untyped scalding because we - * don't use TupleConverters/Setters after each map. - */ - override def toPipe[U >: T](fieldNames: Fields)(implicit setter: TupleSetter[U]): Pipe = - inpipe.flatMapTo[TupleEntry, U](fields -> fieldNames)(flatMapFn) -} - -final case class MergedTypedPipe[T](left: TypedPipe[T], right: TypedPipe[T]) extends TypedPipe[T] { - import Dsl._ - - // Implements a cross project. The right side should be tiny - def cross[U](tiny : TypedPipe[U]): TypedPipe[(T,U)] = tiny match { - case EmptyTypedPipe(fd, m) => EmptyTypedPipe(fd, m) - case _ => MergedTypedPipe(left.cross(tiny), right.cross(tiny)) - } - - // prints the current pipe to either stdout or stderr - override def debug: TypedPipe[T] = - MergedTypedPipe(left.debug, right.debug) - - override def filter(f: T => Boolean): TypedPipe[T] = - MergedTypedPipe(left.filter(f), right.filter(f)) - - def flatMap[U](f: T => TraversableOnce[U]): TypedPipe[U] = - MergedTypedPipe(left.flatMap(f), right.flatMap(f)) - - def limit(count: Int): TypedPipe[T] = - TypedPipe.fromSingleField(fork.toPipe(0).limit(count)) - - def sample(percent: Double): TypedPipe[T] = - MergedTypedPipe(left.sample(percent), right.sample(percent)) - - def sample(percent: Double, seed: Long): TypedPipe[T] = - MergedTypedPipe(left.sample(percent, seed), right.sample(percent, seed)) - - override def sumByLocalKeys[K,V](implicit ev : T <:< (K,V), sg: Semigroup[V]): - TypedPipe[(K, V)] = - MergedTypedPipe(left.sumByLocalKeys, right.sumByLocalKeys) - - override def map[U](f: T => U): TypedPipe[U] = - MergedTypedPipe(left.map(f), right.map(f)) - - override def fork: TypedPipe[T] = - MergedTypedPipe(left.fork, right.fork) - - override def toPipe[U >: T](fieldNames: Fields)(implicit setter: TupleSetter[U]): Pipe = { - if(left == right) { - //use map: - left.flatMap {t => List(t, t)}.toPipe[U](fieldNames) - } - else { - import RichPipe.assignName - new cascading.pipe.Merge(assignName(left.toPipe[U](fieldNames)), - assignName(right.toPipe[U](fieldNames))) - } - } - - override def hashCogroup[K,V,W,R](smaller: HashJoinable[K,W]) - (joiner: (K, V, Iterable[W]) => Iterator[R]) - (implicit ev: TypedPipe[T] <:< TypedPipe[(K,V)]): TypedPipe[(K,R)] = - MergedTypedPipe(left.hashCogroup(smaller)(joiner), right.hashCogroup(smaller)(joiner)) -} - -class TuplePipeJoinEnrichment[K, V](pipe: TypedPipe[(K, V)])(implicit ord: Ordering[K]) { - def join[W](smaller : TypedPipe[(K, W)], reducers: Int = -1) : CoGrouped[K, (V, W)] = pipe.group.withReducers(reducers).join(smaller.group) - def leftJoin[W](smaller : TypedPipe[(K, W)], reducers: Int = -1) : CoGrouped[K, (V, Option[W])] = pipe.group.withReducers(reducers).leftJoin(smaller.group) - def rightJoin[W](smaller : TypedPipe[(K, W)], reducers: Int = -1) : CoGrouped[K, (Option[V], W)] = pipe.group.withReducers(reducers).rightJoin(smaller.group) - def outerJoin[W](smaller : TypedPipe[(K, W)], reducers: Int = -1) : CoGrouped[K, (Option[V], Option[W])] = pipe.group.withReducers(reducers).outerJoin(smaller.group) -} - -class MappablePipeJoinEnrichment[T](pipe: TypedPipe[T]) { - def joinBy[K, U](smaller : TypedPipe[U])(g : (T => K), h : (U => K), reducers: Int = -1)(implicit ord: Ordering[K]) : CoGrouped[K, (T, U)] = pipe.groupBy(g).withReducers(reducers).join(smaller.groupBy(h)) - def leftJoinBy[K, U](smaller : TypedPipe[U])(g : (T => K), h : (U => K), reducers: Int = -1)(implicit ord: Ordering[K]) : CoGrouped[K, (T, Option[U])] = pipe.groupBy(g).withReducers(reducers).leftJoin(smaller.groupBy(h)) - def rightJoinBy[K, U](smaller : TypedPipe[U])(g : (T => K), h : (U => K), reducers: Int = -1)(implicit ord: Ordering[K]) : CoGrouped[K, (Option[T], U)] = pipe.groupBy(g).withReducers(reducers).rightJoin(smaller.groupBy(h)) - def outerJoinBy[K, U](smaller : TypedPipe[U])(g : (T => K), h : (U => K), reducers: Int = -1)(implicit ord: Ordering[K]) : CoGrouped[K, (Option[T], Option[U])] = pipe.groupBy(g).withReducers(reducers).outerJoin(smaller.groupBy(h)) -} - -object Syntax { - implicit def joinOnTuplePipe[K, V](p: TypedPipe[(K, V)])(implicit ord: Ordering[K]) : TuplePipeJoinEnrichment[K, V] = new TuplePipeJoinEnrichment(p) - implicit def joinOnMappablePipe[T](p: TypedPipe[T]) : MappablePipeJoinEnrichment[T] = new MappablePipeJoinEnrichment(p) -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSink.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSink.scala index 0b5d1f9d2e..876025db75 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSink.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSink.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed @@ -23,28 +23,43 @@ import cascading.pipe.Pipe import cascading.tuple.Fields object TypedSink extends java.io.Serializable { - /** Build a TypedSink by declaring a concrete type for the Source - * Here because of the late addition of TypedSink to scalding to make it - * easier to port segacy code + + /** + * Build a TypedSink by declaring a concrete type for the Source Here because of the late addition of + * TypedSink to scalding to make it easier to port legacy code */ def apply[T](s: Source)(implicit tset: TupleSetter[T]): TypedSink[T] = new TypedSink[T] { - def setter[U <:T] = TupleSetter.asSubSetter[T, U](tset) - def writeFrom(pipe : Pipe)(implicit flowDef : FlowDef, mode : Mode): Pipe = + def setter[U <: T] = TupleSetter.asSubSetter[T, U](tset) + def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe = s.writeFrom(pipe) } } -/** Opposite of TypedSource, used for writing into +/** + * Opposite of TypedSource, used for writing into */ -trait TypedSink[-T] extends java.io.Serializable { +trait TypedSink[-T] extends Output[T] { def setter[U <: T]: TupleSetter[U] // These are the fields the write function is expecting - def sinkFields : Fields = Dsl.intFields(0 until setter.arity) + def sinkFields: Fields = Dsl.intFields(0 until setter.arity) - /** pipe is assumed to have the schema above, otherwise an error may occur - * The exact same pipe is returned to match the legacy Source API. + /** + * pipe is assumed to have the schema above, otherwise an error may occur The exact same pipe is returned to + * match the legacy Source API. */ - def writeFrom(pipe : Pipe)(implicit flowDef : FlowDef, mode : Mode): Pipe -} + def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe + /** + * Transform this sink into another type by applying a function first + */ + def contraMap[U](fn: U => T): TypedSink[U] = { + val self = this // compiler generated self can cause problems with serialization + new TypedSink[U] { + override def sinkFields = self.sinkFields + def setter[V <: U]: TupleSetter[V] = self.setter.contraMap(fn) + def writeFrom(pipe: Pipe)(implicit fd: FlowDef, mode: Mode): Pipe = self.writeFrom(pipe) + override def contraMap[U1](fn2: U1 => U) = self.contraMap(fn2.andThen(fn)) + } + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSource.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSource.scala index 3d5b7117dd..1e3ac198c7 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSource.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/TypedSource.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.typed @@ -22,17 +22,30 @@ import cascading.flow.FlowDef import cascading.pipe.Pipe import cascading.tuple.Fields -trait TypedSource[+T] extends java.io.Serializable { +trait TypedSource[+T] extends Input[T] { + /** - * Because TupleConverter cannot be covariant, we need to jump through this hoop. - * A typical implementation might be: - * (implicit conv: TupleConverter[T]) - * and then: + * Because TupleConverter cannot be covariant, we need to jump through this hoop. A typical implementation + * might be: (implicit conv: TupleConverter[T]) and then: * * override def converter[U >: T] = TupleConverter.asSuperConverter[T, U](conv) */ def converter[U >: T]: TupleConverter[U] def read(implicit flowDef: FlowDef, mode: Mode): Pipe // These are the default column number YOU MAY NEED TO OVERRIDE! - def sourceFields : Fields = Dsl.intFields(0 until converter.arity) + def sourceFields: Fields = Dsl.intFields(0 until converter.arity) + + /** + * Transform this TypedSource into another by mapping after. We don't call this map because of conflicts + * with Mappable, unfortunately + */ + def andThen[U](fn: T => U): TypedSource[U] = { + val self = this // compiler generated self can cause problems with serialization + new TypedSource[U] { + override def sourceFields = self.sourceFields + def converter[V >: U]: TupleConverter[V] = self.converter.andThen(fn) + def read(implicit fd: FlowDef, mode: Mode): Pipe = self.read + override def andThen[U1](fn2: U => U1) = self.andThen(fn.andThen(fn2)) + } + } } diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/ValuePipe.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/ValuePipe.scala deleted file mode 100644 index 00fb0ee248..0000000000 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/ValuePipe.scala +++ /dev/null @@ -1,79 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.typed - -import com.twitter.algebird._ -import com.twitter.scalding.{Mode, IterableSource} -import cascading.flow.FlowDef - - -object ValuePipe extends java.io.Serializable { - implicit def toTypedPipe[V](v: ValuePipe[V]): TypedPipe[V] = v.toTypedPipe - - def fold[T, U, V](l: ValuePipe[T], r: ValuePipe[U])(f: (T, U) => V): ValuePipe[V] = - l.leftCross(r).collect { case (t, Some(u)) => f(t,u) } -} - -/** ValuePipe is special case of a TypedPipe of just a optional single element. - * It is like a distribute Option type - * It allows to perform scalar based operations on pipes like normalization. - */ -sealed trait ValuePipe[+T] extends java.io.Serializable { - def leftCross[U](that: ValuePipe[U]): ValuePipe[(T, Option[U])] = that match { - case EmptyValue() => map((_, None)) - case LiteralValue(v2) => map((_, Some(v2))) - // We don't know if a computed value is empty or not. We need to run the MR job: - case _ => ComputedValue(toTypedPipe.leftCross(that)) - } - def collect[U](fn: PartialFunction[T, U]): ValuePipe[U] = - filter(fn.isDefinedAt(_)).map(fn(_)) - - def map[U](fn: T => U): ValuePipe[U] - def filter(fn: T => Boolean): ValuePipe[T] - def toTypedPipe: TypedPipe[T] - - def debug: ValuePipe[T] -} -case class EmptyValue(implicit val flowDef: FlowDef, mode: Mode) extends ValuePipe[Nothing] { - override def leftCross[U](that: ValuePipe[U]) = EmptyValue() - override def map[U](fn: Nothing => U): ValuePipe[U] = EmptyValue() - override def filter(fn: Nothing => Boolean) = EmptyValue() - override def toTypedPipe: TypedPipe[Nothing] = TypedPipe.empty - - def debug: ValuePipe[Nothing] = { - println("EmptyValue") - this - } -} -case class LiteralValue[T](value: T)(implicit val flowDef: FlowDef, mode: Mode) extends ValuePipe[T] { - override def map[U](fn: T => U) = LiteralValue(fn(value)) - override def filter(fn: T => Boolean) = if(fn(value)) this else EmptyValue() - override lazy val toTypedPipe = TypedPipe.from(Iterable(value)) - - def debug: ValuePipe[T] = map { v => - println("LiteralValue(" + v.toString + ")") - v - } -} -case class ComputedValue[T](override val toTypedPipe: TypedPipe[T]) extends ValuePipe[T] { - override def map[U](fn: T => U) = ComputedValue(toTypedPipe.map(fn)) - override def filter(fn: T => Boolean) = ComputedValue(toTypedPipe.filter(fn)) - - def debug: ValuePipe[T] = map { value => - println("ComputedValue(" + value.toString + ")") - value - } -} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/AsyncFlowDefRunner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/AsyncFlowDefRunner.scala new file mode 100644 index 0000000000..2f9f5bf65d --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/AsyncFlowDefRunner.scala @@ -0,0 +1,426 @@ +package com.twitter.scalding.typed.cascading_backend + +import cascading.flow.{Flow, FlowDef} +import com.twitter.scalding.{ + source, + typed, + CascadingLocal, + CascadingMode, + Config, + Execution, + ExecutionContext, + ExecutionCounters, + FlowStateMap, + HadoopMode, + JobStats, + Mappable, + TypedPipe +} +import com.twitter.scalding.{CFuture, CPromise, CancellationHandler} +import com.twitter.scalding.typed.{Output, TypedSink} +import com.twitter.scalding.cascading_interop.FlowListenerPromise +import com.twitter.scalding.dagon.{HMap, Rule} +import java.util.UUID +import java.util.concurrent.LinkedBlockingQueue +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.slf4j.LoggerFactory +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} +import scala.util.control.NonFatal +import scala.util.{Failure, Success, Try} + +import Execution.{ToWrite, Writer} + +import CascadingExtensions._ + +object AsyncFlowDefRunner { + + /** + * We send messages from other threads into the submit thread here + */ + private sealed trait FlowDefAction + private final case class RunFlowDef(conf: Config, fd: FlowDef, result: CPromise[(Long, JobStats)]) + extends FlowDefAction + private final case class StopFlow(flow: Flow[_], result: Promise[Unit]) extends FlowDefAction + private case object Stop extends FlowDefAction + + /** + * This is a Thread used as a shutdown hook to clean up temporary files created by some Execution + * + * If the job is aborted the shutdown hook may not run and the temporary files will not get cleaned up + */ + final case class TempFileCleanup(filesToCleanup: List[String], mode: CascadingMode) extends Thread { + + val LOG = LoggerFactory.getLogger(this.getClass) + + override def run(): Unit = { + val fs = mode match { + case localMode: CascadingLocal => FileSystem.getLocal(new Configuration) + case hdfsMode: HadoopMode => FileSystem.get(hdfsMode.jobConf) + } + + filesToCleanup.foreach { file: String => + try { + val path = new Path(file) + if (fs.exists(path)) { + // The "true" parameter here indicates that we should recursively delete everything under the given path + fs.delete(path, true) + } + } catch { + // If we fail in deleting a temp file, log the error but don't fail the run + case e: Throwable => LOG.warn(s"Unable to delete temp file $file", e) + } + } + } + } +} + +/** + * This holds an internal thread to run This holds an internal thread to submit run a Config, Mode, FlowDef + * and return a Future holding the JobStats + */ +class AsyncFlowDefRunner(mode: CascadingMode) extends Writer { + import AsyncFlowDefRunner._ + + private[this] val mutex = new AnyRef + + type StateKey[T] = (Config, TypedPipe[T]) + type WorkVal[T] = Future[TypedPipe[T]] + + private case class FilesToCleanUp(onFinish: Set[String], onShutdown: Set[String]) { + def addFile(conf: Config, s: String): FilesToCleanUp = + if (conf.getExecutionCleanupOnFinish) copy(onFinish = onFinish + s) + else copy(onShutdown = onShutdown + s) + } + private object FilesToCleanUp { + def empty: FilesToCleanUp = FilesToCleanUp(Set.empty, Set.empty) + } + + /** + * @param filesToCleanup + * temporary files created by forceToDiskExecution + * @param initToOpt + * this is the mapping between user's TypedPipes and their optimized versions which are actually run. + * @param forcedPipes + * these are all the side effecting forcing of TypedPipes into simple SourcePipes or IterablePipes. These + * are for both toIterableExecution and forceToDiskExecution + */ + private case class State( + filesToCleanup: FilesToCleanUp, + initToOpt: HMap[StateKey, TypedPipe], + forcedPipes: HMap[StateKey, WorkVal] + ) { + + def addFilesToCleanup(conf: Config, s: Option[String]): State = + s match { + case Some(path) => + val ftc1 = filesToCleanup.addFile(conf, path) + copy(filesToCleanup = ftc1) + case None => this + } + + /** + * Returns true if we actually add this optimized pipe. We do this because we don't want to take the side + * effect twice. + */ + def addForce[T]( + c: Config, + init: TypedPipe[T], + opt: TypedPipe[T], + p: Future[TypedPipe[T]] + ): (State, Boolean) = + forcedPipes.get((c, opt)) match { + case None => + ( + copy(forcedPipes = forcedPipes + ((c, opt) -> p), initToOpt = initToOpt + ((c, init) -> opt)), + true + ) + case Some(_) => + (copy(initToOpt = initToOpt + ((c, init) -> opt)), false) + } + + def getForce[T](c: Config, init: TypedPipe[T]): Option[Future[TypedPipe[T]]] = + initToOpt.get((c, init)).map { opt => + forcedPipes.get((c, opt)) match { + case None => + sys.error(s"invariant violation: initToOpt mapping exists for $init, but no forcedPipe") + case Some(p) => p + } + } + } + + private[this] var state: State = State(FilesToCleanUp.empty, HMap.empty, HMap.empty) + + private def updateState[S](fn: State => (State, S)): S = + mutex.synchronized { + val s0 = state + val (st1, s) = fn(s0) + require(state eq s0, "updateState has recursively modified state, programming error") + state = st1 + s + } + private def getState: State = + updateState(s => (s, s)) + + private val messageQueue: LinkedBlockingQueue[AsyncFlowDefRunner.FlowDefAction] = + new LinkedBlockingQueue[AsyncFlowDefRunner.FlowDefAction]() + + /** + * Hadoop and/or cascading has some issues, it seems, with starting jobs from multiple threads. This thread + * does all the Flow starting. + */ + private lazy val thread = new Thread(new Runnable { + def run(): Unit = { + @annotation.tailrec + def go(id: Long): Unit = messageQueue.take match { + case Stop => () + case StopFlow(flow, promise) => + promise.complete(Try(flow.stop())) + go(id) + case RunFlowDef(conf, fd, cpromise) => + try { + val ctx = ExecutionContext.newContext(conf.setScaldingFlowCounterValue(id))(fd, mode) + ctx.buildFlow match { + case Success(Some(flow)) => + val future = FlowListenerPromise + .start(flow, { f: Flow[_] => (id, JobStats(f.getFlowStats)) }) + // we want to stop the flow when the execution is cancelled + val cancel = CancellationHandler.fromFn { cec => + val done = Promise[Unit]() + messageQueue.put(StopFlow(flow, done)) + done.future + } + cpromise.completeWith(CFuture(future, cancel)) + case Success(None) => + // These is nothing to do: + cpromise.promise.success((id, JobStats.empty)) + cpromise.cancellationHandler.success(CancellationHandler.empty) + case Failure(err) => + cpromise.promise.failure(err) + cpromise.cancellationHandler.success(CancellationHandler.empty) + } + } catch { + case t: Throwable => + // something bad happened, but this thread is a daemon + // that should only stop if all others have stopped or + // we have received the stop message. + // Stopping this thread prematurely can deadlock + // futures from the promise we have. + // In a sense, this thread does not exist logically and + // must forward all exceptions to threads that requested + // this work be started. + cpromise.promise.tryFailure(t) + cpromise.cancellationHandler.success(CancellationHandler.empty) + } + // Loop + go(id + 1) + } + + // Now we actually run the recursive loop + go(0) + } + }) + + def runFlowDef(conf: Config, fd: FlowDef): CFuture[(Long, JobStats)] = + try { + val cpromise = CPromise[(Long, JobStats)]() + // fill in flow once it is built + messageQueue.put(RunFlowDef(conf, fd, cpromise)) + // Don't do any work after the .put call, we want no chance for exception + // after the put + cpromise.cfuture + } catch { + case NonFatal(e) => + CFuture.failed(e) + } + + def start(): Unit = { + // Make sure this thread can't keep us running if all others are gone + thread.setDaemon(true) + thread.start() + } + /* + * This is called after we are done submitting all jobs + */ + def finished(): Unit = { + messageQueue.put(Stop) + // get an immutable copy + val filesToRm = getState.filesToCleanup + if (filesToRm.onShutdown.nonEmpty) { + Runtime.getRuntime.addShutdownHook(TempFileCleanup(filesToRm.onShutdown.toList, mode)) + } + if (filesToRm.onFinish.nonEmpty) { + val cleanUpThread = TempFileCleanup(filesToRm.onFinish.toList, mode) + // run it that the outer most execution is complete + cleanUpThread.start() + } + } + + /** + * This evaluates the fn in a Try, validates the sources calls runFlowDef, then clears the FlowStateMap + */ + def validateAndRun( + conf: Config + )(fn: Config => FlowDef)(implicit cec: ConcurrentExecutionContext): CFuture[(Long, ExecutionCounters)] = { + val tFlowDef = Try(fn(conf)).map { flowDef => + FlowStateMap.validateSources(flowDef, mode) + flowDef + } + + tFlowDef match { + case Success(flowDef) => + runFlowDef(conf, flowDef).map { case (id, jobStats) => + FlowStateMap.clear(flowDef) + (id, ExecutionCounters.fromJobStats(jobStats)) + } + case Failure(e) => + CFuture.failed(e) + } + } + + def execute(conf: Config, writes: List[ToWrite[_]])(implicit + cec: ConcurrentExecutionContext + ): CFuture[(Long, ExecutionCounters)] = { + + import Execution.ToWrite._ + + val done = Promise[Unit]() + + val phases: Seq[Rule[TypedPipe]] = + CascadingBackend.defaultOptimizationRules(conf) + + val optimizedWrites = ToWrite.optimizeWriteBatch(writes, phases) + + def prepareFD(c: Config): FlowDef = { + val fd = new FlowDef + + def write[A](tpipe: TypedPipe[A], out: Output[A]): Unit = + out match { + case dest: TypedSink[A] @unchecked => + // We have already applied the optimizations to the batch of writes above + val pipe = CascadingBackend.toPipeUnoptimized(tpipe, dest.sinkFields)(fd, mode, dest.setter) + dest.writeFrom(pipe)(fd, mode) + case _ => + throw new IllegalArgumentException( + s"cascading mode requires all outputs to be TypedSink, found: $out of class: ${out.getClass}" + ) + } + + def force[A](init: TypedPipe[A], opt: TypedPipe[A]): Unit = { + val pipePromise = Promise[TypedPipe[A]]() + val fut = pipePromise.future + // This updates mutable state + val sinkOpt = updateState { s => + val (nextState, added) = s.addForce(conf, init, opt, fut) + if (added) { + val uuid = UUID.randomUUID + val (sink, forcedPipe, clean) = forceToDisk(uuid, c, opt) + (nextState.addFilesToCleanup(conf, clean), Some((sink, forcedPipe))) + } else { + (nextState, None) + } + } + + sinkOpt.foreach { case (sink, fp) => + // We write the optimized pipe + write(opt, sink) + val pipeFut = done.future.map(_ => fp()) + pipePromise.completeWith(pipeFut) + } + } + def addIter[A](init: TypedPipe[A], optimized: Either[Iterable[A], Mappable[A]]): Unit = { + val result = optimized match { + case Left(iter) if iter.isEmpty => TypedPipe.EmptyTypedPipe + case Left(iter) => TypedPipe.IterablePipe(iter) + case Right(mappable) => TypedPipe.SourcePipe(mappable) + } + val fut = Future.successful(result) + updateState(_.addForce(conf, init, result, fut)) + } + + optimizedWrites.foreach { + case OptimizedWrite(init, Force(opt)) => + force(init, opt) + case OptimizedWrite(init, ToIterable(opt)) => + def step[A](init: TypedPipe[A], opt: TypedPipe[A]): Unit = + opt match { + case TypedPipe.EmptyTypedPipe => addIter(init, Left(Nil)) + case TypedPipe.IterablePipe(as) => addIter(init, Left(as)) + case TypedPipe.SourcePipe(src: Mappable[A]) => addIter(init, Right(src)) + case other => + // we need to write the pipe out first. + force(init, opt) + // now, when we go to check for the pipe later, it + // will be a SourcePipe of a Mappable by construction + } + step(init, opt) + + case OptimizedWrite(_, SimpleWrite(pipe, sink)) => + write(pipe, sink) + } + + fd + } + + val cfuture = validateAndRun(conf)(prepareFD _) + + // When we are done, the forced pipes are ready: + done.completeWith(cfuture.future.map(_ => ())) + cfuture + } + + def getForced[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ConcurrentExecutionContext + ): Future[TypedPipe[T]] = + getState.getForce(conf, initial) match { + case Some(fut) => fut + case None => + val msg = + s"logic error: getForced($conf, $initial) does not have a forced pipe." + Future.failed(new IllegalStateException(msg)) + } + + def getIterable[T](conf: Config, initial: TypedPipe[T])(implicit + cec: ConcurrentExecutionContext + ): Future[Iterable[T]] = + getForced(conf, initial).flatMap { + case TypedPipe.EmptyTypedPipe => Future.successful(Nil) + case TypedPipe.IterablePipe(iter) => Future.successful(iter) + case TypedPipe.SourcePipe(src: Mappable[T]) => + Future.successful(new Iterable[T] { + def iterator = src.toIterator(conf, mode) + }) + case other => + val msg = + s"logic error: expected an Iterable pipe. ($conf, $initial) -> $other is not iterable" + Future.failed(new IllegalStateException(msg)) + } + + private def forceToDisk[T]( // linter:disable:UnusedParameter + uuid: UUID, + conf: Config, + pipe: TypedPipe[T] // note, we don't use this, but it fixes the type T + ): (typed.TypedSink[T], () => TypedPipe[T], Option[String]) = + mode match { + case _: CascadingLocal => // Local or Test mode + val inMemoryDest = new typed.MemorySink[T] + + /** + * This is a bit tricky. readResults has to be called after the job has run, so we need to do this + * inside the function which will be called after the job has run + */ + (inMemoryDest, () => TypedPipe.from(inMemoryDest.readResults), None) + case _: HadoopMode => + val temporaryPath: String = { + val tmpDir = conf + .get("hadoop.tmp.dir") + .orElse(conf.get("cascading.tmp.dir")) + .getOrElse("/tmp") + + tmpDir + "/scalding/snapshot-" + uuid + ".seq" + } + val cleanup = Some(temporaryPath) + val srcSink = source.TypedSequenceFile[T](temporaryPath) + (srcSink, () => TypedPipe.from(srcSink), cleanup) + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CascadingBackend.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CascadingBackend.scala new file mode 100644 index 0000000000..72248e0649 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CascadingBackend.scala @@ -0,0 +1,930 @@ +package com.twitter.scalding.typed.cascading_backend + +import cascading.flow.FlowDef +import cascading.operation.Debug +import cascading.pipe.{CoGroup, Each, HashJoin, Pipe} +import cascading.tuple.{Fields, Tuple => CTuple} +import com.twitter.scalding.dagon.{Dag, FunctionK, HCache, Id, Rule} +import com.twitter.scalding.TupleConverter.{singleConverter, tuple2Converter} +import com.twitter.scalding.TupleSetter.{singleSetter, tup2Setter} +import com.twitter.scalding.{ + CleanupIdentityFunction, + Config, + Dsl, + Execution, + Field, + FlowState, + FlowStateMap, + GroupBuilder, + IncrementCounters, + IterableSource, + MapsideReduce, + Mode, + RichFlowDef, + RichPipe, + Source, + TupleConverter, + TupleGetter, + TupleSetter, + TypedBufferOp, + WrappedJoiner, + Write +} +import com.twitter.scalding.typed._ +import com.twitter.scalding.typed.functions.{FilterKeysToFilter, FlatMapValuesToFlatMap, MapValuesToMap} +import com.twitter.scalding.serialization.{ + Boxed, + BoxedOrderedSerialization, + CascadingBinaryComparator, + EquivSerialization, + OrderedSerialization, + WrappedSerialization +} +import java.util.WeakHashMap +import org.slf4j.LoggerFactory + +object CascadingBackend { + private[this] val logger = LoggerFactory.getLogger(getClass) + + def converterFrom[A](ts: TupleSetter[A]): Option[TupleConverter[A]] = + ts match { + case TupleSetter.Single() => + Some(TupleConverter.Single(TupleGetter.Casting()).asInstanceOf[TupleConverter[A]]) + case _ => TupleSetter.converterFromSetter(ts, TupleConverter) + } + + def areDefiniteInverse[A, B](t: TupleConverter[A], s: TupleSetter[B]): Boolean = + converterFrom(s).exists(_ == t) + + import TypedPipe._ + + private val valueField: Fields = new Fields("value") + private val kvFields: Fields = new Fields("key", "value") + private val f0: Fields = new Fields(java.lang.Integer.valueOf(0)) + + private def tuple2Conv[K, V](ord: Ordering[K]): TupleConverter[(K, V)] = + ord match { + case _: OrderedSerialization[_] => + tuple2Converter[Boxed[K], V].andThen { kv => + (kv._1.get, kv._2) + } + case _ => tuple2Converter[K, V] + } + + private def valueConverter[V](optOrd: Option[Ordering[V]]): TupleConverter[V] = + optOrd + .map { + case _: OrderedSerialization[_] => + TupleConverter.singleConverter[Boxed[V]].andThen(_.get) + case _ => TupleConverter.singleConverter[V] + } + .getOrElse(TupleConverter.singleConverter[V]) + + private def keyConverter[K](ord: Ordering[K]): TupleConverter[K] = + ord match { + case _: OrderedSerialization[_] => + TupleConverter.singleConverter[Boxed[K]].andThen(_.get) + case _ => TupleConverter.singleConverter[K] + } + + private def keyGetter[K](ord: Ordering[K]): TupleGetter[K] = + ord match { + case _: OrderedSerialization[K] => + new TupleGetter[K] { + def get(tup: CTuple, i: Int) = tup.getObject(i).asInstanceOf[Boxed[K]].get + } + case _ => TupleGetter.castingGetter + } + + /** + * If we are using OrderedComparable, we need to box the key to prevent other serializers from handling the + * key + */ + private def getBoxFnAndOrder[K]( + ordser: OrderedSerialization[K], + flowDef: FlowDef + ): (K => Boxed[K], BoxedOrderedSerialization[K]) = { + // We can only supply a cacheKey if the equals and hashcode are known sane + val (boxfn, cls) = + Boxed.nextCached[K](if (ordser.isInstanceOf[EquivSerialization[_]]) Some(ordser) else None) + val boxordSer = BoxedOrderedSerialization(boxfn, ordser) + + WrappedSerialization.rawSetBinary( + List((cls, boxordSer)), + { case (k: String, v: String) => + FlowStateMap.merge(flowDef, FlowState.withConfigSetting(k + cls, v)) + } + ) + (boxfn, boxordSer) + } + + /** + * Check if the Ordering is an OrderedSerialization, if so box in a Boxed so hadoop and cascading can + * dispatch the right serialization + */ + private def maybeBox[K, V](ord: Ordering[K], flowDef: FlowDef)( + op: (TupleSetter[(K, V)], Fields) => Pipe + ): Pipe = + ord match { + case ordser: OrderedSerialization[K] => + val (boxfn, boxordSer) = getBoxFnAndOrder[K](ordser, flowDef) + + val ts = tup2Setter[(Boxed[K], V)].contraMap { kv1: (K, V) => (boxfn(kv1._1), kv1._2) } + val keyF = new Fields("key") + keyF.setComparator("key", new CascadingBinaryComparator(boxordSer)) + op(ts, keyF) + case _ => + val ts = tup2Setter[(K, V)] + val keyF = Field.singleOrdered("key")(ord) + op(ts, keyF) + } + + // TODO we could probably optimize this further by just composing + // the toPipe function directly, so we don't actually create the pipe until + // the TupleSetter comes in. With this, we can make sure to use the right + // TupleSetter on the final pipe + private case class CascadingPipe[+T]( + pipe: Pipe, + fields: Fields, + @transient localFlowDef: FlowDef, // not serializable. + converter: TupleConverter[_ <: T] + ) { + + /** + * merge the flowDef into this new flowdef an make sure the tuples have the structure defined by setter + */ + def toPipe[U >: T](f: Fields, fd: FlowDef, setter: TupleSetter[U]): Pipe = { + val resFd = new RichFlowDef(fd) + resFd.mergeFrom(localFlowDef) + + if (!RichPipe.isPassthrough(pipe) && areDefiniteInverse(converter, setter) && (fields == f)) { + // Note that some custom scalding sources have a bug + // that the fields on the TypedSource don't match the fields on underlying + // scheme. To work around this, we don't do this optimization on sources. + // + // We have extended this to cover filters, since we can push those down + // and they will proxy through the previous fields which may be UNKNOWN + // + // Note, cascading appears to not care if it sees UNKNOWN fields as input to + // a filter or map, but it does care about creating a Merge with + // unknown fields, since we always call toPipe before creating the Merge, + // as long as the chain of operations contains at least one non-filter, + // we should be okay + // + // in this branch, the TupleConverter is what the Setter is expecting, so we don't + // need to do any transform + // we are already in the right format + pipe + } else { + // we need to convert + RichPipe(pipe).mapTo[T, U](fields -> f)(t => t)(TupleConverter.asSuperConverter(converter), setter) + } + } + } + + private object CascadingPipe { + def single[T](pipe: Pipe, fd: FlowDef): CascadingPipe[T] = + CascadingPipe(pipe, f0, fd, singleConverter[T]) + } + + /** + * we want to cache renderings of some TypedPipe to Pipe so cascading will see them as the same. Without + * this, it is very easy to have a lot of recomputation. Ideally we would plan an entire graph at once, and + * not need a static cache here, but currently we still plan one TypedPipe at a time. + */ + private class CompilerCache { + + private[this] val cache = new WeakHashMap[FlowDef, FunctionK[TypedPipe, CascadingPipe]]() + + def get(fd: FlowDef, m: Mode): FunctionK[TypedPipe, CascadingPipe] = + cache.synchronized { + cache.get(fd) match { + case null => + val c = compile(m) + cache.put(fd, c) + c + case nonNull => nonNull + } + } + } + private[this] val cache = new CompilerCache + + /** + * Method to compile scalding's `TypedPipe`s to cascading's `Pipe`s. + * + * Since equal `TypedPipe`s define same computation we would like to compile them into referentially the + * same cascading's `Pipe` instance. This logic breaks if one typed pipe is really big and has two forked + * different computations both of which significantly decrease size of the data. If we will cache common + * part of this two computations in the same cascading's `Pipe` instance we end up with common part being + * materialized. Therefore for some kind of `TypedPipe`s we want to avoid their caching. + * + * `.cross` `TypedPipe` is one of examples of such `TypedPipe`s we never want to materialize and, therefore, + * cache. + * + * `compile` logic is separated into next functions: + * - `transform` which defines main transformation logic, without any caching applied. This method accepts + * `rec` parameter which is being called to transform children pipes. + * - `withCachePolicy` which defines transformation logic with caching applied. + * - `notCached` to support use case with `.cross` pipe, where pipe itself shouldn't be cached but `left` + * and `right` sides of it should be. + */ + private def compile(mode: Mode): FunctionK[TypedPipe, CascadingPipe] = + new FunctionK[TypedPipe, CascadingPipe] { + + private val cache = HCache.empty[TypedPipe, CascadingPipe] + + override def toFunction[U]: TypedPipe[U] => CascadingPipe[U] = withCachePolicy + + private def withCachePolicy[U]: TypedPipe[U] => CascadingPipe[U] = { + // Don't cache `CrossPipe`, but cache `left` and `right` side of it + case cp @ CrossPipe(left, right) => + notCached(excludes = Set(left, right))(cp) + // Don't cache `Fork` and `WithDescriptionTypedPipe` + // since if we do cache them `CrossPipe` will end up being cached as well + case tp @ Fork(_) => + transform(tp, this) + case tp @ WithDescriptionTypedPipe(_, _) => + transform(tp, this) + // Cache all other typed pipes + case tp => + cache.getOrElseUpdate(tp, transform(tp, this)) + } + + private def notCached(excludes: Set[TypedPipe[_]]): FunctionK[TypedPipe, CascadingPipe] = + new FunctionK[TypedPipe, CascadingPipe] { + override def toFunction[U]: TypedPipe[U] => CascadingPipe[U] = { tp => + if (excludes.contains(tp)) withCachePolicy(tp) else transform(tp, this) + } + } + + private def transform[T]( + pipe: TypedPipe[T], + rec: FunctionK[TypedPipe, CascadingPipe] + ): CascadingPipe[T] = pipe match { + case cp @ CounterPipe(_) => + def go[A](cp: CounterPipe[A]): CascadingPipe[A] = { + val CascadingPipe(pipe0, initF, fd, conv) = rec(cp.pipe) + val cpipe = RichPipe(pipe0) + .eachTo(initF -> f0)( + new IncrementCounters[A]( + _, + TupleConverter + .asSuperConverter(conv) + ) + ) + CascadingPipe.single[A](cpipe, fd) + } + + go(cp) + case cp @ CrossPipe(_, _) => + rec(cp.viaHashJoin) + case cv @ CrossValue(_, _) => + rec(cv.viaHashJoin) + case DebugPipe(p) => + val inner = rec(p) + inner.copy(pipe = new Each(inner.pipe, new Debug)) + case EmptyTypedPipe => + // just use an empty iterable pipe. + rec(IterablePipe(List.empty[T])) + case fk @ FilterKeys(_, _) => + def go[K, V](node: FilterKeys[K, V]): CascadingPipe[(K, V)] = { + val rewrite = Filter[(K, V)](node.input, FilterKeysToFilter(node.fn)) + rec(rewrite) + } + + go(fk) + case f @ Filter(_, _) => + // hand holding for type inference + def go[T1 <: T](f: Filter[T1]): CascadingPipe[T] = { + val Filter(input, fn) = f + val CascadingPipe(pipe, initF, fd, conv) = rec(input) + // This does not need a setter, which is nice. + val fpipe = RichPipe(pipe).filter[T1](initF)(fn)(TupleConverter.asSuperConverter(conv)) + CascadingPipe[T](fpipe, initF, fd, conv) + } + + go(f) + case f @ FlatMapValues(_, _) => + def go[K, V, U](node: FlatMapValues[K, V, U]): CascadingPipe[T] = + rec(FlatMapped[(K, V), (K, U)](node.input, FlatMapValuesToFlatMap(node.fn))) + + go(f) + case fm @ FlatMapped(_, _) => + // TODO we can optimize a flatmapped input directly and skip some tupleconverters + def go[A, B <: T](fm: FlatMapped[A, B]): CascadingPipe[T] = { + val CascadingPipe(pipe, initF, fd, conv) = rec(fm.input) + val fmpipe = RichPipe(pipe).flatMapTo[A, T](initF -> f0)(fm.fn)( + TupleConverter + .asSuperConverter(conv), + singleSetter + ) + CascadingPipe.single[B](fmpipe, fd) + } + + go(fm) + case ForceToDisk(input) => + val cp = rec(input) + cp.copy(pipe = RichPipe(cp.pipe).forceToDisk) + case Fork(input) => + // fork doesn't mean anything here since we are already planning each TypedPipe to + // something in cascading. Fork is an optimizer level operation + rec(input) + case IterablePipe(iter) => + val fd = new FlowDef + val pipe = IterableSource[T](iter, f0)(singleSetter, singleConverter).read(fd, mode) + CascadingPipe.single[T](pipe, fd) + case f @ MapValues(_, _) => + def go[K, A, B](fn: MapValues[K, A, B]): CascadingPipe[_ <: (K, B)] = + rec(Mapped[(K, A), (K, B)](fn.input, MapValuesToMap(fn.fn))) + + go(f) + case m @ Mapped(_, _) => + def go[A, B <: T](m: Mapped[A, B]): CascadingPipe[T] = { + val Mapped(input, fn) = m + val CascadingPipe(pipe, initF, fd, conv) = rec(input) + val fmpipe = RichPipe(pipe).mapTo[A, T](initF -> f0)(fn)( + TupleConverter + .asSuperConverter(conv), + singleSetter + ) + CascadingPipe.single[B](fmpipe, fd) + } + + go(m) + + case m @ MergedTypedPipe(_, _) => + OptimizationRules.unrollMerge(m) match { + case Nil => rec(EmptyTypedPipe) + case h :: Nil => rec(h) + case nonEmpty => + // TODO: a better optimization is to not materialize this + // node at all if there is no fan out since groupBy and cogroupby + // can accept multiple inputs + + val flowDef = new FlowDef + // if all of the converters are the same, we could skip some work + // here, but need to be able to see that correctly + val pipes = nonEmpty.map(p => rec(p).toPipe(f0, flowDef, singleSetter)) + val merged = new cascading.pipe.Merge(pipes.map(RichPipe.assignName): _*) + CascadingPipe.single[T](merged, flowDef) + } + case SourcePipe(input) => + input match { + case ts: TypedSource[_] => + val typedSrc = ts.asInstanceOf[TypedSource[T]] + val fd = new FlowDef + val pipe = typedSrc.read(fd, mode) + CascadingPipe[T](pipe, typedSrc.sourceFields, fd, typedSrc.converter[T]) + case notCascading => + throw new IllegalArgumentException( + s"cascading mode requires TypedSource, found: $notCascading of class ${notCascading.getClass}" + ) + } + case sblk @ SumByLocalKeys(_, _) => + def go[K, V](sblk: SumByLocalKeys[K, V]): CascadingPipe[(K, V)] = { + val cp = rec(sblk.input) + val localFD = new FlowDef + val cpKV: Pipe = cp.toPipe(kvFields, localFD, tup2Setter) + val msr = new MapsideReduce(sblk.semigroup, new Fields("key"), valueField, None)( + singleConverter[V], + singleSetter[V] + ) + val kvpipe = RichPipe(cpKV).eachTo(kvFields -> kvFields)(_ => msr) + CascadingPipe(kvpipe, kvFields, localFD, tuple2Converter[K, V]) + } + + go(sblk) + case trapped: TrappedPipe[u] => + val cp: CascadingPipe[_ <: u] = rec(trapped.input) + import trapped._ + // TODO: with diamonds in the graph, this might not be correct + // it seems cascading requires puts the immediate tuple that + // caused the exception, so if you addTrap( ).map(f).map(g) + // and f changes the tuple structure, if we don't collapse the + // maps into 1 operation, cascading can write two different + // schemas into the trap, making it unreadable. + // this basically means there can only be one operation in between + // a trap and a forceToDisk or a groupBy/cogroupBy (any barrier). + (sink, sink) match { + case (src: Source, tsink: TypedSink[u @unchecked]) => + val optTC: Option[TupleConverter[u]] = + (sink match { + case tsrc: TypedSource[u @unchecked] if tsrc.converter.arity == tsink.setter.arity => + Some(tsrc.converter) + case _ => + converterFrom(tsink.setter) + }).map(TupleConverter.asSuperConverter(_)) + + optTC match { + case Some(tc) => + val fd = new FlowDef + val pp: Pipe = cp.toPipe[u](tsink.sinkFields, fd, TupleSetter.asSubSetter(tsink.setter)) + val pipe = RichPipe.assignName(pp) + fd.addTrap(pipe, src.createTap(Write)(mode)) + CascadingPipe[u](pipe, tsink.sinkFields, fd, tc) + case None => + logger.warn( + s"No TupleConverter found for ${trapped}. Use a TypedSink that is also a TypedSource. Found sink: ${sink}" + ) + // we just ignore the trap in this case. + // if the job doesn't fail, the trap would be empty anyway, + // if the job does fail, we will see the failure + cp + } + case _ => + // it should be safe to only warn here because + // if the trap is removed and there is a failure the job should fail + logger.warn( + s"Trap on ${trapped.input} does not have a valid output: ${trapped.sink}" + + ", a subclass of Source and TypedSink is required\nTrap ignored" + ) + cp + } + case WithDescriptionTypedPipe(input, descs) => + @annotation.tailrec + def loop[A]( + t: TypedPipe[A], + acc: List[(String, Boolean)] + ): (TypedPipe[A], List[(String, Boolean)]) = + t match { + case WithDescriptionTypedPipe(i, descs) => + loop(i, descs ::: acc) + case notDescr => (notDescr, acc) + } + + val (root, allDesc) = loop(input, descs) + val cp = rec(root) + cp.copy(pipe = applyDescriptions(cp.pipe, allDesc)) + + case WithOnComplete(input, fn) => + val cp = rec(input) + val next = new Each(cp.pipe, Fields.ALL, new CleanupIdentityFunction(fn)) + cp.copy(pipe = next) + + case hcg @ HashCoGroup(_, _, _) => + def go[K, V1, V2, R](hcg: HashCoGroup[K, V1, V2, R]): CascadingPipe[(K, R)] = + planHashJoin(hcg.left, hcg.right, hcg.joiner, rec) + + go(hcg) + case ReduceStepPipe(rs) => + planReduceStep(rs, rec) + + case CoGroupedPipe(cg) => + planCoGroup(cg, rec) + } + } + + private def applyDescriptions(p: Pipe, descriptions: List[(String, Boolean)]): Pipe = { + val ordered = descriptions.collect { case (d, false) => d }.reverse + val unordered = descriptions.collect { case (d, true) => d }.distinct.sorted + + RichPipe.setPipeDescriptions(p, ordered ::: unordered) + } + + /** + * These are rules we should apply to any TypedPipe before handing to cascading. These should be a bit + * conservative in that they should be highly likely to improve the graph. + */ + def defaultOptimizationRules(config: Config): Seq[Rule[TypedPipe]] = { + + def std(forceHash: Rule[TypedPipe]) = + OptimizationRules.standardMapReduceRules ::: + List( + OptimizationRules.FilterLocally, // after filtering, we may have filtered to nothing, lets see + OptimizationRules.simplifyEmpty, + // add any explicit forces to the optimized graph + Rule.orElse(List(forceHash, OptimizationRules.RemoveDuplicateForceFork)) + ) + + config.getOptimizationPhases match { + case Some(tryPhases) => tryPhases.get.phases + case None => + val force = + if (config.getHashJoinAutoForceRight) OptimizationRules.ForceToDiskBeforeHashJoin + else Rule.empty[TypedPipe] + std(force) + } + } + + final def toPipe[U](p: TypedPipe[U], fieldNames: Fields)(implicit + flowDef: FlowDef, + mode: Mode, + setter: TupleSetter[U] + ): Pipe = { + + val phases = defaultOptimizationRules(Config.defaultFrom(mode)) + val (d, id) = Dag(p, OptimizationRules.toLiteral) + val d1 = d.applySeq(phases) + val p1 = d1.evaluate(id) + + // Now that we have an optimized pipe, convert it to a Pipe + toPipeUnoptimized(p1, fieldNames) + } + + /** + * This needs to be called, to plan any writes from the TypedPipe API. This is here to globally optimize the + * entire flow, and not optimize on a write-by-write basis. + * + * This uses the FlowStateMap this method is idempotent. + * + * It is called by default in ExecutionContext at the last step before building the Flow. Job also needs to + * call this method in validate to make sure validation works. + */ + def planTypedWrites(fd: FlowDef, mode: Mode): Unit = { + def doWrites(writes: List[FlowStateMap.TypedWrite[_]]): Unit = { + val empty = Dag.empty(OptimizationRules.toLiteral) + type ToDo[A] = (Id[A], TypedSink[A]) + val (rootedDag, todos) = writes.foldLeft((empty, List.empty[ToDo[_]])) { case ((dag, items), tw) => + val (nextDag, id) = dag.addRoot(tw.pipe) + require(tw.mode == mode, s"${tw.mode} should be equal to $mode") + (nextDag, (id, tw.sink) :: items) + } + val phases = defaultOptimizationRules(Config.defaultFrom(mode)) + val optDag = rootedDag.applySeq(phases) + def doWrite[A](pair: (Id[A], TypedSink[A])): Unit = { + val optPipe = optDag.evaluate(pair._1) + val dest = pair._2 + val cpipe = toPipeUnoptimized[A](optPipe, dest.sinkFields)(fd, mode, dest.setter) + dest.writeFrom(cpipe)(fd, mode) + } + todos.foreach(doWrite(_)) + } + + doWrites(FlowStateMap.removeWrites(fd).pendingTypedWrites) + + // We do writes twice because customer can use Typed API in their TypedSink implementation. + val pendingWritesAfterPlan = FlowStateMap.removeWrites(fd).pendingTypedWrites + if (pendingWritesAfterPlan.nonEmpty) { + logger.warn( + "Using Typed API in TypedSink implementation is prohibited and " + + "might be removed in later releases of Scalding." + ) + doWrites(pendingWritesAfterPlan) + } + + if (FlowStateMap.removeWrites(fd).pendingTypedWrites.nonEmpty) { + throw new IllegalStateException("You can't use Typed API in you TypedSink implementation nestedly.") + } + } + + /** + * Convert a cascading FlowDef to an Option[Execution[Unit]] + * + * Return None if the Execution is empty. If the FlowDef includes things other than TypedPipe.writes, this + * will return Some failed Execution. + * + * This method is useful for people who have used the Typed-API with FlowDefs, but not Executions and want + * to convert to an Execution without rewriting all the code. An example of this is summingbird which + * constructs a single FlowDef for the entire plan it makes. For large plans from summingbird you may want + * to use write partitioning. + */ + def flowDefToExecution( + fd: FlowDef, + partitionOptimizations: Option[Seq[Rule[TypedPipe]]] + ): Option[Execution[Unit]] = { + val rfd = new RichFlowDef(fd) + // TypedPipe jobs should not have modified + // the FlowDef yet, only the FlowState should + // be updated + if (rfd.isEmpty) { + FlowStateMap + .get(fd) + .flatMap { // Note, we want this to be a pure function so we don't mutate the FlowStateMap + case FlowState(srcs, confs, writes) if srcs.isEmpty && confs.isEmpty => + writes match { + case Nil => None + case nonEmpty => + partitionOptimizations match { + case None => + def write[A](w: FlowStateMap.TypedWrite[A]): Execution[Unit] = + w.pipe.writeExecution(w.sink) + + Some(Execution.sequence(nonEmpty.map(write(_))).unit) + case Some(rules) => + def toPair[A]( + f: FlowStateMap.TypedWrite[A] + ): WritePartitioner.PairK[TypedPipe, TypedSink, A] = + (f.pipe, f.sink) + + val pairs: List[WritePartitioner.PairK[TypedPipe, TypedSink, _]] = nonEmpty.map(toPair(_)) + Some(WritePartitioner.materialize[Execution](rules, pairs)) + } + } + case fs => + // we can't handle if there have been anything other than TypedPipe.write on the + // TypedPipe + Some( + Execution.failed(new Exception(s"expected empty FlowState other than TypedWrites, found: $fs")) + ) + } + } else + Some( + Execution.failed( + new Exception(s"We can only convert Typed-API Jobs to Execution. Found non-empty FlowDef: $fd") + ) + ) + } + + /** + * This converts the TypedPipe to a cascading Pipe doing the most direct possible translation we can. This + * is useful for testing or for expert cases where you want more direct control of the TypedPipe than the + * default method gives you. + */ + final def toPipeUnoptimized[U](input: TypedPipe[U], fieldNames: Fields)(implicit + flowDef: FlowDef, + mode: Mode, + setter: TupleSetter[U] + ): Pipe = { + + val compiler = cache.get(flowDef, mode) + + /** + * These rules are not optimizations, but actually required for Cascading to not throw. Cascading requires + * certain shapes of the graphs + */ + val p = OptimizationRules( + input, + OptimizationRules.DescribeLater + .orElse(OptimizationRules.DeferMerge) + .orElse(OptimizationRules.DiamondToFlatMap) + ) + + val cp: CascadingPipe[U] = compiler(p) + + RichPipe(cp.toPipe(fieldNames, flowDef, TupleSetter.asSubSetter(setter))) + // TODO: this indirection may not be needed anymore, we could directly track config changes + // rather than using FlowStateMap. This is the only call of this method, so maybe we can + // remove it. + .applyFlowConfigProperties(flowDef) + } + + private def planCoGroup[K, R]( + cg: CoGrouped[K, R], + rec: FunctionK[TypedPipe, CascadingPipe] + ): CascadingPipe[(K, R)] = { + + // This has the side effect of planning all inputs now + // before we need to call them below + val inputsCR = cg.inputs.map(rec(_)) + + import cg.{inputs, joinFunction} + // Cascading handles the first item in join differently, we have to see if it is repeated + val firstCount = inputs.count(_ == inputs.head) + + import Dsl._ + import RichPipe.assignName + + val flowDef = new FlowDef + + def toPipe[A, B](t: TypedPipe[(A, B)], f: Fields, setter: TupleSetter[(A, B)]): Pipe = + rec(t).toPipe(f, flowDef, TupleSetter.asSubSetter(setter)) + /* + * we only want key and value. + * Cascading requires you have the same number coming in as out. + * in the first case, we introduce (null0, null1), in the second + * we have (key1, value1), but they are then discarded: + */ + def outFields(inCount: Int): Fields = + List("key", "value") ++ (0 until (2 * (inCount - 1))).map("null%d".format(_)) + + // Make this stable so the compiler does not make a closure + val ord = cg.keyOrdering + + val newPipe = maybeBox[K, Any](ord, flowDef) { (tupset, ordKeyField) => + if (firstCount == inputs.size) { + + /** + * This is a self-join Cascading handles this by sending the data only once, spilling to disk if the + * groups don't fit in RAM, then doing the join on this one set of data. This is fundamentally + * different than the case where the first item is not repeated. That case is below + */ + val NUM_OF_SELF_JOINS = firstCount - 1 + new CoGroup( + assignName(toPipe[K, Any](inputs.head, kvFields, tupset)), + ordKeyField, + NUM_OF_SELF_JOINS, + outFields(firstCount), + WrappedJoiner(new DistinctCoGroupJoiner(firstCount, keyGetter(ord), joinFunction)) + ) + } else if (firstCount == 1) { + + def keyId(idx: Int): String = "key%d".format(idx) + + /** + * As long as the first one appears only once, we can handle self joins on the others: Cascading does + * this by maybe spilling all the streams other than the first item. This is handled by a different + * CoGroup constructor than the above case. + */ + def renamePipe(idx: Int, p: TypedPipe[(K, Any)]): Pipe = + toPipe[K, Any](p, List(keyId(idx), "value%d".format(idx)), tupset) + + // This is tested for the properties we need (non-reordering) + val distincts = CoGrouped.distinctBy(inputs)(identity) + val dsize = distincts.size + val isize = inputs.size + + def makeFields(id: Int): Fields = { + val comp = ordKeyField.getComparators.apply(0) + val fieldName = keyId(id) + val f = new Fields(fieldName) + f.setComparator(fieldName, comp) + f + } + + val groupFields: Array[Fields] = (0 until dsize) + .map(makeFields) + .toArray + + val pipes: Array[Pipe] = distincts.zipWithIndex.map { case (item, idx) => + assignName(renamePipe(idx, item)) + }.toArray + + val cjoiner = if (isize != dsize) { + // avoid capturing anything other than the mapping ints: + val mapping: Map[Int, Int] = inputs.zipWithIndex.map { case (item, idx) => + idx -> distincts.indexWhere(_ == item) + }.toMap + + new CoGroupedJoiner(isize, keyGetter(ord), joinFunction) { + val distinctSize = dsize + def distinctIndexOf(orig: Int) = mapping(orig) + } + } else { + new DistinctCoGroupJoiner(isize, keyGetter(ord), joinFunction) + } + + new CoGroup(pipes, groupFields, outFields(dsize), WrappedJoiner(cjoiner)) + } else { + + /** + * This is non-trivial to encode in the type system, so we throw this exception at the planning phase. + */ + sys.error( + "Except for self joins, where you are joining something with only itself,\n" + + "left-most pipe can only appear once. Firsts: " + + inputs.collect { case x if x == inputs.head => x }.toString + ) + } + } + /* + * the CoGrouped only populates the first two fields, the second two + * are null. We then project out at the end of the method. + */ + val pipeWithRedAndDescriptions = { + RichPipe.setReducers(newPipe, cg.reducers.getOrElse(-1)) + RichPipe.setPipeDescriptions(newPipe, cg.descriptions) + newPipe.project(kvFields) + } + + CascadingPipe[(K, R)](pipeWithRedAndDescriptions, kvFields, flowDef, tuple2Converter[K, R]) + } + + /** + * TODO: most of the complexity of this method should be rewritten as an optimization rule that works on the + * scalding typed AST. the code in here gets pretty complex and depending on the details of cascading and + * also how we compile to cascading. + * + * But the optimization is somewhat general: we often want a checkpoint before a hashjoin is replicated + */ + private def planHashJoin[K, V1, V2, R]( + left: TypedPipe[(K, V1)], + right: HashJoinable[K, V2], + joiner: (K, V1, Iterable[V2]) => Iterator[R], + rec: FunctionK[TypedPipe, CascadingPipe] + ): CascadingPipe[(K, R)] = { + + val fd = new FlowDef + val leftPipe = rec(left).toPipe(kvFields, fd, tup2Setter) + val mappedPipe = rec(right.mapped).toPipe(new Fields("key1", "value1"), fd, tup2Setter) + + val singleValuePerRightKey = CoGroupable.atMostOneValue(right) + val keyOrdering = right.keyOrdering + val hashPipe = new HashJoin( + RichPipe.assignName(leftPipe), + Field.singleOrdered("key")(keyOrdering), + mappedPipe, + Field.singleOrdered("key1")(keyOrdering), + WrappedJoiner(new HashJoiner(singleValuePerRightKey, right.joinFunction, joiner)) + ) + + CascadingPipe[(K, R)](RichPipe(hashPipe).project(kvFields), kvFields, fd, tuple2Converter[K, R]) + } + + private def planReduceStep[K, V1, V2]( + rs: ReduceStep[K, V1, V2], + rec: FunctionK[TypedPipe, CascadingPipe] + ): CascadingPipe[(K, V2)] = { + + val mapped = rec(rs.mapped) + + def groupOp(gb: GroupBuilder => GroupBuilder): CascadingPipe[_ <: (K, V2)] = + groupOpWithValueSort(None)(gb) + + def groupOpWithValueSort( + valueSort: Option[Ordering[V1]] + )(gb: GroupBuilder => GroupBuilder): CascadingPipe[_ <: (K, V2)] = { + val flowDef = new FlowDef + val pipe = maybeBox[K, V1](rs.keyOrdering, flowDef) { (tupleSetter, fields) => + val (sortOpt, ts) = valueSort + .map { + case ordser: OrderedSerialization[V1 @unchecked] => + // We get in here when we do a secondary sort + // and that sort is an ordered serialization + // We now need a boxed serializer for this type + // Then we set the comparator on the field, and finally we box the value with our tupleSetter + val (boxfn, boxordSer) = getBoxFnAndOrder[V1](ordser, flowDef) + val valueF = new Fields("value") + valueF.setComparator("value", new CascadingBinaryComparator(boxordSer)) + val ts2 = tupleSetter.asInstanceOf[TupleSetter[(K, Boxed[V1])]].contraMap { kv1: (K, V1) => + (kv1._1, boxfn(kv1._2)) + } + (Some(valueF), ts2) + case vs => + val vord = Field.singleOrdered("value")(vs) + (Some(vord), tupleSetter) + } + .getOrElse((None, tupleSetter)) + + val p = mapped.toPipe(kvFields, flowDef, TupleSetter.asSubSetter(ts)) + + RichPipe(p).groupBy(fields) { inGb => + val withSort = sortOpt.fold(inGb)(inGb.sortBy) + gb(withSort) + } + } + + val tupConv = tuple2Conv[K, V2](rs.keyOrdering) + CascadingPipe(pipe, kvFields, flowDef, tupConv) + } + + rs match { + case ir @ IdentityReduce(_, _, None, descriptions, _) => + type CP[V] = CascadingPipe[_ <: (K, V)] + // Not doing anything + ir.evidence.subst[CP](mapped.copy(pipe = RichPipe.setPipeDescriptions(mapped.pipe, descriptions))) + case uir @ UnsortedIdentityReduce(_, _, None, descriptions, _) => + type CP[V] = CascadingPipe[_ <: (K, V)] + // Not doing anything + uir.evidence.subst[CP](mapped.copy(pipe = RichPipe.setPipeDescriptions(mapped.pipe, descriptions))) + case IdentityReduce(_, _, Some(reds), descriptions, _) => + groupOp(_.reducers(reds).setDescriptions(descriptions)) + case UnsortedIdentityReduce(_, _, Some(reds), descriptions, _) => + // This is weird, but it is sometimes used to force a partition + groupOp(_.reducers(reds).setDescriptions(descriptions)) + case ivsr @ IdentityValueSortedReduce(_, _, _, _, _, _) => + groupOpWithValueSort(Some(ivsr.valueSort)) { gb => + // If its an ordered serialization we need to unbox + val mappedGB = + if (ivsr.valueSort.isInstanceOf[OrderedSerialization[_]]) + gb.mapStream[Boxed[V1], V1](valueField -> valueField) { it: Iterator[Boxed[V1]] => + it.map(_.get) + } + else + gb + + mappedGB + .reducers(ivsr.reducers.getOrElse(-1)) + .setDescriptions(ivsr.descriptions) + } + case vsr @ ValueSortedReduce(_, _, _, _, _, _) => + val optVOrdering = Some(vsr.valueSort) + groupOpWithValueSort(optVOrdering) { + // If its an ordered serialization we need to unbox + // the value before handing it to the users operation + _.every( + new cascading.pipe.Every( + _, + valueField, + new TypedBufferOp[K, V1, V2]( + keyConverter(vsr.keyOrdering), + valueConverter(optVOrdering), + vsr.reduceFn, + valueField + ), + Fields.REPLACE + ) + ) + .reducers(vsr.reducers.getOrElse(-1)) + .setDescriptions(vsr.descriptions) + } + case imr @ IteratorMappedReduce(_, _, _, _, _) => + groupOp { + _.every( + new cascading.pipe.Every( + _, + valueField, + new TypedBufferOp( + keyConverter(imr.keyOrdering), + TupleConverter.singleConverter[V1], + imr.reduceFn, + valueField + ), + Fields.REPLACE + ) + ) + .reducers(imr.reducers.getOrElse(-1)) + .setDescriptions(imr.descriptions) + } + } + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CascadingExtensions.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CascadingExtensions.scala new file mode 100644 index 0000000000..b034927bae --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CascadingExtensions.scala @@ -0,0 +1,556 @@ +package com.twitter.scalding.typed.cascading_backend + +import cascading.flow.{Flow, FlowConnector, FlowDef} +import cascading.pipe.Pipe +import cascading.tap.Tap +import cascading.tuple.{Fields, TupleEntryIterator} +import com.twitter.scalding.cascading_interop.FlowListenerPromise +import com.twitter.scalding.filecache.{CachedFile, DistributedCacheFile} +import com.twitter.scalding.mathematics.{Matrix2, MatrixLiteral} +import com.twitter.scalding.typed.KeyedListLike +import org.apache.hadoop.conf.Configuration +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future} +import scala.util.Try + +import com.twitter.scalding._ + +import TupleConverter.singleConverter +import scala.collection.JavaConverters._ + +trait CascadingExtensions { + implicit class TypedPipeCompanionCascadingExtensions(tp: TypedPipe.type) { + + /** + * Create a TypedPipe from a cascading Pipe, some Fields and the type T Avoid this if you can. Prefer + * from(TypedSource). + */ + def fromPipe[T](pipe: Pipe, fields: Fields)(implicit + flowDef: FlowDef, + mode: Mode, + conv: TupleConverter[T] + ): TypedPipe[T] = { + + /* + * This could be in TypedSource, but we don't want to encourage users + * to work directly with Pipe + */ + case class WrappingSource[T]( + pipe: Pipe, + fields: Fields, + @transient localFlow: FlowDef, // FlowDef is not serializable. We shouldn't need to, but being paranoid + mode: Mode, + conv: TupleConverter[T] + ) extends TypedSource[T] { + + def converter[U >: T]: TupleConverter[U] = + TupleConverter.asSuperConverter[T, U](conv) + + def read(implicit fd: FlowDef, m: Mode): Pipe = { + // This check is not likely to fail unless someone does something really strange. + // for historical reasons, it is not checked by the typed system + require( + m == mode, + s"Cannot switch Mode between TypedPipe.from and toPipe calls. Pipe: $pipe, pipe mode: $m, outer mode: $mode" + ) + Dsl.flowDefToRichFlowDef(fd).mergeFrom(localFlow) + pipe + } + + override def sourceFields: Fields = fields + } + + val localFlow = Dsl.flowDefToRichFlowDef(flowDef).onlyUpstreamFrom(pipe) + TypedPipe.from(WrappingSource(pipe, fields, localFlow, mode, conv)) + } + + /** + * Input must be a Pipe with exactly one Field Avoid this method and prefer from(TypedSource) if possible + */ + def fromSingleField[T](pipe: Pipe)(implicit fd: FlowDef, mode: Mode): TypedPipe[T] = + fromPipe(pipe, new Fields(0))(fd, mode, singleConverter[T]) + + } + + abstract class TypedPipeLikeExtensions[A, T <: A] { + def toTypedPipe: TypedPipe[T] + + /** + * Export back to a raw cascading Pipe. useful for interop with the scalding Fields API or with Cascading + * code. Avoid this if possible. Prefer to write to TypedSink. + */ + final def toPipe[U >: T]( + fieldNames: Fields + )(implicit flowDef: FlowDef, mode: Mode, setter: TupleSetter[U]): Pipe = + // we have to be cafeful to pass the setter we want since a low priority implicit can always be + // found :( + CascadingBackend.toPipe[U](toTypedPipe.withLine, fieldNames)(flowDef, mode, setter) + + /** use a TupleUnpacker to flatten U out into a cascading Tuple */ + def unpackToPipe[U >: T]( + fieldNames: Fields + )(implicit fd: FlowDef, mode: Mode, up: TupleUnpacker[U]): Pipe = { + val setter = up.newSetter(fieldNames) + toPipe[U](fieldNames)(fd, mode, setter) + } + + /** + * If you want to writeThrough to a specific file if it doesn't already exist, and otherwise just read + * from it going forward, use this. + */ + def make[U >: T](dest: Source with TypedSink[T] with TypedSource[U]): Execution[TypedPipe[U]] = + Execution.getMode.flatMap { mode => + try { + dest.validateTaps(mode) + Execution.from(TypedPipe.from(dest)) + } catch { + case ivs: InvalidSourceException => toTypedPipe.writeThrough(dest) + } + } + + /** + * Safely write to a TypedSink[T]. If you want to write to a Source (not a Sink) you need to do something + * like: toPipe(fieldNames).write(dest) + * @return + * a pipe equivalent to the current pipe. + */ + def write(dest: TypedSink[T])(implicit flowDef: FlowDef, mode: Mode): TypedPipe[T] = { + // We do want to record the line number that this occurred at + val next = toTypedPipe.withLine + FlowStateMap.merge(flowDef, FlowState.withTypedWrite(next, dest, mode)) + next + } + } + + implicit class TypedPipeCascadingExtensions[T](val toTypedPipe: TypedPipe[T]) + extends TypedPipeLikeExtensions[Any, T] + + implicit class KeyedListCascadingExtensions[K, V, S[K, +V] <: KeyedListLike[K, V, S]]( + keyed: KeyedListLike[K, V, S] + ) extends TypedPipeLikeExtensions[(K, V), (K, V)] { + def toTypedPipe = keyed.toTypedPipe + } + + implicit class ValuePipeCascadingExtensions[T](value: ValuePipe[T]) + extends TypedPipeLikeExtensions[Any, T] { + def toTypedPipe = value.toTypedPipe + } + + implicit class MappableCascadingExtensions[T](mappable: Mappable[T]) + extends TypedPipeLikeExtensions[Any, T] { + def toTypedPipe = TypedPipe.from(mappable) + } + + implicit class ExecutionCompanionCascadingExtensions(ex: Execution.type) { + def fromFn(fn: (Config, Mode) => FlowDef): Execution[Unit] = + Execution.backendSpecific(CascadingExtensions.FromFnToBackend(fn)) + + /** + * Distributes the file onto each map/reduce node, so you can use it for Scalding source creation and + * TypedPipe, KeyedList, etc. transformations. Using the [[com.twitter.scalding.filecache.CachedFile]] + * outside of Execution will probably not work. + * + * For multiple files you must nested your execution, see docs of + * [[com.twitter.scalding.filecache.DistributedCacheFile]] + */ + def withCachedFile[T](path: String)(fn: CachedFile => Execution[T]): Execution[T] = + DistributedCacheFile.execution(path)(fn) + + /* + * This runs a Flow using Cascading's built in threads. The resulting JobStats + * are put into a promise when they are ready + */ + def run[C](flow: Flow[C]): Future[JobStats] = + // This is in Java because of the cascading API's raw types on FlowListener + FlowListenerPromise.start(flow, { f: Flow[C] => JobStats(f.getFlowStats) }) + private def run[L, C](label: L, flow: Flow[C]): Future[(L, JobStats)] = + // This is in Java because of the cascading API's raw types on FlowListener + FlowListenerPromise.start(flow, { f: Flow[C] => (label, JobStats(f.getFlowStats)) }) + + /* + * This blocks the current thread until the job completes with either success or + * failure. + */ + def waitFor[C](flow: Flow[C]): Try[JobStats] = + Try { + flow.complete() + JobStats(flow.getStats) + } + + } + + @deprecated( + "Use CascadingMode.cast(mode) or pattern match directly on known CascadingModes (e.g. Hdfs, Local)", + "0.18.0" + ) + implicit class DeprecatedCascadingModeExtensions(mode: Mode) { + private def cmode: CascadingMode = CascadingMode.cast(mode) + + def openForRead(config: Config, tap: Tap[_, _, _]): TupleEntryIterator = + cmode.openForRead(config, tap) + + def openForRead(tap: Tap[_, _, _]): TupleEntryIterator = + openForRead(Config.defaultFrom(mode), tap) + + // Returns true if the file exists on the current filesystem. + def fileExists(filename: String): Boolean = + cmode.fileExists(filename) + + /** Create a new FlowConnector for this cascading planner */ + def newFlowConnector(props: Config): FlowConnector = + cmode.newFlowConnector(props) + } + + implicit class ModeCompanionExtensions(m: Mode.type) { + def CascadingFlowConnectorClassKey = "cascading.flow.connector.class" + def CascadingFlowProcessClassKey = "cascading.flow.process.class" + + def DefaultHadoopFlowConnector = "cascading.flow.hadoop.HadoopFlowConnector" + def DefaultHadoopFlowProcess = "cascading.flow.hadoop.HadoopFlowProcess" + + def DefaultHadoop2Mr1FlowConnector = "cascading.flow.hadoop2.Hadoop2MR1FlowConnector" + def DefaultHadoop2Mr1FlowProcess = + "cascading.flow.hadoop.HadoopFlowProcess" // no Hadoop2MR1FlowProcess as of Cascading 3.0.0-wip-75? + + def DefaultHadoop2TezFlowConnector = "cascading.flow.tez.Hadoop2TezFlowConnector" + def DefaultHadoop2TezFlowProcess = "cascading.flow.tez.Hadoop2TezFlowProcess" + + // This should be passed ALL the args supplied after the job name + def apply(args: Args, config: Configuration): Mode = { + val strictSources = args.boolean("tool.partialok") == false + if (!strictSources) { + // TODO we should do smarter logging here + println("[Scalding:INFO] using --tool.partialok. Missing log data won't cause errors.") + } + + if (args.boolean("local")) + Local(strictSources) + else if ( + args.boolean("hdfs") + ) /* FIXME: should we start printing deprecation warnings ? It's okay to set manually c.f.*.class though */ + Hdfs(strictSources, config) + else if (args.boolean("hadoop1")) { + config.set(CascadingFlowConnectorClassKey, DefaultHadoopFlowConnector) + config.set(CascadingFlowProcessClassKey, DefaultHadoopFlowProcess) + Hdfs(strictSources, config) + } else if (args.boolean("hadoop2-mr1")) { + config.set(CascadingFlowConnectorClassKey, DefaultHadoop2Mr1FlowConnector) + config.set(CascadingFlowProcessClassKey, DefaultHadoop2Mr1FlowProcess) + Hdfs(strictSources, config) + } else if (args.boolean("hadoop2-tez")) { + config.set(CascadingFlowConnectorClassKey, DefaultHadoop2TezFlowConnector) + config.set(CascadingFlowProcessClassKey, DefaultHadoop2TezFlowProcess) + Hdfs(strictSources, config) + } else + throw ArgsException( + "[ERROR] Mode must be one of --local, --hadoop1, --hadoop2-mr1, --hadoop2-tez or --hdfs, you provided none" + ) + } + + } + + implicit class JobStatsCompanionCascadingExtensions(jobstat: JobStats.type) { + + import cascading.stats.{CascadeStats, CascadingStats, FlowStats} + + def apply(stats: CascadingStats): JobStats = { + val m: Map[String, Any] = statsMap(stats) + new JobStats(stats match { + case cs: CascadeStats => m + case fs: FlowStats => m + ("flow_step_stats" -> fs.getFlowStepStats.asScala.map(statsMap)) + }) + } + + private def counterMap(stats: CascadingStats): Map[String, Map[String, Long]] = + stats.getCounterGroups.asScala.map { group => + ( + group, + stats + .getCountersFor(group) + .asScala + .map { counter => + (counter, stats.getCounterValue(group, counter)) + } + .toMap + ) + }.toMap + + private def statsMap(stats: CascadingStats): Map[String, Any] = + Map( + "counters" -> counterMap(stats), + "duration" -> stats.getDuration, + "finished_time" -> stats.getFinishedTime, + "id" -> stats.getID, + "name" -> stats.getName, + "run_time" -> stats.getRunTime, + "start_time" -> stats.getStartTime, + "submit_time" -> stats.getSubmitTime, + "failed" -> stats.isFailed, + "skipped" -> stats.isSkipped, + "stopped" -> stats.isStopped, + "successful" -> stats.isSuccessful + ) + + } + + implicit class ConfigCascadingExtensions(config: Config) { + import cascading.flow.{FlowListener, FlowProps, FlowStepListener, FlowStepStrategy} + import com.twitter.bijection.{Base64String, Injection} + import com.twitter.chill.{Externalizer, ExternalizerCodec, ExternalizerInjection, KryoInstantiator} + import com.twitter.chill.config.{ConfiguredInstantiator, ScalaMapConfig} + import com.twitter.scalding.filecache.{CachedFile, DistributedCacheFile} + import org.apache.hadoop.mapred.JobConf + import org.apache.hadoop.io.serializer.{Serialization => HSerialization} + + /** + * Add files to be localized to the config. Intended to be used by user code. + * @param cachedFiles + * CachedFiles to be added + * @return + * new Config with cached files + */ + def addDistributedCacheFiles(cachedFiles: CachedFile*): Config = + DistributedCacheFile.addDistributedCacheFiles(config, cachedFiles: _*) + + /** + * Get cached files from config + */ + def getDistributedCachedFiles: Seq[CachedFile] = + DistributedCacheFile.getDistributedCachedFiles(config) + + def getCascadingSerializationTokens: Map[Int, String] = + config + .get(Config.CascadingSerializationTokens) + .map(CascadingTokenUpdater.parseTokens) + .getOrElse(Map.empty[Int, String]) + + /* + * If a ConfiguredInstantiator has been set up, this returns it + */ + def getKryo: Option[KryoInstantiator] = + if (config.toMap.contains(ConfiguredInstantiator.KEY)) + Some((new ConfiguredInstantiator(ScalaMapConfig(config.toMap))).getDelegate) + else None + + /** + * This function gets the set of classes that have been registered to Kryo. They may or may not be used in + * this job, but Cascading might want to be made aware that these classes exist + */ + def getKryoRegisteredClasses: Set[Class[_]] = + // Get an instance of the Kryo serializer (which is populated with registrations) + config.getKryo + .map { kryo => + val cr = kryo.newKryo.getClassResolver + + @annotation.tailrec + def kryoClasses(idx: Int, acc: Set[Class[_]]): Set[Class[_]] = + Option(cr.getRegistration(idx)) match { + case Some(reg) => kryoClasses(idx + 1, acc + reg.getType) + case None => acc // The first null is the end of the line + } + + kryoClasses(0, Set[Class[_]]()) + } + .getOrElse(Set()) + + /* + * Hadoop and Cascading serialization needs to be first, and the Kryo serialization + * needs to be last and this method handles this for you: + * hadoop, cascading, [userHadoop,] kyro + * is the order. + * + * Kryo uses the ConfiguredInstantiator, which is configured either by reflection: + * Right(classOf[MyInstantiator]) or by serializing given Instantiator instance + * with a class to serialize to bootstrap the process: + * Left((classOf[serialization.KryoHadoop], myInstance)) + */ + def setSerialization( + kryo: Either[(Class[_ <: KryoInstantiator], KryoInstantiator), Class[_ <: KryoInstantiator]], + userHadoop: Seq[Class[_ <: HSerialization[_]]] = Nil + ): Config = { + + // Hadoop and Cascading should come first + val first: Seq[Class[_ <: HSerialization[_]]] = + Seq( + classOf[org.apache.hadoop.io.serializer.WritableSerialization], + classOf[cascading.tuple.hadoop.TupleSerialization], + classOf[serialization.WrappedSerialization[_]] + ) + // this must come last + val last: Seq[Class[_ <: HSerialization[_]]] = Seq(classOf[com.twitter.chill.hadoop.KryoSerialization]) + val required = (first ++ last).toSet[AnyRef] // Class is invariant, but we use it as a function + // Make sure we keep the order correct and don't add the required fields twice + val hadoopSer = first ++ (userHadoop.filterNot(required)) ++ last + + val hadoopKV = Config.IoSerializationsKey -> hadoopSer.map(_.getName).mkString(",") + + // Now handle the Kryo portion which uses another mechanism + val chillConf = ScalaMapConfig(config.toMap) + kryo match { + case Left((bootstrap, inst)) => ConfiguredInstantiator.setSerialized(chillConf, bootstrap, inst) + case Right(refl) => ConfiguredInstantiator.setReflect(chillConf, refl) + } + val withKryo = Config(chillConf.toMap + hadoopKV) + + val kryoClasses = withKryo.getKryoRegisteredClasses + .filterNot(_.isPrimitive) // Cascading handles primitives and arrays + .filterNot(_.isArray) + + withKryo.addCascadingClassSerializationTokens(kryoClasses) + } + + def setDefaultComparator(clazz: Class[_ <: java.util.Comparator[_]]): Config = + config + (FlowProps.DEFAULT_ELEMENT_COMPARATOR -> clazz.getName) + + /** + * The serialization of your data will be smaller if any classes passed between tasks in your job are + * listed here. Without this, strings are used to write the types IN EACH RECORD, which compression + * probably takes care of, but compression acts AFTER the data is serialized into buffers and spilling has + * been triggered. + */ + def addCascadingClassSerializationTokens(clazzes: Set[Class[_]]): Config = + CascadingTokenUpdater.update(config, clazzes) + + def getSubmittedTimestamp: Option[RichDate] = + config.get(Config.ScaldingFlowSubmittedTimestamp).map(ts => RichDate(ts.toLong)) + /* + * Sets the timestamp only if it was not already set. This is here + * to prevent overwriting the submission time if it was set by an + * previously (or externally) + */ + def maybeSetSubmittedTimestamp(date: RichDate = RichDate.now): (Option[RichDate], Config) = + config.update(Config.ScaldingFlowSubmittedTimestamp) { + case s @ Some(ts) => (s, Some(RichDate(ts.toLong))) + case None => (Some(date.timestamp.toString), None) + } + + /** + * configure flow listeneres for observability + */ + def addFlowListener(flowListenerProvider: (Mode, Config) => FlowListener): Config = { + val serializedListener = flowListenerSerializer(flowListenerProvider) + config + .update(Config.FlowListeners) { + case None => (Some(serializedListener), ()) + case Some(lst) => (Some(s"$serializedListener,$lst"), ()) + } + ._2 + } + + def getFlowListeners: List[Try[(Mode, Config) => FlowListener]] = + config + .get(Config.FlowListeners) + .toList + .flatMap(s => StringUtility.fastSplit(s, ",")) + .map(flowListenerSerializer.invert(_)) + + def addFlowStepListener(flowListenerProvider: (Mode, Config) => FlowStepListener): Config = { + val serializedListener = flowStepListenerSerializer(flowListenerProvider) + config + .update(Config.FlowStepListeners) { + case None => (Some(serializedListener), ()) + case Some(lst) => (Some(s"$serializedListener,$lst"), ()) + } + ._2 + } + + def getFlowStepListeners: List[Try[(Mode, Config) => FlowStepListener]] = + config + .get(Config.FlowStepListeners) + .toList + .flatMap(s => StringUtility.fastSplit(s, ",")) + .map(flowStepListenerSerializer.invert(_)) + + def addFlowStepStrategy(flowStrategyProvider: (Mode, Config) => FlowStepStrategy[JobConf]): Config = { + val serializedListener = flowStepStrategiesSerializer(flowStrategyProvider) + config + .update(Config.FlowStepStrategies) { + case None => (Some(serializedListener), ()) + case Some(lst) => (Some(s"$serializedListener,$lst"), ()) + } + ._2 + } + + def clearFlowStepStrategies: Config = + config.-(Config.FlowStepStrategies) + + def getFlowStepStrategies: List[Try[(Mode, Config) => FlowStepStrategy[JobConf]]] = + config + .get(Config.FlowStepStrategies) + .toList + .flatMap(s => StringUtility.fastSplit(s, ",")) + .map(flowStepStrategiesSerializer.invert(_)) + + private[this] def buildInj[T: ExternalizerInjection: ExternalizerCodec]: Injection[T, String] = + Injection.connect[T, Externalizer[T], Array[Byte], Base64String, String] + + private[scalding] def flowStepListenerSerializer = + buildInj[(Mode, Config) => FlowStepListener] + private[scalding] def flowListenerSerializer = buildInj[(Mode, Config) => FlowListener] + private[scalding] def flowStepStrategiesSerializer = + buildInj[(Mode, Config) => FlowStepStrategy[JobConf]] + private[scalding] def argsSerializer = buildInj[Map[String, List[String]]] + } + + implicit class ConfigCompanionCascadingExtensions(config: Config.type) { + import org.apache.hadoop.conf.Configuration + /* + * Note that Hadoop Configuration is mutable, but Config is not. So a COPY is + * made on calling here. If you need to update Config, you do it by modifying it. + * This copy also forces all expressions in values to be evaluated, freezing them + * as well. + */ + def fromHadoop(conf: Configuration): Config = + // use `conf.get` to force JobConf to evaluate expressions + Config(conf.asScala.map(e => e.getKey -> conf.get(e.getKey)).toMap) + + /* + * For everything BUT SERIALIZATION, this prefers values in conf, + * but serialization is generally required to be set up with Kryo + * (or some other system that handles general instances at runtime). + */ + def hadoopWithDefaults(conf: Configuration): Config = + (Config.default ++ fromHadoop(conf)) + .setSerialization(Right(classOf[serialization.KryoHadoop])) + .setScaldingVersion + .setHRavenHistoryUserName + } + + implicit class UniqueIDCompanionCascadingExtensions(uid: UniqueID.type) { + def getIDFor(implicit fd: FlowDef): UniqueID = + /* + * In real deploys, this can even be a constant, but for testing + * we need to allocate unique IDs to prevent different jobs running + * at the same time from touching each other's counters. + */ + UniqueID.fromSystemHashCode(fd) + } + + implicit class Matrix2Extensions[R, C, V](mat: Matrix2[R, C, V]) { + def write(sink: TypedSink[(R, C, V)])(implicit fd: FlowDef, m: Mode): Matrix2[R, C, V] = { + import mat.{rowOrd, colOrd} + MatrixLiteral(mat.toTypedPipe.write(sink), mat.sizeHint) + } + } +} + +object CascadingExtensions extends CascadingExtensions { + // This case class preserves equality on Executions + private case class FromFnToBackend(fn: (Config, Mode) => FlowDef) + extends Function4[Config, Mode, Execution.Writer, ConcurrentExecutionContext, CFuture[ + (Long, ExecutionCounters, Unit) + ]] { + def apply(conf: Config, mode: Mode, writer: Execution.Writer, ec: ConcurrentExecutionContext) = + writer match { + case afdr: AsyncFlowDefRunner => + afdr + .validateAndRun(conf)(fn(_, mode))(ec) + .map { case (id, cnt) => (id, cnt, ()) }(ec) + case _ => + CFuture.failed( + new IllegalArgumentException( + s"Execution.fromFn requires cascading Mode producing AsyncFlowDefRunner, found mode: $mode and writer ${writer.getClass}: $writer" + ) + ) + } + } +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CoGroupJoiner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CoGroupJoiner.scala new file mode 100644 index 0000000000..7b84a9ded6 --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/CoGroupJoiner.scala @@ -0,0 +1,64 @@ +package com.twitter.scalding.typed.cascading_backend + +import cascading.pipe.joiner.{Joiner => CJoiner, JoinerClosure} +import cascading.tuple.{Tuple => CTuple} +import com.twitter.scalding.TupleGetter +import com.twitter.scalding.serialization.{Externalizer, MultiJoinExternalizer} +import scala.collection.JavaConverters._ +import com.twitter.scalding.typed.MultiJoinFunction + +abstract class CoGroupedJoiner[K]( + inputSize: Int, + getter: TupleGetter[K], + inJoinFunction: MultiJoinFunction[K, Any] +) extends CJoiner { + + /** + * We have a test that should fail if Externalizer is not used here. you can test failure of that test by + * replacing Externalizer with Some + */ + val joinFunction = Externalizer(MultiJoinExternalizer.externalize(inJoinFunction)) + val distinctSize: Int + def distinctIndexOf(originalPos: Int): Int + + // This never changes. Compute it once + protected val restIndices: IndexedSeq[Int] = (1 until inputSize).map { idx => + val didx = distinctIndexOf(idx) + assert(didx > 0, "the left most can only be iterated once") + didx + } + + override def getIterator(jc: JoinerClosure) = { + val iters = (0 until distinctSize).map(jc.getIterator(_).asScala.buffered) + // This use of `_.get` is safe, but difficult to prove in the types. + @SuppressWarnings(Array("org.wartremover.warts.OptionPartial")) + val keyTuple = + iters.collectFirst { case iter if iter.nonEmpty => iter.head }.get // One of these must have a key + val key = getter.get(keyTuple, 0) + + def unbox(it: Iterator[CTuple]): Iterator[Any] = + it.map(_.getObject(1): Any) + + val leftMost = unbox(iters.head) // linter:disable:UndesirableTypeInference + + def toIterable(didx: Int) = + new Iterable[Any] { + def iterator = unbox(jc.getIterator(didx).asScala) + } + + val rest = restIndices.map(toIterable(_)) + joinFunction + .get(key, leftMost, rest) + .map { rval => + // There always has to be the same number of resulting fields as input + // or otherwise the flow planner will throw + val res = CTuple.size(distinctSize) + res.set(0, key) + res.set(1, rval) + res + } + .asJava + } + + override def numJoins = distinctSize - 1 +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/DistinctCoGroupJoiner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/DistinctCoGroupJoiner.scala new file mode 100644 index 0000000000..f1335b562e --- /dev/null +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/DistinctCoGroupJoiner.scala @@ -0,0 +1,14 @@ +package com.twitter.scalding.typed.cascading_backend + +import com.twitter.scalding.TupleGetter +import com.twitter.scalding.typed.MultiJoinFunction + +// If all the input pipes are unique, this works: +class DistinctCoGroupJoiner[K]( + count: Int, + getter: TupleGetter[K], + @transient joinF: MultiJoinFunction[K, Any] +) extends CoGroupedJoiner[K](count, getter, joinF) { + val distinctSize = count + def distinctIndexOf(idx: Int) = idx +} diff --git a/scalding-core/src/main/scala/com/twitter/scalding/typed/HashJoiner.scala b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/HashJoiner.scala similarity index 56% rename from scalding-core/src/main/scala/com/twitter/scalding/typed/HashJoiner.scala rename to scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/HashJoiner.scala index 1d89013fd3..71475503bc 100644 --- a/scalding-core/src/main/scala/com/twitter/scalding/typed/HashJoiner.scala +++ b/scalding-core/src/main/scala/com/twitter/scalding/typed/cascading_backend/HashJoiner.scala @@ -12,42 +12,58 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ -package com.twitter.scalding.typed + */ +package com.twitter.scalding.typed.cascading_backend import cascading.pipe.joiner.{Joiner => CJoiner, JoinerClosure} -import cascading.tuple.{Tuple => CTuple, Fields, TupleEntry} +import cascading.tuple.{Tuple => CTuple} -import com.twitter.scalding._ +import com.twitter.scalding.serialization.{Externalizer, MultiJoinExternalizer} +import com.twitter.scalding.typed.MultiJoinFunction import scala.collection.JavaConverters._ /** * Only intended to be use to implement the hashCogroup on TypedPipe/Grouped */ -class HashJoiner[K,V,W,R](rightGetter: (K, Iterator[CTuple], Seq[Iterable[CTuple]]) => Iterator[W], - joiner: (K, V, Iterable[W]) => Iterator[R]) extends CJoiner { +class HashJoiner[K, V, W, R]( + rightHasSingleValue: Boolean, + rightGetter: MultiJoinFunction[K, W], + joiner: (K, V, Iterable[W]) => Iterator[R] +) extends CJoiner { + + private[this] val rightGetterEx = Externalizer(MultiJoinExternalizer.externalize(rightGetter)) + private[this] val joinEx = Externalizer(joiner) override def getIterator(jc: JoinerClosure) = { // The left one cannot be iterated multiple times on Hadoop: val leftIt = jc.getIterator(0).asScala // should only be 0 or 1 here - if(leftIt.isEmpty) { + if (leftIt.isEmpty) { (Iterator.empty: Iterator[CTuple]).asJava // java is not covariant so we need this - } - else { + } else { + // In this branch there must be at least one item on the left in a hash-join val left = leftIt.buffered - // There must be at least one item on the left in a hash-join val key = left.head.getObject(0).asInstanceOf[K] // It is safe to iterate over the right side again and again - val rightIterable = new Iterable[W] { - def iterator = rightGetter(key, jc.getIterator(1).asScala, Nil) - } + + val rightIterable = + if (rightHasSingleValue) { + // Materialize this once for all left values + rightGetterEx.get(key, jc.getIterator(1).asScala.map(_.getObject(1): Any), Nil).toList + } else { + // TODO: it might still be good to count how many there are and materialize + // in memory without reducing again + new Iterable[W] { + def iterator = rightGetterEx.get(key, jc.getIterator(1).asScala.map(_.getObject(1): Any), Nil) + } + } left.flatMap { kv => val leftV = kv.getObject(1).asInstanceOf[V] // get just the Vs - joiner(key, leftV, rightIterable) + joinEx + .get(key, leftV, rightIterable) .map { rval => // There always has to be four resulting fields // or otherwise the flow planner will throw @@ -59,5 +75,6 @@ class HashJoiner[K,V,W,R](rightGetter: (K, Iterator[CTuple], Seq[Iterable[CTuple }.asJava } } + override val numJoins = 1 } diff --git a/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/05/_SUCCESS b/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/05/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/05/_ignored b/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/05/_ignored new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/07/2013-07.txt b/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/07/2013-07.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/07/_SUCCESS/_ignored b/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/07/_SUCCESS/_ignored new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/08/2013-08.txt b/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/08/2013-08.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/08/_SUCCESS b/scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/test_data/2013/08/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scalding-core/src/test/scala/com/twitter/scalding/AlgebraicReductionsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/AlgebraicReductionsTest.scala index 6a3e643aa0..6b30c0328c 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/AlgebraicReductionsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/AlgebraicReductionsTest.scala @@ -12,61 +12,59 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ -/** - */ -class AlgebraJob(args : Args) extends Job(args) { - Tsv("input", ('x,'y,'z,'w)) - .map('w -> 'w) { w : Int => Set(w) } +import org.scalatest.{Matchers, WordSpec} + +class AlgebraJob(args: Args) extends Job(args) { + Tsv("input", ('x, 'y, 'z, 'w)) + .map('w -> 'w) { w: Int => Set(w) } .groupBy('x) { - _.sum[(Int,Int)](('y,'z) -> ('sy, 'sz)) - .sum[Set[Int]]('w -> 'setw) - .times[(Int,Int)](('y, 'z) -> ('py, 'pz)) - .dot[Int]('y,'z,'ydotz) + _.sum[(Int, Int)](('y, 'z) -> ('sy, 'sz)) + .sum[Set[Int]]('w -> 'setw) + .times[(Int, Int)](('y, 'z) -> ('py, 'pz)) + .dot[Int]('y, 'z, 'ydotz) } .write(Tsv("output")) } -class ComplicatedAlgebraJob(args : Args) extends Job(args) { - Tsv("input", ('x,'y,'z,'w,'v)) - .map('w -> 'w) { w : Int => Set(w) } +class ComplicatedAlgebraJob(args: Args) extends Job(args) { + Tsv("input", ('x, 'y, 'z, 'w, 'v)) + .map('w -> 'w) { w: Int => Set(w) } .groupBy('x) { - _.sum[(Int,Int,Set[Int],Double)](('y,'z,'w,'v) -> ('sy,'sz,'sw,'sv)) + _.sum[(Int, Int, Set[Int], Double)](('y, 'z, 'w, 'v) -> ('sy, 'sz, 'sw, 'sv)) } .write(Tsv("output")) } -class AlgebraJobTest extends Specification { - noDetailedDiffs() +class AlgebraJobTest extends WordSpec with Matchers { import Dsl._ - val inputData = List((1,2,3,5),(1,4,5,7),(2,1,0,7)) - val correctOutput = List((1,6,8,Set(5,7), 8,15,(6 + 20)),(2,1,0,Set(7),1,0,0)) + val inputData = List((1, 2, 3, 5), (1, 4, 5, 7), (2, 1, 0, 7)) + val correctOutput = List((1, 6, 8, Set(5, 7), 8, 15, (6 + 20)), (2, 1, 0, Set(7), 1, 0, 0)) "A AlgebraJob" should { - JobTest("com.twitter.scalding.AlgebraJob") - .source(Tsv("input",('x,'y,'z,'w)), inputData) + JobTest(new AlgebraJob(_)) + .source(Tsv("input", ('x, 'y, 'z, 'w)), inputData) .sink[(Int, Int, Int, Set[Int], Int, Int, Int)](Tsv("output")) { buf => "correctly do algebra" in { - buf.toList must be_==(correctOutput) + buf.toList shouldBe correctOutput } } .run - .finish + .finish() } - val inputData2 = List((1,2,3,5,1.2),(1,4,5,7,0.1),(2,1,0,7,3.2)) - val correctOutput2 = List((1,6,8,Set(5,7),1.3),(2,1,0,Set(7),3.2)) + val inputData2 = List((1, 2, 3, 5, 1.2), (1, 4, 5, 7, 0.1), (2, 1, 0, 7, 3.2)) + val correctOutput2 = List((1, 6, 8, Set(5, 7), 1.3), (2, 1, 0, Set(7), 3.2)) "A ComplicatedAlgebraJob" should { - JobTest("com.twitter.scalding.ComplicatedAlgebraJob") - .source(Tsv("input",('x,'y,'z,'w,'v)), inputData2) + JobTest(new ComplicatedAlgebraJob(_)) + .source(Tsv("input", ('x, 'y, 'z, 'w, 'v)), inputData2) .sink[(Int, Int, Int, Set[Int], Double)](Tsv("output")) { buf => "correctly do complex algebra" in { - buf.toList must be_==(correctOutput2) + buf.toList shouldBe correctOutput2 } } .run - .finish + .finish() } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ArgHelpTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ArgHelpTest.scala new file mode 100644 index 0000000000..9acb3d5272 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/ArgHelpTest.scala @@ -0,0 +1,80 @@ +package com.twitter.scalding + +import org.scalatest.{Matchers, WordSpec} + +case class ArgHelperTest(testFn: Seq[DescribedArg] => Unit) extends ArgHelper { + override def helpRequest(describedArgs: Seq[DescribedArg]): Nothing = { + testFn(describedArgs) + throw new HelpException() + } +} + +class ArgHelpTest extends WordSpec with Matchers { + def job = TypedPipe.from(List(1, 2, 3)).toIterableExecution + + "ArgHelper" should { + "print help when asked" in { + var helpCalled = false + val helper = ArgHelperTest((describeArgs: Seq[DescribedArg]) => helpCalled = true) + + val args = List(OptionalArg("name", "Name of person")) + val config = Config.unitTestDefault.setArgs(Args("--help")) + + intercept[HelpException] { + helper.describe(args, job).waitFor(config, Local(true)).get + } + assert(helpCalled, "Help function was called") + } + } + + it should { + "run job without help" in { + var helpCalled = false + val helper = ArgHelperTest((describeArgs: Seq[DescribedArg]) => helpCalled = true) + + val args = List(OptionalArg("name", "Name of person")) + val config = Config.unitTestDefault.setArgs(Args("")) + + val returnValues = helper.describe(args, job).waitFor(config, Local(true)).get.toList + assert(!helpCalled, "Help function was not called") + assert(returnValues == List(1, 2, 3)) + } + } + + it should { + "call help even when given missing args" in { + var helpCalled = false + val helper = ArgHelperTest((describeArgs: Seq[DescribedArg]) => helpCalled = true) + + val args = List(OptionalArg("name", "Name of person")) + val config = Config.unitTestDefault.setArgs(Args(List("--help", "--name", "Bill", "--phone", "111"))) + + intercept[HelpException] { + helper.validatedDescribe(args, job).waitFor(config, Local(true)).get + } + assert(helpCalled, "Help was output") + } + } + + it should { + "not fail when all args are described" in { + val args = List(OptionalArg("name", "Name of person"), OptionalArg("phone", "Person's phone")) + val config = Config.unitTestDefault.setArgs(Args(List("--name", "Bill", "--phone", "111"))) + + val returnValues = ArgHelp.validatedDescribe(args, job).waitFor(config, Local(true)).get + assert(returnValues == List(1, 2, 3)) + } + } + + it should { + "fail when all args are not described" in { + val args = List(OptionalArg("name", "Name of person"), OptionalArg("phone", "Person's phone")) + val config = + Config.unitTestDefault.setArgs(Args(List("--name", "Bill", "--phone", "111", "--address", "123"))) + + intercept[DescriptionValidationException] { + ArgHelp.validatedDescribe(args, job.unit).waitFor(config, Local(true)).get + } + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/BlockJoinTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/BlockJoinTest.scala index a7f0a72b3e..16aa6a383a 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/BlockJoinTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/BlockJoinTest.scala @@ -12,18 +12,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ +import org.scalatest.{Matchers, WordSpec} import cascading.pipe.joiner._ -import java.lang.reflect.InvocationTargetException - import scala.collection.mutable.Buffer -class InnerProductJob(args : Args) extends Job(args) { +class InnerProductJob(args: Args) extends Job(args) { val l = args.getOrElse("left", "1").toInt val r = args.getOrElse("right", "1").toInt val j = args.getOrElse("joiner", "i") match { @@ -33,72 +31,67 @@ class InnerProductJob(args : Args) extends Job(args) { case "o" => new OuterJoin } - val in0 = Tsv("input0").read.mapTo((0,1,2) -> ('x1, 'y1, 's1)) { input : (Int, Int, Int) => input } - val in1 = Tsv("input1").read.mapTo((0,1,2) -> ('x2, 'y2, 's2)) { input : (Int, Int, Int) => input } + val in0 = Tsv("input0").read.mapTo((0, 1, 2) -> ('x1, 'y1, 's1)) { input: (Int, Int, Int) => input } + val in1 = Tsv("input1").read.mapTo((0, 1, 2) -> ('x2, 'y2, 's2)) { input: (Int, Int, Int) => input } in0 .blockJoinWithSmaller('y1 -> 'y2, in1, leftReplication = l, rightReplication = r, joiner = j) - .map(('s1, 's2) -> 'score) { v : (Int, Int) => + .map(('s1, 's2) -> 'score) { v: (Int, Int) => v._1 * v._2 } - .groupBy('x1, 'x2) { _.sum[Double]('score) } + .groupBy('x1, 'x2)(_.sum[Double]('score)) .write(Tsv("output")) } -class BlockJoinPipeTest extends Specification { - noDetailedDiffs() - +class BlockJoinPipeTest extends WordSpec with Matchers { "An InnerProductJob" should { val in1 = List(("0", "0", "1"), ("0", "1", "1"), ("1", "0", "2"), ("2", "0", "4")) val in2 = List(("0", "1", "1"), ("1", "0", "2"), ("2", "4", "5")) val correctOutput = Set((0, 1, 2.0), (0, 0, 1.0), (1, 1, 4.0), (2, 1, 8.0)) - def runJobWithArguments(left : Int = 1, right : Int = 1, joiner : String = "i") - (callback : Buffer[(Int,Int,Double)] => Unit ) { - JobTest("com.twitter.scalding.InnerProductJob") + def runJobWithArguments(left: Int = 1, right: Int = 1, joiner: String = "i")( + callback: Buffer[(Int, Int, Double)] => Unit + ): Unit = + JobTest(new InnerProductJob(_)) .source(Tsv("input0"), in1) .source(Tsv("input1"), in2) .arg("left", left.toString) .arg("right", right.toString) .arg("joiner", joiner) - .sink[(Int,Int,Double)](Tsv("output")) { outBuf => + .sink[(Int, Int, Double)](Tsv("output")) { outBuf => callback(outBuf) } .run - .finish - } + .finish() "correctly compute product with 1 left block and 1 right block" in { runJobWithArguments() { outBuf => - val unordered = outBuf.toSet - unordered must_== correctOutput + outBuf.toSet shouldBe correctOutput } } "correctly compute product with multiple left and right blocks" in { runJobWithArguments(left = 3, right = 7) { outBuf => - val unordered = outBuf.toSet - unordered must_== correctOutput + outBuf.toSet shouldBe correctOutput } } "correctly compute product with a valid LeftJoin" in { runJobWithArguments(right = 7, joiner = "l") { outBuf => - val unordered = outBuf.toSet - unordered must_== correctOutput + outBuf.toSet shouldBe correctOutput } } "throw an exception when used with OuterJoin" in { - runJobWithArguments(joiner = "o") { g => g } must throwA[InvocationTargetException] + an[InvalidJoinModeException] should be thrownBy runJobWithArguments(joiner = "o") { _ => } } "throw an exception when used with an invalid LeftJoin" in { - runJobWithArguments(joiner = "l", left = 2) { g => g } must throwA[InvocationTargetException] + an[InvalidJoinModeException] should be thrownBy runJobWithArguments(joiner = "l", left = 2) { _ => } } "throw an exception when used with an invalid RightJoin" in { - runJobWithArguments(joiner = "r", right = 2) { g => g } must throwA[InvocationTargetException] + an[InvalidJoinModeException] should be thrownBy runJobWithArguments(joiner = "r", right = 2) { _ => } } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/CascadeTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/CascadeTest.scala index 9b001e02e6..3b5106cb8f 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/CascadeTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/CascadeTest.scala @@ -12,54 +12,56 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding +import org.scalatest.{Matchers, WordSpec} + import java.io.BufferedWriter import java.io.FileWriter import scala.io.Source.fromFile import java.io.File -import org.specs._ import cascading.cascade.Cascade import cascading.flow.FlowSkipIfSinkNotStale -import cascading.tuple.Fields -class Job1(args : Args) extends Job(args) { - Tsv(args("input0"), ('line)).pipe.map[String, String]('line -> 'line)( (x: String) => "job1:"+x).write(Tsv(args("output0"), fields='line ) ) +class Job1(args: Args) extends Job(args) { + Tsv(args("input0"), 'line).pipe + .map[String, String]('line -> 'line)((x: String) => "job1:" + x) + .write(Tsv(args("output0"), fields = 'line)) } -class Job2(args : Args) extends Job(args) { - Tsv(args("output0"), ('line)).pipe.map[String, String]('line -> 'line)( (x: String) => "job2"+x).write(Tsv(args("output1"))) +class Job2(args: Args) extends Job(args) { + Tsv(args("output0"), 'line).pipe + .map[String, String]('line -> 'line)((x: String) => "job2" + x) + .write(Tsv(args("output1"))) } class CascadeTestJob(args: Args) extends CascadeJob(args) { val jobs = List(new Job1(args), new Job2(args)) - override def preProcessCascade(cascade: Cascade) = { + override def preProcessCascade(cascade: Cascade) = cascade.setFlowSkipStrategy(new FlowSkipIfSinkNotStale()) - } - override def postProcessCascade(cascade: Cascade) = { + override def postProcessCascade(cascade: Cascade) = println(cascade.getCascadeStats()) - } } -class TwoPhaseCascadeTest extends Specification with FieldConversions { +class TwoPhaseCascadeTest extends WordSpec with Matchers with FieldConversions { "A Cascade job" should { CascadeTest("com.twitter.scalding.CascadeTestJob") .arg("input0", "input0") .arg("output0", "output0") .arg("output1", "output1") - .source(Tsv("input0", ('line)), List(Tuple1("line1"), Tuple1("line2"), Tuple1("line3"), Tuple1("line4"))) + .source(Tsv("input0", 'line), List(Tuple1("line1"), Tuple1("line2"), Tuple1("line3"), Tuple1("line4"))) .sink[String](Tsv("output1")) { ob => "verify output got changed by both flows" in { - ob.toList must_== List("job2job1:line1", "job2job1:line2", "job2job1:line3", "job2job1:line4") + ob.toList shouldBe List("job2job1:line1", "job2job1:line2", "job2job1:line3", "job2job1:line4") } } .runHadoop - .finish + .finish() } "A Cascade job run though Tool.main" should { @@ -76,17 +78,23 @@ class TwoPhaseCascadeTest extends Specification with FieldConversions { val output1 = File.createTempFile("cascading-job-output1-", "") output1.mkdir() - val args = Array[String]("com.twitter.scalding.CascadeTestJob", "--local", - "--input0", input0.getAbsolutePath, - "--output0", output0.getAbsolutePath, - "--output1", output1.getAbsolutePath) + val args = Array[String]( + "com.twitter.scalding.CascadeTestJob", + "--local", + "--input0", + input0.getAbsolutePath, + "--output0", + output0.getAbsolutePath, + "--output1", + output1.getAbsolutePath + ) Tool.main(args) val lines = fromFile(output1.getAbsolutePath).getLines.toList "verify output got changed by both flows" in { - lines must_== List("job2job1:a", "job2job1:b", "job2job1:c", "job2job1:d", "job2job1:e") + lines shouldBe List("job2job1:a", "job2job1:b", "job2job1:c", "job2job1:d", "job2job1:e") } input0.delete() diff --git a/scalding-core/src/test/scala/com/twitter/scalding/CoGroupTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/CoGroupTest.scala index 60bd2d01af..dfae14822b 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/CoGroupTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/CoGroupTest.scala @@ -12,31 +12,30 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.pipe.joiner._ -import org.specs._ +import org.scalatest.{Matchers, WordSpec} -class StarJoinJob(args : Args) extends Job(args) { - val in0 = Tsv("input0").read.mapTo((0,1) -> ('x0, 'a)) { input : (Int, Int) => input } - val in1 = Tsv("input1").read.mapTo((0,1) -> ('x1, 'b)) { input : (Int, Int) => input } - val in2 = Tsv("input2").read.mapTo((0,1) -> ('x2, 'c)) { input : (Int, Int) => input } - val in3 = Tsv("input3").read.mapTo((0,1) -> ('x3, 'd)) { input : (Int, Int) => input } +class StarJoinJob(args: Args) extends Job(args) { + val in0 = Tsv("input0").read.mapTo((0, 1) -> ('x0, 'a)) { input: (Int, Int) => input } + val in1 = Tsv("input1").read.mapTo((0, 1) -> ('x1, 'b)) { input: (Int, Int) => input } + val in2 = Tsv("input2").read.mapTo((0, 1) -> ('x2, 'c)) { input: (Int, Int) => input } + val in3 = Tsv("input3").read.mapTo((0, 1) -> ('x3, 'd)) { input: (Int, Int) => input } - in0.coGroupBy('x0) { - _.coGroup('x1, in1, OuterJoinMode) - .coGroup('x2, in2, OuterJoinMode) - .coGroup('x3, in3, OuterJoinMode) - } - .project('x0, 'a, 'b, 'c, 'd) - .write(Tsv("output")) + in0 + .coGroupBy('x0) { + _.coGroup('x1, in1, OuterJoinMode) + .coGroup('x2, in2, OuterJoinMode) + .coGroup('x3, in3, OuterJoinMode) + } + .project('x0, 'a, 'b, 'c, 'd) + .write(Tsv("output")) } -class CoGroupTest extends Specification { - noDetailedDiffs() +class CoGroupTest extends WordSpec with Matchers { "A StarJoinJob" should { - JobTest("com.twitter.scalding.StarJoinJob") + JobTest(new StarJoinJob(_)) .source(Tsv("input0"), List((0, 1), (1, 1), (2, 1), (3, 2))) .source(Tsv("input1"), List((0, 1), (2, 5), (3, 2))) .source(Tsv("input2"), List((1, 1), (2, 8))) @@ -44,11 +43,11 @@ class CoGroupTest extends Specification { .sink[(Int, Int, Int, Int, Int)](Tsv("output")) { outputBuf => "be able to work" in { val out = outputBuf.toSet - val expected = Set((0,1,1,0,9), (1,1,0,1,0), (2,1,5,8,11), (3,2,2,0,0)) - out must_== expected + val expected = Set((0, 1, 1, 0, 9), (1, 1, 0, 1, 0), (2, 1, 5, 8, 11), (3, 2, 2, 0, 0)) + out shouldBe expected } } .run - .finish + .finish() } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ConfigTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ConfigTest.scala new file mode 100644 index 0000000000..d702cd078e --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/ConfigTest.scala @@ -0,0 +1,185 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import com.twitter.scalding.filecache.{HadoopCachedFile, URIHasher} +import java.net.URI +import org.apache.hadoop.mapreduce.MRJobConfig +import org.apache.hadoop.conf.Configuration + +import org.scalatest.{Matchers, WordSpec} +import org.scalacheck.Arbitrary +import org.scalacheck.Properties +import org.scalacheck.Prop.forAll + +import scala.util.Success + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions.ConfigCascadingExtensions + +class ConfigTest extends WordSpec with Matchers { + + val defaultHdfs = Config.defaultFrom(Hdfs(true, new Configuration())) + + "A Config" should { + "cascadingAppJar works" in { + val cls = getClass + Config.default.setCascadingAppJar(cls).getCascadingAppJar should contain(Success(cls)) + } + "default has serialization set" in { + val sers = defaultHdfs.get("io.serializations").get.split(",").toList + sers.last shouldBe (classOf[com.twitter.chill.hadoop.KryoSerialization].getName) + } + "default has chill configured" in { + defaultHdfs.get(com.twitter.chill.config.ConfiguredInstantiator.KEY) should not be empty + } + "setting timestamp twice does not change it" in { + val date = RichDate.now + val (oldDate, newConf) = Config.empty.maybeSetSubmittedTimestamp(date) + oldDate shouldBe empty + newConf.getSubmittedTimestamp should contain(date) + val (stillOld, new2) = newConf.maybeSetSubmittedTimestamp(date + Seconds(1)) + stillOld should contain(date) + new2 shouldBe newConf + } + "adding UniqueIDs works" in { + assert(Config.empty.getUniqueIds.size === 0) + val (id, conf) = Config.empty.ensureUniqueId + assert(conf.getUniqueIds === (Set(id))) + } + "roundtrip Args" in { + val config = Config.empty + val args = Args(Array("--hello", "party people")) + + assert(config.setArgs(args).getArgs === args) + } + "throw when Args has been manually modified" in { + val config = Config.empty + (Config.ScaldingJobArgsSerialized -> " ") + intercept[RuntimeException](config.getArgs) + } + "Default serialization should have tokens" in { + defaultHdfs.getCascadingSerializationTokens should not be empty + defaultHdfs.getCascadingSerializationTokens.values + .map(Class.forName) + .filter(c => c.isPrimitive || c.isArray) shouldBe empty + + Config.empty.getCascadingSerializationTokens shouldBe empty + + // tokenClasses are a subset that don't include primitives or arrays. + val tokenClasses = defaultHdfs.getCascadingSerializationTokens.values.toSet + val kryoClasses = defaultHdfs.getKryoRegisteredClasses.map(_.getName) + // Tokens are a subset of Kryo registered classes + (kryoClasses & tokenClasses) shouldBe tokenClasses + // the only Kryo classes we don't assign tokens for are the primitives + array + (kryoClasses -- tokenClasses).forall { c => + // primitives cannot be forName'd + val prim = Set( + classOf[Boolean], + classOf[Byte], + classOf[Short], + classOf[Int], + classOf[Long], + classOf[Float], + classOf[Double], + classOf[Char], + classOf[Unit] + ) + .map(_.getName) + + prim(c) || Class.forName(c).isArray + } shouldBe true + } + "addDistributedCacheFile works" in { + val (cachedFile, path) = ConfigTest.makeCachedFileAndPath("test.txt") + + Config.empty + .addDistributedCacheFiles(cachedFile) + .get(MRJobConfig.CACHE_FILES) shouldBe Some(path) + } + "multiple addDistributedCacheFile work" in { + val (cachedFileFirst, pathFirst) = ConfigTest.makeCachedFileAndPath("first.txt") + val (cachedFileSecond, pathSecond) = ConfigTest.makeCachedFileAndPath("second.txt") + + Config.empty + .addDistributedCacheFiles(cachedFileFirst, cachedFileSecond) + .get(MRJobConfig.CACHE_FILES) shouldBe Some(s"$pathFirst,$pathSecond") + + Config.empty + .addDistributedCacheFiles(cachedFileFirst) + .addDistributedCacheFiles(cachedFileSecond) + .get(MRJobConfig.CACHE_FILES) shouldBe Some(s"$pathFirst,$pathSecond") + } + + "constants match cascading values" in { + // We do this to avoid depending on cascading to get these strings, but + // want to ensure they match + + Config.CascadingSpillablePropListThreshold shouldBe + cascading.tuple.collect.SpillableProps.LIST_THRESHOLD + + Config.CascadingSpillablePropMapThreshold shouldBe + cascading.tuple.collect.SpillableProps.MAP_THRESHOLD + + Config.CascadingAggregateByThreshold shouldBe + cascading.pipe.assembly.AggregateBy.AGGREGATE_BY_THRESHOLD + + Config.CascadingAppAppJarClass shouldBe + cascading.property.AppProps.APP_JAR_CLASS + + Config.CascadingAppFrameworks shouldBe + cascading.property.AppProps.APP_FRAMEWORKS + } + } +} + +object ConfigTest { + def makeCachedFileAndPath(name: String): (HadoopCachedFile, String) = { + val uriString = s"hdfs://foo.example:1234/path/to/the/stuff/$name" + val uri = new URI(uriString) + val hashHex = URIHasher(uri) + val hashedFilename = hashHex + s"-$name" + val cachedFile = HadoopCachedFile(uri) + + (cachedFile, s"$uriString#$hashedFilename") + } +} + +object ConfigProps extends Properties("Config") { + implicit def arbConfig: Arbitrary[Config] = + Arbitrary(Arbitrary.arbitrary[Map[String, String]].map(Config(_))) + + property(".+(k, v).get(k) == Some(v)") = forAll { (c: Config, k: String, v: String) => + (c + (k, v)).get(k) == Some(v) + } + property(".-(k).get(k) == None") = forAll { (c: Config, k: String) => + (c - k).get(k) == None + } + property("++ unions keys") = forAll { (c1: Config, c2: Config) => + (c1 ++ c2).toMap.keySet == (c1.toMap.keySet | c2.toMap.keySet) + } + property("++ == c2.orElse(c1)") = forAll { (c1: Config, c2: Config, keys: Set[String]) => + val merged = c1 ++ c2 + val testKeys = c1.toMap.keySet | c2.toMap.keySet ++ keys + testKeys.forall(k => merged.get(k) == c2.get(k).orElse(c1.get(k))) + } + property("adding many UniqueIDs works") = forAll { (l: List[String]) => + val uids = l.filterNot(s => s.isEmpty || s.contains(",")).map(UniqueID(_)) + (uids + .foldLeft(Config.empty) { (conf, id) => + conf.addUniqueId(id) + } + .getUniqueIds == uids.toSet) + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/CoreTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/CoreTest.scala index 2832dfd372..409036d893 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/CoreTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/CoreTest.scala @@ -12,71 +12,76 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding +import org.scalatest.{Matchers, WordSpec} import cascading.tuple.Fields import cascading.tuple.TupleEntry -import java.util.concurrent.TimeUnit - -import org.specs._ +import com.twitter.algebird.{Fold, Semigroup} +import com.twitter.scalding.source.DailySuffixTsv +import com.twitter.scalding.typed.TypedPipeGen import java.lang.{Integer => JInt} - -class NumberJoinerJob(args : Args) extends Job(args) { - val in0 = TypedTsv[(Int,Int)]("input0").read.rename((0,1) -> ('x0, 'y0)) - val in1 = Tsv("input1").read.mapTo((0,1) -> ('x1, 'y1)) { input : (Long, Long) => input } - in0.joinWithSmaller('x0 -> 'x1, in1) - .write(Tsv("output")) +import org.scalacheck.{Arbitrary, Gen} +import org.scalatest.prop.PropertyChecks + +class NumberJoinerJob(args: Args) extends Job(args) { + val in0 = TypedTsv[(Int, Int)]("input0").read.rename((0, 1) -> ('x0, 'y0)) + val in1 = Tsv("input1").read.mapTo((0, 1) -> ('x1, 'y1)) { input: (Long, Long) => input } + in0 + .joinWithSmaller('x0 -> 'x1, in1) + .write(Tsv("output")) } -class NumberJoinTest extends Specification { - import Dsl._ +class NumberJoinTest extends WordSpec with Matchers { "A NumberJoinerJob" should { - //Set up the job: + // Set up the job: "not throw when joining longs with ints" in { - JobTest("com.twitter.scalding.NumberJoinerJob") - .source(TypedTsv[(Int,Int)]("input0"), List((0,1), (1,2), (2,4))) - .source(Tsv("input1"), List(("0","1"), ("1","3"), ("2","9"))) - .sink[(Int,Int,Long,Long)](Tsv("output")) { outBuf => + JobTest(new NumberJoinerJob(_)) + .source(TypedTsv[(Int, Int)]("input0"), List((0, 1), (1, 2), (2, 4))) + .source(Tsv("input1"), List(("0", "1"), ("1", "3"), ("2", "9"))) + .sink[(Int, Int, Long, Long)](Tsv("output")) { outBuf => val unordered = outBuf.toSet - unordered.size must be_==(3) - unordered((0,1,0L,1L)) must be_==(true) - unordered((1,2,1L,3L)) must be_==(true) - unordered((2,4,2L,9L)) must be_==(true) + unordered should have size 3 + unordered should contain(0, 1, 0L, 1L) + unordered should contain(1, 2, 1L, 3L) + unordered should contain(2, 4, 2L, 9L) } .run .runHadoop - .finish + .finish() } } } class SpillingJob(args: Args) extends Job(args) { - TypedTsv[(Int, Int)]("input").read.rename((0,1) -> ('n, 'v)) + TypedTsv[(Int, Int)]("input").read + .rename((0, 1) -> ('n, 'v)) .groupBy('n) { group => - group.spillThreshold(3).sum[Int]('v).size - }.write(Tsv("output")) + group.spillThreshold(3).sum[Int]('v).size + } + .write(Tsv("output")) } - -class SpillingTest extends Specification { - import Dsl._ +class SpillingTest extends WordSpec with Matchers { "A SpillingJob" should { val src = (0 to 9).map(_ -> 1) ++ List(0 -> 4) - val result = src.groupBy(_._1) - .mapValues { v => (v.map(_._2).sum, v.size) } + val result = src + .groupBy(_._1) + .mapValues(v => (v.map(_._2).sum, v.size)) .map { case (a, (b, c)) => (a, b, c) } .toSet - //Set up the job: + // Set up the job: "work when number of keys exceeds spill threshold" in { JobTest(new SpillingJob(_)) .source(TypedTsv[(Int, Int)]("input"), src) .sink[(Int, Int, Int)](Tsv("output")) { outBuf => - outBuf.toSet must be_==(result) - }.run + outBuf.toSet shouldBe result + } + .run .runHadoop - .finish + .finish() } } } @@ -89,114 +94,123 @@ class GroupRandomlyJob(args: Args) extends Job(args) { import GroupRandomlyJob.NumShards Tsv("fakeInput").read - .mapTo(0 -> 'num) { (line: String) => line.toInt } - .groupRandomly(NumShards) { _.max('num) } - .groupAll { _.size } + .mapTo(0 -> 'num)((line: String) => line.toInt) + .groupRandomly(NumShards)(_.max('num)) + .groupAll(_.size) .write(Tsv("fakeOutput")) } -class GroupRandomlyJobTest extends Specification { +class GroupRandomlyJobTest extends WordSpec with Matchers { import GroupRandomlyJob.NumShards - noDetailedDiffs() "A GroupRandomlyJob" should { - val input = (0 to 10000).map { _.toString }.map { Tuple1(_) } - JobTest("com.twitter.scalding.GroupRandomlyJob") + val input = (0 to 10000).map(i => Tuple1(i.toString)) + JobTest(new GroupRandomlyJob(_)) .source(Tsv("fakeInput"), input) - .sink[(Int)](Tsv("fakeOutput")) { outBuf => + .sink[Int](Tsv("fakeOutput")) { outBuf => val numShards = outBuf(0) - numShards must be_==(NumShards) + numShards shouldBe NumShards } - .run.finish + .run + .finish() } } class ShuffleJob(args: Args) extends Job(args) { - Tsv("fakeInput") - .read - .mapTo(0 -> 'num) { (line: String) => line.toInt } + Tsv("fakeInput").read + .mapTo(0 -> 'num)((line: String) => line.toInt) .shuffle(shards = 1, seed = 42L) - .groupAll{ _.toList[Int]('num -> 'num) } + .groupAll(_.toList[Int]('num -> 'num)) .write(Tsv("fakeOutput")) } -class ShuffleJobTest extends Specification { - noDetailedDiffs() - - val expectedShuffle : List[Int] = List(10, 5, 9, 12, 0, 1, 4, 8, 11, 6, 2, 3, 7) +class ShuffleJobTest extends WordSpec with Matchers { + val expectedShuffle: List[Int] = List(10, 5, 9, 12, 0, 1, 4, 8, 11, 6, 2, 3, 7) "A ShuffleJob" should { - val input = (0 to 12).map { Tuple1(_) } - JobTest("com.twitter.scalding.ShuffleJob") + val input = (0 to 12).map(Tuple1(_)) + JobTest(new ShuffleJob(_)) .source(Tsv("fakeInput"), input) .sink[(List[Int])](Tsv("fakeOutput")) { outBuf => - outBuf(0) must be_==(expectedShuffle) + outBuf(0) shouldBe expectedShuffle } - .run.finish + .run + .finish() } } class MapToGroupBySizeSumMaxJob(args: Args) extends Job(args) { - TextLine(args("input")).read. - //1 is the line - mapTo(1-> ('kx,'x)) { line : String => - val x = line.toDouble - ((x > 0.5),x) - }. - groupBy('kx) { _.size.sum[Double]('x->'sx).max('x) }. - write( Tsv(args("output")) ) + TextLine(args("input")).read + . + // 1 is the line + mapTo(1 -> ('kx, 'x)) { line: String => + val x = line.toDouble + ((x > 0.5), x) + } + .groupBy('kx)(_.size.sum[Double]('x -> 'sx).max('x)) + .write(Tsv(args("output"))) } -class MapToGroupBySizeSumMaxTest extends Specification { - noDetailedDiffs() +class MapToGroupBySizeSumMaxTest extends WordSpec with Matchers { "A MapToGroupBySizeSumMaxJob" should { val r = new java.util.Random - //Here is our input data: - val input = (0 to 100).map { i : Int => (i.toString, r.nextDouble.toString) } - //Here is our expected output: - val goldenOutput = input.map { case (line : String, x : String) => - val xv = x.toDouble; - ((xv > 0.5), xv) - }. - groupBy { case (kx : Boolean, x : Double) => kx }. - mapValues { vals => - val vlist = vals.map { case (k:Boolean, x:Double) => x }.toList + // Here is our input data: + val input = (0 to 100).map { i: Int => (i.toString, r.nextDouble.toString) } + // Here is our expected output: + val goldenOutput = input + .map { case (line: String, x: String) => + val xv = x.toDouble; + ((xv > 0.5), xv) + } + .groupBy { case (kx: Boolean, x: Double) => kx } + .mapValues { vals => + val vlist = vals.map { case (k: Boolean, x: Double) => x }.toList val size = vlist.size val sum = vlist.sum val max = vlist.max (size, sum, max) } - //Now we have the expected input and output: - JobTest("com.twitter.scalding.MapToGroupBySizeSumMaxJob"). - arg("input","fakeInput"). - arg("output","fakeOutput"). - source(TextLine("fakeInput"), input). - sink[(Boolean,Int,Double,Double)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { - case (k:Boolean, sz : Int, sm : Double, mx : Double) => - (k, (sz,sm,mx) ) + // Now we have the expected input and output: + JobTest(new MapToGroupBySizeSumMaxJob(_)) + .arg("input", "fakeInput") + .arg("output", "fakeOutput") + .source(TextLine("fakeInput"), input) + .sink[(Boolean, Int, Double, Double)](Tsv("fakeOutput")) { outBuf => + val actualOutput = outBuf.map { case (k: Boolean, sz: Int, sm: Double, mx: Double) => + (k, (sz, sm, mx)) }.toMap "produce correct size, sum, max" in { - goldenOutput must be_==(actualOutput) + goldenOutput shouldBe actualOutput } - }. - run. - finish + } + .run + .finish() } } class PartitionJob(args: Args) extends Job(args) { Tsv("input", new Fields("age", "weight")) - .partition('age -> 'isAdult) { (_:Int) > 18 } { _.average('weight) } + .partition('age -> 'isAdult)((_: Int) > 18)(_.average('weight)) .project('isAdult, 'weight) .write(Tsv("output")) } -class PartitionJobTest extends Specification { - noDetailedDiffs() +class PartitionJobTest extends WordSpec with Matchers { "A PartitionJob" should { - val input = List((3, 23),(23,154),(15,123),(53,143),(7,85),(19,195), - (42,187),(35,165),(68,121),(13,103),(17,173),(2,13)) + val input = List( + (3, 23), + (23, 154), + (15, 123), + (53, 143), + (7, 85), + (19, 195), + (42, 187), + (35, 165), + (68, 121), + (13, 103), + (17, 173), + (2, 13) + ) val (adults, minors) = input.partition { case (age, _) => age > 18 } val Seq(adultWeights, minorWeights) = Seq(adults, minors).map { list => @@ -208,1253 +222,1799 @@ class PartitionJobTest extends Specification { ) JobTest(new com.twitter.scalding.PartitionJob(_)) .source(Tsv("input", new Fields("age", "weight")), input) - .sink[(Boolean,Double)](Tsv("output")) { outBuf => - outBuf.toMap must be_==(expectedOutput) + .sink[(Boolean, Double)](Tsv("output")) { outBuf => + outBuf.toMap shouldBe expectedOutput } - .run.finish + .run + .finish() } } -class MRMJob(args : Args) extends Job(args) { - val in = Tsv("input").read.mapTo((0,1) -> ('x,'y)) { xy : (Int,Int) => xy } - // XOR reduction (insane, I guess: - in.groupBy('x) { _.reduce('y) { (left : Int, right : Int) => left ^ right } } +class MRMJob(args: Args) extends Job(args) { + val in = Tsv("input").read.mapTo((0, 1) -> ('x, 'y)) { xy: (Int, Int) => xy } + // XOR reduction (insane, I guess: + in.groupBy('x)(_.reduce('y)((left: Int, right: Int) => left ^ right)) .write(Tsv("outputXor")) - // set - val setPipe = in.groupBy('x) { _.mapReduceMap('y -> 'y) { (input : Int) => Set(input) } - { (left : Set[Int], right : Set[Int]) => left ++ right } - { (output : Set[Int]) => output.toList } + // set + val setPipe = in.groupBy('x) { + _.mapReduceMap('y -> 'y)((input: Int) => Set(input)) { (left: Set[Int], right: Set[Int]) => + left ++ right + }((output: Set[Int]) => output.toList) } - setPipe.flatten[Int]('y -> 'y) - .write(Tsv("outputSet")) + setPipe + .flatten[Int]('y -> 'y) + .write(Tsv("outputSet")) - setPipe.flattenTo[Int]('y -> 'y) - .write(Tsv("outputSetTo")) + setPipe + .flattenTo[Int]('y -> 'y) + .write(Tsv("outputSetTo")) } -class MRMTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 +class MRMTest extends WordSpec with Matchers { "A MRMJob" should { - val input = List((0,1),(0,2),(1,3),(1,1)) + val input = List((0, 1), (0, 2), (1, 3), (1, 1)) - JobTest("com.twitter.scalding.MRMJob") + JobTest(new MRMJob(_)) .source(Tsv("input"), input) - .sink[(Int,Int)](Tsv("outputXor")) { outBuf => + .sink[(Int, Int)](Tsv("outputXor")) { outBuf => "use reduce to compute xor" in { - outBuf.toList.sorted must be_==(List((0,3),(1,2))) + outBuf.toList.sorted shouldBe List((0, 3), (1, 2)) } } - .sink[(Int,Int)](Tsv("outputSet")) { outBuf => + .sink[(Int, Int)](Tsv("outputSet")) { outBuf => "use mapReduceMap to round-trip input" in { - outBuf.toList.sorted must be_==(input.sorted) + outBuf.toList.sorted shouldBe (input.sorted) } } .sink[Int](Tsv("outputSetTo")) { outBuf => "use flattenTo" in { - outBuf.toList.sorted must be_==(input.map { _._2 }.sorted) + outBuf.toList.sorted shouldBe (input.map(_._2).sorted) } } .run - .finish + .finish() } } class JoinJob(args: Args) extends Job(args) { - val p1 = Tsv(args("input1")) - .read - .mapTo((0, 1) -> ('k1, 'v1)) { v : (String, Int) => v } - val p2 = Tsv(args("input2")) - .read - .mapTo((0, 1) -> ('k2, 'v2)) { v : (String, Int) => v } + val p1 = Tsv(args("input1")).read + .mapTo((0, 1) -> ('k1, 'v1)) { v: (String, Int) => v } + val p2 = Tsv(args("input2")).read + .mapTo((0, 1) -> ('k2, 'v2)) { v: (String, Int) => v } p1.joinWithSmaller('k1 -> 'k2, p2) .project('k1, 'v1, 'v2) - .write( Tsv(args("output")) ) + .write(Tsv(args("output"))) } -class JoinTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 +class JoinTest extends WordSpec with Matchers with PropertyChecks { "A JoinJob" should { val input1 = List("a" -> 1, "b" -> 2, "c" -> 3) val input2 = List("b" -> -1, "c" -> 5, "d" -> 4) val correctOutput = Map("b" -> (2, -1), "c" -> (3, 5)) - JobTest("com.twitter.scalding.JoinJob") + JobTest(new JoinJob(_)) .arg("input1", "fakeInput1") .arg("input2", "fakeInput2") .arg("output", "fakeOutput") .source(Tsv("fakeInput1"), input1) .source(Tsv("fakeInput2"), input2) - .sink[(String,Int,Int)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { - case (k : String, v1 : Int, v2 : Int) => - (k,(v1, v2)) + .sink[(String, Int, Int)](Tsv("fakeOutput")) { outBuf => + val actualOutput = outBuf.map { case (k: String, v1: Int, v2: Int) => + (k, (v1, v2)) }.toMap "join tuples with the same key" in { - correctOutput must be_==(actualOutput) + correctOutput shouldBe actualOutput } } .run - .finish + .finish() + } + + "inner join" should { + case class Value(opt: Option[Int] = None) + case class TotalValue(k: String, opt: Option[Int]) + + "respect symmetric law" in { + val srcGen: Gen[TypedPipe[Int]] = { + val ints = Gen.listOf(Arbitrary.arbitrary[Int]).map(TypedPipe.from(_)) + Gen.oneOf(ints, Gen.const(TypedPipe.empty)) + } + + forAll(TypedPipeGen.keyed(srcGen), TypedPipeGen.keyed(srcGen)) { (left, right) => + val leftWithRight = left.join(right).values + val rightWithLeft = right.join(left).values.swap + + assert( + TypedPipeChecker.inMemoryToList(leftWithRight).sorted == + TypedPipeChecker.inMemoryToList(rightWithLeft).sorted + ) + } + } + + "correctly work with mapValueStream" in { + val left = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .mapValueStream { _ => + Iterator.single(scala.util.Random.nextInt()) + } + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.join(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 1) + + // left is random + val (k, (_, rV)) = result.head + assert(k == "b") + assert(rV == 2) + } + } + + "correctly work with mapValues" in { + val left = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .mapValues(identity) + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.join(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 1) + + assert(result.head == ("b", (1, 2))) + } + } + + "correctly work with mapGroup" in { + val left = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .mapGroup { (_, _) => + Iterator.single(scala.util.Random.nextInt()) + } + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.join(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 1) + + // left is random + val (k, (_, rV)) = result.head + assert(k == "b") + assert(rV == 2) + } + } + + "correctly work with sumLeft and custom semigroup" in { + implicit val sg: Semigroup[Int] = new Semigroup[Int] { + override def plus(x: Int, y: Int): Int = x + y + + // doing crazy + override def sumOption(iter: TraversableOnce[Int]): Option[Int] = + if (iter.isEmpty) Some(0) + else super.sumOption(iter) + } + + val left = + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sumLeft + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.join(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 1) + + assert(result.head == ("b", (1, 2))) + } + } + + "correctly work with sum" in { + val left = + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sum + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.join(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 1) + + assert(result.head == ("b", (1, 2))) + } + } + + "correctly work with foldWithKey" in { + val left = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .foldWithKey { _ => + Fold.foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) + } + } + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.join(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 1) + + assert(result.head == ("b", (Value(Some(1)), 2))) + } + } + + "correctly work with foldLeft" in { + val left = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) + } + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.join(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 1) + + assert(result.head == ("b", (Value(Some(1)), 2))) + } + } + } + + "left join" should { + case class Value(opt: Option[Int] = None) + case class TotalValue(k: String, opt: Option[Int]) + + "correctly work with mapValueStream" in { + val left = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .mapValueStream { _ => + Iterator.single(scala.util.Random.nextInt()) + } + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.leftJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + + "correctly work with mapValues" in { + val left = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .mapValues(identity) + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.leftJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + + "correctly work with mapGroup" in { + val left = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .mapGroup { (_, _) => + Iterator.single(scala.util.Random.nextInt()) + } + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.leftJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + + "correctly work with sumLeft and custom semigroup" in { + implicit val sg: Semigroup[Int] = new Semigroup[Int] { + override def plus(x: Int, y: Int): Int = x + y + + // doing crazy stuff + override def sumOption(iter: TraversableOnce[Int]): Option[Int] = + if (iter.isEmpty) Some(0) + else super.sumOption(iter) + } + + val left = + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sumLeft + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.leftJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + + "correctly work with sum" in { + val left = + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sum + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.join(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 1) + + assert(result.head == ("b", (1, 2))) + } + } + + "correctly work with foldWithKey" in { + val left = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .foldWithKey { _ => + Fold.foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) + } + } + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.leftJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + + "correctly work with foldLeft" in { + val left = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) + } + + val right = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val res = left.leftJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + } + + "right join" should { + case class Value(opt: Option[Int] = None) + case class TotalValue(k: String, opt: Option[Int]) + + "correctly work with mapValueStream" in { + val left = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val right = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .mapValueStream { _ => + Iterator.single(scala.util.Random.nextInt()) + } + + val res = left.rightJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + + "correctly work with mapValues" in { + val left = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val right = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .mapValues(identity) + + val res = left.rightJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + + "correctly work with mapGroup" in { + val left = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val right = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .mapGroup { (_, _) => + Iterator.single(scala.util.Random.nextInt()) + } + + val res = left.rightJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + + "correctly work with sumLeft and custom semigroup" in { + implicit val sg: Semigroup[Int] = new Semigroup[Int] { + override def plus(x: Int, y: Int): Int = x + y + + // doing crazy stuff + override def sumOption(iter: TraversableOnce[Int]): Option[Int] = + if (iter.isEmpty) Some(scala.util.Random.nextInt()) + else super.sumOption(iter) + } + + val left = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val right = + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sumLeft + + val res = left.rightJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + + "correctly work with sum" in { + val left = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val right = + TypedPipe.from(List("a" -> 1, "b" -> 1)).group.sum + + val res = left.rightJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + + "correctly work with foldWithKey" in { + val left = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val right = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .foldWithKey { _ => + Fold.foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) + } + } + + val res = left.rightJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } + + "correctly work with foldLeft" in { + val left = + TypedPipe.from(List("c" -> 3, "b" -> 2)) + + val right = + TypedPipe + .from(List("a" -> 1, "b" -> 1)) + .group + .foldLeft(Value()) { case (acc, int) => + acc.copy(Some(int)) + } + + val res = left.rightJoin(right) + + TypedPipeChecker.checkOutput(res) { result => + assert(result.size == 2) + + assert(result.map(_._1) == List("a", "b")) + } + } } } class CollidingKeyJoinJob(args: Args) extends Job(args) { - val p1 = Tsv(args("input1")) - .read - .mapTo((0, 1) -> ('k1, 'v1)) { v : (String, Int) => v } + val p1 = Tsv(args("input1")).read + .mapTo((0, 1) -> ('k1, 'v1)) { v: (String, Int) => v } // An an extra fake key to do a join - .map('k1 -> 'k2) { (k : String) => k + k } - val p2 = Tsv(args("input2")) - .read - .mapTo((0, 1) -> ('k1, 'v2)) { v : (String, Int) => v } + .map('k1 -> 'k2)((k: String) => k + k) + val p2 = Tsv(args("input2")).read + .mapTo((0, 1) -> ('k1, 'v2)) { v: (String, Int) => v } // An an extra fake key to do a join - .map('k1 -> 'k3) { (k : String) => k + k } - p1.joinWithSmaller(('k1,'k2) -> ('k1,'k3), p2) - .write( Tsv(args("output")) ) + .map('k1 -> 'k3)((k: String) => k + k) + p1.joinWithSmaller(('k1, 'k2) -> ('k1, 'k3), p2) + .write(Tsv(args("output"))) } -class CollidingKeyJoinTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 +class CollidingKeyJoinTest extends WordSpec with Matchers { "A CollidingKeyJoinJob" should { val input1 = List("a" -> 1, "b" -> 2, "c" -> 3) val input2 = List("b" -> -1, "c" -> 5, "d" -> 4) val correctOutput = Map("b" -> (2, "bb", -1, "bb"), "c" -> (3, "cc", 5, "cc")) - JobTest("com.twitter.scalding.CollidingKeyJoinJob") + JobTest(new CollidingKeyJoinJob(_)) .arg("input1", "fakeInput1") .arg("input2", "fakeInput2") .arg("output", "fakeOutput") .source(Tsv("fakeInput1"), input1) .source(Tsv("fakeInput2"), input2) - .sink[(String,Int,String,Int,String)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { - case (k : String, v1 : Int, k2 : String, v2 : Int, k3 : String) => - (k,(v1, k2, v2, k3)) + .sink[(String, Int, String, Int, String)](Tsv("fakeOutput")) { outBuf => + val actualOutput = outBuf.map { case (k: String, v1: Int, k2: String, v2: Int, k3: String) => + (k, (v1, k2, v2, k3)) }.toMap "join tuples with the same key" in { - correctOutput must be_==(actualOutput) + correctOutput shouldBe actualOutput } } .run - .finish + .finish() } } class TinyJoinJob(args: Args) extends Job(args) { - val p1 = Tsv(args("input1")) - .read - .mapTo((0, 1) -> ('k1, 'v1)) { v : (String, Int) => v } - val p2 = Tsv(args("input2")) - .read - .mapTo((0, 1) -> ('k2, 'v2)) { v : (String, Int) => v } + val p1 = Tsv(args("input1")).read + .mapTo((0, 1) -> ('k1, 'v1)) { v: (String, Int) => v } + val p2 = Tsv(args("input2")).read + .mapTo((0, 1) -> ('k2, 'v2)) { v: (String, Int) => v } p1.joinWithTiny('k1 -> 'k2, p2) .project('k1, 'v1, 'v2) - .write( Tsv(args("output")) ) + .write(Tsv(args("output"))) } -class TinyJoinTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 +class TinyJoinTest extends WordSpec with Matchers { "A TinyJoinJob" should { val input1 = List("a" -> 1, "b" -> 2, "c" -> 3) val input2 = List("b" -> -1, "c" -> 5, "d" -> 4) val correctOutput = Map("b" -> (2, -1), "c" -> (3, 5)) - - JobTest("com.twitter.scalding.TinyJoinJob") + var idx = 0 + JobTest(new TinyJoinJob(_)) .arg("input1", "fakeInput1") .arg("input2", "fakeInput2") .arg("output", "fakeOutput") .source(Tsv("fakeInput1"), input1) .source(Tsv("fakeInput2"), input2) - .sink[(String,Int,Int)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { - case (k : String, v1 : Int, v2 : Int) => - (k,(v1, v2)) + .sink[(String, Int, Int)](Tsv("fakeOutput")) { outBuf => + val actualOutput = outBuf.map { case (k: String, v1: Int, v2: Int) => + (k, (v1, v2)) }.toMap - "join tuples with the same key" in { - correctOutput must be_==(actualOutput) + (idx + ": join tuples with the same key") in { + actualOutput shouldBe correctOutput } + idx += 1 } .run .runHadoop - .finish + .finish() } } class TinyCollisionJoinJob(args: Args) extends Job(args) { - val p1 = Tsv(args("input1")) - .read - .mapTo((0, 1) -> ('k1, 'v1)) { v : (String, Int) => v } - val p2 = Tsv(args("input2")) - .read - .mapTo((0, 1) -> ('k1, 'v2)) { v : (String, Int) => v } + val p1 = Tsv(args("input1")).read + .mapTo((0, 1) -> ('k1, 'v1)) { v: (String, Int) => v } + val p2 = Tsv(args("input2")).read + .mapTo((0, 1) -> ('k1, 'v2)) { v: (String, Int) => v } p1.joinWithTiny('k1 -> 'k1, p2) - .write( Tsv(args("output")) ) + .write(Tsv(args("output"))) } -class TinyCollisionJoinTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 +class TinyCollisionJoinTest extends WordSpec with Matchers { "A TinyCollisionJoinJob" should { val input1 = List("a" -> 1, "b" -> 2, "c" -> 3) val input2 = List("b" -> -1, "c" -> 5, "d" -> 4) val correctOutput = Map("b" -> (2, -1), "c" -> (3, 5)) - JobTest("com.twitter.scalding.TinyCollisionJoinJob") + JobTest(new TinyCollisionJoinJob(_)) .arg("input1", "fakeInput1") .arg("input2", "fakeInput2") .arg("output", "fakeOutput") .source(Tsv("fakeInput1"), input1) .source(Tsv("fakeInput2"), input2) - .sink[(String,Int,Int)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { - case (k : String, v1 : Int, v2 : Int) => - (k,(v1, v2)) + .sink[(String, Int, Int)](Tsv("fakeOutput")) { outBuf => + val actualOutput = outBuf.map { case (k: String, v1: Int, v2: Int) => + (k, (v1, v2)) }.toMap "join tuples with the same key" in { - correctOutput must be_==(actualOutput) + correctOutput shouldBe actualOutput } } .run - .finish + .finish() } } -class TinyThenSmallJoin(args : Args) extends Job(args) { - val pipe0 = Tsv("in0",('x0,'y0)).read - val pipe1 = Tsv("in1",('x1,'y1)).read - val pipe2 = Tsv("in2",('x2,'y2)).read +class TinyThenSmallJoin(args: Args) extends Job(args) { + val pipe0 = Tsv("in0", ('x0, 'y0)).read + val pipe1 = Tsv("in1", ('x1, 'y1)).read + val pipe2 = Tsv("in2", ('x2, 'y2)).read - pipe0.joinWithTiny('x0 -> 'x1, pipe1) + pipe0 + .joinWithTiny('x0 -> 'x1, pipe1) .joinWithSmaller('x0 -> 'x2, pipe2) - .map(('y0, 'y1, 'y2) -> ('y0, 'y1, 'y2)) { v : (TC,TC,TC) => + .map(('y0, 'y1, 'y2) -> ('y0, 'y1, 'y2)) { v: (TC, TC, TC) => (v._1.n, v._2.n, v._3.n) } .project('x0, 'y0, 'x1, 'y1, 'x2, 'y2) .write(Tsv("out")) } -case class TC(val n : Int) +case class TC(val n: Int) -class TinyThenSmallJoinTest extends Specification with FieldConversions { - noDetailedDiffs() //Fixes an issue with scala 2.9 +class TinyThenSmallJoinTest extends WordSpec with Matchers with FieldConversions { "A TinyThenSmallJoin" should { - val input0 = List((1,TC(2)),(2,TC(3)),(3,TC(4))) - val input1 = List((1,TC(20)),(2,TC(30)),(3,TC(40))) - val input2 = List((1,TC(200)),(2,TC(300)),(3,TC(400))) - val correct = List((1,2,1,20,1,200), - (2,3,2,30,2,300),(3,4,3,40,3,400)) - - JobTest("com.twitter.scalding.TinyThenSmallJoin") - .source(Tsv("in0",('x0,'y0)), input0) - .source(Tsv("in1",('x1,'y1)), input1) - .source(Tsv("in2",('x2,'y2)), input2) - .sink[(Int,Int,Int,Int,Int,Int)](Tsv("out")) { outBuf => - val actualOutput = outBuf.toList.sorted - println(actualOutput) - "join tuples with the same key" in { - correct must be_==(actualOutput) + val input0 = List((1, TC(2)), (2, TC(3)), (3, TC(4))) + val input1 = List((1, TC(20)), (2, TC(30)), (3, TC(40))) + val input2 = List((1, TC(200)), (2, TC(300)), (3, TC(400))) + val correct = List((1, 2, 1, 20, 1, 200), (2, 3, 2, 30, 2, 300), (3, 4, 3, 40, 3, 400)) + var idx = 0 + JobTest(new TinyThenSmallJoin(_)) + .source(Tsv("in0", ('x0, 'y0)), input0) + .source(Tsv("in1", ('x1, 'y1)), input1) + .source(Tsv("in2", ('x2, 'y2)), input2) + .sink[(Int, Int, Int, Int, Int, Int)](Tsv("out")) { outBuf => + (idx + ": join tuples with the same key") in { + outBuf.toList.sorted shouldBe correct } + idx += 1 } .run .runHadoop - .finish + .finish() } } class LeftJoinJob(args: Args) extends Job(args) { val p1 = Tsv(args("input1")) - .mapTo((0, 1) -> ('k1, 'v1)) { v : (String, Int) => v } + .mapTo((0, 1) -> ('k1, 'v1)) { v: (String, Int) => v } val p2 = Tsv(args("input2")) - .mapTo((0, 1) -> ('k2, 'v2)) { v : (String, Int) => v } + .mapTo((0, 1) -> ('k2, 'v2)) { v: (String, Int) => v } p1.leftJoinWithSmaller('k1 -> 'k2, p2) .project('k1, 'v1, 'v2) // Null sent to TSV will not be read in properly - .map('v2 -> 'v2) { v : AnyRef => Option(v).map { _.toString }.getOrElse("NULL") } - .write( Tsv(args("output")) ) + .map('v2 -> 'v2) { v: AnyRef => Option(v).map(_.toString).getOrElse("NULL") } + .write(Tsv(args("output"))) } -class LeftJoinTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 +class LeftJoinTest extends WordSpec with Matchers { "A LeftJoinJob" should { val input1 = List("a" -> 1, "b" -> 2, "c" -> 3) val input2 = List("b" -> -1, "c" -> 5, "d" -> 4) - val correctOutput = Map[String,(Int,AnyRef)]("a" -> (1,"NULL"), "b" -> (2, "-1"), - "c" -> (3, "5")) - - JobTest("com.twitter.scalding.LeftJoinJob") + val correctOutput = Map[String, (Int, AnyRef)]("a" -> (1, "NULL"), "b" -> (2, "-1"), "c" -> (3, "5")) + var idx = 0 + JobTest(new LeftJoinJob(_)) .arg("input1", "fakeInput1") .arg("input2", "fakeInput2") .arg("output", "fakeOutput") .source(Tsv("fakeInput1"), input1) .source(Tsv("fakeInput2"), input2) - .sink[(String,Int,JInt)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { input : (String,Int,AnyRef) => + .sink[(String, Int, JInt)](Tsv("fakeOutput")) { outBuf => + val actualOutput = outBuf.map { input: (String, Int, AnyRef) => println(input) val (k, v1, v2) = input - (k,(v1, v2)) + (k, (v1, v2)) }.toMap - "join tuples with the same key" in { - correctOutput must be_==(actualOutput) + (idx + ": join tuples with the same key") in { + correctOutput shouldBe actualOutput } + idx += 1 } .run .runHadoop - .finish + .finish() } } class LeftJoinWithLargerJob(args: Args) extends Job(args) { val p1 = Tsv(args("input1")) - .mapTo((0, 1) -> ('k1, 'v1)) { v : (String, Int) => v } + .mapTo((0, 1) -> ('k1, 'v1)) { v: (String, Int) => v } val p2 = Tsv(args("input2")) - .mapTo((0, 1) -> ('k2, 'v2)) { v : (String, Int) => v } + .mapTo((0, 1) -> ('k2, 'v2)) { v: (String, Int) => v } // Note i am specifying the joiner explicitly since this did not work properly before (leftJoinWithLarger always worked) p1.joinWithLarger('k1 -> 'k2, p2, new cascading.pipe.joiner.LeftJoin) .project('k1, 'v1, 'v2) // Null sent to TSV will not be read in properly - .map('v2 -> 'v2) { v : AnyRef => Option(v).map { _.toString }.getOrElse("NULL") } - .write( Tsv(args("output")) ) + .map('v2 -> 'v2) { v: AnyRef => Option(v).map(_.toString).getOrElse("NULL") } + .write(Tsv(args("output"))) } -class LeftJoinWithLargerTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 +class LeftJoinWithLargerTest extends WordSpec with Matchers { "A LeftJoinWithLargerJob" should { val input1 = List("a" -> 1, "b" -> 2, "c" -> 3) val input2 = List("b" -> -1, "c" -> 5, "d" -> 4) - val correctOutput = Map[String,(Int,AnyRef)]("a" -> (1,"NULL"), "b" -> (2, "-1"), - "c" -> (3, "5")) - - JobTest("com.twitter.scalding.LeftJoinWithLargerJob") + val correctOutput = Map[String, (Int, AnyRef)]("a" -> (1, "NULL"), "b" -> (2, "-1"), "c" -> (3, "5")) + var idx = 0 + JobTest(new LeftJoinWithLargerJob(_)) .arg("input1", "fakeInput1") .arg("input2", "fakeInput2") .arg("output", "fakeOutput") .source(Tsv("fakeInput1"), input1) .source(Tsv("fakeInput2"), input2) - .sink[(String,Int,JInt)](Tsv("fakeOutput")) { outBuf => - val actualOutput = outBuf.map { input : (String,Int,AnyRef) => + .sink[(String, Int, JInt)](Tsv("fakeOutput")) { outBuf => + val actualOutput = outBuf.map { input: (String, Int, AnyRef) => println(input) val (k, v1, v2) = input - (k,(v1, v2)) + (k, (v1, v2)) }.toMap - "join tuples with the same key" in { - correctOutput must be_==(actualOutput) + s"$idx: join tuples with the same key" in { + correctOutput shouldBe actualOutput } + idx += 1 } .run .runHadoop - .finish + .finish() } } -class MergeTestJob(args : Args) extends Job(args) { - val in = TextLine(args("in")).read.mapTo(1->('x,'y)) { line : String => - val p = line.split(" ").map { _.toDouble } - (p(0),p(1)) +class MergeTestJob(args: Args) extends Job(args) { + val in = TextLine(args("in")).read.mapTo(1 -> ('x, 'y)) { line: String => + val p = line.split(" ").map(_.toDouble) + (p(0), p(1)) } - val big = in.filter('x) { (x:Double) => (x > 0.5) } - val small = in.filter('x) { (x:Double) => (x <= 0.5) } - (big ++ small).groupBy('x) { _.max('y) } - .write(Tsv(args("out"))) + val big = in.filter('x)((x: Double) => (x > 0.5)) + val small = in.filter('x)((x: Double) => (x <= 0.5)) + (big ++ small) + .groupBy('x)(_.max('y)) + .write(Tsv(args("out"))) // Self merge should work - (big ++ big).groupBy('x) { _.max('y) } - .write(Tsv("out2")) + (big ++ big) + .groupBy('x)(_.max('y)) + .write(Tsv("out2")) } -class MergeTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 +class MergeTest extends WordSpec with Matchers { "A MergeTest" should { val r = new java.util.Random - //Here is our input data: - val input = (0 to 100).map { i => (i.toString, r.nextDouble.toString +" "+ r.nextDouble.toString) } - //Here is our expected output: - val parsed = input.map { case (line : String, x : String) => - val t = x.split(" ").map { _.toDouble } - (t(0),t(1)) + // Here is our input data: + val input = (0 to 100).map(i => (i.toString, r.nextDouble.toString + " " + r.nextDouble.toString)) + // Here is our expected output: + val parsed = input.map { case (line: String, x: String) => + val t = x.split(" ").map(_.toDouble) + (t(0), t(1)) } - val big = parsed.filter( _._1 > 0.5 ) - val small = parsed.filter( _._1 <= 0.5 ) - val golden = (big ++ small).groupBy{ _._1 }.mapValues { itup => (itup.map{ _._2 }.max) } - //Now we have the expected input and output: - JobTest("com.twitter.scalding.MergeTestJob"). - arg("in","fakeInput"). - arg("out","fakeOutput"). - source(TextLine("fakeInput"), input). - sink[(Double,Double)](Tsv("fakeOutput")) { outBuf => + val big = parsed.filter(_._1 > 0.5) + val small = parsed.filter(_._1 <= 0.5) + val golden = (big ++ small).groupBy(_._1).mapValues(itup => (itup.map(_._2).max)) + // Now we have the expected input and output: + JobTest(new MergeTestJob(_)) + .arg("in", "fakeInput") + .arg("out", "fakeOutput") + .source(TextLine("fakeInput"), input) + .sink[(Double, Double)](Tsv("fakeOutput")) { outBuf => "correctly merge two pipes" in { - golden must be_==(outBuf.toMap) + golden shouldBe outBuf.toMap } - }. - sink[(Double,Double)](Tsv("out2")) { outBuf => + } + .sink[(Double, Double)](Tsv("out2")) { outBuf => "correctly self merge" in { - outBuf.toMap must be_==(big.groupBy(_._1).mapValues{iter => iter.map(_._2).max}) + outBuf.toMap shouldBe big.groupBy(_._1).mapValues(iter => iter.map(_._2).max) } - }. - run. - finish + } + .run + .finish() } } -class SizeAveStdJob(args : Args) extends Job(args) { - TextLine(args("input")).mapTo('x,'y) { line => - val p = line.split(" ").map { _.toDouble }.slice(0,2) - (p(0),p(1)) - }.map('x -> 'x) { (x : Double) => (4 * x).toInt } - .groupBy('x) { - _.sizeAveStdev('y->('size,'yave,'ystdev)) - //Make sure this doesn't ruin the calculation - .sizeAveStdev('y->('size2,'yave2,'ystdev2)) - .average('y) - } - .project('x,'size,'yave,'ystdev,'y) - .write(Tsv(args("output"))) +class SizeAveStdJob(args: Args) extends Job(args) { + TextLine(args("input")) + .mapTo('x, 'y) { line => + val p = line.split(" ").map(_.toDouble).slice(0, 2) + (p(0), p(1)) + } + .map('x -> 'x)((x: Double) => (4 * x).toInt) + .groupBy('x) { + _.sizeAveStdev('y -> ('size, 'yave, 'ystdev)) + // Make sure this doesn't ruin the calculation + .sizeAveStdev('y -> ('size2, 'yave2, 'ystdev2)) + .average('y) + } + .project('x, 'size, 'yave, 'ystdev, 'y) + .write(Tsv(args("output"))) } -class SizeAveStdSpec extends Specification { +class SizeAveStdSpec extends WordSpec with Matchers { "A sizeAveStd job" should { - "correctly compute aves and standard deviations" in { - val r = new java.util.Random - def powerLawRand = { - // Generates a 1/x powerlaw with a max value or 1e40 - scala.math.pow(1e40, r.nextDouble) - } - //Here is our input data: - val input = (0 to 10000).map { i => (i.toString, r.nextDouble.toString +" "+ powerLawRand.toString) } - val output = input.map { numline => numline._2.split(" ").map { _.toDouble } } - .map { vec => ((vec(0)*4).toInt, vec(1)) } - .groupBy { tup => tup._1 } - .mapValues { tups => - val all = tups.map { tup => tup._2.toDouble }.toList - val size = all.size.toLong - val ave = all.sum / size - //Compute the standard deviation: - val vari = all.map { x => (x-ave)*(x-ave) }.sum / (size) - val stdev = scala.math.sqrt(vari) - (size, ave, stdev) - } - JobTest(new SizeAveStdJob(_)). - arg("input","fakeInput"). - arg("output","fakeOutput"). - source(TextLine("fakeInput"), input). - sink[(Int,Long,Double,Double,Double)](Tsv("fakeOutput")) { outBuf => - "correctly compute size, ave, stdev" in { - outBuf.foreach { computed => - val correctTup = output(computed._1) - //Size - computed._2 must be_== (correctTup._1) - //Ave - computed._3/correctTup._2 must beCloseTo(1.0, 1e-6) - //Stdev - computed._4/correctTup._3 must beCloseTo(1.0, 1e-6) - //Explicitly calculated Average: - computed._5/computed._3 must beCloseTo(1.0, 1e-6) - } + val r = new java.util.Random + def powerLawRand = + // Generates a 1/x powerlaw with a max value or 1e40 + scala.math.pow(1e40, r.nextDouble) + // Here is our input data: + val input = (0 to 10000).map(i => (i.toString, r.nextDouble.toString + " " + powerLawRand.toString)) + val output = input + .map { numline => + val vec = numline._2.split(" ").map(_.toDouble) + ((vec(0) * 4).toInt, vec(1)) + } + .groupBy(_._1) + .mapValues { tups => + val all = tups.map(_._2).toList + val size = all.size.toLong + val ave = all.sum / size + // Compute the standard deviation: + val vari = all.map(x => (x - ave) * (x - ave)).sum / size + val stdev = scala.math.sqrt(vari) + (size, ave, stdev) + } + JobTest(new SizeAveStdJob(_)) + .arg("input", "fakeInput") + .arg("output", "fakeOutput") + .source(TextLine("fakeInput"), input) + .sink[(Int, Long, Double, Double, Double)](Tsv("fakeOutput")) { outBuf => + "correctly compute size, ave, stdev" in { + outBuf.foreach { computed => + val correctTup = output(computed._1) + // Size + computed._2 shouldBe (correctTup._1) + // Ave + computed._3 / correctTup._2 shouldBe 1.0 +- 1e-6 + // Stdev + computed._4 / correctTup._3 shouldBe 1.0 +- 1e-6 + // Explicitly calculated Average: + computed._5 / computed._3 shouldBe 1.0 +- 1e-6 } - }. - run. - finish - } + } + } + .run + .finish() } } -class DoubleGroupJob(args : Args) extends Job(args) { - TextLine(args("in")).mapTo('x, 'y) { line => +class DoubleGroupJob(args: Args) extends Job(args) { + TextLine(args("in")) + .mapTo('x, 'y) { line => val p = line.split(" ") - (p(0),p(1)) + (p(0), p(1)) } - .groupBy('x) { _.size } - .groupBy('size ) { _.size('cnt) } + .groupBy('x)(_.size) + .groupBy('size)(_.size('cnt)) .write(Tsv(args("out"))) } -class DoubleGroupSpec extends Specification { +class DoubleGroupSpec extends WordSpec with Matchers { "A DoubleGroupJob" should { - "correctly generate output" in { - JobTest("com.twitter.scalding.DoubleGroupJob"). - arg("in","fakeIn"). - arg("out","fakeOut"). - source(TextLine("fakeIn"), List("0" -> "one 1", - "1" -> "two 1", - "2" -> "two 2", - "3" -> "three 3", - "4" -> "three 4", - "5" -> "three 5", - "6" -> "just one" - )). - sink[(Long,Long)](Tsv("fakeOut")) { outBuf => - "correctly build histogram" in { - val outM = outBuf.toMap - outM(1) must be_== (2) //both one and just keys occur only once - outM(2) must be_== (1) - outM(3) must be_== (1) - } - }. - run. - finish - } + JobTest(new DoubleGroupJob(_)) + .arg("in", "fakeIn") + .arg("out", "fakeOut") + .source( + TextLine("fakeIn"), + List( + "0" -> "one 1", + "1" -> "two 1", + "2" -> "two 2", + "3" -> "three 3", + "4" -> "three 4", + "5" -> "three 5", + "6" -> "just one" + ) + ) + .sink[(Long, Long)](Tsv("fakeOut")) { outBuf => + "correctly build histogram" in { + val outM = outBuf.toMap + outM(1) shouldBe 2 // both one and just keys occur only once + outM(2) shouldBe 1 + outM(3) shouldBe 1 + } + } + .run + .finish() } } -class GroupUniqueJob(args : Args) extends Job(args) { - TextLine(args("in")).mapTo('x, 'y) { line => +class GroupUniqueJob(args: Args) extends Job(args) { + TextLine(args("in")) + .mapTo('x, 'y) { line => val p = line.split(" ") - (p(0),p(1)) + (p(0), p(1)) } - .groupBy('x) { _.size } - .unique('size ) + .groupBy('x)(_.size) + .unique('size) .write(Tsv(args("out"))) } -class GroupUniqueSpec extends Specification { +class GroupUniqueSpec extends WordSpec with Matchers { "A GroupUniqueJob" should { - JobTest("com.twitter.scalding.GroupUniqueJob"). - arg("in","fakeIn"). - arg("out","fakeOut"). - source(TextLine("fakeIn"), List("0" -> "one 1", - "1" -> "two 1", - "2" -> "two 2", - "3" -> "three 3", - "4" -> "three 4", - "5" -> "three 5", - "6" -> "just one" - )). - sink[(Long)](Tsv("fakeOut")) { outBuf => + JobTest(new GroupUniqueJob(_)) + .arg("in", "fakeIn") + .arg("out", "fakeOut") + .source( + TextLine("fakeIn"), + List( + "0" -> "one 1", + "1" -> "two 1", + "2" -> "two 2", + "3" -> "three 3", + "4" -> "three 4", + "5" -> "three 5", + "6" -> "just one" + ) + ) + .sink[Long](Tsv("fakeOut")) { outBuf => "correctly count unique sizes" in { - val outSet = outBuf.toSet - outSet.size must_== 3 + outBuf.toSet should have size 3 } - }. - run. - finish + } + .run + .finish() } } -class DiscardTestJob(args : Args) extends Job(args) { - TextLine(args("in")).flatMapTo('words) { line => line.split("\\s+") } - .map('words -> 'wsize) { word : String => word.length } +class DiscardTestJob(args: Args) extends Job(args) { + TextLine(args("in")) + .flatMapTo('words)(line => line.split("\\s+")) + .map('words -> 'wsize) { word: String => word.length } .discard('words) - .map('* -> 'correct) { te : TupleEntry => !te.getFields.contains('words) } - .groupAll { _.forall('correct -> 'correct) { x : Boolean => x } } + .map('* -> 'correct) { te: TupleEntry => !te.getFields.contains('words) } + .groupAll(_.forall('correct -> 'correct) { x: Boolean => x }) .write(Tsv(args("out"))) } -class DiscardTest extends Specification { +class DiscardTest extends WordSpec with Matchers { "A DiscardTestJob" should { - JobTest("com.twitter.scalding.DiscardTestJob") - .arg("in","fakeIn") - .arg("out","fakeOut") + JobTest(new DiscardTestJob(_)) + .arg("in", "fakeIn") + .arg("out", "fakeOut") .source(TextLine("fakeIn"), List("0" -> "hello world", "1" -> "foo", "2" -> "bar")) .sink[Boolean](Tsv("fakeOut")) { outBuf => "must reduce down to one line" in { - outBuf.size must_== 1 + outBuf should have size 1 } "must correctly discard word column" in { - outBuf(0) must beTrue + outBuf(0) shouldBe true } } .run - .finish + .finish() } } -class HistogramJob(args : Args) extends Job(args) { +class HistogramJob(args: Args) extends Job(args) { TextLine(args("in")).read - .groupBy('line) { _.size } - .groupBy('size) { _.size('freq) } + .groupBy('line)(_.size) + .groupBy('size)(_.size('freq)) .write(Tsv(args("out"))) } -class HistogramTest extends Specification { +class HistogramTest extends WordSpec with Matchers { "A HistogramJob" should { - JobTest("com.twitter.scalding.HistogramJob") - .arg("in","fakeIn") - .arg("out","fakeOut") + JobTest(new HistogramJob(_)) + .arg("in", "fakeIn") + .arg("out", "fakeOut") .source(TextLine("fakeIn"), List("0" -> "single", "1" -> "single")) - .sink[(Long,Long)](Tsv("fakeOut")) { outBuf => + .sink[(Long, Long)](Tsv("fakeOut")) { outBuf => "must reduce down to a single line for a trivial input" in { - outBuf.size must_== 1 + outBuf should have size 1 } "must get the result right" in { - outBuf(0) must_== (2L,1L) + outBuf(0) shouldBe (2L, 1L) } } .run - .finish + .finish() } } -class ForceReducersJob(args : Args) extends Job(args) { +class ForceReducersJob(args: Args) extends Job(args) { TextLine("in").read .rename((0, 1) -> ('num, 'line)) - .flatMap('line -> 'words){l : String => l.split(" ")} - .groupBy('num){ _.toList[String]('words -> 'wordList).forceToReducers } - .map('wordList -> 'wordList){w : List[String] => w.mkString(" ")} + .flatMap('line -> 'words) { l: String => l.split(" ") } + .groupBy('num)(_.toList[String]('words -> 'wordList).forceToReducers) + .map('wordList -> 'wordList) { w: List[String] => w.mkString(" ") } .project('num, 'wordList) .write(Tsv("out")) } -class ForceReducersTest extends Specification { +class ForceReducersTest extends WordSpec with Matchers { "A ForceReducersJob" should { - JobTest("com.twitter.scalding.ForceReducersJob") + var idx = 0 + JobTest(new ForceReducersJob(_)) .source(TextLine("in"), List("0" -> "single test", "1" -> "single result")) - .sink[(Int,String)](Tsv("out")) { outBuf => - "must get the result right" in { - //need to convert to sets because order - outBuf(0)._2.split(" ").toSet must_== Set("single", "test") - outBuf(1)._2.split(" ").toSet must_== Set("single", "result") + .sink[(Int, String)](Tsv("out")) { outBuf => + (idx + ": must get the result right") in { + // need to convert to sets because order + outBuf(0)._2.split(" ").toSet shouldBe Set("single", "test") + outBuf(1)._2.split(" ").toSet shouldBe Set("single", "result") } + idx += 1 } .run .runHadoop - .finish + .finish() } } -class ToListJob(args : Args) extends Job(args) { +class ToListJob(args: Args) extends Job(args) { TextLine(args("in")).read - .flatMap('line -> 'words){l : String => l.split(" ")} - .groupBy('offset){ _.toList[String]('words -> 'wordList) } - .map('wordList -> 'wordList){w : List[String] => w.mkString(" ")} + .flatMap('line -> 'words) { l: String => l.split(" ") } + .groupBy('offset)(_.toList[String]('words -> 'wordList)) + .map('wordList -> 'wordList) { w: List[String] => w.mkString(" ") } .project('offset, 'wordList) .write(Tsv(args("out"))) } -class NullListJob(args : Args) extends Job(args) { +class NullListJob(args: Args) extends Job(args) { TextLine(args("in")).read - .groupBy('offset){ _.toList[String]('line -> 'lineList).spillThreshold(100) } - .map('lineList -> 'lineList) { ll : List[String] => ll.mkString(" ") } + .groupBy('offset)(_.toList[String]('line -> 'lineList).spillThreshold(100)) + .map('lineList -> 'lineList) { ll: List[String] => ll.mkString(" ") } .write(Tsv(args("out"))) } -class ToListTest extends Specification { +class ToListTest extends WordSpec with Matchers { "A ToListJob" should { - JobTest("com.twitter.scalding.ToListJob") - .arg("in","fakeIn") - .arg("out","fakeOut") + JobTest(new ToListJob(_)) + .arg("in", "fakeIn") + .arg("out", "fakeOut") .source(TextLine("fakeIn"), List("0" -> "single test", "1" -> "single result")) - .sink[(Int,String)](Tsv("fakeOut")) { outBuf => + .sink[(Int, String)](Tsv("fakeOut")) { outBuf => "must have the right number of lines" in { - outBuf.size must_== 2 + outBuf should have size 2 } "must get the result right" in { - //need to convert to sets because order - outBuf(0)._2.split(" ").toSet must_== Set("single", "test") - outBuf(1)._2.split(" ").toSet must_== Set("single", "result") + // need to convert to sets because order + outBuf(0)._2.split(" ").toSet shouldBe Set("single", "test") + outBuf(1)._2.split(" ").toSet shouldBe Set("single", "result") } } .run - .finish + .finish() } "A NullListJob" should { - JobTest("com.twitter.scalding.NullListJob") - .arg("in","fakeIn") - .arg("out","fakeOut") + JobTest(new NullListJob(_)) + .arg("in", "fakeIn") + .arg("out", "fakeOut") .source(TextLine("fakeIn"), List("0" -> null, "0" -> "a", "0" -> null, "0" -> "b")) - .sink[(Int,String)](Tsv("fakeOut")) { outBuf => + .sink[(Int, String)](Tsv("fakeOut")) { outBuf => "must have the right number of lines" in { - outBuf.size must_== 1 + outBuf should have size 1 } "must return an empty list for null key" in { val sSet = outBuf(0)._2.split(" ").toSet - sSet must_== Set("a", "b") + sSet shouldBe Set("a", "b") } } .run - .finish + .finish() } } -class CrossJob(args : Args) extends Job(args) { +class CrossJob(args: Args) extends Job(args) { val p1 = Tsv(args("in1")).read - .mapTo((0,1) -> ('x,'y)) { tup : (Int, Int) => tup } + .mapTo((0, 1) -> ('x, 'y)) { tup: (Int, Int) => tup } val p2 = Tsv(args("in2")).read - .mapTo(0->'z) { (z : Int) => z} + .mapTo(0 -> 'z)((z: Int) => z) p1.crossWithTiny(p2).write(Tsv(args("out"))) } -class CrossTest extends Specification { - noDetailedDiffs() - +class CrossTest extends WordSpec with Matchers { "A CrossJob" should { - JobTest("com.twitter.scalding.CrossJob") - .arg("in1","fakeIn1") - .arg("in2","fakeIn2") - .arg("out","fakeOut") - .source(Tsv("fakeIn1"), List(("0","1"),("1","2"),("2","3"))) - .source(Tsv("fakeIn2"), List("4","5").map { Tuple1(_) }) - .sink[(Int,Int,Int)](Tsv("fakeOut")) { outBuf => - "must look exactly right" in { - outBuf.size must_==6 - outBuf.toSet must_==(Set((0,1,4),(0,1,5),(1,2,4),(1,2,5),(2,3,4),(2,3,5))) + var idx = 0 + JobTest(new com.twitter.scalding.CrossJob(_)) + .arg("in1", "fakeIn1") + .arg("in2", "fakeIn2") + .arg("out", "fakeOut") + .source(Tsv("fakeIn1"), List(("0", "1"), ("1", "2"), ("2", "3"))) + .source(Tsv("fakeIn2"), List("4", "5").map(Tuple1(_))) + .sink[(Int, Int, Int)](Tsv("fakeOut")) { outBuf => + (idx + ": must look exactly right") in { + outBuf should have size 6 + outBuf.toSet shouldBe (Set((0, 1, 4), (0, 1, 5), (1, 2, 4), (1, 2, 5), (2, 3, 4), (2, 3, 5))) } + idx += 1 } .run .runHadoop - .finish + .finish() } } -class GroupAllCrossJob(args : Args) extends Job(args) { +class GroupAllCrossJob(args: Args) extends Job(args) { val p1 = Tsv(args("in1")).read - .mapTo((0,1) -> ('x,'y)) { tup : (Int, Int) => tup } - .groupAll { _.max('x) } - .map('x -> 'x) { x : Int => List(x) } + .mapTo((0, 1) -> ('x, 'y)) { tup: (Int, Int) => tup } + .groupAll(_.max('x)) + .map('x -> 'x) { x: Int => List(x) } val p2 = Tsv(args("in2")).read - .mapTo(0->'z) { (z : Int) => z} + .mapTo(0 -> 'z)((z: Int) => z) p2.crossWithTiny(p1) .map('x -> 'x) { l: List[Int] => l.size } .project('x, 'z) .write(Tsv(args("out"))) } -class GroupAllCrossTest extends Specification { - noDetailedDiffs() - +class GroupAllCrossTest extends WordSpec with Matchers { "A GroupAllCrossJob" should { + var idx = 0 JobTest(new GroupAllCrossJob(_)) - .arg("in1","fakeIn1") - .arg("in2","fakeIn2") - .arg("out","fakeOut") - .source(Tsv("fakeIn1"), List(("0","1"),("1","2"),("2","3"))) - .source(Tsv("fakeIn2"), List("4","5").map { Tuple1(_) }) - .sink[(Int,Int)](Tsv("fakeOut")) { outBuf => - "must look exactly right" in { - outBuf.size must_==2 - outBuf.toSet must_==(Set((1,4), (1,5))) + .arg("in1", "fakeIn1") + .arg("in2", "fakeIn2") + .arg("out", "fakeOut") + .source(Tsv("fakeIn1"), List(("0", "1"), ("1", "2"), ("2", "3"))) + .source(Tsv("fakeIn2"), List("4", "5").map(Tuple1(_))) + .sink[(Int, Int)](Tsv("fakeOut")) { outBuf => + (idx + ": must look exactly right") in { + outBuf should have size 2 + outBuf.toSet shouldBe Set((1, 4), (1, 5)) } + idx += 1 } .run .runHadoop - .finish + .finish() } } -class SmallCrossJob(args : Args) extends Job(args) { +class SmallCrossJob(args: Args) extends Job(args) { val p1 = Tsv(args("in1")).read - .mapTo((0,1) -> ('x,'y)) { tup : (Int, Int) => tup } + .mapTo((0, 1) -> ('x, 'y)) { tup: (Int, Int) => tup } val p2 = Tsv(args("in2")).read - .mapTo(0->'z) { (z : Int) => z} + .mapTo(0 -> 'z)((z: Int) => z) p1.crossWithSmaller(p2).write(Tsv(args("out"))) } -class SmallCrossTest extends Specification { - noDetailedDiffs() - +class SmallCrossTest extends WordSpec with Matchers { "A SmallCrossJob" should { - JobTest("com.twitter.scalding.SmallCrossJob") - .arg("in1","fakeIn1") - .arg("in2","fakeIn2") - .arg("out","fakeOut") - .source(Tsv("fakeIn1"), List(("0","1"),("1","2"),("2","3"))) - .source(Tsv("fakeIn2"), List("4","5").map { Tuple1(_) }) - .sink[(Int,Int,Int)](Tsv("fakeOut")) { outBuf => - "must look exactly right" in { - outBuf.size must_==6 - outBuf.toSet must_==(Set((0,1,4),(0,1,5),(1,2,4),(1,2,5),(2,3,4),(2,3,5))) + var idx = 0 + JobTest(new SmallCrossJob(_)) + .arg("in1", "fakeIn1") + .arg("in2", "fakeIn2") + .arg("out", "fakeOut") + .source(Tsv("fakeIn1"), List(("0", "1"), ("1", "2"), ("2", "3"))) + .source(Tsv("fakeIn2"), List("4", "5").map(Tuple1(_))) + .sink[(Int, Int, Int)](Tsv("fakeOut")) { outBuf => + (idx + ": must look exactly right") in { + outBuf should have size 6 + outBuf.toSet shouldBe Set((0, 1, 4), (0, 1, 5), (1, 2, 4), (1, 2, 5), (2, 3, 4), (2, 3, 5)) } + idx += 1 } .run .runHadoop - .finish + .finish() } } -class TopKJob(args : Args) extends Job(args) { +class TopKJob(args: Args) extends Job(args) { Tsv(args("in")).read - .mapTo(0 -> 'x) { (tup : Int) => tup } - //Take the smallest 3 values: - .groupAll { _.sortedTake[Int]('x->'x, 3) } + .mapTo(0 -> 'x)((tup: Int) => tup) + // Take the smallest 3 values: + .groupAll(_.sortedTake[Int]('x -> 'x, 3)) .write(Tsv(args("out"))) } -class TopKTest extends Specification { +class TopKTest extends WordSpec with Matchers { "A TopKJob" should { - JobTest("com.twitter.scalding.TopKJob") - .arg("in","fakeIn") - .arg("out","fakeOut") - .source(Tsv("fakeIn"), List(3,24,1,4,5).map { Tuple1(_) } ) + JobTest(new TopKJob(_)) + .arg("in", "fakeIn") + .arg("out", "fakeOut") + .source(Tsv("fakeIn"), List(3, 24, 1, 4, 5).map(Tuple1(_))) .sink[List[Int]](Tsv("fakeOut")) { outBuf => "must look exactly right" in { - outBuf.size must_==1 - outBuf(0) must be_==(List(1,3,4)) + outBuf should have size 1 + outBuf(0) shouldBe List(1, 3, 4) } } .run - .finish + .finish() } } -class ScanJob(args : Args) extends Job(args) { - Tsv("in",('x,'y,'z)) +class ScanJob(args: Args) extends Job(args) { + Tsv("in", ('x, 'y, 'z)) .groupBy('x) { _.sortBy('y) - .scanLeft('y -> 'ys)(0) { (oldV : Int, newV : Int) => oldV + newV } + .scanLeft('y -> 'ys)(0)((oldV: Int, newV: Int) => oldV + newV) } - .project('x,'ys,'z) - .map('z -> 'z) { z : Int => z } //Make sure the null z is converted to an int + .project('x, 'ys, 'z) + .map('z -> 'z) { z: Int => z } // Make sure the null z is converted to an int .write(Tsv("out")) } -class ScanTest extends Specification { +class ScanTest extends WordSpec with Matchers { import Dsl._ - noDetailedDiffs() + "A ScanJob" should { - JobTest("com.twitter.scalding.ScanJob") - .source(Tsv("in",('x,'y,'z)), List((3,0,1),(3,1,10),(3,5,100)) ) - .sink[(Int,Int,Int)](Tsv("out")) { outBuf => () - val correct = List((3,0,0),(3,0,1),(3,1,10),(3,6,100)) - "have a working scanLeft" in { - outBuf.toList must be_== (correct) + var idx = 0 + JobTest(new ScanJob(_)) + .source(Tsv("in", ('x, 'y, 'z)), List((3, 0, 1), (3, 1, 10), (3, 5, 100))) + .sink[(Int, Int, Int)](Tsv("out")) { outBuf => + val correct = List((3, 0, 0), (3, 0, 1), (3, 1, 10), (3, 6, 100)) + (idx + ": have a working scanLeft") in { + outBuf.toList shouldBe correct } + idx += 1 } .run .runHadoop - .finish + .finish() } } -class TakeJob(args : Args) extends Job(args) { +class TakeJob(args: Args) extends Job(args) { val input = Tsv("in").read - .mapTo((0,1,2) -> ('x,'y,'z)) { tup : (Int,Int,Int) => tup } + .mapTo((0, 1, 2) -> ('x, 'y, 'z)) { tup: (Int, Int, Int) => tup } - input.groupBy('x) { _.take(2) }.write(Tsv("out2")) + input.groupBy('x)(_.take(2)).write(Tsv("out2")) input.groupAll.write(Tsv("outall")) } -class TakeTest extends Specification { - noDetailedDiffs() +class TakeTest extends WordSpec with Matchers { "A TakeJob" should { - JobTest("com.twitter.scalding.TakeJob") - .source(Tsv("in"), List((3,0,1),(3,1,10),(3,5,100)) ) - .sink[(Int,Int,Int)](Tsv("outall")) { outBuf => () + JobTest(new TakeJob(_)) + .source(Tsv("in"), List((3, 0, 1), (3, 1, 10), (3, 5, 100))) + .sink[(Int, Int, Int)](Tsv("outall")) { outBuf => "groupAll must see everything in same order" in { - outBuf.size must_==3 - outBuf.toList must be_== (List((3,0,1),(3,1,10),(3,5,100))) + outBuf should have size 3 + outBuf.toList shouldBe List((3, 0, 1), (3, 1, 10), (3, 5, 100)) } } - .sink[(Int,Int,Int)](Tsv("out2")) { outBuf => + .sink[(Int, Int, Int)](Tsv("out2")) { outBuf => "take(2) must only get 2" in { - outBuf.size must_==2 - outBuf.toList must be_== (List((3,0,1),(3,1,10))) + outBuf should have size 2 + outBuf.toList shouldBe List((3, 0, 1), (3, 1, 10)) } } .run - .finish + .finish() } } -class DropJob(args : Args) extends Job(args) { +class DropJob(args: Args) extends Job(args) { val input = Tsv("in").read - .mapTo((0,1,2) -> ('x,'y,'z)) { tup : (Int,Int,Int) => tup } + .mapTo((0, 1, 2) -> ('x, 'y, 'z)) { tup: (Int, Int, Int) => tup } - input.groupBy('x) { _.drop(2) }.write(Tsv("out2")) + input.groupBy('x)(_.drop(2)).write(Tsv("out2")) input.groupAll.write(Tsv("outall")) } -class DropTest extends Specification { - noDetailedDiffs() +class DropTest extends WordSpec with Matchers { "A DropJob" should { - JobTest("com.twitter.scalding.DropJob") - .source(Tsv("in"), List((3,0,1),(3,1,10),(3,5,100)) ) - .sink[(Int,Int,Int)](Tsv("outall")) { outBuf => () + JobTest(new DropJob(_)) + .source(Tsv("in"), List((3, 0, 1), (3, 1, 10), (3, 5, 100))) + .sink[(Int, Int, Int)](Tsv("outall")) { outBuf => "groupAll must see everything in same order" in { - outBuf.size must_==3 - outBuf.toList must be_== (List((3,0,1),(3,1,10),(3,5,100))) + outBuf should have size 3 + outBuf.toList shouldBe List((3, 0, 1), (3, 1, 10), (3, 5, 100)) } } - .sink[(Int,Int,Int)](Tsv("out2")) { outBuf => + .sink[(Int, Int, Int)](Tsv("out2")) { outBuf => "drop(2) must only get 1" in { - outBuf.toList must be_== (List((3,5,100))) + outBuf.toList shouldBe List((3, 5, 100)) } } .run - .finish + .finish() } } -class PivotJob(args : Args) extends Job(args) { - Tsv("in",('k,'w,'y,'z)).read - .unpivot(('w,'y,'z) -> ('col, 'val)) +class PivotJob(args: Args) extends Job(args) { + Tsv("in", ('k, 'w, 'y, 'z)).read + .unpivot(('w, 'y, 'z) -> ('col, 'val)) .write(Tsv("unpivot")) .groupBy('k) { - _.pivot(('col,'val) -> ('w,'y,'z)) - }.write(Tsv("pivot")) - .unpivot(('w,'y,'z) -> ('col, 'val)) + _.pivot(('col, 'val) -> ('w, 'y, 'z)) + } + .write(Tsv("pivot")) + .unpivot(('w, 'y, 'z) -> ('col, 'val)) .groupBy('k) { - _.pivot(('col,'val) -> ('w,'y,'z,'default), 2.0) - }.write(Tsv("pivot_with_default")) + _.pivot(('col, 'val) -> ('w, 'y, 'z, 'default), 2.0) + } + .write(Tsv("pivot_with_default")) } -class PivotTest extends Specification with FieldConversions { - noDetailedDiffs() - val input = List(("1","a","b","c"),("2","d","e","f")) +class PivotTest extends WordSpec with Matchers with FieldConversions { + val input = List(("1", "a", "b", "c"), ("2", "d", "e", "f")) "A PivotJob" should { JobTest("com.twitter.scalding.PivotJob") - .source(Tsv("in",('k,'w,'y,'z)), input) - .sink[(String,String,String)](Tsv("unpivot")) { outBuf => + .source(Tsv("in", ('k, 'w, 'y, 'z)), input) + .sink[(String, String, String)](Tsv("unpivot")) { outBuf => "unpivot columns correctly" in { - outBuf.size must_== 6 - outBuf.toList.sorted must be_== (List(("1","w","a"),("1","y","b"),("1","z","c"), - ("2","w","d"),("2","y","e"),("2","z","f")).sorted) + outBuf should have size 6 + outBuf.toList.sorted shouldBe (List( + ("1", "w", "a"), + ("1", "y", "b"), + ("1", "z", "c"), + ("2", "w", "d"), + ("2", "y", "e"), + ("2", "z", "f") + ).sorted) } } - .sink[(String,String,String,String)](Tsv("pivot")) { outBuf => + .sink[(String, String, String, String)](Tsv("pivot")) { outBuf => "pivot back to the original" in { - outBuf.size must_==2 - outBuf.toList.sorted must be_== (input.sorted) + outBuf should have size 2 + outBuf.toList.sorted shouldBe (input.sorted) } } - .sink[(String,String,String,String,Double)](Tsv("pivot_with_default")) { outBuf => + .sink[(String, String, String, String, Double)](Tsv("pivot_with_default")) { outBuf => "pivot back to the original with the missing column replace by the specified default" in { - outBuf.size must_==2 - outBuf.toList.sorted must be_== (List(("1","a","b","c",2.0),("2","d","e","f",2.0)).sorted) + outBuf should have size 2 + outBuf.toList.sorted shouldBe (List(("1", "a", "b", "c", 2.0), ("2", "d", "e", "f", 2.0)).sorted) } } .run - .finish + .finish() } } -class IterableSourceJob(args : Args) extends Job(args) { - val list = List((1,2,3),(4,5,6),(3,8,9)) - val iter = IterableSource(list, ('x,'y,'z)) - Tsv("in",('x,'w)) - .joinWithSmaller('x->'x, iter) +class IterableSourceJob(args: Args) extends Job(args) { + val list = List((1, 2, 3), (4, 5, 6), (3, 8, 9)) + val iter = IterableSource(list, ('x, 'y, 'z)) + Tsv("in", ('x, 'w)) + .joinWithSmaller('x -> 'x, iter) .write(Tsv("out")) - Tsv("in",('x,'w)) - .joinWithTiny('x->'x, iter) + Tsv("in", ('x, 'w)) + .joinWithTiny('x -> 'x, iter) .write(Tsv("tiny")) - //Now without fields and using the implicit: - Tsv("in",('x,'w)) - .joinWithTiny('x -> 0, list).write(Tsv("imp")) + // Now without fields and using the implicit: + Tsv("in", ('x, 'w)) + .joinWithTiny('x -> 0, list) + .write(Tsv("imp")) } -class IterableSourceTest extends Specification with FieldConversions { - noDetailedDiffs() - val input = List((1,10),(2,20),(3,30)) +class IterableSourceTest extends WordSpec with Matchers with FieldConversions { + val input = List((1, 10), (2, 20), (3, 30)) "A IterableSourceJob" should { - JobTest("com.twitter.scalding.IterableSourceJob") - .source(Tsv("in",('x,'w)), input) - .sink[(Int,Int,Int,Int)](Tsv("out")) { outBuf => - "Correctly joinWithSmaller" in { - outBuf.toList.sorted must be_== (List((1,10,2,3),(3,30,8,9))) + var idx = 0 + JobTest(new IterableSourceJob(_)) + .source(Tsv("in", ('x, 'w)), input) + .sink[(Int, Int, Int, Int)](Tsv("out")) { outBuf => + s"$idx: Correctly joinWithSmaller" in { + outBuf.toList.sorted shouldBe List((1, 10, 2, 3), (3, 30, 8, 9)) } + idx += 1 } - .sink[(Int,Int,Int,Int)](Tsv("tiny")) { outBuf => - "Correctly joinWithTiny" in { - outBuf.toList.sorted must be_== (List((1,10,2,3),(3,30,8,9))) + .sink[(Int, Int, Int, Int)](Tsv("tiny")) { outBuf => + s"$idx: correctly joinWithTiny" in { + outBuf.toList.sorted shouldBe List((1, 10, 2, 3), (3, 30, 8, 9)) } + idx += 1 } - .sink[(Int,Int,Int,Int,Int)](Tsv("imp")) { outBuf => - "Correctly implicitly joinWithTiny" in { - outBuf.toList.sorted must be_== (List((1,10,1,2,3),(3,30,3,8,9))) + .sink[(Int, Int, Int, Int, Int)](Tsv("imp")) { outBuf => + s"$idx: correctly implicitly joinWithTiny" in { + outBuf.toList.sorted shouldBe List((1, 10, 1, 2, 3), (3, 30, 3, 8, 9)) } + idx += 1 } .run .runHadoop - .finish + .finish() } } -class HeadLastJob(args : Args) extends Job(args) { - Tsv("input",('x,'y)).groupBy('x) { - _.sortBy('y) - .head('y -> 'yh).last('y -> 'yl) - }.write(Tsv("output")) +class HeadLastJob(args: Args) extends Job(args) { + Tsv("input", ('x, 'y)) + .groupBy('x) { + _.sortBy('y) + .head('y -> 'yh) + .last('y -> 'yl) + } + .write(Tsv("output")) } -class HeadLastTest extends Specification { +class HeadLastTest extends WordSpec with Matchers { import Dsl._ - noDetailedDiffs() - val input = List((1,10),(1,20),(1,30),(2,0)) + val input = List((1, 10), (1, 20), (1, 30), (2, 0)) "A HeadLastJob" should { - JobTest("com.twitter.scalding.HeadLastJob") - .source(Tsv("input",('x,'y)), input) - .sink[(Int,Int,Int)](Tsv("output")) { outBuf => + JobTest(new HeadLastJob(_)) + .source(Tsv("input", ('x, 'y)), input) + .sink[(Int, Int, Int)](Tsv("output")) { outBuf => "Correctly do head/last" in { - outBuf.toList must be_==(List((1,10,30),(2,0,0))) + outBuf.toList shouldBe List((1, 10, 30), (2, 0, 0)) } } .run - .finish + .finish() } } -class HeadLastUnsortedJob(args : Args) extends Job(args) { - Tsv("input",('x,'y)).groupBy('x) { - _.head('y -> 'yh).last('y -> 'yl) - }.write(Tsv("output")) +class HeadLastUnsortedJob(args: Args) extends Job(args) { + Tsv("input", ('x, 'y)) + .groupBy('x) { + _.head('y -> 'yh).last('y -> 'yl) + } + .write(Tsv("output")) } -class HeadLastUnsortedTest extends Specification { +class HeadLastUnsortedTest extends WordSpec with Matchers { import Dsl._ - noDetailedDiffs() - val input = List((1,10),(1,20),(1,30),(2,0)) + val input = List((1, 10), (1, 20), (1, 30), (2, 0)) "A HeadLastUnsortedTest" should { - JobTest("com.twitter.scalding.HeadLastUnsortedJob") - .source(Tsv("input",('x,'y)), input) - .sink[(Int,Int,Int)](Tsv("output")) { outBuf => + JobTest(new HeadLastUnsortedJob(_)) + .source(Tsv("input", ('x, 'y)), input) + .sink[(Int, Int, Int)](Tsv("output")) { outBuf => "Correctly do head/last" in { - outBuf.toList must be_==(List((1,10,30),(2,0,0))) + outBuf.toList shouldBe List((1, 10, 30), (2, 0, 0)) } } .run - .finish + .finish() } } -class MkStringToListJob(args : Args) extends Job(args) { - Tsv("input", ('x,'y)).groupBy('x) { - _.sortBy('y) - .mkString('y -> 'ystring,",") - .toList[Int]('y -> 'ylist) - }.write(Tsv("output")) +class MkStringToListJob(args: Args) extends Job(args) { + Tsv("input", ('x, 'y)) + .groupBy('x) { + _.sortBy('y) + .mkString('y -> 'ystring, ",") + .toList[Int]('y -> 'ylist) + } + .write(Tsv("output")) } -class MkStringToListTest extends Specification with FieldConversions { - noDetailedDiffs() - val input = List((1,30),(1,10),(1,20),(2,0)) +class MkStringToListTest extends WordSpec with Matchers with FieldConversions { + val input = List((1, 30), (1, 10), (1, 20), (2, 0)) "A IterableSourceJob" should { - JobTest("com.twitter.scalding.MkStringToListJob") - .source(Tsv("input",('x,'y)), input) - .sink[(Int,String,List[Int])](Tsv("output")) { outBuf => + JobTest(new MkStringToListJob(_)) + .source(Tsv("input", ('x, 'y)), input) + .sink[(Int, String, List[Int])](Tsv("output")) { outBuf => "Correctly do mkString/toList" in { - outBuf.toSet must be_==(Set((1,"10,20,30",List(10,20,30)),(2,"0",List(0)))) + outBuf.toSet shouldBe Set((1, "10,20,30", List(10, 20, 30)), (2, "0", List(0))) } } .run // This can't be run in Hadoop mode because we can't serialize the list to Tsv - .finish + .finish() } } -class InsertJob(args : Args) extends Job(args) { - Tsv("input", ('x, 'y)).insert(('z, 'w), (1,2)).write(Tsv("output")) +class InsertJob(args: Args) extends Job(args) { + Tsv("input", ('x, 'y)).insert(('z, 'w), (1, 2)).write(Tsv("output")) } -class InsertJobTest extends Specification { +class InsertJobTest extends WordSpec with Matchers { import Dsl._ - noDetailedDiffs() - val input = List((2,2), (3,3)) + val input = List((2, 2), (3, 3)) "An InsertJob" should { - JobTest(new com.twitter.scalding.InsertJob(_)) + JobTest(new InsertJob(_)) .source(Tsv("input", ('x, 'y)), input) .sink[(Int, Int, Int, Int)](Tsv("output")) { outBuf => "Correctly insert a constant" in { - outBuf.toSet must be_==(Set((2,2,1,2), (3,3,1,2))) + outBuf.toSet shouldBe Set((2, 2, 1, 2), (3, 3, 1, 2)) } } .run - .finish + .finish() } } -class FoldJob(args : Args) extends Job(args) { +class FoldJob(args: Args) extends Job(args) { import scala.collection.mutable.{Set => MSet} - Tsv("input", ('x,'y)).groupBy('x) { + Tsv("input", ('x, 'y)) + .groupBy('x) { // DON'T USE MUTABLE, IT IS UNCOOL AND DANGEROUS!, but we test, just in case - _.foldLeft('y -> 'yset)(MSet[Int]()){(ms : MSet[Int], y : Int) => + _.foldLeft('y -> 'yset)(MSet[Int]()) { (ms: MSet[Int], y: Int) => ms += y ms } - }.write(Tsv("output")) + } + .write(Tsv("output")) } -class FoldJobTest extends Specification { +class FoldJobTest extends WordSpec with Matchers { import Dsl._ import scala.collection.mutable.{Set => MSet} - noDetailedDiffs() - val input = List((1,30),(1,10),(1,20),(2,0)) + val input = List((1, 30), (1, 10), (1, 20), (2, 0)) "A FoldTestJob" should { - JobTest("com.twitter.scalding.FoldJob") - .source(Tsv("input",('x,'y)), input) - .sink[(Int,MSet[Int])](Tsv("output")) { outBuf => + JobTest(new FoldJob(_)) + .source(Tsv("input", ('x, 'y)), input) + .sink[(Int, MSet[Int])](Tsv("output")) { outBuf => "Correctly do a fold with MutableSet" in { - outBuf.toSet must be_==(Set((1,MSet(10,20,30)),(2,MSet(0)))) + outBuf.toSet shouldBe Set((1, MSet(10, 20, 30)), (2, MSet(0))) } } .run // This can't be run in Hadoop mode because we can't serialize the list to Tsv - .finish + .finish() } } // TODO make a Product serializer that clean $outer parameters -case class V(v : Int) -class InnerCaseJob(args : Args) extends Job(args) { - val res = TypedTsv[Int]("input") - .mapTo(('xx, 'vx)) { x => (x*x, V(x)) } - .groupBy('xx) { _.head('vx) } - .map('vx -> 'x) { v : V => v.v } - .project('x, 'xx) - .write(Tsv("output")) +case class V(v: Int) +class InnerCaseJob(args: Args) extends Job(args) { + val res = TypedTsv[Int]("input") + .mapTo(('xx, 'vx))(x => (x * x, V(x))) + .groupBy('xx)(_.head('vx)) + .map('vx -> 'x) { v: V => v.v } + .project('x, 'xx) + .write(Tsv("output")) } -class InnerCaseTest extends Specification { - import Dsl._ +class InnerCaseTest extends WordSpec with Matchers { - noDetailedDiffs() - val input = List(Tuple1(1),Tuple1(2),Tuple1(2),Tuple1(4)) + val input = List(Tuple1(1), Tuple1(2), Tuple1(2), Tuple1(4)) "An InnerCaseJob" should { - JobTest(new com.twitter.scalding.InnerCaseJob(_)) + JobTest(new InnerCaseJob(_)) .source(TypedTsv[Int]("input"), input) - .sink[(Int,Int)](Tsv("output")) { outBuf => + .sink[(Int, Int)](Tsv("output")) { outBuf => "Correctly handle inner case classes" in { - outBuf.toSet must be_==(Set((1,1),(2,4),(4,16))) + outBuf.toSet shouldBe Set((1, 1), (2, 4), (4, 16)) } } .runHadoop - .finish + .finish() } } -class NormalizeJob(args : Args) extends Job(args) { - Tsv("in") - .read - .mapTo((0,1) -> ('x,'y)) { tup : (Double, Int) => tup } +class NormalizeJob(args: Args) extends Job(args) { + Tsv("in").read + .mapTo((0, 1) -> ('x, 'y)) { tup: (Double, Int) => tup } .normalize('x) .project('x, 'y) .write(Tsv("out")) } -class NormalizeTest extends Specification { - noDetailedDiffs() - +class NormalizeTest extends WordSpec with Matchers { "A NormalizeJob" should { - JobTest("com.twitter.scalding.NormalizeJob") - .source(Tsv("in"), List(("0.3", "1"), ("0.3", "1"), ("0.3", -"1"), ("0.3", "1"))) + JobTest(new NormalizeJob(_)) + .source(Tsv("in"), List(("0.3", "1"), ("0.3", "1"), ("0.3", "1"), ("0.3", "1"))) .sink[(Double, Int)](Tsv("out")) { outBuf => "must be normalized" in { - outBuf.size must_== 4 - outBuf.toSet must_==(Set((0.25,1),(0.25,1),(0.25,1),(0.25,1))) - } - } - .run - .finish - } -} - -class ApproxUniqJob(args : Args) extends Job(args) { - Tsv("in",('x,'y)) - .read - .groupBy('x) { _.approxUniques('y -> 'ycnt) } - .write(Tsv("out")) -} - -class ApproxUniqTest extends Specification { - import Dsl._ - noDetailedDiffs() - - "A ApproxUniqJob" should { - val input = (1 to 1000).flatMap { i => List(("x0", i), ("x1", i)) }.toList - JobTest("com.twitter.scalding.ApproxUniqJob") - .source(Tsv("in",('x,'y)), input) - .sink[(String, Double)](Tsv("out")) { outBuf => - "must approximately count" in { - outBuf.size must_== 2 - val kvresult = outBuf.groupBy { _._1 }.mapValues { _.head._2 } - kvresult("x0") must beCloseTo(1000.0, 30.0) //We should be 1%, but this is on average, so - kvresult("x1") must beCloseTo(1000.0, 30.0) //We should be 1%, but this is on average, so + outBuf should have size 4 + outBuf.toSet shouldBe Set((0.25, 1), (0.25, 1), (0.25, 1), (0.25, 1)) } } .run - .finish + .finish() } } -class ForceToDiskJob(args : Args) extends Job(args) { - val x = Tsv("in", ('x,'y)) - .read - .filter('x) { x : Int => x > 0 } +class ForceToDiskJob(args: Args) extends Job(args) { + val x = Tsv("in", ('x, 'y)).read + .filter('x) { x: Int => x > 0 } .rename('x -> 'x1) - Tsv("in",('x,'y)) - .read + Tsv("in", ('x, 'y)).read .joinWithTiny('y -> 'y, x.forceToDisk) - .project('x,'x1,'y) + .project('x, 'x1, 'y) .write(Tsv("out")) } -class ForceToDiskTest extends Specification { +class ForceToDiskTest extends WordSpec with Matchers { import Dsl._ - noDetailedDiffs() "A ForceToDiskJob" should { - val input = (1 to 1000).flatMap { i => List((-1, i), (1, i)) }.toList + var idx = 0 + val input = (1 to 1000).flatMap(i => List((-1, i), (1, i))).toList JobTest(new ForceToDiskJob(_)) - .source(Tsv("in",('x,'y)), input) - .sink[(Int,Int,Int)](Tsv("out")) { outBuf => - "run correctly when combined with joinWithTiny" in { - outBuf.size must_== 2000 - val correct = (1 to 1000).flatMap { y => List((1,1,y),(-1,1,y)) }.sorted - outBuf.toList.sorted must_== correct + .source(Tsv("in", ('x, 'y)), input) + .sink[(Int, Int, Int)](Tsv("out")) { outBuf => + (idx + ": run correctly when combined with joinWithTiny") in { + outBuf should have size 2000 + val correct = (1 to 1000).flatMap(y => List((1, 1, y), (-1, 1, y))).sorted + outBuf.toList.sorted shouldBe correct } + idx += 1 } .run .runHadoop - .finish + .finish() } } -class ThrowsErrorsJob(args : Args) extends Job(args) { - Tsv("input",('letter, 'x)) - .read +class ThrowsErrorsJob(args: Args) extends Job(args) { + Tsv("input", ('letter, 'x)).read .addTrap(Tsv("trapped")) - .map(('letter, 'x) -> 'yPrime){ fields : Product => - val x = fields.productElement(1).asInstanceOf[Int] - if (x == 1) throw new Exception("Erroneous Ones") else x } + .map(('letter, 'x) -> 'yPrime) { fields: Product => + val x = fields.productElement(1).asInstanceOf[Int] + if (x == 1) throw new Exception("Erroneous Ones") else x + } .write(Tsv("output")) } - -class ItsATrapTest extends Specification { +class ItsATrapTest extends WordSpec with Matchers { import Dsl._ - noDetailedDiffs() //Fixes an issue with scala 2.9 "An AddTrap" should { - val input = List(("a", 1),("b", 2), ("c", 3), ("d", 1), ("e", 2)) + val input = List(("a", 1), ("b", 2), ("c", 3), ("d", 1), ("e", 2)) JobTest(new ThrowsErrorsJob(_)) - .source(Tsv("input",('letter,'x)), input) + .source(Tsv("input", ('letter, 'x)), input) .sink[(String, Int)](Tsv("output")) { outBuf => "must contain all numbers in input except for 1" in { - outBuf.toList.sorted must be_==(List(("b", 2), ("c", 3), ("e", 2))) + outBuf.toList.sorted shouldBe List(("b", 2), ("c", 3), ("e", 2)) } } .sink[(String, Int)](Tsv("trapped")) { outBuf => "must contain all 1s and fields in input" in { - outBuf.toList.sorted must be_==(List(("a", 1), ("d", 1))) + outBuf.toList.sorted shouldBe List(("a", 1), ("d", 1)) } } .run - .finish + .finish() + } +} + +object TypedThrowsErrorsJob { + val input = TypedTsv[(String, Int)]("input") + val output = TypedTsv[(String, Int)]("output") + + def trans1(x: (String, Int)) = x match { case (str, int) => (str, int, int) } + val trap1 = TypedTsv[(String, Int, Int)]("trapped1") + + val trap2 = TypedTsv[(String, Int, Int, String)]("trapped2") + def trans2(x: (String, Int, Int)) = x match { case (str, int1, int2) => (str, int1, int2 * int1, str) } + + def trans3(x: (String, Int, Int, String)) = x match { case (str, int, _, _) => (str, int) } +} + +class TypedThrowsErrorsJob(args: Args) extends Job(args) { + import TypedThrowsErrorsJob._ + + TypedPipe + .from(input) + .map(trans1(_)) + .addTrap(trap1) + .map(tup => if (tup._2 == 1) throw new Exception("Oh no!") else trans2(tup)) + .addTrap(trap2) + .map(tup => if (tup._2 % 2 == 0) throw new Exception("Oh no!") else trans3(tup)) + .write(output) +} + +object TypedThrowsErrorsJob2 { + val input = TypedTsv[(String, Int)]("input") + val output = TypedTsv[(String, Int)]("output") + val trap = TypedTsv[(String, Int, Int)]("trapped1") + + def trans1(x: (String, Int)) = x match { case (str, int) => (str, int, int) } + def trans2(x: (String, Int, Int)) = x match { case (str, int1, int2) => (str, int1, int2 * int1, str) } + def trans3(x: (String, Int, Int, String)) = x match { case (str, int, _, _) => (str, int) } +} + +class TypedThrowsErrorsJob2(args: Args) extends Job(args) { + import TypedThrowsErrorsJob2._ + + TypedPipe + .from(input) + .map(trans1(_)) + .addTrap(trap) + .map(tup => if (tup._2 == 1) throw new Exception("Oh no!") else trans2(tup)) + .map(tup => if (tup._2 % 2 == 0) throw new Exception("Oh no!") else trans3(tup)) + .write(output) +} + +class TypedItsATrapTest extends WordSpec with Matchers { + + "A Typed AddTrap with many traps" should { + import TypedThrowsErrorsJob._ + + val data = List(("a", 1), ("b", 2), ("c", 3), ("d", 4), ("e", 5)) + + JobTest(new TypedThrowsErrorsJob(_)) + .source(input, data) + .typedSink(output) { outBuf => + "output must contain all odd except first" in { + outBuf.toList.sorted shouldBe List(("c", 3), ("e", 5)) + } + } + .typedSink(trap1) { outBuf => + "trap1 must contain only the first" in { + outBuf.toList.sorted shouldBe List(("a", 1, 1)) + } + } + .typedSink(trap2) { outBuf => + "trap2 must contain the even numbered" in { + outBuf.toList.sorted shouldBe List(("b", 2, 4, "b"), ("d", 4, 16, "d")) + } + } + .run + .finish() + } + + "A Typed AddTrap with many erroneous maps" should { + import TypedThrowsErrorsJob2._ + + val data = List(("a", 1), ("b", 2), ("c", 3), ("d", 4), ("e", 5)) + + JobTest(new TypedThrowsErrorsJob2(_)) + .source(input, data) + .typedSink(output) { outBuf => + "output must contain all odd except first" in { + outBuf.toList.sorted shouldBe List(("c", 3), ("e", 5)) + } + } + .typedSink(TypedThrowsErrorsJob2.trap) { outBuf => + "trap must contain the first and the evens" in { + outBuf.toList.sorted shouldBe List(("a", 1, 1), ("b", 2, 2), ("d", 4, 4)) + } + } + .run + .finish() } } class GroupAllToListTestJob(args: Args) extends Job(args) { TypedTsv[(Long, String, Double)]("input") - .mapTo('a, 'b) { case(id, k, v) => (id, Map(k -> v)) } - .groupBy('a) { _.sum[Map[String, Double]]('b) } + .mapTo('a, 'b) { case (id, k, v) => (id, Map(k -> v)) } + .groupBy('a)(_.sum[Map[String, Double]]('b)) .groupAll { _.toList[(Long, Map[String, Double])](('a, 'b) -> 'abList) } - .map('abList -> 'abMap) { - list : List[(Long, Map[String, Double])] => list.toMap + .map('abList -> 'abMap) { list: List[(Long, Map[String, Double])] => + list.toMap } .project('abMap) .map('abMap -> 'abMap) { x: AnyRef => x.toString } .write(Tsv("output")) } -class GroupAllToListTest extends Specification { - import Dsl._ - - noDetailedDiffs() +class GroupAllToListTest extends WordSpec with Matchers { "A GroupAllToListTestJob" should { val input = List((1L, "a", 1.0), (1L, "b", 2.0), (2L, "a", 1.0), (2L, "b", 2.0)) @@ -1463,19 +2023,19 @@ class GroupAllToListTest extends Specification { .source(TypedTsv[(Long, String, Double)]("input"), input) .sink[String](Tsv("output")) { outBuf => "must properly aggregate stuff into a single map" in { - outBuf.size must_== 1 - outBuf(0) must be_==(output.toString) + outBuf should have size 1 + outBuf(0) shouldBe output.toString } } .runHadoop - .finish + .finish() } } class ToListGroupAllToListTestJob(args: Args) extends Job(args) { TypedTsv[(Long, String)]("input") - .mapTo('b, 'c) { case(k, v) => (k, v) } - .groupBy('c) { _.toList[Long]('b -> 'bList) } + .mapTo('b, 'c) { case (k, v) => (k, v) } + .groupBy('c)(_.toList[Long]('b -> 'bList)) .groupAll { _.toList[(String, List[Long])](('c, 'bList) -> 'cbList) } @@ -1483,37 +2043,40 @@ class ToListGroupAllToListTestJob(args: Args) extends Job(args) { .write(Tsv("output")) } -class ToListGroupAllToListSpec extends Specification { - import Dsl._ - - noDetailedDiffs() +class ToListGroupAllToListSpec extends WordSpec with Matchers { val expected = List(("us", List(1)), ("jp", List(3, 2)), ("gb", List(3, 1))) "A ToListGroupAllToListTestJob" should { JobTest(new ToListGroupAllToListTestJob(_)) - .source(TypedTsv[(Long, String)]("input"), List((1L, "us"), (1L, "gb"), (2L, "jp"), (3L, "jp"), (3L, "gb"))) + .source( + TypedTsv[(Long, String)]("input"), + List((1L, "us"), (1L, "gb"), (2L, "jp"), (3L, "jp"), (3L, "gb")) + ) .sink[String](Tsv("output")) { outBuf => "must properly aggregate stuff in hadoop mode" in { - outBuf.size must_== 1 - outBuf.head must_== expected.toString + outBuf should have size 1 + outBuf.head shouldBe (expected.toString) println(outBuf.head) } } .runHadoop - .finish + .finish() JobTest(new ToListGroupAllToListTestJob(_)) - .source(TypedTsv[(Long, String)]("input"), List((1L, "us"), (1L, "gb"), (2L, "jp"), (3L, "jp"), (3L, "gb"))) + .source( + TypedTsv[(Long, String)]("input"), + List((1L, "us"), (1L, "gb"), (2L, "jp"), (3L, "jp"), (3L, "gb")) + ) .sink[List[(String, List[Long])]](Tsv("output")) { outBuf => "must properly aggregate stuff in local model" in { - outBuf.size must_== 1 - outBuf.head must_== expected + outBuf should have size 1 + outBuf.head shouldBe expected println(outBuf.head) } } .run - .finish + .finish() } } @@ -1551,137 +2114,151 @@ class HangingTest extends Specification { } .run .runHadoop - .finish + .finish() } } -*/ + */ -class Function2Job(args : Args) extends Job(args) { +class Function2Job(args: Args) extends Job(args) { import FunctionImplicits._ - Tsv("in", ('x,'y)).mapTo(('x, 'y) -> 'xy) { (x: String, y: String) => x + y }.write(Tsv("output")) + Tsv("in", ('x, 'y)).mapTo(('x, 'y) -> 'xy)((x: String, y: String) => x + y).write(Tsv("output")) } -class Function2Test extends Specification { +class Function2Test extends WordSpec with Matchers { import Dsl._ - noDetailedDiffs() //Fixes an issue with scala 2.9 "A Function2Job" should { val input = List(("a", "b")) - JobTest("com.twitter.scalding.Function2Job") - .source(Tsv("in",('x,'y)), input) + JobTest(new Function2Job(_)) + .source(Tsv("in", ('x, 'y)), input) .sink[String](Tsv("output")) { outBuf => "convert a function2 to tupled function1" in { - outBuf must be_==(List("ab")) + outBuf shouldBe List("ab") } } .run - .finish + .finish() } } - -class SampleWithReplacementJob(args : Args) extends Job(args) { +class SampleWithReplacementJob(args: Args) extends Job(args) { val input = Tsv("in").read .sampleWithReplacement(1.0, 0) .write(Tsv("output")) } -class SampleWithReplacementTest extends Specification { +class SampleWithReplacementTest extends WordSpec with Matchers { import com.twitter.scalding.mathematics.Poisson val p = new Poisson(1.0, 0) - val simulated = (1 to 100).map{ - i => i -> p.nextInt - }.filterNot(_._2 == 0).toSet + val simulated = (1 to 100) + .map { i => + i -> p.nextInt + } + .filterNot(_._2 == 0) + .toSet - noDetailedDiffs() "A SampleWithReplacementJob" should { - JobTest("com.twitter.scalding.SampleWithReplacementJob") - .source(Tsv("in"), (1 to 100).map(i => i) ) - .sink[Int](Tsv("output")) { outBuf => () + JobTest(new SampleWithReplacementJob(_)) + .source(Tsv("in"), (1 to 100).map(i => i)) + .sink[Int](Tsv("output")) { outBuf => "sampleWithReplacement must sample items according to a poisson distribution" in { - outBuf.toList.groupBy(i => i) - .map(p => p._1 -> p._2.size) - .filterNot(_._2 == 0).toSet must_== simulated + outBuf.toList + .groupBy(i => i) + .map(p => p._1 -> p._2.size) + .filterNot(_._2 == 0) + .toSet shouldBe simulated } } .run - .finish + .finish() } } class VerifyTypesJob(args: Args) extends Job(args) { Tsv("input", new Fields("age", "weight")) - .addTrap(Tsv("trap")) + .addTrap(Tsv("trap")) .verifyTypes[(Int, Int)]('age -> 'weight) .verifyTypes[Int]('weight) .write(Tsv("output")) } -class VerifyTypesJobTest extends Specification { +class VerifyTypesJobTest extends WordSpec with Matchers { "Verify types operation" should { "put bad records in a trap" in { - val input = List((3, "aaa"),(23,154),(15,"123"),(53,143),(7,85),(19,195), - (42,187),(35,165),(68,121),(13,"34"),(17,173),(2,13),(2,"break")) - - JobTest(new com.twitter.scalding.VerifyTypesJob(_)) - .source(Tsv("input", new Fields("age", "weight")), input) - .sink[(Int, Int)](Tsv("output")) { outBuf => - outBuf.toList.size must_== input.size - 2 - } - .sink[(Any, Any)](Tsv("trap")) { outBuf => - outBuf.toList.size must_== 2 - } - .run - .finish + val input = List( + (3, "aaa"), + (23, 154), + (15, "123"), + (53, 143), + (7, 85), + (19, 195), + (42, 187), + (35, 165), + (68, 121), + (13, "34"), + (17, 173), + (2, 13), + (2, "break") + ) + + JobTest(new VerifyTypesJob(_)) + .source(Tsv("input", new Fields("age", "weight")), input) + .sink[(Int, Int)](Tsv("output")) { outBuf => + outBuf.toList should have size (input.size - 2) + } + .sink[(Any, Any)](Tsv("trap")) { outBuf => + outBuf.toList should have size 2 + } + .run + .finish() - } - } + } + } } -class SortingJob(args : Args) extends Job(args) { - Tsv("in", ('x, 'y, 'z)) - .read +class SortingJob(args: Args) extends Job(args) { + Tsv("in", ('x, 'y, 'z)).read .groupAll(_.sortBy('y)) .write(Tsv("output")) } -class SortingJobTest extends Specification { +class SortingJobTest extends WordSpec with Matchers { import Dsl._ - noDetailedDiffs() "A SortingJob" should { JobTest(new SortingJob(_)) - .source(Tsv("in", ('x, 'y, 'z)), (1 to 100).map(i => (i, i*i % 5, i*i*i)) ) - .sink[(Int,Int,Int)](Tsv("output")) { outBuf => + .source(Tsv("in", ('x, 'y, 'z)), (1 to 100).map(i => (i, i * i % 5, i * i * i))) + .sink[(Int, Int, Int)](Tsv("output")) { outBuf => "keep all the columns" in { - val correct = (1 to 100).map(i => (i, i*i % 5, i*i*i)).toList.sortBy(_._2) - outBuf.toList must_==(correct) + val correct = (1 to 100).map(i => (i, i * i % 5, i * i * i)).toList.sortBy(_._2) + outBuf.toList shouldBe correct } } .run - .finish + .finish() } } class CollectJob(args: Args) extends Job(args) { Tsv("input", new Fields("name", "age")) - .collectTo[(String, Int), String](('name, 'age) -> 'adultFirstNames) - { case (name, age) if age > 18 => name.split(" ").head } + .collectTo[(String, Int), String](('name, 'age) -> 'adultFirstNames) { + case (name, age) if age > 18 => name.split(" ").head + } .write(Tsv("output")) } -class CollectJobTest extends Specification { - noDetailedDiffs() +class CollectJobTest extends WordSpec with Matchers { "A CollectJob" should { - val input = List(("steve m", 21),("john f",89),("s smith", 12),("jill q",55),("some child",8)) - val expectedOutput = input.collect{ case (name, age) if age > 18 => name.split(" ").head } + val input = List(("steve m", 21), ("john f", 89), ("s smith", 12), ("jill q", 55), ("some child", 8)) + val expectedOutput = input.collect { case (name, age) if age > 18 => name.split(" ").head } - JobTest(new com.twitter.scalding.CollectJob(_)) + JobTest(new CollectJob(_)) .source(Tsv("input", new Fields("name", "age")), input) .sink[String](Tsv("output")) { outBuf => - outBuf.toList must be_==(expectedOutput) + outBuf.toList shouldBe expectedOutput } - .run.finish + .run + .finish() } } @@ -1691,8 +2268,7 @@ class FilterJob(args: Args) extends Job(args) { .write(Tsv("output")) } -class FilterJobTest extends Specification { - noDetailedDiffs() +class FilterJobTest extends WordSpec with Matchers { "A FilterJob" should { val input = List(("steve m", 21), ("john f", 89), ("s smith", 12), ("jill q", 55), ("some child", 8)) val expectedOutput = input.filter(_._2 > 18) @@ -1700,9 +2276,10 @@ class FilterJobTest extends Specification { JobTest(new com.twitter.scalding.FilterJob(_)) .source(Tsv("input", new Fields("name", "age")), input) .sink[(String, Int)](Tsv("output")) { outBuf => - outBuf.toList must be_==(expectedOutput) + outBuf.toList shouldBe expectedOutput } - .run.finish + .run + .finish() } } @@ -1712,8 +2289,7 @@ class FilterNotJob(args: Args) extends Job(args) { .write(Tsv("output")) } -class FilterNotJobTest extends Specification { - noDetailedDiffs() +class FilterNotJobTest extends WordSpec with Matchers { "A FilterNotJob" should { val input = List(("steve m", 21), ("john f", 89), ("s smith", 12), ("jill q", 55), ("some child", 8)) val expectedOutput = input.filterNot(_._2 > 18) @@ -1721,9 +2297,10 @@ class FilterNotJobTest extends Specification { JobTest(new com.twitter.scalding.FilterNotJob(_)) .source(Tsv("input", new Fields("name", "age")), input) .sink[(String, Int)](Tsv("output")) { outBuf => - outBuf.toList must be_==(expectedOutput) + outBuf.toList shouldBe expectedOutput } - .run.finish + .run + .finish() } } @@ -1731,49 +2308,81 @@ class CounterJob(args: Args) extends Job(args) { val foo_bar = Stat("foo_bar") val age_group_older_than_18 = Stat("age_group_older_than_18") val reduce_hit = Stat("reduce_hit") - age_group_older_than_18 Tsv("input", new Fields("name", "age")) - .filter('age){ age : Int => + .filter('age) { age: Int => foo_bar.incBy(2) true } - .collect[(String, Int), String](('name, 'age) -> 'adultFirstNames) { case (name, age) if age > 18 => - age_group_older_than_18.inc - name.split(" ").head + .collect[(String, Int), String](('name, 'age) -> 'adultFirstNames) { + case (name, age) if age > 18 => + age_group_older_than_18.inc() + name.split(" ").head } - .groupAll{ - _.reduce('age -> 'sum_of_ages) { - (acc : Int, age : Int) => - reduce_hit.inc - acc + age + .groupAll { + _.reduce('age -> 'sum_of_ages) { (acc: Int, age: Int) => + reduce_hit.inc() + acc + age } } .write(Tsv("output")) } -class CounterJobTest extends Specification { - noDetailedDiffs() +class CounterJobTest extends WordSpec with Matchers { "A CounterJob" should { - val input = List(("steve m", 21),("john f",89),("s smith", 12),("jill q",55),("some child",8)) - val expectedOutput = input.collect{ case(name, age) if age > 18 => age}.sum.toString + val input = List(("steve m", 21), ("john f", 89), ("s smith", 12), ("jill q", 55), ("some child", 8)) + val expectedOutput = input.collect { case (name, age) if age > 18 => age }.sum.toString "have the right counter and output values" in { - JobTest(new com.twitter.scalding.CounterJob(_)) + JobTest(new CounterJob(_)) .source(Tsv("input", new Fields("name", "age")), input) - .sink[String](Tsv("output")) { outBuf => outBuf(0) must be_==(expectedOutput)} - .counter("foo_bar") { _ must_== 10 } - .counter("age_group_older_than_18") { _ must_== 3 } - .counter("reduce_hit") { _ must_== 2 } - .counter("bad_group_bad_counter") { _ must_== 0 } + .sink[String](Tsv("output"))(outBuf => outBuf(0) shouldBe expectedOutput) + .counter("foo_bar")(_ shouldBe 10) + .counter("age_group_older_than_18")(_ shouldBe 3) + .counter("reduce_hit")(_ shouldBe 2) + .counter("bad_group_bad_counter")(_ shouldBe 0) // This is redundant but just added here to show both methods for counter tests - .counters { _ must_== Map( - "foo_bar" -> 10, - "age_group_older_than_18" -> 3, - "reduce_hit" -> 2 - ) + .counters { + _ shouldBe Map("foo_bar" -> 10, "age_group_older_than_18" -> 3, "reduce_hit" -> 2) } .run - .finish + .finish() } } } + +object DailySuffixTsvJob { + val strd1 = "2014-05-01" + val strd2 = "2014-05-02" + implicit val tz: java.util.TimeZone = DateOps.UTC + implicit val parser: DateParser = DateParser.default + implicit val dr: DateRange = DateRange(RichDate(strd1), RichDate(strd2)) + + def source(str: String) = DailySuffixTsv(str) +} + +class DailySuffixTsvJob(args: Args) extends Job(args) with UtcDateRangeJob { + import TDsl._ + DailySuffixTsvJob + .source("input0") + .read + .toTypedPipe[(String, Int)]((0, 1)) + .write(TypedTsv[(String, Int)]("output0")) +} + +class DailySuffixTsvTest extends WordSpec with Matchers { + val data = List(("aaa", 1), ("bbb", 2)) + + "A DailySuffixTsv Source" should { + import DailySuffixTsvJob._ + JobTest(new DailySuffixTsvJob(_)) + .arg("date", strd1 + " " + strd2) + .source(source("input0"), data) + .sink[(String, Int)](TypedTsv[(String, Int)]("output0")) { buf => + "read and write data" in { + buf shouldBe data + } + } + .run + .finish() + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/CumulativeSumTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/CumulativeSumTest.scala new file mode 100644 index 0000000000..f8158e785a --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/CumulativeSumTest.scala @@ -0,0 +1,92 @@ +package com.twitter.scalding + +import org.scalatest.WordSpec + +import com.twitter.scalding.typed.CumulativeSum._ + +class AddRankingWithCumulativeSum(args: Args) extends Job(args) { + TypedPipe + .from(TypedTsv[(String, Double)]("input1")) + .map { case (gender, height) => + (gender, (height, 1L)) + } + .cumulativeSum + .map { case (gender, (height, rank)) => + (gender, height, rank) + } + .write(TypedTsv("result1")) +} + +class AddRankingWithPartitionedCumulativeSum(args: Args) extends Job(args) { + TypedPipe + .from(TypedTsv[(String, Double)]("input1")) + .map { case (gender, height) => + (gender, (height, 1L)) + } + .cumulativeSum(h => (h / 100).floor.toLong) + .map { case (gender, (height, rank)) => + (gender, height, rank) + } + .write(TypedTsv("result1")) +} + +class CumulativeSumTest1 extends WordSpec { + + // --- A simple ranking job + val sampleInput1 = List( + ("male", "165.2"), + ("female", "172.2"), + ("male", "184.1"), + ("male", "125.4"), + ("female", "128.6"), + ("male", "265.2"), + ("female", "272.2"), + ("male", "284.1"), + ("male", "225.4"), + ("female", "228.6") + ) + + // Each group sorted and ranking added highest person to shortest + val expectedOutput1 = Set( + ("male", 184.1, 3), + ("male", 165.2, 2), + ("male", 125.4, 1), + ("female", 172.2, 2), + ("female", 128.6, 1), + ("male", 284.1, 6), + ("male", 265.2, 5), + ("male", 225.4, 4), + ("female", 272.2, 4), + ("female", 228.6, 3) + ) + + "A simple ranking cumulative sum job" should { + JobTest("com.twitter.scalding.AddRankingWithCumulativeSum") + .source(TypedTsv[(String, Double)]("input1"), sampleInput1) + .sink[(String, Double, Long)](TypedTsv[(String, Double, Long)]("result1")) { outBuf1 => + "produce correct number of records when filtering out null values" in { + assert(outBuf1.size === 10) + } + "create correct ranking per group, 1st being the heighest person of that group" in { + assert(outBuf1.toSet === expectedOutput1) + } + } + .run + .finish() + } + + "A partitioned ranking cumulative sum job" should { + JobTest("com.twitter.scalding.AddRankingWithPartitionedCumulativeSum") + .source(TypedTsv[(String, Double)]("input1"), sampleInput1) + .sink[(String, Double, Long)](TypedTsv[(String, Double, Long)]("result1")) { outBuf1 => + "produce correct number of records when filtering out null values" in { + assert(outBuf1.size === 10) + } + "create correct ranking per group, 1st being the heighest person of that group" in { + assert(outBuf1.toSet === expectedOutput1) + } + } + .run + .finish() + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionAppProperties.scala b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionAppProperties.scala new file mode 100644 index 0000000000..e61ec12c0a --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionAppProperties.scala @@ -0,0 +1,67 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import org.scalacheck.Properties +import org.scalacheck.Prop.forAll +import org.scalacheck.Prop._ + +// Be careful here in that Array[String] equality isn't contents based. its java referenced based. +object ExecutionAppProperties extends Properties("ExecutionApp Properties") { + def debugPrint( + inputArgs: Array[String], + resultingHadoop: HadoopArgs, + resultingNonHadoop: NonHadoopArgs + ): Unit = { + val errorMsg = "Input Args: " + inputArgs.map("\"" + _ + "\"").mkString(",") + "\n" + + "Hadoop Args: " + resultingHadoop.toArray.mkString(",") + "\n" + + "Non-Hadoop Args: " + resultingNonHadoop.toArray.mkString(",") + "\n" + sys.error(errorMsg) + } + + property("Non-hadoop random args will all end up in the right bucket") = forAll { (args: Array[String]) => + val (hadoopArgs, nonHadoop) = ExecutionApp.extractUserHadoopArgs(args) + val res = hadoopArgs.toArray.isEmpty && nonHadoop.toArray.sameElements(args) + if (!res) debugPrint(args, hadoopArgs, nonHadoop) + res + } + + property("adding an hadoop lib jars in the middle will extract it right") = forAll { + (leftArgs: Array[String], rightArgs: Array[String]) => + // in the process of validating the hadoop args we give this to generic options parser + // as a result this file must exist. the parser enforces this. + val inputHadoopArgs = Array("-libjars", "/etc/hosts") + val totalArgStr = leftArgs ++ inputHadoopArgs ++ rightArgs + val (hadoopArgs, nonHadoop) = ExecutionApp.extractUserHadoopArgs(totalArgStr) + val res = (!hadoopArgs.toArray.isEmpty) && + (nonHadoop.toArray.sameElements(leftArgs ++ rightArgs)) && + (inputHadoopArgs.sameElements(hadoopArgs.toArray)) + if (!res) debugPrint(totalArgStr, hadoopArgs, nonHadoop) + res + } + + property("adding an hadoop -D parameter in the middle will extract it right") = forAll { + (leftArgs: Array[String], rightArgs: Array[String]) => + val inputHadoopArgs = Array("-Dx.y.z=123") + val totalArgStr = leftArgs ++ inputHadoopArgs ++ rightArgs + val (hadoopArgs, nonHadoop) = ExecutionApp.extractUserHadoopArgs(totalArgStr) + val res = (!hadoopArgs.toArray.isEmpty) && + (nonHadoop.toArray.sameElements(leftArgs ++ rightArgs)) && + (inputHadoopArgs.sameElements(hadoopArgs.toArray)) + if (!res) debugPrint(totalArgStr, hadoopArgs, nonHadoop) + res + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionOptimizationRulesTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionOptimizationRulesTest.scala new file mode 100644 index 0000000000..237746ff51 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionOptimizationRulesTest.scala @@ -0,0 +1,314 @@ +package com.twitter.scalding + +import cascading.flow.FlowDef +import cascading.pipe.Pipe +import cascading.scheme.NullScheme +import cascading.tap.Tap +import cascading.tuple.{Fields, Tuple} +import com.twitter.scalding.dagon.{Dag, Rule} +import com.twitter.maple.tap.MemorySourceTap +import com.twitter.scalding.typed.TypedPipeGen +import java.io.{InputStream, OutputStream} +import java.util.UUID +import org.scalacheck.{Arbitrary, Gen} +import org.scalatest.FunSuite +import org.scalatest.prop.PropertyChecks +import scala.collection.JavaConverters._ +import scala.collection.mutable.Buffer + +class ExecutionOptimizationRulesTest extends FunSuite with PropertyChecks { + class MemorySource[T: TupleConverter](inFields: Fields = Fields.NONE) + extends Mappable[T] + with TypedSink[T] { + private[this] val buf = Buffer[Tuple]() + private[this] val name: String = UUID.randomUUID.toString + + def setter[U <: T] = TupleSetter.asSubSetter(TupleSetter.singleSetter[T]) + + override def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe = + mode match { + case cl: CascadingLocal => + val tap = new MemoryTap(new NullScheme(sinkFields, sinkFields), buf) + flowDef.addSink(name, tap) + flowDef.addTail(new Pipe(name, pipe)) + pipe + case _ => sys.error("MemorySink only usable with cascading local") + } + + def fields = + if (inFields.isNone && setter.arity > 0) { + Dsl.intFields(0 until setter.arity) + } else inFields + + override def converter[U >: T]: TupleConverter[U] = + TupleConverter.asSuperConverter[T, U](implicitly[TupleConverter[T]]) + + private lazy val hdfsTap: Tap[_, _, _] = new MemorySourceTap(buf.asJava, fields) + + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = { + if (readOrWrite == Write) { + sys.error("IterableSource is a Read-only Source") + } + mode match { + case Local(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), buf) + case Test(_) => new MemoryTap[InputStream, OutputStream](new NullScheme(fields, fields), buf) + case Hdfs(_, _) => hdfsTap + case HadoopTest(_, _) => hdfsTap + case _ => throw ModeException("Unsupported mode for IterableSource: " + mode.toString) + } + } + } + + val pipe: Gen[Execution[TypedPipe[Int]]] = + TypedPipeGen.genWithIterableSources.map(pipe => Execution.from(pipe)) + + case class PlusOne() extends (TypedPipe[Int] => TypedPipe[Int]) { + override def apply(p: TypedPipe[Int]): TypedPipe[Int] = p.map(_ + 1) + } + + case class PlusI(i: Int) extends (TypedPipe[Int] => TypedPipe[Int]) { + override def apply(p: TypedPipe[Int]): TypedPipe[Int] = p.map(_ + i) + } + + def mapped(exec: Gen[Execution[TypedPipe[Int]]]): Gen[Execution[TypedPipe[Int]]] = + exec.flatMap { pipe => + Gen.frequency( + (1, Execution.Mapped(pipe, PlusOne())), + (5, Arbitrary.arbitrary[Int].map(i => Execution.Mapped(pipe, PlusI(i)))) + ) + } + + case class ReplaceTo[T](to: Execution[TypedPipe[Int]]) + extends (TypedPipe[Int] => Execution[TypedPipe[Int]]) { + override def apply(v1: TypedPipe[Int]): Execution[TypedPipe[Int]] = to + } + + def flatMapped(exec: Gen[Execution[TypedPipe[Int]]]): Gen[Execution[TypedPipe[Int]]] = + exec.flatMap { from => + exec.map { to => + from.flatMap(ReplaceTo(to)) + } + } + + def zipped[A, B](left: Gen[Execution[A]], right: Gen[Execution[B]]): Gen[Execution[(A, B)]] = + for { + one <- left + two <- right + } yield Execution.Zipped(one, two) + + def write(pipe: Gen[TypedPipe[Int]]): Gen[Execution[TypedPipe[Int]]] = + pipe.map(_.writeThrough(new MemorySource[Int]())) + + val mappedOrFlatMapped = + Gen.oneOf(mapped(pipe), flatMapped(pipe)) + + val zippedWrites = + zipped(write(TypedPipeGen.genWithIterableSources), write(TypedPipeGen.genWithIterableSources)) + + val mappedWrites = + mapped(write(TypedPipeGen.genWithIterableSources)) + + val zippedFlatMapped = + Gen.oneOf( + zipped(flatMapped(pipe), flatMapped(pipe)), + zipped(mappedWrites, flatMapped(pipe)), + zipped(flatMapped(pipe), mappedWrites) + ) + + val zippedMapped = + Gen.oneOf( + zipped(mappedWrites, mappedOrFlatMapped), + zipped(mappedOrFlatMapped, mappedWrites) + ) + + val genExec = + Gen.oneOf( + zippedWrites, + zipped(mappedOrFlatMapped, write(TypedPipeGen.genWithIterableSources)), + zipped(write(TypedPipeGen.genWithIterableSources), mappedOrFlatMapped) + ) + + val iterableExec = + Gen + .oneOf( + zippedWrites, + zippedFlatMapped, + zippedMapped, + zipped(mappedOrFlatMapped, write(TypedPipeGen.genWithIterableSources)), + zipped(write(TypedPipeGen.genWithIterableSources), mappedOrFlatMapped) + ) + .map { exec => + exec + .flatMap { case (left, right) => + left.toIterableExecution.zip(right.toIterableExecution) + } + .map { case (left, right) => + left ++ right + } + .map { + _.toList.sorted + } + } + + import ExecutionOptimizationRules._ + + val allRules = List( + ZipWrite, + ZipMap, + ZipFlatMap, + MapWrite, + FuseMaps + ) + + def genRuleFrom(rs: List[Rule[Execution]]): Gen[Rule[Execution]] = + for { + c <- Gen.choose(1, rs.size) + rs <- Gen.pick(c, rs) + } yield rs.reduce(_.orElse(_)) + + val genRule = genRuleFrom(allRules) + + def invert[T](exec: Execution[T]) = + assert(toLiteral(exec).evaluate == exec) + + // how many writes (not hidden inside FlatMap) are there + def writeCount[A](ex: Execution[A]): Int = { + val (dag, _) = Dag(ex, ExecutionOptimizationRules.toLiteral) + dag.allNodes.count { + case Execution.WriteExecution(_, _, _) => true + case _ => false + } + } + + test("randomly generated executions trees are invertible") { + forAll(genExec) { exec => + invert(exec) + } + } + + test("optimization rules are reproducible") { + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 500) + + forAll(genExec, genRule) { (exec, rule) => + val optimized = ExecutionOptimizationRules.apply(exec, rule) + val optimized2 = ExecutionOptimizationRules.apply(exec, rule) + assert(optimized == optimized2) + } + } + + test("standard rules are reproducible") { + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 500) + + forAll(genExec) { exec => + val optimized = ExecutionOptimizationRules.stdOptimizations(exec) + val optimized2 = ExecutionOptimizationRules.stdOptimizations(exec) + assert(optimized == optimized2) + } + } + + def runAndCompare[A](origin: Execution[A], opt: Execution[A]) = { + val config = Config.unitTestDefault.setExecutionOptimization(false) + + assert( + origin.waitFor(config, Local(true)).get == + opt.waitFor(config, Local(true)).get + ) + } + + test("all optimization rules don't change results") { + forAll(iterableExec, genRule) { (e, r) => + val opt = ExecutionOptimizationRules.apply(e, r) + runAndCompare(e, opt) + } + } + + test("zip of writes merged") { + forAll(zippedWrites) { e => + val opt = ExecutionOptimizationRules.apply(e, ZipWrite) + + assert(e.isInstanceOf[Execution.Zipped[_, _]]) + assert(writeCount(opt) == 1) + } + } + + test("zip with const is optimized") { + val pipe = TypedPipe.from(List(1, 2, 3)) + val sink = new MemorySource[Int]() + + val job0 = pipe + .writeExecution(sink) + .zip(Execution.from("hello")) + .zip(pipe.writeExecution(sink)) + + assert(writeCount(job0) == 2) + + assert(writeCount(ExecutionOptimizationRules.stdOptimizations(job0)) == 1) + + val job1 = pipe + .writeExecution(sink) + .zip(Execution.from("hello").zip(pipe.writeExecution(sink))) + assert(writeCount(job1) == 2) + + assert(writeCount(ExecutionOptimizationRules.stdOptimizations(job1)) == 1) + + val job2 = pipe + .writeExecution(sink) + .zip(Execution.from("world")) + .zip(Execution.from("hello").zip(pipe.writeExecution(sink))) + + assert(writeCount(job2) == 2) + assert(writeCount(ExecutionOptimizationRules.stdOptimizations(job2)) == 1) + } + + test("push map fn into write") { + forAll(mappedWrites) { e => + val opt = ExecutionOptimizationRules.apply(e, MapWrite) + + assert(e.isInstanceOf[Execution.Mapped[_, _]]) + assert(opt.isInstanceOf[Execution.WriteExecution[_]]) + } + } + + test("push map into down after zip") { + forAll(zippedMapped) { e => + val opt = ExecutionOptimizationRules.apply(e, ZipMap) + + e match { + case Execution.Zipped(one: Execution.Mapped[s, t], two) => + assert(true) + case Execution.Zipped(one, two: Execution.Mapped[s, t]) => + assert(true) + case _ => + fail(s"$e is not zipped with map") + } + + opt match { + case Execution.Zipped(one: Execution.Mapped[s, t], two) => + fail(s"$opt didn't push map into zip") + case Execution.Zipped(one, two: Execution.Mapped[s, t]) => + fail(s"$opt didn't push map into zip") + case _ => + assert(true) + } + } + } + + test("push zip into flat map") { + forAll(zippedFlatMapped) { e => + val opt = ExecutionOptimizationRules.apply(e, ZipFlatMap) + + e match { + case Execution.Zipped(one: Execution.FlatMapped[s, t], two) => + assert(true) + case Execution.Zipped(one, two: Execution.FlatMapped[s, t]) => + assert(true) + case _ => + fail(s"$e is not zipped with flat mapped") + } + + assert(opt.isInstanceOf[Execution.FlatMapped[_, _]]) + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionTest.scala new file mode 100644 index 0000000000..80018c2af1 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionTest.scala @@ -0,0 +1,1256 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.MacroEqualityOrderedSerialization +import com.twitter.scalding.serialization.OrderedSerialization +import java.nio.file.Files +import java.io.File +import java.util +import java.util.concurrent.CountDownLatch +import org.scalatest.{Matchers, WordSpec} +import scala.collection.JavaConverters._ +import scala.concurrent.{ExecutionContext => ConcurrentExecutionContext, Future, Promise} +import scala.util.{Failure, Success, Try} +import cascading.flow.{Flow, FlowDef, FlowListener} +import com.twitter.scalding.typed.cascading_backend.AsyncFlowDefRunner.TempFileCleanup +import com.twitter.scalding.cascading_interop.FlowListenerPromise.FlowStopException +import org.apache.hadoop.conf.Configuration + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ + +object ExecutionTestJobs { + def wordCount(in: String, out: String) = + TypedPipe + .from(TextLine(in)) + .flatMap(_.split("\\s+")) + .map((_, 1L)) + .sumByKey + .writeExecution(TypedTsv(out)) + + def wordCount2(in: TypedPipe[String]) = + in + .flatMap(_.split("\\s+")) + .map((_, 1L)) + .sumByKey + .toIterableExecution + + def zipped(in1: TypedPipe[Int], in2: TypedPipe[Int]) = + in1.groupAll.sum.values.toIterableExecution + .zip(in2.groupAll.sum.values.toIterableExecution) + + def mergeFanout(in: List[Int]): Execution[Iterable[(Int, Int)]] = { + // Force a reduce, so no fancy optimizations kick in + val source = TypedPipe.from(in).groupBy(_ % 3).head + + (source.mapValues(_ * 2) ++ (source.mapValues(_ * 3))).toIterableExecution + } + + def writeExecutionWithTempFile(tempFile: String, testData: List[String]): Execution[List[String]] = { + val forced = TypedPipe.from(testData).map(s => s).forceToDiskExecution + + Execution + .withConfig(forced)(conf => conf + ("hadoop.tmp.dir" -> tempFile)) + .flatMap(_.toIterableExecution) + .map(_.toList) + } +} + +abstract class TestExecutionJob[+T](args: Args) extends ExecutionJob[T](args) { + // In tests, classloader issues with sbt mean we should not + // really use threads, so we run immediately + override def concurrentExecutionContext = new scala.concurrent.ExecutionContext { + def execute(r: Runnable) = r.run + def reportFailure(t: Throwable) = () + } +} + +class WordCountEc(args: Args) extends TestExecutionJob[Unit](args) { + def execution = ExecutionTestJobs.wordCount(args("input"), args("output")) +} + +class ExecutionWithTempFiles(args: Args, tempFile: String, testData: List[String]) + extends TestExecutionJob[List[String]](args) { + override def execution = ExecutionTestJobs.writeExecutionWithTempFile(tempFile, testData) +} + +class ZippedExecutionWithTempFiles( + args: Args, + tempFileOne: String, + tempFileTwo: String, + testDataOne: List[String], + testDataTwo: List[String] +) extends TestExecutionJob[(List[String], List[String])](args) { + override def execution = { + val executionOne = ExecutionTestJobs.writeExecutionWithTempFile(tempFileOne, testDataOne) + val executionTwo = ExecutionTestJobs.writeExecutionWithTempFile(tempFileTwo, testDataTwo) + executionOne.zip(executionTwo) + } +} + +case class MyCustomType(s: String) + +class NormalJobToExecutionTestJob(args: Args) extends Job(args) { + TypedPipe + .from(0 to 100) + .groupBy(_ % 3) + .sum + .write(source.NullSink) +} + +class FlowListenerWithException extends FlowListener { + override def onStarting(flow: Flow[_]): Unit = + throw new RuntimeException("something goes wrong") + + override def onCompleted(flow: Flow[_]): Unit = {} + + override def onStopping(flow: Flow[_]): Unit = {} + + override def onThrowable(flow: Flow[_], throwable: Throwable): Boolean = false +} + +class ExecutionTest extends WordSpec with Matchers { + implicit class ExecutionTestHelper[T](ex: Execution[T]) { + def shouldSucceed(): T = { + val r = ex.waitFor(Config.default, Local(true)) + r match { + case Success(s) => s + case Failure(e) => fail(s"Failed running execution, exception:\n$e") + } + } + def shouldSucceedHadoop(): T = { + val mode = Hdfs(true, new Configuration) + val r = ex.waitFor(Config.defaultFrom(mode), mode) + r match { + case Success(s) => s + case Failure(e) => fail(s"Failed running execution, exception:\n$e") + } + } + def shouldFail(): Unit = { + val r = ex.waitFor(Config.default, Local(true)) + assert(r.isFailure) + } + def shouldFailWith(message: String): Unit = { + val r = ex.waitFor(Config.default, Local(true)) + assert(r.isFailure) + r.failed.get.getMessage shouldBe message + } + } + + def getShutdownHooks: Seq[Thread] = { + // The list of attached shutdown hooks are not accessible normally, so we must use reflection to get them + val clazz = Class.forName("java.lang.ApplicationShutdownHooks") + val hooksField = clazz.getDeclaredField("hooks") + hooksField.setAccessible(true) + hooksField.get(null).asInstanceOf[util.IdentityHashMap[Thread, Thread]].asScala.keys.toSeq + } + + def isTempFileCleanupHook(hook: Thread): Boolean = + classOf[TempFileCleanup].isAssignableFrom(hook.getClass) + + "An Execution" should { + "run" in { + ExecutionTestJobs + .wordCount2(TypedPipe.from(List("a b b c c c", "d d d d"))) + .waitFor(Config.default, Local(false)) + .get + .toMap shouldBe Map("a" -> 1L, "b" -> 2L, "c" -> 3L, "d" -> 4L) + } + "run with zip" in { + (ExecutionTestJobs + .zipped(TypedPipe.from(0 until 100), TypedPipe.from(100 until 200)) + .shouldSucceed() match { + case (it1, it2) => (it1.head, it2.head) + }) shouldBe ((0 until 100).sum, (100 until 200).sum) + } + "run with exception in flow listener" in { + val exec = ExecutionTestJobs.wordCount2(TypedPipe.from(List("a", "b"))) + + Execution + .withConfig(exec) { config => + config.addFlowListener((_, _) => new FlowListenerWithException()) + } + .shouldFailWith("Flow was stopped") + } + "lift to try" in { + val res = ExecutionTestJobs + .wordCount2(TypedPipe.from(List("a", "b"))) + .liftToTry + .shouldSucceed() + + assert(res.isSuccess) + } + "lift to try on exception" in { + val res: Try[Nothing] = ExecutionTestJobs + .wordCount2(TypedPipe.from(List("a", "b"))) + .map(_ => throw new RuntimeException("Something went wrong")) + .liftToTry + .shouldSucceed() + + assert(res.isFailure) + } + "merge fanouts without error" in { + def unorderedEq[T](l: Iterable[T], r: Iterable[T]): Boolean = + (l.size == r.size) && (l.toSet == r.toSet) + + def correct(l: List[Int]): List[(Int, Int)] = { + val in = l.groupBy(_ % 3).mapValues(_.head) + in.mapValues(_ * 2).toList ++ in.mapValues(_ * 3) + } + val input = (0 to 100).toList + val result = ExecutionTestJobs.mergeFanout(input).waitFor(Config.default, Local(false)).get + val cres = correct(input) + unorderedEq(cres, result.toList) shouldBe true + } + "If either fails, zip fails, else we get success" in { + val neverHappens = Promise[Int]().future + Execution + .fromFuture(_ => neverHappens) + .zip(Execution.failed(new Exception("oh no"))) + .shouldFail() + + Execution + .failed(new Exception("oh no")) + .zip(Execution.fromFuture(_ => neverHappens)) + .shouldFail() + // If both are good, we succeed: + Execution + .from(1) + .zip(Execution.from("1")) + .shouldSucceed() shouldBe (1, "1") + } + + "If one write fails, the other gets cancelled" in { + @volatile var cancelledEx: Option[Throwable] = None + + val failedTp: TypedPipe[Int] = + TypedPipe.from(Seq(0)).groupAll.sum.values.map(_ => throw new Exception("oh no")) + val failedEx: Execution[Iterable[Int]] = failedTp.toIterableExecution + + val mapCountDownLatch = new CountDownLatch(1) + + val blockingTp: TypedPipe[Int] = TypedPipe.from(Seq(1)).groupAll.sum.values.map { i => + // block until we are done + mapCountDownLatch.await() + i + } + + val onCompleteCountDownLatch = new CountDownLatch(1) + val otherEx: Execution[Iterable[Int]] = blockingTp.toIterableExecution.onComplete { t => + if (t.isFailure) { + // capture the exception + cancelledEx = t.failed.toOption + } + onCompleteCountDownLatch.countDown() + } + + val zipped = failedEx.zip(otherEx) + + zipped.shouldFail() + + // wait for onComplete to finish + onCompleteCountDownLatch.await() + + // execution should be cancelled and the flow stopped + assert(cancelledEx.get.isInstanceOf[FlowStopException]) + + // finish counting down on the map to release the thread + mapCountDownLatch.countDown() + + // do the same on the other side + @volatile var cancelledEx2: Option[Throwable] = None + + val failedTp2: TypedPipe[Int] = + TypedPipe.from(Seq(0)).groupAll.sum.values.map(_ => throw new Exception("oh no")) + val failedEx2: Execution[Iterable[Int]] = failedTp2.toIterableExecution + + val mapCountDownLatch2 = new CountDownLatch(1) + + val blockingTp2: TypedPipe[Int] = TypedPipe.from(Seq(1)).groupAll.sum.values.map { i => + // block until we are done + mapCountDownLatch2.await() + i + } + + val onCompleteCountDownLatch2 = new CountDownLatch(1) + val otherEx2: Execution[Iterable[Int]] = blockingTp2.toIterableExecution.onComplete { t => + if (t.isFailure) { + // capture the exception + cancelledEx2 = t.failed.toOption + } + onCompleteCountDownLatch2.countDown() + } + + val zipped2 = otherEx2.zip(failedEx2) + + zipped2.shouldFail() + + // wait for onComplete to finish + onCompleteCountDownLatch2.await() + + // execution should be cancelled and the flow stopped + assert(cancelledEx2.get.isInstanceOf[FlowStopException]) + + // finish counting down on the map to release the thread + mapCountDownLatch2.countDown() + } + + "If one write fails, the flatmapped execution gets cancelled" in { + @volatile var cancelledEx: Option[Throwable] = None + + val failedTp: TypedPipe[Int] = + TypedPipe.from(Seq(0)).groupAll.sum.values.map(_ => throw new Exception("oh no")) + val failedEx: Execution[Iterable[Int]] = failedTp.toIterableExecution + + val mapCountDownLatch = new CountDownLatch(1) + + val otherTp: TypedPipe[Int] = TypedPipe.from(Seq(1)).groupAll.sum.values + + val onCompleteCountDownLatch = new CountDownLatch(1) + val otherEx: Execution[Iterable[Int]] = otherTp.toIterableExecution + .flatMap { _ => + TypedPipe + .from(Seq(2)) + .groupAll + .sum + .values + .map { i => + // block until we are done + mapCountDownLatch.await() + i + } + .toIterableExecution + } + .onComplete { t => + if (t.isFailure) { + // capture the exception + cancelledEx = t.failed.toOption + } + onCompleteCountDownLatch.countDown() + } + + val zipped = failedEx.zip(otherEx) + + zipped.shouldFail() + + // wait for onComplete to finish + onCompleteCountDownLatch.await() + + // execution should be cancelled and the flow stopped + assert(cancelledEx.get.isInstanceOf[FlowStopException]) + + // finish counting down on the map to release the thread + mapCountDownLatch.countDown() + + // do the same on the other side + @volatile var cancelledEx2: Option[Throwable] = None + + val failedTp2: TypedPipe[Int] = + TypedPipe.from(Seq(0)).groupAll.sum.values.map(_ => throw new Exception("oh no")) + val failedEx2: Execution[Iterable[Int]] = failedTp2.toIterableExecution + + val mapCountDownLatch2 = new CountDownLatch(1) + + val otherTp2: TypedPipe[Int] = TypedPipe.from(Seq(1)).groupAll.sum.values + + val onCompleteCountDownLatch2 = new CountDownLatch(1) + val otherEx2: Execution[Iterable[Int]] = otherTp2.toIterableExecution + .flatMap { _ => + TypedPipe + .from(Seq(2)) + .groupAll + .sum + .values + .map { i => + // block until we are done + mapCountDownLatch2.await() + i + } + .toIterableExecution + } + .onComplete { t => + if (t.isFailure) { + // capture the exception + cancelledEx2 = t.failed.toOption + } + onCompleteCountDownLatch2.countDown() + } + + val zipped2 = otherEx2.zip(failedEx2) + + zipped2.shouldFail() + + // wait for onComplete to finish + onCompleteCountDownLatch2.await() + + // execution should be cancelled and the flow stopped + assert(cancelledEx2.get.isInstanceOf[FlowStopException]) + + // finish counting down on the map to release the thread + mapCountDownLatch2.countDown() + } + + "recoverWith may fail to match" in { + val exception = new RuntimeException() + + val result = Execution + .from[Unit] { + throw exception + } + .recoverWith { case _: NullPointerException => + Execution.unit + } + .waitFor(Config.default, Local(true)) + + result shouldBe Failure(exception) + } + + "recover from failure" in { + val tp = TypedPipe.from(Seq(1)).groupAll.sum.values.map(_ => throw new Exception("oh no")) + val recoveredTp = TypedPipe.from(Seq(2)).groupAll.sum.values + val recoveredEx = tp.toIterableExecution.recoverWith { case t: Throwable => + recoveredTp.toIterableExecution + } + + val res = recoveredEx.shouldSucceed() + res shouldBe List(2) + } + + "not recover when cancelled by another execution" in { + @volatile var cancelledEx: Option[Throwable] = None + + val failedTp: TypedPipe[Int] = + TypedPipe.from(Seq(0)).groupAll.sum.values.map(_ => throw new Exception("oh no")) + val failedEx: Execution[Iterable[Int]] = failedTp.toIterableExecution + + val mapCountDownLatch = new CountDownLatch(1) + + val blockingTp: TypedPipe[Int] = TypedPipe.from(Seq(1)).groupAll.sum.values.map { i => + // block until we are done + mapCountDownLatch.await() + i + } + + val onCompleteCountDownLatch = new CountDownLatch(1) + val recoveredTp = TypedPipe.from(Seq(2)) + val otherEx: Execution[Iterable[Int]] = blockingTp.toIterableExecution + .recoverWith { case t: Throwable => + recoveredTp.toIterableExecution + } + .onComplete { t => + if (t.isFailure) { + // capture the exception + cancelledEx = t.failed.toOption + } + onCompleteCountDownLatch.countDown() + } + + val zipped = failedEx.zip(otherEx) + + zipped.shouldFail() + + // wait for onComplete to finish + onCompleteCountDownLatch.await() + + // execution should be cancelled and the flow stopped + assert(cancelledEx.get.isInstanceOf[FlowStopException]) + + // finish counting down on the map to release the thread + mapCountDownLatch.countDown() + } + + "Config transformer will isolate Configs" in { + def doesNotHaveVariable(message: String) = Execution.getConfig.flatMap { cfg => + if (cfg.get("test.cfg.variable").isDefined) + Execution.failed(new Exception(s"$message\n: var: ${cfg.get("test.cfg.variable")}")) + else + Execution.from(()) + } + + val hasVariable = Execution.getConfig.flatMap { cfg => + if (cfg.get("test.cfg.variable").isEmpty) + Execution.failed(new Exception("Should see variable inside of transform")) + else + Execution.from(()) + } + + def addOption(cfg: Config) = cfg.+("test.cfg.variable", "dummyValue") + + doesNotHaveVariable("Should not see variable before we've started transforming") + .flatMap(_ => Execution.withConfig(hasVariable)(addOption)) + .flatMap(_ => doesNotHaveVariable("Should not see variable in flatMap's after the isolation")) + .map(_ => true) + .shouldSucceed() shouldBe true + } + + "Config transformer will interact correctly with the cache" in { + var incrementIfDefined = 0 + var totalEvals = 0 + + val incrementor = Execution.getConfig.flatMap { cfg => + totalEvals += 1 + if (cfg.get("test.cfg.variable").isDefined) + incrementIfDefined += 1 + Execution.from(()) + } + + def addOption(cfg: Config) = cfg.+("test.cfg.variable", "dummyValue") + + // Here we run without the option, with the option, and finally without again. + incrementor + .flatMap(_ => Execution.withConfig(incrementor)(addOption)) + .flatMap(_ => incrementor) + .map(_ => true) + .shouldSucceed() shouldBe true + + assert(incrementIfDefined === 1) + // We should evaluate once for the default config, and once for the modified config. + assert(totalEvals === 2) + } + + "Config transformer will interact correctly with the cache when writing" in { + import java.io._ + val srcF = File.createTempFile("tmpoutputLocation", ".tmp").getAbsolutePath + val sinkF = File.createTempFile("tmpoutputLocation2", ".tmp").getAbsolutePath + + def writeNums(nums: List[Int]): Unit = { + val pw = new PrintWriter(new File(srcF)) + pw.write(nums.mkString("\n")) + pw.close + } + + writeNums(List(1, 2, 3)) + + val sink = TypedTsv[Int](sinkF) + val src = TypedTsv[Int](srcF) + val operationTP = (TypedPipe.from(src) ++ TypedPipe + .from((1 until 100).toList)).writeExecution(sink).getCounters.map(_._2.toMap) + + def addOption(cfg: Config) = cfg.+("test.cfg.variable", "dummyValue") + + // Here we run without the option, with the option, and finally without again. + val (oldCounters, newCounters) = operationTP + .flatMap { oc => + writeNums(List(1, 2, 3, 4, 5, 6, 7)) + Execution.withConfig(operationTP)(addOption).map(nc => (oc, nc)) + } + .shouldSucceed() + + assert( + oldCounters != newCounters, + "With new configs given the source changed we shouldn't cache so the counters should be different" + ) + + } + + "correctly add cached file into config" in { + val execution = Execution.withCachedFile("/path/to/your/file.txt") { cachedFile => + Execution.getConfig.map { config => + config.getDistributedCachedFiles should contain only cachedFile + } + } + + execution.waitFor(Config.default, Hdfs(strict = true, new Configuration(false))) match { + case Success(s) => s + case Failure(e) => fail(s"Failed running execution, exception:\n$e") + } + } + + "correctly add cached files into config" in { + val execution = + Execution.withCachedFile("/path/to/your/one.txt") { one => + Execution.withCachedFile("/path/to/your/second.txt") { second => + Execution.getConfig.map { config => + config.getDistributedCachedFiles should contain only (one, second) + } + } + } + + execution.waitFor(Config.default, Hdfs(strict = true, new Configuration(false))) match { + case Success(s) => s + case Failure(e) => fail(s"Failed running execution, exception:\n$e") + } + } + } + + "ExecutionApp" should { + val parser = new ExecutionApp { def job = Execution.from(()) } + "parse hadoop args correctly" in { + val conf = parser.config(Array("-Dmapred.reduce.tasks=100", "--local"))._1 + conf.get("mapred.reduce.tasks") should contain("100") + conf.getArgs.boolean("local") shouldBe true + + val (conf1, Hdfs(_, hconf)) = parser.config(Array("--test", "-Dmapred.reduce.tasks=110", "--hdfs")) + conf1.get("mapred.reduce.tasks") should contain("110") + conf1.getArgs.boolean("test") shouldBe true + hconf.get("mapred.reduce.tasks") shouldBe "110" + } + } + "An ExecutionJob" should { + "run correctly" in { + JobTest(new WordCountEc(_)) + .arg("input", "in") + .arg("output", "out") + .source(TextLine("in"), List((0, "hello world"), (1, "goodbye world"))) + .typedSink(TypedTsv[(String, Long)]("out")) { outBuf => + outBuf.toMap shouldBe Map("hello" -> 1L, "world" -> 2L, "goodbye" -> 1L) + } + .run + .runHadoop + .finish() + } + } + "Executions" should { + "work correctly with flowDef from user" in { + class PipeBuilderJob(args: Args) extends TestExecutionJob[Unit](args) { + override def execution: Execution[Unit] = + Execution.getMode.flatMap { mode => + val flowDef: FlowDef = new FlowDef + + pipeBuilder(flowDef, mode) + + Execution.fromFn((_, _) => flowDef) + } + + def pipeBuilder(implicit flowDef: FlowDef, mode: Mode): TypedPipe[Int] = + TypedPipe + .from(TextLine(args("input"))) + .map(_.toInt) + .map(_ * 2) + .write(TypedTsv[Int]("out")) + } + + val input = List((0, "1"), (1, "2"), (2, "3"), (3, "4"), (4, "5")) + val expected = input.map(_._2).map(_.toInt).map(_ * 2) + + JobTest(new PipeBuilderJob(_)) + .arg("input", "in") + .source(TextLine("in"), input) + .typedSink(TypedTsv[Int]("out")) { outBuf => + outBuf.toList shouldBe expected + } + .run + .runHadoop + .finish() + } + + "shutdown hook should clean up temporary files" in { + val tempFileOne = Files.createTempDirectory("scalding-execution-test") + val tempFileTwo = Files.createTempDirectory("scalding-execution-test") + val mode = Test(Map()) + + Files.exists(tempFileOne) should be(true) + Files.exists(tempFileTwo) should be(true) + + val cleanupThread = + TempFileCleanup(List(tempFileOne.toFile.getAbsolutePath, tempFileTwo.toFile.getAbsolutePath), mode) + cleanupThread.run() + + Files.exists(tempFileOne) should be(false) + Files.exists(tempFileTwo) should be(false) + } + + "clean up temporary files on exit" in { + val tempFile = Files.createTempDirectory("scalding-execution-test").toFile.getAbsolutePath + val testData = List("a", "b", "c") + getShutdownHooks.foreach { hook: Thread => + isTempFileCleanupHook(hook) should be(false) + } + + ExecutionTestJobs + .writeExecutionWithTempFile(tempFile, testData) + .shouldSucceedHadoop() + + // This is hacky, but there's a small chance that the new cleanup hook isn't registered by the time we get here + // A small sleep like this appears to be sufficient to ensure we can see it + Thread.sleep(1000) + val cleanupHook = getShutdownHooks.find(isTempFileCleanupHook) + cleanupHook shouldBe defined + + val files = cleanupHook.get.asInstanceOf[TempFileCleanup].filesToCleanup + + assert(files.size == 1) + assert(files.head.contains(tempFile)) + cleanupHook.get.run() + // Remove the hook so it doesn't show up in the list of shutdown hooks for other tests + Runtime.getRuntime.removeShutdownHook(cleanupHook.get) + } + + "clean up temporary files on finish" in { + val tempFile = Files.createTempDirectory("scalding-execution-test").toFile.getAbsolutePath + val testData = List("a", "b", "c") + + val ex = ExecutionTestJobs.writeExecutionWithTempFile(tempFile, testData) + val onFinish = Execution.withConfig(ex)(_.setExecutionCleanupOnFinish(true)) + onFinish.shouldSucceedHadoop() + + // This is hacky, but there's a small chance that the cleanup thread has not finished + // running by the time we check below + // A small sleep like this appears to be sufficient to ensure we can see it + Thread.sleep(1000) + val f = new File(tempFile) + def allChildren(f: File): List[File] = + if (f.isDirectory) f.listFiles().toList.flatMap(allChildren(_)) + else List(f) + + assert(allChildren(f).isEmpty, f.toString) + } + + "clean up temporary files on exit with a zip" in { + val tempFileOne = Files.createTempDirectory("scalding-execution-test").toFile.getAbsolutePath + val tempFileTwo = Files.createTempDirectory("scalding-execution-test").toFile.getAbsolutePath + val testDataOne = List("a", "b", "c") + val testDataTwo = List("x", "y", "z") + getShutdownHooks.foreach { hook: Thread => + isTempFileCleanupHook(hook) should be(false) + } + + ExecutionTestJobs + .writeExecutionWithTempFile(tempFileOne, testDataOne) + .zip(ExecutionTestJobs.writeExecutionWithTempFile(tempFileTwo, testDataTwo)) + .shouldSucceedHadoop() + + // This is hacky, but there's a small chance that the new cleanup hook isn't registered by the time we get here + // A small sleep like this appears to be sufficient to ensure we can see it + Thread.sleep(1000) + val cleanupHook = getShutdownHooks.find(isTempFileCleanupHook) + cleanupHook shouldBe defined + + val files = cleanupHook.get.asInstanceOf[TempFileCleanup].filesToCleanup + + assert(files.size == 2) + assert(files.head.contains(tempFileOne) || files.head.contains(tempFileTwo)) + assert(files(1).contains(tempFileOne) || files(1).contains(tempFileTwo)) + cleanupHook.get.run() + // Remove the hook so it doesn't show up in the list of shutdown hooks for other tests + Runtime.getRuntime.removeShutdownHook(cleanupHook.get) + } + + "evaluate once per run" in { + var first = 0 + var second = 0 + var third = 0 + val e1 = Execution.from { first += 1; 42 } + val e2 = e1.flatMap { x => + second += 1 + Execution.from(2 * x) + } + val e3 = e1.map { x => third += 1; x * 3 } + + /** + * Notice both e3 and e2 need to evaluate e1. + */ + val res = e3.zip(e2) + res.shouldSucceed() + assert((first, second, third) == (1, 1, 1)) + } + "zip does not duplicate counters" in { + val c1 = Execution + .withId { implicit uid => + val stat = Stat("test") + val e1 = TypedPipe + .from(0 until 100) + .map { x => + stat.inc + x + } + .writeExecution(source.NullSink) + + e1.zip(e1) + } + .getCounters + .map { case (_, c) => c("test") } + + val c2 = Execution + .withId { implicit uid => + val stat = Stat("test") + val e2 = TypedPipe + .from(0 until 100) + .map { x => + stat.inc + x + } + .writeExecution(source.NullSink) + + e2.flatMap(Execution.from(_)).zip(e2) + } + .getCounters + .map { case (_, c) => c("test") } + + c1.shouldSucceed() should ===(100) + c2.shouldSucceed() should ===(100) + } + "zip does not duplicate pure counters" in { + val c1 = { + val e1 = TypedPipe + .from(0 until 100) + .tallyAll("scalding", "test") + .writeExecution(source.NullSink) + + e1.zip(e1).getCounters.map { case (_, c) => + println(c.toMap) + c(("test", "scalding")) + } + } + + val c2 = { + val e2 = TypedPipe + .from(0 until 100) + .tallyAll("scalding", "test") + .writeExecution(source.NullSink) + + e2.flatMap(Execution.from(_)).zip(e2).getCounters.map { case (_, c) => + println(c.toMap) + c(("test", "scalding")) + } + } + + c1.shouldSucceed() should ===(100) + c2.shouldSucceed() should ===(100) + } + + "Running a large loop won't exhaust boxed instances" in { + var timesEvaluated = 0 + import com.twitter.scalding.serialization.macros.impl.BinaryOrdering._ + // Attempt to use up 4 boxed classes for every execution + def baseExecution(idx: Int): Execution[Unit] = TypedPipe + .from(0 until 1000) + .map(_.toShort) + .flatMap { i => + timesEvaluated += 1 + List((i, i), (i, i)) + } + .sumByKey + .map { case (k, v) => + (k.toInt, v) + } + .sumByKey + .map { case (k, v) => + (k.toLong, v) + } + .sumByKey + .map { case (k, v) => + (k.toString, v) + } + .sumByKey + .map { case (k, v) => + (MyCustomType(k), v) + } + .sumByKey + .writeExecution(TypedTsv(s"/tmp/asdf_$idx")) + + implicitly[OrderedSerialization[MyCustomType]] match { + case mos: MacroEqualityOrderedSerialization[_] => + assert(mos.uniqueId == "com.twitter.scalding.MyCustomType") + case _ => + sys.error( + "Ordered serialization should have been the MacroEqualityOrderedSerialization for this test" + ) + } + def executionLoop(idx: Int): Execution[Unit] = + if (idx > 0) + baseExecution(idx).flatMap(_ => executionLoop(idx - 1)) + else + Execution.unit + + executionLoop(55).shouldSucceed() + assert(timesEvaluated == 55 * 1000, "Should run the 55 execution loops for 1000 elements") + } + + "evaluate shared portions just once, writeExecution" in { + + var timesEvaluated = 0 + val baseTp = TypedPipe + .from(0 until 1000) + .flatMap { i => + timesEvaluated += 1 + List(i, i) + } + .fork + + val fde1 = baseTp.map(_ * 3).writeExecution(TypedTsv("/tmp/asdf")) + val fde2 = baseTp.map(_ * 5).writeExecution(TypedTsv("/tmp/asdf2")) + + val res = fde1.zip(fde2) + + res.shouldSucceed() + assert( + timesEvaluated == 1000, + "Should share the common sub section of the graph when we zip two write Executions" + ) + } + + "evaluate shared portions just once, forceToDiskExecution" in { + + var timesEvaluated = 0 + val baseTp = TypedPipe + .from(0 until 1000) + .flatMap { i => + timesEvaluated += 1 + List(i, i) + } + .fork + + val fde1 = baseTp.map(_ * 3).forceToDiskExecution + val fde2 = baseTp.map(_ * 5).forceToDiskExecution + + val res = fde1.zip(fde2) + + res.shouldSucceed() + assert( + timesEvaluated == 1000, + "Should share the common sub section of the graph when we zip two write Executions" + ) + } + + "evaluate shared portions just once, forceToDiskExecution with execution cache" in { + + var timesEvaluated = 0 + val baseTp = TypedPipe + .from(0 until 1000) + .flatMap { i => + timesEvaluated += 1 + List(i, i) + } + .fork + + val fde1 = baseTp.map(_ * 3).forceToDiskExecution + val fde2 = baseTp.map(_ * 5).forceToDiskExecution + + val res = fde1.zip(fde2).flatMap(_ => fde1).flatMap(_.toIterableExecution) + + res.shouldSucceed() + assert( + timesEvaluated == 1000, + "Should share the common sub section of the graph when we zip two write Executions and then flatmap" + ) + } + + "Ability to do isolated caches so we don't exhaust memory" in { + + def memoryWastingExecutionGenerator(id: Int): Execution[Array[Long]] = + Execution.withNewCache(Execution.from(id).flatMap { idx => + Execution.from(Array.fill(4000000)(idx.toLong)) + }) + + def writeAll(numExecutions: Int): Execution[Unit] = + if (numExecutions > 0) { + memoryWastingExecutionGenerator(numExecutions).flatMap { _ => + writeAll(numExecutions - 1) + } + } else { + Execution.from(()) + } + + writeAll(400).shouldSucceed() + } + "handle failure" in { + val result = Execution.withParallelism(Seq(Execution.failed(new Exception("failed"))), 1) + + result.shouldFail() + } + + "handle an error running in parallel" in { + val executions = + Execution.failed(new Exception("failed")) :: 0.to(10).map(i => Execution.from[Int](i)).toList + + val result = Execution.withParallelism(executions, 3) + + result.shouldFail() + } + + "run in parallel" in { + val executions = 0.to(10).map(i => Execution.from[Int](i)).toList + + val result = Execution.withParallelism(executions, 3) + + assert(result.shouldSucceed() == 0.to(10).toSeq) + } + + "block correctly" in { + var seen = 0 + def updateSeen(idx: Int): Unit = { + assert(seen === idx) + seen += 1 + } + + val executions = 0 + .to(10) + .map { i => + Execution + .from[Int](i) + .map { i => Thread.sleep(10 - i); i } + .onComplete(t => updateSeen(t.get)) + } + .toList + .reverse + + val result = Execution.withParallelism(executions, 1) + + assert(result.shouldSucceed() == 0.to(10).reverse) + } + + "can hashCode, compare, and run a long sequence" in { + val execution = Execution.sequence((1 to 100000).toList.map(Execution.from(_))) + assert(execution.hashCode == execution.hashCode) + + assert(execution == execution) + + assert(execution.shouldSucceed() == (1 to 100000).toList) + } + + "caches a withId Execution computation" in { + var called = false + val execution = Execution.withId { id => + assert(!called) + called = true + Execution.from("foobar") + } + + val doubleExecution = execution.zip(execution) + + assert(doubleExecution.shouldSucceed() == ("foobar", "foobar")) + assert(called) + } + + "maintains equality and hashCode after reconstruction" when { + // Make two copies of these. Comparison by reference + // won't match between the two. + val futureF = { _: ConcurrentExecutionContext => Future.successful(10) } + val futureF2 = { _: ConcurrentExecutionContext => Future.successful(10) } + val fnF = { (_: Config, _: Mode) => null } + val fnF2 = { (_: Config, _: Mode) => null } + val withIdF = { _: UniqueID => Execution.unit } + val withIdF2 = { _: UniqueID => Execution.unit } + val mapF = { _: Int => 12 } + val mapF2 = { _: Int => 12 } + + def reconstructibleLaws[T](ex: => Execution[T], ex2: Execution[T]): Unit = { + assert(ex == ex) + assert(ex.hashCode == ex.hashCode) + assert(ex != ex2) + } + + "Execution.fromFuture" in { + reconstructibleLaws(Execution.fromFuture(futureF), Execution.fromFuture(futureF2)) + } + + "Execution.fromFn" in { + reconstructibleLaws(Execution.fromFn(fnF), Execution.fromFn(fnF2)) + } + + "Execution.withId" in { + reconstructibleLaws(Execution.withId(withIdF), Execution.withId(withIdF2)) + } + + "Execution#map" in { + reconstructibleLaws(Execution.fromFuture(futureF).map(mapF), Execution.fromFuture(futureF).map(mapF2)) + } + + "Execution.zip" in { + reconstructibleLaws( + Execution.zip(Execution.fromFuture(futureF2), Execution.withId(withIdF)), + Execution.zip(Execution.fromFuture(futureF2), Execution.withId(withIdF2)) + ) + } + + "Execution.sequence" in { + reconstructibleLaws( + Execution.sequence( + Seq( + Execution.fromFuture(futureF), + Execution.withId(withIdF), + Execution.fromFuture(futureF2).map(mapF) + ) + ), + Execution.sequence( + Seq(Execution.fromFuture(futureF), Execution.withId(withIdF), Execution.fromFn(fnF)) + ) + ) + } + } + + "Has consistent hashCode and equality for mutable" when { + // These cases are a bit convoluted, but we still + // want equality to be consistent + trait MutableX[T] { + protected var x: Int + def setX(newX: Int): Unit = x = newX + def makeExecution: Execution[T] + } + + case class FromFutureMutable(var x: Int = 0) + extends Function1[ConcurrentExecutionContext, Future[Int]] + with MutableX[Int] { + def apply(context: ConcurrentExecutionContext) = Future.successful(x) + def makeExecution = Execution.fromFuture(this) + } + case class FromFnMutable(var x: Int = 0) extends Function2[Config, Mode, Null] with MutableX[Unit] { + def apply(config: Config, mode: Mode) = null + def makeExecution = Execution.fromFn(this) + } + case class WithIdMutable(var x: Int = 0) + extends Function1[UniqueID, Execution[Int]] + with MutableX[Int] { + def apply(id: UniqueID) = Execution.fromFuture(FromFutureMutable(x)) + def makeExecution = Execution.withId(this) + } + val mapFunction = { x: Int => x * x } + case class MapMutable(var x: Int = 0) extends MutableX[Int] { + val m = FromFutureMutable(x) + override def setX(newX: Int) = { + x = newX + m.setX(x) + } + def makeExecution = m.makeExecution.map(mapFunction) + } + case class ZipMutable(var x: Int = 0) extends MutableX[(Int, Int)] { + val m1 = FromFutureMutable(x) + val m2 = WithIdMutable(x) + override def setX(newX: Int) = { + x = newX + m1.setX(x) + m2.setX(x + 20) + } + def makeExecution = m1.makeExecution.zip(m2.makeExecution) + } + case class SequenceMutable(var x: Int = 0) extends MutableX[Seq[Int]] { + val m1 = FromFutureMutable(x) + val m2 = WithIdMutable(x) + override def setX(newX: Int) = { + x = newX + m1.setX(x) + m2.setX(x * 3) + } + def makeExecution = Execution.sequence(Seq(m1.makeExecution, m2.makeExecution)) + } + + def mutableLaws[T, U <: MutableX[T]](mutableGen: => U, expectedOpt: Option[Int => T] = None): Unit = { + expectedOpt.foreach { expected => + require(expected(10) != expected(20)) + } + def validate(ex: Execution[T], seed: Int): Unit = + expectedOpt.foreach { expected => + assert(ex.shouldSucceed() == expected(seed)) + } + + val mutable1 = mutableGen + mutable1.setX(10) + val ex1 = mutable1.makeExecution + + val mutable2 = mutableGen + mutable2.setX(10) + val ex2 = mutable2.makeExecution + + assert(ex1 == ex2) + assert(ex1.hashCode == ex2.hashCode) + + validate(ex1, 10) + validate(ex2, 10) + + mutable2.setX(20) + // We may have the same hashCode still, but we don't need to + assert(ex1 != ex2) + validate(ex2, 20) + + val mutable3 = mutableGen + mutable3.setX(20) + val ex3 = mutable3.makeExecution + + assert(ex1 != ex3) + validate(ex3, 20) + + mutable3.setX(10) + if (ex1 == ex3) { + // If they are made equal again, the hashCodes must match + assert(ex1.hashCode == ex3.hashCode) + } + validate(ex3, 10) + } + + "Execution.fromFuture" in { + mutableLaws(FromFutureMutable(), Some { x: Int => x }) + } + + "Execution.fromFn" in { + mutableLaws(FromFnMutable(), Option.empty[Int => Unit]) + } + + "Execution.withId" in { + mutableLaws(WithIdMutable(), Some { x: Int => x }) + } + + "Execution#map" in { + mutableLaws(MapMutable(), Some { x: Int => x * x }) + } + + "Execution#zip" in { + mutableLaws(ZipMutable(), Some { x: Int => (x, x + 20) }) + } + + "Execution.sequence" in { + mutableLaws(SequenceMutable(), Some { x: Int => Seq(x, x * 3) }) + } + } + } + + "Simple jobs" should { + "convert to Execution and run" in { + val ex = Job.toExecutionFromClass( + classOf[NormalJobToExecutionTestJob], + Execution.failed(new Exception("couldn't run")) + ) + val res = ex.waitFor(Config.empty, Local(true)) + assert(res.isSuccess) + } + "convert ExecutionJob to Execution" in { + val test = JobTest(new WordCountEc(_)) + .arg("input", "in") + .arg("output", "out") + .source(TextLine("in"), List((0, "hello world"), (1, "goodbye world"))) + .typedSink(TypedTsv[(String, Long)]("out")) { outBuf => + outBuf.toMap shouldBe Map("hello" -> 1L, "world" -> 2L, "goodbye" -> 1L) + } + val ex = Job.toExecutionFromClass(classOf[WordCountEc], Execution.failed(new Exception("oh no"))) + val check = + for { + _ <- ex + mode <- Execution.getMode + _ = test.postRunChecks(mode) + } yield () + + val conf = Config.empty.setArgs(test.getArgs) + val mode = test.getTestMode(useHadoop = false) + assert(check.waitFor(conf, mode).isSuccess) + } + } + + "toIterableExecution" should { + "work in TypedSource" in { + val workingDir = System.getProperty("user.dir") + val job = TypedPipe.from(TextLine(workingDir + "/../tutorial/data/hello.txt")).toIterableExecution + assert(job.waitFor(Config.empty, Local(true)).get.toList == List("Hello world", "Goodbye world")) + } + "work in a mapped TypedSource" in { + val workingDir = System.getProperty("user.dir") + val job = + TypedPipe.from(TextLine(workingDir + "/../tutorial/data/hello.txt")).map(_.size).toIterableExecution + assert( + job.waitFor(Config.empty, Local(true)).get.toList == List("Hello world", "Goodbye world").map(_.size) + ) + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ExecutionUtilTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionUtilTest.scala new file mode 100644 index 0000000000..a0b88aa207 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/ExecutionUtilTest.scala @@ -0,0 +1,47 @@ +package com.twitter.scalding + +import org.scalatest.{Matchers, WordSpec} + +class ExecutionUtilTest extends WordSpec with Matchers { + import ExecutionUtil._ + + implicit val tz: java.util.TimeZone = DateOps.UTC + implicit val dp: DateParser = DateParser.default + implicit val dateRange: DateRange = DateRange.parse("2015-01-01", "2015-01-10") + + def run[T](e: Execution[T]) = { + val mode = Local(true) + e.waitFor(Config.defaultFrom(mode), mode) + } + + def testJob(dr: DateRange) = { + assert(dr != null) + TypedPipe + .from[Int](Seq(1, 2, 3)) + .toIterableExecution + .map(_.head) + } + + def testJobFailure(dr: DateRange) = + throw new Exception(s"failed: $dr") + + "ExecutionUtil" should { + "run multiple jobs" in { + val days = dateRange.each(Days(1)).toSeq + val result = runDatesWithParallelism(Days(1))(testJob) + assert(run(result).get == days.map(d => (d, 1))) + } + + "run multiple jobs with executions" in { + val days = dateRange.each(Days(1)).toSeq + val result = runDateRangeWithParallelism(Days(1))(testJob) + assert(run(result).get == days.map(d => 1)) + } + + "run multiple jobs with executions and sum results" in { + val days = dateRange.each(Days(1)).toSeq + val result = runDateRangeWithParallelismSum(Days(1))(testJob) + assert(run(result).get == days.map(d => 1).sum) + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ExpandLibJarsGlobsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ExpandLibJarsGlobsTest.scala new file mode 100644 index 0000000000..e478924bb7 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/ExpandLibJarsGlobsTest.scala @@ -0,0 +1,91 @@ +package com.twitter.scalding + +import java.io.File +import org.scalatest.{Matchers, WordSpec} + +class ExpandLibJarsGlobsTest extends WordSpec with Matchers { + def touch(parent: File, p: String): String = { + val f = new File(parent, p) + f.createNewFile + f.getAbsolutePath + } + + def getTmpRoot = { + val tmpRoot = + new File(System.getProperty("java.io.tmpdir"), scala.util.Random.nextInt(Int.MaxValue).toString) + require(tmpRoot.mkdirs(), "Failed to make temporary directory") + tmpRoot.deleteOnExit() + tmpRoot + } + + "ExpandLibJarsGlobs" should { + "expand entries" in { + + val tmpRoot = getTmpRoot + // Has a side effect, but returns us the jars absolute paths + val jars = (0 until 20).map { idx => + touch(tmpRoot, s"myF_$idx.jar") + } ++ (0 until 20).map { idx => + touch(tmpRoot, s".myHidden.jar.myF_$idx.jar") + } + + val resultingLibJars1 = + ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/*.jar"))(1).split(",") + assert(resultingLibJars1.sorted.toList == jars.sorted.toList) + + val resultingLibJars2 = + ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/"))(1).split(",") + assert(resultingLibJars2.sorted.toList == jars.sorted.toList) + + val resultingLibJars3 = + ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/*"))(1).split(",") + assert(resultingLibJars3.sorted.toList == jars.sorted.toList) + } + + "Skips over unmatched entries" in { + val tmpRoot = getTmpRoot + + // Has a side effect, but returns us the jars absolute paths + val jars = (0 until 20).map { idx => + touch(tmpRoot, s"myF_$idx.jar") + } ++ (0 until 20).map { idx => + touch(tmpRoot, s".myHidden.jar.myF_$idx.jar") + } + + val resultingLibJars1 = ExpandLibJarsGlobs(Array("-libjars", s"${tmpRoot.getAbsolutePath}/*.zip"))(1) + .split(",") + .filter(_.nonEmpty) + assert(resultingLibJars1.isEmpty) + } + + "Multiple paths in libjars" in { + val tmpRoot1 = getTmpRoot + val tmpRoot2 = getTmpRoot + + // Has a side effect, but returns us the jars absolute paths + val jars1 = (0 until 20).map { idx => + touch(tmpRoot1, s"myF_$idx.jar") + } ++ (0 until 20).map { idx => + touch(tmpRoot1, s".myHidden.jar.myF_$idx.jar") + } + + val jars2 = (0 until 1).map { idx => + touch(tmpRoot2, s"myF_$idx.jar") + } + + // Using wildcards for both + val resultingLibJars1 = ExpandLibJarsGlobs( + Array("-libjars", s"${tmpRoot1.getAbsolutePath}/*.jar,${tmpRoot2.getAbsolutePath}/*.jar") + )(1).split(",") + assert(resultingLibJars1.sorted.toList == (jars1 ++ jars2).sorted.toList) + + // No wildcards for second dir + val resultingLibJars2 = ExpandLibJarsGlobs( + Array("-libjars", s"${tmpRoot1.getAbsolutePath}/*.jar,${tmpRoot2.getAbsolutePath}/myF_0.jar") + )(1).split(",") + assert(resultingLibJars2.sorted.toList == (jars1 ++ jars2).sorted.toList) + + } + + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/FieldImpsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/FieldImpsTest.scala index fd1667f5b4..bd4a60eb6e 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/FieldImpsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/FieldImpsTest.scala @@ -12,62 +12,49 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.Fields -import org.specs._ +import org.scalatest.{Matchers, WordSpec} -class FieldImpsTest extends Specification with FieldConversions { - noDetailedDiffs() //Fixes issue for scala 2.9 - def setAndCheck[T <: Comparable[_]](v : T)(implicit conv : (T) => Fields) { - val vF = conv(v) - vF.equals(new Fields(v)) must beTrue - } - def setAndCheckS[T <: Comparable[_]](v : Seq[T])(implicit conv : (Seq[T]) => Fields) { - val vF = conv(v) - vF.equals(new Fields(v : _*)) must beTrue - } - def setAndCheckSym(v : Symbol) { - val vF : Fields = v - vF.equals(new Fields(v.toString.tail)) must beTrue - } - def setAndCheckSymS(v : Seq[Symbol]) { - val vF : Fields = v - vF.equals(new Fields(v.map(_.toString.tail) : _*)) must beTrue - } - def setAndCheckField(v : Field[_]) { - val vF : Fields = v +class FieldImpsTest extends WordSpec with Matchers with FieldConversions { + def setAndCheck[T <: Comparable[_]](v: T)(implicit conv: (T) => Fields): Unit = + conv(v) shouldBe (new Fields(v)) + def setAndCheckS[T <: Comparable[_]](v: Seq[T])(implicit conv: (Seq[T]) => Fields): Unit = + conv(v) shouldBe (new Fields(v: _*)) + def setAndCheckSym(v: Symbol): Unit = + (v: Fields) shouldBe (new Fields(v.toString.tail)) + def setAndCheckSymS(v: Seq[Symbol]): Unit = + (v: Fields) shouldBe (new Fields(v.map(_.toString.tail): _*)) + def setAndCheckField(v: Field[_]): Unit = { + val vF: Fields = v val fields = new Fields(v.id) fields.setComparators(v.ord) checkFieldsWithComparators(vF, fields) } - def setAndCheckFieldS(v : Seq[Field[_]]) { - val vF : Fields = v - val fields = new Fields(v.map(_.id) : _*) - fields.setComparators(v.map(_.ord) : _*) + def setAndCheckFieldS(v: Seq[Field[_]]): Unit = { + val vF: Fields = v + val fields = new Fields(v.map(_.id): _*) + fields.setComparators(v.map(_.ord): _*) checkFieldsWithComparators(vF, fields) } - def setAndCheckEnumValue(v : Enumeration#Value) { - val vF : Fields = v - vF.equals(new Fields(v.toString)) must beTrue - } - def setAndCheckEnumValueS(v : Seq[Enumeration#Value]) { - val vF : Fields = v - vF.equals(new Fields(v.map(_.toString) : _*)) must beTrue - } - def checkFieldsWithComparators(actual: Fields, expected: Fields) { + def setAndCheckEnumValue(v: Enumeration#Value): Unit = + (v: Fields) shouldBe (new Fields(v.toString)) + def setAndCheckEnumValueS(v: Seq[Enumeration#Value]): Unit = + (v: Fields) shouldBe (new Fields(v.map(_.toString): _*)) + def checkFieldsWithComparators(actual: Fields, expected: Fields): Unit = { // sometimes one or the other is actually a RichFields, so rather than test for // actual.equals(expected), we just check that all the field names and comparators line up - actual.size must_== expected.size - (asList(actual), asList(expected)).zipped.forall(_.equals(_)) must beTrue - actual.getComparators.toSeq.equals(expected.getComparators.toSeq) must beTrue + actual should have size (expected.size) + asList(actual) shouldBe asList(expected) + actual.getComparators.toSeq shouldBe (expected.getComparators.toSeq) } "Field" should { "contain manifest" in { val field = Field[Long]("foo") - field.mf mustEqual Some(implicitly[Manifest[Long]]) + field.mf should contain(implicitly[Manifest[Long]]) } } "RichFields" should { @@ -76,24 +63,27 @@ class FieldImpsTest extends Specification with FieldConversions { val f2 = Field[String]('bar) val rf = RichFields(f1, f2) val fields: Fields = rf - fields.size mustEqual 2 - f1.id mustEqual fields.get(0) - f2.id mustEqual fields.get(1) - f1.ord mustEqual fields.getComparators()(0) - f2.ord mustEqual fields.getComparators()(1) + fields should have size 2 + f1.id shouldBe (fields.get(0)) + f2.id shouldBe (fields.get(1)) + f1.ord shouldBe (fields.getComparators()(0)) + f2.ord shouldBe (fields.getComparators()(1)) } "convert from Fields" in { val fields = new Fields("foo", "bar") val comparator = implicitly[Ordering[String]] fields.setComparators(comparator, comparator) val fieldList: List[Field[_]] = fields.toFieldList - fieldList mustEqual List(new StringField[String]("foo")(comparator, None), new StringField[String]("bar")(comparator, None)) + fieldList shouldBe List( + new StringField[String]("foo")(comparator, None), + new StringField[String]("bar")(comparator, None) + ) } "throw an exception on when converting a virtual Fields instance" in { import Fields._ List(ALL, ARGS, FIRST, GROUP, LAST, NONE, REPLACE, RESULTS, SWAP, UNKNOWN, VALUES).foreach { fields => - fields.toFieldList must throwA[Exception] + an[Exception] should be thrownBy fields.toFieldList } } } @@ -101,21 +91,21 @@ class FieldImpsTest extends Specification with FieldConversions { "convert from ints" in { setAndCheck(int2Integer(0)) setAndCheck(int2Integer(5)) - setAndCheckS(List(1,23,3,4).map(int2Integer)) + setAndCheckS(List(1, 23, 3, 4).map(int2Integer)) setAndCheckS((0 until 10).map(int2Integer)) } "convert from strings" in { setAndCheck("hey") setAndCheck("world") - setAndCheckS(List("one","two","three")) - //Synonym for list - setAndCheckS(Seq("one","two","three")) + setAndCheckS(List("one", "two", "three")) + // Synonym for list + setAndCheckS(Seq("one", "two", "three")) } "convert from symbols" in { setAndCheckSym('hey) - //Shortest length to make sure the tail stuff is working: + // Shortest length to make sure the tail stuff is working: setAndCheckSym('h) - setAndCheckSymS(List('hey,'world,'symbols)) + setAndCheckSymS(List('hey, 'world, 'symbols)) } "convert from com.twitter.scalding.Field instances" in { // BigInteger is just a convenient non-primitive ordered type @@ -140,40 +130,39 @@ class FieldImpsTest extends Specification with FieldConversions { object Schema extends Enumeration { val one, two, three = Value } - var vf : Fields = Schema - vf must be_==(new Fields("one","two","three")) + (Schema: Fields) shouldBe (new Fields("one", "two", "three")) } "convert from general int tuples" in { - var vf : Fields = Tuple1(1) - vf must be_==(new Fields(int2Integer(1))) - vf = (1,2) - vf must be_==(new Fields(int2Integer(1),int2Integer(2))) - vf = (1,2,3) - vf must be_==(new Fields(int2Integer(1),int2Integer(2),int2Integer(3))) - vf = (1,2,3,4) - vf must be_==(new Fields(int2Integer(1),int2Integer(2),int2Integer(3),int2Integer(4))) + var vf: Fields = Tuple1(1) + vf shouldBe (new Fields(int2Integer(1))) + vf = (1, 2) + vf shouldBe (new Fields(int2Integer(1), int2Integer(2))) + vf = (1, 2, 3) + vf shouldBe (new Fields(int2Integer(1), int2Integer(2), int2Integer(3))) + vf = (1, 2, 3, 4) + vf shouldBe (new Fields(int2Integer(1), int2Integer(2), int2Integer(3), int2Integer(4))) } "convert from general string tuples" in { - var vf : Fields = Tuple1("hey") - vf must be_==(new Fields("hey")) - vf = ("hey","world") - vf must be_==(new Fields("hey","world")) - vf = ("foo","bar","baz") - vf must be_==(new Fields("foo","bar","baz")) + var vf: Fields = Tuple1("hey") + vf shouldBe (new Fields("hey")) + vf = ("hey", "world") + vf shouldBe (new Fields("hey", "world")) + vf = ("foo", "bar", "baz") + vf shouldBe (new Fields("foo", "bar", "baz")) } "convert from general symbol tuples" in { - var vf : Fields = Tuple1('hey) - vf must be_==(new Fields("hey")) - vf = ('hey,'world) - vf must be_==(new Fields("hey","world")) - vf = ('foo,'bar,'baz) - vf must be_==(new Fields("foo","bar","baz")) + var vf: Fields = Tuple1('hey) + vf shouldBe (new Fields("hey")) + vf = ('hey, 'world) + vf shouldBe (new Fields("hey", "world")) + vf = ('foo, 'bar, 'baz) + vf shouldBe (new Fields("foo", "bar", "baz")) } "convert from general com.twitter.scalding.Field tuples" in { val foo = Field[java.math.BigInteger]("foo") val bar = Field[java.math.BigDecimal]("bar") - var vf : Fields = Tuple1(foo) + var vf: Fields = Tuple1(foo) var fields = new Fields("foo") fields.setComparators(foo.ord) checkFieldsWithComparators(vf, fields) @@ -194,53 +183,53 @@ class FieldImpsTest extends Specification with FieldConversions { val one, two, three = Value } import Schema._ - var vf : Fields = Tuple1(one) - vf must be_==(new Fields("one")) + var vf: Fields = Tuple1(one) + vf shouldBe (new Fields("one")) vf = (one, two) - vf must be_==(new Fields("one","two")) + vf shouldBe (new Fields("one", "two")) vf = (one, two, three) - vf must be_==(new Fields("one","two","three")) + vf shouldBe (new Fields("one", "two", "three")) } "convert to a pair of Fields from a pair of values" in { - var f2 : (Fields,Fields) = "hey"->"you" - f2 must be_==((new Fields("hey"),new Fields("you"))) + var f2: (Fields, Fields) = "hey" -> "you" + f2 shouldBe (new Fields("hey"), new Fields("you")) f2 = 'hey -> 'you - f2 must be_==((new Fields("hey"),new Fields("you"))) + f2 shouldBe (new Fields("hey"), new Fields("you")) f2 = (0 until 10) -> 'you - f2 must be_==((new Fields((0 until 10).map(int2Integer) : _*),new Fields("you"))) + f2 shouldBe (new Fields((0 until 10).map(int2Integer): _*), new Fields("you")) - f2 = (('hey, 'world) -> 'other) - f2 must be_==((new Fields("hey","world"),new Fields("other"))) + f2 = ('hey, 'world) -> 'other + f2 shouldBe (new Fields("hey", "world"), new Fields("other")) - f2 = 0 -> 2 - f2 must be_==((new Fields(int2Integer(0)),new Fields(int2Integer(2)))) + f2 = 0 -> 2 + f2 shouldBe (new Fields(int2Integer(0)), new Fields(int2Integer(2))) - f2 = (0, (1,"you")) - f2 must be_==((new Fields(int2Integer(0)),new Fields(int2Integer(1),"you"))) + f2 = (0, (1, "you")) + f2 shouldBe (new Fields(int2Integer(0)), new Fields(int2Integer(1), "you")) val foo = Field[java.math.BigInteger]("foo") val bar = Field[java.math.BigDecimal]("bar") - f2 = ((foo,bar) -> 'bell) + f2 = (foo, bar) -> 'bell var fields = new Fields("foo", "bar") fields.setComparators(foo.ord, bar.ord) - f2 must be_==((fields, new Fields("bell"))) + f2 shouldBe (fields, new Fields("bell")) - f2 = (foo -> ('bar,'bell)) + f2 = foo -> ('bar, 'bell) fields = RichFields(foo) fields.setComparators(foo.ord) - f2 must be_==((fields, new Fields("bar", "bell"))) + f2 shouldBe (fields, new Fields("bar", "bell")) - f2 = Seq("one","two","three") -> Seq("1","2","3") - f2 must be_==((new Fields("one","two","three"),new Fields("1","2","3"))) - f2 = List("one","two","three") -> List("1","2","3") - f2 must be_==((new Fields("one","two","three"),new Fields("1","2","3"))) - f2 = List('one,'two,'three) -> List('n1,'n2,'n3) - f2 must be_==((new Fields("one","two","three"),new Fields("n1","n2","n3"))) - f2 = List(4,5,6) -> List(1,2,3) - f2 must be_==((new Fields(int2Integer(4),int2Integer(5),int2Integer(6)), - new Fields(int2Integer(1),int2Integer(2),int2Integer(3)))) + f2 = Seq("one", "two", "three") -> Seq("1", "2", "3") + f2 shouldBe (new Fields("one", "two", "three"), new Fields("1", "2", "3")) + f2 = List("one", "two", "three") -> List("1", "2", "3") + f2 shouldBe (new Fields("one", "two", "three"), new Fields("1", "2", "3")) + f2 = List('one, 'two, 'three) -> List('n1, 'n2, 'n3) + f2 shouldBe (new Fields("one", "two", "three"), new Fields("n1", "n2", "n3")) + f2 = List(4, 5, 6) -> List(1, 2, 3) + f2 shouldBe (new Fields(int2Integer(4), int2Integer(5), int2Integer(6)), + new Fields(int2Integer(1), int2Integer(2), int2Integer(3))) object Schema extends Enumeration { val one, two, three = Value @@ -248,40 +237,40 @@ class FieldImpsTest extends Specification with FieldConversions { import Schema._ f2 = one -> two - f2 must be_==((new Fields("one"),new Fields("two"))) + f2 shouldBe (new Fields("one"), new Fields("two")) f2 = (one, two) -> three - f2 must be_==((new Fields("one","two"),new Fields("three"))) + f2 shouldBe (new Fields("one", "two"), new Fields("three")) f2 = one -> (two, three) - f2 must be_==((new Fields("one"),new Fields("two", "three"))) + f2 shouldBe (new Fields("one"), new Fields("two", "three")) } "correctly see if there are ints" in { - hasInts(0) must beTrue - hasInts((0,1)) must beTrue - hasInts('hey) must beFalse - hasInts((0,'hey)) must beTrue - hasInts(('hey,9)) must beTrue - hasInts(('a,'b)) must beFalse - def i(xi : Int) = new java.lang.Integer(xi) - asSet(0) must be_==(Set(i(0))) - asSet((0,1,2)) must be_==(Set(i(0),i(1),i(2))) - asSet((0,1,'hey)) must be_==(Set(i(0),i(1),"hey")) + hasInts(0) shouldBe true + hasInts((0, 1)) shouldBe true + hasInts('hey) shouldBe false + hasInts((0, 'hey)) shouldBe true + hasInts(('hey, 9)) shouldBe true + hasInts(('a, 'b)) shouldBe false + def i(xi: Int) = new java.lang.Integer(xi) + asSet(0) shouldBe Set(i(0)) + asSet((0, 1, 2)) shouldBe Set(i(0), i(1), i(2)) + asSet((0, 1, 'hey)) shouldBe Set(i(0), i(1), "hey") } "correctly determine default modes" in { - //Default case: - defaultMode(0,'hey) must be_==(Fields.ALL) - defaultMode((0,'t),'x) must be_==(Fields.ALL) - defaultMode(('hey,'x),'y) must be_==(Fields.ALL) - //Equal: - defaultMode('hey,'hey) must be_==(Fields.REPLACE) - defaultMode(('hey,'x),('hey,'x)) must be_==(Fields.REPLACE) - defaultMode(0,0) must be_==(Fields.REPLACE) - //Subset/superset: - defaultMode(('hey,'x),'x) must be_==(Fields.SWAP) - defaultMode('x, ('hey,'x)) must be_==(Fields.SWAP) - defaultMode(0, ('hey,0)) must be_==(Fields.SWAP) - defaultMode(('hey,0),0) must be_==(Fields.SWAP) + // Default case: + defaultMode(0, 'hey) shouldBe Fields.ALL + defaultMode((0, 't), 'x) shouldBe Fields.ALL + defaultMode(('hey, 'x), 'y) shouldBe Fields.ALL + // Equal: + defaultMode('hey, 'hey) shouldBe Fields.REPLACE + defaultMode(('hey, 'x), ('hey, 'x)) shouldBe Fields.REPLACE + defaultMode(0, 0) shouldBe Fields.REPLACE + // Subset/superset: + defaultMode(('hey, 'x), 'x) shouldBe Fields.SWAP + defaultMode('x, ('hey, 'x)) shouldBe Fields.SWAP + defaultMode(0, ('hey, 0)) shouldBe Fields.SWAP + defaultMode(('hey, 0), 0) shouldBe Fields.SWAP } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/FileSourceTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/FileSourceTest.scala index 3b56a5a0ab..dcfdafa7df 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/FileSourceTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/FileSourceTest.scala @@ -12,17 +12,19 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ +import cascading.scheme.NullScheme +import cascading.tuple.Fields import org.apache.hadoop.conf.Configuration +import org.scalatest.{Matchers, WordSpec} class MultiTsvInputJob(args: Args) extends Job(args) { try { MultipleTsvFiles(List("input0", "input1"), ('query, 'queryStats)).read.write(Tsv("output0")) } catch { - case e : Exception => e.printStackTrace() + case e: Exception => e.printStackTrace() } } @@ -30,145 +32,308 @@ class MultiTsvInputJob(args: Args) extends Job(args) { class SequenceFileInputJob(args: Args) extends Job(args) { try { SequenceFile("input0").read.write(SequenceFile("output0")) - WritableSequenceFile("input1", ('query, 'queryStats)).read.write(WritableSequenceFile("output1", ('query, 'queryStats))) + WritableSequenceFile("input1", ('query, 'queryStats)).read + .write(WritableSequenceFile("output1", ('query, 'queryStats))) } catch { case e: Exception => e.printStackTrace() } } -class FileSourceTest extends Specification { - noDetailedDiffs() +class MultipleTextLineFilesJob(args: Args) extends Job(args) { + try { + MultipleTextLineFiles(args.list("input"): _*).pipe.write(Tsv("output0")) + } catch { + case e: Exception => e.printStackTrace() + } + +} + +class FileSourceTest extends WordSpec with Matchers { import Dsl._ "A MultipleTsvFile Source" should { - JobTest(new MultiTsvInputJob(_)). - source(MultipleTsvFiles(List("input0", "input1"), ('query, 'queryStats)), - List(("foobar", 1), ("helloworld", 2))). - sink[(String, Int)](Tsv("output0")) { - outBuf => - "take multiple Tsv files as input sources" in { - outBuf.length must be_==(2) - outBuf.toList must be_==(List(("foobar", 1), ("helloworld", 2))) - } + JobTest(new MultiTsvInputJob(_)) + .source( + MultipleTsvFiles(List("input0", "input1"), ('query, 'queryStats)), + List(("foobar", 1), ("helloworld", 2)) + ) + .sink[(String, Int)](Tsv("output0")) { outBuf => + "take multiple Tsv files as input sources" in { + (outBuf should have).length(2) + outBuf.toList shouldBe List(("foobar", 1), ("helloworld", 2)) + } } .run - .finish + .finish() } "A WritableSequenceFile Source" should { - JobTest(new SequenceFileInputJob(_)). - source(SequenceFile("input0"), - List(("foobar0", 1), ("helloworld0", 2))). - source(WritableSequenceFile("input1", ('query, 'queryStats)), - List(("foobar1", 1), ("helloworld1", 2))). - sink[(String, Int)](SequenceFile("output0")) { - outBuf => - "sequence file input" in { - outBuf.length must be_==(2) - outBuf.toList must be_==(List(("foobar0", 1), ("helloworld0", 2))) - } + JobTest(new SequenceFileInputJob(_)) + .source(SequenceFile("input0"), List(("foobar0", 1), ("helloworld0", 2))) + .source(WritableSequenceFile("input1", ('query, 'queryStats)), List(("foobar1", 1), ("helloworld1", 2))) + .sink[(String, Int)](SequenceFile("output0")) { outBuf => + "sequence file input" in { + (outBuf should have).length(2) + outBuf.toList shouldBe List(("foobar0", 1), ("helloworld0", 2)) + } + } + .sink[(String, Int)](WritableSequenceFile("output1", ('query, 'queryStats))) { outBuf => + "writable sequence file input" in { + (outBuf should have).length(2) + outBuf.toList shouldBe List(("foobar1", 1), ("helloworld1", 2)) + } } - .sink[(String, Int)](WritableSequenceFile("output1", ('query, 'queryStats))) { - outBuf => - "writable sequence file input" in { - outBuf.length must be_==(2) - outBuf.toList must be_==(List(("foobar1", 1), ("helloworld1", 2))) - } + .run + .finish() + } + + "A MultipleTextLineFiles Source" should { + JobTest(new MultipleTextLineFilesJob(_)) + .arg("input", List("input0", "input1")) + .source(MultipleTextLineFiles("input0", "input1"), List("foobar", "helloworld")) + .sink[String](Tsv("output0")) { outBuf => + "take multiple text files as input sources" in { + (outBuf should have).length(2) + outBuf.toList shouldBe List("foobar", "helloworld") + } } .run - .finish + .finish() + } + + "TextLine.toIterator" should { + "correctly read strings" in { + TextLine("../tutorial/data/hello.txt").toIterator(Config.default, Local(true)).toList shouldBe List( + "Hello world", + "Goodbye world" + ) + } } /** - * The layout of the test data looks like this: + * The layout of the test data looks like this: /test_data/2013/02 does not exist * - * /test_data/2013/03 (dir with a single data file in it) - * /test_data/2013/03/2013-03.txt - - * /test_data/2013/04 (dir with a single data file and a _SUCCESS file) - * /test_data/2013/04/2013-04.txt + * /test_data/2013/03 (dir with a single data file in it) /test_data/2013/03/2013-03.txt + * + * /test_data/2013/04 (dir with a single data file and a _SUCCESS file) /test_data/2013/04/2013-04.txt * /test_data/2013/04/_SUCCESS - - * /test_data/2013/05 (empty dir) - - * /test_data/2013/06 (dir with only a _SUCCESS file) - * /test_data/2013/06/_SUCCESS + * + * /test_data/2013/05 (logically empty dir: git does not support empty dirs) + * + * /test_data/2013/06 (dir with only a _SUCCESS file) /test_data/2013/06/_SUCCESS + * + * /test_data/2013/07 /test_data/2013/07/2013-07.txt /test_data/2013/07/_SUCCESS */ "default pathIsGood" should { import TestFileSource.pathIsGood + "reject a non-existing directory" in { + pathIsGood("test_data/2013/02/") shouldBe false + pathIsGood("test_data/2013/02/*") shouldBe false + } "accept a directory with data in it" in { - pathIsGood("test_data/2013/03/") must be_==(true) - pathIsGood("test_data/2013/03/*") must be_==(true) + pathIsGood("test_data/2013/03/") shouldBe true + pathIsGood("test_data/2013/03/*") shouldBe true } "accept a directory with data and _SUCCESS in it" in { - pathIsGood("test_data/2013/04/") must be_==(true) - pathIsGood("test_data/2013/04/*") must be_==(true) + pathIsGood("test_data/2013/04/") shouldBe true + pathIsGood("test_data/2013/04/*") shouldBe true + } + + "accept a single directory without glob" in { + pathIsGood("test_data/2013/05/") shouldBe true } - "reject an empty directory" in { - pathIsGood("test_data/2013/05/") must be_==(false) - pathIsGood("test_data/2013/05/*") must be_==(false) + "reject a single directory glob with ignored files" in { + pathIsGood("test_data/2013/05/*") shouldBe false } "reject a directory with only _SUCCESS when specified as a glob" in { - pathIsGood("test_data/2013/06/*") must be_==(false) + pathIsGood("test_data/2013/06/*") shouldBe false } "accept a directory with only _SUCCESS when specified without a glob" in { - pathIsGood("test_data/2013/06/") must be_==(true) + pathIsGood("test_data/2013/06/") shouldBe true + } + } + + "FileSource.globHasSuccessFile" should { + import TestFileSource.globHasSuccessFile + + "accept a directory glob with only _SUCCESS" in { + globHasSuccessFile("test_data/2013/06/*") shouldBe true + } + + "accept a directory glob with _SUCCESS and other hidden files" in { + globHasSuccessFile("test_data/2013/05/*") shouldBe true + } + + "accept a directory glob with _SUCCESS and other non-hidden files" in { + globHasSuccessFile("test_data/2013/04/*") shouldBe true + } + + "reject a path without glob" in { + globHasSuccessFile("test_data/2013/04/") shouldBe false + } + + "reject a multi-dir glob without _SUCCESS" in { + globHasSuccessFile("test_data/2013/{02,03}/*") shouldBe false } } "success file source pathIsGood" should { import TestSuccessFileSource.pathIsGood + "reject a non-existing directory" in { + pathIsGood("test_data/2013/02/") shouldBe false + pathIsGood("test_data/2013/02/*") shouldBe false + } + "reject a directory with data in it but no _SUCCESS file" in { - pathIsGood("test_data/2013/03/") must be_==(false) - pathIsGood("test_data/2013/03/*") must be_==(false) + pathIsGood("test_data/2013/03/") shouldBe false + pathIsGood("test_data/2013/03/*") shouldBe false } - "accept a directory with data and _SUCCESS in it when specified as a glob" in { - pathIsGood("test_data/2013/04/*") must be_==(true) + "reject a single directory without glob" in { + pathIsGood("test_data/2013/05/") shouldBe false } - "reject a directory with data and _SUCCESS in it when specified without a glob" in { - pathIsGood("test_data/2013/04/") must be_==(false) + "reject a single directory glob with only _SUCCESS and ignored files" in { + pathIsGood("test_data/2013/05/*") shouldBe false + } + + "accept a directory with data and _SUCCESS in it when specified as a glob" in { + pathIsGood("test_data/2013/04/*") shouldBe true } - "reject an empty directory" in { - pathIsGood("test_data/2013/05/") must be_==(false) - pathIsGood("test_data/2013/05/*") must be_==(false) + "reject a directory with data and _SUCCESS in it when specified without a glob" in { + pathIsGood("test_data/2013/04/") shouldBe false } "reject a directory with only _SUCCESS when specified as a glob" in { - pathIsGood("test_data/2013/06/*") must be_==(false) + pathIsGood("test_data/2013/06/*") shouldBe false } "reject a directory with only _SUCCESS when specified without a glob" in { - pathIsGood("test_data/2013/06/") must be_==(false) + pathIsGood("test_data/2013/06/") shouldBe false + } + + "reject a multi-dir glob with only one _SUCCESS" in { + pathIsGood("test_data/2013/{03,04}/*") shouldBe false + } + + "accept a multi-dir glob if every dir has _SUCCESS" in { + pathIsGood("test_data/2013/{04,08}/*") shouldBe true + } + + "accept a multi-dir glob if all dirs with non-hidden files have _SUCCESS while dirs with " + + "hidden ones don't" in { + pathIsGood("test_data/2013/{04,05}/*") shouldBe true + } + + // NOTE: this is an undesirable limitation of SuccessFileSource, and is encoded here + // as a demonstration. This isn't a great behavior that we'd want to keep. + "accept a multi-dir glob if all dirs with non-hidden files have _SUCCESS while other dirs " + + "are empty or don't exist" in { + pathIsGood("test_data/2013/{02,04,05}/*") shouldBe true + } + } + + "FixedPathSource.hdfsWritePath" should { + "crib if path == *" in { + intercept[AssertionError](TestFixedPathSource("*").hdfsWritePath) + } + + "crib if path == /*" in { + intercept[AssertionError](TestFixedPathSource("/*").hdfsWritePath) + } + + "remove /* from a path ending in /*" in { + TestFixedPathSource("test_data/2013/06/*").hdfsWritePath shouldBe "test_data/2013/06" } + "leave path as-is when it ends in a directory name" in { + TestFixedPathSource("test_data/2013/06").hdfsWritePath shouldBe "test_data/2013/06" + } + + "leave path as-is when it ends in a directory name/" in { + TestFixedPathSource("test_data/2013/06/").hdfsWritePath shouldBe "test_data/2013/06/" + } + + "leave path as-is when it ends in * without a preceding /" in { + TestFixedPathSource("test_data/2013/06*").hdfsWritePath shouldBe "test_data/2013/06*" + } } + + "invalid source input" should { + "Throw in validateTaps in strict mode" in { + val e = intercept[InvalidSourceException] { + TestInvalidFileSource.validateTaps(Hdfs(strict = true, new Configuration())) + } + assert(e.getMessage.endsWith("Data is missing from one or more paths in: List(invalid_hdfs_path)")) + } + + "Throw in validateTaps in non-strict mode" in { + val e = intercept[InvalidSourceException] { + TestInvalidFileSource.validateTaps(Hdfs(strict = false, new Configuration())) + } + assert(e.getMessage.endsWith("No good paths in: List(invalid_hdfs_path)")) + } + + "Throw in toIterator because no data is present in strict mode" in { + val e = intercept[InvalidSourceException] { + TestInvalidFileSource.toIterator(Config.default, Hdfs(strict = true, new Configuration())) + } + assert(e.getMessage.endsWith("Data is missing from one or more paths in: List(invalid_hdfs_path)")) + } + + "Throw in toIterator because no data is present in non-strict mode" in { + val e = intercept[InvalidSourceException] { + TestInvalidFileSource.toIterator(Config.default, Hdfs(strict = false, new Configuration())) + } + assert(e.getMessage.endsWith("No good paths in: List(invalid_hdfs_path)")) + } + } +} + +object TestPath { + def getCurrentDirectory = new java.io.File(".").getCanonicalPath + def prefix = getCurrentDirectory.split("/").last match { + case "scalding-core" => getCurrentDirectory + case _ => getCurrentDirectory + "/scalding-core" + } + val testfsPathRoot = prefix + "/src/test/resources/com/twitter/scalding/test_filesystem/" } object TestFileSource extends FileSource { + import TestPath.testfsPathRoot + override def hdfsPaths: Iterable[String] = Iterable.empty - override def localPath: String = "" + override def localPaths: Iterable[String] = Iterable.empty - val testfsPathRoot = "scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/" val conf = new Configuration() def pathIsGood(p: String) = super.pathIsGood(testfsPathRoot + p, conf) + def globHasSuccessFile(p: String) = FileSource.globHasSuccessFile(testfsPathRoot + p, conf) } object TestSuccessFileSource extends FileSource with SuccessFileSource { + import TestPath.testfsPathRoot override def hdfsPaths: Iterable[String] = Iterable.empty - override def localPath: String = "" + override def localPaths: Iterable[String] = Iterable.empty - val testfsPathRoot = "scalding-core/src/test/resources/com/twitter/scalding/test_filesystem/" val conf = new Configuration() def pathIsGood(p: String) = super.pathIsGood(testfsPathRoot + p, conf) -} \ No newline at end of file +} + +object TestInvalidFileSource extends FileSource with Mappable[String] { + override def hdfsPaths: Iterable[String] = Iterable("invalid_hdfs_path") + override def localPaths: Iterable[String] = Iterable("invalid_local_path") + override def hdfsScheme = new NullScheme(Fields.ALL, Fields.NONE) + override def converter[U >: String] = + TupleConverter.asSuperConverter[String, U](implicitly[TupleConverter[String]]) +} + +case class TestFixedPathSource(path: String*) extends FixedPathSource(path: _*) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/FlowStateMapTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/FlowStateMapTest.scala new file mode 100644 index 0000000000..422db9eb0b --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/FlowStateMapTest.scala @@ -0,0 +1,20 @@ +package com.twitter.scalding + +import org.scalatest.FunSuite + +import cascading.flow.FlowDef +import com.twitter.scalding.source.{NullSink, TypedText} +import com.twitter.scalding.typed.cascading_backend.{CascadingBackend, CascadingExtensions} + +import CascadingExtensions._ + +class FlowStateMapTest extends FunSuite { + test("make sure sure sourcemap isn't empty after planning") { + implicit val fd = new FlowDef + implicit val m = Local(false) + val t = TypedPipe.from(TypedText.tsv[String]("")).write(NullSink) + CascadingBackend.planTypedWrites(fd, m) + val state = FlowStateMap(fd) + assert(state.sourceMap.nonEmpty) + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/IntegralCompTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/IntegralCompTest.scala index 843025f02a..c1895c6322 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/IntegralCompTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/IntegralCompTest.scala @@ -12,56 +12,55 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ -class IntegralCompTest extends Specification { - def box[T](t : T) = t.asInstanceOf[AnyRef] +import org.scalatest.{Matchers, WordSpec} + +class IntegralCompTest extends WordSpec with Matchers { + def box[T](t: T) = t.asInstanceOf[AnyRef] "IntegralComparator" should { val intComp = new IntegralComparator "recognize integral types" in { - intComp.isIntegral(box(1)) must beTrue - intComp.isIntegral(box(1L)) must beTrue - intComp.isIntegral(box(1.asInstanceOf[Short])) must beTrue - //Boxed - intComp.isIntegral(new java.lang.Long(2)) must beTrue - intComp.isIntegral(new java.lang.Integer(2)) must beTrue - intComp.isIntegral(new java.lang.Short(2.asInstanceOf[Short])) must beTrue - intComp.isIntegral(new java.lang.Long(2)) must beTrue - intComp.isIntegral(new java.lang.Long(2)) must beTrue - //These are not integrals - intComp.isIntegral(box(0.0)) must beFalse - intComp.isIntegral(box("hey")) must beFalse - intComp.isIntegral(box(Nil)) must beFalse - intComp.isIntegral(box(None)) must beFalse + intComp.isIntegral(box(1)) shouldBe true + intComp.isIntegral(box(1L)) shouldBe true + intComp.isIntegral(box(1: Short)) shouldBe true + // Boxed + intComp.isIntegral(new java.lang.Long(2)) shouldBe true + intComp.isIntegral(new java.lang.Integer(2)) shouldBe true + intComp.isIntegral(new java.lang.Short(2: Short)) shouldBe true + // These are not integrals + intComp.isIntegral(box(0.0)) shouldBe false + intComp.isIntegral(box("hey")) shouldBe false + intComp.isIntegral(box(Nil)) shouldBe false + intComp.isIntegral(box(None)) shouldBe false } "handle null inputs" in { - intComp.hashCode(null) must be_==(0) - List(box(1),box("hey"),box(2L),box(0.0)).foreach { x => - intComp.compare(null, x) must be_<(0) - intComp.compare(x,null) must be_>(0) - intComp.compare(x, x) must be_==(0) - } - intComp.compare(null,null) must be_==(0) + intComp.hashCode(null) shouldBe 0 + List(box(1), box("hey"), box(2L), box(0.0)).foreach { x => + intComp.compare(null, x) should be < 0 + intComp.compare(x, null) should be > 0 + intComp.compare(x, x) shouldBe 0 + } + intComp.compare(null, null) shouldBe 0 } "have consistent hashcode" in { - List( (box(1),box(1L)), (box(2),box(2L)), (box(3),box(3L)) ) + List((box(1), box(1L)), (box(2), box(2L)), (box(3), box(3L))) .foreach { pair => - intComp.compare(pair._1, pair._2) must be_==(0) - intComp.hashCode(pair._1) must be_==(intComp.hashCode(pair._2)) + intComp.compare(pair._1, pair._2) shouldBe 0 + intComp.hashCode(pair._1) shouldBe (intComp.hashCode(pair._2)) } - List( (box(1),box(2L)), (box(2),box(3L)), (box(3),box(4L)) ) + List((box(1), box(2L)), (box(2), box(3L)), (box(3), box(4L))) .foreach { pair => - intComp.compare(pair._1, pair._2) must be_<(0) - intComp.compare(pair._2, pair._1) must be_>(0) + intComp.compare(pair._1, pair._2) should be < 0 + intComp.compare(pair._2, pair._1) should be > 0 } } "Compare strings properly" in { - intComp.compare("hey","you") must be_==("hey".compareTo("you")) - intComp.compare("hey","hey") must be_==("hey".compareTo("hey")) - intComp.compare("you","hey") must be_==("you".compareTo("hey")) + intComp.compare("hey", "you") shouldBe ("hey".compareTo("you")) + intComp.compare("hey", "hey") shouldBe ("hey".compareTo("hey")) + intComp.compare("you", "hey") shouldBe ("you".compareTo("hey")) } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/IterableExecutionSerializationTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/IterableExecutionSerializationTest.scala new file mode 100644 index 0000000000..2719ac817e --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/IterableExecutionSerializationTest.scala @@ -0,0 +1,33 @@ +package com.twitter.scalding + +import com.twitter.bijection.JavaSerializationInjection +import com.twitter.chill.KryoPool +import com.twitter.chill.config.ScalaAnyRefMapConfig +import com.twitter.scalding.serialization.{Externalizer, KryoHadoop} +import com.twitter.scalding.source.TypedText +import org.scalatest.FunSuite + +class ToIterableSerializationTest extends FunSuite { + + class Foo { + val field = 42 + } + + val myFoo = new Foo + val testIterableExecution = + Execution.toIterable(TypedPipe.from(TypedText.tsv[Int]("foo")).map(_ * myFoo.field)) + + test("toIterableExecution should roundtrip") { + + val jInjection = JavaSerializationInjection[Externalizer[Execution[Iterable[Int]]]] + val externalizer = Externalizer(testIterableExecution) + + assert(jInjection.invert(jInjection(externalizer)).isSuccess) + } + test("testing kryo") { + val kryo = new KryoHadoop(ScalaAnyRefMapConfig(Map("scalding.kryo.setreferences" -> "true"))) + val kryoPool = KryoPool.withByteArrayOutputStream(1, kryo) + assert(scala.util.Try(kryoPool.deepCopy(testIterableExecution)).isSuccess) + } + +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/JobTestTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/JobTestTest.scala index 70f900724b..ab93c91258 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/JobTestTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/JobTestTest.scala @@ -1,17 +1,19 @@ package com.twitter.scalding -import org.specs.Specification +import com.twitter.scalding.source.TypedText +import org.scalatest.{Matchers, WordSpec} /** * Simple identity job that reads from a Tsv and writes to a Tsv with no change. * - * @param args to the job. "input" specifies the input file, and "output" the output file. + * @param args + * to the job. "input" specifies the input file, and "output" the output file. */ class SimpleTestJob(args: Args) extends Job(args) { Tsv(args("input")).read.write(Tsv(args("output"))) } -class JobTestTest extends Specification { +class JobTestTest extends WordSpec with Matchers { "A JobTest" should { "error helpfully when a source in the job doesn't have a corresponding .source call" in { val testInput: List[(String, Int)] = List(("a", 1), ("b", 2)) @@ -28,14 +30,40 @@ class JobTestTest extends Specification { .arg("input", "input") .arg("output", "output") .source(incorrectSource, testInput) - .sink[(String, Int)](Tsv("output")){ outBuf => { assert(outBuf == testInput) }} + .sink[(String, Int)](Tsv("output"))(outBuf => outBuf shouldBe testInput) .run - runJobTest() must throwA[IllegalArgumentException].like { - case iae: IllegalArgumentException => - iae.getMessage mustVerify( - _.contains( TestTapFactory.sourceNotFoundError.format(requiredSource))) + (the[IllegalArgumentException] thrownBy { + runJobTest() + } should have).message( + s"Failed to create tap for: $requiredSource, with error: requirement failed: " + TestTapFactory.sourceNotFoundError + .format(requiredSource) + ) + } + "use local mode by default" in { + JobTest(new SimpleTestJob(_)).getTestMode(true, None) match { + case m: HadoopTest => m.jobConf.get("mapreduce.framework.name") shouldBe "local" + } + } + + "work with a lot of sinks at the same time" in { + val elements = List(1, 2, 3, 4, 5) + val sinks: Seq[TypedSink[Int] with Source] = (1 to 100).map { num => + TypedText.tsv[Int]("output" + num) + } + + val writes = sinks.map { sink => + TypedPipe.from(elements).writeExecution(sink) + } + val writesExecution: Execution[Unit] = Execution.sequence(writes).unit + + var jobTest = JobTest(new ExecutionJob[Unit](_) { + override def execution: Execution[Unit] = writesExecution + }) + sinks.foreach { sink => + jobTest = jobTest.sink[Int](sink)(_.toList == elements) } + jobTest.run.finish() } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/KryoTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/KryoTest.scala index 9468adf191..46070a7a04 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/KryoTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/KryoTest.scala @@ -12,43 +12,41 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import com.twitter.scalding.serialization._ +import org.scalatest.{Matchers, WordSpec} -import org.specs._ - -import java.io.{ByteArrayOutputStream=>BOS} -import java.io.{ByteArrayInputStream=>BIS} +import java.io.{ByteArrayOutputStream => BOS} +import java.io.{ByteArrayInputStream => BIS} import scala.collection.immutable.ListMap import scala.collection.immutable.HashMap -import com.twitter.algebird.{AveragedValue, DecayedValue, - HyperLogLog, HyperLogLogMonoid, Moments, Monoid} +import com.twitter.algebird.{AveragedValue, DecayedValue, HyperLogLogMonoid, Moments, Monoid} -import com.twitter.chill.config.ConfiguredInstantiator +import com.twitter.chill.config.{ConfiguredInstantiator, ScalaMapConfig} import com.twitter.chill.hadoop.HadoopConfig import com.twitter.chill.hadoop.KryoSerialization +import com.esotericsoftware.kryo.io.{Input, Output} + import org.apache.hadoop.conf.Configuration + /* -* This is just a test case for Kryo to deal with. It should -* be outside KryoTest, otherwise the enclosing class, KryoTest -* will also need to be serialized -*/ -case class TestCaseClassForSerialization(x : String, y : Int) + * This is just a test case for Kryo to deal with. It should + * be outside KryoTest, otherwise the enclosing class, KryoTest + * will also need to be serialized + */ +case class TestCaseClassForSerialization(x: String, y: Int) -case class TestValMap(val map : Map[String,Double]) -case class TestValHashMap(val map : HashMap[String,Double]) +case class TestValMap(val map: Map[String, Double]) +case class TestValHashMap(val map: HashMap[String, Double]) -class KryoTest extends Specification { +class KryoTest extends WordSpec with Matchers { implicit def dateParser: DateParser = DateParser.default - noDetailedDiffs() //Fixes issue for scala 2.9 - def getSerialization = { val conf = new Configuration val chillConf = new HadoopConfig(conf) @@ -56,7 +54,7 @@ class KryoTest extends Specification { new KryoSerialization(conf) } - def serObj[T <: AnyRef](in : T) = { + def serObj[T <: AnyRef](in: T) = { val khs = getSerialization val ks = khs.getSerializer(in.getClass.asInstanceOf[Class[AnyRef]]) val out = new BOS @@ -66,7 +64,7 @@ class KryoTest extends Specification { out.toByteArray } - def deserObj[T <: AnyRef](cls : Class[_], input : Array[Byte]) : T = { + def deserObj[T <: AnyRef](cls: Class[_], input: Array[Byte]): T = { val khs = getSerialization val ks = khs.getDeserializer(cls.asInstanceOf[Class[AnyRef]]) val in = new BIS(input) @@ -76,88 +74,103 @@ class KryoTest extends Specification { ks.close res.asInstanceOf[T] } - def singleRT[T <: AnyRef](in : T) : T = { + def singleRT[T <: AnyRef](in: T): T = deserObj[T](in.getClass, serObj(in)) - } - - //These are analogous to how Hadoop will serialize - def serialize(ins : List[AnyRef]) = { - ins.map { v => (v.getClass, serObj(v)) } - } - def deserialize(input : List[(Class[_], Array[Byte])]) = { - input.map { tup => deserObj[AnyRef](tup._1, tup._2) } - } - def serializationRT(ins : List[AnyRef]) = deserialize(serialize(ins)) + // These are analogous to how Hadoop will serialize + def serialize(ins: List[AnyRef]) = + ins.map(v => (v.getClass, serObj(v))) + def deserialize(input: List[(Class[_], Array[Byte])]) = + input.map(tup => deserObj[AnyRef](tup._1, tup._2)) + def serializationRT(ins: List[AnyRef]) = deserialize(serialize(ins)) "KryoSerializers and KryoDeserializers" should { + "round trip for KryoHadoop" in { + val kryoHadoop = new serialization.KryoHadoop(new HadoopConfig(new Configuration)) + val bootstrapKryo = new serialization.KryoHadoop(new ScalaMapConfig(Map.empty)).newKryo + + val buffer = new Array[Byte](1024 * 1024) + val output = new Output(buffer) + bootstrapKryo.writeClassAndObject(output, kryoHadoop) + + val input = new Input(buffer) + val deserialized = bootstrapKryo.readClassAndObject(input).asInstanceOf[serialization.KryoHadoop] + deserialized.newKryo + } + "round trip any non-array object" in { - import HyperLogLog._ - implicit val hllmon = new HyperLogLogMonoid(4) - val test = List(1,2,"hey",(1,2),Args("--this is --a --b --test 34"), - ("hey","you"), - ("slightly", 1L, "longer", 42, "tuple"), - Map(1->2,4->5), - 0 to 100, - (0 to 42).toList, Seq(1,100,1000), - Map("good" -> 0.5, "bad" -> -1.0), - Set(1,2,3,4,10), - ListMap("good" -> 0.5, "bad" -> -1.0), - HashMap("good" -> 0.5, "bad" -> -1.0), - TestCaseClassForSerialization("case classes are: ", 10), - TestValMap(Map("you" -> 1.0, "every" -> 2.0, "body" -> 3.0, "a" -> 1.0, - "b" -> 2.0, "c" -> 3.0, "d" -> 4.0)), - TestValHashMap(HashMap("you" -> 1.0)), - Vector(1,2,3,4,5), - TestValMap(null), - Some("junk"), - DecayedValue(1.0, 2.0), - Moments(100.0), Monoid.plus(Moments(100), Moments(2)), - AveragedValue(100, 32.0), - // Serialize an instance of the HLL monoid - hllmon.apply(42), - Monoid.sum(List(1,2,3,4).map { hllmon(_) }), - 'hai) + implicit val hllmon: HyperLogLogMonoid = new HyperLogLogMonoid(4) + val test = List( + 1, + 2, + "hey", + (1, 2), + Args("--this is --a --b --test 34"), + ("hey", "you"), + ("slightly", 1L, "longer", 42, "tuple"), + Map(1 -> 2, 4 -> 5), + 0 to 100, + (0 to 42).toList, + Seq(1, 100, 1000), + Map("good" -> 0.5, "bad" -> -1.0), + Set(1, 2, 3, 4, 10), + ListMap("good" -> 0.5, "bad" -> -1.0), + HashMap("good" -> 0.5, "bad" -> -1.0), + TestCaseClassForSerialization("case classes are: ", 10), + TestValMap( + Map("you" -> 1.0, "every" -> 2.0, "body" -> 3.0, "a" -> 1.0, "b" -> 2.0, "c" -> 3.0, "d" -> 4.0) + ), + TestValHashMap(HashMap("you" -> 1.0)), + Vector(1, 2, 3, 4, 5), + TestValMap(null), + Some("junk"), + DecayedValue(1.0, 2.0), + Moments(100.0), + Monoid.plus(Moments(100), Moments(2)), + AveragedValue(100, 32.0), + // Serialize an instance of the HLL monoid + hllmon.toHLL(42), + Monoid.sum(List(1, 2, 3, 4).map(hllmon.toHLL(_))), + 'hai + ) .asInstanceOf[List[AnyRef]] - serializationRT(test) must be_==(test) + serializationRT(test) shouldBe test // HyperLogLogMonoid doesn't have a good equals. :( - singleRT(new HyperLogLogMonoid(5)).bits must be_==(5) + singleRT(new HyperLogLogMonoid(5)).bits shouldBe 5 } "handle arrays" in { - def arrayRT[T](arr : Array[T]) { - serializationRT(List(arr))(0) - .asInstanceOf[Array[T]].toList must be_==(arr.toList) - } + def arrayRT[T](arr: Array[T]): Unit = + serializationRT(List(arr)).head + .asInstanceOf[Array[T]] + .toList shouldBe (arr.toList) arrayRT(Array(0)) arrayRT(Array(0.1)) arrayRT(Array("hey")) - arrayRT(Array((0,1))) + arrayRT(Array((0, 1))) arrayRT(Array(None, Nil, None, Nil)) } "handle scala singletons" in { val test = List(Nil, None) - //Serialize each: - serializationRT(test) must be_==(test) - //Together in a list: - singleRT(test) must be_==(test) + // Serialize each: + serializationRT(test) shouldBe test + // Together in a list: + singleRT(test) shouldBe test } "handle Date, RichDate and DateRange" in { import DateOps._ - implicit val tz = PACIFIC - val myDate : RichDate = "1999-12-30T14" - val simpleDate : java.util.Date = myDate.value + implicit val tz: java.util.TimeZone = PACIFIC + val myDate: RichDate = "1999-12-30T14" + val simpleDate: java.util.Date = myDate.value val myDateRange = DateRange("2012-01-02", "2012-06-09") - singleRT(myDate) must be_==(myDate) - singleRT(simpleDate) must be_==(simpleDate) - singleRT(myDateRange) must be_==(myDateRange) + singleRT(myDate) shouldBe myDate + singleRT(simpleDate) shouldBe simpleDate + singleRT(myDateRange) shouldBe myDateRange } "Serialize a giant list" in { val bigList = (1 to 100000).toList val list2 = deserObj[List[Int]](bigList.getClass, serObj(bigList)) - //Specs, it turns out, also doesn't deal with giant lists well: - list2.zip(bigList).foreach { tup => - tup._1 must be_==(tup._2) - } + // Specs, it turns out, also doesn't deal with giant lists well: + list2.zip(bigList).foreach { case (l, r) => l shouldBe r } } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/LargePlanTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/LargePlanTest.scala new file mode 100644 index 0000000000..496ba79de5 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/LargePlanTest.scala @@ -0,0 +1,63 @@ +package com.twitter.scalding + +import org.scalatest.FunSuite + +import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.Await +import scala.concurrent.duration._ + +/** + * on branch 0.17.x: + * - size=2 took 0.5 seconds + * - size=4 took 0.2 seconds + * - size=8 took 0.3 seconds + * - size=16 took 0.4 seconds + * - size=32 took 0.7 seconds + * - size=64 took 18.9 seconds + * - size=128 timed out (after 60 seconds) + * + * on branch cascading3: + * - size=2 took 0.6 seconds + * - size=4 took 0.3 seconds + * - size=8 took 0.3 seconds + * - size=16 took 0.4 seconds + * - size=32 took 0.5 seconds + * - size=64 took 1.2 seconds + * - size=128 took 2.7 seconds + */ + +class LargePlanTest extends FunSuite { + + val ns = List((1, 100), (2, 200)) + + // build a small pipe (only 2 keys) composed of a potentially large + // number of joins. + def build(size: Int): TypedPipe[(Int, Int)] = { + val pipe = TypedPipe.from(ns) + if (size <= 0) pipe + else pipe.join(build(size - 1)).mapValues { case (x, y) => x + y } + } + + // each test might run for up to this long + val Timeout = 60.seconds // one minute + + // run a test at a particular size + def run(size: Int): Unit = { + val t0 = System.currentTimeMillis() + val pipe = build(size) + val exec = pipe.toIterableExecution + val fut = exec.run(Config.empty, Local(true)) + val values = Await.result(fut, Timeout) + val secs = "%.1f".format((System.currentTimeMillis() - t0) / 1000.0) + assert(true) + println(s"size=$size took $secs seconds") + } + + test("size=2")(run(2)) + test("size=4")(run(4)) + test("size=8")(run(8)) + test("size=16")(run(16)) + test("size=32")(run(32)) + test("size=64")(run(64)) + // test("size=128") { run(128) } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/LookupJoinTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/LookupJoinTest.scala index b4cdd64946..e4a24062d3 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/LookupJoinTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/LookupJoinTest.scala @@ -12,61 +12,205 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import com.twitter.scalding.typed.LookupJoin -import org.specs._ +import org.scalatest.{Matchers, WordSpec} -class LookupJoinerJob(args : Args) extends Job(args) { - import TDsl._ +import com.twitter.algebird.Semigroup - val in0 = TypedTsv[(Int,Int,Int)]("input0") - val in1 = TypedTsv[(Int,Int,Int)]("input1") +object LookupJoinedTest { - LookupJoin(TypedPipe.from(in0).map { case (t,k,v) => (t, (k, v)) }, - TypedPipe.from(in1).map { case (t,k,v) => (t, (k, v)) }) + // Not defined if there is a collision in K and T, so make those unique: + def genList(maxTime: Int, maxKey: Int, sz: Int): List[(Int, Int, Int)] = { + val rng = new java.util.Random + (0 until sz).view + .map { _ => + (rng.nextInt(maxTime), rng.nextInt(maxKey), rng.nextInt) + } + .groupBy { case (t, k, v) => (t, k) } + .mapValues(_.headOption.toList) + .values + .flatten + .toList + } +} + +class LookupJoinerJob(args: Args) extends Job(args) { + + val in0 = TypedTsv[(Int, Int, Int)]("input0") + val in1 = TypedTsv[(Int, Int, Int)]("input1") + + LookupJoin( + TypedPipe.from(in0).map { case (t, k, v) => (t, (k, v)) }, + TypedPipe.from(in1).map { case (t, k, v) => (t, (k, v)) } + ) .map { case (t, (k, (v, opt))) => (t.toString, k.toString, v.toString, opt.toString) } - .write(TypedTsv[(String,String,String,String)]("output")) + .write(TypedTsv[(String, String, String, String)]("output")) + + LookupJoin + .rightSumming( + TypedPipe.from(in0).map { case (t, k, v) => (t, (k, v)) }, + TypedPipe.from(in1).map { case (t, k, v) => (t, (k, v)) } + ) + .map { case (t, (k, (v, opt))) => + (t.toString, k.toString, v.toString, opt.toString) + } + .write(TypedTsv[(String, String, String, String)]("output2")) } -class LookupJoinedTest extends Specification { - noDetailedDiffs() - import Dsl._ - def lookupJoin[T:Ordering,K,V,W](in0: Iterable[(T,K,V)], in1: Iterable[(T,K,W)]) = { - // super inefficient, but easy to verify: +class LookupJoinedTest extends WordSpec with Matchers { + + import LookupJoinedTest.genList + + def lookupJoin[T: Ordering, K, V, W](in0: Iterable[(T, K, V)], in1: Iterable[(T, K, W)]) = { + val serv = in1.groupBy(_._2) def lookup(t: T, k: K): Option[W] = { - implicit val ord = Ordering.by { tkw: (T, K, W) => tkw._1 } - in1.filter { case (t1, k1, _) => (k1 == k) && Ordering[T].lt(t1, t) } - .reduceOption(Ordering[(T,K,W)].max(_, _)) - .map { _._3 } + val ord = Ordering.by { tkw: (T, K, W) => tkw._1 } + serv.get(k).flatMap { in1s => + in1s + .filter { case (t1, _, _) => Ordering[T].lt(t1, t) } + .reduceOption(ord.max(_, _)) + .map { + _._3 + } + } } - in0.map { case (t,k,v) => (t.toString, k.toString, v.toString, lookup(t, k).toString) } + in0.map { case (t, k, v) => (t.toString, k.toString, v.toString, lookup(t, k).toString) } } + + def lookupSumJoin[T: Ordering, K, V, W: Semigroup](in0: Iterable[(T, K, V)], in1: Iterable[(T, K, W)]) = { + implicit val ord: Ordering[(T, K, W)] = Ordering.by { + _._1 + } + val serv: Map[K, List[(T, K, W)]] = in1.groupBy(_._2).map { case (k, v) => + ( + k, + v.toList.sorted + .scanLeft(None: Option[(T, K, W)]) { (old, newer) => + old + .map { case (_, _, w) => (newer._1, newer._2, Semigroup.plus(w, newer._3)) } + .orElse(Some(newer)) + } + .collect { case Some(v) => v } + ) + } + + def lookup(t: T, k: K): Option[W] = { + val ord = Ordering.by { tkw: (T, K, W) => tkw._1 } + serv.get(k).flatMap { in1s => + in1s + .filter { case (t1, _, _) => Ordering[T].lt(t1, t) } + .reduceOption(ord.max(_, _)) + .map { + _._3 + } + } + } + in0.map { case (t, k, v) => (t.toString, k.toString, v.toString, lookup(t, k).toString) } + } + "A LookupJoinerJob" should { "correctly lookup" in { - val rng = new java.util.Random - val MAX_KEY = 10 - def genList(sz: Int): List[(Int, Int, Int)] = { - (0 until sz).map { _ => - (rng.nextInt, rng.nextInt(MAX_KEY), rng.nextInt) - }.toList - } - val in0 = genList(1000) - val in1 = genList(1000) + val MAX_KEY = 100 + val VAL_COUNT = 10000 + val in0 = genList(Int.MaxValue, MAX_KEY, VAL_COUNT) + val in1 = genList(Int.MaxValue, MAX_KEY, VAL_COUNT) JobTest(new LookupJoinerJob(_)) - .source(TypedTsv[(Int,Int,Int)]("input0"), in0) - .source(TypedTsv[(Int,Int,Int)]("input1"), in1) - .sink[(String, String, String, String)]( - TypedTsv[(String,String,String,String)]("output")) { outBuf => - outBuf.toSet must be_==(lookupJoin(in0, in1).toSet) - in0.size must be_==(outBuf.size) + .source(TypedTsv[(Int, Int, Int)]("input0"), in0) + .source(TypedTsv[(Int, Int, Int)]("input1"), in1) + .sink[(String, String, String, String)](TypedTsv[(String, String, String, String)]("output")) { + outBuf => + outBuf.toSet should equal(lookupJoin(in0, in1).toSet) + in0.size should equal(outBuf.size) + } + .sink[(String, String, String, String)](TypedTsv[(String, String, String, String)]("output2")) { + outBuf => + outBuf.toSet should equal(lookupSumJoin(in0, in1).toSet) + in0.size should equal(outBuf.size) + } + .run + // .runHadoop + .finish() + } + } +} + +class WindowLookupJoinerJob(args: Args) extends Job(args) { + + val in0 = TypedTsv[(Int, Int, Int)]("input0") + val in1 = TypedTsv[(Int, Int, Int)]("input1") + val window = args("window").toInt + + def gate(left: Int, right: Int) = + (left.toLong - right.toLong) < window + + LookupJoin + .withWindow( + TypedPipe.from(in0).map { case (t, k, v) => (t, (k, v)) }, + TypedPipe.from(in1).map { case (t, k, v) => (t, (k, v)) } + )(gate _) + .map { case (t, (k, (v, opt))) => + (t.toString, k.toString, v.toString, opt.toString) + } + .write(TypedTsv[(String, String, String, String)]("output")) +} + +class WindowLookupJoinedTest extends WordSpec with Matchers { + + import LookupJoinedTest.genList + + def windowLookupJoin[K, V, W](in0: Iterable[(Int, K, V)], in1: Iterable[(Int, K, W)], win: Int) = { + val serv = in1.groupBy(_._2) + // super inefficient, but easy to verify: + def lookup(t: Int, k: K): Option[W] = { + val ord = Ordering.by { tkw: (Int, K, W) => tkw._1 } + serv.get(k).flatMap { in1s => + in1s + .filter { case (t1, _, _) => + (t1 < t) && ((t.toLong - t1.toLong) < win) + } + .reduceOption(ord.max(_, _)) + .map { + _._3 + } + } + } + in0.map { case (t, k, v) => (t.toString, k.toString, v.toString, lookup(t, k).toString) } + } + + "A WindowLookupJoinerJob" should { + // Set up the job: + "correctly lookup" in { + val MAX_KEY = 10 + val MAX_TIME = 10000 + val sz: Int = 10000; + val in0 = genList(MAX_TIME, MAX_KEY, 10000) + val in1 = genList(MAX_TIME, MAX_KEY, 10000) + JobTest(new WindowLookupJoinerJob(_)) + .arg("window", "100") + .source(TypedTsv[(Int, Int, Int)]("input0"), in0) + .source(TypedTsv[(Int, Int, Int)]("input1"), in1) + .sink[(String, String, String, String)](TypedTsv[(String, String, String, String)]("output")) { + outBuf => + val results = outBuf.toList.sorted + val correct = windowLookupJoin(in0, in1, 100).toList.sorted + def some(it: List[(String, String, String, String)]) = + it.filter(_._4.startsWith("Some")) + + def none(it: List[(String, String, String, String)]) = + it.filter(_._4.startsWith("None")) + + some(results) shouldBe (some(correct)) + none(results) shouldBe (none(correct)) + in0.size should equal(outBuf.size) } .run - .runHadoop - .finish + // .runHadoop + .finish() } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/PackTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/PackTest.scala index 601042bd88..fb61419a48 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/PackTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/PackTest.scala @@ -12,27 +12,28 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.tuple.TupleEntry -import org.specs._ -import scala.reflect.BeanProperty +import org.scalatest.{Matchers, WordSpec} +import scala.beans.BeanProperty import scala.collection.mutable.Buffer class IntContainer { private var firstValue = 0 def getFirstValue = firstValue - def setFirstValue(v : Int) { firstValue = v } + def setFirstValue(v: Int): Unit = + firstValue = v @BeanProperty // Test the other syntax var secondValue = 0 } object FatContainer { - def fromFibonacci(first : Int, second : Int) = { + def fromFibonacci(first: Int, second: Int) = { val fc = new FatContainer fc.f1 = first fc.f2 = second @@ -87,12 +88,11 @@ class FatContainer { @BeanProperty var f23 = 0 } -case class IntCaseClass(firstValue : Int, secondValue : Int) +case class IntCaseClass(firstValue: Int, secondValue: Int) -class ContainerPopulationJob (args : Args) extends Job(args) { - Tsv("input") - .read - .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v : (Int, Int) => v} +class ContainerPopulationJob(args: Args) extends Job(args) { + Tsv("input").read + .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v: (Int, Int) => v } .pack[IntContainer](('firstValue, 'secondValue) -> 'combined) .project('combined) .unpack[IntContainer]('combined -> ('firstValue, 'secondValue)) @@ -100,27 +100,24 @@ class ContainerPopulationJob (args : Args) extends Job(args) { .write(Tsv("output")) } -class ContainerToPopulationJob (args : Args) extends Job(args) { - Tsv("input") - .read - .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v : (Int, Int) => v} +class ContainerToPopulationJob(args: Args) extends Job(args) { + Tsv("input").read + .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v: (Int, Int) => v } .packTo[IntContainer](('firstValue, 'secondValue) -> 'combined) .unpackTo[IntContainer]('combined -> ('firstValue, 'secondValue)) .write(Tsv("output")) - Tsv("input") - .read - .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v : (Int, Int) => v} + Tsv("input").read + .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v: (Int, Int) => v } .packTo[IntCaseClass](('firstValue, 'secondValue) -> 'combined) .unpackTo[IntCaseClass]('combined -> ('firstValue, 'secondValue)) .write(Tsv("output-cc")) } -class FatContainerPopulationJob (args : Args) extends Job(args) { - Tsv("input") - .read - .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v : (Int, Int) => v} - .map(('firstValue, 'secondValue) -> 'fatContainer) { v : (Int, Int) => +class FatContainerPopulationJob(args: Args) extends Job(args) { + Tsv("input").read + .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v: (Int, Int) => v } + .map(('firstValue, 'secondValue) -> 'fatContainer) { v: (Int, Int) => FatContainer.fromFibonacci(v._1, v._2) } .unpack[FatContainer]('fatContainer -> '*) @@ -128,37 +125,30 @@ class FatContainerPopulationJob (args : Args) extends Job(args) { .write(Tsv("output")) } -class FatContainerToPopulationJob (args : Args) extends Job(args) { - Tsv("input") - .read - .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v : (Int, Int) => v} - .map(('firstValue, 'secondValue) -> 'fatContainer) { v : (Int, Int) => +class FatContainerToPopulationJob(args: Args) extends Job(args) { + Tsv("input").read + .mapTo((0, 1) -> ('firstValue, 'secondValue)) { v: (Int, Int) => v } + .map(('firstValue, 'secondValue) -> 'fatContainer) { v: (Int, Int) => FatContainer.fromFibonacci(v._1, v._2) } .unpackTo[FatContainer]('fatContainer -> '*) .write(Tsv("output")) } -class PackTest extends Specification { - noDetailedDiffs() - - val inputData = List( - (1, 2), - (2, 2), - (3, 2) - ) +class PackTest extends WordSpec with Matchers { + val inputData = List((1, 2), (2, 2), (3, 2)) "A ContainerPopulationJob" should { - JobTest("com.twitter.scalding.ContainerPopulationJob") + JobTest(new ContainerPopulationJob(_)) .source(Tsv("input"), inputData) .sink[(Int, Int)](Tsv("output")) { buf => "correctly populate container objects" in { - buf.size must_== 3 - buf.toSet must_== inputData.toSet + buf should have size 3 + buf.toSet shouldBe inputData.toSet } } .run - .finish + .finish() } "A ContainerToPopulationJob" should { @@ -166,50 +156,51 @@ class PackTest extends Specification { .source(Tsv("input"), inputData) .sink[(Int, Int)](Tsv("output")) { buf => "correctly populate container objects" in { - buf.size must_== 3 - buf.toSet must_== inputData.toSet + buf should have size 3 + buf.toSet shouldBe inputData.toSet } } .sink[(Int, Int)](Tsv("output-cc")) { buf => "correctly populate container case class objects" in { - buf.size must_== 3 - buf.toSet must_== inputData.toSet + buf should have size 3 + buf.toSet shouldBe inputData.toSet } } .run - .finish + .finish() } val fatInputData = List((8, 13)) - val fatCorrect = List(8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811) + val fatCorrect = List(8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, + 28657, 46368, 75025, 121393, 196418, 317811) "A FatContainerPopulationJob" should { - JobTest("com.twitter.scalding.FatContainerPopulationJob") + JobTest(new FatContainerPopulationJob(_)) .source(Tsv("input"), fatInputData) - .sink[TupleEntry](Tsv("output")) { buf : Buffer[TupleEntry] => + .sink[TupleEntry](Tsv("output")) { buf: Buffer[TupleEntry] => "correctly populate a fat container object" in { val te = buf.head for (idx <- fatCorrect.indices) { - te.getInteger(idx) must_== fatCorrect(idx) + te.getInteger(idx) shouldBe fatCorrect(idx) } } } .run - .finish + .finish() } "A FatContainerToPopulationJob" should { - JobTest("com.twitter.scalding.FatContainerPopulationJob") + JobTest(new FatContainerPopulationJob(_)) .source(Tsv("input"), fatInputData) - .sink[TupleEntry](Tsv("output")) { buf : Buffer[TupleEntry] => + .sink[TupleEntry](Tsv("output")) { buf: Buffer[TupleEntry] => "correctly populate a fat container object" in { val te = buf.head for (idx <- fatCorrect.indices) { - te.getInteger(idx) must_== fatCorrect(idx) + te.getInteger(idx) shouldBe fatCorrect(idx) } } } .run - .finish + .finish() } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/PageRankTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/PageRankTest.scala deleted file mode 100644 index 115700188b..0000000000 --- a/scalding-core/src/test/scala/com/twitter/scalding/PageRankTest.scala +++ /dev/null @@ -1,54 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding - -import org.specs._ - -class PageRankTest extends Specification { - "A PageRank job" should { - JobTest(new com.twitter.scalding.examples.PageRank(_)). - arg("input", "inputFile"). - arg("output", "outputFile"). - arg("errorOut", "error"). - arg("temp", "tempBuffer"). - //How many iterations to do each time: - arg("iterations", "6"). - arg("convergence", "0.05"). - source(Tsv("inputFile"), List((1L,"2",1.0),(2L,"1,3",1.0),(3L,"2",1.0))). - //Don't check the tempBuffer: - sink[(Long,String,Double)](Tsv("tempBuffer")) { ob => () }. - sink[Double](TypedTsv[Double]("error")) { ob => - "have low error" in { - ob.head must be_<=(0.05) - } - }. - sink[(Long,String,Double)](Tsv("outputFile")){ outputBuffer => - val pageRank = outputBuffer.map { res => (res._1,res._3) }.toMap - "correctly compute pagerank" in { - val d = 0.85 - val twoPR = ( 1.0 + 2*d ) / (1.0 + d) - val otherPR = ( 1.0 + d / 2.0 ) / (1.0 + d) - println(pageRank) - (pageRank(1L) + pageRank(2L) + pageRank(3L)) must beCloseTo(3.0, 0.1) - pageRank(1L) must beCloseTo(otherPR, 0.1) - pageRank(2L) must beCloseTo(twoPR, 0.1) - pageRank(3L) must beCloseTo(otherPR, 0.1) - } - }. - run. - finish - } -} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/PartitionSourceTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/PartitionSourceTest.scala new file mode 100644 index 0000000000..4987c62904 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/PartitionSourceTest.scala @@ -0,0 +1,183 @@ +/* +Copyright 2014 Snowplow Analytics Ltd + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding + +import java.io.File +import scala.io.{Source => ScalaSource} + +import org.scalatest.{Matchers, WordSpec} + +import cascading.tap.SinkMode +import cascading.tuple.Fields +import cascading.tuple.TupleEntry +import cascading.util.Util +import cascading.tap.partition.Partition + +import com.twitter.scalding.{PartitionedTsv => StandardPartitionedTsv} + +object PartitionSourceTestHelpers { + import Dsl._ + + class CustomPartition(val partitionFields: Fields) extends Partition { + + def getPartitionFields(): Fields = partitionFields + def getPathDepth(): Int = 1 + + def toPartition(tupleEntry: TupleEntry): String = + "{" + Util.join(tupleEntry.asIterableOf(classOf[String]), "}->{", true) + "}" + + def toTuple(partition: String, tupleEntry: TupleEntry): Unit = + throw new RuntimeException("toTuple for reading not implemented") + } + + // Define once, here, otherwise testMode.getWritePathFor() won't work + val DelimitedPartitionedTsv = StandardPartitionedTsv("base", "/", 'col1) + val CustomPartitionedTsv = + StandardPartitionedTsv("base", new CustomPartition('col1, 'col2), false, Fields.ALL, SinkMode.REPLACE) + val PartialPartitionedTsv = StandardPartitionedTsv("base", "/", ('col1, 'col2), false, ('col1, 'col3)) +} + +class DelimitedPartitionTestJob(args: Args) extends Job(args) { + import PartitionSourceTestHelpers._ + try { + Tsv("input", ('col1, 'col2)).read.write(DelimitedPartitionedTsv) + } catch { + case e: Exception => e.printStackTrace() + } +} + +class CustomPartitionTestJob(args: Args) extends Job(args) { + import PartitionSourceTestHelpers._ + try { + Tsv("input", ('col1, 'col2, 'col3)).read.write(CustomPartitionedTsv) + } catch { + case e: Exception => e.printStackTrace() + } +} + +class PartialPartitionTestJob(args: Args) extends Job(args) { + import PartitionSourceTestHelpers._ + + try { + Tsv("input", ('col1, 'col2, 'col3)).read.write(PartialPartitionedTsv) + } catch { + case e: Exception => e.printStackTrace() + } +} + +class DelimitedPartitionSourceTest extends WordSpec with Matchers { + import Dsl._ + import PartitionSourceTestHelpers._ + "PartitionedTsv fed a DelimitedPartition" should { + "split output by the delimited path" in { + val input = Seq(("A", 1), ("A", 2), ("B", 3)) + + // Need to save the job to allow, find the temporary directory data was written to + var job: Job = null; + def buildJob(args: Args): Job = { + job = new DelimitedPartitionTestJob(args) + job + } + + JobTest(buildJob(_)) + .source(Tsv("input", ('col1, 'col2)), input) + .runHadoop + .finish() + + val testMode = job.mode.asInstanceOf[HadoopTest] + + val directory = new File(testMode.getWritePathFor(DelimitedPartitionedTsv)) + + directory.listFiles().map(_.getName()).toSet shouldBe Set("A", "B") + + val aSource = ScalaSource.fromFile(new File(directory, "A/part-00000-00000")) + val bSource = ScalaSource.fromFile(new File(directory, "B/part-00000-00001")) + + aSource.getLines.toSeq shouldBe Seq("A\t1", "A\t2") + bSource.getLines.toSeq shouldBe Seq("B\t3") + } + } +} + +class CustomPartitionSourceTest extends WordSpec with Matchers { + import Dsl._ + import PartitionSourceTestHelpers._ + "PartitionedTsv fed a CustomPartition" should { + "split output by the custom path" in { + val input = Seq(("A", "x", 1), ("A", "x", 2), ("B", "y", 3)) + + // Need to save the job to allow, find the temporary directory data was written to + var job: Job = null; + def buildJob(args: Args): Job = { + job = new CustomPartitionTestJob(args) + job + } + + JobTest(buildJob(_)) + .source(Tsv("input", ('col1, 'col2, 'col3)), input) + .runHadoop + .finish() + + val testMode = job.mode.asInstanceOf[HadoopTest] + + val directory = new File(testMode.getWritePathFor(CustomPartitionedTsv)) + + directory.listFiles().map(_.getName()).toSet shouldBe Set("{A}->{x}", "{B}->{y}") + + val aSource = ScalaSource.fromFile(new File(directory, "{A}->{x}/part-00000-00000")) + val bSource = ScalaSource.fromFile(new File(directory, "{B}->{y}/part-00000-00001")) + + aSource.getLines.toSeq shouldBe Seq("A\tx\t1", "A\tx\t2") + bSource.getLines.toSeq shouldBe Seq("B\ty\t3") + } + } +} + +class PartialPartitionSourceTest extends WordSpec with Matchers { + import Dsl._ + import PartitionSourceTestHelpers._ + "PartitionedTsv fed a DelimitedPartition and only a subset of fields" should { + "split output by the delimited path, discarding the unwanted fields" in { + + val input = Seq(("A", "x", 1), ("A", "x", 2), ("B", "y", 3)) + + // Need to save the job to allow, find the temporary directory data was written to + var job: Job = null; + def buildJob(args: Args): Job = { + job = new PartialPartitionTestJob(args) + job + } + + JobTest(buildJob(_)) + .source(Tsv("input", ('col1, 'col2, 'col3)), input) + .runHadoop + .finish() + + val testMode = job.mode.asInstanceOf[HadoopTest] + + val directory = new File(testMode.getWritePathFor(PartialPartitionedTsv)) + + directory.listFiles().map(_.getName()).toSet shouldBe Set("A", "B") + + val aSource = ScalaSource.fromFile(new File(directory, "A/x/part-00000-00000")) + val bSource = ScalaSource.fromFile(new File(directory, "B/y/part-00000-00001")) + + aSource.getLines.toSeq shouldBe Seq("A\t1", "A\t2") + bSource.getLines.toSeq shouldBe Seq("B\t3") + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/PathFilterTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/PathFilterTest.scala index e6f6e70f33..88f0d0470d 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/PathFilterTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/PathFilterTest.scala @@ -1,37 +1,37 @@ package com.twitter.scalding -import org.specs.Specification -import org.apache.hadoop.fs.{Path => HadoopPath,PathFilter} +import org.scalatest.{Matchers, WordSpec} +import org.apache.hadoop.fs.{Path => HadoopPath, PathFilter} -class PathFilterTest extends Specification { +class PathFilterTest extends WordSpec with Matchers { "RichPathFilter" should { import RichPathFilter.toRichPathFilter val p = new HadoopPath("/nowhere") "compose ands" in { - AlwaysTrue.and(AlwaysTrue).accept(p) must be_==(true) - AlwaysTrue.and(AlwaysFalse).accept(p) must be_==(false) - AlwaysFalse.and(AlwaysTrue).accept(p) must be_==(false) - AlwaysFalse.and(AlwaysFalse).accept(p) must be_==(false) + AlwaysTrue.and(AlwaysTrue).accept(p) shouldBe true + AlwaysTrue.and(AlwaysFalse).accept(p) shouldBe false + AlwaysFalse.and(AlwaysTrue).accept(p) shouldBe false + AlwaysFalse.and(AlwaysFalse).accept(p) shouldBe false - AlwaysTrue.and(AlwaysTrue, AlwaysTrue).accept(p) must be_==(true) - AlwaysTrue.and(AlwaysTrue, AlwaysFalse).accept(p) must be_==(false) + AlwaysTrue.and(AlwaysTrue, AlwaysTrue).accept(p) shouldBe true + AlwaysTrue.and(AlwaysTrue, AlwaysFalse).accept(p) shouldBe false } "compose ors" in { - AlwaysTrue.or(AlwaysTrue).accept(p) must be_==(true) - AlwaysTrue.or(AlwaysFalse).accept(p) must be_==(true) - AlwaysFalse.or(AlwaysTrue).accept(p) must be_==(true) - AlwaysFalse.or(AlwaysFalse).accept(p) must be_==(false) + AlwaysTrue.or(AlwaysTrue).accept(p) shouldBe true + AlwaysTrue.or(AlwaysFalse).accept(p) shouldBe true + AlwaysFalse.or(AlwaysTrue).accept(p) shouldBe true + AlwaysFalse.or(AlwaysFalse).accept(p) shouldBe false - AlwaysFalse.or(AlwaysTrue, AlwaysTrue).accept(p) must be_==(true) - AlwaysTrue.or(AlwaysFalse, AlwaysFalse).accept(p) must be_==(true) + AlwaysFalse.or(AlwaysTrue, AlwaysTrue).accept(p) shouldBe true + AlwaysTrue.or(AlwaysFalse, AlwaysFalse).accept(p) shouldBe true } "negate nots" in { - AlwaysTrue.not.accept(p) must be_==(false) - AlwaysFalse.not.accept(p) must be_==(true) - AlwaysTrue.not.not.accept(p) must be_==(true) + AlwaysTrue.not.accept(p) shouldBe false + AlwaysFalse.not.accept(p) shouldBe true + AlwaysTrue.not.not.accept(p) shouldBe true } } @@ -43,4 +43,4 @@ object AlwaysTrue extends PathFilter { object AlwaysFalse extends PathFilter { override def accept(p: HadoopPath): Boolean = false -} \ No newline at end of file +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ReduceOperationsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ReduceOperationsTest.scala index 528f2c483b..6bb6229c39 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ReduceOperationsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ReduceOperationsTest.scala @@ -12,22 +12,23 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ -import com.twitter.scalding._ +import org.scalatest.{Matchers, WordSpec} class SortWithTakeJob(args: Args) extends Job(args) { try { Tsv("input0", ('key, 'item_id, 'score)).read .groupBy('key) { _.sortWithTake[(Long, Double)]((('item_id, 'score), 'top_items), 5) { - (item_0: (Long, Double), item_1: (Long, Double)) => if (item_0._2 == item_1._2) { item_0._1 > item_1._1 } else { item_0._2 > item_1._2 } + (item_0: (Long, Double), item_1: (Long, Double)) => + if (item_0._2 == item_1._2) { item_0._1 > item_1._1 } + else { item_0._2 > item_1._2 } } } .map('top_items -> 'top_items) { - //used to test that types are correct + // used to test that types are correct topItems: List[(Long, Double)] => topItems } .project('key, 'top_items) @@ -44,7 +45,7 @@ class SortedReverseTakeJob(args: Args) extends Job(args) { _.sortedReverseTake[(Long, Double)]((('item_id, 'score), 'top_items), 5) } .map('top_items -> 'top_items) { - //used to test that types are correct + // used to test that types are correct topItems: List[(Long, Double)] => topItems } .project('key, 'top_items) @@ -61,7 +62,7 @@ class SortedTakeJob(args: Args) extends Job(args) { _.sortedTake[(Long, Double)]((('item_id, 'score), 'top_items), 5) } .map('top_items -> 'top_items) { - //used to test that types are correct + // used to test that types are correct topItems: List[(Long, Double)] => topItems } .project('key, 'top_items) @@ -72,72 +73,87 @@ class SortedTakeJob(args: Args) extends Job(args) { } class ApproximateUniqueCountJob(args: Args) extends Job(args) { - implicit def utf8ToBytes(s: String) = com.twitter.bijection.Injection.utf8(s) + implicit def utf8ToBytes(s: String): Array[Byte] = com.twitter.bijection.Injection.utf8(s) try { Tsv("input0", ('category, 'model, 'os)).read .groupBy('category) { _.approximateUniqueCount[String]('os -> 'os_count) } + .map('os_count -> 'os_count) { osCount: Double => + osCount.toLong + } .write(Tsv("output0")) } catch { case e: Exception => e.printStackTrace() } } -class ReduceOperationsTest extends Specification { - noDetailedDiffs() +class ReduceOperationsTest extends WordSpec with Matchers { import Dsl._ - val inputData = List(("a", 2L, 3.0), ("a", 3L, 3.0), ("a", 1L, 3.5), ("b", 1L, 6.0), ("b", 2L, 5.0), ("b", 3L, 4.0), ("b", 4L, 3.0), ("b", 5L, 2.0), ("b", 6L, 1.0)) + val inputData = List( + ("a", 2L, 3.0), + ("a", 3L, 3.0), + ("a", 1L, 3.5), + ("b", 1L, 6.0), + ("b", 2L, 5.0), + ("b", 3L, 4.0), + ("b", 4L, 3.0), + ("b", 5L, 2.0), + ("b", 6L, 1.0) + ) "A sortWithTake job" should { - JobTest("com.twitter.scalding.SortWithTakeJob") + JobTest(new SortWithTakeJob(_)) .source(Tsv("input0", ('key, 'item_id, 'score)), inputData) .sink[(String, List[(Long, Double)])](Tsv("output0")) { buf => "grouped list" in { val whatWeWant: Map[String, String] = Map( - "a" -> List((1L, 3.5), (3L, 3.0), (2L, 3.0)).toString, - "b" -> List((1L, 6.0), (2L, 5.0), (3L, 4.0), (4L, 3.0), (5L, 2.0)).toString) + "a" -> List((1L, 3.5), (3L, 3.0), (2L, 3.0)).toString, + "b" -> List((1L, 6.0), (2L, 5.0), (3L, 4.0), (4L, 3.0), (5L, 2.0)).toString + ) val whatWeGet: Map[String, List[(Long, Double)]] = buf.toMap - whatWeGet.get("a").getOrElse("apples") must be_==(whatWeWant.get("a").getOrElse("oranges")) - whatWeGet.get("b").getOrElse("apples") must be_==(whatWeWant.get("b").getOrElse("oranges")) + whatWeGet.get("a").getOrElse("apples") shouldBe (whatWeWant.get("a").getOrElse("oranges")) + whatWeGet.get("b").getOrElse("apples") shouldBe (whatWeWant.get("b").getOrElse("oranges")) } } .runHadoop - .finish + .finish() } "A sortedTake job" should { - JobTest("com.twitter.scalding.SortedTakeJob") + JobTest(new SortedTakeJob(_)) .source(Tsv("input0", ('key, 'item_id, 'score)), inputData) .sink[(String, List[(Long, Double)])](Tsv("output0")) { buf => "grouped list" in { val whatWeWant: Map[String, String] = Map( - "a" -> List((1L, 3.5), (2L, 3.0), (3L, 3.0)).toString, - "b" -> List((1L, 6.0), (2L, 5.0), (3L, 4.0), (4L, 3.0), (5L, 2.0)).toString) + "a" -> List((1L, 3.5), (2L, 3.0), (3L, 3.0)).toString, + "b" -> List((1L, 6.0), (2L, 5.0), (3L, 4.0), (4L, 3.0), (5L, 2.0)).toString + ) val whatWeGet: Map[String, List[(Long, Double)]] = buf.toMap - whatWeGet.get("a").getOrElse("apples") must be_==(whatWeWant.get("a").getOrElse("oranges")) - whatWeGet.get("b").getOrElse("apples") must be_==(whatWeWant.get("b").getOrElse("oranges")) + whatWeGet.get("a").getOrElse("apples") shouldBe (whatWeWant.get("a").getOrElse("oranges")) + whatWeGet.get("b").getOrElse("apples") shouldBe (whatWeWant.get("b").getOrElse("oranges")) } } .runHadoop - .finish + .finish() } "A sortedReverseTake job" should { - JobTest("com.twitter.scalding.SortedReverseTakeJob") + JobTest(new SortedReverseTakeJob(_)) .source(Tsv("input0", ('key, 'item_id, 'score)), inputData) .sink[(String, List[(Long, Double)])](Tsv("output0")) { buf => "grouped list" in { val whatWeWant: Map[String, String] = Map( - "a" -> List((3L, 3.0), (2L, 3.0), (1L, 3.5)).toString, - "b" -> List((6L, 1.0), (5L, 2.0), (4L, 3.0), (3L, 4.0), (2L, 5.0)).toString) + "a" -> List((3L, 3.0), (2L, 3.0), (1L, 3.5)).toString, + "b" -> List((6L, 1.0), (5L, 2.0), (4L, 3.0), (3L, 4.0), (2L, 5.0)).toString + ) val whatWeGet: Map[String, List[(Long, Double)]] = buf.toMap - whatWeGet.get("a").getOrElse("apples") must be_==(whatWeWant.get("a").getOrElse("oranges")) - whatWeGet.get("b").getOrElse("apples") must be_==(whatWeWant.get("b").getOrElse("oranges")) + whatWeGet.get("a").getOrElse("apples") shouldBe (whatWeWant.get("a").getOrElse("oranges")) + whatWeGet.get("b").getOrElse("apples") shouldBe (whatWeWant.get("b").getOrElse("oranges")) } } .runHadoop - .finish + .finish() } "An approximateUniqueCount job" should { @@ -147,21 +163,18 @@ class ReduceOperationsTest extends Specification { ("mobile", "droid x", "android") ) - JobTest("com.twitter.scalding.ApproximateUniqueCountJob") + JobTest(new ApproximateUniqueCountJob(_)) .source(Tsv("input0", ('category, 'model, 'os)), inputData) - .sink[(String, Double)](Tsv("output0")) { buf => + .sink[(String, Long)](Tsv("output0")) { buf => "grouped OS count" in { - val whatWeWant: Map[String, Double] = Map( - "laptop" -> 1.0, - "mobile" -> 2.0 - ) - val whatWeGet: Map[String, Double] = buf.toMap - whatWeGet.size must be_==(2) - whatWeGet.get("laptop").getOrElse("apples") must be_==(whatWeWant.get("laptop").getOrElse("oranges")) - whatWeGet.get("mobile").getOrElse("apples") must be_==(whatWeWant.get("mobile").getOrElse("oranges")) + val whatWeWant: Map[String, Long] = Map("laptop" -> 1, "mobile" -> 2) + val whatWeGet: Map[String, Long] = buf.toMap + whatWeGet should have size 2 + whatWeGet.get("laptop").getOrElse("apples") shouldBe (whatWeWant.get("laptop").getOrElse("oranges")) + whatWeGet.get("mobile").getOrElse("apples") shouldBe (whatWeWant.get("mobile").getOrElse("oranges")) } } .runHadoop - .finish + .finish() } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ReferencedClassFinderTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ReferencedClassFinderTest.scala new file mode 100644 index 0000000000..b6553fc786 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/ReferencedClassFinderTest.scala @@ -0,0 +1,74 @@ +package com.twitter.scalding + +import org.apache.hadoop.io.BytesWritable +import org.scalatest.{Matchers, WordSpec} + +case class C1(a: Int) +case class C2(b: Int) +case class C3(c: Int) +case class C4(d: Int) + +trait TraitType { + val tp2 = TypedPipe.from(List(C4(0), C4(1))) +} + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ + +class ReferencedClassFinderExample(args: Args) extends Job(args) with TraitType { + case class C5(e: Int) + + val tp = TypedPipe.from(List(C1(1), C1(1), C1(2), C1(3), C1(5))) + val grouped = tp.groupBy(c => C2(c.a))(new Ordering[C2] { + override def compare(a: C2, b: C2) = b.b - a.b + }) + // Verify that we can inspect private[this] fields + private[this] val withTuple = grouped.toList.mapValues(list => C3(list.length)) + // Verify that we don't assign a >= 128 token to a class that has a < 128 token + val bw = TypedPipe.from(List(new BytesWritable(Array[Byte](0, 1, 2)))) + // Verify we don't tokenize scala's array & primitive wrappers. + val ints = TypedPipe.from(List(0, 1, 2)) + val arr = TypedPipe.from(List(Array(0L), Array(1L), Array(2L))) + + val innerClass = TypedPipe.from(List(C5(2), C5(3), C5(5), C5(8))) + + withTuple.write(TypedTsv[(C2, C3)](args("output"))) +} + +class ReferencedClassFinderTest extends WordSpec with Matchers { + "JobClassFinder" should { + "Identify and tokenize used case classes" in { + val job = JobTest(new ReferencedClassFinderExample(_)) + .arg("output", "outputFile") + .sink[(C2, C3)](TypedTsv[(C2, C3)]("outputFile")) { _: Any => Unit } + .initJob(false) + + val config = Config.tryFrom(job.config).get + val tokenizedClasses = config.getCascadingSerializationTokens.values.toSet + val kryoRegisteredClasses = config.getKryoRegisteredClasses + + tokenizedClasses should contain(classOf[C1].getName) + tokenizedClasses should contain(classOf[C2].getName) + tokenizedClasses should contain(classOf[C3].getName) + tokenizedClasses should contain(classOf[C4].getName) + tokenizedClasses should contain(classOf[ReferencedClassFinderExample#C5].getName) + kryoRegisteredClasses should contain(classOf[C1]) + kryoRegisteredClasses should contain(classOf[C2]) + kryoRegisteredClasses should contain(classOf[C3]) + kryoRegisteredClasses should contain(classOf[C4]) + kryoRegisteredClasses should contain(classOf[ReferencedClassFinderExample#C5]) + + tokenizedClasses should not contain (classOf[BytesWritable].getName) + kryoRegisteredClasses should not contain (classOf[BytesWritable]) + // classOf[Int] will return the primitive int, so manually pass in scala's wrapper + tokenizedClasses should not contain "scala.Int" + tokenizedClasses should not contain "scala.Array" + } + + "Run successfully" in { + JobTest(new ReferencedClassFinderExample(_)) + .arg("output", "outputFile") + .sink[(C2, C3)](TypedTsv[(C2, C3)]("outputFile")) { _: Any => Unit } + .runHadoop + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/RegressionTests.scala b/scalding-core/src/test/scala/com/twitter/scalding/RegressionTests.scala new file mode 100644 index 0000000000..a2a01837c3 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/RegressionTests.scala @@ -0,0 +1,26 @@ +package com.twitter.scalding + +import org.scalatest.FunSuite + +class RegressionTests extends FunSuite { + test("hashJoins + merges that fail in cascading 3") { + val p1 = + TypedPipe + .from(List(1, 2)) + .cross(TypedPipe.from(List(3, 4))) + + val p2 = + TypedPipe + .from(List(5, 6)) + .cross(TypedPipe.from(List(8, 9))) + + val p3 = p1 ++ p2 + val p4 = TypedPipe.from(List((8, 1), (10, 2))) ++ p3 + + val expected = List((1, 3), (1, 4), (2, 3), (2, 4), (5, 8), (5, 9), (6, 8), (6, 9), (8, 1), (10, 2)) + val values = p4.toIterableExecution + .waitFor(Config.empty, Local(true)) + .get + assert(values.toList.sorted == expected) + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/RichPipeSpecification.scala b/scalding-core/src/test/scala/com/twitter/scalding/RichPipeSpecification.scala new file mode 100644 index 0000000000..6b0f1d5229 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/RichPipeSpecification.scala @@ -0,0 +1,113 @@ +package com.twitter.scalding + +import java.util.UUID + +import org.scalacheck.Prop._ +import org.scalacheck.{Gen, Properties} + +object RichPipeSpecification extends Properties("RichPipe") { + + import Gen._ + import cascading.pipe.{Pipe => CPipe} + + def extractPipeNumber(pipeName: String) = pipeName match { + case RichPipe.FormerAssignedPipeNamePattern(pipenum) => pipenum.toInt + case _ => 0 + } + + /* Note: in these tests, we can never compare to equality with the basePipeNumber or offsets from that; as the pipe + assigned names number sequence is a global atomic integer, and the test framework might run other tests in parallel + to this, we can only count on it being monotonically increasing. */ + + property( + "assignName carries over the old number " + + "if it was already an assigned name" + ) = forAll(posNum[Int]) { (oldNum: Int) => + val basePipeNumber = extractPipeNumber(RichPipe.getNextName) + + val p = new CPipe(s"_pipe_$oldNum") + val ap = RichPipe.assignName(p) + + val newNum = extractPipeNumber(ap.getName) + + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$oldNum") + } + + property( + "assignName carries over the last (12-hexdigits) group from the UUID " + + "if the old name included one" + ) = forAll(alphaStr, uuid, alphaStr) { (prefix: String, uuid: UUID, suffix: String) => + val basePipeNumber = extractPipeNumber(RichPipe.getNextName) + + val lastGroup = uuid.toString.split("-").last + val p = new CPipe(prefix + uuid + suffix) + val ap = RichPipe.assignName(p) + + val newNum = extractPipeNumber(ap.getName) + + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$lastGroup") + } + + property( + "assignName carries over the last (12-hexdigits) group from the *last* UUID " + + "if the old name included more than one" + ) = forAll(alphaStr, uuid, alphaStr, uuid, alphaStr) { + (prefix: String, uuid1: UUID, middle: String, uuid: UUID, suffix: String) => + val basePipeNumber = extractPipeNumber(RichPipe.getNextName) + + val lastGroup = uuid.toString.split("-").last + val p = new CPipe(prefix + uuid1 + middle + uuid + suffix) + val ap = RichPipe.assignName(p) + + val newNum = extractPipeNumber(ap.getName) + + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$lastGroup") + } + + property( + "assignName carries over the over the old number " + + "if it was already an assigned name carrying bits from a UUID" + ) = forAll(posNum[Int], uuid) { (oldNum: Int, uuid: UUID) => + val basePipeNumber = extractPipeNumber(RichPipe.getNextName) + val lastGroup = uuid.toString.split("-").last + + val p = new CPipe(s"_pipe_$oldNum-$lastGroup") + val ap = RichPipe.assignName(p) + + val newNum = extractPipeNumber(ap.getName) + + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$oldNum") + } + + val smallNames = Gen.choose(0, 12).flatMap(sz => Gen.listOfN(sz, alphaChar)).map(_.mkString) + + val longNames = Gen.choose(13, 256).flatMap(sz => Gen.listOfN(sz, alphaChar)).map(_.mkString) + + property( + "assignName carries over the whole old name " + + "if it's 12 characters or less" + ) = forAll(smallNames) { (name: String) => + val basePipeNumber = extractPipeNumber(RichPipe.getNextName) + val p = new CPipe(name) + val ap = RichPipe.assignName(p) + + val newNum = extractPipeNumber(ap.getName) + + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$name") + } + + property( + "assignName carries over the last 12 characters of the old name " + + "if it's more than 12 characters" + ) = forAll(longNames) { (name: String) => + val basePipeNumber = extractPipeNumber(RichPipe.getNextName) + val nameEnd = name.subSequence(name.length - 12, name.length) + val p = new CPipe(name) + val ap = RichPipe.assignName(p) + + val newNum = extractPipeNumber(ap.getName) + + (newNum > basePipeNumber) && ap.getName.endsWith(s"-$nameEnd") + } + +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/ScanLeftTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/ScanLeftTest.scala index 6533753640..9e415cee3f 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/ScanLeftTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/ScanLeftTest.scala @@ -1,77 +1,31 @@ package com.twitter.scalding -import org.specs._ -import com.twitter.scalding._ +import org.scalatest.{Matchers, WordSpec} /** - * Simple Example: First group data by gender and then sort by height reverse order. - * Then add another column for each group which is the rank order of the height. + * Simple Example: First group data by gender and then sort by height reverse order. Then add another column + * for each group which is the rank order of the height. */ class AddRankingWithScanLeft(args: Args) extends Job(args) { - - Tsv("input1", ('gender, 'height)) - .read + Tsv("input1", ('gender, 'height)).read .groupBy('gender) { group => group.sortBy('height).reverse - group.scanLeft(('height) -> ('rank))((0L)) { - (rank: Long, user_id: Double) => - { - (rank + 1L) - } + group.scanLeft('height -> 'rank)(0L) { (rank: Long, user_id: Double) => + (rank + 1L) } } - // scanLeft generates an extra line per group, thus remove it + // scanLeft generates an extra line per group, thus remove it .filter('height) { x: String => x != null } .debug .write(Tsv("result1")) - } -/** - * Advanced example: Count seconds each user spent reading a blog article (using scanLeft) - * For the sake of simplicity we assume that you have converted date-time into epoch - */ -//class ScanLeftTimeExample(args: Args) extends Job(args) { -// -// Tsv("input2", ('epoch, 'user, 'event)) -// // Create a helper symbol first -// .insert('temp, 0L) -// // Group by user and sort by epoch in reverse, so that most recent event comes first -// .groupBy('user) { group => -// group.sortBy('epoch).reverse -// .scanLeft(('epoch, 'temp) -> ('originalEpoch, 'duration))((0L, 0L)) { -// (firstLine: (Long, Long), secondLine: (Long, Long)) => -// var delta = firstLine._1 - secondLine._1 -// // scanLeft is initialised with (0L,0L) so first subtraction -// // will result into a negative number! -// if (delta < 0L) delta = -delta -// (secondLine._1, delta) -// } -// } -// .project('epoch, 'user, 'event, 'duration) -// // Remove lines introduced by scanLeft and discard helping symbols -// .filter('epoch) { x: Any => x != null } -// // Order in ascending time -// .groupBy('user) { group => -// group.sortBy('epoch) -// } -// // You can now remove most recent events where we are uncertain of time spent -// .filter('duration) { x: Long => x < 10000L } -// .debug -// .write(Tsv("result2")) -// -//} - -class ScanLeftTest extends Specification { +class ScanLeftTest extends WordSpec with Matchers { import Dsl._ // --- A simple ranking job - val sampleInput1 = List( - ("male", "165.2"), - ("female", "172.2"), - ("male", "184.1"), - ("male", "125.4"), - ("female", "128.6")) + val sampleInput1 = + List(("male", "165.2"), ("female", "172.2"), ("male", "184.1"), ("male", "125.4"), ("female", "128.6")) // Each group sorted and ranking added highest person to shortest val expectedOutput1 = Set( @@ -79,54 +33,21 @@ class ScanLeftTest extends Specification { ("male", 165.2, 2), ("male", 125.4, 3), ("female", 172.2, 1), - ("female", 128.6, 2)) + ("female", 128.6, 2) + ) "A simple ranking scanleft job" should { - JobTest("com.twitter.scalding.AddRankingWithScanLeft") + JobTest(new AddRankingWithScanLeft(_)) .source(Tsv("input1", ('gender, 'height)), sampleInput1) .sink[(String, Double, Long)](Tsv("result1")) { outBuf1 => "produce correct number of records when filtering out null values" in { - outBuf1.size must_== 5 + outBuf1 should have size 5 } "create correct ranking per group, 1st being the heighest person of that group" in { - outBuf1.toSet must_== expectedOutput1 + outBuf1.toSet shouldBe expectedOutput1 } } .run - .finish + .finish() } - -// // --- A trickier duration counting job -// var sampleInput2 = List( -// (1370737000L, "userA", "/read/blog/123"), -// (1370737002L, "userB", "/read/blog/781"), -// (1370737028L, "userA", "/read/blog/621"), -// (1370737067L, "userB", "/add/comment/"), -// (1370737097L, "userA", "/read/blog/888"), -// (1370737103L, "userB", "/read/blog/999")) -// -// // Each group sorted and ranking added highest person to shortest -// val expectedOutput2 = Set( -// (1370737000L, "userA", "/read/blog/123", 28), // userA was reading blog/123 for 28 seconds -// (1370737028L, "userA", "/read/blog/621", 69), // userA was reading blog/621 for 69 seconds -// (1370737002L, "userB", "/read/blog/781", 65), // userB was reading blog/781 for 65 seconds -// (1370737067L, "userB", "/add/comment/", 36)) // userB was posting a comment for 36 seconds -// // Note that the blog/999 is not recorded as we can't tell how long userB spend on it based on the input -// -// "A more advanced time extraction scanleft job" should { -// JobTest("com.twitter.scalding.ScanLeftTimeExample") -// .source(Tsv("input2", ('epoch, 'user, 'event)), sampleInput2) -// .sink[(Long, String, String, Long)](Tsv("result2")) { outBuf2 => -// "produce correct number of records when filtering out null values" in { -// outBuf2.size must_== 4 -// } -// "create correct output per user" in { -// outBuf2.toSet must_== expectedOutput2 -// } -// } -// .run -// .finish -// } - } - diff --git a/scalding-core/src/test/scala/com/twitter/scalding/SideEffectTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/SideEffectTest.scala index b5adf52279..f578ffc2ba 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/SideEffectTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/SideEffectTest.scala @@ -12,109 +12,112 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import scala.annotation.tailrec -import cascading.pipe._ -import org.specs._ +import org.scalatest.{Matchers, WordSpec} /* * Zip uses side effect construct to create zipped list. */ -class Zip(args : Args) extends Job(args) { +class Zip(args: Args) extends Job(args) { - //import RichPipe._ + // import RichPipe._ def createState = new { var lastLine: String = null - def release() {} + def release(): Unit = () } - val zipped = Tsv("line",('line)).pipe - .using { createState } - .flatMap[String, (String, String)] ('line -> ('l1, 'l2)) { case (accu, line) => - if (accu.lastLine == null) { - accu.lastLine = line - List() - } else { - val zipped = List((accu.lastLine, line)) - accu.lastLine = line - zipped - } + val zipped = Tsv("line", 'line).pipe + .using(createState) + .flatMap[String, (String, String)]('line -> ('l1, 'l2)) { case (accu, line) => + if (accu.lastLine == null) { + accu.lastLine = line + List() + } else { + val zipped = List((accu.lastLine, line)) + accu.lastLine = line + zipped } + } .project('l1, 'l2) zipped.write(Tsv("zipped")) } -class SideEffectTest extends Specification with FieldConversions { +class SideEffectTest extends WordSpec with Matchers with FieldConversions { "Zipper should do create zipped sequence. Coded with side effect" should { - JobTest("com.twitter.scalding.Zip") - .source(Tsv("line",('line)), List(Tuple1("line1"), Tuple1("line2"), Tuple1("line3"), Tuple1("line4"))) + JobTest(new Zip(_)) + .source(Tsv("line", 'line), List(Tuple1("line1"), Tuple1("line2"), Tuple1("line3"), Tuple1("line4"))) .sink[(String, String)](Tsv("zipped")) { ob => "correctly compute zipped sequence" in { val res = ob.toList val expected = List(("line1", "line2"), ("line2", "line3"), ("line3", "line4")) - res.zip(expected) foreach { - case ((a, b), (c, d)) => - a must be_== ( c ) - b must be_== ( d ) - } + res shouldBe expected } } .run - .finish + .finish() } } /* - * ZipBuffer uses (unneccessary) side effect to construct zipped. + * ZipBuffer uses (unnecessary) side effect to construct zipped. */ -class ZipBuffer(args : Args) extends Job(args) { +class ZipBuffer(args: Args) extends Job(args) { - //import RichPipe._ + // import RichPipe._ def createState = new { var lastLine: String = null - def release() {} + def release(): Unit = () } - val zipped = Tsv("line",('line)).pipe - .map('line -> 'oddOrEven) { line : String => line.substring(line.length-1).toInt % 2 match { - case 0 => "even" - case 1 => "odd" - }} + val zipped = Tsv("line", 'line).pipe + .map('line -> 'oddOrEven) { line: String => + line.substring(line.length - 1).toInt % 2 match { + case 0 => "even" + case 1 => "odd" + } + } .groupBy('oddOrEven) { - _.using { createState } - .mapStream('line -> ('l1, 'l2)) { (accu, iter : Iterator[String]) => { - accu.lastLine = iter.next() - for (line <- iter) yield { - val result = (accu.lastLine, line) - accu.lastLine = line - result + _.using(createState) + .mapStream('line -> ('l1, 'l2)) { (accu, iter: Iterator[String]) => + accu.lastLine = iter.next() + for (line <- iter) yield { + val result = (accu.lastLine, line) + accu.lastLine = line + result + } } - }} } - .project('l1, 'l2) + .project('l1, 'l2) zipped.write(Tsv("zipped")) } -class SideEffectBufferTest extends Specification with FieldConversions { +class SideEffectBufferTest extends WordSpec with Matchers with FieldConversions { "ZipBuffer should do create two zipped sequences, one for even lines and one for odd lines. Coded with side effect" should { JobTest("com.twitter.scalding.ZipBuffer") - .source(Tsv("line",('line)), List(Tuple1("line1"), Tuple1("line2"), Tuple1("line3"), Tuple1("line4"), Tuple1("line5"), Tuple1("line6"))) + .source( + Tsv("line", 'line), + List( + Tuple1("line1"), + Tuple1("line2"), + Tuple1("line3"), + Tuple1("line4"), + Tuple1("line5"), + Tuple1("line6") + ) + ) .sink[(String, String)](Tsv("zipped")) { ob => "correctly compute zipped sequence" in { val res = ob.toList.sorted - val expected = List(("line1", "line3"), ("line3", "line5"), ("line2", "line4"), ("line4", "line6")).sorted - res.zip(expected) foreach { - case ((a, b), (c, d)) => - a must be_== ( c ) - b must be_== ( d ) - } + val expected = + List(("line1", "line3"), ("line3", "line5"), ("line2", "line4"), ("line4", "line6")).sorted + res shouldBe expected } } .run - .finish + .finish() } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/SkewJoinTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/SkewJoinTest.scala index f507cce49b..d6cd93d32f 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/SkewJoinTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/SkewJoinTest.scala @@ -12,53 +12,54 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ - -import cascading.pipe.joiner._ - -import java.lang.reflect.InvocationTargetException +import org.scalatest.{Matchers, WordSpec} import scala.collection.mutable.Buffer -class SkewJoinJob(args : Args) extends Job(args) { +class SkewJoinJob(args: Args) extends Job(args) { val sampleRate = args.getOrElse("sampleRate", "0.001").toDouble val reducers = args.getOrElse("reducers", "-1").toInt val replicationFactor = args.getOrElse("replicationFactor", "1").toInt - val replicator = if (args.getOrElse("replicator", "a") == "a") - SkewReplicationA(replicationFactor) - else - SkewReplicationB() + val replicator = + if (args.getOrElse("replicator", "a") == "a") + SkewReplicationA(replicationFactor) + else + SkewReplicationB() - val in0 = Tsv("input0").read.mapTo((0,1,2) -> ('x1, 'y1, 's1)) { input : (Int, Int, Int) => input } - val in1 = Tsv("input1").read.mapTo((0,1,2) -> ('x2, 'y2, 's2)) { input : (Int, Int, Int) => input } + val in0 = Tsv("input0").read.mapTo((0, 1, 2) -> ('x1, 'y1, 's1)) { input: (Int, Int, Int) => input } + val in1 = Tsv("input1").read.mapTo((0, 1, 2) -> ('x2, 'y2, 's2)) { input: (Int, Int, Int) => input } in0 .skewJoinWithSmaller('y1 -> 'y2, in1, sampleRate, reducers, replicator) .project('x1, 'y1, 's1, 'x2, 'y2, 's2) .write(Tsv("output")) // Normal inner join: - in0 + in0 .joinWithSmaller('y1 -> 'y2, in1) .project('x1, 'y1, 's1, 'x2, 'y2, 's2) .write(Tsv("jws-output")) } object JoinTestHelper { - import Dsl._ val rng = new java.util.Random - def generateInput(size: Int, max: Int): List[(String,String,String)] = { + def generateInput(size: Int, max: Int): List[(String, String, String)] = { def next: String = rng.nextInt(max).toString - (0 to size).map { i => (next, next, next) }.toList + (0 to size).map(i => (next, next, next)).toList } - type JoinResult = (Int,Int,Int,Int,Int,Int) + type JoinResult = (Int, Int, Int, Int, Int, Int) - def runJobWithArguments(fn: (Args) => Job, sampleRate : Double = 0.001, reducers : Int = -1, - replicationFactor : Int = 1, replicator : String = "a"): (List[JoinResult], List[JoinResult]) = { + def runJobWithArguments( + fn: (Args) => Job, + sampleRate: Double = 0.001, + reducers: Int = -1, + replicationFactor: Int = 1, + replicator: String = "a" + ): (List[JoinResult], List[JoinResult]) = { val skewResult = Buffer[JoinResult]() val innerResult = Buffer[JoinResult]() @@ -66,83 +67,81 @@ object JoinTestHelper { .arg("sampleRate", sampleRate.toString) .arg("reducers", reducers.toString) .arg("replicationFactor", replicationFactor.toString) - .arg("replicator", replicator.toString) + .arg("replicator", replicator) .source(Tsv("input0"), generateInput(1000, 100)) - .source(Tsv("input1"), generateInput(1000, 100)) - .sink[(Int,Int,Int,Int,Int,Int)](Tsv("output")) { outBuf => skewResult ++ outBuf } - .sink[(Int,Int,Int,Int,Int,Int)](Tsv("jws-output")) { outBuf => innerResult ++ outBuf } + .source(Tsv("input1"), generateInput(100, 100)) + .sink[(Int, Int, Int, Int, Int, Int)](Tsv("output"))(outBuf => skewResult ++= outBuf) + .sink[(Int, Int, Int, Int, Int, Int)](Tsv("jws-output"))(outBuf => innerResult ++= outBuf) .run - //.runHadoop //this takes MUCH longer to run. Commented out by default, but tests pass on my machine - .finish + // .runHadoop //this takes MUCH longer to run. Commented out by default, but tests pass on my machine + .finish() (skewResult.toList.sorted, innerResult.toList.sorted) } } -class SkewJoinPipeTest extends Specification { - noDetailedDiffs() - +class SkewJoinPipeTest extends WordSpec with Matchers { import JoinTestHelper._ "A SkewInnerProductJob" should { - "compute skew join with sampleRate = 0.001, using strategy A" in { val (sk, inner) = runJobWithArguments(new SkewJoinJob(_), sampleRate = 0.001, replicator = "a") - sk must_== inner + sk shouldBe inner } "compute skew join with sampleRate = 0.001, using strategy B" in { val (sk, inner) = runJobWithArguments(new SkewJoinJob(_), sampleRate = 0.001, replicator = "b") - sk must_== inner + sk shouldBe inner } "compute skew join with sampleRate = 0.1, using strategy A" in { val (sk, inner) = runJobWithArguments(new SkewJoinJob(_), sampleRate = 0.1, replicator = "a") - sk must_== inner + sk shouldBe inner } "compute skew join with sampleRate = 0.1, using strategy B" in { val (sk, inner) = runJobWithArguments(new SkewJoinJob(_), sampleRate = 0.1, replicator = "b") - sk must_== inner + sk shouldBe inner } "compute skew join with sampleRate = 0.9, using strategy A" in { val (sk, inner) = runJobWithArguments(new SkewJoinJob(_), sampleRate = 0.9, replicator = "a") - sk must_== inner + sk shouldBe inner } "compute skew join with sampleRate = 0.9, using strategy B" in { val (sk, inner) = runJobWithArguments(new SkewJoinJob(_), sampleRate = 0.9, replicator = "b") - sk must_== inner + sk shouldBe inner } "compute skew join with replication factor 5, using strategy A" in { val (sk, inner) = runJobWithArguments(new SkewJoinJob(_), replicationFactor = 5, replicator = "a") - sk must_== inner + sk shouldBe inner } "compute skew join with reducers = 10, using strategy A" in { val (sk, inner) = runJobWithArguments(new SkewJoinJob(_), reducers = 10, replicator = "a") - sk must_== inner + sk shouldBe inner } "compute skew join with reducers = 10, using strategy B" in { val (sk, inner) = runJobWithArguments(new SkewJoinJob(_), reducers = 10, replicator = "b") - sk must_== inner + sk shouldBe inner } } } -class CollidingKeySkewJoinJob(args : Args) extends Job(args) { +class CollidingKeySkewJoinJob(args: Args) extends Job(args) { val sampleRate = args.getOrElse("sampleRate", "0.001").toDouble val reducers = args.getOrElse("reducers", "-1").toInt val replicationFactor = args.getOrElse("replicationFactor", "1").toInt - val replicator = if (args.getOrElse("replicator", "a") == "a") - SkewReplicationA(replicationFactor) - else - SkewReplicationB() + val replicator = + if (args.getOrElse("replicator", "a") == "a") + SkewReplicationA(replicationFactor) + else + SkewReplicationB() - val in0 = Tsv("input0").read.mapTo((0,1,2) -> ('k1, 'k3, 'v1)) { input : (Int, Int, Int) => input } - val in1 = Tsv("input1").read.mapTo((0,1,2) -> ('k2, 'k3, 'v2)) { input : (Int, Int, Int) => input } + val in0 = Tsv("input0").read.mapTo((0, 1, 2) -> ('k1, 'k3, 'v1)) { input: (Int, Int, Int) => input } + val in1 = Tsv("input1").read.mapTo((0, 1, 2) -> ('k2, 'k3, 'v2)) { input: (Int, Int, Int) => input } in0 .skewJoinWithSmaller('k3 -> 'k3, in1, sampleRate, reducers, replicator) @@ -150,27 +149,25 @@ class CollidingKeySkewJoinJob(args : Args) extends Job(args) { .insert('z, 0) // Make it have the same schema as the non-colliding job .write(Tsv("output")) // Normal inner join: - in0 + in0 .joinWithSmaller('k3 -> 'k3, in1) .project('k1, 'k3, 'v1, 'k2, 'v2) .insert('z, 0) // Make it have the same schema as the non-colliding job .write(Tsv("jws-output")) } -class CollidingKeySkewJoinTest extends Specification { - noDetailedDiffs() +class CollidingKeySkewJoinTest extends WordSpec with Matchers { import JoinTestHelper._ "A CollidingSkewInnerProductJob" should { - "compute skew join with colliding fields, using strategy A" in { val (sk, inn) = runJobWithArguments(new CollidingKeySkewJoinJob(_), replicator = "a") - sk must_== inn + sk shouldBe inn } "compute skew join with colliding fields, using strategy B" in { val (sk, inn) = runJobWithArguments(new CollidingKeySkewJoinJob(_), replicator = "b") - sk must_== inn + sk shouldBe inn } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/SourceSpec.scala b/scalding-core/src/test/scala/com/twitter/scalding/SourceSpec.scala index 0ba46d5a18..9f6c8c78a5 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/SourceSpec.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/SourceSpec.scala @@ -12,21 +12,21 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ +import org.scalatest.{Matchers, WordSpec} import cascading.pipe.Pipe import cascading.tuple.Fields +import com.twitter.scalding.source._ -class SourceSpec extends Specification { - import Dsl._ +class SourceSpec extends WordSpec with Matchers { "A case class Source" should { "inherit equality properly from TimePathedSource" in { - implicit val tz = DateOps.UTC - implicit val parser = DateParser.default + implicit val tz: java.util.TimeZone = DateOps.UTC + implicit val parser: DateParser = DateParser.default val d1 = RichDate("2012-02-01") val d2 = RichDate("2012-02-02") @@ -37,70 +37,129 @@ class SourceSpec extends Specification { val a = DailySuffixTsv("/test")(dr1) val b = DailySuffixTsv("/test")(dr2) val c = DailySuffixTsv("/testNew")(dr1) - val d = DailySuffixTsvSecond("/test")(dr1) + val d = new DailySuffixTsvSecond("/testNew")(dr1) val e = DailySuffixTsv("/test")(dr1) - (a == b) must beFalse - (b == c) must beFalse - (a == d) must beFalse - (a == e) must beTrue + a should not be b + b should not be c + a should not be d + a shouldBe e } } + class DailySuffixTsvSecond(prefix: String, fs: Fields = Fields.ALL)( + override implicit val dateRange: DateRange + ) extends DailySuffixSource(prefix, dateRange) + with DelimitedScheme { + override val fields = fs + } + "A Source with overriden transformForRead and transformForWrite" should { "respect these overrides even for tests" in { JobTest(new AddRemoveOneJob(_)) .source(AddOneTsv("input"), List((0, "0"), (1, "1"))) .sink[(String, String)](RemoveOneTsv("output")) { buf => - buf.toSet must_== Set(("0", "0"), ("1", "1")) + buf.toSet shouldBe Set(("0", "0"), ("1", "1")) } .run - .finish + .finish() } } } -case class DailySuffixTsv(p : String)(dr : DateRange) - extends TimePathedSource(p + TimePathedSource.YEAR_MONTH_DAY + "/*", dr, DateOps.UTC) - -case class DailySuffixTsvSecond(p : String)(dr : DateRange) - extends TimePathedSource(p + TimePathedSource.YEAR_MONTH_DAY + "/*", dr, DateOps.UTC) - -case class AddOneTsv(p : String) extends FixedPathSource(p) - with DelimitedScheme with Mappable[(Int, String, String)] { +case class AddOneTsv(p: String) + extends FixedPathSource(p) + with DelimitedScheme + with Mappable[(Int, String, String)] { import Dsl._ import TDsl._ override val transformInTest = true override val sourceFields = new Fields("one", "two", "three") override def converter[U >: (Int, String, String)] = - TupleConverter.asSuperConverter[(Int, String, String), U](implicitly[TupleConverter[(Int, String, String)]]) - override def transformForRead(p: Pipe) = { - p.mapTo((0, 1) -> ('one, 'two, 'three)) { - t: (Int, String) => t :+ "1" + TupleConverter.asSuperConverter[(Int, String, String), U]( + implicitly[TupleConverter[(Int, String, String)]] + ) + override def transformForRead(p: Pipe) = + p.mapTo((0, 1) -> ('one, 'two, 'three)) { t: (Int, String) => + t :+ "1" } - } } -case class RemoveOneTsv(p : String) extends FixedPathSource(p) - with DelimitedScheme with Mappable[(Int, String, String)] { +case class RemoveOneTsv(p: String) + extends FixedPathSource(p) + with DelimitedScheme + with Mappable[(Int, String, String)] { override val transformInTest = true import Dsl._ override val sourceFields = new Fields("one", "two", "three") override def converter[U >: (Int, String, String)] = - TupleConverter.asSuperConverter[(Int, String, String), U](implicitly[TupleConverter[(Int, String, String)]]) - override def transformForWrite(p: Pipe) = { - p.mapTo(('one, 'two, 'three) -> (0, 1)) { - t: (Int, String, String) => (t._1, t._2) + TupleConverter.asSuperConverter[(Int, String, String), U]( + implicitly[TupleConverter[(Int, String, String)]] + ) + override def transformForWrite(p: Pipe) = + p.mapTo(('one, 'two, 'three) -> (0, 1)) { t: (Int, String, String) => + (t._1, t._2) } - } } class AddRemoveOneJob(args: Args) extends Job(args) { - AddOneTsv("input") - .read + AddOneTsv("input").read - //just for fun lets just switch all 1s with 2s + // just for fun lets just switch all 1s with 2s .map('three -> 'three) { s: String => "2" } - .write(RemoveOneTsv("output")) } + +class MapTypedPipe(args: Args) extends Job(args) { + TypedPipe + .from(TypedText.tsv[(Int, String)]("input")) + .map(MapFunctionAndThenTest.mapFunction) + .write(TypedText.tsv[(Int, String, Int)]("output")) +} + +class IdentityTypedPipe(args: Args) extends Job(args) { + TypedPipe + .from( + TypedText + .tsv[(Int, String)]("input") + .andThen(MapFunctionAndThenTest.mapFunction) + ) + .write(TypedText.tsv[(Int, String, Int)]("output")) +} + +object MapFunctionAndThenTest { + def mapFunction(input: (Int, String)): (Int, String, Int) = + (input._1, input._2, input._1) + + val input: List[(Int, String)] = List((0, "a"), (1, "b"), (2, "c")) + val output: List[(Int, String, Int)] = List((0, "a", 0), (1, "b", 1), (2, "c", 2)) +} +class TypedPipeAndThenTest extends WordSpec with Matchers { + import MapFunctionAndThenTest._ + "Mappable.andThen is like TypedPipe.map" should { + JobTest(new MapTypedPipe(_)) + .source(TypedText.tsv[(Int, String)]("input"), input) + .typedSink(TypedText.tsv[(Int, String, Int)]("output")) { outputBuffer => + val outMap = outputBuffer.toList + "TypedPipe return proper results" in { + outMap should have size 3 + outMap shouldBe output + } + } + .run + .finish() + + JobTest(new IdentityTypedPipe(_)) + .source(TypedText.tsv[(Int, String)]("input"), input) + .typedSink(TypedText.tsv[(Int, String, Int)]("output")) { outputBuffer => + val outMap = outputBuffer.toList + "Mappable.andThen return proper results" in { + outMap should have size 3 + outMap shouldBe output + } + } + .run + .finish() + + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/StatsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/StatsTest.scala new file mode 100644 index 0000000000..59fa003e32 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/StatsTest.scala @@ -0,0 +1,68 @@ +package com.twitter.scalding + +import cascading.flow.FlowException +import org.scalatest.{Matchers, WordSpec} + +import scala.util.Try + +class StatsTestJob1(args: Args) extends Job(args) with CounterVerification { + val nonZero = Stat("number of non-zero records", "stats") + + TypedPipe + .from(TypedTsv[(String, Int)](args("input"))) + .map { kv => + if (kv._2 != 0) nonZero.inc() + (kv._1.toLowerCase, kv._2) + } + .write(TypedTsv[(String, Int)](args("output"))) + + override def verifyCounters(counters: Map[StatKey, Long]): Try[Unit] = Try { + assert(counters(nonZero) > 0) + } +} + +class StatsTestJob2(args: Args) extends StatsTestJob1(args) { + override def verifyCountersInTest: Boolean = false +} + +class StatsTest extends WordSpec with Matchers { + + val goodInput = List(("a", 0), ("b", 1), ("c", 2)) + val badInput = List(("a", 0), ("b", 0), ("c", 0)) + + def runJobTest[T: TupleSetter](f: Args => Job, input: List[T]): Unit = + JobTest(f) + .arg("input", "input") + .arg("output", "output") + .source(TypedTsv[(String, Int)]("input"), input) + .sink[(String, Int)](TypedTsv[(String, Int)]("output"))(outBuf => outBuf shouldBe input) + .run + + "StatsTestJob" should { + "pass if verifyCounters() is true" in { + runJobTest(new StatsTestJob1(_), goodInput) + } + } + + it should { + "fail if verifyCounters() is false" in { + an[FlowException] should be thrownBy runJobTest(new StatsTestJob1(_), badInput) + } + } + + it should { + "skip verifyCounters() if job fails" in { + (the[FlowException] thrownBy runJobTest( + new StatsTestJob1(_), + List((null, 0)) + )).getCause.getCause shouldBe a[NullPointerException] + } + } + + it should { + "skip verifyCounters() if verifyCountersInTest is false" in { + runJobTest(new StatsTestJob2(_), badInput) + } + } + +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TemplateSourceTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TemplateSourceTest.scala index e151aa7ced..551681aff4 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TemplateSourceTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TemplateSourceTest.scala @@ -12,28 +12,24 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.io.File import scala.io.{Source => ScalaSource} -import org.specs._ - -import cascading.tap.SinkMode -import cascading.tuple.Fields +import org.scalatest.{Matchers, WordSpec} class TemplateTestJob(args: Args) extends Job(args) { try { Tsv("input", ('col1, 'col2)).read.write(TemplatedTsv("base", "%s", 'col1)) } catch { - case e : Exception => e.printStackTrace() + case e: Exception => e.printStackTrace() } } -class TemplateSourceTest extends Specification { - noDetailedDiffs() +class TemplateSourceTest extends WordSpec with Matchers { import Dsl._ "TemplatedTsv" should { "split output by template" in { @@ -49,19 +45,19 @@ class TemplateSourceTest extends Specification { JobTest(buildJob(_)) .source(Tsv("input", ('col1, 'col2)), input) .runHadoop - .finish + .finish() val testMode = job.mode.asInstanceOf[HadoopTest] val directory = new File(testMode.getWritePathFor(TemplatedTsv("base", "%s", 'col1))) - directory.listFiles().map({ _.getName() }).toSet mustEqual Set("A", "B") + directory.listFiles().map(_.getName()).toSet shouldBe Set("A", "B") val aSource = ScalaSource.fromFile(new File(directory, "A/part-00000")) val bSource = ScalaSource.fromFile(new File(directory, "B/part-00000")) - aSource.getLines.toList mustEqual Seq("A\t1", "A\t2") - bSource.getLines.toList mustEqual Seq("B\t3") + aSource.getLines.toList shouldBe Seq("A\t1", "A\t2") + bSource.getLines.toList shouldBe Seq("B\t3") } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TestTapFactoryTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TestTapFactoryTest.scala index 8978be7b9e..d92c0f02c7 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TestTapFactoryTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TestTapFactoryTest.scala @@ -2,41 +2,31 @@ package com.twitter.scalding import cascading.tap.Tap import cascading.tuple.{Fields, Tuple} -import java.lang.IllegalArgumentException import scala.collection.mutable.Buffer -import org.specs.Specification +import org.scalatest.{Matchers, WordSpec} -class TestTapFactoryTest extends Specification { +class TestTapFactoryTest extends WordSpec with Matchers { "A test tap created by TestTapFactory" should { - "error helpfully when a source is not in the map for test buffers" >> { + "error helpfully when a source is not in the map for test buffers" in { // Source to use for this test. - val testSource = new Tsv("path") + val testSource = Tsv("path") // Map of sources to use when creating the tap-- does not contain testSource val emptySourceMap = Map[Source, Buffer[Tuple]]() - def buffers(s: Source): Option[Buffer[Tuple]] = { - if (emptySourceMap.contains(s)) { - Some(emptySourceMap(s)) - } else { - None - } - } - val testFields = new Fields() + val testMode = Test(emptySourceMap.get(_)) + val testTapFactory = TestTapFactory(testSource, new Fields()) - val testMode = Test(buffers) - val testTapFactory = TestTapFactory(testSource, testFields) + def createIllegalTap(accessMode: AccessMode): Tap[Any, Any, Any] = + testTapFactory.createTap(accessMode)(testMode).asInstanceOf[Tap[Any, Any, Any]] - def createIllegalTap(): Tap[Any, Any, Any] = - testTapFactory.createTap(Read)(testMode).asInstanceOf[Tap[Any, Any, Any]] + (the[IllegalArgumentException] thrownBy { + createIllegalTap(Read) + } should have).message("requirement failed: " + TestTapFactory.sourceNotFoundError.format(testSource)) - createIllegalTap() must throwA[IllegalArgumentException].like { - case iae: IllegalArgumentException => - iae.getMessage mustVerify( - _.contains(TestTapFactory.sourceNotFoundError.format(testSource))) - } + (the[IllegalArgumentException] thrownBy { + createIllegalTap(Write) + } should have).message("requirement failed: " + TestTapFactory.sinkNotFoundError.format(testSource)) } - } - } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TimePathedSourceTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TimePathedSourceTest.scala new file mode 100644 index 0000000000..561a17e3a1 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/TimePathedSourceTest.scala @@ -0,0 +1,41 @@ +/* +Copyright 2016 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import java.util.TimeZone + +import org.scalatest.{Matchers, WordSpec} + +class TimePathedSourceTest extends WordSpec with Matchers { + "TimePathedSource.hdfsWritePath" should { + val dateRange = DateRange(RichDate(0L), RichDate(0L)) + val utcTZ = DateOps.UTC + + "crib if path == /*" in { + intercept[AssertionError](TestTimePathedSource("/*", dateRange, utcTZ).hdfsWritePath) + } + + "crib if path doesn't end with /*" in { + intercept[AssertionError](TestTimePathedSource("/my/invalid/path", dateRange, utcTZ).hdfsWritePath) + } + + "work for path ending with /*" in { + TestTimePathedSource("/my/path/*", dateRange, utcTZ).hdfsWritePath.startsWith("/my/path") + } + } +} + +case class TestTimePathedSource(p: String, dr: DateRange, t: TimeZone) extends TimePathedSource(p, dr, t) diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TupleTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TupleTest.scala index be63e8c3ba..f32ac670aa 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TupleTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TupleTest.scala @@ -12,80 +12,119 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import cascading.tuple.{TupleEntry,Tuple=>CTuple} +import cascading.tuple.{Tuple => CTuple, TupleEntry} -import org.specs._ +import org.scalatest.{Matchers, WordSpec} -class TupleTest extends Specification { - noDetailedDiffs() //Fixes issue for scala 2.9 +class TupleTest extends WordSpec with Matchers { + def get[T](ctup: CTuple)(implicit tc: TupleConverter[T]) = tc(new TupleEntry(ctup)) + def set[T](t: T)(implicit ts: TupleSetter[T]): CTuple = ts(t) - def get[T](ctup : CTuple)(implicit tc : TupleConverter[T]) = tc(new TupleEntry(ctup)) - def set[T](t : T)(implicit ts : TupleSetter[T]) : CTuple = ts(t) - - def arityConvMatches[T](t : T, ar : Int)(implicit tc : TupleConverter[T]) : Boolean = { + def arityConvMatches[T](t: T, ar: Int)(implicit tc: TupleConverter[T]): Boolean = { + assert(t != null) tc.arity == ar } - def aritySetMatches[T](t : T, ar : Int)(implicit tc : TupleSetter[T]) : Boolean = { + def aritySetMatches[T](t: T, ar: Int)(implicit tc: TupleSetter[T]): Boolean = { + assert(t != null) tc.arity == ar } - def roundTrip[T](t : T)(implicit tc : TupleConverter[T], ts : TupleSetter[T]) : Boolean = { + def roundTrip[T](t: T)(implicit tc: TupleConverter[T], ts: TupleSetter[T]): Boolean = tc(new TupleEntry(ts(t))) == t - } "TupleConverters" should { "TupleGetter should work as a type-class" in { val emptyTup = new CTuple - val ctup = new CTuple("hey",new java.lang.Long(2), new java.lang.Integer(3), emptyTup) - TupleGetter.get[String](ctup, 0) must be_==("hey") - TupleGetter.get[Long](ctup, 1) must be_==(2L) - TupleGetter.get[Int](ctup, 2) must be_==(3) - TupleGetter.get[CTuple](ctup, 3) must be_==(emptyTup) + val ctup = new CTuple("hey", new java.lang.Long(2), new java.lang.Integer(3), emptyTup) + TupleGetter.get[String](ctup, 0) shouldBe "hey" + TupleGetter.get[Long](ctup, 1) shouldBe 2L + TupleGetter.get[Int](ctup, 2) shouldBe 3 + TupleGetter.get[CTuple](ctup, 3) shouldBe emptyTup } "get primitives out of cascading tuples" in { - val ctup = new CTuple("hey",new java.lang.Long(2), new java.lang.Integer(3)) - get[(String,Long,Int)](ctup) must be_==(("hey",2L,3)) - - roundTrip[Int](3) must beTrue - arityConvMatches(3,1) must beTrue - aritySetMatches(3,1) must beTrue - roundTrip[Long](42L) must beTrue - arityConvMatches(42L,1) must beTrue - aritySetMatches(42L,1) must beTrue - roundTrip[String]("hey") must beTrue - arityConvMatches("hey",1) must beTrue - aritySetMatches("hey",1) must beTrue - roundTrip[(Int,Int)]((4,2)) must beTrue - arityConvMatches((2,3),2) must beTrue - aritySetMatches((2,3),2) must beTrue + val ctup = new CTuple("hey", new java.lang.Long(2), new java.lang.Integer(3)) + get[(String, Long, Int)](ctup) shouldBe ("hey", 2L, 3) + + roundTrip[Int](3) shouldBe true + arityConvMatches(3, 1) shouldBe true + aritySetMatches(3, 1) shouldBe true + roundTrip[Long](42L) shouldBe true + arityConvMatches(42L, 1) shouldBe true + aritySetMatches(42L, 1) shouldBe true + roundTrip[String]("hey") shouldBe true + arityConvMatches("hey", 1) shouldBe true + aritySetMatches("hey", 1) shouldBe true + roundTrip[(Int, Int)]((4, 2)) shouldBe true + arityConvMatches((2, 3), 2) shouldBe true + aritySetMatches((2, 3), 2) shouldBe true } "get non-primitives out of cascading tuples" in { - val ctup = new CTuple(None,List(1,2,3), 1->2 ) - get[(Option[Int],List[Int],(Int,Int))](ctup) must be_==((None,List(1,2,3), 1->2 )) + val ctup = new CTuple(None, List(1, 2, 3), 1 -> 2) + get[(Option[Int], List[Int], (Int, Int))](ctup) shouldBe (None, List(1, 2, 3), 1 -> 2) - roundTrip[(Option[Int],List[Int])]((Some(1),List())) must beTrue - arityConvMatches((None,Nil),2) must beTrue - aritySetMatches((None,Nil),2) must beTrue + roundTrip[(Option[Int], List[Int])]((Some(1), List())) shouldBe true + arityConvMatches((None, Nil), 2) shouldBe true + aritySetMatches((None, Nil), 2) shouldBe true - arityConvMatches(None,1) must beTrue - aritySetMatches(None,1) must beTrue - arityConvMatches(List(1,2,3),1) must beTrue - aritySetMatches(List(1,2,3),1) must beTrue + arityConvMatches(None, 1) shouldBe true + aritySetMatches(None, 1) shouldBe true + arityConvMatches(List(1, 2, 3), 1) shouldBe true + aritySetMatches(List(1, 2, 3), 1) shouldBe true } "deal with AnyRef" in { - val ctup = new CTuple(None,List(1,2,3), 1->2 ) - get[(AnyRef,AnyRef,AnyRef)](ctup) must be_==((None,List(1,2,3), 1->2 )) - get[AnyRef](new CTuple("you")) must be_==("you") - - roundTrip[AnyRef]("hey") must beTrue - roundTrip[(AnyRef,AnyRef)]((Nil,Nil)) must beTrue - arityConvMatches[(AnyRef,AnyRef)](("hey","you"),2) must beTrue - aritySetMatches[(AnyRef,AnyRef)](("hey","you"),2) must beTrue + val ctup = new CTuple(None, List(1, 2, 3), 1 -> 2) + get[(AnyRef, AnyRef, AnyRef)](ctup) shouldBe (None, List(1, 2, 3), 1 -> 2) + get[AnyRef](new CTuple("you")) shouldBe "you" + + roundTrip[AnyRef]("hey") shouldBe true + roundTrip[(AnyRef, AnyRef)]((Nil, Nil)) shouldBe true + arityConvMatches[(AnyRef, AnyRef)](("hey", "you"), 2) shouldBe true + aritySetMatches[(AnyRef, AnyRef)](("hey", "you"), 2) shouldBe true + } + + "TupleConverter/Setters have good equality" in { + assert(TupleConverter.singleConverter[Int] == TupleConverter.singleConverter[Int]) + assert(TupleConverter.singleConverter[String] == TupleConverter.singleConverter[String]) + assert(TupleConverter.singleConverter[(Int, String)] == TupleConverter.singleConverter[(Int, String)]) + + assert(TupleConverter.tuple2Converter[Int, Int] == TupleConverter.tuple2Converter[Int, Int]) + assert(TupleConverter.tuple2Converter[Int, String] == TupleConverter.tuple2Converter[Int, String]) + assert( + TupleConverter.tuple2Converter[Int, (Int, String)] == TupleConverter + .tuple2Converter[Int, (Int, String)] + ) + + assert(TupleSetter.singleSetter[Int] == TupleSetter.singleSetter[Int]) + assert(TupleSetter.singleSetter[String] == TupleSetter.singleSetter[String]) + assert(TupleSetter.singleSetter[(Int, String)] == TupleSetter.singleSetter[(Int, String)]) + + assert(TupleSetter.tup2Setter[(Int, Int)] == TupleSetter.tup2Setter[(Int, Int)]) + assert(TupleSetter.tup2Setter[(String, Int)] == TupleSetter.tup2Setter[(String, Int)]) + assert( + TupleSetter.tup2Setter[((Int, String), String)] == TupleSetter.tup2Setter[((Int, String), String)] + ) + } + + "CascadingBackend can tell Converter/Setter inverses" in { + import com.twitter.scalding.typed.cascading_backend.CascadingBackend + + assert( + CascadingBackend + .areDefiniteInverse(TupleConverter.singleConverter[Any], TupleSetter.singleSetter[Any]) + ) + assert( + !CascadingBackend + .areDefiniteInverse(TupleConverter.singleConverter[Any], TupleSetter.tup2Setter[(Any, Any)]) + ) + assert( + CascadingBackend + .areDefiniteInverse(TupleConverter.tuple2Converter[Any, Any], TupleSetter.tup2Setter[(Any, Any)]) + ) } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedDelimitedTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedDelimitedTest.scala index 6179abfee8..cf204690ef 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TypedDelimitedTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedDelimitedTest.scala @@ -12,17 +12,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ -import com.twitter.scalding._ +import org.scalatest.{Matchers, WordSpec} +import com.twitter.scalding.source.DailySuffixTypedTsv class TypedTsvJob(args: Args) extends Job(args) { try { TypedTsv[(String, Int)]("input0").read.write(TypedTsv[(String, Int)]("output0")) } catch { - case e : Exception => e.printStackTrace() + case e: Exception => e.printStackTrace() } } @@ -30,7 +30,7 @@ class TypedCsvJob(args: Args) extends Job(args) { try { TypedCsv[(String, Int)]("input0").read.write(TypedCsv[(String, Int)]("output0")) } catch { - case e : Exception => e.printStackTrace() + case e: Exception => e.printStackTrace() } } @@ -38,7 +38,7 @@ class TypedPsvJob(args: Args) extends Job(args) { try { TypedPsv[(String, Int)]("input0").read.write(TypedPsv[(String, Int)]("output0")) } catch { - case e : Exception => e.printStackTrace() + case e: Exception => e.printStackTrace() } } @@ -46,73 +46,92 @@ class TypedOsvJob(args: Args) extends Job(args) { try { TypedOsv[(String, Int)]("input0").read.write(TypedOsv[(String, Int)]("output0")) } catch { - case e : Exception => e.printStackTrace() + case e: Exception => e.printStackTrace() } } -class TypedDelimitedTest extends Specification { - noDetailedDiffs() - import Dsl._ +object DailySuffixTypedTsvJob { + val strd1 = "2014-05-01" + val strd2 = "2014-05-02" + implicit val tz: java.util.TimeZone = DateOps.UTC + implicit val parser: DateParser = DateParser.default + implicit val dr1: DateRange = DateRange(RichDate(strd1), RichDate(strd2)) + + def source(str: String) = DailySuffixTypedTsv[(String, Int)](str) + +} + +class DailySuffixTypedTsvJob(args: Args) extends Job(args) with UtcDateRangeJob { + try { + DailySuffixTypedTsvJob.source("input0").read.write(TypedTsv[(String, Int)]("output0")) + } catch { + case e: Exception => e.printStackTrace() + } +} + +class TypedDelimitedTest extends WordSpec with Matchers { val data = List(("aaa", 1), ("bbb", 2)) "A TypedTsv Source" should { JobTest(new TypedTsvJob(_)) .source(TypedTsv[(String, Int)]("input0"), data) - .sink[(String, Int)](TypedTsv[(String, Int)]("output0")) { buf => + .typedSink(TypedTsv[(String, Int)]("output0")) { buf => "read and write data" in { - buf must be_==(data) + buf shouldBe data } } .run - .finish + .finish() } "A TypedCsv Source" should { JobTest(new TypedCsvJob(_)) .source(TypedCsv[(String, Int)]("input0"), data) - .sink[(String, Int)](TypedCsv[(String, Int)]("output0")) { buf => + .typedSink(TypedCsv[(String, Int)]("output0")) { buf => "read and write data" in { - buf must be_==(data) + buf shouldBe data } } .run - .finish + .finish() } "A TypedPsv Source" should { JobTest(new TypedPsvJob(_)) .source(TypedPsv[(String, Int)]("input0"), data) - .sink[(String, Int)](TypedPsv[(String, Int)]("output0")) { buf => + .typedSink(TypedPsv[(String, Int)]("output0")) { buf => "read and write data" in { - buf must be_==(data) + buf shouldBe data } } .run - .finish + .finish() } - "A TypedTsv Source" should { - JobTest(new TypedTsvJob(_)) - .source(TypedTsv[(String, Int)]("input0"), data) - .sink[(String, Int)](TypedTsv[(String, Int)]("output0")) { buf => + "A TypedOsv Source" should { + JobTest(new TypedOsvJob(_)) + .source(TypedOsv[(String, Int)]("input0"), data) + .typedSink(TypedOsv[(String, Int)]("output0")) { buf => "read and write data" in { - buf must be_==(data) + buf shouldBe data } } .run - .finish + .finish() } - "A TypedOsv Source" should { - JobTest(new TypedOsvJob(_)) - .source(TypedOsv[(String, Int)]("input0"), data) - .sink[(String, Int)](TypedOsv[(String, Int)]("output0")) { buf => + "A DailySuffixTypedTsv Source" should { + import DailySuffixTypedTsvJob._ + JobTest(new DailySuffixTypedTsvJob(_)) + .arg("date", strd1 + " " + strd2) + .source(source("input0"), data) + .typedSink(TypedTsv[(String, Int)]("output0")) { buf => "read and write data" in { - buf must be_==(data) + buf shouldBe data } } .run - .finish + .finish() } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedFieldsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedFieldsTest.scala index ed10aaba39..b7387a23c2 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TypedFieldsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedFieldsTest.scala @@ -12,13 +12,13 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import cascading.flow.FlowException -import org.specs._ +import org.scalatest.{Matchers, WordSpec} -class TypedFieldsTest extends Specification { +class TypedFieldsTest extends WordSpec with Matchers { "A fields API job" should { @@ -26,52 +26,49 @@ class TypedFieldsTest extends Specification { // the Opaque class has no comparator "throw an exception if a field is not comparable" in { - - untypedJob must throwA(new FlowException("local step failed")) - + val thrown = the[FlowException] thrownBy untypedJob() + thrown.getMessage shouldBe "local step failed" } // Now run the typed fields version "group by custom comparator correctly" in { - - JobTest("com.twitter.scalding.TypedFieldsJob"). - arg("input", "inputFile"). - arg("output", "outputFile"). - source(TextLine("inputFile"), List("0" -> "5,foo", "1" -> "6,bar", "2" -> "9,foo")). - sink[(Opaque,Int)](Tsv("outputFile")){ outputBuffer => + JobTest(new TypedFieldsJob(_)) + .arg("input", "inputFile") + .arg("output", "outputFile") + .source(TextLine("inputFile"), List("0" -> "5,foo", "1" -> "6,bar", "2" -> "9,foo")) + .sink[(Opaque, Int)](Tsv("outputFile")) { outputBuffer => val outMap = outputBuffer.map { case (opaque: Opaque, i: Int) => (opaque.str, i) }.toMap - outMap.size must_== 2 - outMap("foo") must be_==(14) - outMap("bar") must be_==(6) - }. - run. - finish + outMap should have size 2 + outMap("foo") shouldBe 14 + outMap("bar") shouldBe 6 + } + .run + .finish() } } - def untypedJob { - JobTest("com.twitter.scalding.UntypedFieldsJob"). - arg("input", "inputFile"). - arg("output", "outputFile"). - source(TextLine("inputFile"), List("0" -> "5,foo", "1" -> "6,bar", "2" -> "9,foo")). - sink[(Opaque,Int)](Tsv("outputFile")){ _ => }. - run. - finish - } + def untypedJob(): Unit = + JobTest(new UntypedFieldsJob(_)) + .arg("input", "inputFile") + .arg("output", "outputFile") + .source(TextLine("inputFile"), List("0" -> "5,foo", "1" -> "6,bar", "2" -> "9,foo")) + .sink[(Opaque, Int)](Tsv("outputFile")) { _ => } + .run + .finish() } class UntypedFieldsJob(args: Args) extends Job(args) { TextLine(args("input")).read - .map('line -> ('x,'y)) { line: String => + .map('line -> ('x, 'y)) { line: String => val split = line.split(",") (split(0).toInt, new Opaque(split(1))) } - .groupBy('y) { _.sum[Double]('x) } + .groupBy('y)(_.sum[Double]('x)) .write(Tsv(args("output"))) } @@ -80,8 +77,8 @@ class UntypedFieldsJob(args: Args) extends Job(args) { class TypedFieldsJob(args: Args) extends Job(args) { - implicit val ordering = new Ordering[Opaque] { - def compare(a: Opaque, b: Opaque) = a.str compare b.str + implicit val ordering: Ordering[Opaque] = new Ordering[Opaque] { + def compare(a: Opaque, b: Opaque) = a.str.compare(b.str) } val xField = Field[String]('x) @@ -89,10 +86,10 @@ class TypedFieldsJob(args: Args) extends Job(args) { TextLine(args("input")).read .map('line -> (xField, yField)) { line: String => - val split = line.split(",") + val split = line.split(",") (split(0).toInt, new Opaque(split(1))) } - .groupBy(yField) { _.sum[Double](xField -> xField) } + .groupBy(yField)(_.sum[Double](xField -> xField)) .write(Tsv(args("output"))) } @@ -102,8 +99,8 @@ class TypedFieldsJob(args: Args) extends Job(args) { class Opaque(val str: String) { override def equals(other: Any) = other match { - case other: Opaque => str equals other.str - case _ => false + case other: Opaque => str.equals(other.str) + case _ => false } override def hashCode = str.hashCode } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeCheckerTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeCheckerTest.scala new file mode 100644 index 0000000000..accbd489f2 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeCheckerTest.scala @@ -0,0 +1,33 @@ +package com.twitter.scalding + +import org.scalatest.{Matchers, WordSpec} + +class TypedPipeCheckerTest extends WordSpec with Matchers { + import TypedPipeChecker._ + + "TypedPipeChecker" should { + "run asserts on pipe" in { + checkOutput(TypedPipe.from(List(1, 2, 3, 4))) { rows => + assert(rows.size == 4) + assert(rows == List(1, 2, 3, 4)) + } + } + } + + it should { + "give back a list" in { + val list = inMemoryToList(TypedPipe.from(List(1, 2, 3, 4))) + assert(list == List(1, 2, 3, 4)) + } + } + + it should { + "allow for a list of input to be run through a transform function" in { + def transform(pipe: TypedPipe[Int]) = pipe.map(identity) + + checkOutputTransform(List(1, 2, 3))(transform) { rows => + assert(rows == List(1, 2, 3)) + } + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeTest.scala index e23ca6d405..d2efa310c1 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedPipeTest.scala @@ -12,818 +12,1141 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ - +import org.scalatest.{FunSuite, Matchers, WordSpec} +import com.twitter.scalding.source.{FixedTypedText, TypedText} +import scala.collection.mutable // Use the scalacheck generators import org.scalacheck.Gen import scala.collection.mutable.Buffer import TDsl._ +import typed.MultiJoin + object TUtil { - def printStack( fn: => Unit ) { - try { fn } catch { case e : Throwable => e.printStackTrace; throw e } + def printStack(fn: => Unit): Unit = + try { fn } + catch { case e: Throwable => e.printStackTrace; throw e } + + implicit class JobTestExt(test: JobTest) { + def writesLessDataThen(limitInBytes: Int): JobTest = test + .counter("BYTES_WRITTEN", group = "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter") { + value => assert(value < limitInBytes, s"Job wrote $value bytes of data with limit $limitInBytes") + } } } class TupleAdderJob(args: Args) extends Job(args) { - TypedTsv[(String, String)]("input", ('a, 'b)) - .map{ f => + TypedText + .tsv[(String, String)]("input") + .map { f => (1 +: f) ++ (2, 3) } - .write(TypedTsv[(Int,String,String,Int,Int)]("output")) + .write(TypedText.tsv[(Int, String, String, Int, Int)]("output")) } -class TupleAdderTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TupleAdderTest extends WordSpec with Matchers { "A TupleAdderJob" should { JobTest(new TupleAdderJob(_)) - .source(TypedTsv[(String, String)]("input", ('a, 'b)), List(("a", "a"), ("b", "b"))) - .sink[(Int, String, String, Int, Int)](TypedTsv[(Int,String,String,Int,Int)]("output")) { outBuf => - "be able to use generated tuple adders" in { - outBuf.size must_== 2 - outBuf.toSet must_== Set((1, "a", "a", 2, 3), (1, "b", "b", 2, 3)) - } + .source(TypedText.tsv[(String, String)]("input"), List(("a", "a"), ("b", "b"))) + .sink[(Int, String, String, Int, Int)](TypedText.tsv[(Int, String, String, Int, Int)]("output")) { + outBuf => + "be able to use generated tuple adders" in { + outBuf should have size 2 + outBuf.toSet shouldBe Set((1, "a", "a", 2, 3), (1, "b", "b", 2, 3)) + } } .run - .finish + .finish() } } -class TypedPipeJob(args : Args) extends Job(args) { - //Word count using TypedPipe +class TypedPipeJob(args: Args) extends Job(args) { + // Word count using TypedPipe TextLine("inputFile") - .flatMap { _.split("\\s+") } - .map { w => (w, 1L) } + .flatMap(_.split("\\s+")) + .map(w => (w, 1L)) .forceToDisk .group - //.forceToReducers + // .forceToReducers .sum .debug - .write(TypedTsv[(String,Long)]("outputFile")) + .write(TypedText.tsv[(String, Long)]("outputFile")) } -class TypedPipeTest extends Specification { - import Dsl._ - noDetailedDiffs() //Fixes an issue with scala 2.9 +class TypedPipeTest extends WordSpec with Matchers { "A TypedPipe" should { + var idx = 0 TUtil.printStack { - JobTest(new com.twitter.scalding.TypedPipeJob(_)). - source(TextLine("inputFile"), List("0" -> "hack hack hack and hack")). - sink[(String,Long)](TypedTsv[(String,Long)]("outputFile")){ outputBuffer => - val outMap = outputBuffer.toMap - "count words correctly" in { - outMap("hack") must be_==(4) - outMap("and") must be_==(1) + JobTest(new TypedPipeJob(_)) + .source(TextLine("inputFile"), List("0" -> "hack hack hack and hack")) + .sink[(String, Long)](TypedText.tsv[(String, Long)]("outputFile")) { outputBuffer => + val outMap = outputBuffer.toMap + (idx + ": count words correctly") in { + outMap("hack") shouldBe 4 + outMap("and") shouldBe 1 + } + idx += 1 } - }. - run. - runHadoop. - finish + .run + .runHadoop + .finish() } } } -class TypedSumByKeyJob(args : Args) extends Job(args) { - //Word count using TypedPipe +class TypedSumByKeyJob(args: Args) extends Job(args) { + // Word count using TypedPipe TextLine("inputFile") - .flatMap { l => l.split("\\s+").map((_, 1L)) } + .flatMap(l => l.split("\\s+").map((_, 1L))) .sumByKey - .write(TypedTsv[(String,Long)]("outputFile")) + .write(TypedText.tsv[(String, Long)]("outputFile")) } -class TypedSumByKeyTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 +class TypedSumByKeyTest extends WordSpec with Matchers { "A TypedSumByKeyPipe" should { + var idx = 0 TUtil.printStack { - JobTest(new com.twitter.scalding.TypedSumByKeyJob(_)). - source(TextLine("inputFile"), List("0" -> "hack hack hack and hack")). - sink[(String,Long)](TypedTsv[(String,Long)]("outputFile")){ outputBuffer => - val outMap = outputBuffer.toMap - "count words correctly" in { - outMap("hack") must be_==(4) - outMap("and") must be_==(1) + JobTest(new TypedSumByKeyJob(_)) + .source(TextLine("inputFile"), List("0" -> "hack hack hack and hack")) + .sink[(String, Long)](TypedText.tsv[(String, Long)]("outputFile")) { outputBuffer => + val outMap = outputBuffer.toMap + (idx + ": count words correctly") in { + outMap("hack") shouldBe 4 + outMap("and") shouldBe 1 + } + idx += 1 } - }. - run. - runHadoop. - finish + .run + .runHadoop + .finish() } } } -class TypedPipeJoinJob(args : Args) extends Job(args) { - (Tsv("inputFile0").read.toTypedPipe[(Int,Int)](0, 1).group - leftJoin TypedPipe.from[(Int,Int)](Tsv("inputFile1").read, (0, 1)).group) +class TypedPipeSortByJob(args: Args) extends Job(args) { + TypedPipe + .from(TypedText.tsv[(Int, Float, String)]("input")) + .groupBy(_._1) + .sortBy(_._2) + .mapValues(_._3) + .sum + .write(TypedText.tsv[(Int, String)]("output")) +} + +class TypedPipeSortByTest extends FunSuite { + test("groups should not be disturbed by sortBy") { + JobTest(new TypedPipeSortByJob(_)) + .source( + TypedText.tsv[(Int, Float, String)]("input"), + List( + (0, 0.6f, "6"), + (0, 0.5f, "5"), + (0, 0.1f, "1"), + (1, 0.1f, "10"), + (1, 0.5f, "50"), + (1, 0.51f, "510") + ) + ) + .sink[(Int, String)](TypedText.tsv[(Int, String)]("output")) { outputBuffer => + val map = outputBuffer.toList.groupBy(_._1) + assert(map.size == 2, "should be two keys") + assert(map.forall { case (_, vs) => vs.size == 1 }, "only one key per value") + assert(map.get(0) == Some(List((0, "156"))), "key(0) is correct") + assert(map.get(1) == Some(List((1, "1050510"))), "key(1) is correct") + } + .run + .runHadoop + .finish() + } +} + +class TypedPipeJoinJob(args: Args) extends Job(args) { + Tsv("inputFile0").read + .toTypedPipe[(Int, Int)](0, 1) + .group + .leftJoin(TypedPipe.fromPipe[(Int, Int)](Tsv("inputFile1").read, (0, 1)).group) .toTypedPipe - .write(TypedTsv[(Int,(Int,Option[Int]))]("outputFile")) + .write(TypedText.tsv[(Int, (Int, Option[Int]))]("outputFile")) } -class TypedPipeJoinTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 - import Dsl._ +class TypedPipeJoinTest extends WordSpec with Matchers { "A TypedPipeJoin" should { JobTest(new com.twitter.scalding.TypedPipeJoinJob(_)) - .source(Tsv("inputFile0"), List((0,0), (1,1), (2,2), (3,3), (4,5))) - .source(Tsv("inputFile1"), List((0,1), (1,2), (2,3), (3,4))) - .sink[(Int,(Int,Option[Int]))](TypedTsv[(Int,(Int,Option[Int]))]("outputFile")){ outputBuffer => + .source(Tsv("inputFile0"), List((0, 0), (1, 1), (2, 2), (3, 3), (4, 5))) + .source(Tsv("inputFile1"), List((0, 1), (1, 2), (2, 3), (3, 4))) + .typedSink[(Int, (Int, Option[Int]))](TypedText.tsv[(Int, (Int, Option[Int]))]("outputFile")) { + outputBuffer => + val outMap = outputBuffer.toMap + "correctly join" in { + outMap should have size 5 + outMap(0) shouldBe (0, Some(1)) + outMap(1) shouldBe (1, Some(2)) + outMap(2) shouldBe (2, Some(3)) + outMap(3) shouldBe (3, Some(4)) + outMap(4) shouldBe (5, None) + } + }(implicitly[TypeDescriptor[(Int, (Int, Option[Int]))]].converter) + .run + .finish() + } +} + +// This is a non-serializable class +class OpaqueJoinBox(i: Int) { def get = i } + +class TypedPipeJoinKryoJob(args: Args) extends Job(args) { + val box = new OpaqueJoinBox(2) + TypedPipe + .from(TypedText.tsv[(Int, Int)]("inputFile0")) + .join(TypedPipe.from(TypedText.tsv[(Int, Int)]("inputFile1"))) + .mapValues { case (x, y) => x * y * box.get } + .write(TypedText.tsv[(Int, Int)]("outputFile")) +} + +class TypedPipeJoinKryoTest extends WordSpec with Matchers { + "OpaqueJoinBox" should { + "not be serializable" in { + serialization.Externalizer(new OpaqueJoinBox(1)).javaWorks shouldBe false + } + "closure not be serializable" in { + val box = new OpaqueJoinBox(2) + + val fn = { v: Int => v * box.get } + + serialization.Externalizer(fn).javaWorks shouldBe false + } + } + "A TypedPipeJoinKryo" should { + JobTest(new com.twitter.scalding.TypedPipeJoinKryoJob(_)) + .source(TypedText.tsv[(Int, Int)]("inputFile0"), List((0, 0), (1, 1), (2, 2), (3, 3), (4, 5))) + .source(TypedText.tsv[(Int, Int)]("inputFile1"), List((0, 1), (1, 2), (2, 3), (3, 4))) + .typedSink[(Int, Int)](TypedText.tsv[(Int, Int)]("outputFile")) { outputBuffer => val outMap = outputBuffer.toMap "correctly join" in { - outMap(0) must be_==((0,Some(1))) - outMap(1) must be_==((1,Some(2))) - outMap(2) must be_==((2,Some(3))) - outMap(3) must be_==((3,Some(4))) - outMap(4) must be_==((5,None)) - outMap.size must be_==(5) + outMap should have size 4 + outMap(0) shouldBe 0 + outMap(1) shouldBe 4 + outMap(2) shouldBe 12 + outMap(3) shouldBe 24 } - }. - run. - finish + }(implicitly[TypeDescriptor[(Int, Int)]].converter) + .runHadoop // need hadoop to test serialization + .finish() } } +class TypedPipeDistinctJob(args: Args) extends Job(args) { + Tsv("inputFile").read + .toTypedPipe[(Int, Int)](0, 1) + .distinct + .write(TypedText.tsv[(Int, Int)]("outputFile")) +} + +class TypedPipeDistinctTest extends WordSpec with Matchers { + "A TypedPipeDistinctJob" should { + JobTest(new TypedPipeDistinctJob(_)) + .source(Tsv("inputFile"), List((0, 0), (1, 1), (2, 2), (2, 2), (2, 5))) + .sink[(Int, Int)](TypedText.tsv[(Int, Int)]("outputFile")) { outputBuffer => + val outMap = outputBuffer.toMap + "correctly count unique item sizes" in { + outputBuffer.toSet should have size 4 + } + } + .run + .finish() + } +} -class TypedPipeDistinctJob(args : Args) extends Job(args) { - Tsv("inputFile").read.toTypedPipe[(Int,Int)](0, 1) +class TypedPipeDistinctWordsJob(args: Args) extends Job(args) { + TextLine("inputFile") + .flatMap(_.split("\\s+")) .distinct - .write(TypedTsv[(Int, Int)]("outputFile")) + .write(TextLine("outputFile")) } +class TypedPipeDistinctWordsTest extends WordSpec with Matchers { + "A TypedPipeDistinctWordsJob" should { + var idx = 0 + JobTest(new TypedPipeDistinctWordsJob(_)) + .source(TextLine("inputFile"), List(1 -> "a b b c", 2 -> "c d e")) + .sink[String](TextLine("outputFile")) { outputBuffer => + s"$idx: correctly count unique item sizes" in { + outputBuffer.toSet should have size 5 + } + idx += 1 + } + .run + .runHadoop + .finish() + } +} -class TypedPipeDistinctTest extends Specification { -noDetailedDiffs() //Fixes an issue with scala 2.9 -import Dsl._ -"A TypedPipeDistinctJob" should { - JobTest(new com.twitter.scalding.TypedPipeDistinctJob(_)) - .source(Tsv("inputFile"), List((0,0), (1,1), (2,2), (2,2), (2,5))) - .sink[(Int, Int)](TypedTsv[(Int, Int)]("outputFile")){ outputBuffer => - val outMap = outputBuffer.toMap - "correctly count unique item sizes" in { - val outSet = outputBuffer.toSet - outSet.size must_== 4 - } - }. - run. - finish +class TypedPipeDistinctByJob(args: Args) extends Job(args) { + Tsv("inputFile").read + .toTypedPipe[(Int, Int)](0, 1) + .distinctBy(_._2) + .write(TypedText.tsv[(Int, Int)]("outputFile")) +} + +class TypedPipeDistinctByTest extends WordSpec with Matchers { + "A TypedPipeDistinctByJob" should { + JobTest(new TypedPipeDistinctByJob(_)) + .source(Tsv("inputFile"), List((0, 1), (1, 1), (2, 2), (2, 2), (2, 5))) + .typedSink(TypedText.tsv[(Int, Int)]("outputFile")) { outputBuffer => + "correctly count unique item sizes" in { + val outSet = outputBuffer.toSet + outSet should have size 3 + (List(outSet) should contain).oneOf(Set((0, 1), (2, 2), (2, 5)), Set((1, 1), (2, 2), (2, 5))) + } + } + .run + .finish() + } } + +class TypedPipeGroupedDistinctJob(args: Args) extends Job(args) { + val groupedTP = Tsv("inputFile").read.toTypedPipe[(Int, Int)](0, 1).group + + groupedTP.distinctValues + .write(TypedText.tsv[(Int, Int)]("outputFile1")) + groupedTP.distinctSize + .write(TypedText.tsv[(Int, Long)]("outputFile2")) } +class TypedPipeGroupedDistinctJobTest extends WordSpec with Matchers { + "A TypedPipeGroupedDistinctJob" should { + JobTest(new TypedPipeGroupedDistinctJob(_)) + .source(Tsv("inputFile"), List((0, 0), (0, 1), (0, 1), (1, 0), (1, 1))) + .sink[(Int, Int)](TypedText.tsv[(Int, Int)]("outputFile1")) { outputBuffer => + val outSet = outputBuffer.toSet + "correctly generate unique items" in { + outSet should have size 4 + } + } + .sink[(Int, Int)](TypedText.tsv[(Int, Long)]("outputFile2")) { outputBuffer => + val outMap = outputBuffer.toMap + "correctly count unique item sizes" in { + outMap(0) shouldBe 2 + outMap(1) shouldBe 2 + } + } + .run + .finish() + } +} -class TypedPipeHashJoinJob(args : Args) extends Job(args) { - TypedTsv[(Int,Int)]("inputFile0") +class TypedPipeHashJoinJob(args: Args) extends Job(args) { + TypedText + .tsv[(Int, Int)]("inputFile0") .group - .hashLeftJoin(TypedTsv[(Int,Int)]("inputFile1").group) - .write(TypedTsv[(Int,(Int,Option[Int]))]("outputFile")) + .hashLeftJoin(TypedText.tsv[(Int, Int)]("inputFile1").group) + .write(TypedText.tsv[(Int, (Int, Option[Int]))]("outputFile")) } -class TypedPipeHashJoinTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 - import Dsl._ +class TypedPipeHashJoinTest extends WordSpec with Matchers { "A TypedPipeHashJoinJob" should { - JobTest(new com.twitter.scalding.TypedPipeHashJoinJob(_)) - .source(TypedTsv[(Int,Int)]("inputFile0"), List((0,0), (1,1), (2,2), (3,3), (4,5))) - .source(TypedTsv[(Int,Int)]("inputFile1"), List((0,1), (1,2), (2,3), (3,4))) - .sink[(Int,(Int,Option[Int]))](TypedTsv[(Int,(Int,Option[Int]))]("outputFile")){ outputBuffer => + JobTest(new TypedPipeHashJoinJob(_)) + .source(TypedText.tsv[(Int, Int)]("inputFile0"), List((0, 0), (1, 1), (2, 2), (3, 3), (4, 5))) + .source(TypedText.tsv[(Int, Int)]("inputFile1"), List((0, 1), (1, 2), (2, 3), (3, 4))) + .typedSink(TypedText.tsv[(Int, (Int, Option[Int]))]("outputFile")) { outputBuffer => val outMap = outputBuffer.toMap "correctly join" in { - outMap(0) must be_==((0,Some(1))) - outMap(1) must be_==((1,Some(2))) - outMap(2) must be_==((2,Some(3))) - outMap(3) must be_==((3,Some(4))) - outMap(4) must be_==((5,None)) - outMap.size must be_==(5) + outMap should have size 5 + outMap(0) shouldBe (0, Some(1)) + outMap(1) shouldBe (1, Some(2)) + outMap(2) shouldBe (2, Some(3)) + outMap(3) shouldBe (3, Some(4)) + outMap(4) shouldBe (5, None) } - }. - run. - finish + }(implicitly[TypeDescriptor[(Int, (Int, Option[Int]))]].converter) + .run + .finish() } } -class TypedImplicitJob(args : Args) extends Job(args) { - def revTup[K,V](in : (K,V)) : (V,K) = (in._2, in._1) - TextLine("inputFile").read.typed(1 -> ('maxWord, 'maxCnt)) { tpipe : TypedPipe[String] => - tpipe.flatMap { _.split("\\s+") } - .map { w => (w, 1L) } - .group - .sum - .groupAll - // Looks like swap, but on the values in the grouping: - .mapValues { revTup _ } - .forceToReducers - .max - // Throw out the Unit key and reverse the value tuple - .values - .swap - }.write(TypedTsv[(String,Int)]("outputFile")) +class TypedPipeTwoHashJoinsInARowTest extends WordSpec with Matchers { + "Two hashJoins" should { + "work correctly" in { + val elements = List(1, 2, 3) + val tp1 = TypedPipe.from(elements.map(v => (v, v))) + val tp2 = TypedPipe.from(elements.map(v => (v, 2 * v))) + val tp3 = TypedPipe.from(elements.map(v => (v, 3 * v))) + TypedPipeChecker.checkOutput(tp1.hashJoin(tp2).hashJoin(tp3))(result => + result shouldBe elements.map(v => (v, ((v, 2 * v), 3 * v))) + ) + } + } } -class TypedPipeTypedTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 - import Dsl._ +class TypedImplicitJob(args: Args) extends Job(args) { + def revTup[K, V](in: (K, V)): (V, K) = (in._2, in._1) + TextLine("inputFile").read + .typed(1 -> ('maxWord, 'maxCnt)) { tpipe: TypedPipe[String] => + tpipe + .flatMap(_.split("\\s+")) + .map(w => (w, 1L)) + .group + .sum + .groupAll + // Looks like swap, but on the values in the grouping: + .mapValues(revTup _) + .forceToReducers + .max + // Throw out the Unit key and reverse the value tuple + .values + .swap + } + .write(TypedText.tsv[(String, Int)]("outputFile")) +} + +class TypedPipeTypedTest extends WordSpec with Matchers { "A TypedImplicitJob" should { - JobTest(new com.twitter.scalding.TypedImplicitJob(_)) + JobTest(new TypedImplicitJob(_)) .source(TextLine("inputFile"), List("0" -> "hack hack hack and hack")) - .sink[(String,Int)](TypedTsv[(String,Int)]("outputFile")){ outputBuffer => + .typedSink(TypedText.tsv[(String, Int)]("outputFile")) { outputBuffer => val outMap = outputBuffer.toMap "find max word" in { - outMap("hack") must be_==(4) - outMap.size must be_==(1) + outMap should have size 1 + outMap("hack") shouldBe 4 + } + } + .run + .finish() + } +} + +class TypedWithOnCompleteJob(args: Args) extends Job(args) { + val onCompleteMapperStat = Stat("onCompleteMapper") + val onCompleteReducerStat = Stat("onCompleteReducer") + def onCompleteMapper() = onCompleteMapperStat.inc() + def onCompleteReducer() = onCompleteReducerStat.inc() + // find repeated words ignoring case + TypedText + .tsv[String]("input") + .map(_.toUpperCase) + .onComplete(onCompleteMapper) + .groupBy(identity) + .mapValueStream(words => Iterator(words.size)) + .filter { case (word, occurrences) => occurrences > 1 } + .keys + .onComplete(onCompleteReducer) + .write(TypedText.tsv[String]("output")) +} + +class TypedPipeWithOnCompleteTest extends WordSpec with Matchers { + import Dsl._ + val inputText = "the quick brown fox jumps over the lazy LAZY dog" + "A TypedWithOnCompleteJob" should { + JobTest(new TypedWithOnCompleteJob(_)) + .source(TypedText.tsv[String]("input"), inputText.split("\\s+").map(Tuple1(_))) + .counter("onCompleteMapper")(cnt => "have onComplete called on mapper" in { assert(cnt == 1) }) + .counter("onCompleteReducer")(cnt => "have onComplete called on reducer" in { assert(cnt == 1) }) + .sink[String](TypedText.tsv[String]("output")) { outbuf => + "have the correct output" in { + val correct = inputText + .split("\\s+") + .map(_.toUpperCase) + .groupBy(x => x) + .filter(_._2.size > 1) + .keys + .toList + .sorted + val sortedL = outbuf.toList.sorted + assert(sortedL == correct) + } + } + .runHadoop + .finish() + } +} + +class TypedPipeWithOuterAndLeftJoin(args: Args) extends Job(args) { + val userNames = TypedText.tsv[(Int, String)]("inputNames").group + val userData = TypedText.tsv[(Int, Double)]("inputData").group + val optionalData = TypedText.tsv[(Int, Boolean)]("inputOptionalData").group + + userNames + .outerJoin(userData) + .leftJoin(optionalData) + .map { case (id, ((nameOpt, userDataOption), optionalDataOpt)) => id } + .write(TypedText.tsv[Int]("output")) +} + +class TypedPipeWithOuterAndLeftJoinTest extends WordSpec with Matchers { + + "A TypedPipeWithOuterAndLeftJoin" should { + JobTest(new TypedPipeWithOuterAndLeftJoin(_)) + .source(TypedText.tsv[(Int, String)]("inputNames"), List((1, "Jimmy Foursquare"))) + .source(TypedText.tsv[(Int, Double)]("inputData"), List((1, 0.1), (5, 0.5))) + .source(TypedText.tsv[(Int, Boolean)]("inputOptionalData"), List((1, true), (99, false))) + .sink[Long](TypedText.tsv[Int]("output")) { outbuf => + "have output for user 1" in { + assert(outbuf.toList.contains(1) == true) + } + "have output for user 5" in { + assert(outbuf.toList.contains(5) == true) + } + "not have output for user 99" in { + assert(outbuf.toList.contains(99) == false) } } .run - .finish + .finish() } } -class TJoinCountJob(args : Args) extends Job(args) { - (TypedPipe.from[(Int,Int)](Tsv("in0",(0,1)), (0,1)).group - join TypedPipe.from[(Int,Int)](Tsv("in1", (0,1)), (0,1)).group) +class TJoinCountJob(args: Args) extends Job(args) { + TypedPipe + .fromPipe[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .group + .join(TypedPipe.fromPipe[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)).group) .size - .write(TypedTsv[(Int,Long)]("out")) + .write(TypedText.tsv[(Int, Long)]("out")) - //Also check simple joins: - (TypedPipe.from[(Int,Int)](Tsv("in0",(0,1)), (0,1)).group - join TypedPipe.from[(Int,Int)](Tsv("in1", (0,1)), (0,1)).group) - //Flatten out to three values: + // Also check simple joins: + TypedPipe + .fromPipe[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .group + .join(TypedPipe.fromPipe[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)).group) + // Flatten out to three values: .toTypedPipe - .map { kvw => (kvw._1, kvw._2._1, kvw._2._2) } - .write(TypedTsv[(Int,Int,Int)]("out2")) + .map(kvw => (kvw._1, kvw._2._1, kvw._2._2)) + .write(TypedText.tsv[(Int, Int, Int)]("out2")) - //Also check simple leftJoins: - (TypedPipe.from[(Int,Int)](Tsv("in0",(0,1)), (0,1)).group - leftJoin TypedPipe.from[(Int,Int)](Tsv("in1", (0,1)), (0,1)).group) - //Flatten out to three values: + // Also check simple leftJoins: + TypedPipe + .fromPipe[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .group + .leftJoin(TypedPipe.fromPipe[(Int, Int)](Tsv("in1", (0, 1)), (0, 1)).group) + // Flatten out to three values: .toTypedPipe - .map { kvw : (Int,(Int,Option[Int])) => + .map { kvw: (Int, (Int, Option[Int])) => (kvw._1, kvw._2._1, kvw._2._2.getOrElse(-1)) } - .write(TypedTsv[(Int,Int,Int)]("out3")) + .write(TypedText.tsv[(Int, Int, Int)]("out3")) } -class TNiceJoinCountJob(args : Args) extends Job(args) { - import com.twitter.scalding.typed.Syntax.joinOnTuplePipe +/** + * This test exercises the implicit from TypedPipe to HashJoinabl + */ +class TNiceJoinCountJob(args: Args) extends Job(args) { - (TypedPipe.from[(Int,Int)](Tsv("in0",(0,1)), (0,1)) - join TypedPipe.from[(Int,Int)](Tsv("in1", (0,1)), (0,1))) + TypedPipe + .fromPipe[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .join(TypedPipe.fromPipe[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))) .size - .write(TypedTsv[(Int,Long)]("out")) + .write(TypedText.tsv[(Int, Long)]("out")) - //Also check simple joins: - (TypedPipe.from[(Int,Int)](Tsv("in0",(0,1)), (0,1)) - join TypedPipe.from[(Int,Int)](Tsv("in1", (0,1)), (0,1))) - //Flatten out to three values: + // Also check simple joins: + TypedPipe + .fromPipe[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .join(TypedPipe.fromPipe[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))) + // Flatten out to three values: .toTypedPipe - .map { kvw => (kvw._1, kvw._2._1, kvw._2._2) } - .write(TypedTsv[(Int,Int,Int)]("out2")) - - //Also check simple leftJoins: - (TypedPipe.from[(Int,Int)](Tsv("in0",(0,1)), (0,1)) - leftJoin TypedPipe.from[(Int,Int)](Tsv("in1", (0,1)), (0,1))) - //Flatten out to three values: + .map(kvw => (kvw._1, kvw._2._1, kvw._2._2)) + .write(TypedText.tsv[(Int, Int, Int)]("out2")) + + // Also check simple leftJoins: + TypedPipe + .fromPipe[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .leftJoin(TypedPipe.fromPipe[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))) + // Flatten out to three values: .toTypedPipe - .map { kvw : (Int,(Int,Option[Int])) => - (kvw._1, kvw._2._1, kvw._2._2.getOrElse(-1)) - } - .write(TypedTsv[(Int,Int,Int)]("out3")) + .map { kvw: (Int, (Int, Option[Int])) => + (kvw._1, kvw._2._1, kvw._2._2.getOrElse(-1)) + } + .write(TypedText.tsv[(Int, Int, Int)]("out3")) } -class TNiceJoinByCountJob(args : Args) extends Job(args) { +class TNiceJoinByCountJob(args: Args) extends Job(args) { import com.twitter.scalding.typed.Syntax._ - (TypedPipe.from[(Int,Int)](Tsv("in0",(0,1)), (0,1)) - joinBy TypedPipe.from[(Int,Int)](Tsv("in1", (0,1)), (0,1)))(_._1, _._1) + (TypedPipe + .fromPipe[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .joinBy(TypedPipe.fromPipe[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))))(_._1, _._1) .size - .write(TypedTsv[(Int,Long)]("out")) + .write(TypedText.tsv[(Int, Long)]("out")) - //Also check simple joins: - (TypedPipe.from[(Int,Int)](Tsv("in0",(0,1)), (0,1)) - joinBy TypedPipe.from[(Int,Int)](Tsv("in1", (0,1)), (0,1)))(_._1, _._1) - //Flatten out to three values: + // Also check simple joins: + (TypedPipe + .fromPipe[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .joinBy(TypedPipe.fromPipe[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))))(_._1, _._1) + // Flatten out to three values: .toTypedPipe - .map { kvw => (kvw._1, kvw._2._1._2, kvw._2._2._2) } - .write(TypedTsv[(Int,Int,Int)]("out2")) - - //Also check simple leftJoins: - (TypedPipe.from[(Int,Int)](Tsv("in0",(0,1)), (0,1)) - leftJoinBy TypedPipe.from[(Int,Int)](Tsv("in1", (0,1)), (0,1)))(_._1, _._1) - //Flatten out to three values: + .map(kvw => (kvw._1, kvw._2._1._2, kvw._2._2._2)) + .write(TypedText.tsv[(Int, Int, Int)]("out2")) + + // Also check simple leftJoins: + (TypedPipe + .fromPipe[(Int, Int)](Tsv("in0", (0, 1)), (0, 1)) + .leftJoinBy(TypedPipe.fromPipe[(Int, Int)](Tsv("in1", (0, 1)), (0, 1))))(_._1, _._1) + // Flatten out to three values: .toTypedPipe - .map { kvw : (Int,((Int,Int),Option[(Int,Int)])) => - (kvw._1, kvw._2._1._2, kvw._2._2.getOrElse((-1,-1))._2) - } - .write(TypedTsv[(Int,Int,Int)]("out3")) + .map { kvw: (Int, ((Int, Int), Option[(Int, Int)])) => + (kvw._1, kvw._2._1._2, kvw._2._2.getOrElse((-1, -1))._2) + } + .write(TypedText.tsv[(Int, Int, Int)]("out3")) } -class TypedPipeJoinCountTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 +class TypedPipeJoinCountTest extends WordSpec with Matchers { import Dsl._ - val joinTests = List("com.twitter.scalding.TJoinCountJob", "com.twitter.scalding.TNiceJoinCountJob", "com.twitter.scalding.TNiceJoinByCountJob") - - joinTests.foreach{ jobName => - "A " + jobName should { - JobTest(jobName) - .source(Tsv("in0",(0,1)), List((0,1),(0,2),(1,1),(1,5),(2,10))) - .source(Tsv("in1",(0,1)), List((0,10),(1,20),(1,10),(1,30))) - .sink[(Int,Long)](TypedTsv[(Int,Long)]("out")) { outbuf => - val outMap = outbuf.toMap - "correctly reduce after cogroup" in { - outMap(0) must be_==(2) - outMap(1) must be_==(6) - outMap.size must be_==(2) + val joinTests = List( + "com.twitter.scalding.TJoinCountJob", + "com.twitter.scalding.TNiceJoinCountJob", + "com.twitter.scalding.TNiceJoinByCountJob" + ) + + joinTests.foreach { jobName => + "A " + jobName should { + var idx = 0 + JobTest(jobName) + .source(Tsv("in0", (0, 1)), List((0, 1), (0, 2), (1, 1), (1, 5), (2, 10))) + .source(Tsv("in1", (0, 1)), List((0, 10), (1, 20), (1, 10), (1, 30))) + .typedSink(TypedText.tsv[(Int, Long)]("out")) { outbuf => + val outMap = outbuf.toMap + (idx + ": correctly reduce after cogroup") in { + outMap should have size 2 + outMap(0) shouldBe 2 + outMap(1) shouldBe 6 + } + idx += 1 } - } - .sink[(Int,Int,Int)](TypedTsv[(Int,Int,Int)]("out2")) { outbuf2 => - val outMap = outbuf2.groupBy { _._1 } - "correctly do a simple join" in { - outMap.size must be_==(2) - outMap(0).toList.sorted must be_==(List((0,1,10),(0,2,10))) - outMap(1).toList.sorted must be_==(List((1,1,10),(1,1,20),(1,1,30),(1,5,10),(1,5,20),(1,5,30))) + .typedSink(TypedText.tsv[(Int, Int, Int)]("out2")) { outbuf2 => + val outMap = outbuf2.groupBy(_._1) + (idx + ": correctly do a simple join") in { + outMap should have size 2 + outMap(0).toList.sorted shouldBe List((0, 1, 10), (0, 2, 10)) + outMap(1).toList.sorted shouldBe List( + (1, 1, 10), + (1, 1, 20), + (1, 1, 30), + (1, 5, 10), + (1, 5, 20), + (1, 5, 30) + ) + } + idx += 1 } - } - .sink[(Int,Int,Int)](TypedTsv[(Int,Int,Int)]("out3")) { outbuf => - val outMap = outbuf.groupBy { _._1 } - "correctly do a simple leftJoin" in { - outMap.size must be_==(3) - outMap(0).toList.sorted must be_==(List((0,1,10),(0,2,10))) - outMap(1).toList.sorted must be_==(List((1,1,10),(1,1,20),(1,1,30),(1,5,10),(1,5,20),(1,5,30))) - outMap(2).toList.sorted must be_==(List((2,10,-1))) + .typedSink(TypedText.tsv[(Int, Int, Int)]("out3")) { outbuf => + val outMap = outbuf.groupBy(_._1) + (idx + ": correctly do a simple leftJoin") in { + outMap should have size 3 + outMap(0).toList.sorted shouldBe List((0, 1, 10), (0, 2, 10)) + outMap(1).toList.sorted shouldBe List( + (1, 1, 10), + (1, 1, 20), + (1, 1, 30), + (1, 5, 10), + (1, 5, 20), + (1, 5, 30) + ) + outMap(2).toList.sorted shouldBe List((2, 10, -1)) + } + idx += 1 } - } - .run - .runHadoop - .finish - }} + .run + .runHadoop + .finish() + } + } } -class TCrossJob(args : Args) extends Job(args) { - (TextLine("in0") cross TextLine("in1")) - .write(TypedTsv[(String,String)]("crossed")) +class TCrossJob(args: Args) extends Job(args) { + TextLine("in0") + .cross(TextLine("in1")) + .write(TypedText.tsv[(String, String)]("crossed")) } -class TypedPipeCrossTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 - import Dsl._ +class TypedPipeCrossTest extends WordSpec with Matchers { "A TCrossJob" should { + var idx = 0 TUtil.printStack { - JobTest(new com.twitter.scalding.TCrossJob(_)) - .source(TextLine("in0"), List((0,"you"),(1,"all"))) - .source(TextLine("in1"), List((0,"every"),(1,"body"))) - .sink[(String,String)](TypedTsv[(String,String)]("crossed")) { outbuf => - val sortedL = outbuf.toList.sorted - "create a cross-product" in { - sortedL must be_==(List(("all","body"), - ("all","every"), - ("you","body"), - ("you","every"))) + JobTest(new TCrossJob(_)) + .source(TextLine("in0"), List((0, "you"), (1, "all"))) + .source(TextLine("in1"), List((0, "every"), (1, "body"))) + .typedSink(TypedText.tsv[(String, String)]("crossed")) { outbuf => + val sortedL = outbuf.toList.sorted + (idx + ": create a cross-product") in { + sortedL shouldBe List(("all", "body"), ("all", "every"), ("you", "body"), ("you", "every")) + } + idx += 1 } - } - .run - .runHadoop - .finish + .run + .runHadoop + .finish() } } } -class TJoinTakeJob(args : Args) extends Job(args) { - val items0 = TextLine("in0").flatMap { s => (1 to 10).map((_, s)) }.group - val items1 = TextLine("in1").map { s => (s.toInt, ()) }.group +class TJoinTakeJob(args: Args) extends Job(args) { + val items0 = TextLine("in0").flatMap(s => (1 to 10).map((_, s))).group + val items1 = TextLine("in1").map(s => (s.toInt, ())).group - items0.join(items1.take(1)) + items0 + .join(items1.take(1)) .mapValues(_._1) // discard the () .toTypedPipe - .write(TypedTsv[(Int,String)]("joined")) + .write(TypedText.tsv[(Int, String)]("joined")) } -class TypedJoinTakeTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 - import Dsl._ +class TypedJoinTakeTest extends WordSpec with Matchers { "A TJoinTakeJob" should { + var idx = 0 TUtil.printStack { - JobTest(new TJoinTakeJob(_)) - .source(TextLine("in0"), List((0,"you"),(1,"all"))) - .source(TextLine("in1"), List((0,"3"),(1,"2"),(0,"3"))) - .sink[(Int,String)](TypedTsv[(Int,String)]("joined")) { outbuf => - val sortedL = outbuf.toList.sorted - "dedup keys by using take" in { - sortedL must be_==( - List((3,"you"), (3, "all"), (2, "you"), (2, "all")).sorted) + JobTest(new TJoinTakeJob(_)) + .source(TextLine("in0"), List((0, "you"), (1, "all"))) + .source(TextLine("in1"), List((0, "3"), (1, "2"), (0, "3"))) + .typedSink(TypedText.tsv[(Int, String)]("joined")) { outbuf => + val sortedL = outbuf.toList.sorted + (idx + ": dedup keys by using take") in { + sortedL shouldBe (List((3, "you"), (3, "all"), (2, "you"), (2, "all")).sorted) + } + idx += 1 } - } - .run - .runHadoop - .finish + .run + .runHadoop + .finish() } } } -class TGroupAllJob(args : Args) extends Job(args) { - TextLine("in") - .groupAll - .sorted - .values - .write(TypedTsv[String]("out")) +class TGroupAllJob(args: Args) extends Job(args) { + TextLine("in").groupAll.sorted.values + .write(TypedText.tsv[String]("out")) } -class TypedGroupAllTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 - import Dsl._ +class TypedGroupAllTest extends WordSpec with Matchers { "A TGroupAllJob" should { + var idx = 0 TUtil.printStack { - val input = List((0,"you"),(1,"all"), (2,"everybody")) - JobTest(new TGroupAllJob(_)) - .source(TextLine("in"), input) - .sink[String](TypedTsv[String]("out")) { outbuf => - val sortedL = outbuf.toList - val correct = input.map { _._2 }.sorted - "create sorted output" in { - sortedL must_==(correct) + val input = List((0, "you"), (1, "all"), (2, "everybody")) + JobTest(new TGroupAllJob(_)) + .source(TextLine("in"), input) + .typedSink(TypedText.tsv[String]("out")) { outbuf => + val sortedL = outbuf.toList + val correct = input.map(_._2).sorted + (idx + ": create sorted output") in { + sortedL shouldBe correct + } + idx += 1 } - } - .run - .runHadoop - .finish + .run + .runHadoop + .finish() } } } class TSelfJoin(args: Args) extends Job(args) { - val g = TypedTsv[(Int,Int)]("in").group - g.join(g).values.write(TypedTsv[(Int,Int)]("out")) + val g = TypedText.tsv[(Int, Int)]("in").group + g.join(g).values.write(TypedText.tsv[(Int, Int)]("out")) } -class TSelfJoinTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 - import Dsl._ +class TSelfJoinTest extends WordSpec with Matchers { "A TSelfJoin" should { JobTest(new TSelfJoin(_)) - .source(TypedTsv[(Int,Int)]("in"), List((1,2), (1,3), (2,1))) - .sink[(Int,Int)](TypedTsv[(Int,Int)]("out")) { outbuf => - outbuf.toList.sorted must be_==(List((1,1),(2,2),(2,3),(3,2),(3,3))) + .source(TypedText.tsv[(Int, Int)]("in"), List((1, 2), (1, 3), (2, 1))) + .typedSink(TypedText.tsv[(Int, Int)]("out")) { outbuf => + outbuf.toList.sorted shouldBe List((1, 1), (2, 2), (2, 3), (3, 2), (3, 3)) } .run .runHadoop - .finish + .finish() } } -class TJoinWordCount(args : Args) extends Job(args) { +class TJoinWordCount(args: Args) extends Job(args) { - def countWordsIn(pipe: TypedPipe[(String)]) = { - pipe.flatMap { _.split("\\s+"). map(_.toLowerCase) } + def countWordsIn(pipe: TypedPipe[String]) = + pipe + .flatMap(_.split("\\s+").map(_.toLowerCase)) .groupBy(identity) .mapValueStream(input => Iterator(input.size)) .forceToReducers - } val first = countWordsIn(TypedPipe.from(TextLine("in0"))) val second = countWordsIn(TypedPipe.from(TextLine("in1"))) - first.outerJoin(second) + first + .outerJoin(second) .toTypedPipe .map { case (word, (firstCount, secondCount)) => - (word, firstCount.getOrElse(0), secondCount.getOrElse(0)) + (word, firstCount.getOrElse(0), secondCount.getOrElse(0)) } - .write(TypedTsv[(String,Int,Int)]("out")) + .write(TypedText.tsv[(String, Int, Int)]("out")) } -class TypedJoinWCTest extends Specification { - noDetailedDiffs() //Fixes an issue with scala 2.9 - import Dsl._ +class TypedJoinWCTest extends WordSpec with Matchers { "A TJoinWordCount" should { TUtil.printStack { - val in0 = List((0,"you all everybody"),(1,"a b c d"), (2,"a b c")) - val in1 = List((0,"you"),(1,"a b c d"), (2,"a a b b c c")) - def count(in : List[(Int,String)]) : Map[String, Int] = { - in.flatMap { _._2.split("\\s+").map { _.toLowerCase } }.groupBy { identity }.mapValues { _.size } - } - def outerjoin[K,U,V](m1 : Map[K,U], z1 : U, m2 : Map[K,V], z2 : V) : Map[K,(U,V)] = { - (m1.keys ++ m2.keys).map { k => (k, (m1.getOrElse(k, z1), m2.getOrElse(k, z2))) }.toMap - } - val correct = outerjoin(count(in0), 0, count(in1), 0) - .toList - .map { tup => (tup._1, tup._2._1, tup._2._2) } - .sorted - - JobTest(new TJoinWordCount(_)) - .source(TextLine("in0"), in0) - .source(TextLine("in1"), in1) - .sink[(String,Int,Int)](TypedTsv[(String,Int,Int)]("out")) { outbuf => - val sortedL = outbuf.toList - "create sorted output" in { - sortedL must_==(correct) + val in0 = List((0, "you all everybody"), (1, "a b c d"), (2, "a b c")) + val in1 = List((0, "you"), (1, "a b c d"), (2, "a a b b c c")) + def count(in: List[(Int, String)]): Map[String, Int] = + in.flatMap(_._2.split("\\s+").map(_.toLowerCase)).groupBy(identity).mapValues(_.size) + def outerjoin[K, U, V](m1: Map[K, U], z1: U, m2: Map[K, V], z2: V): Map[K, (U, V)] = + (m1.keys ++ m2.keys).map(k => (k, (m1.getOrElse(k, z1), m2.getOrElse(k, z2)))).toMap + val correct = outerjoin(count(in0), 0, count(in1), 0).toList.map { tup => + (tup._1, tup._2._1, tup._2._2) + }.sorted + + JobTest(new TJoinWordCount(_)) + .source(TextLine("in0"), in0) + .source(TextLine("in1"), in1) + .typedSink(TypedText.tsv[(String, Int, Int)]("out")) { outbuf => + val sortedL = outbuf.toList + "create sorted output" in { + sortedL shouldBe correct + } } - } - .run - .finish + .run + .finish() } } } class TypedLimitJob(args: Args) extends Job(args) { - val p = TypedTsv[String]("input").limit(10): TypedPipe[String] - p.write(TypedTsv[String]("output")) + val p = TypedText.tsv[String]("input").limit(10): TypedPipe[String] + p.write(TypedText.tsv[String]("output")) } -class TypedLimitTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedLimitTest extends WordSpec with Matchers { "A TypedLimitJob" should { JobTest(new TypedLimitJob(_)) - .source(TypedTsv[String]("input"), (0 to 100).map { i => Tuple1(i.toString) }) - .sink[String](TypedTsv[String]("output")) { outBuf => + .source(TypedText.tsv[String]("input"), (0 to 100).map(i => Tuple1(i.toString))) + .typedSink(TypedText.tsv[String]("output")) { outBuf => "not have more than the limited outputs" in { - outBuf.size must be_<=(10) + outBuf.size should be <= 10 } } .runHadoop - .finish + .finish() } } class TypedFlattenJob(args: Args) extends Job(args) { - TypedTsv[String]("input").map { _.split(" ").toList } + TypedText + .tsv[String]("input") + .map(_.split(" ").toList) .flatten - .write(TypedTsv[String]("output")) + .write(TypedText.tsv[String]("output")) } -class TypedFlattenTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedFlattenTest extends WordSpec with Matchers { "A TypedLimitJob" should { JobTest(new TypedFlattenJob(_)) - .source(TypedTsv[String]("input"), List(Tuple1("you all"), Tuple1("every body"))) - .sink[String](TypedTsv[String]("output")) { outBuf => + .source(TypedText.tsv[String]("input"), List(Tuple1("you all"), Tuple1("every body"))) + .typedSink(TypedText.tsv[String]("output")) { outBuf => "correctly flatten" in { - outBuf.toSet must be_==(Set("you", "all", "every", "body")) + outBuf.toSet shouldBe Set("you", "all", "every", "body") } } .runHadoop - .finish + .finish() } } class TypedMergeJob(args: Args) extends Job(args) { - val tp = TypedPipe.from(TypedTsv[String]("input")) + val tp = TypedPipe.from(TypedText.tsv[String]("input")) + // This exercise a self merge (tp ++ tp) - .write(TypedTsv[String]("output")) + .write(TypedText.tsv[String]("output")) (tp ++ (tp.map(_.reverse))) - .write(TypedTsv[String]("output2")) + .write(TypedText.tsv[String]("output2")) } -class TypedMergeTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedMergeTest extends WordSpec with Matchers { "A TypedMergeJob" should { + var idx = 0 JobTest(new TypedMergeJob(_)) - .source(TypedTsv[String]("input"), List(Tuple1("you all"), Tuple1("every body"))) - .sink[String](TypedTsv[String]("output")) { outBuf => - "correctly flatten" in { - outBuf.toSet must be_==(Set("you all", "every body")) + .source(TypedText.tsv[String]("input"), List(Tuple1("you all"), Tuple1("every body"))) + .typedSink(TypedText.tsv[String]("output")) { outBuf => + (idx + ": correctly flatten") in { + outBuf.toSet shouldBe Set("you all", "every body") } + idx += 1 } - .sink[String](TypedTsv[String]("output2")) { outBuf => - "correctly flatten" in { + .typedSink(TypedText.tsv[String]("output2")) { outBuf => + (idx + ": correctly flatten") in { val correct = Set("you all", "every body") - outBuf.toSet must be_==(correct ++ correct.map(_.reverse)) + outBuf.toSet shouldBe (correct ++ correct.map(_.reverse)) } + idx += 1 } .runHadoop - .finish + .finish() } } class TypedShardJob(args: Args) extends Job(args) { - (TypedPipe.from(TypedTsv[String]("input")) ++ - (TypedPipe.empty.map { _ => "hey" }) ++ - TypedPipe.from(List("item"))) + (TypedPipe.from(TypedText.tsv[String]("input")) ++ + TypedPipe.empty.map(_ => "hey") ++ + TypedPipe.from(List("item"))) .shard(10) - .write(TypedTsv[String]("output")) + .write(TypedText.tsv[String]("output")) } -class TypedShardTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedShardTest extends WordSpec with Matchers { "A TypedShardJob" should { val genList = Gen.listOf(Gen.identifier) // Take one random sample lazy val mk: List[String] = genList.sample.getOrElse(mk) JobTest(new TypedShardJob(_)) - .source(TypedTsv[String]("input"), mk) - .sink[String](TypedTsv[String]("output")) { outBuf => + .source(TypedText.tsv[String]("input"), mk) + .typedSink(TypedText.tsv[String]("output")) { outBuf => "correctly flatten" in { - outBuf.size must be_==(mk.size + 1) - outBuf.toSet must be_==(mk.toSet + "item") + outBuf should have size (mk.size + 1) + outBuf.toSet shouldBe (mk.toSet + "item") } } .run - .finish + .finish() } } class TypedLocalSumJob(args: Args) extends Job(args) { - TypedPipe.from(TypedTsv[String]("input")) - .flatMap { s => s.split(" ").map((_, 1L)) } + TypedPipe + .from(TypedText.tsv[String]("input")) + .flatMap(s => s.split(" ").map((_, 1L))) .sumByLocalKeys - .write(TypedTsv[(String, Long)]("output")) + .write(TypedText.tsv[(String, Long)]("output")) } -class TypedLocalSumTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedLocalSumTest extends WordSpec with Matchers { "A TypedLocalSumJob" should { + var idx = 0 val genList = Gen.listOf(Gen.identifier) // Take one random sample lazy val mk: List[String] = genList.sample.getOrElse(mk) JobTest(new TypedLocalSumJob(_)) - .source(TypedTsv[String]("input"), mk) - .sink[(String, Long)](TypedTsv[(String, Long)]("output")) { outBuf => - "not expand and have correct total sum" in { + .source(TypedText.tsv[String]("input"), mk) + .typedSink(TypedText.tsv[(String, Long)]("output")) { outBuf => + s"$idx: not expand and have correct total sum" in { import com.twitter.algebird.MapAlgebra.sumByKey val lres = outBuf.toList - val fmapped = mk.flatMap { s => s.split(" ").map((_, 1L)) } - lres.size must be_<=(fmapped.size) - sumByKey(lres) must be_==(sumByKey(fmapped)) + val fmapped = mk.flatMap(s => s.split(" ").map((_, 1L))) + lres.size should be <= (fmapped.size) + sumByKey(lres) shouldBe (sumByKey(fmapped)) } + idx += 1 } .run .runHadoop - .finish + .finish() } } class TypedHeadJob(args: Args) extends Job(args) { - TypedPipe.from(TypedTsv[(Int, Int)]("input")) + TypedPipe + .from(TypedText.tsv[(Int, Int)]("input")) .group .head - .write(TypedTsv[(Int, Int)]("output")) + .write(TypedText.tsv[(Int, Int)]("output")) } -class TypedHeadTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedHeadTest extends WordSpec with Matchers { "A TypedHeadJob" should { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - val mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt) } + val mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt)) JobTest(new TypedHeadJob(_)) - .source(TypedTsv[(Int, Int)]("input"), mk) - .sink[(Int,Int)](TypedTsv[(Int, Int)]("output")) { outBuf => + .source(TypedText.tsv[(Int, Int)]("input"), mk) + .typedSink(TypedText.tsv[(Int, Int)]("output")) { outBuf => "correctly take the first" in { val correct = mk.groupBy(_._1).mapValues(_.head._2) - outBuf.size must be_==(correct.size) - outBuf.toMap must be_==(correct) + outBuf should have size (correct.size) + outBuf.toMap shouldBe correct } } .run - .finish + .finish() } } class TypedSortWithTakeJob(args: Args) extends Job(args) { - TypedPipe.from(TypedTsv[(Int, Int)]("input")) - .group + val in = TypedPipe.from(TypedText.tsv[(Int, Int)]("input")) + + in.group .sortedReverseTake(5) - .mapValues { (s: Seq[Int]) => s.toString } - .write(TypedTsv[(Int, String)]("output")) + .flattenValues + .write(TypedText.tsv[(Int, Int)]("output")) + + in.group.sorted.reverse + .bufferedTake(5) + .write(TypedText.tsv[(Int, Int)]("output2")) } -class TypedSortWithTakeTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedSortWithTakeTest extends WordSpec with Matchers { "A TypedSortWithTakeJob" should { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - val mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt) } + val mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt)) JobTest(new TypedSortWithTakeJob(_)) - .source(TypedTsv[(Int, Int)]("input"), mk) - .sink[(Int,String)](TypedTsv[(Int, String)]("output")) { outBuf => + .source(TypedText.tsv[(Int, Int)]("input"), mk) + .sink[(Int, Int)](TypedText.tsv[(Int, Int)]("output")) { outBuf => "correctly take the first" in { - val correct = mk.groupBy(_._1).mapValues(_.map(i => i._2).sorted.reverse.take(5).toList.toString) - outBuf.size must be_==(correct.size) - outBuf.toMap must be_==(correct) + val correct = mk.groupBy(_._1).mapValues(_.map(i => i._2).sorted.reverse.take(5).toSet) + outBuf.groupBy(_._1).mapValues(_.map { case (k, v) => v }.toSet) shouldBe correct + } + } + .sink[(Int, Int)](TypedText.tsv[(Int, Int)]("output2")) { outBuf => + "correctly take the first using sorted.reverse.take" in { + val correct = mk.groupBy(_._1).mapValues(_.map(i => i._2).sorted.reverse.take(5).toSet) + outBuf.groupBy(_._1).mapValues(_.map { case (k, v) => v }.toSet) shouldBe correct } } .run - .finish + .finish() } } class TypedLookupJob(args: Args) extends Job(args) { - TypedPipe.from(TypedTsv[Int]("input0")) - .hashLookup(TypedPipe.from(TypedTsv[(Int, String)]("input1")).group) - .write(TypedTsv[(Int, Option[String])]("output")) + TypedPipe + .from(TypedText.tsv[Int]("input0")) + .hashLookup(TypedPipe.from(TypedText.tsv[(Int, String)]("input1")).group) + .mapValues { o: Option[String] => o.getOrElse("") } + .write(TypedText.tsv[(Int, String)]("output")) } -class TypedLookupJobTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedLookupJobTest extends WordSpec with Matchers { "A TypedLookupJob" should { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - val mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt.toString) } + val mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt.toString)) JobTest(new TypedLookupJob(_)) - .source(TypedTsv[Int]("input0"), (-1 to 100)) - .source(TypedTsv[(Int, String)]("input1"), mk) - .sink[(Int,Option[String])](TypedTsv[(Int, Option[String])]("output")) { outBuf => + .source(TypedText.tsv[Int]("input0"), (-1 to 100)) + .source(TypedText.tsv[(Int, String)]("input1"), mk) + .typedSink(TypedText.tsv[(Int, String)]("output")) { outBuf => "correctly TypedPipe.hashLookup" in { val data = mk.groupBy(_._1) - .mapValues(kvs => kvs.map { case (k, v) => (k, Some(v)) }) - val correct = (-1 to 100).flatMap { k => - data.get(k).getOrElse(List((k, None))) - }.toList.sorted - outBuf.size must be_==(correct.size) - outBuf.toList.sorted must be_==(correct) + val correct = (-1 to 100) + .flatMap { k => + data.get(k).getOrElse(List((k, ""))) + } + .toList + .sorted + outBuf should have size (correct.size) + outBuf.toList.sorted shouldBe correct } - } + }(implicitly[TypeDescriptor[(Int, String)]].converter) .run - .finish + .finish() } } class TypedLookupReduceJob(args: Args) extends Job(args) { - TypedPipe.from(TypedTsv[Int]("input0")) - .hashLookup(TypedPipe.from(TypedTsv[(Int, String)]("input1")).group.max) - .write(TypedTsv[(Int, Option[String])]("output")) + TypedPipe + .from(TypedText.tsv[Int]("input0")) + .hashLookup(TypedPipe.from(TypedText.tsv[(Int, String)]("input1")).group.max) + .mapValues { o: Option[String] => o.getOrElse("") } + .write(TypedText.tsv[(Int, String)]("output")) } -class TypedLookupReduceJobTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedLookupReduceJobTest extends WordSpec with Matchers { "A TypedLookupJob" should { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - val mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt.toString) } + val mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt.toString)) JobTest(new TypedLookupReduceJob(_)) - .source(TypedTsv[Int]("input0"), (-1 to 100)) - .source(TypedTsv[(Int, String)]("input1"), mk) - .sink[(Int,Option[String])](TypedTsv[(Int, Option[String])]("output")) { outBuf => + .source(TypedText.tsv[Int]("input0"), (-1 to 100)) + .source(TypedText.tsv[(Int, String)]("input1"), mk) + .typedSink(TypedText.tsv[(Int, String)]("output")) { outBuf => "correctly TypedPipe.hashLookup" in { - val data = mk.groupBy(_._1) + val data = mk + .groupBy(_._1) .mapValues { kvs => val (k, v) = kvs.maxBy(_._2) - (k, Some(v)) + (k, v) } - val correct = (-1 to 100).map { k => - data.get(k).getOrElse((k, None)) - }.toList.sorted - outBuf.size must be_==(correct.size) - outBuf.toList.sorted must be_==(correct) + val correct = (-1 to 100) + .map { k => + data.get(k).getOrElse((k, "")) + } + .toList + .sorted + outBuf should have size (correct.size) + outBuf.toList.sorted shouldBe correct } - } + }(implicitly[TypeDescriptor[(Int, String)]].converter) .run - .finish + .finish() } } -class TypedFilterJob(args : Args) extends Job(args) { - TypedPipe.from(TypedTsv[Int]("input")) - .filter { _ > 50 } - .filterNot { _ % 2 == 0 } - .write(TypedTsv[Int]("output")) +class TypedFilterJob(args: Args) extends Job(args) { + TypedPipe + .from(TypedText.tsv[Int]("input")) + .filter(_ > 50) + .filterNot(_ % 2 == 0) + .write(TypedText.tsv[Int]("output")) } -class TypedFilterTest extends Specification { - import Dsl._ - noDetailedDiffs() //Fixes an issue with scala 2.9 +class TypedFilterTest extends WordSpec with Matchers { "A TypedPipe" should { "filter and filterNot elements" in { val input = -1 to 100 val isEven = (i: Int) => i % 2 == 0 - val expectedOutput = input filter { _ > 50 } filterNot isEven + val expectedOutput = input.filter(_ > 50).filterNot(isEven) + + TUtil.printStack { + JobTest(new com.twitter.scalding.TypedFilterJob(_)) + .source(TypedText.tsv[Int]("input"), input) + .typedSink(TypedText.tsv[Int]("output")) { outBuf => + outBuf.toList shouldBe expectedOutput + } + .run + .runHadoop + .finish() + } + } + } +} + +class TypedPartitionJob(args: Args) extends Job(args) { + val (p1, p2) = TypedPipe.from(TypedText.tsv[Int]("input")).partition(_ > 50) + p1.write(TypedText.tsv[Int]("output1")) + p2.write(TypedText.tsv[Int]("output2")) +} + +class TypedPartitionTest extends WordSpec with Matchers { + "A TypedPipe" should { + "partition elements" in { + val input = -1 to 100 + val (expected1, expected2) = input.partition(_ > 50) TUtil.printStack { - JobTest(new com.twitter.scalding.TypedFilterJob(_)). - source(TypedTsv[Int]("input"), input). - sink[Int](TypedTsv[Int]("output")) { outBuf => - outBuf.toList must be_==(expectedOutput) - }. - run. - runHadoop. - finish + JobTest(new com.twitter.scalding.TypedPartitionJob(_)) + .source(TypedText.tsv[Int]("input"), input) + .typedSink(TypedText.tsv[Int]("output1")) { outBuf => + outBuf.toList shouldBe expected1 + } + .typedSink(TypedText.tsv[Int]("output2")) { outBuf => + outBuf.toList shouldBe expected2 + } + .run + .runHadoop + .finish() } } } } class TypedMultiJoinJob(args: Args) extends Job(args) { - val zero = TypedPipe.from(TypedTsv[(Int, Int)]("input0")) - val one = TypedPipe.from(TypedTsv[(Int, Int)]("input1")) - val two = TypedPipe.from(TypedTsv[(Int, Int)]("input2")) + val zero = TypedPipe.from(TypedText.tsv[(Int, Int)]("input0")) + val one = TypedPipe.from(TypedText.tsv[(Int, Int)]("input1")) + val two = TypedPipe.from(TypedText.tsv[(Int, Int)]("input2")) - val cogroup = zero.group - .join(one.group.max) - .join(two.group.max) + val cogroup = MultiJoin(zero, one.group.max, two.group.max) // make sure this is indeed a case with no self joins // distinct by mapped @@ -831,32 +1154,27 @@ class TypedMultiJoinJob(args: Args) extends Job(args) { assert(distinct.size == cogroup.inputs.size) cogroup - .map { case (k, ((v0, v1), v2)) => (k, v0, v1, v2) } - .write(TypedTsv[(Int, Int, Int, Int)]("output")) + .map { case (k, (v0, v1, v2)) => (k, v0, v1, v2) } + .write(TypedText.tsv[(Int, Int, Int, Int)]("output")) } -class TypedMultiJoinJobTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedMultiJoinJobTest extends WordSpec with Matchers { "A TypedMultiJoinJob" should { val rng = new java.util.Random - val COUNT = 100*100 + val COUNT = 100 * 100 val KEYS = 10 - def mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt) } + def mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt)) val mk0 = mk val mk1 = mk val mk2 = mk JobTest(new TypedMultiJoinJob(_)) - .source(TypedTsv[(Int, Int)]("input0"), mk0) - .source(TypedTsv[(Int, Int)]("input1"), mk1) - .source(TypedTsv[(Int, Int)]("input2"), mk2) - .sink[(Int,Int,Int,Int)](TypedTsv[(Int, Int, Int, Int)]("output")) { outBuf => + .source(TypedText.tsv[(Int, Int)]("input0"), mk0) + .source(TypedText.tsv[(Int, Int)]("input1"), mk1) + .source(TypedText.tsv[(Int, Int)]("input2"), mk2) + .typedSink(TypedText.tsv[(Int, Int, Int, Int)]("output")) { outBuf => "correctly do a multi-join" in { def groupMax(it: Seq[(Int, Int)]): Map[Int, Int] = - it.groupBy(_._1).mapValues { kvs => - val (k, v) = kvs.maxBy(_._2) - v - }.toMap + it.groupBy(_._1).map { case (_, kvs) => kvs.maxBy(_._2) } val d0 = mk0.groupBy(_._1).mapValues(_.map { case (_, v) => v }) val d1 = groupMax(mk1) @@ -871,24 +1189,26 @@ class TypedMultiJoinJobTest extends Specification { } yield (v0s, (k, v1, v2))) } .flatMap { case (v0s, (k, v1, v2)) => - v0s.map { (k, _, v1, v2) } + v0s.map((k, _, v1, v2)) } - .toList.sorted + .sorted - outBuf.size must be_==(correct.size) - outBuf.toList.sorted must be_==(correct) + outBuf should have size (correct.size) + outBuf.sorted shouldBe correct } } .runHadoop - .finish + .finish() } } class TypedMultiSelfJoinJob(args: Args) extends Job(args) { - val zero = TypedPipe.from(TypedTsv[(Int, Int)]("input0")) - val one = TypedPipe.from(TypedTsv[(Int, Int)]("input1")) - // forceToReducers makes sure the first and the second part of - .group.forceToReducers + val zero = TypedPipe.from(TypedText.tsv[(Int, Int)]("input0")) + val one = TypedPipe + .from(TypedText.tsv[(Int, Int)]("input1")) + // forceToReducers makes sure the first and the second part of + .group + .forceToReducers val cogroup = zero.group .join(one.max) @@ -901,32 +1221,30 @@ class TypedMultiSelfJoinJob(args: Args) extends Job(args) { cogroup .map { case (k, ((v0, v1), v2)) => (k, v0, v1, v2) } - .write(TypedTsv[(Int, Int, Int, Int)]("output")) + .write(TypedText.tsv[(Int, Int, Int, Int)]("output")) } -class TypedMultiSelfJoinJobTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedMultiSelfJoinJobTest extends WordSpec with Matchers { "A TypedMultiSelfJoinJob" should { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - def mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt) } + def mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt)) val mk0 = mk val mk1 = mk JobTest(new TypedMultiSelfJoinJob(_)) - .source(TypedTsv[(Int, Int)]("input0"), mk0) - .source(TypedTsv[(Int, Int)]("input1"), mk1) - .sink[(Int,Int,Int,Int)](TypedTsv[(Int, Int, Int, Int)]("output")) { outBuf => + .source(TypedText.tsv[(Int, Int)]("input0"), mk0) + .source(TypedText.tsv[(Int, Int)]("input1"), mk1) + .typedSink(TypedText.tsv[(Int, Int, Int, Int)]("output")) { outBuf => "correctly do a multi-self-join" in { def group(it: Seq[(Int, Int)])(red: (Int, Int) => Int): Map[Int, Int] = - it.groupBy(_._1).mapValues { kvs => - kvs.map(_._2).reduce(red) - }.toMap + it.groupBy(_._1).map { case (k, kvs) => + (k, kvs.map(_._2).reduce(red)) + } val d0 = mk0.groupBy(_._1).mapValues(_.map { case (_, v) => v }) val d1 = group(mk1)(_ max _) - val d2 = group(mk1)( _ min _) + val d2 = group(mk1)(_ min _) val correct = (d0.keySet ++ d1.keySet ++ d2.keySet).toList .flatMap { k => @@ -937,251 +1255,460 @@ class TypedMultiSelfJoinJobTest extends Specification { } yield (v0s, (k, v1, v2))) } .flatMap { case (v0s, (k, v1, v2)) => - v0s.map { (k, _, v1, v2) } + v0s.map((k, _, v1, v2)) } - .toList.sorted + .sorted - outBuf.size must be_==(correct.size) - outBuf.toList.sorted must be_==(correct) + outBuf should have size (correct.size) + outBuf.sorted shouldBe correct } } .runHadoop - .finish + .finish() } } class TypedMapGroup(args: Args) extends Job(args) { - TypedPipe.from(TypedTsv[(Int, Int)]("input")) + TypedPipe + .from(TypedText.tsv[(Int, Int)]("input")) .group - .mapGroup { (k, iters) => iters.map(_ * k) } + .mapGroup((k, iters) => iters.map(_ * k)) .max - .write(TypedTsv[(Int, Int)]("output")) + .write(TypedText.tsv[(Int, Int)]("output")) } -class TypedMapGroupTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedMapGroupTest extends WordSpec with Matchers { "A TypedMapGroup" should { val rng = new java.util.Random val COUNT = 10000 val KEYS = 100 - val mk = (1 to COUNT).map { _ => (rng.nextInt % KEYS, rng.nextInt) } + val mk = (1 to COUNT).map(_ => (rng.nextInt % KEYS, rng.nextInt)) JobTest(new TypedMapGroup(_)) - .source(TypedTsv[(Int, Int)]("input"), mk) - .sink[(Int,Int)](TypedTsv[(Int, Int)]("output")) { outBuf => + .source(TypedText.tsv[(Int, Int)]("input"), mk) + .typedSink(TypedText.tsv[(Int, Int)]("output")) { outBuf => "correctly do a mapGroup" in { def mapGroup(it: Seq[(Int, Int)]): Map[Int, Int] = - it.groupBy(_._1).mapValues { kvs => - kvs.map { case (k, v) => k * v }.max - }.toMap + it.groupBy(_._1).map { case (k, kvs) => + (k, kvs.map { case (k, v) => k * v }.max) + } + val correct = mapGroup(mk).toList.sorted - outBuf.size must be_==(correct.size) - outBuf.toList.sorted must be_==(correct) + outBuf should have size (correct.size) + outBuf.sorted shouldBe correct } } .runHadoop - .finish + .finish() } } class TypedSelfCrossJob(args: Args) extends Job(args) { - val pipe = TypedPipe.from(TypedTsv[Int]("input")) + val pipe = TypedPipe.from(TypedText.tsv[Int]("input")) pipe .cross(pipe.groupAll.sum.values) - .write(TypedTsv[(Int, Int)]("output")) + .write(TypedText.tsv[(Int, Int)]("output")) } - -class TypedSelfCrossTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedSelfCrossTest extends WordSpec with Matchers { val input = (1 to 100).toList "A TypedSelfCrossJob" should { + var idx = 0 JobTest(new TypedSelfCrossJob(_)) - .source(TypedTsv[Int]("input"), input) - .sink[(Int,Int)](TypedTsv[(Int, Int)]("output")) { outBuf => - "not change the length of the input" in { - outBuf.size must_== input.size + .source(TypedText.tsv[Int]("input"), input) + .typedSink(TypedText.tsv[(Int, Int)]("output")) { outBuf => + (idx + ": not change the length of the input") in { + outBuf should have size (input.size) } + idx += 1 } .run .runHadoop - .finish + .finish() } } class TypedSelfLeftCrossJob(args: Args) extends Job(args) { - val pipe = TypedPipe.from(TypedTsv[Int]("input")) + val pipe = TypedPipe.from(TypedText.tsv[Int]("input")) pipe .leftCross(pipe.sum) - .write(TypedTsv[(Int, Option[Int])]("output")) + .write(TypedText.tsv[(Int, Option[Int])]("output")) } - -class TypedSelfLeftCrossTest extends Specification { - import Dsl._ - noDetailedDiffs() +class TypedSelfLeftCrossTest extends WordSpec with Matchers { val input = (1 to 100).toList "A TypedSelfLeftCrossJob" should { + var idx = 0 JobTest(new TypedSelfLeftCrossJob(_)) - .source(TypedTsv[Int]("input"), input) - .sink[(Int, Option[Int])](TypedTsv[(Int, Option[Int])]("output")) { outBuf => - "attach the sum of all values correctly" in { - outBuf.size must_== input.size + .source(TypedText.tsv[Int]("input"), input) + .typedSink(TypedText.tsv[(Int, Option[Int])]("output")) { outBuf => + s"$idx: attach the sum of all values correctly" in { + outBuf should have size (input.size) val sum = input.reduceOption(_ + _) // toString to deal with our hadoop testing jank - outBuf.toList.sortBy(_._1).toString must be_== (input.sorted.map((_, sum)).toString) + outBuf.toList.sortBy(_._1).toString shouldBe (input.sorted.map((_, sum)).toString) } - } + idx += 1 + }(implicitly[TypeDescriptor[(Int, Option[Int])]].converter) .run .runHadoop - .finish + .finish() + } +} + +class JoinMapGroupJob(args: Args) extends Job(args) { + def r1 = TypedPipe.from(Seq((1, 10))) + def r2 = TypedPipe.from(Seq((1, 1), (2, 2), (3, 3))) + r1.groupBy(_._1) + .join(r2.groupBy(_._1)) + .mapGroup { case (a, b) => Iterator("a") } + .write(TypedText.tsv("output")) +} + +class JoinMapGroupJobTest extends WordSpec with Matchers { + + "A JoinMapGroupJob" should { + JobTest(new JoinMapGroupJob(_)) + .typedSink(TypedText.tsv[(Int, String)]("output")) { outBuf => + "not duplicate keys" in { + outBuf.toList shouldBe List((1, "a")) + } + } + .run + .finish() } } +class MapValueStreamNonEmptyIteratorJob(args: Args) extends Job(args) { + val input = TypedPipe.from[(Int, String)](Seq((1, "a"), (1, "b"), (3, "a"))) + val extraKeys = TypedPipe.from[(Int, String)](Seq((4, "a"))) + + input + .groupBy(_._1) + .mapValueStream(values => List(values.size).toIterator) + .leftJoin(extraKeys.group) + .toTypedPipe + .map { case (key, (iteratorSize, extraOpt)) => (key, iteratorSize) } + .write(TypedText.tsv[(Int, Int)]("output")) +} + +class MapValueStreamNonEmptyIteratorTest extends WordSpec with Matchers { + + "A MapValueStreamNonEmptyIteratorJob" should { + JobTest(new MapValueStreamNonEmptyIteratorJob(_)) + .sink[(Int, Int)](TypedText.tsv[(Int, Int)]("output")) { outBuf => + "not have iterators of size 0" in { + assert(outBuf.toList.filter(_._2 == 0) === Nil) + } + } + .run + .finish() + } +} + +class NullSinkJob(args: Args, m: scala.collection.mutable.Buffer[Int]) extends Job(args) { + TypedPipe + .from(0 to 100) + .map { i => m += i; i } // side effect + .write(source.NullSink) +} + +class NullSinkJobTest extends WordSpec with Matchers { + "A NullSinkJob" should { + val buf = scala.collection.mutable.Buffer[Int]() + JobTest(new NullSinkJob(_, buf)) + .typedSink[Any](source.NullSink) { _ => + "have a side effect" in { + assert(buf.toSet === (0 to 100).toSet) + } + } + .run + .finish() + } +} class TypedSketchJoinJob(args: Args) extends Job(args) { - val zero = TypedPipe.from(TypedTsv[(Int, Int)]("input0")) - val one = TypedPipe.from(TypedTsv[(Int, Int)]("input1")) + val zero = TypedPipe.from(TypedText.tsv[(Int, Int)]("input0")) + val one = TypedPipe.from(TypedText.tsv[(Int, Int)]("input1")) - implicit def serialize(k:Int) = k.toString.getBytes + implicit def serialize(k: Int): Array[Byte] = k.toString.getBytes zero .sketch(args("reducers").toInt) .join(one) - .map{case (k, (v0,v1)) => (k, v0, v1)} - .write(TypedTsv[(Int, Int, Int)]("output-sketch")) + .map { case (k, (v0, v1)) => (k, v0, v1) } + .write(TypedText.tsv[(Int, Int, Int)]("output-sketch")) - zero - .group + zero.group .join(one.group) - .map{case (k, (v0,v1)) => (k, v0, v1)} - .write(TypedTsv[(Int, Int, Int)]("output-join")) + .map { case (k, (v0, v1)) => (k, v0, v1) } + .write(TypedText.tsv[(Int, Int, Int)]("output-join")) } class TypedSketchLeftJoinJob(args: Args) extends Job(args) { - val zero = TypedPipe.from(TypedTsv[(Int, Int)]("input0")) - val one = TypedPipe.from(TypedTsv[(Int, Int)]("input1")) + val zero = TypedPipe.from(TypedText.tsv[(Int, Int)]("input0")) + val one = TypedPipe.from(TypedText.tsv[(Int, Int)]("input1")) - implicit def serialize(k:Int) = k.toString.getBytes + implicit def serialize(k: Int): Array[Byte] = k.toString.getBytes zero .sketch(args("reducers").toInt) .leftJoin(one) - .map{case (k, (v0,v1)) => (k, v0, v1.getOrElse(-1))} - .write(TypedTsv[(Int, Int, Int)]("output-sketch")) + .map { case (k, (v0, v1)) => (k, v0, v1.getOrElse(-1)) } + .write(TypedText.tsv[(Int, Int, Int)]("output-sketch")) - zero - .group + zero.group .leftJoin(one.group) - .map{case (k, (v0,v1)) => (k, v0, v1.getOrElse(-1))} - .write(TypedTsv[(Int, Int, Int)]("output-join")) + .map { case (k, (v0, v1)) => (k, v0, v1.getOrElse(-1)) } + .write(TypedText.tsv[(Int, Int, Int)]("output-join")) } - object TypedSketchJoinTestHelper { - import Dsl._ val rng = new java.util.Random - def generateInput(size: Int, max: Int, dist: (Int) => Int): List[(Int,Int)] = { + def generateInput(size: Int, max: Int, dist: (Int) => Int): List[(Int, Int)] = { def next: Int = rng.nextInt(max) (0 to size).flatMap { i => val k = next - (1 to dist(k)).map { j => (k, next) } + (1 to dist(k)).map(j => (k, next)) }.toList } - def runJobWithArguments(fn: (Args) => Job, reducers : Int, dist: (Int) => Int): (List[(Int,Int,Int)], List[(Int,Int,Int)]) = { + def runJobWithArguments( + fn: (Args) => Job, + reducers: Int, + dist: (Int) => Int + ): (List[(Int, Int, Int)], List[(Int, Int, Int)]) = { - val sketchResult = Buffer[(Int,Int,Int)]() - val innerResult = Buffer[(Int,Int,Int)]() + val sketchResult = Buffer[(Int, Int, Int)]() + val innerResult = Buffer[(Int, Int, Int)]() JobTest(fn) .arg("reducers", reducers.toString) - .source(TypedTsv[(Int,Int)]("input0"), generateInput(1000, 100, dist)) - .source(TypedTsv[(Int,Int)]("input1"), generateInput(100, 100, x => 1)) - .sink[(Int,Int,Int)](TypedTsv[(Int,Int,Int)]("output-sketch")) { outBuf => sketchResult ++= outBuf } - .sink[(Int,Int,Int)](TypedTsv[(Int,Int,Int)]("output-join")) { outBuf => innerResult ++= outBuf } + .source(TypedText.tsv[(Int, Int)]("input0"), generateInput(1000, 100, dist)) + .source(TypedText.tsv[(Int, Int)]("input1"), generateInput(100, 100, x => 1)) + .typedSink(TypedText.tsv[(Int, Int, Int)]("output-sketch"))(outBuf => sketchResult ++= outBuf) + .typedSink(TypedText.tsv[(Int, Int, Int)]("output-join"))(outBuf => innerResult ++= outBuf) .run .runHadoop - .finish + .finish() (sketchResult.toList.sorted, innerResult.toList.sorted) } } -class TypedSketchJoinJobTest extends Specification { - import Dsl._ - noDetailedDiffs() - +class TypedSketchJoinJobTest extends WordSpec with Matchers { import TypedSketchJoinTestHelper._ "A TypedSketchJoinJob" should { "get the same result as an inner join" in { val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 10, x => 1) - sk must_== inner + sk shouldBe inner } "get the same result when half the left keys are missing" in { - val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 10, x => if(x < 50) 0 else 1) - sk must_== inner + val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 10, x => if (x < 50) 0 else 1) + sk shouldBe inner } "get the same result with a massive skew to one key" in { - val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 10, x => if(x == 50) 1000 else 1) - sk must_== inner + val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 10, x => if (x == 50) 1000 else 1) + sk shouldBe inner } "still work with only one reducer" in { val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 1, x => 1) - sk must_== inner + sk shouldBe inner } "still work with massive skew and only one reducer" in { - val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 1, x => if(x == 50) 1000 else 1) - sk must_== inner + val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 1, x => if (x == 50) 1000 else 1) + sk shouldBe inner } } } -class TypedSketchLeftJoinJobTest extends Specification { - import Dsl._ - noDetailedDiffs() - +class TypedSketchLeftJoinJobTest extends WordSpec with Matchers { import TypedSketchJoinTestHelper._ "A TypedSketchLeftJoinJob" should { "get the same result as a left join" in { val (sk, left) = runJobWithArguments(new TypedSketchLeftJoinJob(_), 10, x => 1) - sk must_== left + sk shouldBe left } "get the same result when half the left keys are missing" in { - val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 10, x => if(x < 50) 0 else 1) - sk must_== inner + val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 10, x => if (x < 50) 0 else 1) + sk shouldBe inner } "get the same result with a massive skew to one key" in { - val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 10, x => if(x == 50) 1000 else 1) - sk must_== inner + val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 10, x => if (x == 50) 1000 else 1) + sk shouldBe inner } - "still work with only one reducer" in { val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 1, x => 1) - sk must_== inner + sk shouldBe inner } "still work with massive skew and only one reducer" in { - val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 1, x => if(x == 50) 1000 else 1) - sk must_== inner + val (sk, inner) = runJobWithArguments(new TypedSketchJoinJob(_), 1, x => if (x == 50) 1000 else 1) + sk shouldBe inner + } + } +} + +class TypedPipeRequireTest extends FunSuite { + test("requireSingleValuePerKey should not cause a job to fail") { + + def ex(req: Boolean) = { + val ex = + TypedPipe + .from((1 to 1000)) + .map(k => (k.toString, k)) + .join(TypedPipe.from((1 to 1000 by 5)).map(_.toString).asKeys) + val g = + if (req) ex.group.requireSingleValuePerKey.toTypedPipe + else ex.group.toTypedPipe + + g.toIterableExecution } + + assert( + ex(false).waitFor(Config.empty, Local(true)).get.toList.sorted == + ex(true).waitFor(Config.empty, Local(true)).get.toList.sorted + ) + } +} + +object TypedPipeConverterTest { + class TypedTsvWithCustomConverter[T: TypeDescriptor](nonSerializableObj: Any, path: String*) + extends FixedTypedText[T](TypedText.TAB, path: _*) { + override def converter[U >: T]: TupleConverter[U] = + super.converter.andThen { t: T => nonSerializableObj; t } + } + + class NonSerializableObj + + val source = new TypedTsvWithCustomConverter[Int](new NonSerializableObj(), "input") + + class JobWithCustomConverter(args: Args) extends Job(args) { + TypedPipe + .from(source) + .map(i => i + 1) + .write(TypedText.tsv[Int]("output")) } } +class TypedPipeConverterTest extends FunSuite { + import TypedPipeConverterTest._ + + test("any converter should be serializable") { + val expected = mutable.Buffer[Int](0 to 10: _*) + val result = mutable.Buffer[Int]() + + JobTest(new JobWithCustomConverter(_)) + .source(source, expected.map(_ - 1)) + .typedSink(TypedText.tsv[Int]("output"))(outBuf => result ++= outBuf) + .runHadoop + .finish() + + assert(result == expected) + } +} + +object TypedPipeCrossWithMapWithToPipeTest { + val source = TypedText.tsv[Int]("source") + + val sink1 = TypedText.tsv[Int]("sink1") + val sink2 = TypedText.tsv[Int]("sink2") + + class TestJob(args: Args) extends Job(args) { + val mapPipe: TypedPipe[Map[Int, Int]] = TypedPipe + .from(source) + .groupAll + .toList + .mapValues(values => values.map(v => (v, v)).toMap) + .values + + val crossedMapped = TypedPipe + .from(source) + .cross(mapPipe) + .map { case (value, map) => map(value) } + + crossedMapped.toPipe('value).write(sink1) + crossedMapped.map(identity).toPipe('value).write(sink2) + } +} + +class TypedPipeCrossWithMapWithToPipeTest extends FunSuite { + import TypedPipeCrossWithMapWithToPipeTest._ + import TUtil._ + + test("data between cross and subsequent map shouldn't be materialized") { + val n = 3000 + val bytesPerElement = 100 // we shouldn't write more than 100 bytes per element + val values = 1 to n + + JobTest(new TestJob(_)) + .source(source, values) + .typedSink(sink1) { outBuf => + assert(outBuf.toSet == values.toSet) + } + .typedSink(sink2) { outBuf => + assert(outBuf.toSet == values.toSet) + } + .writesLessDataThen(bytesPerElement * n) + .runHadoop + .finish() + } +} + +object TypedPipeCrossWithDifferentMapsAfterTest { + val source = TypedText.tsv[Int]("source") + + val sink1 = TypedText.tsv[Int]("sink1") + val sink2 = TypedText.tsv[Int]("sink2") + + class TestJob(args: Args) extends Job(args) { + val mapPipe: TypedPipe[Map[Int, Int]] = TypedPipe + .from(source) + .groupAll + .toList + .mapValues(values => values.map(v => (v, v)).toMap) + .values + + val crossed = TypedPipe.from(source).cross(mapPipe) + crossed.map { case (value, map) => map(value) }.write(sink1) + crossed.map { case (value, map) => map(identity(value)) }.write(sink2) + } +} + +class TypedPipeCrossWithDifferentMapsAfterTest extends FunSuite { + import TypedPipeCrossWithDifferentMapsAfterTest._ + import TUtil._ + + test("cross data shouldn't be materialized") { + val n = 3000 + val bytesPerElement = 100 // we shouldn't write more than 100 bytes per element + val values = 1 to n + + JobTest(new TestJob(_)) + .source(source, values) + .typedSink(sink1) { outBuf => + assert(outBuf.toSet == values.toSet) + } + .typedSink(sink2) { outBuf => + assert(outBuf.toSet == values.toSet) + } + .writesLessDataThen(bytesPerElement * n) + .runHadoop + .finish() + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedSinkWithTypedImplementationTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedSinkWithTypedImplementationTest.scala new file mode 100644 index 0000000000..adbdebeeff --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedSinkWithTypedImplementationTest.scala @@ -0,0 +1,109 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import cascading.flow.FlowDef +import cascading.pipe.Pipe +import cascading.tuple.Fields +import org.apache.hadoop.conf.Configuration +import org.scalatest.{Matchers, WordSpec} + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ + +class TypedSinkWithTypedImplementation(path: String) extends TypedSink[String] { + private val fields = new Fields(0) + + override def setter[U <: String]: TupleSetter[U] = TupleSetter.singleSetter[U] + + override def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe = { + TypedPipe.fromPipe[String](pipe, fields).write(TypedTsv[String](path)) + pipe + } +} + +class TypedSinkWithTypedImplementationRecursive(path: String) extends TypedSink[String] { + private val fields = new Fields(0) + + override def setter[U <: String]: TupleSetter[U] = TupleSetter.singleSetter[U] + + override def writeFrom(pipe: Pipe)(implicit flowDef: FlowDef, mode: Mode): Pipe = { + TypedPipe.fromPipe[String](pipe, fields).write(new TypedSinkWithTypedImplementationRecursive(path)) + pipe + } +} + +class TypedSinkWithTypedImplementationJob(args: Args) extends Job(args) { + TypedPipe + .from(List("test")) + .write(new TypedSinkWithTypedImplementation("output")) +} + +class TypedSinkWithTypedImplementationTest extends WordSpec with Matchers { + "A TypedSinkWithTypedImplementationJob" should { + "should produce correct results" in { + JobTest(new TypedSinkWithTypedImplementationJob(_)) + .sink[String](TypedTsv[String]("output"))(_.toList == List("test")) + .runHadoop + .finish() + } + } + + "A TypedSinkWithTypedImplementation" should { + "should work with .writeExecution" in { + val elements = List("test") + val elementsFromExecution = TypedPipe + .from(elements) + .writeExecution(new TypedSinkWithTypedImplementation("output")) + .flatMap(_ => TypedPipe.from(TypedTsv[String]("output")).toIterableExecution) + .waitFor(Config.default, HadoopTest(new Configuration(), _ => None)) + .get + .toList + + assert(elements == elementsFromExecution) + } + } + + "A TypedSinkWithTypedImplementation" should { + "should work with Execution.fromFn" in { + val elements = List("test") + val elementsFromExecution = Execution + .fromFn { case (confArg, modeArg) => + implicit val flowDef = new FlowDef + implicit val mode = modeArg + TypedPipe.from(elements).write(new TypedSinkWithTypedImplementation("output")) + flowDef + } + .flatMap(_ => TypedPipe.from(TypedTsv[String]("output")).toIterableExecution) + .waitFor(Config.default, HadoopTest(new Configuration(), _ => None)) + .get + .toList + + assert(elements == elementsFromExecution) + } + } + + "A TypedSinkWithTypedImplementationRecursive" should { + "should fail" in { + assert( + TypedPipe + .from(List("test")) + .writeExecution(new TypedSinkWithTypedImplementationRecursive("output")) + .waitFor(Config.default, HadoopTest(new Configuration(), _ => None)) + .isFailure + ) + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/TypedSketchJoinJobForEmptyKeysTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/TypedSketchJoinJobForEmptyKeysTest.scala new file mode 100644 index 0000000000..4aa144eef9 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/TypedSketchJoinJobForEmptyKeysTest.scala @@ -0,0 +1,39 @@ +package com.twitter.scalding + +import org.scalatest.{Matchers, WordSpec} + +class TypedSketchJoinJobForEmptyKeys(args: Args) extends Job(args) { + // Deal with when a key appears in left but not right + val leftTypedPipe = TypedPipe.from(List((1, 1111))) + val rightTypedPipe = TypedPipe.from(List((3, 3333), (4, 4444))) + + implicit def serialize(k: Int): Array[Byte] = k.toString.getBytes + + val sketched = leftTypedPipe + .sketch(1) + .leftJoin(rightTypedPipe) + + // this is test that a TypedPipe.Keyed method works: + sketched.values + + sketched + .map { case (a, (b, c)) => + (a, b, c.getOrElse(-1)) + } + .write(TypedTsv("output")) +} + +class TypedSketchJoinJobForEmptyKeysTest extends WordSpec with Matchers { + "A TypedSketchJoinJobForEmptyKeysTest" should { + "Sketch leftJoin with a single left key should be correct" in { + JobTest(new TypedSketchJoinJobForEmptyKeys(_)) + .sink[(Int, Int, Int)](TypedTsv[(Int, Int, Int)]("output")) { outBuf => + outBuf should have size 1 + val unordered = outBuf.toSet + unordered should contain(1, 1111, -1) + } + .run + .finish() + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/WeightedPageRankTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/WeightedPageRankTest.scala deleted file mode 100644 index 22f6ddad1b..0000000000 --- a/scalding-core/src/test/scala/com/twitter/scalding/WeightedPageRankTest.scala +++ /dev/null @@ -1,54 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding - -import org.specs._ - -class WeightedPageRankSpec extends Specification { - "Weighted PageRank job" should { - JobTest(new com.twitter.scalding.examples.WeightedPageRank(_)). - arg("pwd", "."). - arg("weighted", "true"). - arg("maxiterations", "1"). - arg("jumpprob","0.1"). - source(Tsv("./nodes"), List((1,"2,3","1,2",0.26),(2,"3","1",0.54),(3,"","",0.2))). - source(Tsv("./numnodes"), List((3))). - source(Tsv("./pagerank_0"), List((1,0.086),(2,0.192),(3,0.722))). - sink[Double](TypedTsv[Double]("./totaldiff")) { ob => - "have low error" in { - ob.head must beCloseTo(0.722-0.461+0.2964-0.192+0.2426-0.086, 0.001) - } - }. - sink[(Int,Double)](Tsv("./pagerank_1")){ outputBuffer => - val pageRank = outputBuffer.map { res => (res._1,res._2) }.toMap - "correctly compute pagerank" in { - val deadMass = 0.722/3*0.9 - val userMass = List(0.26, 0.54, 0.2).map { _*0.1 } - val massNext = List(0, 0.086/3, (0.086*2/3+0.192)).map { _*0.9 } - val expected = (userMass zip massNext) map { a : (Double, Double) => a._1 + a._2 + deadMass } - - println(pageRank) - (pageRank(1) + pageRank(2) + pageRank(3)) must beCloseTo(1.0, 0.001) - pageRank(1) must beCloseTo(expected(0), 0.001) - pageRank(2) must beCloseTo(expected(1), 0.001) - pageRank(3) must beCloseTo(expected(2), 0.001) - } - }. - runWithoutNext(useHadoop=false). - runWithoutNext(useHadoop=true). - finish - } -} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/WrappedJoinerTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/WrappedJoinerTest.scala new file mode 100644 index 0000000000..d951a7451b --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/WrappedJoinerTest.scala @@ -0,0 +1,71 @@ +package com.twitter.scalding + +import cascading.flow.FlowException +import cascading.flow.FlowProcess +import cascading.pipe.CoGroup +import cascading.pipe.joiner.{InnerJoin, JoinerClosure} +import cascading.tuple.Tuple +import org.scalatest.{Matchers, WordSpec} +import java.util.{Iterator => JIterator} + +class CheckFlowProcessJoiner(uniqueID: UniqueID) extends InnerJoin { + override def getIterator(joinerClosure: JoinerClosure): JIterator[Tuple] = { + val flowProcess = RuntimeStats.getFlowProcessForUniqueId(uniqueID) + if (flowProcess == FlowProcess.NULL) { + throw new NullPointerException("No active FlowProcess was available.") + } + + super.getIterator(joinerClosure) + } +} + +class TestWrappedJoinerJob(args: Args) extends Job(args) { + val uniqueID = UniqueID.getIDFor(flowDef) + + val inA = Tsv("inputA", ('a, 'b)) + val inB = Tsv("inputB", ('x, 'y)) + + val joiner = { + val checkJoiner = new CheckFlowProcessJoiner(uniqueID) + if (args.boolean("wrapJoiner")) WrappedJoiner(checkJoiner) else checkJoiner + } + + val p1 = new CoGroup(inA, 'a, inB, 'x, joiner) + + // The .forceToDisk is necessary to have the test work properly. + p1.forceToDisk.write(Tsv("output")) +} + +class WrappedJoinerTest extends WordSpec with Matchers { + "Methods called from a Joiner" should { + "have access to a FlowProcess when WrappedJoiner is used" in { + JobTest(new TestWrappedJoinerJob(_)) + .arg("wrapJoiner", "true") + .source(Tsv("inputA"), Seq(("1", "alpha"), ("2", "beta"))) + .source(Tsv("inputB"), Seq(("1", "first"), ("2", "second"))) + .sink[(Int, String)](Tsv("output")) { outBuf => + // The job will fail with an exception if the FlowProcess is unavailable. + } + .runHadoop + .finish() + } + + "have no access to a FlowProcess when WrappedJoiner is not used" in { + try { + JobTest(new TestWrappedJoinerJob(_)) + .source(Tsv("inputA"), Seq(("1", "alpha"), ("2", "beta"))) + .source(Tsv("inputB"), Seq(("1", "first"), ("2", "second"))) + .sink[(Int, String)](Tsv("output")) { outBuf => + // The job will fail with an exception if the FlowProcess is unavailable. + } + .runHadoop + .finish() + + fail("The test Job without WrappedJoiner should fail.") + } catch { + case ex: FlowException => + ex.getCause.getMessage should include("No active FlowProcess was available") + } + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/XHandlerTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/XHandlerTest.scala index 35e8ce6b8c..d84edbce49 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/XHandlerTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/XHandlerTest.scala @@ -12,41 +12,45 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ +import org.scalatest.{Matchers, WordSpec} import cascading.flow.planner.PlannerException -class XHandlerTest extends Specification { +class XHandlerTest extends WordSpec with Matchers { "Throwable classes" should { "be handled if exist in default mapping" in { val rxh = RichXHandler() - rxh.handlers.find(h => h(new PlannerException)).isDefined must beTrue - rxh.handlers.find(h => h(new InvalidSourceException("Invalid Source"))).isDefined must beTrue - rxh.handlers.find(h => h(new NoSuchMethodError)).isDefined must beTrue - rxh.handlers.find(h => h(new AbstractMethodError)).isDefined must beTrue - rxh.handlers.find(h => h(new NoClassDefFoundError)).isDefined must beTrue + rxh.handlers.find(h => h(new PlannerException)) should not be empty + rxh.handlers.find(h => h(new InvalidSourceException("Invalid Source"))) should not be empty + rxh.handlers.find(h => h(new NoSuchMethodError)) should not be empty + rxh.handlers.find(h => h(new AbstractMethodError)) should not be empty + rxh.handlers.find(h => h(new NoClassDefFoundError)) should not be empty + rxh.handlers.find(h => + h(new ModeLoadException("dummy", new ClassNotFoundException)) + ) should not be empty } "be handled if exist in custom mapping" in { val cRxh = RichXHandler(RichXHandler.mapping ++ Map(classOf[NullPointerException] -> "NPE")) - cRxh.handlers.find(h => h(new NullPointerException)).isDefined must beTrue - cRxh.mapping(classOf[NullPointerException]) must_== "NPE" + cRxh.handlers.find(h => h(new NullPointerException)) should not be empty + cRxh.mapping(classOf[NullPointerException]) shouldBe "NPE" } "not be handled if missing in mapping" in { val rxh = RichXHandler() - rxh.handlers.find(h => h(new NullPointerException)).isDefined must beFalse - rxh.handlers.find(h => h(new IndexOutOfBoundsException)).isDefined must beFalse + rxh.handlers.find(h => h(new NullPointerException)) shouldBe empty + rxh.handlers.find(h => h(new IndexOutOfBoundsException)) shouldBe empty } "be valid keys in mapping if defined" in { val rxh = RichXHandler() - rxh.mapping(classOf[PlannerException]) must_== RichXHandler.RequireSinks - rxh.mapping(classOf[InvalidSourceException]) must_== RichXHandler.DataIsMissing - rxh.mapping(classOf[NoSuchMethodError]) must_== RichXHandler.BinaryProblem - rxh.mapping(classOf[AbstractMethodError]) must_== RichXHandler.BinaryProblem - rxh.mapping(classOf[NoClassDefFoundError]) must_== RichXHandler.BinaryProblem - rxh.mapping(classOf[NullPointerException]) must_== RichXHandler.Default + rxh.mapping(classOf[ModeLoadException]) shouldBe RichXHandler.RequiredCascadingFabricNotInClassPath + rxh.mapping(classOf[PlannerException]) shouldBe RichXHandler.RequireSinks + rxh.mapping(classOf[InvalidSourceException]) shouldBe RichXHandler.DataIsMissing + rxh.mapping(classOf[NoSuchMethodError]) shouldBe RichXHandler.BinaryProblem + rxh.mapping(classOf[AbstractMethodError]) shouldBe RichXHandler.BinaryProblem + rxh.mapping(classOf[NoClassDefFoundError]) shouldBe RichXHandler.BinaryProblem + rxh.mapping(classOf[NullPointerException]) shouldBe RichXHandler.Default } "create a URL link in GitHub wiki" in { val NoClassDefFoundErrorString = "javalangnoclassdeffounderror" @@ -54,12 +58,23 @@ class XHandlerTest extends Specification { val NoSuchMethodErrorString = "javalangnosuchmethoderror" val InvalidSouceExceptionString = "comtwitterscaldinginvalidsourceexception" val PlannerExceptionString = "cascadingflowplannerplannerexception" - RichXHandler.createXUrl(new PlannerException) must_== RichXHandler.gitHubUrl + PlannerExceptionString - RichXHandler.createXUrl(new InvalidSourceException("Invalid Source")) must_== RichXHandler.gitHubUrl + InvalidSouceExceptionString - RichXHandler.createXUrl(new NoSuchMethodError) must_== RichXHandler.gitHubUrl + NoSuchMethodErrorString - RichXHandler.createXUrl(new AbstractMethodError) must_== RichXHandler.gitHubUrl + AbstractMethodErrorString - RichXHandler.createXUrl(new NoClassDefFoundError) must_== RichXHandler.gitHubUrl + NoClassDefFoundErrorString + val ModeLoadExceptionString = "comtwitterscaldingmodeloadexception" + RichXHandler.createXUrl(new PlannerException) shouldBe (RichXHandler.gitHubUrl + PlannerExceptionString) + RichXHandler.createXUrl( + new InvalidSourceException("Invalid Source") + ) shouldBe (RichXHandler.gitHubUrl + InvalidSouceExceptionString) + RichXHandler.createXUrl( + new NoSuchMethodError + ) shouldBe (RichXHandler.gitHubUrl + NoSuchMethodErrorString) + RichXHandler.createXUrl( + new AbstractMethodError + ) shouldBe (RichXHandler.gitHubUrl + AbstractMethodErrorString) + RichXHandler.createXUrl( + new NoClassDefFoundError + ) shouldBe (RichXHandler.gitHubUrl + NoClassDefFoundErrorString) + RichXHandler.createXUrl( + ModeLoadException("dummy", new ClassNotFoundException) + ) shouldBe (RichXHandler.gitHubUrl + ModeLoadExceptionString) } - } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/bdd/MultipleSourcesSpecTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/bdd/MultipleSourcesSpecTest.scala index f6619cade7..8be0829dbe 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/bdd/MultipleSourcesSpecTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/bdd/MultipleSourcesSpecTest.scala @@ -1,135 +1,114 @@ package com.twitter.scalding.bdd -import org.specs.Specification +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding.Dsl._ import com.twitter.scalding.RichPipe import scala.collection.mutable.Buffer import cascading.tuple.Tuple -class MultipleSourcesSpecTest extends Specification with BddDsl { - +class MultipleSourcesSpecTest extends WordSpec with Matchers with BddDsl { "A test with two sources" should { "accept an operation with two input pipes" in { Given { - List(("Stefano", "110"), ("Rajah", "220")) withSchema('name, 'points) - } And { - List(("Stefano", "home1"), ("Rajah", "home2")) withSchema('name, 'address) - } When { - (pipe1: RichPipe, pipe2: RichPipe) => { - pipe1.joinWithSmaller('name -> 'name, pipe2).map('address -> 'address_transf) { - address: String => address + "_transf" - } - } - } Then { - buffer: Buffer[(String, String, String, String)] => { - buffer.forall({ - case (_, _, _, addressTransf) => addressTransf.endsWith("_transf") - }) mustBe true + List(("Stefano", "110"), ("Rajah", "220")).withSchema('name, 'points) + }.And { + List(("Stefano", "home1"), ("Rajah", "home2")).withSchema('name, 'address) + }.When { (pipe1: RichPipe, pipe2: RichPipe) => + pipe1.joinWithSmaller('name -> 'name, pipe2).map('address -> 'address_transf) { address: String => + address + "_transf" } + }.Then { buffer: Buffer[(String, String, String, String)] => + buffer.forall { case (_, _, _, addressTransf) => + addressTransf.endsWith("_transf") + } shouldBe true } } - "accept an operation with two input pipes using Tuples" in { Given { - List(new Tuple("Stefano", "110"), new Tuple("Rajah", "220")) withSchema('name, 'points) - } And { - List(new Tuple("Stefano", "home1"), new Tuple("Rajah", "home2")) withSchema('name, 'address) - } When { - (pipe1: RichPipe, pipe2: RichPipe) => { - pipe1.joinWithSmaller('name -> 'name, pipe2).map('address -> 'address_transf) { - address: String => address + "_transf" - } - } - } Then { - buffer: Buffer[(String, String, String, String)] => { - buffer.forall({ - case (_, _, _, addressTransf) => addressTransf.endsWith("_transf") - }) mustBe true + List(new Tuple("Stefano", "110"), new Tuple("Rajah", "220")).withSchema('name, 'points) + }.And { + List(new Tuple("Stefano", "home1"), new Tuple("Rajah", "home2")).withSchema('name, 'address) + }.When { (pipe1: RichPipe, pipe2: RichPipe) => + pipe1.joinWithSmaller('name -> 'name, pipe2).map('address -> 'address_transf) { address: String => + address + "_transf" } + }.Then { buffer: Buffer[(String, String, String, String)] => + buffer.forall { case (_, _, _, addressTransf) => + addressTransf.endsWith("_transf") + } shouldBe true } } } - "A test with three sources" should { "accept an operation with three input pipes" in { Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col2) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col3) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col4) - } When { - (pipe1: RichPipe, pipe2: RichPipe, pipe3: RichPipe) => { - pipe1 - .joinWithSmaller('col1 -> 'col1, pipe2) - .joinWithSmaller('col1 -> 'col1, pipe3) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col4) + }.When { (pipe1: RichPipe, pipe2: RichPipe, pipe3: RichPipe) => + pipe1 + .joinWithSmaller('col1 -> 'col1, pipe2) + .joinWithSmaller('col1 -> 'col1, pipe3) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) mustBe true - } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } } "A test with four sources" should { "compile mixing an operation with inconsistent number of input pipes but fail at runtime" in { - Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col2) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col3) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col4) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col5) - } When { - (pipe1: RichPipe, pipe2: RichPipe, pipe3: RichPipe) => { + an[IllegalArgumentException] should be thrownBy { + Given { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col4) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col5) + }.When { (pipe1: RichPipe, pipe2: RichPipe, pipe3: RichPipe) => pipe1 .joinWithSmaller('col1 -> 'col1, pipe2) .joinWithSmaller('col1 -> 'col1, pipe3) .joinWithSmaller('col1 -> 'col1, pipe3) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" + } + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } - } Then { - buffer: Buffer[Tuple] => { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) mustBe true - } - } must throwA[IllegalArgumentException] + } } "be used with a function accepting a list of sources because there is no implicit for functions with more than three input pipes" in { Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col2) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col4) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col5) - } And { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col6) - } When { - (pipes: List[RichPipe]) => { - pipes(0) - .joinWithSmaller('col1 -> 'col1, pipes(1)) - .joinWithSmaller('col1 -> 'col1, pipes(2)) - .joinWithSmaller('col1 -> 'col1, pipes(3)) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col4) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col5) + }.And { + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col6) + }.When { (pipes: List[RichPipe]) => + pipes.head + .joinWithSmaller('col1 -> 'col1, pipes(1)) + .joinWithSmaller('col1 -> 'col1, pipes(2)) + .joinWithSmaller('col1 -> 'col1, pipes(3)) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) mustBe true - } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/bdd/SingleSourceSpecTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/bdd/SingleSourceSpecTest.scala index df1a45906e..dc6d1452ae 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/bdd/SingleSourceSpecTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/bdd/SingleSourceSpecTest.scala @@ -1,96 +1,75 @@ package com.twitter.scalding.bdd -import org.specs.Specification -import com.twitter.scalding.{Dsl, RichPipe} +import org.scalatest.{Matchers, WordSpec} +import com.twitter.scalding.RichPipe import scala.collection.mutable.Buffer import cascading.pipe.Pipe import cascading.tuple.Tuple import com.twitter.scalding.Dsl._ -class SingleSourceSpecTest extends Specification with BddDsl { - +class SingleSourceSpecTest extends WordSpec with Matchers with BddDsl { "A test with single source" should { "accept an operation with a single input rich pipe" in { Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema (('col1, 'col2)) - } When { - pipe: RichPipe => { - pipe.map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[(String, String, String)] => { - buffer.forall({ - case (_, _, transformed) => transformed.endsWith("_transf") - }) mustBe true + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema(('col1, 'col2)) + }.When { pipe: RichPipe => + pipe.map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + }.Then { buffer: Buffer[(String, String, String)] => + buffer.forall { case (_, _, transformed) => + transformed.endsWith("_transf") + } shouldBe true } } "accept an operation with a single input pipe" in { Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema (('col1, 'col2)) - } When { - pipe: Pipe => { - pipe.map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[(String, String, String)] => { - buffer.forall({ - case (_, _, transformed) => transformed.endsWith("_transf") - }) mustBe true + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema(('col1, 'col2)) + }.When { pipe: Pipe => + pipe.map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + }.Then { buffer: Buffer[(String, String, String)] => + buffer.forall { case (_, _, transformed) => + transformed.endsWith("_transf") + } shouldBe true } } "work with output as Tuple" in { Given { - List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema (('col1, 'col2)) - } When { - pipe: RichPipe => { - pipe.map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[Tuple] => { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) mustBe true + List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema(('col1, 'col2)) + }.When { pipe: RichPipe => + pipe.map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } "work with input as simple type" in { Given { - List("col1_1", "col1_2") withSchema ('col1) - } When { - pipe: RichPipe => { - pipe.map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[Tuple] => { - buffer.forall(tuple => tuple.getString(1).endsWith("_transf")) mustBe true + List("col1_1", "col1_2").withSchema('col1) + }.When { pipe: RichPipe => + pipe.map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(1).endsWith("_transf")) shouldBe true } } "work with input as Tuple" in { Given { - List(new Tuple("col1_1", "col2_1"), new Tuple("col1_2", "col2_2")) withSchema (('col1, 'col2)) - } When { - pipe: RichPipe => { - pipe.map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[Tuple] => { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) mustBe true + List(new Tuple("col1_1", "col2_1"), new Tuple("col1_2", "col2_2")).withSchema(('col1, 'col2)) + }.When { pipe: RichPipe => + pipe.map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/bdd/SourceListSpecTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/bdd/SourceListSpecTest.scala index db998637e4..7bef024c43 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/bdd/SourceListSpecTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/bdd/SourceListSpecTest.scala @@ -1,122 +1,104 @@ package com.twitter.scalding.bdd -import org.specs.Specification +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding.RichPipe import scala.collection.mutable.Buffer import cascading.tuple.Tuple import cascading.pipe.Pipe import com.twitter.scalding.Dsl._ -class SourceListSpecTest extends Specification with BddDsl { +class SourceListSpecTest extends WordSpec with Matchers with BddDsl { "A test with a list of sources" should { "compile mixing it with a multi pipe function but fail if not same cardinality between given and when clause" in { - Given { - List( - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col2)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col3)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col4)) - ) - } When { - (pipe1: RichPipe, pipe2: RichPipe) => { + an[IllegalArgumentException] should be thrownBy { + Given { + List( + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col4)) + ) + }.When { (pipe1: RichPipe, pipe2: RichPipe) => pipe1 .joinWithSmaller('col1 -> 'col1, pipe2) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" - } - } - } Then { - buffer: Buffer[Tuple] => { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) mustBe true + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" + } + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } - } must throwA[IllegalArgumentException] + } } "work properly with a multi rich-pipe function with same cardinality" in { Given { List( - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col2)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col3)) + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3)) ) - } When { - (pipe1: RichPipe, pipe2: RichPipe) => { - pipe1 - .joinWithSmaller('col1 -> 'col1, pipe2) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" + }.When { (pipe1: RichPipe, pipe2: RichPipe) => + pipe1 + .joinWithSmaller('col1 -> 'col1, pipe2) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) mustBe true - } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } "work properly with a multi pipe function with same cardinality" in { Given { List( - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col2)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col3)) + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3)) ) - } When { - (pipe1: Pipe, pipe2: Pipe) => { - pipe1 - .joinWithSmaller('col1 -> 'col1, pipe2) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" + }.When { (pipe1: Pipe, pipe2: Pipe) => + pipe1 + .joinWithSmaller('col1 -> 'col1, pipe2) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) mustBe true - } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } "work properly with a function accepting a list of rich pipes" in { Given { List( - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col2)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col3)) + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3)) ) - } When { - (pipes: List[RichPipe]) => { - pipes(0) - .joinWithSmaller('col1 -> 'col1, pipes(1)) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" + }.When { (pipes: List[RichPipe]) => + pipes.head + .joinWithSmaller('col1 -> 'col1, pipes(1)) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) mustBe true - } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } "work properly with a function accepting a list of pipes" in { Given { List( - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col2)), - (List(("col1_1", "col2_1"), ("col1_2", "col2_2")) withSchema('col1, 'col3)) + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col2)), + (List(("col1_1", "col2_1"), ("col1_2", "col2_2")).withSchema('col1, 'col3)) ) - } When { - (pipes: List[Pipe]) => { - pipes(0) - .joinWithSmaller('col1 -> 'col1, pipes(1)) - .map('col1 -> 'col1_transf) { - col1: String => col1 + "_transf" + }.When { (pipes: List[Pipe]) => + pipes.head + .joinWithSmaller('col1 -> 'col1, pipes(1)) + .map('col1 -> 'col1_transf) { col1: String => + col1 + "_transf" } - .project(('col1, 'col2, 'col1_transf)) - } - } Then { - buffer: Buffer[Tuple] => { - buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) mustBe true - } + .project(('col1, 'col2, 'col1_transf)) + }.Then { buffer: Buffer[Tuple] => + buffer.forall(tuple => tuple.getString(2).endsWith("_transf")) shouldBe true } } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/bdd/TypedApiTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/bdd/TypedApiTest.scala new file mode 100644 index 0000000000..cd72c1023f --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/bdd/TypedApiTest.scala @@ -0,0 +1,177 @@ +package com.twitter.scalding.bdd + +import cascading.flow.FlowException +import com.twitter.scalding._ +import org.scalatest.{Matchers, WordSpec} +import scala.math._ + +import scala.collection.mutable + +case class UserWithGender(name: String, gender: String) +case class UserWithAge(name: String, age: Int) +case class UserInfo(name: String, gender: String, age: Int) +case class EstimatedContribution(name: String, suggestedPensionContributionPerMonth: Double) + +class TypedApiTest extends WordSpec with Matchers with TBddDsl { + "A test with a single source" should { + "accept an operation from working with a single tuple-typed pipe" in { + Given { + List(("Joe", "M", 40), ("Sarah", "F", 22)) + }.When { in: TypedPipe[(String, String, Int)] => + in.map[(String, Double)] { + case (name, "M", age) => (name, (1000.0 / (72 - age))) + case (name, _, age) => (name, (1000.0 / (80 - age))) + } + }.Then { buffer: mutable.Buffer[(String, Double)] => + buffer.toList shouldBe List(("Joe", 1000.0 / 32), ("Sarah", 1000.0 / 58)) + } + } + + "accept an operation from single case class-typed pipe" in { + Given { + List(UserInfo("Joe", "M", 40), UserInfo("Sarah", "F", 22)) + }.When { in: TypedPipe[UserInfo] => + in.map { + case UserInfo(name, "M", age) => EstimatedContribution(name, (1000.0 / (72 - age))) + case UserInfo(name, _, age) => EstimatedContribution(name, (1000.0 / (80 - age))) + } + }.Then { buffer: mutable.Buffer[EstimatedContribution] => + buffer.toList shouldBe List( + EstimatedContribution("Joe", 1000.0 / 32), + EstimatedContribution("Sarah", 1000.0 / 58) + ) + } + } + } + + "A test with a two sources" should { + + "accept an operation from two tuple-typed pipes" in { + Given { + List(("Joe", "M"), ("Sarah", "F")) + }.And { + List(("Joe", 40), ("Sarah", 22)) + }.When { (gender: TypedPipe[(String, String)], age: TypedPipe[(String, Int)]) => + gender.group + .join(age.group) + .toTypedPipe + .map { value: (String, (String, Int)) => + val (name, (gender, age)) = value + (name, gender, age) + } + }.Then { buffer: mutable.Buffer[(String, String, Int)] => + buffer.toList shouldBe List(("Joe", "M", 40), ("Sarah", "F", 22)) + } + } + + "accept an operation from two case classes-typed pipes" in { + Given { + List(UserWithGender("Joe", "M"), UserWithGender("Sarah", "F")) + }.And { + List(UserWithAge("Joe", 40), UserWithAge("Sarah", 22)) + }.When { (gender: TypedPipe[UserWithGender], age: TypedPipe[UserWithAge]) => + gender + .groupBy(_.name) + .join(age.groupBy(_.name)) + .mapValues { value: (UserWithGender, UserWithAge) => + val (withGender, withAge) = value + UserInfo(withGender.name, withGender.gender, withAge.age) + } + .values + }.Then { buffer: mutable.Buffer[UserInfo] => + buffer.toList shouldBe List(UserInfo("Joe", "M", 40), UserInfo("Sarah", "F", 22)) + } + } + } + + "A test with a list of sources" should { + "Work as if combining the sources with the And operator but requires explicit cast of the input pipes" in { + GivenSources { + List( + List(UserWithGender("Joe", "M"), UserWithGender("Sarah", "F")), + List(UserWithAge("Joe", 40), UserWithAge("Sarah", 22)) + ) + }.When { pipes: List[TypedPipe[_]] => + val gender = pipes(0).asInstanceOf[TypedPipe[UserWithGender]] // linter:ignore + val age = pipes(1).asInstanceOf[TypedPipe[UserWithAge]] // linter:ignore + + gender + .groupBy(_.name) + .join(age.groupBy(_.name)) + .mapValues { value: (UserWithGender, UserWithAge) => + val (withGender, withAge) = value + UserInfo(withGender.name, withGender.gender, withAge.age) + } + .values + }.Then { buffer: mutable.Buffer[UserInfo] => + buffer.toList shouldBe List(UserInfo("Joe", "M", 40), UserInfo("Sarah", "F", 22)) + } + } + + "not checking the types of the sources and fail if any error occurs" in { + an[FlowException] should be thrownBy { + GivenSources { + List( + List(UserWithGender("Joe", "M"), UserWithGender("Sarah", "F")), + List(("Joe", 40), ("Sarah", 22)) + ) + }.When { pipes: List[TypedPipe[_]] => + val gender = pipes(0).asInstanceOf[TypedPipe[UserWithGender]] // linter:ignore + val age = pipes(1).asInstanceOf[TypedPipe[UserWithAge]] // linter:ignore + + gender + .groupBy(_.name) + .join(age.groupBy(_.name)) + .mapValues { value: (UserWithGender, UserWithAge) => + val (withGender, withAge) = value + UserInfo(withGender.name, withGender.gender, withAge.age) + } + .values + }.Then { buffer: mutable.Buffer[UserInfo] => + buffer.toList shouldBe List(UserInfo("Joe", "M", 40), UserInfo("Sarah", "F", 22)) + } + } + } + + "be created when adding a source to four sources" in { + Given { + List(("Joe", "user1"), ("Sarah", "user2")) + }.And { + List(("user1", "M"), ("user2", "F")) + }.And { + List(("user1", 40), ("user2", 22)) + }.And { + List(("user1", 1000L), ("user2", 800L)) + }.And { + List(("user1", true), ("user2", false)) + }.When { pipes: List[TypedPipe[_]] => + val withUserID = pipes(0).asInstanceOf[TypedPipe[(String, String)]] // linter:ignore + val withGender = pipes(1).asInstanceOf[TypedPipe[(String, String)]] + val withAge = pipes(2).asInstanceOf[TypedPipe[(String, Int)]] + val withIncome = pipes(3).asInstanceOf[TypedPipe[(String, Long)]] + val withSmoker = pipes(4).asInstanceOf[TypedPipe[(String, Boolean)]] + + withUserID.swap.group + .join(withGender.group) + .join(withAge.group) + .join(withIncome.group) + .join(withSmoker.group) + .flatMapValues { + case ((((name: String, gender: String), age: Int), income: Long), smoker) => + val lifeExpectancy = (gender, smoker) match { + case ("M", true) => 68 + case ("M", false) => 72 + case (_, true) => 76 + case (_, false) => 80 + } + + Some(EstimatedContribution(name, floor(income / (lifeExpectancy - age)))) + case _ => None + } + .values + }.Then { buffer: mutable.Buffer[EstimatedContribution] => + buffer.toList shouldBe List(EstimatedContribution("Joe", 35.0), EstimatedContribution("Sarah", 13.0)) + } + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategyTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategyTest.scala new file mode 100644 index 0000000000..0b8ce6e6ec --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorStepStrategyTest.scala @@ -0,0 +1,37 @@ +package com.twitter.scalding.estimation.memory + +import org.apache.hadoop.mapred.JobConf +import org.scalatest.{Matchers, WordSpec} + +class MemoryEstimatorStepStrategyTest extends WordSpec with Matchers { + "A Memory estimator step strategy" should { + "set xmx settings correctly" in { + val conf = confWith("test.opts", "-Xmx3500m -Djava.net.preferIPv4Stack=true -Xms34m") + + MemoryEstimatorStepStrategy.setXmxMemory("test.opts", 1024, conf) + + conf.get("test.opts") shouldBe "-Djava.net.preferIPv4Stack=true -Xmx1024m" + } + + "set xmx settings correctly with empty original config" in { + val conf = confWith(Map.empty) + + MemoryEstimatorStepStrategy.setXmxMemory("test.opts", 1024, conf) + + conf.get("test.opts") shouldBe " -Xmx1024m" + } + } + + def confWith(key: String, value: String): JobConf = + confWith(Map(key -> value)) + + def confWith(values: Map[String, String]): JobConf = { + val conf = new JobConf(false) + + values.foreach { case (k, v) => + conf.set(k, v) + } + + conf + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimatorTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimatorTest.scala new file mode 100644 index 0000000000..ac73f9878a --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/estimation/memory/SmoothedHistoryMemoryEstimatorTest.scala @@ -0,0 +1,173 @@ +package com.twitter.scalding.estimation.memory + +import cascading.flow.FlowStep +import com.twitter.scalding.estimation.{FlowStepHistory, FlowStrategyInfo, HistoryService, Task} +import org.apache.hadoop.mapred.JobConf +import org.mockito.Mockito._ +import org.mockito.Matchers._ +import org.scalatest.{Matchers, WordSpec} +import scala.util.{Success, Try} + +class SmoothedHistoryMemoryEstimatorTest extends WordSpec with Matchers { + import Utils._ + + "A memory history estimator" should { + "return None without history" in { + SmoothedMemoryEstimator.empty.estimate(TestFlowStrategyInfo.dummy) shouldBe None + } + + "estimate correct numbers for only reducers" in { + val estimation = SmoothedMemoryEstimator + .makeHistory(Seq("REDUCE" -> 1024.megabytes)) + .estimate(TestFlowStrategyInfo.dummy) + + estimation shouldBe reduceEstimate((1228, 1536)) + } + + "estimate correct numbers for only mappers" in { + val estimation = SmoothedMemoryEstimator + .makeHistory(Seq("MAP" -> 1024.megabytes)) + .estimate(TestFlowStrategyInfo.dummy) + + estimation shouldBe mapEstimate((1228, 1536)) + } + + "estimate correct numbers" in { + val estimation = SmoothedMemoryEstimator + .makeHistory( + Seq( + "MAP" -> 800.megabytes, + "REDUCE" -> 800.megabytes, + "MAP" -> 1024.megabytes, + "REDUCE" -> 1024.megabytes, + "MAP" -> 1300.megabytes, + "REDUCE" -> 1300.megabytes, + "MAP" -> 723.megabytes, + "REDUCE" -> 723.megabytes + ) + ) + .estimate(TestFlowStrategyInfo.dummy) + + estimation shouldBe Some(MemoryEstimate(Some((1228, 1536)), Some((1228, 1536)))) + } + + "estimate less than max cap" in { + val conf = TestFlowStrategyInfo.dummy.step.getConfig + val estimation = SmoothedMemoryEstimator + .makeHistory(Seq("MAP" -> (MemoryEstimatorConfig.getMaxContainerMemory(conf).megabyte + 1.gigabyte))) + .estimate(TestFlowStrategyInfo.dummy) + + val expectedEstimation = ( + (MemoryEstimatorConfig.getMaxContainerMemory(conf) / MemoryEstimatorConfig.getXmxScaleFactor( + conf + )).toLong, + MemoryEstimatorConfig.getMaxContainerMemory(conf) + ) + + estimation shouldBe mapEstimate(expectedEstimation) + } + + "estimate not less than min cap" in { + val conf = TestFlowStrategyInfo.dummy.step.getConfig + val estimation = SmoothedMemoryEstimator + .makeHistory( + Seq("MAP" -> (MemoryEstimatorConfig.getMinContainerMemory(conf).megabyte - 500.megabyte)) + ) + .estimate(TestFlowStrategyInfo.dummy) + + val expectedEstimation = ( + (MemoryEstimatorConfig.getMinContainerMemory(conf) / MemoryEstimatorConfig.getXmxScaleFactor( + conf + )).toLong, + MemoryEstimatorConfig.getMinContainerMemory(conf) + ) + + estimation shouldBe mapEstimate(expectedEstimation) + } + } +} + +object EmptyHistoryService extends HistoryService { + override def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = + Success(Seq.empty) +} + +class DummyHistoryService(val history: Seq[(String, Long)]) extends HistoryService { + override def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = + Success(history.map { case (taskType, memory) => + val task = Task( + details = Map(Task.TaskType -> taskType), + counters = Map(SmoothedHistoryMemoryEstimator.CommittedHeapBytes -> memory) + ) + val tasks = Seq(task) + FlowStepHistory( + keys = null, + submitTimeMillis = 0, + launchTimeMillis = 0L, + finishTimeMillis = 0L, + totalMaps = 0L, + totalReduces = 0L, + finishedMaps = 0L, + finishedReduces = 0L, + failedMaps = 0L, + failedReduces = 0L, + mapFileBytesRead = 0L, + mapFileBytesWritten = 0L, + mapOutputBytes = 0L, + reduceFileBytesRead = 0L, + hdfsBytesRead = 0L, + hdfsBytesWritten = 0L, + mapperTimeMillis = 0L, + reducerTimeMillis = 0L, + reduceShuffleBytes = 0L, + cost = 1.1, + tasks = tasks + ) + }) +} + +class SmoothedMemoryEstimator(override val historyService: HistoryService) + extends SmoothedHistoryMemoryEstimator + +object SmoothedMemoryEstimator { + def empty: SmoothedMemoryEstimator = new SmoothedMemoryEstimator(EmptyHistoryService) + + def makeHistory(history: Seq[(String, Long)]): SmoothedMemoryEstimator = + new SmoothedMemoryEstimator(new DummyHistoryService(history)) +} + +object TestFlowStrategyInfo { + def dummy: FlowStrategyInfo = { + val mockedConf = spy(new JobConf()) + val mockedStep = mock(classOf[FlowStep[JobConf]]) + val mockedInfo = mock(classOf[FlowStrategyInfo]) + + when(mockedConf.get(anyString())).thenReturn(null) + when(mockedStep.getConfig).thenReturn(mockedConf) + when(mockedInfo.step).thenReturn(mockedStep) + + mockedInfo + } +} + +object Utils { + implicit class StorageUnit(val wrapped: Long) extends AnyVal { + def fromMegabytes(megabytes: Long): Long = megabytes * 1024 * 1024 + def fromGigabytes(gigabytes: Long): Long = gigabytes * 1024 * 1024 * 1024 + + def megabyte: Long = megabytes + def megabytes: Long = fromMegabytes(wrapped) + def gigabyte: Long = gigabytes + def gigabytes: Long = fromGigabytes(wrapped) + + def inMegabytes: Long = wrapped / (1024L * 1024) + } + + implicit def doubleToLong(value: Double): StorageUnit = new StorageUnit(value.toLong) + + def mapEstimate(value: (Long, Long)): Some[MemoryEstimate] = + Some(MemoryEstimate(mapMemoryInMB = Some(value), None)) + + def reduceEstimate(value: (Long, Long)): Some[MemoryEstimate] = + Some(MemoryEstimate(None, reduceMemoryInMB = Some(value))) +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrixTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrixTest.scala deleted file mode 100644 index 4ce154b041..0000000000 --- a/scalding-core/src/test/scala/com/twitter/scalding/examples/WeightedPageRankFromMatrixTest.scala +++ /dev/null @@ -1,167 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package com.twitter.scalding.examples - -import scala.collection._ - -import org.specs._ - -import com.twitter.scalding._ -import com.twitter.scalding.Dsl._ - -import WeightedPageRankFromMatrixSpec._ - -class WeightedPageRankFromMatrixSpec extends Specification { - - "Weighted PageRank from Matrix job" should { - - // 0.0 0.0 0.0 0.0 1.0 - // 0.5 0.0 0.0 0.0 0.0 - // 0.5 0.0 0.0 0.0 0.0 - // 0.0 1.0 0.5 0.0 0.0 - // 0.0 0.0 0.5 1.0 0.0 - val edges = List( - (0, 4, 1.0), - (1, 0, 0.5), - (2, 0, 0.5), - (3, 1, 1.0), - (3, 2, 0.5), - (4, 2, 0.5), - (4, 3, 1.0)) - - val d = 0.4d // damping factor - val n = 5 // number of nodes - val onesVector = filledColumnVector(1d, n) - val iterationZeroVector = filledColumnVector(1d / n, n) - - val expectedSolution = Array(0.28, 0.173333, 0.173333, 0.173333, 0.2) - - JobTest("com.twitter.scalding.examples.WeightedPageRankFromMatrix"). - arg("d", d.toString). - arg("n", n.toString). - arg("convergenceThreshold", "0.0001"). - arg("maxIterations", "1"). - arg("currentIteration", "0"). - arg("rootDir", "root"). - source(TypedTsv[(Int, Int, Double)]("root/edges"), edges). - source(TypedTsv[(Int, Double)]("root/onesVector"), onesVector). - source(TypedTsv[(Int, Double)]("root/iterations/0"), iterationZeroVector). - sink[(Int, Int, Double)](Tsv("root/constants/M_hat")) { outputBuffer => - outputBuffer.size must be (7) - val outputMap = toSparseMap(outputBuffer) - outputMap((0 -> 1)) must beCloseTo (0.4, 0) - outputMap((0 -> 2)) must beCloseTo (0.4, 0) - outputMap((1 -> 3)) must beCloseTo (0.26666, 0.00001) - outputMap((2 -> 3)) must beCloseTo (0.13333, 0.00001) - outputMap((2 -> 4)) must beCloseTo (0.13333, 0.00001) - outputMap((3 -> 4)) must beCloseTo (0.26666, 0.00001) - outputMap((4 -> 0)) must beCloseTo (0.4, 0) - }. - sink[(Int, Double)](Tsv("root/constants/priorVector")) { outputBuffer => - outputBuffer.size must be (5) - val expectedValue = ((1 - d) / 2) * d - assertVectorsEqual( - new Array[Double](5).map { v => expectedValue }, - outputBuffer.map(_._2).toArray) - }. - sink[(Int, Double)](Tsv("root/iterations/1")) { outputBuffer => - outputBuffer.size must be (5) - assertVectorsEqual( - expectedSolution, - outputBuffer.map(_._2).toArray, - 0.00001) - }. - sink[Double](TypedTsv[Double]("root/diff")) { outputBuffer => - outputBuffer.size must be (1) - - val expectedDiff = - expectedSolution.zip(iterationZeroVector.map(_._2)). - map { case (a, b) => math.abs(a - b) }. - sum - outputBuffer.head must beCloseTo (expectedDiff, 0.00001) - }. - run. - finish - } - - private def assertVectorsEqual(expected: Array[Double], actual: Array[Double], variance: Double) { - actual.zipWithIndex.foreach { case (value, i) => - value must beCloseTo (expected(i), variance) - } - } - - private def assertVectorsEqual(expected: Array[Double], actual: Array[Double]) { - actual.zipWithIndex.foreach { case (value, i) => - value must beCloseTo (expected(i), 0) - } - } -} - -object WeightedPageRankFromMatrixSpec { - - def toSparseMap[Row, Col, V](iterable: Iterable[(Row, Col, V)]): Map[(Row, Col), V] = - iterable.map { entry => ((entry._1, entry._2), entry._3) }.toMap - - def filledColumnVector(value: Double, size: Int): List[(Int, Double)] = { - val vector = mutable.ListBuffer[(Int, Double)]() - (0 until size).foreach { row => - vector += new Tuple2(row, value) - } - - vector.toList - } -} - -/** - * Octave/Matlab implementations to provide the expected ranks. This comes from - * the Wikipedia page on PageRank: - * http://en.wikipedia.org/wiki/PageRank#Computation - -function [v] = iterate(A, sv, d) - -N = size(A, 2) -M = (spdiags(1 ./ sum(A, 2), 0, N, N) * A)'; -v = (d * M * sv) + (((1 - d) / N) .* ones(N, 1)); - -endfunction - -iterate([0 0 0 0 1; 0.5 0 0 0 0; 0.5 0 0 0 0; 0 1 0.5 0 0; 0 0 0.5 1 0], [0.2; 0.2; 0.2; 0.2; 0.2], 0.4) - -% Parameter M adjacency matrix where M_i,j represents the link from 'j' to 'i', such that for all 'j' sum(i, M_i,j) = 1 -% Parameter d damping factor -% Parameter v_quadratic_error quadratic error for v -% Return v, a vector of ranks such that v_i is the i-th rank from [0, 1] - -function [v] = rank(M, d, v_quadratic_error) - -N = size(M, 2); % N is equal to half the size of M -v = rand(N, 1); -v = v ./ norm(v, 2); -last_v = ones(N, 1) * inf; -M_hat = (d .* M) + (((1 - d) / N) .* ones(N, N)); - -while(norm(v - last_v, 2) > v_quadratic_error) - last_v = v; - v = M_hat * v; - v = v ./ norm(v, 2); -end - -endfunction - -M = [0 0 0 0 1 ; 0.5 0 0 0 0 ; 0.5 0 0 0 0 ; 0 1 0.5 0 0 ; 0 0 0.5 1 0]; -rank(M, 0.4, 0.001) - -*/ diff --git a/scalding-core/src/test/scala/com/twitter/scalding/filecache/DistributedCacheFileSpec.scala b/scalding-core/src/test/scala/com/twitter/scalding/filecache/DistributedCacheFileSpec.scala index c2bbca5cef..2db8e0c6cc 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/filecache/DistributedCacheFileSpec.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/filecache/DistributedCacheFileSpec.scala @@ -12,22 +12,19 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.filecache import cascading.tuple.Tuple import com.twitter.scalding._ -import java.io.File import java.net.URI -import org.apache.hadoop.conf.Configuration -import org.specs.Specification -import org.specs.mock.Mockito +import org.scalatest.{Matchers, WordSpec} import scala.collection.mutable - -class DistributedCacheFileSpec extends Specification with Mockito { +// TODO: fix? is it worth having the dep on mockito just for this? +class DistributedCacheFileSpec extends WordSpec with Matchers { case class UnknownMode(buffers: Map[Source, mutable.Buffer[Tuple]]) extends TestMode with CascadingLocal - + /* val conf = smartMock[Configuration] lazy val hdfsMode = { @@ -45,18 +42,19 @@ class DistributedCacheFileSpec extends Specification with Mockito { lazy val testMode = smartMock[Test] lazy val localMode = smartMock[Local] - + */ val uriString = "hdfs://foo.example:1234/path/to/the/stuff/thefilename.blah" val uri = new URI(uriString) val hashHex = URIHasher(uri) - val hashedFilename = "thefilename.blah-" + hashHex + val hashedFilename = hashHex + "-thefilename.blah" "DistributedCacheFile" should { "symlinkNameFor must return a hashed name" in { - DistributedCacheFile.symlinkNameFor(uri) must_== hashedFilename + DistributedCacheFile.symlinkNameFor(uri) shouldBe hashedFilename } } + /* "UncachedFile.add" should { val dcf = new UncachedFile(Right(uri)) @@ -64,8 +62,8 @@ class DistributedCacheFileSpec extends Specification with Mockito { "use the local file path" in { val cf = dcf.add()(mode) - cf.path must_== uri.getPath - cf.file must_== new File(uri.getPath).getCanonicalFile + cf.path shouldBe (uri.getPath) + cf.file shouldBe (new File(uri.getPath).getCanonicalFile) } } @@ -79,7 +77,8 @@ class DistributedCacheFileSpec extends Specification with Mockito { "throw RuntimeException when the current mode isn't recognized" in { val mode = smartMock[UnknownMode] - dcf.add()(mode) must throwA[RuntimeException] + an[RuntimeException] should be thrownBy (dcf.add()(mode)) } } + */ } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/macros/MacroDepHygiene.scala b/scalding-core/src/test/scala/com/twitter/scalding/macros/MacroDepHygiene.scala new file mode 100644 index 0000000000..1343732509 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/macros/MacroDepHygiene.scala @@ -0,0 +1,69 @@ +/* + Copyright 2014 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.macros + +import org.scalatest.WordSpec + +/** + * This test is intended to ensure that the macros do not require any imported code in scope. This is why all + * references are via absolute paths. + */ +class MacroDepHygiene extends WordSpec { + + case class A(x: Int, y: String) + case class B(x: A, y: String, z: A) + class C + + def isMg(a: Any) = a.isInstanceOf[com.twitter.bijection.macros.MacroGenerated] + + "TupleSetter macro" should { + def isTupleSetterAvailable[T](implicit proof: com.twitter.scalding.TupleSetter[T]) = isMg(proof) + + "work fine without any imports" in { + com.twitter.scalding.macros.Macros.caseClassTupleSetter[A] + com.twitter.scalding.macros.Macros.caseClassTupleSetter[B] + } + + "implicitly work fine without any imports" in { + import com.twitter.scalding.macros.MacroImplicits.materializeCaseClassTupleSetter + assert(isTupleSetterAvailable[A]) + assert(isTupleSetterAvailable[B]) + } + + "fail if not a case class" in { + assert(!isTupleSetterAvailable[C]) + } + } + + "TupleConverter macro" should { + def isTupleConverterAvailable[T](implicit proof: com.twitter.scalding.TupleConverter[T]) = isMg(proof) + + "work fine without any imports" in { + com.twitter.scalding.macros.Macros.caseClassTupleConverter[A] + com.twitter.scalding.macros.Macros.caseClassTupleConverter[B] + } + + "implicitly work fine without any imports" in { + import com.twitter.scalding.macros.MacroImplicits.materializeCaseClassTupleConverter + assert(isTupleConverterAvailable[A]) + assert(isTupleConverterAvailable[B]) + } + + "fail if not a case class" in { + assert(!isTupleConverterAvailable[C]) + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/macros/MacrosUnitTests.scala b/scalding-core/src/test/scala/com/twitter/scalding/macros/MacrosUnitTests.scala new file mode 100644 index 0000000000..2054f1bc97 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/macros/MacrosUnitTests.scala @@ -0,0 +1,317 @@ +/* + Copyright 2014 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.macros + +import cascading.tuple.{Tuple => CTuple, TupleEntry} +import com.twitter.bijection.macros.{IsCaseClass, MacroGenerated} +import com.twitter.scalding._ +import com.twitter.scalding.serialization.Externalizer +import org.scalacheck.Arbitrary +import org.scalacheck.Prop +import org.scalacheck.Prop.forAll +import org.scalacheck.Properties +import org.scalatest.{Matchers, WordSpec} +import scala.reflect.runtime.universe._ + +// We avoid nesting these just to avoid any complications in the serialization test +case class SampleClassA(x: Int, y: String) +case class SampleClassB(a1: SampleClassA, a2: SampleClassA, y: String) +case class SampleClassC(a: SampleClassA, b: SampleClassB, c: SampleClassA, d: SampleClassB, e: SampleClassB) +case class SampleClassD(a: Option[SampleClassC]) +case class SampleClassE(a: String, b: Boolean, c: Short, d: Int, e: Long, f: Float, g: Double) +case class SampleClassF(a: Option[Int]) +case class SampleClassG(a: java.util.Date) + +case class SampleClassFail(a: Option[Option[Int]]) // linter:ignore + +object MacroProperties extends Properties("TypeDescriptor.roundTrip") { + def roundTrip[T: Arbitrary: TypeDescriptor]: Prop = forAll { t: T => + val setter = implicitly[TypeDescriptor[T]].setter + val converter = implicitly[TypeDescriptor[T]].converter + val fields = implicitly[TypeDescriptor[T]].fields + converter(new TupleEntry(fields, setter(t))) == t + } + + def propertyFor[T: TypeTag: Arbitrary: TypeDescriptor]: Unit = + property(typeTag[T].tpe.toString) = roundTrip[T] + + propertyFor[Int] + propertyFor[Option[Int]] + propertyFor[Option[(Int, String, Option[Long])]] + propertyFor[Option[(Option[Boolean], Int, String, Option[Long])]] + propertyFor[(Int, Double, String, Option[(String, Int, Option[Long])])] +} + +class MacrosUnitTests extends WordSpec with Matchers { + import MacroImplicits._ + def isMg[T](t: T): T = { + t shouldBe a[MacroGenerated] + t + } + + private val dummy = new TupleConverter[Nothing] { + def apply(te: TupleEntry) = sys.error("dummy") + override val arity = 1 + } + + private val dummy2 = new TypeDescriptor[Nothing] { + def setter = sys.error("dummy") + def converter = sys.error("dummy") + def fields = sys.error("dummy") + } + + def isMacroTupleConverterAvailable[T](implicit + proof: TupleConverter[T] = dummy.asInstanceOf[TupleConverter[T]] + ) = + proof.isInstanceOf[MacroGenerated] + + def isMacroTypeDescriptorAvailable[T](implicit + proof: TypeDescriptor[T] = dummy2.asInstanceOf[TypeDescriptor[T]] + ) = + proof.isInstanceOf[MacroGenerated] + + def mgConv[T](te: TupleEntry)(implicit conv: TupleConverter[T]): T = isMg(conv)(te) + def mgSet[T](t: T)(implicit set: TupleSetter[T]): TupleEntry = new TupleEntry(isMg(set)(t)) + + def shouldRoundTrip[T: IsCaseClass: TupleSetter: TupleConverter](t: T): Unit = + t shouldBe mgConv(mgSet(t)) + + def shouldRoundTripOther[T: IsCaseClass: TupleSetter: TupleConverter](te: TupleEntry, t: T): Unit = { + val inter = mgConv(te) + inter shouldBe t + mgSet(inter) shouldBe te + } + + def canExternalize(t: AnyRef): Unit = + Externalizer(t).javaWorks shouldBe true + + "MacroGenerated TupleConverter" should { + "Not compile for Option[Option[Int]]" in { + // TODO figure out a way to test this does not compile. See: + // https://github.com/milessabin/shapeless/blob/master/core/src/main/scala/shapeless/test/typechecking.scala + // uncommenting fails to compile, but we want to be more sure + // Macros.caseClassTupleConverter[Option[Option[Int]]] + // Macros.caseClassTupleConverter[Option[String]] + } + } + + "MacroGenerated TupleSetter" should { + + "Generate the setter SampleClassA" in { Macros.caseClassTupleSetter[SampleClassA] } + "Generate the setter SampleClassB" in { Macros.caseClassTupleSetter[SampleClassB] } + "Generate the setter SampleClassC" in { Macros.caseClassTupleSetter[SampleClassC] } + "Generate the setter SampleClassD" in { Macros.caseClassTupleSetter[SampleClassD] } + "Generate the setter SampleClassE" in { Macros.caseClassTupleSetter[SampleClassE] } + "Generate the setter SampleClassF" in { Macros.caseClassTupleSetter[SampleClassF] } + "Generate the setter SampleClassG" in { Macros.caseClassTupleSetterWithUnknown[SampleClassG] } + + def doesJavaWork[T](implicit set: TupleSetter[T]): Unit = + canExternalize(isMg(set)) + "be serializable for case class A" in { doesJavaWork[SampleClassA] } + "be serializable for case class B" in { doesJavaWork[SampleClassB] } + "be serializable for case class C" in { doesJavaWork[SampleClassC] } + "be serializable for case class D" in { doesJavaWork[SampleClassD] } + "be serializable for case class E" in { doesJavaWork[SampleClassE] } + "be serializable for case class F" in { doesJavaWork[SampleClassF] } + } + + "MacroGenerated TupleConverter" should { + "Generate the converter SampleClassA" in { Macros.caseClassTupleConverter[SampleClassA] } + "Generate the converter SampleClassB" in { Macros.caseClassTupleConverter[SampleClassB] } + "Generate the converter SampleClassC" in { Macros.caseClassTupleConverter[SampleClassC] } + "Generate the converter SampleClassD" in { Macros.caseClassTupleConverter[SampleClassD] } + "Generate the converter SampleClassE" in { Macros.caseClassTupleConverter[SampleClassE] } + "Generate the converter SampleClassF" in { Macros.caseClassTupleConverter[SampleClassF] } + "Generate the converter SampleClassG" in { Macros.caseClassTupleConverterWithUnknown[SampleClassG] } + "Generate the converter Option[(Int, String)]" in { + Macros.caseClassTupleConverter[Option[(Int, String)]] + } + "Generate the converter Option[(Int, Option[(Long, String)])]" in { + Macros.caseClassTupleConverter[Option[(Int, Option[(Long, String)])]] + } + + "Not generate a convertor for SampleClassFail" in { + isMacroTupleConverterAvailable[SampleClassFail] shouldBe false + } + + def doesJavaWork[T](implicit conv: TupleConverter[T]): Unit = + canExternalize(isMg(conv)) + "be serializable for case class A" in { doesJavaWork[SampleClassA] } + "be serializable for case class B" in { doesJavaWork[SampleClassB] } + "be serializable for case class C" in { doesJavaWork[SampleClassC] } + "be serializable for case class D" in { doesJavaWork[SampleClassD] } + "be serializable for case class E" in { doesJavaWork[SampleClassE] } + "be serializable for case class F" in { doesJavaWork[SampleClassF] } + } + + "MacroGenerated TypeDescriptor" should { + "Generate the converter SampleClassA" in { Macros.caseClassTypeDescriptor[SampleClassA] } + "Generate the converter SampleClassB" in { Macros.caseClassTypeDescriptor[SampleClassB] } + "Generate the converter SampleClassC" in { Macros.caseClassTypeDescriptor[SampleClassC] } + "Generate the converter SampleClassD" in { Macros.caseClassTypeDescriptor[SampleClassD] } + "Generate the converter SampleClassE" in { Macros.caseClassTypeDescriptor[SampleClassE] } + "Generate the converter SampleClassF" in { Macros.caseClassTypeDescriptor[SampleClassF] } + "Generate the converter SampleClassG" in { Macros.caseClassTypeDescriptorWithUnknown[SampleClassG] } + + "Not generate a convertor for SampleClassFail" in { + isMacroTypeDescriptorAvailable[SampleClassFail] shouldBe false + } + + def doesJavaWork[T](implicit conv: TypeDescriptor[T]): Unit = + canExternalize(isMg(conv)) + "be serializable for case class A" in { doesJavaWork[SampleClassA] } + "be serializable for case class B" in { doesJavaWork[SampleClassB] } + "be serializable for case class C" in { doesJavaWork[SampleClassC] } + "be serializable for case class D" in { doesJavaWork[SampleClassD] } + "be serializable for case class E" in { doesJavaWork[SampleClassE] } + "be serializable for case class F" in { doesJavaWork[SampleClassF] } + } + + "MacroGenerated TupleSetter and TupleConverter" should { + "round trip class -> tupleentry -> class" in { + shouldRoundTrip(SampleClassA(100, "onehundred")) + shouldRoundTrip(SampleClassB(SampleClassA(100, "onehundred"), SampleClassA(-1, "zero"), "what")) + val a = SampleClassA(73, "hrmA1") + val b = SampleClassB(a, a, "hrmB1") + val c = SampleClassC( + a, + b, + SampleClassA(123980, "heyA2"), + SampleClassB(a, SampleClassA(-1, "zeroA3"), "zooB2"), + b + ) + shouldRoundTrip(b) + shouldRoundTrip(c) + shouldRoundTrip(SampleClassD(Some(c))) + shouldRoundTrip(SampleClassD(None)) + + implicit val tupSetterG = Macros.caseClassTupleSetterWithUnknown[SampleClassG] + implicit val tupConverterG = Macros.caseClassTupleConverterWithUnknown[SampleClassG] + shouldRoundTrip(SampleClassG(new java.util.Date(123412L))) + } + + "Case Class should form expected tuple" in { + val input = SampleClassC( + SampleClassA(1, "asdf"), + SampleClassB(SampleClassA(2, "bcdf"), SampleClassA(5, "jkfs"), "wetew"), + SampleClassA(9, "xcmv"), + SampleClassB(SampleClassA(23, "ck"), SampleClassA(13, "dafk"), "xcv"), + SampleClassB(SampleClassA(34, "were"), SampleClassA(654, "power"), "adsfmx") + ) + val setter = implicitly[TupleSetter[SampleClassC]] + val tup = setter(input) + assert(tup.size == 19) + assert(tup.getInteger(0) === 1) + assert(tup.getString(18) === "adsfmx") + } + + "round trip tupleentry -> class -> tupleEntry" in { + val a_tup = CTuple.size(2) + a_tup.setInteger(0, 100) + a_tup.setString(1, "onehundred") + val a_te = new TupleEntry(a_tup) + val a = SampleClassA(100, "onehundred") + shouldRoundTripOther(a_te, a) + + val b_tup = CTuple.size(5) + b_tup.setInteger(0, 100) + b_tup.setString(1, "onehundred") + b_tup.setInteger(2, 100) + b_tup.setString(3, "onehundred") + b_tup.setString(4, "what") + val b_te = new TupleEntry(b_tup) + val b = SampleClassB(a, a, "what") + shouldRoundTripOther(b_te, b) + + val c_tup = CTuple.size(19) + c_tup.setInteger(0, 100) + c_tup.setString(1, "onehundred") + + c_tup.setInteger(2, 100) + c_tup.setString(3, "onehundred") + c_tup.setInteger(4, 100) + c_tup.setString(5, "onehundred") + c_tup.setString(6, "what") + + c_tup.setInteger(7, 100) + c_tup.setString(8, "onehundred") + + c_tup.setInteger(9, 100) + c_tup.setString(10, "onehundred") + c_tup.setInteger(11, 100) + c_tup.setString(12, "onehundred") + c_tup.setString(13, "what") + + c_tup.setInteger(14, 100) + c_tup.setString(15, "onehundred") + c_tup.setInteger(16, 100) + c_tup.setString(17, "onehundred") + c_tup.setString(18, "what") + + val c_te = new TupleEntry(c_tup) + val c = SampleClassC(a, b, a, b, b) + shouldRoundTripOther(c_te, c) + } + + "Case Class should form expected Fields" in { + val fields = Macros.toFields[SampleClassB] + assert(fields.size === 5) + assert( + fields.getTypes === Array[java.lang.reflect.Type]( + classOf[Int], + classOf[String], + classOf[Int], + classOf[String], + classOf[String] + ) + ) + val names = List("a1.x", "a1.y", "a2.x", "a2.y", "y") + names.zipWithIndex.foreach { case (name, indx) => + assert(fields.get(indx) === name) + } + } + + "Case Class should form expected Fields with Options" in { + val fields = Macros.toFields[SampleClassD] + assert(fields.size === 19) + assert(fields.getTypes === Array.fill[java.lang.reflect.Type](19)(classOf[java.lang.Object])) + } + + "Case Class should form expected Fields with Unknown types" in { + val fields = Macros.toFieldsWithUnknown[SampleClassG] + assert(fields.size === 1) + assert(fields.getTypes === Array[java.lang.reflect.Type](classOf[java.util.Date])) + } + + "Case Class should form expected Indexed Fields" in { + val fields = Macros.toIndexedFields[SampleClassB] + assert(fields.size === 5) + assert( + fields.getTypes === Array[java.lang.reflect.Type]( + classOf[Int], + classOf[String], + classOf[Int], + classOf[String], + classOf[String] + ) + ) + val names = 0 until fields.size + names.zipWithIndex.foreach { case (name, indx) => + assert(fields.get(indx) === name) + } + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/CombinatoricsTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/CombinatoricsTest.scala index d45d992b78..9a9f179971 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/CombinatoricsTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/CombinatoricsTest.scala @@ -12,61 +12,55 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics -import org.specs._ +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ -class CombinatoricsJob(args : Args) extends Job(args) { +class CombinatoricsJob(args: Args) extends Job(args) { val C = Combinatorics - C.permutations( 10,3 ).write(Tsv("perms.txt")) + C.permutations(10, 3).write(Tsv("perms.txt")) - C.combinations( 5,2 ).write(Tsv("combs.txt")) + C.combinations(5, 2).write(Tsv("combs.txt")) // how many ways can you invest $10000 in KR,ABT,DLTR,MNST ? val cash = 1000.0 val error = 1.0 // max error $1, so its ok if we cannot invest the last dollar - val (kr,abt,dltr,mnst) = (27.0,64.0,41.0,52.0) // share prices - val stocks = IndexedSeq( kr,abt,dltr,mnst) + val (kr, abt, dltr, mnst) = (27.0, 64.0, 41.0, 52.0) // share prices + val stocks = IndexedSeq(kr, abt, dltr, mnst) - - C.weightedSum( stocks, cash,error).write( Tsv("invest.txt")) - C.positiveWeightedSum( stocks, cash,error).write( Tsv("investpos.txt")) + C.weightedSum(stocks, cash, error).write(Tsv("invest.txt")) + C.positiveWeightedSum(stocks, cash, error).write(Tsv("investpos.txt")) } -class CombinatoricsJobTest extends Specification { - noDetailedDiffs() - import Dsl._ +class CombinatoricsJobTest extends WordSpec with Matchers { "A Combinatorics Job" should { - JobTest( new CombinatoricsJob(_)) - .sink[(Int,Int)](Tsv("perms.txt")) { pbuf => - val psize = pbuf.toList.size + JobTest(new CombinatoricsJob(_)) + .sink[(Int, Int)](Tsv("perms.txt")) { pbuf => "correctly compute 10 permute 3 equals 720" in { - psize must be_==(720) + pbuf.toList should have size 720 } } - .sink[(Int,Int)](Tsv("combs.txt")) { buf => + .sink[(Int, Int)](Tsv("combs.txt")) { buf => val csize = buf.toList.size "correctly compute 5 choose 2 equals 10" in { - csize must be_==(10) + buf.toList should have size 10 } } - .sink[(Int,Int,Int,Int)](Tsv("invest.txt")) { buf => - val isize = buf.toList.size + .sink[(Int, Int, Int, Int)](Tsv("invest.txt")) { buf => "correctly compute 169 tuples that allow you to invest $1000 among the 4 given stocks" in { - isize must be_==(169) + buf.toList should have size 169 } } - .sink[(Int,Int,Int,Int)](Tsv("investpos.txt")) { buf => - val ipsize = buf.toList.size + .sink[(Int, Int, Int, Int)](Tsv("investpos.txt")) { buf => "correctly compute 101 non-zero tuples that allow you to invest $1000 among the 4 given stocks" in { - ipsize must be_==(101) + buf.toList should have size 101 } } .run - .finish + .finish() } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/HistogramTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/HistogramTest.scala index d7e7812d79..35d7495fd8 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/HistogramTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/HistogramTest.scala @@ -12,63 +12,64 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics -import org.specs._ +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ -class HistogramJob(args : Args) extends Job(args) { +class HistogramJob(args: Args) extends Job(args) { try { val hist = Tsv("input", 'n) - .groupAll{ _.histogram('n -> 'hist) } + .groupAll(_.histogram('n -> 'hist)) hist - .flatMapTo('hist -> ('bin, 'cdf)){h : Histogram => h.cdf} + .flatMapTo('hist -> ('bin, 'cdf)) { h: Histogram => h.cdf } .write(Tsv("cdf-output")) hist - .mapTo('hist -> ('min, 'max, 'sum, 'mean, 'stdDev)){h : Histogram => (h.min, h.max, h.sum, h.mean, h.stdDev)} + .mapTo('hist -> ('min, 'max, 'sum, 'mean, 'stdDev)) { h: Histogram => + (h.min, h.max, h.sum, h.mean, h.stdDev) + } .write(Tsv("stats-output")) - } catch { - case e : Exception => e.printStackTrace() - } + } catch { + case e: Exception => e.printStackTrace() + } } -class HistogramJobTest extends Specification { - noDetailedDiffs() +class HistogramJobTest extends WordSpec with Matchers { import Dsl._ val values = List(1.0, 2.5, 1.5, 3.0, 3.0, 3.0, 4.2, 2.0, 8.0, 1.0) val inputData = values.map(Tuple1(_)) val cdfOutput = Set((1.0, 0.3), (2.0, 0.5), (3.0, 0.8), (4.0, 0.9), (8.0, 1.0)) "A HistogramJob" should { - JobTest("com.twitter.scalding.mathematics.HistogramJob") - .source(Tsv("input",('n)), inputData) + JobTest(new HistogramJob(_)) + .source(Tsv("input", 'n), inputData) .sink[(Double, Double, Double, Double, Double)](Tsv("stats-output")) { buf => val (min, max, sum, mean, stdDev) = buf.head "correctly compute the min" in { - min must be_==(values.map(_.floor).min) + min shouldBe (values.map(_.floor).min) } "correctly compute the max" in { - max must be_==(values.map(_.floor).max) + max shouldBe (values.map(_.floor).max) } "correctly compute the sum" in { - sum must be_==(values.map(_.floor).sum) + sum shouldBe (values.map(_.floor).sum) } "correctly compute the mean" in { - mean must be_==(values.map(_.floor).sum/values.size) + mean shouldBe (values.map(_.floor).sum / values.size) } "correctly compute the stdDev" in { - stdDev must beCloseTo(1.989974874, 0.000000001) + stdDev shouldBe 1.989974874 +- 0.000000001 } } .sink[(Double, Double)](Tsv("cdf-output")) { buf => "correctly compute a CDF" in { - buf.toSet must be_==(cdfOutput) + buf.toSet shouldBe cdfOutput } } .run - .finish + .finish() } } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2OptimizationTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2OptimizationTest.scala index 406d7b7cea..8e11d6812a 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2OptimizationTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2OptimizationTest.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics import org.scalacheck.Arbitrary @@ -21,7 +21,7 @@ import org.scalacheck.Properties import org.scalacheck.Prop.forAll import org.scalacheck._ import org.scalacheck.Gen._ -import org.specs._ +import org.scalatest.{Matchers, WordSpec} import com.twitter.scalding._ import Matrix2._ import cascading.flow.FlowDef @@ -29,15 +29,13 @@ import com.twitter.algebird.Ring import com.twitter.scalding.IterableSource /** - * Unit tests used in development - * (stronger properties are tested in ScalaCheck tests at the end) + * Unit tests used in development (stronger properties are tested in ScalaCheck tests at the end) */ -class Matrix2OptimizationSpec extends Specification { - import Dsl._ +class Matrix2OptimizationSpec extends WordSpec with Matchers { import com.twitter.scalding.Test - implicit val mode = Test(Map()) - implicit val fd = new FlowDef + implicit val mode: Test = Test(Map()) + implicit val fd: FlowDef = new FlowDef val globM = TypedPipe.from(IterableSource(List((1, 2, 3.0), (2, 2, 4.0)))) @@ -45,143 +43,187 @@ class Matrix2OptimizationSpec extends Specification { implicit val ord1: Ordering[Int] = Ordering.Int implicit val ord2: Ordering[(Int, Int)] = Ordering.Tuple2[Int, Int] - def literal(tpipe: TypedPipe[(Int, Int, Double)], sizeHint: SizeHint): MatrixLiteral[Any, Any, Double] = MatrixLiteral(tpipe, sizeHint).asInstanceOf[MatrixLiteral[Any, Any, Double]] - def product(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double], optimal: Boolean = false): Product[Any, Any, Any, Double] = Product(left, right, ring) - def sum(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double]): Sum[Any, Any, Double] = Sum(left, right, ring) + def literal(tpipe: TypedPipe[(Int, Int, Double)], sizeHint: SizeHint): MatrixLiteral[Any, Any, Double] = + MatrixLiteral(tpipe, sizeHint).asInstanceOf[MatrixLiteral[Any, Any, Double]] + def product( + left: Matrix2[Any, Any, Double], + right: Matrix2[Any, Any, Double] + ): Product[Any, Any, Any, Double] = Product(left, right, ring) + def sum(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double]): Sum[Any, Any, Double] = + Sum(left, right, ring) /** * Values used in tests */ // ((A1(A2 A3))((A4 A5) A6) - val optimizedPlan = product( - product(literal(globM, FiniteHint(30, 35)), - product(literal(globM, FiniteHint(35, 15)), - literal(globM, FiniteHint(15, 5)), true), true), + val optimizedPlan = product( // linter:ignore + product( + literal(globM, FiniteHint(30, 35)), + product(literal(globM, FiniteHint(35, 15)), literal(globM, FiniteHint(15, 5))) + ), product( - product(literal(globM, FiniteHint(5, 10)), - literal(globM, FiniteHint(10, 20)), true), - literal(globM, FiniteHint(20, 25)), true), true) + product(literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(10, 20))), + literal(globM, FiniteHint(20, 25)) + ) + ) val optimizedPlanCost = 1850 // originally 15125.0 // A1(A2(A3(A4(A5 A6)))) - val unoptimizedPlan = product(literal(globM, FiniteHint(30, 35)), - product(literal(globM, FiniteHint(35, 15)), - product(literal(globM, FiniteHint(15, 5)), - product(literal(globM, FiniteHint(5, 10)), - product(literal(globM, FiniteHint(10, 20)), literal(globM, FiniteHint(20, 25))))))) + val unoptimizedPlan = product( + literal(globM, FiniteHint(30, 35)), // linter:ignore + product( + literal(globM, FiniteHint(35, 15)), + product( + literal(globM, FiniteHint(15, 5)), + product( + literal(globM, FiniteHint(5, 10)), + product(literal(globM, FiniteHint(10, 20)), literal(globM, FiniteHint(20, 25))) + ) + ) + ) + ) - val simplePlan = product(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 25)), true) + val simplePlan = + product(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 25))) // linter:ignore - val simplePlanCost = 750 //originally 26250 + val simplePlanCost = 750 // originally 26250 - val combinedUnoptimizedPlan = sum(unoptimizedPlan, simplePlan) + val combinedUnoptimizedPlan = sum(unoptimizedPlan, simplePlan) // linter:ignore - val combinedOptimizedPlan = sum(optimizedPlan, simplePlan) + val combinedOptimizedPlan = sum(optimizedPlan, simplePlan) // linter:ignore val combinedOptimizedPlanCost = optimizedPlanCost + simplePlanCost // A1 * (A2 * (A3 * ( A4 + A4 ) * (A5 * (A6)))) - val unoptimizedGlobalPlan = product(literal(globM, FiniteHint(30, 35)), - product(literal(globM, FiniteHint(35, 15)), - product(literal(globM, FiniteHint(15, 5)), - product(sum(literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(5, 10))), - product(literal(globM, FiniteHint(10, 20)), literal(globM, FiniteHint(20, 25))))))) + val unoptimizedGlobalPlan = product( + literal(globM, FiniteHint(30, 35)), // linter:ignore + product( + literal(globM, FiniteHint(35, 15)), + product( + literal(globM, FiniteHint(15, 5)), + product( + sum(literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(5, 10))), + product(literal(globM, FiniteHint(10, 20)), literal(globM, FiniteHint(20, 25))) + ) + ) + ) + ) // ((A1(A2 A3))(((A4 + A4) A5) A6) - val optimizedGlobalPlan = product( - product(literal(globM, FiniteHint(30, 35)), - product(literal(globM, FiniteHint(35, 15)), - literal(globM, FiniteHint(15, 5)), true), true), + val optimizedGlobalPlan = product( // linter:ignore product( - product(sum(literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(5, 10))), - literal(globM, FiniteHint(10, 20)), true), - literal(globM, FiniteHint(20, 25)), true), true) - - val productSequence = IndexedSeq(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 15)), - literal(globM, FiniteHint(15, 5)), literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(10, 20)), - literal(globM, FiniteHint(20, 25))) - - val combinedSequence = List(IndexedSeq(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 15)), - literal(globM, FiniteHint(15, 5)), literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(10, 20)), - literal(globM, FiniteHint(20, 25))), IndexedSeq(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 25)))) - - val planWithSum = product(literal(globM, FiniteHint(30, 35)), sum(literal(globM, FiniteHint(35, 25)), literal(globM, FiniteHint(35, 25))), true) - - val g = literal(globM, FiniteHint(30, 30)) - val g2 = product(g, g, true) - val g4 = product(g2, g2, true) - val optimizedGraph8 = product(g4, g4, true) - - val unoptimizedGraphVectorPlan = (g ^ (5)) * literal(globM, FiniteHint(Long.MaxValue, 1)) - - val optimizedGraphVectorPlan = product( + literal(globM, FiniteHint(30, 35)), + product(literal(globM, FiniteHint(35, 15)), literal(globM, FiniteHint(15, 5))) + ), product( - literal(globM, FiniteHint(30, 30)), - literal(globM, FiniteHint(30, 30))), + product( + sum(literal(globM, FiniteHint(5, 10)), literal(globM, FiniteHint(5, 10))), + literal(globM, FiniteHint(10, 20)) + ), + literal(globM, FiniteHint(20, 25)) + ) + ) + + val productSequence = IndexedSeq( + literal(globM, FiniteHint(30, 35)), + literal(globM, FiniteHint(35, 15)), + literal(globM, FiniteHint(15, 5)), + literal(globM, FiniteHint(5, 10)), + literal(globM, FiniteHint(10, 20)), + literal(globM, FiniteHint(20, 25)) + ) + + val combinedSequence = List( + IndexedSeq( + literal(globM, FiniteHint(30, 35)), + literal(globM, FiniteHint(35, 15)), + literal(globM, FiniteHint(15, 5)), + literal(globM, FiniteHint(5, 10)), + literal(globM, FiniteHint(10, 20)), + literal(globM, FiniteHint(20, 25)) + ), + IndexedSeq(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 25))) + ) + + val planWithSum = product( + literal(globM, FiniteHint(30, 35)), + sum(literal(globM, FiniteHint(35, 25)), literal(globM, FiniteHint(35, 25))) + ) // linter:ignore + + val g = literal(globM, FiniteHint(30, 30)) // linter:ignore + val g2 = product(g, g) // linter:ignore + val g4 = product(g2, g2) // linter:ignore + val optimizedGraph8 = product(g4, g4) // linter:ignore + + val unoptimizedGraphVectorPlan = (g ^ 5) * literal(globM, FiniteHint(Long.MaxValue, 1)) + + val optimizedGraphVectorPlan = product( // linter:ignore + product(literal(globM, FiniteHint(30, 30)), literal(globM, FiniteHint(30, 30))), product( literal(globM, FiniteHint(30, 30)), product( literal(globM, FiniteHint(30, 30)), - product( - literal(globM, FiniteHint(30, 30)), - literal(globM, FiniteHint(Long.MaxValue, 1)))))) + product(literal(globM, FiniteHint(30, 30)), literal(globM, FiniteHint(Long.MaxValue, 1))) + ) + ) + ) "Matrix multiplication chain optimization" should { "handle a single matrix" in { val p = IndexedSeq(literal(globM, FiniteHint(30, 35))) val result = optimizeProductChain(p, Some(ring, MatrixJoiner2.default)) - (result == (0, literal(globM, FiniteHint(30, 35)))) must beTrue + result shouldBe (0, literal(globM, FiniteHint(30, 35))) } "handle two matrices" in { val p = IndexedSeq(literal(globM, FiniteHint(30, 35)), literal(globM, FiniteHint(35, 25))) val result = optimizeProductChain(p, Some(ring, MatrixJoiner2.default)) - ((simplePlanCost, simplePlan) == result) must beTrue + (simplePlanCost, simplePlan) shouldBe result } "handle an example with 6 matrices" in { val result = optimizeProductChain(productSequence, Some(ring, MatrixJoiner2.default)) - ((optimizedPlanCost, optimizedPlan) == result) must beTrue + (optimizedPlanCost, optimizedPlan) shouldBe result } "not change an optimized plan" in { - ((optimizedPlanCost, optimizedPlan) == optimize(optimizedPlan)) must beTrue + (optimizedPlanCost, optimizedPlan) shouldBe optimize(optimizedPlan) } "change an unoptimized plan" in { - ((optimizedPlanCost, optimizedPlan) == optimize(unoptimizedPlan)) must beTrue + (optimizedPlanCost, optimizedPlan) shouldBe optimize(unoptimizedPlan) } "handle an optimized plan with sum" in { - ((combinedOptimizedPlanCost, combinedOptimizedPlan) == optimize(combinedOptimizedPlan)) must beTrue + (combinedOptimizedPlanCost, combinedOptimizedPlan) shouldBe optimize(combinedOptimizedPlan) } "handle an unoptimized plan with sum" in { - ((combinedOptimizedPlanCost, combinedOptimizedPlan) == optimize(combinedUnoptimizedPlan)) must beTrue + (combinedOptimizedPlanCost, combinedOptimizedPlan) shouldBe (optimize(combinedUnoptimizedPlan)) } "not break A*(B+C)" in { - (planWithSum == optimize(planWithSum)._2) must beTrue + planWithSum shouldBe (optimize(planWithSum)._2) } "handle an unoptimized global plan" in { - (optimizedGlobalPlan == optimize(unoptimizedGlobalPlan)._2) must beTrue + optimizedGlobalPlan shouldBe (optimize(unoptimizedGlobalPlan)._2) } "handle an optimized global plan" in { - (optimizedGlobalPlan == optimize(optimizedGlobalPlan)._2) must beTrue + optimizedGlobalPlan shouldBe (optimize(optimizedGlobalPlan)._2) } "handle a G^5 V plan" in { - (optimizedGraphVectorPlan == optimize(unoptimizedGraphVectorPlan)._2) must beTrue + optimizedGraphVectorPlan shouldBe (optimize(unoptimizedGraphVectorPlan)._2) } "handle an optimized G^5 V plan" in { - (optimizedGraphVectorPlan == optimize(optimizedGraphVectorPlan)._2) must beTrue + optimizedGraphVectorPlan shouldBe (optimize(optimizedGraphVectorPlan)._2) } "handle a G^8 plan" in { - (optimizedGraph8 == optimize(g ^ 8)._2) must beTrue + optimizedGraph8 shouldBe (optimize(g ^ 8)._2) } } @@ -190,16 +232,21 @@ class Matrix2OptimizationSpec extends Specification { object Matrix2Props extends Properties("Matrix2") { import com.twitter.scalding.Test - implicit val mode = Test(Map()) - implicit val fd = new FlowDef + implicit val mode: Test = Test(Map()) + implicit val fd: FlowDef = new FlowDef val globM = TypedPipe.from(IterableSource(List((1, 2, 3.0), (2, 2, 4.0)))) implicit val ring: Ring[Double] = Ring.doubleRing implicit val ord1: Ordering[Int] = Ordering.Int - def literal(tpipe: TypedPipe[(Int, Int, Double)], sizeHint: SizeHint): MatrixLiteral[Any, Any, Double] = MatrixLiteral(tpipe, sizeHint).asInstanceOf[MatrixLiteral[Any, Any, Double]] - def product(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double], optimal: Boolean = false): Product[Any, Any, Any, Double] = Product(left, right, ring) - def sum(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double]): Sum[Any, Any, Double] = Sum(left, right, ring) + def literal(tpipe: TypedPipe[(Int, Int, Double)], sizeHint: SizeHint): MatrixLiteral[Any, Any, Double] = + MatrixLiteral(tpipe, sizeHint).asInstanceOf[MatrixLiteral[Any, Any, Double]] + def product( + left: Matrix2[Any, Any, Double], + right: Matrix2[Any, Any, Double] + ): Product[Any, Any, Any, Double] = Product(left, right, ring) + def sum(left: Matrix2[Any, Any, Double], right: Matrix2[Any, Any, Double]): Sum[Any, Any, Double] = + Sum(left, right, ring) /** * Helper methods used in tests for randomized generations @@ -219,30 +266,37 @@ object Matrix2Props extends Properties("Matrix2") { } } - def productChainGen(current: Int, target: Int, prevCol: Long, result: List[MatrixLiteral[Any, Any, Double]]): List[MatrixLiteral[Any, Any, Double]] = { + def productChainGen( + current: Int, + target: Int, + prevCol: Long, + result: List[MatrixLiteral[Any, Any, Double]] + ): List[MatrixLiteral[Any, Any, Double]] = if (current == target) result else { - val (randomMatrix, cols) = genLeaf((prevCol, 0)) + val (randomMatrix, cols) = genLeaf((prevCol, 0)) // linter:ignore productChainGen(current + 1, target, cols, result ++ List(randomMatrix)) } - } - def randomProduct(p: Int): Matrix2[Any, Any, Double] = { + def randomProduct(p: Int): Matrix2[Any, Any, Double] = if (p == 1) genLeaf((0, 0))._1 else { val full = productChainGen(0, p, 0, Nil).toIndexedSeq generateRandomPlan(0, full.size - 1, full) } - } - def genNode(depth: Int) = for { + def genNode(depth: Int): Gen[Matrix2[Any, Any, Double]] = for { v <- arbitrary[Int] p <- Gen.choose(1, 10) left <- genFormula(depth + 1) right <- genFormula(depth + 1) - } yield if (depth > 5) randomProduct(p) else (if (v > 0) randomProduct(p) else Sum(left, right, ring)) + } yield if (depth > 5 || v > 0) randomProduct(p) else Sum(left, right, ring) - def genFormula(depth: Int): Gen[Matrix2[Any, Any, Double]] = if (depth > 5) genLeaf((0, 0))._1 else (oneOf(genNode(depth + 1), genLeaf((0, 0))._1)) + def genFormula(depth: Int): Gen[Matrix2[Any, Any, Double]] = + if (depth > 5) + genLeaf((0, 0))._1 + else + (oneOf(genNode(depth + 1), Gen.const(genLeaf((0, 0))._1))) implicit def arbT: Arbitrary[Matrix2[Any, Any, Double]] = Arbitrary(genFormula(0)) @@ -252,33 +306,37 @@ object Matrix2Props extends Properties("Matrix2") { implicit def arbSeq: Arbitrary[IndexedSeq[MatrixLiteral[Any, Any, Double]]] = Arbitrary(genProdSeq) - def generateRandomPlan(i: Int, j: Int, p: IndexedSeq[MatrixLiteral[Any, Any, Double]]): Matrix2[Any, Any, Double] = { + def generateRandomPlan( + i: Int, + j: Int, + p: IndexedSeq[MatrixLiteral[Any, Any, Double]] + ): Matrix2[Any, Any, Double] = if (i == j) p(i) else { val genK = Gen.choose(i, j - 1) val k = genK.sample.getOrElse(i) - val X = generateRandomPlan(i, k, p) - val Y = generateRandomPlan(k + 1, j, p) + val X = generateRandomPlan(i, k, p) // linter:ignore + val Y = generateRandomPlan(k + 1, j, p) // linter:ignore Product(X, Y, ring) } - } /** - * Function that recursively estimates a cost of a given MatrixFormula / plan. - * This is the used in the tests for checking whether an optimized plan has - * a cost <= a randomized plan. - * The cost estimation of this evaluation should return the same values as the one - * used in building optimized plans -- this is checked in the tests below. - * @return resulting cost + * Function that recursively estimates a cost of a given MatrixFormula / plan. This is the used in the tests + * for checking whether an optimized plan has a cost <= a randomized plan. The cost estimation of this + * evaluation should return the same values as the one used in building optimized plans -- this is checked + * in the tests below. + * @return + * resulting cost */ def evaluate(mf: Matrix2[Any, Any, Double]): BigInt = { /** - * This function strips off the formula into a list of independent product chains - * (i.e. same as matrixFormulaToChains in Prototype, but has Products - * instead of IndexedSeq[Literal]) + * This function strips off the formula into a list of independent product chains (i.e. same as + * matrixFormulaToChains in Prototype, but has Products instead of IndexedSeq[Literal]) */ - def toProducts(mf: Matrix2[Any, Any, Double]): (Option[Product[Any, Any, Any, Double]], List[Product[Any, Any, Any, Double]]) = { + def toProducts( + mf: Matrix2[Any, Any, Double] + ): (Option[Product[Any, Any, Any, Double]], List[Product[Any, Any, Any, Double]]) = mf match { case element @ MatrixLiteral(_, _) => (None, Nil) case Sum(left, right, _) => { @@ -307,13 +365,14 @@ object Matrix2Props extends Properties("Matrix2") { if (lastLP.isDefined && lastRP.isDefined) { (Some(Product(lastLP.get, lastRP.get, ring)), leftR ++ rightR) } else { - val newP = if (lastLP.isDefined) List(lastLP.get) else if (lastRP.isDefined) List(lastRP.get) else Nil + val newP = + if (lastLP.isDefined) List(lastLP.get) else if (lastRP.isDefined) List(lastRP.get) else Nil (None, newP ++ leftR ++ rightR) } } + case HadamardProduct(_, _, _) => sys.error("Hadamard unexpected here") } - } /** * To create a companion tree which has respective ranges of each product @@ -322,7 +381,7 @@ object Matrix2Props extends Properties("Matrix2") { def diff: Int = range._2 - range._1 } - def labelTree(p: Matrix2[Any, Any, Double], start: Int): Option[LabeledTree] = { + def labelTree(p: Matrix2[Any, Any, Double], start: Int): Option[LabeledTree] = p match { case Product(left @ MatrixLiteral(_, _), right @ MatrixLiteral(_, _), _, _) => { Some(new LabeledTree((start, start + 1), None, None)) @@ -342,42 +401,50 @@ object Matrix2Props extends Properties("Matrix2") { } case _ => None } - } /** - * This function evaluates a product chain in the same way - * as the dynamic programming procedure computes cost - * (optimizeProductChain - computeCosts in Prototype) + * This function evaluates a product chain in the same way as the dynamic programming procedure computes + * cost (optimizeProductChain - computeCosts in Prototype) */ - def evaluateProduct(p: Matrix2[Any, Any, Double], labels: LabeledTree): Option[(BigInt, Matrix2[Any, Any, Double], Matrix2[Any, Any, Double])] = { + def evaluateProduct( + p: Matrix2[Any, Any, Double], + labels: LabeledTree + ): Option[(BigInt, Matrix2[Any, Any, Double], Matrix2[Any, Any, Double])] = p match { case Product(left @ MatrixLiteral(_, _), right @ MatrixLiteral(_, _), _, _) => { // reflects optimize when k==i: p(i).sizeHint * (p(k).sizeHint * p(j).sizeHint) - Some((left.sizeHint * (left.sizeHint * right.sizeHint)).total.get, - left, right) + Some((left.sizeHint * (left.sizeHint * right.sizeHint)).total.get, left, right) } case Product(left @ MatrixLiteral(_, _), right @ Product(_, _, _, _), _, _) => { - val (cost, pLeft, pRight) = evaluateProduct(right, labels.right.get).get + val (cost, pLeft, pRight) = evaluateProduct(right, labels.right.get).get // linter:ignore // reflects optimize when k==i: p(i).sizeHint * (p(k).sizeHint * p(j).sizeHint) // diff is computed in the labeled tree - it measures "spread" of the tree // diff corresponds to (k - i) or (j - k - 1) in optimize: (k - i) * computeCosts(p, i, k) + (j - k - 1) * computeCosts(p, k + 1, j) - Some(labels.right.get.diff * cost + (left.sizeHint * (left.sizeHint * pRight.sizeHint)).total.get, - left, pRight) + Some( + labels.right.get.diff * cost + (left.sizeHint * (left.sizeHint * pRight.sizeHint)).total.get, + left, + pRight + ) } case Product(left @ Product(_, _, _, _), right @ MatrixLiteral(_, _), _, _) => { - val (cost, pLeft, pRight) = evaluateProduct(left, labels.left.get).get - Some(labels.left.get.diff * cost + (pLeft.sizeHint * (pRight.sizeHint * right.sizeHint)).total.get, - pLeft, right) + val (cost, pLeft, pRight) = evaluateProduct(left, labels.left.get).get // linter:ignore + Some( + labels.left.get.diff * cost + (pLeft.sizeHint * (pRight.sizeHint * right.sizeHint)).total.get, + pLeft, + right + ) } case Product(left, right, _, _) => { - val (cost1, p1Left, p1Right) = evaluateProduct(left, labels.left.get).get - val (cost2, p2Left, p2Right) = evaluateProduct(right, labels.right.get).get - Some(labels.left.get.diff * cost1 + labels.right.get.diff * cost2 + (p1Left.sizeHint * (p1Right.sizeHint * p2Right.sizeHint)).total.get, - p1Left, p2Right) + val (cost1, p1Left, p1Right) = evaluateProduct(left, labels.left.get).get // linter:ignore + val (cost2, p2Left, p2Right) = evaluateProduct(right, labels.right.get).get // linter:ignore + Some( + labels.left.get.diff * cost1 + labels.right.get.diff * cost2 + (p1Left.sizeHint * (p1Right.sizeHint * p2Right.sizeHint)).total.get, + p1Left, + p2Right + ) } case _ => None } - } val (last, productList) = toProducts(mf) val products = if (last.isDefined) last.get :: productList else productList @@ -386,20 +453,21 @@ object Matrix2Props extends Properties("Matrix2") { // ScalaCheck properties /** - * Verifying "evaluate" function - that it does return - * the same overall costs as what is estimated in the optimization procedure + * Verifying "evaluate" function - that it does return the same overall costs as what is estimated in the + * optimization procedure */ property("evaluate function returns the same cost as optimize") = forAll { (a: Matrix2[Any, Any, Double]) => optimize(a)._1 == evaluate(optimize(a)._2) } /** - * "Proof": the goal property that estimated costs of optimized plans or product chains - * are less than or equal to costs of randomized equivalent plans or product chains + * "Proof": the goal property that estimated costs of optimized plans or product chains are less than or + * equal to costs of randomized equivalent plans or product chains */ - property("a cost of an optimized chain of matrix products is <= a random one") = forAll { (a: IndexedSeq[MatrixLiteral[Any, Any, Double]]) => - optimizeProductChain(a, Some(ring, MatrixJoiner2.default))._1 <= - evaluate(generateRandomPlan(0, a.length - 1, a)) + property("a cost of an optimized chain of matrix products is <= a random one") = forAll { + (a: IndexedSeq[MatrixLiteral[Any, Any, Double]]) => + optimizeProductChain(a, Some(ring, MatrixJoiner2.default))._1 <= + evaluate(generateRandomPlan(0, a.length - 1, a)) } property("cost of a random plan is <= a random one") = forAll { (a: Matrix2[Any, Any, Double]) => diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2Test.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2Test.scala index b513864f72..5027401fda 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2Test.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/Matrix2Test.scala @@ -12,19 +12,18 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics import com.twitter.scalding._ -import cascading.pipe.joiner._ -import org.specs._ -import com.twitter.algebird.{Ring,Group} +import com.twitter.scalding.serialization._ +import com.twitter.scalding.source.TypedText +import org.scalatest.{Matchers, WordSpec} +import com.twitter.algebird.field._ class Matrix2Sum(args: Args) extends Job(args) { - import Matrix2._ import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ val p1: Pipe = Tsv("mat1", ('x1, 'y1, 'v1)).read @@ -36,14 +35,28 @@ class Matrix2Sum(args: Args) extends Job(args) { val mat2 = MatrixLiteral(tp2, NoClue) val sum = mat1 + mat2 - sum.write(TypedTsv[(Int,Int,Double)]("sum")) + sum.write(TypedText.tsv[(Int, Int, Double)]("sum")) +} + +class Matrix2SumOrderedSerialization(args: Args) extends Job(args) { + import RequiredBinaryComparators.orderedSerialization + + override def config = super.config + (Config.ScaldingRequireOrderedSerialization -> "true") + implicit val intOS = orderedSerialization[Int] + + val tp1 = TypedPipe.from(TypedText.tsv[(Int, Int, Double)]("mat1")) + val mat1 = MatrixLiteral(tp1, NoClue) + + val tp2 = TypedPipe.from(TypedText.tsv[(Int, Int, Double)]("mat2")) + val mat2 = MatrixLiteral(tp2, NoClue) + + val sum = mat1 + mat2 + sum.write(TypedText.tsv[(Int, Int, Double)]("sum")) } class Matrix2Sum3(args: Args) extends Job(args) { - import Matrix2._ import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ val p1: Pipe = Tsv("mat1", ('x1, 'y1, 'v1)).read @@ -51,14 +64,12 @@ class Matrix2Sum3(args: Args) extends Job(args) { val mat1 = MatrixLiteral(tp1, NoClue) val sum = mat1 + mat1 - sum.write(TypedTsv[(Int,Int,(Double, Double, Double))]("sum")) + sum.write(TypedText.tsv[(Int, Int, (Double, Double, Double))]("sum")) } class Matrix2SumChain(args: Args) extends Job(args) { - import Matrix2._ import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ val p1: Pipe = Tsv("mat1", ('x1, 'y1, 'v1)).read @@ -74,14 +85,12 @@ class Matrix2SumChain(args: Args) extends Job(args) { val mat3 = MatrixLiteral(tp3, NoClue) val sum = mat1 + mat2 + mat3 - sum.write(TypedTsv[(Int,Int,Double)]("sum")) + sum.write(TypedText.tsv[(Int, Int, Double)]("sum")) } -class Matrix2RowRowHad(args : Args) extends Job(args) { +class Matrix2RowRowHad(args: Args) extends Job(args) { - import Matrix2._ import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ val p1: Pipe = Tsv("mat1", ('x1, 'y1, 'v1)).read @@ -90,14 +99,12 @@ class Matrix2RowRowHad(args : Args) extends Job(args) { val row1 = mat1.getRow(1) val rowSum = row1 #*# row1 - rowSum.toTypedPipe.map { case (x, idx, v) => (idx, v) }.write(TypedTsv[(Int,Double)]("rowRowHad")) + rowSum.toTypedPipe.map { case (x, idx, v) => (idx, v) }.write(TypedText.tsv[(Int, Double)]("rowRowHad")) } -class Matrix2ZeroHad(args : Args) extends Job(args) { +class Matrix2ZeroHad(args: Args) extends Job(args) { - import Matrix2._ import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ val p1: Pipe = Tsv("mat1", ('x1, 'y1, 'v1)).read @@ -109,14 +116,12 @@ class Matrix2ZeroHad(args : Args) extends Job(args) { val mat2 = MatrixLiteral(tp2, NoClue) val rowSum = mat1 #*# mat2 - rowSum.write(TypedTsv[(Int,Int,Double)]("zeroHad")) + rowSum.write(TypedText.tsv[(Int, Int, Double)]("zeroHad")) } class Matrix2HadSum(args: Args) extends Job(args) { - import Matrix2._ import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ val p1: Pipe = Tsv("mat1", ('x1, 'y1, 'v1)).read @@ -132,14 +137,12 @@ class Matrix2HadSum(args: Args) extends Job(args) { val mat3 = MatrixLiteral(tp3, NoClue) val sum = mat1 #*# (mat2 + mat3) - sum.write(TypedTsv[(Int,Int,Double)]("hadSum")) + sum.write(TypedText.tsv[(Int, Int, Double)]("hadSum")) } class Matrix2Prod(args: Args) extends Job(args) { - import Matrix2._ import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ val p1: Pipe = Tsv("mat1", ('x1, 'y1, 'v1)).read @@ -147,14 +150,13 @@ class Matrix2Prod(args: Args) extends Job(args) { val mat1 = MatrixLiteral(tp1, NoClue) val gram = mat1 * mat1.transpose - gram.write(TypedTsv[(Int,Int,Double)]("product")) + gram.write(TypedText.tsv[(Int, Int, Double)]("product")) } class Matrix2JProd(args: Args) extends Job(args) { import Matrix2._ import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ val p1: Pipe = Tsv("mat1", ('x1, 'y1, 'v1)).read @@ -162,14 +164,12 @@ class Matrix2JProd(args: Args) extends Job(args) { val mat1 = MatrixLiteral(tp1, SparseHint(0.75, 2, 2)) val gram = mat1 * J[Int, Int, Double] * mat1.transpose - gram.write(TypedTsv[(Int,Int,Double)]("product")) + gram.write(TypedText.tsv[(Int, Int, Double)]("product")) } class Matrix2ProdSum(args: Args) extends Job(args) { - import Matrix2._ import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ val p1: Pipe = Tsv("mat1", ('x1, 'y1, 'v1)).read @@ -181,35 +181,39 @@ class Matrix2ProdSum(args: Args) extends Job(args) { val mat2 = MatrixLiteral(tp2, NoClue) val gram = (mat1 * mat1.transpose) + mat2 - gram.write(TypedTsv[(Int,Int,Double)]("product-sum")) + gram.write(TypedText.tsv[(Int, Int, Double)]("product-sum")) } class Matrix2PropJob(args: Args) extends Job(args) { - import Matrix2._ - import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ - val tsv1 = TypedTsv[(Int,Int,Int)]("graph") + val tsv1 = TypedPipe.from(TypedText.tsv[(Int, Int, Int)]("graph")) val p1 = tsv1.toPipe(('x1, 'y1, 'v1)) val tp1 = p1.toTypedPipe[(Int, Int, Int)](('x1, 'y1, 'v1)) val mat = MatrixLiteral(tp1, NoClue) - val tsv2 = TypedTsv[(Int,Double)]("col") + val tsv2 = TypedText.tsv[(Int, Double)]("col") val col = MatrixLiteral(TypedPipe.from(tsv2).map { case (idx, v) => (idx, (), v) }, NoClue) - val tsv3 = TypedTsv[(Int,Double)]("row") + val tsv3 = TypedText.tsv[(Int, Double)]("row") val row = MatrixLiteral(TypedPipe.from(tsv3).map { case (idx, v) => ((), idx, v) }, NoClue) - mat.binarizeAs[Boolean].propagate(col).toTypedPipe.map { case (idx, x, v) => (idx, v) }.write(TypedTsv[(Int,Double)]("prop-col")) - row.propagateRow(mat.binarizeAs[Boolean]).toTypedPipe.map { case (x, idx, v) => (idx, v) }.write(TypedTsv[(Int,Double)]("prop-row")) + mat + .binarizeAs[Boolean] + .propagate(col) + .toTypedPipe + .map { case (idx, x, v) => (idx, v) } + .write(TypedText.tsv[(Int, Double)]("prop-col")) + row + .propagateRow(mat.binarizeAs[Boolean]) + .toTypedPipe + .map { case (x, idx, v) => (idx, v) } + .write(TypedText.tsv[(Int, Double)]("prop-row")) } -class Matrix2Cosine(args : Args) extends Job(args) { +class Matrix2Cosine(args: Args) extends Job(args) { - import Matrix2._ import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ val p1: Pipe = Tsv("mat1", ('x1, 'y1, 'v1)).read @@ -218,270 +222,354 @@ class Matrix2Cosine(args : Args) extends Job(args) { val matL2Norm = mat1.rowL2Normalize val cosine = matL2Norm * matL2Norm.transpose - cosine.write(TypedTsv[(Int,Int,Double)]("cosine")) + cosine.write(TypedText.tsv[(Int, Int, Double)]("cosine")) +} + +class Matrix2Normalize(args: Args) extends Job(args) { + + val tp1 = TypedPipe.from(TypedText.tsv[(Int, Int, Double)]("mat1")) + val mat1 = MatrixLiteral(tp1, NoClue) + + // Now test for the case when value is Long type + val matL1Norm = mat1.rowL1Normalize + matL1Norm.write(TypedText.tsv[(Int, Int, Double)]("normalized")) + + // val p2: Pipe = Tsv("mat2", ('x2, 'y2, 'v2)).read // test Long type as value is OK + val tp2 = TypedPipe.from(TypedText.tsv[(Int, Int, Long)]("mat2")) + // val tp2 = p2.toTypedPipe[(Int, Int, Long)](('x2, 'y2, 'v2)) + val mat2 = MatrixLiteral(tp2, NoClue) + + val mat2L1Norm = mat2.rowL1Normalize + mat2L1Norm.write(TypedText.tsv[(Int, Int, Double)]("long_normalized")) } class Scalar2Ops(args: Args) extends Job(args) { - import Matrix2._ import Scalar2._ import cascading.pipe.Pipe - import cascading.tuple.Fields import com.twitter.scalding.TDsl._ val p1: Pipe = Tsv("mat1", ('x1, 'y1, 'v1)).read val tp1 = p1.toTypedPipe[(Int, Int, Double)](('x1, 'y1, 'v1)) val mat1 = MatrixLiteral(tp1, NoClue) - (mat1 * 3.0).write(TypedTsv[(Int,Int,Double)]("times3")) - (mat1 / 3.0).write(TypedTsv[(Int,Int,Double)]("div3")) + (mat1 * 3.0).write(TypedText.tsv[(Int, Int, Double)]("times3")) + (mat1 / 3.0).write(TypedText.tsv[(Int, Int, Double)]("div3")) // implicit conversion still doesn't work? - (Scalar2(3.0) * mat1).write(TypedTsv[(Int,Int,Double)]("3times")) + (Scalar2(3.0) * mat1).write(TypedText.tsv[(Int, Int, Double)]("3times")) - // Now with Scalar objects: - (mat1.trace * mat1).write(TypedTsv[(Int,Int,Double)]("tracetimes")) - (mat1 * mat1.trace).write(TypedTsv[(Int,Int,Double)]("timestrace")) - (mat1 / mat1.trace).write(TypedTsv[(Int,Int,Double)]("divtrace")) + // Now with Scalar objects: + (mat1.trace * mat1).write(TypedText.tsv[(Int, Int, Double)]("tracetimes")) + (mat1 * mat1.trace).write(TypedText.tsv[(Int, Int, Double)]("timestrace")) + (mat1 / mat1.trace).write(TypedText.tsv[(Int, Int, Double)]("divtrace")) } -class Matrix2Test extends Specification { - noDetailedDiffs() // For scala 2.9 +class Matrix2Test extends WordSpec with Matchers { import Dsl._ - def toSparseMat[Row, Col, V](iter: Iterable[(Row, Col, V)]): Map[(Row, Col), V] = { - iter.map { it => ((it._1, it._2), it._3) }.toMap - } - def oneDtoSparseMat[Idx, V](iter: Iterable[(Idx, V)]): Map[(Idx, Idx), V] = { - iter.map { it => ((it._1, it._1), it._2) }.toMap - } + def toSparseMat[Row, Col, V](iter: Iterable[(Row, Col, V)]): Map[(Row, Col), V] = + iter.map(it => ((it._1, it._2), it._3)).toMap + def oneDtoSparseMat[Idx, V](iter: Iterable[(Idx, V)]): Map[(Idx, Idx), V] = + iter.map(it => ((it._1, it._1), it._2)).toMap "A MatrixSum job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Matrix2Sum") + JobTest(new Matrix2Sum(_)) .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) .source(Tsv("mat2", ('x2, 'y2, 'v2)), List((1, 3, 3.0), (2, 1, 8.0), (1, 2, 4.0))) - .sink[(Int, Int, Double)](TypedTsv[(Int,Int,Double)]("sum")) { ob => + .typedSink(TypedText.tsv[(Int, Int, Double)]("sum")) { ob => "correctly compute sums" in { - val pMap = toSparseMat(ob) - pMap must be_==(Map((1, 1) -> 1.0, (1, 2) -> 8.0, (1, 3) -> 3.0, (2, 1) -> 8.0, (2, 2) -> 3.0)) + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 8.0, + (1, 3) -> 3.0, + (2, 1) -> 8.0, + (2, 2) -> 3.0 + ) } } .runHadoop - .finish + .finish() + } + } + + "A MatrixSum job with Orderedserialization" should { + TUtil.printStack { + JobTest(new Matrix2SumOrderedSerialization(_)) + .source(TypedText.tsv[(Int, Int, Double)]("mat1"), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .source(TypedText.tsv[(Int, Int, Double)]("mat2"), List((1, 3, 3.0), (2, 1, 8.0), (1, 2, 4.0))) + .typedSink(TypedText.tsv[(Int, Int, Double)]("sum")) { ob => + "correctly compute sums" in { + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 8.0, + (1, 3) -> 3.0, + (2, 1) -> 8.0, + (2, 2) -> 3.0 + ) + } + } + .runHadoop + .finish() } } "A Matrix2Sum3 job, where the Matrix contains tuples as values," should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Matrix2Sum3") - .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1,1,(1.0, 3.0, 5.0)),(2,2,(3.0, 2.0, 1.0)),(1,2,(4.0, 5.0, 2.0)))) - .sink[(Int, Int, String)](TypedTsv[(Int,Int,(Double, Double, Double))]("sum")) { ob => + JobTest(new Matrix2Sum3(_)) + .source( + Tsv("mat1", ('x1, 'y1, 'v1)), + List((1, 1, (1.0, 3.0, 5.0)), (2, 2, (3.0, 2.0, 1.0)), (1, 2, (4.0, 5.0, 2.0))) + ) + .typedSink(TypedText.tsv[(Int, Int, (Double, Double, Double))]("sum")) { ob => "correctly compute sums" in { // Treat (Double, Double, Double) as string because that is what is actually returned // when using runHadoop - val pMap = toSparseMat(ob) - val result = Map((1,1)->(2.0, 6.0, 10.0), (2,2)->(6.0, 4.0, 2.0), (1,2)->(8.0, 10.0, 4.0)).mapValues(_.toString) - pMap must be_==(result) + val result = + Map((1, 1) -> (2.0, 6.0, 10.0), (2, 2) -> (6.0, 4.0, 2.0), (1, 2) -> (8.0, 10.0, 4.0)) + toSparseMat(ob) shouldBe result } - } + }(implicitly[TypeDescriptor[(Int, Int, (Double, Double, Double))]].converter) .runHadoop - .finish + .finish() } } "A Matrix2SumChain job" should { TUtil.printStack { - JobTest(new com.twitter.scalding.mathematics.Matrix2SumChain(_)) + JobTest(new Matrix2SumChain(_)) .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) .source(Tsv("mat2", ('x2, 'y2, 'v2)), List((1, 3, 3.0), (2, 1, 8.0), (1, 2, 4.0))) .source(Tsv("mat3", ('x3, 'y3, 'v3)), List((1, 3, 4.0), (2, 1, 1.0), (1, 2, 4.0))) - .sink[(Int, Int, Double)](TypedTsv[(Int,Int,Double)]("sum")) { ob => + .typedSink(TypedText.tsv[(Int, Int, Double)]("sum")) { ob => "correctly compute sums" in { - val pMap = toSparseMat(ob) - pMap must be_==(Map((1, 1) -> 1.0, (1, 2) -> 12.0, (1, 3) -> 7.0, (2, 1) -> 9.0, (2, 2) -> 3.0)) + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 12.0, + (1, 3) -> 7.0, + (2, 1) -> 9.0, + (2, 2) -> 3.0 + ) } } .runHadoop - .finish + .finish() } } "A Matrix2HadSum job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Matrix2HadSum") + JobTest(new Matrix2HadSum(_)) .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 3, 1.0), (2, 2, 3.0))) .source(Tsv("mat2", ('x2, 'y2, 'v2)), List((1, 3, 3.0), (2, 1, 8.0), (1, 2, 4.0))) .source(Tsv("mat3", ('x3, 'y3, 'v3)), List((1, 3, 4.0), (2, 1, 1.0), (1, 2, 4.0))) - .sink[(Int, Int, Double)](TypedTsv[(Int,Int,Double)]("hadSum")) { ob => + .typedSink(TypedText.tsv[(Int, Int, Double)]("hadSum")) { ob => "correctly compute a combination of a Hadamard product and a sum" in { - val pMap = toSparseMat(ob) - pMap must be_==(Map((1, 3) -> 7.0)) + toSparseMat(ob) shouldBe Map((1, 3) -> 7.0) } } .runHadoop - .finish + .finish() } } "A Matrix2 RowRowHad job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Matrix2RowRowHad") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Double)](TypedTsv[(Int,Double)]("rowRowHad")) { ob => - "correctly compute a Hadamard product of row vectors" in { - val pMap = oneDtoSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (2,2)->16.0) ) + JobTest(new Matrix2RowRowHad(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .typedSink(TypedText.tsv[(Int, Double)]("rowRowHad")) { ob => + "correctly compute a Hadamard product of row vectors" in { + oneDtoSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (2, 2) -> 16.0) + } } - } - .runHadoop - .finish + .runHadoop + .finish() } } "A Matrix2 ZeroHad job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Matrix2ZeroHad") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .source(Tsv("mat2",('x2,'y2,'v2)), List()) - .sink[(Int,Int,Double)](TypedTsv[(Int,Int,Double)]("zeroHad")) { ob => - "correctly compute a Hadamard product with a zero matrix" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map( )) + JobTest(new Matrix2ZeroHad(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .source[(Int, Int, Double)](Tsv("mat2", ('x2, 'y2, 'v2)), List()) + .typedSink(TypedText.tsv[(Int, Int, Double)]("zeroHad")) { ob => + "correctly compute a Hadamard product with a zero matrix" in { + toSparseMat(ob) shouldBe empty + } } - } - .runHadoop - .finish + .runHadoop + .finish() } } - "A Matrix2Prod job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Matrix2Prod") + JobTest(new Matrix2Prod(_)) .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) - .sink[(Int, Int, Double)](TypedTsv[(Int,Int,Double)]("product")) { ob => + .typedSink(TypedText.tsv[(Int, Int, Double)]("product")) { ob => "correctly compute products" in { - val pMap = toSparseMat(ob) - pMap must be_==(Map((1, 1) -> 17.0, (1, 2) -> 12.0, (2, 1) -> 12.0, (2, 2) -> 9.0)) + toSparseMat(ob) shouldBe Map((1, 1) -> 17.0, (1, 2) -> 12.0, (2, 1) -> 12.0, (2, 2) -> 9.0) } } .runHadoop - .finish + .finish() } } "A Matrix2JProd job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Matrix2JProd") + JobTest(new Matrix2JProd(_)) .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) - .sink[(Int, Int, Double)](TypedTsv[(Int,Int,Double)]("product")) { ob => + .typedSink(TypedText.tsv[(Int, Int, Double)]("product")) { ob => "correctly compute products with infinite matrices" in { - val pMap = toSparseMat(ob) - pMap must be_==(Map((1,1) -> 5.0, (1,2) -> 35.0, (2,1) -> 3.0, (2,2) -> 21.0)) + toSparseMat(ob) shouldBe Map((1, 1) -> 5.0, (1, 2) -> 35.0, (2, 1) -> 3.0, (2, 2) -> 21.0) } } .runHadoop - .finish + .finish() } } - "A Matrix2Prod job" should { + "A Matrix2ProdSum job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Matrix2ProdSum") + JobTest(new Matrix2ProdSum(_)) .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) .source(Tsv("mat2", ('x2, 'y2, 'v2)), List((1, 1, 1.0), (1, 2, 1.0), (2, 1, 1.0), (2, 2, 1.0))) - .sink[(Int, Int, Double)](TypedTsv[(Int,Int,Double)]("product-sum")) { ob => + .typedSink(TypedText.tsv[(Int, Int, Double)]("product-sum")) { ob => "correctly compute products" in { - val pMap = toSparseMat(ob) - pMap must be_==(Map((1, 1) -> 18.0, (1, 2) -> 13.0, (2, 1) -> 13.0, (2, 2) -> 10.0)) + toSparseMat(ob) shouldBe Map((1, 1) -> 18.0, (1, 2) -> 13.0, (2, 1) -> 13.0, (2, 2) -> 10.0) } } .runHadoop - .finish + .finish() } } "A Matrix2 Propagation job" should { TUtil.printStack { - JobTest(new Matrix2PropJob(_)) - /* Sparse representation of the input matrix: - * [[0 1 1], - * [0 0 1], - * [1 0 0]] = List((0,1,1), (0,2,1), (1,2,1), (2,0,1)) - * - * Sparse representation of the input vector: - * [1.0 2.0 4.0] = List((0,1.0), (1,2.0), (2,4.0)) - */ - .source(TypedTsv[(Int,Int,Int)]("graph"), List((0,1,1), (0,2,1), (1,2,1), (2,0,1))) - .source(TypedTsv[(Int,Double)]("row"), List((0,1.0), (1,2.0), (2,4.0))) - .source(TypedTsv[(Int,Double)]("col"), List((0,1.0), (1,2.0), (2,4.0))) - .sink[(Int, Double)](TypedTsv[(Int,Double)]("prop-col")) { ob => - "correctly propagate columns" in { - ob.toMap must be_==(Map(0 -> 6.0, 1 -> 4.0, 2 -> 1.0)) + JobTest(new Matrix2PropJob(_)) + /* Sparse representation of the input matrix: + * [[0 1 1], + * [0 0 1], + * [1 0 0]] = List((0,1,1), (0,2,1), (1,2,1), (2,0,1)) + * + * Sparse representation of the input vector: + * [1.0 2.0 4.0] = List((0,1.0), (1,2.0), (2,4.0)) + */ + .source(TypedText.tsv[(Int, Int, Int)]("graph"), List((0, 1, 1), (0, 2, 1), (1, 2, 1), (2, 0, 1))) + .source(TypedText.tsv[(Int, Double)]("row"), List((0, 1.0), (1, 2.0), (2, 4.0))) + .source(TypedText.tsv[(Int, Double)]("col"), List((0, 1.0), (1, 2.0), (2, 4.0))) + .typedSink(TypedText.tsv[(Int, Double)]("prop-col")) { ob => + "correctly propagate columns" in { + ob.toMap shouldBe Map(0 -> 6.0, 1 -> 4.0, 2 -> 1.0) + } } - } - .sink[(Int,Double)](TypedTsv[(Int,Double)]("prop-row")) { ob => - "correctly propagate rows" in { - ob.toMap must be_==(Map(0 -> 4.0, 1 -> 1.0, 2 -> 3.0)) + .typedSink(TypedText.tsv[(Int, Double)]("prop-row")) { ob => + "correctly propagate rows" in { + ob.toMap shouldBe Map(0 -> 4.0, 1 -> 1.0, 2 -> 3.0) + } } - } - .runHadoop - .finish + .runHadoop + .finish() } } "A Matrix2 Cosine job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Matrix2Cosine") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int, Int, Double)](TypedTsv[(Int,Int,Double)]("cosine")) { ob => - "correctly compute cosine similarity" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (1,2)->0.9701425001453319, (2,1)->0.9701425001453319, (2,2)->1.0 )) + JobTest(new Matrix2Cosine(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .typedSink(TypedText.tsv[(Int, Int, Double)]("cosine")) { ob => + "correctly compute cosine similarity" in { + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 0.9701425001453319, + (2, 1) -> 0.9701425001453319, + (2, 2) -> 1.0 + ) + } + } + .runHadoop + .finish() + } + } + + "A Matrix2 Normalize job" should { + TUtil.printStack { + JobTest(new Matrix2Normalize(_)) + .source( + TypedText.tsv[(Int, Int, Double)]("mat1"), + List((1, 1, 4.0), (1, 2, 1.0), (2, 2, 1.0), (3, 1, 1.0), (3, 2, 3.0), (3, 3, 4.0)) + ) + .source( + TypedText.tsv[(Int, Int, Long)]("mat2"), + List((1, 1, 4L), (1, 2, 1L), (2, 2, 1L), (3, 1, 1L), (3, 2, 3L), (3, 3, 4L)) + ) + .typedSink(TypedText.tsv[(Int, Int, Double)]("normalized")) { ob => + "correctly compute l1 normalization for matrix with double values" in { + toSparseMat(ob) shouldBe Map( + (1, 1) -> 0.8, + (1, 2) -> 0.2, + (2, 2) -> 1.0, + (3, 1) -> 0.125, + (3, 2) -> 0.375, + (3, 3) -> 0.5 + ) + } } - } - .runHadoop - .finish + .typedSink(TypedText.tsv[(Int, Int, Double)]("long_normalized")) { ob => + "correctly compute l1 normalization for matrix with long values" in { + toSparseMat(ob) shouldBe Map( + (1, 1) -> 0.8, + (1, 2) -> 0.2, + (2, 2) -> 1.0, + (3, 1) -> 0.125, + (3, 2) -> 0.375, + (3, 3) -> 0.5 + ) + } + + } + .runHadoop + .finish() } } "A Matrix2 Scalar2Ops job" should { TUtil.printStack { - JobTest(new com.twitter.scalding.mathematics.Scalar2Ops(_)) - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int, Int, Double)](TypedTsv[(Int,Int,Double)]("times3")) { ob => - "correctly compute M * 3" in { - toSparseMat(ob) must be_==( Map((1,1)->3.0, (2,2)->9.0, (1,2)->12.0) ) + JobTest(new Scalar2Ops(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .typedSink(TypedText.tsv[(Int, Int, Double)]("times3")) { ob => + "correctly compute M * 3" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 3.0, (2, 2) -> 9.0, (1, 2) -> 12.0) + } } - } - .sink[(Int,Int,Double)](TypedTsv[(Int,Int,Double)]("div3")) { ob => - "correctly compute M / 3" in { - toSparseMat(ob) must be_==( Map((1,1)->(1.0/3.0), (2,2)->(3.0/3.0), (1,2)->(4.0/3.0)) ) + .typedSink(TypedText.tsv[(Int, Int, Double)]("div3")) { ob => + "correctly compute M / 3" in { + toSparseMat(ob) shouldBe Map((1, 1) -> (1.0 / 3.0), (2, 2) -> (3.0 / 3.0), (1, 2) -> (4.0 / 3.0)) + } } - } - .sink[(Int, Int, Double)](TypedTsv[(Int,Int,Double)]("3times")) { ob => - "correctly compute 3 * M" in { - toSparseMat(ob) must be_==( Map((1,1)->3.0, (2,2)->9.0, (1,2)->12.0) ) + .typedSink(TypedText.tsv[(Int, Int, Double)]("3times")) { ob => + "correctly compute 3 * M" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 3.0, (2, 2) -> 9.0, (1, 2) -> 12.0) + } } - } - .sink[(Int,Int,Double)](TypedTsv[(Int,Int,Double)]("timestrace")) { ob => - "correctly compute M * Tr(M)" in { - toSparseMat(ob) must be_==( Map((1,1)->4.0, (2,2)->12.0, (1,2)->16.0) ) + .typedSink(TypedText.tsv[(Int, Int, Double)]("timestrace")) { ob => + "correctly compute M * Tr(M)" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 4.0, (2, 2) -> 12.0, (1, 2) -> 16.0) + } } - } - .sink[(Int,Int,Double)](TypedTsv[(Int,Int,Double)]("tracetimes")) { ob => - "correctly compute Tr(M) * M" in { - toSparseMat(ob) must be_==( Map((1,1)->4.0, (2,2)->12.0, (1,2)->16.0) ) + .typedSink(TypedText.tsv[(Int, Int, Double)]("tracetimes")) { ob => + "correctly compute Tr(M) * M" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 4.0, (2, 2) -> 12.0, (1, 2) -> 16.0) + } } - } - .sink[(Int,Int,Double)](TypedTsv[(Int,Int,Double)]("divtrace")) { ob => - "correctly compute M / Tr(M)" in { - toSparseMat(ob) must be_==( Map((1,1)->(1.0/4.0), (2,2)->(3.0/4.0), (1,2)->(4.0/4.0)) ) + .typedSink(TypedText.tsv[(Int, Int, Double)]("divtrace")) { ob => + "correctly compute M / Tr(M)" in { + toSparseMat(ob) shouldBe Map((1, 1) -> (1.0 / 4.0), (2, 2) -> (3.0 / 4.0), (1, 2) -> (4.0 / 4.0)) + } } - } - .runHadoop - .finish + .runHadoop + .finish() } } - } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/MatrixTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/MatrixTest.scala index 6410f4d47a..a5de270d41 100644 --- a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/MatrixTest.scala +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/MatrixTest.scala @@ -12,126 +12,112 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.mathematics import com.twitter.scalding._ -import cascading.pipe.joiner._ -import org.specs._ +import org.scalatest.{Matchers, WordSpec} import com.twitter.algebird.Group +import com.twitter.algebird.field._ object TUtil { - def printStack( fn: => Unit ) { - try { fn } catch { case e : Throwable => e.printStackTrace; throw e } - } + def printStack(fn: => Unit): Unit = + try { fn } + catch { case e: Throwable => e.printStackTrace; throw e } } -class MatrixProd(args : Args) extends Job(args) { +class MatrixProd(args: Args) extends Job(args) { import Matrix._ - val mat1 = Tsv("mat1",('x1,'y1,'v1)) - .toMatrix[Int,Int,Double]('x1,'y1,'v1) + val mat1 = Tsv("mat1", ('x1, 'y1, 'v1)) + .toMatrix[Int, Int, Double]('x1, 'y1, 'v1) val gram = mat1 * mat1.transpose gram.pipe.write(Tsv("product")) } -class MatrixBlockProd(args : Args) extends Job(args) { +class MatrixBlockProd(args: Args) extends Job(args) { import Matrix._ - val mat1 = Tsv("mat1",('x1,'y1,'v1)) - .mapToBlockMatrix(('x1,'y1,'v1)) - {(rcv: (String, Int, Double)) => (rcv._1(0), rcv._1, rcv._2, rcv._3)} + val mat1 = Tsv("mat1", ('x1, 'y1, 'v1)) + .mapToBlockMatrix(('x1, 'y1, 'v1))((rcv: (String, Int, Double)) => (rcv._1(0), rcv._1, rcv._2, rcv._3)) - val mat2 = Tsv("mat1",('x1,'y1,'v1)) - .toMatrix[String,Int,Double]('x1,'y1,'v1) + val mat2 = Tsv("mat1", ('x1, 'y1, 'v1)) + .toMatrix[String, Int, Double]('x1, 'y1, 'v1) .toBlockMatrix(s => (s(0), s)) - val gram = mat1 dotProd mat2.transpose + val gram = mat1.dotProd(mat2.transpose) gram.pipe.write(Tsv("product")) } -class MatrixSum(args : Args) extends Job(args) { +class MatrixSum(args: Args) extends Job(args) { import Matrix._ - val mat1 = Tsv("mat1",('x1,'y1,'v1)) - .mapToMatrix('x1,'y1,'v1) { rowColVal : (Int,Int,Double) => rowColVal } - val mat2 = Tsv("mat2",('x2,'y2,'v2)) - .mapToMatrix('x2,'y2,'v2) { rowColVal : (Int,Int,Double) => rowColVal } + val mat1 = Tsv("mat1", ('x1, 'y1, 'v1)) + .mapToMatrix('x1, 'y1, 'v1) { rowColVal: (Int, Int, Double) => rowColVal } + val mat2 = Tsv("mat2", ('x2, 'y2, 'v2)) + .mapToMatrix('x2, 'y2, 'v2) { rowColVal: (Int, Int, Double) => rowColVal } val sum = mat1 + mat2 sum.pipe.write(Tsv("sum")) } -class MatrixSum3(args : Args) extends Job(args) { - - import Matrix._ +class MatrixSum3(args: Args) extends Job(args) { - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,(Double, Double, Double)]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, (Double, Double, Double)]('x1, 'y1, 'v1, p1) val sum = mat1 + mat1 sum.pipe.write(Tsv("sum")) } +class Randwalk(args: Args) extends Job(args) { -class Randwalk(args : Args) extends Job(args) { - - import Matrix._ - - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val mat1L1Norm = mat1.rowL1Normalize val randwalk = mat1L1Norm * mat1L1Norm randwalk.pipe.write(Tsv("randwalk")) } -class Cosine(args : Args) extends Job(args) { +class Cosine(args: Args) extends Job(args) { - import Matrix._ - - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val matL2Norm = mat1.rowL2Normalize val cosine = matL2Norm * matL2Norm.transpose cosine.pipe.write(Tsv("cosine")) } -class Covariance(args : Args) extends Job(args) { - - import Matrix._ +class Covariance(args: Args) extends Job(args) { - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val matCentered = mat1.colMeanCentering val cov = matCentered * matCentered.transpose cov.pipe.write(Tsv("cov")) } -class VctProd(args : Args) extends Job(args) { - - import Matrix._ +class VctProd(args: Args) extends Job(args) { - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val row = mat1.getRow(1) val rowProd = row * row.transpose rowProd.pipe.write(Tsv("vctProd")) } -class VctDiv(args : Args) extends Job(args) { +class VctDiv(args: Args) extends Job(args) { - import Matrix._ - - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val row = mat1.getRow(1).diag val row2 = mat1.getRow(2).diag.inverse @@ -141,8 +127,8 @@ class VctDiv(args : Args) extends Job(args) { class ScalarOps(args: Args) extends Job(args) { import Matrix._ - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) (mat1 * 3.0).pipe.write(Tsv("times3")) (mat1 / 3.0).pipe.write(Tsv("div3")) (3.0 * mat1).pipe.write(Tsv("3times")) @@ -152,11 +138,10 @@ class ScalarOps(args: Args) extends Job(args) { (mat1 / mat1.trace).pipe.write(Tsv("divtrace")) } -class DiagonalOps(args : Args) extends Job(args) { +class DiagonalOps(args: Args) extends Job(args) { import Matrix._ - val mat = Tsv("mat1",('x1,'y1,'v1)) - .read - .toMatrix[Int,Int,Double]('x1,'y1,'v1) + val mat = Tsv("mat1", ('x1, 'y1, 'v1)).read + .toMatrix[Int, Int, Double]('x1, 'y1, 'v1) (mat * mat.diagonal).write(Tsv("mat-diag")) (mat.diagonal * mat).write(Tsv("diag-mat")) (mat.diagonal * mat.diagonal).write(Tsv("diag-diag")) @@ -167,9 +152,9 @@ class DiagonalOps(args : Args) extends Job(args) { class PropJob(args: Args) extends Job(args) { import Matrix._ - val mat = TypedTsv[(Int,Int,Int)]("graph").toMatrix - val row = TypedTsv[(Int,Double)]("row").toRow - val col = TypedTsv[(Int,Double)]("col").toCol + val mat = TypedTsv[(Int, Int, Int)]("graph").toMatrix + val row = TypedTsv[(Int, Double)]("row").toRow + val col = TypedTsv[(Int, Double)]("col").toCol mat.binarizeAs[Boolean].propagate(col).write(Tsv("prop-col")) row.propagate(mat.binarizeAs[Boolean]).write(Tsv("prop-row")) @@ -178,55 +163,47 @@ class PropJob(args: Args) extends Job(args) { class MatrixMapWithVal(args: Args) extends Job(args) { import Matrix._ - val mat = TypedTsv[(Int,Int,Int)]("graph").toMatrix - val row = TypedTsv[(Int,Double)]("row").toRow + val mat = TypedTsv[(Int, Int, Int)]("graph").toMatrix + val row = TypedTsv[(Int, Double)]("row").toRow - mat.mapWithIndex { (v,r,c) => if (r == c) v else 0 }.write(Tsv("diag")) - row.mapWithIndex { (v,c) => if (c == 0) v else 0.0 }.write(Tsv("first")) + mat.mapWithIndex((v, r, c) => if (r == c) v else 0).write(Tsv("diag")) + row.mapWithIndex((v, c) => if (c == 0) v else 0.0).write(Tsv("first")) } -class RowMatProd(args : Args) extends Job(args) { - - import Matrix._ +class RowMatProd(args: Args) extends Job(args) { - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val row = mat1.getRow(1) val rowProd = row * mat1 rowProd.pipe.write(Tsv("rowMatPrd")) } -class MatColProd(args : Args) extends Job(args) { - - import Matrix._ +class MatColProd(args: Args) extends Job(args) { - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val col = mat1.getCol(1) val colProd = mat1 * col colProd.pipe.write(Tsv("matColPrd")) } -class RowRowSum(args : Args) extends Job(args) { +class RowRowSum(args: Args) extends Job(args) { - import Matrix._ - - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val row1 = mat1.getRow(1) val rowSum = row1 + row1 rowSum.pipe.write(Tsv("rowRowSum")) } -class RowRowDiff(args : Args) extends Job(args) { - - import Matrix._ +class RowRowDiff(args: Args) extends Job(args) { - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val row1 = mat1.getRow(1) val row2 = mat1.getRow(2) @@ -234,830 +211,812 @@ class RowRowDiff(args : Args) extends Job(args) { rowSum.pipe.write(Tsv("rowRowDiff")) } -class RowRowHad(args : Args) extends Job(args) { - - import Matrix._ +class RowRowHad(args: Args) extends Job(args) { - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val row1 = mat1.getRow(1) - val rowSum = row1 hProd row1 + val rowSum = row1.hProd(row1) rowSum.pipe.write(Tsv("rowRowHad")) } -class VctOuterProd(args : Args) extends Job(args) { +class VctOuterProd(args: Args) extends Job(args) { - import Matrix._ - - val p1 = Tsv("mat1",('x1,'y1,'v1)).read - val mat1 = new Matrix[Int,Int,Double]('x1,'y1,'v1, p1) + val p1 = Tsv("mat1", ('x1, 'y1, 'v1)).read + val mat1 = new Matrix[Int, Int, Double]('x1, 'y1, 'v1, p1) val row1 = mat1.getRow(1) - val outerProd = row1.transpose * row1 + val outerProd = row1.transpose * row1 outerProd.pipe.write(Tsv("outerProd")) } -class FilterMatrix(args : Args) extends Job(args) { - - import Matrix._ +class FilterMatrix(args: Args) extends Job(args) { - val p1 = Tsv("mat1",('x,'y,'v)).read - val p2 = Tsv("mat2",('x,'y,'v)).read - val mat1 = new Matrix[Int,Int,Double]('x,'y,'v, p1) - val mat2 = new Matrix[Int,Int,Double]('x,'y,'v, p2) + val p1 = Tsv("mat1", ('x, 'y, 'v)).read + val p2 = Tsv("mat2", ('x, 'y, 'v)).read + val mat1 = new Matrix[Int, Int, Double]('x, 'y, 'v, p1) + val mat2 = new Matrix[Int, Int, Double]('x, 'y, 'v, p2) - mat1.removeElementsBy(mat2).write(Tsv("removeMatrix")) - mat1.keepElementsBy(mat2).write(Tsv("keepMatrix")) + mat1.removeElementsBy(mat2).write(Tsv("removeMatrix")) + mat1.keepElementsBy(mat2).write(Tsv("keepMatrix")) } -class KeepRowsCols(args : Args) extends Job(args) { +class KeepRowsCols(args: Args) extends Job(args) { - import Matrix._ + val p1 = Tsv("mat1", ('x, 'y, 'v)).read + val mat1 = new Matrix[Int, Int, Double]('x, 'y, 'v, p1) + val p2 = Tsv("col1", ('x, 'v)).read + val col1 = new ColVector[Int, Double]('x, 'v, p2) - val p1 = Tsv("mat1",('x,'y,'v)).read - val mat1 = new Matrix[Int,Int,Double]('x,'y,'v, p1) - val p2 = Tsv("col1", ('x, 'v)).read - val col1 = new ColVector[Int, Double]('x, 'v, p2) - - mat1.keepRowsBy(col1).write(Tsv("keepRows")) - mat1.keepColsBy(col1.transpose).write(Tsv("keepCols")) + mat1.keepRowsBy(col1).write(Tsv("keepRows")) + mat1.keepColsBy(col1.transpose).write(Tsv("keepCols")) } -class RemoveRowsCols(args : Args) extends Job(args) { +class RemoveRowsCols(args: Args) extends Job(args) { - import Matrix._ + val p1 = Tsv("mat1", ('x, 'y, 'v)).read + val mat1 = new Matrix[Int, Int, Double]('x, 'y, 'v, p1) + val p2 = Tsv("col1", ('x, 'v)).read + val col1 = new ColVector[Int, Double]('x, 'v, p2) - val p1 = Tsv("mat1",('x,'y,'v)).read - val mat1 = new Matrix[Int,Int,Double]('x,'y,'v, p1) - val p2 = Tsv("col1", ('x, 'v)).read - val col1 = new ColVector[Int, Double]('x, 'v, p2) - - mat1.removeRowsBy(col1).write(Tsv("removeRows")) - mat1.removeColsBy(col1.transpose).write(Tsv("removeCols")) + mat1.removeRowsBy(col1).write(Tsv("removeRows")) + mat1.removeColsBy(col1.transpose).write(Tsv("removeCols")) } -class ScalarRowRight(args : Args) extends Job(args) { +class ScalarRowRight(args: Args) extends Job(args) { - import Matrix._ - - val p1 = Tsv("row1",('x,'v)).read + val p1 = Tsv("row1", ('x, 'v)).read val row1 = new RowVector[Int, Double]('x, 'v, p1) - - (row1*new LiteralScalar[Double](3.0)).write(Tsv("scalarRowRight")) + + (row1 * new LiteralScalar[Double](3.0)).write(Tsv("scalarRowRight")) // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) - (row1*sca1).write(Tsv("scalarObjRowRight")) + (row1 * sca1).write(Tsv("scalarObjRowRight")) } -class ScalarRowLeft(args : Args) extends Job(args) { +class ScalarRowLeft(args: Args) extends Job(args) { - import Matrix._ - - val p1 = Tsv("row1",('x,'v)).read + val p1 = Tsv("row1", ('x, 'v)).read val row1 = new RowVector[Int, Double]('x, 'v, p1) - + (new LiteralScalar[Double](3.0) * row1).write(Tsv("scalarRowLeft")) // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) - (sca1*row1).write(Tsv("scalarObjRowLeft")) + (sca1 * row1).write(Tsv("scalarObjRowLeft")) } -class ScalarColRight(args : Args) extends Job(args) { +class ScalarColRight(args: Args) extends Job(args) { - import Matrix._ - - val p1 = Tsv("col1",('x,'v)).read + val p1 = Tsv("col1", ('x, 'v)).read val col1 = new ColVector[Int, Double]('x, 'v, p1) - - (col1*new LiteralScalar[Double](3.0)).write(Tsv("scalarColRight")) + + (col1 * new LiteralScalar[Double](3.0)).write(Tsv("scalarColRight")) // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) - (col1*sca1).write(Tsv("scalarObjColRight")) + (col1 * sca1).write(Tsv("scalarObjColRight")) } -class ScalarColLeft(args : Args) extends Job(args) { +class ScalarColLeft(args: Args) extends Job(args) { - import Matrix._ - - val p1 = Tsv("col1",('x,'v)).read + val p1 = Tsv("col1", ('x, 'v)).read val col1 = new ColVector[Int, Double]('x, 'v, p1) - + (new LiteralScalar[Double](3.0) * col1).write(Tsv("scalarColLeft")) // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) - (sca1*col1).write(Tsv("scalarObjColLeft")) + (sca1 * col1).write(Tsv("scalarObjColLeft")) } -class ScalarDiagRight(args : Args) extends Job(args) { +class ScalarDiagRight(args: Args) extends Job(args) { - import Matrix._ - - val p1 = Tsv("diag1",('x,'v)).read + val p1 = Tsv("diag1", ('x, 'v)).read val diag1 = new DiagonalMatrix[Int, Double]('x, 'v, p1) - - (diag1*new LiteralScalar[Double](3.0)).write(Tsv("scalarDiagRight")) + + (diag1 * new LiteralScalar[Double](3.0)).write(Tsv("scalarDiagRight")) // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) - (diag1*sca1).write(Tsv("scalarObjDiagRight")) + (diag1 * sca1).write(Tsv("scalarObjDiagRight")) } -class ScalarDiagLeft(args : Args) extends Job(args) { +class ScalarDiagLeft(args: Args) extends Job(args) { - import Matrix._ - - val p1 = Tsv("diag1",('x,'v)).read + val p1 = Tsv("diag1", ('x, 'v)).read val diag1 = new DiagonalMatrix[Int, Double]('x, 'v, p1) - + (new LiteralScalar[Double](3.0) * diag1).write(Tsv("scalarDiagLeft")) // now with a scalar object - val p2 = Tsv("sca1", ('v)).read + val p2 = Tsv("sca1", 'v).read val sca1 = new Scalar[Double]('v, p2) - (sca1*diag1).write(Tsv("scalarObjDiagLeft")) + (sca1 * diag1).write(Tsv("scalarObjDiagLeft")) } -class ColNormalize(args : Args) extends Job(args) { - - import Matrix._ +class ColNormalize(args: Args) extends Job(args) { - val p1 = Tsv("col1",('x,'v)).read + val p1 = Tsv("col1", ('x, 'v)).read val col1 = new ColVector[Int, Double]('x, 'v, p1) - + col1.L0Normalize.write(Tsv("colLZeroNorm")) col1.L1Normalize.write(Tsv("colLOneNorm")) } -class ColDiagonal(args : Args) extends Job(args) { - - import Matrix._ +class ColDiagonal(args: Args) extends Job(args) { val col1 = new ColVector[Int, Double]('x, 'v, null, FiniteHint(100, 1)) val sizeHintTotal = col1.diag.sizeHint.total.get } -class RowNormalize(args : Args) extends Job(args) { +class RowNormalize(args: Args) extends Job(args) { - import Matrix._ - - val p1 = Tsv("row1",('x,'v)).read + val p1 = Tsv("row1", ('x, 'v)).read val row1 = new RowVector[Int, Double]('x, 'v, p1) - + row1.L0Normalize.write(Tsv("rowLZeroNorm")) row1.L1Normalize.write(Tsv("rowLOneNorm")) } -class MatrixTest extends Specification { - noDetailedDiffs() // For scala 2.9 +class MatrixTest extends WordSpec with Matchers { import Dsl._ - def toSparseMat[Row,Col,V](iter : Iterable[(Row,Col,V)]) : Map[(Row,Col),V] = { - iter.map { it => ((it._1, it._2),it._3) }.toMap - } - def oneDtoSparseMat[Idx,V](iter : Iterable[(Idx,V)]) : Map[(Idx,Idx),V] = { - iter.map { it => ((it._1, it._1), it._2) }.toMap - } + def toSparseMat[Row, Col, V](iter: Iterable[(Row, Col, V)]): Map[(Row, Col), V] = + iter.map(it => ((it._1, it._2), it._3)).toMap + def oneDtoSparseMat[Idx, V](iter: Iterable[(Idx, V)]): Map[(Idx, Idx), V] = + iter.map(it => ((it._1, it._1), it._2)).toMap "A MatrixProd job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.MatrixProd") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Int,Double)](Tsv("product")) { ob => - "correctly compute products" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,1)->17.0, (1,2)->12.0, (2,1)->12.0, (2,2)->9.0)) + JobTest(new MatrixProd(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Int, Double)](Tsv("product")) { ob => + "correctly compute products" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 17.0, (1, 2) -> 12.0, (2, 1) -> 12.0, (2, 2) -> 9.0) + } } - } - .run - .finish + .run + .finish() } } "A MatrixBlockProd job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.MatrixBlockProd") - .source(Tsv("mat1",('x1,'y1,'v1)), List(("alpha1",1,1.0),("alpha1",2,2.0),("beta1",1,5.0),("beta1",2,6.0),("alpha2",1,3.0),("alpha2",2,4.0),("beta2",1,7.0),("beta2",2,8.0))) - .sink[(String,String,Double)](Tsv("product")) { ob => - "correctly compute block products" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map( - ("alpha1", "alpha1") -> 5.0, - ("alpha1", "alpha2") -> 11.0, - ("alpha2", "alpha1") -> 11.0, - ("alpha2", "alpha2") -> 25.0, - ("beta1", "beta1") -> 61.0, - ("beta1", "beta2") -> 83.0, - ("beta2", "beta1") -> 83.0, - ("beta2", "beta2") -> 113.0)) + JobTest(new MatrixBlockProd(_)) + .source( + Tsv("mat1", ('x1, 'y1, 'v1)), + List( + ("alpha1", 1, 1.0), + ("alpha1", 2, 2.0), + ("beta1", 1, 5.0), + ("beta1", 2, 6.0), + ("alpha2", 1, 3.0), + ("alpha2", 2, 4.0), + ("beta2", 1, 7.0), + ("beta2", 2, 8.0) + ) + ) + .sink[(String, String, Double)](Tsv("product")) { ob => + "correctly compute block products" in { + toSparseMat(ob) shouldBe Map( + ("alpha1", "alpha1") -> 5.0, + ("alpha1", "alpha2") -> 11.0, + ("alpha2", "alpha1") -> 11.0, + ("alpha2", "alpha2") -> 25.0, + ("beta1", "beta1") -> 61.0, + ("beta1", "beta2") -> 83.0, + ("beta2", "beta1") -> 83.0, + ("beta2", "beta2") -> 113.0 + ) + } } - } .run - .finish + .finish() } } "A MatrixSum job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.MatrixSum") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .source(Tsv("mat2",('x2,'y2,'v2)), List((1,3,3.0),(2,1,8.0),(1,2,4.0))) - .sink[(Int,Int,Double)](Tsv("sum")) { ob => - "correctly compute sums" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (1,2)->8.0, (1,3)->3.0, (2,1)->8.0, (2,2)->3.0)) + JobTest(new MatrixSum(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .source(Tsv("mat2", ('x2, 'y2, 'v2)), List((1, 3, 3.0), (2, 1, 8.0), (1, 2, 4.0))) + .sink[(Int, Int, Double)](Tsv("sum")) { ob => + "correctly compute sums" in { + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 8.0, + (1, 3) -> 3.0, + (2, 1) -> 8.0, + (2, 2) -> 3.0 + ) + } } - } - .run - .finish + .run + .finish() } } - + "A MatrixSum job, where the Matrix contains tuples as values," should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.MatrixSum3") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,(1.0, 3.0, 5.0)),(2,2,(3.0, 2.0, 1.0)),(1,2,(4.0, 5.0, 2.0)))) - .sink[(Int,Int,(Double, Double, Double))](Tsv("sum")) { ob => - "correctly compute sums" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,1)->(2.0, 6.0, 10.0), (2,2)->(6.0, 4.0, 2.0), (1,2)->(8.0, 10.0, 4.0))) + JobTest("com.twitter.scalding.mathematics.MatrixSum3") + .source( + Tsv("mat1", ('x1, 'y1, 'v1)), + List((1, 1, (1.0, 3.0, 5.0)), (2, 2, (3.0, 2.0, 1.0)), (1, 2, (4.0, 5.0, 2.0))) + ) + .sink[(Int, Int, (Double, Double, Double))](Tsv("sum")) { ob => + "correctly compute sums" in { + toSparseMat(ob) shouldBe Map( + (1, 1) -> (2.0, 6.0, 10.0), + (2, 2) -> (6.0, 4.0, 2.0), + (1, 2) -> (8.0, 10.0, 4.0) + ) + } } - } - .run - .finish + .run + .finish() } } "A Matrix Randwalk job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Randwalk") - /* - * 1.0 4.0 - * 0.0 3.0 - * row normalized: - * 1.0/5.0 4.0/5.0 - * 0.0 1.0 - * product with itself: - * 1.0/25.0 (4.0/25.0 + 4.0/5.0) - * 0.0 1.0 - */ - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Int,Double)](Tsv("randwalk")) { ob => - "correctly compute matrix randwalk" in { - val pMap = toSparseMat(ob) - val exact = Map((1,1)->(1.0/25.0) , (1,2)->(4.0/25.0 + 4.0/5.0), (2,2)->1.0) - val grp = implicitly[Group[Map[(Int,Int),Double]]] - // doubles are hard to compare - grp.minus(pMap, exact) - .mapValues { x => x*x } - .map { _._2 } - .sum must be_<(0.0001) + JobTest(new Randwalk(_)) + /* + * 1.0 4.0 + * 0.0 3.0 + * row normalized: + * 1.0/5.0 4.0/5.0 + * 0.0 1.0 + * product with itself: + * 1.0/25.0 (4.0/25.0 + 4.0/5.0) + * 0.0 1.0 + */ + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Int, Double)](Tsv("randwalk")) { ob => + "correctly compute matrix randwalk" in { + val pMap = toSparseMat(ob) + val exact = Map((1, 1) -> (1.0 / 25.0), (1, 2) -> (4.0 / 25.0 + 4.0 / 5.0), (2, 2) -> 1.0) + val grp = implicitly[Group[Map[(Int, Int), Double]]] + // doubles are hard to compare + grp + .minus(pMap, exact) + .mapValues(x => x * x) + .map(_._2) + .sum should be < 0.0001 + } } - } - .run - .finish + .run + .finish() } } "A Matrix Cosine job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Cosine") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Int,Double)](Tsv("cosine")) { ob => - "correctly compute cosine similarity" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (1,2)->0.9701425001453319, (2,1)->0.9701425001453319, (2,2)->1.0 )) + JobTest(new Cosine(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Int, Double)](Tsv("cosine")) { ob => + "correctly compute cosine similarity" in { + toSparseMat(ob) shouldBe Map( + (1, 1) -> 1.0, + (1, 2) -> 0.9701425001453319, + (2, 1) -> 0.9701425001453319, + (2, 2) -> 1.0 + ) + } } - } - .run - .finish + .run + .finish() } } "A Matrix Covariance job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.Covariance") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Int,Double)](Tsv("cov")) { ob => - "correctly compute matrix covariance" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,1)->0.25, (1,2)-> -0.25, (2,1)-> -0.25, (2,2)->0.25 )) + JobTest(new Covariance(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Int, Double)](Tsv("cov")) { ob => + "correctly compute matrix covariance" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 0.25, (1, 2) -> -0.25, (2, 1) -> -0.25, (2, 2) -> 0.25) + } } - } - .run - .finish + .run + .finish() } } "A Matrix VctProd job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.VctProd") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[Double](Tsv("vctProd")) { ob => - "correctly compute vector inner products" in { - ob(0) must be_==(17.0) + JobTest(new VctProd(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[Double](Tsv("vctProd")) { ob => + "correctly compute vector inner products" in { + ob(0) shouldBe 17.0 + } } - } - .run - .finish + .run + .finish() } } "A Matrix VctDiv job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.VctDiv") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Double)](Tsv("vctDiv")) { ob => - "correctly compute vector element-wise division" in { - val pMap = oneDtoSparseMat(ob) - pMap must be_==( Map((2,2)->1.3333333333333333) ) + JobTest(new VctDiv(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Double)](Tsv("vctDiv")) { ob => + "correctly compute vector element-wise division" in { + oneDtoSparseMat(ob) shouldBe Map((2, 2) -> 1.3333333333333333) + } } - } - .run - .finish + .run + .finish() } } "A Matrix ScalarOps job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.ScalarOps") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Int,Double)](Tsv("times3")) { ob => - "correctly compute M * 3" in { - toSparseMat(ob) must be_==( Map((1,1)->3.0, (2,2)->9.0, (1,2)->12.0) ) + JobTest(new ScalarOps(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Int, Double)](Tsv("times3")) { ob => + "correctly compute M * 3" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 3.0, (2, 2) -> 9.0, (1, 2) -> 12.0) + } + } + .sink[(Int, Int, Double)](Tsv("3times")) { ob => + "correctly compute 3 * M" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 3.0, (2, 2) -> 9.0, (1, 2) -> 12.0) + } + } + .sink[(Int, Int, Double)](Tsv("div3")) { ob => + "correctly compute M / 3" in { + toSparseMat(ob) shouldBe Map((1, 1) -> (1.0 / 3.0), (2, 2) -> (3.0 / 3.0), (1, 2) -> (4.0 / 3.0)) + } + } + .sink[(Int, Int, Double)](Tsv("timestrace")) { ob => + "correctly compute M * Tr(M)" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 4.0, (2, 2) -> 12.0, (1, 2) -> 16.0) + } + } + .sink[(Int, Int, Double)](Tsv("tracetimes")) { ob => + "correctly compute Tr(M) * M" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 4.0, (2, 2) -> 12.0, (1, 2) -> 16.0) + } + } + .sink[(Int, Int, Double)](Tsv("divtrace")) { ob => + "correctly compute M / Tr(M)" in { + toSparseMat(ob) shouldBe Map((1, 1) -> (1.0 / 4.0), (2, 2) -> (3.0 / 4.0), (1, 2) -> (4.0 / 4.0)) + } } - } - .sink[(Int,Int,Double)](Tsv("3times")) { ob => - "correctly compute 3 * M" in { - toSparseMat(ob) must be_==( Map((1,1)->3.0, (2,2)->9.0, (1,2)->12.0) ) - } - } - .sink[(Int,Int,Double)](Tsv("div3")) { ob => - "correctly compute M / 3" in { - toSparseMat(ob) must be_==( Map((1,1)->(1.0/3.0), (2,2)->(3.0/3.0), (1,2)->(4.0/3.0)) ) - } - } - .sink[(Int,Int,Double)](Tsv("timestrace")) { ob => - "correctly compute M * Tr(M)" in { - toSparseMat(ob) must be_==( Map((1,1)->4.0, (2,2)->12.0, (1,2)->16.0) ) - } - } - .sink[(Int,Int,Double)](Tsv("tracetimes")) { ob => - "correctly compute Tr(M) * M" in { - toSparseMat(ob) must be_==( Map((1,1)->4.0, (2,2)->12.0, (1,2)->16.0) ) - } - } - .sink[(Int,Int,Double)](Tsv("divtrace")) { ob => - "correctly compute M / Tr(M)" in { - toSparseMat(ob) must be_==( Map((1,1)->(1.0/4.0), (2,2)->(3.0/4.0), (1,2)->(4.0/4.0)) ) - } - } - .run - .finish + .run + .finish() } } "A Matrix Diagonal job" should { TUtil.printStack { - JobTest(new DiagonalOps(_)) - /* [[1.0 4.0] - * [0.0 3.0]] - */ - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Int,Double)](Tsv("diag-mat")) { ob => - "correctly compute diag * matrix" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (1,2)->4.0, (2,2)->9.0) ) - } - } - .sink[(Int,Double)](Tsv("diag-diag")) { ob => - "correctly compute diag * diag" in { - val pMap = oneDtoSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (2,2)->9.0) ) - } - } - .sink[(Int,Int,Double)](Tsv("mat-diag")) { ob => - "correctly compute matrix * diag" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (1,2)->12.0, (2,2)->9.0) ) - } - } - .sink[(Int,Double)](Tsv("diag-col")) { ob => - "correctly compute diag * col" in { - ob.toMap must be_==( Map(1->1.0)) - } - } - .sink[(Int,Double)](Tsv("row-diag")) { ob => - "correctly compute row * diag" in { - ob.toMap must be_==( Map(1->1.0, 2 -> 12.0)) + JobTest(new DiagonalOps(_)) + /* [[1.0 4.0] + * [0.0 3.0]] + */ + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Int, Double)](Tsv("diag-mat")) { ob => + "correctly compute diag * matrix" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (1, 2) -> 4.0, (2, 2) -> 9.0) + } + } + .sink[(Int, Double)](Tsv("diag-diag")) { ob => + "correctly compute diag * diag" in { + oneDtoSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (2, 2) -> 9.0) + } + } + .sink[(Int, Int, Double)](Tsv("mat-diag")) { ob => + "correctly compute matrix * diag" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (1, 2) -> 12.0, (2, 2) -> 9.0) + } + } + .sink[(Int, Double)](Tsv("diag-col")) { ob => + "correctly compute diag * col" in { + ob.toMap shouldBe Map(1 -> 1.0) + } + } + .sink[(Int, Double)](Tsv("row-diag")) { ob => + "correctly compute row * diag" in { + ob.toMap shouldBe Map(1 -> 1.0, 2 -> 12.0) + } } - } - .run - .finish + .run + .finish() } } "A Propagation job" should { TUtil.printStack { - JobTest(new PropJob(_)) - /* [[0 1 1], - * [0 0 1], - * [1 0 0]] = List((0,1,1), (0,2,1), (1,2,1), (2,0,1)) - * [1.0 2.0 4.0] = List((0,1.0), (1,2.0), (2,4.0)) - */ - .source(TypedTsv[(Int,Int,Int)]("graph"), List((0,1,1), (0,2,1), (1,2,1), (2,0,1))) - .source(TypedTsv[(Int,Double)]("row"), List((0,1.0), (1,2.0), (2,4.0))) - .source(TypedTsv[(Int,Double)]("col"), List((0,1.0), (1,2.0), (2,4.0))) - .sink[(Int,Double)](Tsv("prop-col")) { ob => - "correctly propagate columns" in { - ob.toMap must be_==(Map(0 -> 6.0, 1 -> 4.0, 2 -> 1.0)) + JobTest(new PropJob(_)) + /* [[0 1 1], + * [0 0 1], + * [1 0 0]] = List((0,1,1), (0,2,1), (1,2,1), (2,0,1)) + * [1.0 2.0 4.0] = List((0,1.0), (1,2.0), (2,4.0)) + */ + .source(TypedTsv[(Int, Int, Int)]("graph"), List((0, 1, 1), (0, 2, 1), (1, 2, 1), (2, 0, 1))) + .source(TypedTsv[(Int, Double)]("row"), List((0, 1.0), (1, 2.0), (2, 4.0))) + .source(TypedTsv[(Int, Double)]("col"), List((0, 1.0), (1, 2.0), (2, 4.0))) + .sink[(Int, Double)](Tsv("prop-col")) { ob => + "correctly propagate columns" in { + ob.toMap shouldBe Map(0 -> 6.0, 1 -> 4.0, 2 -> 1.0) + } + } + .sink[(Int, Double)](Tsv("prop-row")) { ob => + "correctly propagate rows" in { + ob.toMap shouldBe Map(0 -> 4.0, 1 -> 1.0, 2 -> 3.0) + } } - } - .sink[(Int,Double)](Tsv("prop-row")) { ob => - "correctly propagate rows" in { - ob.toMap must be_==(Map(0 -> 4.0, 1 -> 1.0, 2 -> 3.0)) - } - } - .run - .finish + .run + .finish() } } "A MapWithIndex job" should { JobTest(new MatrixMapWithVal(_)) - .source(TypedTsv[(Int,Int,Int)]("graph"), List((0,1,1), (1,1,3), (0,2,1), (1,2,1), (2,0,1))) - .source(TypedTsv[(Int,Double)]("row"), List((0,1.0), (1,2.0), (2,4.0))) - .sink[(Int,Double)](Tsv("first")) { ob => + .source(TypedTsv[(Int, Int, Int)]("graph"), List((0, 1, 1), (1, 1, 3), (0, 2, 1), (1, 2, 1), (2, 0, 1))) + .source(TypedTsv[(Int, Double)]("row"), List((0, 1.0), (1, 2.0), (2, 4.0))) + .sink[(Int, Double)](Tsv("first")) { ob => "correctly mapWithIndex on Row" in { - ob.toMap must be_==(Map(0 -> 1.0)) + ob.toMap shouldBe Map(0 -> 1.0) } } - .sink[(Int,Int,Int)](Tsv("diag")) { ob => + .sink[(Int, Int, Int)](Tsv("diag")) { ob => "correctly mapWithIndex on Matrix" in { - toSparseMat(ob) must be_==(Map((1,1) -> 3)) + toSparseMat(ob) shouldBe Map((1, 1) -> 3) } } .run - .finish + .finish() } "A Matrix RowMatProd job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.RowMatProd") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Double)](Tsv("rowMatPrd")) { ob => - "correctly compute a new row vector" in { - val pMap = oneDtoSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (2,2)->16.0) ) + JobTest(new RowMatProd(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Double)](Tsv("rowMatPrd")) { ob => + "correctly compute a new row vector" in { + oneDtoSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (2, 2) -> 16.0) + } } - } - .run - .finish + .run + .finish() } } "A Matrix MatColProd job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.MatColProd") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Double)](Tsv("matColPrd")) { ob => - "correctly compute a new column vector" in { - val pMap = oneDtoSparseMat(ob) - pMap must be_==( Map((1,1)->1.0) ) + JobTest(new MatColProd(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Double)](Tsv("matColPrd")) { ob => + "correctly compute a new column vector" in { + oneDtoSparseMat(ob) shouldBe Map((1, 1) -> 1.0) + } } - } - .run - .finish + .run + .finish() } } "A Matrix RowRowDiff job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.RowRowDiff") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Double)](Tsv("rowRowDiff")) { ob => - "correctly subtract row vectors" in { - val pMap = oneDtoSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (2,2)->1.0) ) + JobTest(new RowRowDiff(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Double)](Tsv("rowRowDiff")) { ob => + "correctly subtract row vectors" in { + oneDtoSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (2, 2) -> 1.0) + } } - } - .run - .finish + .run + .finish() } } "A Matrix VctOuterProd job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.VctOuterProd") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Int,Double)](Tsv("outerProd")) { ob => - "correctly compute the outer product of a column and row vector" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (1,2)->4.0, (2,1) -> 4.0, (2,2)->16.0) ) + JobTest(new VctOuterProd(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Int, Double)](Tsv("outerProd")) { ob => + "correctly compute the outer product of a column and row vector" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (1, 2) -> 4.0, (2, 1) -> 4.0, (2, 2) -> 16.0) + } } - } - .run - .finish + .run + .finish() } } "A Matrix RowRowSum job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.RowRowSum") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Double)](Tsv("rowRowSum")) { ob => - "correctly add row vectors" in { - val pMap = oneDtoSparseMat(ob) - pMap must be_==( Map((1,1)->2.0, (2,2)->8.0) ) + JobTest(new RowRowSum(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Double)](Tsv("rowRowSum")) { ob => + "correctly add row vectors" in { + oneDtoSparseMat(ob) shouldBe Map((1, 1) -> 2.0, (2, 2) -> 8.0) + } } - } - .run - .finish + .run + .finish() } } "A Matrix RowRowHad job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.RowRowHad") - .source(Tsv("mat1",('x1,'y1,'v1)), List((1,1,1.0),(2,2,3.0),(1,2,4.0))) - .sink[(Int,Double)](Tsv("rowRowHad")) { ob => - "correctly compute a Hadamard product of row vectors" in { - val pMap = oneDtoSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (2,2)->16.0) ) + JobTest(new RowRowHad(_)) + .source(Tsv("mat1", ('x1, 'y1, 'v1)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0))) + .sink[(Int, Double)](Tsv("rowRowHad")) { ob => + "correctly compute a Hadamard product of row vectors" in { + oneDtoSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (2, 2) -> 16.0) + } } - } - .run - .finish + .run + .finish() } - } - + } + "A FilterMatrix job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.FilterMatrix") - .source(Tsv("mat1",('x,'y,'v)), List((1,1,1.0),(2,2,3.0),(1,2,4.0),(2,1,2.0))) - .source(Tsv("mat2",('x,'y,'v)), List((1,1,5.0),(2,2,9.0))) - .sink[(Int,Int,Double)](Tsv("removeMatrix")) { ob => - "correctly remove elements" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,2)->4.0, (2,1)->2.0) ) - } - } - .sink[(Int,Int,Double)](Tsv("keepMatrix")) { ob => - "correctly keep elements" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,1)->1.0, (2,2)->3.0) ) + JobTest(new FilterMatrix(_)) + .source(Tsv("mat1", ('x, 'y, 'v)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0), (2, 1, 2.0))) + .source(Tsv("mat2", ('x, 'y, 'v)), List((1, 1, 5.0), (2, 2, 9.0))) + .sink[(Int, Int, Double)](Tsv("removeMatrix")) { ob => + "correctly remove elements" in { + toSparseMat(ob) shouldBe Map((1, 2) -> 4.0, (2, 1) -> 2.0) + } + } + .sink[(Int, Int, Double)](Tsv("keepMatrix")) { ob => + "correctly keep elements" in { + toSparseMat(ob) shouldBe Map((1, 1) -> 1.0, (2, 2) -> 3.0) + } } - } - .run - .finish + .run + .finish() } } "A KeepRowsCols job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.KeepRowsCols") - .source(Tsv("mat1",('x,'y,'v)), List((1,1,1.0),(2,2,3.0),(1,2,4.0),(2,1,2.0))) - .source(Tsv("col1",('x,'v)), List((1,5.0))) - .sink[(Int,Int,Double)](Tsv("keepRows")) { ob => - "correctly keep row vectors" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((1,2)->4.0, (1,1)->1.0) ) + JobTest(new KeepRowsCols(_)) + .source(Tsv("mat1", ('x, 'y, 'v)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0), (2, 1, 2.0))) + .source(Tsv("col1", ('x, 'v)), List((1, 5.0))) + .sink[(Int, Int, Double)](Tsv("keepRows")) { ob => + "correctly keep row vectors" in { + toSparseMat(ob) shouldBe Map((1, 2) -> 4.0, (1, 1) -> 1.0) + } + } + .sink[(Int, Int, Double)](Tsv("keepCols")) { ob => + "correctly keep col vectors" in { + toSparseMat(ob) shouldBe Map((2, 1) -> 2.0, (1, 1) -> 1.0) + } } - } - .sink[(Int,Int,Double)](Tsv("keepCols")) { ob => - "correctly keep col vectors" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((2,1)->2.0, (1,1)->1.0) ) - } - } - .run - .finish + .run + .finish() } } "A RemoveRowsCols job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.RemoveRowsCols") - .source(Tsv("mat1",('x,'y,'v)), List((1,1,1.0),(2,2,3.0),(1,2,4.0),(2,1,2.0))) - .source(Tsv("col1",('x,'v)), List((1,5.0))) - .sink[(Int,Int,Double)](Tsv("removeRows")) { ob => - "correctly keep row vectors" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((2,2)->3.0, (2,1)->2.0) ) + JobTest(new RemoveRowsCols(_)) + .source(Tsv("mat1", ('x, 'y, 'v)), List((1, 1, 1.0), (2, 2, 3.0), (1, 2, 4.0), (2, 1, 2.0))) + .source(Tsv("col1", ('x, 'v)), List((1, 5.0))) + .sink[(Int, Int, Double)](Tsv("removeRows")) { ob => + "correctly keep row vectors" in { + toSparseMat(ob) shouldBe Map((2, 2) -> 3.0, (2, 1) -> 2.0) + } + } + .sink[(Int, Int, Double)](Tsv("removeCols")) { ob => + "correctly keep col vectors" in { + toSparseMat(ob) shouldBe Map((2, 2) -> 3.0, (1, 2) -> 4.0) + } } - } - .sink[(Int,Int,Double)](Tsv("removeCols")) { ob => - "correctly keep col vectors" in { - val pMap = toSparseMat(ob) - pMap must be_==( Map((2,2)->3.0, (1,2)->4.0) ) - } - } - .run - .finish + .run + .finish() } } "A Scalar Row Right job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.ScalarRowRight") - .source(Tsv("sca1", ('v)), List(3.0)) - .source(Tsv("row1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) - .sink[(Int, Double)](Tsv("scalarRowRight")) { ob => - "correctly compute a new row vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .sink[(Int, Double)](Tsv("scalarObjRowRight")) { ob => - "correctly compute a new row vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .run - .finish + var idx = 0 + JobTest(new ScalarRowRight(_)) + .source(Tsv("sca1", 'v), List(3.0)) + .source(Tsv("row1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) + .sink[(Int, Double)](Tsv("scalarRowRight")) { ob => + s"$idx: correctly compute a new row vector" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .sink[(Int, Double)](Tsv("scalarObjRowRight")) { ob => + s"$idx: correctly compute a new row vector" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .run + .finish() } } "A Scalar Row Left job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.ScalarRowLeft") - .source(Tsv("sca1", ('v)), List(3.0)) - .source(Tsv("row1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) - .sink[(Int, Double)](Tsv("scalarRowLeft")) { ob => - "correctly compute a new row vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .sink[(Int, Double)](Tsv("scalarObjRowLeft")) { ob => - "correctly compute a new row vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .run - .finish + var idx = 0 + JobTest(new ScalarRowLeft(_)) + .source(Tsv("sca1", 'v), List(3.0)) + .source(Tsv("row1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) + .sink[(Int, Double)](Tsv("scalarRowLeft")) { ob => + s"$idx: correctly compute a new row vector" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .sink[(Int, Double)](Tsv("scalarObjRowLeft")) { ob => + s"$idx: correctly compute a new row vector" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .run + .finish() } } "A Scalar Col Right job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.ScalarColRight") - .source(Tsv("sca1", ('v)), List(3.0)) - .source(Tsv("col1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) - .sink[(Int, Double)](Tsv("scalarColRight")) { ob => - "correctly compute a new col vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .sink[(Int, Double)](Tsv("scalarObjColRight")) { ob => - "correctly compute a new col vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .run - .finish + var idx = 0 + JobTest(new ScalarColRight(_)) + .source(Tsv("sca1", 'v), List(3.0)) + .source(Tsv("col1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) + .sink[(Int, Double)](Tsv("scalarColRight")) { ob => + s"$idx: correctly compute a new col vector" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .sink[(Int, Double)](Tsv("scalarObjColRight")) { ob => + s"$idx: correctly compute a new col vector" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .run + .finish() } } "A Scalar Col Left job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.ScalarColLeft") - .source(Tsv("sca1", ('v)), List(3.0)) - .source(Tsv("col1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) - .sink[(Int, Double)](Tsv("scalarColLeft")) { ob => - "correctly compute a new col vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .sink[(Int, Double)](Tsv("scalarObjColLeft")) { ob => - "correctly compute a new col vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .run - .finish + var idx = 0 + JobTest(new ScalarColLeft(_)) + .source(Tsv("sca1", 'v), List(3.0)) + .source(Tsv("col1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) + .sink[(Int, Double)](Tsv("scalarColLeft")) { ob => + s"$idx: correctly compute a new col vector" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .sink[(Int, Double)](Tsv("scalarObjColLeft")) { ob => + s"$idx: correctly compute a new col vector" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .run + .finish() } } "A Scalar Diag Right job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.ScalarDiagRight") - .source(Tsv("sca1", ('v)), List(3.0)) - .source(Tsv("diag1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) - .sink[(Int, Double)](Tsv("scalarDiagRight")) { ob => - "correctly compute a new diag matrix" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .sink[(Int, Double)](Tsv("scalarObjDiagRight")) { ob => - "correctly compute a new diag matrix" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .run - .finish + var idx = 0 + JobTest(new ScalarDiagRight(_)) + .source(Tsv("sca1", 'v), List(3.0)) + .source(Tsv("diag1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) + .sink[(Int, Double)](Tsv("scalarDiagRight")) { ob => + s"$idx: correctly compute a new diag matrix" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .sink[(Int, Double)](Tsv("scalarObjDiagRight")) { ob => + s"$idx: correctly compute a new diag matrix" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .run + .finish() } } "A Scalar Diag Left job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.ScalarDiagLeft") - .source(Tsv("sca1", ('v)), List(3.0)) - .source(Tsv("diag1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) - .sink[(Int, Double)](Tsv("scalarDiagLeft")) { ob => - "correctly compute a new diag matrix" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .sink[(Int, Double)](Tsv("scalarObjDiagLeft")) { ob => - "correctly compute a new diag matrix" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) ) - } - } - .run - .finish + var idx = 0 + JobTest(new ScalarDiagLeft(_)) + .source(Tsv("sca1", 'v), List(3.0)) + .source(Tsv("diag1", ('x, 'v)), List((1, 1.0), (2, 2.0), (3, 6.0))) + .sink[(Int, Double)](Tsv("scalarDiagLeft")) { ob => + s"$idx: correctly compute a new diag matrix" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .sink[(Int, Double)](Tsv("scalarObjDiagLeft")) { ob => + s"$idx: correctly compute a new diag matrix" in { + ob.toMap shouldBe Map(1 -> 3.0, 2 -> 6.0, 3 -> 18.0) + } + idx += 1 + } + .run + .finish() } } - "A Col Normalizing job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.ColNormalize") - .source(Tsv("col1", ('x, 'v)), List((1, 1.0), (2, -2.0), (3, 6.0))) - .sink[(Int, Double)](Tsv("colLZeroNorm")) { ob => - "correctly compute a new col vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> (1.0/3.0), 2 -> (-2.0/3.0), 3 -> (6.0/3.0)) ) - } - } - .sink[(Int, Double)](Tsv("colLOneNorm")) { ob => - "correctly compute a new col vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> (1.0/9.0), 2 -> (-2.0/9.0), 3 -> (6.0/9.0)) ) - } - } - .run - .finish + var idx = 0 + JobTest(new ColNormalize(_)) + .source(Tsv("col1", ('x, 'v)), List((1, 1.0), (2, -2.0), (3, 6.0))) + .sink[(Int, Double)](Tsv("colLZeroNorm")) { ob => + s"$idx: correctly compute a new col vector" in { + ob.toMap shouldBe Map(1 -> (1.0 / 3.0), 2 -> (-2.0 / 3.0), 3 -> (6.0 / 3.0)) + } + idx += 1 + } + .sink[(Int, Double)](Tsv("colLOneNorm")) { ob => + s"$idx: correctly compute a new col vector" in { + ob.toMap shouldBe Map(1 -> (1.0 / 9.0), 2 -> (-2.0 / 9.0), 3 -> (6.0 / 9.0)) + } + idx += 1 + } + .run + .finish() } } "A Col Diagonal job" should { TUtil.printStack { "correctly compute the size of the diagonal matrix" in { - val col = new ColDiagonal(Mode.putMode(new Test(Map.empty), new Args(Map.empty))) - col.sizeHintTotal must be_==(100L) - } + val col = new ColDiagonal(Mode.putMode(new Test(Map.empty), new Args(Map.empty))) + col.sizeHintTotal shouldBe 100L + } } } "A Row Normalizing job" should { TUtil.printStack { - JobTest("com.twitter.scalding.mathematics.RowNormalize") - .source(Tsv("row1", ('x, 'v)), List((1, 1.0), (2, -2.0), (3, 6.0))) - .sink[(Int, Double)](Tsv("rowLZeroNorm")) { ob => - "correctly compute a new row vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> (1.0/3.0), 2 -> (-2.0/3.0), 3 -> (6.0/3.0)) ) - } - } - .sink[(Int, Double)](Tsv("rowLOneNorm")) { ob => - "correctly compute a new row vector" in { - val pMap = ob.toMap - pMap must be_==( Map(1 -> (1.0/9.0), 2 -> (-2.0/9.0), 3 -> (6.0/9.0)) ) - } - } - .run - .finish + var idx = 0 + JobTest(new RowNormalize(_)) + .source(Tsv("row1", ('x, 'v)), List((1, 1.0), (2, -2.0), (3, 6.0))) + .sink[(Int, Double)](Tsv("rowLZeroNorm")) { ob => + s"$idx: correctly compute a new row vector" in { + ob.toMap shouldBe Map(1 -> (1.0 / 3.0), 2 -> (-2.0 / 3.0), 3 -> (6.0 / 3.0)) + } + idx += 1 + } + .sink[(Int, Double)](Tsv("rowLOneNorm")) { ob => + s"$idx: correctly compute a new row vector" in { + ob.toMap shouldBe Map(1 -> (1.0 / 9.0), 2 -> (-2.0 / 9.0), 3 -> (6.0 / 9.0)) + } + idx += 1 + } + .run + .finish() } } - - - - } diff --git a/scalding-core/src/test/scala/com/twitter/scalding/mathematics/TypedSimilarityTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/TypedSimilarityTest.scala new file mode 100644 index 0000000000..7324630bd5 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/mathematics/TypedSimilarityTest.scala @@ -0,0 +1,126 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.mathematics + +import com.twitter.scalding._ +import com.twitter.algebird.MapAlgebra.dot +import com.twitter.algebird.Group + +import TDsl._ + +import org.scalatest.{Matchers, WordSpec} + +import GraphOperations._ + +class TypedCosineSimJob(args: Args) extends Job(args) { + + // val simOf = new ExactInCosine[Int]() + val simOf = new DiscoInCosine[Int](0.001, 0.1, 0.01) + val graph = withInDegree { + TypedTsv[(Int, Int)]("ingraph") + .map { case (from, to) => Edge(from, to, ()) } + } + // Just keep the degree + .map(edge => edge.mapData(_._2)) + + simOf(graph, { n: Int => n % 2 == 0 }, { n: Int => n % 2 == 1 }) + .map(edge => (edge.from, edge.to, edge.data)) + .write(TypedTsv[(Int, Int, Double)]("out")) +} + +class TypedDimsumCosineSimJob(args: Args) extends Job(args) { + + val simOf = new DimsumInCosine[Int](0.001, 0.1, 0.01) + val graph = withInNorm { + TypedTsv[(Int, Int, Double)]("ingraph") + .map { case (from, to, weight) => Edge(from, to, Weight(weight)) } + } + + simOf(graph, { n: Int => n % 2 == 0 }, { n: Int => n % 2 == 1 }) + .map(edge => (edge.from, edge.to, edge.data)) + .toPipe('from, 'to, 'data) + .write(TypedTsv[(Int, Int, Double)]("out")) +} + +class TypedSimilarityTest extends WordSpec with Matchers { + val nodes = 50 + val rand = new java.util.Random(1) + val edges = (0 to nodes).flatMap { n => + // try to get at least 6 edges for each node + (0 to ((nodes / 5).max(6))).foldLeft(Set[(Int, Int)]()) { (set, idx) => + if (set.size > 6) { set } + else { + set + (n -> rand.nextInt(nodes)) + } + } + }.toSeq + + val MaxWeight = 2 + val weightedEdges = (0 to nodes).flatMap { n => + // try to get at least 10 edges for each node + (0 to ((nodes / 5).max(10))).foldLeft(Set[(Int, Int, Double)]()) { (set, idx) => + if (set.size > 10) { set } + else { + set + ((n, rand.nextInt(nodes), rand.nextDouble * MaxWeight)) + } + } + }.toSeq + + def cosineOf(es: Seq[(Int, Int)]): Map[(Int, Int), Double] = { + // Get followers of each node: + val matrix: Map[Int, Map[Int, Double]] = + es.groupBy(_._2).mapValues(seq => seq.map { case (from, to) => (from, 1.0) }.toMap) + for { + (k1, v1) <- matrix if k1 % 2 == 0 + (k2, v2) <- matrix if k2 % 2 == 1 + } yield ((k1, k2) -> (dot(v1, v2) / scala.math.sqrt(dot(v1, v1) * dot(v2, v2)))) + } + + def weightedCosineOf(es: Seq[(Int, Int, Double)]): Map[(Int, Int), Double] = { + // Get followers of each node: + val matrix: Map[Int, Map[Int, Double]] = + es.groupBy(_._2).mapValues(seq => seq.map { case (from, to, weight) => (from, weight) }.toMap) + for { + (k1, v1) <- matrix if k1 % 2 == 0 + (k2, v2) <- matrix if k2 % 2 == 1 + } yield ((k1, k2) -> (dot(v1, v2) / scala.math.sqrt(dot(v1, v1) * dot(v2, v2)))) + } + + "A TypedCosineJob" should { + "compute cosine similarity" in { + JobTest(new TypedCosineSimJob(_)) + .source(TypedTsv[(Int, Int)]("ingraph"), edges) + .sink[(Int, Int, Double)](TypedTsv[(Int, Int, Double)]("out")) { ob => + val result = ob.map { case (n1, n2, d) => ((n1 -> n2) -> d) }.toMap + val error = Group.minus(result, cosineOf(edges)) + dot(error, error) should be < 0.001 + } + .run + .finish() + } + "compute dimsum cosine similarity" in { + JobTest(new TypedDimsumCosineSimJob(_)) + .source(TypedTsv[(Int, Int, Double)]("ingraph"), weightedEdges) + .sink[(Int, Int, Double)](TypedTsv[(Int, Int, Double)]("out")) { ob => + val result = ob.map { case (n1, n2, d) => ((n1 -> n2) -> d) }.toMap + val error = Group.minus(result, weightedCosineOf(weightedEdges)) + dot(error, error) should be < (0.01 * error.size) + } + .run + .finish() + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/source/TypedTextTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/source/TypedTextTest.scala new file mode 100644 index 0000000000..a36a818c88 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/source/TypedTextTest.scala @@ -0,0 +1,38 @@ +package com.twitter.scalding.source + +import org.scalatest.FunSuite + +case class Test1(a: Int, b: Long, c: Option[Double]) +case class Test2(one: Test1, d: String) + +class TypedTextTest extends FunSuite { + test("Test with a flat tuple") { + val source = TypedText.tsv[Test1]("myPath") + assert(source.sourceFields.size == 3) + } + + test("Test with a nested tuple") { + val source = TypedText.tsv[Test2]("myPath") + assert(source.sourceFields.size == 4) + } + + test("Test with a raw type") { + val source = TypedText.tsv[String]("myPath") + assert(source.sourceFields.size == 1) + } + + test("Test with a tuple") { + val source = TypedText.tsv[(Int, Int)]("myPath") + assert(source.sourceFields.size == 2) + } + + test("Test with an Optional Int") { + val source = TypedText.tsv[Option[Int]]("myPath") + assert(source.sourceFields.size == 1) + } + + test("Test with an Int") { + val source = TypedText.tsv[Int]("myPath") + assert(source.sourceFields.size == 1) + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/BijectedSourceSinkTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/BijectedSourceSinkTest.scala new file mode 100644 index 0000000000..7327ccda84 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/BijectedSourceSinkTest.scala @@ -0,0 +1,97 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +import org.scalatest.{Matchers, WordSpec} + +import com.twitter.scalding._ + +private[typed] object LongIntPacker { + def lr(l: Int, r: Int): Long = (l.toLong << 32) | r + def l(rowCol: Long) = (rowCol >>> 32).toInt + def r(rowCol: Long) = (rowCol & 0xffffffff).toInt +} + +class MutatedSourceJob(args: Args) extends Job(args) { + import com.twitter.bijection._ + implicit val bij: AbstractBijection[Long, (Int, Int)] = new AbstractBijection[Long, (Int, Int)] { + override def apply(x: Long) = (LongIntPacker.l(x), LongIntPacker.r(x)) + override def invert(y: (Int, Int)) = LongIntPacker.lr(y._1, y._2) + } + + val in0: TypedPipe[(Int, Int)] = TypedPipe.from(BijectedSourceSink(TypedTsv[Long]("input0"))) + + in0 + .map { tup: (Int, Int) => + (tup._1 * 2, tup._2 * 2) + } + .write(BijectedSourceSink(TypedTsv[Long]("output"))) +} + +class MutatedSourceTest extends WordSpec with Matchers { + "A MutatedSourceJob" should { + "Not throw when using a converted source" in { + JobTest(new MutatedSourceJob(_)) + .source(TypedTsv[Long]("input0"), List(8L, 4123423431L, 12L)) + .typedSink(TypedTsv[Long]("output")) { outBuf => + val unordered = outBuf.toSet + // Size should be unchanged + unordered should have size 3 + + // Simple case, 2*8L won't run into the packer logic + unordered should contain(16L) + // Big one that should be in both the high and low 4 bytes of the Long + val big = 4123423431L + val newBig = LongIntPacker.lr(LongIntPacker.l(big) * 2, LongIntPacker.r(big) * 2) + unordered should contain(newBig) + } + .run + .runHadoop + .finish() + } + } +} + +class ContraMappedAndThenSourceJob(args: Args) extends Job(args) { + TypedPipe + .from(TypedTsv[Long]("input0").andThen(x => (LongIntPacker.l(x), LongIntPacker.r(x)))) + .map { case (l, r) => (l * 2, r * 2) } + .write(TypedTsv[Long]("output").contraMap { case (l, r) => LongIntPacker.lr(l, r) }) +} + +class ContraMappedAndThenSourceTest extends WordSpec with Matchers { + "A ContraMappedAndThenSourceJob" should { + "Not throw when using a converted source" in { + JobTest(new ContraMappedAndThenSourceJob(_)) + .source(TypedTsv[Long]("input0"), List(8L, 4123423431L, 12L)) + .typedSink(TypedTsv[Long]("output")) { outBuf => + val unordered = outBuf.toSet + // Size should be unchanged + unordered should have size 3 + + // Simple case, 2*8L won't run into the packer logic + unordered should contain(16L) + // Big one that should be in both the high and low 4 bytes of the Long + val big = 4123423431L + val newBig = LongIntPacker.lr(LongIntPacker.l(big) * 2, LongIntPacker.r(big) * 2) + unordered should contain(newBig) + } + .run + .runHadoop + .finish() + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/InAnotherPackage.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/InAnotherPackage.scala new file mode 100644 index 0000000000..207a24adaf --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/InAnotherPackage.scala @@ -0,0 +1,14 @@ +package com.twitter.example.scalding.typed + +import com.twitter.scalding._ +import scala.concurrent.{ExecutionContext => SExecutionContext, _} +import SExecutionContext.Implicits.global + +object InAnotherPackage { + def buildF: Future[TypedPipe[(Int, Int)]] = + Future { + TypedPipe + .from(List(1, 2, 3, 4, 555, 3)) + .map { case x => (x, x) } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/MultiJoinTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/MultiJoinTest.scala new file mode 100644 index 0000000000..9025f4523c --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/MultiJoinTest.scala @@ -0,0 +1,73 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +import org.scalatest.WordSpec + +import com.twitter.scalding.typed.FlattenGroup._ + +class MultiJoinTest extends WordSpec { + + def addKeys[V](t: Seq[V]): Seq[(Int, V)] = t.iterator.zipWithIndex.map { case (v, k) => (k, v) }.toSeq + + val doubles = TypedPipe.from(addKeys(List(1.0d, 2.0d, 3.0d))) + val longs = TypedPipe.from(addKeys(List(10L, 20L, 30L))) + val strings = TypedPipe.from(addKeys(List("one", "two", "three"))) + val sets = TypedPipe.from(addKeys(List(Set(1), Set(2), Set(3)))) + val maps = TypedPipe.from(addKeys(List(Map(1 -> 1), Map(2 -> 2), Map(3 -> 3)))) + + val joined = doubles.join(longs).join(strings).join(sets).join(maps) + val leftJoined = doubles.leftJoin(longs).leftJoin(strings).leftJoin(sets).leftJoin(maps) + val outerJoined = doubles.outerJoin(longs).outerJoin(strings).outerJoin(sets).outerJoin(maps) + + // note that these tests are essentially compile-time tests, all + // we are testing is that this compiles + + "The flatten methods" should { + "actually match the outputs of joins" in { + + val joinedFlat: CoGrouped[Int, (Double, Long, String, Set[Int], Map[Int, Int])] = + joined.mapValues(x => flattenNestedTuple(x)) + + val leftJoinedFlat + : CoGrouped[Int, (Double, Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]])] = + leftJoined.mapValues(x => flattenNestedTuple(x)) + + val outerJoinedFlat: CoGrouped[ + Int, + (Option[Double], Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]]) + ] = + outerJoined.mapValues(x => flattenNestedOptionTuple(x)) + } + + "Have implicit flattenValueTuple methods for low arity" in { + + val joinedFlat: CoGrouped[Int, (Double, Long, String, Set[Int], Map[Int, Int])] = + joined.flattenValueTuple + + val leftJoinedFlat + : CoGrouped[Int, (Double, Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]])] = + leftJoined.flattenValueTuple + + val outerJoinedFlat: CoGrouped[ + Int, + (Option[Double], Option[Long], Option[String], Option[Set[Int]], Option[Map[Int, Int]]) + ] = + outerJoined.flattenValueTuple + } + + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/NoStackLineNumberTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/NoStackLineNumberTest.scala new file mode 100644 index 0000000000..97ce3360a8 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/NoStackLineNumberTest.scala @@ -0,0 +1,52 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.typed + +import org.scalatest.WordSpec + +import com.twitter.scalding._ +import scala.concurrent.{ExecutionContext => SExecutionContext, _} +import SExecutionContext.Implicits.global +import scala.concurrent.duration.{Duration => SDuration} + +import cascading.flow.FlowDef +import org.apache.hadoop.conf.Configuration + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ + +class NoStackLineNumberTest extends WordSpec { + + "No Stack Shouldn't block getting line number info" should { + "actually get the no stack info" in { + import Dsl._ + implicit val fd: FlowDef = new FlowDef + implicit val m: Hdfs = new Hdfs(false, new Configuration) + + val pipeFut = com.twitter.example.scalding.typed.InAnotherPackage.buildF.map { tp => + tp.toPipe('a, 'b) + } + val pipe = Await.result(pipeFut, SDuration.Inf) + // We pick up line number info via TypedPipe.withLine + // So this should have some non-scalding info in it. + val allDesc = RichPipe(pipe).upstreamPipes + .map(RichPipe.getPipeDescriptions(_).toSet) + .foldLeft(Set.empty[String])(_ | _) + + assert(allDesc.size > 0) + assert(allDesc.exists(_.contains("com.twitter.example.scalding.typed.InAnotherPackage"))) + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/OptimizationRulesTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/OptimizationRulesTest.scala new file mode 100644 index 0000000000..bf5b890aff --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/OptimizationRulesTest.scala @@ -0,0 +1,785 @@ +package com.twitter.scalding.typed + +import cascading.flow.FlowDef +import cascading.tuple.Fields +import com.twitter.scalding.dagon.{Dag, Rule} +import com.twitter.algebird.Monoid +import com.twitter.scalding.source.{NullSink, TypedText} +import org.apache.hadoop.conf.Configuration +import com.twitter.scalding.{Config, ExecutionContext, FlowState, FlowStateMap, Hdfs, IterableSource, Local} +import com.twitter.scalding.typed.cascading_backend.{CascadingBackend, CascadingExtensions} +import com.twitter.scalding.typed.memory_backend.MemoryMode +import org.scalatest.FunSuite +import org.scalatest.prop.PropertyChecks +import org.scalacheck.{Arbitrary, Gen} +import scala.util.{Failure, Success, Try} + +import CascadingExtensions._ + +object TypedPipeGen { + val srcGen: Gen[TypedPipe[Int]] = { + val g1 = Gen.listOf(Arbitrary.arbitrary[Int]).map(TypedPipe.from(_)) + val src = Gen.identifier.map(f => TypedPipe.from(TypedText.tsv[Int](f))) + Gen.oneOf(g1, src, Gen.const(TypedPipe.empty)) + } + + def mapped(srcGen: Gen[TypedPipe[Int]]): Gen[TypedPipe[Int]] = { + val commonFreq = 10 + val next1: Gen[TypedPipe[Int] => TypedPipe[Int]] = + Gen.frequency( + ( + 1, + tpGen(srcGen).map { p: TypedPipe[Int] => x: TypedPipe[Int] => + x.cross(p).keys + } + ), + ( + 2, + tpGen(srcGen).map { p: TypedPipe[Int] => x: TypedPipe[Int] => + x.cross(ValuePipe(2)).values + } + ), + // Gen.const({ t: TypedPipe[Int] => t.debug }), debug spews a lot to the terminal + ( + commonFreq, + Arbitrary.arbitrary[Int => Boolean].map { fn => t: TypedPipe[Int] => + t.filter(fn) + } + ), + ( + commonFreq, + Arbitrary.arbitrary[Int => Int].map { fn => t: TypedPipe[Int] => + t.map(fn) + } + ), + ( + commonFreq, + Arbitrary.arbitrary[Int => List[Int]].map { fn => t: TypedPipe[Int] => + t.flatMap(fn.andThen(_.take(4))) // the take is to not get too big + } + ), + (2, Gen.const { t: TypedPipe[Int] => t.forceToDisk }), + (2, Gen.const { t: TypedPipe[Int] => t.fork }), + ( + 5, + tpGen(srcGen).map { p: TypedPipe[Int] => x: TypedPipe[Int] => + x ++ p + } + ), + ( + 1, + Gen.identifier.map { id => t: TypedPipe[Int] => + t.withDescription(id) + } + ) + ) + + val one = for { + n <- next1 + p <- tpGen(srcGen) + } yield n(p) + + val next2: Gen[TypedPipe[(Int, Int)] => TypedPipe[Int]] = + Gen.oneOf( + Gen.const { p: TypedPipe[(Int, Int)] => p.values }, + Gen.const { p: TypedPipe[(Int, Int)] => p.keys } + ) + + val two = for { + n <- next2 + p <- keyed(srcGen) + } yield n(p) + + Gen.frequency((4, one), (1, two)) + } + + def keyed(srcGen: Gen[TypedPipe[Int]]): Gen[TypedPipe[(Int, Int)]] = { + val keyRec = Gen.lzy(keyed(srcGen)) + val one = Gen.oneOf( + for { + single <- tpGen(srcGen) + fn <- Arbitrary.arbitrary[Int => (Int, Int)] + } yield single.map(fn), + for { + single <- tpGen(srcGen) + fn <- Arbitrary.arbitrary[Int => List[(Int, Int)]] + } yield single.flatMap(fn.andThen(_.take(4))) // take to not get too big + ) + + val two = Gen.oneOf( + for { + fn <- Arbitrary.arbitrary[Int => Boolean] + pair <- keyRec + } yield pair.filterKeys(fn), + for { + fn <- Arbitrary.arbitrary[Int => List[Int]] + pair <- keyRec + } yield pair.flatMapValues(fn.andThen(_.take(4))), // take to not get too big + for { + fn <- Arbitrary.arbitrary[Int => Int] + pair <- keyRec + } yield pair.mapValues(fn), + for { + pair <- keyRec + } yield pair.sumByKey.toTypedPipe, + for { + pair <- keyRec + } yield pair.sumByLocalKeys, + for { + pair <- keyRec + } yield pair.group.mapGroup((k, its) => its).toTypedPipe, + for { + pair <- keyRec + } yield pair.group.sorted.mapGroup((k, its) => its).toTypedPipe, + for { + pair <- keyRec + } yield pair.group.sorted.withReducers(2).mapGroup((k, its) => its).toTypedPipe, + for { + p1 <- keyRec + p2 <- keyRec + } yield p1.hashJoin(p2).mapValues { case (a, b) => 31 * a + b }, + for { + p1 <- keyRec + p2 <- keyRec + } yield p1.join(p2).values, + for { + p1 <- keyRec + p2 <- keyRec + } yield p1.join(p2).mapValues { case (a, b) => a + 31 * b }.toTypedPipe + ) + + // bias to consuming Int, since the we can stack overflow with the (Int, Int) + // cases + Gen.frequency((2, one), (1, two)) + } + + def tpGen(srcGen: Gen[TypedPipe[Int]]): Gen[TypedPipe[Int]] = + Gen.lzy(Gen.frequency((1, srcGen), (1, mapped(srcGen)))) + + /** + * This generates a TypedPipe that can't necessarily be run because it has fake sources + */ + val genWithFakeSources: Gen[TypedPipe[Int]] = tpGen(srcGen) + + /** + * This can always be run because all the sources are Iterable sources + */ + val genWithIterableSources: Gen[TypedPipe[Int]] = + Gen + .choose(0, 16) // don't make giant lists which take too long to evaluate + .flatMap { sz => + tpGen(Gen.listOfN(sz, Arbitrary.arbitrary[Int]).map(TypedPipe.from(_))) + } + + val genKeyedWithFake: Gen[TypedPipe[(Int, Int)]] = + keyed(srcGen) + + import OptimizationRules._ + + val allRules = List( + AddExplicitForks, + ComposeDescriptions, + ComposeFlatMap, + ComposeMap, + ComposeFilter, + ComposeWithOnComplete, + ComposeMapFlatMap, + ComposeFilterFlatMap, + ComposeFilterMap, + ComposeReduceSteps, + DescribeLater, + DiamondToFlatMap, + RemoveDuplicateForceFork, + IgnoreNoOpGroup, + DeferMerge, + FilterKeysEarly, + FilterLocally, + // EmptyIsOftenNoOp, this causes confluence problems when combined with other rules randomly. + // Have to be careful about the order it is applied + EmptyIterableIsEmpty, + HashToShuffleCoGroup, + ForceToDiskBeforeHashJoin, + MapValuesInReducers + ) + + def genRuleFrom(rs: List[Rule[TypedPipe]]): Gen[Rule[TypedPipe]] = + for { + c <- Gen.choose(1, rs.size) + rs <- Gen.pick(c, rs) + } yield rs.reduce(_.orElse(_)) + + val genRule: Gen[Rule[TypedPipe]] = genRuleFrom(allRules) + + // How many steps would this be in Hadoop on Cascading + def steps[A](p: TypedPipe[A]): Int = { + val mode = Hdfs.default + val fd = new FlowDef + val pipe = CascadingBackend.toPipeUnoptimized(p, NullSink.sinkFields)(fd, mode, NullSink.setter) + NullSink.writeFrom(pipe)(fd, mode) + val ec = ExecutionContext.newContext(Config.defaultFrom(mode))(fd, mode) + val flow = ec.buildFlow.get.get + flow.getFlowSteps.size + } + + // How many steps would this be in Hadoop on Cascading + def optimizedSteps[A](rs: List[Rule[TypedPipe]], maxSteps: Int)(pipe: TypedPipe[A]) = { + val (dag, id) = Dag(pipe, OptimizationRules.toLiteral) + val optDag = dag.applySeq(rs) + val optPipe = optDag.evaluate(id) + val s = steps(optPipe) + assert(s <= maxSteps, s"$s > $maxSteps. optimized: $optPipe") + } +} + +/** + * Used to test that we call phases + */ +class ThrowingOptimizer extends OptimizationPhases { + def phases = sys.error("booom") +} + +/** + * Just convert everything to a constant so we can check that the optimization was applied + */ +class ConstantOptimizer extends OptimizationPhases { + def phases = List(new Rule[TypedPipe] { + def apply[T](on: Dag[TypedPipe]) = { t => + Some(TypedPipe.empty) + } + }) +} + +// we need to extend PropertyChecks, it seems, to control the number of successful runs +// for optimization rules, we want to do many tests +class OptimizationRulesTest extends FunSuite with PropertyChecks { + import OptimizationRules.toLiteral + import TypedPipeGen.optimizedSteps + + def invert[T](t: TypedPipe[T]) = + assert(toLiteral(t).evaluate == t) + + test("randomly generated TypedPipe trees are invertible") { + forAll(TypedPipeGen.genWithFakeSources) { (t: TypedPipe[Int]) => + invert(t) + } + } + + test("optimization rules are reproducible") { + import TypedPipeGen.{genWithFakeSources, genRule} + + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 500) + forAll(genWithFakeSources, genRule) { (t, rule) => + val optimized = Dag.applyRule(t, toLiteral, rule) + val optimized2 = Dag.applyRule(t, toLiteral, rule) + assert(optimized == optimized2) + } + } + + test("standard rules are reproducible") { + import TypedPipeGen.genWithFakeSources + + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 500) + forAll(genWithFakeSources) { t => + val (dag1, id1) = Dag(t, toLiteral) + val opt1 = dag1.applySeq(OptimizationRules.standardMapReduceRules) + val t1 = opt1.evaluate(id1) + + val (dag2, id2) = Dag(t, toLiteral) + val opt2 = dag2.applySeq(OptimizationRules.standardMapReduceRules) + val t2 = opt2.evaluate(id2) + assert(t1 == t2) + } + } + + def optimizationLaw[T: Ordering](t: TypedPipe[T], rule: Rule[TypedPipe]) = { + val optimized = Dag.applyRule(t, toLiteral, rule) + + // We don't want any further optimization on this job + val conf = Config.empty.setOptimizationPhases(classOf[EmptyOptimizationPhases]) + assert( + TypedPipeDiff + .diff(t, optimized) + .toIterableExecution + .waitFor(conf, Local(true)) + .get + .isEmpty + ) + } + + def optimizationLawMemory[T: Ordering](t: TypedPipe[T], rule: Rule[TypedPipe]) = { + val optimized = Dag.applyRule(t, toLiteral, rule) + + // We don't want any further optimization on this job + val conf = Config.empty.setOptimizationPhases(classOf[EmptyOptimizationPhases]) + assert( + TypedPipeDiff + .diff(t, optimized) + .toIterableExecution + .waitFor(conf, MemoryMode.empty) + .get + .isEmpty + ) + } + + def optimizationReducesSteps[T](init: TypedPipe[T], rule: Rule[TypedPipe]) = { + val optimized = Dag.applyRule(init, toLiteral, rule) + + assert(TypedPipeGen.steps(init) >= TypedPipeGen.steps(optimized)) + } + + test("all optimization rules don't change results") { + import TypedPipeGen.{genWithIterableSources, genRule} + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 50) + forAll(genWithIterableSources, genRule)(optimizationLaw[Int] _) + } + + test("dediamonding never changes results") { + import TypedPipeGen.genWithIterableSources + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 50) + forAll(genWithIterableSources)(optimizationLawMemory[Int](_, OptimizationRules.DeDiamondMappers)) + } + + test("some past failures of the optimizationLaw") { + val arg01 = TypedPipe.empty.withDescription("foo") ++ TypedPipe.empty.withDescription("bar") + optimizationLaw[Int](arg01, Rule.empty) + } + + test("all optimization rules do not increase steps") { + import TypedPipeGen.{allRules, genWithIterableSources, genRuleFrom} + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 200) + + val possiblyIncreasesSteps: Set[Rule[TypedPipe]] = + Set( + OptimizationRules.AddExplicitForks, // explicit forks can cause cascading to add steps instead of recomputing values + OptimizationRules.ForceToDiskBeforeHashJoin, // adding a forceToDisk can increase the number of steps + OptimizationRules.HashToShuffleCoGroup // obviously changing a hashjoin to a cogroup can increase steps + ) + + val gen = genRuleFrom(allRules.filterNot(possiblyIncreasesSteps)) + + forAll(genWithIterableSources, gen)(optimizationReducesSteps[Int] _) + } + + test("ThrowingOptimizer is triggered") { + forAll(TypedPipeGen.genWithFakeSources) { t => + val conf = new Configuration() + conf.set(Config.OptimizationPhases, classOf[ThrowingOptimizer].getName) + implicit val mode = Hdfs(true, conf) + implicit val fd = new FlowDef + Try(CascadingBackend.toPipe(t, new Fields("value"))) match { + case Failure(ex) => assert(ex.getMessage == "booom") + case Success(res) => fail(s"expected failure, got $res") + } + } + + forAll(TypedPipeGen.genWithFakeSources) { t => + val ex = t.toIterableExecution + + val config = Config.empty.setOptimizationPhases(classOf[ThrowingOptimizer]) + ex.waitFor(config, Local(true)) match { + case Failure(ex) => assert(ex.getMessage == "booom") + case Success(res) => fail(s"expected failure, got $res") + } + } + } + + test("ConstantOptimizer is triggered") { + forAll(TypedPipeGen.genWithFakeSources) { t => + val conf = new Configuration() + conf.set(Config.OptimizationPhases, classOf[ConstantOptimizer].getName) + implicit val mode = Hdfs(true, conf) + implicit val fd = new FlowDef + Try(CascadingBackend.toPipe(t, new Fields("value"))) match { + case Failure(ex) => fail(s"$ex") + case Success(pipe) => + FlowStateMap.get(fd) match { + case None => fail("expected a flow state") + case Some(FlowState(m, _, _)) => + assert(m.size == 1) + m.head._2 match { + case it: IterableSource[_] => + assert(it.iter == Nil) + case _ => + fail(s"$m") + } + } + } + } + + forAll(TypedPipeGen.genWithFakeSources) { t => + val ex = t.toIterableExecution + + val config = Config.empty.setOptimizationPhases(classOf[ConstantOptimizer]) + ex.waitFor(config, Local(true)) match { + case Failure(ex) => fail(s"$ex") + case Success(res) => assert(res.isEmpty) + } + } + } + + test("OptimizationRules.toLiteral is invertible on some specific instances") { + + invert(TypedPipe.from(TypedText.tsv[Int]("foo"))) + invert(TypedPipe.from(List(1, 2, 3))) + invert(TypedPipe.from(List(1, 2, 3)).map(_ * 2)) + invert { + TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey.toTypedPipe + } + + invert { + val p = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey + + p.mapGroup((k, its) => Iterator.single(its.sum * k)) + } + + invert { + val p = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey + p.cross(TypedPipe.from(List("a", "b", "c")).sum) + } + + invert { + val p = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey + p.cross(TypedPipe.from(List("a", "b", "c"))) + } + + invert { + val p = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey + p.forceToDisk + } + + invert { + val p = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)).sumByKey + p.fork + } + + invert { + val p1 = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)) + val p2 = TypedPipe.from(TypedText.tsv[(Int, String)]("foo")) + + p1.join(p2).toTypedPipe + } + + invert { + val p1 = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)) + val p2 = TypedPipe.from(TypedText.tsv[(Int, String)]("foo")) + + p1.hashJoin(p2) + } + + invert { + val p1 = TypedPipe.from(List(1, 2, 3)).map(i => (i, i)) + val p2 = TypedPipe.from(TypedText.tsv[(Int, String)]("foo")) + + p1.join(p2).filterKeys(_ % 2 == 0) + } + } + + test("all transforms preserve equality") { + + forAll(TypedPipeGen.genWithFakeSources, TypedPipeGen.genKeyedWithFake) { (tp, keyed) => + val fn0 = { i: Int => i * 2 } + val filterFn = { i: Int => i % 2 == 0 } + val fn1 = { i: Int => (0 to i) } + + def eqCheck[T](t: => T) = + assert(t == t) + + eqCheck(tp.map(fn0)) + eqCheck(tp.filter(filterFn)) + eqCheck(tp.flatMap(fn1)) + + eqCheck(keyed.mapValues(fn0)) + eqCheck(keyed.flatMapValues(fn1)) + eqCheck(keyed.filterKeys(filterFn)) + + eqCheck(tp.groupAll) + eqCheck(tp.groupBy(fn0)) + eqCheck(tp.asKeys) + eqCheck(tp.either(keyed)) + eqCheck(keyed.eitherValues(keyed.mapValues(fn0))) + eqCheck(tp.map(fn1).flatten) + eqCheck(keyed.swap) + eqCheck(keyed.keys) + eqCheck(keyed.values) + + val valueFn: (Int, Option[Int]) => String = { (a, b) => a.toString + b.toString } + val valueFn2: (Int, Option[Int]) => List[Int] = { (a, b) => a :: (b.toList) } + val valueFn3: (Int, Option[Int]) => Boolean = { (a, b) => true } + + eqCheck(tp.mapWithValue(LiteralValue(1))(valueFn)) + eqCheck(tp.flatMapWithValue(LiteralValue(1))(valueFn2)) + eqCheck(tp.filterWithValue(LiteralValue(1))(valueFn3)) + + eqCheck(tp.hashLookup(keyed)) + eqCheck(tp.groupRandomly(100)) + val ordInt = implicitly[Ordering[Int]] + eqCheck(tp.distinctBy(fn0)(ordInt)) + } + } + + @annotation.tailrec + final def fib[A](t0: A, t1: A, n: Int)(fn: (A, A) => A): A = + if (n <= 0) t0 + else if (n == 1) t1 + else { + val t2 = fn(t0, t1) + fib(t1, t2, n - 1)(fn) + } + + def isFasterThan[A](millis: Int)(a: => A) = { + val start = System.nanoTime() + val res = a + val end = System.nanoTime() + assert((end - start) / (1000L * 1000L) < millis) + } + + test( + "Dagon relies on fast hashCodes and fast equality. Test some example ones to make sure they are not exponential" + ) { + + def testFib(fn: (TypedPipe[Int], TypedPipe[Int]) => TypedPipe[Int]) = + isFasterThan(1000) { + fib(TypedPipe.from(List(0)), TypedPipe.from(List(1, 2)), 45)(fn).hashCode + } + + // Test the ways we can combine pipes + testFib(_ ++ _) + testFib(_.cross(_).map { case (a, b) => a * b }) + testFib { (left, right) => + left.asKeys.join(right.asKeys).keys + } + + assert(TypedPipe.empty == TypedPipe.empty) + // now test equality on a fib merge + // without linear time equality, this fails when fib count is 35, at 50 + // it would take a huge amount of time + isFasterThan(1000) { + assert( + fib(TypedPipe.from(List(0)), TypedPipe.from(List(1)), 50)(_ ++ _) == + fib(TypedPipe.from(List(0)), TypedPipe.from(List(1)), 50)(_ ++ _) + ) + } + } + + test("write should be fast for large graphs") { + val fd = new FlowDef + isFasterThan(1000) { + // for non-linear complexity this will fail + fib(TypedPipe.from(List(0)), TypedPipe.from(List(1, 2)), 45) { (a, b) => + // write should be fast too + a.write(NullSink)(fd, Local(true)) ++ b + } + } + } + + test("joins are merged") { + def kv(s: String) = + TypedPipe.from(TypedText.tsv[(Int, Int)](s)) + + optimizedSteps(OptimizationRules.standardMapReduceRules, 1) { + kv("a").join(kv("b")).join(kv("c")) + } + } + + test("needless toTypedPipe is removed") { + + def kv(s: String) = + TypedPipe.from(TypedText.tsv[(Int, Int)](s)) + + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { + kv("a").sumByKey.toTypedPipe.group.max + } + + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { + kv("a").join(kv("b")).toTypedPipe.group.max + } + + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { + kv("a").sumByKey.toTypedPipe.join(kv("b")).toTypedPipe.group.max + } + + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { + kv("a").join(kv("b").sumByKey.toTypedPipe).toTypedPipe.group.max + } + + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { + kv("a").join(kv("b").sumByKey.toTypedPipe.mapValues(_ * 2)).toTypedPipe.group.max + } + + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { + kv("a").join(kv("b").sumByKey.toTypedPipe.flatMapValues(0 to _)).toTypedPipe.group.max + } + + optimizedSteps( + OptimizationRules.standardMapReduceRules ::: List(OptimizationRules.ComposeReduceSteps), + 1 + ) { + kv("a").join(kv("b").sumByKey.toTypedPipe.filterKeys(_ > 2)).toTypedPipe.group.max + } + } + + test("merge before reduce is one step") { + def kv(s: String) = + TypedPipe.from(TypedText.tsv[(Int, Int)](s)) + + optimizedSteps(OptimizationRules.standardMapReduceRules, 1) { + (kv("a") ++ kv("b")).sumByKey + } + + optimizedSteps(OptimizationRules.standardMapReduceRules, 1) { + (kv("a") ++ kv("b")).join(kv("c")) + } + optimizedSteps(OptimizationRules.standardMapReduceRules, 1) { + kv("a").join(kv("b") ++ kv("c")) + } + + optimizedSteps(OptimizationRules.standardMapReduceRules, 1) { + val input = kv("a") + val pipe1 = input.mapValues(_ * 2) + val pipe2 = input.mapValues(_ * 3) + (pipe1 ++ pipe2).sumByKey + } + + optimizedSteps(OptimizationRules.standardMapReduceRules, 1) { + val input = kv("a") + val pipe1 = input.mapValues(_ * 2) + val pipe2 = input.mapValues(_ * 3) + (pipe1 ++ pipe2).mapValues(_ + 42).sumByKey + } + } + + test("merge after identical reduce reduce is one step") { + def kv(s: String) = + TypedPipe.from(TypedText.tsv[(Int, Int)](s)) + + // TODO: currently fails, this is now 3 steps. + // optimizedSteps(OptimizationRules.standardMapReduceRules, 1) { + // (kv("a").join(kv("b")) ++ kv("a").join(kv("c"))) + // } + + // TODO: currently fails, this is now 3 steps. + // optimizedSteps(OptimizationRules.standardMapReduceRules, 1) { + // (kv("a").join(kv("b")) ++ kv("c").join(kv("b"))) + // } + + // TODO: currently fails, this is now 3 steps. + // optimizedSteps(OptimizationRules.standardMapReduceRules, 1) { + // (kv("a").sumByKey.mapValues(_ * 10) ++ kv("a").sumByKey) + // } + + } + + test("merging pipes does not make them unplannable") { + val pipe1 = TypedPipe.from(0 to 1000).map { x => + (x, x) + } ++ + (TypedPipe.from(0 to 2000).groupBy(_ % 17).sum.toTypedPipe) + + val pipe2 = TypedPipe.from(0 to 1000) ++ + TypedPipe.from(0 to 2000).filter(_ % 17 == 0) + + val pipe3 = TypedPipe.from(TypedText.tsv[Int]("src1")).map { x => + (x, x) + } ++ + (TypedPipe.from(TypedText.tsv[Int]("src2")).groupBy(_ % 17).sum.toTypedPipe) + + val pipe4 = TypedPipe.from(TypedText.tsv[Int]("src1")) ++ + TypedPipe.from(TypedText.tsv[Int]("src2")).filter(_ % 17 == 0) + + optimizedSteps(OptimizationRules.standardMapReduceRules, 2)(pipe1) + optimizedSteps(OptimizationRules.standardMapReduceRules, 1)(pipe2) + optimizedSteps(OptimizationRules.standardMapReduceRules, 2)(pipe3) + optimizedSteps(OptimizationRules.standardMapReduceRules, 1)(pipe4) + } + + test("we can plan an enormous list of combined typedPipes") { + // set this to 10,000 and use the default Monoid.plus + // and this fails fast, but still planning a giant graph + // is quadratic to apply the optimizations, so it takes + // a long time, but does not stack overflow. + val pipes = (0 to 1000).map(i => TypedPipe.from(List(i))) + val pipe = Monoid.sum(pipes) + + optimizedSteps(OptimizationRules.standardMapReduceRules, 1)(pipe) + } + + test("dediamond map only diamonds") { + def law[A, B: Ordering](root: TypedPipe[A], diamond: TypedPipe[B]) = { + val dag0 = Dag.empty(OptimizationRules.toLiteral) + val (dag1, id1) = dag0.addRoot(root) + val (dag2, id2) = dag1.addRoot(diamond) + val dag3 = dag2.applySeq(OptimizationRules.standardMapReduceRules) + val optDiamond = dag3.evaluate(id2) + val optRoot = dag3.evaluate(id1) + + import TypedPipe._ + optDiamond match { + case WithDescriptionTypedPipe(FlatMapped(r1, _), _) => + assert(r1 == optRoot) + case err => fail(s"expected a single mapper: $err") + } + + // Make sure running this with dediamounding doesn't change the rules + optimizationLawMemory(diamond, OptimizationRules.DeDiamondMappers) + } + + { + val pipe = TypedPipe.from(List(1, 2)) + val diamond = pipe.map(_ + 1) ++ pipe.map(_ + 2) + law(pipe, diamond) + } + + { + // we need to use a flatMap to make sure that none of the optimizations are applied + val pipe: TypedPipe[(Int, Int)] = TypedPipe + .from(0 to 1000) + .flatMap(k => (k % 11, k % 13) :: Nil) + .sumByKey + .toTypedPipe + // Do all kinds of different map only operations, but see them merged down to one flatMap + val diamond = pipe.map(identity) ++ + pipe.mapValues(_ ^ 11) ++ + pipe.flatMapValues(i => (0 until (i % 7))) ++ + pipe.flatMap { case (k, v) => (k, v) :: (v, k) :: Nil } ++ + pipe.filter { case (k, v) => k > v } ++ + pipe.filterKeys(_ % 3 == 0) + law(pipe, diamond) + } + + { + // single forks are removed + val pipe = TypedPipe.from(List(1, 2)).asKeys.sum + val diamond = pipe.map(identity) ++ pipe.map(identity).fork + law(pipe, diamond) + } + + { + // here is a merge that doesn't dediamond, don't mess this us + val pipe = TypedPipe.from(List(1, 2)).map(_ * 2) ++ TypedPipe.from(0 to 100).map(_ * Int.MaxValue) + optimizationLawMemory(pipe, OptimizationRules.DeDiamondMappers) + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedDelimitedSourceTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedDelimitedSourceTest.scala new file mode 100644 index 0000000000..e38c5aea50 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedDelimitedSourceTest.scala @@ -0,0 +1,69 @@ +// Copyright 2014 Commonwealth Bank of Australia +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.twitter.scalding.typed + +import java.io.File + +import scala.io.{Source => ScalaSource} + +import org.scalatest.{Matchers, WordSpec} + +import com.twitter.scalding._ +import TDsl._ + +object PartitionedDelimitedTestSources { + val singlePartition = PartitionedCsv[String, (String, String)]("out", "%s") +} + +class PartitionedDelimitedWriteJob(args: Args) extends Job(args) { + import PartitionedDelimitedTestSources._ + TypedCsv[(String, String, String)]("in") + .map { case (v1, v2, v3) => (v1, (v2, v3)) } + .write(singlePartition) +} + +class PartitionedDelimitedTest extends WordSpec with Matchers { + import PartitionedDelimitedTestSources._ + + "PartitionedDelimited" should { + "write out CSVs" in { + val input = Seq(("A", "X", "1"), ("A", "Y", "2"), ("B", "Z", "3")) + + // Need to save the job to allow, find the temporary directory data was written to + var job: Job = null; + def buildJob(args: Args): Job = { + job = new PartitionedDelimitedWriteJob(args) + job + } + + JobTest(buildJob(_)) + .source(TypedCsv[(String, String, String)]("in"), input) + .runHadoop + .finish() + + val testMode = job.mode.asInstanceOf[HadoopTest] + + val directory = new File(testMode.getWritePathFor(singlePartition)) + + directory.listFiles().map(_.getName()).toSet shouldBe Set("A", "B") + + val aSource = ScalaSource.fromFile(new File(directory, "A/part-00000-00000")) + val bSource = ScalaSource.fromFile(new File(directory, "B/part-00000-00001")) + + aSource.getLines.toList shouldBe Seq("X,1", "Y,2") + bSource.getLines.toList shouldBe Seq("Z,3") + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedTextLineTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedTextLineTest.scala new file mode 100644 index 0000000000..b318fcc6aa --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/PartitionedTextLineTest.scala @@ -0,0 +1,111 @@ +// Copyright 2014 Commonwealth Bank of Australia +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.twitter.scalding.typed + +import java.io.File + +import scala.io.{Source => ScalaSource} + +import org.scalatest.{Matchers, WordSpec} + +import com.twitter.scalding._ + +import TDsl._ + +object PartitionedTextLineTestSources { + val singlePartition = PartitionedTextLine[String]("out", "%s") + val multiplePartition = PartitionedTextLine[(String, String)]("out", "%s/%s") +} + +class PartitionedTextLineSingleWriteJob(args: Args) extends Job(args) { + import PartitionedTextLineTestSources._ + TypedCsv[(String, String)]("in").write(singlePartition) +} + +class PartitionedTextLineMultipleWriteJob(args: Args) extends Job(args) { + import PartitionedTextLineTestSources._ + TypedCsv[(String, String, String)]("in") + .map { case (v1, v2, v3) => ((v1, v2), v3) } + .write(multiplePartition) +} + +class PartitionedTextLineTest extends WordSpec with Matchers { + import PartitionedTextLineTestSources._ + + "PartitionedTextLine" should { + "be able to split output by a single partition" in { + val input = Seq(("A", "1"), ("A", "2"), ("B", "3")) + + // Need to save the job to allow, find the temporary directory data was written to + var job: Job = null; + def buildJob(args: Args): Job = { + job = new PartitionedTextLineSingleWriteJob(args) + job + } + + JobTest(buildJob(_)) + .source(TypedCsv[(String, String)]("in"), input) + .runHadoop + .finish() + + val testMode = job.mode.asInstanceOf[HadoopTest] + + val directory = new File(testMode.getWritePathFor(singlePartition)) + println(directory) + + directory.listFiles().map(_.getName()).toSet shouldBe Set("A", "B") + + val aSource = ScalaSource.fromFile(new File(directory, "A/part-00000-00000")) + val bSource = ScalaSource.fromFile(new File(directory, "B/part-00000-00001")) + + aSource.getLines.toList shouldBe Seq("1", "2") + bSource.getLines.toList shouldBe Seq("3") + } + "be able to split output by multiple partitions" in { + val input = Seq(("A", "X", "1"), ("A", "Y", "2"), ("B", "Z", "3")) + + // Need to save the job to allow, find the temporary directory data was written to + var job: Job = null; + def buildJob(args: Args): Job = { + job = new PartitionedTextLineMultipleWriteJob(args) + job + } + + JobTest(buildJob(_)) + .source(TypedCsv[(String, String, String)]("in"), input) + .runHadoop + .finish() + + val testMode = job.mode.asInstanceOf[HadoopTest] + + val directory = new File(testMode.getWritePathFor(multiplePartition)) + println(directory) + + directory.listFiles.flatMap(d => d.listFiles.map(d.getName + "/" + _.getName)).toSet shouldBe Set( + "A/X", + "A/Y", + "B/Z" + ) + + val axSource = ScalaSource.fromFile(new File(directory, "A/X/part-00000-00000")) + val aySource = ScalaSource.fromFile(new File(directory, "A/Y/part-00000-00001")) + val bzSource = ScalaSource.fromFile(new File(directory, "B/Z/part-00000-00002")) + + axSource.getLines.toList shouldBe Seq("1") + aySource.getLines.toList shouldBe Seq("2") + bzSource.getLines.toList shouldBe Seq("3") + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/RequireOrderedSerializationTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/RequireOrderedSerializationTest.scala new file mode 100644 index 0000000000..1f66fb0df8 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/RequireOrderedSerializationTest.scala @@ -0,0 +1,112 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import com.twitter.scalding.serialization.OrderedSerialization +import com.twitter.scalding.serialization.StringOrderedSerialization +import com.twitter.scalding.serialization.RequireOrderedSerializationMode + +import org.scalatest.{Matchers, WordSpec} + +class NoOrderdSerJob(args: Args, requireOrderedSerializationMode: String) extends Job(args) { + + override def config = + super.config + (Config.ScaldingRequireOrderedSerialization -> requireOrderedSerializationMode) + + TypedPipe + .from(TypedTsv[(String, String)]("input")) + .group + .max + .write(TypedTsv[(String, String)]("output")) +} + +class OrderdSerJob(args: Args, requireOrderedSerializationMode: String) extends Job(args) { + + implicit def stringOS: OrderedSerialization[String] = new StringOrderedSerialization + + override def config = + super.config + (Config.ScaldingRequireOrderedSerialization -> requireOrderedSerializationMode) + + TypedPipe + .from(TypedTsv[(String, String)]("input")) + .group + .sorted + .max + .write(TypedTsv[(String, String)]("output")) +} + +class RequireOrderedSerializationTest extends WordSpec with Matchers { + + "A NoOrderedSerJob" should { + + def test(job: Args => Job) = + JobTest(job) + .source(TypedTsv[(String, String)]("input"), List(("a", "a"), ("b", "b"))) + .sink[(String, String)](TypedTsv[(String, String)]("output"))(outBuf => ()) + .run + .finish() + + "throw when mode is Fail" in { + val ex = the[Exception] thrownBy { + test(new NoOrderdSerJob(_, RequireOrderedSerializationMode.Fail.toString)) + } + ex.getMessage should include("SerializationTest.scala:") + } + + "not throw when mode is Log" in { + test(new NoOrderdSerJob(_, RequireOrderedSerializationMode.Log.toString)) + } + + "throw when mode is true" in { + val ex = the[Exception] thrownBy { + test(new NoOrderdSerJob(_, "true")) + } + ex.getMessage should include("SerializationTest.scala:") + } + + "not throw when mode is false" in { + test(new NoOrderdSerJob(_, "false")) + } + } + + "A OrderedSerJob" should { + + def test(job: Args => Job) = + JobTest(job) + .source(TypedTsv[(String, String)]("input"), List(("a", "a"), ("a", "b"), ("b", "b"))) + .sink[(String, String)](TypedTsv[(String, String)]("output")) { outBuf => + outBuf.toSet shouldBe Set(("a", "b"), ("b", "b")) + } + .run + .finish() + + "run when mode is Fail" in { + test(new OrderdSerJob(_, RequireOrderedSerializationMode.Fail.toString)) + } + + "run when mode is Log" in { + test(new OrderdSerJob(_, RequireOrderedSerializationMode.Log.toString)) + } + + "run when mode is true" in { + test(new OrderdSerJob(_, "true")) + } + + "run when mode is false" in { + test(new OrderdSerJob(_, "false")) + } + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/WritePartitionerTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/WritePartitionerTest.scala new file mode 100644 index 0000000000..e72ad7bc5c --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/WritePartitionerTest.scala @@ -0,0 +1,324 @@ +package com.twitter.scalding.typed + +import com.twitter.algebird.Monoid +import com.twitter.scalding.{Config, Execution, Local} +import com.twitter.scalding.source.{NullSink, TypedText} +import com.twitter.scalding.typed.cascading_backend.CascadingBackend +import com.twitter.scalding.dagon.Dag +import org.scalatest.FunSuite +import org.scalatest.prop.PropertyChecks + +class WritePartitionerTest extends FunSuite with PropertyChecks { + def fakeSource[T](id: Int): TypedSource[T] = + TypedText.tsv[String](s"source_$id").asInstanceOf[TypedSource[T]] + + case class WriteState( + writes: List[WritePartitioner.PairK[TypedPipe, Output, _]], + materializations: List[WritePartitioner.PairK[TypedPipe, TypedSource, _]] + ) { + + def ++(that: WriteState): WriteState = + WriteState(writes ::: that.writes, materializations ::: that.materializations) + } + + object WriteState { + def empty: WriteState = WriteState(List.empty, List.empty) + } + + case class State[+T](writes: WriteState, value: T) + object State { + implicit val materializer: WritePartitioner.Materializer[State] = + new WritePartitioner.Materializer[State] { + def pure[A](a: A) = State(WriteState(List.empty, List.empty), a) + def map[A, B](ma: State[A])(fn: A => B) = + State(ma.writes, fn(ma.value)) + + def zip[A, B](ma: State[A], mb: State[B]) = + State(ma.writes ++ mb.writes, (ma.value, mb.value)) + + def materialize[A](t: State[TypedPipe[A]]): State[TypedPipe[A]] = { + val fakeReader = fakeSource[A](t.writes.materializations.size) + val newMats: List[WritePartitioner.PairK[TypedPipe, TypedSource, _]] = + (t.value, fakeReader) :: t.writes.materializations + State(t.writes.copy(materializations = newMats), TypedPipe.from(fakeReader)) + } + def write[A](tp: State[TypedPipe[A]], sink: Output[A]): State[Unit] = { + val newWrites: List[WritePartitioner.PairK[TypedPipe, Output, _]] = + (tp.value, sink) :: tp.writes.writes + State(tp.writes.copy(writes = newWrites), ()) + } + def sequence_[A](as: Seq[State[A]]): State[Unit] = + // just merge them all together: + State( + as.foldLeft(WriteState.empty) { (old, n) => + n.writes ++ old + }, + () + ) + } + } + + test("When we break at forks we have at most 2 + hashJoin steps") { + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 100) + + def afterPartitioningEachStepIsSize1[T](init: TypedPipe[T]) = { + val phases = CascadingBackend.defaultOptimizationRules(Config.empty) + + val writes = WritePartitioner.materialize[State](phases, List((init, NullSink))).writes + + writes.writes.foreach { case (tp, _) => + val (dag, _) = Dag(tp, OptimizationRules.toLiteral) + val hcg = dag.allNodes.collect { case h: TypedPipe.HashCoGroup[_, _, _, _] => 1 }.sum + // we can have at most 2 + hcg jobs + assert(TypedPipeGen.steps(tp) <= 2 + hcg, s"optimized: ${tp.toString}") + } + writes.materializations.foreach { case (tp, _) => + val (dag, _) = Dag(tp, OptimizationRules.toLiteral) + val hcg = dag.allNodes.collect { case h: TypedPipe.HashCoGroup[_, _, _, _] => 1 }.sum + // we can have at most 1 + hcg jobs + assert(TypedPipeGen.steps(tp) <= 2 + hcg, s"optimized: ${tp.toString}") + } + } + + forAll(TypedPipeGen.genWithFakeSources)(afterPartitioningEachStepIsSize1(_)) + } + + test("the total number of steps is not more than cascading") { + def notMoreSteps[T](t: TypedPipe[T]) = { + val phases = CascadingBackend.defaultOptimizationRules(Config.empty) + + val writes = WritePartitioner.materialize[State](phases, List((t, NullSink))).writes + + val writeSteps = writes.writes.map { case (tp, _) => + TypedPipeGen.steps(tp) + }.sum + val matSteps = writes.materializations.map { case (tp, _) => + TypedPipeGen.steps(tp) + }.sum + val (dag, id) = Dag(t, OptimizationRules.toLiteral) + val optDag = dag.applySeq(phases) + val optT = optDag.evaluate(id) + assert(writeSteps + matSteps <= TypedPipeGen.steps(optT)) + } + + { + import TypedPipe._ + + val pipe = WithDescriptionTypedPipe( + Mapped( + ReduceStepPipe( + ValueSortedReduce[Int, Int, Int]( + implicitly[Ordering[Int]], + WithDescriptionTypedPipe( + WithDescriptionTypedPipe( + Mapped( + WithDescriptionTypedPipe( + MergedTypedPipe( + WithDescriptionTypedPipe( + Fork( + WithDescriptionTypedPipe( + TrappedPipe( + SourcePipe(TypedText.tsv[Int]("oyg")), + TypedText.tsv[Int]("a3QasphTfqhd1namjb") + ), + List(("org.scalacheck.Gen$R $class.map(Gen.scala:237)", true)) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + IterablePipe( + List(-930762680, -1495455462, -1, -903011942, -2147483648, 1539778843, -2147483648) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + implicitly[Ordering[Int]], + null /**/, + Some(2), + List() + ) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) + + notMoreSteps(pipe) + } + + { + import TypedPipe._ + + val pipe = WithDescriptionTypedPipe( + ForceToDisk( + WithDescriptionTypedPipe( + Mapped( + ReduceStepPipe( + ValueSortedReduce[Int, Int, Int]( + implicitly[Ordering[Int]], + WithDescriptionTypedPipe( + WithDescriptionTypedPipe( + Mapped( + WithDescriptionTypedPipe( + MergedTypedPipe( + WithDescriptionTypedPipe( + Mapped( + WithDescriptionTypedPipe( + CrossValue(SourcePipe(TypedText.tsv[Int]("yumwd")), LiteralValue(2)), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + WithDescriptionTypedPipe( + Mapped( + WithDescriptionTypedPipe( + FilterKeys( + WithDescriptionTypedPipe( + SumByLocalKeys( + WithDescriptionTypedPipe( + FlatMapped(IterablePipe(List(943704575)), null /**/ ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + implicitly[Monoid[Int]] + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + implicitly[Ordering[Int]], + null /**/, + None, + List() + ) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) + + notMoreSteps(pipe) + } + + { + import TypedPipe._ + + val pipe = WithDescriptionTypedPipe( + Fork( + WithDescriptionTypedPipe( + Mapped( + WithDescriptionTypedPipe( + CrossValue( + WithDescriptionTypedPipe( + TrappedPipe[Int]( + WithDescriptionTypedPipe( + ForceToDisk( + WithDescriptionTypedPipe( + Mapped( + ReduceStepPipe( + ValueSortedReduce[Int, Int, Int]( + implicitly[Ordering[Int]], + WithDescriptionTypedPipe( + WithDescriptionTypedPipe( + FilterKeys( + WithDescriptionTypedPipe( + FlatMapValues( + WithDescriptionTypedPipe( + Mapped( + IterablePipe(List(1533743286, 0, -1, 0, 1637692751)), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + implicitly[Ordering[Int]], + null /**/, + Some(2), + List() + ) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + TypedText.tsv[Int]("mndlSTwuEmwqhJk7ac") + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + LiteralValue(2) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ), + null /**/ + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) + ), + List(("org.scalacheck.Gen$R$class.map(Gen.scala:237)", true)) + ) + + notMoreSteps(pipe) + } + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 100) + forAll(TypedPipeGen.genWithFakeSources)(notMoreSteps(_)) + } + + test("breaking things up does not change the results") { + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 100) + + def partitioningDoesNotChange[T: Ordering](init: TypedPipe[T]) = { + val phases = CascadingBackend.defaultOptimizationRules(Config.empty) + + // We don't want any further optimization on this job + val ex: Execution[TypedPipe[T]] = WritePartitioner.partitionSingle(phases, init) + assert( + ex.flatMap(TypedPipeDiff.diff[T](init, _).toIterableExecution) + .waitFor(Config.empty, Local(true)) + .get + .isEmpty + ) + } + + forAll(TypedPipeGen.genWithIterableSources)(partitioningDoesNotChange(_)) + } +} diff --git a/scalding-core/src/test/scala/com/twitter/scalding/typed/memory_backend/MemoryTest.scala b/scalding-core/src/test/scala/com/twitter/scalding/typed/memory_backend/MemoryTest.scala new file mode 100644 index 0000000000..6e573e4786 --- /dev/null +++ b/scalding-core/src/test/scala/com/twitter/scalding/typed/memory_backend/MemoryTest.scala @@ -0,0 +1,113 @@ +package com.twitter.scalding.typed.memory_backend + +import org.scalatest.FunSuite +import org.scalatest.prop.PropertyChecks +import com.twitter.scalding.{Config, Execution, Local, TypedPipe} +import com.twitter.scalding.typed.TypedPipeGen + +class MemoryTest extends FunSuite with PropertyChecks { + + private def mapMatch[K, V](ex: Execution[Iterable[(K, V)]]) = { + val mm = MemoryMode.empty + + val mkv = ex.waitFor(Config.empty, mm) + + val lkv = ex.waitFor(Config.empty, Local(true)) + assert(mkv.get.toMap == lkv.get.toMap) + } + + private def timeit[A](msg: String, a: => A): A = { + val start = System.nanoTime() + val res = a + val diff = System.nanoTime() - start + val ms = diff / 1e6 + // uncomment this for some poor version of benchmarking, + // but scalding in-memory mode seems about 3-100x faster + // + // println(s"$msg: $ms ms") + res + } + + private def sortMatch[A: Ordering](ex: Execution[Iterable[A]]) = { + val mm = MemoryMode.empty + + val mkv = timeit("scalding", ex.waitFor(Config.empty, mm)) + + val lkv = timeit("cascading", ex.waitFor(Config.empty, Local(true))) + assert(mkv.get.toList.sorted == lkv.get.toList.sorted) + } + + test("basic word count") { + val x = TypedPipe + .from(0 until 100) + .groupBy(_ % 2) + .sum + .toIterableExecution + + mapMatch(x) + } + + test("mapGroup works") { + val x = TypedPipe + .from(0 until 100) + .groupBy(_ % 2) + .mapGroup((k, vs) => Iterator.single(vs.foldLeft(k)(_ + _))) + .toIterableExecution + + mapMatch(x) + } + + test("hashJoin works") { + val input = TypedPipe.from(0 until 100) + val left = input.map(k => (k, k % 2)) + val right = input.map(k => (k, k % 3)) + + mapMatch(left.hashJoin(right).toIterableExecution) + } + + test("join works") { + val input = TypedPipe.from(0 until 100) + val left = input.map(k => (k, k % 2)) + val right = input.map(k => (k, k % 3)) + + mapMatch(left.join(right).toIterableExecution) + } + + test("scalding memory mode matches cascading local mode") { + import TypedPipeGen.genWithIterableSources + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 50) + forAll(genWithIterableSources)(pipe => sortMatch(pipe.toIterableExecution)) + } + + test("writing gives the same result as toIterableExecution") { + import TypedPipeGen.genWithIterableSources + // we can afford to test a lot more in just memory mode because it is faster than cascading + implicit val generatorDrivenConfig: PropertyCheckConfiguration = + PropertyCheckConfiguration(minSuccessful = 500) + forAll(genWithIterableSources) { pipe => + val sink = new MemorySink.LocalVar[Int] + + val ex1 = pipe.writeExecution(SinkT("my_sink")) + val ex2 = pipe.toIterableExecution + + val mm = MemoryMode.empty.addSink(SinkT("my_sink"), sink) + val res1 = ex1.waitFor(Config.empty, mm) + val res2 = ex2.waitFor(Config.empty, MemoryMode.empty) + + assert(sink.reset().get.toList.sorted == res2.get.toList.sorted) + + } + } + + test("using sources work") { + val srctag = SourceT[Int]("some_source") + + val job = TypedPipe.from(srctag).map(i => (i % 31, i)).sumByKey.toIterableExecution + + val jobRes = job.waitFor(Config.empty, MemoryMode.empty.addSourceIterable(srctag, (0 to 10000))) + + val expected = (0 to 10000).groupBy(_ % 31).mapValues(_.sum).toList.sorted + assert(jobRes.get.toList.sorted == expected) + } +} diff --git a/scalding-dagon/src/main/scala-2.12-/com/twitter/scalding/dagon/ScalaVersionCompat.scala b/scalding-dagon/src/main/scala-2.12-/com/twitter/scalding/dagon/ScalaVersionCompat.scala new file mode 100644 index 0000000000..d7a366e47b --- /dev/null +++ b/scalding-dagon/src/main/scala-2.12-/com/twitter/scalding/dagon/ScalaVersionCompat.scala @@ -0,0 +1,23 @@ +package com.twitter.scalding.dagon + +object ScalaVersionCompat { + type LazyList[+A] = scala.collection.immutable.Stream[A] + val LazyList = scala.collection.immutable.Stream + + type IterableOnce[+A] = scala.collection.TraversableOnce[A] + + def iterateOnce[A](as: IterableOnce[A]): Iterator[A] = + as.toIterator + + def lazyList[A](as: A*): LazyList[A] = + Stream(as: _*) + + def lazyListToIterator[A](lst: LazyList[A]): Iterator[A] = + lst.iterator + + def lazyListFromIterator[A](it: Iterator[A]): LazyList[A] = + it.toStream + + implicit val ieeeDoubleOrdering: Ordering[Double] = + Ordering.Double +} diff --git a/scalding-dagon/src/main/scala-2.13+/com/twitter/scalding/dagon/ScalaVersionCompat.scala b/scalding-dagon/src/main/scala-2.13+/com/twitter/scalding/dagon/ScalaVersionCompat.scala new file mode 100644 index 0000000000..b1b55d9057 --- /dev/null +++ b/scalding-dagon/src/main/scala-2.13+/com/twitter/scalding/dagon/ScalaVersionCompat.scala @@ -0,0 +1,23 @@ +package com.twitter.scalding.dagon + +object ScalaVersionCompat { + type LazyList[+A] = scala.collection.immutable.LazyList[A] + val LazyList = scala.collection.immutable.LazyList + + type IterableOnce[+A] = scala.collection.IterableOnce[A] + + def iterateOnce[A](as: IterableOnce[A]): Iterator[A] = + as.iterator + + def lazyList[A](as: A*): LazyList[A] = + LazyList(as: _*) + + def lazyListToIterator[A](lst: LazyList[A]): Iterator[A] = + lst.iterator + + def lazyListFromIterator[A](it: Iterator[A]): LazyList[A] = + LazyList.from(it) + + implicit val ieeeDoubleOrdering: Ordering[Double] = + Ordering.Double.IeeeOrdering +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Cache.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Cache.scala new file mode 100644 index 0000000000..93d15e5029 --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Cache.scala @@ -0,0 +1,63 @@ +package com.twitter.scalding.dagon + +import java.io.Serializable + +/** + * This is a useful cache for memoizing function. + * + * The cache is implemented using a mutable pointer to an immutable map value. In the worst-case, race + * conditions might cause us to lose cache values (i.e. compute some keys twice), but we will never produce + * incorrect values. + */ +sealed class Cache[K, V] private (init: Map[K, V]) extends Serializable { + + private[this] var map: Map[K, V] = init + + /** + * Given a key, either return a cached value, or compute, store, and return a new value. + * + * This method is what justifies the existence of Cache. Its second parameter (`v`) is by-name: it will only + * be evaluated in cases where the key is not cached. + * + * For example: + * + * def greet(i: Int): Int = { println("hi") i + 1 } + * + * val c = Cache.empty[Int, Int] c.getOrElseUpdate(1, greet(1)) // says hi, returns 2 c.getOrElseUpdate(1, + * greet(1)) // just returns 2 + */ + def getOrElseUpdate(k: K, v: => V): V = + map.get(k) match { + case Some(exists) => exists + case None => + val res = v + map = map.updated(k, res) + res + } + + /** + * Create a second cache with the same values as this one. + * + * The two caches will start with the same values, but will be independently updated. + */ + def duplicate: Cache[K, V] = + new Cache(map) + + /** + * Access the currently-cached keys and values as a map. + */ + def toMap: Map[K, V] = + map + + /** + * Forget all cached keys and values. + * + * After calling this method, the resulting cache is equivalent to Cache.empty[K, V]. + */ + def reset(): Unit = + map = Map.empty +} + +object Cache { + def empty[K, V]: Cache[K, V] = new Cache(Map.empty) +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Dag.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Dag.scala new file mode 100644 index 0000000000..af1e24a9fc --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Dag.scala @@ -0,0 +1,785 @@ +/* + Copyright 2014 Twitter, Inc. + Copyright 2017 Stripe, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.scalding.dagon + +import com.twitter.scalding.dagon.ScalaVersionCompat.{lazyListToIterator, LazyList} + +import java.io.Serializable +import scala.util.control.TailCalls + +/** + * Represents a directed acyclic graph (DAG). + * + * The type N[_] represents the type of nodes in the graph. + */ +sealed abstract class Dag[N[_]] extends Serializable { self => + + /** + * These have package visibility to test the law that for all Expr, the node they evaluate to is unique + */ + protected def idToExp: HMap[Id, Expr[N, *]] + + /** + * The set of roots that were added by addRoot. These are Ids that will always evaluate such that + * roots.forall(evaluateOption(_).isDefined) + */ + protected def roots: Set[Id[_]] + + /** + * Convert a N[T] to a Literal[T, N]. + */ + def toLiteral: FunctionK[N, Literal[N, *]] + + // Caches polymorphic functions of type Id[T] => Option[N[T]] + private val idToN: HCache[Id, Lambda[t => Option[N[t]]]] = + HCache.empty[Id, Lambda[t => Option[N[t]]]] + + // Caches polymorphic functions of type Literal[N, T] => Option[Id[T]] + private val litToId: HCache[Literal[N, *], Lambda[t => Option[Id[t]]]] = + HCache.empty[Literal[N, *], Lambda[t => Option[Id[t]]]] + + // Caches polymorphic functions of type Expr[N, T] => N[T] + private val evalMemo = Expr.evaluateMemo(idToExp) + + /** + * String representation of this DAG. + */ + override def toString: String = + s"Dag(idToExp = $idToExp, roots = $roots)" + + /** + * Which ids are reachable from the roots? + */ + def reachableIds: Set[Id[_]] = + rootsUp.map(_._2).toSet + + /** + * Apply the given rule to the given dag until the graph no longer changes. + */ + def apply(rule: Rule[N]): Dag[N] = { + + @annotation.tailrec + def loop(d: Dag[N]): Dag[N] = { + val next = d.applyOnce(rule) + if (next eq d) next + else loop(next) + } + + loop(this) + } + + /** + * Apply a sequence of rules, which you may think of as phases, in order First apply one rule until it does + * not apply, then the next, etc.. + */ + def applySeq(phases: Seq[Rule[N]]): Dag[N] = + phases.foldLeft(this)((dag, rule) => dag(rule)) + + def applySeqOnce(phases: Seq[Rule[N]]): Dag[N] = + phases.iterator + .map(rule => applyOnce(rule)) + .filter(_ ne this) + .take(1) + .toList + .headOption + .getOrElse(this) + + /** + * apply the rule at the first place that satisfies it, and return from there. + */ + def applyOnce(rule: Rule[N]): Dag[N] = { + type DagT[T] = Dag[N] + + val f = new FunctionK[HMap[Id, Expr[N, *]]#Pair, Lambda[x => Option[DagT[x]]]] { + def toFunction[U] = { (kv: (Id[U], Expr[N, U])) => + val (id, expr) = kv + + if (expr.isVar) None // Vars always point somewhere, apply the rule there + else { + val n1 = evaluate(id) + rule + .apply[U](self)(n1) + .filter(_ != n1) + .map { n2 => + // A node can have several Ids. + // we need to point ALL of the old ids to the new one + val oldIds = + findAll(n1) match { + case absent if absent.isEmpty => + sys.error(s"unreachable code, $n1 should have id $id") + case existing => existing + } + // If n2 depends on n1, the Var trick fails and introduces + // loops. To avoid this, we have to work in an edge based + // approach. For all n3 != n2, if they depend on n1, replace + // with n2. Leave n2 alone. + + // Get an ID for the new node + // if this new node points to the old node + // we are going to create a cycle, since + // below we point the old nodes back to the + // new id. To fix this, re-reassign + // n1 to a new id, since that new id won't be + // updated to point to itself, we prevent a loop + val newIdN1 = Id.next[U]() + val dag1 = replaceId(newIdN1, expr, n1) + val (dag2, newId) = dag1.ensure(n2) + + // We can't delete Ids which may have been shared + // publicly, and the ids may be embedded in many + // nodes. Instead we remap 'ids' to be a pointer + // to 'newid'. + dag2.repointIds(n1, oldIds, newId, n2) + } + } + } + } + + // We want to apply rules + // in a deterministic order so they are reproducible + rootsUp + .map { case (_, id) => + // use the method to fix the types below + // if we don't use DagT here, scala thinks + // it is unused even though we use it above + def go[A](id: Id[A]): Option[DagT[A]] = { + val expr = idToExp(id) + f.toFunction[A]((id, expr)) + } + go(id) + } + .collectFirst { case Some(dag) => dag } + .getOrElse(this) + } + + /** + * Apply a rule at most cnt times. + */ + def applyMax(rule: Rule[N], cnt: Int): Dag[N] = { + + @annotation.tailrec + def loop(d: Dag[N], cnt: Int): Dag[N] = + if (cnt <= 0) d + else { + val next = d.applyOnce(rule) + if (next eq d) d + else loop(next, cnt - 1) + } + + loop(this, cnt) + } + + def depthOfId[A](i: Id[A]): Option[Int] = + depth.get(i) + + def depthOf[A](n: N[A]): Option[Int] = + find(n).flatMap(depthOfId(_)) + + private lazy val depth: Map[Id[_], Int] = { + sealed trait Rest { + def dependsOn(id: Id[_]): Boolean + } + sealed case class Same(asId: Id[_]) extends Rest { + def dependsOn(id: Id[_]) = id == asId + } + sealed case class MaxInc(a: Id[_], b: Id[_]) extends Rest { + def dependsOn(id: Id[_]) = (id == a) || (id == b) + } + sealed case class Inc(of: Id[_]) extends Rest { + def dependsOn(id: Id[_]) = id == of + } + sealed case class Variadic(ids: List[Id[_]]) extends Rest { + def dependsOn(id: Id[_]) = ids.contains(id) + } + + @annotation.tailrec + def lookup( + state: Map[Id[_], Int], + todo: List[(Id[_], Rest)], + nextRound: List[(Id[_], Rest)] + ): Map[Id[_], Int] = + todo match { + case Nil => + nextRound match { + case Nil => state + case repeat => + val sortRepeat = repeat.sortWith { case ((i0, r0), (i1, r1)) => + r1.dependsOn(i0) || (!r0.dependsOn(i1)) + } + lookup(state, sortRepeat, Nil) + } + case (h @ (id, Same(a))) :: rest => + state.get(a) match { + case Some(depth) => + val state1 = state.updated(id, depth) + lookup(state1, rest, nextRound) + case None => + lookup(state, rest, h :: nextRound) + } + case (h @ (id, Inc(a))) :: rest => + state.get(a) match { + case Some(depth) => + val state1 = state.updated(id, depth + 1) + lookup(state1, rest, nextRound) + case None => + lookup(state, rest, h :: nextRound) + } + case (h @ (id, MaxInc(a, b))) :: rest => + (state.get(a), state.get(b)) match { + case (Some(da), Some(db)) => + val depth = math.max(da, db) + 1 + val state1 = state.updated(id, depth) + lookup(state1, rest, nextRound) + case _ => + lookup(state, rest, h :: nextRound) + } + case (id, Variadic(Nil)) :: rest => + val depth = 0 + val state1 = state.updated(id, depth) + lookup(state1, rest, nextRound) + case (item @ (id, Variadic(h :: t))) :: rest => + // max can't throw here because ids is non-empty + def maxId(head: Id[_], tail: List[Id[_]], acc: Int): Option[Int] = + state.get(head) match { + case None => None + case Some(d) => + val newAcc = Math.max(acc, d) + tail match { + case Nil => Some(newAcc) + case h :: t => maxId(h, t, newAcc) + } + } + + maxId(h, t, 0) match { + case Some(depth) => + val state1 = state.updated(id, depth + 1) + lookup(state1, rest, nextRound) + case None => + lookup(state, rest, item :: nextRound) + } + } + + @annotation.tailrec + def loop( + stack: List[Id[_]], + seen: Set[Id[_]], + state: Map[Id[_], Int], + todo: List[(Id[_], Rest)] + ): Map[Id[_], Int] = + stack match { + case Nil => + lookup(state, todo, Nil) + case h :: tail if seen(h) => loop(tail, seen, state, todo) + case h :: tail => + val seen1 = seen + h + idToExp.get(h) match { + case None => + loop(tail, seen1, state, todo) + case Some(Expr.Const(_)) => + loop(tail, seen1, state.updated(h, 0), todo) + case Some(Expr.Var(id)) => + loop(id :: tail, seen1, state, (h, Same(id)) :: todo) + case Some(Expr.Unary(id, _)) => + loop(id :: tail, seen1, state, (h, Inc(id)) :: todo) + case Some(Expr.Binary(id0, id1, _)) => + loop(id0 :: id1 :: tail, seen1, state, (h, MaxInc(id0, id1)) :: todo) + case Some(Expr.Variadic(ids, _)) => + loop(ids reverse_::: tail, seen1, state, (h, Variadic(ids)) :: todo) + } + } + + loop(roots.toList, Set.empty, Map.empty, Nil) + } + + /** + * Find all the nodes currently in the graph + */ + lazy val allNodes: Set[N[_]] = { + type Node = Either[Id[_], Expr[N, _]] + def deps(n: Node): List[Node] = n match { + case Right(Expr.Const(_)) => Nil + case Right(Expr.Var(id)) => Left(id) :: Nil + case Right(Expr.Unary(id, _)) => Left(id) :: Nil + case Right(Expr.Binary(id0, id1, _)) => Left(id0) :: Left(id1) :: Nil + case Right(Expr.Variadic(ids, _)) => ids.map(Left(_)) + case Left(id) => idToExp.get(id).map(Right(_): Node).toList + } + val all = Graphs.reflexiveTransitiveClosure(roots.toList.map(Left(_): Node))(deps _) + + all.iterator.collect { case Right(expr) => evalMemo(expr) }.toSet + } + + //////////////////////////// + // + // These following methods are the only methods that directly + // allocate new Dag instances. These are where all invariants + // must be maintained + // + //////////////////////////// + + /** + * Add a GC root, or tail in the DAG, that can never be deleted. + */ + def addRoot[T](node: N[T]): (Dag[N], Id[T]) = { + val (dag, id) = ensure(node) + (dag.copy(gcroots = dag.roots + id), id) + } + + // Convenient method to produce new, modified DAGs based on this + // one. + private def copy( + id2Exp: HMap[Id, Expr[N, *]] = self.idToExp, + node2Literal: FunctionK[N, Literal[N, *]] = self.toLiteral, + gcroots: Set[Id[_]] = self.roots + ): Dag[N] = new Dag[N] { + def idToExp = id2Exp + def roots = gcroots + def toLiteral = node2Literal + } + + // these are included for binary compatibility + + // $COVERAGE-OFF$ + private[dagon] def com$stripe$dagon$Dag$$copy$default$2() + : com.twitter.scalding.dagon.FunctionK[N, Literal[N, *]] = + self.toLiteral + + private[dagon] def com$stripe$dagon$Dag$$copy$default$3(): scala.collection.immutable.Set[Id[_]] = + self.roots + // $COVERAGE-ON$ + + // Produce a new DAG that is equivalent to this one, but which frees + // orphaned nodes and other internal state which may no longer be + // needed. + private def gc: Dag[N] = { + val keepers = reachableIds + if (idToExp.forallKeys(keepers)) this + else copy(id2Exp = idToExp.filterKeys(keepers)) + } + + /* + * This updates the canonical Id for a given node and expression + */ + protected def replaceId[A](newId: Id[A], expr: Expr[N, A], node: N[A]): Dag[N] = + copy(id2Exp = idToExp.updated(newId, expr)) + + protected def repointIds[A](orig: N[A], oldIds: Iterable[Id[A]], newId: Id[A], newNode: N[A]): Dag[N] = + if (oldIds.nonEmpty) { + val newIdToExp = oldIds.foldLeft(idToExp) { (mapping, origId) => + mapping.updated(origId, Expr.Var[N, A](newId)) + } + copy(id2Exp = newIdToExp).gc + } else this + + /** + * This is only called by ensure + * + * Note, Expr must never be a Var + */ + private def addExp[T](exp: Expr[N, T]): (Dag[N], Id[T]) = { + require(!exp.isVar) + val nodeId = Id.next[T]() + (copy(id2Exp = idToExp.updated(nodeId, exp)), nodeId) + } + + //////////////////////////// + // + // End of methods that direcly allocate new Dag instances + // + //////////////////////////// + + /** + * This finds an Id[T] in the current graph that is equivalent to the given N[T] + */ + def find[T](node: N[T]): Option[Id[T]] = + findLiteral(toLiteral(node), node) + + private def findLiteral[T](lit: Literal[N, T], n: => N[T]): Option[Id[T]] = + litToId.getOrElseUpdate( + lit, { + // It's important to not compare equality in the Literal + // space because it can have function members that are + // equivalent, but not .equals + val lst = findAll(n).filterNot(id => idToExp(id).isVar) + val it = lazyListToIterator(lst) + if (it.hasNext) { + // there can be duplicate ids. Consider this case: + // Id(0) -> Expr.Unary(Id(1), fn) + // Id(1) -> Expr.Const(n1) + // Id(2) -> Expr.Unary(Id(3), fn) + // Id(3) -> Expr.Const(n2) + // + // then, a rule replaces n1 and n2 both with n3 Then, we'd have + // Id(1) -> Var(Id(4)) + // Id(4) -> Expr.Const(n3) + // Id(3) -> Var(Id(4)) + // + // and now, Id(0) and Id(2) both point to non-Var nodes, but also + // both are equal + + // We use the maximum ID which is important to deal with + // cycle avoidance in applyRule since we guarantee + // that all the nodes that are repointed are computed + // before we add a new node to graph + Some(it.max) + } else { + // if the node is the in the graph it has at least + // one non-Var node + None + } + } + ) + + /** + * Nodes can have multiple ids in the graph, this gives all of them + */ + def findAll[T](node: N[T]): LazyList[Id[T]] = { + // TODO: this computation is really expensive, 60% of CPU in a recent benchmark + // maintaining these mappings would be nice, but maybe expensive as we are rewriting + // nodes + val f = new FunctionK[HMap[Id, Expr[N, *]]#Pair, Lambda[x => Option[Id[x]]]] { + def toFunction[T1] = { case (thisId, expr) => + if (node == evalMemo(expr)) Some(thisId) else None + } + } + + // this cast is safe if node == expr.evaluate(idToExp) implies types match + idToExp.optionMap(f).asInstanceOf[LazyList[Id[T]]] + } + + /** + * This throws if the node is missing, use find if this is not a logic error in your programming. With + * dependent types we could possibly get this to not compile if it could throw. + */ + def idOf[T](node: N[T]): Id[T] = + find(node).getOrElse { + val msg = s"could not get node: $node\n from $this" + throw new NoSuchElementException(msg) + } + + /** + * ensure the given literal node is present in the Dag Note: it is important that at each moment, each node + * has at most one id in the graph. Put another way, for all Id[T] in the graph evaluate(id) is distinct. + */ + protected def ensure[T](node: N[T]): (Dag[N], Id[T]) = { + val lit = toLiteral(node) + val litMemo = Literal.evaluateMemo[N] + try ensureFast(lit, litMemo) + catch { + case _: Throwable => // StackOverflowError should work, but not on scala.js + ensureRec(lit, litMemo).result + } + } + + /* + * This does recursion on the stack, which is faster, but can overflow + */ + protected def ensureFast[T](lit: Literal[N, T], memo: FunctionK[Literal[N, *], N]): (Dag[N], Id[T]) = + findLiteral(lit, memo(lit)) match { + case Some(id) => (this, id) + case None => + lit match { + case Literal.Const(n) => + addExp(Expr.Const(n)) + case Literal.Unary(prev, fn) => + val (exp1, idprev) = ensureFast(prev, memo) + exp1.addExp(Expr.Unary(idprev, fn)) + case Literal.Binary(n1, n2, fn) => + val (exp1, id1) = ensureFast(n1, memo) + val (exp2, id2) = exp1.ensureFast(n2, memo) + exp2.addExp(Expr.Binary(id1, id2, fn)) + case Literal.Variadic(args, fn) => + @annotation.tailrec + def go[A](dag: Dag[N], args: List[Literal[N, A]], acc: List[Id[A]]): (Dag[N], List[Id[A]]) = + args match { + case Nil => (dag, acc.reverse) + case h :: tail => + val (dag1, hid) = dag.ensureFast(h, memo) + go(dag1, tail, hid :: acc) + } + + val (d, ids) = go(this, args, Nil) + d.addExp(Expr.Variadic(ids, fn)) + } + } + + protected def ensureRec[T]( + lit: Literal[N, T], + memo: FunctionK[Literal[N, *], N] + ): TailCalls.TailRec[(Dag[N], Id[T])] = + findLiteral(lit, memo(lit)) match { + case Some(id) => TailCalls.done((this, id)) + case None => + lit match { + case Literal.Const(n) => + TailCalls.done(addExp(Expr.Const(n))) + case Literal.Unary(prev, fn) => + TailCalls.tailcall(ensureRec(prev, memo)).map { case (exp1, idprev) => + exp1.addExp(Expr.Unary(idprev, fn)) + } + case Literal.Binary(n1, n2, fn) => + for { + p1 <- TailCalls.tailcall(ensureRec(n1, memo)) + (exp1, id1) = p1 + p2 <- TailCalls.tailcall(exp1.ensureRec(n2, memo)) + (exp2, id2) = p2 + } yield exp2.addExp(Expr.Binary(id1, id2, fn)) + case Literal.Variadic(args, fn) => + def go[A](dag: Dag[N], args: List[Literal[N, A]]): TailCalls.TailRec[(Dag[N], List[Id[A]])] = + args match { + case Nil => TailCalls.done((dag, Nil)) + case h :: tail => + for { + rest <- go(dag, tail) + (dag1, its) = rest + dagH <- TailCalls.tailcall(dag1.ensureRec(h, memo)) + (dag2, idh) = dagH + } yield (dag2, idh :: its) + } + + go(this, args).map { case (d, ids) => + d.addExp(Expr.Variadic(ids, fn)) + } + } + } + + /** + * After applying rules to your Dag, use this method to get the original node type. Only call this on an + * Id[T] that was generated by this dag or a parent. + */ + def evaluate[T](id: Id[T]): N[T] = + evaluateOption(id).getOrElse { + val msg = s"Could not evaluate: $id\nin $this" + throw new NoSuchElementException(msg) + } + + def evaluateOption[T](id: Id[T]): Option[N[T]] = + idToN.getOrElseUpdate(id, idToExp.get(id).map(evalMemo(_))) + + /** + * Return the number of nodes that depend on the given Id, TODO we might want to cache these. We need to + * garbage collect nodes that are no longer reachable from the root + */ + def fanOut(id: Id[_]): Int = + evaluateOption(id) + .map(fanOut) + .getOrElse(0) + + /** + * Returns 0 if the node is absent, which is true use .contains(n) to check for containment + */ + def fanOut(node: N[_]): Int = { + val interiorFanOut = dependentsOf(node).size + val tailFanOut = if (isRoot(node)) 1 else 0 + interiorFanOut + tailFanOut + } + + /** + * Is this node a root of this graph + */ + def isRoot(n: N[_]): Boolean = + roots.iterator.exists(evaluatesTo(_, n)) + + // This is a roots up iterator giving the depth + // to the nearest root and in sorted by Id.serial + // within each depth + private def rootsUp: Iterator[(Int, Id[_])] = { + type State = (Int, Id[_], List[Id[_]], List[Id[_]], Set[Id[_]]) + + def sort(l: List[Id[_]]): List[Id[_]] = + l.asInstanceOf[List[Id[Any]]].sorted + + def computeNext(s: List[Id[_]], seen: Set[Id[_]]): (List[Id[_]], Set[Id[_]]) = + s.foldLeft((List.empty[Id[_]], seen)) { case ((l, s), id) => + val newIds = Expr.dependsOnIds(idToExp(id)).filterNot(seen) + (newIds reverse_::: l, s ++ newIds) + } + + def initState: Option[State] = { + val rootList = roots.toList + val (next, seen) = computeNext(rootList, rootList.toSet) + sort(rootList) match { + case Nil => None + case h :: tail => Some((0, h, tail, next, seen)) + } + } + + def nextState(current: State): Option[State] = + current match { + case (_, _, Nil, Nil, _) => + None + case (depth, _, Nil, nextBatch, seen) => + sort(nextBatch) match { + case h :: tail => + val (nextBatch1, seen1) = computeNext(nextBatch, seen) + Some((depth + 1, h, tail, nextBatch1, seen1)) + case Nil => + // nextBatch has at least one item, and sorting preserves that + sys.error("impossible") + } + case (d, _, h :: tail, next, seen) => + Some((d, h, tail, next, seen)) + } + + new Iterator[(Int, Id[_])] { + var state: Option[State] = initState + + def hasNext = state.isDefined + def next: (Int, Id[_]) = + state match { + case None => throw new NoSuchElementException("roots up has no more items") + case Some(s) => + state = nextState(s) + (s._1, s._2) + } + } + } + + /** + * Is this node in this DAG + */ + def contains(node: N[_]): Boolean = + find(node).isDefined + + /** + * What nodes do we depend directly on + */ + def dependenciesOf(node: N[_]): List[N[_]] = + toLiteral(node) match { + case Literal.Const(_) => + Nil + case Literal.Unary(n, _) => + n.evaluate :: Nil + case Literal.Binary(n1, n2, _) => + val evalLit = Literal.evaluateMemo[N] + evalLit(n1) :: evalLit(n2) :: Nil + case Literal.Variadic(inputs, _) => + val evalLit = Literal.evaluateMemo[N] + inputs.map(evalLit(_)) + } + + /** + * It is as expensive to compute this for the whole graph as it is to answer a single query we already cache + * the N pointed to, so this structure should be small + */ + private lazy val dependencyMap: Map[N[_], Set[N[_]]] = { + def dependsOnSet(expr: Expr[N, _]): Set[N[_]] = expr match { + case Expr.Const(_) => Set.empty + case Expr.Var(id) => sys.error(s"logic error: Var($id)") + case Expr.Unary(id, _) => Set(evaluate(id)) + case Expr.Binary(id0, id1, _) => Set(evaluate(id0), evaluate(id1)) + case Expr.Variadic(ids, _) => ids.iterator.map(evaluate(_)).toSet + } + + type SetConst[T] = (N[T], Set[N[_]]) + val pointsToNode = new FunctionK[HMap[Id, Expr[N, *]]#Pair, Lambda[x => Option[SetConst[x]]]] { + def toFunction[T] = { case (id, expr) => + // here are the nodes we depend on: + + // We can ignore Vars here, since all vars point to a final expression + if (!expr.isVar) { + val depSet = dependsOnSet(expr) + Some((evalMemo(expr), depSet)) + } else None + } + } + + idToExp + .optionMap[SetConst](pointsToNode) + .flatMap { case (n, deps) => + deps.map((_, n): (N[_], N[_])) + } + .groupBy(_._1) + .iterator + .map { case (k, vs) => (k, vs.iterator.map(_._2).toSet) } + .toMap + } + + /** + * list all the nodes that depend on the given node + */ + def dependentsOf(node: N[_]): Set[N[_]] = + dependencyMap.getOrElse(node, Set.empty) + + private def evaluatesTo[A, B](id: Id[A], n: N[B]): Boolean = { + val idN = evaluate(id) + // since we cache, reference equality will often work + val refEq = idN.asInstanceOf[AnyRef] eq id.asInstanceOf[AnyRef] + refEq || (idN == n) + } + + /** + * equivalent to (but maybe faster than) fanOut(n) <= 1 + */ + def hasSingleDependent(n: N[_]): Boolean = + fanOut(n) <= 1 + + /** + * Return all dependents of a given node. Does not include itself + */ + def transitiveDependentsOf(p: N[_]): Set[N[_]] = { + def nfn(n: N[Any]): List[N[Any]] = + dependentsOf(n).toList.asInstanceOf[List[N[Any]]] + + Graphs.depthFirstOf(p.asInstanceOf[N[Any]])(nfn _).toSet + } + + /** + * Return the transitive dependencies of a given node + */ + def transitiveDependenciesOf(p: N[_]): Set[N[_]] = { + def nfn(n: N[Any]): List[N[Any]] = + dependenciesOf(n).toList.asInstanceOf[List[N[Any]]] + + Graphs.depthFirstOf(p.asInstanceOf[N[Any]])(nfn _).toSet + } +} + +object Dag { + def empty[N[_]](n2l: FunctionK[N, Literal[N, *]]): Dag[N] = + new Dag[N] { + val idToExp = HMap.empty[Id, Expr[N, *]] + val toLiteral = n2l + val roots = Set.empty[Id[_]] + } + + /** + * This creates a new Dag rooted at the given tail node + */ + def apply[T, N[_]](n: N[T], nodeToLit: FunctionK[N, Literal[N, *]]): (Dag[N], Id[T]) = + empty(nodeToLit).addRoot(n) + + /** + * This is the most useful function. Given a N[T] and a way to convert to Literal[T, N], apply the given + * rule until it no longer applies, and return the N[T] which is equivalent under the given rule + */ + def applyRule[T, N[_]](n: N[T], nodeToLit: FunctionK[N, Literal[N, *]], rule: Rule[N]): N[T] = { + val (dag, id) = apply(n, nodeToLit) + dag(rule).evaluate(id) + } + + /** + * This is useful when you have rules you want applied in a certain order. Given a N[T] and a way to convert + * to Literal[T, N], for each rule in the sequence, apply the given rule until it no longer applies, and + * return the N[T] which is equivalent under the given rule + */ + def applyRuleSeq[T, N[_]](n: N[T], nodeToLit: FunctionK[N, Literal[N, *]], rules: Seq[Rule[N]]): N[T] = { + val (dag, id) = apply(n, nodeToLit) + dag.applySeq(rules).evaluate(id) + } +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Expr.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Expr.scala new file mode 100644 index 0000000000..b2dd46c7bd --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Expr.scala @@ -0,0 +1,147 @@ +/* + Copyright 2014 Twitter, Inc. + Copyright 2017 Stripe, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.scalding.dagon + +import java.io.Serializable +import scala.util.control.TailCalls +import scala.util.hashing.MurmurHash3 + +/** + * Expr[N, T] is an expression of a graph of container nodes N[_] with result type N[T]. These expressions are + * like the Literal[T, N] graphs except that functions always operate with an indirection of a Id[T] where + * N[T] is the type of the input node. + * + * Nodes can be deleted from the graph by replacing an Expr at Id = idA with Var(idB) pointing to some + * upstream node. + * + * To add nodes to the graph, add depth to the final node returned in a Unary or Binary expression. + * + * TODO: see the approach here: https://gist.github.com/pchiusano/1369239 Which seems to show a way to do + * currying, so we can handle general arity + */ +sealed trait Expr[N[_], T] extends Serializable { self: Product => + + def evaluate(idToExp: HMap[Id, Expr[N, *]]): N[T] = + Expr.evaluate(idToExp, this) + + /** + * Memoize the hashCode, but notice that Expr is not recursive on itself (only via the Id graph) so it does + * not have the DAG-exponential-equality-and-hashcode issue that Literal and other DAGs have. + * + * We use a lazy val instead of a val for binary compatibility. + */ + override lazy val hashCode: Int = + MurmurHash3.productHash(self) + + final def isVar: Boolean = + this match { + case Expr.Var(_) => true + case _ => false + } +} + +object Expr { + + sealed case class Const[N[_], T](value: N[T]) extends Expr[N, T] { + override def evaluate(idToExp: HMap[Id, Expr[N, *]]): N[T] = + value + } + + sealed case class Var[N[_], T](name: Id[T]) extends Expr[N, T] + + sealed case class Unary[N[_], T1, T2](arg: Id[T1], fn: N[T1] => N[T2]) extends Expr[N, T2] + + sealed case class Binary[N[_], T1, T2, T3](arg1: Id[T1], arg2: Id[T2], fn: (N[T1], N[T2]) => N[T3]) + extends Expr[N, T3] + + sealed case class Variadic[N[_], T1, T2](args: List[Id[T1]], fn: List[N[T1]] => N[T2]) extends Expr[N, T2] + + /** + * What Ids does this expression depend on + */ + def dependsOnIds[N[_], A](expr: Expr[N, A]): List[Id[_]] = + expr match { + case Const(_) => Nil + case Var(id) => id :: Nil + case Unary(id, _) => id :: Nil + case Binary(id0, id1, _) => id0 :: id1 :: Nil + case Variadic(ids, _) => ids + } + + /** + * Evaluate the given expression with the given mapping of Id to Expr. + */ + def evaluate[N[_], T](idToExp: HMap[Id, Expr[N, *]], expr: Expr[N, T]): N[T] = + evaluateMemo(idToExp)(expr) + + /** + * Build a memoized FunctionK for this particular idToExp. Clearly, this FunctionK is only valid for the + * given idToExp which is captured in this closure. + */ + def evaluateMemo[N[_]](idToExp: HMap[Id, Expr[N, *]]): FunctionK[Expr[N, *], N] = { + val fast = Memoize.functionK[Expr[N, *], N](new Memoize.RecursiveK[Expr[N, *], N] { + def toFunction[T] = { + case (Const(n), _) => n + case (Var(id), rec) => rec(idToExp(id)) + case (Unary(id, fn), rec) => + fn(rec(idToExp(id))) + case (Binary(id1, id2, fn), rec) => + fn(rec(idToExp(id1)), rec(idToExp(id2))) + case (Variadic(args, fn), rec) => + fn(args.map(id => rec(idToExp(id)))) + } + }) + + import TailCalls._ + + val slowAndSafe = Memoize.functionKTailRec[Expr[N, *], N](new Memoize.RecursiveKTailRec[Expr[N, *], N] { + def toFunction[T] = { + case (Const(n), _) => done(n) + case (Var(id), rec) => rec(idToExp(id)) + case (Unary(id, fn), rec) => rec(idToExp(id)).map(fn) + case (Binary(id1, id2, fn), rec) => + for { + nn1 <- rec(idToExp(id1)) + nn2 <- rec(idToExp(id2)) + } yield fn(nn1, nn2) + case (Variadic(args, fn), rec) => + def loop[A](as: List[Id[A]]): TailRec[List[N[A]]] = + as match { + case Nil => done(Nil) + case h :: t => loop(t).flatMap(tt => rec(idToExp(h)).map(_ :: tt)) + } + loop(args).map(fn) + } + }) + + def onStackGoSlow[A](lit: Expr[N, A]): N[A] = + try fast(lit) + catch { + case _: Throwable => // StackOverflowError should work, but not on scala.js + slowAndSafe(lit).result + } + + /* + * We *non-recursively* use either the fast approach or the slow approach + */ + Memoize.functionK[Expr[N, *], N](new Memoize.RecursiveK[Expr[N, *], N] { + def toFunction[T] = { case (u, _) => onStackGoSlow(u) } + }) + } + +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/FunctionK.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/FunctionK.scala new file mode 100644 index 0000000000..1498d6dad7 --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/FunctionK.scala @@ -0,0 +1,22 @@ +package com.twitter.scalding.dagon + +import java.io.Serializable + +/** + * This is a Natural transformation. + * + * For any type X, this type can produce a function from T[X] to R[X]. + */ +trait FunctionK[T[_], R[_]] extends Serializable { + def apply[U](tu: T[U]): R[U] = + toFunction[U](tu) + + def toFunction[U]: T[U] => R[U] +} + +object FunctionK { + def andThen[A[_], B[_], C[_]](first: FunctionK[A, B], second: FunctionK[B, C]): FunctionK[A, C] = + new FunctionK[A, C] { + def toFunction[U] = first.toFunction[U].andThen(second.toFunction[U]) + } +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Graphs.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Graphs.scala new file mode 100644 index 0000000000..b854840d57 --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Graphs.scala @@ -0,0 +1,90 @@ +package com.twitter.scalding.dagon + +import scala.collection.mutable + +object Graphs { + + /** + * Return the depth first enumeration of reachable nodes, NOT INCLUDING INPUT, unless it can be reached via + * neighbors + */ + def depthFirstOf[T](t: T)(nf: NeighborFn[T]): List[T] = + reflexiveTransitiveClosure(nf(t).toList)(nf) + + /** + * All the nodes we can reach from this start, including the initial nodes + */ + def reflexiveTransitiveClosure[T](start: List[T])(nf: NeighborFn[T]): List[T] = { + @annotation.tailrec + def loop(stack: List[T], deps: List[T], acc: Set[T]): List[T] = + stack match { + case Nil => deps + case h :: tail => + val newStack = nf(h).filterNot(acc).foldLeft(tail) { (s, it) => + it :: s + } + val newDeps = if (acc(h)) deps else h :: deps + loop(newStack, newDeps, acc + h) + } + + loop(start, start.distinct, start.toSet).reverse + } + + /** + * Return a NeighborFn for the graph of reversed edges defined by this set of nodes and nf We avoid Sets + * which use hash-codes which may depend on addresses which are not stable from one run to the next. + */ + def reversed[T](nodes: Iterable[T])(nf: NeighborFn[T]): NeighborFn[T] = { + val graph: Map[T, List[T]] = nodes + .foldLeft(Map.empty[T, List[T]]) { (g, child) => + val gWithChild = g + (child -> g.getOrElse(child, Nil)) + nf(child).foldLeft(gWithChild) { (innerg, parent) => + innerg + (parent -> (child :: innerg.getOrElse(parent, Nil))) + } + } + // make sure the values are sets, not .mapValues is lazy in scala + .map { case (k, v) => (k, v.distinct) } + + graph.getOrElse(_, Nil) + } + + /** + * Return the depth of each node in the dag. a node that has no dependencies has depth == 0 else it is max + * of parent + 1 + * + * Behavior is not defined if the graph is not a DAG (for now, it runs forever, may throw later) + */ + def dagDepth[T](nodes: Iterable[T])(nf: NeighborFn[T]): Map[T, Int] = { + val acc = mutable.Map.empty[T, Int] + + @annotation.tailrec + def computeDepth(todo: Set[T]): Unit = + if (!todo.isEmpty) { + def withParents(n: T) = (n :: (nf(n).toList)).filterNot(acc.contains(_)).distinct + + val (doneThisStep, rest) = todo + .map { + withParents(_) + } + .partition { + _.size == 1 + } + + acc ++= doneThisStep.flatten.map { n => + val depth = nf(n) // n is done now, so all it's neighbors must be too. + .map { + acc(_) + 1 + } + .reduceOption { + _ max _ + } + .getOrElse(0) + n -> depth + } + computeDepth(rest.flatten) + } + + computeDepth(nodes.toSet) + acc.toMap + } +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/HCache.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/HCache.scala new file mode 100644 index 0000000000..83a6a050f8 --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/HCache.scala @@ -0,0 +1,63 @@ +package com.twitter.scalding.dagon + +import java.io.Serializable + +/** + * This is a useful cache for memoizing natural transformations. + * + * The cache is implemented using a mutable pointer to an immutable map value. In the worst-case, race + * conditions might cause us to lose cache values (i.e. compute some keys twice), but we will never produce + * incorrect values. + */ +sealed class HCache[K[_], V[_]] private (init: HMap[K, V]) extends Serializable { + + private[this] var hmap: HMap[K, V] = init + + /** + * Given a key, either return a cached value, or compute, store, and return a new value. + * + * This method is what justifies the existence of Cache. Its second parameter (`v`) is by-name: it will only + * be evaluated in cases where the key is not cached. + * + * For example: + * + * def greet(i: Int): Option[Int] = { println("hi") Option(i + 1) } + * + * val c = Cache.empty[Option, Option] c.getOrElseUpdate(Some(1), greet(1)) // says hi, returns Some(2) + * c.getOrElseUpdate(Some(1), greet(1)) // just returns Some(2) + */ + def getOrElseUpdate[T](k: K[T], v: => V[T]): V[T] = + hmap.get(k) match { + case Some(exists) => exists + case None => + val res = v + hmap = hmap + (k -> res) + res + } + + /** + * Create a second cache with the same values as this one. + * + * The two caches will start with the same values, but will be independently updated. + */ + def duplicate: HCache[K, V] = + new HCache(hmap) + + /** + * Access the currently-cached keys and values as a map. + */ + def toHMap: HMap[K, V] = + hmap + + /** + * Forget all cached keys and values. + * + * After calling this method, the resulting cache is equivalent to Cache.empty[K, V]. + */ + def reset(): Unit = + hmap = HMap.empty[K, V] +} + +object HCache { + def empty[K[_], V[_]]: HCache[K, V] = new HCache(HMap.empty) +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/HMap.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/HMap.scala new file mode 100644 index 0000000000..addc260e81 --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/HMap.scala @@ -0,0 +1,100 @@ +/* + Copyright 2014 Twitter, Inc. + Copyright 2017 Stripe, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.scalding.dagon + +import com.twitter.scalding.dagon.ScalaVersionCompat.{lazyListFromIterator, LazyList} + +import java.io.Serializable + +/** + * This is a weak heterogenous map. It uses equals on the keys, so it is your responsibilty that if k: K[_] == + * k2: K[_] then the types are actually equal (either be careful or store a type identifier). + */ +final class HMap[K[_], V[_]](protected val map: Map[K[_], V[_]]) extends Serializable { + + type Pair[t] = (K[t], V[t]) + + override def toString: String = + "H%s".format(map) + + override def equals(that: Any): Boolean = + that match { + case null => false + case h: HMap[_, _] => map.equals(h.map) + case _ => false + } + + override def hashCode: Int = + map.hashCode + + def updated[T](k: K[T], v: V[T]): HMap[K, V] = + HMap.from[K, V](map.updated(k, v)) + + def +[T](kv: (K[T], V[T])): HMap[K, V] = + HMap.from[K, V](map + kv) + + def ++(other: HMap[K, V]): HMap[K, V] = + HMap.from[K, V](map ++ other.map) + + def -(k: K[_]): HMap[K, V] = + HMap.from[K, V](map - k) + + def apply[T](id: K[T]): V[T] = + map(id).asInstanceOf[V[T]] + + def get[T](id: K[T]): Option[V[T]] = + map.get(id).asInstanceOf[Option[V[T]]] + + def contains[T](id: K[T]): Boolean = + map.contains(id) + + def isEmpty: Boolean = map.isEmpty + + def size: Int = map.size + + def forallKeys(p: K[_] => Boolean): Boolean = + map.forall { case (k, _) => p(k) } + + def filterKeys(p: K[_] => Boolean): HMap[K, V] = + HMap.from[K, V](map.filter { case (k, _) => p(k) }) + + def keySet: Set[K[_]] = + map.keySet + + def keysOf[T](v: V[T]): Set[K[T]] = + map.iterator.collect { + case (k, w) if v == w => k.asInstanceOf[K[T]] + }.toSet + + def optionMap[R[_]](f: FunctionK[Pair, Lambda[x => Option[R[x]]]]): LazyList[R[_]] = { + val fnAny = f.toFunction[Any].andThen(_.iterator) + lazyListFromIterator(map.iterator.asInstanceOf[Iterator[(K[Any], V[Any])]].flatMap(fnAny)) + } + + def mapValues[V1[_]](f: FunctionK[V, V1]): HMap[K, V1] = + HMap.from[K, V1](map.map { case (k, v) => k -> f(v) }.toMap) +} + +object HMap { + + def empty[K[_], V[_]]: HMap[K, V] = + from[K, V](Map.empty[K[_], V[_]]) + + private[dagon] def from[K[_], V[_]](m: Map[K[_], V[_]]): HMap[K, V] = + new HMap[K, V](m) +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Id.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Id.scala new file mode 100644 index 0000000000..7546f71034 --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Id.scala @@ -0,0 +1,31 @@ +package com.twitter.scalding.dagon + +import java.io.Serializable +import java.util.concurrent.atomic.AtomicLong + +/** + * The Expressions are assigned Ids. Each Id is associated with an expression of inner type T. + * + * This is done to put an indirection in the Dag that allows us to rewrite nodes by simply replacing the + * expressions associated with given Ids. + * + * T is a phantom type used by the type system + */ +final class Id[T] private (val serial: Long) extends Serializable { + require(serial >= 0, s"counter overflow has occurred: $serial") + override def toString: String = s"Id($serial)" +} + +object Id { + + @transient private[this] val counter = new AtomicLong(0) + + def next[T](): Id[T] = + new Id[T](counter.getAndIncrement()) + + implicit def idOrdering[T]: Ordering[Id[T]] = + new Ordering[Id[T]] { + def compare(a: Id[T], b: Id[T]) = + java.lang.Long.compare(a.serial, b.serial) + } +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Literal.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Literal.scala new file mode 100644 index 0000000000..ad17f0153b --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Literal.scala @@ -0,0 +1,126 @@ +package com.twitter.scalding.dagon + +import java.io.Serializable +import scala.util.control.TailCalls +import scala.util.hashing.MurmurHash3 + +/** + * This represents literal expressions (no variable redirection) of container nodes of type N[T] + */ +sealed trait Literal[N[_], T] extends Serializable { self: Product => + def evaluate: N[T] = Literal.evaluate(this) + + override val hashCode: Int = MurmurHash3.productHash(self) + + /** + * Here we memoize as we check equality and always check reference equality first. This can dramatically + * improve performance on graphs that merge back often + */ + override def equals(that: Any) = that match { + case thatF: Literal[_, _] => + if (thatF eq this) true + else if (thatF.hashCode != hashCode) false + else Literal.eqFn[N](RefPair(this, thatF.asInstanceOf[Literal[N, _]])) + case _ => false + } +} + +object Literal { + + sealed case class Const[N[_], T](override val evaluate: N[T]) extends Literal[N, T] + + sealed case class Unary[N[_], T1, T2](arg: Literal[N, T1], fn: N[T1] => N[T2]) extends Literal[N, T2] + + sealed case class Binary[N[_], T1, T2, T3]( + arg1: Literal[N, T1], + arg2: Literal[N, T2], + fn: (N[T1], N[T2]) => N[T3] + ) extends Literal[N, T3] + + sealed case class Variadic[N[_], T1, T2](args: List[Literal[N, T1]], fn: List[N[T1]] => N[T2]) + extends Literal[N, T2] + + /** + * This evaluates a literal formula back to what it represents being careful to handle diamonds by creating + * referentially equivalent structures (not just structurally equivalent) + */ + def evaluate[N[_], T](lit: Literal[N, T]): N[T] = + evaluateMemo[N](lit) + + /** + * Memoized version of evaluation to handle diamonds + * + * Each call to this creates a new internal memo. + */ + def evaluateMemo[N[_]]: FunctionK[Literal[N, *], N] = { + import TailCalls._ + + val slowAndSafe = + Memoize.functionKTailRec[Literal[N, *], N](new Memoize.RecursiveKTailRec[Literal[N, *], N] { + def toFunction[T] = { + case (Const(n), _) => done(n) + case (Unary(n, fn), rec) => rec(n).map(fn) + case (Binary(n1, n2, fn), rec) => + for { + nn1 <- rec(n1) + nn2 <- rec(n2) + } yield fn(nn1, nn2) + case (Variadic(args, fn), rec) => + def loop[A](as: List[Literal[N, A]]): TailRec[List[N[A]]] = + as match { + case Nil => done(Nil) + case h :: t => loop(t).flatMap(tt => rec(h).map(_ :: tt)) + } + loop(args).map(fn) + } + }) + + val fast = Memoize.functionK[Literal[N, *], N](new Memoize.RecursiveK[Literal[N, *], N] { + def toFunction[T] = { + case (Const(n), _) => n + case (Unary(n, fn), rec) => fn(rec(n)) + case (Binary(n1, n2, fn), rec) => fn(rec(n1), rec(n2)) + case (Variadic(args, fn), rec) => fn(args.map(rec(_))) + } + }) + + def onStackGoSlow[A](lit: Literal[N, A]): N[A] = + try fast(lit) + catch { + case _: Throwable => // StackOverflowError should work, but not on scala.js + slowAndSafe(lit).result + } + + /* + * We *non-recursively* use either the fast approach or the slow approach + */ + Memoize.functionK[Literal[N, *], N](new Memoize.RecursiveK[Literal[N, *], N] { + def toFunction[T] = { case (u, _) => onStackGoSlow(u) } + }) + } + + /** + * Note that this is a def, not a val, so the cache only lives as long as a single outermost equality check + */ + private def eqFn[N[_]]: Function[RefPair[Literal[N, _], Literal[N, _]], Boolean] = + Memoize.function[RefPair[Literal[N, _], Literal[N, _]], Boolean] { + case (pair, _) if pair.itemsEq => true + case (RefPair(Const(a), Const(b)), _) => a == b + case (RefPair(Unary(left, fa), Unary(right, fb)), rec) => + (fa == fb) && rec(RefPair(left, right)) + case (RefPair(Binary(lefta, righta, fa), Binary(leftb, rightb, fb)), rec) => + (fa == fb) && rec(RefPair(lefta, leftb)) && rec(RefPair(righta, rightb)) + case (RefPair(Variadic(argsa, fa), Variadic(argsb, fb)), rec) => + @annotation.tailrec + def loop(left: List[Literal[N, _]], right: List[Literal[N, _]]): Boolean = + (left, right) match { + case (lh :: ltail, rh :: rtail) => + rec(RefPair(lh, rh)) && loop(ltail, rtail) + case (Nil, Nil) => true + case _ => false + } + + (fa == fb) && loop(argsa, argsb) + case other => false + } +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Memoize.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Memoize.scala new file mode 100644 index 0000000000..352e6a402b --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Memoize.scala @@ -0,0 +1,71 @@ +package com.twitter.scalding.dagon + +import scala.util.control.TailCalls.{done, tailcall, TailRec} + +object Memoize { + + /** + * Allow the user to create memoized recursive functions, by providing a function which can operate values + * as well as references to "itself". + * + * For example, we can translate the naive recursive Fibonnaci definition (which is exponential) into an + * opimized linear-time (and linear-space) form: + * + * Memoize.function[Int, Long] { case (0, _) => 1 case (1, _) => 1 case (i, f) => f(i - 1) + f(i - 2) } + */ + def function[A, B](f: (A, A => B) => B): A => B = { + // It is tempting to use a mutable.Map here, + // but mutating the Map inside of the call-by-name value causes + // some issues in some versions of scala. It is + // safer to use a mutable pointer to an immutable Map. + val cache = Cache.empty[A, B] + lazy val g: A => B = (a: A) => cache.getOrElseUpdate(a, f(a, g)) + g + } + + type RecursiveK[A[_], B[_]] = FunctionK[Lambda[x => (A[x], FunctionK[A, B])], B] + + /** + * Memoize a FunctionK using an HCache internally. + */ + def functionK[A[_], B[_]](f: RecursiveK[A, B]): FunctionK[A, B] = { + val hcache = HCache.empty[A, B] + lazy val hg: FunctionK[A, B] = new FunctionK[A, B] { + def toFunction[T]: A[T] => B[T] = + at => hcache.getOrElseUpdate(at, f((at, hg))) + } + hg + } + + private def cacheCall[A](t: => TailRec[A]): TailRec[A] = { + var res: Option[TailRec[A]] = None + tailcall { + res match { + case Some(a) => a + case None => + t.flatMap { a => + val d = done(a) + res = Some(d) + d + } + } + } + } + + type FunctionKRec[A[_], B[_]] = FunctionK[A, Lambda[x => TailRec[B[x]]]] + type RecursiveKTailRec[A[_], B[_]] = + FunctionK[Lambda[x => (A[x], FunctionKRec[A, B])], Lambda[x => TailRec[B[x]]]] + + /** + * Memoize a FunctionK using an HCache, and tailCalls, which are slower but make things stack safe + */ + def functionKTailRec[A[_], B[_]](f: RecursiveKTailRec[A, B]): FunctionKRec[A, B] = { + type TailB[Z] = TailRec[B[Z]] + val hcache = HCache.empty[A, TailB] + lazy val hg: FunctionK[A, TailB] = new FunctionK[A, TailB] { + def toFunction[T]: A[T] => TailB[T] = + at => hcache.getOrElseUpdate(at, cacheCall(f((at, hg)))) + } + hg + } +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/PartialRule.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/PartialRule.scala new file mode 100644 index 0000000000..6ef8fd0e5f --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/PartialRule.scala @@ -0,0 +1,11 @@ +package com.twitter.scalding.dagon + +/** + * Often a partial function is an easier way to express rules + */ +trait PartialRule[N[_]] extends Rule[N] { + final def apply[T](on: Dag[N]): N[T] => Option[N[T]] = + applyWhere[T](on).lift + + def applyWhere[T](on: Dag[N]): PartialFunction[N[T], N[T]] +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/RefPair.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/RefPair.scala new file mode 100644 index 0000000000..d217b324a2 --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/RefPair.scala @@ -0,0 +1,24 @@ +package com.twitter.scalding.dagon + +import scala.util.hashing.MurmurHash3 + +/** + * A tuple2 that uses reference equality on items to do equality useful for caching the results of pair-wise + * functions on DAGs. + * + * Without this, you can easily get exponential complexity on recursion on DAGs. + */ +case class RefPair[A <: AnyRef, B <: AnyRef](_1: A, _2: B) { + + override lazy val hashCode: Int = MurmurHash3.productHash(this) + + override def equals(that: Any) = that match { + case RefPair(thatA, thatB) => (_1 eq thatA) && (_2 eq thatB) + case _ => false + } + + /** + * true if the left is referentially equal to the right + */ + def itemsEq: Boolean = _1 eq _2 +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Rule.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Rule.scala new file mode 100644 index 0000000000..90f7e4ae20 --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/Rule.scala @@ -0,0 +1,55 @@ +package com.twitter.scalding.dagon + +import java.io.Serializable + +/** + * This implements a simplification rule on Dags + */ +trait Rule[N[_]] extends Serializable { self => + + /** + * If the given Id can be replaced with a simpler expression, return Some(expr) else None. + * + * If it is convenient, you might write a partial function and then call .lift to get the correct Function + * type + */ + def apply[T](on: Dag[N]): N[T] => Option[N[T]] + + /** + * If the current rule cannot apply, then try the argument here. Note, this applies in series at a given + * node, not on the whole Dag after the first rule has run. For that, see Dag.applySeq. + */ + def orElse(that: Rule[N]): Rule[N] = + new Rule[N] { + def apply[T](on: Dag[N]) = { n => + self.apply(on)(n) match { + case Some(n1) if n1 == n => + // If the rule emits the same as input fall through + that.apply(on)(n) + case None => + that.apply(on)(n) + case s @ Some(_) => s + } + } + + override def toString: String = + s"$self.orElse($that)" + } +} + +object Rule { + + /** + * A Rule that never applies + */ + def empty[N[_]]: Rule[N] = + new Rule[N] { + def apply[T](on: Dag[N]) = { _ => None } + } + + /** + * Build a new Rule out of several using orElse to compose + */ + def orElse[N[_]](it: Iterable[Rule[N]]): Rule[N] = + it.reduceOption(_ orElse _).getOrElse(empty) +} diff --git a/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/package.scala b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/package.scala new file mode 100644 index 0000000000..1dbdcdf230 --- /dev/null +++ b/scalding-dagon/src/main/scala/com/twitter/scalding/dagon/package.scala @@ -0,0 +1,7 @@ +package com.twitter.scalding + +/** Collection of graph algorithms */ +package object dagon { + type BoolT[T] = Boolean + type NeighborFn[T] = T => Iterable[T] +} diff --git a/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/CacheTests.scala b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/CacheTests.scala new file mode 100644 index 0000000000..9335747fe2 --- /dev/null +++ b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/CacheTests.scala @@ -0,0 +1,43 @@ +package com.twitter.scalding.dagon + +import org.scalacheck.Prop._ +import org.scalacheck.{Arbitrary, Cogen, Properties} + +abstract class CacheTests[K: Cogen: Arbitrary, V: Arbitrary](name: String) extends Properties(name) { + + def buildMap(c: Cache[K, V], ks: Iterable[K], f: K => V): Map[K, V] = + ks.iterator.foldLeft(Map.empty[K, V]) { (m, k) => + m.updated(k, c.getOrElseUpdate(k, f(k))) + } + + property("getOrElseUpdate") = forAll { (f: K => V, k: K, v1: V, v2: V) => + val c = Cache.empty[K, V] + var count = 0 + val x = c.getOrElseUpdate(k, { count += 1; v1 }) + val y = c.getOrElseUpdate(k, { count += 1; v2 }) + x == v1 && y == v1 && count == 1 + } + + property("toMap") = forAll { (f: K => V, ks: Set[K]) => + val c = Cache.empty[K, V] + val m = buildMap(c, ks, f) + c.toMap == m + } + + property("duplicate") = forAll { (f: K => V, ks: Set[K]) => + val c = Cache.empty[K, V] + val d = c.duplicate + buildMap(c, ks, f) + d.toMap.isEmpty + } + + property("reset works") = forAll { (f: K => V, ks: Set[K]) => + val c = Cache.empty[K, V] + buildMap(c, ks, f) + val d = c.duplicate + c.reset() + c.toMap.isEmpty && d.toMap.size == ks.size + } +} + +object CacheTestsSL extends CacheTests[String, Long]("CacheTests[String, Long]") diff --git a/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/DataFlowTest.scala b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/DataFlowTest.scala new file mode 100644 index 0000000000..2433d1b5b2 --- /dev/null +++ b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/DataFlowTest.scala @@ -0,0 +1,901 @@ +package com.twitter.scalding.dagon + +import org.scalatest.FunSuite +import org.scalatest.prop.GeneratorDrivenPropertyChecks._ +import org.scalacheck.{Arbitrary, Cogen, Gen} +import scala.util.control.TailCalls + +import ScalaVersionCompat.{iterateOnce, lazyListFromIterator, IterableOnce} + +object DataFlowTest { + sealed abstract class Flow[+T] extends Product { + def filter(fn: T => Boolean): Flow[T] = + optionMap(Flow.FilterFn(fn)) + + def map[U](fn: T => U): Flow[U] = + optionMap(Flow.MapFn(fn)) + + def optionMap[U](fn: T => Option[U]): Flow[U] = + Flow.OptionMapped(this, fn) + + def concatMap[U](fn: T => IterableOnce[U]): Flow[U] = + Flow.ConcatMapped(this, fn) + + def ++[U >: T](that: Flow[U]): Flow[U] = + Flow.Merge(this, that) + + def tagged[A](a: A): Flow[T] = + Flow.Tagged(this, a) + + /* + * For large dags you need to eagerly cache hashcode + * or you will stack overflow + */ + override val hashCode = scala.util.hashing.MurmurHash3.productHash(this) + + /* + * You need a custom equals to avoid stack overflow + */ + override def equals(that: Any) = { + type Pair = (Flow[Any], Flow[Any]) + import Flow._ + @annotation.tailrec + def loop(pairs: List[Pair]): Boolean = + pairs match { + case Nil => true + case h :: tail => + val (h1, h2) = h + if (h1 eq h2) loop(tail) + else + h match { + case (IteratorSource(as), IteratorSource(bs)) => (as == bs) && loop(tail) + case (OptionMapped(f1, fn1), OptionMapped(f2, fn2)) => + (fn1 == fn2) && { + val pair = (f1, f2) + loop(pair :: tail) + } + case (ConcatMapped(f1, fn1), ConcatMapped(f2, fn2)) => + (fn1 == fn2) && { + val pair = (f1, f2) + loop(pair :: tail) + } + case (Merge(m1, m2), Merge(m3, m4)) => + val pair1 = (m1, m3) + val pair2 = (m2, m4) + loop(pair1 :: pair2 :: tail) + case (Merged(it0), Merged(it1)) => + (it0.size == it1.size) && { + val pairs = it0.zip(it1) + loop(pairs ::: tail) + } + case (Fork(f1), Fork(f2)) => + val pair = (f1, f2) + loop(pair :: tail) + case (Tagged(f1, a1), Tagged(f2, a2)) => + a1 == a2 && { + val pair = (f1, f2) + loop(pair :: tail) + } + case (_, _) => false + } + } + + that match { + case f: Flow[_] => + // since hashCode is computed, let's use this first + // after this, we are dealing with collisions and equality + (this eq f) || ((this.hashCode == f.hashCode) && loop((this, f) :: Nil)) + case _ => false + } + } + } + + object Flow { + def apply[T](it: Iterator[T]): Flow[T] = IteratorSource(it) + + def dependenciesOf(f: Flow[Any]): List[Flow[Any]] = + f match { + case IteratorSource(_) => Nil + case OptionMapped(f, _) => f :: Nil + case ConcatMapped(f, _) => f :: Nil + case Tagged(f, _) => f :: Nil + case Fork(f) => f :: Nil + case Merge(left, right) => left :: right :: Nil + case Merged(ins) => ins + } + + def transitiveDeps(f: Flow[Any]): List[Flow[Any]] = + Graphs.reflexiveTransitiveClosure(List(f))(dependenciesOf _) + + case class IteratorSource[T](it: Iterator[T]) extends Flow[T] + case class OptionMapped[T, U](input: Flow[T], fn: T => Option[U]) extends Flow[U] + case class ConcatMapped[T, U](input: Flow[T], fn: T => IterableOnce[U]) extends Flow[U] + case class Merge[T](left: Flow[T], right: Flow[T]) extends Flow[T] + case class Merged[T](inputs: List[Flow[T]]) extends Flow[T] + case class Tagged[A, T](input: Flow[T], tag: A) extends Flow[T] + case class Fork[T](input: Flow[T]) extends Flow[T] + + def toLiteral: FunctionK[Flow, Literal[Flow, *]] = + Memoize.functionK[Flow, Literal[Flow, *]](new Memoize.RecursiveK[Flow, Literal[Flow, *]] { + import Literal._ + + def toFunction[T] = { + case (it @ IteratorSource(_), _) => Const(it) + case (o: OptionMapped[s, T], rec) => Unary(rec[s](o.input), { f: Flow[s] => OptionMapped(f, o.fn) }) + case (c: ConcatMapped[s, T], rec) => Unary(rec[s](c.input), { f: Flow[s] => ConcatMapped(f, c.fn) }) + case (t: Tagged[a, s], rec) => Unary(rec[s](t.input), { f: Flow[s] => Tagged(f, t.tag) }) + case (f: Fork[s], rec) => Unary(rec[s](f.input), { f: Flow[s] => Fork(f) }) + case (m: Merge[s], rec) => + Binary(rec(m.left), rec(m.right), (l: Flow[s], r: Flow[s]) => Merge(l, r)) + case (m: Merged[s], rec) => Variadic(m.inputs.map(rec(_)), { fs: List[Flow[s]] => Merged(fs) }) + } + }) + + def toLiteralTail: FunctionK[Flow, Literal[Flow, *]] = + FunctionK.andThen[Flow, Lambda[x => TailCalls.TailRec[Literal[Flow, x]]], Literal[Flow, *]]( + Memoize.functionKTailRec[Flow, Literal[Flow, *]]( + new Memoize.RecursiveKTailRec[Flow, Literal[Flow, *]] { + import Literal._ + + def toFunction[T] = { + case (it @ IteratorSource(_), _) => TailCalls.done(Const(it)) + case (o: OptionMapped[s, T], rec) => + rec[s](o.input).map(Unary(_, { f: Flow[s] => OptionMapped(f, o.fn) })) + case (c: ConcatMapped[s, T], rec) => + rec[s](c.input).map(Unary(_, { f: Flow[s] => ConcatMapped(f, c.fn) })) + case (t: Tagged[a, s], rec) => rec[s](t.input).map(Unary(_, { f: Flow[s] => Tagged(f, t.tag) })) + case (f: Fork[s], rec) => rec[s](f.input).map(Unary(_, { f: Flow[s] => Fork(f) })) + case (m: Merge[s], rec) => + for { + l <- rec(m.left) + r <- rec(m.right) + } yield Binary(l, r, (l: Flow[s], r: Flow[s]) => Merge(l, r)) + case (m: Merged[s], rec) => + def loop(ins: List[Flow[s]]): TailCalls.TailRec[List[Literal[Flow, s]]] = + ins match { + case Nil => TailCalls.done(Nil) + case h :: tail => + for { + lh <- rec(h) + lt <- loop(ins) + } yield lh :: lt + } + + loop(m.inputs).map(Variadic(_, { fs: List[Flow[s]] => Merged(fs) })) + } + } + ), + new FunctionK[Lambda[x => TailCalls.TailRec[Literal[Flow, x]]], Literal[Flow, *]] { + def toFunction[T] = _.result + } + ) + /* + * use case class functions to preserve equality where possible + */ + private case class FilterFn[A](fn: A => Boolean) extends Function1[A, Option[A]] { + def apply(a: A): Option[A] = if (fn(a)) Some(a) else None + } + + private case class MapFn[A, B](fn: A => B) extends Function1[A, Option[B]] { + def apply(a: A): Option[B] = Some(fn(a)) + } + + private case class ComposedOM[A, B, C](fn1: A => Option[B], fn2: B => Option[C]) + extends Function1[A, Option[C]] { + def apply(a: A): Option[C] = { + // TODO this would be 2x faster if we do it repeatedly and we right associate once in + // advance + // this type checks, but can't be tailrec + // def loop[A1, B1](start: A1, first: A1 => Option[B1], next: B1 => Option[C]): Option[C] = + @annotation.tailrec + def loop(start: Any, first: Any => Option[Any], next: Any => Option[C]): Option[C] = + first match { + case ComposedOM(f1, f2) => + loop(start, f1, ComposedOM(f2, next)) + case notComp => + notComp(start) match { + case None => None + case Some(b) => + next match { + case ComposedOM(f1, f2) => + loop(b, f1, f2) + case notComp => notComp(b) + } + } + } + + loop(a, fn1.asInstanceOf[Any => Option[Any]], fn2.asInstanceOf[Any => Option[C]]) + } + } + private case class ComposedCM[A, B, C](fn1: A => IterableOnce[B], fn2: B => IterableOnce[C]) + extends Function1[A, IterableOnce[C]] { + def apply(a: A): IterableOnce[C] = iterateOnce(fn1(a)).flatMap(fn2) + } + private case class OptionToConcatFn[A, B](fn: A => Option[B]) extends Function1[A, IterableOnce[B]] { + def apply(a: A): IterableOnce[B] = fn(a) match { + case Some(a) => Iterator.single(a) + case None => Iterator.empty + } + } + + /** + * Add explicit fork this is useful if you don't want to have to check each rule for fanout + * + * This rule has to be applied from lower down on the graph looking up to avoid cases where Fork(f) exists + * and f has a fanOut. + */ + object explicitFork extends Rule[Flow] { + def needsFork[N[_]](on: Dag[N], n: N[_]): Boolean = + n match { + case Fork(_) => false + case n => !on.hasSingleDependent(n) + } + def apply[T](on: Dag[Flow]) = { + case OptionMapped(flow, fn) if needsFork(on, flow) => + Some(OptionMapped(Fork(flow), fn)) + case ConcatMapped(flow, fn) if needsFork(on, flow) => + Some(ConcatMapped(Fork(flow), fn)) + case Tagged(flow, tag) if needsFork(on, flow) => + Some(Tagged(Fork(flow), tag)) + case Merge(lhs, rhs) => + val (nl, nr) = (needsFork(on, lhs), needsFork(on, lhs)) + if (!nl && !nr) None + else Some(Merge(if (nl) Fork(lhs) else lhs, if (nr) Fork(rhs) else rhs)) + case Merged(inputs) => + val nx = inputs.map(needsFork(on, _)) + if (nx.forall(_ == false)) None + else Some(Merged(inputs.zip(nx).map { case (n, b) => if (b) Fork(n) else n })) + case _ => + None + } + } + + /** + * f.optionMap(fn1).optionMap(fn2) == f.optionMap { t => fn1(t).flatMap(fn2) } we use object to get good + * toString for debugging + */ + object composeOptionMapped extends Rule[Flow] { + // This recursively scoops up as much as we can into one OptionMapped + // the Any here is to make tailrec work, which until 2.13 does not allow + // the types to change on the calls + @annotation.tailrec + private def compose[B]( + dag: Dag[Flow], + flow: Flow[Any], + fn: Any => Option[B], + diff: Boolean + ): (Boolean, OptionMapped[_, B]) = + flow match { + case OptionMapped(inner, fn1) if dag.hasSingleDependent(flow) => + compose(dag, inner, ComposedOM(fn1, fn), true) + case _ => (diff, OptionMapped(flow, fn)) + } + + def apply[T](on: Dag[Flow]) = { + case OptionMapped(inner, fn) => + val (changed, res) = compose(on, inner, fn, false) + if (changed) Some(res) + else None + case _ => None + } + } + + /** + * f.concatMap(fn1).concatMap(fn2) == f.concatMap { t => fn1(t).flatMap(fn2) } + */ + object composeConcatMap extends PartialRule[Flow] { + def applyWhere[T](on: Dag[Flow]) = { + case (ConcatMapped(inner @ ConcatMapped(s, fn0), fn1)) if on.hasSingleDependent(inner) => + ConcatMapped(s, ComposedCM(fn0, fn1)) + } + } + + /** + * (a ++ b).concatMap(fn) == (a.concatMap(fn) ++ b.concatMap(fn)) (a ++ b).optionMap(fn) == + * (a.optionMap(fn) ++ b.optionMap(fn)) + */ + object mergePullDown extends PartialRule[Flow] { + def applyWhere[T](on: Dag[Flow]) = { + case (ConcatMapped(merge @ Merge(a, b), fn)) if on.hasSingleDependent(merge) => + a.concatMap(fn) ++ b.concatMap(fn) + case (OptionMapped(merge @ Merge(a, b), fn)) if on.hasSingleDependent(merge) => + a.optionMap(fn) ++ b.optionMap(fn) + } + } + + /** + * we can convert optionMap to concatMap if we don't care about maintaining the knowledge about which fns + * potentially expand the size + */ + object optionMapToConcatMap extends PartialRule[Flow] { + def applyWhere[T](on: Dag[Flow]) = { case OptionMapped(of, fn) => + ConcatMapped(of, OptionToConcatFn(fn)) + } + } + + /** + * right associate merges + */ + object CombineMerges extends Rule[Flow] { + def apply[T](on: Dag[Flow]) = { + @annotation.tailrec + def flatten(f: Flow[T], toCheck: List[Flow[T]], acc: List[Flow[T]]): List[Flow[T]] = + f match { + case m @ Merge(a, b) if on.hasSingleDependent(m) => + // on the inner merges, we only destroy them if they have no fanout + flatten(a, b :: toCheck, acc) + case noSplit => + toCheck match { + case h :: tail => flatten(h, tail, noSplit :: acc) + case Nil => (noSplit :: acc).reverse + } + } + + { node: Flow[T] => + node match { + case Merge(a, b) => + flatten(a, b :: Nil, Nil) match { + case a1 :: a2 :: Nil => + None // could not simplify + case many => Some(Merged(many)) + } + case Merged(list @ (h :: tail)) => + val res = flatten(h, tail, Nil) + if (res != list) Some(Merged(res)) + else None + case _ => None + } + } + } + } + + /** + * evaluate single fanout sources + */ + object evalSource extends PartialRule[Flow] { + def applyWhere[T](on: Dag[Flow]) = { + case OptionMapped(src @ IteratorSource(it), fn) if on.hasSingleDependent(src) => + IteratorSource(it.flatMap(fn(_).iterator)) + case ConcatMapped(src @ IteratorSource(it), fn) if on.hasSingleDependent(src) => + IteratorSource(it.flatMap(fn)) + case Merge(src1 @ IteratorSource(it1), src2 @ IteratorSource(it2)) + if it1 != it2 && on.hasSingleDependent(src1) && on.hasSingleDependent(src2) => + IteratorSource(it1 ++ it2) + case Merge(src1 @ IteratorSource(it1), src2 @ IteratorSource(it2)) + if it1 == it2 && on.hasSingleDependent(src1) && on.hasSingleDependent(src2) => + // we need to materialize the left + val left = lazyListFromIterator(it1) + IteratorSource((left ++ left).iterator) + case Merged(Nil) => IteratorSource(Iterator.empty) + case Merged(single :: Nil) => single + case Merged((src1 @ IteratorSource(it1)) :: (src2 @ IteratorSource(it2)) :: tail) + if it1 != it2 && on.hasSingleDependent(src1) && on.hasSingleDependent(src2) => + Merged(IteratorSource(it1 ++ it2) :: tail) + case Merged((src1 @ IteratorSource(it1)) :: (src2 @ IteratorSource(it2)) :: tail) + if it1 == it2 && on.hasSingleDependent(src1) && on.hasSingleDependent(src2) => + // we need to materialize the left + val left = lazyListFromIterator(it1) + Merged(IteratorSource((left ++ left).iterator) :: tail) + } + } + + object removeTag extends PartialRule[Flow] { + def applyWhere[T](on: Dag[Flow]) = { case Tagged(in, _) => + in + } + } + + /** + * these are all optimization rules to simplify + */ + val allRulesList: List[Rule[Flow]] = + List(composeOptionMapped, composeConcatMap, mergePullDown, CombineMerges, removeTag, evalSource) + + val allRules = Rule.orElse(allRulesList) + + val ruleGen: Gen[Rule[Flow]] = { + val allRules = List( + composeOptionMapped, + composeConcatMap, + optionMapToConcatMap, + mergePullDown, + CombineMerges, + evalSource, + removeTag + ) + for { + n <- Gen.choose(0, allRules.size) + gen = if (n == 0) Gen.const(List(Rule.empty[Flow])) else Gen.pick(n, allRules) + rs <- gen + } yield rs.reduce((r1: Rule[Flow], r2: Rule[Flow]) => r1.orElse(r2)) + } + + implicit val arbRule: Arbitrary[Rule[Flow]] = + Arbitrary(ruleGen) + + def genFlow[T](g: Gen[T])(implicit cogen: Cogen[T]): Gen[Flow[T]] = { + implicit val arb: Arbitrary[T] = Arbitrary(g) + + def genSource: Gen[Flow[T]] = + Gen.listOf(g).map(l => Flow(l.iterator)) + + /** + * We want to create DAGs, so we need to sometimes select a parent + */ + def reachable(f: Flow[T]): Gen[Flow[T]] = + Gen.lzy(Gen.oneOf(Flow.transitiveDeps(f).asInstanceOf[List[Flow[T]]])) + + val optionMap: Gen[Flow[T]] = + for { + parent <- Gen.lzy(genFlow(g)) + fn <- implicitly[Arbitrary[T => Option[T]]].arbitrary + } yield parent.optionMap(fn) + + val concatMap: Gen[Flow[T]] = + for { + parent <- Gen.lzy(genFlow(g)) + fn <- implicitly[Arbitrary[T => List[T]]].arbitrary + } yield parent.concatMap(fn) + + val merge: Gen[Flow[T]] = + for { + left <- Gen.lzy(genFlow(g)) + right <- Gen.frequency((3, genFlow(g)), (2, reachable(left))) + swap <- Gen.choose(0, 1) + res = if (swap == 1) (right ++ left) else (left ++ right) + } yield res + + val tagged: Gen[Flow[T]] = + for { + tag <- g + input <- genFlow(g) + } yield input.tagged(tag) + + Gen.frequency((4, genSource), (1, optionMap), (1, concatMap), (1, tagged), (1, merge)) + } + + implicit def arbFlow[T: Arbitrary: Cogen]: Arbitrary[Flow[T]] = + Arbitrary(genFlow[T](implicitly[Arbitrary[T]].arbitrary)) + + def expDagGen[T: Cogen](g: Gen[T]): Gen[Dag[Flow]] = { + val empty = Dag.empty[Flow](toLiteral) + + Gen.frequency((1, Gen.const(empty)), (10, genFlow(g).map(f => empty.addRoot(f)._1))) + } + + def arbExpDag[T: Arbitrary: Cogen]: Arbitrary[Dag[Flow]] = + Arbitrary(expDagGen[T](implicitly[Arbitrary[T]].arbitrary)) + } +} + +class DataFlowTest extends FunSuite { + + implicit val generatorDrivenConfig = + PropertyCheckConfiguration(minSuccessful = 5000) + + import DataFlowTest._ + + test("basic test 1") { + val f1 = Flow((0 to 100).iterator) + + val branch1 = f1.map(_ * 2).filter(_ % 6 != 0) + val branch2 = f1.map(_ * Int.MaxValue).filter(_ % 6 == 0) + + val tail = (branch1 ++ branch2).map(_ / 3) + + import Flow._ + + val res = Dag.applyRule(tail, toLiteral, mergePullDown.orElse(composeOptionMapped)) + + res match { + case Merge(OptionMapped(s1, fn1), OptionMapped(s2, fn2)) => + assert(s1 == s2) + case other => fail(s"$other") + } + } + + test("basic test 2") { + def it1: Iterator[Int] = (0 to 100).iterator + def it2: Iterator[Int] = (1000 to 2000).iterator + + val f = Flow(it1).map(_ * 2) ++ Flow(it2).filter(_ % 7 == 0) + + Dag.applyRule(f, Flow.toLiteral, Flow.allRules) match { + case Flow.IteratorSource(it) => + assert(it.toList == (it1.map(_ * 2) ++ (it2.filter(_ % 7 == 0))).toList) + case nonSrc => + fail(s"expected total evaluation $nonSrc") + } + + } + + test("fanOut matches") { + + def law(f: Flow[Int], rule: Rule[Flow], maxApplies: Int) = { + val (dag, id) = Dag(f, Flow.toLiteral) + + val optimizedDag = dag.applyMax(rule, maxApplies) + val optF = optimizedDag.evaluate(id) + + val depGraph = SimpleDag[Flow[Any]](Flow.transitiveDeps(optF))(Flow.dependenciesOf _) + + def fanOut(f: Flow[Any]): Int = { + val internal = depGraph.fanOut(f).getOrElse(0) + val external = if (depGraph.isTail(f)) 1 else 0 + internal + external + } + + optimizedDag.allNodes.foreach { n => + assert(depGraph.depth(n) == optimizedDag.depthOf(n), s"$n inside\n$optimizedDag") + assert(optimizedDag.fanOut(n) == fanOut(n), s"$n in $optimizedDag") + assert( + optimizedDag.isRoot(n) == (n == optF), + s"$n should not be a root, only $optF is, $optimizedDag" + ) + assert( + depGraph.isTail(n) == optimizedDag.isRoot(n), + s"$n is seen as a root, but shouldn't, $optimizedDag" + ) + } + } + + forAll(law(_, _, _)) + + /** + * Here we have a list of past regressions + */ + val it1 = List(1, 2, 3).iterator + val fn1 = { i: Int => if (i % 2 == 0) Some(i + 1) else None } + val it2 = List(2, 3, 4).iterator + val it3 = List(3, 4, 5).iterator + val fn2 = { i: Int => None } + val fn3 = { i: Int => (0 to i) } + + import Flow._ + + val g = ConcatMapped( + Merge( + OptionMapped(IteratorSource(it1), fn1), + OptionMapped(Merge(IteratorSource(it2), IteratorSource(it3)), fn2) + ), + fn3 + ) + law(g, Flow.allRules, 2) + } + + test("we either totally evaluate or have Iterators with fanOut") { + + def law(f: Flow[Int], ap: Dag[Flow] => Dag[Flow]) = { + val (dag, id) = Dag(f, Flow.toLiteral) + val optDag = ap(dag) + val optF = optDag.evaluate(id) + + optF match { + case Flow.IteratorSource(_) => succeed + case nonEval => + val depGraph = SimpleDag[Flow[Any]](Flow.transitiveDeps(nonEval))(Flow.dependenciesOf _) + + val fansOut = depGraph.nodes + .collect { case src @ Flow.IteratorSource(_) => + src + } + .exists(depGraph.fanOut(_).get > 1) + + assert(fansOut, s"should have fanout: $nonEval") + } + } + + forAll(law(_: Flow[Int], dag => dag(Flow.allRules))) + forAll(law(_: Flow[Int], dag => dag.applySeq(Flow.allRulesList))) + } + + test("addRoot adds roots") { + + def law[T](d: Dag[Flow], f: Flow[T], p: Boolean) = { + val (next, id) = d.addRoot(f) + if (p) { + println((next, id)) + println(next.evaluate(id)) + println(next.evaluate(id) == f) + println(next.idOf(f)) + } + assert(next.isRoot(f)) + assert(next.evaluate(id) == f) + assert(next.evaluate(next.idOf(f)) == f) + } + + { + import Flow._ + val (dag, id0) = Dag(IteratorSource(Iterator.empty), toLiteral) + val iter0 = IteratorSource(Iterator(0)) + val merged0 = Merge(iter0, iter0) + val tagged0 = Tagged(merged0, 638667334) + val merged1 = Merge(iter0, Merge(tagged0, merged0)) + val merged2 = Merge(iter0, merged1) + assert(merged0 != merged2) + val tagged1 = Tagged(ConcatMapped(merged2, { i: Int => List(i) }), -2147483648) + val optMapped0 = OptionMapped(tagged1, { i: Int => Some(i) }) + val flow = Merge(tagged0, optMapped0) + + law(dag, flow, false) + assert(flow != null) + } + + implicit val dagArb = Flow.arbExpDag[Int] + forAll { (d: Dag[Flow], f: Flow[Int]) => + law(d, f, false) + } + } + + test("all Dag.allNodes agrees with Flow.transitiveDeps") { + forAll { (f: Flow[Int], rule: Rule[Flow], max: Int) => + val (dag, id) = Dag(f, Flow.toLiteral) + + val optimizedDag = dag.applyMax(rule, max) + + val optF = optimizedDag.evaluate(id) + assert(optimizedDag.allNodes == Flow.transitiveDeps(optF).toSet, s"optimized: $optF $optimizedDag") + } + } + + test("transitiveDependenciesOf matches Flow.transitiveDeps") { + forAll { (f: Flow[Int], rule: Rule[Flow], max: Int) => + val (dag, id) = Dag(f, Flow.toLiteral) + + val optimizedDag = dag.applyMax(rule, max) + + val optF = optimizedDag.evaluate(id) + assert( + optimizedDag.transitiveDependenciesOf(optF) == (Flow.transitiveDeps(optF).toSet - optF), + s"optimized: $optF $optimizedDag" + ) + } + } + + test("Dag: findAll(n).forall(evaluate(_) == n)") { + forAll { (f: Flow[Int], rule: Rule[Flow], max: Int) => + val (dag, id) = Dag(f, Flow.toLiteral) + + val optimizedDag = dag.applyMax(rule, max) + + optimizedDag.allNodes.foreach { n => + optimizedDag.findAll(n).foreach { id => + assert(optimizedDag.evaluate(id) == n, s"$id does not eval to $n in $optimizedDag") + } + } + } + } + + test("apply the empty rule returns eq dag") { + implicit val dag = Flow.arbExpDag[Int] + + forAll { (d: Dag[Flow]) => + assert(d(Rule.empty[Flow]) eq d) + } + } + + test("rules are idempotent") { + def law(f: Flow[Int], rule: Rule[Flow]) = { + val (dag, id) = Dag(f, Flow.toLiteral) + val optimizedDag = dag(rule) + val optF = optimizedDag.evaluate(id) + + val (dag2, id2) = Dag(optF, Flow.toLiteral) + val optimizedDag2 = dag2(rule) + val optF2 = optimizedDag2.evaluate(id2) + + assert(optF2 == optF, s"dag1: $optimizedDag -- dag1: $optimizedDag2") + } + + forAll(law _) + } + + test("dependentsOf matches SimpleDag.dependantsOf") { + forAll { (f: Flow[Int], rule: Rule[Flow], max: Int) => + val (dag, id) = Dag(f, Flow.toLiteral) + + val optimizedDag = dag.applyMax(rule, max) + val depGraph = + SimpleDag[Flow[Any]](Flow.transitiveDeps(optimizedDag.evaluate(id)))(Flow.dependenciesOf _) + + optimizedDag.allNodes.foreach { n => + assert(depGraph.depth(n) == optimizedDag.depthOf(n)) + assert( + optimizedDag.dependentsOf(n) == depGraph.dependantsOf(n).fold(Set.empty[Flow[Any]])(_.toSet), + s"node: $n" + ) + assert( + optimizedDag.transitiveDependentsOf(n) == + depGraph.transitiveDependantsOf(n).toSet, + s"node: $n" + ) + } + } + } + + test("dependenciesOf matches toLiteral") { + forAll { (f: Flow[Int]) => + val (dag, id) = Dag(f, Flow.toLiteral) + + def contract(n: Flow[_]): List[Flow[_]] = + Flow.toLiteral(n) match { + case Literal.Const(_) => Nil + case Literal.Unary(n, _) => n.evaluate :: Nil + case Literal.Binary(n1, n2, _) => n1.evaluate :: n2.evaluate :: Nil + case Literal.Variadic(ns, _) => ns.map(_.evaluate) + } + + dag.allNodes.foreach { n => + assert(dag.dependenciesOf(n) == contract(n)) + } + } + } + + test("hasSingleDependent matches fanOut") { + forAll { (f: Flow[Int]) => + val (dag, id) = Dag(f, Flow.toLiteral) + + dag.allNodes.foreach { n => + assert(dag.hasSingleDependent(n) == (dag.fanOut(n) <= 1)) + } + + dag.allNodes + .filter(dag.hasSingleDependent) + .foreach { n => + assert(dag.dependentsOf(n).size <= 1) + } + } + } + + test("findAll works as expected") { + def law(f: Flow[Int], rule: Rule[Flow], max: Int, check: List[Flow[Int]]) = { + val (dag, _) = Dag(f, Flow.toLiteral) + + val optimizedDag = dag.applyMax(rule, max) + + optimizedDag.allNodes.iterator.foreach { n => + assert(optimizedDag.findAll(n).nonEmpty, s"findAll: $n $optimizedDag") + } + check.filterNot(optimizedDag.allNodes).foreach { n => + assert(optimizedDag.findAll(n).isEmpty, s"findAll: $n $optimizedDag") + } + } + + law(Flow.IteratorSource(Iterator(1, 2, 3)), Flow.allRules, 1, Nil) + law(Flow.IteratorSource(Iterator(1, 2, 3)).map(_ * 2), Flow.allRules, 1, Nil) + law(Flow.IteratorSource(Iterator(1, 2, 3)).map(_ * 2).tagged(100), Flow.allRules, 1, Nil) + forAll(law _) + } + + test("contains(n) is the same as allNodes.contains(n)") { + forAll { (f: Flow[Int], rule: Rule[Flow], max: Int, check: List[Flow[Int]]) => + val (dag, _) = Dag(f, Flow.toLiteral) + + val optimizedDag = dag.applyMax(rule, max) + + (optimizedDag.allNodes.iterator ++ check.iterator).foreach { n => + assert(optimizedDag.contains(n) == optimizedDag.allNodes(n), s"$n $optimizedDag") + } + } + } + + test("all roots can be evaluated") { + forAll { (roots: List[Flow[Int]], rule: Rule[Flow], max: Int) => + val dag = Dag.empty[Flow](Flow.toLiteral) + + // This is pretty slow with tons of roots, take 10 + val (finalDag, allRoots) = roots.take(10).foldLeft((dag, Set.empty[Id[Int]])) { case ((d, s), f) => + val (nextDag, id) = d.addRoot(f) + (nextDag, s + id) + } + + val optimizedDag = finalDag.applyMax(rule, max) + + allRoots.foreach { id => + assert(optimizedDag.evaluateOption(id).isDefined, s"$optimizedDag $id") + } + } + } + + test("removeTag removes all .tagged") { + forAll { f: Flow[Int] => + val (dag, id) = Dag(f, Flow.toLiteral) + val optDag = dag(Flow.allRules) // includes removeTagged + + optDag.allNodes.foreach { + case Flow.Tagged(_, _) => fail(s"expected no Tagged, but found one") + case _ => succeed + } + } + } + + test("reachableIds are only the set of nodes") { + forAll { (f: Flow[Int], rule: Rule[Flow], max: Int) => + val (dag, id) = Dag(f, Flow.toLiteral) + + val optimizedDag = dag.applyMax(rule, max) + + assert( + optimizedDag.reachableIds.map(optimizedDag.evaluate(_)) == optimizedDag.allNodes, + s"$optimizedDag" + ) + } + } + + test("adding explicit forks does not loop") { + forAll { (f: Flow[Int]) => + Dag.applyRule(f, Flow.toLiteral, Flow.explicitFork) + // we are just testing that this does not throw + } + + // Here are some explicit examples: + import Flow._ + val src = IteratorSource(Iterator(1)) + val example = ConcatMapped( + Tagged(Merge(OptionMapped(src, { x: Int => Option(2 * x) }), src), 0), + { x: Int => List(x) } + ) + Dag.applyRule(example, Flow.toLiteral, Flow.explicitFork) + + // Here is an example where we have a root that has fanOut + val d0 = Dag.empty(Flow.toLiteral) + val (d1, id0) = d0.addRoot(src) + val (d2, id1) = d1.addRoot(example) + + d2.apply(Flow.explicitFork) + } + + test("a particular hard case for explicit forks") { + // + // Here we have an implicit fork just before an explicit + // fork, but then try to add explicit forks. This should + // move the implicit fork down to the explicit fork. + import Flow._ + val src = IteratorSource(Iterator(1, 2, 3)) + val fn1: Int => Option[Int] = { x => Option(x + 1) } + val f1 = OptionMapped(src, fn1) + val f2 = Fork(src) + val f3 = OptionMapped(f2, { x: Int => Option(x + 2) }) + val f4 = OptionMapped(f2, { x: Int => Option(x + 3) }) + + // Here is an example where we have a root that has fanOut + val d0 = Dag.empty(Flow.toLiteral) + val (d1, id1) = d0.addRoot(f1) + val (d2, id3) = d1.addRoot(f3) + val (d3, id4) = d2.addRoot(f4) + + val d4 = d3.apply(Flow.explicitFork) + assert(d4.evaluate(id1) == OptionMapped(Fork(src), fn1)) + } + + test("test a giant graph") { + import Flow._ + + @annotation.tailrec + def incrementChain(f: Flow[Int], incs: Int): Flow[Int] = + if (incs <= 0) f + else incrementChain(f.map(_ + 1), incs - 1) + + // val incCount = if (catalysts.Platform.isJvm) 10000 else 1000 + val incCount = 1000 + + val incFlow = incrementChain(IteratorSource((0 to 100).iterator), incCount) + val (dag, id) = Dag(incFlow, Flow.toLiteralTail) + + // make sure we can evaluate the id: + val node1 = dag.evaluate(id) + assert(node1 == incFlow) + + assert(dag.depthOfId(id) == Some(incCount)) + assert(dag.depthOf(incFlow) == Some(incCount)) + + val optimizedDag = dag(allRules) + + optimizedDag.evaluate(id) match { + case IteratorSource(it) => + assert(it.toList == (0 to 100).map(_ + incCount).toList) + case other => + fail(s"expected to be optimized: $other") + } + } +} diff --git a/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/ExpressionDagTests.scala b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/ExpressionDagTests.scala new file mode 100644 index 0000000000..a2d5e79868 --- /dev/null +++ b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/ExpressionDagTests.scala @@ -0,0 +1,294 @@ +/* + Copyright 2014 Twitter, Inc. + Copyright 2017 Stripe, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.scalding.dagon + +import org.scalacheck.Prop._ +import org.scalacheck.{Gen, Prop, Properties} + +import ScalaVersionCompat.ieeeDoubleOrdering + +object DagTests extends Properties("Dag") { + + /* + * Here we test with a simple algebra optimizer + */ + + sealed trait Formula[T] { // we actually will ignore T + def evaluate: Int + def closure: Set[Formula[T]] + + def inc(n: Int): Formula[T] = Inc(this, n) + def +(that: Formula[T]): Formula[T] = Sum(this, that) + def *(that: Formula[T]): Formula[T] = Product(this, that) + + override def equals(that: Any) = that match { + case thatF: Formula[_] => eqFn(RefPair(this, thatF)) + case _ => false + } + } + + object Formula { + def apply(n: Int): Formula[Unit] = Constant(n) + + def inc[T](by: Int): Formula[T] => Formula[T] = Inc(_, by) + def sum[T]: (Formula[T], Formula[T]) => Formula[T] = Sum(_, _) + def product[T]: (Formula[T], Formula[T]) => Formula[T] = Product(_, _) + } + + case class Constant[T](override val evaluate: Int) extends Formula[T] { + def closure = Set(this) + } + case class Inc[T](in: Formula[T], by: Int) extends Formula[T] { + override val hashCode = (getClass, in, by).hashCode + def evaluate = in.evaluate + by + def closure = in.closure + this + } + case class Sum[T](left: Formula[T], right: Formula[T]) extends Formula[T] { + override val hashCode = (getClass, left, right).hashCode + def evaluate = left.evaluate + right.evaluate + def closure = (left.closure ++ right.closure) + this + } + case class Product[T](left: Formula[T], right: Formula[T]) extends Formula[T] { + override val hashCode = (getClass, left, right).hashCode + def evaluate = left.evaluate * right.evaluate + def closure = (left.closure ++ right.closure) + this + } + + def eqFn: Function[RefPair[Formula[_], Formula[_]], Boolean] = + Memoize.function[RefPair[Formula[_], Formula[_]], Boolean] { + case (pair, _) if pair.itemsEq => true + case (RefPair(Constant(a), Constant(b)), _) => a == b + case (RefPair(Inc(ia, ca), Inc(ib, cb)), rec) => (ca == cb) && rec(RefPair(ia, ib)) + case (RefPair(Sum(lefta, leftb), Sum(righta, rightb)), rec) => + rec(RefPair(lefta, righta)) && rec(RefPair(leftb, rightb)) + case (RefPair(Product(lefta, leftb), Product(righta, rightb)), rec) => + rec(RefPair(lefta, righta)) && rec(RefPair(leftb, rightb)) + case other => false + } + + def testRule[T](start: Formula[T], expected: Formula[T], rule: Rule[Formula]): Prop = { + val got = Dag.applyRule(start, toLiteral, rule) + (got == expected) :| s"$got == $expected" + } + + def genForm: Gen[Formula[Int]] = + Gen.frequency((1, genProd), (1, genSum), (4, genInc), (4, genConst)) + + def genConst: Gen[Formula[Int]] = Gen.chooseNum(Int.MinValue, Int.MaxValue).map(Constant(_)) + + def genInc: Gen[Formula[Int]] = + for { + by <- Gen.chooseNum(Int.MinValue, Int.MaxValue) + f <- Gen.lzy(genForm) + } yield Inc(f, by) + + def genSum: Gen[Formula[Int]] = + for { + left <- Gen.lzy(genForm) + // We have to make dags, so select from the closure of left sometimes + right <- Gen.oneOf(genForm, Gen.oneOf(left.closure.toSeq)) + } yield Sum(left, right) + + def genProd: Gen[Formula[Int]] = + for { + left <- Gen.lzy(genForm) + // We have to make dags, so select from the closure of left sometimes + right <- Gen.oneOf(genForm, Gen.oneOf(left.closure.toSeq)) + } yield Product(left, right) + + /** + * Here we convert our dag nodes into Literal[Formula, T] + */ + def toLiteral: FunctionK[Formula, Literal[Formula, *]] = + Memoize.functionK[Formula, Literal[Formula, *]](new Memoize.RecursiveK[Formula, Literal[Formula, *]] { + def toFunction[T] = { + case (c @ Constant(_), _) => Literal.Const(c) + case (Inc(in, by), f) => Literal.Unary(f(in), Formula.inc(by)) + case (Sum(lhs, rhs), f) => Literal.Binary(f(lhs), f(rhs), Formula.sum) + case (Product(lhs, rhs), f) => Literal.Binary(f(lhs), f(rhs), Formula.product) + } + }) + + /** + * Inc(Inc(a, b), c) = Inc(a, b + c) + */ + object CombineInc extends Rule[Formula] { + def apply[T](on: Dag[Formula]) = { + case Inc(i @ Inc(a, b), c) if on.fanOut(i) == 1 => Some(Inc(a, b + c)) + case _ => None + } + } + + object RemoveInc extends PartialRule[Formula] { + def applyWhere[T](on: Dag[Formula]) = { case Inc(f, by) => + Sum(f, Constant(by)) + } + } + + /** + * We should be able to totally evaluate these formulas + */ + object EvaluationRule extends Rule[Formula] { + def apply[T](on: Dag[Formula]) = { + case Sum(Constant(a), Constant(b)) => Some(Constant(a + b)) + case Product(Constant(a), Constant(b)) => Some(Constant(a * b)) + case Inc(Constant(a), b) => Some(Constant(a + b)) + case _ => None + } + } + + @annotation.tailrec + final def fib[A](a0: A, a1: A, n: Int)(fn: (A, A) => A): A = + if (n <= 0) a0 + else if (n == 1) a1 + else fib(a1, fn(a0, a1), n - 1)(fn) + + def timeit[A](a: => A): (Double, A) = { + val start = System.nanoTime() + val res = a + val end = System.nanoTime() + ((end - start).toDouble, res) + } + + // This is a bit noisey due to timing, but often passes + property("Evaluation is at most n^(3.0)") = { + def fibFormula(n: Int): Formula[Unit] = fib(Formula(1), Formula(1), n)(Sum(_, _)) + + def runit(n: Int): (Double, Int) = + timeit(Dag.applyRule(fibFormula(n), toLiteral, EvaluationRule)) match { + case (t, Constant(res)) => (t, res) + case (_, other) => sys.error(s"unexpected result: $other") + } + + def check = { + val (t10, res10) = runit(10) + val (t20, res20) = runit(20) + val (t40, res40) = runit(40) + val (t80, res80) = runit(80) + val (t160, res160) = runit(160) + // if this is polynomial = t(n) ~ Cn^k, so t(20)/t(10) == t(40)/t(20) == 2^k + val k = List(t160 / t80, t80 / t40, t40 / t20, t20 / t10).map(math.log(_) / math.log(2.0)).max + println(s"${t10}, ${t20}, ${t40}, ${t80}, ${t160}, $k") + (res10 == fib(1, 1, 10)(_ + _)) && + (res20 == fib(1, 1, 20)(_ + _)) && + (res40 == fib(1, 1, 40)(_ + _)) && + (res80 == fib(1, 1, 80)(_ + _)) && + (res160 == fib(1, 1, 160)(_ + _)) && + (k < 3.0) // without properly memoized equality checks, this rule becomes exponential + } + check || check || check || check // try 4 times if needed to warm up the jit + } + + // Check the Node[T] <=> Id[T] is an Injection for all nodes reachable from the root + + property("toLiteral/Literal.evaluate is a bijection") = forAll(genForm) { form => + toLiteral.apply(form).evaluate == form + } + + property("Going to Dag round trips") = forAll(genForm) { form => + val (dag, id) = Dag(form, toLiteral) + dag.evaluate(id) == form + } + + property("CombineInc does not change results") = forAll(genForm) { form => + val simplified = Dag.applyRule(form, toLiteral, CombineInc) + form.evaluate == simplified.evaluate + } + + property("RemoveInc removes all Inc") = forAll(genForm) { form => + val noIncForm = Dag.applyRule(form, toLiteral, RemoveInc) + def noInc(f: Formula[Int]): Boolean = f match { + case Constant(_) => true + case Inc(_, _) => false + case Sum(l, r) => noInc(l) && noInc(r) + case Product(l, r) => noInc(l) && noInc(r) + } + noInc(noIncForm) && (noIncForm.evaluate == form.evaluate) + } + + // The normal Inc gen recursively calls the general dag Generator + def genChainInc: Gen[Formula[Int]] = + for { + by <- Gen.chooseNum(Int.MinValue, Int.MaxValue) + chain <- genChain + } yield Inc(chain, by) + + def genChain: Gen[Formula[Int]] = Gen.frequency((1, genConst), (3, genChainInc)) + + property("CombineInc compresses linear Inc chains") = forAll(genChain) { chain => + Dag.applyRule(chain, toLiteral, CombineInc) match { + case Constant(n) => true + case Inc(Constant(n), b) => true + case _ => false // All others should have been compressed + } + } + + property("EvaluationRule totally evaluates") = forAll(genForm) { form => + testRule(form, Constant(form.evaluate), EvaluationRule) + } + + property("Crush down explicit diamond") = forAll { (xs0: List[Int], ys0: List[Int]) => + val a = Formula(123) + + // ensure that we won't ever use the same constant on the LHS and RHS + // because we want all our inc nodes to fan out to only one other node. + def munge(xs: List[Int]): List[Int] = xs.take(10).map(_ % 10) + val (xs, ys) = (munge(0 :: xs0), munge(0 :: ys0).map(_ + 1000)) + val (x, y) = (xs.sum, ys.sum) + + val complex = xs.foldLeft(a)(_ inc _) + ys.foldLeft(a)(_ inc _) + val expected = a.inc(x) + a.inc(y) + testRule(complex, expected, CombineInc) + } + + property("all tails have fanOut of 1") = forAll { (n1: Int, ns: List[Int]) => + // Make sure we have a set of distinct nodes + val tails = (n1 :: ns).zipWithIndex.map { case (i, idx) => Formula(i).inc(idx) } + + val (dag, roots) = + tails.foldLeft((Dag.empty[Formula](toLiteral), Set.empty[Id[_]])) { case ((d, s), f) => + val (dnext, id) = d.addRoot(f) + (dnext, s + id) + } + + roots.forall(dag.fanOut(_) == 1) + } + + property("depth is non-decreasing further down the graph") = forAll(genForm) { form => + val (dag, id) = Dag(form, toLiteral) + + import dag.depthOf + + val di = dag.depthOfId(id) + val df = depthOf(form) + val prop1 = di.isDefined + val prop2 = di == df + + def prop3 = form match { + case Constant(_) => di == Some(0) + case Inc(a, _) => + (di.get == (depthOf(a).get + 1)) + case Sum(a, b) => + (di.get == (depthOf(a).get + 1)) || (di.get == (depthOf(b).get + 1)) + case Product(a, b) => + (di.get == (depthOf(a).get + 1)) || (di.get == (depthOf(b).get + 1)) + } + + prop1 && prop2 && prop3 + } +} diff --git a/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/HCacheTests.scala b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/HCacheTests.scala new file mode 100644 index 0000000000..dc8461e334 --- /dev/null +++ b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/HCacheTests.scala @@ -0,0 +1,47 @@ +package com.twitter.scalding.dagon + +import org.scalacheck.Prop._ +import org.scalacheck.{Arbitrary, Cogen, Properties} + +abstract class HCacheTests[K[_], V[_]](name: String)(implicit + ka: Arbitrary[K[Int]], + kc: Cogen[K[Int]], + va: Arbitrary[V[Int]] +) extends Properties(name) { + + def buildHMap(c: HCache[K, V], ks: Iterable[K[Int]], f: K[Int] => V[Int]): HMap[K, V] = + ks.iterator.foldLeft(HMap.empty[K, V]) { (m, k) => + m.updated(k, c.getOrElseUpdate(k, f(k))) + } + + property("getOrElseUpdate") = forAll { (f: K[Int] => V[Int], k: K[Int], v1: V[Int], v2: V[Int]) => + val c = HCache.empty[K, V] + var count = 0 + val x = c.getOrElseUpdate(k, { count += 1; v1 }) + val y = c.getOrElseUpdate(k, { count += 1; v2 }) + x == v1 && y == v1 && count == 1 + } + + property("toHMap") = forAll { (f: K[Int] => V[Int], ks: Set[K[Int]]) => + val c = HCache.empty[K, V] + val m = buildHMap(c, ks, f) + c.toHMap == m + } + + property("duplicate") = forAll { (f: K[Int] => V[Int], ks: Set[K[Int]]) => + val c = HCache.empty[K, V] + val d = c.duplicate + buildHMap(c, ks, f) + d.toHMap.isEmpty + } + + property("reset works") = forAll { (f: K[Int] => V[Int], ks: Set[K[Int]]) => + val c = HCache.empty[K, V] + buildHMap(c, ks, f) + val d = c.duplicate + c.reset() + c.toHMap.isEmpty && d.toHMap.size == ks.size + } +} + +object HCacheTestsLL extends HCacheTests[List, List]("HCacheTests[List, List]") diff --git a/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/HMapTests.scala b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/HMapTests.scala new file mode 100644 index 0000000000..a26c6b1635 --- /dev/null +++ b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/HMapTests.scala @@ -0,0 +1,170 @@ +/* + Copyright 2014 Twitter, Inc. + Copyright 2017 Stripe, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.scalding.dagon + +import org.scalacheck.Prop._ +import org.scalacheck.{Arbitrary, Cogen, Gen, Properties} +import Arbitrary.arbitrary + +/** + * This tests the HMap. We use the type system to prove the types are correct and don't (yet?) engage in the + * problem of higher kinded Arbitraries. + */ +object HMapTests extends Properties("HMap") { + + case class Key[T](key: Int) + + object Key { + implicit def arbitraryKey[A]: Arbitrary[Key[A]] = + Arbitrary(arbitrary[Int].map(n => Key(n & 0xff))) + implicit def cogenKey[A]: Cogen[Key[A]] = + Cogen[Int].contramap(_.key) + } + + case class Value[T](value: Int) + + object Value { + implicit def arbitraryValue[A]: Arbitrary[Value[A]] = + Arbitrary(arbitrary[Int].map(n => Value(n & 0xff))) + implicit def cogenValue[A]: Cogen[Value[A]] = + Cogen[Int].contramap(_.value) + } + + type H = HMap[Key, Value] + type K = Key[Int] + type V = Value[Int] + + def fromPairs(kvs: Iterable[(K, V)]): H = + kvs.foldLeft(HMap.empty[Key, Value])(_ + _) + + implicit val arbitraryHmap: Arbitrary[H] = + Arbitrary( + Gen + .listOf(for { + k <- arbitrary[K] + v <- arbitrary[V] + } yield (k, v)) + .map(fromPairs) + ) + + type FK = FunctionK[H#Pair, Lambda[x => Option[Value[x]]]] + type FKValues = FunctionK[Value, Value] + + implicit val arbitraryFunctionK: Arbitrary[FK] = + Arbitrary(arbitrary[(Int, Int) => Option[Int]].map { f => + new FK { + def toFunction[T] = { case (Key(m), Value(n)) => f(m, n).map(Value(_)) } + } + }) + + implicit val arbitraryFunctionKValues: Arbitrary[FKValues] = + Arbitrary(arbitrary[Int => Int].map { f => + new FKValues { + override def toFunction[T] = v => Value(f(v.value)) + } + }) + + property("equals works") = forAll { (m0: Map[K, V], m1: Map[K, V]) => + (fromPairs(m0) == fromPairs(m1)) == (m0 == m1) + } + + property("hashCode/equals consistency") = forAll { (h0: H, h1: H) => + if (h0 == h1) h0.hashCode == h1.hashCode else true + } + + property("contains/get consistency") = forAll { (h: H, k: K) => + h.get(k).isDefined == h.contains(k) + } + + property("+/updated consistency") = forAll { (h: H, k: K, v: V) => + h.updated(k, v) == h + (k -> v) + } + + property("adding a pair works") = forAll { (h0: H, k: K, v: V) => + val h1 = h0.updated(k, v) + val expectedSize = if (h0.contains(k)) h0.size else h0.size + 1 + (h1.get(k) == Some(v)) && (h1.size == expectedSize) + } + + property("apply works") = forAll { (h: H, k: K) => + scala.util.Try(h(k)).toOption == h.get(k) + } + + property("size works") = forAll { (m: Map[K, V]) => + fromPairs(m).size == m.size + } + + property("removing a key works") = forAll { (h0: H, k: K) => + val h1 = h0 - k + val expectedSize = if (h0.contains(k)) h0.size - 1 else h0.size + (h1.get(k) == None) && (h1.size == expectedSize) + } + + property("keysOf works") = forAll { (h0: H, k: K, v: V) => + val h1 = h0.updated(k, v) + val newKeys = h1.keysOf(v) -- h0.keysOf(v) + + val sizeIsConsistent = newKeys.size match { + case 0 => h0.contains(k) // k was already set to v + case 1 => h0.get(k).forall(_ != v) // k was not set to v + case _ => false // this should not happen + } + + h1.contains(k) && sizeIsConsistent + } + + property("optionMap works") = forAll { (m: Map[K, V], f: FK) => + val h = fromPairs(m) + val got = h.optionMap(f).map { case Value(v) => v }.toSet + val expected = m.flatMap(f(_)).map { case Value(v) => v }.toSet + got == expected + } + + property("keySet works") = forAll { (m: Map[K, V]) => + m.keySet == fromPairs(m).keySet + } + + property("filterKeys works") = forAll { (h: H, p0: K => Boolean) => + val p = p0.asInstanceOf[Key[_] => Boolean] + val a = h.filterKeys(p) + h.keySet.forall(k => p(k) == a.contains(k)) + } + + property("forallKeys works") = forAll { (h: H, p0: K => Boolean) => + val p = p0.asInstanceOf[Key[_] => Boolean] + h.forallKeys(p) == h.keySet.forall(p) + } + + property("HMap.from works") = forAll { (m: Map[K, V]) => + HMap.from[Key, Value](m.asInstanceOf[Map[Key[_], Value[_]]]) == fromPairs(m) + } + + property("heterogenous equality is false") = forAll { (h: H) => + h != null && h != 33 + } + + property("++ works") = forAll { (m1: Map[K, V], m2: Map[K, V]) => + fromPairs(m1) ++ fromPairs(m2) == fromPairs(m1 ++ m2) + } + + property("mapValues works") = forAll { (m: Map[K, V], fk: FKValues) => + val h = fromPairs(m) + val got = fromPairs(m).mapValues(fk) + got.forallKeys(k => got.get(k) == h.get(k).map(fk(_))) + } +} diff --git a/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/LiteralTests.scala b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/LiteralTests.scala new file mode 100644 index 0000000000..afb7e408cb --- /dev/null +++ b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/LiteralTests.scala @@ -0,0 +1,139 @@ +/* + Copyright 2014 Twitter, Inc. + Copyright 2017 Stripe, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.scalding.dagon + +import org.scalacheck.Prop._ +import org.scalacheck.{Arbitrary, Gen, Properties} + +import Literal.{Binary, Const, Unary, Variadic} + +object LiteralTests extends Properties("Literal") { + case class Box[T](get: T) + + def transitiveClosure[N[_]]( + l: Literal[N, _], + acc: Set[Literal[N, _]] = Set.empty[Literal[N, _]] + ): Set[Literal[N, _]] = l match { + case c @ Const(_) => acc + c + case u @ Unary(prev, _) => if (acc(u)) acc else transitiveClosure(prev, acc + u) + case b @ Binary(p1, p2, _) => + if (acc(b)) acc else transitiveClosure(p2, transitiveClosure(p1, acc + b)) + case v @ Variadic(ins, fn) => + val newNodes = ins.filterNot(acc) + newNodes.foldLeft(acc + v)((res, n) => transitiveClosure(n, res)) + } + + def genBox: Gen[Box[Int]] = Gen.chooseNum(0, 10).map(Box(_)) + + def genConst: Gen[Literal[Box, Int]] = genBox.map(Const(_)) + def genUnary: Gen[Literal[Box, Int]] = + for { + fn <- Arbitrary.arbitrary[(Int) => Int] + bfn = { case Box(b) => Box(fn(b)) }: Box[Int] => Box[Int] + input <- genLiteral + } yield Unary(input, bfn) + + def mk(fn: (Int, Int) => Int) = fn + def genBinary: Gen[Literal[Box, Int]] = + for { + fn <- Gen.oneOf[(Int, Int) => Int](mk(_ * _), mk(_ + _)) + bfn = { case (Box(l), Box(r)) => Box(fn(l, r)) }: (Box[Int], Box[Int]) => Box[Int] + left <- genLiteral + // We have to make dags, so select from the closure of left sometimes + right <- Gen.oneOf(genLiteral, genChooseFrom(transitiveClosure[Box](left))) + } yield Binary(left, right, bfn) + + def genVariadic: Gen[Literal[Box, Int]] = { + def append(cnt: Int, items: List[Literal[Box, Int]]): Gen[List[Literal[Box, Int]]] = + if (cnt > 0) { + + val hGen: Gen[Literal[Box, Int]] = + if (items.nonEmpty) { + val inner = Gen + .oneOf(items.flatMap(transitiveClosure[Box](_))) + .asInstanceOf[Gen[Literal[Box, Int]]] + Gen.frequency((4, Gen.lzy(genLiteral)), (1, inner)) + } else Gen.lzy(genLiteral) + + for { + head <- hGen + rest <- append(cnt - 1, head :: items) + } yield rest + } else Gen.const(items) + + for { + argc <- Gen.choose(0, 4) + args <- append(argc, Nil) + fn <- Arbitrary.arbitrary[List[Int] => Int] + bfn = { boxes: List[Box[Int]] => Box(fn(boxes.map { case Box(b) => b })) } + } yield Variadic(args, bfn) + } + + def genChooseFrom[N[_]](s: Set[Literal[N, _]]): Gen[Literal[N, Int]] = + Gen.oneOf(s.toSeq.asInstanceOf[Seq[Literal[N, Int]]]) + + /* + * Create dags. Don't use binary too much as it can create exponentially growing dags + */ + def genLiteral: Gen[Literal[Box, Int]] = + Gen.frequency((6, genConst), (12, genUnary), (2, genBinary), (1, genVariadic)) + + // This evaluates by recursively walking the tree without memoization + // as lit.evaluate should do + def slowEvaluate[T](lit: Literal[Box, T]): Box[T] = lit match { + case Const(n) => n + case Unary(in, fn) => fn(slowEvaluate(in)) + case Binary(a, b, fn) => fn(slowEvaluate(a), slowEvaluate(b)) + case Variadic(ins, fn) => fn(ins.map(slowEvaluate(_))) + } + + property("Literal.evaluate must match simple explanation") = forAll(genLiteral) { (l: Literal[Box, Int]) => + l.evaluate == slowEvaluate(l) + } + + property("equality is transitive") = forAll(genLiteral, genLiteral, genLiteral) { (a, b, c) => + if (a == b) { + if (b == c) (a == c) else true + } else if (b == c) { + (a != c) // otherwise, a == b + } else true + } + + property("binary equality regression check") = forAll(genLiteral, genLiteral) { (a, b) => + if (a != b) { + val fn: (Box[Int], Box[Int]) => Box[Int] = null + Binary(a, b, fn) != Binary(a, a, fn) + } else true + } + + property("reflexive equality") = forAll(genLiteral, genLiteral)((a, b) => (a == b) == (b == a)) + + property("equality spec") = forAll(genLiteral, genLiteral) { (a, b) => + (a, b) match { + case (Const(ca), Const(cb)) => + ((a == b) == (ca == cb)) + case (Unary(ua, fa), Unary(ub, fb)) => + ((a == b) == ((ua == ub) && (fa == fb))) + case (Binary(aa, ab, fa), Binary(ba, bb, fb)) => + ((a == b) == ((aa == ba) && (ab == bb) && (fa == fb))) + case (Variadic(as, fa), Variadic(bs, fb)) => + ((a == b) == ((as == bs) && (fa == fb))) + case (_, _) => a != b + } + } +} diff --git a/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/MemoizeTests.scala b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/MemoizeTests.scala new file mode 100644 index 0000000000..d943bd8d7b --- /dev/null +++ b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/MemoizeTests.scala @@ -0,0 +1,52 @@ +package com.twitter.scalding.dagon + +import org.scalatest.FunSuite + +class MemoizeTests extends FunSuite { + test("fibonacci is linear in time") { + + var calls = 0 + + val fib = + Memoize.function[Int, Long] { (i, f) => + calls += 1 + + i match { + case 0 => 0 + case 1 => 1 + case i => f(i - 1) + f(i - 2) + } + } + + def fib2(n: Int, x: Long, y: Long): Long = + if (n == 0) x + else fib2(n - 1, y, x + y) + + assert(fib(100) == fib2(100, 0L, 1L)) + assert(calls == 101) + } + + test("functionK repeated calls only evaluate once") { + + var calls = 0 + val fn = + Memoize.functionK[BoolT, BoolT](new Memoize.RecursiveK[BoolT, BoolT] { + def toFunction[T] = { case (b, rec) => + calls += 1 + + !b + } + }) + + assert(fn(true) == false) + assert(calls == 1) + assert(fn(true) == false) + assert(calls == 1) + + assert(fn(false) == true) + assert(calls == 2) + assert(fn(false) == true) + assert(calls == 2) + + } +} diff --git a/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/ReadmeTest.scala b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/ReadmeTest.scala new file mode 100644 index 0000000000..e1c4313529 --- /dev/null +++ b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/ReadmeTest.scala @@ -0,0 +1,69 @@ +package readme + +object Example { + + import com.twitter.scalding.dagon._ + + // 1. set up an AST type + + sealed trait Eqn[T] { + def unary_-(): Eqn[T] = Negate(this) + def +(that: Eqn[T]): Eqn[T] = Add(this, that) + def -(that: Eqn[T]): Eqn[T] = Add(this, Negate(that)) + } + + case class Const[T](value: Int) extends Eqn[T] + case class Var[T](name: String) extends Eqn[T] + case class Negate[T](eqn: Eqn[T]) extends Eqn[T] + case class Add[T](lhs: Eqn[T], rhs: Eqn[T]) extends Eqn[T] + + object Eqn { + // these function constructors make the definition of + // toLiteral a lot nicer. + def negate[T]: Eqn[T] => Eqn[T] = Negate(_) + def add[T]: (Eqn[T], Eqn[T]) => Eqn[T] = Add(_, _) + } + + // 2. set up a transfromation from AST to Literal + + val toLiteral: FunctionK[Eqn, Literal[Eqn, *]] = + Memoize.functionK[Eqn, Literal[Eqn, *]](new Memoize.RecursiveK[Eqn, Literal[Eqn, *]] { + def toFunction[T] = { + case (c @ Const(_), f) => Literal.Const(c) + case (v @ Var(_), f) => Literal.Const(v) + case (Negate(x), f) => Literal.Unary(f(x), Eqn.negate) + case (Add(x, y), f) => Literal.Binary(f(x), f(y), Eqn.add) + } + }) + + // 3. set up rewrite rules + + object SimplifyNegation extends PartialRule[Eqn] { + def applyWhere[T](on: Dag[Eqn]) = { + case Negate(Negate(e)) => e + case Negate(Const(x)) => Const(-x) + } + } + + object SimplifyAddition extends PartialRule[Eqn] { + def applyWhere[T](on: Dag[Eqn]) = { + case Add(Const(x), Const(y)) => Const(x + y) + case Add(Add(e, Const(x)), Const(y)) => Add(e, Const(x + y)) + case Add(Add(Const(x), e), Const(y)) => Add(e, Const(x + y)) + case Add(Const(x), Add(Const(y), e)) => Add(Const(x + y), e) + case Add(Const(x), Add(e, Const(y))) => Add(Const(x + y), e) + } + } + + val rules = SimplifyNegation.orElse(SimplifyAddition) + + // 4. apply rewrite rules to a particular AST value + + val a: Eqn[Unit] = Var("x") + Const(1) + val b1: Eqn[Unit] = a + Const(2) + val b2: Eqn[Unit] = a + Const(5) + Var("y") + val c: Eqn[Unit] = b1 - b2 + + val simplified: Eqn[Unit] = + Dag.applyRule(c, toLiteral, rules) +} diff --git a/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/RealNumberTest.scala b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/RealNumberTest.scala new file mode 100644 index 0000000000..7835c79bf7 --- /dev/null +++ b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/RealNumberTest.scala @@ -0,0 +1,1109 @@ +package com.twitter.scalding.dagon + +import org.scalatest.FunSuite +import org.scalacheck.{Arbitrary, Gen, Shrink} +import org.scalatest.prop.GeneratorDrivenPropertyChecks._ + +object RealNumbers { + + class SortedList[+A] private (val toList: List[A]) { + def apply[A1 >: A](a: A1): Boolean = + toList.contains(a) + + def tail: SortedList[A] = + new SortedList(toList.tail) + + def filter(fn: A => Boolean): SortedList[A] = + new SortedList(toList.filter(fn)) + + def filterNot(fn: A => Boolean): SortedList[A] = + new SortedList(toList.filterNot(fn)) + + def collect[B: Ordering](fn: PartialFunction[A, B]): SortedList[B] = + new SortedList(toList.collect(fn).sorted) + + def |[A1 >: A: Ordering](that: SortedList[A1]): SortedList[A1] = + // could do a merge-sort here in linear time + new SortedList((toList reverse_::: that.toList).sorted) + + def +[A1 >: A: Ordering](a: A1): SortedList[A1] = + new SortedList((a :: toList).sorted) + + def removeFirst[A1 >: A](a: A1): SortedList[A1] = + toList match { + case Nil => new SortedList(Nil) + case h :: tail if h == a => new SortedList(tail) + case h :: tail => + val tl = new SortedList(tail) + new SortedList(h :: (tl.removeFirst(a)).toList) + } + + def ++[A1 >: A: Ordering](that: Iterable[A1]): SortedList[A1] = + new SortedList((toList ++ that).sorted) + + override def equals(that: Any) = + that match { + case sl: SortedList[_] => toList == sl.toList + case _ => false + } + override def hashCode: Int = toList.hashCode + } + object SortedList { + val empty: SortedList[Nothing] = new SortedList(Nil) + + def unapply[A](list: SortedList[A]): Some[List[A]] = + Some(list.toList) + + def fromList[A: Ordering](as: List[A]): SortedList[A] = + new SortedList(as.sorted) + + def apply[A: Ordering](as: A*): SortedList[A] = + new SortedList(as.toList.sorted) + + implicit def sortedListOrd[A: Ordering]: Ordering[SortedList[A]] = { + val ordList: Ordering[Iterable[A]] = Ordering.Iterable[A] + ordList.on { ls: SortedList[A] => ls.toList } + } + + // Get all the list methods + implicit def toList[A](sl: SortedList[A]): List[A] = sl.toList + } + + sealed abstract class Real { self: Product => + import Real._ + + // cache the hashcode + override val hashCode = scala.util.hashing.MurmurHash3.productHash(self) + override def equals(that: Any) = that match { + case thatF: Real => + if (thatF eq this) true + else if (thatF.hashCode != hashCode) false + else { + @annotation.tailrec + def loop(todo: List[RefPair[Real, Real]], seen: Set[RefPair[Real, Real]]): Boolean = + todo match { + case Nil => true + case rf :: tail => + if (rf.itemsEq) loop(tail, seen) + else + rf match { + case RefPair(Const(a), Const(b)) => + (a == b) && loop(tail, seen) + case RefPair(Variable(a), Variable(b)) => + (a == b) && loop(tail, seen) + case RefPair(Sum(as), Sum(bs)) => + (as.size == bs.size) && { + val stack = + as.iterator + .zip(bs.iterator) + .map { case (a, b) => + RefPair(a, b) + } + .filterNot(seen) + .toList + + loop(stack reverse_::: tail, seen ++ stack) + } + case RefPair(Prod(as), Prod(bs)) => + (as.size == bs.size) && { + val stack = + as.iterator + .zip(bs.iterator) + .map { case (a, b) => + RefPair(a, b) + } + .filterNot(seen) + .toList + + loop(stack reverse_::: tail, seen ++ stack) + } + case _ => false + } + } + loop(RefPair[Real, Real](this, thatF) :: Nil, Set.empty) + } + case _ => false + } + + /** + * This multiplicativily increases the size of the Real, be careful, consider (a1 + a2 + .. an) * (b1 + b2 + * + .. + bk) we wind up with n*k items from n + k. We can wind up exponentially larger: (a0 + a1)*(b0 + + * b1)*.... will be exponentially larger number of terms after expand + */ + def expand: Real = + this match { + case Const(_) | Variable(_) => this + case Sum(s) => Real.sum(SortedList.fromList(s.map(_.expand))) + case Prod(p) => + // for all the sums in here we need do the full cross product + val nonSums = p.filter(_.toSum.isEmpty) + val sums = p.toList.collect { case Sum(s) => s.toList } + def cross(ls: List[List[Real]]): List[Real] = + // (a + b), (c + d)... = a * cross(tail) + b * cross(tail) ... + ls match { + case Nil => Const(1.0) :: Nil + case h0 :: Nil => h0 + case h :: tail => + cross(tail).flatMap { term => + h.map(h => Real.prod(SortedList.fromList(h :: term :: Nil))) + } + } + val sum1 = Real.sum(SortedList.fromList(cross(sums))) + if (nonSums.isEmpty) sum1 + else { + Real.prod(Real.prod(nonSums), sum1) + } + } + + def +(that: Real): Real = + Real.sum(this, that) + + def unary_-(): Real = + Real.prod(Const(-1.0), this) + + def -(that: Real): Real = + this + (-that) + + def *(that: Real): Real = + Real.prod(this, that) + + def evaluate(m: Map[String, Double]): Option[Double] = + this match { + case Const(d) => Some(d) + case Variable(v) => m.get(v) + case Sum(s) => + s.iterator.foldLeft(Option(0.0)) { + case (None, _) => None + case (Some(d), v) => v.evaluate(m).map(_ + d) + } + case Prod(p) => + p.iterator.foldLeft(Option(1.0)) { + case (None, _) => None + case (Some(d), v) => v.evaluate(m).map(_ * d) + } + } + + def freeVars: Set[String] = + this match { + case Const(_) => Set.empty + case Variable(v) => Set(v) + case Sum(v) => v.iterator.flatMap(_.freeVars).toSet + case Prod(v) => v.iterator.flatMap(_.freeVars).toSet + } + + def toSum: Option[Sum] = + this match { + case s @ Sum(_) => Some(s) + case _ => None + } + def toProd: Option[Prod] = + this match { + case p @ Prod(_) => Some(p) + case _ => None + } + def toConst: Option[Const] = + this match { + case c @ Const(_) => Some(c) + case _ => None + } + + /** + * If this divides the current value return a Some(res) such that res * r0 == this + */ + def divOpt(r0: Real): Option[Real] = + r0 match { + case z if z.isDefinitelyZero => None + case c @ Const(_) if !c.isFinite => None + case same if same == self => Some(one) + case Prod(ps) => + // to divide by a product all must divide + @annotation.tailrec + def loop(r: Real, ps: SortedList[Real]): Option[Real] = + if (ps.isEmpty) Some(r) + else + r.divOpt(ps.head) match { + case None => None + case Some(r1) => loop(r1, ps.tail) + } + loop(this, ps) + case nonProd => + // note r is not a product, so we can do the naive thing: + this match { + case Prod(ps) => + // if any of these ps can be divided by r0, we are good + def allFocii[A]( + head: List[A], + focus: A, + tail: List[A], + acc: List[(List[A], A, List[A])] + ): List[(List[A], A, List[A])] = + tail match { + case Nil => (head, focus, Nil) :: acc + case h :: t => allFocii(focus :: head, h, t, (head, focus, tail) :: acc) + } + ps.toList match { + case Nil => Const(1.0).divOpt(nonProd) + case h :: tail => + val trials = allFocii(Nil, h, tail, Nil) + trials.iterator + .map { case (l, f, r) => + f.divOpt(nonProd).map(div => Real.prod(SortedList.fromList(div :: l reverse_::: r))) + } + .collectFirst { case Some(res) => res } + } + case c @ Const(d) if c.isFinite => + nonProd match { + // we want to make progress, not do a naive division + case c1 @ Const(d1) if c1.isFinite && d1 != 1.0 => Some(Const(d / d1)) + case _ => None + } + case _ => None + } + } + + override def toString = { + def loop(r: Real): String = + r match { + case Variable(x) => x + case Const(d) => d.toString + case Sum(s) => + s.iterator.map(loop(_)).mkString("(", " + ", ")") + case Prod(p) => + p.iterator.map(loop(_)).mkString("(", "*", ")") + } + loop(this) + } + + def isDefinitelyZero: Boolean = + this match { + case Const(d) => d == 0.0 + case Variable(_) => false + case Sum(s) => s.isEmpty || s.forall(_.isDefinitelyZero) + case Prod(s) => s.exists(_.isDefinitelyZero) + } + + def cost: Int = { + def costOp(s: SortedList[Real]): Int = + if (s.isEmpty) 0 + else s.iterator.map(_.cost).sum + (s.iterator.length - 1) + + this match { + case Variable(_) | Const(_) => 0 + case Sum(s) => costOp(s) + case Prod(p) => costOp(p) + } + } + + /** + * What is the order of polynomial for each variable + */ + def orderMap: Map[String, Int] = + this match { + case Const(_) => Map.empty + case Variable(x) => Map((x, 1)) + case Sum(items) => + items + .foldLeft(Map.empty[String, Int]) { (o, v) => + val ov = v.orderMap + (o.keySet ++ ov.keySet).foldLeft(o) { case (o, k) => + o.updated(k, o.getOrElse(k, 0).max(ov.getOrElse(k, 0))) + } + } + case Prod(items) => + items + .foldLeft(Map.empty[String, Int]) { (o, v) => + val ov = v.orderMap + (o.keySet ++ ov.keySet).foldLeft(o) { case (o, k) => + o.updated(k, o.getOrElse(k, 0) + ov.getOrElse(k, 0)) + } + } + } + } + object Real { + case class Const(toDouble: Double) extends Real { + def isFinite: Boolean = + java.lang.Double.isFinite(toDouble) + } + case class Variable(name: String) extends Real + // use a sorted set, we have unique representations + case class Sum(terms: SortedList[Real]) extends Real + case class Prod(terms: SortedList[Real]) extends Real + + val zero: Real = Const(0.0) + val one: Real = Const(1.0) + + def const(d: Double): Real = Const(d) + def variable(v: String): Real = Variable(v) + + // What things can a given number + def divisors(r: Real): List[Real] = + (r match { + case p @ Prod(ps) => + p :: ps.flatMap(divisors(_)) + case nonProd => nonProd :: Nil + }).filterNot(_.isDefinitelyZero) + + def sum(a: Real, b: Real): Real = + sum(SortedList(a, b)) + + def sum(s0: SortedList[Real]): Real = { + val s = s0.filterNot(_.isDefinitelyZero) + if (s.isEmpty) zero + else if (s.size == 1) s.head + else { + val sums = s.iterator.map(_.toSum).collect { case Some(Sum(ps)) => ps }.toList.flatten + val nonSum = s.filterNot(_.toSum.isDefined) + if (sums.isEmpty) Sum(nonSum) + else sum(nonSum ++ sums) + } + } + + def prod(a: Real, b: Real): Real = + Prod(SortedList(a, b)) + + def prod(s0: SortedList[Real]): Real = { + def isOne(a: Real): Boolean = + a match { + case Sum(s) => false + case Const(d) => d == 1.0 + case Variable(_) => false + case Prod(s) => s.forall(isOne) + } + + val s = s0.filterNot(isOne) + if (s.isEmpty) one + else if (s.size == 1) s.head + else if (s.exists(_.isDefinitelyZero)) zero + else { + val prods = s.iterator.map(_.toProd).collect { case Some(Prod(ps)) => ps }.toList.flatten + val nonProd = s.filterNot(_.toProd.isDefined) + if (prods.isEmpty) Prod(nonProd) + else prod(nonProd ++ prods) + } + } + + implicit def ordReal[R <: Real]: Ordering[R] = + new Ordering[R] { + def compareIt(a: Iterator[Real], b: Iterator[Real]): Int = { + @annotation.tailrec + def loop(): Int = + (a.hasNext, b.hasNext) match { + case (true, true) => + val c = compareReal(a.next, b.next) + if (c == 0) loop() else c + case (false, true) => -1 + case (true, false) => 1 + case (false, false) => 0 + } + + loop() + } + def compare(a: R, b: R) = compareReal(a, b) + + def compareReal(a: Real, b: Real) = + (a, b) match { + case (Const(a), Const(b)) => java.lang.Double.compare(a, b) + case (Const(_), _) => -1 + case (Variable(a), Variable(b)) => a.compareTo(b) + case (Variable(_), Const(_)) => 1 + case (Variable(_), _) => -1 + case (Sum(a), Sum(b)) => compareIt(a.iterator, b.iterator) + case (Sum(_), Const(_) | Variable(_)) => 1 + case (Sum(_), Prod(_)) => -1 + case (Prod(a), Prod(b)) => compareIt(a.iterator, b.iterator) + case (Prod(_), Const(_) | Variable(_) | Sum(_)) => 1 + } + } + + def genReal(depth: Int): Gen[Real] = { + val const = Gen.choose(-1000, 1000).map(i => Const(i.toDouble)) + val variable = Gen.choose('a', 'z').map(v => Variable(v.toString)) + if (depth <= 0) Gen.oneOf(const, variable) + else { + val rec = Gen.lzy(genReal(depth - 1)) + val items = Gen.choose(0, 10).flatMap(Gen.listOfN(_, rec)) + val sum = items.map(ls => Real.sum(SortedList(ls: _*))) + val prod = items.map(ls => Real.prod(SortedList(ls: _*))) + Gen.oneOf(const, variable, sum, prod) + } + } + + implicit val arbReal: Arbitrary[Real] = Arbitrary(genReal(4)) + + implicit val shrinkReal: Shrink[Real] = + Shrink { + case Const(_) => Stream.empty + case Variable(_) => Const(1.0) #:: Stream.empty + case Sum(items) if items.isEmpty => Const(0.0) #:: Stream.empty + case Sum(items) => + val smaller = Sum(items.tail) + smaller #:: shrinkReal.shrink(smaller) + case Prod(items) if items.isEmpty => Const(1.0) #:: Stream.empty + case Prod(items) => + val smaller = Prod(items.tail) + smaller #:: shrinkReal.shrink(smaller) + } + } + + type RealN[A] = Real + def toLiteral: FunctionK[RealN, Literal[RealN, *]] = + Memoize.functionK[RealN, Literal[RealN, *]](new Memoize.RecursiveK[RealN, Literal[RealN, *]] { + import Real._ + def toFunction[T] = { + case (r @ (Const(_) | Variable(_)), _) => Literal.Const(r) + case (Sum(rs), rec) => + Literal.Variadic[RealN, T, T](rs.iterator.map(rec[T](_)).toList, rs => Sum(SortedList(rs: _*))) + case (Prod(rs), rec) => + Literal.Variadic[RealN, T, T](rs.iterator.map(rec[T](_)).toList, rs => Prod(SortedList(rs: _*))) + } + }) + + sealed trait Parser[+A] { + def apply(s: String): Option[(String, A)] + def map[B](fn: A => B): Parser[B] = Parser.Map(this, fn) + def zip[B](that: Parser[B]): Parser[(A, B)] = Parser.Zip(this, that) + def |[A1 >: A](that: Parser[A1]): Parser[A1] = + (this, that) match { + case (Parser.OneOf(l), Parser.OneOf(r)) => Parser.OneOf(l ::: r) + case (l, Parser.OneOf(r)) => Parser.OneOf(l :: r) + case (Parser.OneOf(l), r) => Parser.OneOf(l :+ r) + case (l, r) => Parser.OneOf(List(l, r)) + } + + def ? : Parser[Option[A]] = + map(Some(_)) | Parser.Pure(None) + + def *>[B](that: Parser[B]): Parser[B] = + zip(that).map(_._2) + + def <*[B](that: Parser[B]): Parser[A] = + zip(that).map(_._1) + } + + object Parser { + final case class Pure[A](a: A) extends Parser[A] { + def apply(s: String) = Some((s, a)) + } + final case class Map[A, B](p: Parser[A], fn: A => B) extends Parser[B] { + def apply(s: String) = p(s).map { case (s, a) => (s, fn(a)) } + } + final case class Zip[A, B](a: Parser[A], b: Parser[B]) extends Parser[(A, B)] { + def apply(s: String) = a(s).flatMap { case (s, a) => b(s).map { case (s, b) => (s, (a, b)) } } + } + + final case class OneOf[A](ls: List[Parser[A]]) extends Parser[A] { + def apply(s: String) = { + @annotation.tailrec + def loop(ls: List[Parser[A]]): Option[(String, A)] = + ls match { + case Nil => None + case h :: tail => + h(s) match { + case None => loop(tail) + case some => some + } + } + loop(ls) + } + } + + final case class Rep[A](a: Parser[A]) extends Parser[List[A]] { + def apply(str: String) = { + @annotation.tailrec + def loop(str: String, acc: List[A]): (String, List[A]) = + a(str) match { + case None => (str, acc.reverse) + case Some((rest, a)) => loop(rest, a :: acc) + } + + Some(loop(str, Nil)) + } + } + + final case class StringParser(expect: String) extends Parser[String] { + val len = expect.length + def apply(s: String) = + if (s.startsWith(expect)) Some((s.drop(len), expect)) + else None + } + + final case class LazyParser[A](p: () => Parser[A]) extends Parser[A] { + private lazy val pa: Parser[A] = { + @annotation.tailrec + def loop(p: Parser[A]): Parser[A] = + p match { + case LazyParser(lp) => loop(lp()) + case nonLazy => nonLazy + } + + loop(p()) + } + + def apply(s: String) = pa(s) + } + + def str(s: String): Parser[String] = StringParser(s) + def chr(c: Char): Parser[String] = StringParser(c.toString) + def number(n: Int): Parser[Int] = StringParser(n.toString).map(_ => n) + def defer[A](p: => Parser[A]): Parser[A] = LazyParser(() => p) + } + + val realParser: Parser[Real] = { + val variable: Parser[Real] = + Parser + .OneOf( + ('a' to 'z').toList.map(Parser.chr(_)) + ) + .map(Real.Variable(_)) + + val digit: Parser[Int] = Parser.OneOf((0 to 9).toList.map(Parser.number(_))) + val intP: Parser[Double] = + digit + .zip(Parser.Rep(digit)) + .map { case (d, ds) => + (d :: ds).foldLeft(0.0)((acc, d) => acc * 10.0 + d) + } + + val constP = + (Parser + .chr('-') + .? + .zip(intP.zip((Parser.chr('.') *> Parser.Rep(digit)).?))) + .map { + case (s, (h, None)) => s.fold(h)(_ => -h) + case (s, (h, Some(rest))) => + val num = rest.reverse.foldLeft(0.0)((acc, d) => acc / 10.0 + d) + val pos = h + (num / 10.0) + s.fold(pos)(_ => -pos) + } + .map(Real.const(_)) + + val recurse = Parser.defer(realParser) + + def op(str: String): Parser[SortedList[Real]] = { + val left = Parser.chr('(') + val right = Parser.chr(')') + val rest = Parser.Rep(Parser.str(str) *> recurse) + (left *> recurse.zip(rest) <* right) + .map { case (h, t) => + SortedList((h :: t): _*) + } + } + + variable | constP | op(" + ").map(Real.Sum(_)) | op("*").map(Real.Prod(_)) + } + + object CombineProdSum extends Rule[RealN] { + import Real._ + + def apply[A](dag: Dag[RealN]) = { + case Sum(inner) if inner.exists(_.toSum.isDefined) => + val nonSum = inner.filter(_.toSum.isEmpty) + val innerSums = inner.flatMap(_.toSum match { + case Some(Sum(s)) => s + case None => SortedList.empty + }) + Some(sum(nonSum ++ innerSums)) + + case Prod(inner) if inner.exists(_.toProd.isDefined) => + val nonProd = inner.filter(_.toProd.isEmpty) + val innerProds = inner.flatMap(_.toProd match { + case Some(Prod(s)) => s + case None => SortedList.empty + }) + Some(prod(nonProd ++ innerProds)) + + case _ => None + } + } + + object CombineConst extends Rule[RealN] { + import Real._ + + def combine(r: Real): Option[Real] = r match { + case Sum(inner) if inner.count(_.toConst.isDefined) > 1 => + val nonConst = inner.filter(_.toConst.isEmpty).filterNot(_.isDefinitelyZero) + val c = inner.collect { case Const(d) => d }.sum + Some(sum(nonConst + Const(c))) + case Prod(inner) if inner.count(_.toConst.isDefined) > 1 => + val nonConst = inner.filter(_.toConst.isEmpty) + val c = inner.collect { case Const(d) => d }.product + Some(prod(nonConst + Const(c))) + case Prod(inner) if inner.exists(_.isDefinitelyZero) => + Some(zero) + case _ => None + } + + def apply[A](dag: Dag[RealN]) = combine(_) + } + + object RemoveNoOp extends Rule[RealN] { + import Real._ + def apply[A](dag: Dag[RealN]) = { + case Sum(rs) if rs.exists(_.isDefinitelyZero) => + Some(sum(rs.filterNot(_.isDefinitelyZero))) + case Prod(rs) if rs.collectFirst { case Const(1.0) => () }.nonEmpty => + Some(prod(rs)) + case _ => None + } + } + + /* + * The idea here is to take (ab + ac) to a(b + c) + * + */ + object ReduceProd extends Rule[RealN] { + import Real._ + + def bestPossible(c0: Int, r: Real): Option[(Int, Real)] = + r match { + case Sum(maybeProd) if maybeProd.lengthCompare(1) > 0 => + // println(s"trying: $r") + // these are all the things that divide at least one item + val allProds = maybeProd.flatMap(divisors(_)).distinct.sorted + // each of the products are candidates for reducing: + allProds.iterator + .map { p => + // we could try dividing each term by p, but maybe we need to group them + // into sums to make it work: + // e.g. (a + b + c + 1*(a + b + c)), here, (a+b+c) does not + // divide any of the rest, but it does divide the union. + // + // To handle this case, if we have a sum, subtract p + val (hadP, maybeProdNotP) = p match { + case Sum(ps) if ps.nonEmpty && ps.forall(maybeProd(_)) => + (true, ps.foldLeft(maybeProd)(_.removeFirst(_))) + case _ => (false, maybeProd) + } + val divO = maybeProdNotP.toList.map(pr => (pr.divOpt(p), pr)) + val canDiv = divO.collect { case (Some(res), _) => res } + val noDiv = divO.collect { case (None, pr) => pr } + // we don't want to use Real.sum here which can + // do normalizations we don't want in a rule + val cd = if (hadP) one :: canDiv else canDiv + val canDiv1 = Sum(SortedList.fromList(cd)) + val res = (p, canDiv1, noDiv) + + // println(s"$r => $res") + res + } + // we want to factor from at least two items + .filter { + case (_, Sum(items), _) => items.lengthCompare(2) >= 0 + case _ => false + } + .map { case (p, canDiv1, noDiv) => + /* + * p*canDiv + noDiv + */ + val noDiv1 = sum(SortedList.fromList(noDiv)) + val r1 = sum(prod(p, canDiv1), noDiv1) + val c1 = r1.cost + if (c1 < c0) { + // println(s"decreased cost: $c1 from $c0: $r => $r1") + (c1, r1) + } else { + // println(s"did not decrease cost: $c1 from $c0: $r") + // this is ad-hoc... other rules can lower cost as well here + val canDiv2 = CombineConst.combine(canDiv1).getOrElse(canDiv1) + val r1 = sum(prod(p, canDiv2), noDiv1) + val res = (r1.cost, r1) + // println(s"try 2 to decrease cost: ${res._1} from $c0: $r => $r1") + res + } + } + .filter { case (c1, _) => c1 <= c0 } // allow groupings that don't reduce cost + .toList + .sorted + .headOption + case _ => None + } + + def apply[A](dag: Dag[RealN]) = { r => + bestPossible(r.cost, r).map(_._2) + } + } + + val allRules0: Rule[RealN] = + CombineConst.orElse(CombineProdSum).orElse(RemoveNoOp) + + // ReduceProd is a bit expensive, do it after everything else can't be applied + val allRules: List[Rule[RealN]] = + allRules0 :: (ReduceProd.orElse(allRules0)) :: Nil + + implicit val arbRule: Arbitrary[Rule[RealN]] = + Arbitrary(Gen.oneOf(CombineConst, CombineProdSum, RemoveNoOp, ReduceProd)) + + /** + * Unsafe string parsing, to be used in testing + */ + def real(s: String): Real = + realParser(s) match { + case None => sys.error(s"couldn't parse: $s") + case Some(("", r)) => r + case Some((rest, _)) => sys.error(s"still need to parse: $rest") + } + + def optimize(r: Real, n: Int): Real = { + val (dag, id) = Dag[Any, RealN](r, toLiteral) + val optDag = dag.applyMax(allRules0, n) + optDag.evaluate(id) + } + + def optimizeAll(r: Real): Real = { + val (dag, id) = Dag[Any, RealN](r, toLiteral) + var seen: Set[Real] = Set(dag.evaluate(id)) + + val maxSteps = 1000 + + def loop(d: Dag[RealN], max: Int): Dag[RealN] = + if (max <= 0) { + println(s"exhausted on $r at ${d.evaluate(id)}") + d + } else { + // System.out.print('.') + // System.out.flush() + + // prefer to use allRules0 until it no longer applies + val d1 = d.applySeqOnce(allRules) + val r1 = d1.evaluate(id) + if (d1 == d) d1 + else if (seen(r1)) { + // TODO: the rules currently can create loops, :( + // System.out.println(s"loop (step ${maxSteps - max}): $r from ${d.evaluate(id)} to ${r1}") + d1 + // loop(d1, max - 1) + } else { + seen += r1 + loop(d1, max - 1) + } + } + val optDag = loop(dag, maxSteps) + val d1 = optDag.evaluate(id) + d1 + } +} + +class RealNumberTest extends FunSuite { + import RealNumbers._ + + implicit val generatorDrivenConfig = + // PropertyCheckConfiguration(minSuccessful = 5000) + PropertyCheckConfiguration(minSuccessful = 500) + + def close(a: Double, b: Double, msg: => String) = { + val diff = Math.abs(a - b) + if (diff < 1e-6) succeed + else { + // this should really only happen for giant numbers + assert(Math.abs(a) > 1e9 || Math.abs(b) > 1e9, msg) + } + } + + def closeOpt(opt: Option[Double], nonOpt: Option[Double], msg: => String) = + (opt, nonOpt) match { + case (None, None) => succeed + case (Some(_), None) => + // optimization can make things succeed: 0.0 * a = 0.0, so we don't need to know a + () + case (None, Some(_)) => fail(s"unoptimized succeded: $msg") + case (Some(a), Some(b)) => close(a, b, s"$msg, $a, $b") + } + + test("can parse") { + assert(real("1") == Real.Const(1.0)) + assert(real("1.0") == Real.Const(1.0)) + assert(real("1.5") == Real.Const(1.5)) + assert(real("-1.5") == Real.Const(-1.5)) + assert(real("x") == Real.Variable("x")) + assert(real("(1 + 2)") == Real.Sum(SortedList(Real.const(1.0), Real.const(2.0)))) + assert(real("(1*2)") == Real.Prod(SortedList(Real.const(1.0), Real.const(2.0)))) + } + + test("combine const") { + assert(optimizeAll(real("(1 + 2 + 3)")) == real("6")) + } + + test("we can parse anything") { + forAll { r: Real => + assert(real(r.toString) == r, s"couldn't parse: $r") + } + } + + test("optimization reduces cost") { + def law(r: Real, strong: Boolean, n: Int) = { + val optR = optimize(r, n) + assert(r.cost >= optR.cost, s"$r => $optR") + } + forAll(Real.genReal(3), Gen.choose(0, 1000))(law(_, false, _)) + } + + test("optimizeAll reduces cost") { + def law(r: Real, strong: Boolean) = { + val optR = optimizeAll(r) + if (strong) assert(r.cost > optR.cost, s"$r => $optR") + else assert(r.cost >= optR.cost, s"$r => $optR") + } + forAll(Real.genReal(3))(law(_, false)) + + val strongCases = List("((x*1) + (x*2))", "((x*1) + (x*2) + (y*3))", "(1 + 2)") + + strongCases.foreach(s => law(real(s), true)) + } + + test("rules don't loop") { + def neverLoop(r: Real, rules: List[Rule[RealN]]): Unit = { + val (dag, id) = Dag[Any, RealN](r, toLiteral) + var seen: Set[Real] = Set(dag.evaluate(id)) + def loop(d: Dag[RealN]): Unit = { + // val d1 = d.applySeqOnce(allRules) + val d1 = d.applySeqOnce(rules) + val r1 = d1.evaluate(id) + if (seen(r1)) { + assert(d1 eq d, s"we have seen: $r1 before. Previous: ${d.evaluate(id)} working on $r") + () + } else { + seen += r1 + loop(d1) + } + } + + loop(dag) + } + + forAll { (r: Real, rules: Set[Rule[RealN]]) => + neverLoop(r, rules.toList) + } + } + + test("optimization does not change evaluation") { + def law(r: Real, vars: Map[String, Double], ruleSeq: Seq[Rule[RealN]]) = { + val optR = Dag.applyRuleSeq[Any, RealN](r, toLiteral, ruleSeq) + closeOpt(optR.evaluate(vars), r.evaluate(vars), s"$optR, $r") + } + + val ruleSeqGen: Gen[Seq[Rule[RealN]]] = + implicitly[Arbitrary[Set[Rule[RealN]]]].arbitrary.map(_.toSeq) + + val genMap = Gen.mapOf(Gen.zip(Gen.choose('a', 'z').map(_.toString), Gen.choose(-1000.0, 1000.0))) + + def genCompleteMap(r: Real): Gen[Map[String, Double]] = { + val vars = r.freeVars + val sz = vars.size + Gen.listOfN(sz, Gen.choose(-1000.0, 1000.0)).map { ds => + vars.zip(ds).toMap + } + } + + val completeEval = + Real + .genReal(3) + .flatMap(r => genCompleteMap(r).map((r, _))) + + forAll(Real.genReal(3), genMap, ruleSeqGen)(law(_, _, _)) + + forAll(completeEval, implicitly[Arbitrary[Set[Rule[RealN]]]].arbitrary) { case ((r, m), rules) => + val res0 = r.evaluate(m) + val rOpt = Dag.applyRuleSeq[Any, RealN](r, toLiteral, rules.toList) + val resOpt = rOpt.evaluate(m) + + // scalacheck's broken shrinking kills this, if you see + // fishy failures, comment it out, until you debug + assert(res0.isDefined, s"expected unoptimized to work") + assert(resOpt.isDefined, s"expected optimized to work") + closeOpt(resOpt, res0, s"$rOpt, $r") + } + + forAll(completeEval, Real.genReal(3)) { case ((r0, vars), r1) => + r0.divOpt(r1) match { + case None => () + case Some(r2) => + // r0/r1 == r2, so r0 == r1 * r2 + closeOpt(Real.prod(r1, r2).evaluate(vars), r0.evaluate(vars), s"numerator: $r2") + } + } + + // past failures here + List( + ("((-986.0*x) + (-325.0*363.0*z) + (530.0*928.0*x))", Map("x" -> 1.0, "z" -> 10.0), allRules0 :: Nil), + ( + "(q + q + r + x + (-755.0*-394.0*674.0))", + Map("x" -> 0.0, "q" -> 0.0, "r" -> 3.3953184045350335e-5), + ReduceProd :: Nil + ), + ("(x + (y + ((x*2) + (y*2))))", Map("x" -> 1.0, "y" -> 10.0), allRules) + ).foreach { case (inputS, vars, ruleSet) => law(real(inputS), vars, ruleSet) } + + } + + test("evaluation works when all vars are present") { + val genMap = Gen.mapOf(Gen.zip(Gen.choose('a', 'z').map(_.toString), Gen.choose(-1000.0, 1000.0))) + forAll(Real.genReal(5), genMap) { (r, vars) => + r.evaluate(vars) match { + case None => assert((r.freeVars -- vars.keys).nonEmpty) + case Some(_) => assert((r.freeVars -- vars.keys).isEmpty) + } + } + } + + test("cost is expected") { + List( + ("(x + (1 + 2))", 2), + ("(x + (1*2))", 2), + ("(2.0*(2.0*2.0))", 2), + ("x", 0), + ("((a + b)*(c + d))", 3), + ("((a*c) + (b*c) + (a*d) + (b*d))", 7), + ("(4*2)", 1) + ).foreach { case (inputS, expCost) => + val input = real(inputS) + assert(input.cost == expCost, s"$inputS -> $input has cost: ${input.cost} not $expCost") + } + } + + test("we fold in constants") { + List( + ("(x + x)", "(2*x)"), + ("(x + (1 + 2))", "(x + 3)"), + ("(x + (1*2))", "(x + 2)"), + ("(2.0*(2.0*2.0))", "8.0"), + ("(1*2)", "2") + ).foreach { case (input, exp) => + val opt = optimizeAll(real(input)) + assert(opt == real(exp), s"$input => $opt not $exp") + } + } + + test("expand works as expected") { + val genMap = Gen.mapOf(Gen.zip(Gen.choose('a', 'z').map(_.toString), Gen.choose(-1000.0, 1000.0))) + + // expand can exponentially increase the size of the real so we can't make it too big + forAll(Real.genReal(2), genMap) { (r, vars) => + val rexp = r.expand + closeOpt(rexp.evaluate(vars), r.evaluate(vars), s"$rexp, $r") + } + + assert(real("((a + b)*(c + d))").expand == real("((a*c) + (a*d) + (b*c) + (b*d))")) + } + + test("Real.divOpt and divisors agree") { + forAll { r: Real => + Real.divisors(r).foreach { div => + r.divOpt(div) match { + case None => fail(s"expected to divide: $r with $div") + case Some(_) => succeed + } + + r.divOpt(r) match { + case None => fail(s"we expect a self division: $r") + case Some(_) => succeed + } + } + } + } + + test("optimization de-foils easy cases") { + val var1 = Gen.choose('a', 'm').map(c => Real.Variable(c.toString)) + val var2 = Gen.choose('n', 'z').map(c => Real.Variable(c.toString)) + def sumOf(g: Gen[Real]) = + Gen + .choose(2, 4) + .flatMap(Gen.listOfN(_, g)) + .map(ls => Real.sum(SortedList(ls: _*))) + + val s1 = sumOf(var1) + val s2 = sumOf(var2) + val prod = Gen.zip(s1, s2).map { case (a, b) => Real.prod(a, b) } + + // these products of sums + forAll(prod) { r: Real => + val cost0 = r.cost + // expanded has exponentially more cost than r + val expanded = r.expand + val expCost = expanded.cost + val optR = optimizeAll(expanded) + val (dag, id) = Dag[Any, RealN](optR, toLiteral) + + // we cannot apply the ReduceProd rule: + assert(dag.applyOnce(ReduceProd) == dag) + + assert( + optR.cost <= cost0, + s"$r ($cost0) optimized to $optR expanded to: $expanded expanded cost: $expCost" + ) + } + } + + test("optimization nearly de-foils") { + val sum = Gen + .choose(2, 4) + .flatMap(Gen.listOfN(_, Real.genReal(0))) + .map(ls => Real.sum(SortedList(ls: _*))) + val prod = Gen + .choose(2, 3) + .flatMap(Gen.listOfN(_, sum)) + .map(ls => Real.prod(SortedList(ls: _*))) + // these products of sums + forAll(prod) { r: Real => + val cost0 = r.cost + // expanded has exponentially more cost than r + val expanded = r.expand + val expCost = expanded.cost + // we should get close to minimal, which is cost0: + val closeness = 0.5 // 1.0 means all the way to minimal + val prettyGood = cost0 * closeness + expCost.toDouble * (1.0 - closeness) + val optR = optimizeAll(expanded) + assert( + optR.cost.toDouble <= prettyGood, + s"$r ($cost0) optimized to $optR expanded to: $expanded expanded cost: $expCost" + ) + } + } + + test("orderMap works") { + List( + ("x", Map("x" -> 1)), + ("(x*x)", Map("x" -> 2)), + ("((x + 1)*(x + 2))", Map("x" -> 2)), + ("((x + x)*(x + 2))", Map("x" -> 2)) + ) + .foreach { case (r, o) => + assert(real(r).orderMap == o, s"$r") + } + } + + test("test L2 norm example") { + import Real._ + val terms = 100 + val points = (1 to terms).map(i => const(i.toDouble)) + // sum_i (d_i - x)*(d_i - x) + val x = Variable("x") + val l2 = Real.sum(SortedList.fromList(points.map(d => (d - x) * (d - x)).toList)) + + object ExpandWhenOrderMatches extends Rule[RealN] { + def apply[T](on: Dag[RealN]) = { + case s @ Sum(items) => + val newItems = items + .groupBy(_.orderMap) + .iterator + .map { case (_, vs) => + // we can combine sums of the same order: + if (vs.size > 1) { + sum(SortedList.fromList(vs.iterator.map(_.expand).toList)) + } else vs.head + } + .toList + + val newSum = sum(SortedList.fromList(newItems)) + if (items == newSum) None + else Some(newSum) + case _ => None + } + } + + val optL2 = Dag.applyRuleSeq[Any, RealN](l2, toLiteral, ExpandWhenOrderMatches :: allRules) + + val optC = optL2.cost + assert(optC == 4) // we can convert polynomial order 2 to a*(b + x*(c + x)) + } +} diff --git a/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/SimpleDag.scala b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/SimpleDag.scala new file mode 100644 index 0000000000..0b6ed09f0a --- /dev/null +++ b/scalding-dagon/src/test/scala/com/twitter/scalding/dagon/SimpleDag.scala @@ -0,0 +1,66 @@ +/* + Copyright 2013 Twitter, Inc. + Copyright 2017 Stripe, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ + +package com.twitter.scalding.dagon + +import Graphs._ + +/** + * Given Dag and a List of immutable nodes, and a function to get dependencies, compute the dependants + * (reverse the graph) + */ +abstract class SimpleDag[T] { + def nodes: List[T] + def dependenciesOf(t: T): Iterable[T] + + lazy val allTails: List[T] = nodes.filter(fanOut(_).get == 0) + private lazy val nodeSet: Set[T] = nodes.toSet + + /** + * This is the dependants graph. Each node knows who it depends on but not who depends on it without doing + * this computation + */ + private lazy val graph: NeighborFn[T] = reversed(nodes)(dependenciesOf(_)) + + private lazy val depths: Map[T, Int] = dagDepth(nodes)(dependenciesOf(_)) + + /** + * The max of zero and 1 + depth of all parents if the node is the graph + */ + def isNode(p: T): Boolean = nodeSet.contains(p) + def depth(p: T): Option[Int] = depths.get(p) + + def dependantsOf(p: T): Option[List[T]] = + if (isNode(p)) Some(graph(p).toList) else None + + def fanOut(p: T): Option[Int] = dependantsOf(p).map(_.size) + + def isTail(t: T): Boolean = allTails.contains(t) + + /** + * Return all dependendants of a given node. Does not include itself + */ + def transitiveDependantsOf(p: T): List[T] = depthFirstOf(p)(graph) +} + +object SimpleDag { + def apply[T](nodes0: List[T])(nfn: T => Iterable[T]): SimpleDag[T] = + new SimpleDag[T] { + def nodes = nodes0 + def dependenciesOf(t: T) = nfn(t) + } +} diff --git a/scalding-date/src/main/scala/com/twitter/scalding/AbsoluteDuration.scala b/scalding-date/src/main/scala/com/twitter/scalding/AbsoluteDuration.scala index f7d84b97b7..599ba3f8e8 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/AbsoluteDuration.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/AbsoluteDuration.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.util.Calendar @@ -25,23 +25,20 @@ import scala.annotation.tailrec */ object AbsoluteDuration extends java.io.Serializable { - def max(a : AbsoluteDuration, b : AbsoluteDuration) = if(a > b) a else b + def max(a: AbsoluteDuration, b: AbsoluteDuration) = if (a > b) a else b type TimeCons = ((Int) => AbsoluteDuration, Int) val SEC_IN_MS = 1000 val MIN_IN_MS = 60 * SEC_IN_MS val HOUR_IN_MS = 60 * MIN_IN_MS - val UTC_UNITS = List[TimeCons]((Hours,HOUR_IN_MS), - (Minutes,MIN_IN_MS), - (Seconds,SEC_IN_MS), - (Millisecs,1)).reverse + val UTC_UNITS = + List[TimeCons]((Hours, HOUR_IN_MS), (Minutes, MIN_IN_MS), (Seconds, SEC_IN_MS), (Millisecs, 1)).reverse def exact(fnms: TimeCons): (Long) => Option[AbsoluteDuration] = { ms: Long => - if( ms % fnms._2 == 0 ) { - Some(fnms._1( (ms / fnms._2).toInt )) - } - else { + if (ms % fnms._2 == 0) { + Some(fnms._1((ms / fnms._2).toInt)) + } else { None } } @@ -52,29 +49,29 @@ object AbsoluteDuration extends java.io.Serializable { def fromMillisecs(diffInMs: Long): AbsoluteDuration = fromMillisecs(diffInMs, UTC_UNITS, Nil) @tailrec - private def fromMillisecs(diffInMs: Long, units: List[TimeCons], acc: List[AbsoluteDuration]): - AbsoluteDuration = { - - if(diffInMs == 0L) { - //We are done: + private def fromMillisecs( + diffInMs: Long, + units: List[TimeCons], + acc: List[AbsoluteDuration] + ): AbsoluteDuration = + if (diffInMs == 0L) { + // We are done: acc match { - case Nil => units.head._1(0) + case Nil => units.head._1(0) case (h :: Nil) => h - case _ => AbsoluteDurationList(acc) + case _ => AbsoluteDurationList(acc) } - } - else { + } else { units match { case (tc0 :: tc1 :: tail) => { - //Only get as many as the next guy can't get: + // Only get as many as the next guy can't get: val nextSize = tc1._2 val thisDiff = diffInMs % nextSize // Keep only this amount of millis for this unit val theseUnits = thisDiff / tc0._2 val (newDiff, newAcc) = if (theseUnits != 0L) { val dur = tc0._1(theseUnits.toInt) - (diffInMs - dur.toMillisecs, dur::acc) - } - else { + (diffInMs - dur.toMillisecs, dur :: acc) + } else { (diffInMs, acc) } fromMillisecs(newDiff, (tc1 :: tail), newAcc) @@ -83,9 +80,11 @@ object AbsoluteDuration extends java.io.Serializable { // We can't go any further, try to jam the rest into this unit: val (fn, cnt) = tc val theseUnits = diffInMs / cnt - require((theseUnits <= Int.MaxValue) && (theseUnits >= Int.MinValue), + require( + (theseUnits <= Int.MaxValue) && (theseUnits >= Int.MinValue), "diff not representable in an Int: " + theseUnits + AbsoluteDurationList(acc) + - "total: " + (diffInMs + AbsoluteDurationList(acc).toMillisecs)) + "total: " + (diffInMs + AbsoluteDurationList(acc).toMillisecs) + ) val thisPart = fn(theseUnits.toInt) if (acc.isEmpty) thisPart @@ -98,74 +97,72 @@ object AbsoluteDuration extends java.io.Serializable { } } } - } } sealed trait AbsoluteDuration extends Duration with Ordered[AbsoluteDuration] { // Here are the abstracts: - def toMillisecs : Long + def toMillisecs: Long // These are all in terms of toMillisecs - def toSeconds : Double = toMillisecs/1000.0 - override def addTo(that : RichDate) = RichDate(that.timestamp + toMillisecs) - override def subtractFrom(that : RichDate) = RichDate(that.timestamp - toMillisecs) + def toSeconds: Double = toMillisecs / 1000.0 + override def addTo(that: RichDate) = RichDate(that.timestamp + toMillisecs) + override def subtractFrom(that: RichDate) = RichDate(that.timestamp - toMillisecs) - def compare(that : AbsoluteDuration) : Int = + def compare(that: AbsoluteDuration): Int = this.toMillisecs.compareTo(that.toMillisecs) - def +(that : AbsoluteDuration): AbsoluteDuration = + def +(that: AbsoluteDuration): AbsoluteDuration = AbsoluteDuration.fromMillisecs(this.toMillisecs + that.toMillisecs) - def -(that : AbsoluteDuration): AbsoluteDuration = + def -(that: AbsoluteDuration): AbsoluteDuration = AbsoluteDuration.fromMillisecs(this.toMillisecs - that.toMillisecs) def *(that: Long): AbsoluteDuration = AbsoluteDuration.fromMillisecs(this.toMillisecs * that) - /** Returns the number of times that divides this and the remainder - * The law is: that * result_.1 + result._2 == this + /** + * Returns the number of times that divides this and the remainder The law is: that * result_.1 + result._2 + * \== this */ def /(that: AbsoluteDuration): (Long, AbsoluteDuration) = { - val divs = (this.toMillisecs / that.toMillisecs) + val divs = this.toMillisecs / that.toMillisecs val rem = this - (that * divs) (divs, rem) } - override def equals(eq: Any): Boolean = { + override def equals(eq: Any): Boolean = eq match { case eqo: AbsoluteDuration => (eqo.toMillisecs) == this.toMillisecs - case _ => false + case _ => false } - } override def hashCode: Int = toMillisecs.hashCode } -case class Millisecs(cnt : Int) extends Duration(Calendar.MILLISECOND, cnt, DateOps.UTC) - with AbsoluteDuration { +final case class Millisecs(cnt: Int) + extends Duration(Calendar.MILLISECOND, cnt, DateOps.UTC) + with AbsoluteDuration { override def toSeconds = cnt / 1000.0 override def toMillisecs = cnt.toLong } -case class Seconds(cnt : Int) extends Duration(Calendar.SECOND, cnt, DateOps.UTC) - with AbsoluteDuration { +final case class Seconds(cnt: Int) extends Duration(Calendar.SECOND, cnt, DateOps.UTC) with AbsoluteDuration { override def toSeconds = cnt.toDouble override def toMillisecs = (cnt.toLong) * 1000L } -case class Minutes(cnt : Int) extends Duration(Calendar.MINUTE, cnt, DateOps.UTC) - with AbsoluteDuration { +final case class Minutes(cnt: Int) extends Duration(Calendar.MINUTE, cnt, DateOps.UTC) with AbsoluteDuration { override def toSeconds = cnt * 60.0 override def toMillisecs = cnt.toLong * 60L * 1000L } -case class Hours(cnt : Int) extends Duration(Calendar.HOUR, cnt, DateOps.UTC) - with AbsoluteDuration { +final case class Hours(cnt: Int) extends Duration(Calendar.HOUR, cnt, DateOps.UTC) with AbsoluteDuration { override def toSeconds = cnt * 60.0 * 60.0 override def toMillisecs = cnt.toLong * 60L * 60L * 1000L } -case class AbsoluteDurationList(parts : List[AbsoluteDuration]) - extends AbstractDurationList[AbsoluteDuration](parts) with AbsoluteDuration { - override def toSeconds = parts.map{ _.toSeconds }.sum - override def toMillisecs : Long = parts.map{ _.toMillisecs }.sum +final case class AbsoluteDurationList(parts: List[AbsoluteDuration]) + extends AbstractDurationList[AbsoluteDuration](parts) + with AbsoluteDuration { + override def toSeconds = parts.map(_.toSeconds).sum + override def toMillisecs: Long = parts.map(_.toMillisecs).sum } diff --git a/scalding-date/src/main/scala/com/twitter/scalding/CalendarOps.scala b/scalding-date/src/main/scala/com/twitter/scalding/CalendarOps.scala index 2a403b2934..ff3ada1f0f 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/CalendarOps.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/CalendarOps.scala @@ -1,24 +1,20 @@ package com.twitter.scalding -import java.util.{Date, Calendar} +import java.util.{Calendar, Date} import scala.annotation.tailrec /** - * */ object CalendarOps { def truncate(date: Calendar, field: Int): Calendar = { @tailrec - def truncateIter(cal: Calendar, field: Int, currentField: Int): Calendar = { + def truncateIter(cal: Calendar, field: Int, currentField: Int): Calendar = if (currentField > field) { currentField match { case Calendar.DAY_OF_MONTH => cal.set(currentField, 1) - case Calendar.DAY_OF_WEEK_IN_MONTH => Unit // Skip - case Calendar.DAY_OF_WEEK => Unit // Skip - case Calendar.DAY_OF_YEAR => Unit // Skip - case Calendar.WEEK_OF_MONTH => Unit // Skip - case Calendar.WEEK_OF_YEAR => Unit // Skip - case Calendar.HOUR_OF_DAY => Unit // Skip + case Calendar.DAY_OF_WEEK_IN_MONTH | Calendar.DAY_OF_WEEK | Calendar.DAY_OF_YEAR | + Calendar.WEEK_OF_MONTH | Calendar.WEEK_OF_YEAR | Calendar.HOUR_OF_DAY => + () // Skip case _ => cal.set(currentField, 0) } @@ -26,7 +22,6 @@ object CalendarOps { } else { cal } - } val cloned = date.clone().asInstanceOf[Calendar] @@ -34,10 +29,10 @@ object CalendarOps { } def truncate(date: Date, field: Int): Date = { - val cal = Calendar.getInstance(); - cal.setTime(date); + val cal = Calendar.getInstance() + cal.setTime(date) - truncate(cal, field).getTime(); + truncate(cal, field).getTime() } } diff --git a/scalding-date/src/main/scala/com/twitter/scalding/DateOps.scala b/scalding-date/src/main/scala/com/twitter/scalding/DateOps.scala index b71110e044..57118c5691 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/DateOps.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/DateOps.scala @@ -12,52 +12,103 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding - -import java.util.Calendar -import java.util.Date import java.util.TimeZone import java.text.SimpleDateFormat import scala.util.matching.Regex /** -* Holds some coversion functions for dealing with strings as RichDate objects -*/ + * Holds some coversion functions for dealing with strings as RichDate objects + */ object DateOps extends java.io.Serializable { val PACIFIC = TimeZone.getTimeZone("America/Los_Angeles") val UTC = TimeZone.getTimeZone("UTC") + val DATE_WITHOUT_DASH = "yyyyMMdd" val DATE_WITH_DASH = "yyyy-MM-dd" + val DATEHOUR_WITHOUT_DASH = "yyyyMMddHH" val DATEHOUR_WITH_DASH = "yyyy-MM-dd HH" + val DATETIME_WITHOUT_DASH = "yyyyMMddHHmm" val DATETIME_WITH_DASH = "yyyy-MM-dd HH:mm" + val DATETIME_HMS_WITHOUT_DASH = "yyyyMMddHHmmss" val DATETIME_HMS_WITH_DASH = "yyyy-MM-dd HH:mm:ss" val DATETIME_HMSM_WITH_DASH = "yyyy-MM-dd HH:mm:ss.SSS" - private val DATE_RE = """\d{4}-\d{2}-\d{2}""" - private val SEP_RE = """(T?|\s*)""" - private val DATE_FORMAT_VALIDATORS = List(DATE_WITH_DASH -> new Regex("""^\s*""" + DATE_RE + """\s*$"""), - DATEHOUR_WITH_DASH -> new Regex("""^\s*""" + DATE_RE + - SEP_RE + """\d\d\s*$"""), - DATETIME_WITH_DASH -> new Regex("""^\s*""" + DATE_RE + - SEP_RE + """\d\d:\d\d\s*$"""), - DATETIME_HMS_WITH_DASH -> new Regex("""^\s*""" + DATE_RE + - SEP_RE + """\d\d:\d\d:\d\d\s*$"""), - DATETIME_HMSM_WITH_DASH -> new Regex("""^\s*""" + DATE_RE + - SEP_RE + """\d\d:\d\d:\d\d\.\d{1,3}\s*$""")) + private[scalding] sealed abstract class Format(val pattern: String, val validator: Regex) { + def matches(s: String): Boolean = validator.findFirstIn(s).isDefined + } + + private[scalding] object Format { + private val date = """\d{4}-\d{2}-\d{2}""" + private val sep = """(T?|\s*)""" + private val emptyBegin = """^\s*""" + private val emptyEnd = """\s*$""" + + case object DATE_WITHOUT_DASH + extends Format(DateOps.DATE_WITHOUT_DASH, new Regex(emptyBegin + """\d{8}""" + emptyEnd)) + case object DATE_WITH_DASH extends Format(DateOps.DATE_WITH_DASH, new Regex(emptyBegin + date + emptyEnd)) + case object DATEHOUR_WITHOUT_DASH + extends Format(DateOps.DATEHOUR_WITHOUT_DASH, new Regex(emptyBegin + """\d{10}""" + emptyEnd)) + case object DATEHOUR_WITH_DASH + extends Format(DateOps.DATEHOUR_WITH_DASH, new Regex(emptyBegin + date + sep + """\d\d""" + emptyEnd)) + case object DATETIME_WITHOUT_DASH + extends Format(DateOps.DATETIME_WITHOUT_DASH, new Regex(emptyBegin + """\d{12}""" + emptyEnd)) + case object DATETIME_WITH_DASH + extends Format( + DateOps.DATETIME_WITH_DASH, + new Regex(emptyBegin + date + sep + """\d\d:\d\d""" + emptyEnd) + ) + case object DATETIME_HMS_WITHOUT_DASH + extends Format(DateOps.DATETIME_HMS_WITHOUT_DASH, new Regex(emptyBegin + """\d{14}""" + emptyEnd)) + case object DATETIME_HMS_WITH_DASH + extends Format( + DateOps.DATETIME_HMS_WITH_DASH, + new Regex(emptyBegin + date + sep + """\d\d:\d\d:\d\d""" + emptyEnd) + ) + case object DATETIME_HMSM_WITH_DASH + extends Format( + DateOps.DATETIME_HMSM_WITH_DASH, + new Regex(emptyBegin + date + sep + """\d\d:\d\d:\d\d\.\d{1,3}""" + emptyEnd) + ) + } + private val prepare: String => String = { (str: String) => - str.replace("T"," ") //We allow T to separate dates and times, just remove it and then validate - .replaceAll("[/_]", "-") // Allow for slashes and underscores + str + .replace("T", " ") // We allow T to separate dates and times, just remove it and then validate + .replaceAll("[/_]", "-") // Allow for slashes and underscores } + /** - * Return the guessed format for this datestring - */ - def getFormat(s : String) : Option[String] = { - DATE_FORMAT_VALIDATORS.find{_._2.findFirstIn(prepare(s)).isDefined}.map(_._1) + * Return the guessed format for this datestring + */ + private[scalding] def getFormatObject(s: String): Option[Format] = { + val formats: List[Format] = List[Format]( + Format.DATE_WITH_DASH, + Format.DATEHOUR_WITH_DASH, + Format.DATETIME_WITH_DASH, + Format.DATETIME_HMS_WITH_DASH, + Format.DATETIME_HMSM_WITH_DASH, + Format.DATE_WITHOUT_DASH, + Format.DATEHOUR_WITHOUT_DASH, + Format.DATETIME_WITHOUT_DASH, + Format.DATETIME_HMS_WITHOUT_DASH + ) + + formats.find(_.matches(prepare(s))) } + /** + * Return the guessed format for this datestring + */ + def getFormat(s: String): Option[String] = getFormatObject(s).map(_.pattern) + + /** + * The DateParser returned here is based on SimpleDateFormat, which is not thread-safe. Do not share the + * result across threads. + */ def getDateParser(s: String): Option[DateParser] = - getFormat(s).map { fmt => DateParser.from(new SimpleDateFormat(fmt)).contramap(prepare) } + getFormat(s).map(fmt => DateParser.from(new SimpleDateFormat(fmt)).contramap(prepare)) } diff --git a/scalding-date/src/main/scala/com/twitter/scalding/DateParser.scala b/scalding-date/src/main/scala/com/twitter/scalding/DateParser.scala index 77540745a0..48b4999841 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/DateParser.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/DateParser.scala @@ -1,4 +1,3 @@ - /* Copyright 2012 Twitter, Inc. @@ -13,11 +12,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import scala.util.{Try, Failure} +import scala.util.{Failure, Try} import java.util.TimeZone import java.text.DateFormat @@ -31,30 +30,37 @@ trait DateParser extends java.io.Serializable { self => def rescueWith(second: DateParser): DateParser = new DateParser { - def parse(s: String)(implicit tz: TimeZone) = { - self.parse(s) orElse second.parse(s) - } + def parse(s: String)(implicit tz: TimeZone) = + self.parse(s).orElse(second.parse(s)) } } object DateParser { - /** This is scalding's default date parser. You can choose this - * by setting an implicit val DateParser. + + /** + * This is scalding's default date parser. You can choose this by setting an implicit val DateParser. Note + * that DateParsers using SimpleDateFormat from Java are not thread-safe, thus the def here. You can cache + * the result if you are sure */ - val default: DateParser = new DateParser { + def default: DateParser = new DateParser { def parse(s: String)(implicit tz: TimeZone) = - DateOps.getDateParser(s) - .map { p => p.parse(s) } + DateOps + .getDateParser(s) + .map(p => p.parse(s)) .getOrElse(Failure(new IllegalArgumentException("Could not find parser for: " + s))) } /** Try these Parsers in order */ def apply(items: Iterable[DateParser]): DateParser = - items.reduce { _.rescueWith(_) } + items.reduce(_.rescueWith(_)) /** Using the type-class pattern */ def parse(s: String)(implicit tz: TimeZone, p: DateParser): Try[RichDate] = p.parse(s)(tz) + /** + * Note that DateFormats in Java are generally not thread-safe, so you should not share the result here + * across threads + */ implicit def from(df: DateFormat): DateParser = new DateParser { def parse(s: String)(implicit tz: TimeZone) = Try { df.setTimeZone(tz) @@ -62,6 +68,9 @@ object DateParser { } } + /** + * This ignores the time-zone assuming it must be in the String + */ def from(fn: String => RichDate) = new DateParser { def parse(s: String)(implicit tz: TimeZone) = Try(fn(s)) } @@ -71,22 +80,12 @@ object DateParser { } /** - //Scalding used to support Natty, this is removed. To add it back, use something like this in your code, - //possibly with: - //implicit val myParser = DateParser(Seq(DateParser.default, NattyParser)) - -object NattyParser extends DateParser { - def parse(s: String)(implicit tz: TimeZone) = Try { - val timeParser = new natty.Parser(tz) - val dateGroups = timeParser.parse(s) - if (dateGroups.size == 0) { - throw new IllegalArgumentException("Could not convert string: '" + str + "' into a date.") - } - // a DateGroup can have more than one Date (e.g. if you do "Sept. 11th or 12th"), - // but we're just going to take the first - val dates = dateGroups.get(0).getDates() - RichDate(dates.get(0)) - } -} - -*/ + * //Scalding used to support Natty, this is removed. To add it back, use something like this in your code, + * //possibly with: //implicit val myParser = DateParser(Seq(DateParser.default, NattyParser)) + * + * object NattyParser extends DateParser { def parse(s: String)(implicit tz: TimeZone) = Try { val timeParser + * \= new natty.Parser(tz) val dateGroups = timeParser.parse(s) if (dateGroups.size == 0) { throw new + * IllegalArgumentException("Could not convert string: '" + str + "' into a date.") } // a DateGroup can have + * more than one Date (e.g. if you do "Sept. 11th or 12th"), // but we're just going to take the first val + * dates = dateGroups.get(0).getDates() RichDate(dates.get(0)) } } + */ diff --git a/scalding-date/src/main/scala/com/twitter/scalding/DateRange.scala b/scalding-date/src/main/scala/com/twitter/scalding/DateRange.scala index 234440b76b..a61970212d 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/DateRange.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/DateRange.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import scala.annotation.tailrec @@ -20,99 +20,112 @@ import scala.annotation.tailrec import java.util.TimeZone object DateRange extends java.io.Serializable { - /** Parse this string into a range. - * 2009-10-01 is interpetted as the whole day - * 2009-10-01T12 is interpetted as the whole hour - * 2009-10-01T12:00 is interpetted as a single minute - * 2009-10-01T12:00:02 is interpretted as a single second + + /** + * Parse this string into a range. 2009-10-01 is interpetted as the whole day 2009-10-01T12 is interpetted + * as the whole hour 2009-10-01T12:00 is interpetted as a single minute 2009-10-01T12:00:02 is interpretted + * as a single second * - * This is called parse to avoid a collision with implicit conversions - * from String to RichDate + * This is called parse to avoid a collision with implicit conversions from String to RichDate */ def parse(truncatediso8601: String)(implicit tz: TimeZone, dp: DateParser): DateRange = DateRange(RichDate(truncatediso8601), RichDate.upperBound(truncatediso8601)) /** - * We take the upper bound of the second parameter, so we take the latest time that - * could be construed as matching the string passed, e.g. - * ("2011-01-02T04", "2011-01-02T05") includes two full hours (all of 4 and all of 5) + * We take the upper bound of the second parameter, so we take the latest time that could be construed as + * matching the string passed, e.g. ("2011-01-02T04", "2011-01-02T05") includes two full hours (all of 4 and + * all of 5) */ - def parse(iso8601start: String, - iso8601inclusiveUpper: String)(implicit tz: TimeZone, dp: DateParser): DateRange = { + def parse(iso8601start: String, iso8601inclusiveUpper: String)(implicit + tz: TimeZone, + dp: DateParser + ): DateRange = { val start = RichDate(iso8601start) val end = RichDate.upperBound(iso8601inclusiveUpper) - //Make sure the end is not before the beginning: + // Make sure the end is not before the beginning: assert(start <= end, "end of date range must occur after the start") DateRange(start, end) } - /** Pass one or two args (from a scalding.Args .list) to parse into a DateRange + /** + * Pass one or two args (from a scalding.Args .list) to parse into a DateRange */ def parse(fromArgs: Seq[String])(implicit tz: TimeZone, dp: DateParser): DateRange = fromArgs match { case Seq(s, e) => parse(s, e) - case Seq(o) => parse(o) - case x => sys.error("--date must have exactly one or two date[time]s. Got: " + x.toString) + case Seq(o) => parse(o) + case x => sys.error("--date must have exactly one or two date[time]s. Got: " + x.toString) } + + /** + * DateRanges are inclusive. Use this to create a DateRange that excludes the last millisecond from the + * second argument. + */ + def exclusiveUpper(include: RichDate, exclude: RichDate): DateRange = + DateRange(include, exclude - Millisecs(1)) } /** -* represents a closed interval of time. -* -* TODO: This should be Range[RichDate, Duration] for an appropriate notion -* of Range -*/ -case class DateRange(val start : RichDate, val end : RichDate) { - import DateOps._ + * represents a closed interval of time. + * + * TODO: This should be Range[RichDate, Duration] for an appropriate notion of Range + */ +case class DateRange(val start: RichDate, val end: RichDate) { + require(start <= end, s"""The start "$start" must be before or on the end "$end".""") + + /** + * shift this by the given unit + */ + def +(timespan: Duration) = DateRange(start + timespan, end + timespan) + def -(timespan: Duration) = DateRange(start - timespan, end - timespan) + + def isBefore(d: RichDate) = end < d + def isAfter(d: RichDate) = d < start + /** - * shift this by the given unit - */ - def +(timespan : Duration) = DateRange(start + timespan, end + timespan) - def -(timespan : Duration) = DateRange(start - timespan, end - timespan) + * make the range wider by delta on each side. Good to catch events which might spill over. + */ + def embiggen(delta: Duration) = DateRange(start - delta, end + delta) - def isBefore(d : RichDate) = end < d - def isAfter(d : RichDate) = d < start /** - * make the range wider by delta on each side. Good to catch events which - * might spill over. - */ - def embiggen(delta : Duration) = DateRange(start - delta, end + delta) + * Extend the length by moving the end. We can keep the party going, but we can't start it earlier. + */ + def extend(delta: Duration) = DateRange(start, end + delta) + /** - * Extend the length by moving the end. We can keep the party going, but we - * can't start it earlier. - */ - def extend(delta : Duration) = DateRange(start, end + delta) + * Extend the length by moving the start. Turns out, we can start the party early. + */ + def prepend(delta: Duration) = DateRange(start - delta, end) + + def contains(point: RichDate) = (start <= point) && (point <= end) - def contains(point : RichDate) = (start <= point) && (point <= end) /** - * Is the given Date range a (non-strict) subset of the given range - */ - def contains(dr : DateRange) = start <= dr.start && dr.end <= end + * Is the given Date range a (non-strict) subset of the given range + */ + def contains(dr: DateRange) = start <= dr.start && dr.end <= end /** - * produce a contiguous non-overlapping set of DateRanges - * whose union is equivalent to this. - * If it is passed an integral unit of time (not a DurationList), it stops at boundaries - * which are set by the start timezone, else break at start + k * span. + * produce a contiguous non-overlapping set of DateRanges whose union is equivalent to this. If it is passed + * an integral unit of time (not a DurationList), it stops at boundaries which are set by the start + * timezone, else break at start + k * span. */ - def each(span : Duration) : Iterable[DateRange] = { - //tail recursive method which produces output (as a stack, so it is - //reversed). acc is the accumulated list so far: - @tailrec def eachRec(acc : List[DateRange], nextDr : DateRange) : List[DateRange] = { + def each(span: Duration): Iterable[DateRange] = { + // tail recursive method which produces output (as a stack, so it is + // reversed). acc is the accumulated list so far: + @tailrec def eachRec(acc: List[DateRange], nextDr: DateRange): List[DateRange] = { val next_start = span.floorOf(nextDr.start) + span - //the smallest grain of time we count is 1 millisecond + // the smallest grain of time we count is 1 millisecond val this_end = next_start - Millisecs(1) - if( nextDr.end <= this_end ) { - //This is the last block, output and end: + if (nextDr.end <= this_end) { + // This is the last block, output and end: nextDr :: acc - } - else { - //Put today's portion, and then start on tomorrow: + } else { + // Put today's portion, and then start on tomorrow: val today = DateRange(nextDr.start, this_end) eachRec(today :: acc, DateRange(next_start, nextDr.end)) } } - //have to reverse because eachDayRec produces backwards + // have to reverse because eachDayRec produces backwards eachRec(Nil, this).reverse } diff --git a/scalding-date/src/main/scala/com/twitter/scalding/Duration.scala b/scalding-date/src/main/scala/com/twitter/scalding/Duration.scala index c4729468c1..663041acb0 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/Duration.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/Duration.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.util.Calendar @@ -21,74 +21,72 @@ import java.util.TimeZone import scala.annotation.tailrec /** -* Represents millisecond based duration (non-calendar based): seconds, minutes, hours -* calField should be a java.util.Calendar field -*/ + * Represents millisecond based duration (non-calendar based): seconds, minutes, hours calField should be a + * java.util.Calendar field + */ object Duration extends java.io.Serializable { // TODO: remove this in 0.9.0 - val SEC_IN_MS = 1000 - val MIN_IN_MS = 60 * SEC_IN_MS - val HOUR_IN_MS = 60 * MIN_IN_MS - val UTC_UNITS = List((Hours,HOUR_IN_MS),(Minutes,MIN_IN_MS),(Seconds,SEC_IN_MS),(Millisecs,1)) + val SEC_IN_MS: Int = 1000 + val MIN_IN_MS: Int = 60 * SEC_IN_MS + val HOUR_IN_MS: Int = 60 * MIN_IN_MS + val UTC_UNITS: List[(Int => AbsoluteDuration, Int)] = + List[(Int => AbsoluteDuration, Int)]( + (Hours, HOUR_IN_MS), + (Minutes, MIN_IN_MS), + (Seconds, SEC_IN_MS), + (Millisecs, 1) + ) } -abstract class Duration(val calField : Int, val count : Int, val tz : TimeZone) - extends java.io.Serializable { - protected def calAdd(that : RichDate, steps : Int) = { +abstract class Duration(val calField: Int, val count: Int, val tz: TimeZone) extends java.io.Serializable { + protected def calAdd(that: RichDate, steps: Int) = { val cal = that.toCalendar(tz) cal.setLenient(true) cal.add(calField, steps) RichDate(cal) } - def addTo(that : RichDate) = calAdd(that, count) + def addTo(that: RichDate) = calAdd(that, count) - def subtractFrom(that : RichDate) = calAdd(that, -count) + def subtractFrom(that: RichDate) = calAdd(that, -count) // Return the latest RichDate at boundary of this time unit, ignoring // the count of the units. Like a truncation. // Only makes sense for non-mixed durations. - def floorOf(that : RichDate) : RichDate = { + def floorOf(that: RichDate): RichDate = { val cal = that.toCalendar(tz) RichDate(CalendarOps.truncate(cal, calField)) } } -case class Days(cnt : Int)(implicit tz : TimeZone) - extends Duration(Calendar.DAY_OF_MONTH, cnt, tz) +case class Days(cnt: Int)(implicit tz: TimeZone) extends Duration(Calendar.DAY_OF_MONTH, cnt, tz) -case class Weeks(cnt : Int)(implicit tz : TimeZone) - extends Duration(Calendar.WEEK_OF_YEAR, cnt, tz) { +case class Weeks(cnt: Int)(implicit tz: TimeZone) extends Duration(Calendar.WEEK_OF_YEAR, cnt, tz) { // The library we are using can't handle week truncation... - override def floorOf(that : RichDate) = { + override def floorOf(that: RichDate) = { val step = Days(1) - @tailrec def recentMonday(rd : RichDate) : RichDate = { + @tailrec def recentMonday(rd: RichDate): RichDate = rd.toCalendar(tz).get(Calendar.DAY_OF_WEEK) match { case Calendar.MONDAY => rd - case _ => recentMonday(step.subtractFrom(rd)) + case _ => recentMonday(step.subtractFrom(rd)) } - } - //Set it to the earliest point in the day: + // Set it to the earliest point in the day: step.floorOf(recentMonday(that)) } } -case class Months(cnt : Int)(implicit tz : TimeZone) - extends Duration(Calendar.MONTH, cnt, tz) +case class Months(cnt: Int)(implicit tz: TimeZone) extends Duration(Calendar.MONTH, cnt, tz) -case class Years(cnt : Int)(implicit tz : TimeZone) - extends Duration(Calendar.YEAR, cnt, tz) +case class Years(cnt: Int)(implicit tz: TimeZone) extends Duration(Calendar.YEAR, cnt, tz) -abstract class AbstractDurationList[T <: Duration](parts : List[T]) extends Duration(-1,-1, null) { - override def addTo(that : RichDate) = { - parts.foldLeft(that) { (curdate, next) => next.addTo(curdate) } - } - override def subtractFrom(that : RichDate) = { - parts.foldLeft(that) { (curdate, next) => next.subtractFrom(curdate) } - } - //This does not make sense for a DurationList interval, pass through - override def floorOf(that : RichDate) = that +abstract class AbstractDurationList[T <: Duration](parts: List[T]) extends Duration(-1, -1, null) { + override def addTo(that: RichDate) = + parts.foldLeft(that)((curdate, next) => next.addTo(curdate)) + override def subtractFrom(that: RichDate) = + parts.foldLeft(that)((curdate, next) => next.subtractFrom(curdate)) + // This does not make sense for a DurationList interval, pass through + override def floorOf(that: RichDate) = that } -case class DurationList(parts : List[Duration]) extends AbstractDurationList[Duration](parts) +case class DurationList(parts: List[Duration]) extends AbstractDurationList[Duration](parts) diff --git a/scalding-date/src/main/scala/com/twitter/scalding/Globifier.scala b/scalding-date/src/main/scala/com/twitter/scalding/Globifier.scala index 9c595794a9..57ab8d1d5e 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/Globifier.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/Globifier.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.util.TimeZone @@ -25,29 +25,28 @@ import java.util.regex.Pattern * current range. This children must be ordered from largest * to smallest in size. */ -class BaseGlobifier(dur : Duration, val sym: String, pattern : String, tz : TimeZone, child : Option[BaseGlobifier]) - extends java.io.Serializable { - import DateOps._ +class BaseGlobifier( + dur: Duration, + val sym: String, + pattern: String, + tz: TimeZone, + child: Option[BaseGlobifier] +) extends java.io.Serializable { // result <= rd - private def greatestLowerBound(rd : RichDate) = dur.floorOf(rd) + private def greatestLowerBound(rd: RichDate) = dur.floorOf(rd) // rd <= result - private def leastUpperBound(rd : RichDate) : RichDate = { - val lb = greatestLowerBound(rd) - if (lb == rd) - rd - else - lb + dur - } + private def leastUpperBound(rd: RichDate): RichDate = + greatestLowerBound(rd) + dur def format(rd: RichDate) = rd.format(pattern)(tz) // Generate a lazy list of all children - final def children : Stream[BaseGlobifier] = child match { + final def children: Stream[BaseGlobifier] = child match { case Some(c) => Stream.cons(c, c.children) - case None => Stream.empty + case None => Stream.empty } - final def asteriskChildren(rd : RichDate) : String = { + final def asteriskChildren(rd: RichDate): String = { val childStarPattern = children.foldLeft(pattern) { (this_pat, child) => this_pat.replaceAll(Pattern.quote(child.sym), "*") } @@ -56,37 +55,37 @@ class BaseGlobifier(dur : Duration, val sym: String, pattern : String, tz : Time // Handles the case of zero interior boundaries // with potential boundaries only at the end points. - private def simpleCase(dr : DateRange) : List[String] = { + private def simpleCase(dr: DateRange): List[String] = { val sstr = format(dr.start) val estr = format(dr.end) if (dr.end < dr.start) { Nil - } - else if (child.isEmpty) { - //There is only one block: - assert(sstr == estr, "Malformed heirarchy" + sstr + " != " + estr) - List(sstr) - } - else { - /* - * Two cases: we should asterisk our children, or we need - * to recurse. If we fill this entire range, just asterisk, - */ - val bottom = children.last - val fillsright = format(leastUpperBound(dr.end)) == - format(bottom.leastUpperBound(dr.end)) - val fillsleft = format(greatestLowerBound(dr.start)) == - format(bottom.greatestLowerBound(dr.start)) - if (fillsright && fillsleft) { - List(asteriskChildren(dr.start)) - } - else { - child.get.globify(dr) + } else { + child match { + case None => + // There is only one block: + assert(sstr == estr, "Malformed hierarchy" + sstr + " != " + estr) + List(sstr) + case Some(c) => + /* + * Two cases: we should asterisk our children, or we need + * to recurse. If we fill this entire range, just asterisk, + */ + val bottom = children.last + val fillsright = format(leastUpperBound(dr.end)) == + format(bottom.leastUpperBound(dr.end)) + val fillsleft = format(greatestLowerBound(dr.start)) == + format(bottom.greatestLowerBound(dr.start)) + if (fillsright && fillsleft) { + List(asteriskChildren(dr.start)) + } else { + c.globify(dr) + } } } } - def globify(dr : DateRange) : List[String] = { + def globify(dr: DateRange): List[String] = { /* We know: * start <= end : by assumption * mid1 - start < delta : mid1 is least upper bound @@ -97,49 +96,45 @@ class BaseGlobifier(dur : Duration, val sym: String, pattern : String, tz : Time */ val mid1 = leastUpperBound(dr.start) val mid2 = greatestLowerBound(dr.end) - //Imprecise patterns may not need to drill down, let's see if we can stop early: + // Imprecise patterns may not need to drill down, let's see if we can stop early: val sstr = format(dr.start) val estr = format(dr.end) if (sstr == estr) { List(sstr) - } - else if (dr.end < dr.start) { - //This is nonsense: + } else if (dr.end < dr.start) { + // This is nonsense: Nil - } - else if (mid2 < mid1) { - //We do not contain a boundary point: + } else if (mid2 < mid1) { + // We do not contain a boundary point: simpleCase(dr) - } - // otherwise we contain one or more than one boundary points + } // otherwise we contain one or more than one boundary points else if (mid1 == mid2) { - //we contain exactly one boundary point: + // we contain exactly one boundary point: simpleCase(DateRange(dr.start, mid1 - Millisecs(1))) ++ simpleCase(DateRange(mid1, dr.end)) - } - else { - //We contain 2 or more boundary points: + } else { + // We contain 2 or more boundary points: // [start <= mid1 < mid2 <= end] // First check to see if we even need to check our children: simpleCase(DateRange(dr.start, mid1 - Millisecs(1))) ++ (asteriskChildren(mid1) :: - globify(DateRange(mid1 + dur, dr.end))) + globify(DateRange(mid1 + dur, dr.end))) } } } -case class HourGlob(pat : String)(implicit tz : TimeZone) - extends BaseGlobifier(Hours(1),"%1$tH", pat, tz, None) +case class HourGlob(pat: String)(implicit tz: TimeZone) + extends BaseGlobifier(Hours(1), "%1$tH", pat, tz, None) -case class DayGlob(pat : String)(implicit tz: TimeZone) - extends BaseGlobifier(Days(1)(tz), "%1$td", pat, tz, Some(HourGlob(pat))) +case class DayGlob(pat: String)(implicit tz: TimeZone) + extends BaseGlobifier(Days(1)(tz), "%1$td", pat, tz, Some(HourGlob(pat))) -case class MonthGlob(pat : String)(implicit tz: TimeZone) - extends BaseGlobifier(Months(1)(tz), "%1$tm", pat, tz, Some(DayGlob(pat))) +case class MonthGlob(pat: String)(implicit tz: TimeZone) + extends BaseGlobifier(Months(1)(tz), "%1$tm", pat, tz, Some(DayGlob(pat))) /* * This is the outermost globifier and should generally be used to globify */ -case class Globifier(pat : String)(implicit tz: TimeZone) - extends BaseGlobifier(Years(1)(tz), "%1$tY", pat, tz, Some(MonthGlob(pat))) - with java.io.Serializable +case class Globifier(pat: String)(implicit tz: TimeZone) + extends BaseGlobifier(Years(1)(tz), "%1$tY", pat, tz, Some(MonthGlob(pat))) + with java.io.Serializable diff --git a/scalding-date/src/main/scala/com/twitter/scalding/RichDate.scala b/scalding-date/src/main/scala/com/twitter/scalding/RichDate.scala index 35ffe8607a..fcc848f25c 100644 --- a/scalding-date/src/main/scala/com/twitter/scalding/RichDate.scala +++ b/scalding-date/src/main/scala/com/twitter/scalding/RichDate.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.text.SimpleDateFormat @@ -22,78 +22,91 @@ import java.util.Date import java.util.TimeZone /** -* RichDate adds some nice convenience functions to the Java date/calendar classes -* We commonly do Date/Time work in analysis jobs, so having these operations convenient -* is very helpful. -*/ + * RichDate adds some nice convenience functions to the Java date/calendar classes We commonly do Date/Time + * work in analysis jobs, so having these operations convenient is very helpful. + */ object RichDate { // Implicits to Java types: - implicit def toDate(rd : RichDate) = rd.value - implicit def toCalendar(rd : RichDate)(implicit tz : TimeZone): Calendar = { + implicit def toDate(rd: RichDate): Date = rd.value + implicit def toCalendar(rd: RichDate)(implicit tz: TimeZone): Calendar = { val cal = Calendar.getInstance(tz) cal.setTime(rd.value) cal } - implicit def apply(d : Date): RichDate = RichDate(d.getTime) - implicit def apply(d : Calendar): RichDate = RichDate(d.getTime) + implicit def apply(d: Date): RichDate = RichDate(d.getTime) + implicit def apply(d: Calendar): RichDate = RichDate(d.getTime) + /** - * Parse the string with one of the value DATE_FORMAT_VALIDATORS in the order listed in DateOps. - * We allow either date, date with time in minutes, date with time down to seconds. - * The separator between date and time can be a space or "T". - */ - implicit def apply(str : String)(implicit tz : TimeZone, dp: DateParser): RichDate = + * Parse the string with one of the value DATE_FORMAT_VALIDATORS in the order listed in DateOps. We allow + * either date, date with time in minutes, date with time down to seconds. The separator between date and + * time can be a space or "T". + */ + implicit def apply(str: String)(implicit tz: TimeZone, dp: DateParser): RichDate = dp.parse(str).get /* If the format is one of the truncated DateOps formats, we can do * the upper bound, else go to the end of the day */ - def upperBound(s : String)(implicit tz : TimeZone, dp: DateParser) = { + def upperBound(s: String)(implicit tz: TimeZone, dp: DateParser) = { val end = apply(s) - (DateOps.getFormat(s) match { - case Some(DateOps.DATE_WITH_DASH) => end + Days(1) - case Some(DateOps.DATEHOUR_WITH_DASH) => end + Hours(1) - case Some(DateOps.DATETIME_WITH_DASH) => end + Minutes(1) - case Some(DateOps.DATETIME_HMS_WITH_DASH) => end + Seconds(1) - case Some(DateOps.DATETIME_HMSM_WITH_DASH) => end + Millisecs(2) - case None => Days(1).floorOf(end + Days(1)) + (DateOps.getFormatObject(s) match { + case Some(DateOps.Format.DATE_WITHOUT_DASH) => end + Days(1) + case Some(DateOps.Format.DATE_WITH_DASH) => end + Days(1) + case Some(DateOps.Format.DATEHOUR_WITHOUT_DASH) => end + Hours(1) + case Some(DateOps.Format.DATEHOUR_WITH_DASH) => end + Hours(1) + case Some(DateOps.Format.DATETIME_WITHOUT_DASH) => end + Minutes(1) + case Some(DateOps.Format.DATETIME_WITH_DASH) => end + Minutes(1) + case Some(DateOps.Format.DATETIME_HMS_WITHOUT_DASH) => end + Seconds(1) + case Some(DateOps.Format.DATETIME_HMS_WITH_DASH) => end + Seconds(1) + case Some(DateOps.Format.DATETIME_HMSM_WITH_DASH) => end + Millisecs(2) + case None => Days(1).floorOf(end + Days(1)) }) - Millisecs(1) } def now: RichDate = RichDate(System.currentTimeMillis()) + + implicit def richDateOrdering: Ordering[RichDate] = new Ordering[RichDate] { + def compare(a: RichDate, b: RichDate) = java.lang.Long.compare(a.timestamp, b.timestamp) + } } -/** A value class wrapper for milliseconds since the epoch +/** + * A value class wrapper for milliseconds since the epoch. Its tempting to extend this with AnyVal but this + * causes problem with Java code. */ -case class RichDate(val timestamp : Long) extends Ordered[RichDate] { +case class RichDate(val timestamp: Long) extends Ordered[RichDate] { // these are mutable, don't keep them around def value: Date = new java.util.Date(timestamp) - def +(interval : Duration) = interval.addTo(this) - def -(interval : Duration) = interval.subtractFrom(this) + def +(interval: Duration) = interval.addTo(this) + def -(interval: Duration) = interval.subtractFrom(this) - //Inverse of the above, d2 + (d1 - d2) == d1 - def -(that : RichDate) = AbsoluteDuration.fromMillisecs(timestamp - that.timestamp) + // Inverse of the above, d2 + (d1 - d2) == d1 + def -(that: RichDate) = AbsoluteDuration.fromMillisecs(timestamp - that.timestamp) - override def compare(that : RichDate) : Int = - Ordering[Long].compare(timestamp, that.timestamp) + override def compare(that: RichDate): Int = + java.lang.Long.compare(timestamp, that.timestamp) - //True of the other is a RichDate with equal value, or a Date equal to value - override def equals(that : Any) = + // True of the other is a RichDate with equal value, or a Date equal to value + override def equals(that: Any) = that match { - case d: Date => d.getTime == timestamp + case d: Date => d.getTime == timestamp case RichDate(ts) => ts == timestamp - case _ => false + case _ => false } - /** Use String.format to format the date, as opposed to toString with uses SimpleDateFormat + def before(that: RichDate): Boolean = compare(that) < 0 + def after(that: RichDate): Boolean = compare(that) > 0 + + /** + * Use String.format to format the date, as opposed to toString, which uses SimpleDateFormat. */ - def format(pattern: String)(implicit tz: TimeZone) : String = String.format(pattern, toCalendar(tz)) + def format(pattern: String)(implicit tz: TimeZone): String = String.format(pattern, toCalendar(tz)) /** - * Make sure the hashCode is the same as Date for the (questionable) choice - * to make them equal. this is the same as what java does (and only sane thing): - * http://grepcode.com/file/repository.grepcode.com/java/root/jdk/openjdk/6-b14/java/util/Date.java#989 + * Make sure the hashCode is the same as Date for the (questionable) choice to make them equal. This is the + * same as what java does (and only sane thing). */ override def hashCode = (timestamp.toInt) ^ ((timestamp >> 32).toInt) @@ -105,13 +118,13 @@ case class RichDate(val timestamp : Long) extends Ordered[RichDate] { } override def toString = value.toString - /** Use SimpleDateFormat to print the string + /** + * Use SimpleDateFormat to print the string */ - def toString(fmt : String)(implicit tz : TimeZone) : String = { + def toString(fmt: String)(implicit tz: TimeZone): String = { val cal = toCalendar(tz) val sdfmt = new SimpleDateFormat(fmt) sdfmt.setCalendar(cal) sdfmt.format(cal.getTime) } } - diff --git a/scalding-date/src/test/scala/com/twitter/scalding/CalendarOpsTest.scala b/scalding-date/src/test/scala/com/twitter/scalding/CalendarOpsTest.scala index c8dc72f13d..a929071ca7 100644 --- a/scalding-date/src/test/scala/com/twitter/scalding/CalendarOpsTest.scala +++ b/scalding-date/src/test/scala/com/twitter/scalding/CalendarOpsTest.scala @@ -3,88 +3,117 @@ package com.twitter.scalding import java.text.SimpleDateFormat import java.util._ -import org.specs._ +import org.scalatest.WordSpec -class CalendarOpsTest extends Specification { - noDetailedDiffs() - - val cal = Calendar.getInstance(); - - val dateParser = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH); - val dateTimeParser = new SimpleDateFormat("MMM dd, yyyy H:mm:ss.SSS", Locale.ENGLISH); +class CalendarOpsTest extends WordSpec { + val cal = Calendar.getInstance() + val dateParser = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH) + val dateTimeParser = new SimpleDateFormat("MMM dd, yyyy H:mm:ss.SSS", Locale.ENGLISH) "The CalendarOps truncate method" should { "not truncate if the specified field is milliseconds" in { cal.setTime(new Date(1384819200555L)) - cal.get(Calendar.MILLISECOND) must be equalTo 555 + assert(cal.get(Calendar.MILLISECOND) === 555) } "truncate to a year" in { - dateParser.parse("January 1, 2002") must be equalTo - CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.YEAR) - - dateParser.parse("January 1, 2001") must be equalTo - CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.YEAR) + assert( + dateParser.parse("January 1, 2002") === + CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.YEAR) + ) + + assert( + dateParser.parse("January 1, 2001") === + CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.YEAR) + ) } "truncate to a month" in { - dateParser.parse("February 1, 2002") must be equalTo - CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.MONTH) - - dateParser.parse("November 1, 2001") must be equalTo - CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.MONTH) + assert( + dateParser.parse("February 1, 2002") === + CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.MONTH) + ) + + assert( + dateParser.parse("November 1, 2001") === + CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.MONTH) + ) } "truncate to a date" in { - dateParser.parse("February 12, 2002") must be equalTo - CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.DATE) - - dateParser.parse("November 18, 2001") must be equalTo - CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.DATE) + assert( + dateParser.parse("February 12, 2002") == + CalendarOps.truncate(dateParser.parse("February 12, 2002 12:34:56.789"), Calendar.DATE) + ) + + assert( + dateParser.parse("November 18, 2001") === + CalendarOps.truncate(dateParser.parse("November 18, 2001 1:23:11.321"), Calendar.DATE) + ) } "truncate to a minute" in { - dateTimeParser.parse("February 12, 2002 12:34:00.000") must be equalTo - CalendarOps.truncate(dateTimeParser.parse("February 12, 2002 12:34:56.789"), Calendar.MINUTE) - - dateTimeParser.parse("November 18, 2001 1:23:00.000") must be equalTo - CalendarOps.truncate(dateTimeParser.parse("November 18, 2001 1:23:11.321"), Calendar.MINUTE) + assert( + dateTimeParser.parse("February 12, 2002 12:34:00.000") === + CalendarOps.truncate(dateTimeParser.parse("February 12, 2002 12:34:56.789"), Calendar.MINUTE) + ) + + assert( + dateTimeParser.parse("November 18, 2001 1:23:00.000") === + CalendarOps.truncate(dateTimeParser.parse("November 18, 2001 1:23:11.321"), Calendar.MINUTE) + ) } "truncate to a second" in { - dateTimeParser.parse("February 12, 2002 12:34:56.000") must be equalTo - CalendarOps.truncate(dateTimeParser.parse("February 12, 2002 12:34:56.789"), Calendar.SECOND) - - dateTimeParser.parse("November 18, 2001 1:23:11.000") must be equalTo - CalendarOps.truncate(dateTimeParser.parse("November 18, 2001 1:23:11.321"), Calendar.SECOND) + assert( + dateTimeParser.parse("February 12, 2002 12:34:56.000") === + CalendarOps.truncate(dateTimeParser.parse("February 12, 2002 12:34:56.789"), Calendar.SECOND) + ) + + assert( + dateTimeParser.parse("November 18, 2001 1:23:11.000") === + CalendarOps.truncate(dateTimeParser.parse("November 18, 2001 1:23:11.321"), Calendar.SECOND) + ) } "truncate to AM" in { - dateTimeParser.parse("February 3, 2002 00:00:00.000") must be equalTo - CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 01:10:00.000"), Calendar.AM_PM) - - dateTimeParser.parse("February 3, 2002 00:00:00.000") must be equalTo - CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 11:10:00.000"), Calendar.AM_PM) + assert( + dateTimeParser.parse("February 3, 2002 00:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 01:10:00.000"), Calendar.AM_PM) + ) + + assert( + dateTimeParser.parse("February 3, 2002 00:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 11:10:00.000"), Calendar.AM_PM) + ) } "truncate to PM" in { - dateTimeParser.parse("February 3, 2002 12:00:00.000") must be equalTo - CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 13:10:00.000"), Calendar.AM_PM) - - dateTimeParser.parse("February 3, 2002 12:00:00.000") must be equalTo - CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 19:10:00.000"), Calendar.AM_PM) + assert( + dateTimeParser.parse("February 3, 2002 12:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 13:10:00.000"), Calendar.AM_PM) + ) + + assert( + dateTimeParser.parse("February 3, 2002 12:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("February 3, 2002 19:10:00.000"), Calendar.AM_PM) + ) } "truncate respects DST" in { TimeZone.setDefault(TimeZone.getTimeZone("MET")) dateTimeParser.setTimeZone(TimeZone.getTimeZone("MET")) - dateTimeParser.parse("March 30, 2003 00:00:00.000") must be equalTo - CalendarOps.truncate(dateTimeParser.parse("March 30, 2003 05:30:45.000"), Calendar.DATE) + assert( + dateTimeParser.parse("March 30, 2003 00:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("March 30, 2003 05:30:45.000"), Calendar.DATE) + ) - dateTimeParser.parse("October 26, 2003 00:00:00.000") must be equalTo - CalendarOps.truncate(dateTimeParser.parse("October 26, 2003 05:30:45.000"), Calendar.DATE) + assert( + dateTimeParser.parse("October 26, 2003 00:00:00.000") === + CalendarOps.truncate(dateTimeParser.parse("October 26, 2003 05:30:45.000"), Calendar.DATE) + ) } } } diff --git a/scalding-date/src/test/scala/com/twitter/scalding/DateProperties.scala b/scalding-date/src/test/scala/com/twitter/scalding/DateProperties.scala index 5da9a568be..61773c8989 100644 --- a/scalding-date/src/test/scala/com/twitter/scalding/DateProperties.scala +++ b/scalding-date/src/test/scala/com/twitter/scalding/DateProperties.scala @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import org.scalacheck.Arbitrary @@ -21,7 +21,6 @@ import org.scalacheck.Prop.forAll import org.scalacheck.Gen.choose import org.scalacheck.Prop._ -import scala.util.control.Exception.allCatch import AbsoluteDuration.fromMillisecs object DateProperties extends Properties("Date Properties") { @@ -29,23 +28,24 @@ object DateProperties extends Properties("Date Properties") { implicit def dateParser: DateParser = DateParser.default implicit val durationArb: Arbitrary[Duration] = - Arbitrary { choose(0, 10000).map { Millisecs(_) } } + Arbitrary(choose(0, 10000).map(Millisecs(_))) implicit val richDateArb: Arbitrary[RichDate] = Arbitrary { - for(v <- choose(0L, 1L<<32)) yield RichDate(v) + for (v <- choose(0L, 1L << 32)) yield RichDate(v) } implicit val dateRangeArb: Arbitrary[DateRange] = Arbitrary { - for(v1 <- choose(0L, 1L<<33); - v2 <- choose(v1, 1L<<33)) yield DateRange(RichDate(v1), RichDate(v2)) + for { + v1 <- choose(0L, 1L << 33) + v2 <- choose(v1, 1L << 33) + } yield DateRange(RichDate(v1), RichDate(v2)) } implicit val absdur: Arbitrary[AbsoluteDuration] = Arbitrary { - implicitly[Arbitrary[Long]] - .arbitrary + implicitly[Arbitrary[Long]].arbitrary // Ignore Longs that are too big to fit, and make sure we can add any random 3 together // Long.MaxValue / 1200 ms is the biggest that will fit, we divide by 3 to make sure // we can add three together in tests - .map { ms => fromMillisecs(ms/(1200*3)) } + .map(ms => fromMillisecs(ms / (1200 * 3))) } property("Shifting DateRanges breaks containment") = forAll { (dr: DateRange, r: Duration) => @@ -55,31 +55,31 @@ object DateProperties extends Properties("Date Properties") { property("Arithmetic works as expected") = forAll { (dr: DateRange, r: Duration) => (dr + r) - r == dr && - (dr.start + r) - r == dr.start + (dr.start + r) - r == dr.start } property("fromMillisecs toMillisecs") = forAll { (ad: AbsoluteDuration) => val ms = ad.toMillisecs (fromMillisecs(ms) == ad) } - def asInt(b: Boolean) = if(b) 1 else 0 + def asInt(b: Boolean) = if (b) 1 else 0 property("Before/After works") = forAll { (dr: DateRange, rd: RichDate) => (asInt(dr.contains(rd)) + asInt(dr.isBefore(rd)) + asInt(dr.isAfter(rd)) == 1) && - (dr.isBefore(dr.end + (dr.end - dr.start))) && - (dr.isAfter(dr.start - (dr.end - dr.start))) + (dr.isBefore(dr.end + (dr.end - dr.start))) && + (dr.isAfter(dr.start - (dr.end - dr.start))) } - def divDur(ad: AbsoluteDuration, div: Int) = fromMillisecs(ad.toMillisecs/div) + def divDur(ad: AbsoluteDuration, div: Int) = fromMillisecs(ad.toMillisecs / div) property("each output is contained") = forAll { (dr: DateRange) => val r = divDur(dr.end - dr.start, 10) - dr.each(r).forall { dr.contains(_) } + dr.each(r).forall(dr.contains(_)) } property("Embiggen/extend always contains") = forAll { (dr: DateRange, d: Duration) => dr.embiggen(d).contains(dr) && - dr.extend(d).contains(dr) + dr.extend(d).contains(dr) } property("RichDate subtraction Roundtrip") = forAll { (timestamp0: Long, delta: AbsoluteDuration) => @@ -91,30 +91,41 @@ object DateProperties extends Properties("Date Properties") { Millisecs(ms).toMillisecs.toInt == ms } - property("AbsoluteDuration group properties") = - forAll { (a: AbsoluteDuration, b: AbsoluteDuration, c: AbsoluteDuration) => + property("AbsoluteDuration group properties") = forAll { + (a: AbsoluteDuration, b: AbsoluteDuration, c: AbsoluteDuration) => (a + b) - c == a + (b - c) && (a + b) + c == a + (b + c) && (a - a) == fromMillisecs(0) && (b - b) == fromMillisecs(0) && - (c - c) == fromMillisecs(0) && - { b.toMillisecs == 0 || { + (c - c) == fromMillisecs(0) && { + b.toMillisecs == 0 || { // Don't divide by zero: - val (d, rem) = (a/b) + val (d, rem) = a / b a == b * d + rem && (rem.toMillisecs.abs < b.toMillisecs.abs) } } - } + } property("DateRange.length is correct") = forAll { (dr: DateRange) => dr.start + dr.length - AbsoluteDuration.fromMillisecs(1L) == dr.end } - def toRegex(glob: String) = (glob.flatMap { c => if(c == '*') ".*" else c.toString }).r + property("DateRange.exclusiveUpper works") = forAll { (a: RichDate, b: RichDate) => + val lower = Ordering[RichDate].min(a, b) + val upper = Ordering[RichDate].max(a, b) + val ex = DateRange.exclusiveUpper(lower, upper) + val in = DateRange(lower, upper) + val upperPred = upper - Millisecs(1) + + (false == ex.contains(upper)) && + (ex.contains(upperPred) || (lower == upper)) + } + + def toRegex(glob: String) = glob.flatMap(c => if (c == '*') ".*" else c.toString).r def matches(l: List[String], arg: String): Int = l - .map { toRegex _ } - .map { _.findFirstMatchIn(arg).map { _ => 1 }.getOrElse(0) } + .map(toRegex _) + .map(_.findFirstMatchIn(arg).map(_ => 1).getOrElse(0)) .sum // Make sure globifier always contains: @@ -123,7 +134,8 @@ object DateProperties extends Properties("Date Properties") { property("Globifying produces matching patterns") = forAll { (dr: DateRange) => val globbed = glob.globify(dr) // Brute force - dr.each(Hours(1)).map { _.start.format(pattern)(DateOps.UTC) } - .forall { matches(globbed, _) == 1 } + dr.each(Hours(1)) + .map(_.start.format(pattern)(DateOps.UTC)) + .forall(matches(globbed, _) == 1) } } diff --git a/scalding-date/src/test/scala/com/twitter/scalding/DateTest.scala b/scalding-date/src/test/scala/com/twitter/scalding/DateTest.scala index c2da64e43f..5de6b50f8c 100644 --- a/scalding-date/src/test/scala/com/twitter/scalding/DateTest.scala +++ b/scalding-date/src/test/scala/com/twitter/scalding/DateTest.scala @@ -12,158 +12,217 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding -import org.specs._ +import org.scalatest.WordSpec import java.util.Calendar +import java.util.TimeZone -class DateTest extends Specification { - noDetailedDiffs() - implicit val tz = DateOps.PACIFIC +class DateTest extends WordSpec { + implicit val tz: TimeZone = DateOps.PACIFIC implicit def dateParser: DateParser = DateParser.default "A RichDate" should { "implicitly convert strings" in { - val rd1 : RichDate = "2011-10-20" - val rd2 : RichDate = "2011-10-20" - rd1 must be_==(rd2) + val rd1: RichDate = "2011-10-20" + val rd2: RichDate = "2011-10-20" + val rd3: RichDate = "20111020" + val rd4: RichDate = "20111020" + assert(rd1 === rd2) + assert(rd3 === rd4) + assert(rd1 === rd3) } "implicitly convert calendars" in { - val rd1 : RichDate = "2011-10-20" + val rd1WithoutDash: RichDate = "20111020" + val calWithoutDash = Calendar.getInstance(tz) + calWithoutDash.setTime(rd1WithoutDash.value) + val rd2WithoutDash: RichDate = calWithoutDash + + val rd1: RichDate = "2011-10-20" val cal = Calendar.getInstance(tz) cal.setTime(rd1.value) - val rd2 : RichDate = cal - rd1 must_== rd2 + val rd2: RichDate = cal + + assert(rd1WithoutDash == rd2WithoutDash) + assert(rd1 === rd2) } "deal with strings with spaces" in { - val rd1 : RichDate = " 2011-10-20 " - val rd2 : RichDate = "2011-10-20 " - val rd3 : RichDate = " 2011-10-20 " - rd1 must be_==(rd2) - rd1 must be_==(rd3) + val rd1: RichDate = " 2011-10-20 " + val rd2: RichDate = "2011-10-20 " + val rd3: RichDate = " 2011-10-20 " + assert(rd1 === rd2) + assert(rd1 === rd3) } "handle dates with slashes and underscores" in { - val rd1 : RichDate = "2011-10-20" - val rd2 : RichDate = "2011/10/20" - val rd3 : RichDate = "2011_10_20" - rd1 must be_==(rd2) - rd1 must be_==(rd3) + val rd1: RichDate = "2011-10-20" + val rd2: RichDate = "2011/10/20" + val rd3: RichDate = "2011_10_20" + assert(rd1 === rd2) + assert(rd1 === rd3) } "be able to parse milliseconds" in { - val rd1 : RichDate = "2011-10-20 20:01:11.0" - val rd2 : RichDate = "2011-10-20 22:11:24.23" - val rd3 : RichDate = "2011-10-20 22:11:24.023 " - rd2 must_== rd3 + val rd1: RichDate = "2011-10-20 20:01:11.0" + val rd2: RichDate = "2011-10-20 22:11:24.23" + val rd3: RichDate = "2011-10-20 22:11:24.023 " + assert(rd2 === rd3) } "throw an exception when trying to parse illegal strings" in { // Natty is *really* generous about what it accepts - RichDate("jhbjhvhjv") must throwAn[IllegalArgumentException] - RichDate("99-99-99") must throwAn[IllegalArgumentException] + intercept[IllegalArgumentException](RichDate("jhbjhvhjv")) + intercept[IllegalArgumentException](RichDate("99-99-99")) } "be able to deal with arithmetic operations with whitespace" in { - val rd1 : RichDate = RichDate("2010-10-02") + Seconds(1) - val rd2 : RichDate = " 2010-10-02 T 00:00:01 " - rd1 must be_==(rd2) + val rd1: RichDate = RichDate("2010-10-02") + Seconds(1) + val rd2: RichDate = " 2010-10-02 T 00:00:01 " + assert(rd1 === rd2) + } + "be able to deal with arithmetic operations without hyphens and whitespaces" in { + val rd1: RichDate = RichDate("20101002") + Seconds(1) + val rd2: RichDate = " 2010-10-02 T 00:00:01 " + assert(rd1 === rd2) } "Have same equals & hashCode as Date (crazy?)" in { - val rd1 : RichDate = "2011-10-20" - rd1.equals(rd1.value) must beTrue - rd1.hashCode must be_==(rd1.value.hashCode) + val rd1: RichDate = "2011-10-20" + assert(rd1 === rd1.value) + assert(rd1.hashCode === rd1.value.hashCode) } "be well ordered" in { - val rd1 : RichDate = "2011-10-20" - val rd2 : RichDate = "2011-10-21" - rd1 must be_<(rd2) - rd1 must be_<=(rd2) - rd2 must be_>(rd1) - rd2 must be_>=(rd1) - rd1 must be_>=(rd1) - rd2 must be_>=(rd2) + val rd1: RichDate = "2011-10-20" + val rd2: RichDate = "2011-10-21" + assert(rd1 < rd2) + assert(rd1 <= rd2) + assert(rd2 > rd1) + assert(rd2 >= rd1) + assert(rd1 >= rd1) + assert(rd2 >= rd2) + } + "be able to compare with before() and after() with TimeZone in context" in { + implicit val tz: TimeZone = TimeZone.getDefault + val rd1: RichDate = "2011-01-01" + val rd2: RichDate = "2012-01-01" + assert(rd1.before(rd2)) } "implicitly convert from long" in { // This kind of implicit is not safe (what does the long mean?) implicit def longToDate(l: Long): RichDate = RichDate(l) - //This is close to: Mon Oct 24 20:03:13 PDT 2011 + // This is close to: Mon Oct 24 20:03:13 PDT 2011 val long_val = 1319511818135L val rd1 = "2011-10-24T20:03:00" val rd2 = "2011-10-24T20:04:00" - DateRange(rd1, rd2).contains(RichDate(long_val)) must beTrue - //Check edge cases: - DateRange(rd1, long_val).contains(long_val) must beTrue - DateRange(rd1, (long_val+1)).contains(long_val) must beTrue - DateRange(long_val, rd2).contains(long_val) must beTrue - DateRange((long_val-1), rd2).contains(long_val) must beTrue + assert(DateRange(rd1, rd2).contains(RichDate(long_val))) + // Check edge cases: + assert(DateRange(rd1, long_val).contains(long_val)) + assert(DateRange(rd1, (long_val + 1)).contains(long_val)) + assert(DateRange(long_val, rd2).contains(long_val)) + assert(DateRange((long_val - 1), rd2).contains(long_val)) - DateRange(rd1, "2011-10-24T20:03:01").contains(long_val) must beFalse - DateRange(rd1, (long_val-1)).contains(long_val) must beFalse - DateRange((long_val+1), rd2).contains(long_val) must beFalse + assert(!DateRange(rd1, "2011-10-24T20:03:01").contains(long_val)) + assert(!DateRange(rd1, (long_val - 1)).contains(long_val)) + assert(!DateRange((long_val + 1), rd2).contains(long_val)) } "roundtrip successfully" in { val start_str = "2011-10-24 20:03:00" - //string -> date -> string - RichDate(start_str).toString(DateOps.DATETIME_HMS_WITH_DASH) must_== start_str - //long -> date == date -> long -> date + // string -> date -> string + assert(RichDate(start_str).toString(DateOps.DATETIME_HMS_WITH_DASH) === start_str) + // long -> date == date -> long -> date val long_val = 1319511818135L val date = RichDate(long_val) val long2 = date.value.getTime val date2 = RichDate(long2) - date must_== date2 - long_val must_== long2 + assert(date === date2) + assert(long_val === long2) } "know the most recent time units" in { - //10-25 is a Tuesday, earliest in week is a monday - Weeks(1).floorOf("2011-10-25") must_==(RichDate("2011-10-24")) - Days(1).floorOf("2011-10-25 10:01") must_==(RichDate("2011-10-25 00:00")) - //Leaving off the time should give the same result: - Days(1).floorOf("2011-10-25 10:01") must_==(RichDate("2011-10-25")) - Hours(1).floorOf("2011-10-25 10:01") must_==(RichDate("2011-10-25 10:00")) + // 10-25 is a Tuesday, earliest in week is a monday + assert(Weeks(1).floorOf("2011-10-25") === RichDate("2011-10-24")) + assert(Weeks(1).floorOf("20111025") === RichDate("2011-10-24")) + assert(Days(1).floorOf("2011-10-25 10:01") === RichDate("2011-10-25 00:00")) + assert(Days(1).floorOf("201110251001") === RichDate("2011-10-25 00:00")) + // Leaving off the time should give the same result: + assert(Days(1).floorOf("201110251001") === RichDate("2011-10-25")) + assert(Days(1).floorOf("2011-10-25 10:01") === RichDate("2011-10-25")) + assert(Hours(1).floorOf("201110251001") === RichDate("2011-10-25 10:00")) + assert(Hours(1).floorOf("2011-10-25 10:01") === RichDate("2011-10-25 10:00")) } "correctly do arithmetic" in { - val d1 : RichDate = "2011-10-24" + val d1: RichDate = "2011-10-24" (-4 to 4).foreach { n => List(Hours, Minutes, Seconds, Millisecs).foreach { u => val d2 = d1 + u(n) - (d2 - d1) must_== u(n) + assert((d2 - d1) === u(n)) } } } "correctly calculate upperBound" in { - Seconds(1).floorOf(RichDate.upperBound("2010-10-01")) must_== Seconds(1).floorOf(RichDate("2010-10-01 23:59:59")) - Seconds(1).floorOf(RichDate.upperBound("2010-10-01 14")) must_== Seconds(1).floorOf(RichDate("2010-10-01 14:59:59")) - Seconds(1).floorOf(RichDate.upperBound("2010-10-01 14:15")) must_== Seconds(1).floorOf(RichDate("2010-10-01 14:15:59")) + assert( + Seconds(1).floorOf(RichDate.upperBound("20101001")) === Seconds(1).floorOf( + RichDate("2010-10-01 23:59:59") + ) + ) + assert( + Seconds(1).floorOf(RichDate.upperBound("2010100114")) === Seconds(1).floorOf( + RichDate("2010-10-01 14:59:59") + ) + ) + assert( + Seconds(1).floorOf(RichDate.upperBound("201010011415")) === Seconds(1).floorOf( + RichDate("2010-10-01 14:15:59") + ) + ) + assert( + Seconds(1).floorOf(RichDate.upperBound("2010-10-01")) === Seconds(1).floorOf( + RichDate("2010-10-01 23:59:59") + ) + ) + assert( + Seconds(1).floorOf(RichDate.upperBound("2010-10-01 14")) === Seconds(1).floorOf( + RichDate("2010-10-01 14:59:59") + ) + ) + assert( + Seconds(1).floorOf(RichDate.upperBound("2010-10-01 14:15")) === Seconds(1).floorOf( + RichDate("2010-10-01 14:15:59") + ) + ) + } + "Have an implicit Ordering" in { + implicitly[Ordering[RichDate]] + implicitly[Ordering[(String, RichDate)]] } } "A DateRange" should { "correctly iterate on each duration" in { - def rangeContainTest(d1 : DateRange, dur : Duration) = { - d1.each(dur).forall( (d1r : DateRange) => d1.contains(d1r) ) must beTrue - } + def rangeContainTest(d1: DateRange, dur: Duration) = + assert(d1.each(dur).forall((d1r: DateRange) => d1.contains(d1r))) rangeContainTest(DateRange("2010-10-01", "2010-10-13"), Weeks(1)) rangeContainTest(DateRange("2010-10-01", "2010-10-13"), Weeks(2)) rangeContainTest(DateRange("2010-10-01", "2010-10-13"), Days(1)) - //Prime non one: + // Prime non one: rangeContainTest(DateRange("2010-10-01", "2010-10-13"), Days(5)) - //Prime number of Minutes + // Prime number of Minutes rangeContainTest(DateRange("2010-10-01", "2010-10-13"), Minutes(13)) rangeContainTest(DateRange("2010-10-01", "2010-10-13"), Hours(13)) - DateRange("2010-10-01", "2010-10-10").each(Days(1)).size must_== 10 - DateRange("2010-10-01 00:00", RichDate("2010-10-02") - Millisecs(1)).each(Hours(1)).size must_== 24 - DateRange("2010-10-01 00:00", RichDate("2010-10-02") + Millisecs(1)).each(Hours(1)).size must_== 25 - DateRange("2010-10-01",RichDate.upperBound("2010-10-20")).each(Days(1)).size must_== 20 - DateRange("2010-10-01",RichDate.upperBound("2010-10-01")).each(Hours(1)).size must_== 24 - DateRange("2010-10-31",RichDate.upperBound("2010-10-31")).each(Hours(1)).size must_== 24 - DateRange("2010-10-31",RichDate.upperBound("2010-10-31")).each(Days(1)).size must_== 1 - DateRange("2010-10-31 12:00",RichDate.upperBound("2010-10-31 13")).each(Minutes(1)).size must_== 120 + assert(DateRange("2010-10-01", "2010-10-10").each(Days(1)).size === 10) + assert(DateRange("201010010000", RichDate("2010-10-02") - Millisecs(1)).each(Hours(1)).size === 24) + assert(DateRange("2010-10-01 00:00", RichDate("2010-10-02") - Millisecs(1)).each(Hours(1)).size === 24) + assert(DateRange("2010-10-01 00:00", RichDate("2010-10-02") + Millisecs(1)).each(Hours(1)).size === 25) + assert(DateRange("2010-10-01", RichDate.upperBound("2010-10-20")).each(Days(1)).size === 20) + assert(DateRange("2010-10-01", RichDate.upperBound("2010-10-01")).each(Hours(1)).size === 24) + assert(DateRange("2010-10-31", RichDate.upperBound("2010-10-31")).each(Hours(1)).size === 24) + assert(DateRange("2010-10-31", RichDate.upperBound("2010-10-31")).each(Days(1)).size === 1) + assert( + DateRange("2010-10-31 12:00", RichDate.upperBound("2010-10-31 13")).each(Minutes(1)).size === 120 + ) } "have each partition disjoint and adjacent" in { - def eachIsDisjoint(d : DateRange, dur : Duration) { + def eachIsDisjoint(d: DateRange, dur: Duration): Unit = { val dl = d.each(dur) - dl.zip(dl.tail).forall { case (da, db) => + assert(dl.zip(dl.tail).forall { case (da, db) => da.isBefore(db.start) && db.isAfter(da.end) && ((da.end + Millisecs(1)) == db.start) - } must beTrue + }) } eachIsDisjoint(DateRange("2010-10-01", "2010-10-03"), Days(1)) eachIsDisjoint(DateRange("2010-10-01", "2010-10-03"), Weeks(1)) @@ -174,45 +233,56 @@ class DateTest extends Specification { eachIsDisjoint(DateRange("2010-10-01", "2010-10-03"), Hours(2)) eachIsDisjoint(DateRange("2010-10-01", "2010-10-03"), Minutes(1)) } + "reject an end that is before its start" in { + intercept[IllegalArgumentException](DateRange("2010-10-02", "2010-10-01")) + } + "correctly add time in either or both directions" in { + assert(DateRange("2010-10-01", "2010-10-02").extend(Days(3)).each(Days(1)).size === 5) + assert(DateRange("2010-10-01", "2010-10-02").prepend(Days(3)).each(Days(1)).size === 5) + assert(DateRange("2010-10-01", "2010-10-02").embiggen(Days(3)).each(Days(1)).size === 8) + assert( + DateRange("2010-10-01", "2010-10-10").extend(Days(1)).prepend(Days(1)) == + DateRange("2010-10-01", "2010-10-10").embiggen(Days(1)) + ) + } } "Time units" should { - def isSame(d1 : Duration, d2 : Duration) = { + def isSame(d1: Duration, d2: Duration) = (RichDate("2011-12-01") + d1) == (RichDate("2011-12-01") + d2) - } "have 1000 milliseconds in a sec" in { - isSame(Millisecs(1000), Seconds(1)) must beTrue - Seconds(1).toMillisecs must_== 1000L - Millisecs(1000).toSeconds must_== 1.0 - Seconds(2).toMillisecs must_== 2000L - Millisecs(2000).toSeconds must_== 2.0 + assert(isSame(Millisecs(1000), Seconds(1))) + assert(Seconds(1).toMillisecs === 1000L) + assert(Millisecs(1000).toSeconds === 1.0) + assert(Seconds(2).toMillisecs === 2000L) + assert(Millisecs(2000).toSeconds === 2.0) } "have 60 seconds in a minute" in { - isSame(Seconds(60), Minutes(1)) must beTrue - Minutes(1).toSeconds must_== 60.0 - Minutes(1).toMillisecs must_== 60 * 1000L - Minutes(2).toSeconds must_== 120.0 - Minutes(2).toMillisecs must_== 120 * 1000L - } + assert(isSame(Seconds(60), Minutes(1))) + assert(Minutes(1).toSeconds === 60.0) + assert(Minutes(1).toMillisecs === 60 * 1000L) + assert(Minutes(2).toSeconds === 120.0) + assert(Minutes(2).toMillisecs === 120 * 1000L) + } "have 60 minutes in a hour" in { - isSame(Minutes(60),Hours(1)) must beTrue - Hours(1).toSeconds must_== 60.0 * 60.0 - Hours(1).toMillisecs must_== 60 * 60 * 1000L - Hours(2).toSeconds must_== 2 * 60.0 * 60.0 - Hours(2).toMillisecs must_== 2 * 60 * 60 * 1000L + assert(isSame(Minutes(60), Hours(1))) + assert(Hours(1).toSeconds === 60.0 * 60.0) + assert(Hours(1).toMillisecs === 60 * 60 * 1000L) + assert(Hours(2).toSeconds === 2 * 60.0 * 60.0) + assert(Hours(2).toMillisecs === 2 * 60 * 60 * 1000L) } - "have 7 days in a week" in { isSame(Days(7), Weeks(1)) must beTrue } + "have 7 days in a week" in { assert(isSame(Days(7), Weeks(1))) } } "AbsoluteDurations" should { "behave as comparable" in { - (Hours(5) >= Hours(2)) must beTrue - (Minutes(60) >= Minutes(60)) must beTrue - (Hours(1) < Millisecs(3600001)) must beTrue + assert(Hours(5) >= Hours(2)) + assert(Minutes(60) >= Minutes(60)) + assert(Hours(1) < Millisecs(3600001)) } "add properly" in { - (Hours(2) + Hours(1)).compare(Hours(3)) must_== 0 + assert((Hours(2) + Hours(1)).compare(Hours(3)) === 0) } "have a well behaved max function" in { - AbsoluteDuration.max(Hours(1), Hours(2)).compare(Hours(2)) must_== 0 + assert(AbsoluteDuration.max(Hours(1), Hours(2)).compare(Hours(2)) === 0) } } "Globifiers" should { @@ -221,40 +291,89 @@ class DateTest extends Specification { val t2 = Globifier("/%1$tY/%1$tm/%1$td/") val testcases = - (t1.globify(DateRange("2011-12-01T14", "2011-12-04")), - List("/2011/12/01/14","/2011/12/01/15","/2011/12/01/16","/2011/12/01/17","/2011/12/01/18", - "/2011/12/01/19","/2011/12/01/20", "/2011/12/01/21","/2011/12/01/22","/2011/12/01/23", - "/2011/12/02/*","/2011/12/03/*","/2011/12/04/00")) :: - (t1.globify(DateRange("2011-12-01", "2011-12-01T23:59")), - List("/2011/12/01/*")) :: - (t1.globify(DateRange("2011-12-01T12", "2011-12-01T12:59")), - List("/2011/12/01/12")) :: - (t1.globify(DateRange("2011-12-01T12", "2011-12-01T14")), - List("/2011/12/01/12","/2011/12/01/13","/2011/12/01/14")) :: - (t2.globify(DateRange("2011-12-01T14", "2011-12-04")), - List("/2011/12/01/","/2011/12/02/","/2011/12/03/","/2011/12/04/")) :: - (t2.globify(DateRange("2011-12-01", "2011-12-01T23:59")), - List("/2011/12/01/")) :: - (t2.globify(DateRange("2011-12-01T12", "2011-12-01T12:59")), - List("/2011/12/01/")) :: - (t2.globify(DateRange("2011-12-01T12", "2012-01-02T14")), - List("/2011/12/*/","/2012/01/01/","/2012/01/02/")) :: - (t2.globify(DateRange("2011-11-01T12", "2011-12-02T14")), - List("/2011/11/*/","/2011/12/01/","/2011/12/02/")) :: - Nil + ( + t1.globify(DateRange("2011-12-01T14", "2011-12-04")), + List( + "/2011/12/01/14", + "/2011/12/01/15", + "/2011/12/01/16", + "/2011/12/01/17", + "/2011/12/01/18", + "/2011/12/01/19", + "/2011/12/01/20", + "/2011/12/01/21", + "/2011/12/01/22", + "/2011/12/01/23", + "/2011/12/02/*", + "/2011/12/03/*", + "/2011/12/04/00" + ) + ) :: + (t1.globify(DateRange("2011-12-01", "2011-12-01T23:59")), List("/2011/12/01/*")) :: + ( + t1.globify(DateRange("2014-06-30T00", "2014-07-01T00")), + List("/2014/06/30/*", "/2014/07/01/00") + ) :: + (t1.globify(DateRange("2011-12-01T12", "2011-12-01T12:59")), List("/2011/12/01/12")) :: + ( + t1.globify(DateRange("2011-12-01T12", "2011-12-01T14")), + List("/2011/12/01/12", "/2011/12/01/13", "/2011/12/01/14") + ) :: + ( + t2.globify(DateRange("2011-12-01T14", "2011-12-04")), + List("/2011/12/01/", "/2011/12/02/", "/2011/12/03/", "/2011/12/04/") + ) :: + (t2.globify(DateRange("2011-12-01", "2011-12-01T23:59")), List("/2011/12/01/")) :: + (t2.globify(DateRange("2011-12-01T12", "2011-12-01T12:59")), List("/2011/12/01/")) :: + ( + t2.globify(DateRange("2011-12-01T12", "2012-01-02T14")), + List("/2011/12/*/", "/2012/01/01/", "/2012/01/02/") + ) :: + ( + t2.globify(DateRange("2011-11-01T12", "2011-12-02T14")), + List("/2011/11/*/", "/2011/12/01/", "/2011/12/02/") + ) :: + Nil - testcases.foreach { tup => - tup._1 must_== tup._2 - } + testcases.foreach { case (l, r) => assert(l === r) } } - def eachElementDistinct(dates : List[String]) = dates.size == dates.toSet.size - def globMatchesDate(glob : String)(date : String) = { - java.util.regex.Pattern.matches(glob.replaceAll("\\*","[0-9]*"), date) + + "The forward and reverser should match" in { + val globifierOps = GlobifierOps() + + val hourlyTestCases = List( + DateRange("2011-12-01T14", "2011-12-04"), + DateRange("2011-12-01", "2011-12-01T23:59"), + DateRange("2014-06-30T00", "2014-07-01T00"), + DateRange("2011-12-01T12", "2011-12-01T12:59"), + DateRange("2011-12-01T12", "2011-12-01T14") + ) + + hourlyTestCases.foreach { dr => + val resultantDR = globifierOps.hourlyRtGlobifier(dr) + assert(globifierOps.normalizeHrDr(dr) === globifierOps.normalizeHrDr(resultantDR)) + } + + val dailyTestCases = List( + DateRange("2011-12-01T14", "2011-12-04"), + DateRange("2011-12-01", "2011-12-01T23:59"), + DateRange("2011-12-01T12", "2011-12-01T12:59"), + DateRange("2011-12-01T12", "2012-01-02T14"), + DateRange("2011-11-01T12", "2011-12-02T14") + ) + + dailyTestCases.foreach { dr => + val resultantDR = globifierOps.dailyRtGlobifier(dr) + assert(globifierOps.normalizeDayDr(dr) === globifierOps.normalizeDayDr(resultantDR)) + } } - def bruteForce(pattern : String, dr : DateRange, dur : Duration)(implicit tz : java.util.TimeZone) = { + + def eachElementDistinct(dates: List[String]) = dates.size == dates.toSet.size + def globMatchesDate(glob: String)(date: String) = + java.util.regex.Pattern.matches(glob.replaceAll("\\*", "[0-9]*"), date) + def bruteForce(pattern: String, dr: DateRange, dur: Duration)(implicit tz: java.util.TimeZone) = dr.each(dur) - .map { (dr : DateRange) => String.format(pattern, dr.start.toCalendar(tz)) } - } + .map((dr: DateRange) => String.format(pattern, dr.start.toCalendar(tz))) "handle random test cases" in { // This kind of implicit is not safe (what does the long mean?) @@ -263,16 +382,19 @@ class DateTest extends Specification { val t1 = Globifier(pattern) val r = new java.util.Random() - (0 until 100) foreach { step => + (0 until 100).foreach { step => val start = RichDate("2011-08-03").value.getTime + r.nextInt(Int.MaxValue) val dr = DateRange(start, start + r.nextInt(Int.MaxValue)) val splits = bruteForce(pattern, dr, Hours(1)) val globed = t1.globify(dr) - eachElementDistinct(globed) must beTrue - //See that each path is matched by exactly one glob: - splits.map { path => globed.filter { globMatchesDate(_)(path) }.size } - .forall { _ == 1 } must beTrue + assert(eachElementDistinct(globed)) + // See that each path is matched by exactly one glob: + assert( + splits + .map(path => globed.filter(globMatchesDate(_)(path)).size) + .forall(_ == 1) + ) } } } diff --git a/scalding-date/src/test/scala/com/twitter/scalding/GlobifierOps.scala b/scalding-date/src/test/scala/com/twitter/scalding/GlobifierOps.scala new file mode 100644 index 0000000000..0220a55e3d --- /dev/null +++ b/scalding-date/src/test/scala/com/twitter/scalding/GlobifierOps.scala @@ -0,0 +1,106 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import java.util.TimeZone +import scala.util.{Failure, Success, Try} + +case class GlobifierOps(implicit tz: TimeZone, dp: DateParser) { + val yearMonthDayHourDurations = List(Years(1), Months(1), Days(1), Hours(1)) + val yearMonthDayHourPattern = "/%1$tY/%1$tm/%1$td/%1$tH" + private val hourlyGlobifier = Globifier(yearMonthDayHourPattern) + + def normalizeHrDr(a: DateRange) = + DateRange(Hours(1).floorOf(a.start), Hours(1).floorOf(a.end)) + + def hourlyRtGlobifier(inputDR: DateRange): DateRange = + rtGlobifier(hourlyGlobifier, yearMonthDayHourDurations)(inputDR) + + val yearMonthDayDurations = List(Years(1), Months(1), Days(1)) + val yearMonthDayPattern = "/%1$tY/%1$tm/%1$td" + private val dailyGlobifier = Globifier(yearMonthDayPattern) + + def normalizeDayDr(a: DateRange) = + DateRange(Days(1).floorOf(a.start), Days(1).floorOf(a.end)) + + def dailyRtGlobifier(inputDR: DateRange): DateRange = + rtGlobifier(dailyGlobifier, yearMonthDayDurations)(inputDR) + + def rtGlobifier(globifier: Globifier, durationList: List[Duration])(inputDr: DateRange): DateRange = { + val p = globifier.globify(inputDr) + + val drList = p + .map { pattern => + val (lists, _, _) = + pattern.split("/").tail.foldLeft((List[(Duration, Duration)](), durationList, true)) { + case ((durationLists, mappings, shouldContinue), current) => + val curMapping = mappings.head + if (shouldContinue) { + val tryDuration: Try[Duration] = Try(current.toInt).map { indx => + curMapping match { + case t if mappings.tail == Nil => t + case _ => Millisecs(0) + } + } + + val (duration, doContinue) = tryDuration match { + case Success(d) => (d, true) + case Failure(e) => + val dur: Duration = curMapping match { + case Years(_) => sys.error("Current is " + current + ", parsed as all years?") + case Months(_) => Years(1) + case Days(_) => Months(1) + case Hours(_) => Days(1) + } + (dur, false) + } + + val base: Duration = Try(current.toInt) + .map { indx => + curMapping match { + case Years(_) => Years(indx - 1970) + case Months(_) => Months(indx - 1) // months and days are 1 offsets not 0 + case Days(_) => Days(indx - 1) + case Hours(_) => Hours(indx) + } + } + .getOrElse(Hours(0)) + (durationLists :+ (base, duration), mappings.tail, doContinue) + } else { + (durationLists, mappings.tail, false) + } + } + val baseDate = lists.foldLeft(RichDate("1970-01-01T00")) { case (curDate, (base, _)) => + base.addTo(curDate) + } + val endDate = lists.foldLeft(baseDate) { case (curDate, (_, dur)) => + dur.addTo(curDate) + } + DateRange(baseDate, endDate - Millisecs(1)) + } + .sortBy(_.start) + + def combineDR(existing: DateRange, next: DateRange): DateRange = { + require( + existing.end == next.start - Millisecs(1), + "Not contigious range: \n" + existing + "\n" + next + "...From:\n" + p.mkString(",\n") + ) + DateRange(existing.start, next.end) + } + + drList.reduceLeft(combineDR) + } +} diff --git a/scalding-date/src/test/scala/com/twitter/scalding/GlobifierProperties.scala b/scalding-date/src/test/scala/com/twitter/scalding/GlobifierProperties.scala new file mode 100644 index 0000000000..f976504f5d --- /dev/null +++ b/scalding-date/src/test/scala/com/twitter/scalding/GlobifierProperties.scala @@ -0,0 +1,110 @@ +/* +Copyright 2012 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding + +import org.scalacheck.Arbitrary +import org.scalacheck.Properties +import org.scalacheck.Prop.forAll +import org.scalacheck.Gen.choose +import org.scalacheck.Prop._ + +import java.util.TimeZone + +object GlobifierProperties extends Properties("Globifier Properties") { + + implicit def dateParser: DateParser = DateParser.default + implicit def tz: TimeZone = TimeZone.getTimeZone("UTC") + + implicit val hourArb: Arbitrary[Hours] = + Arbitrary(choose(0, 10000).map(Hours(_))) + + implicit val dayArb: Arbitrary[Days] = + Arbitrary(choose(0, 100).map(Days(_))) + + implicit val yearArb: Arbitrary[Years] = + Arbitrary(choose(0, 100).map(Years(_))) + + implicit val richDateArb: Arbitrary[RichDate] = Arbitrary { + for (v <- choose(0L, 1L << 32)) yield RichDate(v) + } + + lazy val globifierOps = GlobifierOps() + + def testHrDr(dr: DateRange): Boolean = { + val resultantDR = globifierOps.hourlyRtGlobifier(dr) + val resultantWithNormalized = globifierOps.hourlyRtGlobifier(globifierOps.normalizeHrDr(dr)) + + val res = globifierOps.normalizeHrDr(dr) == globifierOps.normalizeHrDr(resultantDR) && + globifierOps.normalizeHrDr(dr) == globifierOps.normalizeHrDr(resultantWithNormalized) + + if (!res) { + println("Input dr: " + dr) + println("resulting dr: " + resultantDR) + println("resulting dr with pre-normalize: " + resultantWithNormalized) + } + res + } + + // Laws to ensure we can round trip through the hour patterned globifier + property("HR Globifier with hour deltas RT's") = forAll { (rndRD: RichDate, delta: Hours) => + val rd: RichDate = Hours(1).addTo(Hours(1).floorOf(rndRD)) + val dr = DateRange(rndRD, rd + delta) + testHrDr(dr) + } + property("HR Globifier with Day deltas RT's") = forAll { (rndRD: RichDate, delta: Days) => + val rd: RichDate = Days(1).addTo(Days(1).floorOf(rndRD)) + val dr = DateRange(rndRD, rd + delta) + testHrDr(dr) + } + + property("HR Globifier with Year deltas RT's") = forAll { (rndRD: RichDate, delta: Years) => + val rd: RichDate = Years(1).addTo(Years(1).floorOf(rndRD)) + val dr = DateRange(rndRD, rd + delta) + testHrDr(dr) + } + + def testDayDr(dr: DateRange): Boolean = { + val resultantDR = globifierOps.dailyRtGlobifier(dr) + val resultantWithNormalized = globifierOps.dailyRtGlobifier(globifierOps.normalizeDayDr(dr)) + + val res = globifierOps.normalizeDayDr(dr) == globifierOps.normalizeDayDr(resultantDR) && + globifierOps.normalizeDayDr(dr) == globifierOps.normalizeDayDr(resultantWithNormalized) + + if (!res) { + println("Input dr: " + dr) + println("resulting dr: " + resultantDR) + println("resulting dr with pre-normalize: " + resultantWithNormalized) + } + res + } + // Laws to ensure we can round trip through the day patterned globifier + property("Day Globifier with hour deltas RT's") = forAll { (rndRD: RichDate, delta: Hours) => + val rd: RichDate = Hours(1).addTo(Hours(1).floorOf(rndRD)) + val dr = DateRange(rndRD, rd + delta) + testDayDr(dr) + } + property("Day Globifier with Day deltas RT's") = forAll { (rndRD: RichDate, delta: Days) => + val rd: RichDate = Days(1).addTo(Days(1).floorOf(rndRD)) + val dr = DateRange(rndRD, rd + delta) + testDayDr(dr) + } + + property("Day Globifier with Year deltas RT's") = forAll { (rndRD: RichDate, delta: Years) => + val rd: RichDate = Years(1).addTo(Years(1).floorOf(rndRD)) + val dr = DateRange(rndRD, rd + delta) + testDayDr(dr) + } +} diff --git a/scalding-db/README.md b/scalding-db/README.md new file mode 100644 index 0000000000..c97ab895d4 --- /dev/null +++ b/scalding-db/README.md @@ -0,0 +1,82 @@ +## Scalding JDBC Macros + +Provides macros to interop between Scala case classes and relational database / SQL column definitions. + +For a case class T, the macro-generated `ColumnDefinitionProvider[T]` provides: +1. `ColumnDefinition`s for the corresponding DB table columns +2. `ResultSetExtractor[T]` for extracting records from `java.sql.ResultSet` into objects of type `T` + +Also provided are `TupleConverter`, `TupleSetter` and `cascading.tuple.Fields` for use with Cascading. +`DBTypeDescriptor[T]` is the top-level class that contains all of the above. + +### Illustration + +(in the REPL) + +Necessary imports: + + scalding> import com.twitter.scalding.db_ + scalding> import com.twitter.scalding.db.macros._ + +Case class representing your DB schema: + + scalding> case class ExampleDBRecord( + | card_id: Long, + | tweet_id: Long, + | created_at: Option[java.util.Date], + | deleted: Boolean = false) + defined class ExampleDBRecord + +Get the macro-generated converters: + + scalding> val dbTypeInfo = implicitly[DBTypeDescriptor[ExampleDBRecord]] + dbTypeInfo: com.twitter.scalding.db.DBTypeDescriptor[ExampleDBRecord] = $anon$6@7b07168 + + scalding> val columnDefn = dbTypeInfo.columnDefn + columnDefn: com.twitter.scalding.db.ColumnDefinitionProvider[ExampleDBRecord] = $anon$6$$anon$2@53328a4f + +Macro-generated SQL column definitions: + + scalding> columnDefn.columns + res0: Iterable[com.twitter.scalding.db.ColumnDefinition] = + List( + ColumnDefinition(BIGINT,ColumnName(card_id),NotNullable,None,None), + ColumnDefinition(BIGINT,ColumnName(tweet_id),NotNullable,None,None), + ColumnDefinition(DATETIME,ColumnName(created_at),Nullable,None,None), + ColumnDefinition(BOOLEAN,ColumnName(deleted),NotNullable,None,Some(false)) + ) + +Macro-generated Cascading fields: + + scalding> dbTypeInfo.fields + res1: cascading.tuple.Fields = 'card_id', 'tweet_id', 'created_at', 'deleted | long, long, Date, boolean + + +### Supported Mappings + +Scala type | SQL type +------------- | ------------- +`Int` | `INTEGER` +`Long` | `BIGINT` +`Short` | `SMALLINT` +`Double` | `DOUBLE` +`@varchar @size(20) String `| `VARCHAR(20)` +`@text String` | `TEXT` +`java.util.Date` | `DATETIME` +`@date java.util.Date` | `DATE` +`Boolean` | `BOOLEAN` + | (`BOOLEAN` is used if creating a new table at write time, but `BOOL` and `TINYINT` are also supported for reading existing columns) + +* Annotations are used for String types to clearly distinguish between TEXT and VARCHAR column types +* Scala `Option`s can be used to denote columns that are `NULLABLE` in the DB +* `java.lang.*` types are not supported. For e.g. `Integer` (`java.lang.Integer`) does not work + +## Nested case class + +Nested case classes can be used as a workaround for the 22-size limitation. It can also be used for logically grouping the table columns. Nested case classes are flattened in left to right order. For example: +```scala +case class Person(id: Long, name: String, location: Location) +case class Location(geo: GeoCode, doorNum: Int, street: String, city: String) +case class GeoCode(lat: Long, lng: Long) +``` +is flattened to a table schema with columns `id`, `name`, `lat`, `lng`, `doorNum`, `street`, `city`. diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefiner.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefiner.scala new file mode 100644 index 0000000000..adf80b11b6 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefiner.scala @@ -0,0 +1,121 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding.db + +sealed trait SqlType +case object BIGINT extends SqlType +case object INT extends SqlType +case object SMALLINT extends SqlType +case object TINYINT extends SqlType +case object BOOLEAN extends SqlType +case object VARCHAR extends SqlType +case object DATE extends SqlType +case object DATETIME extends SqlType +case object TEXT extends SqlType +case object BLOB extends SqlType +case object DOUBLE extends SqlType + +object IsNullable { + def apply(isNullable: Boolean): IsNullable = if (isNullable) Nullable else NotNullable +} + +sealed abstract class IsNullable(val toStr: String) +case object Nullable extends IsNullable("NULL") +case object NotNullable extends IsNullable("NOT NULL") + +trait ColumnDefiner { + // Some helper methods that we can use to generate column definitions + protected def bigint( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = + ColumnDefinition(BIGINT, ColumnName(name), nullable, sizeOpt, defaultValue) + + protected def int( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = + ColumnDefinition(INT, ColumnName(name), nullable, sizeOpt, defaultValue) + + protected def smallint( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = + ColumnDefinition(SMALLINT, ColumnName(name), nullable, sizeOpt, defaultValue) + + protected def tinyint( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = + ColumnDefinition(TINYINT, ColumnName(name), nullable, sizeOpt, defaultValue) + + protected def boolean( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = + ColumnDefinition(BOOLEAN, ColumnName(name), nullable, sizeOpt, defaultValue) + + protected def varchar( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = + ColumnDefinition(VARCHAR, ColumnName(name), nullable, sizeOpt, defaultValue) + + protected def date( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = + ColumnDefinition(DATE, ColumnName(name), nullable, sizeOpt, defaultValue) + + protected def datetime( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = + ColumnDefinition(DATETIME, ColumnName(name), nullable, sizeOpt, defaultValue) + + protected def text( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = + ColumnDefinition(TEXT, ColumnName(name), nullable, sizeOpt, defaultValue) + + protected def double( + name: String, + nullable: IsNullable = NotNullable, + sizeOpt: Option[Int] = None, + defaultValue: Option[String] = None + ) = + ColumnDefinition(DOUBLE, ColumnName(name), nullable, sizeOpt, defaultValue) +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefinition.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefinition.scala new file mode 100644 index 0000000000..20c738de3d --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/ColumnDefinition.scala @@ -0,0 +1,42 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding.db + +import com.twitter.scalding.TupleConverter + +case class ColumnName(toStr: String) extends AnyVal +case class SqlTypeName(toStr: String) extends AnyVal + +case class ColumnDefinition( + jdbcType: SqlType, + name: ColumnName, + nullable: IsNullable, + sizeOpt: Option[Int], + defaultValue: Option[String] +) extends Serializable + +trait ColumnDefinitionProvider[T] extends Serializable { + def columns: Iterable[ColumnDefinition] + def resultSetExtractor: ResultSetExtractor[T] +} + +class JdbcValidationException(msg: String) extends RuntimeException(msg) + +trait ResultSetExtractor[T] { + def validate(rsmd: java.sql.ResultSetMetaData): scala.util.Try[Unit] + def toCaseClass(rs: java.sql.ResultSet, c: TupleConverter[T]): T +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/DBColumnTransformer.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/DBColumnTransformer.scala new file mode 100644 index 0000000000..1d37e15f25 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/DBColumnTransformer.scala @@ -0,0 +1,78 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding.db + +// String form of a Column definition to be understood by a db +case class Definition(toStr: String) extends AnyVal + +object DBColumnDefinition { + def apply(col: ColumnDefinition): DBColumnDefinition = DBColumnDefinition( + col.jdbcType, + col.name, + col.nullable, + col.sizeOpt, + col.defaultValue, + SqlTypeName(col.jdbcType.toString) + ) +} + +case class DBColumnDefinition( + jdbcType: SqlType, + name: ColumnName, + nullable: IsNullable, + sizeOpt: Option[Int], + defaultValue: Option[String], + sqlType: SqlTypeName +) + +object DBColumnTransformer { + def columnDefnToDefinition( + col: ColumnDefinition, + columnMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition] + ): Definition = { + val preparedCol = columnMutator(DBColumnDefinition(col)) + val sizeStr = preparedCol.sizeOpt.map(siz => s"($siz)").getOrElse("") + val defStr = preparedCol.defaultValue.map(default => s" DEFAULT '$default' ").getOrElse(" ") + val sqlType = preparedCol.sqlType.toStr + + Definition(sqlType + sizeStr + defStr + preparedCol.nullable.toStr) + } + + private def defaultColumnMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition] = { + case t @ DBColumnDefinition(BIGINT, _, _, None, _, _) => t.copy(sizeOpt = Some(20)) + case t @ DBColumnDefinition(INT, _, _, None, _, _) => t.copy(sizeOpt = Some(11)) + case t @ DBColumnDefinition(SMALLINT, _, _, None, _, _) => t.copy(sizeOpt = Some(6)) + case t @ DBColumnDefinition(TINYINT, _, _, None, _, _) => t.copy(sizeOpt = Some(6)) + case t @ DBColumnDefinition(VARCHAR, _, _, None, _, _) => t.copy(sizeOpt = Some(255)) + case t => t + } + + def mutateColumns( + columnMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition], + columns: Iterable[ColumnDefinition] + ): Iterable[DBColumnDefinition] = + columns.map(c => columnMutator.orElse(defaultColumnMutator)(DBColumnDefinition(c))) + + def columnDefnsToCreate( + columnMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition], + columns: Iterable[ColumnDefinition] + ): Iterable[Definition] = + columns.map(c => columnDefnToDefinition(c, columnMutator.orElse(defaultColumnMutator))) + + def columnDefnsToCreate(columns: Iterable[ColumnDefinition]): Iterable[Definition] = + columns.map(c => columnDefnToDefinition(c, defaultColumnMutator)) +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/DBOptions.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/DBOptions.scala new file mode 100644 index 0000000000..140001c6a5 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/DBOptions.scala @@ -0,0 +1,56 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding.db + +// Schema name in database, used for vertica currently +case class SchemaName(toStr: String) extends AnyVal +// Table name in database +case class TableName(toStr: String) extends AnyVal +// Jdbc style connection url, e.g.: "jdbc:mysql://mysql01.company.com:3306/production" +case class ConnectUrl(toStr: String) extends AnyVal +// Username for the database +case class UserName(toStr: String) extends AnyVal +// Password for the database +case class Password(toStr: String) extends AnyVal { + override def toString: String = super.toString +} +// The adapter to use +case class Adapter(toStr: String) extends AnyVal +// Hadoop path string. Can be absolute path or complete URI. +case class HadoopUri(toStr: String) extends AnyVal +// Sql query string +case class SqlQuery(toStr: String) extends AnyVal +// java.nio.charset types are not serializable, so we define our own +case class StringEncoding(toStr: String) extends AnyVal + +/** + * Pass your DB credentials to this class in a preferred secure way + */ +case class ConnectionConfig( + connectUrl: ConnectUrl, + userName: UserName, + password: Password, + adapter: Adapter, + encoding: StringEncoding +) + +case class Database(toStr: String) extends AnyVal + +case class AvailableDatabases(m: Map[Database, ConnectionConfig] = Map()) { + def get(d: Database) = m.get(d) + def apply(d: Database) = m.apply(d) +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/DBTypeDescriptor.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/DBTypeDescriptor.scala new file mode 100644 index 0000000000..fc5a0066cf --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/DBTypeDescriptor.scala @@ -0,0 +1,27 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding.db +import com.twitter.scalding._ +import cascading.tuple.Fields + +trait DBTypeDescriptor[T] { + def columnDefn: ColumnDefinitionProvider[T] + def converter: TupleConverter[T] + def setter: TupleSetter[T] + def fields: Fields + def jdbcSetter: JdbcStatementSetter[T] +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/JdbcStatementSetter.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/JdbcStatementSetter.scala new file mode 100644 index 0000000000..12381cdeee --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/JdbcStatementSetter.scala @@ -0,0 +1,11 @@ +package com.twitter.scalding.db + +import java.sql.PreparedStatement +import scala.util.Try + +/** + * Case class to JDBC statement setter used for database writes + */ +trait JdbcStatementSetter[T] extends java.io.Serializable { self => + def apply(t: T, s: PreparedStatement): Try[PreparedStatement] +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/extensions/VerticaExtensions.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/extensions/VerticaExtensions.scala new file mode 100644 index 0000000000..c1d9904fa1 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/extensions/VerticaExtensions.scala @@ -0,0 +1,30 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +package com.twitter.scalding.db.extensions + +import com.twitter.scalding.db._ + +object VerticaExtensions { + def verticaMutator: PartialFunction[DBColumnDefinition, DBColumnDefinition] = { + case t @ DBColumnDefinition(BIGINT, _, _, None, _, _) => t.copy(sizeOpt = None) + case t @ DBColumnDefinition(INT, _, _, None, _, _) => t.copy(sizeOpt = None) + case t @ DBColumnDefinition(SMALLINT, _, _, None, _, _) => t.copy(sizeOpt = None) + case t @ DBColumnDefinition(BOOLEAN, _, _, None, _, _) => t.copy(sizeOpt = None) + case t @ DBColumnDefinition(TINYINT, _, _, None, _, _) => t.copy(sizeOpt = None) + case t @ DBColumnDefinition(DOUBLE, _, _, _, _, _) => t.copy(sqlType = SqlTypeName("DOUBLE PRECISION")) + } +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/DBMacro.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/DBMacro.scala new file mode 100644 index 0000000000..5b055eda11 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/DBMacro.scala @@ -0,0 +1,35 @@ +package com.twitter.scalding.db.macros + +import scala.language.experimental.macros +import com.twitter.scalding.db.macros.impl._ +import com.twitter.scalding.db.{ColumnDefinitionProvider, DBTypeDescriptor} + +// This is the sealed base trait for scala runtime annotiations used by the JDBC macros. +// These will read from these macros as a means to annotate fields to make up for the missing +// extra type information JDBC wants but is not in the jvm types. +sealed trait ScaldingDBAnnotation + +// This is the size in characters for a char field +// For integers its really for display purposes +@scala.annotation.meta.getter +final class size(val size: Int) extends annotation.StaticAnnotation with ScaldingDBAnnotation + +// JDBC TEXT type, this forces the String field in question to be a text type +@scala.annotation.meta.getter +final class text() extends annotation.StaticAnnotation with ScaldingDBAnnotation + +// JDBC VARCHAR type, this forces the String field in question to be a text type +@scala.annotation.meta.getter +final class varchar() extends annotation.StaticAnnotation with ScaldingDBAnnotation + +// JDBC DATE type, this toggles a java.util.Date field to be JDBC Date. +// It will default to DATETIME to preserve the full resolution of java.util.Date +@scala.annotation.meta.getter +final class date() extends annotation.StaticAnnotation with ScaldingDBAnnotation + +// This is the entry point to explicitly calling the JDBC macros. +// Most often the implicits will be used in the package however +object DBMacro { + def toColumnDefinitionProvider[T]: ColumnDefinitionProvider[T] = macro ColumnDefinitionProviderImpl[T] + def toDBTypeDescriptor[T]: DBTypeDescriptor[T] = macro DBTypeDescriptorImpl[T] +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/ColumnDefinitionProviderImpl.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/ColumnDefinitionProviderImpl.scala new file mode 100644 index 0000000000..4e1f5bdde9 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/ColumnDefinitionProviderImpl.scala @@ -0,0 +1,365 @@ +package com.twitter.scalding.db.macros.impl + +import scala.annotation.tailrec +import scala.reflect.macros.Context +import scala.util.{Failure, Success} + +import com.twitter.bijection.macros.impl.IsCaseClassImpl +import com.twitter.scalding.db.{ColumnDefinition, ColumnDefinitionProvider, ResultSetExtractor} +import com.twitter.scalding.db.macros.impl.handler._ + +// Simple wrapper to pass around the string name format of fields +private[impl] case class FieldName(toStr: String) { + override def toString = toStr +} + +object ColumnDefinitionProviderImpl { + + // Takes a type and its companion objects apply method + // based on the args it takes gives back out a field name to symbol + private[this] def getDefaultArgs(c: Context)(tpe: c.Type): Map[String, c.Expr[String]] = { + import c.universe._ + val classSym = tpe.typeSymbol + val moduleSym = classSym.companionSymbol + if (moduleSym == NoSymbol) { + c.abort( + c.enclosingPosition, + s"No companion for case class $tpe available. Possibly a nested class? These do not work with this macro." + ) + } + // pick the last apply method which (anecdotally) gives us the defaults + // set in the case class declaration, not the companion object + val applyList = moduleSym.typeSignature.declaration(newTermName("apply")).asTerm.alternatives + val apply = applyList.last.asMethod + // can handle only default parameters from the first parameter list + // because subsequent parameter lists might depend on previous parameters + apply.paramss.head + .map(_.asTerm) + .zipWithIndex + .flatMap { case (p, i) => + if (!p.isParamWithDefault) None + else { + val getterName = newTermName("apply$default$" + (i + 1)) + Some(p.name.toString -> c.Expr(q"$moduleSym.$getterName.toString")) + } + } + .toMap + } + + private[scalding] def getColumnFormats[T]( + c: Context + )(implicit T: c.WeakTypeTag[T]): List[ColumnFormat[c.type]] = { + import c.universe._ + + if (!IsCaseClassImpl.isCaseClassType(c)(T.tpe)) + c.abort( + c.enclosingPosition, + s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class. + This will mean the macro is operating on a non-resolved type.""" + ) + + // Field To JDBCColumn + @tailrec + def matchField( + accessorTree: List[MethodSymbol], + oTpe: Type, + fieldName: FieldName, + defaultValOpt: Option[c.Expr[String]], + annotationInfo: List[(Type, Option[Int])], + nullable: Boolean + ): scala.util.Try[List[ColumnFormat[c.type]]] = + oTpe match { + // String handling + case tpe if tpe =:= typeOf[String] => + StringTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable) + case tpe if tpe =:= typeOf[Array[Byte]] => + BlobTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable) + case tpe if tpe =:= typeOf[Byte] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "TINYINT") + case tpe if tpe =:= typeOf[Short] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "SMALLINT") + case tpe if tpe =:= typeOf[Int] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "INT") + case tpe if tpe =:= typeOf[Long] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "BIGINT") + case tpe if tpe =:= typeOf[Double] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "DOUBLE") + case tpe if tpe =:= typeOf[Boolean] => + NumericTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable, "BOOLEAN") + case tpe if tpe =:= typeOf[java.util.Date] => + DateTypeHandler(c)(accessorTree, fieldName, defaultValOpt, annotationInfo, nullable) + case tpe if tpe.erasure =:= typeOf[Option[Any]] && nullable == true => + Failure( + new Exception( + s"Case class ${T.tpe} has field $fieldName which contains a nested option. This is not supported by this macro." + ) + ) + + case tpe if tpe.erasure =:= typeOf[Option[Any]] && nullable == false => + if (defaultValOpt.isDefined) + Failure( + new Exception( + s"Case class ${T.tpe} has field $fieldName: ${oTpe.toString}, with a default value. Options cannot have default values" + ) + ) + else { + matchField( + accessorTree, + tpe.asInstanceOf[TypeRefApi].args.head, + fieldName, + None, + annotationInfo, + true + ) + } + case tpe if IsCaseClassImpl.isCaseClassType(c)(tpe) => expandMethod(accessorTree, tpe) + + // default + case _ => + Failure( + new Exception( + s"Case class ${T.tpe} has field $fieldName: ${oTpe.toString}, which is not supported for talking to JDBC" + ) + ) + } + + def expandMethod( + outerAccessorTree: List[MethodSymbol], + outerTpe: Type + ): scala.util.Try[List[ColumnFormat[c.type]]] = { + val defaultArgs = getDefaultArgs(c)(outerTpe) + + // Intializes the type info + outerTpe.declarations.foreach(_.typeSignature) + + // We have to build this up front as if the case class definition moves to another file + // the annotation moves from the value onto the getter method? + val annotationData: Map[String, List[(Type, List[Tree])]] = outerTpe.declarations + .map { m => + val mappedAnnotations = m.annotations.map(t => (t.tpe, t.scalaArgs)) + m.name.toString.trim -> mappedAnnotations + } + .groupBy(_._1) + .map { case (k, l) => + (k, l.map(_._2).reduce(_ ++ _)) + } + .filter { case (_, v) => + v.nonEmpty + } + + outerTpe.declarations + .collect { case m: MethodSymbol if m.isCaseAccessor => m } + .map { m => + val fieldName = m.name.toString.trim + val defaultVal = defaultArgs.get(fieldName) + + val annotationInfo: List[(Type, Option[Int])] = annotationData + .getOrElse(m.name.toString.trim, Nil) + .collect { + case (tpe, List(Literal(Constant(siz: Int)))) + if tpe =:= typeOf[com.twitter.scalding.db.macros.size] => + (tpe, Some(siz)) + case (tpe, _) if tpe =:= typeOf[com.twitter.scalding.db.macros.size] => + c.abort( + c.enclosingPosition, + "Hit a size macro where we couldn't parse the value. Probably not a literal constant. Only literal constants are supported." + ) + case (tpe, _) if tpe <:< typeOf[com.twitter.scalding.db.macros.ScaldingDBAnnotation] => + (tpe, None) + } + + matchField( + outerAccessorTree :+ m, + m.returnType, + FieldName(fieldName), + defaultVal, + annotationInfo, + false + ) + } + .toList + // This algorithm returns the error from the first exception we run into. + .foldLeft(scala.util.Try[List[ColumnFormat[c.type]]](Nil)) { case (pTry, nxt) => + (pTry, nxt) match { + case (Success(l), Success(r)) => Success(l ::: r) + case (f @ Failure(_), _) => f + case (_, f @ Failure(_)) => f + } + } + } + + val formats = expandMethod(Nil, T.tpe) match { + case Success(s) => s + case Failure(e) => (c.abort(c.enclosingPosition, e.getMessage)) + } + + val duplicateFields = formats + .map(_.fieldName) + .groupBy(identity) + .filter(_._2.size > 1) + .keys + + if (duplicateFields.nonEmpty) { + c.abort( + c.enclosingPosition, + s""" + Duplicate field names found: ${duplicateFields.mkString(",")}. + Please check your nested case classes. + """ + ) + } else { + formats + } + } + + def getColumnDefn[T](c: Context)(implicit T: c.WeakTypeTag[T]): List[c.Expr[ColumnDefinition]] = { + import c.universe._ + + val columnFormats = getColumnFormats[T](c) + + columnFormats.map { case cf: ColumnFormat[_] => + val nullableVal = + if (cf.nullable) + q"_root_.com.twitter.scalding.db.Nullable" + else + q"_root_.com.twitter.scalding.db.NotNullable" + val fieldTypeSelect = Select(q"_root_.com.twitter.scalding.db", newTermName(cf.fieldType)) + val res = q"""new _root_.com.twitter.scalding.db.ColumnDefinition( + $fieldTypeSelect, + _root_.com.twitter.scalding.db.ColumnName(${cf.fieldName.toStr}), + $nullableVal, + ${cf.sizeOpt}, + ${cf.defaultValue}) + """ + c.Expr[ColumnDefinition](res) + } + } + + def getExtractor[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[ResultSetExtractor[T]] = { + import c.universe._ + + val columnFormats = getColumnFormats[T](c) + + val rsmdTerm = newTermName(c.fresh("rsmd")) + // we validate two things from ResultSetMetadata + // 1. the column types match with actual DB schema + // 2. all non-nullable fields are indeed non-nullable in DB schema + val checks = columnFormats.zipWithIndex.map { case (cf: ColumnFormat[_], pos: Int) => + val fieldName = cf.fieldName.toStr + val typeNameTerm = newTermName(c.fresh(s"colTypeName_$pos")) + // MySQL uses names like `DATE`, `INTEGER` and `VARCHAR`; + // Vertica uses names like `Date`, `Integer` and `Varchar` + val typeName = q""" + val $typeNameTerm = $rsmdTerm.getColumnTypeName(${pos + 1}).toUpperCase(java.util.Locale.US) + """ + // certain types have synonyms, so we group them together here + // note: this is mysql specific + // http://dev.mysql.com/doc/refman/5.0/en/numeric-type-overview.html + val typeValidation = cf.fieldType match { + case "VARCHAR" => q"""List("VARCHAR", "CHAR").contains($typeNameTerm)""" + case "BOOLEAN" | "TINYINT" => q"""List("BOOLEAN", "BOOL", "TINYINT").contains($typeNameTerm)""" + case "INT" => q"""List("INTEGER", "INT").contains($typeNameTerm)""" + // In Vertica, `INTEGER`, `INT`, `BIGINT`, `INT8`, `SMALLINT`, and `TINYINT` are all 64 bits + // https://www.vertica.com/docs/8.1.x/HTML/index.htm#Authoring/SQLReferenceManual/DataTypes/Numeric/INTEGER.htm + // In MySQL, `TINYINT`, `SMALLINT`, `MEDIUMINT`, `INT`, and `BIGINT` are all <= 64 bits + // https://dev.mysql.com/doc/refman/5.7/en/integer-types.html + // As the user has told us this field can store a `BIGINT`, we can safely accept any of these + // types from the database. + case "BIGINT" => + q"""List("INTEGER", "INT", "BIGINT", "INT8", "SMALLINT", + "TINYINT", "SMALLINT", "MEDIUMINT").contains($typeNameTerm)""" + case "DATETIME" => q"""List("DATE","DATETIME","TIMESTAMP").contains($typeNameTerm)""" + case f => q"""$f == $typeNameTerm""" + } + val typeAssert = q""" + if (!$typeValidation) { + throw new _root_.com.twitter.scalding.db.JdbcValidationException( + "Mismatched type for column '" + $fieldName + "'. Expected " + ${cf.fieldType} + + " but set to " + $typeNameTerm + " in DB.") + } + """ + val nullableTerm = newTermName(c.fresh(s"isNullable_$pos")) + val nullableValidation = q""" + val $nullableTerm = $rsmdTerm.isNullable(${pos + 1}) + if ($nullableTerm == _root_.java.sql.ResultSetMetaData.columnNoNulls && ${cf.nullable}) { + throw new _root_.com.twitter.scalding.db.JdbcValidationException( + "Column '" + $fieldName + "' is not nullable in DB.") + } + """ + q""" + $typeName + $typeAssert + $nullableValidation + """ + } + + val rsTerm = newTermName(c.fresh("rs")) + val formats = columnFormats.map { + case cf: ColumnFormat[_] => { + val fieldName = cf.fieldName.toStr + // java boxed types needed below to populate cascading's Tuple + val (box: Option[Tree], primitiveGetter: Tree) = cf.fieldType match { + case "VARCHAR" | "TEXT" => + (None, q"""$rsTerm.getString($fieldName)""") + case "BOOLEAN" => + (Some(q"""_root_.java.lang.Boolean.valueOf"""), q"""$rsTerm.getBoolean($fieldName)""") + case "TINYINT" => + (Some(q"""_root_.java.lang.Byte.valueOf"""), q"""$rsTerm.getByte($fieldName)""") + case "DATE" | "DATETIME" => + ( + None, + q"""Option($rsTerm.getTimestamp($fieldName)).map { ts => new java.util.Date(ts.getTime) }.orNull""" + ) + // dates set to null are populated as None by tuple converter + // if the corresponding case class field is an Option[Date] + case "DOUBLE" => + (Some(q"""_root_.java.lang.Double.valueOf"""), q"""$rsTerm.getDouble($fieldName)""") + case "BIGINT" => + (Some(q"""_root_.java.lang.Long.valueOf"""), q"""$rsTerm.getLong($fieldName)""") + case "INT" | "SMALLINT" => + (Some(q"""_root_.java.lang.Integer.valueOf"""), q"""$rsTerm.getInt($fieldName)""") + case "BLOB" => + ( + None, + q"""Option($rsTerm.getBlob($fieldName)).map ( blob => blob.getBytes(1,blob.length().toInt)).orNull """ + ) + case f => + (None, q"""sys.error("Invalid format " + $f + " for " + $fieldName)""") + } + // note: UNSIGNED BIGINT is currently unsupported + val valueTerm = newTermName(c.fresh("colValue")) + val boxed = box.map(b => q"""$b($valueTerm)""").getOrElse(q"""$valueTerm""") + // primitiveGetter needs to be invoked before we can use wasNull + // to check if the column value that was read is null or not + q""" + { val $valueTerm = $primitiveGetter; if ($rsTerm.wasNull) null else $boxed } + """ + } + } + val tcTerm = newTermName(c.fresh("conv")) + val res = q""" + new _root_.com.twitter.scalding.db.ResultSetExtractor[$T] { + def validate($rsmdTerm: _root_.java.sql.ResultSetMetaData): _root_.scala.util.Try[Unit] = _root_.scala.util.Try { ..$checks } + def toCaseClass($rsTerm: java.sql.ResultSet, $tcTerm: _root_.com.twitter.scalding.TupleConverter[$T]): $T = + $tcTerm(new _root_.cascading.tuple.TupleEntry(new _root_.cascading.tuple.Tuple(..$formats))) + } + """ + // ResultSet -> TupleEntry -> case class + c.Expr[ResultSetExtractor[T]](res) + } + + def apply[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[ColumnDefinitionProvider[T]] = { + import c.universe._ + + val columns = getColumnDefn[T](c) + val resultSetExtractor = getExtractor[T](c) + + val res = q""" + new _root_.com.twitter.scalding.db.ColumnDefinitionProvider[$T] with _root_.com.twitter.bijection.macros.MacroGenerated { + override val columns = List(..$columns) + override val resultSetExtractor = $resultSetExtractor + } + """ + c.Expr[ColumnDefinitionProvider[T]](res) + } +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/DBTypeDescriptorImpl.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/DBTypeDescriptorImpl.scala new file mode 100644 index 0000000000..e0ccfdd710 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/DBTypeDescriptorImpl.scala @@ -0,0 +1,38 @@ +package com.twitter.scalding.db.macros.impl + +import scala.reflect.macros.Context + +import com.twitter.bijection.macros.impl.IsCaseClassImpl +import com.twitter.scalding.macros.impl.{FieldsProviderImpl, TupleConverterImpl, TupleSetterImpl} +import com.twitter.scalding.db.DBTypeDescriptor + +object DBTypeDescriptorImpl { + + def apply[T](c: Context)(implicit T: c.WeakTypeTag[T]): c.Expr[DBTypeDescriptor[T]] = { + import c.universe._ + + if (!IsCaseClassImpl.isCaseClassType(c)(T.tpe)) + c.abort( + c.enclosingPosition, + s"""We cannot enforce ${T.tpe} is a case class, either it is not a case class or this macro call is possibly enclosed in a class. + This will mean the macro is operating on a non-resolved type.""" + ) + + val columnDefn = ColumnDefinitionProviderImpl[T](c) + val converter = TupleConverterImpl.caseClassTupleConverterWithUnknownImpl[T](c) + val setter = TupleSetterImpl.caseClassTupleSetterWithUnknownImpl[T](c) + val jdbcSetter = JdbcStatementSetterImpl.caseClassJdbcSetterCommonImpl[T](c, true) + val fields = FieldsProviderImpl.toFieldsWithUnknownNoPrefixImpl[T](c) + + val res = q""" + new _root_.com.twitter.scalding.db.DBTypeDescriptor[$T] with _root_.com.twitter.bijection.macros.MacroGenerated { + override val columnDefn = $columnDefn + override val converter = $converter + override val setter = $setter + override val fields = $fields + override val jdbcSetter = $jdbcSetter + } + """ + c.Expr[DBTypeDescriptor[T]](res) + } +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcFieldSetter.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcFieldSetter.scala new file mode 100644 index 0000000000..0752f9ab71 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcFieldSetter.scala @@ -0,0 +1,57 @@ +/* + Copyright 2015 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.db.macros.impl + +import scala.reflect.macros.Context +import scala.util.Try + +import com.twitter.scalding.macros.impl.CaseClassFieldSetter + +/** + * Helper class for setting case class fields in java.sql.Statement + */ +private[macros] object JdbcFieldSetter extends CaseClassFieldSetter { + + override def absent(c: Context)(idx: Int, container: c.TermName): c.Tree = { + import c.universe._ + q"""$container.setObject($idx + 1, null)""" + } + + override def default(c: Context)(idx: Int, container: c.TermName, fieldValue: c.Tree): c.Tree = { + import c.universe._ + q"""$container.setObject($idx + 1, $fieldValue)""" + } + + override def from( + c: Context + )(fieldType: c.Type, idx: Int, container: c.TermName, fieldValue: c.Tree): Try[c.Tree] = Try { + import c.universe._ + + // jdbc Statement indexes are one-based, hence +1 here + def simpleType(accessor: Tree) = q"""$accessor(${idx + 1}, $fieldValue)""" + + fieldType match { + case tpe if tpe =:= typeOf[String] => simpleType(q"$container.setString") + case tpe if tpe =:= typeOf[Boolean] => simpleType(q"$container.setBoolean") + case tpe if tpe =:= typeOf[Short] => simpleType(q"$container.setShort") + case tpe if tpe =:= typeOf[Int] => simpleType(q"$container.setInt") + case tpe if tpe =:= typeOf[Long] => simpleType(q"$container.setLong") + case tpe if tpe =:= typeOf[Float] => simpleType(q"$container.setFloat") + case tpe if tpe =:= typeOf[Double] => simpleType(q"$container.setDouble") + case _ => sys.error(s"Unsupported primitive type $fieldType") + } + } +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcStatementSetterImpl.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcStatementSetterImpl.scala new file mode 100644 index 0000000000..2d5d963c0b --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/JdbcStatementSetterImpl.scala @@ -0,0 +1,45 @@ +/* + Copyright 2015 Twitter, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + */ +package com.twitter.scalding.db.macros.impl + +import scala.reflect.macros.Context + +import com.twitter.scalding.macros.impl.CaseClassBasedSetterImpl +import com.twitter.scalding.db.JdbcStatementSetter + +/** + * Generates JDBC PreparedStatement data from case class + */ +private[macros] object JdbcStatementSetterImpl { + + def caseClassJdbcSetterCommonImpl[T](c: Context, allowUnknownTypes: Boolean)(implicit + T: c.WeakTypeTag[T] + ): c.Expr[JdbcStatementSetter[T]] = { + import c.universe._ + + val stmtTerm = newTermName(c.fresh("stmt")) + val (_, setterTerm) = CaseClassBasedSetterImpl(c)(stmtTerm, allowUnknownTypes, JdbcFieldSetter) + val res = q""" + new _root_.com.twitter.scalding.db.JdbcStatementSetter[$T] with _root_.com.twitter.bijection.macros.MacroGenerated { + override def apply(t: $T, $stmtTerm: _root_.java.sql.PreparedStatement) = _root_.scala.util.Try { + $setterTerm + $stmtTerm + } + } + """ + c.Expr[JdbcStatementSetter[T]](res) + } +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/AnnotationHelper.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/AnnotationHelper.scala new file mode 100644 index 0000000000..ce02769e97 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/AnnotationHelper.scala @@ -0,0 +1,74 @@ +package com.twitter.scalding.db.macros.impl.handler + +import scala.reflect.macros.Context +import scala.util.{Failure, Success} + +import com.twitter.scalding.db.macros.impl.FieldName + +private[handler] sealed trait SizeAnno +private[handler] final case class WithSize(v: Int) extends SizeAnno +private[handler] case object WithoutSize extends SizeAnno + +private[handler] sealed trait DateAnno +private[handler] case object WithDate extends DateAnno +private[handler] case object WithoutDate extends DateAnno + +private[handler] sealed trait VarcharAnno +private[handler] case object WithVarchar extends VarcharAnno +private[handler] case object WithoutVarchar extends VarcharAnno + +private[handler] sealed trait TextAnno +private[handler] case object WithText extends TextAnno +private[handler] case object WithoutText extends TextAnno + +private[handler] abstract class AnnotationHelper { + val ctx: Context + val cfieldName: FieldName + val cannotationInfo: List[(ctx.universe.Type, Option[Int])] + import ctx.universe._ + + def sizeAnnotation: scala.util.Try[(AnnotationHelper, SizeAnno)] = + consume[SizeAnno](typeOf[com.twitter.scalding.db.macros.size])( + _.flatten.map(o => WithSize(o)).getOrElse(WithoutSize) + ) + + def textAnnotation: scala.util.Try[(AnnotationHelper, TextAnno)] = + consume(typeOf[com.twitter.scalding.db.macros.text])(_.map(_ => WithText).getOrElse(WithoutText)) + + def varcharAnnotation: scala.util.Try[(AnnotationHelper, VarcharAnno)] = + consume(typeOf[com.twitter.scalding.db.macros.varchar])(_.map(_ => WithVarchar).getOrElse(WithoutVarchar)) + + def dateAnnotation: scala.util.Try[(AnnotationHelper, DateAnno)] = + consume(typeOf[com.twitter.scalding.db.macros.date])(_.map(_ => WithDate).getOrElse(WithoutDate)) + + def consume[T]( + t: ctx.universe.Type + )(fn: Option[Option[Int]] => T): scala.util.Try[(AnnotationHelper, T)] = { + val (matchedAnnotations, remainingAnnotations) = cannotationInfo.partition { case (tpe, _) => + tpe =:= t + } + + val newHelper = new { + val ctx: this.ctx.type = this.ctx + val cfieldName = this.cfieldName + val cannotationInfo: List[(this.ctx.universe.Type, Option[Int])] = remainingAnnotations + } with AnnotationHelper + + matchedAnnotations match { + case h :: Nil => Success((newHelper, fn(Some(h._2)))) + case h :: t => Failure(new Exception(s"Error more than one annotation when looking for $t")) + case Nil => Success((newHelper, fn(None))) + } + } + + def validateFinished: scala.util.Try[Unit] = + if (cannotationInfo.isEmpty) { + Success(()) + } else { + val msg = s""" + Finished consuming annotations for field ${cfieldName.toStr}, but have remaining annotations: + ${cannotationInfo.map(_._1).mkString("\n")} + """ + Failure(new Exception(msg)) + } +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/BlobTypeHandler.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/BlobTypeHandler.scala new file mode 100644 index 0000000000..4d4e5afb8c --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/BlobTypeHandler.scala @@ -0,0 +1,23 @@ +package com.twitter.scalding.db.macros.impl.handler + +import com.twitter.scalding.db.macros.impl.FieldName +import scala.reflect.macros.Context +import scala.util.{Failure, Success} + +object BlobTypeHandler { + def apply[T](c: Context)(implicit + accessorTree: List[c.universe.MethodSymbol], + fieldName: FieldName, + defaultValue: Option[c.Expr[String]], + annotationInfo: List[(c.universe.Type, Option[Int])], + nullable: Boolean + ): scala.util.Try[List[ColumnFormat[c.type]]] = + if (defaultValue.nonEmpty || annotationInfo.nonEmpty) + Failure( + new Exception( + s"Default values and annotation info are not supported: defaultValue = $defaultValue annotationInfo = $annotationInfo" + ) + ) + else + Success(List(ColumnFormat(c)(accessorTree, "BLOB", None))) +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/ColumnFormat.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/ColumnFormat.scala new file mode 100644 index 0000000000..fed2e42935 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/ColumnFormat.scala @@ -0,0 +1,35 @@ +package com.twitter.scalding.db.macros.impl.handler + +import scala.reflect.macros.Context + +import com.twitter.scalding.db.macros.impl.FieldName + +object ColumnFormat { + def apply(c: Context)(fAccessor: List[c.universe.MethodSymbol], fType: String, size: Option[Int])(implicit + fName: FieldName, + isNullable: Boolean, + defaultV: Option[c.Expr[String]] + ): ColumnFormat[c.type] = + new ColumnFormat[c.type](c) { + val fieldAccessor = fAccessor + val fieldType = fType + val fieldName = fName + val nullable = isNullable + val sizeOpt = size + val defaultValue = defaultV + } +} + +/** + * Contains data format information for a column as defined in the case class. + * + * Used by the ColumnDefinitionProvider macro too generate columns definitions and JDBC ResultSet extractor. + */ +abstract class ColumnFormat[C <: Context](val ctx: C) { + def fieldAccessor: List[ctx.universe.MethodSymbol] + def fieldType: String + def fieldName: FieldName + def nullable: Boolean + def sizeOpt: Option[Int] + def defaultValue: Option[ctx.Expr[String]] +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/DateTypeHandler.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/DateTypeHandler.scala new file mode 100644 index 0000000000..26430c9670 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/DateTypeHandler.scala @@ -0,0 +1,34 @@ +package com.twitter.scalding.db.macros.impl.handler + +import scala.reflect.macros.Context +import scala.util.Success + +import com.twitter.scalding.db.macros.impl.FieldName + +object DateTypeHandler { + + def apply[T](c: Context)(implicit + accessorTree: List[c.universe.MethodSymbol], + fieldName: FieldName, + defaultValue: Option[c.Expr[String]], + annotationInfo: List[(c.universe.Type, Option[Int])], + nullable: Boolean + ): scala.util.Try[List[ColumnFormat[c.type]]] = { + + val helper = new { + val ctx: c.type = c + val cfieldName = fieldName + val cannotationInfo = annotationInfo + } with AnnotationHelper + + val extracted = for { + (nextHelper, dateAnno) <- helper.dateAnnotation + _ <- nextHelper.validateFinished + } yield dateAnno + + extracted.flatMap { + case WithDate => Success(List(ColumnFormat(c)(accessorTree, "DATE", None))) + case WithoutDate => Success(List(ColumnFormat(c)(accessorTree, "DATETIME", None))) + } + } +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/NumericTypeHandler.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/NumericTypeHandler.scala new file mode 100644 index 0000000000..492e2fad92 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/NumericTypeHandler.scala @@ -0,0 +1,35 @@ +package com.twitter.scalding.db.macros.impl.handler + +import scala.reflect.macros.Context +import scala.util.{Failure, Success} + +import com.twitter.scalding.db.macros.impl.FieldName + +object NumericTypeHandler { + def apply[T](c: Context)(implicit + accessorTree: List[c.universe.MethodSymbol], + fieldName: FieldName, + defaultValue: Option[c.Expr[String]], + annotationInfo: List[(c.universe.Type, Option[Int])], + nullable: Boolean, + numericType: String + ): scala.util.Try[List[ColumnFormat[c.type]]] = { + + val helper = new { + val ctx: c.type = c + val cfieldName = fieldName + val cannotationInfo = annotationInfo + } with AnnotationHelper + + val extracted = for { + (nextHelper, sizeAnno) <- helper.sizeAnnotation + _ <- nextHelper.validateFinished + } yield sizeAnno + + extracted.flatMap { + case WithSize(s) if s > 0 => Success(List(ColumnFormat(c)(accessorTree, numericType, Some(s)))) + case WithSize(s) => Failure(new Exception(s"Int field $fieldName, has a size defined that is <= 0.")) + case WithoutSize => Success(List(ColumnFormat(c)(accessorTree, numericType, None))) + } + } +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/StringTypeHandler.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/StringTypeHandler.scala new file mode 100644 index 0000000000..a1777e35b3 --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/macros/impl/handler/StringTypeHandler.scala @@ -0,0 +1,60 @@ +package com.twitter.scalding.db.macros.impl.handler + +import scala.reflect.macros.Context +import scala.util.{Failure, Success} + +import com.twitter.scalding.db.macros.impl.FieldName + +object StringTypeHandler { + def apply[T](c: Context)(implicit + accessorTree: List[c.universe.MethodSymbol], + fieldName: FieldName, + defaultValue: Option[c.Expr[String]], + annotationInfo: List[(c.universe.Type, Option[Int])], + nullable: Boolean + ): scala.util.Try[List[ColumnFormat[c.type]]] = { + + val helper = new { + val ctx: c.type = c + val cfieldName = fieldName + val cannotationInfo = annotationInfo + } with AnnotationHelper + + val extracted = for { + (nextHelper, sizeAnno) <- helper.sizeAnnotation + (nextHelper, varcharAnno) <- nextHelper.varcharAnnotation + (nextHelper, textAnno) <- nextHelper.textAnnotation + _ <- nextHelper.validateFinished + } yield (sizeAnno, varcharAnno, textAnno) + + extracted.flatMap { + case (_, WithVarchar, WithText) => + Failure( + new Exception(s"String field $fieldName, has mutually exclusive annotations @text and @varchar") + ) + case (WithoutSize, WithVarchar, WithoutText) => + Failure( + new Exception( + s"String field $fieldName, is forced varchar but has no size annotation. size is required in the presence of varchar." + ) + ) + case (WithoutSize, WithoutVarchar, WithoutText) => + Failure( + new Exception(s"String field $fieldName, at least one of size, varchar, text must be present.") + ) + case (WithSize(siz), _, _) if siz <= 0 => + Failure( + new Exception( + s"String field $fieldName, has a size $siz which is <= 0. Doesn't make sense for a string." + ) + ) + case (WithSize(siz), WithoutVarchar, WithoutText) if siz <= 255 => + Success(List(ColumnFormat(c)(accessorTree, "VARCHAR", Some(siz)))) + case (WithSize(siz), WithoutVarchar, WithoutText) if siz > 255 => + Success(List(ColumnFormat(c)(accessorTree, "TEXT", None))) + case (WithSize(siz), WithVarchar, WithoutText) => + Success(List(ColumnFormat(c)(accessorTree, "VARCHAR", Some(siz)))) + case (_, WithoutVarchar, WithText) => Success(List(ColumnFormat(c)(accessorTree, "TEXT", None))) + } + } +} diff --git a/scalding-db/src/main/scala/com/twitter/scalding/db/package.scala b/scalding-db/src/main/scala/com/twitter/scalding/db/package.scala new file mode 100644 index 0000000000..6a87b12d6c --- /dev/null +++ b/scalding-db/src/main/scala/com/twitter/scalding/db/package.scala @@ -0,0 +1,29 @@ +/* +Copyright 2015 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.db + +import scala.language.experimental.{macros => sMacros} + +import com.twitter.scalding.db.macros.impl.{ColumnDefinitionProviderImpl, DBTypeDescriptorImpl} + +// The implicits in the jdbc.macro's package +// These are to allow us to auto provide our Type Classes without the user possibly knowing +// all of the various ways we could build it. +package object macros { + implicit def toColumnDefinitionProvider[T]: ColumnDefinitionProvider[T] = + macro ColumnDefinitionProviderImpl[T] + implicit def toDBTypeDescriptor[T]: DBTypeDescriptor[T] = macro DBTypeDescriptorImpl[T] +} diff --git a/scalding-db/src/test/scala/com/twitter/scalding/db/DBOptionsTest.scala b/scalding-db/src/test/scala/com/twitter/scalding/db/DBOptionsTest.scala new file mode 100644 index 0000000000..15968a11b8 --- /dev/null +++ b/scalding-db/src/test/scala/com/twitter/scalding/db/DBOptionsTest.scala @@ -0,0 +1,11 @@ +package com.twitter.scalding.db + +import org.scalacheck.Properties +import org.scalacheck.Prop._ + +object DBOptionsTest extends Properties("DBOptions") { + property("password") = forAll { x: String => + ("Password toString should not be equal to x" |: Password(x).toString != x) && + ("Password toStr should be equal to x" |: Password(x).toStr == x) + } +} diff --git a/scalding-db/src/test/scala/com/twitter/scalding/db/macros/MacrosUnitTests.scala b/scalding-db/src/test/scala/com/twitter/scalding/db/macros/MacrosUnitTests.scala new file mode 100644 index 0000000000..bae1a42110 --- /dev/null +++ b/scalding-db/src/test/scala/com/twitter/scalding/db/macros/MacrosUnitTests.scala @@ -0,0 +1,558 @@ +package com.twitter.scalding.db.macros + +import org.mockito.Mockito.when +import org.scalatest.{Matchers, WordSpec} +import org.scalatest.Inside._ +import org.scalatest.exceptions.TestFailedException +import org.scalatest.mock.MockitoSugar +import cascading.tuple.{Fields, Tuple, TupleEntry} +import com.twitter.bijection.macros.MacroGenerated +import com.twitter.scalding.db._ +import java.sql.{Blob, ResultSet, ResultSetMetaData} +import java.util.Date +import javax.sql.rowset.serial.SerialBlob + +object User { + // these defaults should not get picked up in ColumnDefinition + def apply(): User = User(0, "username", Some(0), "female") + def apply(date_id: Int): User = User(date_id, "username", Some(0), "female") + def apply(date_id: Int, username: String): User = User(date_id, username, Some(0), "female") + def apply(date_id: Int, username: String, age: Option[Int]): User = User(date_id, username, age, "female") +} + +case class User( + date_id: Int, + @size(64) user_name: String, + age: Option[Int], + @size(22) gender: String = "male" +) + +case class Demographics(age: Option[Int], @size(22) gender: String = "male") + +case class User2(date_id: Int, @size(64) user_name: String, demographics: Demographics) + +case class BadUser1(user_name: String, age: Int = 13) +case class BadUser2(@size(-1) user_name: String, age: Int) +case class BadUser3(@size(0) age: Int) +case class BadUser5(user_name: Option[String] = Some("bob"), age: Int) + +case class BadUser6(user_names: List[String]) +object Consts { + val cInt: Int = 13 +} +case class BadUser7(@size(Consts.cInt) age: Int) +case class BadUser8(age: Option[Option[Int]]) +case class BadUser9(@size(15) @text age: Option[Option[Int]]) +case class BadUser10(@size(2) @size(4) age: Option[Option[Int]]) + +case class ExhaustiveJdbcCaseClass( + bigInt: Long, // 8 bytes + smallerAgainInt: Int, // 4 bytes + @size( + 5 + ) normalIntWithSize: Int, // Sizes on numerics seem to just be for display. Not sure if its worth allowing. + evenSmallerInt: Short, // 2 bytes + numberFun: Double, + booleanFlag: Boolean, // 1 byte -- tinyint + @size(20) smallString: String, // Should goto varchar + @size(200) smallishString: String, // Should goto varchar + @size(2048) largeString: String, // Should goto TEXT + @text forceTextString: String, // Force smaller to text, stored out of the table. So row query speed possibly faster + @size( + 2051 + ) @varchar forcedVarChar: String, // Forced inline to table -- only some sql version support > 255 for varchar + myDateWithTime: Date, // Default goes to MySQL DateTime/Timestamp so its not lossy + @date myDateWithoutTime: Date, + optiLong: Option[Long], // Nullable long + byteArr: Array[Byte], + tinyInt: Byte +) + +private final case class VerticaCaseClass( + verticaLong: Long, + @date verticaDate: Date, + @varchar @size(size = 1) verticaVarchar1: String +) + +case class CaseClassWithDate(id: Long, myDateWithTime: Date, @date myDateWithoutTime: Date) + +case class CaseClassWithOptions( + id: Option[Int], + @size(20) name: Option[String], + date_id: Option[Date], + boolean_value: Option[Boolean], + short_value: Option[Short], + long_value: Option[Long], + double_value: Option[Double] +) + +case class InnerWithBadNesting(age: Int, id: Long) + +case class OuterWithBadNesting( + id: Int, // duplicate in nested case class + @text name: String, + details: InnerWithBadNesting +) + +class JdbcMacroUnitTests extends WordSpec with Matchers with MockitoSugar { + + val dummy = new ColumnDefinitionProvider[Nothing] { + override val columns = Nil + override val resultSetExtractor = null + } + + def isColumnDefinitionAvailable[T](implicit + proof: ColumnDefinitionProvider[T] = dummy.asInstanceOf[ColumnDefinitionProvider[T]] + ): Unit = { + proof shouldBe a[MacroGenerated] + proof.columns.isEmpty shouldBe false + } + + def isJDBCTypeInfoAvailable[T](implicit + proof: DBTypeDescriptor[T] = dummy.asInstanceOf[DBTypeDescriptor[T]] + ): Unit = { + proof shouldBe a[MacroGenerated] + proof.columnDefn.columns.isEmpty shouldBe false + } + + "String field missing annotation" in { + a[TestFailedException] should be thrownBy isColumnDefinitionAvailable[BadUser1] + } + + "String field size annotation not in range" in { + a[TestFailedException] should be thrownBy isColumnDefinitionAvailable[BadUser2] + } + + "Int field size annotation not in range" in { + a[TestFailedException] should be thrownBy isColumnDefinitionAvailable[BadUser3] + } + + "Option field with default" in { + a[TestFailedException] should be thrownBy isColumnDefinitionAvailable[BadUser5] + } + + "Unknown field type" in { + a[TestFailedException] should be thrownBy isColumnDefinitionAvailable[BadUser6] + } + + "Annotation for size doesn't use a constant" in { + a[TestFailedException] should be thrownBy isColumnDefinitionAvailable[BadUser7] + } + + "Nested options should be blocked" in { + a[TestFailedException] should be thrownBy isColumnDefinitionAvailable[BadUser8] + } + + "Extra annotation not supported on current field " in { + a[TestFailedException] should be thrownBy isColumnDefinitionAvailable[BadUser9] + } + + "Two annotations of the same type " in { + a[TestFailedException] should be thrownBy isColumnDefinitionAvailable[BadUser10] + } + + "Produces the ColumnDefinition" should { + + isColumnDefinitionAvailable[User] + + // verify defaults are from case class declaration, not companion object + val expectedColumns = List( + ColumnDefinition(INT, ColumnName("date_id"), NotNullable, None, None), + ColumnDefinition(VARCHAR, ColumnName("user_name"), NotNullable, Some(64), None), + ColumnDefinition(INT, ColumnName("age"), Nullable, None, None), + ColumnDefinition(VARCHAR, ColumnName("gender"), NotNullable, Some(22), Some("male")) + ) + + val typeDesc = DBMacro.toDBTypeDescriptor[User] + val columnDef = typeDesc.columnDefn + assert(columnDef.columns.toList === expectedColumns) + + val expectedFields = new Fields("date_id", "user_name", "age", "gender") + assert(typeDesc.fields.equalsFields(expectedFields)) + + val rsmd = mock[ResultSetMetaData] + when(rsmd.getColumnTypeName(1)).thenReturn("INT") + when(rsmd.isNullable(1)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(2)).thenReturn("VARCHAR") + when(rsmd.isNullable(2)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(3)).thenReturn("INT") + when(rsmd.isNullable(3)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(4)).thenReturn("VARCHAR") + when(rsmd.isNullable(4)).thenReturn(ResultSetMetaData.columnNullableUnknown) + + assert(columnDef.resultSetExtractor.validate(rsmd).isSuccess) + + val rs = mock[ResultSet] + when(rs.getInt("date_id")).thenReturn(123) + when(rs.getString("user_name")).thenReturn("alice") + when(rs.getInt("age")).thenReturn(26) + when(rs.getString("gender")).thenReturn("F") + + assert( + columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) == User(123, "alice", Some(26), "F") + ) + () // Need this till: https://github.com/scalatest/scalatest/issues/1107 + } + + "Produces the ColumnDefinition for nested case class " should { + + isColumnDefinitionAvailable[User2] + + val expectedColumns = List( + ColumnDefinition(INT, ColumnName("date_id"), NotNullable, None, None), + ColumnDefinition(VARCHAR, ColumnName("user_name"), NotNullable, Some(64), None), + ColumnDefinition(INT, ColumnName("age"), Nullable, None, None), + ColumnDefinition(VARCHAR, ColumnName("gender"), NotNullable, Some(22), Some("male")) + ) + + val typeDesc = DBMacro.toDBTypeDescriptor[User2] + val columnDef = typeDesc.columnDefn + assert(columnDef.columns.toList === expectedColumns) + + val expectedFields = new Fields("date_id", "user_name", "age", "gender") + assert(typeDesc.fields.equalsFields(expectedFields)) + + val rs = mock[ResultSet] + when(rs.getInt("date_id")).thenReturn(123) + when(rs.getString("user_name")).thenReturn("alice") + when(rs.getInt("age")).thenReturn(26) + when(rs.getString("gender")).thenReturn("F") + + assert( + columnDef.resultSetExtractor + .toCaseClass(rs, typeDesc.converter) == User2(123, "alice", Demographics(Some(26), "F")) + ) + () // Need this till: https://github.com/scalatest/scalatest/issues/1107 + } + + "Produces the DBTypeDescriptor" should { + // explicitly just call this to get a compiler error + DBMacro.toDBTypeDescriptor[User] + // ensure the implicit fires + isJDBCTypeInfoAvailable[User] + + val expectedColumns = List( + ColumnDefinition(INT, ColumnName("date_id"), NotNullable, None, None), + ColumnDefinition(VARCHAR, ColumnName("user_name"), NotNullable, Some(64), None), + ColumnDefinition(INT, ColumnName("age"), Nullable, None, None), + ColumnDefinition(VARCHAR, ColumnName("gender"), NotNullable, Some(22), Some("male")) + ) + + assert(DBMacro.toDBTypeDescriptor[User].columnDefn.columns.toList === expectedColumns) + () // Need this till: https://github.com/scalatest/scalatest/issues/1107 + } + + "interoperates with Vertica, which uses different type names" should { + val typeDescriptor = DBMacro.toDBTypeDescriptor[VerticaCaseClass] + val expectedColumns = List( + ColumnDefinition(BIGINT, ColumnName("verticaLong"), NotNullable, None, None), + ColumnDefinition(DATE, ColumnName("verticaDate"), NotNullable, None, None), + ColumnDefinition(VARCHAR, ColumnName("verticaVarchar1"), NotNullable, Some(1), None) + ) + assert(typeDescriptor.columnDefn.columns.toList === expectedColumns) + + // Vertica uses `Integer` + val int64TypeNames = + List("Integer", "INTEGER", "INT", "BIGINT", "INT8", "SMALLINT", "TINYINT", "SMALLINT", "MEDIUMINT") + // Vertica uses `Date` + val dateTypeNames = List("Date", "DATE") + // Vertica uses `Varchar` + val varcharTypeNames = List("Varchar", "VARCHAR") + + int64TypeNames.foreach { int64TypeName => + dateTypeNames.foreach { dateTypeName => + varcharTypeNames.foreach { varcharTypeName => + val resultSetMetaData = mock[ResultSetMetaData] + when(resultSetMetaData.getColumnTypeName(1)).thenReturn(int64TypeName) + when(resultSetMetaData.isNullable(1)).thenReturn(ResultSetMetaData.columnNoNulls) + when(resultSetMetaData.getColumnTypeName(2)).thenReturn(dateTypeName) + when(resultSetMetaData.isNullable(2)).thenReturn(ResultSetMetaData.columnNoNulls) + when(resultSetMetaData.getColumnTypeName(3)).thenReturn(varcharTypeName) + when(resultSetMetaData.isNullable(3)).thenReturn(ResultSetMetaData.columnNoNulls) + + val validationResult = + typeDescriptor.columnDefn.resultSetExtractor.validate(resultSetMetaData) + + assert(validationResult.isSuccess, validationResult) + } + } + } + } + + "Big Jdbc Test" should { + + isColumnDefinitionAvailable[ExhaustiveJdbcCaseClass] + + // explicitly just call this to get a compiler error + DBMacro.toDBTypeDescriptor[ExhaustiveJdbcCaseClass] + // ensure the implicit fires + isJDBCTypeInfoAvailable[ExhaustiveJdbcCaseClass] + + val expectedColumns = List( + ColumnDefinition(BIGINT, ColumnName("bigInt"), NotNullable, None, None), + ColumnDefinition(INT, ColumnName("smallerAgainInt"), NotNullable, None, None), + ColumnDefinition(INT, ColumnName("normalIntWithSize"), NotNullable, Some(5), None), + ColumnDefinition(SMALLINT, ColumnName("evenSmallerInt"), NotNullable, None, None), + ColumnDefinition(DOUBLE, ColumnName("numberFun"), NotNullable, None, None), + ColumnDefinition(BOOLEAN, ColumnName("booleanFlag"), NotNullable, None, None), + ColumnDefinition(VARCHAR, ColumnName("smallString"), NotNullable, Some(20), None), + ColumnDefinition(VARCHAR, ColumnName("smallishString"), NotNullable, Some(200), None), + ColumnDefinition(TEXT, ColumnName("largeString"), NotNullable, None, None), + ColumnDefinition(TEXT, ColumnName("forceTextString"), NotNullable, None, None), + ColumnDefinition(VARCHAR, ColumnName("forcedVarChar"), NotNullable, Some(2051), None), + ColumnDefinition(DATETIME, ColumnName("myDateWithTime"), NotNullable, None, None), + ColumnDefinition(DATE, ColumnName("myDateWithoutTime"), NotNullable, None, None), + ColumnDefinition(BIGINT, ColumnName("optiLong"), Nullable, None, None), + ColumnDefinition(BLOB, ColumnName("byteArr"), NotNullable, None, None), + ColumnDefinition(TINYINT, ColumnName("tinyInt"), NotNullable, None, None) + ) + + val typeDesc = DBMacro.toDBTypeDescriptor[ExhaustiveJdbcCaseClass] + val columnDef = typeDesc.columnDefn + assert(columnDef.columns.toList === expectedColumns) + + val rsmd = mock[ResultSetMetaData] + when(rsmd.getColumnTypeName(1)).thenReturn("BIGINT") + when(rsmd.isNullable(1)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(2)).thenReturn("INT") + when(rsmd.isNullable(2)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(3)).thenReturn("INTEGER") // synonym of INT + when(rsmd.isNullable(3)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(4)).thenReturn("SMALLINT") + when(rsmd.isNullable(4)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(5)).thenReturn("DOUBLE") + when(rsmd.isNullable(5)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(6)).thenReturn("TINYINT") + when(rsmd.isNullable(6)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(7)).thenReturn("VARCHAR") + when(rsmd.isNullable(7)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(8)).thenReturn("CHAR") // synonym of VARCHAR + when(rsmd.isNullable(8)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(9)).thenReturn("TEXT") + when(rsmd.isNullable(9)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(10)).thenReturn("TEXT") + when(rsmd.isNullable(10)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(11)).thenReturn("VARCHAR") + when(rsmd.isNullable(11)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(12)).thenReturn("DATETIME") + when(rsmd.isNullable(12)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(13)).thenReturn("DATE") + when(rsmd.isNullable(13)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(14)).thenReturn("BIGINT") + when(rsmd.isNullable(14)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(15)).thenReturn("BLOB") + when(rsmd.isNullable(15)).thenReturn(ResultSetMetaData.columnNoNulls) + when(rsmd.getColumnTypeName(16)).thenReturn("TINYINT") + when(rsmd.isNullable(16)).thenReturn(ResultSetMetaData.columnNoNulls) + + assert(columnDef.resultSetExtractor.validate(rsmd).isSuccess) + + val byteArrStr: String = "byteArr" + val byteArr: Array[Byte] = byteArrStr.getBytes + val blob: Blob = new SerialBlob(byteArr) + val rs = mock[ResultSet] + when(rs.getLong("bigInt")).thenReturn(12345678L) + when(rs.getInt("smallerAgainInt")).thenReturn(123) + when(rs.getInt("normalIntWithSize")).thenReturn(12) + when(rs.getInt("evenSmallerInt")).thenReturn(1) + when(rs.getDouble("numberFun")).thenReturn(1.1) + when(rs.getBoolean("booleanFlag")).thenReturn(true) + when(rs.getString("smallString")).thenReturn("small_string") + when(rs.getString("smallishString")).thenReturn("smallish_string") + when(rs.getString("largeString")).thenReturn("large_string") + when(rs.getString("forceTextString")).thenReturn("force_text_string") + when(rs.getString("forcedVarChar")).thenReturn("forced_var_char") + when(rs.getTimestamp("myDateWithTime")).thenReturn(new java.sql.Timestamp(1111L)) + when(rs.getTimestamp("myDateWithoutTime")).thenReturn(new java.sql.Timestamp(1112L)) + when(rs.getLong("optiLong")).thenReturn(1113L) + when(rs.getBlob("byteArr")).thenReturn(blob) + when(rs.getByte("tinyInt")).thenReturn(12.toByte) + + val actual: ExhaustiveJdbcCaseClass = columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) + + inside(actual) { + case ExhaustiveJdbcCaseClass( + bigInt, + smallerAgainInt, + normalIntWithSize, + evenSmallerInt, + numberFun, + booleanFlag, + smallString, + smallishString, + largeString, + forceTextString, + forcedVarChar, + myDateWithTime, + myDateWithoutTime, + optiLong, + bArr, + tinyInt + ) => + bigInt should be(12345678L) + smallerAgainInt should be(123) + normalIntWithSize should be(12) + evenSmallerInt should be(1) + numberFun should be(1.1) + booleanFlag should be(true) + smallString should be("small_string") + smallishString should be("smallish_string") + largeString should be("large_string") + forceTextString should be("force_text_string") + forcedVarChar should be("forced_var_char") + myDateWithTime should be(new Date(1111L)) + myDateWithoutTime should be(new Date(1112L)) + optiLong.get should be(1113L) + bArr shouldEqual byteArr + tinyInt shouldEqual 12.toByte + } + () // Need this till: https://github.com/scalatest/scalatest/issues/1107 + } + + "TupleConverter for Date" should { + val typeDesc = DBMacro.toDBTypeDescriptor[CaseClassWithDate] + val converter = typeDesc.converter + val date1 = new Date(100L) + val date2 = new Date(200L) + val t = Tuple.size(3) + t.setLong(0, 99L) + t.set(1, date1) + t.set(2, date2) + assert(CaseClassWithDate(99L, date1, date2) == converter(new TupleEntry(t))) + () // Need this till: https://github.com/scalatest/scalatest/issues/1107 + } + + "ResultSetExtractor validation for nullable columns" should { + + val typeDesc = DBMacro.toDBTypeDescriptor[CaseClassWithOptions] + val columnDef = typeDesc.columnDefn + + val rsmd = mock[ResultSetMetaData] + when(rsmd.getColumnTypeName(1)).thenReturn("INT") + when(rsmd.isNullable(1)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(2)).thenReturn("VARCHAR") + when(rsmd.isNullable(2)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(3)).thenReturn("DATETIME") + when(rsmd.isNullable(3)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(4)).thenReturn("BOOLEAN") + when(rsmd.isNullable(4)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(5)).thenReturn("SMALLINT") + when(rsmd.isNullable(5)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(6)).thenReturn("BIGINT") + when(rsmd.isNullable(6)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(7)).thenReturn("DOUBLE") + when(rsmd.isNullable(7)).thenReturn(ResultSetMetaData.columnNullable) + + assert(columnDef.resultSetExtractor.validate(rsmd).isSuccess) + () // Need this till: https://github.com/scalatest/scalatest/issues/1107 + } + + "ResultSetExtractor when nullable values are not null" should { + val typeDesc = DBMacro.toDBTypeDescriptor[CaseClassWithOptions] + val columnDef = typeDesc.columnDefn + + val rs = mock[ResultSet] + when(rs.getInt("id")).thenReturn(26) + when(rs.wasNull).thenReturn(false) + when(rs.getString("name")).thenReturn("alice") + when(rs.wasNull).thenReturn(false) + when(rs.getTimestamp("date_id")).thenReturn(new java.sql.Timestamp(1111L)) + when(rs.wasNull).thenReturn(false) + when(rs.getBoolean("boolean_value")).thenReturn(true) + when(rs.wasNull).thenReturn(false) + when(rs.getInt("short_value")).thenReturn(2) + when(rs.wasNull).thenReturn(false) + when(rs.getLong("long_value")).thenReturn(2000L) + when(rs.wasNull).thenReturn(false) + when(rs.getDouble("double_value")).thenReturn(2.2) + when(rs.wasNull).thenReturn(false) + assert( + columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) == + CaseClassWithOptions( + Some(26), + Some("alice"), + Some(new Date(1111L)), + Some(true), + Some(2), + Some(2000L), + Some(2.2) + ) + ) + () // Need this till: https://github.com/scalatest/scalatest/issues/1107 + } + + "ResultSetExtractor when null values" should { + val typeDesc = DBMacro.toDBTypeDescriptor[CaseClassWithOptions] + val columnDef = typeDesc.columnDefn + + val rs = mock[ResultSet] + when(rs.getInt("id")).thenReturn(0) // jdbc returns 0 for null numeric values + when(rs.wasNull).thenReturn(true) + when(rs.getString("name")).thenReturn(null) + when(rs.wasNull).thenReturn(true) + when(rs.getString("date_id")).thenReturn(null) + when(rs.getBoolean("boolean_value")).thenReturn(false) // jdbc returns false for null boolean values + when(rs.wasNull).thenReturn(true) + when(rs.getInt("short_value")).thenReturn(0) + when(rs.wasNull).thenReturn(true) + when(rs.getLong("long_value")).thenReturn(0L) + when(rs.wasNull).thenReturn(true) + when(rs.getDouble("double_value")).thenReturn(0) + when(rs.wasNull).thenReturn(true) + assert( + columnDef.resultSetExtractor.toCaseClass(rs, typeDesc.converter) == + CaseClassWithOptions(None, None, None, None, None, None, None) + ) + () // Need this till: https://github.com/scalatest/scalatest/issues/1107 + } + + "ResultSetExtractor for DB schema type mismatch" in { + val typeDesc = DBMacro.toDBTypeDescriptor[CaseClassWithOptions] + val columnDef = typeDesc.columnDefn + + val rsmd = mock[ResultSetMetaData] + when(rsmd.getColumnTypeName(1)).thenReturn("INT") + when(rsmd.isNullable(1)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(2)).thenReturn("TINYINT") // mismatch + when(rsmd.isNullable(2)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(3)).thenReturn("DATETIME") + when(rsmd.isNullable(3)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(4)).thenReturn("BOOLEAN") + when(rsmd.isNullable(4)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(5)).thenReturn("SMALLINT") + when(rsmd.isNullable(5)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(6)).thenReturn("BIGINT") + when(rsmd.isNullable(6)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(7)).thenReturn("DOUBLE") + when(rsmd.isNullable(7)).thenReturn(ResultSetMetaData.columnNullable) + + assert(columnDef.resultSetExtractor.validate(rsmd).isFailure) + } + + "ResultSetExtractor for DB schema nullable mismatch" in { + val typeDesc = DBMacro.toDBTypeDescriptor[CaseClassWithOptions] + val columnDef = typeDesc.columnDefn + + val rsmd = mock[ResultSetMetaData] + when(rsmd.getColumnTypeName(1)).thenReturn("INT") + when(rsmd.isNullable(1)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(2)).thenReturn("VARCHAR") + when(rsmd.isNullable(2)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(3)).thenReturn("DATETIME") + when(rsmd.isNullable(3)).thenReturn(ResultSetMetaData.columnNoNulls) // mismatch + when(rsmd.getColumnTypeName(4)).thenReturn("BOOLEAN") + when(rsmd.isNullable(4)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(5)).thenReturn("SMALLINT") + when(rsmd.isNullable(5)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(6)).thenReturn("BIGINT") + when(rsmd.isNullable(6)).thenReturn(ResultSetMetaData.columnNullable) + when(rsmd.getColumnTypeName(7)).thenReturn("DOUBLE") + when(rsmd.isNullable(7)).thenReturn(ResultSetMetaData.columnNullable) + + assert(columnDef.resultSetExtractor.validate(rsmd).isFailure) + } + + "Duplicate nested fields should be blocked" in { + a[TestFailedException] should be thrownBy isColumnDefinitionAvailable[OuterWithBadNesting] + } +} diff --git a/scalding-estimators-test/src/test/resources/hipster.txt b/scalding-estimators-test/src/test/resources/hipster.txt new file mode 100644 index 0000000000..a7056d8dd8 --- /dev/null +++ b/scalding-estimators-test/src/test/resources/hipster.txt @@ -0,0 +1,7 @@ +Direct trade American Apparel squid umami tote bag. Lo-fi XOXO gluten-free meh literally, typewriter readymade wolf salvia whatever drinking vinegar organic. Four loko literally bicycle rights drinking vinegar Cosby sweater hella stumptown. Dreamcatcher iPhone 90's organic chambray cardigan, wolf fixie gluten-free Brooklyn four loko. Mumblecore ennui twee, 8-bit food truck sustainable tote bag Williamsburg mixtape biodiesel. Semiotics Helvetica put a bird on it, roof party fashion axe organic post-ironic readymade Wes Anderson Pinterest keffiyeh. Craft beer meggings sartorial, butcher Marfa kitsch art party mustache Brooklyn vinyl. + +Wolf flannel before they sold out vinyl, selfies four loko Bushwick Banksy Odd Future. Chillwave banh mi iPhone, Truffaut shabby chic craft beer keytar DIY. Scenester selvage deep v YOLO paleo blog photo booth fap. Sustainable wolf mixtape small batch skateboard, pop-up brunch asymmetrical seitan butcher Thundercats disrupt twee Etsy. You probably haven't heard of them freegan skateboard before they sold out, mlkshk pour-over Echo Park keytar retro farm-to-table. Tattooed sustainable beard, Helvetica Wes Anderson pickled vinyl yr pop-up Vice. Wolf bespoke lomo photo booth ethnic cliche. + +Dreamcatcher Portland put a bird on it, disrupt roof party stumptown aesthetic. Bitters pug drinking vinegar Vice, VHS mixtape mustache Williamsburg put a bird on it YOLO High Life crucifix butcher kale chips. Cornhole keffiyeh bespoke, raw denim hella semiotics iPhone beard literally kogi +1. Umami tousled gastropub, vinyl pork belly drinking vinegar typewriter cliche Cosby sweater. Kale chips DIY quinoa gastropub, paleo ethnic kogi hashtag brunch meh. Twee aesthetic tote bag sustainable drinking vinegar skateboard. 90's cred bespoke, Blue Bottle raw denim flannel fixie master cleanse Helvetica. + +Raw denim Austin ugh Neutra. Irony squid tattooed 90's small batch umami. Bicycle rights distillery authentic, sustainable plaid wolf gastropub freegan you probably haven't heard of them Truffaut forage next level. Chillwave Etsy Bushwick Banksy, Schlitz XOXO crucifix sustainable scenester organic fap try-hard disrupt Cosby sweater chia. Skateboard ennui distillery, letterpress +1 post-ironic locavore kitsch aesthetic single-origin coffee wolf Pinterest cray readymade Pitchfork. Portland Bushwick Thundercats, occupy banh mi American Apparel mlkshk. VHS tofu Intelligentsia, cred Kickstarter single-origin coffee wolf wayfarers umami iPhone +1 Tonx. diff --git a/scalding-estimators-test/src/test/resources/scores.tsv b/scalding-estimators-test/src/test/resources/scores.tsv new file mode 100644 index 0000000000..880f33e2be --- /dev/null +++ b/scalding-estimators-test/src/test/resources/scores.tsv @@ -0,0 +1,13 @@ +iphone 0.5 +mixtape 0.2 +helvetica 0.1 +gastropub 0.1 +raw 0.05 +sustainable 0.01 +stumptown 0.75 +postironic 0.3 +ironic 0.9 +pintrest 0.05 +selfies 0.2 +dreamcatcher 0.65 +twitter 0.0 diff --git a/scalding-estimators-test/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorTest.scala b/scalding-estimators-test/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorTest.scala new file mode 100644 index 0000000000..ea8c10da1e --- /dev/null +++ b/scalding-estimators-test/src/test/scala/com/twitter/scalding/estimation/memory/MemoryEstimatorTest.scala @@ -0,0 +1,250 @@ +package com.twitter.scalding.estimation.memory + +import com.twitter.scalding.Config +import com.twitter.scalding.estimation.{FlowStepHistory, FlowStrategyInfo, HistoryService, Task} +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatformTest} +import com.twitter.scalding.reducer_estimation._ +import org.apache.hadoop.mapred.JobConf +import org.scalatest.{Matchers, WordSpec} +import scala.collection.JavaConverters._ +import scala.util.{Success, Try} + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions.ConfigCompanionCascadingExtensions + +class MemoryEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatformTest { + "Single-step job with memory estimator" should { + "without history don't override memory settings" in { + val customConfig = Config.empty + + (Config.MemoryEstimators -> classOf[EmptySmoothedMemoryEstimator].getName) + + HadoopPlatformJobTest(new SimpleJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + + conf.get(Config.MapMemory) shouldBe None + conf.get(Config.MapJavaOpts) shouldBe None + conf.get(Config.ReduceMemory) shouldBe None + conf.get(Config.ReduceJavaOpts) shouldBe None + } + .run() + } + + "run with correct number of memory" in { + val customConfig = Config.empty + + (Config.MemoryEstimators -> classOf[SmoothedMemoryEstimatorWithData].getName) + + HadoopPlatformJobTest(new SimpleJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + + conf.get(Config.MapMemory) shouldBe Some("1536") + conf.get(Config.MapJavaOpts) shouldBe Some(" -Xmx1228m") + conf.get(Config.ReduceMemory) shouldBe Some("1536") + conf.get(Config.ReduceJavaOpts) shouldBe Some(" -Xmx1228m") + } + .run() + } + + "respect cap when estimated memory is above the configured max" in { + val customConfig = Config.empty + + (Config.MemoryEstimators -> classOf[SmoothedMemoryEstimatorWithMoreThanMaxCap].getName) + + HadoopPlatformJobTest(new SimpleJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + + conf.get(Config.MapMemory) shouldBe Some("8192") + conf.get(Config.MapJavaOpts) shouldBe Some(" -Xmx6553m") + conf.get(Config.ReduceMemory) shouldBe Some("8192") + conf.get(Config.ReduceJavaOpts) shouldBe Some(" -Xmx6553m") + } + .run() + } + + "respect cap when estimated memory is below the configured min" in { + val customConfig = Config.empty + + (Config.MemoryEstimators -> classOf[SmoothedMemoryEstimatorWithLessThanMinCap].getName) + + HadoopPlatformJobTest(new SimpleJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + + conf.get(Config.MapMemory) shouldBe Some("1024") + conf.get(Config.MapJavaOpts) shouldBe Some(" -Xmx819m") + conf.get(Config.ReduceMemory) shouldBe Some("1024") + conf.get(Config.ReduceJavaOpts) shouldBe Some(" -Xmx819m") + } + .run() + } + + "not set memory when error fetching history" in { + val customConfig = Config.empty + + (Config.MemoryEstimators -> classOf[ErrorHistoryBasedMemoryEstimator].getName) + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + + conf.get(Config.MapMemory) shouldBe None + conf.get(Config.MapJavaOpts) shouldBe None + conf.get(Config.ReduceMemory) shouldBe None + conf.get(Config.ReduceJavaOpts) shouldBe None + + } + .run() + } + } + + "Multi-step job with memory estimator" should { + "run with correct number of memory in each step" in { + val customConfig = Config.empty + + (Config.MemoryEstimators -> classOf[SmoothedMemoryEstimatorWithData].getName) + + HadoopPlatformJobTest(new HipJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + + val mapsMemory = steps.map(_.getConfig.getInt(Config.MapMemory, 0)).toList + val mapsJavaOpts = steps.map(_.getConfig.get(Config.MapJavaOpts, "")).toList + + mapsMemory shouldBe List(1536, 0, 1024) + mapsJavaOpts shouldBe List(" -Xmx1228m", "", " -Xmx819m") + + val reducersMemory = steps.map(_.getConfig.getInt(Config.ReduceMemory, 0)).toList + val reducersJavaOpts = steps.map(_.getConfig.get(Config.ReduceJavaOpts, "")).toList + + reducersMemory shouldBe List(1536, 0, 1024) + reducersJavaOpts shouldBe List(" -Xmx1228m", "", " -Xmx819m") + } + .run() + } + } +} + +object EmptyHistoryService extends HistoryService { + override def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = + Success(Seq.empty) +} + +class CustomHistoryService(val history: JobConf => Seq[(String, Long)]) extends HistoryService { + import Utils._ + + override def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = + if (info.step.getStepNum == 1) { + makeHistory(info.step.getConfig, history) + } else if (info.step.getStepNum == 2) { + Success(Nil) + } else { + makeHistory(info.step.getConfig, _ => Seq("MAP" -> 512.megabyte, "REDUCE" -> 512.megabyte)) + } + + def makeHistory(conf: JobConf, history: JobConf => Seq[(String, Long)]): Success[Seq[FlowStepHistory]] = + Success(history(conf).map { case (taskType, memory) => + val task = Task( + details = Map(Task.TaskType -> taskType), + counters = Map(SmoothedHistoryMemoryEstimator.CommittedHeapBytes -> memory) + ) + val tasks = Seq(task) + FlowStepHistory( + keys = null, + submitTimeMillis = 0, + launchTimeMillis = 0L, + finishTimeMillis = 0L, + totalMaps = 0L, + totalReduces = 0L, + finishedMaps = 0L, + finishedReduces = 0L, + failedMaps = 0L, + failedReduces = 0L, + mapFileBytesRead = 0L, + mapFileBytesWritten = 0L, + mapOutputBytes = 0L, + reduceFileBytesRead = 0L, + hdfsBytesRead = 0L, + hdfsBytesWritten = 0L, + mapperTimeMillis = 0L, + reducerTimeMillis = 0L, + reduceShuffleBytes = 0L, + cost = 1.1, + tasks = tasks + ) + }) +} + +class EmptySmoothedMemoryEstimator extends SmoothedHistoryMemoryEstimator { + override def historyService: HistoryService = EmptyHistoryService +} + +class SmoothedMemoryEstimatorWithData extends SmoothedHistoryMemoryEstimator { + import Utils._ + + override def historyService: HistoryService = new CustomHistoryService(_ => + Seq( + "MAP" -> 800.megabytes, + "REDUCE" -> 800.megabytes, + "MAP" -> 1024.megabytes, + "REDUCE" -> 1024.megabytes, + "MAP" -> 1300.megabytes, + "REDUCE" -> 1300.megabytes, + "MAP" -> 723.megabytes, + "REDUCE" -> 723.megabytes + ) + ) +} + +class SmoothedMemoryEstimatorWithMoreThanMaxCap extends SmoothedHistoryMemoryEstimator { + import Utils._ + + override def historyService: HistoryService = new CustomHistoryService(conf => + Seq( + "MAP" -> (MemoryEstimatorConfig.getMaxContainerMemory(conf).megabyte + 1.gigabyte), + "REDUCE" -> (MemoryEstimatorConfig.getMaxContainerMemory(conf).megabyte + 1.gigabyte) + ) + ) +} + +class SmoothedMemoryEstimatorWithLessThanMinCap extends SmoothedHistoryMemoryEstimator { + import Utils._ + + override def historyService: HistoryService = new CustomHistoryService(conf => + Seq( + "MAP" -> (MemoryEstimatorConfig.getMinContainerMemory(conf).megabyte - 500.megabyte), + "REDUCE" -> (MemoryEstimatorConfig.getMinContainerMemory(conf).megabyte - 500.megabyte) + ) + ) +} + +class ErrorHistoryBasedMemoryEstimator extends SmoothedHistoryMemoryEstimator { + override val historyService = ErrorHistoryService +} + +object Utils { + implicit class StorageUnit(val wrapped: Long) extends AnyVal { + def fromMegabytes(megabytes: Long): Long = megabytes * 1024 * 1024 + def fromGigabytes(gigabytes: Long): Long = gigabytes * 1024 * 1024 * 1024 + + def megabyte: Long = megabytes + def megabytes: Long = fromMegabytes(wrapped) + def gigabyte: Long = gigabytes + def gigabytes: Long = fromGigabytes(wrapped) + + def inMegabytes: Long = wrapped / (1024L * 1024) + } + + implicit def doubleToLong(value: Double): StorageUnit = new StorageUnit(value.toLong) +} diff --git a/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimatorTest.scala b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimatorTest.scala new file mode 100644 index 0000000000..ae1b25a601 --- /dev/null +++ b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RatioBasedEstimatorTest.scala @@ -0,0 +1,243 @@ +package com.twitter.scalding.reducer_estimation + +import com.twitter.scalding._ +import com.twitter.scalding.estimation.{FlowStepHistory, FlowStrategyInfo, HistoryService, Task} +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatformTest} +import org.scalatest.{Matchers, WordSpec} +import scala.collection.JavaConverters._ +import scala.util.{Failure, Success, Try} + +class SimpleJobWithNoSetReducers(args: Args, customConfig: Config) extends Job(args) { + import HipJob._ + + override def config = super.config ++ customConfig.toMap.toMap + + TypedPipe + .from(inSrc) + .flatMap(_.split("[^\\w]+")) + .map(_.toLowerCase -> 1) + .group + .sum + .write(counts) +} + +object EmptyHistoryService extends HistoryService { + def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = Success(Nil) +} + +object ErrorHistoryService extends HistoryService { + override def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = + Failure(new RuntimeException("Failed to fetch job history")) +} + +object HistoryServiceWithData { + import ReducerHistoryEstimator._ + + // we only care about these two input size fields for RatioBasedEstimator + def makeHistory(inputHdfsBytesRead: Long, mapOutputBytes: Long): FlowStepHistory = + makeHistory(inputHdfsBytesRead, mapOutputBytes, Seq()) + + def makeHistory( + inputHdfsBytesRead: Long, + mapOutputBytes: Long, + taskRuntimes: Seq[Long] + ): FlowStepHistory = { + val random = new scala.util.Random(123) + val tasks = taskRuntimes.map { time => + val startTime = random.nextLong + Task( + details = Map( + Task.TaskType -> "REDUCE", + Status -> "SUCCEEDED", + StartTime -> startTime, + FinishTime -> (startTime + time) + ), + Map.empty + ) + } + + FlowStepHistory( + keys = null, + submitTimeMillis = 0, + launchTimeMillis = 0L, + finishTimeMillis = 0L, + totalMaps = 0L, + totalReduces = 0L, + finishedMaps = 0L, + finishedReduces = 0L, + failedMaps = 0L, + failedReduces = 0L, + mapFileBytesRead = 0L, + mapFileBytesWritten = 0L, + mapOutputBytes = mapOutputBytes, + reduceFileBytesRead = 0L, + hdfsBytesRead = inputHdfsBytesRead, + hdfsBytesWritten = 0L, + mapperTimeMillis = 0L, + reducerTimeMillis = 0L, + reduceShuffleBytes = 0L, + cost = 1.1, + tasks = tasks + ) + } + + def inputSize = HipJob.InSrcFileSize +} + +object ValidHistoryService extends HistoryService { + import HistoryServiceWithData._ + + def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = + // past reducer ratio 0.5 + Success( + Seq( + makeHistory(10, 1), // below threshold, ignored + makeHistory(inputSize, inputSize / 2), + makeHistory(inputSize, inputSize / 2), + makeHistory(inputSize, inputSize / 2) + ) + ) +} + +object SmallDataExplosionHistoryService extends HistoryService { + import HistoryServiceWithData._ + + def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = { + // huge ratio, but data is still small overall + + val outSize = inputSize * 1000 + + Success( + Seq(makeHistory(inputSize, outSize), makeHistory(inputSize, outSize), makeHistory(inputSize, outSize)) + ) + } +} + +object InvalidHistoryService extends HistoryService { + import HistoryServiceWithData._ + + def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = + // all entries below the 10% threshold for past input size + Success(Seq(makeHistory(10, 1), makeHistory(10, 1), makeHistory(10, 1))) +} + +class EmptyHistoryBasedEstimator extends RatioBasedEstimator { + override val historyService = EmptyHistoryService +} + +class ErrorHistoryBasedEstimator extends RatioBasedEstimator { + override val historyService = ErrorHistoryService +} + +class ValidHistoryBasedEstimator extends RatioBasedEstimator { + override val historyService = ValidHistoryService +} + +class SmallDataExplosionHistoryBasedEstimator extends RatioBasedEstimator { + override val historyService = SmallDataExplosionHistoryService +} + +class InvalidHistoryBasedEstimator extends RatioBasedEstimator { + override val historyService = InvalidHistoryService +} + +class RatioBasedReducerEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatformTest { + + "Single-step job with ratio-based reducer estimator" should { + "not set reducers when no history is found" in { + val customConfig = Config.empty.addReducerEstimator(classOf[EmptyHistoryBasedEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> "1k") + + (RatioBasedEstimator.inputRatioThresholdKey -> 0.10f.toString) + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = steps.head.getConfig + conf.getNumReduceTasks should equal(1) // default + } + .run() + } + + "not set reducers when error fetching history" in { + val customConfig = Config.empty.addReducerEstimator(classOf[ErrorHistoryBasedEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> "1k") + + (RatioBasedEstimator.inputRatioThresholdKey -> 0.10f.toString) + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = steps.head.getConfig + conf.getNumReduceTasks should equal(1) // default + } + .run() + } + + "set reducers correctly when there is valid history" in { + val customConfig = Config.empty + .addReducerEstimator(classOf[ValidHistoryBasedEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> "1k") + + (RatioBasedEstimator.inputRatioThresholdKey -> 0.10f.toString) + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + // base estimate from input size reducer = 3 + // reducer ratio from history = 0.5 + // final estimate = ceil(3 * 0.5) = 2 + val conf = steps.head.getConfig + conf.getNumReduceTasks should equal(2) + } + .run() + } + + /* + * If the InputSizeReducerEstimator decides that less than 1 reducer is necessary, it + * rounds up to 1. If the RatioBasedEstimator relies on this, it will use the rounded-up + * value to calculate the number of reducers. In the case of data explosion on a small dataset, + * you end up with a very large number of reducers because this rounding error is multiplied. + * This regression test ensures that this is no longer the case. + * + * see https://github.com/twitter/scalding/issues/1541 for more details. + */ + "handle mapper output explosion over small data correctly" in { + val customConfig = Config.empty + .addReducerEstimator(classOf[SmallDataExplosionHistoryBasedEstimator]) + + // set the bytes per reducer to to 500x input size, so that we estimate needing 2 reducers, + // even though there's a very large explosion in input data size, the data is still pretty small + (InputSizeReducerEstimator.BytesPerReducer -> (HistoryServiceWithData.inputSize * 500).toString) + + (RatioBasedEstimator.inputRatioThresholdKey -> 0.10f.toString) + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = steps.head.getConfig + conf.getNumReduceTasks should equal(2) // used to pick 1000 with the rounding error + } + .run() + } + + "not set reducers when there is no valid history" in { + val customConfig = Config.empty.addReducerEstimator(classOf[InvalidHistoryBasedEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> "1k") + + (RatioBasedEstimator.inputRatioThresholdKey -> 0.10f.toString) + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = steps.head.getConfig + conf.getNumReduceTasks should equal(1) // default + } + .run() + } + } +} diff --git a/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorTest.scala b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorTest.scala new file mode 100644 index 0000000000..e758033a73 --- /dev/null +++ b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/ReducerEstimatorTest.scala @@ -0,0 +1,314 @@ +package com.twitter.scalding.reducer_estimation + +import cascading.flow.FlowException +import com.twitter.scalding._ +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatformTest} +import java.io.FileNotFoundException +import org.scalatest.{Matchers, WordSpec} +import scala.collection.JavaConverters._ + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions.ConfigCompanionCascadingExtensions + +object HipJob { + val InSrcFileSize = 2496L + val inPath = getClass.getResource("/hipster.txt") // file size is 2496 bytes + val inSrc = TextLine(inPath.toString) + val InScoresFileSize = 174L + val inScores = + TypedTsv[(String, Double)](getClass.getResource("/scores.tsv").toString) // file size is 174 bytes + val out = TypedTsv[Double]("output") + val counts = TypedTsv[(String, Int)]("counts.tsv") + val size = TypedTsv[Long]("size.tsv") + val correct = Map("hello" -> 1, "goodbye" -> 1, "world" -> 2) +} + +class HipJob(args: Args, customConfig: Config) extends Job(args) { + import HipJob._ + override def config = super.config ++ customConfig.toMap.toMap + + def tokenize(text: String): TraversableOnce[String] = + text.toLowerCase + .replaceAll("[^a-zA-Z0-9\\s]", "") + .split("\\s+") + + val wordCounts = TypedPipe + .from(inSrc) + .flatMap(tokenize) + .map(_ -> 1) + .group + .sum + + val scores = TypedPipe.from(inScores).group + + wordCounts + .leftJoin(scores) + .mapValues { case (count, score) => (count, score.getOrElse(0.0)) } + // force another M/R step - should use reducer estimation + .toTypedPipe + .map { case (word, (count, score)) => (count, score) } + .group + .sum + // force another M/R step - this should force 1 reducer because it is essentially a groupAll + .toTypedPipe + .values + .sum + .write(out) + +} + +class SimpleJob(args: Args, customConfig: Config) extends Job(args) { + import HipJob._ + + override def config = super.config ++ customConfig.toMap.toMap + + TypedPipe + .from(inSrc) + .flatMap(_.split("[^\\w]+")) + .map(_.toLowerCase -> 1) + .group + // force the number of reducers to two, to test with/without estimation + .withReducers(2) + .sum + .write(counts) +} + +class SimpleGlobJob(args: Args, customConfig: Config) extends Job(args) { + import HipJob._ + + val inSrcGlob = inPath.toString.replace("hipster", "*") + val inSrc = TextLine(inSrcGlob) + + override def config = super.config ++ customConfig.toMap.toMap + + TypedPipe + .from(inSrc) + .flatMap(_.split("[^\\w]+")) + .map(_.toLowerCase -> 1) + .group + // force the number of reducers to two, to test with/without estimation + .withReducers(2) + .sum + .write(counts) +} + +class SimpleMemoryJob(args: Args, customConfig: Config) extends Job(args) { + import HipJob._ + + val inSrc = IterableSource( + List( + "Direct trade American Apparel squid umami tote bag. Lo-fi XOXO gluten-free meh literally, typewriter readymade wolf salvia whatever drinking vinegar organic. Four loko literally bicycle rights drinking vinegar Cosby sweater hella stumptown. Dreamcatcher iPhone 90's organic chambray cardigan, wolf fixie gluten-free Brooklyn four loko. Mumblecore ennui twee, 8-bit food truck sustainable tote bag Williamsburg mixtape biodiesel. Semiotics Helvetica put a bird on it, roof party fashion axe organic post-ironic readymade Wes Anderson Pinterest keffiyeh. Craft beer meggings sartorial, butcher Marfa kitsch art party mustache Brooklyn vinyl.", + "Wolf flannel before they sold out vinyl, selfies four loko Bushwick Banksy Odd Future. Chillwave banh mi iPhone, Truffaut shabby chic craft beer keytar DIY. Scenester selvage deep v YOLO paleo blog photo booth fap. Sustainable wolf mixtape small batch skateboard, pop-up brunch asymmetrical seitan butcher Thundercats disrupt twee Etsy. You probably haven't heard of them freegan skateboard before they sold out, mlkshk pour-over Echo Park keytar retro farm-to-table. Tattooed sustainable beard, Helvetica Wes Anderson pickled vinyl yr pop-up Vice. Wolf bespoke lomo photo booth ethnic cliche." + ) + ) + + override def config = super.config ++ customConfig.toMap.toMap + + TypedPipe + .from(inSrc) + .flatMap(_.split("[^\\w]+")) + .map(_.toLowerCase -> 1) + .group + // force the number of reducers to two, to test with/without estimation + .withReducers(2) + .sum + .write(counts) +} + +class SimpleFileNotFoundJob(args: Args, customConfig: Config) extends Job(args) { + import HipJob._ + + val inSrc = TextLine("file.txt") + + override def config = super.config ++ customConfig.toMap.toMap + + TypedPipe + .from(inSrc) + .flatMap(_.split("[^\\w]+")) + .map(_.toLowerCase -> 1) + .group + // force the number of reducers to two, to test with/without estimation + .withReducers(2) + .sum + .write(counts) +} + +class GroupAllJob(args: Args, customConfig: Config) extends Job(args) { + + import HipJob._ + override def config = super.config ++ customConfig.toMap.toMap + + TypedPipe + .from(inSrc) + .flatMap(_.split("[^\\w]+")) + .groupAll + .size + .values + .write(size) +} + +class SimpleMapOnlyJob(args: Args, customConfig: Config) extends Job(args) { + import HipJob._ + + override def config = super.config ++ customConfig.toMap.toMap + + // simple job with no reduce phase + TypedPipe + .from(inSrc) + .flatMap(_.split("[^\\w]+")) + .write(TypedTsv[String]("mapped_output")) +} + +class ReducerEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatformTest { + import HipJob._ + + "Single-step job with reducer estimator" should { + "run with correct number of reducers" in { + val customConfig = Config.empty.addReducerEstimator(classOf[InputSizeReducerEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> (1L << 10).toString) + + HadoopPlatformJobTest(new SimpleJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + conf.getNumReducers should contain(2) + conf.get(ReducerEstimatorConfig.originalNumReducers) should be(None) + } + .run() + } + + "run with correct number of reducers when we have a glob pattern in path" in { + val customConfig = Config.empty.addReducerEstimator(classOf[InputSizeReducerEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> (1L << 10).toString) + + (Config.ReducerEstimatorOverride -> "true") + + HadoopPlatformJobTest(new SimpleGlobJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + conf.getNumReducers should contain(3) + conf.get(ReducerEstimatorConfig.originalNumReducers) should contain("2") + } + .run() + } + + "run with correct number of reducers when overriding set values" in { + val customConfig = Config.empty.addReducerEstimator(classOf[InputSizeReducerEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> (1L << 10).toString) + + (Config.ReducerEstimatorOverride -> "true") + + HadoopPlatformJobTest(new SimpleJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + conf.getNumReducers should contain(3) + conf.get(ReducerEstimatorConfig.originalNumReducers) should contain("2") + } + .run() + } + + "respect cap when estimated reducers is above the configured max" in { + val customConfig = Config.empty.addReducerEstimator(classOf[InputSizeReducerEstimator]) + + (Config.ReducerEstimatorOverride -> "true") + + // 1 reducer per byte, should give us a large number + (InputSizeReducerEstimator.BytesPerReducer -> 1.toString) + + (ReducerEstimatorConfig.maxEstimatedReducersKey -> 10.toString) + + HadoopPlatformJobTest(new SimpleJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + conf.get(ReducerEstimatorConfig.estimatedNumReducers) should contain("2496") + conf.get(ReducerEstimatorConfig.cappedEstimatedNumReducersKey) should contain("10") + conf.getNumReducers should contain(10) + } + .run() + } + + "ignore memory source in input size estimation" in { + val customConfig = Config.empty.addReducerEstimator(classOf[InputSizeReducerEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> (1L << 10).toString) + + (Config.ReducerEstimatorOverride -> "true") + + HadoopPlatformJobTest(new SimpleMemoryJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + conf.getNumReducers should contain(2) + conf.get(ReducerEstimatorConfig.originalNumReducers) should contain("2") + } + .run() + } + + "throw FileNotFoundException during estimation" in { + val customConfig = Config.empty.addReducerEstimator(classOf[InputSizeReducerEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> (1L << 10).toString) + + (Config.ReducerEstimatorOverride -> "true") + + HadoopPlatformJobTest(new SimpleFileNotFoundJob(_, customConfig), cluster) + .runExpectFailure { case error: FlowException => + error.getCause.getClass should be(classOf[FileNotFoundException]) + } + } + } + + "Group-all job with reducer estimator" should { + "run with correct number of reducers (i.e. 1)" in { + val customConfig = Config.empty.addReducerEstimator(classOf[InputSizeReducerEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> (1L << 10).toString) + + HadoopPlatformJobTest(new GroupAllJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + conf.getNumReducers should contain(1) + } + .run() + } + } + + "Multi-step job with reducer estimator" should { + "run with correct number of reducers in each step" in { + val customConfig = Config.empty.addReducerEstimator(classOf[InputSizeReducerEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> (1L << 10).toString) + + HadoopPlatformJobTest(new HipJob(_, customConfig), cluster) + .sink[Double](out)(_.head shouldBe 2.86 +- 0.0001) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + val reducers = steps.map(_.getConfig.getInt(Config.HadoopNumReducers, 0)).toList + reducers shouldBe List(3, 1, 1) + } + .run() + } + } + + "Map-only job with reducer estimator" should { + "not set num reducers" in { + val customConfig = Config.empty.addReducerEstimator(classOf[InputSizeReducerEstimator]) + + (InputSizeReducerEstimator.BytesPerReducer -> (1L << 10).toString) + + HadoopPlatformJobTest(new SimpleMapOnlyJob(_, customConfig), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + + val conf = Config.fromHadoop(steps.head.getConfig) + val numReducers = conf.getNumReducers + assert(!numReducers.isDefined || numReducers.get == 0, "Reducers should be 0") + } + .run() + } + } +} diff --git a/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimatorTest.scala b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimatorTest.scala new file mode 100644 index 0000000000..1d0a60f282 --- /dev/null +++ b/scalding-estimators-test/src/test/scala/com/twitter/scalding/reducer_estimation/RuntimeReducerEstimatorTest.scala @@ -0,0 +1,210 @@ +package com.twitter.scalding.reducer_estimation + +import com.twitter.scalding._ +import RuntimeReducerEstimator.{EstimationScheme, IgnoreInputSize, RuntimePerReducer} +import com.twitter.scalding.estimation.{Estimator, FlowStepHistory, FlowStrategyInfo, HistoryService} +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatformTest} +import org.scalatest.{Matchers, WordSpec} +import scala.collection.JavaConverters._ +import scala.util.{Success, Try} + +object HistoryService1 extends HistoryService { + import HistoryServiceWithData._ + + def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = + Success( + Seq( + makeHistory(inputSize * 2, 0, List(10, 1000, 3000)), + makeHistory(inputSize / 2, 0, List(10, 200, 400)), + makeHistory(inputSize * 4, 0, List(10, 2400, 3000)) + ) + ) +} + +class Estimator1 extends RuntimeReducerEstimator { + override val historyService = HistoryService1 +} + +class EmptyRuntimeEstimator extends RatioBasedEstimator { + override val historyService = EmptyHistoryService +} + +class ErrorRuntimeEstimator extends RatioBasedEstimator { + override val historyService = ErrorHistoryService +} + +class DummyEstimator extends Estimator[Int] { + override def estimate(info: FlowStrategyInfo): Option[Int] = Some(42) +} + +class RuntimeReducerEstimatorTest extends WordSpec with Matchers with HadoopSharedPlatformTest { + + "Single-step job with runtime-based reducer estimator" should { + "set reducers correctly with median estimation scheme" in { + val config = Config.empty + .addReducerEstimator(classOf[Estimator1]) + .+(RuntimePerReducer -> "25") + // + (EstimationScheme -> "median") + // + (IgnoreInputSize -> false) + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, config), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + assert(steps.length == 1) + + val conf = steps.head.getConfig + // So our histories are (taking median runtimes): + // + // 2 * inputSize bytes, 3 reducers * 1000 ms for each reducer + // inputSize / 2 bytes, 3 reducers * 200 ms for each reducer + // inputSize * 4 bytes, 3 reducers * 2400 ms for each reducer + // + // If we scale by input size, we get: + // + // (1500 / inputSize) ms per byte + // (1200 / inputSize) ms per byte + // (1800 / inputSize) ms per byte + // + // The median of these is (1500 / inputSize) ms per byte, + // so we anticipate that processing (inputSize bytes) + // will take 1500 ms total. + // To do this in 25 ms, we need 60 reducers. + assert(conf.getNumReduceTasks == 60) + } + .run() + } + + "set reducers correctly with mean estimation scheme" in { + val config = Config.empty + .addReducerEstimator(classOf[Estimator1]) + .+(RuntimePerReducer -> "25") + .+(EstimationScheme -> "mean") + // + (IgnoreInputSize -> false) + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, config), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + assert(steps.length == 1) + + val conf = steps.head.getConfig + // So our histories are (taking mean runtimes): + // + // 2 * inputSize bytes, 3 reducers * 1336.67 ms for each reducer + // inputSize / 2 bytes, 3 reducer * 203.33 ms for each reducer + // inputSize * 4 bytes, 3 reducer * 1803.33 ms for each reducer + // + // If we scale by input size, we get: + // + // (2005 / inputSize) ms per byte + // (1220 / inputSize) ms per byte + // (1352.5 / inputSize) ms per byte + // + // The mean of these is (1525.8 / inputSize) ms per byte, + // so we anticipate that processing (inputSize bytes) + // will take 1525.8 ms total. + // + // To do this in 25 ms, we need 61.03 reducers, which rounds up to 62. + assert(conf.getNumReduceTasks == 62) + } + .run() + } + + "set reducers correctly with mean estimation scheme ignoring input size" in { + val config = Config.empty + .addReducerEstimator(classOf[Estimator1]) + .+(RuntimePerReducer -> "25") + .+(EstimationScheme -> "mean") + .+(IgnoreInputSize -> "true") + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, config), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + assert(steps.length == 1) + + val conf = steps.head.getConfig + // So our histories are (taking mean runtimes): + // + // 2 * inputSize bytes, 3 reducers * 1337 ms for each reducer + // inputSize / 2 bytes, 3 reducers * 203 ms for each reducer + // inputSize * 4 bytes, 3 reducers * 1803 ms for each reducer + // + // We don't scale by input size. + // + // The mean of these is 3342 ms, so we anticipate + // that the job will take 3342 ms total. + // + // To do this in 25 ms, we need 134 reducers. + assert(conf.getNumReduceTasks == 134) + } + .run() + } + + "set reducers correctly with median estimation scheme ignoring input size" in { + val config = Config.empty + .addReducerEstimator(classOf[Estimator1]) + .+(RuntimePerReducer -> "25") + .+(IgnoreInputSize -> "true") + // + (EstimationScheme -> "median") + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, config), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + assert(steps.length == 1) + + val conf = steps.head.getConfig + // So our histories are (taking median runtimes): + // + // 2 * inputSize bytes, 3 reducers * 1000 ms for each reducer + // inputSize / 2 bytes, 3 reducers * 200 ms for each reducer + // inputSize * 4 bytes, 3 reducers * 2400 ms for each reducer + // + // We don't scale by input size. + // + // The median of these is 3000 ms, so we anticipate + // that the job will take 3000 ms total. + // + // To do this in 25 ms, we need 120 reducers. + assert(conf.getNumReduceTasks == 120) + } + .run() + } + + "not set reducers when history service is empty" in { + val config = Config.empty + .addReducerEstimator(classOf[EmptyRuntimeEstimator]) + .addReducerEstimator(classOf[DummyEstimator]) + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, config), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + assert(steps.length == 1) + + val conf = steps.head.getConfig + + // EmptyRuntimeEstimator should have returned None, + // so it should have fallen back to DummyEstimator, + // which returns 42. + assert(conf.getNumReduceTasks == 42) + } + } + + "not set reducers when history service fails" in { + val config = Config.empty + .addReducerEstimator(classOf[ErrorRuntimeEstimator]) + .addReducerEstimator(classOf[DummyEstimator]) + + HadoopPlatformJobTest(new SimpleJobWithNoSetReducers(_, config), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + assert(steps.length == 1) + + val conf = steps.head.getConfig + + // ErrorRuntimeEstimator should have returned None, + // so it should have fallen back to DummyEstimator, + // which returns 42. + assert(conf.getNumReduceTasks == 42) + } + } + } +} diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatform.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatform.scala new file mode 100644 index 0000000000..74120040a1 --- /dev/null +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatform.scala @@ -0,0 +1,69 @@ +package com.twitter.scalding.platform + +import com.twitter.scalding._ +import com.twitter.scalding.source.TypedText + +import java.io.{BufferedWriter, File, FileWriter} + +import org.slf4j.LoggerFactory + +import scala.util.Try + +trait HadoopPlatform[P, R, T <: HadoopPlatform[P, R, T]] { + private val LOG = LoggerFactory.getLogger(getClass) + + val cons: (P) => R + val cluster: LocalCluster + + val dataToCreate: Seq[(String, Seq[String])] + val sourceWriters: Seq[P => R] + val sourceReaders: Seq[Mode => Unit] + + def arg(key: String, value: String): T + + def data(data: (String, Seq[String])): T + + def source[K: TypeDescriptor](location: String, data: Seq[K]): T = + source(TypedText.tsv[K](location), data) + + def source[K](out: TypedSink[K], data: Seq[K]): T + + def sink[K: TypeDescriptor](location: String)(toExpect: Seq[K] => Unit): T = + sink(TypedText.tsv[K](location))(toExpect) + + def sink[K](in: Mappable[K])(toExpect: Seq[K] => Unit): T + + def run(): Unit + + def runExpectFailure[K](fn: Throwable => K): K = + fn(Try(run()).failed.get) + + def init(cons: P => R): R + + def execute(unit: R): Unit + + protected def createSources(): Unit = { + dataToCreate.foreach { case (location, lines) => + val tmpFile = File.createTempFile("hadoop_platform", "job_test") + tmpFile.deleteOnExit() + if (lines.nonEmpty) { + val os = new BufferedWriter(new FileWriter(tmpFile)) + os.write(lines.head) + lines.tail.foreach { str => + os.newLine() + os.write(str) + } + os.close() + } + cluster.putFile(tmpFile, location) + tmpFile.delete() + } + + sourceWriters.foreach(cons => execute(init(cons))) + } + + protected def checkSinks(): Unit = { + LOG.debug("Executing sinks") + sourceReaders.foreach(_(cluster.mode)) + } +} diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformExecutionTest.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformExecutionTest.scala new file mode 100644 index 0000000000..67ce152286 --- /dev/null +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformExecutionTest.scala @@ -0,0 +1,54 @@ +package com.twitter.scalding.platform + +import cascading.flow.Flow +import com.twitter.scalding._ +import org.apache.hadoop.mapred.JobConf +import scala.util.{Failure, Success} + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions.ConfigCompanionCascadingExtensions + +case class HadoopPlatformExecutionTest( + cons: (Config) => Execution[_], + cluster: LocalCluster, + parameters: Map[String, String] = Map.empty, + dataToCreate: Seq[(String, Seq[String])] = Vector(), + sourceWriters: Seq[Config => Execution[_]] = Vector.empty, + sourceReaders: Seq[Mode => Unit] = Vector.empty, + flowCheckers: Seq[Flow[JobConf] => Unit] = Vector.empty +) extends HadoopPlatform[Config, Execution[_], HadoopPlatformExecutionTest] { + + def config: Config = + Config.defaultFrom(cluster.mode) ++ Config.from(parameters) + + override def arg(key: String, value: String): HadoopPlatformExecutionTest = + copy(parameters = parameters + (key -> value)) + + override def data(data: (String, Seq[String])): HadoopPlatformExecutionTest = + copy(dataToCreate = dataToCreate :+ data) + + override def source[K](out: TypedSink[K], data: Seq[K]): HadoopPlatformExecutionTest = + copy(sourceWriters = sourceWriters :+ { config: Config => + TypedPipe.from(data).writeExecution(out) + }) + + override def sink[K](in: Mappable[K])(toExpect: (Seq[K]) => Unit): HadoopPlatformExecutionTest = + copy(sourceReaders = sourceReaders :+ { m: Mode => toExpect(in.toIterator(config, m).toSeq) }) + + override def run(): Unit = { + System.setProperty("cascading.update.skip", "true") + val execution: Execution[Any] = init(cons) + cluster.addClassSourceToClassPath(cons.getClass) + cluster.addClassSourceToClassPath(execution.getClass) + createSources() + execute(execution) + checkSinks() + } + + override def init(cons: (Config) => Execution[_]): Execution[_] = cons(config) + + override def execute(unit: Execution[_]): Unit = + unit.waitFor(config, cluster.mode) match { + case Success(_) => () + case Failure(e) => throw e + } +} diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformJobTest.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformJobTest.scala new file mode 100644 index 0000000000..1a5cce1d27 --- /dev/null +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopPlatformJobTest.scala @@ -0,0 +1,86 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.platform + +import cascading.flow.Flow +import com.twitter.scalding._ + +import org.apache.hadoop.mapred.JobConf + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions.ConfigCompanionCascadingExtensions + +/** + * This class is used to construct unit tests in scalding which use Hadoop's MiniCluster to more fully + * simulate and test the logic which is deployed in a job. + */ +case class HadoopPlatformJobTest( + cons: (Args) => Job, + cluster: LocalCluster, + argsMap: Map[String, List[String]] = Map.empty, + dataToCreate: Seq[(String, Seq[String])] = Vector(), + sourceWriters: Seq[Args => Job] = Vector.empty, + sourceReaders: Seq[Mode => Unit] = Vector.empty, + flowCheckers: Seq[Flow[JobConf] => Unit] = Vector.empty +) extends HadoopPlatform[Args, Job, HadoopPlatformJobTest] { + + override def arg(key: String, value: String): HadoopPlatformJobTest = + copy(argsMap = argsMap + (key -> List(value))) + + override def data(data: (String, Seq[String])): HadoopPlatformJobTest = + copy(dataToCreate = dataToCreate :+ data) + + override def source[T](out: TypedSink[T], data: Seq[T]): HadoopPlatformJobTest = + copy(sourceWriters = sourceWriters :+ { args: Args => + new Job(args) { + TypedPipe.from(List("")).flatMap(_ => data).write(out) + } + }) + + override def sink[T](in: Mappable[T])(toExpect: (Seq[T]) => Unit): HadoopPlatformJobTest = + copy(sourceReaders = sourceReaders :+ { m: Mode => + toExpect(in.toIterator(Config.defaultFrom(m), m).toSeq) + }) + + def inspectCompletedFlow(checker: Flow[JobConf] => Unit): HadoopPlatformJobTest = + copy(flowCheckers = flowCheckers :+ checker) + + override def run(): Unit = { + System.setProperty("cascading.update.skip", "true") + val job = init(cons) + cluster.addClassSourceToClassPath(cons.getClass) + cluster.addClassSourceToClassPath(job.getClass) + createSources() + execute(job) + checkSinks() + flowCheckers.foreach { checker => + job.completedFlow.collect { case f: Flow[JobConf @unchecked] => + checker(f) + } + } + } + + override def init(cons: Args => Job): Job = cons(Mode.putMode(cluster.mode, new Args(argsMap))) + + @annotation.tailrec + override final def execute(job: Job): Unit = { + job.run() + job.clear() + job.next match { // linter:ignore:UseOptionForeachNotPatMatch + case Some(nextJob) => execute(nextJob) + case None => () + } + } +} diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopSharedPlatformTest.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopSharedPlatformTest.scala new file mode 100644 index 0000000000..3f5cae845b --- /dev/null +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/HadoopSharedPlatformTest.scala @@ -0,0 +1,48 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.platform + +import org.scalatest.{BeforeAndAfterAll, Suite} + +trait HadoopSharedPlatformTest extends BeforeAndAfterAll { this: Suite => + org.apache.log4j.Logger.getLogger("org.apache.hadoop").setLevel(org.apache.log4j.Level.ERROR) + org.apache.log4j.Logger.getLogger("org.mortbay").setLevel(org.apache.log4j.Level.ERROR) + org.apache.log4j.Logger.getLogger("org.apache.hadoop.metrics2.util").setLevel(org.apache.log4j.Level.ERROR) + + val cluster = LocalCluster() + + def initialize() = cluster.initialize() + + override def beforeAll(): Unit = { + cluster.synchronized { + initialize() + } + super.beforeAll() + } + + // TODO is there a way to buffer such that we see test results AFTER afterEach? Otherwise the results + // get lost in the logging + override def afterAll(): Unit = + try super.afterAll() + finally { + // Necessary because afterAll can be called from a different thread and we want to make sure that the state + // is visible. Note that this assumes there is no contention for LocalCluster (which LocalCluster ensures), + // otherwise there could be deadlock. + cluster.synchronized { + cluster.shutdown() + } + } +} diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/LocalCluster.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/LocalCluster.scala new file mode 100644 index 0000000000..6fd5e5ead8 --- /dev/null +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/LocalCluster.scala @@ -0,0 +1,195 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.platform + +import com.twitter.scalding._ + +import java.io.{File, RandomAccessFile} +import java.nio.channels.FileLock + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.mapreduce.filecache.DistributedCache +import org.apache.hadoop.fs.{FileUtil, Path} +import org.apache.hadoop.hdfs.MiniDFSCluster +import org.apache.hadoop.mapred.{JobConf, MiniMRCluster} +import org.slf4j.LoggerFactory +import org.slf4j.impl.Log4jLoggerAdapter + +object LocalCluster { + private final val HADOOP_CLASSPATH_DIR = new Path("/tmp/hadoop-classpath-lib") + private final val MUTEX = new RandomAccessFile("NOTICE", "rw").getChannel + + def apply() = new LocalCluster() +} + +class LocalCluster(mutex: Boolean = true) { + org.apache.log4j.Logger.getLogger("org.apache.hadoop").setLevel(org.apache.log4j.Level.ERROR) + org.apache.log4j.Logger.getLogger("org.mortbay").setLevel(org.apache.log4j.Level.ERROR) + org.apache.log4j.Logger.getLogger("BlockStateChange").setLevel(org.apache.log4j.Level.ERROR) + org.apache.log4j.Logger.getLogger("SecurityLogger").setLevel(org.apache.log4j.Level.ERROR) + + private val LOG = LoggerFactory.getLogger(getClass) + + private var hadoop: Option[(MiniDFSCluster, MiniMRCluster, JobConf)] = None + private def getHadoop = hadoop.getOrElse(throw new Exception("Hadoop has not been initialized")) + + private def dfs = getHadoop._1 + private def cluster = getHadoop._2 + private def jobConf = getHadoop._3 + private def fileSystem = dfs.getFileSystem + + private var classpath = Set[File]() + private var lock: Option[FileLock] = None + + // The Mini{DFS,MR}Cluster does not make it easy or clean to have two different processes + // running without colliding. Thus we implement our own mutex. Mkdir should be atomic so + // there should be no race. Just to be careful, however, we make sure that the file + // is what we expected, or else we fail. + private[this] def acquireMutex(): Unit = { + LOG.debug("Attempting to acquire mutex") + lock = Some(LocalCluster.MUTEX.lock()) + LOG.debug("Mutex file acquired") + } + + private[this] def releaseMutex(): Unit = { + LOG.debug("Releasing mutex") + lock.foreach(_.release()) + LOG.debug("Mutex released") + lock = None + } + + /** + * Start up the local cluster instance. + * + * @param inConf + * override default configuration + */ + def initialize(inConf: Config = Config.empty): this.type = { + if (mutex) { + acquireMutex() + } + + if (Option(System.getProperty("hadoop.log.dir")).isEmpty) { + System.setProperty("hadoop.log.dir", "build/test/logs") + } + new File(System.getProperty("hadoop.log.dir")).mkdirs() + + val conf = new Configuration + val dfs = new MiniDFSCluster(conf, 4, true, null) + val fileSystem = dfs.getFileSystem + val cluster = new MiniMRCluster(4, fileSystem.getUri.toString, 1, null, null, new JobConf(conf)) + val mrJobConf = cluster.createJobConf() + mrJobConf.setInt("mapred.submit.replication", 2) + mrJobConf.set("mapred.map.max.attempts", "2") + mrJobConf.set("mapred.reduce.max.attempts", "2") + mrJobConf.set("mapred.child.java.opts", "-Xmx512m") + mrJobConf.setInt("mapred.job.reuse.jvm.num.tasks", -1) + mrJobConf.setInt("mapreduce.client.completion.pollinterval", 20) + mrJobConf.setInt("mapreduce.client.progressmonitor.pollinterval", 20) + mrJobConf.setInt("ipc.ping.interval", 500) + mrJobConf.setInt("dfs.client.socket-timeout", 50) + mrJobConf.set("mapreduce.job.ubertask.enable", "true") + mrJobConf.setInt("mapreduce.job.ubertask.maxmaps", 500) + mrJobConf.setInt("mapreduce.job.ubertask.maxreduces", 500) + mrJobConf.setInt("ipc.client.connection.maxidletime", 50) + + mrJobConf.setMapSpeculativeExecution(false) + mrJobConf.setReduceSpeculativeExecution(false) + mrJobConf.set("mapreduce.user.classpath.first", "true") + + LOG.debug("Creating directory to store jars on classpath: " + LocalCluster.HADOOP_CLASSPATH_DIR) + fileSystem.mkdirs(LocalCluster.HADOOP_CLASSPATH_DIR) + + // merge in input configuration + inConf.toMap.foreach { case (k, v) => mrJobConf.set(k, v) } + + hadoop = Some(dfs, cluster, mrJobConf) + + // TODO I desperately want there to be a better way to do this. I'd love to be able to run ./sbt assembly and depend + // on that, but I couldn't figure out how to make that work. + val baseClassPath = List( + getClass, + classOf[JobConf], + classOf[Option[_]], + classOf[LoggerFactory], + classOf[Log4jLoggerAdapter], + classOf[org.apache.hadoop.net.StaticMapping], + classOf[org.apache.hadoop.yarn.server.MiniYARNCluster], + classOf[com.twitter.scalding.Args], + classOf[org.apache.log4j.LogManager], + classOf[com.twitter.scalding.RichDate], + classOf[cascading.tuple.TupleException], + classOf[com.twitter.chill.Externalizer[_]], + classOf[com.twitter.chill.algebird.AveragedValueSerializer], + classOf[com.twitter.algebird.Semigroup[_]], + classOf[com.twitter.chill.KryoInstantiator], + classOf[org.jgrapht.ext.EdgeNameProvider[_]], + classOf[org.apache.commons.lang.StringUtils], + classOf[cascading.scheme.local.TextDelimited], + classOf[org.apache.commons.logging.LogFactory], + classOf[org.apache.commons.codec.binary.Base64], + classOf[com.twitter.scalding.IntegralComparator], + classOf[org.apache.commons.collections.Predicate], + classOf[com.esotericsoftware.kryo.KryoSerializable], + classOf[com.twitter.chill.hadoop.KryoSerialization], + classOf[com.twitter.maple.tap.TupleMemoryInputFormat], + classOf[org.apache.commons.configuration.Configuration] + ).foreach(addClassSourceToClassPath(_)) + this + } + + def addClassSourceToClassPath[T](clazz: Class[T]): Unit = + addFileToHadoopClassPath(getFileForClass(clazz)) + + def addFileToHadoopClassPath(resourceDir: File): Boolean = + if (classpath.contains(resourceDir)) { + LOG.debug("Already on Hadoop classpath: " + resourceDir) + false + } else { + LOG.debug("Not yet on Hadoop classpath: " + resourceDir) + val localJarFile = if (resourceDir.isDirectory) MakeJar(resourceDir) else resourceDir + val hdfsJarPath = new Path(LocalCluster.HADOOP_CLASSPATH_DIR, localJarFile.getName) + fileSystem.copyFromLocalFile(new Path("file://%s".format(localJarFile.getAbsolutePath)), hdfsJarPath) + DistributedCache.addFileToClassPath(hdfsJarPath, jobConf, fileSystem) + LOG.debug("Added to Hadoop classpath: " + localJarFile) + classpath += resourceDir + true + } + + private def getFileForClass[T](clazz: Class[T]): File = + new File(clazz.getProtectionDomain.getCodeSource.getLocation.toURI) + + def mode: Mode = Hdfs(true, jobConf) + + def putFile(file: File, location: String): Boolean = { + val hdfsLocation = new Path(location) + val exists = fileSystem.exists(hdfsLocation) + if (!exists) FileUtil.copy(file, fileSystem, hdfsLocation, false, jobConf) + exists + } + + // TODO is there a way to know if we need to wait on anything to shut down, etc? + def shutdown(): Unit = { + hadoop.foreach { case (dfs, mr, _) => + dfs.shutdown() + mr.shutdown() + } + hadoop = None + if (mutex) { + releaseMutex() + } + } +} diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/MakeJar.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/MakeJar.scala new file mode 100644 index 0000000000..b8b90c3e28 --- /dev/null +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/MakeJar.scala @@ -0,0 +1,88 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.platform + +import java.io.{BufferedInputStream, File, FileInputStream, FileOutputStream} +import java.util.jar.{Attributes, JarEntry, JarOutputStream, Manifest => JarManifest} + +import org.slf4j.LoggerFactory + +object MakeJar { + private val LOG = LoggerFactory.getLogger(getClass) + + def apply(classDir: File, jarName: Option[String] = None): File = { + val syntheticJar = new File( + System.getProperty("java.io.tmpdir"), + jarName.getOrElse(classDir.getAbsolutePath.replace("/", "_") + ".jar") + ) + LOG.debug("Creating synthetic jar: " + syntheticJar.getAbsolutePath) + val manifest = new JarManifest + manifest.getMainAttributes.put(Attributes.Name.MANIFEST_VERSION, "1.0") + val target = new JarOutputStream(new FileOutputStream(syntheticJar), manifest) + add(classDir, classDir, target) + target.close() + new File(syntheticJar.getAbsolutePath) + } + + private[this] def add(parent: File, source: File, target: JarOutputStream): Unit = { + val name = getRelativeFileBetween(parent, source).getOrElse(new File("")).getPath.replace("\\", "/") + if (source.isDirectory) { + if (!name.isEmpty) { + val entry = new JarEntry(if (!name.endsWith("/")) name + "/" else name) + entry.setTime(source.lastModified()) + target.putNextEntry(entry) + target.closeEntry() + } + source.listFiles.foreach(add(parent, _, target)) + } else { + val entry = new JarEntry(name) + entry.setTime(source.lastModified) + target.putNextEntry(entry) + val in = new BufferedInputStream(new FileInputStream(source)) + val buffer = new Array[Byte](1024) + var count = in.read(buffer) + while (count > -1) { + target.write(buffer, 0, count) + count = in.read(buffer) + } + target.closeEntry + in.close() + } + } + + // Note that this assumes that parent and source are in absolute form if that's what we want + @annotation.tailrec + private[this] def getRelativeFileBetween( + parent: File, + source: File, + result: List[String] = List.empty + ): Option[File] = + Option(source) match { // linter:disable:UseOptionFlatMapNotPatMatch // need as is for tailrec + case Some(src) => { + if (parent == src) { + result.foldLeft(None: Option[File]) { (cum, part) => + Some(cum match { + case Some(p) => new File(p, part) + case None => new File(part) + }) + } + } else { + getRelativeFileBetween(parent, src.getParentFile, src.getName :: result) + } + } + case None => None + } +} diff --git a/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/Scalatest.scala b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/Scalatest.scala new file mode 100644 index 0000000000..5d607712a2 --- /dev/null +++ b/scalding-hadoop-test/src/main/scala/com/twitter/scalding/platform/Scalatest.scala @@ -0,0 +1,52 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.platform + +import org.scalatest.{BeforeAndAfterEach, Suite} + +/** + * This is a mixin fixture for scalatest which makes it easy to use a LocalCluster and will manage the + * lifecycle of one appropriately. + */ +trait HadoopPlatformTest extends BeforeAndAfterEach { this: Suite => + org.apache.log4j.Logger.getLogger("org.apache.hadoop").setLevel(org.apache.log4j.Level.ERROR) + org.apache.log4j.Logger.getLogger("org.mortbay").setLevel(org.apache.log4j.Level.ERROR) + org.apache.log4j.Logger.getLogger("org.apache.hadoop.metrics2.util").setLevel(org.apache.log4j.Level.ERROR) + + val cluster = LocalCluster() + + def initialize() = cluster.initialize() + + override def beforeEach(): Unit = { + cluster.synchronized { + initialize() + } + super.beforeEach() + } + + // TODO is there a way to buffer such that we see test results AFTER afterEach? Otherwise the results + // get lost in the logging + override def afterEach(): Unit = + try super.afterEach() + finally { + // Necessary because afterAll can be called from a different thread and we want to make sure that the state + // is visible. Note that this assumes there is no contention for LocalCluster (which LocalCluster ensures), + // otherwise there could be deadlock. + cluster.synchronized { + cluster.shutdown() + } + } +} diff --git a/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformExecutionTest.scala b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformExecutionTest.scala new file mode 100644 index 0000000000..3f8669b1e8 --- /dev/null +++ b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformExecutionTest.scala @@ -0,0 +1,104 @@ +package com.twitter.scalding.platform + +import com.twitter.scalding.{Config, Execution, TypedPipe, TypedTsv} +import org.scalatest.{Matchers, WordSpec} +import scala.io.Source + +import com.twitter.scalding.typed.cascading_backend.CascadingExtensions._ + +object InAndOutExecution extends Function[Config, Execution[Unit]] { + override def apply(config: Config): Execution[Unit] = + TypedPipe + .from(TypedTsv[String]("input")) + .writeExecution(TypedTsv[String]("output")) +} + +object OneDistributedCacheExecution extends Function[Config, Execution[Unit]] { + val one: (String, Seq[String]) = ("one", Seq("a", "d")) + val input = Seq("a", "b", "c", "d") + val output = Seq("a", "d") + + override def apply(v1: Config): Execution[Unit] = + Execution.withCachedFile("one") { theOne => + lazy val symbols = Source + .fromFile(theOne.file) + .getLines() + .toSeq + + TypedPipe + .from(TypedTsv[String]("input")) + .filter { symbol => + symbols.contains(symbol) + } + .writeExecution(TypedTsv[String]("output")) + } +} + +object MultipleDistributedCacheExecution extends Function[Config, Execution[Unit]] { + val first: (String, Seq[String]) = ("first", Seq("a", "d")) + val second: (String, Seq[String]) = ("second", Seq("c")) + val input = Seq("a", "b", "c", "d") + val output = Seq("a", "c", "d") + + override def apply(v1: Config): Execution[Unit] = + Execution.withCachedFile("first") { theFirst => + Execution.withCachedFile("second") { theSecond => + lazy val firstSymbols = + Source + .fromFile(theFirst.file) + .getLines() + .toSeq + + lazy val secondSymbols = + Source + .fromFile(theSecond.file) + .getLines() + .toSeq + + lazy val symbols = firstSymbols ++ secondSymbols + + TypedPipe + .from(TypedTsv[String]("input")) + .filter { symbol => + symbols.contains(symbol) + } + .writeExecution(TypedTsv[String]("output")) + } + } +} + +class PlatformExecutionTest extends WordSpec with Matchers with HadoopSharedPlatformTest { + "An InAndOutTest" should { + val inAndOut = Seq("a", "b", "c") + + "reading then writing shouldn't change the data" in { + HadoopPlatformExecutionTest(InAndOutExecution, cluster) + .source("input", inAndOut) + .sink[String]("output")(_.toSet shouldBe inAndOut.toSet) + .run() + } + } + + "An DistributedCacheTest" should { + "have access to file on hadoop" in { + import OneDistributedCacheExecution._ + + HadoopPlatformExecutionTest(OneDistributedCacheExecution, cluster) + .data(one) + .source("input", input) + .sink[String]("output")(_ shouldBe output) + .run() + } + + "have access to multiple files on hadoop" in { + import MultipleDistributedCacheExecution._ + + HadoopPlatformExecutionTest(MultipleDistributedCacheExecution, cluster) + .data(first) + .data(second) + .source("input", input) + .sink[String]("output")(_ shouldBe output) + .run() + } + } +} diff --git a/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformTest.scala b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformTest.scala new file mode 100644 index 0000000000..9be7e73571 --- /dev/null +++ b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/PlatformTest.scala @@ -0,0 +1,791 @@ +/* +Copyright 2014 Twitter, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +package com.twitter.scalding.platform + +import java.util.{Iterator => JIterator} + +import cascading.flow.FlowException +import cascading.pipe.joiner.{InnerJoin, JoinerClosure} +import cascading.tap.Tap +import cascading.tuple.{Fields, Tuple} +import com.twitter.scalding._ +import com.twitter.scalding.serialization.OrderedSerialization +import com.twitter.scalding.source.{FixedTypedText, NullSink, TypedText} +import org.scalacheck.{Arbitrary, Gen} +import org.scalatest.{Matchers, WordSpec} + +import scala.collection.JavaConverters._ +import scala.language.experimental.macros + +class InAndOutJob(args: Args) extends Job(args) { + Tsv("input").read.write(Tsv("output")) +} + +object TinyJoinAndMergeJob { + val peopleInput = TypedTsv[Int]("input1") + val peopleData = List(1, 2, 3, 4) + + val messageInput = TypedTsv[Int]("input2") + val messageData = List(1, 2, 3) + + val output = TypedTsv[(Int, Int)]("output") + val outputData = List((1, 2), (2, 2), (3, 2), (4, 1)) +} + +class TinyJoinAndMergeJob(args: Args) extends Job(args) { + import TinyJoinAndMergeJob._ + + val people = peopleInput.read.mapTo(0 -> 'id) { v: Int => v } + + val messages = messageInput.read + .mapTo(0 -> 'id) { v: Int => v } + .joinWithTiny('id -> 'id, people) + + (messages ++ people).groupBy('id)(_.size('count)).write(output) +} + +object TsvNoCacheJob { + val dataInput = TypedTsv[String]("fakeInput") + val data = List("-0.2f -0.3f -0.5f", "-0.1f", "-0.5f") + + val throwAwayOutput = Tsv("output1") + val typedThrowAwayOutput = TypedTsv[Float]("output1") + val realOuput = Tsv("output2") + val typedRealOutput = TypedTsv[Float]("output2") + val outputData = List(-0.5f, -0.2f, -0.3f, -0.1f).sorted +} +class TsvNoCacheJob(args: Args) extends Job(args) { + import TsvNoCacheJob._ + dataInput.read + .flatMap(new cascading.tuple.Fields(Integer.valueOf(0)) -> 'word) { line: String => line.split("\\s") } + .groupBy('word)(group => group.size) + .mapTo('word -> 'num)((w: String) => w.toFloat) + .write(throwAwayOutput) + .groupAll(_.sortBy('num)) + .write(realOuput) +} + +object IterableSourceDistinctJob { + val data = List("a", "b", "c") +} + +class IterableSourceDistinctJob(args: Args) extends Job(args) { + import IterableSourceDistinctJob._ + + TypedPipe.from(data ++ data ++ data).distinct.write(TypedTsv("output")) +} + +class IterableSourceDistinctIdentityJob(args: Args) extends Job(args) { + import IterableSourceDistinctJob._ + + TypedPipe.from(data ++ data ++ data).distinctBy(identity).write(TypedTsv("output")) +} + +class NormalDistinctJob(args: Args) extends Job(args) { + TypedPipe.from(TypedTsv[String]("input")).distinct.write(TypedTsv("output")) +} + +object MultipleGroupByJobData { + val data: List[String] = { + val rnd = new scala.util.Random(22) + (0 until 20).map(_ => rnd.nextLong.toString).toList + }.distinct +} + +class MultipleGroupByJob(args: Args) extends Job(args) { + import com.twitter.scalding.serialization._ + import MultipleGroupByJobData._ + implicit val stringOrdSer: OrderedSerialization[String] = new StringOrderedSerialization() + implicit val stringTup2OrdSer: OrderedSerialization[(String, String)] = + new OrderedSerialization2(stringOrdSer, stringOrdSer) + val otherStream = TypedPipe.from(data).map(k => (k, k)).group + + TypedPipe + .from(data) + .map(k => (k, 1L)) + .group(stringOrdSer) + .sum + .map { case (k, _) => + ((k, k), 1L) + } + .sumByKey(stringTup2OrdSer, implicitly) + .map(_._1._1) + .map { t => + (t.toString, t) + } + .group + .leftJoin(otherStream) + .map(_._1) + .write(TypedTsv("output")) + +} + +class TypedPipeHashJoinWithForceToDiskJob(args: Args) extends Job(args) { + PlatformTest.setAutoForceRight(mode, true) + + val x = TypedPipe.from[(Int, Int)](List((1, 1))) + val y = TypedPipe.from[(Int, String)](List((1, "first"))) + + // trivial transform and forceToDisk on the rhs + val yMap = y.map(p => (p._1, p._2.toUpperCase)).forceToDisk + + x.hashJoin(yMap) + .withDescription("hashJoin") + .write(TypedTsv[(Int, (Int, String))]("output")) +} + +class TypedPipeHashJoinWithForceToDiskFilterJob(args: Args) extends Job(args) { + PlatformTest.setAutoForceRight(mode, true) + + val x = TypedPipe.from[(Int, Int)](List((1, 1))) + val y = TypedPipe.from[(Int, String)](List((1, "first"))) + + // trivial transform and forceToDisk followed by filter on rhs + val yFilter = y.map(p => (p._1, p._2.toUpperCase)).forceToDisk.filter(p => p._1 == 1) + + x.hashJoin(yFilter) + .withDescription("hashJoin") + .write(TypedTsv[(Int, (Int, String))]("output")) +} + +class TypedPipeHashJoinWithForceToDiskWithComplete(args: Args) extends Job(args) { + PlatformTest.setAutoForceRight(mode, true) + + val x = TypedPipe.from[(Int, Int)](List((1, 1))) + val y = TypedPipe.from[(Int, String)](List((1, "first"))) + + // trivial transform and forceToDisk followed by WithComplete on rhs + val yComplete = y.map(p => (p._1, p._2.toUpperCase)).forceToDisk.onComplete(() => println("step complete")) + + x.hashJoin(yComplete) + .withDescription("hashJoin") + .write(TypedTsv[(Int, (Int, String))]("output")) +} + +class TypedPipeHashJoinWithForceToDiskMapJob(args: Args) extends Job(args) { + PlatformTest.setAutoForceRight(mode, false) + val x = TypedPipe.from[(Int, Int)](List((1, 1))) + val y = TypedPipe.from[(Int, String)](List((1, "first"))) + + // trivial transform and forceToDisk followed by map on rhs + val yMap = y.map(p => (p._1, p._2.toUpperCase)).forceToDisk.map(p => (p._1, p._2.toLowerCase)) + + x.hashJoin(yMap) + .withDescription("hashJoin") + .write(TypedTsv[(Int, (Int, String))]("output")) +} + +class TypedPipeHashJoinWithForceToDiskMapWithAutoForceJob(args: Args) extends Job(args) { + PlatformTest.setAutoForceRight(mode, true) + val x = TypedPipe.from[(Int, Int)](List((1, 1))) + val y = TypedPipe.from[(Int, String)](List((1, "first"))) + + // trivial transform and forceToDisk followed by map on rhs + val yMap = y.map(p => (p._1, p._2.toUpperCase)).forceToDisk.map(p => (p._1, p._2.toLowerCase)) + + x.hashJoin(yMap) + .withDescription("hashJoin") + .write(TypedTsv[(Int, (Int, String))]("output")) +} + +class TypedPipeHashJoinWithGroupByJob(args: Args) extends Job(args) { + PlatformTest.setAutoForceRight(mode, true) + + val x = TypedPipe.fromPipe[(String, Int)](Tsv("input1", ('x1, 'y1)), Fields.ALL) + val y = Tsv("input2", ('x2, 'y2)) + + val yGroup = y.groupBy('x2)(p => p) + val yTypedPipe = TypedPipe.fromPipe[(String, Int)](yGroup, Fields.ALL) + + x.hashJoin(yTypedPipe) + .withDescription("hashJoin") + .write(TypedTsv[(String, (Int, Int))]("output")) +} + +class TypedPipeHashJoinWithCoGroupJob(args: Args) extends Job(args) { + PlatformTest.setAutoForceRight(mode, true) + + val x = TypedPipe.from[(Int, Int)](List((1, 1))) + val in0 = Tsv("input0").read.mapTo((0, 1) -> ('x0, 'a)) { input: (Int, Int) => input } + val in1 = Tsv("input1").read.mapTo((0, 1) -> ('x1, 'b)) { input: (Int, Int) => input } + + val coGroupPipe = in0.coGroupBy('x0) { + _.coGroup('x1, in1, OuterJoinMode) + } + + val coGroupTypedPipe = TypedPipe.fromPipe[(Int, Int, Int)](coGroupPipe, Fields.ALL) + val coGroupTuplePipe = coGroupTypedPipe.map { case (a, b, c) => (a, (b, c)) } + x.hashJoin(coGroupTuplePipe) + .withDescription("hashJoin") + .write(TypedTsv[(Int, (Int, (Int, Int)))]("output")) +} + +class TypedPipeHashJoinWithEveryJob(args: Args) extends Job(args) { + PlatformTest.setAutoForceRight(mode, true) + + val x = TypedPipe.fromPipe[(Int, String)](Tsv("input1", ('x1, 'y1)), Fields.ALL) + val y = Tsv("input2", ('x2, 'y2)).groupBy('x2) { + _.foldLeft('y2 -> 'y2)(0)((b: Int, a: Int) => b + a) + } + + val yTypedPipe = TypedPipe.fromPipe[(Int, Int)](y, Fields.ALL) + x.hashJoin(yTypedPipe) + .withDescription("hashJoin") + .write(TypedTsv[(Int, (String, Int))]("output")) +} + +class TypedPipeForceToDiskWithDescriptionJob(args: Args) extends Job(args) { + val writeWords = + TypedPipe + .from[String](List("word1 word2", "word1", "word2")) + .withDescription("write words to disk") + .flatMap(_.split("\\s+")) + .forceToDisk + writeWords + .groupBy(_.length) + .withDescription("output frequency by length") + .size + .write(TypedTsv[(Int, Long)]("output")) +} + +class GroupedLimitJobWithSteps(args: Args) extends Job(args) { + val writeWords = + TypedPipe + .from[String](List("word1 word2", "word1", "word2")) + .flatMap(_.split("\\s+")) + .map(k => k -> 1L) + .sumByKey + .limit(3) + + writeWords + .groupBy(_._1) + .head + .keys + .write(TypedTsv[String]("output1")) + + writeWords + .groupBy(_._1) + .head + .keys + .write(TypedTsv[String]("output2")) +} + +object OrderedSerializationTest { + implicit val genASGK: Arbitrary[NestedCaseClass] = Arbitrary { + for { + ts <- Arbitrary.arbitrary[Long] + b <- Gen.nonEmptyListOf(Gen.alphaNumChar).map(_.mkString) + } yield NestedCaseClass(RichDate(ts), (b, b)) + } + + def sample[T: Arbitrary]: T = Arbitrary.arbitrary[T].sample.get + val data = sample[List[NestedCaseClass]].take(1000) +} + +case class NestedCaseClass(day: RichDate, key: (String, String)) + +// Need to define this in a separate companion object to work around Scala 2.12 compile issues +object OrderedSerializationImplicitDefs { + implicit def primitiveOrderedBufferSupplier[T]: OrderedSerialization[T] = + macro com.twitter.scalding.serialization.macros.impl.OrderedSerializationProviderImpl[T] +} + +class ComplexJob(input: List[NestedCaseClass], args: Args) extends Job(args) { + import OrderedSerializationImplicitDefs._ + + val ds1 = TypedPipe.from(input).map(_ -> 1L).group.sorted.mapValueStream(_.map(_ * 2)).toTypedPipe.group + + val ds2 = TypedPipe.from(input).map(_ -> 1L).distinct.group + + ds2.keys + .map(s => s.toString) + .write(TypedTsv[String](args("output1"))) + + ds2 + .join(ds1) + .values + .map(_.toString) + .write(TypedTsv[String](args("output2"))) +} + +class ComplexJob2(input: List[NestedCaseClass], args: Args) extends Job(args) { + import OrderedSerializationImplicitDefs._ + + val ds1 = TypedPipe.from(input).map(_ -> (1L, "asfg")) + + val ds2 = TypedPipe.from(input).map(_ -> (1L, "sdf")) + + val execution = ds1.join(ds2).groupAll.size.values.toIterableExecution + val r = Config.tryFrom(config).get + execution.waitFor(r, mode).get + + ds1.map(_.toString).write(TypedTsv[String](args("output1"))) + ds2.map(_.toString).write(TypedTsv[String](args("output2"))) +} + +class CheckFlowProcessJoiner(uniqueID: UniqueID) extends InnerJoin { + override def getIterator(joinerClosure: JoinerClosure): JIterator[Tuple] = { + println("CheckFlowProcessJoiner.getItertor") + + val flowProcess = RuntimeStats.getFlowProcessForUniqueId(uniqueID) + if (flowProcess == null) { + throw new NullPointerException("No active FlowProcess was available.") + } + + super.getIterator(joinerClosure) + } +} + +class CheckForFlowProcessInFieldsJob(args: Args) extends Job(args) { + val uniqueID = UniqueID.getIDFor(flowDef) + val stat = Stat("joins") + + val inA = Tsv("inputA", ('a, 'b)) + val inB = Tsv("inputB", ('x, 'y)) + + val p = inA.joinWithSmaller('a -> 'x, inB).map(('b, 'y) -> 'z) { args: (String, String) => + stat.inc() + + val flowProcess = RuntimeStats.getFlowProcessForUniqueId(uniqueID) + if (flowProcess == null) { + throw new NullPointerException("No active FlowProcess was available.") + } + + s"${args._1},${args._2}" + } + + p.write(Tsv("output", ('b, 'y))) +} + +class CheckForFlowProcessInTypedJob(args: Args) extends Job(args) { + val uniqueID = UniqueID.getIDFor(flowDef) + val stat = Stat("joins") + + val inA = TypedPipe.from(TypedTsv[(String, String)]("inputA")) + val inB = TypedPipe.from(TypedTsv[(String, String)]("inputB")) + + inA.group + .join(inB.group) + .forceToReducers + .mapGroup { (key, valuesIter) => + stat.inc() + + val flowProcess = RuntimeStats.getFlowProcessForUniqueId(uniqueID) + if (flowProcess == null) { + throw new NullPointerException("No active FlowProcess was available.") + } + + valuesIter.map { case (a, b) => s"$a:$b" } + } + .toTypedPipe + .write(TypedTsv[(String, String)]("output")) +} + +case class BypassValidationSource(path: String) extends FixedTypedText[Int](TypedText.TAB, path) { + override def validateTaps(mode: Mode): Unit = () + override def createTap(readOrWrite: AccessMode)(implicit mode: Mode): Tap[_, _, _] = + (mode, readOrWrite) match { + case (hdfsMode: Hdfs, Read) => new InvalidSourceTap(Seq(path)) + case _ => super.createTap(readOrWrite) + } +} + +class ReadPathJob(args: Args) extends Job(args) { + TypedPipe + .from(new BypassValidationSource(args.required("input"))) + .write(NullSink) +} + +object PlatformTest { + def setAutoForceRight(mode: Mode, autoForce: Boolean): Unit = + mode match { + case h: HadoopMode => + val config = h.jobConf + config.setBoolean(Config.HashJoinAutoForceRight, autoForce) + case _ => () + } +} + +class TestTypedEmptySource + extends FileSource + with TextSourceScheme + with Mappable[(Long, String)] + with SuccessFileSource { + override def hdfsPaths: Iterable[String] = Iterable.empty + override def localPaths: Iterable[String] = Iterable.empty + override def converter[U >: (Long, String)] = + TupleConverter.asSuperConverter[(Long, String), U](implicitly[TupleConverter[(Long, String)]]) +} + +// Tests the scenario where you have no data present in the directory pointed to by a source typically +// due to the directory being empty (but for a _SUCCESS file) +// We test out that this shouldn't result in a Cascading planner error during {@link Job.buildFlow} +class EmptyDataJob(args: Args) extends Job(args) { + TypedPipe + .from(new TestTypedEmptySource) + .map { case (offset, line) => line } + .write(TypedTsv[String]("output")) +} + +// Keeping all of the specifications in the same tests puts the result output all together at the end. +// This is useful given that the Hadoop MiniMRCluster and MiniDFSCluster spew a ton of logging. +class PlatformTest extends WordSpec with Matchers with HadoopSharedPlatformTest { + + "An InAndOutTest" should { + val inAndOut = Seq("a", "b", "c") + + "reading then writing shouldn't change the data" in { + HadoopPlatformJobTest(new InAndOutJob(_), cluster) + .source("input", inAndOut) + .sink[String]("output")(_.toSet shouldBe (inAndOut.toSet)) + .run() + } + } + + "A TinyJoinAndMergeJob" should { + import TinyJoinAndMergeJob._ + + "merge and joinWithTiny shouldn't duplicate data" in { + HadoopPlatformJobTest(new TinyJoinAndMergeJob(_), cluster) + .source(peopleInput, peopleData) + .source(messageInput, messageData) + .sink(output)(_.toSet shouldBe (outputData.toSet)) + .run() + } + } + + "A TsvNoCacheJob" should { + import TsvNoCacheJob._ + + "Writing to a tsv in a flow shouldn't effect the output" in { + HadoopPlatformJobTest(new TsvNoCacheJob(_), cluster) + .source(dataInput, data) + .sink(typedThrowAwayOutput)(_.toSet should have size 4) + .sink(typedRealOutput) { + _.map { f: Float => (f * 10).toInt }.toList shouldBe (outputData.map { f: Float => + (f * 10).toInt + }.toList) + } + .run() + } + } + + "A multiple group by job" should { + import MultipleGroupByJobData._ + + "do some ops and not stamp on each other ordered serializations" in { + HadoopPlatformJobTest(new MultipleGroupByJob(_), cluster) + .source[String]("input", data) + .sink[String]("output")(_.toSet shouldBe data.map(_.toString).toSet) + .run() + } + + } + + "A TypedPipeForceToDiskWithDescriptionPipe" should { + "have a custom step name from withDescription" in { + HadoopPlatformJobTest(new TypedPipeForceToDiskWithDescriptionJob(_), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + val firstStep = steps.filter(_.getName.startsWith("(1/2")) + val secondStep = steps.filter(_.getName.startsWith("(2/2")) + val lab1 = firstStep.map(_.getConfig.get(Config.StepDescriptions)) + lab1 should have size 1 + lab1(0) should include("write words to disk") + val lab2 = secondStep.map(_.getConfig.get(Config.StepDescriptions)) + lab2 should have size 1 + lab2(0) should include("output frequency by length") + } + .run() + } + } + + "A limit" should { + "not fan out into consumers" in { + // This covers a case where limit was being swept into a typed pipe factory + // so each consumer was re-running the limit independently + // which makes it usage unstable too. + HadoopPlatformJobTest(new GroupedLimitJobWithSteps(_), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 4 + } + .run() + } + } + + // also tests HashJoin behavior to verify that we don't introduce a forceToDisk as the RHS pipe is source Pipe + "A TypedPipeJoinWithDescriptionPipe" should { + "have a custom step name from withDescription and no extra forceToDisk steps on hashJoin's rhs" in { + HadoopPlatformJobTest(new TypedPipeJoinWithDescriptionJob(_), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 1 + val firstStep = steps.headOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") + firstStep should include("leftJoin") + firstStep should include("hashJoin") + steps.map(_.getConfig.get(Config.StepDescriptions)).foreach(s => info(s)) + } + .run() + } + } + + // expect two jobs - one for the map prior to the Checkpoint and one for the hashJoin + "A TypedPipeHashJoinWithForceToDiskJob" should { + "have a custom step name from withDescription and only one user provided forceToDisk on hashJoin's rhs" in { + HadoopPlatformJobTest(new TypedPipeHashJoinWithForceToDiskJob(_), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 2 + val secondStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") + secondStep should include("hashJoin") + } + .run() + } + } + + // expect 3 jobs - one extra compared to previous as there's a new forceToDisk added + "A TypedPipeHashJoinWithForceToDiskFilterJob" should { + "have a custom step name from withDescription and an extra forceToDisk due to a filter operation on hashJoin's rhs" in { + HadoopPlatformJobTest(new TypedPipeHashJoinWithForceToDiskFilterJob(_), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 3 + val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") + lastStep should include("hashJoin") + } + .run() + } + } + + // expect two jobs - one for the map prior to the Checkpoint and one for the rest + "A TypedPipeHashJoinWithForceToDiskWithComplete" should { + "have a custom step name from withDescription and no extra forceToDisk due to with complete operation on hashJoin's rhs" in { + HadoopPlatformJobTest(new TypedPipeHashJoinWithForceToDiskWithComplete(_), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 2 + val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") + lastStep should include("hashJoin") + } + .run() + } + } + + // expect two jobs - one for the map prior to the Checkpoint and one for the rest + "A TypedPipeHashJoinWithForceToDiskMapJob" should { + "have a custom step name from withDescription and no extra forceToDisk due to map (autoForce = false) on forceToDisk operation on hashJoin's rhs" in { + HadoopPlatformJobTest(new TypedPipeHashJoinWithForceToDiskMapJob(_), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 2 + val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") + lastStep should include("hashJoin") + } + .run() + } + } + + // expect one extra job from the above - we end up performing a forceToDisk after the map + "A TypedPipeHashJoinWithForceToDiskMapWithAutoForceJob" should { + "have a custom step name from withDescription and an extra forceToDisk due to map (autoForce = true) on forceToDisk operation on hashJoin's rhs" in { + HadoopPlatformJobTest(new TypedPipeHashJoinWithForceToDiskMapWithAutoForceJob(_), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 3 + val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") + lastStep should include("hashJoin") + } + .run() + } + } + + "A TypedPipeHashJoinWithGroupByJob" should { + "have a custom step name from withDescription and no extra forceToDisk after groupBy on hashJoin's rhs" in { + HadoopPlatformJobTest(new TypedPipeHashJoinWithGroupByJob(_), cluster) + .source(TypedTsv[(String, Int)]("input1"), Seq(("first", 45))) + .source( + TypedTsv[(String, Int)]("input2"), + Seq(("first", 1), ("first", 2), ("first", 3), ("second", 1), ("second", 2)) + ) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 2 + val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") + lastStep should include("hashJoin") + } + .run() + } + } + + "A TypedPipeHashJoinWithCoGroupJob" should { + "have a custom step name from withDescription and no extra forceToDisk after coGroup + map on hashJoin's rhs" in { + HadoopPlatformJobTest(new TypedPipeHashJoinWithCoGroupJob(_), cluster) + .source(TypedTsv[(Int, Int)]("input0"), List((0, 1), (1, 1), (2, 1), (3, 2))) + .source(TypedTsv[(Int, Int)]("input1"), List((0, 1), (2, 5), (3, 2))) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 2 + val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") + lastStep should include("hashJoin") + } + .run() + } + } + + "A TypedPipeHashJoinWithEveryJob" should { + "have a custom step name from withDescription and no extra forceToDisk after an Every on hashJoin's rhs" in { + HadoopPlatformJobTest(new TypedPipeHashJoinWithEveryJob(_), cluster) + .source(TypedTsv[(Int, String)]("input1"), Seq((1, "foo"))) + .source(TypedTsv[(Int, Int)]("input2"), Seq((1, 30), (1, 10), (1, 20), (2, 20))) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + steps should have size 2 + val lastStep = steps.lastOption.map(_.getConfig.get(Config.StepDescriptions)).getOrElse("") + lastStep should include("hashJoin") + } + .run() + } + } + + "A TypedPipeWithDescriptionPipe" should { + "have a custom step name from withDescription" in { + HadoopPlatformJobTest(new TypedPipeWithDescriptionJob(_), cluster) + .inspectCompletedFlow { flow => + val steps = flow.getFlowSteps.asScala + val descs = List( + "map stage - assign words to 1", + "reduce stage - sum", + "write", + // should see the .group and the .write show up as line numbers + "com.twitter.scalding.platform.TypedPipeWithDescriptionJob.(TestJobsWithDescriptions.scala:31)", + "com.twitter.scalding.platform.TypedPipeWithDescriptionJob.(TestJobsWithDescriptions.scala:35)" + ) + + val foundDescs = steps.map(_.getConfig.get(Config.StepDescriptions)) + descs.foreach { d => + assert(foundDescs.size == 1) + assert(foundDescs(0).contains(d)) + } + // steps.map(_.getConfig.get(Config.StepDescriptions)).foreach(s => info(s)) + } + .run() + } + } + + "A IterableSource" should { + import IterableSourceDistinctJob._ + + "distinct properly from normal data" in { + HadoopPlatformJobTest(new NormalDistinctJob(_), cluster) + .source[String]("input", data ++ data ++ data) + .sink[String]("output")(_.toList shouldBe data) + .run() + } + + "distinctBy(identity) properly from a list in memory" in { + HadoopPlatformJobTest(new IterableSourceDistinctIdentityJob(_), cluster) + .sink[String]("output")(_.toList shouldBe data) + .run() + } + + "distinct properly from a list" in { + HadoopPlatformJobTest(new IterableSourceDistinctJob(_), cluster) + .sink[String]("output")(_.toList shouldBe data) + .run() + } + } + + // If we support empty sources again in the future, update this test + "An EmptyData source" should { + "read from empty source and write to output without errors" in { + val e = intercept[FlowException] { + HadoopPlatformJobTest(new EmptyDataJob(_), cluster) + .run() + } + assert(e.getCause.getClass === classOf[InvalidSourceException]) + } + } + + import OrderedSerializationTest._ + "An Ordered Serialization" should { + "A test job with a fork and join, had previously not had boxed serializations on all branches" in { + val fn = (arg: Args) => new ComplexJob(data, arg) + HadoopPlatformJobTest(fn, cluster) + .arg("output1", "output1") + .arg("output2", "output2") + // Here we are just testing that we hit no exceptions in the course of this run + // the previous issue would have caused OOM or other exceptions. If we get to the end + // then we are good. + .sink[String](TypedTsv[String]("output2"))(x => ()) + .sink[String](TypedTsv[String]("output1"))(x => ()) + .run() + } + + "A test job with that joins then groupAll's should have its boxes setup correctly." in { + val fn = (arg: Args) => new ComplexJob2(data, arg) + HadoopPlatformJobTest(fn, cluster) + .arg("output1", "output1") + .arg("output2", "output2") + // Here we are just testing that we hit no exceptions in the course of this run + // the previous issue would have caused OOM or other exceptions. If we get to the end + // then we are good. + .sink[String](TypedTsv[String]("output2"))(x => ()) + .sink[String](TypedTsv[String]("output1"))(x => ()) + .run() + } + } + + "Methods called from a Joiner" should { + "have access to a FlowProcess from a join in the Fields-based API" in { + HadoopPlatformJobTest(new CheckForFlowProcessInFieldsJob(_), cluster) + .source(TypedTsv[(String, String)]("inputA"), Seq(("1", "alpha"), ("2", "beta"))) + .source(TypedTsv[(String, String)]("inputB"), Seq(("1", "first"), ("2", "second"))) + .sink(TypedTsv[(String, String)]("output")) { _ => + // The job will fail with an exception if the FlowProcess is unavailable. + } + .inspectCompletedFlow { flow => + flow.getFlowStats.getCounterValue(Stats.ScaldingGroup, "joins") shouldBe 2 + } + .run() + } + + "have access to a FlowProcess from a join in the Typed API" in { + HadoopPlatformJobTest(new CheckForFlowProcessInTypedJob(_), cluster) + .source(TypedTsv[(String, String)]("inputA"), Seq(("1", "alpha"), ("2", "beta"))) + .source(TypedTsv[(String, String)]("inputB"), Seq(("1", "first"), ("2", "second"))) + .sink[(String, String)](TypedTsv[(String, String)]("output")) { _ => + // The job will fail with an exception if the FlowProcess is unavailable. + } + .inspectCompletedFlow { flow => + flow.getFlowStats.getCounterValue(Stats.ScaldingGroup, "joins") shouldBe 2 + } + .run() + } + } + + "An InvalidSourceTap that gets past validation" should { + "throw an InvalidSourceException" in { + val result: FlowException = intercept[FlowException] { + HadoopPlatformJobTest(new ReadPathJob(_), cluster) + .arg("input", "/sploop/boop/doopity/doo/") + .run() + } + + assert(Option(result.getCause).exists(_.isInstanceOf[InvalidSourceException])) + } + } +} diff --git a/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/TestJobsWithDescriptions.scala b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/TestJobsWithDescriptions.scala new file mode 100644 index 0000000000..7a7198e5fb --- /dev/null +++ b/scalding-hadoop-test/src/test/scala/com/twitter/scalding/platform/TestJobsWithDescriptions.scala @@ -0,0 +1,36 @@ +package com.twitter.scalding.platform + +import com.twitter.scalding._ + +/* + * These jobs are used in PlatformTests that test correct line numbers in descriptions. + * Placing them in a separate file means we don't have to update the tests that care about + * line numbers when PlatformTest.scala changes for unrelated reasons. + */ + +class TypedPipeJoinWithDescriptionJob(args: Args) extends Job(args) { + PlatformTest.setAutoForceRight(mode, true) + + val x = TypedPipe.from[(Int, Int)](List((1, 1))) + val y = TypedPipe.from[(Int, String)](List((1, "first"))) + val z = TypedPipe.from[(Int, Boolean)](List((2, true))).group + + x.hashJoin(y) // this triggers an implicit that somehow pushes the line number to the next one + .withDescription("hashJoin") + .leftJoin(z) + .withDescription("leftJoin") + .values + .write(TypedTsv[((Int, String), Option[Boolean])]("output")) +} + +class TypedPipeWithDescriptionJob(args: Args) extends Job(args) { + TypedPipe + .from[String](List("word1", "word1", "word2")) + .withDescription("map stage - assign words to 1") + .map(w => (w, 1L)) + .group + .withDescription("reduce stage - sum") + .sum + .withDescription("write") + .write(TypedTsv[(String, Long)]("output")) +} diff --git a/scalding-hraven/README.md b/scalding-hraven/README.md new file mode 100644 index 0000000000..4849889f3a --- /dev/null +++ b/scalding-hraven/README.md @@ -0,0 +1,7 @@ +# hRaven Extensions +This module includes additions to Scalding that make use of [hRaven](https://github.com/twitter/hraven) for querying job history. + +## Reducer Estimation +Reducer estimators can include the `HRavenHistory` trait to get additional functionality for querying hRaven for past jobs. + +For example, `RatioBasedReducerEstimator`, also in this module, uses hRaven job history to better estimate reducers based on the ratio of mapper-reducer input data. diff --git a/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryService.scala b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryService.scala new file mode 100644 index 0000000000..534df99b02 --- /dev/null +++ b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryService.scala @@ -0,0 +1,273 @@ +package com.twitter.scalding.hraven.estimation + +import cascading.flow.FlowStep +import com.twitter.hraven.JobDescFactory.{JOBTRACKER_KEY, RESOURCE_MANAGER_KEY} +import com.twitter.hraven.rest.client.HRavenRestClient +import com.twitter.hraven.{Constants, CounterMap, Flow, HadoopVersion, JobDetails, TaskDetails} +import com.twitter.scalding.estimation.{FlowStepHistory, FlowStepKeys, FlowStrategyInfo, HistoryService, Task} +import java.io.IOException +import org.apache.hadoop.mapred.JobConf +import org.slf4j.LoggerFactory +import scala.collection.JavaConverters._ +import scala.util.{Failure, Success, Try} + +object HRavenClient { + import HRavenHistoryService.jobConfToRichConfig + + val apiHostnameKey = "hraven.api.hostname" + val clientConnectTimeoutKey = "hraven.client.connect.timeout" + val clientReadTimeoutKey = "hraven.client.read.timeout" + + private final val clientConnectTimeoutDefault = 30000 + private final val clientReadTimeoutDefault = 30000 + + def apply(conf: JobConf): Try[HRavenRestClient] = + conf + .getFirstKey(apiHostnameKey) + .map( + new HRavenRestClient( + _, + conf.getInt(clientConnectTimeoutKey, clientConnectTimeoutDefault), + conf.getInt(clientReadTimeoutKey, clientReadTimeoutDefault) + ) + ) +} + +object HRavenHistoryService { + private val LOG = LoggerFactory.getLogger(this.getClass) + + case class MissingFieldsException(fields: Seq[String]) extends Exception + + /** + * Add some helper methods to JobConf + */ + case class RichConfig(conf: JobConf) { + + val OldMaxFetch = "hraven.reducer.estimator.max.flow.history" + val MaxFetch = "hraven.estimator.max.flow.history" + val MaxFetchDefault = 8 + + def maxFetch: Int = { + val max = conf.getInt(MaxFetch, -1) + if (max == -1) { + conf.getInt(OldMaxFetch, MaxFetchDefault) + } else { + max + } + } + + /** + * Try fields in order until one returns a value. Logs a warning if nothing was found. + */ + def getFirstKey(fields: String*): Try[String] = + fields + .collectFirst { + case f if conf.get(f) != null => Success(conf.get(f)) + } + .getOrElse { + LOG.warn("Missing required config param: " + fields.mkString(" or ")) + Failure(MissingFieldsException(fields)) + } + } + + implicit def jobConfToRichConfig(conf: JobConf): RichConfig = RichConfig(conf) +} + +/** + * History Service that gives ability to query hRaven for info about past runs. + */ +trait HRavenHistoryService extends HistoryService { + import HRavenHistoryService.jobConfToRichConfig + + private val LOG = LoggerFactory.getLogger(this.getClass) + private val RequiredJobConfigs = Seq("cascading.flow.step.num") + private val MapOutputBytesKey = "MAP_OUTPUT_BYTES" + + protected val detailFields: List[String] + protected val counterFields: List[String] + + protected def details(taskDetails: TaskDetails): Option[Map[String, Any]] + protected def counters(taskCounters: CounterMap): Option[Map[String, Long]] + + def hRavenClient(conf: JobConf): Try[HRavenRestClient] = HRavenClient(conf) + + /** + * Fetch flows until it finds one that was successful (using "HdfsBytesRead > 0" as a marker for successful + * jobs since it seems that this is only set on completion of jobs) + * + * TODO: query hRaven for successful jobs (first need to add ability to filter results in hRaven REST API) + */ + private def fetchSuccessfulFlows( + client: HRavenRestClient, + cluster: String, + user: String, + batch: String, + signature: String, + stepNum: Int, + max: Int, + nFetch: Int + ): Try[Seq[Flow]] = + Try( + client + .fetchFlowsWithConfig(cluster, user, batch, signature, nFetch, RequiredJobConfigs: _*) + ) + .flatMap { flows => + Try { + // Ugly mutable code to add task info to flows + flows.asScala.foreach { flow => + flow.getJobs.asScala + .filter { step => + step.getConfiguration.get("cascading.flow.step.num").toInt == stepNum + } + .foreach { job => + // client.fetchTaskDetails might throw IOException + val tasks = if (counterFields.isEmpty) { + client.fetchTaskDetails(flow.getCluster, job.getJobId, detailFields.asJava) + } else { + client.fetchTaskDetails( + flow.getCluster, + job.getJobId, + detailFields.asJava, + counterFields.asJava + ) + } + job.addTasks(tasks) + } + } + + val successfulFlows = flows.asScala.filter(_.getHdfsBytesRead > 0).take(max) + if (successfulFlows.isEmpty) { + LOG.warn("Unable to find any successful flows in the last " + nFetch + " jobs.") + } + successfulFlows + } + } + .recoverWith { case e: IOException => + LOG.error("Error making API request to hRaven. HRavenHistoryService will be disabled.") + Failure(e) + } + + /** + * Fetch info from hRaven for the last time the given JobStep ran. Finds the last successful complete flow + * and selects the corresponding step from it. + * + * @param step + * FlowStep to get info for + * @return + * Details about the previous successful run. + */ + def fetchPastJobDetails(step: FlowStep[JobConf], max: Int): Try[Seq[JobDetails]] = { + val conf = step.getConfig + val stepNum = step.getStepNum + + def findMatchingJobStep(pastFlow: Flow) = + pastFlow.getJobs.asScala + .find { step => + try { + step.getConfiguration.get("cascading.flow.step.num").toInt == stepNum + } catch { + case _: NumberFormatException => false + } + } + .orElse { + LOG.warn("No matching job step in the retrieved hRaven flow.") + None + } + + def lookupClusterName(client: HRavenRestClient): Try[String] = { + // regex for case matching URL to get hostname out + val hostRegex = """(.*):\d+""".r + + // first try resource manager (for Hadoop v2), then fallback to job tracker + conf.getFirstKey(RESOURCE_MANAGER_KEY, JOBTRACKER_KEY).flatMap { + // extract hostname from hostname:port + case hostRegex(host) => + // convert hostname -> cluster name (e.g. dw2@smf1) + Try(client.getCluster(host)) + } + } + + val flowsTry = for { + // connect to hRaven REST API + client <- hRavenClient(conf) + + // lookup cluster name used by hRaven + cluster <- lookupClusterName(client) + + // get identifying info for this job + user <- conf.getFirstKey("hraven.history.user.name", "user.name") + batch <- conf.getFirstKey("batch.desc") + signature <- conf.getFirstKey("scalding.flow.class.signature") + + // query hRaven for matching flows + flows <- fetchSuccessfulFlows(client, cluster, user, batch, signature, stepNum, max, conf.maxFetch) + + } yield flows + + // Find the FlowStep in the hRaven flow that corresponds to the current step + // *Note*: when hRaven says "Job" it means "FlowStep" + flowsTry.map(flows => flows.flatMap(findMatchingJobStep)) + } + + override def fetchHistory(info: FlowStrategyInfo, maxHistory: Int): Try[Seq[FlowStepHistory]] = + fetchPastJobDetails(info.step, maxHistory).map { history => + for { + step <- history // linter:disable:MergeMaps + keys = FlowStepKeys( + step.getJobName, + step.getUser, + step.getPriority, + step.getStatus, + step.getVersion, + "" + ) + // update HRavenHistoryService.TaskDetailFields when consuming additional task fields from hraven below + tasks = step.getTasks.asScala.flatMap { taskDetails => + details(taskDetails).zip(counters(taskDetails.getCounters)).map { case (details, counters) => + Task(details, counters) + } + } + } yield toFlowStepHistory(keys, step, tasks) + } + + private def toFlowStepHistory(keys: FlowStepKeys, step: JobDetails, tasks: Seq[Task]) = + FlowStepHistory( + keys = keys, + submitTimeMillis = step.getSubmitTime, + launchTimeMillis = step.getLaunchTime, + finishTimeMillis = step.getFinishTime, + totalMaps = step.getTotalMaps, + totalReduces = step.getTotalReduces, + finishedMaps = step.getFinishedMaps, + finishedReduces = step.getFinishedReduces, + failedMaps = step.getFailedMaps, + failedReduces = step.getFailedReduces, + mapFileBytesRead = step.getMapFileBytesRead, + mapFileBytesWritten = step.getMapFileBytesWritten, + mapOutputBytes = mapOutputBytes(step), + reduceFileBytesRead = step.getReduceFileBytesRead, + hdfsBytesRead = step.getHdfsBytesRead, + hdfsBytesWritten = step.getHdfsBytesWritten, + mapperTimeMillis = step.getMapSlotMillis, + reducerTimeMillis = step.getReduceSlotMillis, + reduceShuffleBytes = step.getReduceShuffleBytes, + cost = 0, + tasks = tasks + ) + + private def mapOutputBytes(step: JobDetails): Long = + if (step.getHadoopVersion == HadoopVersion.TWO) { + getCounterValueAsLong(step.getMapCounters, Constants.TASK_COUNTER_HADOOP2, MapOutputBytesKey) + } else { + getCounterValueAsLong(step.getMapCounters, Constants.TASK_COUNTER, MapOutputBytesKey) + } + + private def getCounterValueAsLong( + counters: CounterMap, + counterGroupName: String, + counterName: String + ): Long = { + val counter = counters.getCounter(counterGroupName, counterName) + if (counter != null) counter.getValue else 0L + } +} diff --git a/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/memory/HRavenMemoryService.scala b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/memory/HRavenMemoryService.scala new file mode 100644 index 0000000000..781a7df08f --- /dev/null +++ b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/estimation/memory/HRavenMemoryService.scala @@ -0,0 +1,50 @@ +package com.twitter.scalding.hraven.estimation.memory + +import com.twitter.hraven.{CounterMap, TaskDetails} +import com.twitter.scalding.estimation.Task +import com.twitter.scalding.estimation.memory.SmoothedHistoryMemoryEstimator +import com.twitter.scalding.hraven.estimation.HRavenHistoryService + +trait HRavenMemoryHistoryService extends HRavenHistoryService { + import SmoothedHistoryMemoryEstimator._ + + private val TaskCounterGroup = "org.apache.hadoop.mapreduce.TaskCounter" + + override protected val detailFields: List[String] = List(Task.TaskType) + override protected val counterFields: List[String] = List( + "org.apache.hadoop.mapreduce.TaskCounter.COMMITTED_HEAP_BYTES", + "org.apache.hadoop.mapreduce.TaskCounter.PHYSICAL_MEMORY_BYTES", + "org.apache.hadoop.mapreduce.TaskCounter.GC_TIME_MILLIS", + "org.apache.hadoop.mapreduce.TaskCounter.CPU_MILLISECONDS" + ) + + override protected def details(taskDetails: TaskDetails): Option[Map[String, Any]] = + if (taskDetails.getType.nonEmpty) { + Some(Map(Task.TaskType -> taskDetails.getType)) + } else { + None + } + + override protected def counters(taskCounters: CounterMap): Option[Map[String, Long]] = + // sometimes get groups with only partial data + if (taskCounters.getGroups.isEmpty || taskCounters.getGroup(TaskCounterGroup).size() < 4) { + None + } else { + val group = taskCounters.getGroup(TaskCounterGroup) + + Some( + Map( + CommittedHeapBytes -> group.get(CommittedHeapBytes).getValue, + CpuMs -> group.get(CpuMs).getValue, + PhysicalMemoryBytes -> group.get(PhysicalMemoryBytes).getValue, + GCTimeMs -> group.get(GCTimeMs).getValue + ) + ) + } +} + +object HRavenMemoryHistoryService extends HRavenMemoryHistoryService + +class HRavenSmoothedMemoryEstimator extends SmoothedHistoryMemoryEstimator { + override def historyService = HRavenMemoryHistoryService +} diff --git a/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/reducer_estimation/HRavenBasedEstimator.scala b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/reducer_estimation/HRavenBasedEstimator.scala new file mode 100644 index 0000000000..9e72ca57e8 --- /dev/null +++ b/scalding-hraven/src/main/scala/com/twitter/scalding/hraven/reducer_estimation/HRavenBasedEstimator.scala @@ -0,0 +1,37 @@ +package com.twitter.scalding.hraven.reducer_estimation + +import com.twitter.hraven.{CounterMap, TaskDetails} +import com.twitter.scalding.estimation.Task +import com.twitter.scalding.hraven.estimation.HRavenHistoryService +import com.twitter.scalding.reducer_estimation.{RatioBasedEstimator, RuntimeReducerEstimator} + +trait HRavenReducerHistoryService extends HRavenHistoryService { + override protected val counterFields: List[String] = List() + override protected val detailFields: List[String] = List(Task.TaskType, "status", "startTime", "finishTime") + + override protected def counters(taskCounters: CounterMap): Option[Map[String, Long]] = Some(Map.empty) + + override protected def details(taskDetails: TaskDetails): Option[Map[String, Any]] = + if (taskDetails.getType.nonEmpty) { + Some( + Map( + Task.TaskType -> taskDetails.getType, + "status" -> taskDetails.getStatus, + "startTime" -> taskDetails.getStartTime, + "finishTime" -> taskDetails.getFinishTime + ) + ) + } else { + None + } +} + +object HRavenReducerHistoryService extends HRavenReducerHistoryService + +class HRavenRatioBasedEstimator extends RatioBasedEstimator { + override val historyService = HRavenReducerHistoryService +} + +class HRavenRuntimeBasedEstimator extends RuntimeReducerEstimator { + override val historyService = HRavenReducerHistoryService +} diff --git a/scalding-hraven/src/test/resources/flowResponse.json b/scalding-hraven/src/test/resources/flowResponse.json new file mode 100644 index 0000000000..1a90f3740a --- /dev/null +++ b/scalding-hraven/src/test/resources/flowResponse.json @@ -0,0 +1,3303 @@ +[ { + "flowKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492636627835, + "encodedRunId" : 9223370544218147972 + }, + "flowName" : "somegoodjob", + "userName" : "testuser", + "jobCount" : 3, + "totalMaps" : 368, + "totalReduces" : 136, + "mapFileBytesRead" : 111415465732, + "mapFileBytesWritten" : 222939745150, + "reduceFileBytesRead" : 159412950342, + "hdfsBytesRead" : 261872610033, + "hdfsBytesWritten" : 101710652089, + "mapSlotMillis" : 26673895, + "reduceSlotMillis" : 16718566, + "megabyteMillis" : 140725068288, + "cost" : 0.15269647166666667, + "reduceShuffleBytes" : 111314400214, + "duration" : 1278809, + "wallClockTime" : 1292275, + "cluster" : "test@cluster", + "appId" : "somegoodjob", + "runId" : 1492636627835, + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "submitTime" : 1492636719034, + "launchTime" : 1492636732500, + "finishTime" : 1492638011309, + "queue" : "testuser", + "counters" : { + "Scalding Custom" : { + "test_user" : 749 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 47654313, + "Tuples_Read" : 183443467 + }, + "evictions" : { + "MapsideReduce" : 12588706 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 66501607, + "SPILLED_RECORDS" : 212145628, + "VIRTUAL_MEMORY_BYTES" : 2050469523456, + "LOCALIZED_NANOS" : 523231295546, + "SPLIT_RAW_BYTES" : 772454, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 379508682777, + "REDUCE_SHUFFLE_BYTES" : 111314400214, + "PHYSICAL_MEMORY_BYTES" : 446650638336, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 508618, + "CPU_MILLISECONDS" : 32777850, + "LOCALIZED_FILES_MISSED" : 65239, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 111314400214, + "MERGED_MAP_OUTPUTS" : 19834, + "MAP_INPUT_RECORDS" : 183443467, + "LOCALIZED_BYTES_MISSED" : 76410918436, + "GC_TIME_MILLIS" : 3666842, + "REDUCE_INPUT_GROUPS" : 49999258, + "SHUFFLED_MAPS" : 19834, + "REDUCE_OUTPUT_RECORDS" : 47654313, + "LOCALIZED_BYTES_CACHED" : 464339205620, + "MAP_OUTPUT_RECORDS" : 66501607, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 457269559296 + }, + "hits" : { + "MapsideReduce" : 1545 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 382535497232, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 1890, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 261872610033, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 270828416074, + "HDFS_WRITE_OPS" : 292, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 101710652089 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 114155920, + "Tuples_Read" : 249945074, + "Read_Duration" : 15736600, + "Write_Duration" : 5645584, + "Process_Begin_Time" : 752289113507899, + "Process_End_Time" : 752289136140472 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 151396662, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 16467158 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 383, + "VCORES_MILLIS_REDUCES" : 14628747, + "TOTAL_LAUNCHED_REDUCES" : 137, + "NUM_KILLED_MAPS" : 15, + "OTHER_LOCAL_MAPS" : 193, + "DATA_LOCAL_MAPS" : 14, + "NUM_KILLED_REDUCES" : 1, + "MB_MILLIS_MAPS" : 83649341440, + "SLOTS_MILLIS_REDUCES" : 16718566, + "VCORES_MILLIS_MAPS" : 23339660, + "MB_MILLIS_REDUCES" : 52429429248, + "SLOTS_MILLIS_MAPS" : 26673895, + "RACK_LOCAL_MAPS" : 178, + "MILLIS_REDUCES" : 14628747, + "MILLIS_MAPS" : 23339660 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + } + }, + "mapCounters" : { + "Scalding Custom" : { + "test_user" : 749 + }, + "hits" : { + "MapsideReduce" : 1545 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 222939745150, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 1472, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 261872610033, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 111415465732, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 183443467 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 66501607, + "Tuples_Read" : 183443467, + "Read_Duration" : 12255275, + "Write_Duration" : 1936172, + "Process_Begin_Time" : 549290456528594, + "Process_End_Time" : 549290471470429 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 151396662, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 16467158 + }, + "evictions" : { + "MapsideReduce" : 12588706 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 50274, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 111314400214, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 114990966, + "VIRTUAL_MEMORY_BYTES" : 1488733134848, + "MAP_INPUT_RECORDS" : 183443467, + "LOCALIZED_NANOS" : 404720635591, + "SPLIT_RAW_BYTES" : 772454, + "LOCALIZED_BYTES_MISSED" : 58694615852, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 379508682777, + "PHYSICAL_MEMORY_BYTES" : 384967135232, + "GC_TIME_MILLIS" : 3588158, + "LOCALIZED_FILES_CACHED" : 366240, + "LOCALIZED_BYTES_CACHED" : 333801489356, + "MAP_OUTPUT_RECORDS" : 66501607, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 21975550, + "COMMITTED_HEAP_BYTES" : 388319395840 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 159595752082, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 418, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 159412950342, + "HDFS_WRITE_OPS" : 292, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 101710652089 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 47654313 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 47654313, + "Tuples_Read" : 66501607, + "Read_Duration" : 3481325, + "Write_Duration" : 3709412, + "Process_Begin_Time" : 202998656979305, + "Process_End_Time" : 202998664670043 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 14954, + "MERGED_MAP_OUTPUTS" : 19834, + "REDUCE_INPUT_RECORDS" : 66501607, + "SPILLED_RECORDS" : 97154662, + "VIRTUAL_MEMORY_BYTES" : 561736388608, + "LOCALIZED_NANOS" : 117077260352, + "LOCALIZED_BYTES_MISSED" : 17711987894, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 111314400214, + "PHYSICAL_MEMORY_BYTES" : 61683503104, + "GC_TIME_MILLIS" : 78684, + "REDUCE_INPUT_GROUPS" : 49999258, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 19834, + "LOCALIZED_FILES_CACHED" : 138988, + "REDUCE_OUTPUT_RECORDS" : 47654313, + "LOCALIZED_BYTES_CACHED" : 127341550620, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 10802300, + "COMMITTED_HEAP_BYTES" : 68950163456 + } + }, + "jobs" : [ { + "jobKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492636627835, + "jobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6609466, + "jobIdString" : "job_1470171371859_6609466" + }, + "qualifiedJobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6609466, + "jobIdString" : "job_1470171371859_6609466" + }, + "encodedRunId" : 9223370544218147972 + }, + "jobId" : "job_1470171371859_6609466", + "jobName" : "[757B3B58BB294F3F90AF5F83B8580C9E/A5F2F3029B964D4687A37F67670F8243] com.twitter.testuser.forward.testuserUnhydratedBatchJob/(2/3)", + "user" : "testuser", + "priority" : "", + "status" : "SUCCEEDED", + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "queue" : "testuser", + "submitTime" : 1492636719034, + "launchTime" : 1492636732500, + "finishTime" : 1492637099729, + "totalMaps" : 121, + "totalReduces" : 64, + "finishedMaps" : 121, + "finishedReduces" : 64, + "failedMaps" : 0, + "failedReduces" : 0, + "mapFileBytesRead" : 37795618712, + "mapFileBytesWritten" : 75485125090, + "reduceFileBytesRead" : 37635771874, + "hdfsBytesRead" : 94267826741, + "hdfsBytesWritten" : 37796931657, + "mapSlotMillis" : 13579749, + "reduceSlotMillis" : 7677084, + "reduceShuffleBytes" : 37660545208, + "megabyteMillis" : 68165602304, + "cost" : 0.07396441222222222, + "counters" : { + "Scalding Custom" : { + "test_user" : 749 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 16439297, + "Tuples_Read" : 75698331 + }, + "evictions" : { + "MapsideReduce" : 12588706 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 16467158, + "SPILLED_RECORDS" : 49363208, + "VIRTUAL_MEMORY_BYTES" : 753780228096, + "LOCALIZED_NANOS" : 211444954699, + "SPLIT_RAW_BYTES" : 321497, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 129012446460, + "REDUCE_SHUFFLE_BYTES" : 37660545208, + "PHYSICAL_MEMORY_BYTES" : 222611759104, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 185293, + "CPU_MILLISECONDS" : 14149680, + "LOCALIZED_FILES_MISSED" : 25261, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 37660545208, + "MERGED_MAP_OUTPUTS" : 7744, + "MAP_INPUT_RECORDS" : 75698331, + "LOCALIZED_BYTES_MISSED" : 29807994052, + "GC_TIME_MILLIS" : 3528882, + "REDUCE_INPUT_GROUPS" : 16439297, + "SHUFFLED_MAPS" : 7744, + "REDUCE_OUTPUT_RECORDS" : 16439297, + "LOCALIZED_BYTES_CACHED" : 168574833664, + "MAP_OUTPUT_RECORDS" : 16467158, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 219938238464 + }, + "hits" : { + "MapsideReduce" : 1545 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 113206590143, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 676, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 94267826741, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 75431390586, + "HDFS_WRITE_OPS" : 128, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 37796931657 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 32906455, + "Tuples_Read" : 92165489, + "Read_Duration" : 7976477, + "Write_Duration" : 1878377, + "Process_Begin_Time" : 276137816396411, + "Process_End_Time" : 276137827048673 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 75698331, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 16467158 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 132, + "VCORES_MILLIS_REDUCES" : 6717449, + "TOTAL_LAUNCHED_REDUCES" : 64, + "NUM_KILLED_MAPS" : 11, + "OTHER_LOCAL_MAPS" : 75, + "DATA_LOCAL_MAPS" : 1, + "MB_MILLIS_MAPS" : 42586095104, + "SLOTS_MILLIS_REDUCES" : 7677084, + "VCORES_MILLIS_MAPS" : 11882281, + "MB_MILLIS_REDUCES" : 24075337216, + "SLOTS_MILLIS_MAPS" : 13579749, + "RACK_LOCAL_MAPS" : 58, + "MILLIS_REDUCES" : 6717449, + "MILLIS_MAPS" : 11882281 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + } + }, + "mapCounters" : { + "Scalding Custom" : { + "test_user" : 749 + }, + "hits" : { + "MapsideReduce" : 1545 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 75485125090, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 484, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 94267826741, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 37795618712, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 75698331 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 16467158, + "Tuples_Read" : 75698331, + "Read_Duration" : 6583101, + "Write_Duration" : 1230218, + "Process_Begin_Time" : 180609047914354, + "Process_End_Time" : 180609056298876 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 75698331, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 16467158 + }, + "evictions" : { + "MapsideReduce" : 12588706 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 17197, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 37660545208, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 32909712, + "VIRTUAL_MEMORY_BYTES" : 489125904384, + "MAP_INPUT_RECORDS" : 75698331, + "LOCALIZED_NANOS" : 150246438277, + "SPLIT_RAW_BYTES" : 321497, + "LOCALIZED_BYTES_MISSED" : 20416784175, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 129012446460, + "PHYSICAL_MEMORY_BYTES" : 193147375616, + "GC_TIME_MILLIS" : 3495443, + "LOCALIZED_FILES_CACHED" : 119775, + "LOCALIZED_BYTES_CACHED" : 108638492318, + "MAP_OUTPUT_RECORDS" : 16467158, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 10839880, + "COMMITTED_HEAP_BYTES" : 187491102720 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 37721465053, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 192, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 37635771874, + "HDFS_WRITE_OPS" : 128, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 37796931657 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 16439297 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 16439297, + "Tuples_Read" : 16467158, + "Read_Duration" : 1393376, + "Write_Duration" : 648159, + "Process_Begin_Time" : 95528768482057, + "Process_End_Time" : 95528770749797 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 8060, + "MERGED_MAP_OUTPUTS" : 7744, + "REDUCE_INPUT_RECORDS" : 16467158, + "SPILLED_RECORDS" : 16453496, + "VIRTUAL_MEMORY_BYTES" : 264654323712, + "LOCALIZED_NANOS" : 60713180069, + "LOCALIZED_BYTES_MISSED" : 9389689314, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 37660545208, + "PHYSICAL_MEMORY_BYTES" : 29464383488, + "GC_TIME_MILLIS" : 33439, + "REDUCE_INPUT_GROUPS" : 16439297, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 7744, + "LOCALIZED_FILES_CACHED" : 64388, + "REDUCE_OUTPUT_RECORDS" : 16439297, + "LOCALIZED_BYTES_CACHED" : 58870952798, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 3309800, + "COMMITTED_HEAP_BYTES" : 32447135744 + } + }, + "tasks" : [ ], + "configuration" : { + "cascading.flow.step.num" : "2" + }, + "submitDate" : 1492636719034, + "launchDate" : 1492636732500, + "finishDate" : 1492637099729, + "runTime" : 367229 + }, { + "jobKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492636627835, + "jobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6609558, + "jobIdString" : "job_1470171371859_6609558" + }, + "qualifiedJobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6609558, + "jobIdString" : "job_1470171371859_6609558" + }, + "encodedRunId" : 9223370544218147972 + }, + "jobId" : "job_1470171371859_6609558", + "jobName" : "[757B3B58BB294F3F90AF5F83B8580C9E/CE20226849374E27A88A88828634B0A1] com.twitter.testuser.forward.testuserUnhydratedBatchJob/(1/3)", + "user" : "testuser", + "priority" : "", + "status" : "SUCCEEDED", + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "queue" : "testuser", + "submitTime" : 1492637189747, + "launchTime" : 1492637201324, + "finishTime" : 1492637387288, + "totalMaps" : 185, + "totalReduces" : 62, + "finishedMaps" : 185, + "finishedReduces" : 62, + "failedMaps" : 0, + "failedReduces" : 0, + "mapFileBytesRead" : 38087140565, + "mapFileBytesWritten" : 76339257791, + "reduceFileBytesRead" : 63401214784, + "hdfsBytesRead" : 132064823998, + "hdfsBytesWritten" : 35539895434, + "mapSlotMillis" : 9895558, + "reduceSlotMillis" : 4403484, + "reduceShuffleBytes" : 38141261549, + "megabyteMillis" : 45603508736, + "cost" : 0.049482973888888886, + "counters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 139823447713, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 926, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 132064823998, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 101488355349, + "HDFS_WRITE_OPS" : 124, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 35539895434 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 15607508, + "Tuples_Read" : 92137628 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 50034449, + "Tuples_Read" : 126564569, + "Read_Duration" : 5795589, + "Write_Duration" : 930624, + "Process_Begin_Time" : 368681399803019, + "Process_End_Time" : 368681406897953 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 75698331, + "Records Skipped" : 0 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 188, + "VCORES_MILLIS_REDUCES" : 3853049, + "TOTAL_LAUNCHED_REDUCES" : 63, + "NUM_KILLED_MAPS" : 3, + "OTHER_LOCAL_MAPS" : 77, + "DATA_LOCAL_MAPS" : 11, + "NUM_KILLED_REDUCES" : 1, + "MB_MILLIS_MAPS" : 31032472576, + "SLOTS_MILLIS_REDUCES" : 4403484, + "VCORES_MILLIS_MAPS" : 8658614, + "MB_MILLIS_REDUCES" : 13809327616, + "SLOTS_MILLIS_MAPS" : 9895558, + "RACK_LOCAL_MAPS" : 100, + "MILLIS_REDUCES" : 3853049, + "MILLIS_MAPS" : 8658614 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 34426941, + "SPILLED_RECORDS" : 105895650, + "VIRTUAL_MEMORY_BYTES" : 1004495097856, + "LOCALIZED_NANOS" : 246352867221, + "SPLIT_RAW_BYTES" : 387097, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 129571356432, + "REDUCE_SHUFFLE_BYTES" : 38141261549, + "PHYSICAL_MEMORY_BYTES" : 170776510464, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 248625, + "CPU_MILLISECONDS" : 11651100, + "LOCALIZED_FILES_MISSED" : 32113, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 38141261549, + "MERGED_MAP_OUTPUTS" : 11470, + "MAP_INPUT_RECORDS" : 92137628, + "LOCALIZED_BYTES_MISSED" : 37651200580, + "GC_TIME_MILLIS" : 100047, + "REDUCE_INPUT_GROUPS" : 33559951, + "SHUFFLED_MAPS" : 11470, + "REDUCE_OUTPUT_RECORDS" : 15607508, + "LOCALIZED_BYTES_CACHED" : 226859897415, + "MAP_OUTPUT_RECORDS" : 34426941, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 181911969792 + } + }, + "mapCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 76339257791, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 740, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 132064823998, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 38087140565, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 92137628 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 34426941, + "Tuples_Read" : 92137628, + "Read_Duration" : 4571858, + "Write_Duration" : 430227, + "Process_Begin_Time" : 276137886793246, + "Process_End_Time" : 276137891951274 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 75698331, + "Records Skipped" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 25224, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 38141261549, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 50866238, + "VIRTUAL_MEMORY_BYTES" : 748630978560, + "MAP_INPUT_RECORDS" : 92137628, + "LOCALIZED_NANOS" : 193724866170, + "SPLIT_RAW_BYTES" : 387097, + "LOCALIZED_BYTES_MISSED" : 29337513318, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 129571356432, + "PHYSICAL_MEMORY_BYTES" : 142990761984, + "GC_TIME_MILLIS" : 68431, + "LOCALIZED_FILES_CACHED" : 184196, + "LOCALIZED_BYTES_CACHED" : 167978930317, + "MAP_OUTPUT_RECORDS" : 34426941, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 8247540, + "COMMITTED_HEAP_BYTES" : 150478807040 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 63484189922, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 186, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 63401214784, + "HDFS_WRITE_OPS" : 124, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 35539895434 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 15607508 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 15607508, + "Tuples_Read" : 34426941, + "Read_Duration" : 1223731, + "Write_Duration" : 500397, + "Process_Begin_Time" : 92543513009773, + "Process_End_Time" : 92543514946679 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 6885, + "MERGED_MAP_OUTPUTS" : 11470, + "REDUCE_INPUT_RECORDS" : 34426941, + "SPILLED_RECORDS" : 55029412, + "VIRTUAL_MEMORY_BYTES" : 255864119296, + "LOCALIZED_NANOS" : 51994187544, + "LOCALIZED_BYTES_MISSED" : 8312094452, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 38141261549, + "PHYSICAL_MEMORY_BYTES" : 27785748480, + "GC_TIME_MILLIS" : 31616, + "REDUCE_INPUT_GROUPS" : 33559951, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 11470, + "LOCALIZED_FILES_CACHED" : 63299, + "REDUCE_OUTPUT_RECORDS" : 15607508, + "LOCALIZED_BYTES_CACHED" : 57815578550, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 3403560, + "COMMITTED_HEAP_BYTES" : 31433162752 + } + }, + "tasks" : [ ], + "configuration" : { + "cascading.flow.step.num" : "1" + }, + "submitDate" : 1492637189747, + "launchDate" : 1492637201324, + "finishDate" : 1492637387288, + "runTime" : 185964 + }, { + "jobKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492636627835, + "jobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6609614, + "jobIdString" : "job_1470171371859_6609614" + }, + "qualifiedJobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6609614, + "jobIdString" : "job_1470171371859_6609614" + }, + "encodedRunId" : 9223370544218147972 + }, + "jobId" : "job_1470171371859_6609614", + "jobName" : "[757B3B58BB294F3F90AF5F83B8580C9E/8B29C94927F245C5A2D7CA117AFC2565] com.twitter.testuser.forward.testuserUnhydratedBatchJob/(3/3) .../unhydrated/2017/04/19/19", + "user" : "testuser", + "priority" : "", + "status" : "SUCCEEDED", + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "queue" : "testuser", + "submitTime" : 1492637423760, + "launchTime" : 1492637430152, + "finishTime" : 1492638011309, + "totalMaps" : 62, + "totalReduces" : 10, + "finishedMaps" : 62, + "finishedReduces" : 10, + "failedMaps" : 0, + "failedReduces" : 0, + "mapFileBytesRead" : 35532706455, + "mapFileBytesWritten" : 71115362269, + "reduceFileBytesRead" : 58375963684, + "hdfsBytesRead" : 35539959294, + "hdfsBytesWritten" : 28373824998, + "mapSlotMillis" : 3198588, + "reduceSlotMillis" : 4637998, + "reduceShuffleBytes" : 35512593457, + "megabyteMillis" : 26955957248, + "cost" : 0.029249085555555554, + "counters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 129505459376, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 288, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 35539959294, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 93908670139, + "HDFS_WRITE_OPS" : 40, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 28373824998 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 15607508, + "Tuples_Read" : 15607508 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 31215016, + "Tuples_Read" : 31215016, + "Read_Duration" : 1964534, + "Write_Duration" : 2836583, + "Process_Begin_Time" : 107469897308469, + "Process_End_Time" : 107469902193846 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 63, + "VCORES_MILLIS_REDUCES" : 4058249, + "TOTAL_LAUNCHED_REDUCES" : 10, + "NUM_KILLED_MAPS" : 1, + "OTHER_LOCAL_MAPS" : 41, + "DATA_LOCAL_MAPS" : 2, + "MB_MILLIS_MAPS" : 10030773760, + "SLOTS_MILLIS_REDUCES" : 4637998, + "VCORES_MILLIS_MAPS" : 2798765, + "MB_MILLIS_REDUCES" : 14544764416, + "SLOTS_MILLIS_MAPS" : 3198588, + "RACK_LOCAL_MAPS" : 20, + "MILLIS_REDUCES" : 4058249, + "MILLIS_MAPS" : 2798765 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 15607508, + "SPILLED_RECORDS" : 56886770, + "VIRTUAL_MEMORY_BYTES" : 292194197504, + "LOCALIZED_NANOS" : 65433473626, + "SPLIT_RAW_BYTES" : 63860, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 120924879885, + "REDUCE_SHUFFLE_BYTES" : 35512593457, + "PHYSICAL_MEMORY_BYTES" : 53262368768, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 74700, + "CPU_MILLISECONDS" : 6977070, + "LOCALIZED_FILES_MISSED" : 7865, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 35512593457, + "MERGED_MAP_OUTPUTS" : 620, + "MAP_INPUT_RECORDS" : 15607508, + "LOCALIZED_BYTES_MISSED" : 8951723804, + "GC_TIME_MILLIS" : 37913, + "REDUCE_INPUT_GROUPS" : 10, + "SHUFFLED_MAPS" : 620, + "REDUCE_OUTPUT_RECORDS" : 15607508, + "LOCALIZED_BYTES_CACHED" : 68904474541, + "MAP_OUTPUT_RECORDS" : 15607508, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 55419351040 + } + }, + "mapCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 71115362269, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 248, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 35539959294, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 35532706455, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 15607508 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 15607508, + "Tuples_Read" : 15607508, + "Read_Duration" : 1100316, + "Write_Duration" : 275727, + "Process_Begin_Time" : 92543521820994, + "Process_End_Time" : 92543523220279 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 7853, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 35512593457, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 31215016, + "VIRTUAL_MEMORY_BYTES" : 250976251904, + "MAP_INPUT_RECORDS" : 15607508, + "LOCALIZED_NANOS" : 60749331144, + "SPLIT_RAW_BYTES" : 63860, + "LOCALIZED_BYTES_MISSED" : 8940318359, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 120924879885, + "PHYSICAL_MEMORY_BYTES" : 48828997632, + "GC_TIME_MILLIS" : 24284, + "LOCALIZED_FILES_CACHED" : 62269, + "LOCALIZED_BYTES_CACHED" : 57184066721, + "MAP_OUTPUT_RECORDS" : 15607508, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 2888130, + "COMMITTED_HEAP_BYTES" : 50349486080 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 58390097107, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 40, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 58375963684, + "HDFS_WRITE_OPS" : 40, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 28373824998 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 15607508 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 15607508, + "Tuples_Read" : 15607508, + "Read_Duration" : 864218, + "Write_Duration" : 2560856, + "Process_Begin_Time" : 14926375487475, + "Process_End_Time" : 14926378973567 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 9, + "MERGED_MAP_OUTPUTS" : 620, + "REDUCE_INPUT_RECORDS" : 15607508, + "SPILLED_RECORDS" : 25671754, + "VIRTUAL_MEMORY_BYTES" : 41217945600, + "LOCALIZED_NANOS" : 4369892739, + "LOCALIZED_BYTES_MISSED" : 10204128, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 35512593457, + "PHYSICAL_MEMORY_BYTES" : 4433371136, + "GC_TIME_MILLIS" : 13629, + "REDUCE_INPUT_GROUPS" : 10, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 620, + "LOCALIZED_FILES_CACHED" : 11301, + "REDUCE_OUTPUT_RECORDS" : 15607508, + "LOCALIZED_BYTES_CACHED" : 10655019272, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 4088940, + "COMMITTED_HEAP_BYTES" : 5069864960 + } + }, + "tasks" : [ ], + "configuration" : { + "cascading.flow.step.num" : "3" + }, + "submitDate" : 1492637423760, + "launchDate" : 1492637430152, + "finishDate" : 1492638011309, + "runTime" : 581157 + } ] +}, { + "flowKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492632715606, + "encodedRunId" : 9223370544222060201 + }, + "flowName" : "somegoodjob", + "userName" : "testuser", + "jobCount" : 3, + "totalMaps" : 385, + "totalReduces" : 137, + "mapFileBytesRead" : 113172163234, + "mapFileBytesWritten" : 226544957183, + "reduceFileBytesRead" : 163250996657, + "hdfsBytesRead" : 266627900618, + "hdfsBytesWritten" : 103607443641, + "mapSlotMillis" : 29331332, + "reduceSlotMillis" : 17119024, + "megabyteMillis" : 150597505536, + "cost" : 0.16340875166666663, + "reduceShuffleBytes" : 113144899105, + "duration" : 1355698, + "wallClockTime" : 1368561, + "cluster" : "test@cluster", + "appId" : "somegoodjob", + "runId" : 1492632715606, + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "submitTime" : 1492632797525, + "launchTime" : 1492632810388, + "finishTime" : 1492634166086, + "queue" : "testuser", + "counters" : { + "Scalding Custom" : { + "test_user" : 810 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 48800236, + "Tuples_Read" : 195996831 + }, + "evictions" : { + "MapsideReduce" : 12771518 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 72741055, + "SPILLED_RECORDS" : 231774506, + "VIRTUAL_MEMORY_BYTES" : 2122960666624, + "LOCALIZED_NANOS" : 682258492993, + "SPLIT_RAW_BYTES" : 815480, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 387016458073, + "REDUCE_SHUFFLE_BYTES" : 113144899105, + "PHYSICAL_MEMORY_BYTES" : 464332615680, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 526479, + "CPU_MILLISECONDS" : 36129990, + "LOCALIZED_FILES_MISSED" : 67753, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 113144899105, + "MERGED_MAP_OUTPUTS" : 21045, + "MAP_INPUT_RECORDS" : 195996831, + "LOCALIZED_BYTES_MISSED" : 79988792697, + "GC_TIME_MILLIS" : 3804321, + "REDUCE_INPUT_GROUPS" : 55895910, + "SHUFFLED_MAPS" : 21045, + "REDUCE_OUTPUT_RECORDS" : 48800236, + "LOCALIZED_BYTES_CACHED" : 479959047536, + "MAP_OUTPUT_RECORDS" : 72741055, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 474627133440 + }, + "hits" : { + "MapsideReduce" : 1603 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 389979938079, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 1961, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 266627900618, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 276423159891, + "HDFS_WRITE_OPS" : 294, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 103607443641 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 121541291, + "Tuples_Read" : 268737886, + "Read_Duration" : 17360974, + "Write_Duration" : 6210507, + "Process_Begin_Time" : 779154533017338, + "Process_End_Time" : 779154558057047 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 163208962, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 16803506 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 403, + "VCORES_MILLIS_REDUCES" : 14979147, + "TOTAL_LAUNCHED_REDUCES" : 141, + "NUM_KILLED_MAPS" : 17, + "OTHER_LOCAL_MAPS" : 205, + "DATA_LOCAL_MAPS" : 19, + "NUM_KILLED_REDUCES" : 4, + "MB_MILLIS_MAPS" : 91983058944, + "SLOTS_MILLIS_REDUCES" : 17119024, + "VCORES_MILLIS_MAPS" : 25664916, + "NUM_FAILED_MAPS" : 1, + "MB_MILLIS_REDUCES" : 53685262848, + "SLOTS_MILLIS_MAPS" : 29331332, + "RACK_LOCAL_MAPS" : 180, + "MILLIS_REDUCES" : 14979147, + "MILLIS_MAPS" : 25664916 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + } + }, + "mapCounters" : { + "Scalding Custom" : { + "test_user" : 810 + }, + "hits" : { + "MapsideReduce" : 1603 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 226544957183, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 1540, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 266627900618, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 113172163234, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 195996831 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 72741055, + "Tuples_Read" : 195996831, + "Read_Duration" : 13299362, + "Write_Duration" : 2170007, + "Process_Begin_Time" : 574663776002832, + "Process_End_Time" : 574663792332928 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 163208962, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 16803506 + }, + "evictions" : { + "MapsideReduce" : 12771518 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 51827, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 113144899105, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 122300442, + "VIRTUAL_MEMORY_BYTES" : 1557522055168, + "MAP_INPUT_RECORDS" : 195996831, + "LOCALIZED_NANOS" : 526011367517, + "SPLIT_RAW_BYTES" : 815480, + "LOCALIZED_BYTES_MISSED" : 61496078904, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 387016458073, + "PHYSICAL_MEMORY_BYTES" : 402146570240, + "GC_TIME_MILLIS" : 3714743, + "LOCALIZED_FILES_CACHED" : 383930, + "LOCALIZED_BYTES_CACHED" : 349131284361, + "MAP_OUTPUT_RECORDS" : 72741055, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 23949430, + "COMMITTED_HEAP_BYTES" : 405168017408 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 163434980896, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 421, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 163250996657, + "HDFS_WRITE_OPS" : 294, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 103607443641 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 48800236 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 48800236, + "Tuples_Read" : 72741055, + "Read_Duration" : 4061612, + "Write_Duration" : 4040500, + "Process_Begin_Time" : 204490757014506, + "Process_End_Time" : 204490765724119 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 15914, + "MERGED_MAP_OUTPUTS" : 21045, + "REDUCE_INPUT_RECORDS" : 72741055, + "SPILLED_RECORDS" : 109474064, + "VIRTUAL_MEMORY_BYTES" : 565438611456, + "LOCALIZED_NANOS" : 154690352172, + "LOCALIZED_BYTES_MISSED" : 18484109247, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 113144899105, + "PHYSICAL_MEMORY_BYTES" : 62186045440, + "GC_TIME_MILLIS" : 89578, + "REDUCE_INPUT_GROUPS" : 55895910, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 21045, + "LOCALIZED_FILES_CACHED" : 139160, + "REDUCE_OUTPUT_RECORDS" : 48800236, + "LOCALIZED_BYTES_CACHED" : 127635845787, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 12180560, + "COMMITTED_HEAP_BYTES" : 69459116032 + } + }, + "jobs" : [ { + "jobKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492632715606, + "jobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6608374, + "jobIdString" : "job_1470171371859_6608374" + }, + "qualifiedJobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6608374, + "jobIdString" : "job_1470171371859_6608374" + }, + "encodedRunId" : 9223370544222060201 + }, + "jobId" : "job_1470171371859_6608374", + "jobName" : "[C6D9BD40285E426AA4186394C126B551/B89F9F1DAE1B4A98B710A4D32EC55873] com.twitter.testuser.forward.testuserUnhydratedBatchJob/(2/3)", + "user" : "testuser", + "priority" : "", + "status" : "SUCCEEDED", + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "queue" : "testuser", + "submitTime" : 1492632797525, + "launchTime" : 1492632810388, + "finishTime" : 1492633163993, + "totalMaps" : 129, + "totalReduces" : 64, + "finishedMaps" : 129, + "finishedReduces" : 64, + "failedMaps" : 0, + "failedReduces" : 0, + "mapFileBytesRead" : 38269393803, + "mapFileBytesWritten" : 76454763405, + "reduceFileBytesRead" : 39043379028, + "hdfsBytesRead" : 96004737363, + "hdfsBytesWritten" : 38303868081, + "mapSlotMillis" : 14364674, + "reduceSlotMillis" : 6616707, + "reduceShuffleBytes" : 38146657114, + "megabyteMillis" : 67245979136, + "cost" : 0.07296655722222221, + "counters" : { + "Scalding Custom" : { + "test_user" : 810 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 16775502, + "Tuples_Read" : 81604481 + }, + "evictions" : { + "MapsideReduce" : 12771518 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 16803506, + "SPILLED_RECORDS" : 50771480, + "VIRTUAL_MEMORY_BYTES" : 786154287104, + "LOCALIZED_NANOS" : 261150864082, + "SPLIT_RAW_BYTES" : 342753, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 131133636940, + "REDUCE_SHUFFLE_BYTES" : 38146657114, + "PHYSICAL_MEMORY_BYTES" : 232995028992, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 194946, + "CPU_MILLISECONDS" : 15268200, + "LOCALIZED_FILES_MISSED" : 24664, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 38146657114, + "MERGED_MAP_OUTPUTS" : 8256, + "MAP_INPUT_RECORDS" : 81604481, + "LOCALIZED_BYTES_MISSED" : 29607543994, + "GC_TIME_MILLIS" : 3649536, + "REDUCE_INPUT_GROUPS" : 16775502, + "SHUFFLED_MAPS" : 8256, + "REDUCE_OUTPUT_RECORDS" : 16775502, + "LOCALIZED_BYTES_CACHED" : 177307666049, + "MAP_OUTPUT_RECORDS" : 16803506, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 229451812864 + }, + "hits" : { + "MapsideReduce" : 1603 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 115583763634, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 708, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 96004737363, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 77312772831, + "HDFS_WRITE_OPS" : 128, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 38303868081 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 33579008, + "Tuples_Read" : 98407987, + "Read_Duration" : 8487639, + "Write_Duration" : 2047170, + "Process_Begin_Time" : 288078153269647, + "Process_End_Time" : 288078164714486 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 81604481, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 16803506 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 141, + "VCORES_MILLIS_REDUCES" : 5789619, + "TOTAL_LAUNCHED_REDUCES" : 65, + "NUM_KILLED_MAPS" : 12, + "OTHER_LOCAL_MAPS" : 77, + "DATA_LOCAL_MAPS" : 1, + "NUM_KILLED_REDUCES" : 1, + "MB_MILLIS_MAPS" : 45047618560, + "SLOTS_MILLIS_REDUCES" : 6616707, + "VCORES_MILLIS_MAPS" : 12569090, + "MB_MILLIS_REDUCES" : 20749994496, + "SLOTS_MILLIS_MAPS" : 14364674, + "RACK_LOCAL_MAPS" : 64, + "MILLIS_REDUCES" : 5789619, + "MILLIS_MAPS" : 12569090 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + } + }, + "mapCounters" : { + "Scalding Custom" : { + "test_user" : 810 + }, + "hits" : { + "MapsideReduce" : 1603 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 76454763405, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 516, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 96004737363, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 38269393803, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 81604481 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 16803506, + "Tuples_Read" : 81604481, + "Read_Duration" : 6942029, + "Write_Duration" : 1399751, + "Process_Begin_Time" : 192549636233632, + "Process_End_Time" : 192549645230601 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 81604481, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 16803506 + }, + "evictions" : { + "MapsideReduce" : 12771518 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 16732, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 38146657114, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 33575024, + "VIRTUAL_MEMORY_BYTES" : 521721344000, + "MAP_INPUT_RECORDS" : 81604481, + "LOCALIZED_NANOS" : 180981945921, + "SPLIT_RAW_BYTES" : 342753, + "LOCALIZED_BYTES_MISSED" : 20331938692, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 131133636940, + "PHYSICAL_MEMORY_BYTES" : 203313250304, + "GC_TIME_MILLIS" : 3612486, + "LOCALIZED_FILES_CACHED" : 129296, + "LOCALIZED_BYTES_CACHED" : 117255771650, + "MAP_OUTPUT_RECORDS" : 16803506, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 11626970, + "COMMITTED_HEAP_BYTES" : 197002711040 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 39129000229, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 192, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 39043379028, + "HDFS_WRITE_OPS" : 128, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 38303868081 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 16775502 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 16775502, + "Tuples_Read" : 16803506, + "Read_Duration" : 1545610, + "Write_Duration" : 647419, + "Process_Begin_Time" : 95528517036015, + "Process_End_Time" : 95528519483885 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 7927, + "MERGED_MAP_OUTPUTS" : 8256, + "REDUCE_INPUT_RECORDS" : 16803506, + "SPILLED_RECORDS" : 17196456, + "VIRTUAL_MEMORY_BYTES" : 264432943104, + "LOCALIZED_NANOS" : 79671609404, + "LOCALIZED_BYTES_MISSED" : 9269815365, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 38146657114, + "PHYSICAL_MEMORY_BYTES" : 29681778688, + "GC_TIME_MILLIS" : 37050, + "REDUCE_INPUT_GROUPS" : 16775502, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 8256, + "LOCALIZED_FILES_CACHED" : 64521, + "REDUCE_OUTPUT_RECORDS" : 16775502, + "LOCALIZED_BYTES_CACHED" : 58990754107, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 3641230, + "COMMITTED_HEAP_BYTES" : 32449101824 + } + }, + "tasks" : [ ], + "configuration" : { + "cascading.flow.step.num" : "2" + }, + "submitDate" : 1492632797525, + "launchDate" : 1492632810388, + "finishDate" : 1492633163993, + "runTime" : 353605 + }, { + "jobKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492632715606, + "jobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6608570, + "jobIdString" : "job_1470171371859_6608570" + }, + "qualifiedJobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6608570, + "jobIdString" : "job_1470171371859_6608570" + }, + "encodedRunId" : 9223370544222060201 + }, + "jobId" : "job_1470171371859_6608570", + "jobName" : "[C6D9BD40285E426AA4186394C126B551/953E3620701A4D9183064DE40D3509F2] com.twitter.testuser.forward.testuserUnhydratedBatchJob/(1/3)", + "user" : "testuser", + "priority" : "", + "status" : "SUCCEEDED", + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "queue" : "testuser", + "submitTime" : 1492633250502, + "launchTime" : 1492633263806, + "finishTime" : 1492633490918, + "totalMaps" : 193, + "totalReduces" : 63, + "finishedMaps" : 193, + "finishedReduces" : 63, + "failedMaps" : 0, + "failedReduces" : 0, + "mapFileBytesRead" : 38605064146, + "mapFileBytesWritten" : 77444144858, + "reduceFileBytesRead" : 64369198442, + "hdfsBytesRead" : 134308670528, + "hdfsBytesWritten" : 36314427837, + "mapSlotMillis" : 11418369, + "reduceSlotMillis" : 5361488, + "reduceShuffleBytes" : 38721089102, + "megabyteMillis" : 53551882752, + "cost" : 0.05810751166666666, + "counters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 141897586642, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 961, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 134308670528, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 102974262588, + "HDFS_WRITE_OPS" : 126, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 36314427837 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 16012367, + "Tuples_Read" : 98379983 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 55937549, + "Tuples_Read" : 138305165, + "Read_Duration" : 6707040, + "Write_Duration" : 1074482, + "Process_Begin_Time" : 382114128351773, + "Process_End_Time" : 382114136600916 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 81604481, + "Records Skipped" : 0 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 196, + "VCORES_MILLIS_REDUCES" : 4691302, + "TOTAL_LAUNCHED_REDUCES" : 66, + "NUM_KILLED_MAPS" : 3, + "OTHER_LOCAL_MAPS" : 86, + "DATA_LOCAL_MAPS" : 15, + "NUM_KILLED_REDUCES" : 3, + "MB_MILLIS_MAPS" : 35808005632, + "SLOTS_MILLIS_REDUCES" : 5361488, + "VCORES_MILLIS_MAPS" : 9991073, + "MB_MILLIS_REDUCES" : 16813626368, + "SLOTS_MILLIS_MAPS" : 11418369, + "RACK_LOCAL_MAPS" : 95, + "MILLIS_REDUCES" : 4691302, + "MILLIS_MAPS" : 9991073 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 39925182, + "SPILLED_RECORDS" : 122551413, + "VIRTUAL_MEMORY_BYTES" : 1040169828352, + "LOCALIZED_NANOS" : 328012266597, + "SPLIT_RAW_BYTES" : 407837, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 131829322449, + "REDUCE_SHUFFLE_BYTES" : 38721089102, + "PHYSICAL_MEMORY_BYTES" : 177282752512, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 260456, + "CPU_MILLISECONDS" : 13393960, + "LOCALIZED_FILES_MISSED" : 30470, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 38721089102, + "MERGED_MAP_OUTPUTS" : 12159, + "MAP_INPUT_RECORDS" : 98379983, + "LOCALIZED_BYTES_MISSED" : 35659339426, + "GC_TIME_MILLIS" : 114044, + "REDUCE_INPUT_GROUPS" : 39120398, + "SHUFFLED_MAPS" : 12159, + "REDUCE_OUTPUT_RECORDS" : 16012367, + "LOCALIZED_BYTES_CACHED" : 238450653585, + "MAP_OUTPUT_RECORDS" : 39925182, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 188948316160 + } + }, + "mapCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 77444144858, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 772, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 134308670528, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 38605064146, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 98379983 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 39925182, + "Tuples_Read" : 98379983, + "Read_Duration" : 5140255, + "Write_Duration" : 483616, + "Process_Begin_Time" : 288078225244627, + "Process_End_Time" : 288078231048777 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 81604481, + "Records Skipped" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 23153, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 38721089102, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 56700684, + "VIRTUAL_MEMORY_BYTES" : 780523053056, + "MAP_INPUT_RECORDS" : 98379983, + "LOCALIZED_NANOS" : 259273008863, + "SPLIT_RAW_BYTES" : 407837, + "LOCALIZED_BYTES_MISSED" : 27242131551, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 131829322449, + "PHYSICAL_MEMORY_BYTES" : 149129138176, + "GC_TIME_MILLIS" : 75962, + "LOCALIZED_FILES_CACHED" : 195323, + "LOCALIZED_BYTES_CACHED" : 178606685961, + "MAP_OUTPUT_RECORDS" : 39925182, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 9260920, + "COMMITTED_HEAP_BYTES" : 157008166912 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 64453441784, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 189, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 64369198442, + "HDFS_WRITE_OPS" : 126, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 36314427837 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 16012367 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 16012367, + "Tuples_Read" : 39925182, + "Read_Duration" : 1566785, + "Write_Duration" : 590866, + "Process_Begin_Time" : 94035903107146, + "Process_End_Time" : 94035905552139 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 7313, + "MERGED_MAP_OUTPUTS" : 12159, + "REDUCE_INPUT_RECORDS" : 39925182, + "SPILLED_RECORDS" : 65850729, + "VIRTUAL_MEMORY_BYTES" : 259646775296, + "LOCALIZED_NANOS" : 68072310662, + "LOCALIZED_BYTES_MISSED" : 8415594516, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 38721089102, + "PHYSICAL_MEMORY_BYTES" : 28153614336, + "GC_TIME_MILLIS" : 38082, + "REDUCE_INPUT_GROUPS" : 39120398, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 12159, + "LOCALIZED_FILES_CACHED" : 64003, + "REDUCE_OUTPUT_RECORDS" : 16012367, + "LOCALIZED_BYTES_CACHED" : 58778579076, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 4133040, + "COMMITTED_HEAP_BYTES" : 31940149248 + } + }, + "tasks" : [ ], + "configuration" : { + "cascading.flow.step.num" : "1" + }, + "submitDate" : 1492633250502, + "launchDate" : 1492633263806, + "finishDate" : 1492633490918, + "runTime" : 227112 + }, { + "jobKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492632715606, + "jobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6608659, + "jobIdString" : "job_1470171371859_6608659" + }, + "qualifiedJobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6608659, + "jobIdString" : "job_1470171371859_6608659" + }, + "encodedRunId" : 9223370544222060201 + }, + "jobId" : "job_1470171371859_6608659", + "jobName" : "[C6D9BD40285E426AA4186394C126B551/F612986BE34B4779BBC6323D3B3011F1] com.twitter.testuser.forward.testuserUnhydratedBatchJob/(3/3) .../unhydrated/2017/04/19/18", + "user" : "testuser", + "priority" : "", + "status" : "SUCCEEDED", + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "queue" : "testuser", + "submitTime" : 1492633529277, + "launchTime" : 1492633543389, + "finishTime" : 1492634166086, + "totalMaps" : 63, + "totalReduces" : 10, + "finishedMaps" : 63, + "finishedReduces" : 10, + "failedMaps" : 0, + "failedReduces" : 0, + "mapFileBytesRead" : 36297705285, + "mapFileBytesWritten" : 72646048920, + "reduceFileBytesRead" : 59838419187, + "hdfsBytesRead" : 36314492727, + "hdfsBytesWritten" : 28989147723, + "mapSlotMillis" : 3548289, + "reduceSlotMillis" : 5140829, + "reduceShuffleBytes" : 36277152889, + "megabyteMillis" : 29799643648, + "cost" : 0.032334682777777776, + "counters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 132498587803, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 292, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 36314492727, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 96136124472, + "HDFS_WRITE_OPS" : 40, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 28989147723 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 16012367, + "Tuples_Read" : 16012367 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 32024734, + "Tuples_Read" : 32024734, + "Read_Duration" : 2166295, + "Write_Duration" : 3088855, + "Process_Begin_Time" : 108962251395918, + "Process_End_Time" : 108962256741645 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 66, + "VCORES_MILLIS_REDUCES" : 4498226, + "TOTAL_LAUNCHED_REDUCES" : 10, + "NUM_KILLED_MAPS" : 2, + "OTHER_LOCAL_MAPS" : 42, + "DATA_LOCAL_MAPS" : 3, + "MB_MILLIS_MAPS" : 11127434752, + "SLOTS_MILLIS_REDUCES" : 5140829, + "VCORES_MILLIS_MAPS" : 3104753, + "NUM_FAILED_MAPS" : 1, + "MB_MILLIS_REDUCES" : 16121641984, + "SLOTS_MILLIS_MAPS" : 3548289, + "RACK_LOCAL_MAPS" : 21, + "MILLIS_REDUCES" : 4498226, + "MILLIS_MAPS" : 3104753 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 16012367, + "SPILLED_RECORDS" : 58451613, + "VIRTUAL_MEMORY_BYTES" : 296636551168, + "LOCALIZED_NANOS" : 93095362314, + "SPLIT_RAW_BYTES" : 64890, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 124053498684, + "REDUCE_SHUFFLE_BYTES" : 36277152889, + "PHYSICAL_MEMORY_BYTES" : 54054834176, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 71077, + "CPU_MILLISECONDS" : 7467830, + "LOCALIZED_FILES_MISSED" : 12619, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 36277152889, + "MERGED_MAP_OUTPUTS" : 630, + "MAP_INPUT_RECORDS" : 16012367, + "LOCALIZED_BYTES_MISSED" : 14721909277, + "GC_TIME_MILLIS" : 40741, + "REDUCE_INPUT_GROUPS" : 10, + "SHUFFLED_MAPS" : 630, + "REDUCE_OUTPUT_RECORDS" : 16012367, + "LOCALIZED_BYTES_CACHED" : 64200727902, + "MAP_OUTPUT_RECORDS" : 16012367, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 56227004416 + } + }, + "mapCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 72646048920, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 252, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 36314492727, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 36297705285, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 16012367 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 16012367, + "Tuples_Read" : 16012367, + "Read_Duration" : 1217078, + "Write_Duration" : 286640, + "Process_Begin_Time" : 94035914524573, + "Process_End_Time" : 94035916053550 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 11942, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 36277152889, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 32024734, + "VIRTUAL_MEMORY_BYTES" : 255277658112, + "MAP_INPUT_RECORDS" : 16012367, + "LOCALIZED_NANOS" : 85756412733, + "SPLIT_RAW_BYTES" : 64890, + "LOCALIZED_BYTES_MISSED" : 13922008661, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 124053498684, + "PHYSICAL_MEMORY_BYTES" : 49704181760, + "GC_TIME_MILLIS" : 26295, + "LOCALIZED_FILES_CACHED" : 59311, + "LOCALIZED_BYTES_CACHED" : 53268826750, + "MAP_OUTPUT_RECORDS" : 16012367, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 3061540, + "COMMITTED_HEAP_BYTES" : 51157139456 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 59852538883, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 40, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 59838419187, + "HDFS_WRITE_OPS" : 40, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 28989147723 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 16012367 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 16012367, + "Tuples_Read" : 16012367, + "Read_Duration" : 949217, + "Write_Duration" : 2802215, + "Process_Begin_Time" : 14926336871345, + "Process_End_Time" : 14926340688095 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 674, + "MERGED_MAP_OUTPUTS" : 630, + "REDUCE_INPUT_RECORDS" : 16012367, + "SPILLED_RECORDS" : 26426879, + "VIRTUAL_MEMORY_BYTES" : 41358893056, + "LOCALIZED_NANOS" : 6946432106, + "LOCALIZED_BYTES_MISSED" : 798699366, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 36277152889, + "PHYSICAL_MEMORY_BYTES" : 4350652416, + "GC_TIME_MILLIS" : 14446, + "REDUCE_INPUT_GROUPS" : 10, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 630, + "LOCALIZED_FILES_CACHED" : 10636, + "REDUCE_OUTPUT_RECORDS" : 16012367, + "LOCALIZED_BYTES_CACHED" : 9866512604, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 4406290, + "COMMITTED_HEAP_BYTES" : 5069864960 + } + }, + "tasks" : [ ], + "configuration" : { + "cascading.flow.step.num" : "3" + }, + "submitDate" : 1492633529277, + "launchDate" : 1492633543389, + "finishDate" : 1492634166086, + "runTime" : 622697 + } ] +}, { + "flowKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492629120238, + "encodedRunId" : 9223370544225655569 + }, + "flowName" : "somegoodjob", + "userName" : "testuser", + "jobCount" : 3, + "totalMaps" : 415, + "totalReduces" : 141, + "mapFileBytesRead" : 119652359797, + "mapFileBytesWritten" : 239507162599, + "reduceFileBytesRead" : 184321388670, + "hdfsBytesRead" : 282024326201, + "hdfsBytesWritten" : 109915817955, + "mapSlotMillis" : 29834574, + "reduceSlotMillis" : 17818367, + "megabyteMillis" : 154376586752, + "cost" : 0.1675093172222222, + "reduceShuffleBytes" : 119607357030, + "duration" : 1386911, + "wallClockTime" : 1422000, + "cluster" : "test@cluster", + "appId" : "somegoodjob", + "runId" : 1492629120238, + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "submitTime" : 1492629217932, + "launchTime" : 1492629253021, + "finishTime" : 1492630639932, + "queue" : "testuser", + "counters" : { + "Scalding Custom" : { + "test_user" : 805 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 51711261, + "Tuples_Read" : 206828887 + }, + "evictions" : { + "MapsideReduce" : 12711135 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 75745072, + "SPILLED_RECORDS" : 247366146, + "VIRTUAL_MEMORY_BYTES" : 2261545836544, + "LOCALIZED_NANOS" : 621652509094, + "SPLIT_RAW_BYTES" : 888563, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 410957748527, + "REDUCE_SHUFFLE_BYTES" : 119607357030, + "PHYSICAL_MEMORY_BYTES" : 514274934784, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 573353, + "CPU_MILLISECONDS" : 36940780, + "LOCALIZED_FILES_MISSED" : 59363, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 119607357030, + "MERGED_MAP_OUTPUTS" : 23560, + "MAP_INPUT_RECORDS" : 206828887, + "LOCALIZED_BYTES_MISSED" : 69180413795, + "GC_TIME_MILLIS" : 3829080, + "REDUCE_INPUT_GROUPS" : 57952423, + "SHUFFLED_MAPS" : 23560, + "REDUCE_OUTPUT_RECORDS" : 51711261, + "LOCALIZED_BYTES_CACHED" : 527031421131, + "MAP_OUTPUT_RECORDS" : 75745072, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 524035977216 + }, + "hits" : { + "MapsideReduce" : 1935 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 424018044192, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 2093, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 282024326201, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 303973748467, + "HDFS_WRITE_OPS" : 302, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 109915817955 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 127456333, + "Tuples_Read" : 282573959, + "Read_Duration" : 17267767, + "Write_Duration" : 6329319, + "Process_Begin_Time" : 829902072807036, + "Process_End_Time" : 829902097833076 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 172118504, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 17739761 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 432, + "VCORES_MILLIS_REDUCES" : 15591072, + "TOTAL_LAUNCHED_REDUCES" : 145, + "NUM_KILLED_MAPS" : 17, + "OTHER_LOCAL_MAPS" : 204, + "DATA_LOCAL_MAPS" : 15, + "NUM_KILLED_REDUCES" : 4, + "MB_MILLIS_MAPS" : 93561226752, + "SLOTS_MILLIS_REDUCES" : 17818367, + "VCORES_MILLIS_MAPS" : 26105253, + "MB_MILLIS_REDUCES" : 55878402048, + "SLOTS_MILLIS_MAPS" : 29834574, + "RACK_LOCAL_MAPS" : 213, + "MILLIS_REDUCES" : 15591072, + "MILLIS_MAPS" : 26105253 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + } + }, + "mapCounters" : { + "Scalding Custom" : { + "test_user" : 805 + }, + "hits" : { + "MapsideReduce" : 1935 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 239507162599, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 1660, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 282024326201, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 119652359797, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 206828887 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 75745072, + "Tuples_Read" : 206828887, + "Read_Duration" : 13387512, + "Write_Duration" : 2169585, + "Process_Begin_Time" : 619441287516435, + "Process_End_Time" : 619441303935634 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 172118504, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 17739761 + }, + "evictions" : { + "MapsideReduce" : 12711135 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 45265, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 119607357030, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 128166590, + "VIRTUAL_MEMORY_BYTES" : 1678871752704, + "MAP_INPUT_RECORDS" : 206828887, + "LOCALIZED_NANOS" : 439998797901, + "SPLIT_RAW_BYTES" : 888563, + "LOCALIZED_BYTES_MISSED" : 52866562203, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 410957748527, + "PHYSICAL_MEMORY_BYTES" : 450766368768, + "GC_TIME_MILLIS" : 3740200, + "LOCALIZED_FILES_CACHED" : 424448, + "LOCALIZED_BYTES_CACHED" : 389758256396, + "MAP_OUTPUT_RECORDS" : 75745072, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 24525290, + "COMMITTED_HEAP_BYTES" : 452546949120 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 184510881593, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 433, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 184321388670, + "HDFS_WRITE_OPS" : 302, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 109915817955 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 51711261 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 51711261, + "Tuples_Read" : 75745072, + "Read_Duration" : 3880255, + "Write_Duration" : 4159734, + "Process_Begin_Time" : 210460785290601, + "Process_End_Time" : 210460793897442 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 14087, + "MERGED_MAP_OUTPUTS" : 23560, + "REDUCE_INPUT_RECORDS" : 75745072, + "SPILLED_RECORDS" : 119199556, + "VIRTUAL_MEMORY_BYTES" : 582674083840, + "LOCALIZED_NANOS" : 175613465264, + "LOCALIZED_BYTES_MISSED" : 16309415297, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 119607357030, + "PHYSICAL_MEMORY_BYTES" : 63508566016, + "GC_TIME_MILLIS" : 88880, + "REDUCE_INPUT_GROUPS" : 57952423, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 23560, + "LOCALIZED_FILES_CACHED" : 145515, + "REDUCE_OUTPUT_RECORDS" : 51711261, + "LOCALIZED_BYTES_CACHED" : 134076999091, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 12415490, + "COMMITTED_HEAP_BYTES" : 71489028096 + } + }, + "jobs" : [ { + "jobKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492629120238, + "jobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6607397, + "jobIdString" : "job_1470171371859_6607397" + }, + "qualifiedJobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6607397, + "jobIdString" : "job_1470171371859_6607397" + }, + "encodedRunId" : 9223370544225655569 + }, + "jobId" : "job_1470171371859_6607397", + "jobName" : "[8BEFB46B10BB4992B069A0452C1A7209/6CEF6CE735EA4F22A1D10798AB978AC9] com.twitter.testuser.forward.testuserUnhydratedBatchJob/(2/3)", + "user" : "testuser", + "priority" : "", + "status" : "SUCCEEDED", + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "queue" : "testuser", + "submitTime" : 1492629217932, + "launchTime" : 1492629253021, + "finishTime" : 1492629545471, + "totalMaps" : 142, + "totalReduces" : 64, + "finishedMaps" : 142, + "finishedReduces" : 64, + "failedMaps" : 0, + "failedReduces" : 0, + "mapFileBytesRead" : 40340004073, + "mapFileBytesWritten" : 80595282050, + "reduceFileBytesRead" : 52002327599, + "hdfsBytesRead" : 101477835921, + "hdfsBytesWritten" : 40416854940, + "mapSlotMillis" : 13609547, + "reduceSlotMillis" : 6905340, + "reduceShuffleBytes" : 40209861419, + "megabyteMillis" : 65532563968, + "cost" : 0.07110738277777777, + "counters" : { + "Scalding Custom" : { + "test_user" : 805 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 17709505, + "Tuples_Read" : 86059252 + }, + "evictions" : { + "MapsideReduce" : 12711135 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 17739761, + "SPILLED_RECORDS" : 58676037, + "VIRTUAL_MEMORY_BYTES" : 838904201216, + "LOCALIZED_NANOS" : 198603202103, + "SPLIT_RAW_BYTES" : 376726, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 138812514335, + "REDUCE_SHUFFLE_BYTES" : 40209861419, + "PHYSICAL_MEMORY_BYTES" : 268000686080, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 213660, + "CPU_MILLISECONDS" : 15391480, + "LOCALIZED_FILES_MISSED" : 20666, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 40209861419, + "MERGED_MAP_OUTPUTS" : 9088, + "MAP_INPUT_RECORDS" : 86059252, + "LOCALIZED_BYTES_MISSED" : 24468451612, + "GC_TIME_MILLIS" : 3670350, + "REDUCE_INPUT_GROUPS" : 17709505, + "SHUFFLED_MAPS" : 9088, + "REDUCE_OUTPUT_RECORDS" : 17709505, + "LOCALIZED_BYTES_CACHED" : 196312461278, + "MAP_OUTPUT_RECORDS" : 17739761, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 263016857600 + }, + "hits" : { + "MapsideReduce" : 1935 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 132683302440, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 760, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 101477835921, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 92342331672, + "HDFS_WRITE_OPS" : 128, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 40416854940 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 35449266, + "Tuples_Read" : 103799013, + "Read_Duration" : 8173153, + "Write_Duration" : 1968986, + "Process_Begin_Time" : 307481645061346, + "Process_End_Time" : 307481656087835 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 86059252, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 17739761 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 153, + "VCORES_MILLIS_REDUCES" : 6042173, + "TOTAL_LAUNCHED_REDUCES" : 64, + "NUM_KILLED_MAPS" : 11, + "OTHER_LOCAL_MAPS" : 79, + "DATA_LOCAL_MAPS" : 3, + "MB_MILLIS_MAPS" : 42679540736, + "SLOTS_MILLIS_REDUCES" : 6905340, + "VCORES_MILLIS_MAPS" : 11908354, + "MB_MILLIS_REDUCES" : 21655148032, + "SLOTS_MILLIS_MAPS" : 13609547, + "RACK_LOCAL_MAPS" : 71, + "MILLIS_REDUCES" : 6042173, + "MILLIS_MAPS" : 11908354 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + } + }, + "mapCounters" : { + "Scalding Custom" : { + "test_user" : 805 + }, + "hits" : { + "MapsideReduce" : 1935 + }, + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 80595282050, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 568, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 101477835921, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 40340004073, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 86059252 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 17739761, + "Tuples_Read" : 86059252, + "Read_Duration" : 6780865, + "Write_Duration" : 1351893, + "Process_Begin_Time" : 211953357561478, + "Process_End_Time" : 211953366347172 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 86059252, + "Records Skipped" : 0 + }, + "misses" : { + "MapsideReduce" : 17739761 + }, + "evictions" : { + "MapsideReduce" : 12711135 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 14551, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 40209861419, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 35450896, + "VIRTUAL_MEMORY_BYTES" : 574310744064, + "MAP_INPUT_RECORDS" : 86059252, + "LOCALIZED_NANOS" : 135576295314, + "SPLIT_RAW_BYTES" : 376726, + "LOCALIZED_BYTES_MISSED" : 17303870269, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 138812514335, + "PHYSICAL_MEMORY_BYTES" : 238844108800, + "GC_TIME_MILLIS" : 3634307, + "LOCALIZED_FILES_CACHED" : 146193, + "LOCALIZED_BYTES_CACHED" : 134149432257, + "MAP_OUTPUT_RECORDS" : 17739761, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 11684060, + "COMMITTED_HEAP_BYTES" : 230569721856 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 52088020390, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 192, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 52002327599, + "HDFS_WRITE_OPS" : 128, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 40416854940 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 17709505 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 17709505, + "Tuples_Read" : 17739761, + "Read_Duration" : 1392288, + "Write_Duration" : 617093, + "Process_Begin_Time" : 95528287499868, + "Process_End_Time" : 95528289740663 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 6111, + "MERGED_MAP_OUTPUTS" : 9088, + "REDUCE_INPUT_RECORDS" : 17739761, + "SPILLED_RECORDS" : 23225141, + "VIRTUAL_MEMORY_BYTES" : 264593457152, + "LOCALIZED_NANOS" : 57979043535, + "LOCALIZED_BYTES_MISSED" : 7163002919, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 40209861419, + "PHYSICAL_MEMORY_BYTES" : 29156577280, + "GC_TIME_MILLIS" : 36043, + "REDUCE_INPUT_GROUPS" : 17709505, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 9088, + "LOCALIZED_FILES_CACHED" : 66337, + "REDUCE_OUTPUT_RECORDS" : 17709505, + "LOCALIZED_BYTES_CACHED" : 61097640473, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 3707420, + "COMMITTED_HEAP_BYTES" : 32447135744 + } + }, + "tasks" : [ ], + "configuration" : { + "cascading.flow.step.num" : "2" + }, + "submitDate" : 1492629217932, + "launchDate" : 1492629253021, + "finishDate" : 1492629545471, + "runTime" : 292450 + }, { + "jobKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492629120238, + "jobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6607542, + "jobIdString" : "job_1470171371859_6607542" + }, + "qualifiedJobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6607542, + "jobIdString" : "job_1470171371859_6607542" + }, + "encodedRunId" : 9223370544225655569 + }, + "jobId" : "job_1470171371859_6607542", + "jobName" : "[8BEFB46B10BB4992B069A0452C1A7209/6A94935E7180499FAC11473213CE76EF] com.twitter.testuser.forward.testuserUnhydratedBatchJob/(1/3)", + "user" : "testuser", + "priority" : "", + "status" : "SUCCEEDED", + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "queue" : "testuser", + "submitTime" : 1492629664906, + "launchTime" : 1492629678327, + "finishTime" : 1492629922728, + "totalMaps" : 206, + "totalReduces" : 67, + "finishedMaps" : 206, + "finishedReduces" : 67, + "failedMaps" : 0, + "failedReduces" : 0, + "mapFileBytesRead" : 40706072815, + "mapFileBytesWritten" : 81646497859, + "reduceFileBytesRead" : 67871106082, + "hdfsBytesRead" : 141894757029, + "hdfsBytesWritten" : 38651664308, + "mapSlotMillis" : 12455721, + "reduceSlotMillis" : 5467194, + "reduceShuffleBytes" : 40814059608, + "megabyteMillis" : 57207329280, + "cost" : 0.062073925, + "counters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 149607275269, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 1025, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 141894757029, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 108577178897, + "HDFS_WRITE_OPS" : 134, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 38651664308 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 17000878, + "Tuples_Read" : 103768757 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 58005311, + "Tuples_Read" : 144773190, + "Read_Duration" : 6765949, + "Write_Duration" : 1103794, + "Process_Begin_Time" : 407487916987017, + "Process_End_Time" : 407487925303041 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 86059252, + "Records Skipped" : 0 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 210, + "VCORES_MILLIS_REDUCES" : 4783795, + "TOTAL_LAUNCHED_REDUCES" : 71, + "NUM_KILLED_MAPS" : 4, + "OTHER_LOCAL_MAPS" : 80, + "DATA_LOCAL_MAPS" : 12, + "NUM_KILLED_REDUCES" : 4, + "MB_MILLIS_MAPS" : 39061141504, + "SLOTS_MILLIS_REDUCES" : 5467194, + "VCORES_MILLIS_MAPS" : 10898756, + "MB_MILLIS_REDUCES" : 17145121280, + "SLOTS_MILLIS_MAPS" : 12455721, + "RACK_LOCAL_MAPS" : 118, + "MILLIS_REDUCES" : 4783795, + "MILLIS_MAPS" : 10898756 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 41004433, + "SPILLED_RECORDS" : 126273593, + "VIRTUAL_MEMORY_BYTES" : 1110279864320, + "LOCALIZED_NANOS" : 345506465153, + "SPLIT_RAW_BYTES" : 442894, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 139514063958, + "REDUCE_SHUFFLE_BYTES" : 40814059608, + "PHYSICAL_MEMORY_BYTES" : 189141204992, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 280089, + "CPU_MILLISECONDS" : 13531660, + "LOCALIZED_FILES_MISSED" : 30081, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 40814059608, + "MERGED_MAP_OUTPUTS" : 13802, + "MAP_INPUT_RECORDS" : 103768757, + "LOCALIZED_BYTES_MISSED" : 34340180070, + "GC_TIME_MILLIS" : 115317, + "REDUCE_INPUT_GROUPS" : 40242908, + "SHUFFLED_MAPS" : 13802, + "REDUCE_OUTPUT_RECORDS" : 17000878, + "LOCALIZED_BYTES_CACHED" : 257901926946, + "MAP_OUTPUT_RECORDS" : 41004433, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 201552314368 + } + }, + "mapCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 81646497859, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 824, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 141894757029, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 40706072815, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 103768757 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 41004433, + "Tuples_Read" : 103768757, + "Read_Duration" : 5291658, + "Write_Duration" : 508458, + "Process_Begin_Time" : 307481720391658, + "Process_End_Time" : 307481726373180 + }, + "LzoBlocks of com.twitter.tweetypie.thriftscala.TweetEvent" : { + "Errors" : 0, + "Records Read" : 86059252, + "Records Skipped" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 22237, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 40814059608, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 58713938, + "VIRTUAL_MEMORY_BYTES" : 833510277120, + "MAP_INPUT_RECORDS" : 103768757, + "LOCALIZED_NANOS" : 232395288843, + "SPLIT_RAW_BYTES" : 442894, + "LOCALIZED_BYTES_MISSED" : 25571400633, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 139514063958, + "PHYSICAL_MEMORY_BYTES" : 159147757568, + "GC_TIME_MILLIS" : 77909, + "LOCALIZED_FILES_CACHED" : 210955, + "LOCALIZED_BYTES_CACHED" : 194143118995, + "MAP_OUTPUT_RECORDS" : 41004433, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 9496830, + "COMMITTED_HEAP_BYTES" : 167580286976 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 67960777410, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 201, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 67871106082, + "HDFS_WRITE_OPS" : 134, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 38651664308 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 17000878 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 17000878, + "Tuples_Read" : 41004433, + "Read_Duration" : 1474291, + "Write_Duration" : 595336, + "Process_Begin_Time" : 100006196595359, + "Process_End_Time" : 100006198929861 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 7840, + "MERGED_MAP_OUTPUTS" : 13802, + "REDUCE_INPUT_RECORDS" : 41004433, + "SPILLED_RECORDS" : 67559655, + "VIRTUAL_MEMORY_BYTES" : 276769587200, + "LOCALIZED_NANOS" : 112735693806, + "LOCALIZED_BYTES_MISSED" : 8767128243, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 40814059608, + "PHYSICAL_MEMORY_BYTES" : 29993447424, + "GC_TIME_MILLIS" : 37408, + "REDUCE_INPUT_GROUPS" : 40242908, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 13802, + "LOCALIZED_FILES_CACHED" : 68004, + "REDUCE_OUTPUT_RECORDS" : 17000878, + "LOCALIZED_BYTES_CACHED" : 62693419403, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 4034830, + "COMMITTED_HEAP_BYTES" : 33972027392 + } + }, + "tasks" : [ ], + "configuration" : { + "cascading.flow.step.num" : "1" + }, + "submitDate" : 1492629664906, + "launchDate" : 1492629678327, + "finishDate" : 1492629922728, + "runTime" : 244401 + }, { + "jobKey" : { + "cluster" : "test@cluster", + "userName" : "testuser", + "appId" : "somegoodjob", + "runId" : 1492629120238, + "jobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6607618, + "jobIdString" : "job_1470171371859_6607618" + }, + "qualifiedJobId" : { + "cluster" : "test@cluster", + "jobEpoch" : 1470171371859, + "jobSequence" : 6607618, + "jobIdString" : "job_1470171371859_6607618" + }, + "encodedRunId" : 9223370544225655569 + }, + "jobId" : "job_1470171371859_6607618", + "jobName" : "[8BEFB46B10BB4992B069A0452C1A7209/B2D7B4CDD6864E76BE5F19F61600A47B] com.twitter.testuser.forward.testuserUnhydratedBatchJob/(3/3) .../unhydrated/2017/04/19/17", + "user" : "testuser", + "priority" : "", + "status" : "SUCCEEDED", + "version" : "02CFBD0A94AD5E297C2E4D6665B3B6F0", + "hadoopVersion" : "TWO", + "queue" : "testuser", + "submitTime" : 1492629953696, + "launchTime" : 1492629971471, + "finishTime" : 1492630639932, + "totalMaps" : 67, + "totalReduces" : 10, + "finishedMaps" : 67, + "finishedReduces" : 10, + "failedMaps" : 0, + "failedReduces" : 0, + "mapFileBytesRead" : 38606282909, + "mapFileBytesWritten" : 77265382690, + "reduceFileBytesRead" : 64447954989, + "hdfsBytesRead" : 38651733251, + "hdfsBytesWritten" : 30847298707, + "mapSlotMillis" : 3769306, + "reduceSlotMillis" : 5445833, + "reduceShuffleBytes" : 38583436003, + "megabyteMillis" : 31636693504, + "cost" : 0.03432800944444444, + "counters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 141727466483, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 308, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 38651733251, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 103054237898, + "HDFS_WRITE_OPS" : 40, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 30847298707 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 17000878, + "Tuples_Read" : 17000878 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 34001756, + "Tuples_Read" : 34001756, + "Read_Duration" : 2328665, + "Write_Duration" : 3256539, + "Process_Begin_Time" : 114932510758673, + "Process_End_Time" : 114932516442200 + }, + "org.apache.hadoop.mapreduce.JobCounter" : { + "TOTAL_LAUNCHED_MAPS" : 69, + "VCORES_MILLIS_REDUCES" : 4765104, + "TOTAL_LAUNCHED_REDUCES" : 10, + "NUM_KILLED_MAPS" : 2, + "OTHER_LOCAL_MAPS" : 45, + "MB_MILLIS_MAPS" : 11820544512, + "SLOTS_MILLIS_REDUCES" : 5445833, + "VCORES_MILLIS_MAPS" : 3298143, + "MB_MILLIS_REDUCES" : 17078132736, + "SLOTS_MILLIS_MAPS" : 3769306, + "RACK_LOCAL_MAPS" : 24, + "MILLIS_REDUCES" : 4765104, + "MILLIS_MAPS" : 3298143 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "REDUCE_INPUT_RECORDS" : 17000878, + "SPILLED_RECORDS" : 62416516, + "VIRTUAL_MEMORY_BYTES" : 312361771008, + "LOCALIZED_NANOS" : 77542841838, + "SPLIT_RAW_BYTES" : 68943, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 132631170234, + "REDUCE_SHUFFLE_BYTES" : 38583436003, + "PHYSICAL_MEMORY_BYTES" : 57133043712, + "COMBINE_OUTPUT_RECORDS" : 0, + "LOCALIZED_FILES_CACHED" : 79604, + "CPU_MILLISECONDS" : 8017640, + "LOCALIZED_FILES_MISSED" : 8616, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 38583436003, + "MERGED_MAP_OUTPUTS" : 670, + "MAP_INPUT_RECORDS" : 17000878, + "LOCALIZED_BYTES_MISSED" : 10371782113, + "GC_TIME_MILLIS" : 43413, + "REDUCE_INPUT_GROUPS" : 10, + "SHUFFLED_MAPS" : 670, + "REDUCE_OUTPUT_RECORDS" : 17000878, + "LOCALIZED_BYTES_CACHED" : 72817032907, + "MAP_OUTPUT_RECORDS" : 17000878, + "COMBINE_INPUT_RECORDS" : 0, + "COMMITTED_HEAP_BYTES" : 59466805248 + } + }, + "mapCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 77265382690, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 268, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 38651733251, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 38606282909, + "HDFS_WRITE_OPS" : 0, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 0 + }, + "org.apache.hadoop.mapreduce.lib.input.FileInputFormatCounter" : { + "BYTES_READ" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Read" : 17000878 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 17000878, + "Tuples_Read" : 17000878, + "Read_Duration" : 1314989, + "Write_Duration" : 309234, + "Process_Begin_Time" : 100006209563299, + "Process_End_Time" : 100006211215282 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 8477, + "MAP_OUTPUT_MATERIALIZED_BYTES" : 38583436003, + "MERGED_MAP_OUTPUTS" : 0, + "SPILLED_RECORDS" : 34001756, + "VIRTUAL_MEMORY_BYTES" : 271050731520, + "MAP_INPUT_RECORDS" : 17000878, + "LOCALIZED_NANOS" : 72027213744, + "SPLIT_RAW_BYTES" : 68943, + "LOCALIZED_BYTES_MISSED" : 9991291301, + "FAILED_SHUFFLE" : 0, + "MAP_OUTPUT_BYTES" : 132631170234, + "PHYSICAL_MEMORY_BYTES" : 52774502400, + "GC_TIME_MILLIS" : 27984, + "LOCALIZED_FILES_CACHED" : 67300, + "LOCALIZED_BYTES_CACHED" : 61465705144, + "MAP_OUTPUT_RECORDS" : 17000878, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 3344400, + "COMMITTED_HEAP_BYTES" : 54396940288 + } + }, + "reduceCounters" : { + "org.apache.hadoop.mapreduce.FileSystemCounter" : { + "VIEWFS_LARGE_READ_OPS" : 0, + "FILE_BYTES_WRITTEN" : 64462083793, + "FILE_LARGE_READ_OPS" : 0, + "FILE_WRITE_OPS" : 0, + "VIEWFS_BYTES_READ" : 0, + "VIEWFS_READ_OPS" : 0, + "HDFS_READ_OPS" : 40, + "VIEWFS_WRITE_OPS" : 0, + "HDFS_BYTES_READ" : 0, + "HDFS_LARGE_READ_OPS" : 0, + "FILE_READ_OPS" : 0, + "FILE_BYTES_READ" : 64447954989, + "HDFS_WRITE_OPS" : 40, + "VIEWFS_BYTES_WRITTEN" : 0, + "HDFS_BYTES_WRITTEN" : 30847298707 + }, + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormatCounter" : { + "BYTES_WRITTEN" : 0 + }, + "cascading.flow.StepCounters" : { + "Tuples_Written" : 17000878 + }, + "cascading.flow.SliceCounters" : { + "Tuples_Written" : 17000878, + "Tuples_Read" : 17000878, + "Read_Duration" : 1013676, + "Write_Duration" : 2947305, + "Process_Begin_Time" : 14926301195374, + "Process_End_Time" : 14926305226918 + }, + "Shuffle Errors" : { + "CONNECTION" : 0, + "WRONG_LENGTH" : 0, + "BAD_ID" : 0, + "WRONG_MAP" : 0, + "WRONG_REDUCE" : 0, + "IO_ERROR" : 0 + }, + "org.apache.hadoop.mapreduce.TaskCounter" : { + "LOCALIZED_FILES_MISSED" : 136, + "MERGED_MAP_OUTPUTS" : 670, + "REDUCE_INPUT_RECORDS" : 17000878, + "SPILLED_RECORDS" : 28414760, + "VIRTUAL_MEMORY_BYTES" : 41311039488, + "LOCALIZED_NANOS" : 4898727923, + "LOCALIZED_BYTES_MISSED" : 379284135, + "FAILED_SHUFFLE" : 0, + "REDUCE_SHUFFLE_BYTES" : 38583436003, + "PHYSICAL_MEMORY_BYTES" : 4358541312, + "GC_TIME_MILLIS" : 15429, + "REDUCE_INPUT_GROUPS" : 10, + "COMBINE_OUTPUT_RECORDS" : 0, + "SHUFFLED_MAPS" : 670, + "LOCALIZED_FILES_CACHED" : 11174, + "REDUCE_OUTPUT_RECORDS" : 17000878, + "LOCALIZED_BYTES_CACHED" : 10285939215, + "COMBINE_INPUT_RECORDS" : 0, + "CPU_MILLISECONDS" : 4673240, + "COMMITTED_HEAP_BYTES" : 5069864960 + } + }, + "tasks" : [ ], + "configuration" : { + "cascading.flow.step.num" : "3" + }, + "submitDate" : 1492629953696, + "launchDate" : 1492629971471, + "finishDate" : 1492630639932, + "runTime" : 668461 + } ] +} ] \ No newline at end of file diff --git a/scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6607542.json b/scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6607542.json new file mode 100644 index 0000000000..43e7e28c3d --- /dev/null +++ b/scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6607542.json @@ -0,0 +1,65 @@ +[ { + "taskType" : "", + "counters" : { + } +}, { + "taskType" : "MAP", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 751378432, + "GC_TIME_MILLIS" : 310, + "CPU_MILLISECONDS" : 38570, + "COMMITTED_HEAP_BYTES" : 810524672 + } + } +}, { + "taskType" : "MAP", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 751378432, + "GC_TIME_MILLIS" : 310, + "CPU_MILLISECONDS" : 38570, + "COMMITTED_HEAP_BYTES" : 810524672 + } + } +}, { + "taskType" : "MAP", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 759648256, + "GC_TIME_MILLIS" : 313, + "CPU_MILLISECONDS" : 38620, + "COMMITTED_HEAP_BYTES" : 810520576 + } + } +}, { + "taskType" : "REDUCE", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 449499136, + "GC_TIME_MILLIS" : 444, + "CPU_MILLISECONDS" : 53720, + "COMMITTED_HEAP_BYTES" : 506986496 + } + } +}, { + "taskType" : "REDUCE", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 449499136, + "GC_TIME_MILLIS" : 444, + "CPU_MILLISECONDS" : 53720, + "COMMITTED_HEAP_BYTES" : 506986496 + } + } +}, { + "taskType" : "REDUCE", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 465207296, + "GC_TIME_MILLIS" : 529, + "CPU_MILLISECONDS" : 57210, + "COMMITTED_HEAP_BYTES" : 506986496 + } + } +} ] diff --git a/scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6608570.json b/scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6608570.json new file mode 100644 index 0000000000..f320d64c08 --- /dev/null +++ b/scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6608570.json @@ -0,0 +1,65 @@ +[ { + "taskType" : "", + "counters" : { + } +}, { + "taskType" : "MAP", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 768618496, + "GC_TIME_MILLIS" : 371, + "CPU_MILLISECONDS" : 45260, + "COMMITTED_HEAP_BYTES" : 814776320 + } + } +}, { + "taskType" : "MAP", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 768618496, + "GC_TIME_MILLIS" : 371, + "CPU_MILLISECONDS" : 45260, + "COMMITTED_HEAP_BYTES" : 814776320 + } + } +}, { + "taskType" : "MAP", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 758517760, + "GC_TIME_MILLIS" : 355, + "CPU_MILLISECONDS" : 43950, + "COMMITTED_HEAP_BYTES" : 814280704 + } + } +}, { + "taskType" : "REDUCE", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 433074176, + "GC_TIME_MILLIS" : 671, + "CPU_MILLISECONDS" : 74270, + "COMMITTED_HEAP_BYTES" : 506986496 + } + } +}, { + "taskType" : "REDUCE", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 421924864, + "GC_TIME_MILLIS" : 596, + "CPU_MILLISECONDS" : 64390, + "COMMITTED_HEAP_BYTES" : 506986496 + } + } +}, { + "taskType" : "REDUCE", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 421924864, + "GC_TIME_MILLIS" : 596, + "CPU_MILLISECONDS" : 64390, + "COMMITTED_HEAP_BYTES" : 506986496 + } + } +} ] diff --git a/scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6609558.json b/scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6609558.json new file mode 100644 index 0000000000..de75bf940b --- /dev/null +++ b/scalding-hraven/src/test/resources/jobResponse_job_1470171371859_6609558.json @@ -0,0 +1,75 @@ +[ { + "taskType" : "", + "counters" : { + } +}, { + "taskType" : "MAP", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 759197696, + "GC_TIME_MILLIS" : 458, + "CPU_MILLISECONDS" : 74500, + "COMMITTED_HEAP_BYTES" : 812834816 + } + } +}, { + "taskType" : "MAP", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 759197696, + "GC_TIME_MILLIS" : 458, + "CPU_MILLISECONDS" : 74500, + "COMMITTED_HEAP_BYTES" : 812834816 + } + } +}, { + "taskType" : "MAP", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 760983552, + "GC_TIME_MILLIS" : 370, + "CPU_MILLISECONDS" : 51290, + "COMMITTED_HEAP_BYTES" : 811675648 + } + } +}, { + "taskType" : "REDUCE", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 457273344, + "GC_TIME_MILLIS" : 457, + "CPU_MILLISECONDS" : 51310, + "COMMITTED_HEAP_BYTES" : 506986496 + } + } +}, { + "taskType" : "REDUCE", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 444268544, + "GC_TIME_MILLIS" : 490, + "CPU_MILLISECONDS" : 54550, + "COMMITTED_HEAP_BYTES" : 506986496 + } + } +}, { + "taskType" : "REDUCE", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 444268544, + "GC_TIME_MILLIS" : 490, + "CPU_MILLISECONDS" : 54550, + "COMMITTED_HEAP_BYTES" : 506986496 + } + } +}, { + "taskType" : "REDUCE", + "counters" : { + "org.apache.hadoop.mapreduce.TaskCounter" : { + "PHYSICAL_MEMORY_BYTES" : 442941440, + "GC_TIME_MILLIS" : 520, + "CPU_MILLISECONDS" : 57050, + "COMMITTED_HEAP_BYTES" : 506986496 + } + } +} ] diff --git a/scalding-hraven/src/test/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryServiceTest.scala b/scalding-hraven/src/test/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryServiceTest.scala new file mode 100644 index 0000000000..23bdf9ca07 --- /dev/null +++ b/scalding-hraven/src/test/scala/com/twitter/scalding/hraven/estimation/HRavenHistoryServiceTest.scala @@ -0,0 +1,155 @@ +package com.twitter.scalding.hraven.estimation + +import cascading.flow.FlowStep +import com.twitter.hraven.JobDescFactory.RESOURCE_MANAGER_KEY +import com.twitter.hraven.rest.client.HRavenRestClient +import com.twitter.hraven.util.JSONUtil +import com.twitter.hraven.{Flow, TaskDetails} +import com.twitter.scalding.estimation.FlowStrategyInfo +import com.twitter.scalding.hraven.estimation.memory.HRavenMemoryHistoryService +import com.twitter.scalding.hraven.reducer_estimation.HRavenReducerHistoryService +import java.util +import org.apache.hadoop.mapred.JobConf +import org.codehaus.jackson.`type`.TypeReference +import org.mockito.Matchers._ +import org.mockito.Mockito._ +import org.scalatest.{Matchers, WordSpec} +import scala.collection.JavaConverters._ +import scala.util.Try + +class HRavenHistoryServiceTest extends WordSpec with Matchers { + "A HRaven history service" should { + "work as HRaven memory history service" in { + val tasks = List(7, 6, 6) + + val historyService = new HRavenMemoryHistoryService { + override def hRavenClient(conf: JobConf): Try[HRavenRestClient] = + HRavenMockedClient(super.hRavenClient(conf), detailFields, counterFields) + } + + val history = historyService.fetchHistory(TestFlowStrategyInfo.dummy(), HRavenMockedClient.nFetch) + + if (history.isFailure) { + history.get + } else { + history.foreach(_.foreach { step => + tasks should contain(step.tasks.size) + + step.tasks.foreach { task => + assert(task.details.nonEmpty) + assert(task.counters.nonEmpty) + } + }) + } + } + + "work as HRaven reducer history service" in { + val tasks = List(7, 6, 6) + + val historyService = new HRavenReducerHistoryService { + override def hRavenClient(conf: JobConf): Try[HRavenRestClient] = + HRavenMockedClient(super.hRavenClient(conf), detailFields, counterFields) + } + + val history = historyService.fetchHistory(TestFlowStrategyInfo.dummy(), HRavenMockedClient.nFetch) + + if (history.isFailure) { + history.get + } else { + history.foreach(_.foreach { step => + tasks should contain(step.tasks.size) + + step.tasks.foreach { task => + assert(task.details.nonEmpty) + assert(task.counters.isEmpty) + } + }) + } + } + } +} + +object TestFlowStrategyInfo { + def dummy(stepNum: Int = 1): FlowStrategyInfo = { + val mockedConf = spy(new JobConf()) + + HRavenMockedClient.configure(mockedConf) + + val mockedStep = mock(classOf[FlowStep[JobConf]]) + val mockedInfo = mock(classOf[FlowStrategyInfo]) + + when(mockedStep.getConfig).thenReturn(mockedConf) + when(mockedStep.getStepNum).thenReturn(stepNum) + when(mockedInfo.step).thenReturn(mockedStep) + + mockedInfo + } +} + +object HRavenMockedClient { + val cluster = "test@cluster" + val user = "testuser" + val batch = "somegoodjob" + val signature = "02CFBD0A94AD5E297C2E4D6665B3B6F0" + val nFetch = 3 + + val jobs = List("job_1470171371859_6609558", "job_1470171371859_6608570", "job_1470171371859_6607542") + + val RequiredJobConfigs = Seq("cascading.flow.step.num") + + def apply( + hRaven: Try[HRavenRestClient], + detailFields: List[String], + counterFields: List[String] + ): Try[HRavenRestClient] = + hRaven.map { hRaven => + val client = spy(hRaven) + + doReturn(HRavenMockedClient.cluster) + .when(client) + .getCluster(anyString()) + + doReturn(flowsResponse) + .when(client) + .fetchFlowsWithConfig(anyString(), anyString(), anyString(), anyString(), anyInt(), anyVararg()) + + for (jobId <- jobs) { + val response = jobResponse(jobId) + + doReturn(response) + .when(client) + .fetchTaskDetails(cluster, jobId, detailFields.asJava, counterFields.asJava) + + doReturn(response) + .when(client) + .fetchTaskDetails(cluster, jobId, detailFields.asJava) + } + + client + } + + def configure(conf: JobConf): Unit = { + conf.set(HRavenClient.apiHostnameKey, "test") + conf.set(RESOURCE_MANAGER_KEY, "test.com:5053") + conf.set("hraven.history.user.name", HRavenMockedClient.user) + conf.set("batch.desc", HRavenMockedClient.batch) + conf.set("scalding.flow.class.signature", HRavenMockedClient.signature) + conf.set("hraven.estimator.max.flow.histor", HRavenMockedClient.nFetch.toString) + } + + def flowsResponse: util.List[Flow] = + JSONUtil + .readJson( + getClass.getResourceAsStream("../../../../../flowResponse.json"), + new TypeReference[util.List[Flow]] {} + ) + .asInstanceOf[util.List[Flow]] + + def jobResponse(jobId: String): util.List[TaskDetails] = + JSONUtil + .readJson( + getClass.getResourceAsStream(s"../../../../../jobResponse_$jobId.json"), + new TypeReference[util.List[TaskDetails]] {} + ) + .asInstanceOf[util.List[TaskDetails]] +} diff --git a/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/JDBCSource.scala b/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/JDBCSource.scala deleted file mode 100644 index f0cbf98ad7..0000000000 --- a/scalding-jdbc/src/main/scala/com/twitter/scalding/jdbc/JDBCSource.scala +++ /dev/null @@ -1,175 +0,0 @@ -/* -Copyright 2012 Twitter, Inc. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package com.twitter.scalding.jdbc - -import com.twitter.scalding.{ AccessMode, Hdfs, Mode, Source, TestTapFactory } -import cascading.jdbc.JDBCScheme -import cascading.jdbc.JDBCTap -import cascading.jdbc.TableDesc -import cascading.scheme.Scheme -import cascading.tap.Tap -import cascading.tuple.Fields - -/** - * Extend this source to let scalding read from or write to a database. - * In order for this to work you need to specify the table name, column definitions and DB credentials. - * If you write to a DB, the fields in the final pipe have to correspond to the column names in the DB table. - * Example usage: - * case object YourTableSource extends JDBCSource { - * override val tableName = "tableName" - * override val columns = List( - * varchar("col1", 64), - * date("col2"), - * tinyint("col3"), - * double("col4"), - * ) - * override def currentConfig = ConnectionSpec("www.github.com", "username", "password", "mysql") - * } - * - * @author Argyris Zymnis - * @author Oscar Boykin - * @author Kevin Lin - */ -abstract class JDBCSource extends Source { - - // Override the following three members when you extend this class - val tableName : String - val columns : Iterable[ColumnDefinition] - protected def currentConfig : ConnectionSpec - - // Must be a subset of column names. - // If updateBy column names are given, a SQL UPDATE statement will be generated - // if the values in those columns for the given Tuple are all not {@code null}. - // Otherwise an INSERT statement will be generated. - val updateBy : Iterable[String] = Nil - - // The body of a WHERE clause. If present will filter the full table by this condition. - val filterCondition: Option[String] = None - - // Override this if your table is really large - def maxConcurrentReads = 1 - - // How many rows to insert/update into this table in a batch? - def batchSize = 1000 - - protected def driverFor(adapter: String): String = - Map("mysql" -> "com.mysql.jdbc.Driver", - "hsqldb" -> "org.hsqldb.jdbcDriver") - .apply(adapter) - - def fields : Fields = new Fields(columnNames.toSeq :_*) - - protected def columnNames : Array[String] = columns.map{ _.name }.toArray - protected def columnDefinitions : Array[String] = columns.map{ _.definition }.toArray - protected def tableDesc = new TableDesc(tableName, columnNames, columnDefinitions, null, null) - - protected def nullStr(nullable : Boolean) = if(nullable) "NULL" else "NOT NULL" - - protected def mkColumnDef(name : String, typeName : String, - nullable : Boolean, sizeOp : Option[Int] = None, defOp : Option[String]) = { - val sizeStr = sizeOp.map { "(" + _.toString + ")" }.getOrElse("") - val defStr = defOp.map { " DEFAULT '" + _.toString + "' " }.getOrElse(" ") - ColumnDefinition(name, typeName + sizeStr + defStr + nullStr(nullable)) - } - - // Some helper methods that we can use to generate column definitions - protected def bigint(name : String, size : Int = 20, nullable : Boolean = false) = { - mkColumnDef(name, "BIGINT", nullable, Some(size), None) - } - - protected def int(name : String, size : Int = 11, defaultValue : Int = 0, nullable : Boolean = false) = { - mkColumnDef(name, "INT", nullable, Some(size), Some(defaultValue.toString)) - } - - protected def smallint(name : String, size : Int = 6, defaultValue : Int = 0, nullable : Boolean = false) = { - mkColumnDef(name, "SMALLINT", nullable, Some(size), Some(defaultValue.toString)) - } - - // NOTE: tinyint(1) actually gets converted to a java Boolean - protected def tinyint(name : String, size : Int = 8, nullable : Boolean = false) = { - mkColumnDef(name, "TINYINT", nullable, Some(size), None) - } - - protected def varchar(name : String, size : Int = 255, nullable : Boolean = false) = { - mkColumnDef(name, "VARCHAR", nullable, Some(size), None) - } - - protected def date(name : String, nullable : Boolean = false) = { - mkColumnDef(name, "DATE", nullable, None, None) - } - - protected def datetime(name : String, nullable : Boolean = false) = { - mkColumnDef(name, "DATETIME", nullable, None, None) - } - - protected def text(name : String, nullable : Boolean = false) = { - mkColumnDef(name, "TEXT", nullable, None, None) - } - - protected def double(name : String, nullable : Boolean = false) = { - mkColumnDef(name, "DOUBLE", nullable, None, None) - } - - protected def column(name : String, definition : String) = ColumnDefinition(name, definition) - - protected def createJDBCTap = { - try { - val ConnectionSpec(url, uName, passwd, adapter) = currentConfig - val tap = new JDBCTap(url, uName, passwd, driverFor(adapter), tableDesc, getJDBCScheme) - tap.setConcurrentReads(maxConcurrentReads) - tap.setBatchSize(batchSize) - tap - } catch { - case e: NullPointerException => { - sys.error("Could not find DB credential information.") - } - } - } - - protected def getJDBCScheme = new JDBCScheme( - null, // inputFormatClass - null, // outputFormatClass - columnNames.toArray, - null, // orderBy - filterCondition.getOrElse(null), - updateBy.toArray - ) - - override def createTap(readOrWrite : AccessMode)(implicit mode : Mode) : Tap[_,_,_] = { - mode match { - case Hdfs(_,_) => createJDBCTap.asInstanceOf[Tap[_,_,_]] - // TODO: support Local mode here, and better testing. - case _ => TestTapFactory(this, fields).createTap(readOrWrite) - } - } - - // Generate SQL statement to create the DB table if not existing. - def toSqlCreateString : String = { - def addBackTicks(str : String) = "`" + str + "`" - val allCols = columns - .map { cd => addBackTicks(cd.name) + " " + cd.definition } - .mkString(",\n") - - "CREATE TABLE " + addBackTicks(tableName) + " (\n" + allCols + ",\n PRIMARY KEY HERE!!!!" - } -} - -case class ColumnDefinition(name : String, definition : String) -/** -* Pass your DB credentials to this class in a preferred secure way -*/ -case class ConnectionSpec(connectUrl : String, userName : String, password : String, adapter : String) diff --git a/scalding-json/src/main/scala/com/twitter/scalding/JsonLine.scala b/scalding-json/src/main/scala/com/twitter/scalding/JsonLine.scala index f495b29322..cf05472bc0 100644 --- a/scalding-json/src/main/scala/com/twitter/scalding/JsonLine.scala +++ b/scalding-json/src/main/scala/com/twitter/scalding/JsonLine.scala @@ -12,58 +12,69 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding import java.io.Serializable -import java.lang.reflect.{Type, ParameterizedType} +import java.lang.reflect.{ParameterizedType, Type} import cascading.pipe.Pipe import cascading.tap.SinkMode -import cascading.tuple.{Tuple, TupleEntry, Fields} +import cascading.tuple.{Fields, Tuple, TupleEntry} import com.fasterxml.jackson.core.`type`.TypeReference import com.fasterxml.jackson.module.scala._ import com.fasterxml.jackson.databind.ObjectMapper /** -* This Source writes out the TupleEntry as a simple JSON object, using the field -* names as keys and the string representation of the values. -* -* TODO: it would be nice to have a way to add read/write transformations to pipes -* that doesn't require extending the sources and overriding methods. -*/ -case class JsonLine(p: String, fields: Fields = Fields.ALL, - override val sinkMode: SinkMode = SinkMode.REPLACE) - extends FixedPathSource(p) with TextLineScheme { + * This Source writes out the TupleEntry as a simple JSON object, using the field names as keys and the string + * representation of the values. + * + * TODO: it would be nice to have a way to add read/write transformations to pipes that doesn't require + * extending the sources and overriding methods. + * + * @param failOnEmptyLines + * When set to false, it just skips empty lines instead of failing the jobs. Defaults to true for backwards + * compatibility. + */ +case class JsonLine( + p: String, + fields: Fields = Fields.ALL, + override val sinkMode: SinkMode = SinkMode.REPLACE, + override val transformInTest: Boolean = false, + failOnEmptyLines: Boolean = true +) extends FixedPathSource(p) + with TextLineScheme { import Dsl._ import JsonLine._ - override def transformForWrite(pipe : Pipe) = pipe.mapTo(fields -> 'json) { - t: TupleEntry => mapper.writeValueAsString(TupleConverter.ToMap(t)) + override def transformForWrite(pipe: Pipe) = pipe.mapTo(fields -> 'json) { t: TupleEntry => + mapper.writeValueAsString(TupleConverter.ToMap(t)) } - override def transformForRead(pipe : Pipe) = pipe.mapTo('line -> fields) { + override def transformForRead(pipe: Pipe) = { @scala.annotation.tailrec - def nestedRetrieval(node:Option[Map[String, AnyRef]], path:List[String]):AnyRef = { + def nestedRetrieval(node: Option[Map[String, AnyRef]], path: List[String]): AnyRef = (path, node) match { - case (_, None) => null - case (h::Nil, Some(fs)) => fs.get(h).orNull - case (h::tail, Some(fs)) => fs.get(h).orNull match { - case fs:Map[String,AnyRef] => nestedRetrieval(Option(fs), tail) - case _ => null - } + case (_, None) => null + case (h :: Nil, Some(fs)) => fs.get(h).orNull + case (h :: tail, Some(fs)) => + fs.get(h).orNull match { + case fs: Map[String @unchecked, AnyRef @unchecked] => nestedRetrieval(Option(fs), tail) + case _ => null + } case (Nil, _) => null } - } - val splitFields = (0 until fields.size).map { i:Int => fields.get(i).toString.split('.').toList } + val splitFields = (0 until fields.size).map { i: Int => fields.get(i).toString.split('.').toList } - line : String => - val fs: Map[String, AnyRef] = mapper.readValue(line, mapTypeReference) - val values = splitFields.map { nestedRetrieval(Option(fs), _) } - new cascading.tuple.Tuple(values : _*) + pipe.collectTo[String, Tuple]('line -> fields) { + case line: String if failOnEmptyLines || line.trim.nonEmpty => + val fs: Map[String, AnyRef] = mapper.readValue(line, mapTypeReference) + val values = splitFields.map(nestedRetrieval(Option(fs), _)) + new cascading.tuple.Tuple(values: _*) + } } override def toString = "JsonLine(" + p + ", " + fields.toString + ")" @@ -73,24 +84,27 @@ case class JsonLine(p: String, fields: Fields = Fields.ALL, * TODO: at the next binary incompatible version remove the AbstractFunction2/scala.Serializable jank which * was added to get mima to not report binary errors */ -object JsonLine extends scala.runtime.AbstractFunction3[String,Fields,SinkMode,JsonLine] with Serializable with scala.Serializable { +object JsonLine + extends scala.runtime.AbstractFunction5[String, Fields, SinkMode, Boolean, Boolean, JsonLine] + with Serializable + with scala.Serializable { val mapTypeReference = typeReference[Map[String, AnyRef]] - private [this] def typeReference[T: Manifest] = new TypeReference[T] { + private[this] def typeReference[T: Manifest] = new TypeReference[T] { override def getType = typeFromManifest(manifest[T]) } - private [this] def typeFromManifest(m: Manifest[_]): Type = { - if (m.typeArguments.isEmpty) { m.erasure } - else new ParameterizedType { - def getRawType = m.erasure + private[this] def typeFromManifest(m: Manifest[_]): Type = + if (m.typeArguments.isEmpty) { m.runtimeClass } + else + new ParameterizedType { + def getRawType = m.runtimeClass - def getActualTypeArguments = m.typeArguments.map(typeFromManifest).toArray + def getActualTypeArguments = m.typeArguments.map(typeFromManifest).toArray - def getOwnerType = null - } - } + def getOwnerType = null + } val mapper = new ObjectMapper() mapper.registerModule(DefaultScalaModule) diff --git a/scalding-json/src/main/scala/com/twitter/scalding/TypedJson.scala b/scalding-json/src/main/scala/com/twitter/scalding/TypedJson.scala new file mode 100644 index 0000000000..546cc0e527 --- /dev/null +++ b/scalding-json/src/main/scala/com/twitter/scalding/TypedJson.scala @@ -0,0 +1,74 @@ +package com.twitter.scalding + +import com.twitter.bijection.{AbstractInjection, Injection} +import com.twitter.bijection.Inversion._ +import com.twitter.elephantbird.cascading2.scheme.LzoTextLine + +import org.json4s._ +import org.json4s.native.Serialization._ +import org.json4s.{native, NoTypeHints} + +import scala.collection.JavaConverters._ +import scala.util.Try + +import cascading.pipe.Pipe + +/** + * This type uses the structural type of a case class, but not it's name, to describe the Json using json4s. + * This is intended to be used for intermediate output from a REPL session. The intended use is to save adhoc + * data between sessions. The fully qualified class name of classes defined in a REPL is not stable between + * REPL sessions. + * + * We believe using a fixed schema, such as thrift or Avro is a much safer way to do long term productionized + * data pipelines to minimize risks of incompatible changes to schema that render old data unreadable. + */ + +object TypedJson { + private implicit val formats = native.Serialization.formats(NoTypeHints) + private def caseClass2Json[A <: AnyRef](implicit tt: Manifest[A], fmt: Formats): Injection[A, String] = + new AbstractInjection[A, String] { + override def apply(a: A): String = write(a) + + override def invert(b: String): Try[A] = attempt(b)(read[A]) + } + + def apply[T <: AnyRef: Manifest](p: String) = new TypedJson(p) +} + +class TypedJson[T <: AnyRef: Manifest](p: String) + extends FixedPathSource(p) + with TextSourceScheme + with SingleMappable[T] + with TypedSink[T] { + import Dsl._ + import TypedJson._ + + private[this] val fieldSym = 'jsonString + + @transient private[this] lazy val inj = caseClass2Json[T] + + override def transformForWrite(pipe: Pipe) = + pipe.mapTo(0 -> fieldSym)(inj.apply(_: T)) + + override def transformForRead(pipe: Pipe) = + pipe.mapTo('line -> fieldSym)((jsonStr: String) => inj.invert(jsonStr).get) + + override def setter[U <: T] = TupleSetter.asSubSetter[T, U](TupleSetter.singleSetter[T]) + + override def toIterator(implicit config: Config, mode: Mode): Iterator[T] = { + val tap = createTap(Read)(mode) + CascadingMode + .cast(mode) + .openForRead(config, tap) + .asScala + .map { te => + inj.invert(te.selectTuple('line).getObject(0).asInstanceOf[String]).get + } + } +} + +case class TypedJsonLzo[T <: AnyRef: Manifest](p: String) extends TypedJson[T](p) { + override def hdfsScheme = HadoopSchemeInstance( + new LzoTextLine().asInstanceOf[cascading.scheme.Scheme[_, _, _, _, _]] + ) +} diff --git a/scalding-json/src/test/scala/com/twitter/scalding/JsonLineTest.scala b/scalding-json/src/test/scala/com/twitter/scalding/JsonLineTest.scala index 019b413950..212a39b380 100644 --- a/scalding-json/src/test/scala/com/twitter/scalding/JsonLineTest.scala +++ b/scalding-json/src/test/scala/com/twitter/scalding/JsonLineTest.scala @@ -12,67 +12,83 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ + */ package com.twitter.scalding.json -import org.specs._ -import com.twitter.scalding.{JsonLine => StandardJsonLine, _} - -import cascading.tuple.Fields +import cascading.flow.FlowException import cascading.tap.SinkMode +import cascading.tuple.Fields +import com.twitter.scalding.{JsonLine => StandardJsonLine, _} +import org.scalatest.WordSpec object JsonLine { - def apply(p: String, fields: Fields = Fields.ALL) = new JsonLine(p, fields) -} -class JsonLine(p: String, fields: Fields) extends StandardJsonLine(p, fields, SinkMode.REPLACE) { - // We want to test the actual tranformation here. - override val transformInTest = true + def apply(p: String, fields: Fields = Fields.ALL, failOnEmptyLines: Boolean = true) = + new JsonLine(p, fields, failOnEmptyLines) } -class JsonLineJob(args : Args) extends Job(args) { +class JsonLine(p: String, fields: Fields, failOnEmptyLines: Boolean) + extends StandardJsonLine( + p, + fields, + SinkMode.REPLACE, + // We want to test the actual transformation here. + transformInTest = true, + failOnEmptyLines = failOnEmptyLines + ) + +class JsonLineJob(args: Args) extends Job(args) { try { Tsv("input0", ('query, 'queryStats)).read.write(JsonLine("output0")) } catch { - case e : Exception => e.printStackTrace() + case e: Exception => e.printStackTrace() } } -class JsonLineRestrictedFieldsJob(args : Args) extends Job(args) { +class JsonLineRestrictedFieldsJob(args: Args) extends Job(args) { try { Tsv("input0", ('query, 'queryStats)).read.write(JsonLine("output0", Tuple1('query))) } catch { - case e : Exception => e.printStackTrace() + case e: Exception => e.printStackTrace() } } -class JsonLineInputJob(args : Args) extends Job(args) { +class JsonLineInputJob(args: Args) extends Job(args) { try { JsonLine("input0", ('foo, 'bar)).read .project('foo, 'bar) .write(Tsv("output0")) } catch { - case e : Exception => e.printStackTrace + case e: Exception => e.printStackTrace } } -class JsonLineNestedInputJob(args : Args) extends Job(args) { +class JsonLineInputJobSkipEmptyLines(args: Args) extends Job(args) { try { - JsonLine("input0", (Symbol("foo.too"), 'bar)).read - .rename((Symbol("foo.too") -> ('foo))) + JsonLine("input0", ('foo, 'bar), failOnEmptyLines = false).read .project('foo, 'bar) .write(Tsv("output0")) } catch { - case e : Exception => e.printStackTrace + case e: Exception => e.printStackTrace } } +class JsonLineNestedInputJob(args: Args) extends Job(args) { + try { + JsonLine("input0", (Symbol("foo.too"), 'bar)).read + .rename((Symbol("foo.too") -> 'foo)) + .project('foo, 'bar) + .write(Tsv("output0")) + + } catch { + case e: Exception => e.printStackTrace + } +} -class JsonLineTest extends Specification { - noDetailedDiffs() - import Dsl._ +class JsonLineTest extends WordSpec { + import com.twitter.scalding.Dsl._ "A JsonLine sink" should { JobTest(new JsonLineJob(_)) @@ -80,60 +96,80 @@ class JsonLineTest extends Specification { .sink[String](JsonLine("output0")) { buf => val json = buf.head "not stringify lists or numbers and not escape single quotes" in { - json must be_==("""{"query":"doctor's mask","queryStats":[42.1,17.1]}""") + assert(json === """{"query":"doctor's mask","queryStats":[42.1,17.1]}""") } } .run - .finish + .finish() JobTest(new JsonLineRestrictedFieldsJob(_)) .source(Tsv("input0", ('query, 'queryStats)), List(("doctor's mask", List(42.1f, 17.1f)))) .sink[String](JsonLine("output0", Tuple1('query))) { buf => val json = buf.head "only sink requested fields" in { - json must be_==("""{"query":"doctor's mask"}""") + assert(json === """{"query":"doctor's mask"}""") } } .run - .finish + .finish() val json = """{"foo": 3, "bar": "baz"}\n""" JobTest(new JsonLineInputJob(_)) .source(JsonLine("input0", ('foo, 'bar)), List((0, json))) - .sink[(Int, String)](Tsv("output0")) { - outBuf => - "read json line input" in { - outBuf.toList must be_==(List((3, "baz"))) - } + .sink[(Int, String)](Tsv("output0")) { outBuf => + "read json line input" in { + assert(outBuf.toList === List((3, "baz"))) + } } .run - .finish + .finish() val json2 = """{"foo": 7 }\n""" JobTest(new JsonLineInputJob(_)) .source(JsonLine("input0", ('foo, 'bar)), List((0, json), (1, json2))) - .sink[(Int, String)](Tsv("output0")) { - outBuf => - "handle missing fields" in { - outBuf.toList must be_==(List((3, "baz"), (7, null))) - } + .sink[(Int, String)](Tsv("output0")) { outBuf => + "handle missing fields" in { + assert(outBuf.toList === List((3, "baz"), (7, null))) + } } .run - .finish + .finish() val json3 = """{"foo": {"too": 9}}\n""" JobTest(new JsonLineNestedInputJob(_)) .source(JsonLine("input0", (Symbol("foo.too"), 'bar)), List((0, json), (1, json3))) - .sink[(Int, String)](Tsv("output0")) { - outBuf => - "handle nested fields" in { - outBuf.toList must be_==(List((0, "baz"), (9, null))) + .sink[(Int, String)](Tsv("output0")) { outBuf => + "handle nested fields" in { + assert(outBuf.toList === List((0, "baz"), (9, null))) + } + } + .run + .finish() + + "fail on empty lines by default" in { + intercept[FlowException] { + JobTest(new JsonLineInputJob(_)) + .source(JsonLine("input0", ('foo, 'bar)), List((0, json), (1, json2), (2, ""), (3, " "))) + .sink[(Int, String)](Tsv("output0")) { outBuf => + outBuf.toList + } + .run + .finish() + } + } + + JobTest(new JsonLineInputJobSkipEmptyLines(_)) + .source(JsonLine("input0", ('foo, 'bar)), List((0, json), (1, json2), (2, ""), (3, " "))) + .sink[(Int, String)](Tsv("output0")) { outBuf => + "handle empty lines when `failOnEmptyLines` is set to false" in { + assert(outBuf.toList.size === 2) + } } .run - .finish + .finish() } - } +} diff --git a/scalding-parquet-fixtures/src/test/resources/test.thrift b/scalding-parquet-fixtures/src/test/resources/test.thrift new file mode 100644 index 0000000000..e96749a05d --- /dev/null +++ b/scalding-parquet-fixtures/src/test/resources/test.thrift @@ -0,0 +1,12 @@ +namespace java com.twitter.scalding.parquet.thrift_java.test +#@namespace scala com.twitter.scalding.parquet.thrift_scala.test + +struct Name { + 1: required string first_name, + 2: optional string last_name +} + +struct Address { + 1: string street, + 2: required string zip +} diff --git a/scalding-parquet-scrooge-fixtures/src/test/resources/binary.thrift b/scalding-parquet-scrooge-fixtures/src/test/resources/binary.thrift new file mode 100644 index 0000000000..2f1efb8767 --- /dev/null +++ b/scalding-parquet-scrooge-fixtures/src/test/resources/binary.thrift @@ -0,0 +1,26 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +namespace java com.twitter.scalding.parquet.scrooge.thrift_java.test.binary +#@namespace scala com.twitter.scalding.parquet.scrooge.thrift_scala.test.binary + +struct StringAndBinary { + 1: required string s; + 2: required binary b; +} diff --git a/scalding-parquet-scrooge-fixtures/src/test/resources/compat.thrift b/scalding-parquet-scrooge-fixtures/src/test/resources/compat.thrift new file mode 100644 index 0000000000..2a04fd3792 --- /dev/null +++ b/scalding-parquet-scrooge-fixtures/src/test/resources/compat.thrift @@ -0,0 +1,265 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +namespace java com.twitter.scalding.parquet.scrooge.thrift_java.test.compat +#@namespace scala com.twitter.scalding.parquet.scrooge.thrift_scala.test.compat + +struct StructV1 { + 1: required string name +} +struct StructV2 { + 1: required string name, + 2: optional string age +} +struct StructV3 { + 1: required string name, + 2: optional string age, + 3: optional string gender +} + +struct StructV4WithExtracStructField { + 1: required string name, + 2: optional string age, + 3: optional string gender, + 4: optional StructV3 addedStruct +} + +struct RenameStructV1 { + 1: required string nameChanged +} + +enum NumberEnum { + ONE = 1, + TWO = 2, + THREE = 3 +} + +enum NumberEnumWithMoreValue { + ONE = 1, + TWO = 2, + THREE = 3, + FOUR = 4 +} + +struct StructWithEnum { + 1: required NumberEnum num +} + +struct StructWithMoreEnum { + 1: required NumberEnumWithMoreValue num +} + +struct TypeChangeStructV1{ + 1: required i16 name +} + +struct OptionalStructV1{ + 1: optional string name +} + +struct DefaultStructV1{ + 1: string name +} + +struct AddRequiredStructV1{ + 1: required string name, + 2: required string anotherName +} + +struct MapStructV1{ + 1: required map map1 +} + +struct MapValueStructV1{ + 1: required map map1 +} + +struct MapStructV2{ + 1: required map map1 +} + +struct MapValueStructV2{ + 1: required map map1 +} + +struct MapAddRequiredStructV1{ + 1: required map map1 +} + +struct MapWithStructValue { + 1: required map reqMap +} + +struct MapWithPrimMapValue { + 1: required map> reqMap +} + +struct MapWithStructMapValue { + 1: required map> reqMap +} + +struct SetStructV1{ + 1: required set set1 +} + +struct SetStructV2{ + 1: required set set1 +} + +struct ListStructV1{ + 1: required list list1 +} + +struct ListStructV2{ + 1: required list list1 +} + +struct AString { + 1: required string s +} + +struct ALong { + 1: required i64 l +} + +struct ABool { + 1: required bool b +} + +union UnionV1 { + 1: AString aString, + 2: ALong aLong +} + +union UnionV2 { + 1: AString aString, + 2: ALong aLong, + 3: ABool aNewBool +} + +struct StructWithUnionV1 { + 1: required string name, + 2: required UnionV1 aUnion +} + +struct StructWithUnionV2 { + 1: required string name, + 2: required UnionV2 aUnion +} + +struct AStructThatLooksLikeUnionV2 { + 1: optional AString aString, + 2: optional ALong aLong, + 3: optional ABool aNewBool +} + +struct StructWithAStructThatLooksLikeUnionV2 { + 1: required string name, + 2: required AStructThatLooksLikeUnionV2 aNotQuiteUnion +} + +union UnionOfStructs { + 1: StructV3 structV3, + 2: StructV4WithExtracStructField structV4, + 3: ABool aNewBool +} + +struct StructWithUnionOfStructs { + 1: required string name, + 2: required UnionOfStructs aUnion +} + +struct StructWithOptionalUnionOfStructs { + 1: required string name, + 2: optional UnionOfStructs aUnion +} + +struct StructWithRequiredUnionOfStructs { + 1: required string name, + 2: required UnionOfStructs aUnion +} + +struct OptionalInsideRequired { + 1: required string name, + 2: required StructWithOptionalUnionOfStructs aStruct +} + +struct RequiredInsideOptional { + 1: required string name, + 2: optional StructWithRequiredUnionOfStructs aStruct +} + +union UnionStructUnion { + 1: StructV3 structV3 + 2: StructWithUnionOfStructs structWithUnionOfStructs + 3: ALong aLong +} + +union NestedUnion { + 1: StructV3 structV3 + 2: UnionOfStructs unionOfStructs + 3: ALong aLong +} + +union NestedNestedUnion { + 1: NestedUnion nestedUnion + 2: UnionV2 unionV2 +} + +struct StructWithNestedUnion { + 1: optional UnionOfStructs optUnionOfStructs + 2: required UnionOfStructs reqUnionOfStructs + 3: UnionOfStructs unspecifiedUnionOfStructs + + 4: optional NestedUnion optNestedUnion + 5: required NestedUnion reqNestedUnion + 6: NestedUnion unspecifiedNestedUnion + + 7: optional StructWithUnionV2 optStructWithUnionV2 + 8: required StructWithUnionV2 reqStructWithUnionV2 + 9: StructWithUnionV2 unspecifiedStructWithUnionV2 + + 10: optional UnionStructUnion optUnionStructUnion + 11: required UnionStructUnion reqUnionStructUnion + 12: UnionStructUnion unspecifiedUnionStructUnion +} + +struct MapWithUnionKey { + 1: optional map optMapWithUnionKey + 2: required map reqMapWithUnionKey +} + +struct MapWithUnionValue { + 1: optional map optMapWithUnionValue + 2: required map reqMapWithUnionValue +} + +struct ListOfUnions { + 1: optional list optListUnion + 2: required list reqListUnion +} + +struct EmptyStruct { + +} + +struct NestedEmptyStruct { + 1: required EmptyStruct required_empty + 2: optional EmptyStruct optional_empty +} diff --git a/scalding-parquet-scrooge-fixtures/src/test/resources/test.thrift b/scalding-parquet-scrooge-fixtures/src/test/resources/test.thrift new file mode 100644 index 0000000000..d76cd848b1 --- /dev/null +++ b/scalding-parquet-scrooge-fixtures/src/test/resources/test.thrift @@ -0,0 +1,319 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +namespace java com.twitter.scalding.parquet.scrooge.thrift_java.test +#@namespace scala com.twitter.scalding.parquet.scrooge.thrift_scala.test + +struct TestListsInMap { + 1: string name, + 2: map,list> names, +} + +struct Name { + 1: required string first_name, + 2: optional string last_name +} + +struct Address { + 1: string street, + 2: required string zip +} + +struct AddressWithStreetWithDefaultRequirement { + 1: string street, + 2: required string zip +} + +struct Phone { + 1: string mobile + 2: string work +} + +struct TestPerson { + 1: required Name name, + 2: optional i32 age, + 3: Address address, + 4: string info +} + + +struct RequiredMapFixture { + 1: optional string name, + 2: required map mavalue +} + +struct RequiredListFixture { + 1: optional string info, + 2: required list names +} + +struct RequiredSetFixture { + 1: optional string info, + 2: required set names +} + +struct RequiredPrimitiveFixture { + 1: required bool test_bool, + 2: required byte test_byte, + 3: required i16 test_i16, + 4: required i32 test_i32, + 5: required i64 test_i64, + 6: required double test_double, + 7: required string test_string, + 8: optional string info_string +} + + +struct StructWithReorderedOptionalFields { + 3: optional i32 fieldThree, + 2: optional i32 fieldTwo, + 1: optional i32 fieldOne, +} + +struct StructWithIndexStartsFrom4 { + 6: required Phone phone +} + +struct StructWithExtraField { + 3: required Phone extraPhone, + 6: required Phone phone +} + + +struct TestPersonWithRequiredPhone { + 1: required Name name, + 2: optional i32 age, + 3: required Address address, + 4: optional string info, + 5: required Phone phone +} + +struct TestPersonWithAllInformation { + 1: required Name name, + 2: optional i32 age, + 3: required Address address, + 4: optional Address working_address, + 5: optional string info, + 6: required map phone_map, + 7: optional set interests, + 8: optional list key_words +} + +struct TestMapComplex{ + 1: required map phone_address_map +} + +struct TestMapBinary{ + 1: required map string_binary_map +} + +struct TestMapPrimitiveKey { + 1: required map short_map, + 2: required map int_map, + 3: required map byt_map, + 4: required map bool_map, + 5: required map long_map, + 6: required map double_map, + 7: required map string_map; +} + +struct TestOptionalMap { + 1: optional map short_map, + 2: optional map int_map, + 3: optional map byt_map, + 4: optional map bool_map, + 5: optional map long_map, + 6: optional map double_map, + 7: optional map string_map +} + +struct TestListPrimitive { + 1: required list short_list, + 2: required list int_list, + 3: required list long_list, + 4: required list byte_list, + 5: required list string_list, + 6: required list bool_list, + 7: required list double_list, +} + +struct TestSetPrimitive { + 1: required set short_list, + 2: required set int_list, + 3: required set long_list, + 4: required set byte_list, + 5: required set string_list, + 6: required set bool_list, + 7: required set double_list +} + +struct TestMapPrimitiveValue { + 1: required map short_map, + 2: required map int_map, + 3: required map byt_map, + 4: required map bool_map, + 5: required map long_map, + 6: required map double_map, + 7: required map string_map +} + +union TestUnion { + 1: TestPerson first_person + 2: TestMapComplex second_map +} + +enum Operation { + ADD = 1, + SUBTRACT = 2, + MULTIPLY = 3, + DIVIDE = 4 +} + +struct TestFieldOfEnum{ + 1: required Operation op + 2: optional Operation op2 +} + +struct StringAndBinary { + 1: required string s + 2: required binary b +} + +#fixture fox nested structures +struct NestedList { + 1: required list> rll + 2: required list>> rlll + 3: optional list> oll + 4: optional list>> olll + 5: list> ll + 6: list>> lll +} + +struct ListNestMap { + 1: required list> rlm + 2: required list>> rllm + 3: optional list> olm + 4: optional list>> ollm + 5: list> lm + 6: list>> llm +} + +struct ListNestSet { + 1: required list> rls + 2: required list>> rlls + 3: optional list> ols + 4: optional list>> olls + 5: list> ls + 6: list>> lls +} + +struct ListNestEnum { + 1: required list rle +} + +struct MapNestMap { + 1: required map, map> rmm + 2: required map, Address>, map> rmmm + 3: optional map, map> omm + 4: optional map, Address>, map> ommm + 5: map, map> mm + 6: map, Address>, map> mmm +} + +struct MapNestList { + 1: required map, list
> rml + 2: required map>, list>> rmll + 3: optional map, list
> oml + 4: optional map>, list>> omll + 5: map, list
> ml + 6: map>, list>> mll +} + +struct MapNestSet { + 1: required map, set
> rms + 2: required map>, set>> rmss + 3: optional map, set
> oms + 4: optional map>, set>> omss + 5: map, set
> ms + 6: map>, set>> mss +} + +struct SetNestSet { + 1: required set> rss + 2: required set>> rsss + 3: optional set> oss + 4: optional set>> osss + 5: set> ss + 6: set>> sss +} + +struct SetNestList { + 1: required set> rsl + 2: required set>> rssl + 3: optional set> osl + 4: optional set>> ossl + 5: set> sl + 6: set>> ssl +} + +struct SetNestMap { + 1: required set> rsm + 2: required set>> rssm + 3: required set>>>> rssllm + 4: optional set> osm + 5: optional set>> ossm + 6: optional set>>>> ossllm + 7: set> sm + 8: set>> ssm + 9: set>>>> ssllm +} + +struct AString { + 1: required string s +} + +struct ALong { + 1: required i64 l +} + +struct ABool { + 1: required bool b +} + +union UnionV2 { + 1: AString aString, + 2: ALong aLong, + 3: ABool aNewBool +} + +struct StructWithUnionV2 { + 1: required string name, + 2: required UnionV2 aUnion +} + +struct AStructThatLooksLikeUnionV2 { + 1: optional AString aString, + 2: optional ALong aLong, + 3: optional ABool aNewBool +} + +struct StructWithAStructThatLooksLikeUnionV2 { + 1: required string name, + 2: required AStructThatLooksLikeUnionV2 aNotQuiteUnion +} diff --git a/scalding-parquet-scrooge/README.md b/scalding-parquet-scrooge/README.md new file mode 100644 index 0000000000..1dc0007f6c --- /dev/null +++ b/scalding-parquet-scrooge/README.md @@ -0,0 +1,3 @@ +# Parquet-Scrooge support for Scalding + +This module has sources for reading scrooge-generated thrift structs. See the scalding-parquet module for reading apache-thrift (TBase) generated thrift structs. diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeInputFormat.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeInputFormat.java new file mode 100644 index 0000000000..e58a8f731b --- /dev/null +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeInputFormat.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package com.twitter.scalding.parquet.scrooge; + +import org.apache.parquet.hadoop.thrift.ParquetThriftInputFormat; + +/** + * Use this class to read Scrooge records from parquet file + * @param Type of Scrooge records to read + */ +public class ParquetScroogeInputFormat extends ParquetThriftInputFormat { + public ParquetScroogeInputFormat() { + super(ScroogeReadSupport.class); + } +} diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeOutputFormat.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeOutputFormat.java new file mode 100644 index 0000000000..daa2f69545 --- /dev/null +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeOutputFormat.java @@ -0,0 +1,39 @@ +/** + * Copyright 2012 Twitter, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.twitter.scalding.parquet.scrooge; + +import com.twitter.scrooge.ThriftStruct; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.hadoop.ParquetOutputFormat; + +/** + * Use this class to write Scrooge records to parquet + * @param Type of Scrooge records to write + */ +public class ParquetScroogeOutputFormat extends ParquetOutputFormat { + + public static void setScroogeClass(Configuration configuration, Class thriftClass) { + ScroogeWriteSupport.setScroogeClass(configuration, thriftClass); + } + + public static Class getScroogeClass(Configuration configuration) { + return ScroogeWriteSupport.getScroogeClass(configuration); + } + + public ParquetScroogeOutputFormat() { + super(new ScroogeWriteSupport()); + } +} diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeScheme.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeScheme.java new file mode 100644 index 0000000000..3b41696ae6 --- /dev/null +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeScheme.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package com.twitter.scalding.parquet.scrooge; + +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.RecordReader; + +import com.twitter.scalding.parquet.ParquetValueScheme; +import com.twitter.scalding.parquet.ScaldingDeprecatedParquetInputFormat; +import com.twitter.scrooge.ThriftStruct; + +import cascading.flow.FlowProcess; +import cascading.tap.Tap; +import org.apache.parquet.filter2.predicate.FilterPredicate; +import org.apache.parquet.hadoop.ParquetInputFormat; +import org.apache.parquet.hadoop.ParquetOutputFormat; +import org.apache.parquet.hadoop.mapred.DeprecatedParquetOutputFormat; +import org.apache.parquet.hadoop.thrift.ThriftReadSupport; + +public class ParquetScroogeScheme extends ParquetValueScheme { + + private static final long serialVersionUID = -8332274507341448397L; + + public ParquetScroogeScheme(Class klass) { + this(new ParquetValueScheme.Config().withRecordClass(klass)); + } + + public ParquetScroogeScheme(FilterPredicate filterPredicate, Class klass) { + this(new ParquetValueScheme.Config().withFilterPredicate(filterPredicate).withRecordClass(klass)); + } + + public ParquetScroogeScheme(ParquetValueScheme.Config config) { + super(config); + } + + @Override + public void sinkConfInit(FlowProcess fp, + Tap tap, JobConf jobConf) { + DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf); + ParquetOutputFormat.setWriteSupportClass(jobConf, ScroogeWriteSupport.class); + ScroogeWriteSupport.setScroogeClass(jobConf, this.config.getKlass()); + } + + @Override + public void sourceConfInit(FlowProcess fp, + Tap tap, JobConf jobConf) { + super.sourceConfInit(fp, tap, jobConf); + jobConf.setInputFormat(ScaldingDeprecatedParquetInputFormat.class); + ParquetInputFormat.setReadSupportClass(jobConf, ScroogeReadSupport.class); + ThriftReadSupport.setRecordConverterClass(jobConf, ScroogeRecordConverter.class); + } +} diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java new file mode 100644 index 0000000000..12854bbd41 --- /dev/null +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupport.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package com.twitter.scalding.parquet.scrooge; + +import com.twitter.scrooge.ThriftStruct; +import org.apache.hadoop.conf.Configuration; +import org.apache.parquet.hadoop.api.InitContext; +import org.apache.parquet.hadoop.api.ReadSupport; +import org.apache.parquet.hadoop.thrift.ThriftReadSupport; +import org.apache.parquet.io.InvalidRecordException; +import org.apache.parquet.io.ParquetDecodingException; +import org.apache.parquet.io.api.RecordMaterializer; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.MessageTypeParser; +import org.apache.parquet.schema.Type; +import org.apache.parquet.thrift.ThriftMetaData; +import org.apache.parquet.thrift.ThriftRecordConverter; +import org.apache.parquet.thrift.ThriftSchemaConverter; +import org.apache.parquet.thrift.projection.FieldProjectionFilter; +import org.apache.parquet.thrift.projection.ThriftProjectionException; +import org.apache.parquet.thrift.struct.ThriftType; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Read support for Scrooge + * + * @author Tianshuo Deng + */ +public class ScroogeReadSupport extends ThriftReadSupport { + + /** + * used from hadoop + * the configuration must contain a "parquet.thrift.read.class" setting + */ + public ScroogeReadSupport() { + } + + @Override + protected MessageType getProjectedSchema(FieldProjectionFilter fieldProjectionFilter) { + ThriftType.StructType thriftStruct = new ScroogeStructConverter().convert(thriftClass); + return new ThriftSchemaConverter(fieldProjectionFilter).convert(thriftStruct); + } + + /** + * Method overridden from ThriftReadSupport to call + * {@link #getSchemaForRead(MessageType, MessageType)} instead of + * {@link ReadSupport#getSchemaForRead(MessageType, MessageType)} + *

+ * The changes are done to fix use cases https://github.com/apache/parquet-mr/pull/558 + * Once that is merged, this overridden method can be removed along with + * {@link #getSchemaForRead(MessageType, MessageType)} + * {@link #getSchemaForRead(MessageType, String)} + * {@link #assertAreCompatible(Type, Type)} + * {@link #assertGroupsAreCompatible(GroupType, GroupType)} + * {@link #getThriftClassFromMultipleFiles(Map, Configuration)} + * + * @param context the initialisation context + * @return the readContext that defines how to read the file + */ + @Override + public ReadSupport.ReadContext init(InitContext context) { + final Configuration configuration = context.getConfiguration(); + final MessageType fileMessageType = context.getFileSchema(); + MessageType requestedProjection = fileMessageType; + String partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA); + + FieldProjectionFilter projectionFilter = getFieldProjectionFilter(configuration); + + if (partialSchemaString != null && projectionFilter != null) { + throw new ThriftProjectionException( + String.format("You cannot provide both a partial schema and field projection filter." + + "Only one of (%s, %s, %s) should be set.", + PARQUET_READ_SCHEMA, STRICT_THRIFT_COLUMN_FILTER_KEY, THRIFT_COLUMN_FILTER_KEY)); + } else if (partialSchemaString != null) { + requestedProjection = getSchemaForRead(fileMessageType, partialSchemaString); + } else if (projectionFilter != null) { + try { + if (thriftClass == null) { + thriftClass = getThriftClassFromMultipleFiles(context.getKeyValueMetadata(), configuration); + } + requestedProjection = getProjectedSchema(projectionFilter); + } catch (ClassNotFoundException e) { + throw new ThriftProjectionException("can not find thriftClass from configuration", e); + } + } + + MessageType schemaForRead = getSchemaForRead(fileMessageType, requestedProjection); + return new ReadContext(schemaForRead); + } + + /** + * attempts to validate and construct a {@link MessageType} from a read projection schema + * + * @param fileMessageType the typed schema of the source + * @param partialReadSchemaString the requested projection schema + * @return the typed schema that should be used to read + */ + public static MessageType getSchemaForRead(MessageType fileMessageType, String partialReadSchemaString) { + if (partialReadSchemaString == null) + return fileMessageType; + MessageType requestedMessageType = MessageTypeParser.parseMessageType(partialReadSchemaString); + return getSchemaForRead(fileMessageType, requestedMessageType); + } + + /** + * Updated method from ReadSupport which checks if the projection's compatible instead of a + * stricter check to see if the file's schema contains the projection + * + * @param fileMessageType + * @param projectedMessageType + * @return + */ + public static MessageType getSchemaForRead(MessageType fileMessageType, MessageType projectedMessageType) { + assertGroupsAreCompatible(fileMessageType, projectedMessageType); + return projectedMessageType; + } + + /** + * Getting thrift class from file metadata + */ + public static Class getThriftClassFromMultipleFiles(Map> fileMetadata, Configuration conf) throws ClassNotFoundException { + String className = conf.get(THRIFT_READ_CLASS_KEY, null); + if (className == null) { + Set names = ThriftMetaData.getThriftClassNames(fileMetadata); + if (names == null || names.size() != 1) { + throw new ParquetDecodingException("Could not read file as the Thrift class is not provided and could not be resolved from the file: " + names); + } + className = names.iterator().next(); + } + return (Class) Class.forName(className); + } + + /** + * Validates that the requested group type projection is compatible. + * This allows the projection schema to have extra optional fields. + * + * @param fileType the typed schema of the source + * @param projection requested projection schema + */ + public static void assertGroupsAreCompatible(GroupType fileType, GroupType projection) { + List fields = projection.getFields(); + for (Type otherType : fields) { + if (fileType.containsField(otherType.getName())) { + Type thisType = fileType.getType(otherType.getName()); + assertAreCompatible(thisType, otherType); + if (!otherType.isPrimitive()) { + assertGroupsAreCompatible(thisType.asGroupType(), otherType.asGroupType()); + } + } else if (otherType.getRepetition() == Type.Repetition.REQUIRED) { + throw new InvalidRecordException(otherType.getName() + " not found in " + fileType); + } + } + } + + /** + * Validates that the requested projection is compatible. + * This makes it possible to project a required field using optional since it is less + * restrictive. + * + * @param fileType the typed schema of the source + * @param projection requested projection schema + */ + public static void assertAreCompatible(Type fileType, Type projection) { + if (!fileType.getName().equals(projection.getName()) + || (fileType.getRepetition() != projection.getRepetition() && !fileType.getRepetition().isMoreRestrictiveThan(projection.getRepetition()))) { + throw new InvalidRecordException(projection + " found: expected " + fileType); + } + } + + /** + * Overriding to fall back to get descriptor from the {@link #thriftClass} if thrift metadata is + * not present + * + * @return + */ + @Override + public RecordMaterializer prepareForRead(Configuration configuration, + Map keyValueMetaData, MessageType fileSchema, + ReadSupport.ReadContext readContext) { + ThriftMetaData thriftMetaData = ThriftMetaData.fromExtraMetaData(keyValueMetaData); + try { + if (thriftClass == null) { + thriftClass = getThriftClass(keyValueMetaData, configuration); + } + + ThriftType.StructType descriptor = null; + if (thriftMetaData != null) { + descriptor = thriftMetaData.getDescriptor(); + } else { + ScroogeStructConverter schemaConverter = new ScroogeStructConverter(); + descriptor = schemaConverter.convert(thriftClass); + } + + ThriftRecordConverter converter = new ScroogeRecordConverter( + thriftClass, + readContext.getRequestedSchema(), + descriptor); + return converter; + } catch (Exception t) { + throw new RuntimeException("Unable to create Thrift Converter for Thrift metadata " + thriftMetaData, t); + } + } + + /** + * Getting thrift class from extra metadata + */ + public static Class getThriftClass(Map fileMetadata, Configuration conf) throws ClassNotFoundException { + String className = conf.get(THRIFT_READ_CLASS_KEY, null); + if (className == null) { + final ThriftMetaData metaData = ThriftMetaData.fromExtraMetaData(fileMetadata); + if (metaData == null) { + throw new ParquetDecodingException("Could not read file as the Thrift class is not provided and could not be resolved from the file"); + } + return (Class) metaData.getThriftClass(); + } else { + return (Class) Class.forName(className); + } + } + +} diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeRecordConverter.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeRecordConverter.java new file mode 100644 index 0000000000..20f1e31fe4 --- /dev/null +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeRecordConverter.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package com.twitter.scalding.parquet.scrooge; + +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TProtocol; + +import com.twitter.scrooge.ThriftStruct; +import com.twitter.scrooge.ThriftStructCodec; + +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.thrift.ThriftReader; +import org.apache.parquet.thrift.ThriftRecordConverter; +import org.apache.parquet.thrift.struct.ThriftType.StructType; + +public class ScroogeRecordConverter extends ThriftRecordConverter { + + + public ScroogeRecordConverter(final Class thriftClass, MessageType parquetSchema, StructType thriftType) { + super(new ThriftReader() { + @SuppressWarnings("unchecked") + ThriftStructCodec codec = (ThriftStructCodec) getCodec(thriftClass); + @Override + public T readOneRecord(TProtocol protocol) throws TException { + return codec.decode(protocol); + } + }, thriftClass.getSimpleName(), parquetSchema, thriftType); + } + + private static ThriftStructCodec getCodec(Class klass) { + Class companionClass; + try { + companionClass = Class.forName(klass.getName() + "$"); + Object companionObject = companionClass.getField("MODULE$").get(null); + return (ThriftStructCodec) companionObject; + } catch (Exception t) { + if (t instanceof InterruptedException) Thread.currentThread().interrupt(); + throw new RuntimeException("Unable to create ThriftStructCodec", t); + } + } +} diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeSchemaConversionException.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeSchemaConversionException.java new file mode 100644 index 0000000000..6c6d5cb630 --- /dev/null +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeSchemaConversionException.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package com.twitter.scalding.parquet.scrooge; + +import org.apache.parquet.ParquetRuntimeException; + +/** + * Throw this exception when there is an error converting a Scrooge class to + * thrift schema + */ +class ScroogeSchemaConversionException extends ParquetRuntimeException { + public ScroogeSchemaConversionException(String message, Throwable cause) { + super(message, cause); + } + + public ScroogeSchemaConversionException(String message) { + super(message); + } +} + diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeStructConverter.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeStructConverter.java new file mode 100644 index 0000000000..009ce3dbe4 --- /dev/null +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeStructConverter.java @@ -0,0 +1,397 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package com.twitter.scalding.parquet.scrooge; + +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.lang.reflect.ParameterizedType; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +import scala.collection.JavaConversions; +import scala.collection.JavaConversions$; +import scala.collection.Seq; +import scala.reflect.Manifest; + +import com.twitter.scrooge.ThriftStructCodec; +import com.twitter.scrooge.ThriftStructFieldInfo; + +import org.apache.parquet.thrift.struct.ThriftField; +import org.apache.parquet.thrift.struct.ThriftType; +import org.apache.parquet.thrift.struct.ThriftType.StructType.StructOrUnionType; +import org.apache.parquet.thrift.struct.ThriftTypeID; +import static org.apache.parquet.thrift.struct.ThriftField.Requirement; +import static org.apache.parquet.thrift.struct.ThriftField.Requirement.DEFAULT; +import static org.apache.parquet.thrift.struct.ThriftField.Requirement.REQUIRED; +import static org.apache.parquet.thrift.struct.ThriftField.Requirement.OPTIONAL; + + +/** + * Class to convert a scrooge generated class to {@link ThriftType.StructType}. {@link ScroogeReadSupport } uses this + * class to get the requested schema + * + * @author Tianshuo Deng + */ +public class ScroogeStructConverter { + + /** + * convert a given scrooge generated class to {@link ThriftType.StructType} + */ + public ThriftType.StructType convert(Class scroogeClass) { + return convertStructFromClass(scroogeClass); + } + + private static String mapKeyName(String fieldName) { + return fieldName + "_map_key"; + } + + private static String mapValueName(String fieldName) { + return fieldName + "_map_value"; + } + + private static String listElemName(String fieldName) { + return fieldName + "_list_elem"; + } + + private static String setElemName(String fieldName) { + return fieldName + "_set_elem"; + } + + private Class getCompanionClass(Class klass) { + try { + return Class.forName(klass.getName() + "$"); + } catch (ClassNotFoundException e) { + throw new ScroogeSchemaConversionException("Can not find companion object for scrooge class " + klass, e); + } + } + + private ThriftType.StructType convertStructFromClass(Class klass) { + return convertCompanionClassToStruct(getCompanionClass(klass)); + } + + private ThriftType.StructType convertCompanionClassToStruct(Class companionClass) { + ThriftStructCodec companionObject; + try { + companionObject = (ThriftStructCodec) companionClass.getField("MODULE$").get(null); + } catch (NoSuchFieldException e) { + throw new ScroogeSchemaConversionException("Can not get ThriftStructCodec from companion object of " + companionClass.getName(), e); + } catch (IllegalAccessException e) { + throw new ScroogeSchemaConversionException("Can not get ThriftStructCodec from companion object of " + companionClass.getName(), e); + } + + List children = new LinkedList();//{@link ThriftType.StructType} uses foreach loop to iterate the children, yields O(n) time for linked list + Iterable scroogeFields = getFieldInfos(companionObject); + for (ThriftStructFieldInfo field : scroogeFields) { + children.add(toThriftField(field)); + } + + StructOrUnionType structOrUnionType = + isUnion(companionObject.getClass()) ? StructOrUnionType.UNION : StructOrUnionType.STRUCT; + + return new ThriftType.StructType(children, structOrUnionType); + } + + private Iterable getFieldInfos(ThriftStructCodec c) { + Class klass = c.getClass(); + if (isUnion(klass)) { + // Union needs special treatment since currently scrooge does not generates the fieldInfos + // field in the parent union class + return getFieldInfosForUnion(klass); + } else { + //each struct has a generated fieldInfos method to provide metadata to its fields + try { + Object r = klass.getMethod("fieldInfos").invoke(c); + return JavaConversions$.MODULE$.asJavaIterable((scala.collection.Iterable) r); + } catch (ClassCastException e) { + throw new ScroogeSchemaConversionException("can not get field Info from: " + c.toString(), e); + } catch (InvocationTargetException e) { + throw new ScroogeSchemaConversionException("can not get field Info from: " + c.toString(), e); + } catch (NoSuchMethodException e) { + throw new ScroogeSchemaConversionException("can not get field Info from: " + c.toString(), e); + } catch (IllegalAccessException e) { + throw new ScroogeSchemaConversionException("can not get field Info from: " + c.toString(), e); + } + } + } + + + private Iterable getFieldInfosForUnion(Class klass) { + ArrayList fields = new ArrayList(); + for (Field f : klass.getDeclaredFields()) { + if (f.getType().equals(Manifest.class)) { + Class unionClass = (Class) ((ParameterizedType) f.getGenericType()).getActualTypeArguments()[0]; + Class companionUnionClass = getCompanionClass(unionClass); + try { + Object companionUnionObj = companionUnionClass.getField("MODULE$").get(null); + ThriftStructFieldInfo info = (ThriftStructFieldInfo) companionUnionClass.getMethod("fieldInfo").invoke(companionUnionObj); + fields.add(info); + } catch (NoSuchFieldException e) { + throw new ScroogeSchemaConversionException("can not find fieldInfo for " + unionClass, e); + } catch (InvocationTargetException e) { + throw new ScroogeSchemaConversionException("can not find fieldInfo for " + unionClass, e); + } catch (NoSuchMethodException e) { + throw new ScroogeSchemaConversionException("can not find fieldInfo for " + unionClass, e); + } catch (IllegalAccessException e) { + throw new ScroogeSchemaConversionException("can not find fieldInfo for " + unionClass, e); + } + } + } + return fields; + } + + + /** + * Convert a field in scrooge to ThriftField in parquet + */ + public ThriftField toThriftField(ThriftStructFieldInfo scroogeField) { + Requirement requirement = getRequirementType(scroogeField); + String fieldName = scroogeField.tfield().name; + short fieldId = scroogeField.tfield().id; + byte thriftTypeByte = scroogeField.tfield().type; + ThriftTypeID typeId = ThriftTypeID.fromByte(thriftTypeByte); + ThriftType thriftType; + switch (typeId) { + case BOOL: + thriftType = new ThriftType.BoolType(); + break; + case BYTE: + thriftType = new ThriftType.ByteType(); + break; + case DOUBLE: + thriftType = new ThriftType.DoubleType(); + break; + case I16: + thriftType = new ThriftType.I16Type(); + break; + case I32: + thriftType = new ThriftType.I32Type(); + break; + case I64: + thriftType = new ThriftType.I64Type(); + break; + case STRING: + ThriftType.StringType stringType = new ThriftType.StringType(); + // There is no real binary type (see THRIFT-1920) in Thrift, + // binary data is represented by String type with an additional binary flag. + if (!String.class.equals(scroogeField.manifest().runtimeClass())) { + stringType.setBinary(true); + } + thriftType = stringType; + break; + case STRUCT: + thriftType = convertStructTypeField(scroogeField); + break; + case MAP: + thriftType = convertMapTypeField(scroogeField, requirement); + break; + case SET: + thriftType = convertSetTypeField(scroogeField, requirement); + break; + case LIST: + thriftType = convertListTypeField(scroogeField, requirement); + break; + case ENUM: + thriftType = convertEnumTypeField(scroogeField); + break; + case STOP: + case VOID: + default: + throw new IllegalArgumentException("can't convert type " + typeId); + } + return new ThriftField(fieldName, fieldId, requirement, thriftType); + } + + private ThriftType convertSetTypeField(ThriftStructFieldInfo f, Requirement requirement) { + return convertSetTypeField(f.tfield().name, f.valueManifest().get(), requirement); + } + + private ThriftType convertSetTypeField(String fieldName, Manifest valueManifest, Requirement requirement) { + String elemName = setElemName(fieldName); + ThriftType elementType = convertClassToThriftType(elemName, requirement, valueManifest); + //Set only has one sub-field as element field, therefore using hard-coded 1 as fieldId, + //it's the same as the solution used in ElephantBird + ThriftField elementField = generateFieldWithoutId(elemName, requirement, elementType); + return new ThriftType.SetType(elementField); + } + + private ThriftType convertListTypeField(ThriftStructFieldInfo f, Requirement requirement) { + return convertListTypeField(f.tfield().name, f.valueManifest().get(), requirement); + } + + private ThriftType convertListTypeField(String fieldName, Manifest valueManifest, Requirement requirement) { + String elemName = listElemName(fieldName); + ThriftType elementType = convertClassToThriftType(elemName, requirement, valueManifest); + ThriftField elementField = generateFieldWithoutId(elemName, requirement, elementType); + return new ThriftType.ListType(elementField); + } + + private ThriftType convertMapTypeField(ThriftStructFieldInfo f, Requirement requirement) { + return convertMapTypeField(f.tfield().name, f.keyManifest().get(), f.valueManifest().get(), requirement); + } + + private ThriftType convertMapTypeField(String fieldName, Manifest keyManifest, Manifest valueManifest, Requirement requirement) { + + String keyName = mapKeyName(fieldName); + String valueName = mapValueName(fieldName); + ThriftType keyType = convertClassToThriftType(keyName, requirement, keyManifest); + ThriftField keyField = generateFieldWithoutId(keyName, requirement, keyType); + + ThriftType valueType = convertClassToThriftType(valueName, requirement, valueManifest); + ThriftField valueField = generateFieldWithoutId(valueName, requirement, valueType); + + return new ThriftType.MapType(keyField, valueField); + } + + /** + * Generate artificial field, this kind of fields do not have a field ID. + * To be consistent with the behavior in ElephantBird, here uses 1 as the field ID + */ + private ThriftField generateFieldWithoutId(String fieldName, Requirement requirement, ThriftType thriftType) { + return new ThriftField(fieldName, (short) 1, requirement, thriftType); + } + + /** + * In composite types, such as the type of the key in a map, since we use reflection to get the type class, this method + * does conversion based on the class provided. + * + * @return converted ThriftType + */ + private ThriftType convertClassToThriftType(String name, Requirement requirement, Manifest typeManifest) { + Class typeClass = typeManifest.runtimeClass(); + if (typeManifest.runtimeClass() == boolean.class) { + return new ThriftType.BoolType(); + } else if (typeClass == byte.class) { + return new ThriftType.ByteType(); + } else if (typeClass == double.class) { + return new ThriftType.DoubleType(); + } else if (typeClass == short.class) { + return new ThriftType.I16Type(); + } else if (typeClass == int.class) { + return new ThriftType.I32Type(); + } else if (typeClass == long.class) { + return new ThriftType.I64Type(); + } else if (typeClass == String.class) { + return new ThriftType.StringType(); + } else if (typeClass == ByteBuffer.class) { + return new ThriftType.StringType(); + } else if (typeClass == scala.collection.Seq.class) { + Manifest a = typeManifest.typeArguments().apply(0); + return convertListTypeField(name, a, requirement); + } else if (typeClass == scala.collection.Set.class) { + Manifest setElementManifest = typeManifest.typeArguments().apply(0); + return convertSetTypeField(name, setElementManifest, requirement); + } else if (typeClass == scala.collection.Map.class) { + List> ms = JavaConversions.seqAsJavaList(typeManifest.typeArguments()); + Manifest keyManifest = ms.get(0); + Manifest valueManifest = ms.get(1); + return convertMapTypeField(name, keyManifest, valueManifest, requirement); + } else if (com.twitter.scrooge.ThriftEnum.class.isAssignableFrom(typeClass)) { + return convertEnumTypeField(typeClass, name); + } else { + return convertStructFromClass(typeClass); + } + } + + private ThriftType convertStructTypeField(ThriftStructFieldInfo f) { + return convertStructFromClass(f.manifest().runtimeClass()); + } + + /** + * When define an enum in scrooge, each enum value is a subclass of the enum class, the enum class could be Operation$ + */ + private List getEnumList(String enumName) throws ClassNotFoundException, IllegalAccessException, NoSuchFieldException, NoSuchMethodException, InvocationTargetException { + enumName += "$";//In scala generated code, the actual class is ended with $ + Class companionObjectClass = Class.forName(enumName); + Object cObject = companionObjectClass.getField("MODULE$").get(null); + Method listMethod = companionObjectClass.getMethod("list", new Class[]{}); + Object result = listMethod.invoke(cObject, null); + return JavaConversions.seqAsJavaList((Seq) result); + } + + public ThriftType convertEnumTypeField(ThriftStructFieldInfo f) { + return convertEnumTypeField(f.manifest().runtimeClass(), f.tfield().name); + } + + private ThriftType convertEnumTypeField(Class enumClass, String fieldName) { + List enumValues = new ArrayList(); + String enumName = enumClass.getName(); + try { + List enumCollection = getEnumList(enumName); + for (Object enumObj : enumCollection) { + ScroogeEnumDesc enumDesc = ScroogeEnumDesc.fromEnum(enumObj); + enumValues.add(new ThriftType.EnumValue(enumDesc.id, enumDesc.originalName)); + } + return new ThriftType.EnumType(enumValues); + } catch (RuntimeException e) { + throw new ScroogeSchemaConversionException("Can not convert enum field " + fieldName, e); + } catch (NoSuchMethodException e) { + throw new ScroogeSchemaConversionException("Can not convert enum field " + fieldName, e); + } catch (IllegalAccessException e) { + throw new ScroogeSchemaConversionException("Can not convert enum field " + fieldName, e); + } catch (NoSuchFieldException e) { + throw new ScroogeSchemaConversionException("Can not convert enum field " + fieldName, e); + } catch (InvocationTargetException e) { + throw new ScroogeSchemaConversionException("Can not convert enum field " + fieldName, e); + } catch (ClassNotFoundException e) { + throw new ScroogeSchemaConversionException("Can not convert enum field " + fieldName, e); + } + + } + + //In scrooge generated class, if a class is a union, then it must have a field called "Union" + private boolean isUnion(Class klass) { + for (Field f : klass.getDeclaredFields()) { + if (f.getName().equals("Union")) + return true; + } + return false; + } + + + private Requirement getRequirementType(ThriftStructFieldInfo f) { + if (f.isOptional() && !f.isRequired()) { + return OPTIONAL; + } else if (f.isRequired() && !f.isOptional()) { + return REQUIRED; + } else if (!f.isOptional() && !f.isRequired()) { + return DEFAULT; + } else { + throw new ScroogeSchemaConversionException("can not determine requirement type for : " + f.toString() + + ", isOptional=" + f.isOptional() + ", isRequired=" + f.isRequired()); + } + } + + private static class ScroogeEnumDesc { + private int id; + private String originalName; + + public static ScroogeEnumDesc fromEnum(Object rawScroogeEnum) throws NoSuchMethodException, InvocationTargetException, IllegalAccessException { + Class enumClass = rawScroogeEnum.getClass(); + Method valueMethod = enumClass.getMethod("value", new Class[]{}); + Method originalNameMethod = enumClass.getMethod("originalName", new Class[]{}); + ScroogeEnumDesc result = new ScroogeEnumDesc(); + result.id = (Integer) valueMethod.invoke(rawScroogeEnum, null); + result.originalName = (String) originalNameMethod.invoke(rawScroogeEnum, null); + return result; + } + } +} diff --git a/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeWriteSupport.java b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeWriteSupport.java new file mode 100644 index 0000000000..5d9f3ffb72 --- /dev/null +++ b/scalding-parquet-scrooge/src/main/java/com/twitter/scalding/parquet/scrooge/ScroogeWriteSupport.java @@ -0,0 +1,65 @@ +/** + * Copyright 2012 Twitter, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.twitter.scalding.parquet.scrooge; + +import com.twitter.scrooge.ThriftStruct; + +import org.apache.hadoop.conf.Configuration; +import org.apache.thrift.TException; + +import org.apache.parquet.hadoop.thrift.AbstractThriftWriteSupport; +import org.apache.parquet.io.ParquetEncodingException; +import org.apache.parquet.thrift.struct.ThriftType.StructType; + +/** + * Write support for Scrooge + */ +public class ScroogeWriteSupport extends AbstractThriftWriteSupport { + public static void setScroogeClass(Configuration configuration, Class thriftClass) { + AbstractThriftWriteSupport.setGenericThriftClass(configuration, thriftClass); + } + + public static Class getScroogeClass(Configuration configuration) { + return (Class)AbstractThriftWriteSupport.getGenericThriftClass(configuration); + } + + /** + * used from hadoop + * the configuration must contain a "parquet.thrift.write.class" setting + * (see ScroogeWriteSupport#setScroogeClass) + */ + public ScroogeWriteSupport() { + } + + public ScroogeWriteSupport(Class thriftClass) { + super(thriftClass); + } + + @Override + protected StructType getThriftStruct() { + ScroogeStructConverter schemaConverter = new ScroogeStructConverter(); + return schemaConverter.convert(thriftClass); + } + + @Override + public void write(T record) { + try { + record.write(parquetWriteProtocol); + } catch (TException e) { + throw new ParquetEncodingException(e); + } + } +} diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/Parquet346ScroogeScheme.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/Parquet346ScroogeScheme.scala new file mode 100644 index 0000000000..925482a47b --- /dev/null +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/Parquet346ScroogeScheme.scala @@ -0,0 +1,86 @@ +package com.twitter.scalding.parquet.scrooge + +import cascading.flow.FlowProcess +import cascading.tap.Tap +import com.twitter.scalding.parquet.ParquetValueScheme +import com.twitter.scalding.parquet.thrift.Parquet346StructTypeRepairer +import com.twitter.scrooge.{ThriftStruct, ThriftStructCodec} +import org.apache.hadoop.mapred.{JobConf, OutputCollector, RecordReader} +import org.apache.parquet.hadoop.thrift.ThriftReadSupport +import org.apache.parquet.schema.MessageType +import org.apache.parquet.thrift.struct.ThriftType.StructType +import org.apache.parquet.thrift.{ThriftReader, ThriftRecordConverter} +import org.apache.thrift.protocol.TProtocol + +import scala.util.control.NonFatal + +/** + * This file contains workarounds for PARQUET-346, everything in it should be removed once that bug is fixed + * in upstream parquet. + * + * The root issue is that ScroogeRecordConverter passes a schema based on the file metadata to + * ThriftRecordConverter that may be missing structOrUnionType metadata. This metadata is not actually needed, + * but parquet currently throws if it's missing. The (temporary) "fix" is to populate this metadata by setting + * all structOrUnionType fields to UNION. + */ + +/** + * The same as ParquetScroogeScheme, but sets the record convert to Parquet346ScroogeRecordConverter + */ +class Parquet346ScroogeScheme[T <: ThriftStruct](config: ParquetValueScheme.Config[T]) + extends ParquetScroogeScheme[T](config) { + + override def sourceConfInit( + fp: FlowProcess[JobConf], + tap: Tap[JobConf, RecordReader[_, _], OutputCollector[_, _]], + jobConf: JobConf + ): Unit = { + + super.sourceConfInit(fp, tap, jobConf) + + // Use the fixed record converter instead of the one set in super + ThriftReadSupport.setRecordConverterClass(jobConf, classOf[Parquet346ScroogeRecordConverter[_]]) + } +} + +object Parquet346ScroogeRecordConverter { + + /** + * Same as the (private) getCodec in ScroogeRecordConverter + */ + def getCodec[T <: ThriftStruct](klass: Class[T]): ThriftStructCodec[T] = + try { + val companionClass = Class.forName(klass.getName + "$") + val companionObject: AnyRef = companionClass.getField("MODULE$").get(null) + companionObject.asInstanceOf[ThriftStructCodec[T]] + } catch { + case NonFatal(e) => throw new RuntimeException("Unable to create ThriftStructCodec", e) + } + +} + +/** + * Same as ScroogeRecordConverter with one important (subtle) difference. It passes a repaired schema + * (StructType) to ThriftRecordConverter's constructor. This is important because older files don't contain + * all the metadata needed for ThriftSchemaConverter to not throw, but we can put dummy data in there because + * it's not actually used. + */ +class Parquet346ScroogeRecordConverter[T <: ThriftStruct]( + thriftClass: Class[T], + parquetSchema: MessageType, + thriftType: StructType +) extends ThriftRecordConverter[T]( + // this is a little confusing because it's all being passed to the super constructor + + // this thrift reader is the same as what's in ScroogeRecordConverter's constructor + new ThriftReader[T] { + val codec: ThriftStructCodec[T] = Parquet346ScroogeRecordConverter.getCodec(thriftClass) + def readOneRecord(protocol: TProtocol): T = codec.decode(protocol) + }, + thriftClass.getSimpleName, + parquetSchema, + + // this is the fix -- we add in the missing structOrUnionType metadata + // before passing it along + Parquet346StructTypeRepairer.repair(thriftType) + ) diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetScrooge.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetScrooge.scala new file mode 100644 index 0000000000..2d999705e3 --- /dev/null +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/ParquetScrooge.scala @@ -0,0 +1,33 @@ +package com.twitter.scalding.parquet.scrooge + +import cascading.scheme.Scheme +import com.twitter.scalding._ +import com.twitter.scalding.parquet.thrift.ParquetThriftBaseFileSource +import com.twitter.scalding.source.{DailySuffixSource, HourlySuffixSource} +import com.twitter.scrooge.ThriftStruct + +import scala.reflect.ClassTag + +trait ParquetScrooge[T <: ThriftStruct] extends ParquetThriftBaseFileSource[T] { + + override def hdfsScheme = { + // See docs in Parquet346ScroogeScheme + val scheme = new Parquet346ScroogeScheme[T](this.config) + HadoopSchemeInstance(scheme.asInstanceOf[Scheme[_, _, _, _, _]]) + } + +} + +class DailySuffixParquetScrooge[T <: ThriftStruct](path: String, dateRange: DateRange)(implicit + override val ct: ClassTag[T] +) extends DailySuffixSource(path, dateRange) + with ParquetScrooge[T] + +class HourlySuffixParquetScrooge[T <: ThriftStruct](path: String, dateRange: DateRange)(implicit + override val ct: ClassTag[T] +) extends HourlySuffixSource(path, dateRange) + with ParquetScrooge[T] + +class FixedPathParquetScrooge[T <: ThriftStruct](paths: String*)(implicit override val ct: ClassTag[T]) + extends FixedPathSource(paths: _*) + with ParquetScrooge[T] diff --git a/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSource.scala b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSource.scala new file mode 100644 index 0000000000..7667f85f9c --- /dev/null +++ b/scalding-parquet-scrooge/src/main/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSource.scala @@ -0,0 +1,56 @@ +package com.twitter.scalding.parquet.scrooge + +import _root_.cascading.scheme.Scheme +import com.twitter.scalding._ +import com.twitter.scalding.parquet.thrift.ParquetThriftBase +import com.twitter.scalding.typed.{PartitionSchemed, PartitionUtil} +import com.twitter.scrooge.ThriftStruct + +import scala.reflect.ClassTag + +/** + * Scalding source to read or write partitioned Parquet scrooge data. + * + * For writing it expects a pair of `(P, T)`, where `P` is the data used for partitioning and `T` is the + * scrooge object. `P` must be either a String or a tuple of Strings. Below is an example. + * {{{ + * val data: TypedPipe[MyScroogeObject] = ??? + * data.map { obj => + * ( (obj.country, obj.city), obj) + * }.write(PartitionedParquetScroogeSource[(String, String), MyScroogeObject](path, "%s/%s")) + * }}} + * + * For reading it produces a pair `(P, T)` where `P` is the partition data, `T` is the corresponding scrooge + * object. Below is an example. + * {{{ + * val in: TypedPipe[(String, String), MyScroogeObject] = + * TypedPipe.from( PartitionedParquetScroogeSource[(String, String), MyScroogeObject](path, "%s/%s") ) + * }}} + */ +case class PartitionedParquetScroogeSource[P, T <: ThriftStruct](path: String, template: String)(implicit + val ct: ClassTag[T], + val valueSetter: TupleSetter[T], + val valueConverter: TupleConverter[T], + val partitionSetter: TupleSetter[P], + val partitionConverter: TupleConverter[P] +) extends FixedPathSource(path) + with ParquetThriftBase[T] + with PartitionSchemed[P, T] + with Serializable { + + override val fields = PartitionUtil.toFields(0, implicitly[TupleSetter[T]].arity) + + assert( + fields.size == valueSetter.arity, + "The number of fields needs to be the same as the arity of the value setter" + ) + + // Create the underlying scheme and explicitly set the source, sink fields to be only the specified fields + override def hdfsScheme = { + val scroogeScheme = new Parquet346ScroogeScheme[T](this.config) + val scheme = HadoopSchemeInstance(scroogeScheme.asInstanceOf[Scheme[_, _, _, _, _]]) + scheme.setSinkFields(fields) + scheme.setSourceFields(fields) + scheme + } +} diff --git a/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeSchemeTest.java b/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeSchemeTest.java new file mode 100644 index 0000000000..bd626832ea --- /dev/null +++ b/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ParquetScroogeSchemeTest.java @@ -0,0 +1,237 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package com.twitter.scalding.parquet.scrooge; + +import cascading.flow.Flow; +import cascading.flow.FlowProcess; +import cascading.flow.hadoop.HadoopFlowConnector; +import cascading.operation.BaseOperation; +import cascading.operation.Function; +import cascading.operation.FunctionCall; +import cascading.pipe.Each; +import cascading.pipe.Pipe; +import cascading.scheme.Scheme; +import cascading.scheme.hadoop.TextLine; +import cascading.tap.Tap; +import cascading.tap.hadoop.Hfs; +import cascading.tuple.Fields; +import cascading.tuple.Tuple; +import cascading.tuple.TupleEntry; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.mapreduce.TaskAttemptID; +import org.apache.thrift.TBase; +import org.apache.thrift.protocol.TCompactProtocol; +import org.apache.thrift.protocol.TProtocol; +import org.apache.thrift.protocol.TProtocolFactory; +import org.apache.thrift.transport.TIOStreamTransport; +import org.junit.Test; +import org.apache.parquet.hadoop.thrift.ThriftToParquetFileWriter; +import org.apache.parquet.hadoop.util.ContextUtil; +import com.twitter.scalding.parquet.ParquetValueScheme.Config; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.TestPersonWithAllInformation; +import com.twitter.scalding.parquet.scrooge.thrift_java.test.Address; +import com.twitter.scalding.parquet.scrooge.thrift_java.test.Phone; +import com.twitter.scalding.parquet.scrooge.thrift_java.test.RequiredPrimitiveFixture; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.Name; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.Name$; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import scala.Option; + +import static org.junit.Assert.assertEquals; + +/** + * Write data in thrift, read in scrooge + * + * @author Tianshuo Deng + */ +public class ParquetScroogeSchemeTest { + + public static final String PARQUET_PATH = "target/test/TestParquetToThriftReadProjection/file.parquet"; + public static final String TXT_OUTPUT_PATH = "target/test/TestParquetToThriftReadProjection/output.txt"; + + @Test + public void testWritePrimitveThriftReadScrooge() throws Exception { + RequiredPrimitiveFixture toWrite = new RequiredPrimitiveFixture(true, (byte)2, (short)3, 4, (long)5, 6.0, "7"); + toWrite.setInfo_string("it's info"); + verifyScroogeRead(thriftRecords(toWrite), com.twitter.scalding.parquet.scrooge.thrift_scala.test.RequiredPrimitiveFixture.class, "RequiredPrimitiveFixture(true,2,3,4,5,6.0,7,Some(it's info))\n", "**"); + } + + @Test + public void testNestedReadingInScrooge() throws Exception { + Map phoneMap = new HashMap(); + phoneMap.put("key1", new com.twitter.scalding.parquet.scrooge.thrift_java.test.Phone("111", "222")); + com.twitter.scalding.parquet.scrooge.thrift_java.test.TestPersonWithAllInformation toWrite = new com.twitter.scalding.parquet.scrooge.thrift_java.test.TestPersonWithAllInformation(new com.twitter.scalding.parquet.scrooge.thrift_java.test.Name("first"), new Address("my_street", "my_zip"), phoneMap); + toWrite.setInfo("my_info"); + + String expected = "TestPersonWithAllInformation(Name(first,None),None,Address(my_street,my_zip),None,Some(my_info),Map(key1 -> Phone(111,222)),None,None)\n"; + verifyScroogeRead(thriftRecords(toWrite), TestPersonWithAllInformation.class, expected, "**"); + + String expectedProjected = "TestPersonWithAllInformation(Name(first,None),None,Address(my_street,my_zip),None,Some(my_info),Map(),None,None)\n"; + verifyScroogeRead(thriftRecords(toWrite), TestPersonWithAllInformation.class, expectedProjected, "address/*;info;name/first_name"); + } + + private static class ObjectToStringFunction extends BaseOperation implements Function { + @Override + public void operate(FlowProcess flowProcess, FunctionCall functionCall) { + Object record = functionCall.getArguments().getObject(0); + Tuple result = new Tuple(); + result.add(record.toString()); + functionCall.getOutputCollector().add(result); + } + } + + public void verifyScroogeRead(List recordsToWrite, Class readClass, String expectedStr, String projectionFilter) throws Exception { + Configuration conf = new Configuration(); + deleteIfExist(PARQUET_PATH); + deleteIfExist(TXT_OUTPUT_PATH); + final Path parquetFile = new Path(PARQUET_PATH); + writeParquetFile(recordsToWrite, conf, parquetFile); + + Scheme sourceScheme = new ParquetScroogeScheme(new Config().withRecordClass(readClass).withProjectionString(projectionFilter)); + Tap source = new Hfs(sourceScheme, PARQUET_PATH); + + Scheme sinkScheme = new TextLine(new Fields("first", "last")); + Tap sink = new Hfs(sinkScheme, TXT_OUTPUT_PATH); + + Pipe assembly = new Pipe("namecp"); + assembly = new Each(assembly, new ObjectToStringFunction()); + Flow flow = new HadoopFlowConnector().connect("namecp", source, sink, assembly); + + flow.complete(); + String result = FileUtils.readFileToString(new File(TXT_OUTPUT_PATH + "/part-00000")); + assertEquals(expectedStr, result); + } + + private void writeParquetFile(List recordsToWrite, Configuration conf, Path parquetFile) throws IOException, InterruptedException, org.apache.thrift.TException { + //create a test file + final TProtocolFactory protocolFactory = new TCompactProtocol.Factory(); + final TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0); + Class writeClass = recordsToWrite.get(0).getClass(); + final ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(parquetFile, ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, writeClass); + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos)); + for (TBase recordToWrite : recordsToWrite) { + recordToWrite.write(protocol); + } + w.write(new BytesWritable(baos.toByteArray())); + w.close(); + } + + private List thriftRecords(TBase... records) { + List result = new ArrayList(); + for (TBase record : records) { + result.add(record); + } + return result; + } + + private void deleteIfExist(String path) throws IOException { + Path p = new Path(path); + Configuration conf = new Configuration(); + final FileSystem fs = p.getFileSystem(conf); + if (fs.exists(p)) { + fs.delete(p, true); + } + } + + final String txtInputPath = "src/test/resources/names.txt"; + final String parquetOutputPath = "target/test/ParquetScroogeScheme/names-parquet-out"; + final String txtOutputPath = "target/test/ParquetScroogeScheme/names-txt-out"; + + @Test + public void testWriteThenRead() throws Exception { + doWrite(); + doRead(); + } + + private void doWrite() throws Exception { + Path path = new Path(parquetOutputPath); + final FileSystem fs = path.getFileSystem(new Configuration()); + if (fs.exists(path)) fs.delete(path, true); + + Scheme sourceScheme = new TextLine( new Fields( "first", "last" ) ); + Tap source = new Hfs(sourceScheme, txtInputPath); + + Scheme sinkScheme = new ParquetScroogeScheme(Name.class); + Tap sink = new Hfs(sinkScheme, parquetOutputPath); + + Pipe assembly = new Pipe( "namecp" ); + assembly = new Each(assembly, new PackThriftFunction()); + Flow flow = new HadoopFlowConnector().connect("namecp", source, sink, assembly); + + flow.complete(); + } + + private void doRead() throws Exception { + Path path = new Path(txtOutputPath); + final FileSystem fs = path.getFileSystem(new Configuration()); + if (fs.exists(path)) fs.delete(path, true); + + Scheme sourceScheme = new ParquetScroogeScheme(Name.class); + Tap source = new Hfs(sourceScheme, parquetOutputPath); + + Scheme sinkScheme = new TextLine(new Fields("first", "last")); + Tap sink = new Hfs(sinkScheme, txtOutputPath); + + Pipe assembly = new Pipe( "namecp" ); + assembly = new Each(assembly, new UnpackThriftFunction()); + Flow flow = new HadoopFlowConnector().connect("namecp", source, sink, assembly); + + flow.complete(); + String result = FileUtils.readFileToString(new File(txtOutputPath+"/part-00000")); + assertEquals("0\tAlice\tPractice\n15\tBob\tHope\n24\tCharlie\tHorse\n", result); + } + + private static class PackThriftFunction extends BaseOperation implements Function { + @Override + public void operate(FlowProcess flowProcess, FunctionCall functionCall) { + TupleEntry arguments = functionCall.getArguments(); + Tuple result = new Tuple(); + + Name name = Name$.MODULE$.apply(arguments.getString(0), Option.apply(arguments.getString(1))); + + result.add(name); + functionCall.getOutputCollector().add(result); + } + } + + private static class UnpackThriftFunction extends BaseOperation implements Function { + @Override + public void operate(FlowProcess flowProcess, FunctionCall functionCall) { + TupleEntry arguments = functionCall.getArguments(); + Tuple result = new Tuple(); + + Name name = (Name) arguments.getObject(0); + result.add(name.firstName()); + result.add(name.lastName().get()); + functionCall.getOutputCollector().add(result); + } + } +} diff --git a/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ScroogeBinaryTest.java b/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ScroogeBinaryTest.java new file mode 100644 index 0000000000..0686e79405 --- /dev/null +++ b/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ScroogeBinaryTest.java @@ -0,0 +1,100 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package com.twitter.scalding.parquet.scrooge; + +import java.io.File; +import java.nio.ByteBuffer; +import java.util.UUID; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.ParquetWriter; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.StringAndBinary; +import org.apache.parquet.thrift.ThriftParquetReader; + +public class ScroogeBinaryTest { + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + + @Test + public void testScroogeBinaryEncoding() throws Exception { + StringAndBinary expected = new StringAndBinary.Immutable("test", + ByteBuffer.wrap(new byte[] {-123, 20, 33})); + + File temp = tempDir.newFile(UUID.randomUUID().toString()); + temp.deleteOnExit(); + temp.delete(); + + Path path = new Path(temp.getPath()); + + ParquetWriter writer = new ParquetWriter( + path, new Configuration(), new ScroogeWriteSupport(StringAndBinary.class)); + writer.write(expected); + writer.close(); + + // read using the parquet-thrift version to isolate the write path + ParquetReader reader = ThriftParquetReader. + build(path) + .withThriftClass(com.twitter.scalding.parquet.scrooge.thrift_java.test.binary.StringAndBinary.class) + .build(); + com.twitter.scalding.parquet.scrooge.thrift_java.test.binary.StringAndBinary record = reader.read(); + reader.close(); + + Assert.assertEquals("String should match after serialization round trip", + "test", record.s); + Assert.assertEquals("ByteBuffer should match after serialization round trip", + ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b); + } + + @Test + @SuppressWarnings("unchecked") + public void testScroogeBinaryDecoding() throws Exception { + StringAndBinary expected = new StringAndBinary.Immutable("test", + ByteBuffer.wrap(new byte[] {-123, 20, 33})); + + File temp = tempDir.newFile(UUID.randomUUID().toString()); + temp.deleteOnExit(); + temp.delete(); + + Path path = new Path(temp.getPath()); + + ParquetWriter writer = new ParquetWriter( + path, new Configuration(), new ScroogeWriteSupport(StringAndBinary.class)); + writer.write(expected); + writer.close(); + + Configuration conf = new Configuration(); + conf.set("parquet.thrift.converter.class", ScroogeRecordConverter.class.getName()); + ParquetReader reader = ParquetReader. + builder(new ScroogeReadSupport(), path) + .withConf(conf) + .build(); + StringAndBinary record = reader.read(); + reader.close(); + + Assert.assertEquals("String should match after serialization round trip", + "test", record.s()); + Assert.assertEquals("ByteBuffer should match after serialization round trip", + ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b()); + } +} diff --git a/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTest.java b/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTest.java new file mode 100644 index 0000000000..0ed0bdeb03 --- /dev/null +++ b/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTest.java @@ -0,0 +1,59 @@ +package com.twitter.scalding.parquet.scrooge; + +import cascading.tuple.Fields; +import cascading.tuple.Tuple; +import cascading.tuple.TupleEntry; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.Address; +import com.twitter.scalding.parquet.tuple.TupleWriteSupport; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.thrift.ThriftReadSupport; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.thrift.ThriftSchemaConverter; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.File; +import java.util.UUID; + +public class ScroogeReadSupportTest { + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + + @Test + public void testReadParquetWithoutThriftMetadata() throws Exception { + File temp = tempDir.newFile(UUID.randomUUID().toString()); + temp.deleteOnExit(); + temp.delete(); + Path path = new Path(temp.getPath()); + + Address expected = new Address.Immutable("street1", "zip1"); + // Corresponding tuple entry for the above object + TupleEntry entry = new TupleEntry(new Fields("street", "zip"), new Tuple(expected.street(), expected.zip())); + // Getting corresponding MessageType from the Address thrift + MessageType schema = new ThriftSchemaConverter().convert(new ScroogeStructConverter().convert(Address.class)); + + Configuration conf = new Configuration(); + conf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, schema.toString()); + + // Writing using TupleWriter, this does not add metadata to the parquet + ParquetWriter writer = new ParquetWriter(path, conf, new TupleWriteSupport()); + writer.write(entry); + writer.close(); + + conf.set(ThriftReadSupport.THRIFT_READ_CLASS_KEY, Address.class.getName()); + // Reading using ScroogeReadSupport + ParquetReader

reader = ParquetReader.
+ builder(new ScroogeReadSupport(), path) + .withConf(conf) + .build(); + Address record = reader.read(); + reader.close(); + + Assert.assertEquals(record, expected); + } +} diff --git a/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ScroogeStructConverterTest.java b/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ScroogeStructConverterTest.java new file mode 100644 index 0000000000..26143e4141 --- /dev/null +++ b/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/ScroogeStructConverterTest.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package com.twitter.scalding.parquet.scrooge; + +import org.apache.parquet.schema.MessageTypeParser; +import org.apache.thrift.TBase; +import org.junit.Test; + +import org.apache.parquet.schema.MessageType; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.AddressWithStreetWithDefaultRequirement; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.ListNestEnum; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.ListNestMap; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.ListNestSet; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.MapNestList; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.MapNestMap; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.MapNestSet; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.NestedList; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.SetNestList; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.SetNestMap; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.SetNestSet; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.StringAndBinary; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.TestFieldOfEnum; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.TestListPrimitive; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.TestMapBinary; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.TestMapComplex; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.TestMapPrimitiveKey; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.TestMapPrimitiveValue; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.TestOptionalMap; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.TestPersonWithAllInformation; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.TestSetPrimitive; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.TestUnion; +import org.apache.parquet.thrift.ThriftSchemaConverter; +import org.apache.parquet.thrift.struct.ThriftType; + +import static org.junit.Assert.assertEquals; + +/** + * Test convert scrooge schema to Parquet Schema + */ +public class ScroogeStructConverterTest { + + /** + * Convert ThriftStructs from a thrift class and a scrooge class, assert + * they are the same + * @param scroogeClass + */ + private void shouldConvertConsistentlyWithThriftStructConverter(Class scroogeClass) throws ClassNotFoundException { + Class> thriftClass = (Class>)Class.forName(scroogeClass.getName().replaceFirst("com.twitter.scalding.parquet.scrooge.thrift_scala.test", "com.twitter.scalding.parquet.scrooge.thrift_java.test")); + ThriftType.StructType structFromThriftSchemaConverter = new ThriftSchemaConverter().toStructType(thriftClass); + ThriftType.StructType structFromScroogeSchemaConverter = new ScroogeStructConverter().convert(scroogeClass); + + assertEquals(toParquetSchema(structFromThriftSchemaConverter), toParquetSchema(structFromScroogeSchemaConverter)); + } + + private MessageType toParquetSchema(ThriftType.StructType struct) { + ThriftSchemaConverter sc = new ThriftSchemaConverter(); + return sc.convert(struct); + } + + @Test + public void testConvertPrimitiveMapKey() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(TestMapPrimitiveKey.class); + } + + @Test + public void testBinary() throws Exception { + // Commenting out because this is comparing scala generated class with java generated class. The java class is + // incorrectly generated, so this is currently an invalid test. See testScroogeBinary() instead. + // shouldConvertConsistentlyWithThriftStructConverter(StringAndBinary.class); + } + + @Test + public void testScroogeBinary() { + + MessageType expected = MessageTypeParser.parseMessageType( + "message ParquetSchema {\n" + + "required binary s (UTF8) = 1;\n" + + "required binary b = 2;\n" + + "}"); + assertEquals(expected, toParquetSchema(new ScroogeStructConverter().convert(StringAndBinary.class))); + + } + + @Test + public void testUnion() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(TestUnion.class); + } + + @Test + public void testConvertPrimitiveMapValue() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(TestMapPrimitiveValue.class); + } + + @Test + public void testConvertPrimitiveList() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(TestListPrimitive.class); + } + + @Test + public void testConvertPrimitiveSet() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(TestSetPrimitive.class); + } + + @Test + public void testConvertEnum() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(TestFieldOfEnum.class); + } + + @Test + public void testMapBinary() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(TestMapBinary.class); + } + + @Test + public void testMapComplex() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(TestMapComplex.class); + } + + @Test + public void testConvertStruct() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(TestPersonWithAllInformation.class); + } + + @Test + public void testDefaultFields() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(AddressWithStreetWithDefaultRequirement.class); + } + + @Test + public void testConvertOptionalPrimitiveMap() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(TestOptionalMap.class); + } + + @Test + public void testConvertNestedList() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(NestedList.class); + } + + @Test + public void testConvertListNestMap() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(ListNestMap.class); + } + + @Test + public void testConvertListNestEnum() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(ListNestEnum.class); + } + + @Test + public void testConvertMapNestList() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(MapNestList.class); + } + + @Test + public void testConvertMapNestMap() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(MapNestMap.class); + } + + @Test + public void testConvertMapNestSet() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(MapNestSet.class); + } + + @Test + public void testConvertListNestSet() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(ListNestSet.class); + } + + @Test + public void testConvertSetNestSet() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(SetNestSet.class); + } + + @Test + public void testConvertSetNestList() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(SetNestList.class); + } + + @Test + public void testConvertSetNestMap() throws Exception { + shouldConvertConsistentlyWithThriftStructConverter(SetNestMap.class); + } + +} diff --git a/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/TestCorruptScroogeRecords.java b/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/TestCorruptScroogeRecords.java new file mode 100644 index 0000000000..f5439e84be --- /dev/null +++ b/scalding-parquet-scrooge/src/test/java/com/twitter/scalding/parquet/scrooge/TestCorruptScroogeRecords.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package com.twitter.scalding.parquet.scrooge; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; +import org.apache.thrift.protocol.TBinaryProtocol.Factory; +import org.apache.thrift.protocol.TProtocol; +import org.apache.thrift.transport.TIOStreamTransport; + +import org.apache.parquet.hadoop.thrift.TestCorruptThriftRecords; +import org.apache.parquet.hadoop.thrift.ThriftReadSupport; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.StructWithUnionV2; +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.StructWithUnionV2$; + +import static org.junit.Assert.assertEquals; + +public class TestCorruptScroogeRecords extends TestCorruptThriftRecords { + + @Override + public void setupJob(Job job, Path path) throws Exception { + job.setInputFormatClass(ParquetScroogeInputFormat.class); + ParquetScroogeInputFormat.setInputPaths(job, path); + ParquetScroogeInputFormat.setThriftClass(job.getConfiguration(), StructWithUnionV2.class); + + + ThriftReadSupport.setRecordConverterClass(job.getConfiguration(), ScroogeRecordConverter.class); + + job.setMapperClass(ReadMapper.class); + job.setNumReduceTasks(0); + job.setOutputFormatClass(NullOutputFormat.class); + } + + @Override + protected void assertEqualsExcepted(List expected, List found) throws Exception { + List scroogeExpected = new ArrayList(); + for (org.apache.parquet.thrift.test.compat.StructWithUnionV2 tbase : expected) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + TProtocol out = new Factory().getProtocol(new TIOStreamTransport(baos)); + tbase.write(out); + TProtocol in = new Factory().getProtocol(new TIOStreamTransport(new ByteArrayInputStream(baos.toByteArray()))); + scroogeExpected.add(StructWithUnionV2$.MODULE$.decode(in)); + } + assertEquals(scroogeExpected, found); + } +} diff --git a/scalding-parquet-scrooge/src/test/resources/names.txt b/scalding-parquet-scrooge/src/test/resources/names.txt new file mode 100644 index 0000000000..cf0d55ed66 --- /dev/null +++ b/scalding-parquet-scrooge/src/test/resources/names.txt @@ -0,0 +1,3 @@ +Alice Practice +Bob Hope +Charlie Horse diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetScroogeTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetScroogeTests.scala new file mode 100644 index 0000000000..fd985242cf --- /dev/null +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ParquetScroogeTests.scala @@ -0,0 +1,99 @@ +package com.twitter.scalding.parquet.scrooge + +import com.twitter.scalding.parquet.{ + DeprecatedColumnProjectionString, + ParquetSourcesTestsBase, + StrictColumnProjectionString +} +import com.twitter.scrooge.ThriftStruct +import org.apache.thrift.protocol.TProtocol +import org.apache.parquet.filter2.predicate.FilterPredicate + +class ParquetScroogeTests extends ParquetSourcesTestsBase { + + "DailySuffixParquetScrooge" should { + val default = new DailySuffixParquetScrooge[MockThriftStruct](path, dateRange) + + testDefaultFilter(default) + + testReturnProvidedFilter(new DailySuffixParquetScrooge[MockThriftStruct](path, dateRange) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) + + testDefaultColumns(default) + + testReturnProvidedColumns( + new DailySuffixParquetScrooge[MockThriftStruct](path, dateRange) { + override def withColumns: Set[String] = columnStrings + }, + DeprecatedColumnProjectionString(columnStrings) + ) + + testReturnProvidedColumns( + new DailySuffixParquetScrooge[MockThriftStruct](path, dateRange) { + override def withColumnProjections: Set[String] = columnStrings + }, + StrictColumnProjectionString(columnStrings) + ) + + } + + "HourlySuffixParquetScrooge" should { + val default = new HourlySuffixParquetScrooge[MockThriftStruct](path, dateRange) + + testDefaultFilter(default) + + testReturnProvidedFilter(new HourlySuffixParquetScrooge[MockThriftStruct](path, dateRange) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) + + testDefaultColumns(default) + + testReturnProvidedColumns( + new HourlySuffixParquetScrooge[MockThriftStruct](path, dateRange) { + override def withColumns: Set[String] = columnStrings + }, + DeprecatedColumnProjectionString(columnStrings) + ) + + testReturnProvidedColumns( + new HourlySuffixParquetScrooge[MockThriftStruct](path, dateRange) { + override def withColumnProjections: Set[String] = columnStrings + }, + StrictColumnProjectionString(columnStrings) + ) + + } + + "FixedPathParquetScrooge" should { + val default = new FixedPathParquetScrooge[MockThriftStruct](path, path, path) + + testDefaultFilter(default) + + testReturnProvidedFilter(new FixedPathParquetScrooge[MockThriftStruct](path, path, path) { + override val withFilter: Option[FilterPredicate] = Some(filter1) + }) + + testDefaultColumns(default) + + testReturnProvidedColumns( + new FixedPathParquetScrooge[MockThriftStruct](path, path, path) { + override def withColumns: Set[String] = columnStrings + }, + DeprecatedColumnProjectionString(columnStrings) + ) + + testReturnProvidedColumns( + new FixedPathParquetScrooge[MockThriftStruct](path, path, path) { + override def withColumnProjections: Set[String] = columnStrings + }, + StrictColumnProjectionString(columnStrings) + ) + + } + +} + +class MockThriftStruct extends ThriftStruct { + override def write(oprot: TProtocol): Unit = () +} diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSourceTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSourceTests.scala new file mode 100644 index 0000000000..df106cb19a --- /dev/null +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PartitionedParquetScroogeSourceTests.scala @@ -0,0 +1,75 @@ +package com.twitter.scalding.parquet.scrooge + +import java.io.File + +import com.twitter.scalding._ +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.Address +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path +import org.apache.parquet.hadoop.ParquetReader + +import org.scalatest.{Matchers, WordSpec} + +object PartitionedParquetScroogeTestSources { + val path = "/a/path" + val partitionSource = PartitionedParquetScroogeSource[String, Address](path, "%s") +} + +class PartitionedParquetScroogeWriteJob(args: Args) extends Job(args) { + import PartitionedParquetScroogeTestSources._ + val input = Seq( + Address("123 Embarcadero", "94111"), + Address("123 E 79th St", "10075"), + Address("456 W 80th St", "10075") + ) + + TypedPipe + .from(input) + .map { case Address(street, zipcode) => (zipcode, Address(street, zipcode)) } + .write(partitionSource) +} + +class PartitionedParquetScroogeSourceTests extends WordSpec with Matchers { + import PartitionedParquetScroogeTestSources._ + + def validate(path: Path, expectedAddresses: Address*) = { + val conf: Configuration = new Configuration + conf.set("parquet.thrift.converter.class", classOf[ScroogeRecordConverter[Address]].getName) + val parquetReader: ParquetReader[Address] = + ParquetReader + .builder[Address](new ScroogeReadSupport[Address], path) + .withConf(conf) + .build() + + Stream.continually(parquetReader.read).takeWhile(_ != null).toArray shouldBe expectedAddresses + } + + "PartitionedParquetScroogeSource" should { + "write out partitioned scrooge objects" in { + var job: Job = null; + def buildJob(args: Args): Job = { + job = new PartitionedParquetScroogeWriteJob(args) + job + } + JobTest(buildJob(_)).runHadoop + .finish() + + val testMode = job.mode.asInstanceOf[HadoopTest] + + val directory = new File(testMode.getWritePathFor(partitionSource)) + + directory.listFiles().map(_.getName()).toSet shouldBe Set("94111", "10075") + + // check that the partitioning is done correctly by zipcode + validate( + new Path(directory.getPath + "/94111/part-00000-00000-m-00000.parquet"), + Address("123 Embarcadero", "94111") + ) + validate( + new Path(directory.getPath + "/10075/part-00000-00001-m-00000.parquet"), + Address("123 E 79th St", "10075"), + Address("456 W 80th St", "10075") + ) + } + } +} diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PlanningTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PlanningTests.scala new file mode 100644 index 0000000000..6de6010011 --- /dev/null +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/PlanningTests.scala @@ -0,0 +1,167 @@ +package com.twitter.scalding.parquet.scrooge + +import cascading.flow.FlowDef +import com.twitter.scalding._ +import com.twitter.scalding.source.NullSink +import com.twitter.scalding.typed.cascading_backend.CascadingBackend +import org.scalatest.FunSuite + +class PlanningTests extends FunSuite { + // How many steps would this be in Hadoop on Cascading + def steps[A](p: TypedPipe[A], opt: Boolean = true): Int = { + val mode = Hdfs.default + val fd = new FlowDef + val pipe = + if (opt) CascadingBackend.toPipe(p, NullSink.sinkFields)(fd, mode, NullSink.setter) + else CascadingBackend.toPipeUnoptimized(p, NullSink.sinkFields)(fd, mode, NullSink.setter) + NullSink.writeFrom(pipe)(fd, mode) + val ec = ExecutionContext.newContext(Config.defaultFrom(mode))(fd, mode) + val flow = ec.buildFlow.get.get + flow.getFlowSteps.size + } + + // test for https://github.com/twitter/scalding/issues/1837 + test("merging source plus mapped source works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") + + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).map(_ => null.asInstanceOf[MockThriftStruct]) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 1) + } + + test("filtering works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + + val pipe = + TypedPipe.from(src1).filter(_ => true) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 1) + } + + test("filtering and mapping works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + + val pipe = + TypedPipe.from(src1).filter(_ => true).map(_ => 1) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 1) + } + + test("mapping and filtering works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + + val pipe = + TypedPipe.from(src1).map(_ => 1).filter(_ => true) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 1) + } + + test("merging source plus filter source works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") + + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).filter(_ => true) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 1) + } + + test("merging source plus forceToDisk.filter source works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") + + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).forceToDisk.filter(_ => true) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 2) + } + + test("merging source plus forceToDisk source works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") + + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).forceToDisk + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 2) + } + + test("merging source plus onComplete.filter source works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") + + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).onComplete(() => println("done")).filter(_ => true) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 1) + } + + test("merging source plus onComplete source works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") + + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).onComplete(() => println("done")) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 1) + } + + test("merging source plus withDescription.filter source works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") + + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).withDescription("foo").filter(_ => true) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 1) + } + + test("merging source plus debug.filter source works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") + + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).debug.filter(_ => true) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 1) + } + + test("merging source plus filter and map source works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") + + val pipe = TypedPipe.from(src1) ++ + TypedPipe.from(src2).filter(_ => true).map(_ => null.asInstanceOf[MockThriftStruct]) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 1) + } + + test("merging source plus map and filter source works") { + val src1 = new FixedPathParquetScrooge[MockThriftStruct]("src1") + val src2 = new FixedPathParquetScrooge[MockThriftStruct]("src2") + + val pipe = TypedPipe.from(src1) ++ + TypedPipe + .from(src2) + .map(_ => null.asInstanceOf[MockThriftStruct]) + .filter(_ => true) + + assert(steps(pipe) == 1) + assert(steps(pipe, false) == 1) + } + +} diff --git a/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala new file mode 100644 index 0000000000..34c5585a1c --- /dev/null +++ b/scalding-parquet-scrooge/src/test/scala/com/twitter/scalding/parquet/scrooge/ScroogeReadSupportTests.scala @@ -0,0 +1,130 @@ +package com.twitter.scalding.parquet.scrooge + +import com.twitter.scalding.parquet.scrooge.thrift_scala.test.Address +import com.twitter.scalding.parquet.tuple.macros.Macros._ +import com.twitter.scalding.parquet.tuple.{TypedParquet, TypedParquetSink} +import com.twitter.scalding.platform.{HadoopPlatformJobTest, HadoopSharedPlatformTest} +import com.twitter.scalding.typed.TypedPipe +import com.twitter.scalding.{Args, Job} +import org.apache.parquet.io.InvalidRecordException +import org.apache.parquet.schema.MessageTypeParser +import org.scalatest.{Matchers, WordSpec} + +class ScroogeReadSupportTests extends WordSpec with Matchers with HadoopSharedPlatformTest { + + "ScroogeReadSupport getSchemaForRead" should { + "project extra optional field" in { + val fileType = MessageTypeParser.parseMessageType(""" + |message SampleClass { + | required int32 x; + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType(""" + |message SampleProjection { + | required int32 x; + | optional int32 extra; + |} + """.stripMargin) + + val schema = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + schema shouldEqual requestedProjection + } + + "fail projecting extra required field" in { + val fileType = MessageTypeParser.parseMessageType(""" + |message SampleClass { + | required int32 x; + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType(""" + |message SampleProjection { + | required int32 x; + | required int32 extra; + |} + """.stripMargin) + + an[InvalidRecordException] should be thrownBy { + ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + } + } + + "project required field using optional" in { + val fileType = MessageTypeParser.parseMessageType(""" + |message SampleClass { + | required int32 x; + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType(""" + |message SampleProjection { + | optional int32 x; + |} + """.stripMargin) + + val schema = ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + schema shouldEqual requestedProjection + } + + "fail projecting optional using required" in { + val fileType = MessageTypeParser.parseMessageType(""" + |message SampleClass { + | optional int32 x; + |} + """.stripMargin) + val requestedProjection = MessageTypeParser.parseMessageType(""" + |message SampleProjection { + | required int32 x; + |} + """.stripMargin) + + an[InvalidRecordException] should be thrownBy { + ScroogeReadSupport.getSchemaForRead(fileType, requestedProjection) + } + } + } + + "ScroogeReadSupport" should { + "write using typedparquet and read using parquet scrooge" in { + HadoopPlatformJobTest(new WriteToTypedParquetTupleJob(_), cluster) + .arg("output", "output1") + .sink[AddressCaseClass](TypedParquet[AddressCaseClass](Seq("output1"))) { in => + in should contain theSameElementsAs TypedParquetTestSources.caseClassValues + } + .run() + + HadoopPlatformJobTest(new ReadWithParquetScrooge(_), cluster) + .arg("input", "output1") + .arg("output", "output2") + .sink[Address](new FixedPathParquetScrooge[Address]("output2")) { out => + out should contain theSameElementsAs TypedParquetTestSources.thriftValues + } + .run() + } + } + +} + +object TypedParquetTestSources { + val thriftValues = Seq( + Address("123 Embarcadero", "94111"), + Address("123 E 79th St", "10075"), + Address("456 W 80th St", "10075") + ) + val caseClassValues = thriftValues.map(a => AddressCaseClass(a.street, a.zip)) +} + +case class AddressCaseClass(street: String, zip: String) + +class WriteToTypedParquetTupleJob(args: Args) extends Job(args) { + val outputPath = args.required("output") + val sink = TypedParquetSink[AddressCaseClass](outputPath) + TypedPipe.from(TypedParquetTestSources.caseClassValues).write(sink) +} + +class ReadWithParquetScrooge(args: Args) extends Job(args) { + val inputPath = args.required("input") + val outputPath = args.required("output") + + val input = new FixedPathParquetScrooge[Address](inputPath) + val sink = new FixedPathParquetScrooge[Address](outputPath) + TypedPipe.from(input).write(sink) +} diff --git a/scalding-parquet/README.md b/scalding-parquet/README.md index 5435375136..470bdc75b6 100644 --- a/scalding-parquet/README.md +++ b/scalding-parquet/README.md @@ -1,5 +1,32 @@ # Parquet support for Scalding The implementation is ported from code used by Twitter internally written by Sam Ritchie, Ian O'Connell, Oscar Boykin, Tianshuo Deng -## Use com.twitter.scalding.parquet.thrift for reading Thrift records -## Use com.twitter.tuple for reading Tuple records \ No newline at end of file +## Use com.twitter.scalding.parquet.thrift for reading apache Thrift (TBase) records +## Use com.twitter.scalding.parquet.scrooge for reading scrooge Thrift (ThriftStruct) records + Located in the scalding-parquet-scrooge module +## Use com.twitter.scalding.parquet.tuple for reading Tuple records +## Use com.twitter.scalding.parquet.tuple.TypedParquet for reading or writing case classes: +Can use macro in com.twitter.scalding.parquet.tuple.macros.Macros to generate parquet read/write support. Here's an example: +```scala + import com.twitter.scalding.parquet.tuple.macros.Macros._ + + case class SampleClass(x: Int, y: String) + + class WriteToTypedParquetTupleJob(args: Args) extends Job(args) { + val outputPath = args.required("output") + val sink = TypedParquetSink[SampleClass](outputPath) + + TypedPipe.from(List(SampleClass(0, "foo"), SampleClass(1, "bar"))).write(sink) + } + + class ReadWithFilterPredicateJob(args: Args) extends Job(args) { + val fp: FilterPredicate = FilterApi.eq(binaryColumn("y"), Binary.fromString("foo")) + + val inputPath = args.required("input") + val outputPath = args.required("output") + + val input = TypedParquet[SampleClass](inputPath, fp) + + TypedPipe.from(input).map(_.x).write(TypedTsv[Int](outputPath)) + } +``` \ No newline at end of file diff --git a/scalding-parquet/src/main/java/com/twitter/scalding/parquet/ParquetValueScheme.java b/scalding-parquet/src/main/java/com/twitter/scalding/parquet/ParquetValueScheme.java new file mode 100644 index 0000000000..2d71c44896 --- /dev/null +++ b/scalding-parquet/src/main/java/com/twitter/scalding/parquet/ParquetValueScheme.java @@ -0,0 +1,166 @@ +package com.twitter.scalding.parquet; + +import java.io.IOException; +import java.io.Serializable; + +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.RecordReader; + +import cascading.flow.FlowProcess; +import cascading.scheme.Scheme; +import cascading.scheme.SinkCall; +import cascading.scheme.SourceCall; +import cascading.tap.Tap; +import cascading.tuple.Tuple; +import cascading.tuple.TupleEntry; +import org.apache.parquet.filter2.predicate.FilterPredicate; +import org.apache.parquet.hadoop.ParquetInputFormat; +import org.apache.parquet.hadoop.mapred.Container; +import org.apache.parquet.hadoop.thrift.ParquetThriftInputFormat; +import org.apache.parquet.hadoop.thrift.ThriftReadSupport; + +import static org.apache.parquet.Preconditions.checkNotNull; + +/** + * A Cascading Scheme that returns a simple Tuple with a single value, the "value" object + * coming out of the underlying InputFormat. + * + * This is an abstract class; implementations are expected to set up their Input/Output Formats + * correctly in the respective Init methods. + */ +public abstract class ParquetValueScheme extends Scheme{ + + public static final class Config implements Serializable { + private final FilterPredicate filterPredicate; + private final String deprecatedProjectionString; + private final String strictProjectionString; + private final Class klass; + + private Config(Class klass, FilterPredicate filterPredicate, String deprecatedProjectionString, String strictProjectionString) { + this.filterPredicate = filterPredicate; + this.deprecatedProjectionString = deprecatedProjectionString; + this.strictProjectionString = strictProjectionString; + this.klass = klass; + } + + public Config() { + filterPredicate = null; + deprecatedProjectionString = null; + strictProjectionString = null; + klass = null; + } + + public FilterPredicate getFilterPredicate() { + return filterPredicate; + } + + @Deprecated + public String getProjectionString() { + return deprecatedProjectionString; + } + + public String getStrictProjectionString() { + return strictProjectionString; + } + + public Class getKlass() { + return klass; + } + + public Config withFilterPredicate(FilterPredicate f) { + return new Config(this.klass, checkNotNull(f, "filterPredicate"), this.deprecatedProjectionString, this.strictProjectionString); + } + + @Deprecated + public Config withProjectionString(String p) { + return new Config(this.klass, this.filterPredicate, checkNotNull(p, "projectionString"), this.strictProjectionString); + } + + public Config withStrictProjectionString(String p) { + return new Config(this.klass, this.filterPredicate, this.deprecatedProjectionString, checkNotNull(p, "projectionString")); + } + + public Config withRecordClass(Class klass) { + return new Config(checkNotNull(klass, "recordClass"), this.filterPredicate, this.deprecatedProjectionString, this.strictProjectionString); + } + } + + private static final long serialVersionUID = 157560846420730043L; + protected final Config config; + + public ParquetValueScheme() { + this(new Config()); + } + + public ParquetValueScheme(FilterPredicate filterPredicate) { + this(new Config().withFilterPredicate(filterPredicate)); + } + + public ParquetValueScheme(Config config) { + this.config = config; + } + + @Deprecated + private void setProjectionPushdown(JobConf jobConf) { + if (this.config.deprecatedProjectionString != null) { + ThriftReadSupport.setProjectionPushdown(jobConf, this.config.deprecatedProjectionString); + } + } + + private void setStrictProjectionPushdown(JobConf jobConf) { + if (this.config.strictProjectionString != null) { + ThriftReadSupport.setStrictFieldProjectionFilter(jobConf, this.config.strictProjectionString); + } + } + + private void setPredicatePushdown(JobConf jobConf) { + if (this.config.filterPredicate != null) { + ParquetInputFormat.setFilterPredicate(jobConf, this.config.filterPredicate); + } + } + @Override + public void sourceConfInit(FlowProcess jobConfFlowProcess, Tap jobConfRecordReaderOutputCollectorTap, final JobConf jobConf) { + setPredicatePushdown(jobConf); + setProjectionPushdown(jobConf); + setStrictProjectionPushdown(jobConf); + setRecordClass(jobConf); + } + + private void setRecordClass(JobConf jobConf) { + if (config.klass != null) { + ParquetThriftInputFormat.setThriftClass(jobConf, config.klass); + } + } + + @SuppressWarnings("unchecked") + @Override + public boolean source(FlowProcess fp, SourceCall sc) + throws IOException { + Container value = (Container) sc.getInput().createValue(); + boolean hasNext = sc.getInput().next(null, value); + if (!hasNext) { return false; } + + // Skip nulls + if (value == null) { return true; } + + sc.getIncomingEntry().setTuple(new Tuple(value.get())); + return true; + } + + @SuppressWarnings("unchecked") + @Override + public void sink(FlowProcess fp, SinkCall sc) + throws IOException { + TupleEntry tuple = sc.getOutgoingEntry(); + + if (tuple.size() != 1) { + throw new RuntimeException("ParquetValueScheme expects tuples with an arity of exactly 1, but found " + tuple.getFields()); + } + + T value = (T) tuple.getObject(0); + OutputCollector output = sc.getOutput(); + output.collect(null, value); + } + +} diff --git a/scalding-parquet/src/main/java/com/twitter/scalding/parquet/ScaldingDeprecatedParquetInputFormat.java b/scalding-parquet/src/main/java/com/twitter/scalding/parquet/ScaldingDeprecatedParquetInputFormat.java new file mode 100644 index 0000000000..04f3824da2 --- /dev/null +++ b/scalding-parquet/src/main/java/com/twitter/scalding/parquet/ScaldingDeprecatedParquetInputFormat.java @@ -0,0 +1,200 @@ +package com.twitter.scalding.parquet; + +import static java.lang.Boolean.TRUE; +import static java.util.Arrays.asList; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.FileSplit; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.parquet.hadoop.Footer; +import org.apache.parquet.hadoop.ParquetInputFormat; +import org.apache.parquet.hadoop.ParquetInputSplit; +import org.apache.parquet.hadoop.ParquetRecordReader; +import org.apache.parquet.hadoop.mapred.Container; + +/** + * This class is a clone of org.apache.parquet.hadoop.mapred.DeprecatedParquetInputFormat + * from apache-parquet 1.12.0-RC1, to include the fix https://github.com/apache/parquet-mr/pull/844. + * + * The motivation is patching a bug while we wait for apache-parquet to be published 1.12.0 and for us to + * update the version used. This class should be removed and the latest + * org.apache.parquet.hadoop.mapred.DeprecatedParquetInputFormat should be used. + */ +public class ScaldingDeprecatedParquetInputFormat extends FileInputFormat> { + + protected ParquetInputFormat realInputFormat = new ParquetInputFormat(); + + @Override + public RecordReader> getRecordReader(InputSplit split, JobConf job, + Reporter reporter) throws IOException { + return new RecordReaderWrapper(split, job, reporter); + } + + @Override + public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { + if (isTaskSideMetaData(job)) { + return super.getSplits(job, numSplits); + } + + List