@@ -19,13 +19,12 @@ package com.spotify.ratatool.samplers
19
19
import java .io .File
20
20
import java .nio .ByteBuffer
21
21
import java .nio .file .{Files , Path }
22
-
23
22
import com .google .common .hash .Hasher
24
23
import com .google .common .io .BaseEncoding
25
24
import com .spotify .ratatool .Schemas
26
25
import com .spotify .ratatool .avro .specific .TestRecord
27
26
import com .spotify .ratatool .scalacheck ._
28
- import com .spotify .ratatool .io .{AvroIO , FileStorage }
27
+ import com .spotify .ratatool .io .{AvroIO , FileStorage , ParquetIO }
29
28
import com .spotify .ratatool .samplers .util .{ByteHasher , HexEncoding , MurmurHash }
30
29
import org .apache .avro .generic .GenericRecord
31
30
import org .scalacheck .Prop .{all , forAll , proved }
@@ -38,6 +37,7 @@ import scala.language.postfixOps
38
37
import org .scalatest .flatspec .AnyFlatSpec
39
38
import org .scalatest .matchers .should .Matchers
40
39
40
+ // scalastyle:off file.size.limit
41
41
object BigSamplerTest extends Properties (" BigSampler" ) {
42
42
43
43
private val testSeed = Some (42 )
@@ -453,14 +453,20 @@ sealed trait BigSamplerJobTestRoot
453
453
val dir : Path = Files .createTempDirectory(" ratatool-big-sampler-input" )
454
454
val file1 = new File (dir.toString, " part-00000.avro" )
455
455
val file2 = new File (dir.toString, " part-00001.avro" )
456
+ val fileParquet1 = new File (dir.toString, " part-00000.parquet" )
457
+ val fileParquet2 = new File (dir.toString, " part-00001.parquet" )
456
458
457
459
override protected def beforeAll (configMap : ConfigMap ): Unit = {
458
460
AvroIO .writeToFile(data1, schema, file1)
459
461
AvroIO .writeToFile(data2, schema, file2)
462
+ ParquetIO .writeToFile(data1, schema, fileParquet1)
463
+ ParquetIO .writeToFile(data2, schema, fileParquet2)
460
464
461
465
dir.toFile.deleteOnExit()
462
466
file1.deleteOnExit()
463
467
file2.deleteOnExit()
468
+ fileParquet1.deleteOnExit()
469
+ fileParquet2.deleteOnExit()
464
470
}
465
471
466
472
protected def withOutFile (testCode : (File ) => Any ) {
@@ -476,6 +482,11 @@ sealed trait BigSamplerJobTestRoot
476
482
FileStorage (p).listFiles.foldLeft(0 )((i, m) =>
477
483
i + AvroIO .readFromFile[GenericRecord ](m.resourceId().toString).count(f)
478
484
)
485
+
486
+ protected def countParquetRecords (p : String , f : GenericRecord => Boolean = _ => true ): Long =
487
+ FileStorage (p).listFiles.foldLeft(0 )((i, m) =>
488
+ i + ParquetIO .readFromFile(m.resourceId().toString).count(f)
489
+ )
479
490
}
480
491
481
492
class BigSamplerBasicJobTest extends BigSamplerJobTestRoot {
@@ -487,16 +498,31 @@ class BigSamplerBasicJobTest extends BigSamplerJobTestRoot {
487
498
countAvroRecords(s " $outDir/*.avro " ).toDouble shouldBe totalElements * 0.5 +- 250
488
499
}
489
500
501
+ it should " work for 50% for parquet" in withOutFile { outDir =>
502
+ BigSampler .run(Array (s " --input= $dir/*.parquet " , s " --output= $outDir" , " --sample=0.5" ))
503
+ countParquetRecords(s " $outDir/*.parquet " ).toDouble shouldBe totalElements * 0.5 +- 250
504
+ }
505
+
490
506
it should " work for 1%" in withOutFile { outDir =>
491
507
BigSampler .run(Array (s " --input= $dir/*.avro " , s " --output= $outDir" , " --sample=0.01" ))
492
508
countAvroRecords(s " $outDir/*.avro " ).toDouble shouldBe totalElements * 0.01 +- 35
493
509
}
494
510
511
+ it should " work for 1% for parquet" in withOutFile { outDir =>
512
+ BigSampler .run(Array (s " --input= $dir/*.parquet " , s " --output= $outDir" , " --sample=0.01" ))
513
+ countParquetRecords(s " $outDir/*.parquet " ).toDouble shouldBe totalElements * 0.01 +- 35
514
+ }
515
+
495
516
it should " work for 100%" in withOutFile { outDir =>
496
517
BigSampler .run(Array (s " --input= $dir/*.avro " , s " --output= $outDir" , " --sample=1.0" ))
497
518
countAvroRecords(s " $outDir/*.avro " ) shouldBe totalElements
498
519
}
499
520
521
+ it should " work for 100% for parquet" in withOutFile { outDir =>
522
+ BigSampler .run(Array (s " --input= $dir/*.parquet " , s " --output= $outDir" , " --sample=1.0" ))
523
+ countParquetRecords(s " $outDir/*.parquet " ).toDouble shouldBe totalElements
524
+ }
525
+
500
526
it should " work for 50% with hash field and seed" in withOutFile { outDir =>
501
527
BigSampler .run(
502
528
Array (
@@ -509,8 +535,41 @@ class BigSamplerBasicJobTest extends BigSamplerJobTestRoot {
509
535
)
510
536
countAvroRecords(s " $outDir/*.avro " ).toDouble shouldBe totalElements * 0.5 +- 2000
511
537
}
538
+
539
+ it should " work for 50% with hash field and seed for parquet" in withOutFile { outDir =>
540
+ BigSampler .run(
541
+ Array (
542
+ s " --input= $dir/*.parquet " ,
543
+ s " --output= $outDir" ,
544
+ " --sample=0.5" ,
545
+ " --seed=42" ,
546
+ " --fields=required_fields.int_field"
547
+ )
548
+ )
549
+ countParquetRecords(s " $outDir/*.parquet " ).toDouble shouldBe totalElements * 0.5 +- 2000
550
+ }
551
+ }
552
+
553
+ class BigSamplerWildCardTest extends BigSamplerJobTestRoot {
554
+ override def data1Size : Int = 10000
555
+ override def data2Size : Int = 2500
556
+
557
+ override protected def beforeAll (configMap : ConfigMap ): Unit = {
558
+ ParquetIO .writeToFile(data1, schema, fileParquet1)
559
+ ParquetIO .writeToFile(data2, schema, fileParquet2)
560
+
561
+ dir.toFile.deleteOnExit()
562
+ fileParquet1.deleteOnExit()
563
+ fileParquet2.deleteOnExit()
564
+ }
565
+
566
+ " BigSampler" should " work for wildcard without file extension" in withOutFile { outDir =>
567
+ BigSampler .run(Array (s " --input= $dir/part-* " , s " --output= $outDir" , " --sample=0.5" ))
568
+ countParquetRecords(s " $outDir/*.parquet " ).toDouble shouldBe totalElements * 0.5 +- 250
569
+ }
512
570
}
513
571
572
+
514
573
class BigSamplerApproxDistJobTest extends BigSamplerJobTestRoot {
515
574
override def data1Size : Int = 10000
516
575
override def data2Size : Int = 2500
0 commit comments