|
19 | 19 |
|
20 | 20 | import static org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions.RESOLVE_FILE; |
21 | 21 | import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.MoreObjects.firstNonNull; |
| 22 | +import static org.hamcrest.MatcherAssert.assertThat; |
| 23 | +import static org.hamcrest.Matchers.containsInAnyOrder; |
22 | 24 | import static org.hamcrest.Matchers.isA; |
23 | 25 | import static org.junit.Assert.assertEquals; |
24 | 26 | import static org.junit.Assert.assertFalse; |
25 | 27 | import static org.junit.Assert.assertTrue; |
26 | 28 |
|
| 29 | +import java.io.BufferedReader; |
27 | 30 | import java.io.File; |
28 | 31 | import java.io.FileNotFoundException; |
29 | 32 | import java.io.FileOutputStream; |
|
38 | 41 | import java.nio.file.Paths; |
39 | 42 | import java.nio.file.StandardCopyOption; |
40 | 43 | import java.nio.file.attribute.FileTime; |
| 44 | +import java.util.ArrayList; |
41 | 45 | import java.util.Arrays; |
| 46 | +import java.util.Collections; |
42 | 47 | import java.util.List; |
43 | 48 | import java.util.Objects; |
44 | 49 | import java.util.zip.GZIPOutputStream; |
45 | 50 | import org.apache.beam.sdk.coders.StringUtf8Coder; |
46 | 51 | import org.apache.beam.sdk.coders.VarIntCoder; |
47 | 52 | import org.apache.beam.sdk.io.fs.EmptyMatchTreatment; |
48 | 53 | import org.apache.beam.sdk.io.fs.MatchResult; |
| 54 | +import org.apache.beam.sdk.io.fs.MatchResult.Metadata; |
49 | 55 | import org.apache.beam.sdk.options.PipelineOptionsFactory; |
50 | 56 | import org.apache.beam.sdk.state.StateSpec; |
51 | 57 | import org.apache.beam.sdk.state.StateSpecs; |
52 | 58 | import org.apache.beam.sdk.state.ValueState; |
53 | 59 | import org.apache.beam.sdk.testing.NeedsRunner; |
54 | 60 | import org.apache.beam.sdk.testing.PAssert; |
55 | 61 | import org.apache.beam.sdk.testing.TestPipeline; |
| 62 | +import org.apache.beam.sdk.testing.UsesUnboundedPCollections; |
56 | 63 | import org.apache.beam.sdk.testing.UsesUnboundedSplittableParDo; |
57 | 64 | import org.apache.beam.sdk.transforms.Contextful; |
58 | 65 | import org.apache.beam.sdk.transforms.Create; |
59 | 66 | import org.apache.beam.sdk.transforms.DoFn; |
60 | 67 | import org.apache.beam.sdk.transforms.MapElements; |
| 68 | +import org.apache.beam.sdk.transforms.PTransform; |
61 | 69 | import org.apache.beam.sdk.transforms.ParDo; |
62 | 70 | import org.apache.beam.sdk.transforms.Requirements; |
63 | 71 | import org.apache.beam.sdk.transforms.SerializableFunctions; |
64 | 72 | import org.apache.beam.sdk.transforms.View; |
65 | 73 | import org.apache.beam.sdk.transforms.Watch; |
| 74 | +import org.apache.beam.sdk.transforms.windowing.AfterWatermark; |
| 75 | +import org.apache.beam.sdk.transforms.windowing.FixedWindows; |
66 | 76 | import org.apache.beam.sdk.transforms.windowing.GlobalWindow; |
67 | 77 | import org.apache.beam.sdk.transforms.windowing.PaneInfo; |
| 78 | +import org.apache.beam.sdk.transforms.windowing.Window; |
68 | 79 | import org.apache.beam.sdk.values.KV; |
69 | 80 | import org.apache.beam.sdk.values.PCollection; |
| 81 | +import org.apache.beam.sdk.values.PCollection.IsBounded; |
70 | 82 | import org.apache.beam.sdk.values.PCollectionView; |
71 | 83 | import org.apache.beam.sdk.values.TypeDescriptor; |
72 | 84 | import org.apache.beam.sdk.values.TypeDescriptors; |
| 85 | +import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.Lists; |
73 | 86 | import org.joda.time.Duration; |
74 | 87 | import org.junit.Rule; |
75 | 88 | import org.junit.Test; |
@@ -547,4 +560,130 @@ public void testFileIoDynamicNaming() throws IOException { |
547 | 560 | "Output file shard 0 exists after pipeline completes", |
548 | 561 | new File(outputFileName + "-0").exists()); |
549 | 562 | } |
| 563 | + |
| 564 | + @Test |
| 565 | + @Category({NeedsRunner.class, UsesUnboundedPCollections.class}) |
| 566 | + public void testWriteUnboundedWithCustomBatchSize() throws IOException { |
| 567 | + File root = tmpFolder.getRoot(); |
| 568 | + List<String> inputs = Arrays.asList("one", "two", "three", "four", "five", "six"); |
| 569 | + |
| 570 | + PTransform<PCollection<String>, PCollection<String>> transform = |
| 571 | + Window.<String>into(FixedWindows.of(Duration.standardSeconds(10))) |
| 572 | + .triggering(AfterWatermark.pastEndOfWindow()) |
| 573 | + .withAllowedLateness(Duration.ZERO) |
| 574 | + .discardingFiredPanes(); |
| 575 | + |
| 576 | + FileIO.Write<Void, String> write = |
| 577 | + FileIO.<String>write() |
| 578 | + .via(TextIO.sink()) |
| 579 | + .to(root.getAbsolutePath()) |
| 580 | + .withPrefix("output") |
| 581 | + .withSuffix(".txt") |
| 582 | + .withAutoSharding() |
| 583 | + .withBatchSize(3) |
| 584 | + .withBatchSizeBytes(1024 * 1024) // Set high to avoid triggering flushing by byte count. |
| 585 | + .withBatchMaxBufferingDuration( |
| 586 | + Duration.standardMinutes(1)); // Set high to avoid triggering flushing by duration. |
| 587 | + |
| 588 | + // Prepare timestamps for the elements. |
| 589 | + List<Long> timestamps = new ArrayList<>(); |
| 590 | + for (long i = 0; i < inputs.size(); i++) { |
| 591 | + timestamps.add(i + 1); |
| 592 | + } |
| 593 | + |
| 594 | + p.apply(Create.timestamped(inputs, timestamps).withCoder(StringUtf8Coder.of())) |
| 595 | + .setIsBoundedInternal(IsBounded.UNBOUNDED) |
| 596 | + .apply(transform) |
| 597 | + .apply(write); |
| 598 | + p.run().waitUntilFinish(); |
| 599 | + |
| 600 | + // Verify that the custom batch parameters are set. |
| 601 | + assertEquals(3, write.getBatchSize().intValue()); |
| 602 | + assertEquals(1024 * 1024, write.getBatchSizeBytes().intValue()); |
| 603 | + assertEquals(Duration.standardMinutes(1), write.getBatchMaxBufferingDuration()); |
| 604 | + |
| 605 | + // Verify file contents. |
| 606 | + checkFileContents(root, "output", inputs); |
| 607 | + |
| 608 | + // With auto-sharding, we can't assert on the exact number of output files, but because |
| 609 | + // batch size is 3 and there are 6 elements, we expect at least 2 files. |
| 610 | + final String pattern = new File(root, "output").getAbsolutePath() + "*"; |
| 611 | + List<Metadata> metadata = |
| 612 | + FileSystems.match(Collections.singletonList(pattern)).get(0).metadata(); |
| 613 | + assertTrue(metadata.size() >= 2); |
| 614 | + } |
| 615 | + |
| 616 | + @Test |
| 617 | + @Category({NeedsRunner.class, UsesUnboundedPCollections.class}) |
| 618 | + public void testWriteUnboundedWithCustomBatchSizeBytes() throws IOException { |
| 619 | + File root = tmpFolder.getRoot(); |
| 620 | + // The elements plus newline characters give a total of 4+4+6+5+5+4=28 bytes. |
| 621 | + List<String> inputs = Arrays.asList("one", "two", "three", "four", "five", "six"); |
| 622 | + // Assign timestamps so that all elements fall into the same 10s window. |
| 623 | + List<Long> timestamps = Arrays.asList(1L, 2L, 3L, 4L, 5L, 6L); |
| 624 | + |
| 625 | + FileIO.Write<Void, String> write = |
| 626 | + FileIO.<String>write() |
| 627 | + .via(TextIO.sink()) |
| 628 | + .to(root.getAbsolutePath()) |
| 629 | + .withPrefix("output") |
| 630 | + .withSuffix(".txt") |
| 631 | + .withAutoSharding() |
| 632 | + .withBatchSize(1000) // Set high to avoid flushing by record count. |
| 633 | + .withBatchSizeBytes(10) |
| 634 | + .withBatchMaxBufferingDuration( |
| 635 | + Duration.standardMinutes(1)); // Set high to avoid flushing by duration. |
| 636 | + |
| 637 | + p.apply(Create.timestamped(inputs, timestamps).withCoder(StringUtf8Coder.of())) |
| 638 | + .setIsBoundedInternal(IsBounded.UNBOUNDED) |
| 639 | + .apply( |
| 640 | + Window.<String>into(FixedWindows.of(Duration.standardSeconds(10))) |
| 641 | + .triggering(AfterWatermark.pastEndOfWindow()) |
| 642 | + .withAllowedLateness(Duration.ZERO) |
| 643 | + .discardingFiredPanes()) |
| 644 | + .apply(write); |
| 645 | + |
| 646 | + p.run().waitUntilFinish(); |
| 647 | + |
| 648 | + // Verify that the custom batch parameters are set. |
| 649 | + assertEquals(1000, write.getBatchSize().intValue()); |
| 650 | + assertEquals(10, write.getBatchSizeBytes().intValue()); |
| 651 | + assertEquals(Duration.standardMinutes(1), write.getBatchMaxBufferingDuration()); |
| 652 | + checkFileContents(root, "output", inputs); |
| 653 | + |
| 654 | + // With auto-sharding, we cannot assert on the exact number of output files. The BatchSizeBytes |
| 655 | + // acts as a threshold for flushing; once buffer size reaches 10 bytes, a flush is triggered, |
| 656 | + // but more items may be added before it completes. With 28 bytes total, we can only guarantee |
| 657 | + // at least 2 files are produced. |
| 658 | + final String pattern = new File(root, "output").getAbsolutePath() + "*"; |
| 659 | + List<Metadata> metadata = |
| 660 | + FileSystems.match(Collections.singletonList(pattern)).get(0).metadata(); |
| 661 | + assertTrue(metadata.size() >= 2); |
| 662 | + } |
| 663 | + |
| 664 | + static void checkFileContents(File rootDir, String prefix, List<String> inputs) |
| 665 | + throws IOException { |
| 666 | + List<File> outputFiles = Lists.newArrayList(); |
| 667 | + final String pattern = new File(rootDir, prefix).getAbsolutePath() + "*"; |
| 668 | + List<Metadata> metadata = |
| 669 | + FileSystems.match(Collections.singletonList(pattern)).get(0).metadata(); |
| 670 | + for (Metadata meta : metadata) { |
| 671 | + outputFiles.add(new File(meta.resourceId().toString())); |
| 672 | + } |
| 673 | + assertFalse("Should have produced at least 1 output file", outputFiles.isEmpty()); |
| 674 | + |
| 675 | + List<String> actual = Lists.newArrayList(); |
| 676 | + for (File outputFile : outputFiles) { |
| 677 | + List<String> actualShard = Lists.newArrayList(); |
| 678 | + try (BufferedReader reader = |
| 679 | + Files.newBufferedReader(outputFile.toPath(), StandardCharsets.UTF_8)) { |
| 680 | + String line; |
| 681 | + while ((line = reader.readLine()) != null) { |
| 682 | + actualShard.add(line); |
| 683 | + } |
| 684 | + } |
| 685 | + actual.addAll(actualShard); |
| 686 | + } |
| 687 | + assertThat(actual, containsInAnyOrder(inputs.toArray())); |
| 688 | + } |
550 | 689 | } |
0 commit comments