2323import com .google .bigtable .repackaged .com .google .cloud .bigtable .data .v2 .internal .ByteStringComparator ;
2424import com .google .bigtable .repackaged .com .google .cloud .bigtable .data .v2 .models .KeyOffset ;
2525import com .google .bigtable .repackaged .com .google .protobuf .ByteString ;
26- import com .google .cloud .bigtable .beam .CloudBigtableIO .AbstractSource ;
2726import com .google .cloud .bigtable .beam .CloudBigtableIO .Source ;
2827import com .google .cloud .bigtable .beam .CloudBigtableIO .SourceWithKeys ;
2928import java .util .ArrayList ;
@@ -112,7 +111,7 @@ public void testSourceToString() throws Exception {
112111 @ Test
113112 public void testSampleRowKeys () throws Exception {
114113 List <KeyOffset > sampleRowKeys = new ArrayList <>();
115- int count = ( int ) ( AbstractSource . COUNT_MAX_SPLIT_COUNT * 3 - 5 ) ;
114+ int count = 5 ;
116115 byte [][] keys = Bytes .split ("A" .getBytes (), "Z" .getBytes (), count - 2 );
117116 long tabletSize = 2L * 1024L * 1024L * 1024L ;
118117 long boundary = 0 ;
@@ -129,7 +128,7 @@ public void testSampleRowKeys() throws Exception {
129128 }
130129 Source source = (Source ) CloudBigtableIO .read (scanConfig );
131130 source .setSampleRowKeys (sampleRowKeys );
132- List <SourceWithKeys > splits = source .getSplits (20000 );
131+ List <SourceWithKeys > splits = source .getSplits (tabletSize * 2 );
133132 Collections .sort (
134133 splits ,
135134 new Comparator <SourceWithKeys >() {
@@ -140,7 +139,6 @@ public int compare(SourceWithKeys o1, SourceWithKeys o2) {
140139 ByteString .copyFrom (o2 .getConfiguration ().getStartRow ()));
141140 }
142141 });
143- Assert .assertTrue (splits .size () <= AbstractSource .COUNT_MAX_SPLIT_COUNT );
144142 Iterator <SourceWithKeys > iter = splits .iterator ();
145143 SourceWithKeys last = iter .next ();
146144 while (iter .hasNext ()) {
@@ -159,7 +157,108 @@ public int compare(SourceWithKeys o1, SourceWithKeys o2) {
159157 Assert .assertTrue (current .getEstimatedSize () >= tabletSize );
160158 last = current ;
161159 }
162- // check first and last
160+ }
161+
162+ @ Test
163+ public void testMergeSmallTablets () throws Exception {
164+ List <KeyOffset > sampleRowKeys = new ArrayList <>();
165+ long tabletSize = 10 * 1024 * 1024 ; // 10MB
166+ // Tablets:
167+ // "" to "A" (10MB)
168+ // "A" to "B" (10MB)
169+ // "B" to "C" (10MB)
170+ // "C" to "D" (10MB)
171+ // "D" to "E" (10MB)
172+ sampleRowKeys .add (KeyOffset .create (ByteString .copyFromUtf8 ("A" ), tabletSize ));
173+ sampleRowKeys .add (KeyOffset .create (ByteString .copyFromUtf8 ("B" ), tabletSize * 2 ));
174+ sampleRowKeys .add (KeyOffset .create (ByteString .copyFromUtf8 ("C" ), tabletSize * 3 ));
175+ sampleRowKeys .add (KeyOffset .create (ByteString .copyFromUtf8 ("D" ), tabletSize * 4 ));
176+ sampleRowKeys .add (KeyOffset .create (ByteString .copyFromUtf8 ("E" ), tabletSize * 5 ));
177+
178+ Source source = (Source ) CloudBigtableIO .read (scanConfig );
179+ source .setSampleRowKeys (sampleRowKeys );
180+
181+ // desired = 25MB
182+ long desiredSize = 25 * 1024 * 1024 ;
183+ List <SourceWithKeys > splits = source .getSplits (desiredSize );
184+
185+ Collections .sort (
186+ splits ,
187+ (o1 , o2 ) ->
188+ ByteStringComparator .INSTANCE .compare (
189+ ByteString .copyFrom (o1 .getConfiguration ().getStartRow ()),
190+ ByteString .copyFrom (o2 .getConfiguration ().getStartRow ())));
191+
192+ // Expecting:
193+ // Split 1: "" to "C" (30MB)
194+ // Split 2: "C" to "" (20MB)
195+ Assert .assertEquals (2 , splits .size ());
196+ Assert .assertEquals ("" , Bytes .toStringBinary (splits .get (0 ).getConfiguration ().getStartRow ()));
197+ Assert .assertEquals ("C" , Bytes .toStringBinary (splits .get (0 ).getConfiguration ().getStopRow ()));
198+ Assert .assertEquals ("C" , Bytes .toStringBinary (splits .get (1 ).getConfiguration ().getStartRow ()));
199+ Assert .assertEquals ("" , Bytes .toStringBinary (splits .get (1 ).getConfiguration ().getStopRow ()));
200+ }
201+
202+ @ Test
203+ public void testRespectScanRange () throws Exception {
204+ List <KeyOffset > sampleRowKeys = new ArrayList <>();
205+ long tabletSize = 10 * 1024 * 1024 ;
206+ sampleRowKeys .add (KeyOffset .create (ByteString .copyFromUtf8 ("A" ), tabletSize ));
207+ sampleRowKeys .add (KeyOffset .create (ByteString .copyFromUtf8 ("B" ), tabletSize * 2 ));
208+ sampleRowKeys .add (KeyOffset .create (ByteString .copyFromUtf8 ("C" ), tabletSize * 3 ));
209+ sampleRowKeys .add (KeyOffset .create (ByteString .copyFromUtf8 ("D" ), tabletSize * 4 ));
210+ sampleRowKeys .add (KeyOffset .create (ByteString .copyFromUtf8 ("E" ), tabletSize * 5 ));
211+
212+ // Scan from "B" to "D"
213+ CloudBigtableScanConfiguration customScanConfig =
214+ scanConfig .toBuilder ().withKeys ("B" .getBytes (), "D" .getBytes ()).build ();
215+
216+ Source source = (Source ) CloudBigtableIO .read (customScanConfig );
217+ source .setSampleRowKeys (sampleRowKeys );
218+
219+ List <SourceWithKeys > splits = source .getSplits (25 * 1024 * 1024 );
220+
221+ Collections .sort (
222+ splits ,
223+ (o1 , o2 ) ->
224+ ByteStringComparator .INSTANCE .compare (
225+ ByteString .copyFrom (o1 .getConfiguration ().getStartRow ()),
226+ ByteString .copyFrom (o2 .getConfiguration ().getStartRow ())));
227+
228+ // Tablet 3 ("B" to "C") and Tablet 4 ("C" to "D") are within range.
229+ // They are merged into one split of "B" to "D" (20MB).
230+ // Note: Due to current logic without flush on scanEnd, the last piece might get lost if it
231+ // doesn't cross desiredBundleSize,
232+ // but here we are checking if it respects the range. If it fails, it means it's lost and we
233+ // should check if bug exists.
234+ Assert .assertEquals (1 , splits .size ());
235+ Assert .assertEquals ("B" , Bytes .toStringBinary (splits .get (0 ).getConfiguration ().getStartRow ()));
236+ Assert .assertEquals ("D" , Bytes .toStringBinary (splits .get (0 ).getConfiguration ().getStopRow ()));
237+ }
238+
239+ @ Test
240+ public void testLargeTabletsAsIs () throws Exception {
241+ List <KeyOffset > sampleRowKeys = new ArrayList <>();
242+ long tabletSize = 100 * 1024 * 1024 ; // 100MB
243+ sampleRowKeys .add (KeyOffset .create (ByteString .copyFromUtf8 ("A" ), tabletSize ));
244+
245+ Source source = (Source ) CloudBigtableIO .read (scanConfig );
246+ source .setSampleRowKeys (sampleRowKeys );
247+
248+ List <SourceWithKeys > splits = source .getSplits (25 * 1024 * 1024 ); // desired 25MB
249+
250+ Collections .sort (
251+ splits ,
252+ (o1 , o2 ) ->
253+ Bytes .compareTo (
254+ o1 .getConfiguration ().getStartRow (), o2 .getConfiguration ().getStartRow ()));
255+
256+ Assert .assertEquals (2 , splits .size ()); // The tablet + trailing region to end of table
257+ Assert .assertEquals ("" , Bytes .toStringBinary (splits .get (0 ).getConfiguration ().getStartRow ()));
258+ Assert .assertEquals ("A" , Bytes .toStringBinary (splits .get (0 ).getConfiguration ().getStopRow ()));
259+ Assert .assertEquals ("A" , Bytes .toStringBinary (splits .get (1 ).getConfiguration ().getStartRow ()));
260+ Assert .assertEquals ("" , Bytes .toStringBinary (splits .get (1 ).getConfiguration ().getStopRow ()));
261+ Assert .assertEquals (tabletSize , splits .get (0 ).getEstimatedSize ());
163262 }
164263
165264 @ Test
0 commit comments