1- // VersionID format:
1+ // Hex VersionID format:
22// timestamp sequential_position rep_group_id other_information
33// where:
44// - timestamp 14 bytes epoch in ms (good untill 5138)
55// - sequential_position 06 bytes position in the ms slot (1B ops)
66// - rep_group_id 07 bytes replication group identifier
77// - other_information arbitrary user input, such as a unique string
8+ //
9+ // Base62 VersionID format:
10+ // timestamp sequential_position rep_group_id instance_id version_id_format
11+ // where:
12+ // - timestamp 14 bytes epoch in ms
13+ // - sequential_position 06 bytes position in the ms slot
14+ // - rep_group_id 07 bytes replication group identifier
15+ // - instance_id 06 bytes unique instance identifier (optional)
16+ // - version_id_format 02 bytes version ID format marker + version
817
918import base62Integer from 'base62' ;
1019import baseX from 'base-x' ;
20+ import assert from 'assert' ;
21+ import { VersioningConstants } from './constants' ;
1122const BASE62 = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' ;
1223const base62String = baseX ( BASE62 ) ;
1324
1425// the lengths of the components in bytes
1526const LENGTH_TS = 14 ; // timestamp: epoch in ms
1627const LENGTH_SEQ = 6 ; // position in ms slot
1728const LENGTH_RG = 7 ; // replication group id
29+ const LENGTH_ID = 6 ; // instance id, can be empty
30+ const LENGTH_FT = 2 ; // version ID format, 1 byte + separator
1831
1932// empty string template for the variables in a versionId
2033const TEMPLATE_TS = new Array ( LENGTH_TS + 1 ) . join ( '0' ) ;
2134const TEMPLATE_SEQ = new Array ( LENGTH_SEQ + 1 ) . join ( '0' ) ;
22- const TEMPLATE_RG = new Array ( LENGTH_RG + 1 ) . join ( ' ' ) ;
35+ const TEMPLATE_RG = new Array ( LENGTH_RG + 1 ) . join ( '0' ) ;
36+ const TEMPLATE_ID = new Array ( LENGTH_ID + 1 ) . join ( '0' ) ;
2337
2438export const S3_VERSION_ID_ENCODING_TYPE = process . env . S3_VERSION_ID_ENCODING_TYPE ;
2539
26- // Counter that is increased after each call to generateUniqueVersionId
27- export let uidCounter = 0 ;
28- export const versionIdSeed = getVersionIdSeed ( ) ;
40+ const versionIDFormat = '1' ;
41+
42+ /**
43+ * Check if the S3_VERSION_ID_ENCODING_TYPE is set to 'hex' or not set at all.
44+ * If it is, we use the legacy hex encoding for version IDs.
45+ *
46+ * @return - true if hex encoding type is used, false otherwise
47+ */
48+ export function isHexEncodingType ( ) {
49+ return S3_VERSION_ID_ENCODING_TYPE === 'hex' || ! S3_VERSION_ID_ENCODING_TYPE ;
50+ }
2951
3052/**
3153 * Left-pad a string representation of a value with a given template.
@@ -91,23 +113,6 @@ function wait(span: number) {
91113 }
92114}
93115
94- export function getVersionIdSeed ( ) : string {
95- // The HOSTNAME environment variable is set by default by Kubernetes
96- // and populated with the pod name, containing a suffix with a unique id
97- // as a string.
98- // By default, we rely on the pid, to account for multiple workers in
99- // cluster mode. As a result, the unique id is either <pod-suffix>.<pid>
100- // or <pid>.
101- // If unique vID are needed in a multi cluster mode architecture (i.e.,
102- // multiple server instances, each with multiple workers), the
103- // HOSTNAME environment variable can be set.
104- return `${ process . env . HOSTNAME ?. split ( '-' ) . pop ( ) || '' } ${ process . pid } ` ;
105- }
106-
107- export function generateUniqueVersionId ( replicationGroupId : string ) : string {
108- return generateVersionId ( `${ versionIdSeed } .${ uidCounter ++ } ` , replicationGroupId ) ;
109- }
110-
111116/**
112117 * This function returns a "versionId" string indicating the current time as a
113118 * combination of the current time in millisecond, the position of the request
@@ -124,6 +129,21 @@ export function generateVersionId(info: string, replicationGroupId: string): str
124129 // replication group ID, like PARIS; will be trimmed if exceed LENGTH_RG
125130 const repGroupId = padRight ( replicationGroupId , TEMPLATE_RG ) ;
126131
132+ let otherInfo = '' ;
133+ let instanceIdPadded = '' ;
134+ let vidFormat = '' ;
135+
136+ if ( isHexEncodingType ( ) ) {
137+ // In HEX encoding, the full info data is used.
138+ otherInfo = info ;
139+ } else {
140+ // In base62, info is for the instance ID and is trimmed/padded.
141+ instanceIdPadded = padRight ( info , TEMPLATE_ID ) ;
142+ // versionID format, 2 bytes
143+ vidFormat = VersioningConstants . VersionId . FormatMarker + versionIDFormat ;
144+ assert ( vidFormat . length === LENGTH_FT , `versionID format must be ${ LENGTH_FT } bytes` ) ;
145+ }
146+
127147 // Need to wait for the millisecond slot got "flushed". We wait for
128148 // only a single millisecond when the module is restarted, which is
129149 // necessary for the correctness of the system. This is therefore cheap.
@@ -143,13 +163,6 @@ export function generateVersionId(info: string, replicationGroupId: string): str
143163 lastSeq = lastTimestamp === ts ? lastSeq + 1 : 0 ;
144164 lastTimestamp = ts ;
145165
146- // if S3_VERSION_ID_ENCODING_TYPE is "hex", info is used.
147- if ( S3_VERSION_ID_ENCODING_TYPE === 'hex' || ! S3_VERSION_ID_ENCODING_TYPE ) {
148- // info field stays as is
149- } else {
150- info = '' ;
151- }
152-
153166 // In the default cases, we reverse the chronological order of the
154167 // timestamps so that all versions of an object can be retrieved in the
155168 // reversed chronological order---newest versions first. This is because of
@@ -158,7 +171,9 @@ export function generateVersionId(info: string, replicationGroupId: string): str
158171 padLeft ( MAX_TS - lastTimestamp , TEMPLATE_TS ) +
159172 padLeft ( MAX_SEQ - lastSeq , TEMPLATE_SEQ ) +
160173 repGroupId +
161- info
174+ otherInfo +
175+ instanceIdPadded +
176+ vidFormat
162177 ) ;
163178}
164179
@@ -271,6 +286,30 @@ export function base62Decode(str: string): string | Error {
271286export const ENC_TYPE_HEX = 0 ; // legacy (large) encoding
272287export const ENC_TYPE_BASE62 = 1 ; // new (tiny) encoding
273288
289+ /**
290+ * Checks if the given versionId string contains the specified format version.
291+ * For performance, this function assumes the format marker and version are always
292+ * located at the end of the versionId (at position versionId.length - LENGTH_FT).
293+ * This allows for O(1) access without scanning the entire string.
294+ *
295+ * @param versionId - The versionId string to check.
296+ * @param version - The expected format version.
297+ * @returns true if the versionId contains the format marker and version, false otherwise.
298+ */
299+ function hasVersionIDFormat ( versionId : string , version : string ) : boolean {
300+ const formatMarkerIdx = versionId . length - LENGTH_FT ;
301+ const formatMarker = versionId . charAt ( formatMarkerIdx ) ;
302+ if ( formatMarker !== VersioningConstants . VersionId . FormatMarker ) {
303+ return false ; // no format marker
304+ }
305+ const formatVersion = versionId . substring ( formatMarkerIdx + 1 ) ;
306+ return formatVersion === version ;
307+ }
308+
309+ const LEGACY_BASE62_DECODED_LENGTH = 27 ;
310+ const BASE62_DECODED_LENGTH = 35 ;
311+ const BASE62_ENCODED_LENGTH = 32 ;
312+
274313/**
275314 * Encode a versionId to obscure internal information contained
276315 * in a version ID.
@@ -279,8 +318,9 @@ export const ENC_TYPE_BASE62 = 1; // new (tiny) encoding
279318 * @return - the encoded versionId
280319 */
281320export function encode ( str : string ) : string {
282- // default format without 'info' field will always be 27 characters
283- if ( str . length === 27 ) {
321+ // Legacy base62 version IDs (without 'info' field) are always 27 characters long.
322+ // The new base62 format is 35 characters and includes the format marker at the end.
323+ if ( str . length === LEGACY_BASE62_DECODED_LENGTH || hasVersionIDFormat ( str , versionIDFormat ) ) {
284324 return base62Encode ( str ) ;
285325 } // legacy format
286326 return hexEncode ( str ) ;
@@ -296,15 +336,19 @@ export function encode(str: string): string {
296336 */
297337export function decode ( str : string ) : string | Error {
298338 // default format is exactly 32 characters when encoded
299- if ( str . length === 32 ) {
339+ if ( str . length === BASE62_ENCODED_LENGTH ) {
300340 const decoded : string | Error = base62Decode ( str ) ;
301- if ( typeof decoded === 'string' && decoded . length !== 27 ) {
302- return new Error ( `decoded ${ str } is not length 27` ) ;
341+ // Legacy base62 version IDs (without 'info' field) are always 27 characters long.
342+ // The new base62 format is always 35 characters long.
343+ if ( typeof decoded === 'string' &&
344+ ! [ LEGACY_BASE62_DECODED_LENGTH , BASE62_DECODED_LENGTH ] . includes ( decoded . length ) ) {
345+ return new Error ( `decoded ${ str } is not length ` +
346+ `${ LEGACY_BASE62_DECODED_LENGTH } or ${ BASE62_DECODED_LENGTH } ` ) ;
303347 }
304348 return decoded ;
305349 }
306350 // legacy format
307- if ( str . length > 32 ) {
351+ if ( str . length > BASE62_ENCODED_LENGTH ) {
308352 return hexDecode ( str ) ;
309353 }
310354 return new Error ( `cannot decode str ${ str . length } ` ) ;
0 commit comments