1- // VersionID format:
1+ // Hex VersionID format:
22// timestamp sequential_position rep_group_id other_information
33// where:
44// - timestamp 14 bytes epoch in ms (good untill 5138)
55// - sequential_position 06 bytes position in the ms slot (1B ops)
66// - rep_group_id 07 bytes replication group identifier
77// - other_information arbitrary user input, such as a unique string
8+ //
9+ // Base62 VersionID format:
10+ // timestamp sequential_position rep_group_id instance_id version_id_format
11+ // where:
12+ // - timestamp 14 bytes epoch in ms
13+ // - sequential_position 06 bytes position in the ms slot
14+ // - rep_group_id 07 bytes replication group identifier
15+ // - instance_id 06 bytes unique instance identifier (optional)
16+ // - version_id_format 02 bytes version ID format marker + version
817
918import base62Integer from 'base62' ;
1019import baseX from 'base-x' ;
20+ import assert from 'assert' ;
21+ import { VersioningConstants } from './constants' ;
1122const BASE62 = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' ;
1223const base62String = baseX ( BASE62 ) ;
1324
1425// the lengths of the components in bytes
1526const LENGTH_TS = 14 ; // timestamp: epoch in ms
1627const LENGTH_SEQ = 6 ; // position in ms slot
1728const LENGTH_RG = 7 ; // replication group id
29+ const LENGTH_ID = 6 ; // instance id, can be empty
30+ const LENGTH_FT = 2 ; // version ID format, 1 byte + separator
1831
1932// empty string template for the variables in a versionId
2033const TEMPLATE_TS = new Array ( LENGTH_TS + 1 ) . join ( '0' ) ;
2134const TEMPLATE_SEQ = new Array ( LENGTH_SEQ + 1 ) . join ( '0' ) ;
22- const TEMPLATE_RG = new Array ( LENGTH_RG + 1 ) . join ( ' ' ) ;
35+ const TEMPLATE_RG = new Array ( LENGTH_RG + 1 ) . join ( '0' ) ;
36+ const TEMPLATE_ID = new Array ( LENGTH_ID + 1 ) . join ( '0' ) ;
2337
24- // Counter that is increased after each call to generateUniqueVersionId
25- export let uidCounter = 0 ;
26- export const versionIdSeed = getVersionIdSeed ( ) ;
38+ export const S3_VERSION_ID_ENCODING_TYPE = process . env . S3_VERSION_ID_ENCODING_TYPE ;
39+
40+ const versionIDFormat = '1' ;
41+
42+ /**
43+ * Check if the S3_VERSION_ID_ENCODING_TYPE is set to 'hex' or not set at all.
44+ * If it is, we use the legacy hex encoding for version IDs.
45+ *
46+ * @return - true if hex encoding type is used, false otherwise
47+ */
48+ export function isHexEncodingType ( ) {
49+ return S3_VERSION_ID_ENCODING_TYPE === 'hex' || ! S3_VERSION_ID_ENCODING_TYPE ;
50+ }
2751
2852/**
2953 * Left-pad a string representation of a value with a given template.
@@ -89,23 +113,6 @@ function wait(span: number) {
89113 }
90114}
91115
92- export function getVersionIdSeed ( ) : string {
93- // The HOSTNAME environment variable is set by default by Kubernetes
94- // and populated with the pod name, containing a suffix with a unique id
95- // as a string.
96- // By default, we rely on the pid, to account for multiple workers in
97- // cluster mode. As a result, the unique id is either <pod-suffix>.<pid>
98- // or <pid>.
99- // If unique vID are needed in a multi cluster mode architecture (i.e.,
100- // multiple server instances, each with multiple workers), the
101- // HOSTNAME environment variable can be set.
102- return `${ process . env . HOSTNAME ?. split ( '-' ) . pop ( ) || '' } ${ process . pid } ` ;
103- }
104-
105- export function generateUniqueVersionId ( replicationGroupId : string ) : string {
106- return generateVersionId ( `${ versionIdSeed } .${ uidCounter ++ } ` , replicationGroupId ) ;
107- }
108-
109116/**
110117 * This function returns a "versionId" string indicating the current time as a
111118 * combination of the current time in millisecond, the position of the request
@@ -122,6 +129,21 @@ export function generateVersionId(info: string, replicationGroupId: string): str
122129 // replication group ID, like PARIS; will be trimmed if exceed LENGTH_RG
123130 const repGroupId = padRight ( replicationGroupId , TEMPLATE_RG ) ;
124131
132+ let otherInfo = '' ;
133+ let instanceIdPadded = '' ;
134+ let vidFormat = '' ;
135+
136+ if ( isHexEncodingType ( ) ) {
137+ // In HEX encoding, the full info data is used.
138+ otherInfo = info ;
139+ } else {
140+ // In base62, info is for the instance ID and is trimmed/padded.
141+ instanceIdPadded = padRight ( info , TEMPLATE_ID ) ;
142+ // versionID format, 2 bytes
143+ vidFormat = VersioningConstants . VersionId . FormatMarker + versionIDFormat ;
144+ assert ( vidFormat . length === LENGTH_FT , `versionID format must be ${ LENGTH_FT } bytes` ) ;
145+ }
146+
125147 // Need to wait for the millisecond slot got "flushed". We wait for
126148 // only a single millisecond when the module is restarted, which is
127149 // necessary for the correctness of the system. This is therefore cheap.
@@ -141,13 +163,6 @@ export function generateVersionId(info: string, replicationGroupId: string): str
141163 lastSeq = lastTimestamp === ts ? lastSeq + 1 : 0 ;
142164 lastTimestamp = ts ;
143165
144- // if S3_VERSION_ID_ENCODING_TYPE is "hex", info is used.
145- if ( process . env . S3_VERSION_ID_ENCODING_TYPE === 'hex' || ! process . env . S3_VERSION_ID_ENCODING_TYPE ) {
146- // info field stays as is
147- } else {
148- info = '' ; // eslint-disable-line
149- }
150-
151166 // In the default cases, we reverse the chronological order of the
152167 // timestamps so that all versions of an object can be retrieved in the
153168 // reversed chronological order---newest versions first. This is because of
@@ -156,7 +171,9 @@ export function generateVersionId(info: string, replicationGroupId: string): str
156171 padLeft ( MAX_TS - lastTimestamp , TEMPLATE_TS ) +
157172 padLeft ( MAX_SEQ - lastSeq , TEMPLATE_SEQ ) +
158173 repGroupId +
159- info
174+ otherInfo +
175+ instanceIdPadded +
176+ vidFormat
160177 ) ;
161178}
162179
@@ -269,6 +286,30 @@ export function base62Decode(str: string): string | Error {
269286export const ENC_TYPE_HEX = 0 ; // legacy (large) encoding
270287export const ENC_TYPE_BASE62 = 1 ; // new (tiny) encoding
271288
289+ /**
290+ * Checks if the given versionId string contains the specified format version.
291+ * For performance, this function assumes the format marker and version are always
292+ * located at the end of the versionId (at position versionId.length - LENGTH_FT).
293+ * This allows for O(1) access without scanning the entire string.
294+ *
295+ * @param versionId - The versionId string to check.
296+ * @param version - The expected format version.
297+ * @returns true if the versionId contains the format marker and version, false otherwise.
298+ */
299+ function hasVersionIDFormat ( versionId : string , version : string ) : boolean {
300+ const formatMarkerIdx = versionId . length - LENGTH_FT ;
301+ const formatMarker = versionId . charAt ( formatMarkerIdx ) ;
302+ if ( formatMarker !== VersioningConstants . VersionId . FormatMarker ) {
303+ return false ; // no format marker
304+ }
305+ const formatVersion = versionId . substring ( formatMarkerIdx + 1 ) ;
306+ return formatVersion === version ;
307+ }
308+
309+ const LEGACY_BASE62_DECODED_LENGTH = 27 ;
310+ const BASE62_DECODED_LENGTH = 35 ;
311+ const BASE62_ENCODED_LENGTH = 32 ;
312+
272313/**
273314 * Encode a versionId to obscure internal information contained
274315 * in a version ID.
@@ -277,8 +318,9 @@ export const ENC_TYPE_BASE62 = 1; // new (tiny) encoding
277318 * @return - the encoded versionId
278319 */
279320export function encode ( str : string ) : string {
280- // default format without 'info' field will always be 27 characters
281- if ( str . length === 27 ) {
321+ // Legacy base62 version IDs (without 'info' field) are always 27 characters long.
322+ // The new base62 format is 35 characters and includes the format marker at the end.
323+ if ( str . length === LEGACY_BASE62_DECODED_LENGTH || hasVersionIDFormat ( str , versionIDFormat ) ) {
282324 return base62Encode ( str ) ;
283325 } // legacy format
284326 return hexEncode ( str ) ;
@@ -294,15 +336,19 @@ export function encode(str: string): string {
294336 */
295337export function decode ( str : string ) : string | Error {
296338 // default format is exactly 32 characters when encoded
297- if ( str . length === 32 ) {
339+ if ( str . length === BASE62_ENCODED_LENGTH ) {
298340 const decoded : string | Error = base62Decode ( str ) ;
299- if ( typeof decoded === 'string' && decoded . length !== 27 ) {
300- return new Error ( `decoded ${ str } is not length 27` ) ;
341+ // Legacy base62 version IDs (without 'info' field) are always 27 characters long.
342+ // The new base62 format is always 35 characters long.
343+ if ( typeof decoded === 'string' &&
344+ ! [ LEGACY_BASE62_DECODED_LENGTH , BASE62_DECODED_LENGTH ] . includes ( decoded . length ) ) {
345+ return new Error ( `decoded ${ str } is not length ` +
346+ `${ LEGACY_BASE62_DECODED_LENGTH } or ${ BASE62_DECODED_LENGTH } ` ) ;
301347 }
302348 return decoded ;
303349 }
304350 // legacy format
305- if ( str . length > 32 ) {
351+ if ( str . length > BASE62_ENCODED_LENGTH ) {
306352 return hexDecode ( str ) ;
307353 }
308354 return new Error ( `cannot decode str ${ str . length } ` ) ;
0 commit comments