@@ -31,6 +31,7 @@ type HTMLDocumentMetadata = {
3131 twitter : Record < string , string [ ] > ;
3232 favicon ?: string ;
3333 documentFingerprint ?: string ;
34+ version ?: number ;
3435} ;
3536
3637/**
@@ -96,9 +97,80 @@ export class HTMLMetadata {
9697 metadata . documentFingerprint = dcLink . href ;
9798 }
9899
100+ const version = this . _getVersion ( metadata ) ;
101+
102+ if ( version !== null ) {
103+ metadata . version = version ;
104+ }
105+
99106 return metadata ;
100107 }
101108
109+ /**
110+ * Get document version from citation_id, citation_public_url and rel=canonical
111+ * when they end with v1, v2, ... v25. Returns the version if consistent
112+ * across all three sources, else null. All three must provide a version.
113+ */
114+ private _getVersion ( metadata : HTMLDocumentMetadata ) : number | null {
115+ const candidates = [
116+ metadata . highwire . id ?? [ ] ,
117+ metadata . highwire . public_url ?? [ ] ,
118+ [ metadata . link . find ( link => link . rel === 'canonical' ) ?. href ] ,
119+ ] ;
120+
121+ let version : number | null = null ;
122+
123+ for ( const candidate of candidates ) {
124+ const potential_version = this . _findVersion ( candidate ) ;
125+ if ( ! potential_version ) {
126+ return null ;
127+ }
128+
129+ // If the version is not set, set it to the potential version.
130+ // If the version is set and the potential version is different, return null.
131+ if ( version === null ) {
132+ version = potential_version ;
133+ } else if ( version !== potential_version ) {
134+ return null ;
135+ }
136+ }
137+
138+ return version ;
139+ }
140+
141+ /**
142+ * Find the first valid version in an array of uri strings.
143+ * If the array has multiple versions, it will return the first one.
144+ * If the array has no versions, it will return null.
145+ */
146+ private _findVersion ( uris : ( string | undefined ) [ ] ) : number | null {
147+ for ( const uri of uris ) {
148+ const version = this . _extractVersionFromUri ( uri ) ;
149+ if ( version !== null ) {
150+ return version ;
151+ }
152+ }
153+ return null ;
154+ }
155+
156+ /**
157+ * Extract version number from a uri ending with v1, v2, ...
158+ */
159+ private _extractVersionFromUri ( uri : string | undefined ) : number | null {
160+ if ( uri === undefined ) {
161+ return null ;
162+ }
163+
164+ const match = uri . trim ( ) . match ( / (?< ! [ a - z A - Z ] ) v ( \d + ) $ / i) ;
165+ if ( ! match ) {
166+ return null ;
167+ }
168+ // match[1] contains all digits captured by (\d+); parseInt radix 10 ensures decimal parsing
169+ const version = parseInt ( match [ 1 ] , 10 ) ;
170+ // Only return a version if it is between 1 and 25
171+ return version >= 1 && version <= 25 ? version : null ;
172+ }
173+
102174 /**
103175 * Return an array of all the `content` values of `<meta>` tags on the page
104176 * where the value of the attribute begins with `<prefix>`.
0 commit comments