Skip to content

Commit 7e2d0c8

Browse files
jcastro-dotcmsoidacraerickgonzalez
authored
task(content analytics) #34661 : Replace languageId with languageIso in Analytics Payload and Database Schema (#34785)
## Summary Replaces `language_id` / `userlanguage` with `locale_id` across the Content Analytics pipeline to standardize language tracking using ISO codes instead of internal numeric IDs. ### Changes include: - ClickHouse Schema (`init.sql`): Adds a new column nsamed `language_iso` to the `events` table; updates `session_states`, `session_facts`, and `sessions_by_language_daily` tables to use `locale_id` with `LowCardinality(String)` type. Unknown/missing values now default to '' (undefined) instead of '0'. - Materialized Views: Updates `session_states_mv` and `sessions_by_language_daily` rollup to reference `locale_id` throughout aggregation and grouping logic. - Cube Schemas: Renames the `languageId` dimension to `localeId` (with sql: `locale_id`) in both `Request.js` and `SessionsByLanguageDaily.js`. - Analytics Payload Validator: Updates `pageview.json` to accept `locale_id` instead of `language_id`. ## Test plan - Verify `pageview` events are ingested with `locale_id` field populated correctly (e.g., `en_en`, `es_es`, `fr_fr`) - Confirm `session_facts` and `sessions_by_language_daily` tables reflect the renamed column after re-materialization - Validate Cube queries on languageIso dimension return expected results Closes #34661 --------- Co-authored-by: Arcadio Quintero <oidacra@gmail.com> Co-authored-by: erickgonzalez <erick.gonzalez@dotcms.com>
1 parent 6ea7cd4 commit 7e2d0c8

11 files changed

Lines changed: 2140 additions & 2104 deletions

File tree

core-web/libs/portlets/dot-analytics/data-access/src/lib/types/cubequery.types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ const DimensionField = {
8484
TOP_ATTRIBUTED_CONTENT: 'topAttributedContent',
8585
DEVICE_CATEGORY: 'deviceCategory',
8686
BROWSER_FAMILY: 'browserFamily',
87-
LANGUAGE_ID: 'languageId'
87+
LOCALE_ID: 'localeId'
8888
} as const;
8989

9090
export type DimensionField = (typeof DimensionField)[keyof typeof DimensionField];

core-web/libs/portlets/dot-analytics/data-access/src/lib/types/entities.types.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ export interface SessionsByBrowserDailyEntity {
237237
* SessionsByLanguageDaily cube entity (one row per language).
238238
*/
239239
export interface SessionsByLanguageDailyEntity {
240-
'SessionsByLanguageDaily.languageId'?: string;
240+
'SessionsByLanguageDaily.localeId'?: string;
241241
'SessionsByLanguageDaily.engagedSessions'?: string;
242242
'SessionsByLanguageDaily.totalSessions'?: string;
243243
'SessionsByLanguageDaily.avgEngagedSessionTimeSeconds'?: string;

core-web/libs/sdk/analytics/src/lib/core/shared/models/data.model.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,8 @@ export type DotCMSEventPageData = Pick<
105105
> & {
106106
/** Page title */
107107
title: string | undefined;
108-
/** Language identifier */
109-
language_id?: string;
108+
/** Locale identifier (e.g., es-es, en-us) */
109+
locale_id: string;
110110
/** Persona identifier */
111111
persona?: string;
112112
};

core-web/libs/sdk/analytics/src/lib/core/shared/utils/dot-analytics.utils.spec.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,7 +1044,7 @@ describe('Analytics Utils', () => {
10441044
}
10451045
},
10461046
properties: {
1047-
language_id: 'en-US',
1047+
locale_id: 'en-US',
10481048
persona: 'default',
10491049
url: 'https://example.com/page',
10501050
title: 'Test Page',
@@ -1071,8 +1071,7 @@ describe('Analytics Utils', () => {
10711071
doc_host: 'example.com',
10721072
doc_path: '/page',
10731073
title: 'Test Page',
1074-
language_id: undefined,
1075-
persona: undefined
1074+
locale_id: 'es-es'
10761075
},
10771076
local_time: expect.stringMatching(
10781077
/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2}$/
@@ -1081,7 +1080,7 @@ describe('Analytics Utils', () => {
10811080
source: 'google'
10821081
},
10831082
custom: {
1084-
language_id: 'en-US',
1083+
locale_id: 'en-US',
10851084
persona: 'default',
10861085
utm: {
10871086
source: 'google'
@@ -1117,7 +1116,7 @@ describe('Analytics Utils', () => {
11171116
}
11181117
},
11191118
properties: {
1120-
language_id: 'en-US',
1119+
locale_id: 'en-US',
11211120
persona: 'default',
11221121
title: 'Test Page',
11231122
width: 1024,
@@ -1129,6 +1128,7 @@ describe('Analytics Utils', () => {
11291128

11301129
expect(result).not.toHaveProperty('utm');
11311130
expect(result.context.device).toBeDefined();
1131+
expect(result.page.locale_id).toBe('es-es');
11321132
});
11331133
});
11341134
});

core-web/libs/sdk/analytics/src/lib/core/shared/utils/dot-analytics.utils.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,9 @@ export const enrichPagePayloadOptimized = (
617617
}
618618
});
619619

620+
// TODO: Fix this when we detect the locale_id of the page in not server side
621+
const locale_id = payload.context?.device?.language?.toLowerCase() ?? '';
622+
620623
const pageData: DotCMSEventPageData = {
621624
url: location.href,
622625
doc_encoding: staticData.doc_encoding,
@@ -625,7 +628,8 @@ export const enrichPagePayloadOptimized = (
625628
doc_search: location.search,
626629
doc_host: location.hostname,
627630
doc_path: location.pathname,
628-
title: (properties.title as string) ?? document?.title
631+
title: (properties.title as string) ?? document?.title,
632+
locale_id
629633
};
630634

631635
// Extract UTM parameters from the current URL (already transformed to DotCMS format)

docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/Request.js

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
cube('request', {
1414
sql: `SELECT
1515
event_type, user_agent, referer, url, doc_encoding, page_title,
16-
userlanguage, persona, doc_path, doc_host, doc_protocol, doc_hash,
16+
locale_id, persona, doc_path, doc_host, doc_protocol, doc_hash,
1717
doc_search, screen_resolution, user_language, viewport_height, viewport_width,
1818
utm_campaign, utm_medium, utm_source, utm_term, utm_content,
1919
context_site_auth, context_site_id, sessionid, context_user_id, request_id,
@@ -239,11 +239,11 @@ cube('request', {
239239
title: 'Site Auth',
240240
description: 'Authentication key generated for every Site in the Content Analytics App'
241241
},
242-
languageId: {
243-
sql: `userlanguage`,
242+
localeId: {
243+
sql: `locale_id`,
244244
type: `string`,
245-
title: 'Language ID',
246-
description: 'Content language identifier'
245+
title: 'Language ISO Code',
246+
description: 'The dotCMS Locale ID for the event.'
247247
},
248248
persona: {
249249
sql: `persona`,

docker/docker-compose-examples/analytics/setup/config/dev/cube/schema/SessionsByLanguageDaily.js

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@
77
* clickhouse_test_db.sessions_by_language_daily
88
*
99
* Grain:
10-
* One row per (customer_id, cluster_id, context_site_id, day, language_id).
10+
* One row per (customer_id, cluster_id, context_site_id, day, locale_id).
1111
*
1212
* Widget semantics:
1313
* - Engaged sessions count per language
1414
* - Engaged% within language = engaged_sessions / total_sessions
1515
* - Avg engaged time per language = total_duration_engaged_seconds / engaged_sessions
1616
*
1717
* Note:
18-
* - language_id is stored as String.
18+
* - locale_id is the dotCMS Locale ID.
1919
* - UI resolves the language name via REST Endpoint/API
2020
* */
2121
cube(`SessionsByLanguageDaily`, {
@@ -25,7 +25,7 @@
2525
customer_id,
2626
context_site_id,
2727
day,
28-
language_id,
28+
locale_id,
2929
total_sessions,
3030
engaged_sessions,
3131
total_duration_engaged_seconds,
@@ -99,11 +99,11 @@
9999
description: `Day grain used for filtering and trends. Use granularity: day.`
100100
},
101101

102-
languageId: {
103-
sql: `language_id`,
102+
localeId: {
103+
sql: `locale_id`,
104104
type: `string`,
105-
title: `Language Id`,
106-
description: `dotCMS language id (String). Display name resolved externally.`,
105+
title: `Language ISO Code`,
106+
description: `The dotCMS Locale ID for the event.`,
107107
},
108108

109109
updatedAt: {

docker/docker-compose-examples/analytics/setup/db/clickhouse/init-scripts/init.sql

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ CREATE TABLE IF NOT EXISTS clickhouse_test_db.events
132132
custom_48 String,
133133
custom_49 String,
134134
custom_50 String,
135+
locale_id LowCardinality(String) DEFAULT '',
135136

136137

137138
-- ######################################################
@@ -518,7 +519,7 @@ GROUP BY customer_id, cluster_id, context_user_id, context_site_id;
518519
│ - engaged (true/false) │
519520
│ - device_category │
520521
│ - browser_family │
521-
│ - language_id
522+
│ - locale_id
522523
│ │
523524
└────────────┬────────────────────────┬─────────┘
524525
│ │
@@ -656,8 +657,8 @@ CREATE TABLE clickhouse_test_db.session_states
656657

657658
/* Event counters (mergeable) */
658659
total_events_state AggregateFunction(count), -- total events in session
659-
pageviews_state AggregateFunction(countIf, UInt8), -- number of pageview events in the session
660-
conversions_state AggregateFunction(countIf, UInt8), -- number of conversion events in the session
660+
pageviews_state AggregateFunction(countIf, UInt8), -- total number of pageview events in the session
661+
conversions_state AggregateFunction(countIf, UInt8), -- total number of conversion events in the session
661662

662663
user_agent_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')),
663664

@@ -666,11 +667,11 @@ CREATE TABLE clickhouse_test_db.session_states
666667
-- last-seen device category label for the session (Desktop/Mobile/Tablet/Other)
667668
-- derived from UA fields; stored as state so that late events can update the final value deterministically.
668669

669-
browser_family_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')),
670670
-- last-seen browser family bucket (Chrome/Safari/Firefox/Edge/Other)
671+
browser_family_state AggregateFunction(argMax, String, DateTime64(3, 'UTC')),
671672

672-
language_id_state AggregateFunction(argMax, String, DateTime64(3, 'UTC'))
673-
-- last-seen dotCMS language id (as String), defaulting to '0' if unknown
673+
-- last-seen dotCMS language ISO code, defaulting to '' ('undefined') if unknown
674+
locale_id_state AggregateFunction(argMax, String, DateTime64(3, 'UTC'))
674675
)
675676
/* Why this engine is mandatory:
676677
-> You are storing aggregate states
@@ -925,7 +926,7 @@ WITH
925926
/* Prefer browser mapping, else Other */
926927
coalesce(nullIf(b_map.browser_family, ''), 'Other') AS browser_family,
927928

928-
nullIf(userlanguage, '') AS language_id
929+
nullIf(locale_id, '') AS locale_id
929930
SELECT
930931
e.customer_id,
931932
e.cluster_id,
@@ -949,7 +950,7 @@ SELECT
949950
/* "last seen" dimension states (tables-only values) */
950951
argMaxState(device_category, e.utc_time) AS device_category_state,
951952
argMaxState(browser_family, e.utc_time) AS browser_family_state,
952-
argMaxState(coalesce(language_id, '0'), e.utc_time) AS language_id_state
953+
argMaxState(coalesce(locale_id, ''), e.utc_time) AS locale_id_state
953954
FROM clickhouse_test_db.events AS e
954955
/* Device mapping via table */
955956
LEFT JOIN clickhouse_test_db.device_category_map AS d_dev
@@ -1015,16 +1016,16 @@ CREATE TABLE clickhouse_test_db.session_facts
10151016

10161017
/* Finalized counters */
10171018
total_events UInt32, -- total events in session
1018-
pageviews UInt32, -- pageview events
1019-
conversions UInt32, -- conversion events
1019+
pageviews UInt32, -- total pageview events
1020+
conversions UInt32, -- total conversion events
10201021

10211022
/* Engagement flag (GA4-style) */
10221023
engaged UInt8, -- 1 if engaged, else 0
10231024

10241025
/* Finalized dimensions */
10251026
device_category String, -- Desktop/Mobile/Tablet/Other
10261027
browser_family String, -- Chrome/Safari/Firefox/Edge/Other
1027-
language_id String, -- dotCMS language id as String ('0' unknown)
1028+
locale_id LowCardinality(String), -- dotCMS language Locale ID ('' means undefined)
10281029

10291030
/* Row version timestamp for ReplacingMergeTree */
10301031
updated_at DateTime('UTC')
@@ -1160,7 +1161,7 @@ SELECT
11601161
browser_family_base
11611162
) AS browser_family,
11621163

1163-
language_id,
1164+
locale_id,
11641165

11651166
now64(3, 'UTC') AS updated_at
11661167
FROM
@@ -1198,7 +1199,7 @@ FROM
11981199
/* UA for fallback matching */
11991200
lowerUTF8(argMaxMerge(user_agent_state)) AS ua_l,
12001201

1201-
argMaxMerge(language_id_state) AS language_id
1202+
coalesce(nullIf(argMaxMerge(locale_id_state), ''), '') AS locale_id
12021203
FROM clickhouse_test_db.session_states
12031204
GROUP BY (
12041205
customer_id,
@@ -1542,7 +1543,7 @@ CREATE TABLE clickhouse_test_db.sessions_by_language_daily
15421543
context_site_id String,
15431544
day Date,
15441545

1545-
language_id String, -- dotCMS language id as String ('0' unknown)
1546+
locale_id LowCardinality(String), -- dotCMS language Locale ID ('' means undefined)
15461547

15471548
total_sessions UInt64,
15481549
engaged_sessions UInt64,
@@ -1557,7 +1558,7 @@ CREATE TABLE clickhouse_test_db.sessions_by_language_daily
15571558
)*/
15581559
ENGINE = ReplacingMergeTree
15591560
PARTITION BY toYYYYMM(day)
1560-
ORDER BY (customer_id, cluster_id, context_site_id, day, language_id);
1561+
ORDER BY (customer_id, cluster_id, context_site_id, day, locale_id);
15611562

15621563
/*
15631564
Object: RMV
@@ -1577,7 +1578,7 @@ SELECT
15771578
cluster_id,
15781579
context_site_id,
15791580
toDate(session_start, 'UTC') AS day,
1580-
language_id,
1581+
locale_id,
15811582

15821583
count() AS total_sessions,
15831584
countIf(engaged = 1) AS engaged_sessions,
@@ -1591,4 +1592,4 @@ GROUP BY (
15911592
cluster_id,
15921593
context_site_id,
15931594
day,
1594-
language_id);
1595+
locale_id);

dotCMS/src/main/resources/analytics/validators/pageview.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111
"type": "string",
1212
"required": false
1313
},
14-
"language_id": {
15-
"type": "string"
14+
"locale_id": {
15+
"type": "string",
16+
"required": true
1617
},
1718
"persona": {
1819
"type": "string"

dotcms-integration/src/test/java/com/dotcms/jitsu/validators/AnalyticsValidatorUtilTest.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -832,7 +832,7 @@ public void eventTypeIsRequired() {
832832
*/
833833
@Test
834834
public void dataIsRequired() {
835-
final int expectedErrorCount = 4;
835+
final int expectedErrorCount = 5;
836836
final String json =
837837
"{" +
838838
"\"context\": {" +
@@ -869,6 +869,7 @@ public void dataIsRequired() {
869869
expectedErrorCount, errorsField.size());
870870
assertTrue(errorsField.contains("events[0].data.page"));
871871
assertTrue(errorsField.contains("events[0].data.page.url"));
872+
assertTrue(errorsField.contains("events[0].data.page.locale_id"));
872873
assertTrue(errorsField.contains("events[0].data.page.doc_encoding"));
873874
assertTrue(errorsField.contains("events[0].local_time"));
874875

@@ -889,6 +890,7 @@ public void dataIsRequired() {
889890
assertEquals(expectedErrorCount, errorsField.size());
890891
assertTrue(errorsMessages.contains("Required field is missing: data.page"));
891892
assertTrue(errorsMessages.contains("Required field is missing: data.page.url"));
893+
assertTrue(errorsMessages.contains("Required field is missing: data.page.locale_id"));
892894
assertTrue(errorsMessages.contains("Required field is missing: local_time"));
893895
}
894896

@@ -920,6 +922,7 @@ public void rightPageView() {
920922
"\"page\": {" +
921923
"\"url\": \"http://www.google.com\"," +
922924
"\"title\": \"Google\"," +
925+
"\"locale_id\": \"en\"," +
923926
"\"doc_encoding\": \"UTF8\"" +
924927
"}" +
925928
"}" +
@@ -961,6 +964,7 @@ public void wrongLocalTimeDateFormat() {
961964
"\"page\": {" +
962965
"\"url\": \"http://www.google.com\"," +
963966
"\"title\": \"Google\"," +
967+
"\"locale_id\": \"en\"," +
964968
"\"doc_encoding\": \"UTF8\"" +
965969
"}" +
966970
"}" +
@@ -1006,6 +1010,7 @@ public void extraAttributesPageView() {
10061010
"\"page\": {" +
10071011
"\"url\": \"http://www.google.com\"," +
10081012
"\"title\": \"Google\"," +
1013+
"\"locale_id\": \"en\"," +
10091014
"\"doc_encoding\": \"UTF8\"," +
10101015
"\"extra_field\": \"extra\"" +
10111016
"}" +

0 commit comments

Comments
 (0)