diff --git a/charts/cluster/docs/Getting Started.md b/charts/cluster/docs/Getting Started.md index 73170e5cde..86187ee59c 100644 --- a/charts/cluster/docs/Getting Started.md +++ b/charts/cluster/docs/Getting Started.md @@ -24,7 +24,7 @@ helm upgrade --install cnpg \ ## Creating a cluster configuration Once you have the operator installed, the next step is to prepare the cluster configuration. Whether this will be managed -via a GitOps solution or directly via Helm is up to you. The following sections outlines the important steps in both cases. +via a GitOps solution or directly via Helm is up to you. The following sections outline the important steps in both cases. ### Choosing the database type @@ -88,15 +88,17 @@ There are several important cluster options. Here are the most important ones: `cluster.affinity.topologyKey` - The chart sets it to `topology.kubernetes.io/zone` by default which is useful if you are running a production cluster in a multi AZ cluster (highly recommended). If you are running a single AZ cluster, you may want to change that to `kubernetes.io/hostname` to ensure that cluster instances are not provisioned on the same node. -`cluster.postgresql` - Allows you to override PostgreSQL configuration parameters example: +`cluster.postgresql.parameters` - Allows you to override PostgreSQL configuration parameters, for example: ```yaml cluster: postgresql: - max_connections: "200" - shared_buffers: "2GB" + parameters: + max_connections: "200" + shared_buffers: "2GB" ``` -`cluster.initSQL` - Allows you to run custom SQL queries during the cluster initialization. This is useful for creating -extensions, schemas and databases. Note that these are as a superuser. +`cluster.initdb.postInitSQL` - Allows you to run custom SQL queries during cluster initialization. This is useful for creating +extensions, schemas, and databases. Use `cluster.initdb.postInitApplicationSQL` and `cluster.initdb.postInitTemplateSQL` when +you need application-database or template-database specific initialization. For a full list - refer to the Helm chart [configuration options](../README.md#Configuration-options). diff --git a/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationErrors.md b/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationErrors.md index 13f7eb9dd1..8cb8a29972 100644 --- a/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationErrors.md +++ b/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationErrors.md @@ -25,26 +25,30 @@ The `CNPGClusterLogicalReplicationErrors` alert indicates that a logical replica # Connect to the subscriber and check subscription status kubectl exec -it svc/SUBSCRIBER-CLUSTER-rw -n NAMESPACE -- psql -c " SELECT - subname, - subenabled, - apply_error_count, - sync_error_count, - stats_reset -FROM pg_stat_subscription -WHERE apply_error_count > 0 OR sync_error_count > 0; + s.subname, + s.subenabled, + COALESCE(sss.apply_error_count, 0) AS apply_error_count, + COALESCE(sss.sync_error_count, 0) AS sync_error_count, + sss.stats_reset +FROM pg_subscription s +LEFT JOIN pg_stat_subscription_stats sss ON s.oid = sss.subid +WHERE COALESCE(sss.apply_error_count, 0) > 0 OR COALESCE(sss.sync_error_count, 0) > 0; " # Check the last error message kubectl exec -it svc/SUBSCRIBER-CLUSTER-rw -n NAMESPACE -- psql -c " SELECT - subname, - last_msg_receipt_time, - latest_end_time, + s.subname, + ss.last_msg_receipt_time, + ss.latest_end_time, CASE - WHEN apply_error_count > 0 THEN 'Apply errors detected' - WHEN sync_error_count > 0 THEN 'Sync errors detected' + WHEN COALESCE(sss.apply_error_count, 0) > 0 THEN 'Apply errors detected' + WHEN COALESCE(sss.sync_error_count, 0) > 0 THEN 'Sync errors detected' + ELSE 'No errors detected' END as error_type -FROM pg_stat_subscription; +FROM pg_subscription s +LEFT JOIN pg_stat_subscription ss ON s.oid = ss.subid +LEFT JOIN pg_stat_subscription_stats sss ON s.oid = sss.subid; " ``` @@ -96,21 +100,21 @@ FROM pg_publication; kubectl exec -it svc/SUBSCRIBER-CLUSTER-rw -n NAMESPACE -- psql -c " SELECT subname, - srconninfo, - srschema, - srslotname, - srsynccommit + subconninfo, + subslotname, + subsynccommit, + subpublications FROM pg_subscription; " # Check which tables are being replicated kubectl exec -it svc/SUBSCRIBER-CLUSTER-rw -n NAMESPACE -- psql -c " SELECT - relid::regclass as table_name, - srsubstate as state -FROM pg_subscription_rel -JOIN pg_class ON relid = oid -WHERE srsubstate NOT IN ('r', 's'); -- Not ready or synchronizing + sr.srrelid::regclass as table_name, + sr.srsubstate as state +FROM pg_subscription_rel sr +JOIN pg_class c ON sr.srrelid = c.oid +WHERE sr.srsubstate NOT IN ('r', 's'); -- Not ready or synchronizing " ``` @@ -378,4 +382,4 @@ ALTER TABLE table_name ENABLE TRIGGER trigger_name; - You encounter frequent constraint violations - The schema cannot be synchronized - You need to skip transactions repeatedly - - Error rate is increasing despite fixes \ No newline at end of file + - Error rate is increasing despite fixes diff --git a/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationLagging.md b/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationLagging.md index 66356cdcb4..acf912e769 100644 --- a/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationLagging.md +++ b/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationLagging.md @@ -28,17 +28,19 @@ Connect to the subscriber and check the current state: ```bash kubectl exec -it svc/SUBSCRIBER-CLUSTER-rw -n NAMESPACE -- psql -c " SELECT - subname, - enabled, - EXTRACT(EPOCH FROM (NOW() - last_msg_receipt_time)) as receipt_lag_seconds, - EXTRACT(EPOCH FROM (NOW() - latest_end_time)) as apply_lag_seconds, - pg_wal_lsn_diff(received_lsn, latest_end_lsn) as pending_bytes, + s.subname, + s.subenabled AS enabled, + EXTRACT(EPOCH FROM (NOW() - ss.last_msg_receipt_time)) AS receipt_lag_seconds, + EXTRACT(EPOCH FROM (NOW() - ss.latest_end_time)) AS apply_lag_seconds, + COALESCE(pg_wal_lsn_diff(ss.received_lsn, ss.latest_end_lsn), 0) AS pending_bytes, CASE - WHEN EXTRACT(EPOCH FROM (NOW() - last_msg_receipt_time)) > 60 THEN 'High receipt lag' - WHEN EXTRACT(EPOCH FROM (NOW() - latest_end_time)) > 60 THEN 'High apply lag' - WHEN pg_wal_lsn_diff(received_lsn, latest_end_lsn) > 1024^3 THEN 'High LSN distance' + WHEN EXTRACT(EPOCH FROM (NOW() - ss.last_msg_receipt_time)) > 60 THEN 'High receipt lag' + WHEN EXTRACT(EPOCH FROM (NOW() - ss.latest_end_time)) > 60 THEN 'High apply lag' + WHEN COALESCE(pg_wal_lsn_diff(ss.received_lsn, ss.latest_end_lsn), 0) > 1024^3 THEN 'High LSN distance' + ELSE 'Healthy' END as primary_issue -FROM pg_stat_subscription; +FROM pg_subscription s +LEFT JOIN pg_stat_subscription ss ON s.oid = ss.subid; " ``` @@ -230,4 +232,4 @@ kubectl exec -it svc/SUBSCRIBER-CLUSTER-rw -n NAMESPACE -- psql -c "\dRs+" - Lag continues to increase despite optimization - Network issues persist between clusters - Resource utilization is at maximum but lag continues - - You experience frequent replication failures \ No newline at end of file + - You experience frequent replication failures diff --git a/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationStopped.md b/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationStopped.md index de2190e22b..3b30aca7d0 100644 --- a/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationStopped.md +++ b/charts/cluster/docs/runbooks/CNPGClusterLogicalReplicationStopped.md @@ -25,18 +25,18 @@ The `CNPGClusterLogicalReplicationStopped` alert indicates that a logical replic # Check all subscriptions and their status kubectl exec -it svc/SUBSCRIBER-CLUSTER-rw -n NAMESPACE -- psql -c " SELECT - pg_subscription.subname, - pg_subscription.enabled, + s.subname, + s.subenabled AS enabled, CASE - WHEN pg_subscription.enabled = false THEN 'Explicitly disabled' - WHEN pid IS NULL AND buffered_lag_bytes > 0 THEN 'Stuck (no worker)' - WHEN pid IS NOT NULL THEN 'Active' + WHEN NOT s.subenabled THEN 'Explicitly disabled' + WHEN ss.pid IS NULL AND COALESCE(pg_wal_lsn_diff(ss.received_lsn, ss.latest_end_lsn), 0) > 0 THEN 'Stuck (no worker)' + WHEN ss.pid IS NOT NULL THEN 'Active' ELSE 'Unknown' END as status, - pg_wal_lsn_diff(received_lsn, latest_end_lsn) as pending_bytes, - pid IS NOT NULL as has_worker -FROM pg_subscription -LEFT JOIN pg_stat_subscription ON pg_subscription.oid = pg_stat_subscription.subid; + COALESCE(pg_wal_lsn_diff(ss.received_lsn, ss.latest_end_lsn), 0) AS pending_bytes, + ss.pid IS NOT NULL AS has_worker +FROM pg_subscription s +LEFT JOIN pg_stat_subscription ss ON s.oid = ss.subid; " ``` @@ -63,10 +63,10 @@ WHERE application_name LIKE '%subscription%' OR backend_type = 'logical replicat kubectl exec -it svc/SUBSCRIBER-CLUSTER-rw -n NAMESPACE -- psql -c " SELECT subname, - srconninfo, - srsynccommit, - srslotname, - srsyncstate as sync_state + subconninfo, + subsynccommit, + subslotname, + subpublications FROM pg_subscription; " ``` @@ -86,7 +86,7 @@ kubectl logs -n NAMESPACE $POD --tail=200 | grep -i "subscription\|replication\| ```bash # Extract connection info from subscription kubectl exec -it svc/SUBSCRIBER-CLUSTER-rw -n NAMESPACE -- psql -c " -SELECT srconninfo FROM pg_subscription WHERE subname = 'your_subscription_name'; +SELECT subconninfo FROM pg_subscription WHERE subname = 'your_subscription_name'; " | grep -o "host=[^ ]*" | cut -d= -f2 # Test connection @@ -333,4 +333,4 @@ kubectl exec -it svc/CLUSTER-rw -n NS -- psql -c "SELECT * FROM pg_stat_activity - Workers fail to start despite adequate resources - WAL retention issues prevent catch-up - Frequent disconnections occur - - Data cannot be resynchronized successfully \ No newline at end of file + - Data cannot be resynchronized successfully diff --git a/charts/cluster/templates/console-statefulset.yaml b/charts/cluster/templates/console-statefulset.yaml index dc9c22815c..07a39c0cee 100644 --- a/charts/cluster/templates/console-statefulset.yaml +++ b/charts/cluster/templates/console-statefulset.yaml @@ -55,10 +55,10 @@ spec: apt install -y screen curl wget jq unzip gzip nano vim util-linux less htop cat < /root/.bashrc echo -e "\nHere are some examples for connecting and running queries on the cluster:" - echo ' nohup psql \$DB_SUPERUSER_URI"/DB_NAME" -c "SELECT 1;" 2>&1 > command.log &' + echo ' nohup psql "$DB_SUPERUSER_URI/" -c "SELECT 1;" > command.log 2>&1 &' echo -e "\nTo check up on the command, use:" echo " tail -f command.log" - echo -e "\nYou can also use 'screen' for an interactive session. See https://github.com/paradedb/charts/blob/dev/charts/paradedb/docs/long-running-tasks.md for examples." + echo -e "\nYou can also use 'screen' for an interactive session. See https://github.com/cloudnative-pg/charts/blob/main/charts/cluster/docs/Console.md for examples." echo -e "\n" EOF sleep infinity