stats: add an extra column 'sum_n' to count the number of ambiguous characters. #490

shenwei356 · shenwei356 · commit 1c96e2f71e34 · 2024-10-30T19:19:12.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,8 @@
         - Fix sequence ID parsing with the default regular expression (in this case, we actually use bytes.Index instead) for a rare case: "xxx\tyyy zzz" was wrongly parsed as "xxx\tyyy". [#486](https://github.com/shenwei356/seqkit/issues/486)
     - `seqkit grep/subseq`:
         - Fix negative regions longer than sequence length. [#479](https://github.com/shenwei356/seqkit/issues/479).
+    - `seqkit stats`:
+        - Add an extra column `sum_n` to count the number of ambiguous characters. [#490](https://github.com/shenwei356/seqkit/issues/490)
 - [SeqKit v2.8.2](https://github.com/shenwei356/seqkit/releases/tag/v2.8.2) - 2024-05-17
 [![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.8.2/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.8.2)
     - `seqkit amplicon`:
diff --git a/doc/docs/usage.md b/doc/docs/usage.md
@@ -698,6 +698,7 @@ Columns:
   16. Q30(%)    percentage of bases with the quality score greater than 30
   17. AvgQual   average quality
   18. GC(%)     percentage of GC content
+  19. sum_n     number of ambitious letters (N, n, X, x)
   
 Attention:
   1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3)
@@ -788,13 +789,13 @@ Eexamples
 1. Extra information
 
         $ seqkit stats *.f{a,q}.gz -a
-        file               format  type  num_seqs    sum_len  min_len  avg_len  max_len   Q1   Q2   Q3  sum_gap  N50  N50_num  Q20(%)  Q30(%)  AvgQual  GC(%)
-        hairpin.fa.gz      FASTA   RNA     28,645  2,949,871       39      103    2,354   76   91  111        0  101      380       0       0        0  45.77
-        mature.fa.gz       FASTA   RNA     35,828    781,222       15     21.8       34   21   22   22        0   22       12       0       0        0   47.6
-        Illimina1.8.fq.gz  FASTQ   DNA     10,000  1,500,000      150      150      150  150  150  150        0  150        1   96.16   89.71    24.82  49.91
-        nanopore.fq.gz     FASTQ   DNA      4,000  1,798,723      153    449.7    6,006  271  318  391        0  395      585   40.79   12.63     9.48  46.66
-        reads_1.fq.gz      FASTQ   DNA      2,500    567,516      226      227      229  227  227  227        0  227        3   91.24   86.62    15.45  53.63
-        reads_2.fq.gz      FASTQ   DNA      2,500    560,002      223      224      225  224  224  224        0  224        2   91.06   87.66    14.62  54.77
+        file               format  type  num_seqs    sum_len  min_len  avg_len  max_len   Q1   Q2   Q3  sum_gap  N50  N50_num  Q20(%)  Q30(%)  AvgQual  GC(%)  sum_n
+        hairpin.fa.gz      FASTA   RNA     28,645  2,949,871       39      103    2,354   76   91  111        0  101      380       0       0        0  45.77    255
+        mature.fa.gz       FASTA   RNA     35,828    781,222       15     21.8       34   21   22   22        0   22       12       0       0        0   47.6      0
+        Illimina1.8.fq.gz  FASTQ   DNA     10,000  1,500,000      150      150      150  150  150  150        0  150        1   96.16   89.71    24.82  49.91     38
+        nanopore.fq.gz     FASTQ   DNA      4,000  1,798,723      153    449.7    6,006  271  318  391        0  395      585   40.79   12.63     9.48  46.66      0
+        reads_1.fq.gz      FASTQ   DNA      2,500    567,516      226      227      229  227  227  227        0  227        3   91.24   86.62    15.45  53.63     44
+        reads_2.fq.gz      FASTQ   DNA      2,500    560,002      223      224      225  224  224  224        0  224        2   91.06   87.66    14.62  54.77      2
 
 1. **Parallelize counting files, it's much faster for lots of small files, especially for files on SSD**
 
diff --git a/seqkit/cmd/stat.go b/seqkit/cmd/stat.go
@@ -76,6 +76,7 @@ Columns:
   16. Q30(%)    percentage of bases with the quality score greater than 30
   17. AvgQual   average quality
   18. GC(%)     percentage of GC content
+  19. sum_n     number of ambitious letters (N, n, X, x)
   
 Attention:
   1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3)
@@ -109,6 +110,7 @@ Tips:
 		}
 		gapLettersBytes := []byte(gapLetters)
 		gcLettersBytes := []byte{'g', 'c', 'G', 'C'}
+		nLettersBytes := []byte{'X', 'x', 'N', 'n'}
 
 		skipFileCheck := getFlagBool(cmd, "skip-file-check")
 		all := getFlagBool(cmd, "all")
@@ -194,7 +196,7 @@ Tips:
 				"max_len",
 			}
 			if all {
-				colnames = append(colnames, []string{"Q1", "Q2", "Q3", "sum_gap", "N50", "N50_num", "Q20(%)", "Q30(%)", "AvgQual", "GC(%)"}...)
+				colnames = append(colnames, []string{"Q1", "Q2", "Q3", "sum_gap", "N50", "N50_num", "Q20(%)", "Q30(%)", "AvgQual", "GC(%)", "sum_n"}...)
 			}
 
 			if hasNX {
@@ -242,7 +244,7 @@ Tips:
 							info.lenAvg,
 							info.lenMax)
 						if all {
-							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
+							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%d",
 								info.Q1,
 								info.Q2,
 								info.Q3,
@@ -252,7 +254,9 @@ Tips:
 								info.q20,
 								info.q30,
 								info.avgQual,
-								info.gc)
+								info.gc,
+								info.nSum,
+							)
 						}
 						if hasNX {
 							for _, x = range info.nx {
@@ -283,7 +287,7 @@ Tips:
 							info.lenAvg,
 							info.lenMax)
 						if all {
-							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
+							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%d",
 								info.Q1,
 								info.Q2,
 								info.Q3,
@@ -293,7 +297,9 @@ Tips:
 								info.q20,
 								info.q30,
 								info.avgQual,
-								info.gc)
+								info.gc,
+								info.nSum,
+							)
 						}
 						if hasNX {
 							for _, x = range info.nx {
@@ -332,7 +338,7 @@ Tips:
 							info.lenAvg,
 							info.lenMax)
 						if all {
-							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
+							fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%d",
 								info.Q1,
 								info.Q2,
 								info.Q3,
@@ -342,7 +348,9 @@ Tips:
 								info.q20,
 								info.q30,
 								info.avgQual,
-								info.gc)
+								info.gc,
+								info.nSum,
+							)
 						}
 						if hasNX {
 							for _, x = range info.nx {
@@ -400,6 +408,7 @@ Tips:
 
 				var gapSum uint64
 				var gcSum uint64
+				var nSum uint64
 
 				lensStats := util.NewLengthStats()
 
@@ -478,6 +487,7 @@ Tips:
 
 						gapSum += uint64(byteutil.CountBytes(record.Seq.Seq, gapLettersBytes))
 						gcSum += uint64(byteutil.CountBytes(record.Seq.Seq, gcLettersBytes))
+						nSum += uint64(byteutil.CountBytes(record.Seq.Seq, nLettersBytes))
 					}
 				}
 
@@ -528,7 +538,7 @@ Tips:
 						file = stdinLabel
 					}
 					ch <- statInfo{file, seqFormat, t,
-						0, 0, 0, 0,
+						0, 0, 0, 0, 0,
 						0, 0, 0, 0,
 						0, 0, 0,
 						0, 0, 0, 0,
@@ -542,7 +552,7 @@ Tips:
 						file = stdinLabel
 					}
 					ch <- statInfo{file, seqFormat, t,
-						lensStats.Count(), lensStats.Sum(), gapSum, lensStats.Min(),
+						lensStats.Count(), lensStats.Sum(), gapSum, lensStats.Min(), nSum,
 						mathutil.Round(lensStats.Mean(), 1), lensStats.Max(), n50, l50,
 						q1, q2, q3,
 						mathutil.Round(float64(q20)/float64(lensStats.Sum())*100, 2),
@@ -601,6 +611,7 @@ Tips:
 				{Header: "Q30(%)", Align: stable.AlignRight, HumanizeNumbers: true},
 				{Header: "AvgQual", Align: stable.AlignRight, HumanizeNumbers: true},
 				{Header: "GC(%)", Align: stable.AlignRight, HumanizeNumbers: true},
+				{Header: "sum_n", Align: stable.AlignRight, HumanizeNumbers: true},
 				// {Header: "L50", AlignRight: true},
 			}...)
 		}
@@ -634,6 +645,7 @@ Tips:
 				row = append(row, info.q30)
 				row = append(row, info.avgQual)
 				row = append(row, info.gc)
+				row = append(row, info.nSum)
 			}
 			if hasNX {
 				for _, x = range info.nx {
@@ -656,6 +668,7 @@ type statInfo struct {
 	lenSum uint64
 	gapSum uint64
 	lenMin uint64
+	nSum   uint64
 
 	lenAvg float64
 	lenMax uint64