Skip to content

Commit 1c96e2f

Browse files
committed
stats: add an extra column 'sum_n' to count the number of ambiguous characters. #490
1 parent 2c28761 commit 1c96e2f

3 files changed

Lines changed: 32 additions & 16 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
- Fix sequence ID parsing with the default regular expression (in this case, we actually use bytes.Index instead) for a rare case: "xxx\tyyy zzz" was wrongly parsed as "xxx\tyyy". [#486](https://github.com/shenwei356/seqkit/issues/486)
55
- `seqkit grep/subseq`:
66
- Fix negative regions longer than sequence length. [#479](https://github.com/shenwei356/seqkit/issues/479).
7+
- `seqkit stats`:
8+
- Add an extra column `sum_n` to count the number of ambiguous characters. [#490](https://github.com/shenwei356/seqkit/issues/490)
79
- [SeqKit v2.8.2](https://github.com/shenwei356/seqkit/releases/tag/v2.8.2) - 2024-05-17
810
[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.8.2/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.8.2)
911
- `seqkit amplicon`:

doc/docs/usage.md

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,7 @@ Columns:
698698
16. Q30(%) percentage of bases with the quality score greater than 30
699699
17. AvgQual average quality
700700
18. GC(%) percentage of GC content
701+
19. sum_n number of ambitious letters (N, n, X, x)
701702
702703
Attention:
703704
1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3)
@@ -788,13 +789,13 @@ Eexamples
788789
1. Extra information
789790

790791
$ seqkit stats *.f{a,q}.gz -a
791-
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 N50_num Q20(%) Q30(%) AvgQual GC(%)
792-
hairpin.fa.gz FASTA RNA 28,645 2,949,871 39 103 2,354 76 91 111 0 101 380 0 0 0 45.77
793-
mature.fa.gz FASTA RNA 35,828 781,222 15 21.8 34 21 22 22 0 22 12 0 0 0 47.6
794-
Illimina1.8.fq.gz FASTQ DNA 10,000 1,500,000 150 150 150 150 150 150 0 150 1 96.16 89.71 24.82 49.91
795-
nanopore.fq.gz FASTQ DNA 4,000 1,798,723 153 449.7 6,006 271 318 391 0 395 585 40.79 12.63 9.48 46.66
796-
reads_1.fq.gz FASTQ DNA 2,500 567,516 226 227 229 227 227 227 0 227 3 91.24 86.62 15.45 53.63
797-
reads_2.fq.gz FASTQ DNA 2,500 560,002 223 224 225 224 224 224 0 224 2 91.06 87.66 14.62 54.77
792+
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 N50_num Q20(%) Q30(%) AvgQual GC(%) sum_n
793+
hairpin.fa.gz FASTA RNA 28,645 2,949,871 39 103 2,354 76 91 111 0 101 380 0 0 0 45.77 255
794+
mature.fa.gz FASTA RNA 35,828 781,222 15 21.8 34 21 22 22 0 22 12 0 0 0 47.6 0
795+
Illimina1.8.fq.gz FASTQ DNA 10,000 1,500,000 150 150 150 150 150 150 0 150 1 96.16 89.71 24.82 49.91 38
796+
nanopore.fq.gz FASTQ DNA 4,000 1,798,723 153 449.7 6,006 271 318 391 0 395 585 40.79 12.63 9.48 46.66 0
797+
reads_1.fq.gz FASTQ DNA 2,500 567,516 226 227 229 227 227 227 0 227 3 91.24 86.62 15.45 53.63 44
798+
reads_2.fq.gz FASTQ DNA 2,500 560,002 223 224 225 224 224 224 0 224 2 91.06 87.66 14.62 54.77 2
798799

799800
1. **Parallelize counting files, it's much faster for lots of small files, especially for files on SSD**
800801

seqkit/cmd/stat.go

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ Columns:
7676
16. Q30(%) percentage of bases with the quality score greater than 30
7777
17. AvgQual average quality
7878
18. GC(%) percentage of GC content
79+
19. sum_n number of ambitious letters (N, n, X, x)
7980
8081
Attention:
8182
1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3)
@@ -109,6 +110,7 @@ Tips:
109110
}
110111
gapLettersBytes := []byte(gapLetters)
111112
gcLettersBytes := []byte{'g', 'c', 'G', 'C'}
113+
nLettersBytes := []byte{'X', 'x', 'N', 'n'}
112114

113115
skipFileCheck := getFlagBool(cmd, "skip-file-check")
114116
all := getFlagBool(cmd, "all")
@@ -194,7 +196,7 @@ Tips:
194196
"max_len",
195197
}
196198
if all {
197-
colnames = append(colnames, []string{"Q1", "Q2", "Q3", "sum_gap", "N50", "N50_num", "Q20(%)", "Q30(%)", "AvgQual", "GC(%)"}...)
199+
colnames = append(colnames, []string{"Q1", "Q2", "Q3", "sum_gap", "N50", "N50_num", "Q20(%)", "Q30(%)", "AvgQual", "GC(%)", "sum_n"}...)
198200
}
199201

200202
if hasNX {
@@ -242,7 +244,7 @@ Tips:
242244
info.lenAvg,
243245
info.lenMax)
244246
if all {
245-
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
247+
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%d",
246248
info.Q1,
247249
info.Q2,
248250
info.Q3,
@@ -252,7 +254,9 @@ Tips:
252254
info.q20,
253255
info.q30,
254256
info.avgQual,
255-
info.gc)
257+
info.gc,
258+
info.nSum,
259+
)
256260
}
257261
if hasNX {
258262
for _, x = range info.nx {
@@ -283,7 +287,7 @@ Tips:
283287
info.lenAvg,
284288
info.lenMax)
285289
if all {
286-
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
290+
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%d",
287291
info.Q1,
288292
info.Q2,
289293
info.Q3,
@@ -293,7 +297,9 @@ Tips:
293297
info.q20,
294298
info.q30,
295299
info.avgQual,
296-
info.gc)
300+
info.gc,
301+
info.nSum,
302+
)
297303
}
298304
if hasNX {
299305
for _, x = range info.nx {
@@ -332,7 +338,7 @@ Tips:
332338
info.lenAvg,
333339
info.lenMax)
334340
if all {
335-
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
341+
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%d",
336342
info.Q1,
337343
info.Q2,
338344
info.Q3,
@@ -342,7 +348,9 @@ Tips:
342348
info.q20,
343349
info.q30,
344350
info.avgQual,
345-
info.gc)
351+
info.gc,
352+
info.nSum,
353+
)
346354
}
347355
if hasNX {
348356
for _, x = range info.nx {
@@ -400,6 +408,7 @@ Tips:
400408

401409
var gapSum uint64
402410
var gcSum uint64
411+
var nSum uint64
403412

404413
lensStats := util.NewLengthStats()
405414

@@ -478,6 +487,7 @@ Tips:
478487

479488
gapSum += uint64(byteutil.CountBytes(record.Seq.Seq, gapLettersBytes))
480489
gcSum += uint64(byteutil.CountBytes(record.Seq.Seq, gcLettersBytes))
490+
nSum += uint64(byteutil.CountBytes(record.Seq.Seq, nLettersBytes))
481491
}
482492
}
483493

@@ -528,7 +538,7 @@ Tips:
528538
file = stdinLabel
529539
}
530540
ch <- statInfo{file, seqFormat, t,
531-
0, 0, 0, 0,
541+
0, 0, 0, 0, 0,
532542
0, 0, 0, 0,
533543
0, 0, 0,
534544
0, 0, 0, 0,
@@ -542,7 +552,7 @@ Tips:
542552
file = stdinLabel
543553
}
544554
ch <- statInfo{file, seqFormat, t,
545-
lensStats.Count(), lensStats.Sum(), gapSum, lensStats.Min(),
555+
lensStats.Count(), lensStats.Sum(), gapSum, lensStats.Min(), nSum,
546556
mathutil.Round(lensStats.Mean(), 1), lensStats.Max(), n50, l50,
547557
q1, q2, q3,
548558
mathutil.Round(float64(q20)/float64(lensStats.Sum())*100, 2),
@@ -601,6 +611,7 @@ Tips:
601611
{Header: "Q30(%)", Align: stable.AlignRight, HumanizeNumbers: true},
602612
{Header: "AvgQual", Align: stable.AlignRight, HumanizeNumbers: true},
603613
{Header: "GC(%)", Align: stable.AlignRight, HumanizeNumbers: true},
614+
{Header: "sum_n", Align: stable.AlignRight, HumanizeNumbers: true},
604615
// {Header: "L50", AlignRight: true},
605616
}...)
606617
}
@@ -634,6 +645,7 @@ Tips:
634645
row = append(row, info.q30)
635646
row = append(row, info.avgQual)
636647
row = append(row, info.gc)
648+
row = append(row, info.nSum)
637649
}
638650
if hasNX {
639651
for _, x = range info.nx {
@@ -656,6 +668,7 @@ type statInfo struct {
656668
lenSum uint64
657669
gapSum uint64
658670
lenMin uint64
671+
nSum uint64
659672

660673
lenAvg float64
661674
lenMax uint64

0 commit comments

Comments
 (0)