Skip to content

Commit 9c69b36

Browse files
committed
Added a ksw_global_end function.
This is as per ksw_global, but permits control over whether gaps at the start and/or end of query and target sequences are penalised. For example: GTTTTTTAC vs GTTTTTTAC || | || GT------C GTC------
1 parent f1beb2c commit 9c69b36

2 files changed

Lines changed: 229 additions & 0 deletions

File tree

ksw.c

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,117 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
530530
return score;
531531
}
532532

533+
534+
/**
535+
* Global alignment with the ability to control which start/end gaps
536+
* are penalised for. Otherwise identical to ksw_global.
537+
*
538+
* @param s_q Boolean: whether to count gaps at start of query
539+
* @param e_q Boolean: whether to count gaps at end of query
540+
* @param s_t Boolean: whether to count gaps at start of target
541+
* @param e_t Boolean: whether to count gaps at end of target
542+
*/
543+
int ksw_global_end(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat,
544+
int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_, int s_q, int e_q, int s_t, int e_t)
545+
{
546+
eh_t *eh;
547+
int8_t *qp; // query profile
548+
int i, j, k, gapoe = gapo + gape, score, n_col;
549+
uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
550+
if (n_cigar_) *n_cigar_ = 0;
551+
// allocate memory
552+
if (w == 0) w = qlen > tlen ? qlen : tlen;
553+
n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix
554+
z = malloc(n_col * tlen);
555+
qp = malloc(qlen * m);
556+
eh = calloc(qlen + 1, 8);
557+
// generate the query profile
558+
for (k = i = 0; k < m; ++k) {
559+
const int8_t *p = &mat[k * m];
560+
for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]];
561+
}
562+
// fill the first row
563+
eh[0].h = 0; eh[0].e = MINUS_INF;
564+
for (j = 1; j <= qlen && j <= w; ++j)
565+
eh[j].h = s_t ? -(gapo + gape * j) : 0, eh[j].e = MINUS_INF;
566+
for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band
567+
// DP loop
568+
int32_t beg, best_score = MINUS_INF, best_i = 0;
569+
for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop
570+
int32_t f = MINUS_INF, h1, end;
571+
int8_t *q = &qp[target[i] * qlen];
572+
uint8_t *zi = &z[i * n_col];
573+
beg = i > w? i - w : 0;
574+
end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence
575+
h1 = beg == 0? (s_q ? -(gapo + gape * (i + 1)) : 0) : MINUS_INF;
576+
for (j = beg; LIKELY(j < end); ++j) {
577+
// This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except:
578+
// 1) not checking h>0; 2) recording direction for backtracking
579+
eh_t *p = &eh[j];
580+
int32_t h = p->h, e = p->e;
581+
uint8_t d; // direction
582+
p->h = h1;
583+
h += q[j];
584+
d = h > e? 0 : 1;
585+
h = h > e? h : e;
586+
d = h > f? d : 2;
587+
h = h > f? h : f;
588+
h1 = h;
589+
h -= gapoe;
590+
e -= gape;
591+
d |= e > h? 1<<2 : 0;
592+
e = e > h? e : h;
593+
p->e = e;
594+
f -= gape;
595+
d |= f > h? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two
596+
f = f > h? f : h;
597+
zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell
598+
}
599+
eh[end].h = h1; eh[end].e = MINUS_INF;
600+
if (best_score < eh[end].h)
601+
best_score = eh[end].h, best_i = i;
602+
}
603+
score = eh[qlen].h;
604+
i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell
605+
int n_cigar = 0, m_cigar = 0;
606+
uint32_t *cigar = NULL;
607+
if (!e_t) {
608+
if (score < best_score) {
609+
score = best_score;
610+
if (best_i < i && n_cigar_ && cigar_)
611+
cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i - best_i);
612+
i = best_i;
613+
}
614+
}
615+
if (!e_q) {
616+
for (j = beg; j <= qlen; j++) {
617+
if (score < eh[j].h) {
618+
score = eh[j].h;
619+
if (j < k && n_cigar_ && cigar_)
620+
cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k-j);
621+
k = j;
622+
}
623+
}
624+
}
625+
if (n_cigar_ && cigar_) { // backtrack
626+
int which = 0;
627+
uint32_t tmp;
628+
while (i >= 0 && k >= 0) {
629+
which = z[i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3;
630+
if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k;
631+
else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i;
632+
else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k;
633+
}
634+
if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1);
635+
if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1);
636+
for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR
637+
tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp;
638+
*n_cigar_ = n_cigar, *cigar_ = cigar;
639+
}
640+
free(eh); free(qp); free(z);
641+
return score;
642+
}
643+
533644
/*******************************************
534645
* Main function (not compiled by default) *
535646
*******************************************/
@@ -631,3 +742,119 @@ int main(int argc, char *argv[])
631742
return 0;
632743
}
633744
#endif
745+
746+
//-----------------------------------------------------------------------------
747+
// JKB command line test harness
748+
#ifdef TEST_MAIN
749+
#include <string.h>
750+
#include <stdio.h>
751+
#include <assert.h>
752+
753+
uint8_t W128[128][128];
754+
755+
void init_W128(void) {
756+
int i;
757+
758+
memset(&W128[0][0], -1, 128*128*sizeof(W128[0][0]));
759+
for (i = 0; i < 8; i++)
760+
W128["ACGTacgt"[i]]["ACGTacgt"[i]] = 2;
761+
}
762+
763+
void init_W128_score(int mis, int mat) {
764+
int i, j;
765+
766+
for (i = 0; i < 128; i++)
767+
for (j = 0; j < 128; j++)
768+
W128[i][j] = mis;
769+
for (i = 0; i < 16; i++)
770+
W128["ACGTacgtacgtACGT"[i]]["ACGTacgtACGTacgt"[i]] = mat;
771+
772+
for (i = 0; i < 128; i++)
773+
W128['n'][i] = W128['n'][i] = W128[i]['N'] = W128[i]['n'] = 0;
774+
}
775+
776+
// From htslib/hts.h
777+
#define BAM_CMATCH 0
778+
#define BAM_CINS 1
779+
#define BAM_CDEL 2
780+
#define BAM_CREF_SKIP 3
781+
#define BAM_CSOFT_CLIP 4
782+
#define BAM_CHARD_CLIP 5
783+
#define BAM_CPAD 6
784+
#define BAM_CEQUAL 7
785+
#define BAM_CDIFF 8
786+
#define BAM_CBACK 9
787+
788+
#define BAM_CIGAR_STR "MIDNSHP=XB"
789+
#define BAM_CIGAR_SHIFT 4
790+
#define BAM_CIGAR_MASK 0xf
791+
792+
int main(int argc, char **argv) {
793+
char *A = argv[1];
794+
char *B = argv[2];
795+
int s1 = atoi(argv[3]);
796+
int s2 = atoi(argv[4]);
797+
int e1 = atoi(argv[5]);
798+
int e2 = atoi(argv[6]);
799+
int score;
800+
int G = atoi(argv[7]); // open
801+
int H = atoi(argv[8]); // extend
802+
int mis = atoi(argv[9]);
803+
int mat = atoi(argv[10]);
804+
int band = atoi(argv[11]);
805+
806+
init_W128_score(mis,mat);
807+
808+
uint32_t *cigar;
809+
int ncigar = 0;
810+
811+
int len_A = strlen(A);
812+
int len_B = strlen(B);
813+
//score = ksw_global(len_A, A, len_B, B, 128, (uint8_t *)W128, G, H, band, &ncigar, &cigar);
814+
score = ksw_global_end(len_A, A, len_B, B, 128, (uint8_t *)W128, G, H, band, &ncigar, &cigar, s1, e1, s2, e2);
815+
816+
printf("Score=%d: ", score);
817+
int i, j, k;
818+
for (i = 0; i < ncigar; i++) {
819+
printf("%d%c ", cigar[i]>>4, "MIDNSHP=XB"[cigar[i]&0xf]);
820+
}
821+
printf("\n");
822+
823+
i = j = k = 0;
824+
while (i < len_A || j < len_B) {
825+
int op, oplen = 0;
826+
if (!oplen) {
827+
if (k >= ncigar)
828+
break; // truncated through band?
829+
op = cigar[k] & BAM_CIGAR_MASK;
830+
oplen = cigar[k++] >> BAM_CIGAR_SHIFT;
831+
}
832+
833+
while (oplen--) {
834+
switch (op) {
835+
case BAM_CMATCH:
836+
printf("%c %c\n", A[i], B[j]);
837+
i++, j++;
838+
break;
839+
840+
case BAM_CDEL: // in B only
841+
printf("- %c\n", B[j]);
842+
j++;
843+
break;
844+
845+
case BAM_CINS: // in A only
846+
printf("%c -\n", A[i]);
847+
i++;
848+
break;
849+
850+
default:
851+
abort();
852+
}
853+
}
854+
}
855+
856+
free(cigar);
857+
858+
return 0;
859+
}
860+
#endif

ksw.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ extern "C" {
6464

6565
int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int h0, int *_qle, int *_tle);
6666
int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *_n_cigar, uint32_t **_cigar);
67+
int ksw_global_end(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat,
68+
int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_, int s_q, int e_q, int s_t, int e_t);
6769

6870
#ifdef __cplusplus
6971
}

0 commit comments

Comments
 (0)