@@ -530,6 +530,117 @@ int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target,
530530 return score ;
531531}
532532
533+
534+ /**
535+ * Global alignment with the ability to control which start/end gaps
536+ * are penalised for. Otherwise identical to ksw_global.
537+ *
538+ * @param s_q Boolean: whether to count gaps at start of query
539+ * @param e_q Boolean: whether to count gaps at end of query
540+ * @param s_t Boolean: whether to count gaps at start of target
541+ * @param e_t Boolean: whether to count gaps at end of target
542+ */
543+ int ksw_global_end (int qlen , const uint8_t * query , int tlen , const uint8_t * target , int m , const int8_t * mat ,
544+ int gapo , int gape , int w , int * n_cigar_ , uint32_t * * cigar_ , int s_q , int e_q , int s_t , int e_t )
545+ {
546+ eh_t * eh ;
547+ int8_t * qp ; // query profile
548+ int i , j , k , gapoe = gapo + gape , score , n_col ;
549+ uint8_t * z ; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex
550+ if (n_cigar_ ) * n_cigar_ = 0 ;
551+ // allocate memory
552+ if (w == 0 ) w = qlen > tlen ? qlen : tlen ;
553+ n_col = qlen < 2 * w + 1 ? qlen : 2 * w + 1 ; // maximum #columns of the backtrack matrix
554+ z = malloc (n_col * tlen );
555+ qp = malloc (qlen * m );
556+ eh = calloc (qlen + 1 , 8 );
557+ // generate the query profile
558+ for (k = i = 0 ; k < m ; ++ k ) {
559+ const int8_t * p = & mat [k * m ];
560+ for (j = 0 ; j < qlen ; ++ j ) qp [i ++ ] = p [query [j ]];
561+ }
562+ // fill the first row
563+ eh [0 ].h = 0 ; eh [0 ].e = MINUS_INF ;
564+ for (j = 1 ; j <= qlen && j <= w ; ++ j )
565+ eh [j ].h = s_t ? - (gapo + gape * j ) : 0 , eh [j ].e = MINUS_INF ;
566+ for (; j <= qlen ; ++ j ) eh [j ].h = eh [j ].e = MINUS_INF ; // everything is -inf outside the band
567+ // DP loop
568+ int32_t beg , best_score = MINUS_INF , best_i = 0 ;
569+ for (i = 0 ; LIKELY (i < tlen ); ++ i ) { // target sequence is in the outer loop
570+ int32_t f = MINUS_INF , h1 , end ;
571+ int8_t * q = & qp [target [i ] * qlen ];
572+ uint8_t * zi = & z [i * n_col ];
573+ beg = i > w ? i - w : 0 ;
574+ end = i + w + 1 < qlen ? i + w + 1 : qlen ; // only loop through [beg,end) of the query sequence
575+ h1 = beg == 0 ? (s_q ? - (gapo + gape * (i + 1 )) : 0 ) : MINUS_INF ;
576+ for (j = beg ; LIKELY (j < end ); ++ j ) {
577+ // This loop is organized in a similar way to ksw_extend() and ksw_sse2(), except:
578+ // 1) not checking h>0; 2) recording direction for backtracking
579+ eh_t * p = & eh [j ];
580+ int32_t h = p -> h , e = p -> e ;
581+ uint8_t d ; // direction
582+ p -> h = h1 ;
583+ h += q [j ];
584+ d = h > e ? 0 : 1 ;
585+ h = h > e ? h : e ;
586+ d = h > f ? d : 2 ;
587+ h = h > f ? h : f ;
588+ h1 = h ;
589+ h -= gapoe ;
590+ e -= gape ;
591+ d |= e > h ? 1 <<2 : 0 ;
592+ e = e > h ? e : h ;
593+ p -> e = e ;
594+ f -= gape ;
595+ d |= f > h ? 2 <<4 : 0 ; // if we want to halve the memory, use one bit only, instead of two
596+ f = f > h ? f : h ;
597+ zi [j - beg ] = d ; // z[i,j] keeps h for the current cell and e/f for the next cell
598+ }
599+ eh [end ].h = h1 ; eh [end ].e = MINUS_INF ;
600+ if (best_score < eh [end ].h )
601+ best_score = eh [end ].h , best_i = i ;
602+ }
603+ score = eh [qlen ].h ;
604+ i = tlen - 1 ; k = (i + w + 1 < qlen ? i + w + 1 : qlen ) - 1 ; // (i,k) points to the last cell
605+ int n_cigar = 0 , m_cigar = 0 ;
606+ uint32_t * cigar = NULL ;
607+ if (!e_t ) {
608+ if (score < best_score ) {
609+ score = best_score ;
610+ if (best_i < i && n_cigar_ && cigar_ )
611+ cigar = push_cigar (& n_cigar , & m_cigar , cigar , 2 , i - best_i );
612+ i = best_i ;
613+ }
614+ }
615+ if (!e_q ) {
616+ for (j = beg ; j <= qlen ; j ++ ) {
617+ if (score < eh [j ].h ) {
618+ score = eh [j ].h ;
619+ if (j < k && n_cigar_ && cigar_ )
620+ cigar = push_cigar (& n_cigar , & m_cigar , cigar , 1 , k - j );
621+ k = j ;
622+ }
623+ }
624+ }
625+ if (n_cigar_ && cigar_ ) { // backtrack
626+ int which = 0 ;
627+ uint32_t tmp ;
628+ while (i >= 0 && k >= 0 ) {
629+ which = z [i * n_col + (k - (i > w ? i - w : 0 ))] >> (which <<1 ) & 3 ;
630+ if (which == 0 ) cigar = push_cigar (& n_cigar , & m_cigar , cigar , 0 , 1 ), -- i , -- k ;
631+ else if (which == 1 ) cigar = push_cigar (& n_cigar , & m_cigar , cigar , 2 , 1 ), -- i ;
632+ else cigar = push_cigar (& n_cigar , & m_cigar , cigar , 1 , 1 ), -- k ;
633+ }
634+ if (i >= 0 ) cigar = push_cigar (& n_cigar , & m_cigar , cigar , 2 , i + 1 );
635+ if (k >= 0 ) cigar = push_cigar (& n_cigar , & m_cigar , cigar , 1 , k + 1 );
636+ for (i = 0 ; i < n_cigar >>1 ; ++ i ) // reverse CIGAR
637+ tmp = cigar [i ], cigar [i ] = cigar [n_cigar - 1 - i ], cigar [n_cigar - 1 - i ] = tmp ;
638+ * n_cigar_ = n_cigar , * cigar_ = cigar ;
639+ }
640+ free (eh ); free (qp ); free (z );
641+ return score ;
642+ }
643+
533644/*******************************************
534645 * Main function (not compiled by default) *
535646 *******************************************/
@@ -631,3 +742,119 @@ int main(int argc, char *argv[])
631742 return 0 ;
632743}
633744#endif
745+
746+ //-----------------------------------------------------------------------------
747+ // JKB command line test harness
748+ #ifdef TEST_MAIN
749+ #include <string.h>
750+ #include <stdio.h>
751+ #include <assert.h>
752+
753+ uint8_t W128 [128 ][128 ];
754+
755+ void init_W128 (void ) {
756+ int i ;
757+
758+ memset (& W128 [0 ][0 ], -1 , 128 * 128 * sizeof (W128 [0 ][0 ]));
759+ for (i = 0 ; i < 8 ; i ++ )
760+ W128 ["ACGTacgt" [i ]]["ACGTacgt" [i ]] = 2 ;
761+ }
762+
763+ void init_W128_score (int mis , int mat ) {
764+ int i , j ;
765+
766+ for (i = 0 ; i < 128 ; i ++ )
767+ for (j = 0 ; j < 128 ; j ++ )
768+ W128 [i ][j ] = mis ;
769+ for (i = 0 ; i < 16 ; i ++ )
770+ W128 ["ACGTacgtacgtACGT" [i ]]["ACGTacgtACGTacgt" [i ]] = mat ;
771+
772+ for (i = 0 ; i < 128 ; i ++ )
773+ W128 ['n' ][i ] = W128 ['n' ][i ] = W128 [i ]['N' ] = W128 [i ]['n' ] = 0 ;
774+ }
775+
776+ // From htslib/hts.h
777+ #define BAM_CMATCH 0
778+ #define BAM_CINS 1
779+ #define BAM_CDEL 2
780+ #define BAM_CREF_SKIP 3
781+ #define BAM_CSOFT_CLIP 4
782+ #define BAM_CHARD_CLIP 5
783+ #define BAM_CPAD 6
784+ #define BAM_CEQUAL 7
785+ #define BAM_CDIFF 8
786+ #define BAM_CBACK 9
787+
788+ #define BAM_CIGAR_STR "MIDNSHP=XB"
789+ #define BAM_CIGAR_SHIFT 4
790+ #define BAM_CIGAR_MASK 0xf
791+
792+ int main (int argc , char * * argv ) {
793+ char * A = argv [1 ];
794+ char * B = argv [2 ];
795+ int s1 = atoi (argv [3 ]);
796+ int s2 = atoi (argv [4 ]);
797+ int e1 = atoi (argv [5 ]);
798+ int e2 = atoi (argv [6 ]);
799+ int score ;
800+ int G = atoi (argv [7 ]); // open
801+ int H = atoi (argv [8 ]); // extend
802+ int mis = atoi (argv [9 ]);
803+ int mat = atoi (argv [10 ]);
804+ int band = atoi (argv [11 ]);
805+
806+ init_W128_score (mis ,mat );
807+
808+ uint32_t * cigar ;
809+ int ncigar = 0 ;
810+
811+ int len_A = strlen (A );
812+ int len_B = strlen (B );
813+ //score = ksw_global(len_A, A, len_B, B, 128, (uint8_t *)W128, G, H, band, &ncigar, &cigar);
814+ score = ksw_global_end (len_A , A , len_B , B , 128 , (uint8_t * )W128 , G , H , band , & ncigar , & cigar , s1 , e1 , s2 , e2 );
815+
816+ printf ("Score=%d: " , score );
817+ int i , j , k ;
818+ for (i = 0 ; i < ncigar ; i ++ ) {
819+ printf ("%d%c " , cigar [i ]>>4 , "MIDNSHP=XB" [cigar [i ]& 0xf ]);
820+ }
821+ printf ("\n" );
822+
823+ i = j = k = 0 ;
824+ while (i < len_A || j < len_B ) {
825+ int op , oplen = 0 ;
826+ if (!oplen ) {
827+ if (k >= ncigar )
828+ break ; // truncated through band?
829+ op = cigar [k ] & BAM_CIGAR_MASK ;
830+ oplen = cigar [k ++ ] >> BAM_CIGAR_SHIFT ;
831+ }
832+
833+ while (oplen -- ) {
834+ switch (op ) {
835+ case BAM_CMATCH :
836+ printf ("%c %c\n" , A [i ], B [j ]);
837+ i ++ , j ++ ;
838+ break ;
839+
840+ case BAM_CDEL : // in B only
841+ printf ("- %c\n" , B [j ]);
842+ j ++ ;
843+ break ;
844+
845+ case BAM_CINS : // in A only
846+ printf ("%c -\n" , A [i ]);
847+ i ++ ;
848+ break ;
849+
850+ default :
851+ abort ();
852+ }
853+ }
854+ }
855+
856+ free (cigar );
857+
858+ return 0 ;
859+ }
860+ #endif
0 commit comments