-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathbwt.h
113 lines (85 loc) · 4.4 KB
/
bwt.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#ifndef BWA_BWT_H
#define BWA_BWT_H
#include <stdint.h>
#include <stddef.h>
// Number of unique values in input domain (cardinality of set) - Nucleotides (4) or Amino Acids (20) plus wildcards
#define VALUE_DOMAIN 32
#define VALUE_DEFINED 22
#define VALUE_SCORING 484
// Requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80
#define OCC_INTV_SHIFT 7
#define OCC_INTERVAL (1LL<<OCC_INTV_SHIFT)
#define OCC_INTV_MASK (OCC_INTERVAL - 1)
#ifndef BWA_UBYTE
#define BWA_UBYTE
typedef unsigned char ubyte_t;
#endif
typedef uint64_t bwtint_t;
typedef struct {
bwtint_t primary; // S^{-1}(0), or the primary index of BWT
bwtint_t L2[VALUE_DOMAIN + 1]; // C(), cumulative count
bwtint_t seq_len; // sequence length
bwtint_t bwt_size; // size of BWT (in 32-bit integers)
uint32_t *bwt; // BWT
//uint32_t cnt_table[256];
// Suffix-Array related
int sa_intv;
bwtint_t n_sa;
bwtint_t *sa;
} bwt_t;
typedef struct {
bwtint_t x[3], info;
} bwtintv_t;
typedef struct { size_t n, m; bwtintv_t *a; } bwtintv_v;
/* For general OCC_INTERVAL, the following is correct:
#define bwt_bwt(b, k) ((b)->bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16])
#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4)
*/
// The following two lines are ONLY correct when OCC_INTERVAL==0x80
//#define bwt_bwt(b, k) ((b)->bwt[((k)>>7<<4) + sizeof(bwtint_t) + (((k)&0x7f)>>4)]) // Part of bwt_B0
//#define bwt_occ_intv(b, k) ((b)->bwt + ((k)>>7<<4)) // Now called getOccInterval
/* retrieve a character from the $-removed BWT string. Note that
* bwt_t::bwt is not exactly the BWT string and therefore this macro is
* called bwt_B0 instead of bwt_B */
//#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3) // Now caled unpackBWTValue
//#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0)
#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[VALUE_DEFINED - 1 -(c)]+1, (ik).info = 0)
#ifdef __cplusplus
extern "C" {
#endif
uint32_t * getOccInterval(const bwt_t * passBWT, int64_t passSeqIdx);
ubyte_t unpackBWTValue(const bwt_t * passBWT, int64_t passSeqIdx);
void getOccPerWord(uint32_t passWord, bwtint_t retOcc[VALUE_DOMAIN], int64_t passCount);
void bwt_dump_bwt(const char *fn, const bwt_t *bwt);
void bwt_dump_sa(const char *fn, const bwt_t *bwt);
bwt_t *bwt_restore_bwt(const char *fn);
void bwt_restore_sa(const char *fn, bwt_t *bwt);
void bwt_destroy(bwt_t *bwt);
void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW
void bwt_bwtgen2(const char *fn_pac, const char *fn_bwt, int block_size); // from BWT-SW
void bwt_cal_sa(bwt_t *bwt, int intv);
void bwt_bwtupdate_core(bwt_t *bwt);
bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c);
void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[VALUE_DOMAIN]);
bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k);
// more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values
//void bwt_gen_cnt_table(bwt_t *bwt);
void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol);
void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[VALUE_DOMAIN], bwtint_t cntl[VALUE_DOMAIN]);
int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end);
int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0);
/**
* Extend bi-SA-interval _ik_
*/
void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[VALUE_DOMAIN], int is_back);
/**
* Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_.
* Return the end of the longest exact match starting from _x_.
*/
int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_smem1a(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, uint64_t max_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]);
int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_len, int max_intv, bwtintv_t *mem);
#ifdef __cplusplus
}
#endif
#endif