diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ab859c..c29f57c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Change Log +## [1.3.2] - 2017-02-07 +### Added +- Alignment command can now directly take a protein multi-FASTA and skip ORF detection (-p option) +- Prepare and alignment commands can now make use of a proxy server (HTTP/SOCKS) for contacting UniProt (-P option) + ## [1.3.1] - 2017-01-01 ### Added - UniProt report now includes max mapping quality for each protein diff --git a/align.c b/align.c index 36fa560..658034b 100644 --- a/align.c +++ b/align.c @@ -107,6 +107,7 @@ int command_align(int argc, char *argv[]) { FILE * reportPriStream = 0, * reportSecStream = 0; char * readsProName = 0, * indexProName = 0, * prefixName = 0; char * samName = 0, * reportPriName = 0, * reportSecName = 0; + const char * proxyAddress; memset(&aux, 0, sizeof(ktp_aux_t)); memset(pes, 0, VALUE_DOMAIN * sizeof(mem_pestat_t)); @@ -114,8 +115,9 @@ int command_align(int argc, char *argv[]) { aux.opt = opt = mem_opt_init(); memset(&opt0, 0, sizeof(mem_opt_t)); + proxyAddress = NULL; - while ((c = getopt(argc, argv, "1epabgnMCSPVYJjf:F:u:k:o:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:")) >= 0) { + while ((c = getopt(argc, argv, "1epabgnMCSVYJjf:F:u:k:o:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:y:K:X:H:P:")) >= 0) { if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; else if (c == 'u') opt->outputType = atoi(optarg); else if (c == 'f') opt->min_orf_len = atoi(optarg); @@ -129,9 +131,9 @@ int command_align(int argc, char *argv[]) { else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1; else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1; else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; - else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; + //else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; else if (c == 'a') opt->flag |= MEM_F_ALL; - else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE; + //else if (c == 'p') opt->flag |= MEM_F_PE | MEM_F_SMARTPE; else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP; @@ -142,6 +144,7 @@ int command_align(int argc, char *argv[]) { else if (c == 'g') opt->proteinFlag |= ALIGN_FLAG_GEN_NT; else if (c == 'n') opt->proteinFlag |= ALIGN_FLAG_KEEP_PRO; else if (c == 'J') opt->proteinFlag &= ~ALIGN_FLAG_ADJUST_ORF; + else if (c == 'p') opt->proteinFlag |= ALIGN_FLAG_MANUAL_PRO; else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; else if (c == 'v') bwa_verbose = atoi(optarg); @@ -157,6 +160,7 @@ int command_align(int argc, char *argv[]) { else if (c == 'C') aux.copy_comment = 1; else if (c == 'K') fixed_chunk_size = atoi(optarg); else if (c == 'X') opt->mask_level = atof(optarg); + else if (c == 'P') proxyAddress = optarg; else if (c == 'h') { opt0.max_XA_hits = opt0.max_XA_hits_alt = 1; opt->max_XA_hits = opt->max_XA_hits_alt = strtol(optarg, &p, 10); @@ -344,8 +348,14 @@ int command_align(int argc, char *argv[]) { } - // Detect ORFs and write protein file - writeReadsProtein(argv[optind + 1], readsProName, opt); + if (opt->proteinFlag & ALIGN_FLAG_MANUAL_PRO) { + // Protein input given, skip ORF detection + sprintf(readsProName, argv[optind + 1]); + } + else { + // Detect ORFs and write protein file + writeReadsProtein(argv[optind + 1], readsProName, opt); + } // Open ORFs sequence ko = kopen(readsProName, &fd); @@ -385,14 +395,14 @@ int command_align(int argc, char *argv[]) { // Generate UniProt report if requested if (prefixName != NULL) { - renderUniprotReport(opt->outputType, 1, reportPriStream); + renderUniprotReport(opt->outputType, 1, reportPriStream, proxyAddress); if (opt->flag & MEM_F_ALL) { - renderUniprotReport(opt->outputType, 0, reportSecStream); + renderUniprotReport(opt->outputType, 0, reportSecStream, proxyAddress); } } - // Delete protein file unless requested otherwise - if (!(opt->proteinFlag & ALIGN_FLAG_KEEP_PRO)) { + // Delete generated protein file unless requested otherwise + if (!(opt->proteinFlag & ALIGN_FLAG_KEEP_PRO) && !(opt->proteinFlag & ALIGN_FLAG_MANUAL_PRO)) { remove(readsProName); } @@ -425,6 +435,7 @@ int renderAlignUsage(const mem_opt_t * passOptions) { fprintf(stderr, "Usage: paladin align [options] \n\n"); fprintf(stderr, "Gene detection options:\n\n"); + fprintf(stderr, " -p disable ORF detection and treat input as protein sequence\n"); fprintf(stderr, " -b disable brute force ORF detection\n"); fprintf(stderr, " -J do not adjust minimum ORF length (constant value) for shorter read lengths\n"); fprintf(stderr, " -f INT minimum ORF length accepted (as constant value) [%d]\n", passOptions->min_orf_len); @@ -467,6 +478,7 @@ int renderAlignUsage(const mem_opt_t * passOptions) { fprintf(stderr, " -u INT report type generated when using reporting and a UniProt reference [%d]\n", passOptions->outputType); fprintf(stderr, " 0: Simple ID summary report\n"); fprintf(stderr, " 1: Detailed report (Contacts uniprot.org)\n\n"); + fprintf(stderr, " -P STR HTTP or SOCKS proxy address\n"); fprintf(stderr, " -g generate detected ORF nucleotide sequence FASTA\n"); fprintf(stderr, " -n keep protein sequence after alignment\n"); //fprintf(stderr, " -p smart pairing (ignoring in2.fq)\n"); diff --git a/bntseq.c b/bntseq.c index bf939ca..41d1521 100644 --- a/bntseq.c +++ b/bntseq.c @@ -1,30 +1,3 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Toni Westbrook */ - #include #include #include diff --git a/bwamem.c b/bwamem.c index 8169190..59dccf0 100644 --- a/bwamem.c +++ b/bwamem.c @@ -90,7 +90,7 @@ mem_opt_t *mem_opt_init() return o; } -void filterCompetingAln(worker_t * passWorker, int passCount) { +void filterCompetingAln(worker_t * passWorker, int passCount, int passDisable) { int seqIdx, alnIdx, bestIdx; int currentSeq, readSeq; int seqTotal, bestTotal; @@ -101,6 +101,12 @@ void filterCompetingAln(worker_t * passWorker, int passCount) { // Iterate through each sequence and alignment for (seqIdx = 0 ; seqIdx < passCount - 1 ; seqIdx++) { + // If filtering disabled, simply mark sequence as best + if (passDisable) { + passWorker->regs[seqIdx].active = 1; + continue; + } + // Check if we're in a new sequence or in an alternate frame sscanf(passWorker->seqs[seqIdx].name, "%d:", &readSeq); if (readSeq != currentSeq) { @@ -113,12 +119,11 @@ void filterCompetingAln(worker_t * passWorker, int passCount) { bestIdx = seqIdx; } - // Aggregate all score totals for this sequence - seqTotal = 0; - for (alnIdx = 0 ; alnIdx < passWorker->regs[seqIdx].n ; alnIdx++) { - seqTotal += passWorker->regs[seqIdx].a[alnIdx].score; - } - + // Aggregate all score totals for this sequence + seqTotal = 0; + for (alnIdx = 0 ; alnIdx < passWorker->regs[seqIdx].n ; alnIdx++) { + seqTotal += passWorker->regs[seqIdx].a[alnIdx].score; + } // Check if current alignment is best so far if (seqTotal > bestTotal) { @@ -128,7 +133,7 @@ void filterCompetingAln(worker_t * passWorker, int passCount) { } // Filter final sequence - passWorker->regs[bestIdx].active = 1; + if (!passDisable) passWorker->regs[bestIdx].active = 1; } int getAlignmentType(worker_t * passWorker, int passEntry, int passAlignment) { @@ -1300,7 +1305,7 @@ void mem_process_seqs(const mem_opt_t *opt, const bwt_t *bwt, const bntseq_t *bn } // Filter competing alignments from multi-frame encoding during ORF detection process - filterCompetingAln(&w, n); + filterCompetingAln(&w, n, opt->proteinFlag & ALIGN_FLAG_MANUAL_PRO); kt_for(opt->n_threads, worker2, &w, (opt->flag&MEM_F_PE)? n>>1 : n); diff --git a/bwamem.h b/bwamem.h index 3e30bec..39c3579 100644 --- a/bwamem.h +++ b/bwamem.h @@ -130,7 +130,7 @@ typedef struct { extern "C" { #endif - void filterCompetingAln(worker_t * passWorker, int passCount); + void filterCompetingAln(worker_t * passWorker, int passCount, int passDisable); int getAlignmentType(worker_t * passWorker, int passEntry, int passAlignment); smem_i *smem_itr_init(const bwt_t *bwt); diff --git a/bwt.c b/bwt.c index 711e859..dc2a2ab 100644 --- a/bwt.c +++ b/bwt.c @@ -1,5 +1,3 @@ -/* Contact: Toni Westbrook */ - #include #include #include diff --git a/bwt.h b/bwt.h index bc952db..c31fe2a 100644 --- a/bwt.h +++ b/bwt.h @@ -1,5 +1,3 @@ -/* Contact: Toni Westbrook */ - #ifndef BWA_BWT_H #define BWA_BWT_H diff --git a/bwtindex.c b/bwtindex.c index c82e4a9..2107fc5 100644 --- a/bwtindex.c +++ b/bwtindex.c @@ -1,5 +1,3 @@ -/* Contact: Toni Westbrook */ - #include #include #include @@ -393,7 +391,7 @@ int command_index(int argc, char *argv[]) { int command_prepare(int argc, char *argv[]) { char c; char refArg[] = "-p0"; - const char * refName; + const char * refName, * proxyAddress; int refType, valid; // Fixed passthrough arguments @@ -404,10 +402,12 @@ int command_prepare(int argc, char *argv[]) { valid = 1; refType = -1; refName = NULL; + proxyAddress = NULL; - while ((c = getopt(argc, argv, "r:f:")) >= 0) { + while ((c = getopt(argc, argv, "r:f:P:")) >= 0) { if (c == 'r') refType = atoi(optarg); if (c == 'f') refName = optarg; + if (c == 'P') proxyAddress = optarg; if (c == '?') valid = 0; } @@ -420,7 +420,8 @@ int command_prepare(int argc, char *argv[]) { fprintf(stderr, " -r <#> Reference Database:\n"); fprintf(stderr, " 1: UniProtKB Reviewed (Swiss-Prot)\n"); fprintf(stderr, " 2: UniProtKB Clustered 90%% (UniRef90)\n\n"); - fprintf(stderr, " -f Skip download, use local copy of reference database (may be indexed)\n\n"); + fprintf(stderr, " -f Skip download, use local copy of reference database (may be indexed)\n"); + fprintf(stderr, " -P
HTTP or SOCKS proxy address\n\n"); fprintf(stderr, "Examples:\n\n"); fprintf(stderr, " paladin prepare -r2\n"); fprintf(stderr, " paladin prepare -r1 -f uniprot_sprot.fasta.gz\n"); @@ -432,7 +433,7 @@ int command_prepare(int argc, char *argv[]) { // We can generalize this in the future to include other reference types if (!refName) { - if ((refName = downloadUniprotReference(refType))[0] == 0) { + if ((refName = downloadUniprotReference(refType, proxyAddress))[0] == 0) { return 1; } } diff --git a/bwtindex.h b/bwtindex.h index dec939c..2b2d163 100644 --- a/bwtindex.h +++ b/bwtindex.h @@ -1,4 +1,3 @@ - #ifndef BWTINDEX_H_ #define BWTINDEX_H_ diff --git a/main.c b/main.c index 8c98a98..ae1f589 100644 --- a/main.c +++ b/main.c @@ -1,7 +1,7 @@ /* The MIT License - Copyright (c) 2015 by Anthony Westbrook, University of New Hampshire + Copyright (c) 2015 by Anthony Westbrook, University of New Hampshire Copyright (c) 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining @@ -51,7 +51,7 @@ UniProt-generated functional profile. This text file may be used for all downstream characterizations. - Contact: Toni Westbrook + Contact: Toni Westbrook For information regarding BWA, contact Heng Li */ @@ -135,7 +135,7 @@ int renderVersion() { fprintf(stderr, "Program: PALADIN (Protein Alignment and Detection Interface)\n"); fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); - fprintf(stderr, "Contact: Toni Westbrook (UNH) \n"); + fprintf(stderr, "Contact: Toni Westbrook (UNH) \n"); fprintf(stderr, "Based on: BWA by Heng Li \n\n"); return 1; diff --git a/main.h b/main.h index 3f45528..159718f 100644 --- a/main.h +++ b/main.h @@ -1,7 +1,7 @@ /* The MIT License - Copyright (c) 2015 by Anthony Westbrook, University of New Hampshire + Copyright (c) 2015 by Anthony Westbrook, University of New Hampshire Copyright (c) 2011 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining @@ -51,7 +51,7 @@ UniProt-generated functional profile. This text file may be used for all downstream characterizations. - Contact: Toni Westbrook + Contact: Toni Westbrook For information regarding BWA, contact Heng Li */ @@ -66,7 +66,7 @@ #define PACKAGE_VERSION STR(PACKAGE_VERSION_MAJOR) "." STR(PACKAGE_VERSION_MINOR) "." STR(PACKAGE_VERSION_REV) #define PACKAGE_VERSION_MAJOR 1 #define PACKAGE_VERSION_MINOR 3 -#define PACKAGE_VERSION_REV 1 +#define PACKAGE_VERSION_REV 2 #endif // Render usage and version details diff --git a/protein.c b/protein.c index be760be..1ccc77f 100644 --- a/protein.c +++ b/protein.c @@ -1,5 +1,3 @@ -/* Contact: Toni Westbrook */ - #include #include #include diff --git a/protein.h b/protein.h index 9fcffa8..75d00b8 100644 --- a/protein.h +++ b/protein.h @@ -1,5 +1,3 @@ -/* Contact: Toni Westbrook */ - #ifndef PROTEIN_H_ #define PROTEIN_H_ @@ -11,10 +9,11 @@ #define OUTPUT_TYPE_UNIPROT_SIMPLE 0 #define OUTPUT_TYPE_UNIPROT_FULL 1 -#define ALIGN_FLAG_BRUTE_ORF 0x0001 +#define ALIGN_FLAG_BRUTE_ORF 0x0001 #define ALIGN_FLAG_GEN_NT 0x0002 #define ALIGN_FLAG_KEEP_PRO 0x0004 #define ALIGN_FLAG_ADJUST_ORF 0x0008 +#define ALIGN_FLAG_MANUAL_PRO 0x0010 extern unsigned char codon_aa_hash[64]; diff --git a/uniprot.c b/uniprot.c index f0f11cf..e92fceb 100644 --- a/uniprot.c +++ b/uniprot.c @@ -35,7 +35,7 @@ int * getGlobalCount(int passPrimary) { } -void prepareUniprotReport(int passType, int passPrimary, UniprotList * passLists, CURLBuffer * passBuffer) { +void prepareUniprotReport(int passType, int passPrimary, UniprotList * passLists, CURLBuffer * passBuffer, const char * passProxy) { UniprotList * globalLists; // Aggregate and sort lists by value @@ -48,7 +48,7 @@ void prepareUniprotReport(int passType, int passPrimary, UniprotList * passLists // Report specific preparation if (passType == OUTPUT_TYPE_UNIPROT_FULL) { // Submit entries to UniProt and retrieve full information - retrieveUniprotOnline(passLists + UNIPROT_LIST_FULL, passBuffer); + retrieveUniprotOnline(passLists + UNIPROT_LIST_FULL, passBuffer, passProxy); joinOnlineLists(passLists + UNIPROT_LIST_FULL, passBuffer->buffer); } @@ -58,14 +58,14 @@ void prepareUniprotReport(int passType, int passPrimary, UniprotList * passLists qsort(passLists[UNIPROT_LIST_ORGANISM].entries, passLists[UNIPROT_LIST_ORGANISM].entryCount, sizeof(UniprotEntry), uniprotEntryCompareOrganism); } -void renderUniprotReport(int passType, int passPrimary, FILE * passStream) { +void renderUniprotReport(int passType, int passPrimary, FILE * passStream, const char * passProxy) { UniprotList * globalLists; UniprotList uniprotLists[3]; CURLBuffer tempBuffer; char commonHeader[] = "Count\tAbundance\tQuality (Avg)\tQuality (Max)"; // Prepare data - prepareUniprotReport(passType, passPrimary, uniprotLists, &tempBuffer); + prepareUniprotReport(passType, passPrimary, uniprotLists, &tempBuffer, passProxy); // Report no data globalLists = getGlobalLists(passPrimary); @@ -202,7 +202,7 @@ void cleanUniprotReferenceUniref(const char * passName, int passANN) { } // Download the requested UniProt reference (sprot/trembl/uniref90) -const char * downloadUniprotReference(int passReference) { +const char * downloadUniprotReference(int passReference, const char * passProxy) { CURL * curlHandle; CURLcode curlResult; FILE * fileHandle; @@ -213,7 +213,8 @@ const char * downloadUniprotReference(int passReference) { retFile = downloadNames[passReference]; curl_easy_setopt(curlHandle, CURLOPT_URL, downloadURLs[passReference]); - curl_easy_setopt(curlHandle, CURLOPT_WRITEDATA, fileHandle) ; + curl_easy_setopt(curlHandle, CURLOPT_WRITEDATA, fileHandle); + if (passProxy) curl_easy_setopt(curlHandle, CURLOPT_PROXY, passProxy); logMessage(__func__, LOG_LEVEL_MESSAGE, "Downloading %s...\n", downloadURLs[passReference]); curlResult = curl_easy_perform(curlHandle); @@ -230,7 +231,7 @@ const char * downloadUniprotReference(int passReference) { return retFile; } -void retrieveUniprotOnline(UniprotList * passList, CURLBuffer * retBuffer) { +void retrieveUniprotOnline(UniprotList * passList, CURLBuffer * retBuffer, const char * passProxy) { int entryIdx, queryIdx, parseIdx, errorIdx, queryCount; CURL * curlHandle; CURLcode curlResult; @@ -269,6 +270,7 @@ void retrieveUniprotOnline(UniprotList * passList, CURLBuffer * retBuffer) { curl_easy_setopt(curlHandle, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curlHandle, CURLOPT_WRITEFUNCTION, receiveUniprotOutput); curl_easy_setopt(curlHandle, CURLOPT_WRITEDATA, &tempBuffer); + if (passProxy) curl_easy_setopt(curlHandle, CURLOPT_PROXY, passProxy); resetCURLBuffer(&tempBuffer); curlResult = curl_easy_perform(curlHandle); diff --git a/uniprot.h b/uniprot.h index ebafb31..16d5163 100644 --- a/uniprot.h +++ b/uniprot.h @@ -45,7 +45,7 @@ extern int uniprotPriListCount; extern int uniprotSecListCount; // Rendering -void renderUniprotReport(int passType, int passPrimary, FILE * passStream); +void renderUniprotReport(int passType, int passPrimary, FILE * passStream, const char * passProxy); void renderUniprotEntries(UniprotList * passList, int passType, FILE * passStream); void renderNumberAligned(const mem_opt_t * passOptions); @@ -56,7 +56,7 @@ void cleanUniprotLists(UniprotList * passLists, int passPrimary); // Support UniprotList * getGlobalLists(int passPrimary); int * getGlobalCount(int passPrimary); -void prepareUniprotReport(int passType, int passPrimary, UniprotList * passLists, CURLBuffer * passBuffer); +void prepareUniprotReport(int passType, int passPrimary, UniprotList * passLists, CURLBuffer * passBuffer, const char * passProxy); void prepareUniprotLists(UniprotList * retLists, int passPrimary); void aggregateUniprotList(UniprotList * retList, int passListType, int passPrimary); void joinOnlineLists(UniprotList * retList, char * passUniprotOutput); @@ -71,8 +71,8 @@ int uniprotEntryCompareOnline (const void * passEntry1, const void * passEntry2) // UniProt Interoperability int cleanUniprotReference(int passReference, const char * passBase); void cleanUniprotReferenceUniref(const char * passName, int passANN); -const char * downloadUniprotReference(int passReference); -void retrieveUniprotOnline(UniprotList * passList, CURLBuffer * retBuffer); +const char * downloadUniprotReference(int passReference, const char * passProxy); +void retrieveUniprotOnline(UniprotList * passList, CURLBuffer * retBuffer, const char * passProxy); size_t receiveUniprotOutput(void * passString, size_t passSize, size_t passNum, void * retStream); void initCURLBuffer(CURLBuffer * passBuffer, int passCapacity); void resetCURLBuffer(CURLBuffer * passBuffer);