Skip to content

Commit

Permalink
updated page indexes
Browse files Browse the repository at this point in the history
  • Loading branch information
rvargas committed May 24, 2017
1 parent ce0c5ee commit e869099
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 28 deletions.
35 changes: 18 additions & 17 deletions cazy_parser/create_cazy_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,20 @@
import os, sys, urllib, re, string, time, string, argparse
from bs4 import BeautifulSoup

def logo():
def main():

print '''
___ __ ____ _ _ ____ __ ____ ____ ____ ____
/ __) / _\ (__ )( \/ )___( _ \ / _\ ( _ \/ ___)( __)( _ \\
( (__ / \ / _/ ) /(___)) __// \ ) /\___ \ ) _) ) /
\___)\_/\_/(____)(__/ (__) \_/\_/(__\_)(____/(____)(__\_)
___ __ ____ _ _ ____ __ ____ ____ ____ ____
/ __) / _\ (__ )( \/ )___( _ \ / _\ ( _ \/ ___)( __)( _ \\
( (__ / \ / _/ ) /(___)) __// \ ) /\___ \ ) _) ) /
\___)\_/\_/(____)(__/ (__) \_/\_/(__\_)(____/(____)(__\_)
A simple way to retrieve fasta sequences from CAZy Database (:
A simple way to retrieve fasta sequences from CAZy Database (:
This is the database creator script.
This is the database creator script.
'''
'''

def main():
logo()
parser = argparse.ArgumentParser(description='Generate a comma separated table with information gathered from the CAZy database; internet connection is required.')
args = parser.parse_args()

Expand Down Expand Up @@ -93,7 +92,7 @@ def main():
#==============================================================================#
# Family section
#==============================================================================#
soup = BeautifulSoup(urllib.urlopen(main_class_link))
soup = BeautifulSoup(urllib.urlopen(main_class_link), "lxml")
# soup = BeautifulSoup(urllib.urlopen(main_class_link), 'lxml')
family_table = soup.findAll(name='table')[0]
rows = family_table.findAll(name='td')
Expand All @@ -108,7 +107,7 @@ def main():
print '> %s' % family
#
main_link = 'http://www.cazy.org/%s.html' % family
family_soup = BeautifulSoup(urllib.urlopen(main_link))
family_soup = BeautifulSoup(urllib.urlopen(main_link), 'lxml')
# main_link_dic = {'http://www.cazy.org/%s_all.html#pagination_PRINC' % family: '',
# 'http://www.cazy.org/%s_characterized.html#pagination_PRINC' % family: 'characterized'}
#====================#
Expand All @@ -124,14 +123,15 @@ def main():

page_zero = main_link

soup = BeautifulSoup(urllib.urlopen(main_link))
soup = BeautifulSoup(urllib.urlopen(main_link), "lxml")

# Get page list for the family // 1, 2, 3, 4, 5, 7
page_index_list = soup.findAll(name = 'a', attrs={'class':'lien_pagination'})
# page_list = ['http://www.cazy.org/' + str(l['href']) for l in page_index_list] # deprecated

if bool(page_index_list):
first_page_idx = int(page_index_list[0]['href'].split('PRINC=')[-1].split('#')[0]) # be careful with this
last_page_idx = int(page_index_list[-2]['href'].split('PRINC=')[-1].split('#')[0]) # be careful with this

first_page_idx = int(re.findall('=(\d*)#', str(page_index_list[0]))[0]) # be careful with this
last_page_idx = int(re.findall('=(\d*)#', str(page_index_list[-2]))[0]) # be careful with this

# generate page_list
page_list = []
Expand All @@ -147,7 +147,7 @@ def main():
for link in page_list:
# print link
# tr = rows // # td = cells
soup = BeautifulSoup(urllib.urlopen(link))
soup = BeautifulSoup(urllib.urlopen(link), "lxml")
table = soup.find('table', attrs={'class':'listing'})
domain = ''

Expand Down Expand Up @@ -222,4 +222,5 @@ def main():

if __name__ == '__main__':
main()

# done.
19 changes: 8 additions & 11 deletions cazy_parser/extract_cazy_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,18 @@
import os, sys, itertools, urllib, argparse
#==============================================================================#

def logo():
def main(argv=sys.argv[1:]):
print '''
___ __ ____ _ _ ____ __ ____ ____ ____ ____
/ __) / _\ (__ )( \/ )___( _ \ / _\ ( _ \/ ___)( __)( _ \\
( (__ / \ / _/ ) /(___)) __// \ ) /\___ \ ) _) ) /
\___)\_/\_/(____)(__/ (__) \_/\_/(__\_)(____/(____)(__\_)
A simple way to retrieve fasta sequences from CAZy Database (:
___ __ ____ _ _ ____ __ ____ ____ ____ ____
/ __) / _\ (__ )( \/ )___( _ \ / _\ ( _ \/ ___)( __)( _ \\
( (__ / \ / _/ ) /(___)) __// \ ) /\___ \ ) _) ) /
\___)\_/\_/(____)(__/ (__) \_/\_/(__\_)(____/(____)(__\_)
This is the accession code retrieval script.
A simple way to retrieve fasta sequences from CAZy Database (:
'''
This is the accession code retrieval script.
def main(argv=sys.argv[1:]):
logo()
'''
#==============================================================================#
# Options
#==============================================================================#
Expand Down

0 comments on commit e869099

Please sign in to comment.