-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetftop.py
executable file
·84 lines (68 loc) · 2.57 KB
/
getftop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
import urllib
import urllib2
import urlparse
from bs4 import BeautifulSoup
from time import ctime
import threading
import progressbar
import psycopg2
site="http://ftop.ru"
localdir="/Volumes/exFAT/ftop.py/"
pbar=None
downloaded=0
def show_progress(count,block_size,total_size):
if pbar is None:
pbar=ProgressBar(maxval=total_size)
downloaded += block_size
pbar.update(block_size)
if downloaded == total_size:
pbar.finish()
pbar=None
download=0
def saveimage(imageurl,localname):
print "Begin download: "+localname
urllib.urlretrieve(imageurl,localdir+localname)
#urllib.urlretrieve(imageurl,localdir+localname,reporthook=show_progress)
print "Finish download: "+localname
def dealpage(url):
html=urllib2.urlopen(url).read()
soup=BeautifulSoup(html,"html.parser")
imagedivlist=soup.find_all("div",attrs={"class":"col-md-4 col-sm-6"})
threads=[]
nloops=range(len(imagedivlist))
while len(imagedivlist)>0:
imagediv=imagedivlist.pop()
imageinfohtmlurl=imagediv.find("a").get("href")
miniimgurl=imagediv.find("img").get("src")
filename=imageinfohtmlurl.split("/")[-1].replace(".html",".jpg")
_,_,_,_,date,hashname=miniimgurl.split("/")
trueurl=site+"/images/"+date+"/ftop.ru_"+hashname
#saveimage(trueurl,filename)
t=threading.Thread(target=saveimage,args=(trueurl,filename))
threads.append(t)
for i in nloops:
threads[i].start()
for i in nloops:
threads[i].join()
print "finished"+url
#date=miniimgurl.split("/")[5]
#hashname=miniimgurl.split"/")[6]
#imginfohtml=soup.find("div",attrs={"class":"col-md-4 col-sm-6"}).find("a").get("href")
#miniimgurl=soup.find("div",attrs={"class":"col-md-4 col-sm-6"}).find("img").get("src")
def db_init():
cxn=psycopg.connect(user='tux')
cur = conn.cursor()
cur.execute()
if __name__ == "__main__":
# link_crawler(url, delay=0, num_retries=1, user_agent='BadCrawler')
# html=urllib2.urlopen(url).read()
# soup=BeautifulSoup(html,html.parse)
# originalImageurl=soup.find(attrs={'class':'res-origin'}).find("a").get("href")
# newhtml=urllib2.urlopen("http://ftop.ru"+originalImageurl).read()
# newsoup=BeautifulSoup(newhtml)
# imageurl=newsoup.find("h2",attrs={"align":"center"}).find("a").get("href")
# imagedownloadname=newsoup.find("h2",attrs={"align":"center"}).find("a").get("download")
# urllib.urlretrieve(imageurl,localdir+imagedownloadname)
for i in range(2,100):
dealpage(site+"/page/"+str(i))