import urllib.request
import os,re,time
from bs4 import BeautifulSoup as bs
from multiprocessing import Pool
import socket
socket.setdefaulttimeout(10)
abs = os.path.abspath('.')
targetDir = os.path.join(abs,'pic')
if not os.path.isdir(targetDir):
os.mkdir(targetDir)
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)'
' Gecko/20091201 Firefox/3.5.6'
}
def destFile(path):
pos = path.rindex('/')
t = os.path.join(targetDir, path[pos+1:])
return t
def download(img):
try:
print(img)
wq = urllib.request.Request(img,headers = headers)
data = urllib.request.urlopen(wq).read()
with open(destFile(img), 'wb') as f:
f.write(data)
except urllib.error.URLError as e:
if isinstance(e.reason,socket.timeout):
print('urllib超时重新抓取...')
time.sleep(1)
download(img)
except socket.timeout as e:
print('socket超时重新抓取...')
time.sleep(1)
download(img)
except Exception as err:
print('51')
print(err)
def getImgByPage(url):
img_urls = []
req = urllib.request.Request(url,headers = headers)
webpage = urllib.request.urlopen(req)
data = webpage.read()
soup=bs(data)
div = soup.find(class_="photolst clearfix")
if div:
photolst = div.select(".photolst_photo")
else:
div = soup.find(class_="poster-col4 clearfix")
photolst = div.select(".cover")
for photo in photolst:
img = photo.find('img').get('src')
img = img.replace("/thumb/", "/photo/")
download(img)
def getPagesByType(type):
try:
req = urllib.request.Request(hostname,headers = headers)
webpage = urllib.request.urlopen(req)
data = webpage.read()
soup=bs(data)
s = ''
a = re.findall(r'[\u4e00-\u9fa5]',soup.title.text)
for i in a:
s += i
print(s)
paginator = soup.find(class_='paginator')
total_page = 1
if paginator:
total_page = int(paginator.find(class_='thispage').get('data-total-page'))
if total_page == 1:
getImgByPage(hostname)
else:
urls = []
for i in range(total_page):
if type == 'album':
urls.append(hostname+"?start="+str(i*18))
elif type == 'celebrity':
urls.append(hostname+"?type=C&sortby=vote&size=a&subtype=a&start="+str(i*40))
pool = Pool(8)
pool.map(getImgByPage, urls)
except Exception as err:
print('99')
print(err)
if __name__ == "__main__":
'''
hostname = "http://www.douban.com/photos/album/18445613/"
hostname = "http://movie.douban.com/celebrity/1013763/photos/"
这两种格式都行╮( ̄▽ ̄")╭
'''
hostname = input("请输入豆瓣相册地址: \n")
if hostname.find('http:') < 0:
hostname = 'http://'+hostname
if hostname.find('?') > 0:
hostname = hostname[0:hostname.find('?')]
if hostname[:-1] != '/':
hostname = hostname+"/"
if re.search(r'http://www\.douban\.com/photos/album/\d+/',hostname):
getPagesByType("album")
elif re.search(r'http://movie\.douban\.com/celebrity/\d+/photos/',hostname):
getPagesByType("celebrity")
else:
print('请检查地址是否有误!')