菜单
菜单

python3豆瓣爬虫

大年30没事干,也抢不到红包,于是写写代码娱乐娱乐,复习复习最近学的python。

源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
#---------------------------------------
# 程序:豆瓣相册爬虫
# 版本: 1.3
# 作者:Cao Linjian
# 日期:2015-02-18
# 语言:Python 3.3
# 说明:输入影人图片地址或者相册地址
#---------------------------------------
import urllib.request
import os,re,time
from bs4 import BeautifulSoup as bs #第3方库,解析html
from multiprocessing import Pool
import socket
socket.setdefaulttimeout(10)
abs = os.path.abspath('.')
targetDir = os.path.join(abs,'pic')
if not os.path.isdir(targetDir):
os.mkdir(targetDir)
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)'
' Gecko/20091201 Firefox/3.5.6'
}
#生成的文件名
def destFile(path):
pos = path.rindex('/')
t = os.path.join(targetDir, path[pos+1:])
return t
def download(img):
try:
print(img)
wq = urllib.request.Request(img,headers = headers)
data = urllib.request.urlopen(wq).read()
with open(destFile(img), 'wb') as f:
f.write(data)
except urllib.error.URLError as e:
if isinstance(e.reason,socket.timeout):
print('urllib超时重新抓取...')
time.sleep(1)
download(img)
except socket.timeout as e:
print('socket超时重新抓取...')
time.sleep(1)
download(img)
except Exception as err:
print('51')
print(err)
#从当前页中获取图片地址并下载
def getImgByPage(url):
img_urls = []
req = urllib.request.Request(url,headers = headers)
webpage = urllib.request.urlopen(req)
data = webpage.read()
soup=bs(data)
div = soup.find(class_="photolst clearfix")
if div:
photolst = div.select(".photolst_photo")
else:
div = soup.find(class_="poster-col4 clearfix")
photolst = div.select(".cover")
for photo in photolst:
img = photo.find('img').get('src')
img = img.replace("/thumb/", "/photo/") #缩略图地址改成大图
download(img)
#根据地址类型读取相册页面
def getPagesByType(type):
try:
req = urllib.request.Request(hostname,headers = headers)
webpage = urllib.request.urlopen(req)
data = webpage.read()
soup=bs(data)
s = ''
a = re.findall(r'[\u4e00-\u9fa5]',soup.title.text) #蛋疼的windows
for i in a:
s += i
print(s)
paginator = soup.find(class_='paginator')
total_page = 1
if paginator:
total_page = int(paginator.find(class_='thispage').get('data-total-page'))
if total_page == 1:
getImgByPage(hostname)
else:
urls = []
for i in range(total_page):
if type == 'album':
urls.append(hostname+"?start="+str(i*18))
elif type == 'celebrity':
urls.append(hostname+"?type=C&sortby=vote&size=a&subtype=a&start="+str(i*40))
pool = Pool(8)
pool.map(getImgByPage, urls)
except Exception as err:
print('99')
print(err)
if __name__ == "__main__":
'''
hostname = "http://www.douban.com/photos/album/18445613/"
hostname = "http://movie.douban.com/celebrity/1013763/photos/"
这两种格式都行╮( ̄▽ ̄")╭
'''
hostname = input("请输入豆瓣相册地址: \n")
if hostname.find('http:') < 0:
hostname = 'http://'+hostname
if hostname.find('?') > 0:
hostname = hostname[0:hostname.find('?')]
if hostname[:-1] != '/':
hostname = hostname+"/"
if re.search(r'http://www\.douban\.com/photos/album/\d+/',hostname): #匹配相册地址
getPagesByType("album")
elif re.search(r'http://movie\.douban\.com/celebrity/\d+/photos/',hostname): #匹配影人图片地址
getPagesByType("celebrity")
else:
print('请检查地址是否有误!')

运行该文件后输入 http://www.douban.com/photos/album/18445613/
或者 http://movie.douban.com/celebrity/1013763/photos/ 的格式的地址即可,可以不用加 http://
在相册中间的某页地址如 http://www.douban.com/photos/album/20570238/?start=18 形式的也可。