request以及bs4库爬取豆瓣电影前100
import requests
import re
from bs4 import BeautifulSoup
def getHtmlText(url, ulist):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
movie_title = soup.find_all('span',class_='title')
movie_link = soup.find_all('div', class_='hd')
linkinfo = []
for link in movie_link:
linkinfo.append(link.a.attrs['href'])
movie_rate = soup.find_all('span',class_='rating_num')
movie_intro = soup.find_all('span',class_='inq')
infolist = []
for title in movie_title: #剔除标题中的冗杂信息 例如港台译名
if title.text.find('/')== -1:
infolist.append(title.string)
for i in range(len(movie_link)):
ulist.append([infolist[i], linkinfo[i], movie_rate[i].text, movie_intro[i].text])
def printText(ulist):
# print('名称\t\t\t评分\t\t\t\t链接')
for i in range(len(ulist)):
u = ulist[i]
# print(u[0],'\t\t', u[2],'\t\t', u[1],'\n\n一句话短评: ', u[3],'\n')
print1('名称', u[0])
print1('评分', u[2])
print1('链接', u[1])
print1('短评', u[3])
#....有点傻,下次研究下怎么自动把爬出来的东西保存以及如何合理制表
def print1(name, info):
print('{}:{}'.format(name, info))
def main():
ulist = []
for i in range(4):
url = 'https://movie.douban.com/top250?start={}'.format(i*25)
getHtmlText(url, ulist)
printText(ulist)
main()
在看完网课后首次编写爬虫,哎,感觉好笨,参考了很多博客。因为经验不足出现很多简单的错误,比如turple out of index, list is not callable等等。下次目标是爬取电影短评以及将数据合理的保存。