利用bs4和requests爬取豆瓣Top250排行版电影信息

豆瓣Top250

1.工具

Python requests bs4 csv

2.思路

a.导入第三方库
import requests
import bs4
import unicodecsv as csv
b.使用requests获取网页源码

image.png

header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/90.0.4430.93 Safari/537.36 "
}
#拼接链接
for i in range(0, 10):
    html = requests.get('https://movie.douban.com/top250?start=' + str(start), headers=header)
    # 不加.text 或 .content 就会出现response 200
    print(html.text)
    html.encoding = 'utf-8'
    start += 25
c.使用bs4提取有效信息
soup = bs4.BeautifulSoup(html.text, 'html.parser')

    for item in soup.find_all('div', 'info'):
        title = item.div.span.string
        # print(title)
        yearline = item.find('div', 'bd').p.contents[2].string
        yearline = yearline.replace('\n', '')
        yearline = yearline.replace(' ', '')
        year = yearline[0:4]
        rating = item.find('span', {'class': 'rating_num'}).get_text()
        oneresult = [title, rating, year]
        result.append(oneresult)
    print(result)
d.存储文件


    with open('top_250.csv', 'wb') as f:
        w = csv.writer(f)
        w.writerows(result)
        f.close()

完整代码

#导入requests bs4
import requests
import bs4
import unicodecsv as csv
start = 0
result = []

header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/90.0.4430.93 Safari/537.36 "
}
#拼接链接
for i in range(0, 10):
    html = requests.get('https://movie.douban.com/top250?start=' + str(start), headers=header)
    # 不加.text 或 .content 就会出现response 200
    print(html.text)
    html.encoding = 'utf-8'
    start += 25
    soup = bs4.BeautifulSoup(html.text, 'html.parser')

    for item in soup.find_all('div', 'info'):
        title = item.div.span.string
        # print(title)
        yearline = item.find('div', 'bd').p.contents[2].string
        yearline = yearline.replace('\n', '')
        yearline = yearline.replace(' ', '')
        year = yearline[0:4]
        rating = item.find('span', {'class': 'rating_num'}).get_text()
        oneresult = [title, rating, year]
        result.append(oneresult)
    print(result)
  

    with open('top_250.csv', 'wb') as f:
        w = csv.writer(f)
        w.writerows(result)
        f.close()

<!--WeMedia start-->

付费内容

<!--WeMedia end-->

最后修改:2021 年 07 月 21 日
如果觉得我的文章对你有用,请随意赞赏