Douban Top 250爬虫
阅读原文时间:2023年07月08日阅读:1
# Ref: https://fishc.com.cn/forum.php?mod=viewthread&tid=101887&extra=page%3D1%26filter%3Dtypeid%26typeid%3D722

import requests
from bs4 import BeautifulSoup
import openpyxl

def open_url(url):
    headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36'}
    res = requests.get(url, headers=headers)
    res.raise_for_status()
    res.encoding = res.apparent_encoding
    return res.text

def parserHtml(html) -> object:
    try:
        soup = BeautifulSoup(html, 'html.parser')
        titles = []
        hrefs = []
        messages_movie = []
        messages_star = []

        # 名称
        titles_targets = soup.find_all('div', class_='hd')

        for each in titles_targets:
            titles.append(each.a.span.text)
            hrefs.append(each.a['href']) # 链接

        # 信息
        bd_targets = soup.find_all('div', class_='bd')
        for each in bd_targets:
            try:
                messages_movie.append(each.p.text.split('\n')[1].strip() +
                                       each.p.text.split('\n')[2].strip())
            except:
                continue

        # 评分
        star_targets = soup.find_all('span', class_='rating_num')
        for each in star_targets:
            messages_star.append(each.text)

        # result
        result = []
        for i in range(len(messages_star)):
            # result.append(titles[i]  + messages_movie[i] + messages_star[i] + '\n') # save to text
            result.append([titles[i], messages_star[i], messages_movie[i], hrefs[i]])\

        return result
    except:
        print('解析错误')

# def sava_excel(result):
#     try:
#         with open(r'./Python_Excel_小甲鱼/Top_DouBan_250.txt', 'w', encoding='utf-8') as f:
#             for each in result:
#                 f.write(each)
#         f.close()
#     except:
#         print('存储错误')

def save_excel(result):
    try:
        wb = openpyxl.Workbook()
        ws = wb.active
        ws['A1'] = '电影名称'
        ws['B1'] = '评分'
        ws['C1'] = '电影信息'
        ws['D1'] = '电影链接'

        for each in result:
            ws.append(each)
        wb.save('Top_DouBan_250.xlsx')
    except:
        print('保存Excel错误')

# 获取页面
def get_depth(html):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        depth = soup.find('span', class_='next').previous_sibling.previous_sibling.text
        return depth
    except:
        print('获取页数错误')

def main():
    host = r'https://movie.douban.com/top250'
    html = open_url(host)
    depth = get_depth(html)

    result = []

    for i in range(int(depth)):
        url = host + '/?start=' + str(25*i) + '&filter='
        html = open_url(url)
        result.extend(parserHtml(html))
    save_excel(result)

if __name__ == '__main__':
    main()