https://www.imdb.com/search/title/?title_type=feature
https://www.imdb.com/search/title/?title_type=feature

这个下面的代码有问题,可以获取数据,但是total_movies = 100000 # 要抓取的总电影数目,设置成 100000 或者100个运行之后,可以获取25不同电影信息,然后从25开始一直复制1-25,1-25,1-25,1-25一样的数据到100个,或者 100000


import requests
from bs4 import BeautifulSoup
import csv
import time
import random
# 定义要抓取的数据列
data_columns = ['Title', 'Year', 'Duration', 'Rating', 'Vote Count', 'Image URL', 'Description']
# 定义User-Agent列表
user_agents = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Linux; Android 10; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:91.0) Gecko/20100101 Firefox/91.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
]
# 获取电影详细信息的函数
def fetch_movie_details(movie):
title = movie.find('h3', class_='ipc-title__text').text if movie.find('h3', class_='ipc-title__text') else 'N/A'
metadata = movie.find_all('span', class_='sc-b189961a-8 kLaxqf dli-title-metadata-item')
year = metadata[0].text if len(metadata) > 0 else 'N/A'
duration = metadata[1].text if len(metadata) > 1 else 'N/A'
rating = movie.find('span',
class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text if movie.find(
'span',
class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating') else 'N/A'
vote_count = movie.find('span', class_='ipc-rating-star--voteCount').text.strip('()') if movie.find('span',
class_='ipc-rating-star--voteCount') else 'N/A'
image_url = movie.find('img', class_='ipc-image')['src'] if movie.find('img', class_='ipc-image') else 'N/A'
description = movie.find('div', class_='ipc-html-content-inner-div').text.strip() if movie.find('div',
class_='ipc-html-content-inner-div') else 'N/A'
return [title, year, duration, rating, vote_count, image_url, description]
# 主函数
def fetch_movies_data(base_url, total_movies):
movies_data = []
headers = {'User-Agent': random.choice(user_agents)}
for start in range(1, total_movies, 50):
url = f"{base_url}&start={start}&ref_=adv_nxt"
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status() # 检查请求是否成功
soup = BeautifulSoup(response.text, 'html.parser')
movies = soup.find_all('li', class_='ipc-metadata-list-summary-item')
for movie in movies:
movie_details = fetch_movie_details(movie)
movies_data.append(movie_details)
print(f"Successfully fetched data for movies {start} to {start + 49}")
time.sleep(random.uniform(2, 4)) # 随机延迟
except requests.exceptions.RequestException as e:
print(f"Request failed: {e}, URL: {url}")
time.sleep(random.uniform(5, 10)) # 增加延迟后重试
return movies_data
# 写入CSV文件
def save_to_csv(filename, data, columns):
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(columns)
writer.writerows(data)
# 定义主程序入口
if __name__ == '__main__':
base_url = "<https://www.imdb.com/search/title/?title_type=feature>"
total_movies = 100000 # 要抓取的总电影数目
output_file = 'imdb_movies.csv'
print("开始抓取IMDb电影数据...")
movie_data = fetch_movies_data(base_url, total_movies)
save_to_csv(output_file, movie_data, data_columns)
print(f"数据抓取完成,已存储到 {output_file}")