import requests #请求网页数据 from bs4 import BeautifulSoup #美味汤解析数据 import pandas as pd import time from tqdm import trange #获取爬取速度
defget_bilibili_url(start, end): url_list = [] date_list = [i for i in pd.date_range(start, end).strftime('%Y-%m-%d')] for date in date_list: url = f"https://api.bilibili.com/x/v2/dm/history?type=1&oid=141367679&date={date}" url_list.append(url) return url_list
defget_bilibili_danmu(url_list): headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36", "cookie": "你自己的"#Headers中copy即可 }
file = open("bilibili_danmu.txt", 'w') for i in trange(len(url_list)): url = url_list[i] response = requests.get(url, headers=headers) response.encoding = 'utf-8' soup = BeautifulSoup(response.text) data = soup.find_all("d") danmu = [data[i].text for i in range(len(data))] for items in danmu: file.write(items) file.write("\n") time.sleep(3) file.close()
if __name__ == "__main__": start = '9/24/2020'#设置爬取弹幕的起始日 end = '9/26/2020'#设置爬取弹幕的终止日 url_list = get_bilibili_url(start, end) get_bilibili_danmu(url_list) print("弹幕爬取完成")