Python 爬取 嗶站視頻彈幕 并實(shí)現(xiàn)詞云圖可視化

嗨嘍,大家好呀~這里是愛(ài)看美女的茜茜吶

環(huán)境介紹:

  • python 3.8 解釋器

  • pycharm 編輯器

第三方模塊:

  • requests >>> pip install requests

  • protobuf >>> pip install protobuf

如何安裝python第三方模塊:

  1. win + R 輸入 cmd 點(diǎn)擊確定, 輸入安裝命令 pip install 模塊名 (pip install requests) 回車(chē)

  2. 在pycharm中點(diǎn)擊Terminal(終端) 輸入安裝命令

代碼展示

import requests
import dm_pb2
from google.protobuf import text_format
import re
from datetime import datetime
import csv


with open("danmu.csv", mode='w', encoding='utf-8', newline='') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(["彈幕所在位置", "彈幕內(nèi)容", "彈幕發(fā)布時(shí)間"])
headers = {
    'cookie': "buvid3=355AA300-6A61-04E5-A05C-E891D886F69632716infoc; b_nut=1675085932; i-wanna-go-back=-1; _uuid=387EA3810-FBF5-E92C-827E-2510B578C5B9A33232infoc; buvid4=15C69C98-F6A7-EC6A-872F-E69C1840DD6D33724-023013021-1pW1w45e5fZS9RtebDiGZw%3D%3D; nostalgia_conf=-1; rpdid=|(kmJY|k))lY0J'uY~l|)lmY|; b_ut=5; is-2022-channel=1; buvid_fp_plain=undefined; CURRENT_BLACKGAP=0; LIVE_BUVID=AUTO3216755179681630; header_theme_version=CLOSE; CURRENT_PID=17897430-d93d-11ed-a1f4-675e4c96ff79; FEED_LIVE_VERSION=V8; CURRENT_QUALITY=80; fingerprint=58d6d808ef27a6225c943be7ca980284; buvid_fp=58d6d808ef27a6225c943be7ca980284; enable_web_push=DISABLE; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MDIzODAyNjYsImlhdCI6MTcwMjEyMTAwNiwicGx0IjotMX0.hHZgEl37y35RHgNUEbXnT3y_rtg_w3d1O46vW5TreIQ; bili_ticket_expires=1702380206; SESSDATA=0f019744%2C1717673066%2Ca41c0%2Ac2CjArLmPZFHNFg3B5H60pjRwiqJSLXDG8l2Pb_74Q11o8NmBWyKegdnFb6ivxUL255pwSVjRoaXFXVmFoRlFXY3VCRTAybEpud2ltaXFkRzZXQ25uZ3h0VGxrdGg3bWcxQ2hJN3d4VEZQRjRRTnd5cUx2TmJfUUdlWVZocVRfb281QnJHSklrTkJ3IIEC; bili_jct=f2a37b8a7351e9987d90f80d72dab593; DedeUserID=422789639; DedeUserID__ckMd5=fc4901c78719b545; b_lsid=125EDCFE_18C4E7B181A; home_feed_column=5; browser_resolution=1920-963; sid=6qcgbo4l; PVID=2",
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
url = 'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=323723441&pid=715024588&segment_index=1&pull_mode=1&ps=0&pe=120000&web_location=1315873&w_rid=8138667fe7c9a9d9aa23f488f69e5c2d&wts=1702124018'
# 1.發(fā)送請(qǐng)求
response = requests.get(url=url, headers=headers)
my_seg = dm_pb2.DmSegMobileReply()
data = response.content
my_seg.ParseFromString(data)
for i in my_seg.elems:
    parse_data = text_format.MessageToString(i, as_utf8=True)
    try:
        progress = re.findall('progress: (.*)', parse_data)[0]
    except:
        progress = 1000
    minutes, seconds = divmod(int(progress) // 1000, 60)
    current_time = f'{minutes:02d}:{seconds:02d}'
    content = re.findall('content: (.*)', parse_data)[0]
    ctime = re.findall('ctime: (.*)', parse_data)[0]
    date_time = datetime.fromtimestamp(int(ctime)).strftime('%Y-%m-%d %H:%M:%S')
    print(current_time, content, date_time)
    with open("danmu.csv", mode='a', encoding='utf-8', newline='') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow([current_time, content, date_time])
from datetime import datetime
import re
import requests
import dm_pb2
from google.protobuf import text_format
import csv


with open('danmu.csv', mode='w', encoding='utf-8', newline='') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(['彈幕時(shí)間', '彈幕出現(xiàn)位置', '彈幕內(nèi)容'])

def time_str_to_milliseconds(time_str):
    """將時(shí)間字符串轉(zhuǎn)換為毫秒溪王。"""
    h, m, s = map(int, time_str.split(':'))
    return ((h * 60 + m) * 60 + s) * 1000

# start_time = "00:23:58"
# end_time = "00:26:03"
# # 轉(zhuǎn)換為毫秒
# start_ms = time_str_to_milliseconds(start_time)
# end_ms = time_str_to_milliseconds(end_time)


def get_data(url):
    headers = {
        'Cookie': "buvid3=5CB78B54-F1B3-FCE6-F1AD-C0831287EFD881020infoc; b_nut=1699856581; i-wanna-go-back=-1; b_ut=7; _uuid=F625CC83-C9D9-101035-7C36-D3BDFD6BE10CF80953infoc; enable_web_push=DISABLE; home_feed_column=5; DedeUserID=422789639; DedeUserID__ckMd5=fc4901c78719b545; header_theme_version=CLOSE; CURRENT_FNVAL=4048; buvid4=A6C069B5-4DB6-437A-1160-A2D1E031AFF772289-023083014-j%2BEVJ7V9TtLMVIMXjUkPKw%3D%3D; fingerprint=b3a2765a971ea2692a81ff8b1844fae5; buvid_fp_plain=undefined; buvid_fp=b3a2765a971ea2692a81ff8b1844fae5; rpdid=|(kmJYmkk~k)0J'uYmm)lY~k~; PVID=1; SESSDATA=1a664f71%2C1717565740%2C48bce%2Ac1CjCHJjBfBSiCSW6Dfm5CAL39PzQZEKS9eUW3s5GUBHFuBSQ-KUhgo1bPfAdpSv22A1oSVnhWOUkwbnprSnY4MEVnd1dkNXBFYTVQWk1fYkJkeUZjZmFsRjJSSDB0MndxRmFZRUJTQjRjd0xwMkY2ZWtZal9sTWV6azZZclRTQ0dVNmFzZW14N1FnIIEC; bili_jct=365ff75a8dd1510cb2cdd93895923f7e; sid=4ggq2j9r; bp_video_offset_422789639=872607904249675833; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MDIyNzI5NzcsImlhdCI6MTcwMjAxMzcxNywicGx0IjotMX0.Mn0QVb_HBWG4wdx-IaVgx9UB4CkJW8P5QVS4LDqQGvA; bili_ticket_expires=1702272917; browser_resolution=1562-1010; innersign=0; b_lsid=A5D8EDDF_18C4D46CC84",
        'Referer': "https://www.bilibili.com/bangumi/play/ep327584",
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    resp = requests.get(url=url, headers=headers)
    return resp

def parse_data(resp):
    my_seg = dm_pb2.DmSegMobileReply()
    # DATA是二進(jìn)制數(shù)據(jù)
    # 比如你可以這么寫(xiě)
    DATA = resp.content
    # 或者這么寫(xiě)
    my_seg.ParseFromString(DATA)
    # 理論上此時(shí)文件已經(jīng)被逆序列化了幕侠,你可以通過(guò) print(my_seg.elems)來(lái)得到逆序列化后的數(shù)據(jù)
    for j in my_seg.elems:
        parse_data = text_format.MessageToString(j, as_utf8=True)
        pattern = r"id: (\d+)\nprogress: (\d+)\nmode: (\d+)\nfontsize: (\d+)\ncolor: (\d+)\nmidHash: \"([^\"]+)\"\ncontent: \"([^\"]+)\"\nctime: (\d+)\nweight: (\d+)\nidStr: \"([^\"]+)\""
        matches = re.finditer(pattern, parse_data)

        filtered_danmakus = []
        for match in matches:
            progress = int(match.group(2))

            # # 檢查是否在指定時(shí)間范圍內(nèi)
            # if start_ms <= progress <= end_ms:
            # 轉(zhuǎn)換 progress 為視頻位置(格式:mm:ss)
            minutes, seconds = divmod(progress // 1000, 60)
            video_position = f"{minutes:02d}:{seconds:02d}"

            # 轉(zhuǎn)換 ctime 為日期時(shí)間格式
            ctime = int(match.group(8))
            date_time = datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M:%S')

            # 彈幕內(nèi)容
            content = match.group(7)

            # 添加到篩選結(jié)果列表
            filtered_danmakus.append({
                "video_position": video_position,
                "date_time": date_time,
                "content": content
            })

        # 輸出篩選后的彈幕
        for danmaku in filtered_danmakus:
            print("彈幕出現(xiàn)位置:", '00:'+danmaku["video_position"])
            print("彈幕時(shí)間:", danmaku["date_time"])
            print("彈幕內(nèi)容:", danmaku["content"])
            print("----------")
            with open('danmu.csv', mode='a', encoding='utf-8', newline='') as f:
                csv_writer = csv.writer(f)
                csv_writer.writerow([danmaku["date_time"], '00:'+danmaku["video_position"], danmaku["content"]])


url_list = [
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=1&pull_mode=1&ps=0&pe=120000&web_location=1315873&w_rid=3078e56400ad93df33859b09b8464f6b&wts=1702103538',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=1&pull_mode=1&ps=120000&pe=360000&web_location=1315873&w_rid=db9e8a1b66eacfb77d7e92762ac3fc4b&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=2&web_location=1315873&w_rid=9fe6b7defe3bcd611f6ec7bbd8a57553&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=3&web_location=1315873&w_rid=59a05c03d41c295ad57e0cd23db695eb&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=4&web_location=1315873&w_rid=48a794c85798922aac2ce4a5ad779544&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=5&web_location=1315873&w_rid=62fa8d41489f2b58f2a8577e3e654ef0&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=6&web_location=1315873&w_rid=0d9313ee507d135bce658616e694fb39&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=7&web_location=1315873&w_rid=151cf518a34b72ceeb35fec82b30cd43&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=8&web_location=1315873&w_rid=394bda938a8a775152f1ee7641d0d4bb&wts=1702103541'
]
for url in url_list:
    resp = get_data(url)
    parse_data(resp)

詞云圖

import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 讀取數(shù)據(jù)
df = pd.read_csv('danmu.csv')
text = " ".join(review for review in df['彈幕內(nèi)容'])

# 生成詞云圖
wordcloud = WordCloud(width=800, height=800,
                      font_path=r'C:/Windows/Fonts/simhei.ttf',
                      background_color='white',
                      min_font_size=10).generate(text)

# 展示詞云圖
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)

plt.show()

尾語(yǔ)

感謝你觀看我的文章吶~本次航班到這里就結(jié)束啦 ??

希望本篇文章有對(duì)你帶來(lái)幫助 ??梳毙,有學(xué)習(xí)到一點(diǎn)知識(shí)~

躲起來(lái)的星星??也在努力發(fā)光徙赢,你也要努力加油(讓我們一起努力叭)赂弓。

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
  • 序言:七十年代末枢里,一起剝皮案震驚了整個(gè)濱河市孽鸡,隨后出現(xiàn)的幾起案子,更是在濱河造成了極大的恐慌栏豺,老刑警劉巖彬碱,帶你破解...
    沈念sama閱讀 206,013評(píng)論 6 481
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件,死亡現(xiàn)場(chǎng)離奇詭異奥洼,居然都是意外死亡巷疼,警方通過(guò)查閱死者的電腦和手機(jī),發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 88,205評(píng)論 2 382
  • 文/潘曉璐 我一進(jìn)店門(mén)灵奖,熙熙樓的掌柜王于貴愁眉苦臉地迎上來(lái)嚼沿,“玉大人估盘,你說(shuō)我怎么就攤上這事÷饩。” “怎么了遣妥?”我有些...
    開(kāi)封第一講書(shū)人閱讀 152,370評(píng)論 0 342
  • 文/不壞的土叔 我叫張陵,是天一觀的道長(zhǎng)攀细。 經(jīng)常有香客問(wèn)我箫踩,道長(zhǎng),這世上最難降的妖魔是什么谭贪? 我笑而不...
    開(kāi)封第一講書(shū)人閱讀 55,168評(píng)論 1 278
  • 正文 為了忘掉前任班套,我火速辦了婚禮,結(jié)果婚禮上故河,老公的妹妹穿的比我還像新娘吱韭。我一直安慰自己,他們只是感情好鱼的,可當(dāng)我...
    茶點(diǎn)故事閱讀 64,153評(píng)論 5 371
  • 文/花漫 我一把揭開(kāi)白布理盆。 她就那樣靜靜地躺著,像睡著了一般凑阶。 火紅的嫁衣襯著肌膚如雪猿规。 梳的紋絲不亂的頭發(fā)上,一...
    開(kāi)封第一講書(shū)人閱讀 48,954評(píng)論 1 283
  • 那天宙橱,我揣著相機(jī)與錄音姨俩,去河邊找鬼。 笑死师郑,一個(gè)胖子當(dāng)著我的面吹牛环葵,可吹牛的內(nèi)容都是我干的。 我是一名探鬼主播宝冕,決...
    沈念sama閱讀 38,271評(píng)論 3 399
  • 文/蒼蘭香墨 我猛地睜開(kāi)眼张遭,長(zhǎng)吁一口氣:“原來(lái)是場(chǎng)噩夢(mèng)啊……” “哼!你這毒婦竟也來(lái)了地梨?” 一聲冷哼從身側(cè)響起菊卷,我...
    開(kāi)封第一講書(shū)人閱讀 36,916評(píng)論 0 259
  • 序言:老撾萬(wàn)榮一對(duì)情侶失蹤,失蹤者是張志新(化名)和其女友劉穎宝剖,沒(méi)想到半個(gè)月后洁闰,有當(dāng)?shù)厝嗽跇?shù)林里發(fā)現(xiàn)了一具尸體,經(jīng)...
    沈念sama閱讀 43,382評(píng)論 1 300
  • 正文 獨(dú)居荒郊野嶺守林人離奇死亡万细,尸身上長(zhǎng)有42處帶血的膿包…… 初始之章·張勛 以下內(nèi)容為張勛視角 年9月15日...
    茶點(diǎn)故事閱讀 35,877評(píng)論 2 323
  • 正文 我和宋清朗相戀三年扑眉,在試婚紗的時(shí)候發(fā)現(xiàn)自己被綠了。 大學(xué)時(shí)的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片。...
    茶點(diǎn)故事閱讀 37,989評(píng)論 1 333
  • 序言:一個(gè)原本活蹦亂跳的男人離奇死亡襟雷,死狀恐怖刃滓,靈堂內(nèi)的尸體忽然破棺而出,到底是詐尸還是另有隱情耸弄,我是刑警寧澤咧虎,帶...
    沈念sama閱讀 33,624評(píng)論 4 322
  • 正文 年R本政府宣布,位于F島的核電站计呈,受9級(jí)特大地震影響砰诵,放射性物質(zhì)發(fā)生泄漏。R本人自食惡果不足惜捌显,卻給世界環(huán)境...
    茶點(diǎn)故事閱讀 39,209評(píng)論 3 307
  • 文/蒙蒙 一茁彭、第九天 我趴在偏房一處隱蔽的房頂上張望。 院中可真熱鬧扶歪,春花似錦理肺、人聲如沸。這莊子的主人今日做“春日...
    開(kāi)封第一講書(shū)人閱讀 30,199評(píng)論 0 19
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽(yáng)。三九已至炫欺,卻和暖如春乎完,著一層夾襖步出監(jiān)牢的瞬間,已是汗流浹背品洛。 一陣腳步聲響...
    開(kāi)封第一講書(shū)人閱讀 31,418評(píng)論 1 260
  • 我被黑心中介騙來(lái)泰國(guó)打工树姨, 沒(méi)想到剛下飛機(jī)就差點(diǎn)兒被人妖公主榨干…… 1. 我叫王不留,地道東北人桥状。 一個(gè)月前我還...
    沈念sama閱讀 45,401評(píng)論 2 352
  • 正文 我出身青樓帽揪,卻偏偏與公主長(zhǎng)得像,于是被迫代替她去往敵國(guó)和親岛宦。 傳聞我的和親對(duì)象是個(gè)殘疾皇子台丛,可洞房花燭夜當(dāng)晚...
    茶點(diǎn)故事閱讀 42,700評(píng)論 2 345

推薦閱讀更多精彩內(nèi)容