Pythonで豆瓣の最新映画レビューを取得し、WordCloudで可視化する方法

概要

この記事では、Pythonを使用して豆瓣の最新映画レビューを取得し、データをクリーニングした後、WordCloudで可視化する方法を解説します。具体的には、「異形：夺命舰 Alien: Romulus」のレビューを例に取り上げます。

技術的な要点

1. WordCloudのインストール

pip install wordcloud

2. WordCloudの基本的な使用方法

class wordcloud.WordCloud(font_path=None, width=400, height=200, margin=2, ranks_only=None, prefer_horizontal=0.9, mask=None, scale=1, color_func=None, max_words=200, min_font_size=4, stopwords=None, random_state=None, background_color='black', max_font_size=None, font_step=1, mode='RGB', relative_scaling=0.5, regexp=None, collocations=True, colormap=None, normalize_plurals=True)

主なパラメータ

font_path: 使用するフォントのパス（例: 'simhei.ttf'）
width, height: 出力されるキャンバスの幅と高さ（デフォルトは400x200ピクセル）
prefer_horizontal: 単語が水平方向に配置される頻度（デフォルトは0.9）
mask: 画像マスクを使用して形状を指定
scale: キャンバスのサイズをスケーリング（例: 1.5倍）
min_font_size, max_font_size: 表示される最小・最大フォントサイズ
stopwords: 省略すべき単語のリスト
background_color: 背景色（デフォルトは黒）

WordCloudの主要なメソッド

fit_words(frequencies): 単語の頻度に基づいてWordCloudを生成
generate(text): テキストからWordCloudを生成
generate_from_frequencies(frequencies): 単語の頻度からWordCloudを生成
generate_from_text(text): テキストからWordCloudを生成
process_text(text): テキストを分割し、ストップワードを除去
recolor(): 現在の出力を再着色
to_array(): numpy配列に変換
to_file(filename): ファイルに保存

3. WordCloudの応用例

from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

text = open('test.txt', 'r', encoding='utf-8').read()
bg_pic = Image.open('alice.png')
wc = WordCloud(background_color='white', mask=np.array(bg_pic), font_path="simhei.ttf", max_words=2000, max_font_size=150, random_state=30, scale=1.5)
wc.generate_from_text(text)

image_colors = ImageColorGenerator(np.array(bg_pic))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
print('表示成功!')

# 保存
wc.to_file('test2.jpg')

4. ストップワードの設定

from os import path
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS

text = open('test.txt', 'r', encoding='utf-8').read()
alice_coloring = np.array(Image.open('alice.png'))
stopwords = set(STOPWORDS)
stopwords.add("的")
stopwords.add("了")

wc = WordCloud(background_color='white', mask=np.array(alice_coloring), font_path="simhei.ttf", max_words=2000, stopwords=stopwords, max_font_size=40, random_state=42)
wc.generate(text)
image_colors = ImageColorGenerator(np.array(alice_coloring))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()

# 保存
wc.to_file('test2.jpg')

5. 単語頻度の使用

import jieba.analyse
from PIL import Image, ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS

lyric = ''
with open('./test.txt', 'r', encoding='utf-8') as f:
    lyric = f.read()
result = jieba.analyse.textrank(lyric, topK=50, withWeight=True)
keywords = {i[0]: i[1] for i in result}
print(keywords)

プログラム設計の手順

1. ページからのデータ取得

ページ内にあるidが'nowplaying'の

タグを検索し、その中に含まれるclassが'list-item'の

タグをすべて取得します。

2. データのクリーニング

正規表現を使用して非漢字文字を削除
ストップワードのフィルタリング
結巴分词を使用して中国語のテキストを分割

import re
import jieba.analyse
from wordcloud import WordCloud, STOPWORDS

pattern = re.compile(r'[^ws]')
cleaned_comments = pattern.sub('', comments)

stopwords = set(STOPWORDS)
with open('./StopWords.txt', encoding="utf-8") as f:
    stopwords.update(word.strip() for word in f)
keywords = {word: score for word, score in keywords.items() if word not in stopwords}

result = jieba.analyse.textrank(cleaned_comments, topK=150, withWeight=True)

3. WordCloudでの表示

wordcloud = WordCloud(font_path="simhei.ttf", mask=np.array(bg_pic), background_color="white", max_font_size=80, stopwords=stopwords).generate_from_frequencies(keywords)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

print('表示成功!')

実装コード

import warnings
import jieba
import jieba.analyse
import re
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup as bs
from wordcloud import WordCloud, STOPWORDS

warnings.filterwarnings("ignore")
plt.rcParams['figure.figsize'] = (10.0, 5.0)

def getNowPlayingMovieList():
    url = 'https://movie.douban.com/nowplaying/guangzhou'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
    }
    try:
        resp = requests.get(url, headers=headers)
        resp.raise_for_status()
        html = resp.text
    except requests.exceptions.HTTPError as errh:
        print(f"HTTPエラー: {errh}")
        return []
    except requests.exceptions.RequestException as err:
        print(f"リクエストエラー: {err}")
        return []
    soup = bs(html, 'html.parser')
    nowplaying_movie = soup.find('div', id='nowplaying')
    if not nowplaying_movie:
        return []
    nowplaying_movie_list = nowplaying_movie.find_all('li', class_='list-item')
    nowplaying_list = []
    for item in nowplaying_movie_list:
        nowplaying_dict = {}
        nowplaying_dict['id'] = item['data-subject']
        nowplaying_dict['name'] = item.find('img')['alt']
        nowplaying_list.append(nowplaying_dict)
    return nowplaying_list

def getCommentsById(movieId, pageNum):
    eachCommentList = []
    if pageNum <= 0:
        return eachCommentList
    start = (pageNum - 1) * 20
    url = f'https://movie.douban.com/subject/{movieId}/comments?start={start}&limit=20'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'
    }
    try:
        resp = requests.get(url, headers=headers)
        resp.raise_for_status()
        html = resp.text
    except requests.exceptions.HTTPError as errh:
        print(f"HTTPエラー: {errh}")
        return []
    except requests.exceptions.RequestException as err:
        print(f"リクエストエラー: {err}")
        return []
    soup = bs(html, 'html.parser')
    comment_div_lits = soup.find_all('div', class_='comment')
    for item in comment_div_lits:
        if item.find('p'):
            eachCommentList.append(item.find('p').text.strip())
    return eachCommentList

def main():
    NowPlayingMovie_list = getNowPlayingMovieList()
    if not NowPlayingMovie_list:
        print("映画リストが取得できませんでした")
        return

    commentList = []
    for i in range(1, 11):
        comments_temp = getCommentsById(NowPlayingMovie_list[0]['id'], i)
        commentList.extend(comments_temp)

    comments = " ".join(commentList)
    pattern = re.compile(r'[^ws]')
    cleaned_comments = pattern.sub('', comments)

    result = jieba.analyse.textrank(cleaned_comments, topK=150, withWeight=True)
    keywords = {word: weight for word, weight in result}

    stopwords = set(STOPWORDS)
    with open('./StopWords.txt', encoding="utf-8") as f:
        stopwords.update(word.strip() for word in f)

    keywords = {word: score for word, score in keywords.items() if word not in stopwords}

    wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80, stopwords=stopwords).generate_from_frequencies(keywords)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    print('表示成功!')

if __name__ == "__main__":
    main()

タグ: Python WordCloud 豆瓣网络爬虫数据清洗

6月14日 20:28 投稿

異端開発室

Pythonで豆瓣の最新映画レビューを取得し、WordCloudで可視化する方法

概要

技術的な要点

1. WordCloudのインストール

2. WordCloudの基本的な使用方法

主なパラメータ

WordCloudの主要なメソッド

3. WordCloudの応用例

4. ストップワードの設定

5. 単語頻度の使用

プログラム設計の手順

1. ページからのデータ取得

2. データのクリーニング

3. WordCloudでの表示

実装コード

ホットタグ