Pythonによるテキストファイル処理の実践例

Pythonを用いたテキストファイル(.txt)の操作に関する実用的なコード例を紹介します。以下のスクリプトは、日常業務や学習で頻出するテキスト処理タスクに対応しており、すべての入力ファイルはスクリプトと同じディレクトリに配置されていることを前提とします。

1. テキスト抽出処理

1.1 キーワードを含む行の抽出

センサーデータのフィルタリング
with open("sensor.txt", "r") as fi, open("earpa001.txt", "w") as fo:
    for line in fi:
        parts = line.strip().split(",")
        if " earpa001" in parts:
            fo.write(f"{parts[0]},{parts[1]},{parts[2]},{parts[3]}\n")
画像URLの抽出
with open('webpage.txt') as fi, open('images.txt', 'w') as fo:
    for line in fi:
        if '.JPG' in line:
            url = line.split('src="')[1].split('"')[0]
            fo.write(url + '\n')
大学名の抽出
with open("data.txt") as fi, open("univ.txt", "w") as fo:
    for line in fi:
        if 'alt=' in line:
            name = line.split('alt="')[1].split('"')[0]
            fo.write(name + '\n')
章タイトルの抽出
with open("八十天环游地球.txt") as fi, open("八十天环游地球-章节.txt", "w") as fo:
    for line in fi:
        stripped = line.strip()
        if stripped and stripped[0] == "第" and "章" in stripped:
            fo.write(line)
最頻出語を含む文の抽出
import jieba

with open('data3.txt', 'r', encoding='utf-8') as fi, open('out.txt', 'w', encoding='utf-8') as fo:
    text = fi.read()
    words = [w for w in jieba.lcut(text) if len(w) >= 2]
    
    freq = {}
    for w in words:
        freq[w] = freq.get(w, 0) + 1
    
    top_word = max(freq, key=freq.get)
    
    sentences = text.replace(',', '。').split('。')
    for sent in sentences:
        if top_word in sent:
            fo.write(sent.strip() + '\n')

1.2 条件範囲内のデータ抽出

# 全科目60点以上の場合のみ出力
with open("candidate0.txt") as fi, open("candidate.txt", "w") as fo:
    for line in fi:
        parts = line.strip().split()
        scores = [int(s) for s in parts[-10:]]
        if all(score >= 60 for score in scores):
            fo.write(' '.join(parts[:2]) + '\n')

1.3 キーワード後の内容抽出

『論語』原文の抽出
with open("论语.txt") as fi, open("论语-原文.txt", "w") as fo:
    capturing = False
    for line in fi:
        if "【注释】" in line:
            capturing = False
        elif "【原文】" in line:
            capturing = True
        elif capturing:
            fo.write(line.lstrip())

1.4 極値の抽出

学生の氏名と最終成績の抽出
with open('data.txt') as fi, open('studs.txt', 'w') as fo:
    for line in fi:
        name, info = line.strip().split(':')
        score = info.split(',')[-1]
        fo.write(f"{name}:{score}\n")
最高得点者の特定
students = []
with open('data.txt') as fi:
    for line in fi:
        name, info = line.strip().split(':')
        score = info.split(',')[-1]
        students.append((name, int(score)))

top_student = max(students, key=lambda x: x[1])
print(f"{top_student[0]}:{top_student[1]}")
クラスごとの平均点計算
from collections import defaultdict

class_scores = defaultdict(list)
with open('data.txt') as fi:
    for line in fi:
        name, info = line.strip().split(':')
        cls, score = info.split(',')
        class_scores[cls].append(int(score))

for cls, scores in class_scores.items():
    avg = sum(scores) / len(scores)
    print(f"{cls}:{avg:.2f}")
個人成績の統計
courses = {}
while True:
    data = input("科目名と得点を入力(空白で終了): ").strip()
    if not data:
        break
    subj, score = data.split()
    courses[subj] = int(score)

scores = list(courses.values())
max_subj = max(courses, key=courses.get)
min_subj = min(courses, key=courses.get)
avg = sum(scores) / len(scores)

with open("PY202.txt", "w") as f:
    f.write(f"最高分課程是{max_subj} {courses[max_subj]}, 最低分課程是{min_subj} {courses[min_subj]}, 平均分は{avg:.2f}")

2. テキストのクリーニング

2.1 括弧内注釈の削除

with open("论语-原文.txt") as fi, open("论语-提纯原文.txt", "w") as fo:
    for line in fi:
        cleaned = line
        for i in range(1, 23):
            cleaned = cleaned.replace(f"({i})", "")
        fo.write(cleaned)

2.2 中文句読点の除去

punctuation = ',。?、‘’"";:、)\n——(!'

with open('data.txt', 'r', encoding='utf-8') as fi, open('clean.txt', 'w', encoding='utf-8') as fo:
    content = fi.read()
    cleaned = ''.join(c for c in content if c not in punctuation)
    fo.write(cleaned)

3. 集計・統計処理

3.1 全体集計

画像URLの総数カウント
count = 0
with open('webpage.txt') as f:
    for line in f:
        if '.JPG' in line:
            count += 1
print(count)
ユニーク文字数のカウント
unique_chars = set()
with open('data1.txt') as f:
    for line in f:
        unique_chars.update(line)
print(len(unique_chars))

3.2 グループ別集計

レコード数の集計(グループ化)
from collections import defaultdict

counter = defaultdict(int)
with open("earpa001.txt") as f:
    for line in f:
        parts = line.strip().split(",")
        key = f"{parts[-2]}-{parts[-1]}"
        counter[key] += 1

sorted_items = sorted(counter.items(), key=lambda x: x[1], reverse=True)
with open("earpa001_count.txt", "w") as fo:
    for k, v in sorted_items:
        fo.write(f"{k},{v}\n")
特定語を含む行のカウント
univ_count = 0
college_list = []

with open("univ.txt") as f:
    for line in f:
        line = line.strip()
        has_univ = "大学" in line and "大学生" not in line
        has_college = "学院" in line
        
        if has_univ or has_college:
            college_list.append(line)
            if has_univ:
                univ_count += 1

for item in college_list:
    print(item)
print(f"包含大学的名称数量是{univ_count}")
print(f"包含学院的名称数量是{len(college_list) - univ_count}")
上位10語の語彙頻度分析
import jieba

with open('clean.txt', encoding='utf-8') as f:
    text = f.read()

words = [w for w in jieba.lcut(text) if len(w) >= 3]
freq = {}
for w in words:
    freq[w] = freq.get(w, 0) + 1

top10 = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:10]
print(','.join(f"{word}:{count}" for word, count in top10))
章ごとの最頻出語抽出
import jieba

with open("八十天环游地球.txt", encoding='utf-8') as f:
    lines = f.readlines()

chapter_indices = []
for i, line in enumerate(lines):
    if line.startswith("第") and "章" in line:
        chapter_indices.append(i)

for idx, start in enumerate(chapter_indices):
    end = chapter_indices[idx + 1] if idx + 1 < len(chapter_indices) else len(lines)
    chapter_text = ''.join(lines[start:end])
    title = lines[start].split()[0]
    
    words = [w for w in jieba.lcut(chapter_text) if len(w) >= 2]
    freq = {}
    for w in words:
        freq[w] = freq.get(w, 0) + 1
    
    if freq:
        top_word, count = max(freq.items(), key=lambda x: x[1])
        print(f"{title} {top_word} {count}")
総合得点上位10名の抽出
candidates = []
with open("score.txt") as f:
    for line in f:
        parts = line.strip().split()
        scores = [int(s) for s in parts[2:]]
        total = sum(scores)
        candidates.append((total, line))

candidates.sort(reverse=True)
with open("candidate0.txt", "w") as fo:
    for _, line in candidates[:10]:
        fo.write(line)
操作時間の集計と上位3件表示
total_time = 0
op_times = {}

with open('out.txt') as f:
    for line in f:
        op, exec_time, percent = line.strip().split(',')
        total_time += float(exec_time)
        op_times[op] = float(percent)

print(f'the total execute time is {total_time}')

top_ops = sorted(op_times.items(), key=lambda x: x[1], reverse=True)[:3]
for i, (op, pct) in enumerate(top_ops):
    print(f'the top {i+1} percentage time is {pct}, spent in "{op}" operation')

タグ: Python テキスト処理 ファイル操作 jieba データ抽出

7月3日 20:11 投稿