PyTorch 入門：環境設定からテンソル演算まで

開発環境の構築

Anaconda を利用して仮想環境を構築するのが一般的です。以下のコマンドで環境を作成し、活性化します。

conda create -n dl_env python=3.9
conda activate dl_env

必要なパッケージをインストールした後、PyCharm や VS Code などの IDE で解釈実行環境をこの仮想環境に設定すれば準備が整います。

テンソルのインデックス操作

配列やテンソルからのデータ抽出にはスライシングを用います。変数 data を例に挙げて説明します。

単一要素：data[0, 1]（0 行 1 列目）
行全体：data[0, :]（0 行目全体）
列全体：data[:, 0]（0 列目全体）
部分領域：data[0:2, 1:]（0 行目から 1 行目、1 列目以降）
ステップ指定：data[0:5:2, 0:4:2]（行は 2 つ飛ばし、列は 2 つ飛ばし）

PyTorch によるテンソル操作

基本的なテンソルの生成、形状変更、算術演算および結合操作の例を示します。

import torch

# テンソルの生成と形状確認
seq = torch.arange(30)
print(seq)
print(seq.shape)  # 形状の確認
print(seq.numel())  # 要素数の確認

# 形状の変更（要素数は維持）
matrix = seq.reshape(5, 6)
print(matrix)

# 特殊な値を持つテンソル
print(torch.zeros((3, 3, 3)))
print(torch.ones((3, 3, 3)))

# 値からの生成
vals = torch.tensor([[5, 2, 8], [1, 9, 3], [7, 4, 6]])
print(vals)
print(vals.size())

# 算術演算
vec_a = torch.tensor([1.0, 2.0, 4.0, 8.0])
vec_b = torch.tensor([2.0, 2.0, 2.0, 2.0])
print('加算:', vec_a + vec_b)
print('減算:', vec_a - vec_b)
print('乗算:', vec_a * vec_b)
print('除算:', vec_a / vec_b)
print('冪乗:', vec_a ** vec_b)

# テンソルの結合
base = torch.arange(12, dtype=torch.float32).reshape((3, 4))
offset = torch.tensor([[2.0, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]])
# 行方向への結合
combined_rows = torch.cat((base, offset), dim=0)
# 列方向への結合
combined_cols = torch.cat((base, offset), dim=1)

print('行結合結果')
print(combined_rows)
print('列結合結果')
print(combined_cols)

# 比較演算
print(base == offset)

# 総和計算
print(base.sum())

# 放送機制（Broadcasting）
dim_a = torch.arange(3).reshape((3, 1))
dim_b = torch.arange(2).reshape((1, 2))
print('a:', dim_a)
print('b:', dim_b)
# 形状が異なっても次元数が揃っていれば自動拡張される
print(dim_a + dim_b)

# メモリ管理
# 代入により新しいメモリが割り当てられるか確認
original_id = id(vals)
vals = vals + vec_a[:3]  # サイズを合わせるためスライス
print(original_id == id(vals))

# 原地演算（インプレイス）
buffer = torch.zeros_like(vals)
print('buffer id:', id(buffer))
buffer[:] = vals + 1
print('buffer id:', id(buffer))

# NumPy への変換
numpy_arr = base.numpy()
back_to_tensor = torch.tensor(numpy_arr)
print(type(numpy_arr))
print(type(back_to_tensor))

データ前処理

Pandas を利用して CSV データの読み込み、欠損値処理、カテゴリカル変数のエンコーディングを行います。

import os
import pandas as pd

def prepare_dataset():
    os.makedirs(os.path.join('./', 'dataset'), exist_ok=True)
    file_path = os.path.join('./', 'dataset', 'property_data.csv')
    with open(file_path, 'w') as f:
        f.write('Rooms,Access,Value\n')
        f.write('NA,Pave,127500\n')
        f.write('2,NA,106000\n')
        f.write('4,NA,178100\n')
        f.write('NA,NA,140000\n')

def load_and_clean():
    df = pd.read_csv('./dataset/property_data.csv')
    print(df)
    
    # 特徴量とターゲットの分離
    features, target = df.iloc[:, 0:2], df.iloc[:, 2]
    
    # 数値データの欠損値を平均値で補完
    features = features.fillna(features.mean(numeric_only=True))
    print(features)
    
    # カテゴリカルデータのワンホットエンコーディング（NaN もカテゴリとして扱）
    features = pd.get_dummies(features, dummy_na=True, dtype=float)
    print(features)

prepare_dataset()
load_and_clean()

線形代数演算

PyTorch を用いた転置、総和、内積、行列積、ノルム計算などの操作です。

import torch

# 行列の生成と転置
mat_A = torch.arange(20).reshape(4, 5)
print(mat_A)
print('転置:', mat_A.T)
# A == A.T なら対称行列

# 高次元テンソル
tensor_X = torch.arange(24).reshape(2, 3, 4)  # 2 ブロック，3 行，4 列
print(tensor_X)

# コピーと演算
mat_B = mat_A.clone()  # 深コピー
print(mat_A, '\n', mat_A + mat_B)

# 軸指定の総和
mat_C = torch.arange(40, dtype=torch.float32).reshape(2, 5, 4)
print('元の行列:', mat_C)
total_sum = mat_C.sum()
sum_axis0 = mat_C.sum(axis=0)
sum_axis1 = mat_C.sum(axis=1)

print('全要素和:', total_sum)
print('軸 0 和（5*4 行列）:', sum_axis0)
print('軸 1 和（2*4 行列）:', sum_axis1)

# 複数軸の同時縮約
sum_axis01 = mat_C.sum(axis=[0, 1])
print(sum_axis01)

# 平均値
print(mat_C.mean())
print(mat_C.mean(axis=0))

# 次元保持
sum_keep = mat_C.sum(axis=1, keepdims=True)
print(sum_keep)

# 内積
vec_y = torch.ones(4, dtype=torch.float32)
vec_x = torch.tensor([0., 1., 2., 3.], dtype=torch.float32)
print(torch.dot(vec_x, vec_y))
print(torch.sum(vec_x * vec_y))

# 行列積
mat_D = torch.arange(20, dtype=torch.float32).reshape(5, 4)
mat_E = torch.ones(4, 3)
print(torch.mm(mat_D, mat_E))

# ノルム計算
vec_u = torch.tensor([3.0, -4.0])
print(torch.norm(vec_u))  # L2 ノルム
print(torch.abs(vec_u).sum())  # L1 ノルム

タグ: PyTorch Anaconda Pandas tensor-operations data-preprocessing

6月30日 17:19 投稿

異端開発室