import numpy as np
import pandas as pd
import numpy as np

from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import adfuller

import matplotlib.pyplot as plt
plt.style.use('ggplot') #グラフスタイル
plt.rcParams['figure.figsize'] = [10, 5] #グラフサイズ
plt.rcParams['font.size'] = 10 #フォントサイズ

import japanize_matplotlib #グラフ内で日本語利用

# 月次データの読み込み
df_m = pd.read_csv(
    'MMM_ts_manga5_monthly.csv', 
    index_col='month', 
    parse_dates=True
)
df_m.index = df_m.index.to_period('M')

print(df_m)

               sales  traditional   internet  promotional
month                                                    
2016-01   4030283000    426400000  666900000    650475000
2016-02   5334600000    465450000  657450000    434325000
2016-03   3356769000   1119000000          0    404475000
2016-04   5004399000    926500000  613725000    197400000
2016-05  11650794000   1523800000  699675000    157162500
...              ...          ...        ...          ...
2023-08   5942020000   1521500000  584475000    193537500
2023-09   3889616000    931600000  469875000    326812500
2023-10   3118571000            0  551700000    182812500
2023-11   3107643000            0  410325000    338625000
2023-12   3942322000    656100000  326250000    437775000

[96 rows x 4 columns]

# 年と月の変数の作成
df_m['Year'] = df_m.index.year
df_m['Month'] = df_m.index.month

# 年別月間売上の集計
df_yearly_sales = df_m.groupby('Year')['sales'].sum()

# 年別月間売上と年単位の売上の推移のグラフを上下に並べて表示
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(8, 8))

# 年単位の売上の推移のグラフ
bar = axes[0].bar(
    df_yearly_sales.index, 
    df_yearly_sales.values, 
    color='skyblue'
)
for rect in bar:
    height = rect.get_height()
    axes[0].annotate(f'{height/1e8:.0f}億円',
                xy=(rect.get_x() + rect.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords='offset points',
                ha='center', va='bottom')
axes[0].set_title('年単位の売上の推移')
axes[0].set_xlabel('年')
axes[0].set_ylabel('売上')
axes[0].set_ylim(0, 1e11)
axes[0].set_xticks(df_yearly_sales.index)
axes[0].tick_params(axis='x', rotation=45)

# 年別月間売上のグラフ
years = df_m['Year'].unique()
colors = plt.cm.viridis(np.linspace(0, 1, len(years)))

for i, year in enumerate(years):
    df_year = df_m[df_m['Year'] == year]
    axes[1].plot(
        df_year['Month'], 
        df_year['sales'], 
        label=str(year), 
        color=colors[i]
    )
axes[1].set_title('年別月間売上')
axes[1].set_xlabel('月')
axes[1].set_ylabel('売上')
axes[1].legend()
axes[1].set_xticks(range(1, 13))
axes[1].set_ylim(0, 1.6e10)
axes[1].set_xticklabels([
    '1月', '2月', '3月', '4月', '5月', '6月', 
    '7月', '8月', '9月', '10月', '11月', '12月'
])

plt.tight_layout()
plt.show()

# 2023年のデータをデータフレームから抽出
df_2023 = df_m.loc['2023']

# 4つのサブプロットを作成
total_rows = 4
fig, axes = plt.subplots(nrows=total_rows, ncols=1, figsize=(8, 5))

# 各列のプロット情報
graph_info = [
    ('2023年 売上（sales）', 'sales', '売上金額', 1.6e10),
    ('2023年 マスコミ四媒体広告費（ほぼテレビCM）', 'traditional', 'コスト', 1.6e9),
    ('2023年 インターネット広告費', 'internet', 'コスト', 1.6e9),
    ('2023年 プロモーションメディア広告費', 'promotional', 'コスト', 1.6e9)
]

# プロットをループで作成
for ax, (title, column, ylabel, ylim) in zip(axes, graph_info):
    ax.set_title(title)
    ax.plot(df_2023['Month'], df_2023[column])
    ax.set_ylabel(ylabel)
    ax.set_xticks(range(1, 13))
    ax.set_ylim(0, ylim)
    ax.set_xticklabels(['1月', '2月', '3月', '4月', '5月', '6月', '7月', '8月', '9月', '10月', '11月', '12月'])

# レイアウトを調整して表示
plt.tight_layout()
plt.show()

# 週次データの読み込み
df = pd.read_csv(
    'MMM_ts_manga5.csv', 
    index_col='week', 
    parse_dates=True
)

print(df)

                 sales  traditional   internet  promotional
week                                                       
2015-12-27   649315000    656100000          0            0
2016-01-03   958266000            0  178050000    188587500
2016-01-10   755973000            0  150450000    162037500
2016-01-17   551603000            0          0    126900000
2016-01-24   520183000            0  175500000            0
...                ...          ...        ...          ...
2023-11-26   839617000            0  154950000            0
2023-12-03   829314000            0          0    249187500
2023-12-10   852512000            0  148200000            0
2023-12-17   963276000    656100000          0            0
2023-12-24  1297220000            0  178050000    188587500

[418 rows x 4 columns]

# 直近5年間（52.25週×5年）のデータに絞り
# 目的変数yと説明変数Xに分解

# 対象データ期間の設定
data_term = int(52.25 * 5)

# 説明変数Xと目的変数yに分解
X = df.drop(columns=['sales'])[-data_term:]
y = df['sales'][-data_term:]

from statsmodels.tsa.seasonal import STL
import matplotlib.pyplot as plt

# STL分解の実行
# y: 元の時系列データ (売上データ)
# period: 頻度（ここでは52週=1年間を表す）
stl = STL(y, period=52)
result = stl.fit()

# 分解された成分の取得
# trend: トレンド成分 (全体的な長期的傾向)
# seasonal: 季節成分 (周期的な変動)
# resid: 残差成分 (トレンドと季節成分を除いた部分)
trend = result.trend
seasonal = result.seasonal
resid = result.resid

# 結果の可視化
plt.figure(figsize=(8, 5))

# 元の時系列データをプロット
plt.subplot(411)
plt.plot(y.index, y)
plt.title('元の時系列データ - 売上(sales)')
plt.ylim(bottom=0, top=5e9)

# トレンド成分をプロット
plt.subplot(412)
plt.plot(y.index, trend)
plt.title('トレンド成分')
plt.ylim(bottom=-2e9, top=3e9)

# 季節成分をプロット
plt.subplot(413)
plt.plot(y.index, seasonal)
plt.title('季節成分')
plt.ylim(bottom=-2e9, top=3e9)

# 残差成分をプロット
plt.subplot(414)
plt.plot(y.index, resid)
plt.title('残差成分')
plt.ylim(bottom=-2e9, top=3e9)

# レイアウト調整して表示
plt.tight_layout()
plt.show()

# STL分解後の残差(resid)に対して拡張ディッキー・フラー検定(Augmented Dickey-Fuller test)を適用
# ADF検定は単位根の存在を確認するためのもので、時系列データが定常かどうかを判定します
result = adfuller(resid)

# ADF検定のテスト統計量を出力
print('テスト統計量:', result[0])

# 検定の有意性を判断するためのp値を出力
print('p値:', result[1])

# 信頼水準（1%、5%、10%）ごとの臨界値を出力
print('臨界値:', result[4])

テスト統計量: -11.58222731294389
p値: 2.924170854469547e-21
臨界値: {'1%': -3.4557539868570775, '5%': -2.8727214497041422, '10%': -2.572728476331361}

# 定常性検定の実行
# 各説明変数について拡張ディッキー・フラー検定（ADFテスト）を実施し、結果を出力します

# 説明変数ごとに検定を実行
for column in X.columns:
    # 各列のデータに対してADF検定を実施
    result = adfuller(X[column])

    # 結果をコンソールに出力
    print(column)  # 列名（変数名）を出力
    print('- Test Statistic:', result[0])  # テスト統計量を出力
    print('- p-value:', result[1])  # p値を出力
    print('- Critical Values:', result[4])  # 信頼水準における臨界値を出力

traditional
- Test Statistic: -10.159352311769545
- p-value: 7.579888911588827e-18
- Critical Values: {'1%': -3.4558530692911504, '5%': -2.872764881778665, '10%': -2.572751643088207}
internet
- Test Statistic: -15.681906957312055
- p-value: 1.4691906894548407e-28
- Critical Values: {'1%': -3.4557539868570775, '5%': -2.8727214497041422, '10%': -2.572728476331361}
promotional
- Test Statistic: -5.29778728915373
- p-value: 5.533738398557567e-06
- Critical Values: {'1%': -3.457105309726321, '5%': -2.873313676101283, '10%': -2.5730443824681606}

# 航空機の乗客数（Passengers）データの読み込み
df = pd.read_csv(
    'AirPassengers.csv', 
    index_col='Month',
    parse_dates=True
)

# STL分解の実行
stl = STL(df['Passengers'], period=12)
result = stl.fit()

# 分解された成分の取得
trend = result.trend
seasonal = result.seasonal
resid = result.resid

# トレンド成分を除去したデータの作成
passengers_detrended = df['Passengers'] - trend

# トレンドおよび季節成分を除去したデータの作成
passengers_detrended_deseasonalized = df['Passengers'] - trend - seasonal

# 結果の可視化
plt.figure(figsize=(8, 5))

# 共通のy軸範囲を設定
y_min = min(
    df['Passengers'].min(), 
    passengers_detrended.min(), 
    passengers_detrended_deseasonalized.min()
)
y_max = max(
    df['Passengers'].max(),
    passengers_detrended.max(), 
    passengers_detrended_deseasonalized.max()
)

plt.subplot(3,1,1)
plt.plot(df.index, df['Passengers'])
plt.title('元の時系列データ - 乗客数（Passengers）')
plt.ylim(y_min, y_max)  # スケールを統一

plt.subplot(3,1,2)
plt.plot(df.index, passengers_detrended)
plt.title('トレンド成分を調整（除去）した乗客数（Passengers）')
plt.ylim(y_min, y_max)  # スケールを統一

plt.subplot(3,1,3)
plt.plot(df.index, passengers_detrended_deseasonalized)
plt.title('トレンドおよび季節成分を調整（除去）した乗客数（Passengers）')
plt.ylim(y_min, y_max)  # スケールを統一

plt.tight_layout()
plt.show()

必要なモジュールの読み込み¶

時系列データのグラフ化¶

時系列データの基礎分析¶

航空機の乗客数（Passengers）データの例¶