統計データ解析

第5講サンプルコード

Published

November 1, 2024

準備

以下で利用する共通パッケージを読み込む．

library(conflicted) # 関数名の衝突を警告
conflicts_prefer(   # 優先的に使う関数を指定
  dplyr::filter(),
  dplyr::select(),
  dplyr::lag(),
  )
library(tidyverse)  
library(broom)      # 解析結果を tibble 形式に集約
library(ggfortify)  # 診断プロットの描画
library(gt)         # 表の作成
library(gtsummary)  # 分析結果の表の作成

回帰式を用いた予測

問題

東京の気候データを用いて以下の実験を試みなさい．

8月のデータで回帰式を推定する
上記のモデルで9月のデータを予測する

特定の月のデータを取り出すには，例えば以下のようにすればよい

tw_data <- read_csv("data/tokyo_weather.csv")
tw_train <- tw_data |> filter(month == 8) # 単一の数字と比較
tw_test  <- tw_data |> filter(month %in% c(9,10)) # 集合と比

解答例

東京の気候データによる分析において，信頼区間と予測区間の計算を行う．

推定用データと予測用データを準備し，推定用データを用いて回帰式の推定を行う．

tw_data <- read_csv("data/tokyo_weather.csv")
tw_train <- tw_data |> filter(month %in% 8) # 推定用データ
tw_test  <- tw_data |> filter(month %in% 9) # 予測用データ
tw_model <- temp ~ solar + press + cloud    # モデルの定義 
tw_lm <- lm(tw_model, data = tw_train)      # モデルの推定

信頼区間の計算を行う．

tw_train_conf <-
    augment(tw_lm, newdata = tw_train, # 推定用データに
            interval = "confidence")   # あてはめ値と信頼区間を付加
tw_test_conf <- 
    augment(tw_lm, newdata = tw_test,  # 予測用データ(新規データ)に
            interval = "confidence")   # あてはめ値と信頼区間を付加

予測区間の計算を行う．

tw_train_pred <- 
    augment(tw_lm, newdata = tw_train, # 推定用データに
            interval = "prediction")   # あてはめ値と予測区間を付加
tw_test_pred <- 
    augment(tw_lm, newdata = tw_test,  # 予測用データ(新規データ)に
            interval = "prediction")   # あてはめ値と予測区間を付加

8月のデータで推定したモデルで8月をあてはめた信頼区間を描画する．

tw_train_conf |>
    ggplot(aes(x = day, y = temp)) +
    geom_point(colour = "red", shape = 16) +
    geom_point(aes(y = .fitted), colour = "blue") +
    geom_errorbar(aes(ymin = .lower, ymax = .upper), colour = "royalblue") +
    ylim(c(20,34)) + # 以降比較のため4つのグラフの値域を揃える
    labs(x = "August", y = "Temperature", title = "Confidence Interval")

8月のデータで推定したモデルで8月をあてはめた予測区間を描画する．

tw_train_pred |>
    ggplot(aes(x = day, y = temp)) +
    geom_point(colour = "red", shape = 16) +
    geom_point(aes(y = .fitted), colour = "blue") +
    geom_errorbar(aes(ymin = .lower, ymax = .upper), colour = "steelblue") +
    ylim(c(20,34)) + 
    labs(x = "August", y = "Temperature", title = "Prediction Interval")

8月のモデルで9月をあてはめた信頼区間を描画する．

tw_test_conf |>
    ggplot(aes(x = day, y = temp)) +
    geom_point(colour = "red", shape = 16) +
    geom_point(aes(y = .fitted), colour = "blue") +
    geom_errorbar(aes(ymin = .lower, ymax = .upper), colour = "royalblue") +
    ylim(c(20,34)) + 
    labs(x = "September", y = "Temperature", title = "Confidence Interval")

8月のモデルで9月をあてはめた予測区間を描画する．

tw_test_pred |>
    ggplot(aes(x = day, y = temp)) +
    geom_point(colour = "red", shape = 16) +
    geom_point(aes(y = .fitted), colour = "blue") +
    geom_errorbar(aes(ymin = .lower, ymax = .upper), colour = "steelblue") +
    ylim(c(20,34)) + 
    labs(x = "September", y = "Temperature", title = "Prediction Interval")

人工データによる例

以下はあくまで例なので自由に数値実験を設計して下さい．

人工データを生成する．

#' モデル: y = -1 + 2 * x1
set.seed(1515) # 乱数のシード値(適宜変更せよ)
n <- 50 # データ数の設定
x_obs <- tibble(x0 = 1,
                x1 = runif(n), # 説明変数1
                x2 = runif(n)) # 説明変数2
beta <- c(-1, 2, 0) # (切片, x1の係数, x2の係数) 実質x2は使われていない
sigma <-  1/2 # 誤差の標準偏差
epsilon <- rnorm(nrow(x_obs), sd = sigma) # 誤差項
toy_data <- x_obs |> # 目的変数の観測値を追加
    mutate(y = as.vector(as.matrix(x_obs) %*% beta) + epsilon)

モデルの推定と評価を行う．

toy_lm1 <- lm(y ~ x1, data = toy_data)      # x1のみの正しいモデル
toy_lm2 <- lm(y ~ x1 + x2, data = toy_data) # x1とx2の冗長なモデル
toy_lm3 <- lm(y ~ x2, data = toy_data)      # x2のみの誤ったモデル
summary(toy_lm1)


Call:
lm(formula = y ~ x1, data = toy_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.81627 -0.33791 -0.05144  0.29177  1.07659 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.9059     0.1171  -7.736 5.54e-10 ***
x1            1.9957     0.2155   9.259 2.96e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.466 on 48 degrees of freedom
Multiple R-squared:  0.6411,    Adjusted R-squared:  0.6336 
F-statistic: 85.74 on 1 and 48 DF,  p-value: 2.956e-12

summary(toy_lm2)


Call:
lm(formula = y ~ x1 + x2, data = toy_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.86076 -0.34121 -0.05143  0.29323  1.05813 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -0.9463     0.1473  -6.422 6.16e-08 ***
x1            1.9764     0.2213   8.929 1.09e-11 ***
x2            0.1043     0.2273   0.459    0.649    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4699 on 47 degrees of freedom
Multiple R-squared:  0.6427,    Adjusted R-squared:  0.6275 
F-statistic: 42.27 on 2 and 47 DF,  p-value: 3.139e-11

summary(toy_lm3)


Call:
lm(formula = y ~ x2, data = toy_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.77397 -0.47455  0.02159  0.51110  1.75063 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)  -0.2398     0.2020  -1.187    0.241
x2            0.4892     0.3627   1.349    0.184

Residual standard error: 0.7635 on 48 degrees of freedom
Multiple R-squared:  0.03653,   Adjusted R-squared:  0.01645 
F-statistic:  1.82 on 1 and 48 DF,  p-value: 0.1837

表としてまとめるには，例えば以下のようにすればよい．

my_gts <- function(x){
    tbl_regression(x, intercept = TRUE) |>
        add_significance_stars(
            pattern = "{estimate} ({conf.low}, {conf.high}){stars}",
            hide_ci = TRUE, hide_se = TRUE
        ) |>
        modify_header(estimate = "**Beta (95% CI)**") |>
        modify_footnote(estimate = "CI = Confidence Interval",
                        abbreviation = TRUE) |>
        add_glance_table(include = c(r.squared,
                                     adj.r.squared,
                                     statistic,
                                     p.value)) }
tbl_merge(
    tbls = list(
        my_gts(toy_lm1),
        my_gts(toy_lm2),
        my_gts(toy_lm3)),
    tab_spanner = c("モデル1","モデル2","モデル3")) |>
    modify_table_body( ~ .x |>
                           arrange(factor(variable,
                                          levels = c("(Intercept)","x1","x2"))))

Characteristic	モデル1	モデル2	モデル3
Characteristic	Beta (95% CI) ^1,2	Beta (95% CI) ^1,2	Beta (95% CI) ^1,2
(Intercept)	-0.91 (-1.1, -0.67)***	-0.95 (-1.2, -0.65)***	-0.24 (-0.65, 0.17)
x1	2.0 (1.6, 2.4)***	2.0 (1.5, 2.4)***
x2		0.10 (-0.35, 0.56)	0.49 (-0.24, 1.2)
R²	0.641	0.643	0.037
Adjusted R²	0.634	0.627	0.016
Statistic	85.7	42.3	1.82
p-value	<0.001	<0.001	0.2
¹ p<0.05; p<0.01; p<0.001
² CI = Confidence Interval

新規データに対する予測の仕組みを確認する．

x_new <- tibble(x0 = 1,
                x1 = runif(n),
                x2 = runif(n,-10,10)) 
#' 新規データに対する目的変数の真値 (誤差なし)
y_tilde <- as.vector(as.matrix(x_new) %*% beta)
#' 各モデルでの予測
y_hat1 <- predict(toy_lm1, newdata = x_new) # x_lm1による予測値
y_hat2 <- predict(toy_lm2, newdata = x_new) # x_lm2による予測値
y_hat3 <- predict(toy_lm3, newdata = x_new) # x_lm3による予測値

散布図を用いて，それぞれのモデルによる予測値と真値の関係を可視化する．

tibble(obs = y_tilde,
       model1 = y_hat1, model2 = y_hat2, model3 = y_hat3) |>
    pivot_longer(!obs) |>
    ggplot(aes(x = value, y = obs)) +
    geom_point(aes(colour = name)) +
    geom_abline(slope = 1, intercept = 0, colour = "gray") +
    labs(x = "fitted value", y = "observed value") +
    theme(legend.title = element_blank())

正しいモデルでは正確に予測が行われているが，不要な項が含まれると予測誤差が大きくなることがわかる．

雑音のないデータ \tilde{y} と予測値の相関係数(2乗すると決定係数に相当)を用いて数値的な評価をする．

cor(y_tilde, y_hat1)^2

[1] 1

cor(y_tilde, y_hat2)^2

[1] 0.5402325

cor(y_tilde, y_hat3)^2

[1] 0.0001686474

非線形性を含むモデルとカテゴリカル変数の扱い

問題

東京の気候データ(9-11月)を用いて，気温を回帰する以下のモデルを検討しなさい．

日射量，気圧，湿度の線形回帰モデル
湿度の対数を考えた線形回帰モデル
最初のモデルにそれぞれの交互作用を加えたモデル

東京の気候データ(1年分)を用いて，気温を回帰する以下のモデルを検討しなさい．

降水の有無を表すカテゴリカル変数を用いたモデル (雨が降ると気温が変化することを検証する)
上記に月をカテゴリカル変数として加えたモデル (月毎の気温の差を考慮する)

解答例

9月から11月のデータを用いた分析を行うために，データを整理する．

tw_subset <- tw_data |> filter(month %in% 9:11)

日射量，気圧，湿度を用いた回帰モデルの検討を行う．まず，解析の対象となる変数の散布図を描いておく．

tw_subset |>
    select(temp, solar, press, humid) |>
    GGally::ggpairs(columnLabels = c("気温","日射量","気圧","湿度"))

モデルを比較する．

#' 基本となる線形回帰モデル
summary(lm(temp ~ solar + press + humid, data = tw_subset))


Call:
lm(formula = temp ~ solar + press + humid, data = tw_subset)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.7787 -2.4153  0.0586  2.5814  7.1178 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 177.64597   74.23672   2.393   0.0189 *  
solar         0.78130    0.08492   9.200 1.75e-14 ***
press        -0.18425    0.07274  -2.533   0.0131 *  
humid         0.26864    0.03205   8.383 8.23e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.761 on 87 degrees of freedom
Multiple R-squared:  0.5976,    Adjusted R-squared:  0.5838 
F-statistic: 43.07 on 3 and 87 DF,  p-value: < 2.2e-16

#' 湿度の対数を考えた線形回帰モデル
summary(lm(temp ~ solar + press + log(humid), data = tw_subset))


Call:
lm(formula = temp ~ solar + press + log(humid), data = tw_subset)

Residuals:
   Min     1Q Median     3Q    Max 
-9.868 -2.478 -0.062  2.814  6.635 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 152.12678   76.72547   1.983  0.05055 .  
solar         0.73316    0.08461   8.665 2.18e-13 ***
press        -0.21081    0.07366  -2.862  0.00527 ** 
log(humid)   16.98674    2.12197   8.005 4.83e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.837 on 87 degrees of freedom
Multiple R-squared:  0.5812,    Adjusted R-squared:  0.5667 
F-statistic: 40.24 on 3 and 87 DF,  p-value: < 2.2e-16

#' 最初のモデルにそれぞれの交互作用を加えたモデル (書き方はいろいろある)
summary(lm(temp ~ (solar + press + humid)^2, data = tw_subset))


Call:
lm(formula = temp ~ (solar + press + humid)^2, data = tw_subset)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.8008 -2.5456 -0.0834  2.5669  6.9689 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.052e+02  5.209e+02  -0.394    0.695
solar        1.333e+01  1.841e+01   0.724    0.471
press        1.979e-01  5.142e-01   0.385    0.701
humid        3.847e+00  5.796e+00   0.664    0.509
solar:press -1.271e-02  1.807e-02  -0.704    0.484
solar:humid  4.133e-03  5.894e-03   0.701    0.485
press:humid -3.586e-03  5.732e-03  -0.626    0.533

Residual standard error: 3.793 on 84 degrees of freedom
Multiple R-squared:  0.6047,    Adjusted R-squared:  0.5765 
F-statistic: 21.42 on 6 and 84 DF,  p-value: 4.2e-15

#' 更に3つの変数の積を加えたモデル
summary(lm(temp ~ solar * press * humid, data = tw_subset))


Call:
lm(formula = temp ~ solar * press * humid, data = tw_subset)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.7795 -2.6021 -0.0871  2.5201  6.9460 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)
(Intercept)       -4.786e+01  9.223e+02  -0.052    0.959
solar             -2.966e+00  8.078e+01  -0.037    0.971
press              4.228e-02  9.118e-01   0.046    0.963
humid              1.733e+00  1.175e+01   0.148    0.883
solar:press        3.413e-03  7.989e-02   0.043    0.966
solar:humid        2.331e-01  1.105e+00   0.211    0.833
press:humid       -1.494e-03  1.162e-02  -0.129    0.898
solar:press:humid -2.266e-04  1.094e-03  -0.207    0.836

Residual standard error: 3.815 on 83 degrees of freedom
Multiple R-squared:  0.6049,    Adjusted R-squared:  0.5716 
F-statistic: 18.15 on 7 and 83 DF,  p-value: 2.007e-14

基本的なモデルと最後のモデルの視覚的な評価 (診断プロット) を行う．

autoplot(lm(temp ~ solar + press + humid, data = tw_subset))

autoplot(lm(temp ~ solar * press * humid, data = tw_subset))

雨の有無(カテゴリカル変数)と気温の関係を分析する．雨の有無および月(整数値)をダミー化(因子化)¹する．

¹ 論理値は因子として扱われるが，明示的に因子としたい場合は factor(rain > 0) としても良い．因子の属性を与える関数としては base::as.factor() と forcats::as_factor() があるが，前者は文字列を辞書順に並べ変えてから，後者はデータフレームでの出現順に因子化する．回帰の計算においては特に問題ないが，因子化の順序はグラフなど凡例の順番に影響する．

tw_data_fact <- tw_data |>
    mutate(rain = rain > 0,            # 降雨の有無
           month = as_factor(month)) # 月

通年でのモデルを検討する．

summary(lm(temp ~ rain, data = tw_data_fact))


Call:
lm(formula = temp ~ rain, data = tw_data_fact)

Residuals:
    Min      1Q  Median      3Q     Max 
-17.593  -6.793  -0.060   7.340  14.507 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  17.2932     0.4995  34.620   <2e-16 ***
rainTRUE      1.4668     0.9543   1.537    0.125    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.131 on 363 degrees of freedom
Multiple R-squared:  0.006466,  Adjusted R-squared:  0.003729 
F-statistic: 2.362 on 1 and 363 DF,  p-value: 0.1252

通年では雨と気温の関係は積極的に支持されない．

月を説明変数として加えると月毎の気温の差を考慮した回帰式が推定される．

summary(lm(temp ~ rain + month, data = tw_data_fact))


Call:
lm(formula = temp ~ rain + month, data = tw_data_fact)

Residuals:
   Min     1Q Median     3Q    Max 
-6.794 -1.622 -0.117  1.497  8.528 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   5.8726     0.4460  13.168  < 2e-16 ***
rainTRUE     -0.7213     0.3001  -2.404   0.0168 *  
month2        1.5083     0.6409   2.354   0.0191 *  
month3        7.3318     0.6246  11.738  < 2e-16 ***
month4       10.6265     0.6287  16.903  < 2e-16 ***
month5       13.3712     0.6253  21.385  < 2e-16 ***
month6       17.7021     0.6353  27.864  < 2e-16 ***
month7       22.9302     0.6241  36.743  < 2e-16 ***
month8       23.6067     0.6253  37.754  < 2e-16 ***
month9       21.0853     0.6300  33.469  < 2e-16 ***
month10      13.1445     0.6235  21.083  < 2e-16 ***
month11       8.6210     0.6288  13.710  < 2e-16 ***
month12       3.6180     0.6237   5.801 1.47e-08 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.454 on 352 degrees of freedom
Multiple R-squared:  0.9122,    Adjusted R-squared:  0.9092 
F-statistic: 304.9 on 12 and 352 DF,  p-value: < 2.2e-16

月毎に比較した結果，雨の日の方が気温が低いことが示唆される．

診断プロットは以下のようになる．

autoplot(lm(temp ~ rain + month, data = tw_data_fact))