using GLM, StatsBase, CategoricalArrays, DataFrames
26 GLM
GLM.jl adds support for the generalized linear model and includes conveninet functionality for statistical modeling, like dummy variable coding.
26.1 Data
= randn(200), randn(200) x1, x2
([-0.6801322947126893, -0.20078598406767398, -1.0537649731208625, 0.22067417688653157, -0.1551638336446484, -0.8452113183565811, -1.539383623250148, -1.4830396834917876, -0.5161779252210892, -0.06557934768322994 … -0.8859405196546517, 0.5834529913849991, 0.6056695309981021, -2.270925211265909, -0.5701225871092417, -0.9702610179837404, 0.36937481975829634, -1.7209328949754172, -0.0015845851072174104, -1.0373347107784006], [-0.9046720511242147, 0.0013020782583027864, -0.57638810919079, 0.9977003196246026, 1.1727506264514511, 0.9711341455362745, 0.7529730189675592, 0.046788577042474126, 1.192493348277341, -0.9666790665117393 … -0.25456437424594636, -1.0816035596917075, 1.6689828912229465, 1.9427961127870692, 0.9207322700917318, 1.9083989435112387, -0.3657866677786876, -0.4759472691691634, 0.9388324239077094, -1.2750732314649833])
= hcat(x1, x2) x
200×2 Matrix{Float64}:
-0.680132 -0.904672
-0.200786 0.00130208
-1.05376 -0.576388
0.220674 0.9977
-0.155164 1.17275
-0.845211 0.971134
-1.53938 0.752973
-1.48304 0.0467886
-0.516178 1.19249
-0.0655793 -0.966679
⋮
0.583453 -1.0816
0.60567 1.66898
-2.27093 1.9428
-0.570123 0.920732
-0.970261 1.9084
0.369375 -0.365787
-1.72093 -0.475947
-0.00158459 0.938832
-1.03733 -1.27507
= randn(2) w
2-element Vector{Float64}:
-0.68458690711774
-1.092307385975893
= x * w + randn(200) y
200-element Vector{Float64}:
2.4830943091272504
-0.3163488721279369
1.5993840711823684
-0.32273331930682314
-0.8468678559914746
0.4710275274397834
2.9854838304585467
0.6238129867521336
1.4219679724703098
1.2843304851986987
⋮
-0.3567224451461515
-2.3593894352082163
0.9406136017036868
-0.10727012326206642
-0.716659772001473
2.1545399231205113
3.638188429413554
-2.1797634966094668
1.6989993431780037
26.2 GLM
= fit(GeneralizedLinearModel, x, y, Normal()) lm1
GLM.GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Distributions.Normal{Float64}, GLM.IdentityLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}:
Coefficients:
─────────────────────────────────────────────────────────────────
Coef. Std. Error z Pr(>|z|) Lower 95% Upper 95%
─────────────────────────────────────────────────────────────────
x1 -0.819765 0.0735793 -11.14 <1e-28 -0.963977 -0.675552
x2 -1.08001 0.0666791 -16.20 <1e-58 -1.2107 -0.949318
─────────────────────────────────────────────────────────────────
= categorical(sample(["a", "b", "c"], 200)) xc1
200-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
"a"
"b"
"a"
"b"
"a"
"c"
"b"
"c"
"a"
"b"
⋮
"a"
"c"
"c"
"a"
"c"
"b"
"b"
"b"
"c"
= DataFrame(xc1 = xc1, x1 = x1, y = y); datb
= fit(GeneralizedLinearModel,
mod2 @formula(y ~ xc1 + x1), datb, Normal())
StatsModels.TableRegressionModel{GeneralizedLinearModel{GLM.GlmResp{Vector{Float64}, Normal{Float64}, IdentityLink}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
y ~ 1 + xc1 + x1
Coefficients:
────────────────────────────────────────────────────────────────────────────
Coef. Std. Error z Pr(>|z|) Lower 95% Upper 95%
────────────────────────────────────────────────────────────────────────────
(Intercept) 0.273156 0.169113 1.62 0.1063 -0.0582997 0.604612
xc1: b -0.0519433 0.247147 -0.21 0.8335 -0.536343 0.432456
xc1: c -0.41881 0.241815 -1.73 0.0833 -0.89276 0.0551392
x1 -0.837553 0.112263 -7.46 <1e-13 -1.05758 -0.617522
────────────────────────────────────────────────────────────────────────────
= lm(Term(:y) ~ sum(Term.(Symbol.(names(datb[:, Not(:y)])))), datb) mod2a
StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
y ~ 1 + xc1 + x1
Coefficients:
────────────────────────────────────────────────────────────────────────────
Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95%
────────────────────────────────────────────────────────────────────────────
(Intercept) 0.273156 0.169113 1.62 0.1079 -0.0603591 0.606671
xc1: b -0.0519433 0.247147 -0.21 0.8338 -0.539352 0.435466
xc1: c -0.41881 0.241815 -1.73 0.0849 -0.895705 0.0580839
x1 -0.837553 0.112263 -7.46 <1e-11 -1.05895 -0.616155
────────────────────────────────────────────────────────────────────────────
= "y" outcome
"y"
lm(Term(Symbol(eval(outcome))) ~ sum(Term.(Symbol.(names(datb[:, Not(:y)])))), datb)
StatsModels.TableRegressionModel{LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, LinearAlgebra.CholeskyPivoted{Float64, Matrix{Float64}, Vector{Int64}}}}, Matrix{Float64}}
y ~ 1 + xc1 + x1
Coefficients:
────────────────────────────────────────────────────────────────────────────
Coef. Std. Error t Pr(>|t|) Lower 95% Upper 95%
────────────────────────────────────────────────────────────────────────────
(Intercept) 0.273156 0.169113 1.62 0.1079 -0.0603591 0.606671
xc1: b -0.0519433 0.247147 -0.21 0.8338 -0.539352 0.435466
xc1: c -0.41881 0.241815 -1.73 0.0849 -0.895705 0.0580839
x1 -0.837553 0.112263 -7.46 <1e-11 -1.05895 -0.616155
────────────────────────────────────────────────────────────────────────────