Theophano Mitsa
> library(plspm) > data(mobile) #Let’s take a look at our data > head(mobile) ima1 1
66.67
44.44
2 100.00 3
ima2
ima3
44.44
ima4
44.44
88.89 100.00 100.00
77.78
66.67
55.56
33.33
33.33
44.44 100.00
5 100.00 100.00
44.44
77.78
88.89 100.00
qua4 1
qua5
66.67 55.56
77.78
qua6 44.44
66.67
77.78
88.89 100.00
11.11
val1 22.22
exp2
exp3
qua1
55.56
66.67
55.56
66.67
88.89 100.00
66.67
66.67 100.00
88.89
qua7
44.44
66.67
66.67
44.44
77.78
exp1
88.89 100.00 100.00
4 100.00 100.00
6
ima5
66.67
44.44
val2
66.67
55.56
2 100.00 88.89 100.00 100.00 100.00 100.00 100.00 100.00
66.67
66.67 77.78
66.67
66.67
66.67
66.67
4
77.78 33.33
44.44
77.78
44.44
44.44 100.00 100.00 100.00
88.89
77.78
6 100.00 77.78
88.89
88.89 100.00 100.00
loy2 1 44.44
55.56
55.56 100.00 77.78
66.67
77.78 66.67
loy3 55.56
2 11.11 100.00 3 11.11
66.67
4 33.33 100.00 5 22.22
77.78
6 22.22 100.00
#Let us now create a training and a testing set. > index<-1:nrow(mobile) > set.seed(123)
44.44
77.78
sat3
88.89 comp
66.67
55.56
77.78 100.00 100.00
3
5 100.00 88.89
77.78
88.89
88.89 100.00
sat2
33.33
77.78
77.78 100.00 100.00
66.67
sat1
33.33
88.89 100.00
66.67 100.00 100.00 88.89
qua2
66.67
77.78 66.67
55.56
55.56
44.44 100.00 44.44 100.00 77.78 100.00
loy1
qua3
Theophano Mitsa > testindex<-sample(index,trunc(length(index)/3)) > testset<-mobile[testindex,] > trainset<-mobile[-testindex,] # Now we will create the response variable for the training set, as the average of the 5 image variables. > h1<-nrow(trainset) > trainimean<-numeric(h1) > for(i in 1:h1) { +trainimean[i]<-(trainset$ima1[i]+trainset$ima2[i]+trainset$ima3[i]+trainset$ima4[i]+trainset$ima5[i])/ 5 +} #Let us now remove, from the training set, the columns that correspond to the five components of the latent variable image. > trainset1<-trainset[-c(1:5)]
# Now we will create the response variable for the testing set, as the average of the 5 image variables. > h1<-nrow(testset) > testimean<-numeric(h1) > for(i in 1:h1) { +
testimean[i]<-(testset$ima1[i]+testset$ima2[i]+testset$ima3[i]+testset$ima4[i]+testset$ima5[i])/5
+} #Let us now remove, from the testing set, the columns that correspond to the five components of the latent variable image. > testset1<-testset[-c(1:5)] #As a benchmark, let us compute a model based on linear regression. > modelm<-lm(trainimean~.,data=trainset1) #Let us take a look at the coefficients of the linear regression and their significance. > summary(modelm) ………………………
Theophano Mitsa Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept)
6.519889
4.633504
1.407 0.161503
exp1
-0.043918
0.039415
-1.114 0.266991
exp2
0.142802
0.038630
3.697 0.000308 ***
exp3
0.051840
0.033601
1.543 0.125032
qua1
0.063068
0.069035
0.914 0.362438
qua2
-0.003949
0.039747
-0.099 0.920984
qua3
-0.016342
0.048095
-0.340 0.734508
qua4
0.012258
0.056228
0.218 0.827728
qua5
0.126600
0.055636
2.275 0.024322 *
qua6
0.153209
0.050627
3.026 0.002924 **
qua7
-0.025878
0.049054
-0.528 0.598610
val1
-0.011279
0.038093
val2
0.087909
0.051981
1.691 0.092922 .
sat1
0.167089
0.070963
2.355 0.019866 *
sat2
-0.010561
0.047871
sat3
0.129468
0.057663
comp
0.078032
0.035151
loy1
0.011654
0.026656
loy2
-0.014807
0.020774
loy3
0.006408
0.042559
-0.296 0.767586
-0.221 0.825695 2.245 0.026244 * 2.220 0.027954 * 0.437 0.662602 -0.713 0.477100 0.151 0.880524
--Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 8.283 on 147 degrees of freedom Multiple R-squared:
0.6737,
Adjusted R-squared:
F-statistic: 15.97 on 19 and 147 DF,
0.6315
p-value: < 2.2e-16
#Now let’s compute the predictions and mean square prediction error for the linear regression. > predlm<-predict(modelm,testset1) > sumlm=0
Theophano Mitsa > tt<-length(predlm) > for(i in 1:tt) { + sumlm<-sumlm+(predlm[i]-testimean[i])^2 +} > sumlm 72 10123.05 > sumglm<-sumlm/tt # This is the mean square prediction error for the linear regression. > sumglm 72 121.9644
> library(pls) #Let’s create the model based on Principal Components Regression. > modelpcr<-pcr(trainimean~.,data=trainset1,validation="CV") #Let’s create the model based on Partial Least Squares Regression. > modelplsr<-plsr(trainimean~.,data=trainset1,validation="CV") #Let’s compute the predictions for the principal component and partial least squares models. > predpcr<-predict(modelpcr,testset1) > predpls<-predict(modelplsr,testset1) #Now, let’s compute the mean square errors for the above predictions. > sumpcr=0 > for(i in 1:tt) { + sumpcr<-sumpcr+(predpcr[i]-testimean[i])^2 +} > sumpcr [1] 6942.203 > sumg2<-sumpcr/tt
Theophano Mitsa #This is the mean square error for the principal components regression. > sumg2 [1] 83.641 > sumpls=0 > for(i in 1:tt) { + sumpls<-sumpls+(predpls[i]-testimean[i])^2 +} > sumpls [1] 7090.831 > sumg3<-sumpls/tt #This is the mean square error for the partial least squares error. > sumg3 [1] 85.4317 #Finally, let’s compute a model and predictions based on ridge regression. > library(ridge) >
ridge1_reg<-linearRidge(formula=trainimean~., data=trainset1,lambda="automatic")
> nn<-predict(ridge1_reg,testset1) > sumri=0 > for(i in 1:tt) { + sumri<-sumri+(nn[i]-testimean[i])^2 +} > sumri 72 8944.07 > sumg4<-sumri/tt #This is the mean square error for the ridge regression. > sumg4 72 107.7599
Theophano Mitsa # For a final benchmark, let us now compute the base error, which is computed by subtracting the mean of the mean image rating vector from the actual testset values. In other words, if we had no other knowledge we would use this mean value as our prediction. > mean(testimean) [1] 71.24627 > sumb=0 > for(i in 1:tt) { + sumb<-sumb+(testimean[i]-mean(testimean))^2 } > sumb [1] 15056.23 > sumgb<-sumb/tt #As we see, the base error is much larger than all previous errors. > sumgb [1] 181.4004
#Overall, we see that principal components regression and partial least squares regression have the smallest mean prediction errors.