Steph Locke
2017-04-20
We have the results from our survey.
basedata<-setDT(read_excel(myFile))
results<-basedata[Age<100&Age>Experience,]
Our sample has 30 records but with basic data cleansing we will be working with 28.
DT::datatable(results)
ggplot(results)+ggopts+
aes(x = Age, y=..count..)+
geom_histogram(bins = 10, fill="white", colour="white")
vegalite %>%
add_data(results) %>%
encode_x("Experience","quantitative") %>%
encode_y("*", "quantitative", aggregate="count") %>%
bin_x(maxbins=10) %>% mark_bar()
vegalite %>%
add_data(results) %>%
encode_x("Field","ordinal") %>%
encode_y("*", "quantitative", aggregate="count") %>%
mark_bar()
vegalite %>%
add_data(results) %>%
encode_x("Experience","quantitative") %>%
encode_y("Age", "quantitative") %>%
encode_color("Field", "nominal") %>%
mark_point()
vegalite %>%
add_data(training) %>%
encode_x("Experience","quantitative") %>%
encode_y("Age", "quantitative") %>%
encode_color("Field", "nominal") %>%
mark_point()
vegalite %>%
add_data(holdout) %>%
encode_x("Experience","quantitative") %>%
encode_y("Age", "quantitative") %>%
encode_color("Field", "nominal") %>%
mark_point()
We could take some measure of central tendency to predict the age of attendees.
averages<-training[,.(Mean=floor(mean(Age))
,Median=floor(median(Age))
,Mode=Mode(Age)
)]
knitr::kable(averages)
Mean | Median | Mode |
---|---|---|
36 | 37 | 37 |
holdout[,colnames(averages):=averages]
holdout.m<-melt(holdout, measure.vars = c("Age",colnames(averages)))
vegalite %>%
add_data(holdout.m) %>%
encode_x("Name", "ordinal") %>%
encode_y("value", "quantitative") %>%
encode_color("variable", "nominal")%>%
mark_point()
holdout.lse<-melt(holdout, measure.vars = colnames(averages))
holdout.lse[,Error:=(Age-value)^2]
knitr::kable(holdout.lse[,.(LSE=sum(Error)), variable][order(LSE)])
variable | LSE |
---|---|
Median | 357 |
Mode | 357 |
Mean | 362 |
training[,expLMres:=expLM$fitted]
ggplot(training, aes(x=Experience, y=Age))+
geom_point()+
geom_line(aes(y=expLMres),colour="blue")+
theme_minimal()
holdout[,expLMres:=predict(expLM,holdout)]
holdout.m<-melt(holdout, measure.vars = c("Age","expLMres"))
vegalite %>%
add_data(holdout.m) %>%
encode_x("Name", "ordinal") %>%
encode_y("value", "quantitative") %>%
encode_color("variable", "nominal")%>%
mark_point()
holdout.lse<-melt(holdout, measure.vars = c("expLMres",colnames(averages)))
holdout.lse[,Error:=(Age-value)^2]
knitr::kable(holdout.lse[,.(LSE=sum(Error)), variable][order(LSE)])
variable | LSE |
---|---|
expLMres | 87.33876 |
Median | 357.00000 |
Mode | 357.00000 |
Mean | 362.00000 |
fieldLM<-lm(Age~Experience + Field, training)
summary(fieldLM)
##
## Call:
## lm(formula = Age ~ Experience + Field, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.855 -4.950 -2.061 4.405 15.789
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.4085 3.8575 6.846 3.93e-06 ***
## Experience 0.5889 0.2702 2.179 0.0446 *
## FieldBI 7.7361 5.0842 1.522 0.1476
## FieldDBA 6.1196 4.7355 1.292 0.2146
## FieldOther 3.0252 5.3909 0.561 0.5825
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.174 on 16 degrees of freedom
## Multiple R-squared: 0.3976, Adjusted R-squared: 0.2469
## F-statistic: 2.64 on 4 and 16 DF, p-value: 0.07254
training[,fieldLMres:=fieldLM$fitted]
ggplot(training, aes(x=Experience, y=Age, group=Field, colour=Field))+
geom_point()+
geom_line(aes(y=fieldLMres, group=Field),colour="blue")+
facet_wrap(~Field)+
theme_minimal()
holdout[,fieldLMres:=predict(fieldLM,holdout)]
holdout.m<-melt(holdout, measure.vars = c("Age","fieldLMres"))
vegalite %>%
add_data(holdout.m) %>%
encode_x("Name", "ordinal") %>%
encode_y("value", "quantitative") %>%
encode_color("variable", "nominal")%>%
mark_point()
holdout.lse<-melt(holdout, measure.vars = c("fieldLMres","expLMres",colnames(averages)))
holdout.lse[,Error:=(Age-value)^2]
knitr::kable(holdout.lse[,.(LSE=sum(Error)), variable][order(LSE)])
variable | LSE |
---|---|
expLMres | 87.33876 |
fieldLMres | 95.72245 |
Median | 357.00000 |
Mode | 357.00000 |
Mean | 362.00000 |