# Tehdään aluksi ehto että lataa data vain jos SITÄ EI OLE LADATTU
# Eli lataa vain jos datatiedostoa ei ole
if (!file.exists("./datat/ess_subset.csv")){
library(tidyverse)
# luetaan ensin ess-data
download.file("http://courses.markuskainu.fi/utur2018/datasetit/ess_subset.csv",
destfile = "./datat/ess_subset.csv")
download.file("http://courses.markuskainu.fi/utur2018/datasetit/ess_metadata.csv",
destfile = "./datat/ess_metadata.csv")
}
d <- readr::read_csv("./datat/ess_subset.csv")
m <- readr::read_csv("./datat/ess_metadata.csv")
Luottamus toisiin ihmisiin tai ns. yleinen luottamus on paljon tutkittu aihe enkä suosittele tarttumaan siihen tämän syvemmin. Otin sen ja koetun terveyden tähän tarkasteluun koska ovat selkeät muuttujat. Tarkastelen siis maatason eroja näissä muuttujissa.
Alla kolmen avainmuuttujan tapausten määrät raakadatassa (siis ennen kuin puuttuvat puhdistettu)
# View(m)
# ppltrst: 0 = "You can't be too careful", 10 = "Most people can be trusted"
# health: 1-5, 1 = "very good", 5 = "very bad"
# käsitellään puuttuvat tiedot
d %>% count(ppltrst)
## # A tibble: 14 x 2
## ppltrst n
## <int> <int>
## 1 0 76
## 2 1 54
## 3 2 115
## 4 3 263
## 5 4 303
## 6 5 799
## 7 6 635
## 8 7 1426
## 9 8 1531
## 10 9 473
## 11 10 214
## 12 77 2
## 13 88 9
## 14 99 1
d %>% count(health)
## # A tibble: 7 x 2
## health n
## <int> <int>
## 1 1 1732
## 2 2 2602
## 3 3 1251
## 4 4 257
## 5 5 51
## 6 7 3
## 7 8 5
d %>% count(cntry)
## # A tibble: 4 x 2
## cntry n
## <chr> <int>
## 1 FI 1925
## 2 IS 880
## 3 NO 1545
## 4 SE 1551
# Uudelleenkoodataan erilaiset puuttuvat tiedot NA:ksi
d <- d %>%
mutate(
health = ifelse(health %in% 7:9, NA, health),
ppltrst = ifelse(ppltrst %in% 77:99, NA, ppltrst)
)
# Taulukoidaan uudestaan
d %>% count(ppltrst)
## # A tibble: 12 x 2
## ppltrst n
## <int> <int>
## 1 0 76
## 2 1 54
## 3 2 115
## 4 3 263
## 5 4 303
## 6 5 799
## 7 6 635
## 8 7 1426
## 9 8 1531
## 10 9 473
## 11 10 214
## 12 NA 12
d %>% count(health)
## # A tibble: 6 x 2
## health n
## <int> <int>
## 1 1 1732
## 2 2 2602
## 3 3 1251
## 4 4 257
## 5 5 51
## 6 NA 8
d %>% count(cntry)
## # A tibble: 4 x 2
## cntry n
## <chr> <int>
## 1 FI 1925
## 2 IS 880
## 3 NO 1545
## 4 SE 1551
Lasketaan maatason tunnusluvut ja järjestetään taulukot nousevaa järjestykseen keskiarvon mukaan.
# Tunnusluvut maittain
summary_ppltrst <- d %>%
group_by(cntry) %>%
summarise(ppltrst_mean = mean(ppltrst, na.rm = TRUE),
ppltrst_median = median(ppltrst, na.rm = TRUE)
) %>%
arrange(ppltrst_mean)
summary_ppltrst
## # A tibble: 4 x 3
## cntry ppltrst_mean ppltrst_median
## <chr> <dbl> <dbl>
## 1 SE 6.22 7.
## 2 IS 6.40 7.
## 3 FI 6.76 7.
## 4 NO 6.80 7.
summary_health <- d %>%
group_by(cntry) %>%
summarise(health_mean = mean(health, na.rm = TRUE),
health_median = median(health, na.rm = TRUE)
) %>%
arrange(health_mean)
summary_health
## # A tibble: 4 x 3
## cntry health_mean health_median
## <chr> <dbl> <dbl>
## 1 IS 1.95 2.
## 2 NO 1.97 2.
## 3 SE 2.00 2.
## 4 FI 2.14 2.
ggplot(data = d %>% mutate(cntry = factor(cntry, levels = summary_ppltrst$cntry)),
aes(x = ppltrst)) +
geom_bar(position = "dodge") +
theme_minimal() +
labs(title = "ppltrst", subtitle = "Vastaajien määrät") +
facet_wrap(~cntry) +
scale_x_continuous(breaks = 0:10)
#
ggplot(data = d %>% mutate(cntry = factor(cntry, levels = summary_health$cntry)),
aes(x = health)) +
geom_bar(position = "dodge") +
theme_minimal() +
labs(title = "health", subtitle = "Vastaajien määrät") +
facet_wrap(~cntry)
ggplot(data = d %>% mutate(cntry = factor(cntry, levels = summary_ppltrst$cntry)),
aes(x = ppltrst, y = ..prop..)) +
geom_bar(position = "dodge") +
theme_minimal() +
labs(title = "ppltrst", subtitle = "suhteelliset osuudet") +
facet_wrap(~cntry) +
scale_x_continuous(breaks = 0:10)
#
ggplot(data = d %>% mutate(cntry = factor(cntry, levels = summary_health$cntry)),
aes(x = health, y = ..prop..)) +
geom_bar(position = "dodge") +
theme_minimal() +
labs(title = "health", subtitle = "suhteelliset osuudet") +
facet_wrap(~cntry)
# ladataan Euroopan rajat eurostat-paketilla
geo <- eurostat::get_eurostat_geospatial(output_class = "df")
# Luottamus
karttaan <- d %>%
group_by(cntry) %>%
summarise(varx = mean(ppltrst, na.rm = TRUE))
kartta <- right_join(geo, karttaan, by = c("NUTS_ID" = "cntry"))
ggplot(data=kartta, aes(x=long,y=lat,group=group)) +
geom_polygon(aes(fill=varx),color="dim grey", size=.1) +
theme_light() + theme_minimal() +
coord_map(project="orthographic") +
labs(title = "Yleinen luottamus", fill = "keskiarvo") +
theme(axis.text = element_blank(),
axis.title = element_blank())
# Koettu terveus
karttaan <- d %>%
group_by(cntry) %>%
summarise(varx = mean(health, na.rm = TRUE))
kartta <- right_join(geo, karttaan, by = c("NUTS_ID" = "cntry"))
ggplot(data=kartta, aes(x=long,y=lat,group=group)) +
geom_polygon(aes(fill=varx),color="dim grey", size=.1) +
theme_light() + theme_minimal() +
coord_map(project="orthographic") +
labs(title = "Koettu terveus", fill = "keskiarvo") +
theme(axis.text = element_blank(),
axis.title = element_blank())
Survey datan analyysin ensisijainen paketti on survey. Laajemmin survey-menetelmistä R:stä tutustu Task View Official Statistics & Survey Methodology. dplyr
-tyyppisen syntaksin survey-alyyseihin saat srvyr-paketista. Katso tästä pakettien vertailua: srvyr compared to the survey package
ess_survey <- d %>% as_survey_design(weigth = dweight)
ess_survey %>%
summarize(ppltrst = survey_mean(ppltrst, vartype = "ci", na.rm = TRUE),
health = survey_mean(health, vartype = "ci", na.rm = TRUE)) %>%
gather() %>%
kable()
key | value |
---|---|
ppltrst | 6.574461 |
ppltrst_low | 6.523327 |
ppltrst_upp | 6.625595 |
health | 2.031563 |
health_low | 2.009335 |
health_upp | 2.053791 |
# Maittain
ess_survey %>%
group_by(cntry) %>%
summarize(ppltrst = survey_mean(ppltrst, vartype = "ci", na.rm = TRUE),
health = survey_mean(health, vartype = "ci", na.rm = TRUE)) %>%
arrange(health) %>%
kable()
cntry | ppltrst | ppltrst_low | ppltrst_upp | health | health_low | health_upp |
---|---|---|---|---|---|---|
IS | 6.395904 | 6.253294 | 6.538515 | 1.947727 | 1.889138 | 2.006316 |
NO | 6.801556 | 6.709527 | 6.893586 | 1.970874 | 1.926231 | 2.015517 |
SE | 6.216969 | 6.112074 | 6.321864 | 1.999353 | 1.955377 | 2.043329 |
FI | 6.760915 | 6.675247 | 6.846583 | 2.144566 | 2.107866 | 2.181266 |
Osuudet ja määrät
ess_survey %>%
group_by(cntry,health) %>%
summarize(proportion = survey_mean(na.rm = TRUE),
total = survey_total(na.rm = TRUE)) %>%
kable()
cntry | health | proportion | proportion_se | total | total_se |
---|---|---|---|---|---|
FI | 1 | 0.2199688 | 0.0094468 | 423 | 19.817789 |
FI | 2 | 0.4654186 | 0.0113756 | 895 | 27.556936 |
FI | 3 | 0.2698908 | 0.0101236 | 519 | 21.758528 |
FI | 4 | 0.0395216 | 0.0044433 | 76 | 8.662211 |
FI | 5 | 0.0052002 | 0.0016403 | 10 | 3.159865 |
IS | 1 | 0.3556818 | 0.0161390 | 313 | 17.217669 |
IS | 2 | 0.4000000 | 0.0165159 | 352 | 18.195026 |
IS | 3 | 0.1909091 | 0.0132497 | 168 | 12.776727 |
IS | 4 | 0.0477273 | 0.0071872 | 42 | 6.458184 |
IS | 5 | 0.0056818 | 0.0025340 | 5 | 2.235310 |
NO | 1 | 0.3333333 | 0.0119941 | 515 | 21.682571 |
NO | 2 | 0.4349515 | 0.0126135 | 672 | 24.404390 |
NO | 3 | 0.1689320 | 0.0095334 | 261 | 15.795515 |
NO | 4 | 0.0530744 | 0.0057039 | 82 | 8.993011 |
NO | 5 | 0.0097087 | 0.0024948 | 15 | 3.868386 |
SE | 1 | 0.3113269 | 0.0117811 | 481 | 21.020652 |
SE | 2 | 0.4420712 | 0.0126360 | 683 | 24.577425 |
SE | 3 | 0.1961165 | 0.0101024 | 303 | 16.955545 |
SE | 4 | 0.0368932 | 0.0047960 | 57 | 7.513919 |
SE | 5 | 0.0135922 | 0.0029461 | 21 | 4.574802 |
Sessioninfo on hyvä tulostaa aina dokumentin loppuun että lukija voi tarkistaa millaisessa ympäristössä analyysit on tehty!
devtools::session_info()
## setting value
## version R version 3.4.3 (2017-11-30)
## system x86_64, linux-gnu
## ui X11
## language (EN)
## collate fi_FI.UTF-8
## tz Europe/Mariehamn
## date 2018-04-13
##
## package * version date source
## assertthat 0.2.0 2017-04-11 CRAN (R 3.4.0)
## backports 1.1.2 2017-12-13 cran (@1.1.2)
## base * 3.4.3 2017-12-01 local
## bindr 0.1.1 2018-03-13 CRAN (R 3.4.3)
## bindrcpp * 0.2 2017-06-17 CRAN (R 3.4.0)
## broom 0.4.3 2017-11-20 CRAN (R 3.4.2)
## cellranger 1.1.0 2016-07-27 CRAN (R 3.4.0)
## class 7.3-14 2015-08-30 CRAN (R 3.4.0)
## classInt 0.1-24 2017-04-16 CRAN (R 3.4.0)
## cli 1.0.0 2017-11-05 CRAN (R 3.4.2)
## codetools 0.2-15 2016-10-05 CRAN (R 3.3.1)
## colorspace 1.3-2 2016-12-14 CRAN (R 3.4.0)
## compiler 3.4.3 2017-12-01 local
## crayon 1.3.4 2017-09-16 CRAN (R 3.4.2)
## datasets * 3.4.3 2017-12-01 local
## devtools 1.13.5 2018-02-18 CRAN (R 3.4.3)
## digest 0.6.15 2018-01-28 cran (@0.6.15)
## dplyr * 0.7.4 2017-09-28 CRAN (R 3.4.2)
## e1071 1.6-8 2017-02-02 CRAN (R 3.4.0)
## eurostat * 3.1.5 2017-08-09 CRAN (R 3.4.1)
## evaluate 0.10.1 2017-06-24 CRAN (R 3.4.1)
## forcats * 0.3.0 2018-02-19 CRAN (R 3.4.3)
## foreign 0.8-69 2017-06-21 CRAN (R 3.4.0)
## fs * 1.2.2 2018-03-21 CRAN (R 3.4.3)
## ggplot2 * 2.2.1.9000 2018-03-26 Github (tidyverse/ggplot2@3c9c504)
## glue 1.2.0 2017-10-29 CRAN (R 3.4.2)
## graphics * 3.4.3 2017-12-01 local
## grDevices * 3.4.3 2017-12-01 local
## grid * 3.4.3 2017-12-01 local
## gtable 0.2.0 2016-02-26 CRAN (R 3.4.0)
## haven 1.1.1 2018-01-18 CRAN (R 3.4.3)
## highr 0.6 2016-05-09 CRAN (R 3.4.0)
## hms 0.4.2 2018-03-10 CRAN (R 3.4.3)
## htmltools 0.3.6 2017-04-28 CRAN (R 3.4.0)
## httr 1.3.1 2017-08-20 CRAN (R 3.4.1)
## jsonlite 1.5 2017-06-01 CRAN (R 3.4.0)
## knitr * 1.20 2018-02-20 CRAN (R 3.4.3)
## labeling 0.3 2014-08-23 CRAN (R 3.4.0)
## lattice 0.20-35 2017-03-25 CRAN (R 3.3.3)
## lazyeval 0.2.1 2017-10-29 CRAN (R 3.4.2)
## lubridate 1.7.3 2018-02-27 CRAN (R 3.4.3)
## magrittr 1.5 2014-11-22 CRAN (R 3.4.0)
## mapproj 1.2-5 2017-06-08 CRAN (R 3.4.0)
## maps 3.3.0 2018-04-03 CRAN (R 3.4.3)
## Matrix * 1.2-12 2017-11-16 CRAN (R 3.4.2)
## memoise 1.1.0 2017-04-21 CRAN (R 3.4.0)
## methods * 3.4.3 2017-12-01 local
## mnormt 1.5-5 2016-10-15 CRAN (R 3.4.0)
## modelr 0.1.1 2017-07-24 CRAN (R 3.4.1)
## munsell 0.4.3 2016-02-13 CRAN (R 3.4.0)
## nlme 3.1-131.1 2018-02-16 CRAN (R 3.4.3)
## parallel 3.4.3 2017-12-01 local
## pillar 1.2.1 2018-02-27 CRAN (R 3.4.3)
## pkgconfig 2.0.1 2017-03-21 CRAN (R 3.4.0)
## plyr 1.8.4 2016-06-08 CRAN (R 3.4.0)
## psych 1.7.8 2017-09-09 CRAN (R 3.4.2)
## purrr * 0.2.4 2017-10-18 CRAN (R 3.4.2)
## R6 2.2.2 2017-06-17 CRAN (R 3.4.0)
## RColorBrewer 1.1-2 2014-12-07 CRAN (R 3.4.0)
## Rcpp 0.12.15 2018-01-20 CRAN (R 3.4.3)
## readr * 1.1.1 2017-05-16 CRAN (R 3.4.3)
## readxl 1.0.0 2017-04-18 CRAN (R 3.4.0)
## reshape2 1.4.3 2017-12-11 cran (@1.4.3)
## rlang 0.2.0.9001 2018-03-26 Github (r-lib/rlang@49d7a34)
## rmarkdown 1.9 2018-03-01 CRAN (R 3.4.3)
## rprojroot 1.3-2 2018-01-03 cran (@1.3-2)
## rstudioapi 0.7 2017-09-07 CRAN (R 3.4.2)
## rvest 0.3.2 2016-06-17 CRAN (R 3.4.0)
## scales 0.5.0.9000 2018-02-02 Github (hadley/scales@d767915)
## sp 1.2-7 2018-01-19 CRAN (R 3.4.3)
## splines 3.4.3 2017-12-01 local
## srvyr * 0.3.1 2018-03-10 CRAN (R 3.4.3)
## stats * 3.4.3 2017-12-01 local
## stringi 1.1.6 2017-11-17 CRAN (R 3.4.2)
## stringr * 1.3.0 2018-02-19 CRAN (R 3.4.3)
## survey * 3.33-2 2018-03-13 CRAN (R 3.4.3)
## survival * 2.41-3 2017-04-04 CRAN (R 3.4.0)
## tibble * 1.4.2 2018-01-22 cran (@1.4.2)
## tidyr * 0.8.0 2018-01-29 CRAN (R 3.4.3)
## tidyverse * 1.2.1 2017-11-14 CRAN (R 3.4.2)
## tools 3.4.3 2017-12-01 local
## utf8 1.1.3 2018-01-03 cran (@1.1.3)
## utils * 3.4.3 2017-12-01 local
## withr 2.1.2 2018-03-26 Github (jimhester/withr@79d7b0d)
## xml2 1.2.0 2018-01-24 cran (@1.2.0)
## yaml 2.1.18 2018-03-08 CRAN (R 3.4.3)