From c71d91133293bed8a7ceca5db3cc528e8f16909a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Holger=20D=C3=B6ring?= Date: Thu, 15 Jun 2017 16:42:53 +0200 Subject: [PATCH] tidyverse tools, style guide, countrycode package * use `tidyverse` tools, esp. `read_csv()`, `write_csv()` * use tidyverse style guide, esp. " ' " --> ' " ' * use `countrycode` package, instead of `country.csv` --- import/ches/ches-party-info.R | 11 +++++------ import/ches/ches.R | 2 +- import/clea/clea-national-vote.R | 22 +++++++++++----------- import/clea/clea.R | 12 ++++++------ import/ees14/ees14.R | 15 +++++++-------- import/epac/epac.R | 20 ++++++++++---------- import/huber/huber.R | 29 ++++++++--------------------- import/janda/janda.R | 14 +++++++------- import/marpor/marpor-share.R | 7 +++---- import/marpor/marpor.R | 14 ++++++-------- import/parlgov/parlgov.R | 24 ++++++++++++------------ import/ray/ray.R | 12 ++++++------ import/readme.md | 4 ++-- import/thomas/thomas.R | 17 ++++++++--------- 14 files changed, 92 insertions(+), 111 deletions(-) diff --git a/import/ches/ches-party-info.R b/import/ches/ches-party-info.R index 3fac993..08b3ce0 100644 --- a/import/ches/ches-party-info.R +++ b/import/ches/ches-party-info.R @@ -1,12 +1,11 @@ -library("dplyr") -library("stringr") +library(tidyverse) url <- "http://www.chesdata.eu/1999-2014/1999-2014_CHES_dataset_means.csv" -file_name <- paste0("source__", str_replace(url, ".+/", "")) +file_name <- "source__1999-2014_CHES_dataset_means.csv" if( ! file.exists(file_name)) { - download.file(url, file_name, mode="wb") + download.file(url, file_name, mode = "wb") } -trend_raw <- read.csv(file_name, fileEncoding="utf-8", as.is=TRUE) +trend_raw <- read_csv(file_name) trend <- trend_raw %>% group_by(party_id) %>% @@ -19,4 +18,4 @@ trend <- trend_raw %>% distinct(party_id, .keep_all = TRUE) %>% select(country, party_id, party, cmp_id, electionyear, vote, year_first, year_last) -write.csv(trend, "ches-party-info.csv", na="", row.names = FALSE, fileEncoding="utf-8") +write_csv(trend, "ches-party-info.csv", na = "") diff --git a/import/ches/ches.R b/import/ches/ches.R index 39ef57a..a0f4087 100644 --- a/import/ches/ches.R +++ b/import/ches/ches.R @@ -1,4 +1,4 @@ -library("dplyr") +library(dplyr) party <- read.csv("ches-parties.csv", fileEncoding="utf-8", as.is=TRUE) country <- read.csv("ches-country.csv", fileEncoding="utf-8", as.is=TRUE) diff --git a/import/clea/clea-national-vote.R b/import/clea/clea-national-vote.R index 974c18a..dbe5021 100644 --- a/import/clea/clea-national-vote.R +++ b/import/clea/clea-national-vote.R @@ -1,26 +1,26 @@ library(tidyverse) library(stringr) -clea_version <- '20170530' +clea_version <- "20170530" max_share <- 2.0 -path <- str_interp('source__clea/clea_${clea_version}') +path <- str_interp("source__clea/clea_${clea_version}") # Stata exported RDS file to save disk space -clea_rdata <- str_interp('${path}/clea_${clea_version}.Rds') +clea_rdata <- str_interp("${path}/clea_${clea_version}.Rds") if( ! file.exists(clea_rdata)) { library(haven) - clea <- haven::read_dta(str_interp('${path}/clea_${clea_version}_stata.zip')) + clea <- haven::read_dta(str_interp("${path}/clea_${clea_version}_stata.zip")) saveRDS(clea, file = clea_rdata, ascii = TRUE) } # read CLEA data only once -if( ! exists('clea_raw')) { - clea_raw <- readRDS(file=str_interp('${path}/clea_${clea_version}.Rds')) +if( ! exists("clea_raw")) { + clea_raw <- readRDS(file=str_interp("${path}/clea_${clea_version}.Rds")) } clea <- clea_raw %>% filter(pv1 > 0) %>% - mutate(ctr_n = recode(ctr_n, UK='United Kingdom', US='United States of America'), + mutate(ctr_n = recode(ctr_n, UK="United Kingdom", US="United States of America"), pv1 = as.numeric(pv1), mn = if_else(ctr == 840, 0, as.numeric(mn))) # unify US election months @@ -63,16 +63,16 @@ elec_out <- pa_name %>% group_by() %>% arrange(ctr_n, yr, mn, -pv1_share) -write.csv(elec_out, 'source__clea/clea_national_vote.csv', - na = '', fileEncoding = 'utf-8', row.names = FALSE) +write.csv(elec_out, "source__clea/clea_national_vote.csv", + na = "", fileEncoding = "utf-8", row.names = FALSE) ## Party information for Party Facts data import # filter none, others, alliances, independents -# higher threshold because of votes not in 'pv1' parties +# higher threshold because of votes not in "pv1" parties party_out <- party %>% mutate(ctr_pty = ctr*1000000 + pty) %>% filter(pty > 0, pty < 4000, pv1_share_max >= max_share) -write_csv(party_out, 'clea-national-vote.csv', na = '') +write_csv(party_out, "clea-national-vote.csv", na = "") diff --git a/import/clea/clea.R b/import/clea/clea.R index 663c331..3e2b05e 100644 --- a/import/clea/clea.R +++ b/import/clea/clea.R @@ -2,25 +2,25 @@ library(tidyverse) library(stringr) library(countrycode) -clea_version <- '20170530' +clea_version <- "20170530" path <- str_interp("source__clea/clea_${clea_version}/clea_${clea_version}_appendix_II.csv") party_raw <- read_csv(path) # add CLEA data variable names to party information and clean-up data for import party <- party_raw -names(party) <- c('ctr_n', 'pty', 'abbr', 'name', 'name_english', 'information') +names(party) <- c("ctr_n", "pty", "abbr", "name", "name_english", "information") # add time and size information and select larger parties vote <- read_csv("clea-national-vote.csv") party <- party %>% - mutate(pty = as.integer(pty)) %>% + mutate(pty = as.integer(str_extract_all(pty, "\\d+"))) %>% inner_join(vote) # add Party Facts country codes party <- party %>% - mutate(country = countrycode(ctr_n, 'country.name', 'iso3c', - custom_match = c(Kosovo='XKX', Zambia='ZMB'))) + mutate(country = countrycode(ctr_n, "country.name", "iso3c", + custom_match = c(Kosovo="XKX", Zambia="ZMB"))) if(any(is.na(party$country))) { warning("Country name clean-up needed") } @@ -28,4 +28,4 @@ if(any(is.na(party$country))) { # clean-up CLEA data for import party[nchar(party$abbr) > 25 & ! is.na(party$abbr), "abbr"] <- NA -write_csv(party, 'clea.csv', na = '') +write_csv(party, "clea.csv", na = "") diff --git a/import/ees14/ees14.R b/import/ees14/ees14.R index 71234ba..5372316 100644 --- a/import/ees14/ees14.R +++ b/import/ees14/ees14.R @@ -1,11 +1,10 @@ -library('dplyr') +library(tidyverse) +library(countrycode) -ees <- read.csv('parties-ees-ches-ess.csv', fileEncoding = 'utf-8', as.is=TRUE) +ees_raw <- read_csv("parties-ees-ches-ess.csv") -country <- read.csv('../country.csv', fileEncoding = 'utf-8', as.is=TRUE) -country <- country %>% select(iso2, country_iso3 = iso3) +ees <- ees_raw %>% + mutate(country_iso3 = countrycode(country, "iso2c", "iso3c", + custom_match = c(UK="GBR"))) -ees[ees$country == 'UK', 'country'] <- 'GB' -ees <- ees %>% left_join(country, by = c('country' = 'iso2')) - -write.csv(ees, 'ees14.csv', na='', fileEncoding = 'utf-8', row.names = FALSE) +write_csv(ees, "ees14.csv", na = "") diff --git a/import/epac/epac.R b/import/epac/epac.R index 48b1e16..6cc9526 100644 --- a/import/epac/epac.R +++ b/import/epac/epac.R @@ -1,14 +1,14 @@ -library("tidyverse") -library("readxl") -library("countrycode") +library(tidyverse) +library(readxl) +library(countrycode) epac_raw <- read_excel("epac-parties-2016.xlsx") write_csv(epac_raw, "epac-parties-2016.csv", na = "") # add Party Facts country codes epac <- epac_raw %>% - mutate(country = countrycode(country_name, 'country.name', 'iso3c', - custom_match = c(Kosovo='XKX')), + mutate(country = countrycode(country_name, "country.name", "iso3c", + custom_match = c(Kosovo="XKX")), seat = round(seat, 1)) if(any(is.na(epac$country))) { warning("Country name clean-up needed") @@ -16,12 +16,12 @@ if(any(is.na(epac$country))) { epac_2014_raw <- read_csv("import-2014/epac.csv") -epac_2014_add <- epac_2014_raw %>% - rename(country = country_name_short, country_name = country, party_id = id, - party_accr = accronym, party_name_en = party_name_english) %>% - mutate(round = 2011, pec = NA, elecyear = NA) %>% +epac_2014_add <- epac_2014_raw %>% + rename(country = country_name_short, country_name = country, party_id = id, + party_accr = accronym, party_name_en = party_name_english) %>% + mutate(round = 2011, pec = NA, elecyear = NA) %>% filter( ! party_id %in% epac$party_id) epac <- epac %>% bind_rows(epac_2014_add) %>% arrange(party_id) -write.csv(epac, "epac.csv", na='', fileEncoding = "utf-8", row.names = FALSE) +write.csv(epac, "epac.csv", na = "", fileEncoding = "utf-8", row.names = FALSE) diff --git a/import/huber/huber.R b/import/huber/huber.R index ac06f79..b732e57 100644 --- a/import/huber/huber.R +++ b/import/huber/huber.R @@ -1,24 +1,11 @@ -library('dplyr') +library(tidyverse) +library(countrycode) -# reading huber data and renaming 'id' to 'party_id' -party_raw <- read.csv('huber_inglehart_1995.csv', as.is=TRUE) -party <- party_raw %>% rename(party_id=id) +huber_raw <- read_csv("huber_inglehart_1995.csv") -# reading contry data and convert 'country_name' to upper-case characters -country_raw <- read.csv('../country.csv', fileEncoding = 'utf-8', as.is=TRUE) -country <- country_raw %>% - rename(country_name_short = name_short) %>% - mutate(country = toupper(name)) +huber <- huber_raw %>% + rename(huber_id=id) %>% + mutate(country_name_short = countrycode(country, "country.name", "iso3c", + custom_match = c(`NORTHERN IRELAND`="NIR"))) -# merging country and huber data to get 'country_name_short' -party <- party %>% left_join(country %>% select(country_name_short, country), by='country') - -# adding missing country abbreviations -country_update <- list('BRITAIN'='GBR', 'SOUTH KOREA'='KOR', 'USA'='USA') -for (to_update in names(country_update)) { - party[party$country == to_update, 'country_name_short'] <- country_update[[to_update]] -} -if(any(is.na(party$country_name_short))) warning("Not all observations have country keys") - -# creating the csv file -write.csv(party, 'huber.csv', na='', fileEncoding='utf-8', row.names = FALSE) +write_csv(huber, "huber.csv", na = "") diff --git a/import/janda/janda.R b/import/janda/janda.R index cea0f39..b52e483 100644 --- a/import/janda/janda.R +++ b/import/janda/janda.R @@ -1,17 +1,17 @@ -library("dplyr") +library(tidyverse) -janda <- read.csv('janda-parties.csv', fileEncoding='utf-8', as.is=TRUE) -country <- read.csv('janda-country.csv', fileEncoding='utf-8', as.is=TRUE) +janda <- read_csv("janda-parties.csv") +country <- read_csv("janda-country.csv") # Extract country id from party id janda <- janda %>% mutate(country_id = substr(janda_id, 1, nchar(janda_id) - 1) %>% as.integer, - country_id = ifelse(janda_id >= 10, country_id, 0)) # add US country id '0' + country_id = ifelse(janda_id >= 10, country_id, 0)) # add US country id "0" # Merge parties and country list -janda <- janda %>% +janda <- janda %>% left_join(country, by = c("country_id" = "id")) %>% select(-country_id, country_short = short) %>% - filter(country_short != '') + filter(country_short != "") -write.csv(janda, "janda.csv", na='', fileEncoding = "utf-8", row.names = FALSE) +write_csv(janda, "janda.csv", na = "") diff --git a/import/marpor/marpor-share.R b/import/marpor/marpor-share.R index 1edcb9d..48aba7f 100644 --- a/import/marpor/marpor-share.R +++ b/import/marpor/marpor-share.R @@ -1,10 +1,9 @@ -library(dplyr) -library(readr) +library(tidyverse) marpor <- read_csv("source__MPDataset_MPDS2016b.csv") pa_share <- marpor %>% - select(party, country, countryname, date, pervote) %>% + select(party, country, countryname, date, pervote) %>% mutate(year = date %/% 100) %>% group_by(party) %>% mutate(pervote_max = max(pervote, na.rm = TRUE)) %>% @@ -14,4 +13,4 @@ pa_share <- marpor %>% arrange(party) %>% select(party, pervote_max_year = year, pervote_max) -write.csv(pa_share, "marpor-share.csv", na="", row.names = FALSE, fileEncoding = "utf-8") +write_csv(pa_share, "marpor-share.csv", na = "") diff --git a/import/marpor/marpor.R b/import/marpor/marpor.R index d0177cb..0dfffec 100644 --- a/import/marpor/marpor.R +++ b/import/marpor/marpor.R @@ -1,17 +1,15 @@ -library(dplyr) -library(readr) +library(tidyverse) library(countrycode) marpor_raw <- read_csv("marpor-2016.csv") -marpor <- marpor_raw %>% select(-country) - marpor_share <- read_csv("marpor-share.csv") -marpor <- marpor %>% left_join(marpor_share) + +marpor <- marpor_raw %>% select(-country) %>% left_join(marpor_share) # add Party Facts country codes marpor <- marpor %>% - mutate(country = countrycode(countryname, 'country.name', 'iso3c', - custom_match = c(`Northern Ireland`='NIR'))) + mutate(country = countrycode(countryname, "country.name", "iso3c", + custom_match = c(`Northern Ireland`="NIR"))) if(any(is.na(marpor$country))) { warning("Country name clean-up needed") } @@ -19,4 +17,4 @@ if(any(is.na(marpor$country))) { # replace party short longer than 25 chars marpor[nchar(marpor$abbrev) > 25 & ! is.na(marpor$abbrev), "abbrev"] <- NA -write.csv(marpor, "marpor.csv", na="", row.names = FALSE, fileEncoding="utf-8") +write_csv(marpor, "marpor.csv", na = "") diff --git a/import/parlgov/parlgov.R b/import/parlgov/parlgov.R index 79d5f06..31ef217 100644 --- a/import/parlgov/parlgov.R +++ b/import/parlgov/parlgov.R @@ -1,8 +1,8 @@ -library('tidyverse') -library('dbplyr') +library(tidyverse) +library(dbplyr) -url <- 'http://www.parlgov.org/static/data/parlgov-development.db' -db_file <- 'source__parlgov.db' +url <- "http://www.parlgov.org/static/data/parlgov-development.db" +db_file <- "source__parlgov.db" if( ! file.exists(db_file)) { download.file(url, db_file, mode = "wb") } @@ -10,9 +10,9 @@ if( ! file.exists(db_file)) { con <- DBI::dbConnect(RSQLite::SQLite(), db_file) tbl_parlgov <- function(table) tbl(con, table) %>% as_tibble() -party <- tbl_parlgov('view_party') -elec <- tbl_parlgov('view_election') -party_raw <- tbl_parlgov('party') +party <- tbl_parlgov("view_party") +elec <- tbl_parlgov("view_election") +party_raw <- tbl_parlgov("party") # calculate first and last year each party elec_year <- elec %>% @@ -23,7 +23,7 @@ elec_year <- elec %>% # calculate max vote share each party elec_share <- elec %>% - filter(election_type == 'parliament') %>% + filter(election_type == "parliament") %>% group_by(party_id) %>% mutate(vote_share_max = max(vote_share, na.rm = TRUE)) %>% filter(vote_share == vote_share_max) %>% @@ -33,15 +33,15 @@ elec_share <- elec %>% vote_share_max_year = substr(election_date, 1, 4)) %>% select(party_id, vote_share_max_year, vote_share_max) -parlgov_url <- 'http://www.parlgov.org/explore/%s/party/%d/' +parlgov_url <- "http://www.parlgov.org/explore/%s/party/%d/" parlgov <- party %>% select(party_id, country_name_short:family_name, -country_name, -party_name_ascii, -family_name) %>% - filter(family_name_short != 'none') %>% + filter(family_name_short != "none") %>% mutate(url = sprintf(parlgov_url, tolower(country_name_short), party_id)) %>% left_join(elec_year) %>% left_join(elec_share) %>% - left_join(party_raw %>% select(id, wikipedia), by = c('party_id' = 'id')) %>% + left_join(party_raw %>% select(id, wikipedia), by = c("party_id" = "id")) %>% arrange(country_name_short, party_name) # create import file and remove downloaded source files -write_csv(parlgov, 'parlgov.csv', na = '') +write_csv(parlgov, "parlgov.csv", na = "") diff --git a/import/ray/ray.R b/import/ray/ray.R index a46d65a..732a0b4 100644 --- a/import/ray/ray.R +++ b/import/ray/ray.R @@ -1,17 +1,17 @@ -library("tidyverse") -library("stringr") -library("haven") -library("countrycode") +library(tidyverse) +library(stringr) +library(haven) +library(countrycode) # get local copy of data file url <- "http://www.lsu.edu/faculty/lray2/data/1996survey/1996survey.sav?export=sav" data_file_local <- "source__1996survey.sav" if( ! file.exists(data_file_local)) { - download.file(url, data_file_local, mode="wb") + download.file(url, data_file_local, mode = "wb") } # read, select and clean data -ray_raw <- read_spss("source__1996survey.sav") +ray_raw <- haven::read_spss(data_file_local) ray <- ray_raw %>% select(NATID:CMPCODE) %>% # select party information mutate_at(vars(PARTY, ENAME, NAME), str_trim) %>% # trim white space diff --git a/import/readme.md b/import/readme.md index 6d6262f..4349482 100644 --- a/import/readme.md +++ b/import/readme.md @@ -22,8 +22,8 @@ Party Facts import requires ISO3 country codes. `country.csv` includes the respe ```r # Example country recoding from Marpor import marpor <- marpor %>% - mutate(country = countrycode(countryname, 'country.name', 'iso3c', - custom_match = c(`Northern Ireland`='NIR'))) + mutate(country = countrycode(countryname, "country.name", "iso3c", + custom_match = c(`Northern Ireland`="NIR"))) if(any(is.na(marpor$country))) { warning("Country name clean-up needed") } diff --git a/import/thomas/thomas.R b/import/thomas/thomas.R index 5368cad..909e0bd 100644 --- a/import/thomas/thomas.R +++ b/import/thomas/thomas.R @@ -1,9 +1,9 @@ -library("dplyr") +library(tidyverse) +library(countrycode) -thomas_raw <- read.csv("thomas-parties.csv", fileEncoding = "utf-8", as.is = TRUE) -country_raw <- read.csv("../country.csv", fileEncoding = "utf-8", as.is = TRUE) +thomas_raw <- read_csv("thomas-parties.csv") -# remove 'status quo' information +# remove "status quo" information thomas <- thomas_raw %>% filter(party != "Status quo") # replace UK and RUS with proper country names @@ -13,14 +13,13 @@ thomas$country[thomas$country == "USSR"] <- "Russia" # aggregate parties first and last year thomas <- thomas %>% group_by(country, party) %>% - summarise(year_first = min(time), year_last = max(time)) %>% + summarise(year_first = min(time), year_last = max(time)) %>% ungroup() # get country name iso shortcut -country <- country_raw %>% select(country_short = name_short, country = name) thomas <- thomas %>% - left_join(country) %>% - arrange(country, party) %>% + mutate(country_short = countrycode(country, "country.name", "iso3c")) %>% + arrange(country, party) %>% mutate(id = row_number()) -write.csv(thomas, "thomas.csv", na = "", row.names = FALSE, fileEncoding = "utf-8") +write_csv(thomas, "thomas.csv", na = "")