Clean variable labels and fix spelling according to a wordlist

clean_variables(
  x,
  sep = "_",
  wordlists = NULL,
  spelling_vars = 3,
  sort_by = NULL,
  protect = FALSE,
  classes = NULL,
  warn_spelling = FALSE
)

Arguments

x

a data.frame

sep

The separator used between words, and defaults to the underscore _.

wordlists

a data frame or named list of data frames with at least two columns defining the word list to be used. If this is a data frame, a third column must be present to split the wordlists by column in x (see spelling_vars).

spelling_vars

character or integer. If wordlists is a data frame, then this column in defines the columns in x corresponding to each section of the wordlists data frame. This defaults to 3, indicating the third column is to be used.

sort_by

a character the column to be used for sorting the values in each data frame. If the incoming variables are factors, this determines how the resulting factors will be sorted.

protect

a logical or numeric vector defining the columns to protect from any manipulation. Note: columns in protect will override any columns in either force_Date or guess_dates.

classes

a vector of class definitions for each of the columns. If this is not provided, the classes will be read from the columns themselves. Practically, this is used in clean_data() to mark columns as protected.

warn_spelling

if TRUE, errors and warnings from clean_spelling() will be aggregated and presented for each column that issues them. The default value is FALSE, which means that all errors and warnings will be ignored.

See also

clean_variable_labels() to standardise text, clean_variable_spelling() to correct spelling with a wordlist.

Examples

## make toy data toy_data <- messy_data(20) # location data with mis-spellings, French, and English. messy_locations <- c("hopsital", "h\u00f4pital", "hospital", "m\u00e9dical", "clinic", "feild", "field") toy_data$location <- sample(messy_locations, 20, replace = TRUE) ## show data toy_data
#> 'ID Date of Onset. DisCharge.. GENDER_ Épi.Case_définition #> 1 meaujd 2018-01-08 18/01/2018 Female confirmed #> 2 xuwqoq 2018-01-08 18/01/2018 Female Not.a.Case #> 3 uwchlo 2018-01-11 21/01/2018 FEMALE suspected #> 4 slmlvf 2018-01-09 19/01/2018 FEMALE suspected #> 5 hnsedx 2018-01-02 12/01/2018 Female suspected #> 6 olsggh 2018-01-03 13/01/2018 male probable #> 7 llhdyf 2018-01-05 15/01/2018 female confirmed #> 8 rfzkxl 2018-01-02 12/01/2018 Male Confirmed #> 9 uoqobq 2018-01-07 17/01/2018 female suspected #> 10 cvyrfh 2018-01-07 17/01/2018 female suspected #> 11 ucjywm 2018-01-03 13/01/2018 male suspected #> 12 puxjtn 2018-01-07 17/01/2018 Female suspected #> 13 wtzflo 2018-01-02 12/01/2018 Female Not.a.Case #> 14 affkrl 2018-01-04 14/01/2018 Male PROBABLE #> 15 pfxvpc 2018-01-03 13/01/2018 male Confirmed #> 16 whwufd 2018-01-11 21/01/2018 female PROBABLE #> 17 tidkky 2018-01-02 12/01/2018 male Not.a.Case #> 18 ylhnok 2018-01-11 21/01/2018 MALE suspected #> 19 dxbwut 2018-01-04 14/01/2018 Male PROBABLE #> 20 pwntga 2018-01-06 16/01/2018 female PROBABLE #> messy/dates lat lon location #> 1 that's 24/12/1989! 15.054111 47.29901 clinic #> 2 2018_10_17 12.850681 49.89783 médical #> 3 that's 24/12/1989! 12.025176 49.11427 hôpital #> 4 2018 10 19 12.203133 48.84696 field #> 5 <NA> 11.798453 48.39087 feild #> 6 <NA> 12.580328 49.05366 hospital #> 7 2018 10 19 13.971013 47.44452 médical #> 8 01-12-2001 15.857938 47.37059 field #> 9 <NA> 12.040089 48.30336 field #> 10 <NA> 15.590397 47.96197 hospital #> 11 male 13.330148 49.76273 clinic #> 12 01-12-2001 13.650994 48.61463 hopsital #> 13 that's 24/12/1989! 11.866555 46.33941 hôpital #> 14 that's 24/12/1989! 10.898856 46.95154 hospital #> 15 female 11.107387 47.07064 hôpital #> 16 female 12.849640 49.54101 clinic #> 17 2018 10 19 13.375853 48.17929 clinic #> 18 01-12-2001 15.498215 46.65506 feild #> 19 2018-10-18 11.902546 47.25110 médical #> 20 male 9.579338 47.01058 médical
# clean labels clean_variables(toy_data) # by default, it's the same as clean_variable_lables
#> 'ID Date of Onset. DisCharge.. GENDER_ Épi.Case_définition #> 1 meaujd 2018-01-08 18_01_2018 female confirmed #> 2 xuwqoq 2018-01-08 18_01_2018 female not_a_case #> 3 uwchlo 2018-01-11 21_01_2018 female suspected #> 4 slmlvf 2018-01-09 19_01_2018 female suspected #> 5 hnsedx 2018-01-02 12_01_2018 female suspected #> 6 olsggh 2018-01-03 13_01_2018 male probable #> 7 llhdyf 2018-01-05 15_01_2018 female confirmed #> 8 rfzkxl 2018-01-02 12_01_2018 male confirmed #> 9 uoqobq 2018-01-07 17_01_2018 female suspected #> 10 cvyrfh 2018-01-07 17_01_2018 female suspected #> 11 ucjywm 2018-01-03 13_01_2018 male suspected #> 12 puxjtn 2018-01-07 17_01_2018 female suspected #> 13 wtzflo 2018-01-02 12_01_2018 female not_a_case #> 14 affkrl 2018-01-04 14_01_2018 male probable #> 15 pfxvpc 2018-01-03 13_01_2018 male confirmed #> 16 whwufd 2018-01-11 21_01_2018 female probable #> 17 tidkky 2018-01-02 12_01_2018 male not_a_case #> 18 ylhnok 2018-01-11 21_01_2018 male suspected #> 19 dxbwut 2018-01-04 14_01_2018 male probable #> 20 pwntga 2018-01-06 16_01_2018 female probable #> messy/dates lat lon location #> 1 that_s_24_12_1989 15.054111 47.29901 clinic #> 2 2018_10_17 12.850681 49.89783 medical #> 3 that_s_24_12_1989 12.025176 49.11427 hopital #> 4 2018_10_19 12.203133 48.84696 field #> 5 <NA> 11.798453 48.39087 feild #> 6 <NA> 12.580328 49.05366 hospital #> 7 2018_10_19 13.971013 47.44452 medical #> 8 01_12_2001 15.857938 47.37059 field #> 9 <NA> 12.040089 48.30336 field #> 10 <NA> 15.590397 47.96197 hospital #> 11 male 13.330148 49.76273 clinic #> 12 01_12_2001 13.650994 48.61463 hopsital #> 13 that_s_24_12_1989 11.866555 46.33941 hopital #> 14 that_s_24_12_1989 10.898856 46.95154 hospital #> 15 female 11.107387 47.07064 hopital #> 16 female 12.849640 49.54101 clinic #> 17 2018_10_19 13.375853 48.17929 clinic #> 18 01_12_2001 15.498215 46.65506 feild #> 19 2018_10_18 11.902546 47.25110 medical #> 20 male 9.579338 47.01058 medical
# add a wordlist wordlist <- data.frame( from = c("hopsital", "hopital", "medical", "feild"), to = c("hospital", "hospital", "clinic", "field"), variable = rep("location", 4), stringsAsFactors = FALSE ) clean_variables(toy_data, wordlists = wordlist, spelling_vars = "variable" )
#> 'ID Date of Onset. DisCharge.. GENDER_ Épi.Case_définition #> 1 meaujd 2018-01-08 18_01_2018 female confirmed #> 2 xuwqoq 2018-01-08 18_01_2018 female not_a_case #> 3 uwchlo 2018-01-11 21_01_2018 female suspected #> 4 slmlvf 2018-01-09 19_01_2018 female suspected #> 5 hnsedx 2018-01-02 12_01_2018 female suspected #> 6 olsggh 2018-01-03 13_01_2018 male probable #> 7 llhdyf 2018-01-05 15_01_2018 female confirmed #> 8 rfzkxl 2018-01-02 12_01_2018 male confirmed #> 9 uoqobq 2018-01-07 17_01_2018 female suspected #> 10 cvyrfh 2018-01-07 17_01_2018 female suspected #> 11 ucjywm 2018-01-03 13_01_2018 male suspected #> 12 puxjtn 2018-01-07 17_01_2018 female suspected #> 13 wtzflo 2018-01-02 12_01_2018 female not_a_case #> 14 affkrl 2018-01-04 14_01_2018 male probable #> 15 pfxvpc 2018-01-03 13_01_2018 male confirmed #> 16 whwufd 2018-01-11 21_01_2018 female probable #> 17 tidkky 2018-01-02 12_01_2018 male not_a_case #> 18 ylhnok 2018-01-11 21_01_2018 male suspected #> 19 dxbwut 2018-01-04 14_01_2018 male probable #> 20 pwntga 2018-01-06 16_01_2018 female probable #> messy/dates lat lon location #> 1 that_s_24_12_1989 15.054111 47.29901 clinic #> 2 2018_10_17 12.850681 49.89783 clinic #> 3 that_s_24_12_1989 12.025176 49.11427 hospital #> 4 2018_10_19 12.203133 48.84696 field #> 5 <NA> 11.798453 48.39087 field #> 6 <NA> 12.580328 49.05366 hospital #> 7 2018_10_19 13.971013 47.44452 clinic #> 8 01_12_2001 15.857938 47.37059 field #> 9 <NA> 12.040089 48.30336 field #> 10 <NA> 15.590397 47.96197 hospital #> 11 male 13.330148 49.76273 clinic #> 12 01_12_2001 13.650994 48.61463 hospital #> 13 that_s_24_12_1989 11.866555 46.33941 hospital #> 14 that_s_24_12_1989 10.898856 46.95154 hospital #> 15 female 11.107387 47.07064 hospital #> 16 female 12.849640 49.54101 clinic #> 17 2018_10_19 13.375853 48.17929 clinic #> 18 01_12_2001 15.498215 46.65506 field #> 19 2018_10_18 11.902546 47.25110 clinic #> 20 male 9.579338 47.01058 clinic