|
1 | 1 | #' Edit dataset to fit plotting |
2 | 2 | #' |
3 | 3 | #' @details |
4 | | -#' This function edits the data files (from openESM). Technically, the openesm::get_data function should read the files. However, because the function fails to connect to zenodo, some datasets have been manually saved in the package. To make the plotting smooth, this function cleans the files further for analysis. Load additional datasets, go to the dedicated zenodo webpage from openESM, copy the link for the dataset, and add the dataset manually in the app. If the format is correct, the analysis should still work. Make sure to NOT pick the raw data. |
5 | | -#' @param df The data frame. Note that the df should be in wide format. |
6 | | -#' @param id_col Provide the index for which column the participant ID is in. If there is no participant ID (the data frame is for only one participant), write "none". |
7 | | -#' @param day_col Index number for the column containing the days. |
| 4 | +#' This function edits data files (from openESM). Technically, |
| 5 | +#' the openesm::get_data function should read the files. However, |
| 6 | +#' because the function fails to connect to zenodo, some datasets have been |
| 7 | +#' manually saved in the package. To make the plotting smooth, this function |
| 8 | +#' cleans the files further for analysis. To load additional datasets, go to the |
| 9 | +#' dedicated zenodo webpage from openESM, copy the link for the dataset, and |
| 10 | +#' paste it in the app (in the data tab under the option "paste link"). You will |
| 11 | +#' also need to include some additional information from the dedicated openESM |
| 12 | +#' page. If the format is correct, the analysis should still work. Make sure to |
| 13 | +#' NOT pick the raw data. You can also add your own df as long as it is a csv |
| 14 | +#' and the columns have the correct names. |
| 15 | +#' The function also adds a column indicating whether a beep has been missed. |
| 16 | +#' @param df The data frame. |
| 17 | +#' @param id_col Provide the name of the id column |
| 18 | +#' @param day_col Name of day column. |
8 | 19 | #' @param exp_day Expected days of the study. |
9 | | -#' @param beep_col Index number of the column containing the beeps for the day. |
| 20 | +#' @param beep_col Name of beep column. |
10 | 21 | #' @param exp_beep Expected beeps per day. |
11 | | -#' @param variables A vector containing the names of all columns to be analysed. Each variable must be within quotation marks. |
| 22 | +#' @param variables A vector containing the names of all columns to be analysed. |
| 23 | +#' Each variable must be within quotation marks. |
12 | 24 | #' @import dplyr |
13 | | -#' @returns Creates a data file in the data folder of the package. |
| 25 | +#' @return Data file with all possible days and beeps and with a column indicating |
| 26 | +#' whether a beep was missed. |
14 | 27 | #' @export |
15 | 28 | #' @examples |
16 | 29 | #' \dontrun{ |
17 | 30 | #' |
18 | 31 | #' "menghini_2023_orig" <- readr::read_tsv("https://zenodo.org/records/17347538/files/0022_menghini_ts.tsv?download=1") |
19 | 32 | #' names <- colnames(menghini_2023_orig)[c(9:17,22:28)] |
20 | | -#' menghini <- edit_df(df = menghini_2023_orig, id_col = 1, day_col = 3, exp_day = 3, beep_col = 5, exp_beep = 7, variables = names) |
21 | | -#' menghini_2023 <- menghini[[1]] |
22 | | -#' usethis::use_data(menghini_2023) |
23 | | -#' missing_menghini_2023 <- menghini[[2]] |
24 | | -#' usethis::use_data(missing_menghini_2023) |
| 33 | +#' |
| 34 | +#' menghini_2023 <- clean_df( |
| 35 | +#' df = menghini_2023_orig, |
| 36 | +#' id_col = "id", |
| 37 | +#' day_col = "day", |
| 38 | +#' exp_day = 3, |
| 39 | +#' beep_col = "beep", |
| 40 | +#' exp_beep = 7, |
| 41 | +#' variables = names |
| 42 | +#' ) |
25 | 43 | #' } |
26 | 44 |
|
27 | | -edit_df <- function(df, id_col, day_col, exp_day, beep_col, exp_beep, variables){ |
28 | | - |
29 | | - time <- numeric() |
30 | | - |
31 | | - missing <- data.frame( |
32 | | - id = c(), |
33 | | - beeps = numeric(), |
34 | | - days = numeric() |
35 | | - ) |
36 | | - |
37 | | - participants <- unique(df[[id_col]]) |
38 | | - |
39 | | - for(i in 1:length(participants)){ |
40 | | - sub_df <- df[df[[id_col]]==participants[i],] |
41 | | - |
42 | | - expected_pairs <- expand.grid( |
43 | | - beeps = 1:exp_beep, |
44 | | - days = 1:exp_day |
45 | | - ) |
46 | | - |
47 | | - actual_pairs <- data.frame( |
48 | | - beeps = sub_df[[beep_col]], |
49 | | - days = sub_df[[day_col]] |
50 | | - ) |
51 | | - miss <- anti_join(expected_pairs, actual_pairs, by = c("beeps", "days")) |
52 | 45 |
|
53 | | - add <- data.frame( |
54 | | - id = rep(participants[i], times=nrow(miss)), |
55 | | - days = miss$days, |
56 | | - beeps = miss$beeps |
| 46 | +clean_df <- function(df, |
| 47 | + id_col = "id", |
| 48 | + day_col = "day", |
| 49 | + exp_day, |
| 50 | + beep_col = "beep", |
| 51 | + exp_beep, |
| 52 | + variables){ |
| 53 | + |
| 54 | + # Remove any rows where id, day, or beep is missing a value |
| 55 | + df <- df %>% |
| 56 | + select(any_of(c(id_col, day_col, beep_col, variables))) %>% |
| 57 | + filter( |
| 58 | + !is.na(.data[[id_col]]), |
| 59 | + !is.na(.data[[day_col]]), |
| 60 | + !is.na(.data[[beep_col]]) |
57 | 61 | ) |
58 | 62 |
|
59 | | - missing <- rbind(missing, add) |
| 63 | + # Only select defined variables |
| 64 | + df <- df %>% |
| 65 | + select(any_of(c(id_col, day_col, beep_col, variables))) |
60 | 66 |
|
61 | | - time <- c(time,1:nrow(sub_df)) |
62 | 67 |
|
63 | | - } |
| 68 | + # Change variable names incase original had different one |
| 69 | + df_std <- df %>% |
| 70 | + rename( |
| 71 | + id = all_of(id_col), |
| 72 | + day = all_of(day_col), |
| 73 | + beep = all_of(beep_col) |
| 74 | + ) |
64 | 75 |
|
65 | 76 |
|
66 | | - new_df <- data.frame( |
67 | | - ID = df[,id_col], |
68 | | - time = time, |
69 | | - day = df[,day_col], |
70 | | - beep = df[,beep_col] |
71 | | - ) |
| 77 | + # Impute missing day x beep combinations with NA's |
| 78 | + new_df <- df_std %>% |
| 79 | + complete( |
| 80 | + id, |
| 81 | + day = 1:exp_day, |
| 82 | + beep = 1:exp_beep |
| 83 | + ) %>% |
| 84 | + left_join(df_std, by = c("id", "day", "beep", variables)) |
72 | 85 |
|
73 | | - new_var <- paste0("var_", variables) |
74 | 86 |
|
75 | | - for(i in 1:length(new_var)){ |
76 | | - new_df[new_var[i]] <- df[[variables[i]]] |
77 | | - } |
| 87 | + # create variable that indicates whether a row has NA's |
| 88 | + new_df <- new_df %>% |
| 89 | + mutate( |
| 90 | + missing = if_any(all_of(variables), is.na) |
| 91 | + ) |
78 | 92 |
|
79 | 93 |
|
80 | | - dfs <- list(new_df, missing) |
81 | | - return(dfs) |
| 94 | + # order data according to first all participant then days then beeps |
| 95 | + new_df <- new_df %>% |
| 96 | + arrange(id, day, beep) %>% |
| 97 | + relocate(missing, .after = beep) |
82 | 98 |
|
| 99 | + return(new_df) |
83 | 100 | } |
84 | | - |
|
0 commit comments