Goal
We validate a psychology survey dataset using validate and demonstrate detection, selection/isolation, and correction for:
rationally invalid values (out-of-range)
statistically invalid values (odd response patterns)
duplicated observations
unexplained missing data
Setup
Code
# Install if needed
pkgs <- c ("validate" , "psych" , "dplyr" )
missing <- pkgs[! vapply (pkgs, requireNamespace, logical (1 ), quietly = TRUE )]
if (length (missing) > 0 ) install.packages (missing)
library (validate)
library (psych)
library (dplyr)
rows_violating <- function (cn) {
v <- values (cn)
which (apply (v, 1 , function (x) any (x == FALSE , na.rm = TRUE )))
}
Data
psych::bfi contains Big Five Inventory items (1-6) plus age, gender, education.
Code
items <- c (paste0 ("A" , 1 : 5 ), paste0 ("C" , 1 : 5 ), paste0 ("E" , 1 : 5 ),
paste0 ("N" , 1 : 5 ), paste0 ("O" , 1 : 5 ))
bfi_raw <- psych:: bfi
bfi_dirty <- bfi_raw
# Inject a few issues so the checks are demonstrable
bfi_dirty[1 , "A1" ] <- 9
bfi_dirty[2 , "C3" ] <- 0
bfi_dirty[3 , "age" ] <- - 5
bfi_dirty[4 , "gender" ] <- 3
bfi_dirty[5 , items] <- 1
bfi_dirty[6 , items[1 : 10 ]] <- NA
bfi_dirty <- rbind (bfi_dirty, bfi_dirty[1 , ])
1) Rationally invalid values
Detection with range rules.
Code
# Row-level indicator: all items are within 1..6 or missing
bfi_dirty$ items_in_range <- apply (bfi_dirty[items], 1 , function (x) {
all (x %in% 1 : 6 | is.na (x))
})
rules_range <- validator (
items_in_range == TRUE ,
age >= 16 ,
age <= 100 ,
gender %in% c (1 , 2 ) | is.na (gender),
education %in% 1 : 5 | is.na (education)
)
range_cn <- confront (bfi_dirty, rules_range)
summary (range_cn)
V1
2801
2798
3
0
FALSE
FALSE
items_in_range == TRUE
V2
2801
2713
88
0
FALSE
FALSE
age - 16 >= -1e-08
V3
2801
2801
0
0
FALSE
FALSE
age - 100 <= 1e-08
V4
2801
2800
1
0
FALSE
FALSE
gender %vin% c(1, 2) | is.na(gender)
V5
2801
2801
0
0
FALSE
FALSE
education %vin% 1:5 | is.na(education)
Code
bad_range <- rows_violating (range_cn)
head (bfi_dirty[bad_range, c ("age" , "gender" , "education" , "A1" , "C3" )])
61617
16
1
NA
9
3
61618
18
2
NA
2
0
61620
-5
2
NA
5
4
61621
17
3
NA
4
3
61670
14
2
NA
2
4
61780
14
2
NA
5
6
Correction: set invalid values to NA.
Code
clip_to_na <- function (x, lo, hi) {
x[x < lo | x > hi] <- NA
x
}
bfi_fixed <- bfi_dirty %>%
mutate (
across (all_of (items), ~ clip_to_na (.x, 1 , 6 )),
age = clip_to_na (age, 16 , 100 ),
gender = if_else (gender %in% c (1 , 2 ), gender, NA_real_ ),
education = if_else (education %in% 1 : 5 , education, NA_real_ )
)
2) Statistically invalid values (odd response patterns)
Look for rows with no variation in items, suggesting flat response pattern
Code
bfi_fixed$ row_sd <- apply (bfi_fixed[items], 1 , sd, na.rm = TRUE )
bfi_fixed$ n_missing <- rowSums (is.na (bfi_fixed[items]))
rules_stat <- validator (
row_sd > 0 ,
n_missing <= 5
)
stat_cn <- confront (bfi_fixed, rules_stat)
summary (stat_cn)
V1
2801
2795
6
0
FALSE
FALSE
row_sd > 0
V2
2801
2794
7
0
FALSE
FALSE
n_missing - 5 <= 1e-08
Code
bad_stat <- rows_violating (stat_cn)
head (bfi_fixed[bad_stat, c ("row_sd" , "n_missing" )])
61622
0.000000
0
61623
1.804756
10
62783
0.000000
0
63030
1.779513
15
63991
0.000000
15
64642
0.000000
0
Correction: remove cases with implausible patterns.
Code
bfi_fixed <- bfi_fixed %>%
filter (row_sd > 0 , n_missing <= 5 )
3) Duplicated observations
Detection using exact duplicate rows on items + demographics.
Code
dup_key <- c (items, "age" , "gender" , "education" )
bfi_fixed$ dup_row <- duplicated (bfi_fixed[dup_key])
rules_dup <- validator (! dup_row)
dup_cn <- confront (bfi_fixed, rules_dup)
summary (dup_cn)
Code
bfi_fixed <- bfi_fixed %>%
filter (! dup_row)
4) Unexplained missing data
Detection of missing demographics.
Code
missing_rules <- validator (
! is.na (age),
! is.na (gender)
)
missing_cn <- confront (bfi_fixed, missing_rules)
summary (missing_cn)
V1
2788
2702
86
0
FALSE
FALSE
!is.na(age)
V2
2788
2787
1
0
FALSE
FALSE
!is.na(gender)
Code
bad_missing <- rows_violating (missing_cn)
head (bfi_fixed[bad_missing, c ("age" , "gender" )])
61620
NA
2
61621
17
NA
61670
NA
2
61780
NA
2
62151
NA
2
62360
NA
2
Correction: remove cases with missing key demographics and impute remaining item missings.
Code
bfi_fixed <- bfi_fixed %>%
filter (! is.na (age), ! is.na (gender)) %>%
mutate (across (all_of (items), ~ replace (.x, is.na (.x), median (.x, na.rm = TRUE ))))
Final consistency check
Code
# Recompute indicators after corrections
bfi_fixed$ items_in_range <- apply (bfi_fixed[items], 1 , function (x) {
all (x %in% 1 : 6 | is.na (x))
})
bfi_fixed$ row_sd <- apply (bfi_fixed[items], 1 , sd, na.rm = TRUE )
bfi_fixed$ n_missing <- rowSums (is.na (bfi_fixed[items]))
bfi_fixed$ dup_row <- duplicated (bfi_fixed[dup_key])
final_rules <- validator (
items_in_range == TRUE ,
age >= 16 , age <= 100 ,
gender %in% c (1 , 2 ),
education %in% 1 : 5 | is.na (education),
row_sd > 0 ,
n_missing <= 5 ,
! dup_row
)
final_cn <- confront (bfi_fixed, final_rules)
summary (final_cn)
V1
2701
2701
0
0
FALSE
FALSE
items_in_range == TRUE
V2
2701
2701
0
0
FALSE
FALSE
age - 16 >= -1e-08
V3
2701
2701
0
0
FALSE
FALSE
age - 100 <= 1e-08
V4
2701
2701
0
0
FALSE
FALSE
gender %vin% c(1, 2)
V5
2701
2701
0
0
FALSE
FALSE
education %vin% 1:5 | is.na(education)
V6
2701
2701
0
0
FALSE
FALSE
row_sd > 0
V7
2701
2701
0
0
FALSE
FALSE
n_missing - 5 <= 1e-08
This compact workflow uses detection (rules + confront), isolation (violating), and correction (recoding to NA, removal of implausible cases, and simple imputation) to achieve consistent data.