Compact Data Validation with validate (Psychology Survey Example)

Goal

We validate a psychology survey dataset using validate and demonstrate detection, selection/isolation, and correction for:

rationally invalid values (out-of-range)
statistically invalid values (odd response patterns)
duplicated observations
unexplained missing data

Setup

Code

# Install if needed
pkgs <- c("validate", "psych", "dplyr")
missing <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)]
if (length(missing) > 0) install.packages(missing)

library(validate)
library(psych)
library(dplyr)

rows_violating <- function(cn) {
  v <- values(cn)
  which(apply(v, 1, function(x) any(x == FALSE, na.rm = TRUE)))
}

Data

psych::bfi contains Big Five Inventory items (1-6) plus age, gender, education.

Code

items <- c(paste0("A", 1:5), paste0("C", 1:5), paste0("E", 1:5),
           paste0("N", 1:5), paste0("O", 1:5))

bfi_raw <- psych::bfi
bfi_dirty <- bfi_raw

# Inject a few issues so the checks are demonstrable
bfi_dirty[1, "A1"] <- 9
bfi_dirty[2, "C3"] <- 0
bfi_dirty[3, "age"] <- -5
bfi_dirty[4, "gender"] <- 3
bfi_dirty[5, items] <- 1
bfi_dirty[6, items[1:10]] <- NA
bfi_dirty <- rbind(bfi_dirty, bfi_dirty[1, ])

1) Rationally invalid values

Detection with range rules.

Code

# Row-level indicator: all items are within 1..6 or missing
bfi_dirty$items_in_range <- apply(bfi_dirty[items], 1, function(x) {
  all(x %in% 1:6 | is.na(x))
})

rules_range <- validator(
  items_in_range == TRUE,
  age >= 16,
  age <= 100,
  gender %in% c(1, 2) | is.na(gender),
  education %in% 1:5 | is.na(education)
)

range_cn <- confront(bfi_dirty, rules_range)
summary(range_cn)

name	items	passes	fails	error	warning	expression
V1	2801	2798	3	FALSE	FALSE	items_in_range == TRUE
V2	2801	2713	88	FALSE	FALSE	age - 16 >= -1e-08
V3	2801	2801	0	FALSE	FALSE	age - 100 <= 1e-08
V4	2801	2800	1	FALSE	FALSE	gender %vin% c(1, 2) \| is.na(gender)
V5	2801	2801	0	FALSE	FALSE	education %vin% 1:5 \| is.na(education)

Code

bad_range <- rows_violating(range_cn)
head(bfi_dirty[bad_range, c("age", "gender", "education", "A1", "C3")])

	age	gender	education	A1	C3
61617	16	1	NA	9	3
61618	18	2	NA	2	0
61620	-5	2	NA	5	4
61621	17	3	NA	4	3
61670	14	2	NA	2	4
61780	14	2	NA	5	6

Correction: set invalid values to NA.

Code

clip_to_na <- function(x, lo, hi) {
  x[x < lo | x > hi] <- NA
  x
}

bfi_fixed <- bfi_dirty %>%
  mutate(
    across(all_of(items), ~ clip_to_na(.x, 1, 6)),
    age = clip_to_na(age, 16, 100),
    gender = if_else(gender %in% c(1, 2), gender, NA_real_),
    education = if_else(education %in% 1:5, education, NA_real_)
  )

2) Statistically invalid values (odd response patterns)

Look for rows with no variation in items, suggesting flat response pattern

Code

bfi_fixed$row_sd <- apply(bfi_fixed[items], 1, sd, na.rm = TRUE)
bfi_fixed$n_missing <- rowSums(is.na(bfi_fixed[items]))

rules_stat <- validator(
  row_sd > 0,
  n_missing <= 5
)

stat_cn <- confront(bfi_fixed, rules_stat)
summary(stat_cn)

name	items	passes	fails	nNA	error	warning	expression
V1	2801	2795	6	0	FALSE	FALSE	row_sd > 0
V2	2801	2794	7	0	FALSE	FALSE	n_missing - 5 <= 1e-08

Code

bad_stat <- rows_violating(stat_cn)
head(bfi_fixed[bad_stat, c("row_sd", "n_missing")])

	row_sd	n_missing
61622	0.000000	0
61623	1.804756	10
62783	0.000000	0
63030	1.779513	15
63991	0.000000	15
64642	0.000000	0

Correction: remove cases with implausible patterns.

Code

bfi_fixed <- bfi_fixed %>%
  filter(row_sd > 0, n_missing <= 5)

3) Duplicated observations

Detection using exact duplicate rows on items + demographics.

Code

dup_key <- c(items, "age", "gender", "education")
bfi_fixed$dup_row <- duplicated(bfi_fixed[dup_key])

rules_dup <- validator(!dup_row)

dup_cn <- confront(bfi_fixed, rules_dup)
summary(dup_cn)

error	warning

Code

bfi_fixed <- bfi_fixed %>%
  filter(!dup_row)

4) Unexplained missing data

Detection of missing demographics.

Code

missing_rules <- validator(
  !is.na(age),
  !is.na(gender)
)

missing_cn <- confront(bfi_fixed, missing_rules)
summary(missing_cn)

name	items	passes	fails	nNA	error	warning	expression
V1	2788	2702	86	0	FALSE	FALSE	!is.na(age)
V2	2788	2787	1	0	FALSE	FALSE	!is.na(gender)

Code

bad_missing <- rows_violating(missing_cn)
head(bfi_fixed[bad_missing, c("age", "gender")])

	age	gender
61620	NA	2
61621	17	NA
61670	NA	2
61780	NA	2
62151	NA	2
62360	NA	2

Correction: remove cases with missing key demographics and impute remaining item missings.

Code

bfi_fixed <- bfi_fixed %>%
  filter(!is.na(age), !is.na(gender)) %>%
  mutate(across(all_of(items), ~ replace(.x, is.na(.x), median(.x, na.rm = TRUE))))

Final consistency check

Code

# Recompute indicators after corrections
bfi_fixed$items_in_range <- apply(bfi_fixed[items], 1, function(x) {
  all(x %in% 1:6 | is.na(x))
})
bfi_fixed$row_sd <- apply(bfi_fixed[items], 1, sd, na.rm = TRUE)
bfi_fixed$n_missing <- rowSums(is.na(bfi_fixed[items]))
bfi_fixed$dup_row <- duplicated(bfi_fixed[dup_key])

final_rules <- validator(
  items_in_range == TRUE,
  age >= 16, age <= 100,
  gender %in% c(1, 2),
  education %in% 1:5 | is.na(education),
  row_sd > 0,
  n_missing <= 5,
  !dup_row
)

final_cn <- confront(bfi_fixed, final_rules)
summary(final_cn)

name	items	passes	error	warning	expression
V1	2701	2701	FALSE	FALSE	items_in_range == TRUE
V2	2701	2701	FALSE	FALSE	age - 16 >= -1e-08
V3	2701	2701	FALSE	FALSE	age - 100 <= 1e-08
V4	2701	2701	FALSE	FALSE	gender %vin% c(1, 2)
V5	2701	2701	FALSE	FALSE	education %vin% 1:5 \| is.na(education)
V6	2701	2701	FALSE	FALSE	row_sd > 0
V7	2701	2701	FALSE	FALSE	n_missing - 5 <= 1e-08

This compact workflow uses detection (rules + confront), isolation (violating), and correction (recoding to NA, removal of implausible cases, and simple imputation) to achieve consistent data.