The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.
đŸ“˜ Website • Articles • Reference
FakeDataR makes safe, synthetic stand‑ins for real
datasets.
It mirrors types, factor levels,
NA/blank rates, and even handles sensitive
columns (IDs, emails, phones) via fake or
drop strategies. You can also generate fake tables from a
database schema without reading any rows.
# install.packages("devtools")
::install_github("zobaer09/FakeDataR") devtools
Generate a fake dataset that matches the structure of a real one:
library(FakeDataR)
<- generate_fake_data(mtcars, n = 200, seed = 1)
fake_mtc head(fake_mtc)
#> mpg cyl disp hp drat wt qsec vs
#> 1 16.63945 5.070033 335.2440 282.43325 4.623352 3.588993 17.57882 0.2396067
#> 2 19.14491 4.874581 145.2945 314.84395 2.834732 4.191491 20.72770 0.6477649
#> 3 23.86205 6.067187 453.7102 93.73714 4.867064 3.012021 22.34145 0.9756708
#> 4 31.74288 5.075802 431.0475 264.19953 4.376889 5.247958 20.15496 0.3779988
#> 5 15.13953 4.724673 449.4281 328.11103 3.352964 1.975893 20.39140 0.4641441
#> 6 31.51216 6.074305 361.2276 327.86627 4.229320 1.665920 21.62005 0.8122963
#> am gear carb
#> 1 0.13853856 3.123219 7.102635
#> 2 0.04752457 3.710643 7.770379
#> 3 0.03391887 4.154076 7.068414
#> 4 0.91608902 4.070063 4.064007
#> 5 0.84020039 4.208546 2.343565
#> 6 0.17887142 3.972298 1.576061
Validate that classes and basic patterns were preserved:
validate_fake(mtcars, fake_mtc)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 mpg numeric numeric TRUE 0 0
#> 2 cyl numeric numeric TRUE 0 0
#> 3 disp numeric numeric TRUE 0 0
#> 4 hp numeric numeric TRUE 0 0
#> 5 drat numeric numeric TRUE 0 0
#> 6 wt numeric numeric TRUE 0 0
#> 7 qsec numeric numeric TRUE 0 0
#> 8 vs numeric numeric TRUE 0 0
#> 9 am numeric numeric TRUE 0 0
#> 10 gear numeric numeric TRUE 0 0
#> 11 carb numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE NA NA NA
#> 2 TRUE NA NA NA
#> 3 TRUE NA NA NA
#> 4 TRUE NA NA NA
#> 5 TRUE NA NA NA
#> 6 TRUE NA NA NA
#> 7 TRUE NA NA NA
#> 8 TRUE NA NA NA
#> 9 TRUE NA NA NA
#> 10 TRUE NA NA NA
#> 11 TRUE NA NA NA
#> range_within_original
#> 1 TRUE
#> 2 TRUE
#> 3 TRUE
#> 4 TRUE
#> 5 TRUE
#> 6 TRUE
#> 7 TRUE
#> 8 TRUE
#> 9 TRUE
#> 10 TRUE
#> 11 TRUE
Preserve factor levels & numeric ranges (example with
palmerpenguins
):
if (requireNamespace("palmerpenguins", quietly = TRUE)) {
<- na.omit(palmerpenguins::penguins[, c("species","island","bill_length_mm","sex")])
peng <- generate_fake_data(peng, n = 400, seed = 11, category_mode = "preserve")
fake_peng validate_fake(peng, fake_peng)
}#> column class_original class_fake class_match na_prop_original
#> 1 species factor factor TRUE 0
#> 2 island factor factor TRUE 0
#> 3 bill_length_mm numeric numeric TRUE 0
#> 4 sex factor factor TRUE 0
#> na_prop_fake na_match blank_prop_original blank_prop_fake blank_match
#> 1 0 TRUE 0 0 TRUE
#> 2 0 TRUE 0 0 TRUE
#> 3 0 TRUE NA NA NA
#> 4 0 TRUE 0 0 TRUE
#> range_within_original
#> 1 NA
#> 2 NA
#> 3 TRUE
#> 4 NA
Detect and handle PII by name. Use strategy "fake"
(default) or "drop"
:
<- data.frame(
df id = 1:50,
email = sprintf("user%03d@corp.com", 1:50),
phone = sprintf("(415) 555-%04d", 1:50),
spend = runif(50, 10, 500)
)
# Keep the columns but replace values with synthetic ones
<- generate_fake_data(
fake_keep n = 80, seed = 12,
df, sensitive_detect = TRUE, sensitive_strategy = "fake"
)intersect(df$email, fake_keep$email) # should be character(0)
#> character(0)
# Drop sensitive columns entirely
<- generate_fake_data(
fake_drop n = 80, seed = 13,
df, sensitive_detect = TRUE, sensitive_strategy = "drop"
)names(fake_drop) # no id/email/phone
#> [1] "spend"
Create fake rows using only the schema (types, nullability, etc.). This works even when you cannot access the underlying data.
library(DBI); library(RSQLite)
<- dbConnect(RSQLite::SQLite(), ":memory:")
con dbExecute(con, "
CREATE TABLE employees (
id INTEGER,
email TEXT,
phone TEXT,
is_active BOOLEAN,
hired_at TIMESTAMP,
salary NUMERIC,
dept TEXT
)
")
<- schema_from_db(con, "employees")
sch <- generate_fake_from_schema(sch, n = 50, seed = 14)
fake
str(fake$hired_at) # POSIXct
dbDisconnect(con)
Write fake data to common formats (CSV, RDS, Parquet) and optionally produce an LLM bundle containing a schema and README.
# CSV / RDS
export_fake(fake_mtc, file.path(tempdir(), "fake_mtc.csv"))
export_fake(fake_mtc, file.path(tempdir(), "fake_mtc.rds"))
# Parquet (requires the 'arrow' package)
# install.packages("arrow")
export_fake(fake_mtc, file.path(tempdir(), "fake_mtc.parquet"))
# End-to-end bundle
<- llm_bundle(
b n = 200, seed = 10, level = "high",
mtcars, formats = c("csv","rds"),
path = tempdir(), filename = "mtcars_fake",
write_prompt = TRUE, zip = TRUE
)$zip_path b
All generators respect seed
. Given the same inputs and a
fixed package version, results are reproducible:
<- generate_fake_data(CO2, n = 123, seed = 42)
a1 <- generate_fake_data(CO2, n = 123, seed = 42)
a2 identical(a1, a2)
#> [1] TRUE
Issues and pull requests are welcome! If you find a bug or have a feature request, please open an issue on GitHub.
This package is distributed under the terms of the license included
in the repository (see LICENSE
).
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.