The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.
This vignette demonstrates how to detect input languages and translate only non-English text, integrating seamlessly with tidyverse workflows. This approach is particularly useful for cleaning multilingual datasets and processing mixed-language content efficiently.
Language detection and conditional translation allows you to:
Let’s start with simple language detection:
# Sample texts in different languages
sample_texts <- c(
"Hello, how are you today?", # English
"Bonjour, comment allez-vous?", # French
"Hola, ¿cómo estás hoy?", # Spanish
"Guten Tag, wie geht es Ihnen?", # German
"Ciao, come stai oggi?" # Italian
)
# Detect languages
detected_languages <- sapply(sample_texts, language_detect)
print("Detected languages:")
print(detected_languages)
Create a function that only translates non-English text:
translate_if_not_english <- function(text, target_language = "en") {
# Detect language of the input text
detected_lang <- language_detect(text)
# Check if the detected language is English
is_english <- grepl("en", detected_lang, ignore.case = TRUE)
if (is_english) {
# Return original text if already English
return(list(
original = text,
translated = text,
was_translated = FALSE,
detected_language = detected_lang
))
} else {
# Translate to English if not English
translated_text <- google_translate(text, target_language = target_language, source_language = "auto")
return(list(
original = text,
translated = translated_text,
was_translated = TRUE,
detected_language = detected_lang
))
}
}
# Test the function
test_text_fr <- "Bonjour, j'aimerais acheter un billet."
result <- translate_if_not_english(test_text_fr)
print("Conditional translation result:")
print(paste("Original:", result$original))
print(paste("Translated:", result$translated))
print(paste("Was translated:", result$was_translated))
print(paste("Detected language:", result$detected_language))
Here’s a practical example with a tibble containing mixed-language rows:
# Create a dataset with mixed languages (typical of user-generated content)
mixed_data <- tibble(
id = 1:8,
user_feedback = c(
"Great product, very satisfied!", # English
"Excelente producto, muy satisfecho!", # Spanish
"Produit fantastique, je le recommande!", # French
"This service exceeded my expectations.", # English
"Der Service war wirklich hervorragend.", # German
"Servizio eccellente, davvero impressionante!", # Italian
"The delivery was fast and reliable.", # English
"La livraison était rapide et fiable." # French
),
rating = c(5, 5, 4, 5, 4, 5, 4, 4),
category = rep(c("product", "service"), 4)
)
print("Original mixed-language dataset:")
print(mixed_data)
Now let’s detect languages and conditionally translate:
# Function to process each text entry
process_feedback <- function(text) {
result <- translate_if_not_english(text)
return(tibble(
original_text = result$original,
english_text = result$translated,
was_translated = result$was_translated,
detected_language = result$detected_language
))
}
# Apply to all feedback entries
processed_results <- purrr::map_dfr(mixed_data$user_feedback, process_feedback)
# Combine with original data
enhanced_data <- bind_cols(mixed_data, processed_results)
print("Enhanced dataset with language detection and translation:")
print(enhanced_data)
For more sophisticated data processing workflows:
library(stringr)
# Enhanced processing function with more details
enhanced_language_processing <- function(df, text_column) {
df %>%
mutate(
# Detect language for each text entry
detected_lang = map_chr(!!rlang::sym(text_column),
~ tryCatch(language_detect(.x), error = function(e) "unknown")),
# Determine if translation is needed
needs_translation = !str_detect(detected_lang, "en"),
# Translate only non-English text
english_text = map2_chr(!!rlang::sym(text_column), needs_translation,
~ if (.y) {
tryCatch(google_translate(.x, target_language = "en"),
error = function(e) .x)
} else {
.x
}),
# Add translation confidence/status
translation_status = case_when(
detected_lang == "unknown" ~ "detection_failed",
!needs_translation ~ "already_english",
english_text != !!rlang::sym(text_column) ~ "translated",
TRUE ~ "translation_failed"
)
)
}
# Apply enhanced processing
result_data <- enhanced_language_processing(mixed_data, "user_feedback")
print("Advanced processing results:")
print(result_data %>% select(id, detected_lang, needs_translation, translation_status))
Process large datasets efficiently by filtering and batching:
# Create larger sample dataset
large_dataset <- tibble(
id = 1:20,
content = c(
# Mix of English and non-English content
"Amazing service quality", # EN
"Fantástico servicio al cliente", # ES
"Service client exceptionnel", # FR
"Great user experience", # EN
"Esperienza utente eccellente", # IT
"Ausgezeichnete Benutzerführung", # DE
"Fast shipping and delivery", # EN
"Livraison rapide et efficace", # FR
"Excellent product quality", # EN
"Qualità del prodotto superiore", # IT
"Easy to use interface", # EN
"Interfaz muy fácil de usar", # ES
"Highly recommend this product", # EN
"Je recommande vivement ce produit", # FR
"Outstanding customer support", # EN
"Soporte al cliente sobresaliente", # ES
"Very satisfied with purchase", # EN
"Sehr zufrieden mit dem Kauf", # DE
"Will definitely buy again", # EN
"Sicuramente acquisterò di nuovo" # IT
),
timestamp = Sys.time() + sample(-1000:1000, 20),
priority = sample(c("high", "medium", "low"), 20, replace = TRUE)
)
# Efficient batch processing workflow
batch_process_languages <- function(df, text_col, batch_size = 5) {
# First, detect languages for all entries
df_with_detection <- df %>%
mutate(
row_id = row_number(),
detected_lang = map_chr(!!rlang::sym(text_col),
~ tryCatch(language_detect(.x), error = function(e) "en")),
is_english = str_detect(detected_lang, "en")
)
# Separate English and non-English content
english_content <- df_with_detection %>% filter(is_english)
non_english_content <- df_with_detection %>% filter(!is_english)
# Process non-English content in batches
if (nrow(non_english_content) > 0) {
non_english_content <- non_english_content %>%
mutate(
batch_id = ceiling(row_number() / batch_size),
english_text = map_chr(!!rlang::sym(text_col),
~ tryCatch(google_translate(.x, target_language = "en"),
error = function(e) .x))
)
} else {
non_english_content <- non_english_content %>%
mutate(batch_id = integer(0), english_text = character(0))
}
# For English content, keep original text
english_content <- english_content %>%
mutate(
batch_id = NA_integer_,
english_text = !!rlang::sym(text_col)
)
# Combine results
result <- bind_rows(english_content, non_english_content) %>%
arrange(row_id) %>%
select(-row_id)
return(result)
}
# Apply batch processing
processed_large <- batch_process_languages(large_dataset, "content", batch_size = 3)
# Summary statistics
summary_stats <- processed_large %>%
summarise(
total_entries = n(),
english_entries = sum(is_english),
translated_entries = sum(!is_english),
translation_rate = mean(!is_english),
unique_languages = n_distinct(detected_lang)
)
print("Processing summary:")
print(summary_stats)
print("Sample of processed data:")
print(processed_large %>%
select(id, detected_lang, is_english, content, english_text) %>%
head(10))
Language detection and conditional translation provide powerful tools for cleaning and standardizing multilingual datasets. By integrating with tidyverse workflows, you can efficiently process mixed-language content, enabling consistent analysis and insights across diverse linguistic data sources.
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.