## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE
)

## ----setup--------------------------------------------------------------------
# library(polyglotr)
# library(dplyr)
# library(tibble)
# library(purrr)
# 

## ----basic_detection----------------------------------------------------------
# # Sample texts in different languages
# sample_texts <- c(
#   "Hello, how are you today?",           # English
#   "Bonjour, comment allez-vous?",        # French
#   "Hola, ¿cómo estás hoy?",             # Spanish
#   "Guten Tag, wie geht es Ihnen?",      # German
#   "Ciao, come stai oggi?"               # Italian
# )
# 
# # Detect languages
# detected_languages <- sapply(sample_texts, language_detect)
# print("Detected languages:")
# print(detected_languages)

## ----conditional_translation--------------------------------------------------
# translate_if_not_english <- function(text, target_language = "en") {
#   # Detect language of the input text
#   detected_lang <- language_detect(text)
# 
#   # Check if the detected language is English
#   is_english <- grepl("en", detected_lang, ignore.case = TRUE)
# 
#   if (is_english) {
#     # Return original text if already English
#     return(list(
#       original = text,
#       translated = text,
#       was_translated = FALSE,
#       detected_language = detected_lang
#     ))
#   } else {
#     # Translate to English if not English
#     translated_text <- google_translate(text, target_language = target_language, source_language = "auto")
#     return(list(
#       original = text,
#       translated = translated_text,
#       was_translated = TRUE,
#       detected_language = detected_lang
#     ))
#   }
# }
# 
# # Test the function
# test_text_fr <- "Bonjour, j'aimerais acheter un billet."
# result <- translate_if_not_english(test_text_fr)
# 
# print("Conditional translation result:")
# print(paste("Original:", result$original))
# print(paste("Translated:", result$translated))
# print(paste("Was translated:", result$was_translated))
# print(paste("Detected language:", result$detected_language))

## ----mixed_language_tibble----------------------------------------------------
# # Create a dataset with mixed languages (typical of user-generated content)
# mixed_data <- tibble(
#   id = 1:8,
#   user_feedback = c(
#     "Great product, very satisfied!",                    # English
#     "Excelente producto, muy satisfecho!",               # Spanish
#     "Produit fantastique, je le recommande!",            # French
#     "This service exceeded my expectations.",            # English
#     "Der Service war wirklich hervorragend.",            # German
#     "Servizio eccellente, davvero impressionante!",     # Italian
#     "The delivery was fast and reliable.",               # English
#     "La livraison était rapide et fiable."               # French
#   ),
#   rating = c(5, 5, 4, 5, 4, 5, 4, 4),
#   category = rep(c("product", "service"), 4)
# )
# 
# print("Original mixed-language dataset:")
# print(mixed_data)

## ----detect_and_translate-----------------------------------------------------
# # Function to process each text entry
# process_feedback <- function(text) {
#   result <- translate_if_not_english(text)
#   return(tibble(
#     original_text = result$original,
#     english_text = result$translated,
#     was_translated = result$was_translated,
#     detected_language = result$detected_language
#   ))
# }
# 
# # Apply to all feedback entries
# processed_results <- purrr::map_dfr(mixed_data$user_feedback, process_feedback)
# 
# # Combine with original data
# enhanced_data <- bind_cols(mixed_data, processed_results)
# 
# print("Enhanced dataset with language detection and translation:")
# print(enhanced_data)

## ----advanced_tidyverse-------------------------------------------------------
# library(stringr)
# 
# # Enhanced processing function with more details
# enhanced_language_processing <- function(df, text_column) {
#   df %>%
#     mutate(
#       # Detect language for each text entry
#       detected_lang = map_chr(!!rlang::sym(text_column),
#                              ~ tryCatch(language_detect(.x), error = function(e) "unknown")),
# 
#       # Determine if translation is needed
#       needs_translation = !str_detect(detected_lang, "en"),
# 
#       # Translate only non-English text
#       english_text = map2_chr(!!rlang::sym(text_column), needs_translation,
#                              ~ if (.y) {
#                                tryCatch(google_translate(.x, target_language = "en"),
#                                        error = function(e) .x)
#                              } else {
#                                .x
#                              }),
# 
#       # Add translation confidence/status
#       translation_status = case_when(
#         detected_lang == "unknown" ~ "detection_failed",
#         !needs_translation ~ "already_english",
#         english_text != !!rlang::sym(text_column) ~ "translated",
#         TRUE ~ "translation_failed"
#       )
#     )
# }
# 
# # Apply enhanced processing
# result_data <- enhanced_language_processing(mixed_data, "user_feedback")
# 
# print("Advanced processing results:")
# print(result_data %>% select(id, detected_lang, needs_translation, translation_status))

## ----batch_filtering----------------------------------------------------------
# # Create larger sample dataset
# large_dataset <- tibble(
#   id = 1:20,
#   content = c(
#     # Mix of English and non-English content
#     "Amazing service quality",                           # EN
#     "Fantástico servicio al cliente",                   # ES
#     "Service client exceptionnel",                      # FR
#     "Great user experience",                            # EN
#     "Esperienza utente eccellente",                     # IT
#     "Ausgezeichnete Benutzerführung",                  # DE
#     "Fast shipping and delivery",                       # EN
#     "Livraison rapide et efficace",                    # FR
#     "Excellent product quality",                        # EN
#     "Qualità del prodotto superiore",                  # IT
#     "Easy to use interface",                           # EN
#     "Interfaz muy fácil de usar",                      # ES
#     "Highly recommend this product",                    # EN
#     "Je recommande vivement ce produit",               # FR
#     "Outstanding customer support",                     # EN
#     "Soporte al cliente sobresaliente",                # ES
#     "Very satisfied with purchase",                     # EN
#     "Sehr zufrieden mit dem Kauf",                     # DE
#     "Will definitely buy again",                       # EN
#     "Sicuramente acquisterò di nuovo"                  # IT
#   ),
#   timestamp = Sys.time() + sample(-1000:1000, 20),
#   priority = sample(c("high", "medium", "low"), 20, replace = TRUE)
# )
# 
# # Efficient batch processing workflow
# batch_process_languages <- function(df, text_col, batch_size = 5) {
#   # First, detect languages for all entries
#   df_with_detection <- df %>%
#     mutate(
#       row_id = row_number(),
#       detected_lang = map_chr(!!rlang::sym(text_col),
#                              ~ tryCatch(language_detect(.x), error = function(e) "en")),
#       is_english = str_detect(detected_lang, "en")
#     )
# 
#   # Separate English and non-English content
#   english_content <- df_with_detection %>% filter(is_english)
#   non_english_content <- df_with_detection %>% filter(!is_english)
# 
#   # Process non-English content in batches
#   if (nrow(non_english_content) > 0) {
#     non_english_content <- non_english_content %>%
#       mutate(
#         batch_id = ceiling(row_number() / batch_size),
#         english_text = map_chr(!!rlang::sym(text_col),
#                               ~ tryCatch(google_translate(.x, target_language = "en"),
#                                         error = function(e) .x))
#       )
#   } else {
#     non_english_content <- non_english_content %>%
#       mutate(batch_id = integer(0), english_text = character(0))
#   }
# 
#   # For English content, keep original text
#   english_content <- english_content %>%
#     mutate(
#       batch_id = NA_integer_,
#       english_text = !!rlang::sym(text_col)
#     )
# 
#   # Combine results
#   result <- bind_rows(english_content, non_english_content) %>%
#     arrange(row_id) %>%
#     select(-row_id)
# 
#   return(result)
# }
# 
# # Apply batch processing
# processed_large <- batch_process_languages(large_dataset, "content", batch_size = 3)
# 
# # Summary statistics
# summary_stats <- processed_large %>%
#   summarise(
#     total_entries = n(),
#     english_entries = sum(is_english),
#     translated_entries = sum(!is_english),
#     translation_rate = mean(!is_english),
#     unique_languages = n_distinct(detected_lang)
#   )
# 
# print("Processing summary:")
# print(summary_stats)
# 
# print("Sample of processed data:")
# print(processed_large %>%
#       select(id, detected_lang, is_english, content, english_text) %>%
#       head(10))