How to Extract Structured Data with LLMs in R
Introduction
LLMs excel at extracting structured information from unstructured text. Instead of writing complex regex patterns, you can describe what you want and get clean, structured output.
This tutorial uses the ellmer package. You can use any provider: Claude, OpenAI, or local models with Ollama.
Use cases: - Extract names, dates, addresses from text - Parse product information from descriptions - Convert free-text survey responses to categories - Extract entities from documents - Clean and standardize messy data
Getting Started
library(ellmer)
library(tidyverse)Basic Extraction
Define a schema
Tell the LLM what structure you expect:
person_schema <- type_object(
name = type_string("Person's full name"),
age = type_integer("Person's age in years"),
email = type_string("Email address if mentioned")
)Extract from text
chat <- chat_claude()
result <- chat$extract_data(
"John Smith is 35 years old. You can reach him at john@email.com",
type = person_schema
)Access the results
result$name # "John Smith"
result$age # 35
result$email # "john@email.com"Handle missing data
result <- chat$extract_data(
"Sarah is 28 years old.",
type = person_schema
)
result
# $name: "Sarah"
# $age: 28
# $email: NULL # Not mentioned in textType Definitions
Available types
# String
type_string("description of the field")
# Integer
type_integer("description")
# Number (float)
type_number("description")
# Boolean
type_boolean("description")
# Enum (predefined options)
type_enum(
values = c("positive", "negative", "neutral"),
description = "Sentiment classification"
)
# Array (list of items)
type_array(
items = type_string("individual item description"),
description = "A list of items"
)
# Object (nested structure)
type_object(
field1 = type_string("..."),
field2 = type_integer("...")
)Practical Examples
Extract product information
product_schema <- type_object(
name = type_string("Product name"),
price = type_number("Price in dollars"),
currency = type_string("Currency code"),
features = type_array(
items = type_string("A product feature")
)
)
text <- "The iPhone 15 Pro costs $999. Features include titanium design,
A17 chip, 48MP camera, and USB-C port."
chat <- chat_claude()
product <- chat$extract_data(text, type = product_schema)
product
# $name: "iPhone 15 Pro"
# $price: 999
# $currency: "USD"
# $features: ["titanium design", "A17 chip", "48MP camera", "USB-C port"]Classify sentiment
sentiment_schema <- type_object(
sentiment = type_enum(
values = c("positive", "negative", "neutral"),
description = "Overall sentiment"
),
confidence = type_number("Confidence score from 0 to 1"),
key_phrases = type_array(
items = type_string("Key phrase indicating sentiment")
)
)
review <- "Absolutely love this product! Best purchase I've made this year.
The quality is outstanding and shipping was super fast."
chat <- chat_claude()
result <- chat$extract_data(review, type = sentiment_schema)
result
# $sentiment: "positive"
# $confidence: 0.95
# $key_phrases: ["Absolutely love", "Best purchase", "outstanding", "super fast"]Parse contact information
contact_schema <- type_object(
name = type_string("Full name"),
phone = type_string("Phone number"),
email = type_string("Email address"),
address = type_object(
street = type_string("Street address"),
city = type_string("City"),
state = type_string("State"),
zip = type_string("ZIP code")
)
)
text <- "Contact Jane Doe at (555) 123-4567 or jane.doe@company.com.
Her office is at 123 Main Street, San Francisco, CA 94102."
chat <- chat_claude()
contact <- chat$extract_data(text, type = contact_schema)Extract dates and events
event_schema <- type_object(
event_name = type_string("Name of the event"),
date = type_string("Date in YYYY-MM-DD format"),
location = type_string("Event location"),
description = type_string("Brief description")
)
text <- "Join us for the R Users Meetup on March 15th, 2024 at the
Downtown Conference Center. We'll discuss data visualization techniques."
chat <- chat_claude()
event <- chat$extract_data(text, type = event_schema)
event
# $event_name: "R Users Meetup"
# $date: "2024-03-15"
# $location: "Downtown Conference Center"
# $description: "Discussion about data visualization techniques"Extracting Multiple Items
Extract array of objects
person_schema <- type_object(
name = type_string("Person's name"),
role = type_string("Person's role or title")
)
people_schema <- type_array(
items = person_schema,
description = "List of people mentioned"
)
text <- "The meeting included CEO John Smith, CTO Sarah Johnson,
and CFO Michael Brown. They discussed Q4 results."
chat <- chat_claude()
people <- chat$extract_data(text, type = people_schema)
people
# [[1]] $name: "John Smith", $role: "CEO"
# [[2]] $name: "Sarah Johnson", $role: "CTO"
# [[3]] $name: "Michael Brown", $role: "CFO"Convert to data frame
# Extract as tibble
people_df <- tibble(
name = map_chr(people, "name"),
role = map_chr(people, "role")
)
people_dfBatch Processing
Process multiple texts
library(purrr)
reviews <- c(
"Great product, love it!",
"Terrible quality, very disappointed",
"It's okay, nothing special",
"Best purchase ever, highly recommend"
)
sentiment_schema <- type_enum(
values = c("positive", "negative", "neutral"),
description = "Sentiment"
)
extract_sentiment <- function(text) {
chat <- chat_claude()
Sys.sleep(0.5) # Rate limiting
chat$extract_data(text, type = sentiment_schema)
}
sentiments <- map_chr(reviews, extract_sentiment)
tibble(
review = reviews,
sentiment = sentiments
)Process data frame column
df <- tibble(
id = 1:3,
description = c(
"John Smith, age 30, engineer",
"Jane Doe, age 25, designer",
"Bob Brown, age 45, manager"
)
)
person_schema <- type_object(
name = type_string("Name"),
age = type_integer("Age"),
job = type_string("Job title")
)
df_extracted <- df |>
mutate(
extracted = map(description, \(text) {
chat <- chat_claude()
Sys.sleep(0.5)
chat$extract_data(text, type = person_schema)
}),
name = map_chr(extracted, "name"),
age = map_int(extracted, "age"),
job = map_chr(extracted, "job")
) |>
select(-extracted)
df_extractedAdvanced Patterns
Extraction with instructions
chat <- chat_claude(
system_prompt = "Extract information exactly as specified.
If information is unclear, make your best inference.
Use NULL for genuinely missing data."
)
result <- chat$extract_data(text, type = schema)Validate extracted data
extract_and_validate <- function(text, schema, validation_fn) {
chat <- chat_claude()
result <- chat$extract_data(text, type = schema)
if (!validation_fn(result)) {
warning("Extraction may be incomplete or invalid")
}
result
}
# Example validation
validate_person <- function(person) {
!is.null(person$name) && !is.null(person$age)
}
result <- extract_and_validate(
"Some text",
person_schema,
validate_person
)Combine extraction with classification
ticket_schema <- type_object(
category = type_enum(
values = c("billing", "technical", "account", "other"),
description = "Support ticket category"
),
priority = type_enum(
values = c("low", "medium", "high", "urgent"),
description = "Priority level"
),
summary = type_string("One-sentence summary"),
entities = type_object(
account_id = type_string("Account ID if mentioned"),
error_code = type_string("Error code if mentioned")
)
)
ticket <- "Hi, I can't log into my account #12345. Getting error E401.
This is urgent as I need to complete a transaction today!"
chat <- chat_claude()
parsed_ticket <- chat$extract_data(ticket, type = ticket_schema)Error Handling
safe_extract <- function(text, schema) {
tryCatch({
chat <- chat_claude()
chat$extract_data(text, type = schema)
}, error = function(e) {
warning("Extraction failed: ", e$message)
NULL
})
}
# Use with map for batch processing
results <- map(texts, \(t) safe_extract(t, schema))
# Filter out failures
valid_results <- compact(results) # Remove NULLsLocal LLM Extraction
Use Ollama for free, private extraction:
# Works the same way with local models
chat <- chat_ollama(model = "llama3.2")
result <- chat$extract_data(
"John Smith, 35 years old, john@email.com",
type = person_schema
)Note: Local models may be less accurate for complex schemas. Test thoroughly.
Common Mistakes
1. Schema too complex
# Too many nested levels can confuse the model
# Break into simpler extractions if needed2. Ambiguous field descriptions
# Bad
type_string("date")
# Good
type_string("Event date in YYYY-MM-DD format")3. Not handling NULL values
# Always check for NULLs
result$field %||% "default_value"
# Or use map with default
map_chr(results, "field", .default = NA_character_)4. Forgetting rate limits in batches
# Always add delays
map(texts, \(t) {
Sys.sleep(0.5) # Important!
extract(t)
})Summary
| Task | Code |
|---|---|
| Define string field | type_string("description") |
| Define number field | type_number("description") |
| Define enum field | type_enum(values = c(...)) |
| Define array | type_array(items = type_*()) |
| Define object | type_object(field = type_*()) |
| Extract data | chat$extract_data(text, type) |
- Define schemas with
type_*()functions - Use clear field descriptions
- Handle NULL values for missing data
- Add delays when batch processing
- Validate extracted data when reliability is important