Complete Workflows
End-to-end scenarios combining multiple tools
Learn how to combine tools for complete data discovery workflows.
Workflow 1: Find → Preview → Download
Complete journey from search to data access:
# Discovery workflow for Vienna population data
# 1. Search for datasets
results = search_datasets(
query="Bevölkerung Wien Bezirk",
themes=["SOCI"],
formats=["CSV"],
publishers=["Stadt Wien"],
boost_quality=True,
limit=10
)
# 2. Get dataset details
dataset_id = results['results'][0]['id']
dataset = get_dataset(dataset_id=dataset_id)
# 3. Get download URL
distributions = get_dataset_distributions(dataset_id=dataset_id)
csv_dist = next((d for d in distributions if d.get('format', {}).get('id') == 'CSV'), None)
download_url = csv_dist['access_url']
# 4. Preview schema and data
schema = preview_schema(url=download_url, format="CSV")
preview = preview_data(url=download_url, max_rows=20, format="CSV")
# 5. Validate and proceed
expected_columns = ["Jahr", "Bezirk", "Einwohner"]
actual_columns = [c['name'] for c in schema['columns']]
if all(col in actual_columns for col in expected_columns):
print("Success: Schema validated - ready to download")
# Proceed with download_urlSearch for datasets
Find relevant datasets using filters:
# Find population datasets from Vienna
results = search_datasets(
query="Bevölkerung Wien Bezirk",
themes=["SOCI"],
formats=["CSV"],
publishers=["Stadt Wien"],
boost_quality=True,
limit=10
)
# Display results count
print(f"Found {results['count']} datasets")
# Review top 3 results
for i, dataset in enumerate(results['results'][:3], 1):
title = dataset.get('title', {}).get('de', 'No title')
print(f"{i}. {title}")Select and examine dataset
Get complete metadata:
# Pick the first result
dataset_id = results['results'][0]['id']
# Get complete metadata
dataset = get_dataset(dataset_id=dataset_id)
# Review key information
print(f"Title: {dataset['title']['de']}")
print(f"Description: {dataset['description']['de'][:200]}...")
print(f"Modified: {dataset['modified']}")
print(f"Publisher: {dataset['publisher']['name']}")
print(f"License: {dataset.get('license', 'Not specified')}")Get download URLs
Retrieve distribution information:
# Retrieve all distributions
distributions = get_dataset_distributions(dataset_id=dataset_id)
# Find CSV distribution
csv_dist = next(
(d for d in distributions
if d.get('format', {}).get('id') == 'CSV'),
None
)
if csv_dist:
download_url = csv_dist['access_url']
file_size = csv_dist.get('byte_size', 'Unknown')
print(f"Download URL: {download_url}")
print(f"File size: {file_size} bytes")Preview before download
Check schema and sample data:
# Check schema first
schema = preview_schema(url=download_url, format="CSV")
print("Columns:")
for col in schema['columns']:
print(f" - {col['name']}: {col['type']}")
# Preview sample data
preview = preview_data(url=download_url, max_rows=20, format="CSV")
print(f"\nShowing {preview['row_count']} sample rows:")
for row in preview['data'][:5]:
print(row)Verify and download
Validate structure before proceeding:
# Verify structure matches expectations
expected_columns = ["Jahr", "Bezirk", "Einwohner"]
actual_columns = [c['name'] for c in schema['columns']]
if all(col in actual_columns for col in expected_columns):
print("Success: Schema validated")
# Proceed with full download using download_url
else:
print("Error: Schema mismatch, check dataset")Workflow 2: Semantic search → Related datasets
Discover related data through intelligent search:
# Semantic discovery for air quality data
# 1. Semantic search
results = semantic_search_datasets(
natural_query="Luftqualität Messungen österreichische Städte",
limit=5
)
# 2. Analyze quality
dataset_id = results['results'][0]['id']
quality = analyze_dataset_quality(dataset_id=dataset_id)
# 3. Find related datasets
related = find_related_datasets(
dataset_id=dataset_id,
limit=5,
min_score=30
)
# 4. Compare datasets
original = get_dataset(dataset_id=dataset_id)
related_dataset = get_dataset(dataset_id=related['results'][0]['id'])
original_themes = {c['id'] for c in original.get('categories', [])}
related_themes = {c['id'] for c in related_dataset.get('categories', [])}
shared_themes = original_themes & related_themes
print(f"Original: {original['title']['de']}")
print(f"Related: {related_dataset['title']['de']}")
print(f"Shared themes: {', '.join(shared_themes)}")
print(f"Quality score: {quality['score']}/100")Natural language query
Use semantic search:
# Use semantic search for natural language
results = semantic_search_datasets(
natural_query="Luftqualität Messungen österreichische Städte",
limit=5
)
print(f"Found {len(results['results'])} highly relevant datasets")
# Display semantic search results
for i, dataset in enumerate(results['results'], 1):
title = dataset.get('title', {}).get('de', 'No title')
categories = [c.get('id') for c in dataset.get('categories', [])]
print(f"{i}. {title}")
print(f" Categories: {', '.join(categories)}")Analyze quality
Get quality metrics:
# Pick most relevant dataset
dataset_id = results['results'][0]['id']
# Get quality analysis
quality = analyze_dataset_quality(dataset_id=dataset_id)
print(f"Quality Score: {quality['score']}/100")
print(f"Completeness: {quality['completeness']}")
print("\nQuality Components:")
for component, value in quality['components'].items():
status = "✓" if value else "✗"
print(f" {status} {component}")Find related datasets
Discover similar datasets:
# Discover similar datasets
related = find_related_datasets(
dataset_id=dataset_id,
limit=5,
min_score=30
)
print(f"\nFound {len(related['results'])} related datasets:")
for dataset in related['results']:
title = dataset.get('title', {}).get('de', 'No title')
score = dataset.get('match_score', 0)
themes = [t.get('id') for t in dataset.get('themes', [])]
print(f"Score {score}: {title}")
print(f" Shared themes: {', '.join(themes)}")Compare datasets
Analyze similarities:
# Compare original and related dataset
original = get_dataset(dataset_id=dataset_id)
related_id = related['results'][0]['id']
related_dataset = get_dataset(dataset_id=related_id)
# Compare key attributes
print("\nComparison:")
print(f"Original Publisher: {original['publisher']['name']}")
print(f"Related Publisher: {related_dataset['publisher']['name']}")
original_themes = {c['id'] for c in original.get('categories', [])}
related_themes = {c['id'] for c in related_dataset.get('categories', [])}
shared_themes = original_themes & related_themes
print(f"Shared themes: {', '.join(shared_themes)}")Workflow 3: Quality-filtered search
Find high-quality datasets for production use:
# Quality-focused discovery for economic data
# 1. Quality-aware search
results = search_datasets(
query="Arbeitsmarkt Beschäftigung",
themes=["ECON"],
formats=["CSV", "JSON"],
boost_quality=True,
sort_by="modified_desc",
limit=20
)
# 2. Filter by quality metrics
high_quality = []
for dataset in results['results'][:10]:
quality = analyze_dataset_quality(dataset_id=dataset['id'])
if quality['score'] > 70:
critical_ok = all([
quality['components']['has_title'],
quality['components']['has_description'],
quality['components']['has_publisher'],
quality['components']['has_license']
])
if critical_ok:
high_quality.append({
'id': dataset['id'],
'title': dataset.get('title', {}).get('de'),
'score': quality['score']
})
# 3. Verify data accessibility
accessible = []
for dataset in high_quality:
distributions = get_dataset_distributions(dataset_id=dataset['id'])
has_csv = any(d.get('format', {}).get('id') == 'CSV' and d.get('access_url') for d in distributions)
if has_csv:
csv_url = next(d['access_url'] for d in distributions if d.get('format', {}).get('id') == 'CSV')
try:
schema = preview_schema(url=csv_url, format="CSV")
accessible.append({**dataset, 'url': csv_url, 'columns': len(schema['columns'])})
except Exception:
pass
print(f"Found {len(accessible)} high-quality, accessible datasets")Quality-aware search
Search with quality boost enabled:
# Search with quality boost
results = search_datasets(
query="Arbeitsmarkt Beschäftigung",
themes=["ECON"],
formats=["CSV", "JSON"],
boost_quality=True,
sort_by="modified_desc",
limit=20
)
print(f"Found {results['count']} datasets")Filter by quality metrics
Analyze each dataset for quality:
# Analyze each dataset
high_quality_datasets = []
for dataset in results['results'][:10]:
dataset_id = dataset['id']
# Get quality analysis
quality = analyze_dataset_quality(dataset_id=dataset_id)
# Filter: score > 70 and critical components present
if quality['score'] > 70:
critical_ok = all([
quality['components']['has_title'],
quality['components']['has_description'],
quality['components']['has_publisher'],
quality['components']['has_license']
])
if critical_ok:
high_quality_datasets.append({
'id': dataset_id,
'title': dataset.get('title', {}).get('de', 'No title'),
'score': quality['score'],
'modified': dataset.get('modified')
})
print(f"Found {len(high_quality_datasets)} high-quality datasets")Verify data accessibility
Ensure data is accessible:
# Ensure data is accessible
accessible_datasets = []
for dataset in high_quality_datasets:
# Get distributions
distributions = get_dataset_distributions(dataset_id=dataset['id'])
# Check for valid download URLs
has_csv = any(
d.get('format', {}).get('id') == 'CSV' and d.get('access_url')
for d in distributions
)
if has_csv:
# Verify URL is accessible
csv_url = next(
d['access_url'] for d in distributions
if d.get('format', {}).get('id') == 'CSV'
)
try:
schema = preview_schema(url=csv_url, format="CSV")
accessible_datasets.append({
**dataset,
'url': csv_url,
'columns': len(schema['columns'])
})
print(f"Success: {dataset['title']}")
print(f" Score: {dataset['score']}, Columns: {len(schema['columns'])}")
except Exception as e:
print(f"Error: {dataset['title']} - URL not accessible")
print(f"\n{len(accessible_datasets)} datasets ready for use")Workflow 4: Multi-language search
Handle German and English queries:
# Multi-language discovery
# 1. German query
de_results = semantic_search_datasets(
natural_query="Gesundheitsdaten aus Wien",
limit=10
)
# 2. English query (same intent)
en_results = semantic_search_datasets(
natural_query="health data from Vienna",
limit=10
)
# 3. Compare results
de_ids = {d['id'] for d in de_results['results']}
en_ids = {d['id'] for d in en_results['results']}
overlap = de_ids & en_ids
overlap_pct = (len(overlap) / len(de_ids)) * 100
print(f"German query: {len(de_results['results'])} results")
print(f"English query: {len(en_results['results'])} results")
print(f"Overlap: {overlap_pct:.0f}%")
if overlap_pct > 70:
print("Success: Language detection effective")German query
Search using German terms:
# German semantic search
de_results = semantic_search_datasets(
natural_query="Gesundheitsdaten aus Wien",
limit=10
)
print("German query results:")
for dataset in de_results['results'][:3]:
print(f"- {dataset['title']['de']}")English query equivalent
Same search intent in English:
# English semantic search
en_results = semantic_search_datasets(
natural_query="health data from Vienna",
limit=10
)
print("\nEnglish query results:")
for dataset in en_results['results'][:3]:
print(f"- {dataset['title'].get('de', 'No title')}")Compare results
Analyze language detection:
# Extract dataset IDs
de_ids = {d['id'] for d in de_results['results']}
en_ids = {d['id'] for d in en_results['results']}
# Find overlap
overlap = de_ids & en_ids
print(f"\nOverlapping datasets: {len(overlap)}/{len(de_ids)}")
# Check language detection effectiveness
overlap_pct = (len(overlap) / len(de_ids)) * 100
if overlap_pct > 70:
print(f"Success: Language detection effective ({overlap_pct:.0f}% overlap)")Workflow 5: Catalogue exploration
Explore all data from a specific publisher:
# Catalogue exploration
# 1. List catalogues
catalogues = list_catalogues(limit=50)
vienna = next(c for c in catalogues if 'wien' in c.get('title', {}).get('de', '').lower())
# 2. Get catalogue metadata
catalogue = get_catalogue(catalogue_id=vienna['id'])
# 3. Browse catalogue datasets
datasets = search_datasets(
query="",
catalogues=[vienna['id']],
sort_by="modified_desc",
limit=20
)
# 4. Analyze category distribution
category_counts = {}
for dataset in datasets['results']:
for category in dataset.get('categories', []):
cat_id = category['id']
category_counts[cat_id] = category_counts.get(cat_id, 0) + 1
print(f"Catalogue: {catalogue['title']['de']}")
print(f"Total datasets: {catalogue['count']}")
print("\nCategory distribution:")
for cat, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
print(f" {cat}: {count} datasets")List catalogues
Find available data catalogues:
# Get all catalogues
catalogues = list_catalogues(limit=50)
# Find Vienna catalogue
vienna = next(
c for c in catalogues
if 'wien' in c.get('title', {}).get('de', '').lower()
)
print(f"Found: {vienna['title']['de']}")
print(f"Catalogue ID: {vienna['id']}")
print(f"Dataset count: {vienna['count']}")Get catalogue metadata
Retrieve detailed catalogue information:
# Get detailed catalogue information
catalogue = get_catalogue(catalogue_id=vienna['id'])
print(f"\nCatalogue: {catalogue['title']['de']}")
print(f"Description: {catalogue.get('description', {}).get('de', 'N/A')[:200]}")
print(f"Datasets: {catalogue['count']}")
print(f"Modified: {catalogue['modified']}")Browse catalogue datasets
Search within specific catalogue:
# Search within catalogue
datasets = search_datasets(
query="",
catalogues=[vienna['id']],
sort_by="modified_desc",
limit=20
)
print(f"\nRecent datasets from {vienna['title']['de']}:")
for i, dataset in enumerate(datasets['results'][:10], 1):
title = dataset['title']['de']
modified = dataset['modified']
categories = [c['id'] for c in dataset.get('categories', [])]
print(f"{i}. {title}")
print(f" Modified: {modified}, Categories: {', '.join(categories)}")Category distribution
Analyze categories the catalogue covers:
# Analyze category coverage
category_counts = {}
for dataset in datasets['results']:
for category in dataset.get('categories', []):
cat_id = category['id']
category_counts[cat_id] = category_counts.get(cat_id, 0) + 1
print("\nCategory distribution:")
for cat, count in sorted(
category_counts.items(),
key=lambda x: x[1],
reverse=True
):
print(f" {cat}: {count} datasets")Best practices
Error handling
Wrap workflows in try-except blocks:
try:
results = search_datasets(query="Bevölkerung", limit=10)
# Process results...
except ToolError as e:
print(f"Error: {e}")
# Handle gracefullyPagination
For large result sets, paginate:
limit = 20
offset = 0
all_results = []
while True:
page = search_datasets(
query="Umwelt",
limit=limit,
offset=offset
)
all_results.extend(page['results'])
# Check if more pages exist
if offset + limit >= page['count']:
break
offset += limit
print(f"Retrieved {len(all_results)} total datasets")Caching results
Cache expensive operations:
# Cache quality analysis results
quality_cache = {}
def get_quality_cached(dataset_id):
if dataset_id not in quality_cache:
quality_cache[dataset_id] = analyze_dataset_quality(
dataset_id=dataset_id
)
return quality_cache[dataset_id]Next steps
- API Reference - Complete tool documentation
- Installation - Setup guide
- First Query - Getting started tutorial
How is this guide?
Last updated on