Complete configuration reference for RatCrawler
Essential settings for getting started with RatCrawler
delay
Request delay between pages (seconds)
max_depth
Maximum crawl depth
max_pages
Maximum pages to crawl
user_agent
Custom user agent string
stay_on_domain
Restrict crawling to seed domain
allowed_domains
List of allowed domains
db_path
Database file path
pub struct CrawlConfig {
pub user_agent: String, // Default: "RatCrawler/1.0"
pub timeout_secs: u64, // Default: 30
pub max_redirects: usize, // Default: 5
pub max_depth: usize, // Default: 3
pub max_pages: usize, // Default: 100
pub delay_ms: u64, // Default: 1000
pub respect_robots_txt: bool, // Default: true
}
pub struct BacklinkConfig {
pub user_agent: String, // Default: "RatCrawler-Backlinks/1.0"
pub timeout_secs: u64, // Default: 60
pub max_redirects: usize, // Default: 5
}
# Python Configuration Dictionary
config = {
# Basic crawling settings
'delay': 1.5, # Seconds between requests
'max_depth': 3, # Maximum link depth
'max_pages': 100, # Maximum pages to crawl
'user_agent': 'MyCrawler/1.0', # Custom user agent
# Domain restrictions
'stay_on_domain': True, # Stay within seed domain
'allowed_domains': [], # Additional allowed domains
# Database settings
'db_path': 'crawler.db', # Database file path
'recrawl_days': 7, # Days between recrawls
# Analysis settings
'analyze_backlinks': True, # Enable backlink analysis
# Export settings
'export_json': True, # Export results as JSON
'export_csv': False, # Export data as CSV
}
Fine-tune RatCrawler for specific use cases
# Production-ready configuration
production_config = {
# Performance optimized
'delay': 2.0,
'max_pages': 10000,
'timeout': 30,
'max_redirects': 5,
# Content filtering
'allowed_domains': ['example.com', 'blog.example.com'],
'exclude_patterns': [
r'\.pdf$',
r'\.jpg$',
r'/admin/',
r'/private/'
],
# Quality control
'min_word_count': 50,
'min_content_length': 500,
'duplicate_threshold': 0.85,
# Database optimization
'db_path': '/data/crawler/production.db',
'batch_size': 50,
'vacuum_interval': 1000,
# Monitoring
'enable_metrics': True,
'log_level': 'INFO',
'stats_interval': 100,
# Export & backup
'export_json': True,
'export_csv': False,
'backup_interval': 3600, # 1 hour
}
Configure RatCrawler using environment variables
MONGODB_URI
Optional
MongoDB connection string for additional storage
MAX_CRAWL_DEPTH
Default: 3
Maximum depth for crawling
MAX_PAGES_PER_DOMAIN
Default: 100
Maximum pages to crawl per domain
USER_AGENT
Optional
Custom user agent string
RUST_LOG
Optional
Set logging level (debug, info, warn, error)
DATABASE_URL
Optional
Database connection URL
MAX_WORKERS
Default: 4
Maximum number of worker threads
CACHE_SIZE
Default: 10000
URL cache size in memory
# RatCrawler Environment Configuration
# Copy this to .env file in your project root
# Database Configuration
MONGODB_URI=mongodb://localhost:27017/ratcrawler
DATABASE_URL=sqlite:///ratcrawler.db
# Crawling Limits
MAX_CRAWL_DEPTH=3
MAX_PAGES_PER_DOMAIN=100
MAX_WORKERS=4
# Performance Settings
REQUEST_TIMEOUT=30
CRAWL_DELAY=1.5
CACHE_SIZE=10000
# User Agent
USER_AGENT=RatCrawler/1.0 (+https://github.com/yourusername/ratcrawler)
# Logging
RUST_LOG=info
LOG_LEVEL=INFO
# Export Settings
EXPORT_JSON=true
EXPORT_CSV=false
BACKUP_INTERVAL=3600
# Security
ALLOWED_DOMAINS=example.com,blog.example.com
BLOCKED_DOMAINS=spam.com,bad.com
# API Keys (if using external services)
GOOGLE_TRENDS_API_KEY=your_api_key_here
NEWS_API_KEY=your_api_key_here
Ready-to-use configurations for different scenarios
seo_config = {
'delay': 2.0,
'max_pages': 500,
'analyze_backlinks': True,
'export_json': True,
'recrawl_days': 30,
'user_agent': 'SEO-Analyzer/1.0'
}
For comprehensive SEO audits and backlink analysis
monitor_config = {
'delay': 1.0,
'max_pages': 1000,
'stay_on_domain': True,
'recrawl_days': 1,
'export_json': True
}
For monitoring website changes and content updates
research_config = {
'delay': 3.0,
'max_pages': 10000,
'max_depth': 5,
'export_csv': True,
'user_agent': 'ResearchBot/1.0'
}
For academic research and large-scale data collection
fast_config = {
'delay': 0.5,
'max_pages': 100,
'max_depth': 2,
'timeout': 10,
'stay_on_domain': True
}
For quick scans and time-sensitive data collection
enterprise_config = {
'delay': 1.0,
'max_pages': 50000,
'batch_size': 100,
'enable_metrics': True,
'backup_interval': 1800
}
For large-scale enterprise crawling with monitoring
minimal_config = {
'delay': 1.0,
'max_pages': 10,
'max_depth': 1,
'export_json': False
}
For testing and development with minimal resource usage