Learn how to use RatCrawler with practical examples
Get up and running with RatCrawler in minutes
#!/usr/bin/env python3
from crawler import EnhancedProductionCrawler
# Configure crawler
config = {
'delay': 1.0,
'max_depth': 2,
'max_pages': 50,
'db_path': 'my_crawler.db'
}
# Initialize and run
crawler = EnhancedProductionCrawler(config)
seed_urls = ['https://example.com', 'https://python.org']
results = crawler.comprehensive_crawl(seed_urls)
print(f"ā
Crawled {results['pages_crawled']} pages")
from backlinkprocessor import BacklinkProcessor
# Analyze backlinks
processor = BacklinkProcessor(delay=1.0)
processor.crawl_backlinks(['https://example.com'])
# Get results
pagerank = processor.calculate_pagerank()
processor.calculate_domain_authority()
print(f"Found {len(processor.backlinks)} backlinks")
use ratcrawler::*;
use tokio;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let config = CrawlConfig {
user_agent: "MyCrawler/1.0".to_string(),
max_pages: 100,
delay_ms: 1000,
..Default::default()
};
let mut crawler = WebsiteCrawler::new(&config);
let mut db = WebsiteCrawlerDatabase::new("crawl.db")?;
let urls = vec!["https://example.com".to_string()];
let result = crawler.crawl(urls, &mut db).await?;
println!("Crawled {} pages successfully!", result.pages_crawled);
Ok(())
}
# Build the project
cargo build --release
# Crawl a website
./target/release/rat-crawler crawl https://example.com
# Analyze backlinks
./target/release/rat-crawler backlinks https://example.com
# Integrated analysis
./target/release/rat-crawler integrated https://example.com
Detailed guides for common use cases
Extract and analyze content from websites
from crawler import EnhancedProductionCrawler
from backlinkprocessor import BacklinkProcessor
def analyze_website(url):
# Configure crawler
config = {
'delay': 1.5,
'max_depth': 3,
'max_pages': 100,
'analyze_backlinks': True,
'export_json': True
}
# Initialize components
crawler = EnhancedProductionCrawler(config)
backlink_processor = BacklinkProcessor()
# Comprehensive analysis
results = crawler.comprehensive_crawl([url])
# Extract insights
print(f"Pages crawled: {results['pages_crawled']}")
print(f"Backlinks found: {results['backlinks_found']}")
print(f"Avg word count: {results['avg_word_count']:.0f}")
return results
# Usage
results = analyze_website('https://example.com')
use ratcrawler::*;
use tokio;
async fn analyze_website(url: &str) -> Result<(), Box<dyn std::error::Error>> {
// Configure crawler
let config = CrawlConfig {
user_agent: "WebsiteAnalyzer/1.0".to_string(),
max_pages: 100,
max_depth: 3,
respect_robots_txt: true,
..Default::default()
};
// Initialize crawler
let mut crawler = WebsiteCrawler::new(&config);
let mut database = WebsiteCrawlerDatabase::new("analysis.db")?;
// Crawl website
let result = crawler.crawl(vec![url.to_string()], &mut database).await?;
// Analyze backlinks
let backlink_config = BacklinkConfig {
user_agent: "BacklinkAnalyzer/1.0".to_string(),
timeout_secs: 30,
max_redirects: 5,
};
let processor = BacklinkProcessor::new(
backlink_config.user_agent,
backlink_config.timeout_secs,
backlink_config.max_redirects,
);
let analysis = processor.analyze_backlinks(url).await?;
println!("Analysis complete:");
println!(" Pages: {}", result.pages_crawled);
println!(" Backlinks: {}", analysis.total_backlinks);
Ok(())
}
Analyze SEO metrics and backlink profiles
from backlinkprocessor import BacklinkProcessor
import networkx as nx
def seo_analysis(target_url):
processor = BacklinkProcessor(delay=1.0)
# Discover backlinks
processor.crawl_backlinks([target_url], max_depth=2)
# Build link graph
processor.build_link_graph()
# Calculate SEO metrics
pagerank_scores = processor.calculate_pagerank()
processor.calculate_domain_authority()
# Detect spam links
spam_links = processor.detect_link_spam()
# Generate comprehensive report
report = processor.generate_backlink_report(target_url)
print("SEO Analysis Results:")
print(f" Total backlinks: {report['total_backlinks']}")
print(f" Unique domains: {report['unique_referring_domains']}")
print(f" Domain authority: {report['domain_authority']:.1f}")
print(f" Spam links detected: {len(spam_links)}")
return report
# Analyze your website
report = seo_analysis('https://yourwebsite.com')
Set up automated crawling with scheduling
#!/usr/bin/env python3
import schedule
import time
from datetime import datetime
from crawler import EnhancedProductionCrawler
def scheduled_crawl():
"""Perform scheduled comprehensive crawl"""
print(f"š·ļø Starting scheduled crawl at {datetime.now()}")
config = {
'delay': 1.5,
'max_depth': 3,
'max_pages': 200,
'db_path': 'scheduled_crawler.db',
'analyze_backlinks': True,
'export_json': True
}
crawler = EnhancedProductionCrawler(config)
seed_urls = [
'https://example.com',
'https://python.org',
'https://github.com'
]
try:
results = crawler.comprehensive_crawl(seed_urls)
print(f"ā
Crawl completed: {results['pages_crawled']} pages")
except Exception as e:
print(f"ā Crawl failed: {e}")
def main():
"""Main scheduler function"""
print("š Starting RatCrawler Scheduler")
# Schedule daily crawl at 2 AM
schedule.every().day.at("02:00").do(scheduled_crawl)
# Schedule weekly comprehensive analysis on Sundays
schedule.every().sunday.at("03:00").do(lambda: print("š Weekly analysis would run here"))
print("ā° Scheduler active. Waiting for scheduled tasks...")
print("Press Ctrl+C to stop")
try:
while True:
schedule.run_pending()
time.sleep(60) # Check every minute
except KeyboardInterrupt:
print("\nš Scheduler stopped by user")
if __name__ == "__main__":
main()
Practical applications of RatCrawler
Monitor website SEO metrics, track backlinks, and analyze competitor strategies.
Collect and analyze content from news sites, blogs, and industry publications.
Research market trends, competitor analysis, and industry insights.
Identify security vulnerabilities and monitor for malicious content.
Collect data for academic studies, web science research, and network analysis.
Collect diverse web content for training machine learning models.
Complex implementations and custom integrations
import torch
import transformers
from crawler import EnhancedProductionCrawler
from sklearn.feature_extraction.text import TfidfVectorizer
class MLAnalysisCrawler(EnhancedProductionCrawler):
def __init__(self, config):
super().__init__(config)
self.vectorizer = TfidfVectorizer(max_features=1000)
self.sentiment_analyzer = transformers.pipeline('sentiment-analysis')
def analyze_content_quality(self, page_data):
"""Analyze content using ML models"""
if not page_data.get('content_text'):
return {}
# TF-IDF analysis
tfidf_matrix = self.vectorizer.fit_transform([page_data['content_text']])
content_score = tfidf_matrix.sum() / len(page_data['content_text'].split())
# Sentiment analysis
sentiment = self.sentiment_analyzer(page_data['content_text'][:512])[0]
# Readability metrics
sentences = len(page_data['content_text'].split('.'))
words = len(page_data['content_text'].split())
avg_sentence_length = words / sentences if sentences > 0 else 0
return {
'content_score': float(content_score),
'sentiment': sentiment['label'],
'sentiment_confidence': sentiment['score'],
'avg_sentence_length': avg_sentence_length,
'readability_score': 206.835 - 1.015 * (words/sentences) - 84.6 * (sentences/words)
}
def comprehensive_crawl_with_ml(self, seed_urls):
"""Enhanced crawl with ML analysis"""
results = self.comprehensive_crawl(seed_urls)
# Add ML analysis to each page
for page in results.get('sample_pages', []):
page_data = self.database.get_page_content(page['url'])
if page_data:
ml_analysis = self.analyze_content_quality(page_data)
page.update(ml_analysis)
return results
# Usage
config = {
'delay': 1.0,
'max_pages': 50,
'db_path': 'ml_crawler.db'
}
ml_crawler = MLAnalysisCrawler(config)
results = ml_crawler.comprehensive_crawl_with_ml(['https://example.com'])
#!/usr/bin/env python3
import multiprocessing as mp
import queue
import time
from crawler import EnhancedProductionCrawler
class DistributedCrawler:
def __init__(self, num_workers=4):
self.num_workers = num_workers
self.url_queue = mp.Queue()
self.result_queue = mp.Queue()
self.workers = []
def worker_process(self, worker_id, config):
"""Individual crawler worker"""
crawler = EnhancedProductionCrawler(config)
while True:
try:
url = self.url_queue.get(timeout=5)
print(f"Worker {worker_id}: Processing {url}")
# Crawl single page
page_data = crawler.crawl_page_content(url)
if page_data:
self.result_queue.put(page_data)
except queue.Empty:
break
except Exception as e:
print(f"Worker {worker_id} error: {e}")
def distribute_crawl(self, urls, config):
"""Distribute crawling across multiple processes"""
# Start worker processes
for i in range(self.num_workers):
worker_config = config.copy()
worker_config['db_path'] = f'worker_{i}.db'
p = mp.Process(target=self.worker_process, args=(i, worker_config))
p.start()
self.workers.append(p)
# Add URLs to queue
for url in urls:
self.url_queue.put(url)
# Collect results
results = []
for _ in range(len(urls)):
try:
result = self.result_queue.get(timeout=30)
results.append(result)
except queue.Empty:
break
# Stop workers
for p in self.workers:
p.terminate()
p.join()
return results
# Usage
config = {
'delay': 1.0,
'max_depth': 1, # Single page per worker
'stay_on_domain': True
}
distributed_crawler = DistributedCrawler(num_workers=4)
urls = [f'https://example.com/page{i}' for i in range(20)]
results = distributed_crawler.distribute_crawl(urls, config)
print(f"Distributed crawl completed: {len(results)} pages")