šŸš€ Examples & Tutorials

Learn how to use RatCrawler with practical examples

⚔ Quick Start Guide

Get up and running with RatCrawler in minutes

Python Quick Start

1. Basic Web Crawling

#!/usr/bin/env python3
from crawler import EnhancedProductionCrawler

# Configure crawler
config = {
    'delay': 1.0,
    'max_depth': 2,
    'max_pages': 50,
    'db_path': 'my_crawler.db'
}

# Initialize and run
crawler = EnhancedProductionCrawler(config)
seed_urls = ['https://example.com', 'https://python.org']

results = crawler.comprehensive_crawl(seed_urls)
print(f"āœ… Crawled {results['pages_crawled']} pages")

2. Backlink Analysis

from backlinkprocessor import BacklinkProcessor

# Analyze backlinks
processor = BacklinkProcessor(delay=1.0)
processor.crawl_backlinks(['https://example.com'])

# Get results
pagerank = processor.calculate_pagerank()
processor.calculate_domain_authority()

print(f"Found {len(processor.backlinks)} backlinks")

Rust Quick Start

1. Async Web Crawling

use ratcrawler::*;
use tokio;

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let config = CrawlConfig {
        user_agent: "MyCrawler/1.0".to_string(),
        max_pages: 100,
        delay_ms: 1000,
        ..Default::default()
    };

    let mut crawler = WebsiteCrawler::new(&config);
    let mut db = WebsiteCrawlerDatabase::new("crawl.db")?;

    let urls = vec!["https://example.com".to_string()];
    let result = crawler.crawl(urls, &mut db).await?;

    println!("Crawled {} pages successfully!", result.pages_crawled);
    Ok(())
}

2. Command Line Usage

# Build the project
cargo build --release

# Crawl a website
./target/release/rat-crawler crawl https://example.com

# Analyze backlinks
./target/release/rat-crawler backlinks https://example.com

# Integrated analysis
./target/release/rat-crawler integrated https://example.com

šŸ“– Step-by-Step Tutorials

Detailed guides for common use cases

Website Content Analysis

Extract and analyze content from websites

šŸ Python Implementation

from crawler import EnhancedProductionCrawler
from backlinkprocessor import BacklinkProcessor

def analyze_website(url):
    # Configure crawler
    config = {
        'delay': 1.5,
        'max_depth': 3,
        'max_pages': 100,
        'analyze_backlinks': True,
        'export_json': True
    }

    # Initialize components
    crawler = EnhancedProductionCrawler(config)
    backlink_processor = BacklinkProcessor()

    # Comprehensive analysis
    results = crawler.comprehensive_crawl([url])

    # Extract insights
    print(f"Pages crawled: {results['pages_crawled']}")
    print(f"Backlinks found: {results['backlinks_found']}")
    print(f"Avg word count: {results['avg_word_count']:.0f}")

    return results

# Usage
results = analyze_website('https://example.com')

šŸ¦€ Rust Implementation

use ratcrawler::*;
use tokio;

async fn analyze_website(url: &str) -> Result<(), Box<dyn std::error::Error>> {
    // Configure crawler
    let config = CrawlConfig {
        user_agent: "WebsiteAnalyzer/1.0".to_string(),
        max_pages: 100,
        max_depth: 3,
        respect_robots_txt: true,
        ..Default::default()
    };

    // Initialize crawler
    let mut crawler = WebsiteCrawler::new(&config);
    let mut database = WebsiteCrawlerDatabase::new("analysis.db")?;

    // Crawl website
    let result = crawler.crawl(vec![url.to_string()], &mut database).await?;

    // Analyze backlinks
    let backlink_config = BacklinkConfig {
        user_agent: "BacklinkAnalyzer/1.0".to_string(),
        timeout_secs: 30,
        max_redirects: 5,
    };

    let processor = BacklinkProcessor::new(
        backlink_config.user_agent,
        backlink_config.timeout_secs,
        backlink_config.max_redirects,
    );

    let analysis = processor.analyze_backlinks(url).await?;

    println!("Analysis complete:");
    println!("  Pages: {}", result.pages_crawled);
    println!("  Backlinks: {}", analysis.total_backlinks);

    Ok(())
}

SEO & Backlink Analysis

Analyze SEO metrics and backlink profiles

from backlinkprocessor import BacklinkProcessor
import networkx as nx

def seo_analysis(target_url):
    processor = BacklinkProcessor(delay=1.0)

    # Discover backlinks
    processor.crawl_backlinks([target_url], max_depth=2)

    # Build link graph
    processor.build_link_graph()

    # Calculate SEO metrics
    pagerank_scores = processor.calculate_pagerank()
    processor.calculate_domain_authority()

    # Detect spam links
    spam_links = processor.detect_link_spam()

    # Generate comprehensive report
    report = processor.generate_backlink_report(target_url)

    print("SEO Analysis Results:")
    print(f"  Total backlinks: {report['total_backlinks']}")
    print(f"  Unique domains: {report['unique_referring_domains']}")
    print(f"  Domain authority: {report['domain_authority']:.1f}")
    print(f"  Spam links detected: {len(spam_links)}")

    return report

# Analyze your website
report = seo_analysis('https://yourwebsite.com')

Automated Scheduled Crawling

Set up automated crawling with scheduling

#!/usr/bin/env python3
import schedule
import time
from datetime import datetime
from crawler import EnhancedProductionCrawler

def scheduled_crawl():
    """Perform scheduled comprehensive crawl"""
    print(f"šŸ•·ļø Starting scheduled crawl at {datetime.now()}")

    config = {
        'delay': 1.5,
        'max_depth': 3,
        'max_pages': 200,
        'db_path': 'scheduled_crawler.db',
        'analyze_backlinks': True,
        'export_json': True
    }

    crawler = EnhancedProductionCrawler(config)
    seed_urls = [
        'https://example.com',
        'https://python.org',
        'https://github.com'
    ]

    try:
        results = crawler.comprehensive_crawl(seed_urls)
        print(f"āœ… Crawl completed: {results['pages_crawled']} pages")
    except Exception as e:
        print(f"āŒ Crawl failed: {e}")

def main():
    """Main scheduler function"""
    print("šŸš€ Starting RatCrawler Scheduler")

    # Schedule daily crawl at 2 AM
    schedule.every().day.at("02:00").do(scheduled_crawl)

    # Schedule weekly comprehensive analysis on Sundays
    schedule.every().sunday.at("03:00").do(lambda: print("šŸ“Š Weekly analysis would run here"))

    print("ā° Scheduler active. Waiting for scheduled tasks...")
    print("Press Ctrl+C to stop")

    try:
        while True:
            schedule.run_pending()
            time.sleep(60)  # Check every minute
    except KeyboardInterrupt:
        print("\nšŸ›‘ Scheduler stopped by user")

if __name__ == "__main__":
    main()

šŸ’¼ Real-World Use Cases

Practical applications of RatCrawler

SEO Monitoring

Monitor website SEO metrics, track backlinks, and analyze competitor strategies.

  • • Track PageRank changes
  • • Monitor domain authority
  • • Analyze competitor backlinks
  • • Detect new linking domains

Content Aggregation

Collect and analyze content from news sites, blogs, and industry publications.

  • • Automated content discovery
  • • News trend analysis
  • • Content quality assessment
  • • Topic clustering

Market Research

Research market trends, competitor analysis, and industry insights.

  • • Competitor website analysis
  • • Industry trend monitoring
  • • Product feature tracking
  • • Pricing strategy analysis

Security Research

Identify security vulnerabilities and monitor for malicious content.

  • • Malware detection
  • • Phishing site identification
  • • Security vulnerability scanning
  • • Dark web monitoring

Academic Research

Collect data for academic studies, web science research, and network analysis.

  • • Web graph analysis
  • • Link network studies
  • • Content analysis research
  • • Digital humanities projects

AI Training Data

Collect diverse web content for training machine learning models.

  • • Text corpus collection
  • • Multi-domain datasets
  • • Language model training
  • • Content classification data

šŸ”¬ Advanced Examples

Complex implementations and custom integrations

Custom Crawler with ML Analysis

import torch
import transformers
from crawler import EnhancedProductionCrawler
from sklearn.feature_extraction.text import TfidfVectorizer

class MLAnalysisCrawler(EnhancedProductionCrawler):
    def __init__(self, config):
        super().__init__(config)
        self.vectorizer = TfidfVectorizer(max_features=1000)
        self.sentiment_analyzer = transformers.pipeline('sentiment-analysis')

    def analyze_content_quality(self, page_data):
        """Analyze content using ML models"""
        if not page_data.get('content_text'):
            return {}

        # TF-IDF analysis
        tfidf_matrix = self.vectorizer.fit_transform([page_data['content_text']])
        content_score = tfidf_matrix.sum() / len(page_data['content_text'].split())

        # Sentiment analysis
        sentiment = self.sentiment_analyzer(page_data['content_text'][:512])[0]

        # Readability metrics
        sentences = len(page_data['content_text'].split('.'))
        words = len(page_data['content_text'].split())
        avg_sentence_length = words / sentences if sentences > 0 else 0

        return {
            'content_score': float(content_score),
            'sentiment': sentiment['label'],
            'sentiment_confidence': sentiment['score'],
            'avg_sentence_length': avg_sentence_length,
            'readability_score': 206.835 - 1.015 * (words/sentences) - 84.6 * (sentences/words)
        }

    def comprehensive_crawl_with_ml(self, seed_urls):
        """Enhanced crawl with ML analysis"""
        results = self.comprehensive_crawl(seed_urls)

        # Add ML analysis to each page
        for page in results.get('sample_pages', []):
            page_data = self.database.get_page_content(page['url'])
            if page_data:
                ml_analysis = self.analyze_content_quality(page_data)
                page.update(ml_analysis)

        return results

# Usage
config = {
    'delay': 1.0,
    'max_pages': 50,
    'db_path': 'ml_crawler.db'
}

ml_crawler = MLAnalysisCrawler(config)
results = ml_crawler.comprehensive_crawl_with_ml(['https://example.com'])

Distributed Crawling Setup

#!/usr/bin/env python3
import multiprocessing as mp
import queue
import time
from crawler import EnhancedProductionCrawler

class DistributedCrawler:
    def __init__(self, num_workers=4):
        self.num_workers = num_workers
        self.url_queue = mp.Queue()
        self.result_queue = mp.Queue()
        self.workers = []

    def worker_process(self, worker_id, config):
        """Individual crawler worker"""
        crawler = EnhancedProductionCrawler(config)

        while True:
            try:
                url = self.url_queue.get(timeout=5)
                print(f"Worker {worker_id}: Processing {url}")

                # Crawl single page
                page_data = crawler.crawl_page_content(url)
                if page_data:
                    self.result_queue.put(page_data)

            except queue.Empty:
                break
            except Exception as e:
                print(f"Worker {worker_id} error: {e}")

    def distribute_crawl(self, urls, config):
        """Distribute crawling across multiple processes"""
        # Start worker processes
        for i in range(self.num_workers):
            worker_config = config.copy()
            worker_config['db_path'] = f'worker_{i}.db'

            p = mp.Process(target=self.worker_process, args=(i, worker_config))
            p.start()
            self.workers.append(p)

        # Add URLs to queue
        for url in urls:
            self.url_queue.put(url)

        # Collect results
        results = []
        for _ in range(len(urls)):
            try:
                result = self.result_queue.get(timeout=30)
                results.append(result)
            except queue.Empty:
                break

        # Stop workers
        for p in self.workers:
            p.terminate()
            p.join()

        return results

# Usage
config = {
    'delay': 1.0,
    'max_depth': 1,  # Single page per worker
    'stay_on_domain': True
}

distributed_crawler = DistributedCrawler(num_workers=4)
urls = [f'https://example.com/page{i}' for i in range(20)]

results = distributed_crawler.distribute_crawl(urls, config)
print(f"Distributed crawl completed: {len(results)} pages")