#!/usr/bin/env python3
"""
Data Harvester Template v6.1
Template for creating data harvesting scripts
Usage: python ai/harvest_data.py
"""

import os
import json
import datetime
import hashlib
from pathlib import Path

ROOT = Path(__file__).parent.parent
NOW = datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"

def load_json(path):
    """Load JSON file"""
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(path, data):
    """Save JSON file with pretty formatting"""
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"✅ Saved: {path}")

def generate_id(text):
    """Generate unique identifier from text"""
    return hashlib.md5(text.encode()).hexdigest()

class DataHarvester:
    """Base class for data harvesting"""
    
    def __init__(self):
        self.data = []
        self.errors = []
    
    def validate_data(self, item):
        """Validate a single data item"""
        # Override in subclass
        required_fields = []
        for field in required_fields:
            if field not in item or not item[field]:
                self.errors.append(f"Missing required field: {field}")
                return False
        return True
    
    def deduplicate(self, existing, new):
        """Remove duplicates from new data"""
        # Override in subclass based on unique identifier
        existing_ids = set()
        for item in existing:
            if 'id' in item:
                existing_ids.add(item['id'])
        
        deduplicated = []
        for item in new:
            if 'id' in item and item['id'] not in existing_ids:
                deduplicated.append(item)
        
        return deduplicated
    
    def harvest(self, source):
        """Harvest data from source"""
        # Override in subclass
        print(f"Harvesting from: {source}")
        # Implement source-specific harvesting logic
        return []
    
    def transform(self, raw_data):
        """Transform raw data to federation format"""
        # Override in subclass
        transformed = []
        for item in raw_data:
            # Transform each item
            transformed.append(item)
        return transformed
    
    def merge(self, target_file, new_data):
        """Merge new data with existing data"""
        target_path = ROOT / target_file
        
        try:
            existing = load_json(target_path)
            if isinstance(existing, dict) and 'data' in existing:
                existing_data = existing['data']
            elif isinstance(existing, list):
                existing_data = existing
            else:
                existing_data = []
        except:
            existing_data = []
        
        # Deduplicate
        unique_new = self.deduplicate(existing_data, new_data)
        
        # Merge
        merged = existing_data + unique_new
        
        print(f"📊 Existing: {len(existing_data)}, New: {len(unique_new)}, Total: {len(merged)}")
        
        return merged
    
    def update_catalog(self, dataset_name, dataset_path):
        """Update catalog.json with new dataset"""
        catalog_path = ROOT / 'ai' / 'catalog.json'
        
        try:
            catalog = load_json(catalog_path)
        except:
            catalog = {
                "@context": "https://schema.org",
                "@type": "DataCatalog",
                "name": "AI Website Systems Dataset Catalog",
                "url": "https://www.aiwebsitesystems.com/ai/catalog.json",
                "version": "6.1",
                "dataset": []
            }
        
        # Check if dataset already exists
        existing_urls = [ds.get('url', '') for ds in catalog['dataset']]
        dataset_url = f"https://www.aiwebsitesystems.com{dataset_path if dataset_path.startswith('/') else '/' + dataset_path}"
        
        if dataset_url not in existing_urls:
            catalog['dataset'].append({
                "@type": "Dataset",
                "name": dataset_name,
                "description": f"Harvested dataset: {dataset_name}",
                "url": dataset_url,
                "dateModified": NOW,
                "identifier": generate_id(dataset_name),
                "encodingFormat": "application/json"
            })
            
            catalog['dateModified'] = NOW
            save_json(catalog_path, catalog)
            print(f"📚 Added {dataset_name} to catalog")
    
    def update_manifest(self, dataset_name, dataset_path):
        """Update manifest.json with new dataset"""
        manifest_path = ROOT / 'ai' / 'manifest.json'
        
        try:
            manifest = load_json(manifest_path)
        except:
            print("⚠️  Warning: manifest.json not found")
            return
        
        if 'datasets' not in manifest:
            manifest['datasets'] = []
        
        # Check if dataset already exists
        existing_names = [ds.get('name', '') for ds in manifest['datasets']]
        
        if dataset_name not in existing_names:
            manifest['datasets'].append({
                "name": dataset_name,
                "description": f"Harvested dataset: {dataset_name}",
                "url": dataset_path,
                "format": "application/json"
            })
            
            manifest['updated_utc'] = NOW
            save_json(manifest_path, manifest)
            print(f"📋 Added {dataset_name} to manifest")
    
    def update_health(self):
        """Update health.json metrics"""
        health_path = ROOT / 'ai' / 'health.json'
        
        try:
            health = load_json(health_path)
        except:
            health = {
                "site": "https://www.aiwebsitesystems.com",
                "status": "ok",
                "metrics": {}
            }
        
        # Update dataset count
        try:
            catalog = load_json(ROOT / 'ai' / 'catalog.json')
            health['metrics']['datasetCount'] = len(catalog.get('dataset', []))
        except:
            pass
        
        health['updated_utc'] = NOW
        save_json(health_path, health)
        print(f"💚 Updated health metrics")


# Example: Business Listing Harvester
class BusinessListingHarvester(DataHarvester):
    """Harvest business listings"""
    
    def validate_data(self, item):
        """Validate business listing"""
        required = ['name', 'category']
        for field in required:
            if field not in item or not item[field]:
                self.errors.append(f"Missing {field} in {item.get('name', 'unknown')}")
                return False
        return True
    
    def harvest_from_json(self, json_data):
        """Harvest from JSON data"""
        harvested = []
        
        for item in json_data:
            if self.validate_data(item):
                # Transform to standard format
                listing = {
                    "id": generate_id(item.get('name', '') + item.get('url', '')),
                    "name": item.get('name', ''),
                    "category": item.get('category', 'Uncategorized'),
                    "description": item.get('description', ''),
                    "url": item.get('url', ''),
                    "karma_score": item.get('karma_score', 0.0),
                    "status": item.get('status', 'active'),
                    "added_utc": NOW
                }
                harvested.append(listing)
        
        return harvested


# Example usage
if __name__ == '__main__':
    print("\n" + "=" * 50)
    print("  Data Harvester Template v6.1")
    print("=" * 50 + "\n")
    
    print("This is a template for creating data harvesting scripts.")
    print("\nTo use:")
    print("1. Create a subclass of DataHarvester")
    print("2. Override harvest(), validate_data(), and transform() methods")
    print("3. Call the harvesting workflow\n")
    
    print("Example: BusinessListingHarvester")
    print("  - Validates business listing data")
    print("  - Transforms to federation format")
    print("  - Merges with existing listings")
    print("  - Updates catalog, manifest, and health\n")
    
    print("✅ Template loaded. Create your harvester!\n")
