#!/usr/bin/env python3
"""
Web scraper script to fetch JSON data from Politecnico di Torino API.
This script runs periodically (every 24 hours) to fetch and store data.
"""

import os
import requests
import json
import logging
from datetime import datetime

import psycopg2
from psycopg2 import sql
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Target URL
URL = "https://www.swas.polito.it/dotnet/WMHandler/IrisEsportaJson.ashx?rp=rp21802"

# Database configuration from environment variables
DB_CONFIG = {
    'host': os.getenv('DB_HOST', 'db'),
    'port': os.getenv('DB_PORT', '5432'),
    'user': os.getenv('DB_USER', 'wuyuxuan'),
    'password': os.getenv('DB_PASSWORD', '1234567890'),
    'database': os.getenv('DB_NAME', 'postgres')
}


def get_db_connection():
    """
    Create and return a database connection.
    
    Returns:
        psycopg2.connection: Database connection object, or None if connection fails.
    """
    try:
        conn = psycopg2.connect(
            host=DB_CONFIG['host'],
            port=DB_CONFIG['port'],
            user=DB_CONFIG['user'],
            password=DB_CONFIG['password'],
            database=DB_CONFIG['database']
        )
        logger.info(f"Successfully connected to database: {DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}")
        return conn
    except psycopg2.Error as e:
        logger.error(f"Failed to connect to database: {e}")
        return None


def test_db_connection():
    """
    Test the database connection.
    
    Returns:
        bool: True if connection is successful, False otherwise.
    """
    conn = get_db_connection()
    if conn:
        try:
            cursor = conn.cursor()
            cursor.execute("SELECT version();")
            version = cursor.fetchone()
            logger.info(f"PostgreSQL version: {version[0]}")
            cursor.close()
            conn.close()
            return True
        except psycopg2.Error as e:
            logger.error(f"Database test query failed: {e}")
            return False
    return False


def fetch_data():
    """
    Fetch JSON data from the target URL.
    
    Returns:
        dict: The JSON response data, or None if request fails.
    """
    try:
        logger.info(f"Fetching data from: {URL}")
        
        response = requests.get(URL, timeout=60)
        response.raise_for_status()
        
        # Parse JSON response
        data = response.json()
        
        logger.info(f"Successfully fetched data at {datetime.now().isoformat()}")
        logger.info(f"Data type: {type(data)}, Length: {len(data) if isinstance(data, (list, dict)) else 'N/A'}")
        
        return data
        
    except requests.exceptions.Timeout:
        logger.error("Request timed out")
        return None
    except requests.exceptions.RequestException as e:
        logger.error(f"Request failed: {e}")
        return None
    except json.JSONDecodeError as e:
        logger.error(f"Failed to parse JSON response: {e}")
        return None


def parse_author_name(full_name):
    """
    Parse author name from 'SURNAME, NAME' format.
    
    Args:
        full_name: String in format 'SURNAME, NAME' or 'NAME SURNAME'
    
    Returns:
        tuple: (name, surname)
    """
    if not full_name:
        return ('', '')
    
    full_name = full_name.strip()
    
    # Handle "SURNAME, NAME" format
    if ',' in full_name:
        parts = full_name.split(',', 1)
        surname = parts[0].strip()
        name = parts[1].strip() if len(parts) > 1 else ''
    else:
        # Handle "NAME SURNAME" format - take last word as surname
        parts = full_name.split()
        if len(parts) >= 2:
            name = ' '.join(parts[:-1])
            surname = parts[-1]
        else:
            name = full_name
            surname = ''
    
    return (name, surname)


def insert_publication(cursor, record):
    """
    Insert or update a publication record only if data has changed.
    
    Args:
        cursor: Database cursor
        record: Publication record from API
    
    Returns:
        tuple: (publication_id, needs_author_update) - needs_author_update is True for new/updated records
    """
    lookup = record.get('lookupValues', {}) or {}
    
    handle = record.get('handle', '')
    title = lookup.get('title') or record.get('name', '')
    publication_year = lookup.get('year', '')
    venue = lookup.get('jtitle') or lookup.get('book') or ''
    pub_type = record.get('collection', {}).get('name', '') if record.get('collection') else ''
    state = 'archived' if record.get('archived') else 'in_progress'
    doi = lookup.get('doi', '')
    url = f"https://iris.polito.it/handle/{handle}" if handle else ''
    citation_raw = lookup.get('citation', '')
    abstract = lookup.get('summary', '')
    source_last_modified = record.get('lastModified')
    
    # Check if publication exists and get its last modified time
    cursor.execute(
        "SELECT id, source_last_modified FROM publication_schema.publications WHERE handle = %s",
        (handle,)
    )
    existing = cursor.fetchone()
    
    if existing:
        pub_id, db_last_modified = existing
        
        # Compare timestamps - only update if API data is newer
        if db_last_modified and source_last_modified and source_last_modified <= db_last_modified:
            # No update needed - data hasn't changed
            logger.debug(f"Skipped (no changes): {handle}")
            return (pub_id, False)
        
        # Update existing record (data has changed)
        cursor.execute("""
            UPDATE publication_schema.publications 
            SET title = %s, publication_year = %s, venue = %s, type = %s, 
                state = %s, doi = %s, url = %s, citation_raw = %s, 
                abstract = %s, source_last_modified = %s
            WHERE handle = %s
            RETURNING id
        """, (title, publication_year, venue, pub_type, state, doi, url, 
              citation_raw, abstract, source_last_modified, handle))
        pub_id = cursor.fetchone()[0]
        logger.info(f"Updated publication: {handle}")
        return (pub_id, True)
    else:
        # Insert new record
        cursor.execute("""
            INSERT INTO publication_schema.publications 
            (handle, title, publication_year, venue, type, state, doi, url, 
             citation_raw, abstract, source_last_modified, reveiwed)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, false)
            RETURNING id
        """, (handle, title, publication_year, venue, pub_type, state, doi, 
              url, citation_raw, abstract, source_last_modified))
        pub_id = cursor.fetchone()[0]
        logger.info(f"Inserted new publication: {handle}")
        return (pub_id, True)


def insert_author(cursor, full_name, is_internal=False, authority=None):
    """
    Insert or get an author record.
    
    Args:
        cursor: Database cursor
        full_name: Author's full name
        is_internal: Whether this is an internal author
        authority: Authority ID for iris_profile
    
    Returns:
        uuid: The author ID
    """
    name, surname = parse_author_name(full_name)
    
    # Check if author exists by name + surname
    cursor.execute(
        "SELECT id FROM innuce_schema.authors WHERE name = %s AND surname = %s",
        (name, surname)
    )
    existing = cursor.fetchone()
    
    if existing:
        author_id = existing[0]
        # Update iris_profile if internal and not set
        if is_internal and authority:
            iris_profile = f"https://iris.polito.it/cris/rp/{authority}"
            cursor.execute("""
                UPDATE innuce_schema.authors 
                SET iris_profile = %s 
                WHERE id = %s AND (iris_profile IS NULL OR iris_profile = '')
            """, (iris_profile, author_id))
        return author_id
    
    # Build iris_profile for internal authors
    iris_profile = None
    if is_internal and authority:
        iris_profile = f"https://iris.polito.it/cris/rp/{authority}"
    
    # Insert new author
    cursor.execute("""
        INSERT INTO innuce_schema.authors 
        (name, surname, iris_profile, active)
        VALUES (%s, %s, %s, false)
        RETURNING id
    """, (name, surname, iris_profile))
    author_id = cursor.fetchone()[0]
    logger.info(f"Inserted author: {name} {surname}")
    
    return author_id


def insert_publication_author(cursor, publication_id, author_id, full_name, author_rank, is_internal):
    """
    Insert a publication-author relationship.
    
    Args:
        cursor: Database cursor
        publication_id: Publication UUID
        author_id: Author UUID (can be None for external authors)
        full_name: Author's full name
        author_rank: Position in author list
        is_internal: Whether this is an internal author
    """
    # Check if relationship exists
    cursor.execute("""
        SELECT id FROM publication_schema.publication_authors 
        WHERE publication_id = %s AND full_name = %s
    """, (publication_id, full_name))
    
    existing = cursor.fetchone()
    
    if existing:
        return  # Already exists
    
    cursor.execute("""
        INSERT INTO publication_schema.publication_authors 
        (publication_id, team_member_id, full_name, author_rank, is_corresponding, is_internal)
        VALUES (%s, %s, %s, %s, false, %s)
    """, (publication_id, author_id, full_name, author_rank, is_internal))


def process_and_store_data(data):
    """
    Process fetched data and store in database.
    Only updates records that have changed (based on lastModified timestamp).
    
    Args:
        data: JSON data from API
    
    Returns:
        bool: True if successful, False otherwise
    """
    if not data or 'records' not in data:
        logger.error("No records found in data")
        return False
    
    conn = get_db_connection()
    if not conn:
        return False
    
    try:
        cursor = conn.cursor()
        records = data.get('records', [])
        logger.info(f"Processing {len(records)} records...")
        
        new_count = 0
        updated_count = 0
        skipped_count = 0
        
        for record in records:
            # Insert/update publication - returns (pub_id, needs_update)
            pub_id, needs_update = insert_publication(cursor, record)
            
            if not needs_update:
                # No changes - skip author processing
                skipped_count += 1
                continue
            
            # Count new vs updated
            if needs_update:
                # Check if it was an insert or update by checking handle
                cursor.execute(
                    "SELECT created_at FROM publication_schema.publications WHERE id = %s",
                    (pub_id,)
                )
                result = cursor.fetchone()
                if result:
                    new_count += 1
                else:
                    updated_count += 1
            
            # Build internal authors lookup
            internal_authors = {}
            for ia in record.get('internalAuthors', []) or []:
                author_name = ia.get('author', '')
                internal_authors[author_name.upper()] = ia.get('authority')
            
            # Process all contributors
            lookup = record.get('lookupValues', {}) or {}
            contributors_str = lookup.get('contributors', '')
            
            if contributors_str:
                contributors = [c.strip() for c in contributors_str.split(';')]
                
                for rank, contributor in enumerate(contributors, start=1):
                    if not contributor:
                        continue
                    
                    # Check if this is an internal author
                    contributor_upper = contributor.upper()
                    is_internal = contributor_upper in internal_authors
                    authority = internal_authors.get(contributor_upper)
                    
                    # Insert author
                    author_id = insert_author(cursor, contributor, is_internal, authority)
                    
                    # Insert publication-author relationship
                    insert_publication_author(
                        cursor, pub_id, author_id, contributor, rank, is_internal
                    )
        
        conn.commit()
        logger.info(f"Processing complete: {new_count + updated_count} changed, {skipped_count} skipped (no changes)")
        return True
        
    except psycopg2.Error as e:
        logger.error(f"Database error: {e}")
        conn.rollback()
        return False
    finally:
        cursor.close()
        conn.close()


def main():
    """Main function to execute the scraper."""
    logger.info("=" * 50)
    logger.info("Starting web scraper...")
    logger.info(f"Current time: {datetime.now().isoformat()}")
    
    # Test database connection
    if not test_db_connection():
        logger.error("Database connection failed, aborting")
        return None
    
    # Fetch data
    scraped_data = fetch_data()
    
    if scraped_data is not None:
        logger.info("Data successfully fetched")
        
        # Process and store data
        if process_and_store_data(scraped_data):
            logger.info("Data successfully stored in database")
        else:
            logger.error("Failed to store data in database")
    else:
        logger.warning("No data was fetched")
    
    logger.info("Scraper execution completed")
    logger.info("=" * 50)
    
    return scraped_data


if __name__ == "__main__":
    main()
