"""
PDF Parser utility to extract text and metadata from academic papers.
"""
import PyPDF2
import pdfplumber
from typing import Dict, List


class PDFParser:
    """Extract content from PDF academic papers."""
    
    def __init__(self, pdf_path: str):
        """
        Initialize PDF parser.
        
        Args:
            pdf_path: Path to the PDF file
        """
        self.pdf_path = pdf_path
        self.text_content = ""
        self.metadata = {}
        self.page_count = 0
        
    def extract_all(self) -> Dict[str, any]:
        """
        Extract all content and metadata from PDF.
        
        Returns:
            Dictionary containing text, metadata, and structure
        """
        self._extract_basic_info()
        self._extract_text()
        self._extract_metadata()
        
        return {
            'text': self.text_content,
            'metadata': self.metadata,
            'page_count': self.page_count,
            'title': self.metadata.get('title', 'Untitled'),
            'author': self.metadata.get('author', 'Unknown'),
        }
    
    def _extract_basic_info(self):
        """Extract basic PDF information."""
        with open(self.pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            self.page_count = len(pdf_reader.pages)
    
    def _extract_text(self):
        """Extract text content from PDF using pdfplumber for better accuracy."""
        full_text = []
        
        with pdfplumber.open(self.pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    full_text.append(text)
        
        self.text_content = "\n\n".join(full_text)
    
    def _extract_metadata(self):
        """Extract PDF metadata."""
        with open(self.pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            metadata = pdf_reader.metadata
            
            if metadata:
                self.metadata = {
                    'title': metadata.get('/Title', ''),
                    'author': metadata.get('/Author', ''),
                    'subject': metadata.get('/Subject', ''),
                    'creator': metadata.get('/Creator', ''),
                    'producer': metadata.get('/Producer', ''),
                    'creation_date': metadata.get('/CreationDate', ''),
                }
    
    def extract_sections(self) -> List[Dict[str, str]]:
        """
        Attempt to identify sections in the academic paper.
        
        Returns:
            List of sections with titles and content
        """
        # Basic section detection based on common academic paper structure
        sections = []
        lines = self.text_content.split('\n')
        
        current_section = None
        current_content = []
        
        section_keywords = [
            'abstract', 'introduction', 'background', 'related work',
            'methodology', 'methods', 'approach', 'implementation',
            'results', 'evaluation', 'discussion', 'conclusion',
            'references', 'acknowledgments'
        ]
        
        for line in lines:
            line_lower = line.strip().lower()
            
            # Check if line is likely a section header
            is_section_header = False
            for keyword in section_keywords:
                if line_lower == keyword or (len(line_lower) < 50 and keyword in line_lower):
                    is_section_header = True
                    break
            
            if is_section_header and line.strip():
                # Save previous section
                if current_section:
                    sections.append({
                        'title': current_section,
                        'content': '\n'.join(current_content)
                    })
                
                # Start new section
                current_section = line.strip()
                current_content = []
            else:
                if line.strip():
                    current_content.append(line)
        
        # Add last section
        if current_section:
            sections.append({
                'title': current_section,
                'content': '\n'.join(current_content)
            })
        
        return sections
    
    def get_summary_info(self) -> str:
        """
        Get a brief summary of the PDF for context.
        
        Returns:
            Summary string
        """
        info = f"Title: {self.metadata.get('title', 'N/A')}\n"
        info += f"Author: {self.metadata.get('author', 'N/A')}\n"
        info += f"Pages: {self.page_count}\n"
        info += f"Text Length: {len(self.text_content)} characters"
        return info
