#!/usr/bin/env python

import pdfplumber
import csv
import re
from decimal import Decimal, ROUND_HALF_UP

def clean_amount(amount):
    cleaned_amount = Decimal(amount.replace(',', ''))
    rounded_amount = cleaned_amount.quantize(Decimal('0.01'), rounding=ROUND_HALF_UP)
    formatted_amount = f"{rounded_amount:,.2f}"
    return formatted_amount

def parse_date(date_str):
    from datetime import datetime
    return datetime.strptime(date_str, '%d-%m-%y').strftime('%d-%m-%Y')

def clean_particulars(particulars):
    cleaned = re.sub(r'\d', '', particulars)  # Remove all numbers
    cleaned = re.sub(r'[^a-zA-Z:\s]+', '', cleaned)  # Remove all special characters
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = cleaned.replace("earing:", "Clearing:").replace("ansfer:", "Transfer:")
    cleaned = cleaned.replace("ward ", "Inward ")  # Replace "ward" with "Inward"
    return cleaned.strip()

def is_withdrawal(particulars):
    withdrawal_keywords = ['CHQ RETN', 'TM', 'Transfer:', 'RETN', 'Inward']
    return any(keyword in particulars for keyword in withdrawal_keywords)

def extract_data_from_pdf(pdf_path, csv_path):
    data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            lines = text.split('\n')
            particulars = ""
            date = None
            for line in lines:
                line = line.replace(",.", "").replace(",.Cr", "")  # Remove ",." and ",.Cr"
                if "Page Total:" in line or "----------" in line:
                    continue  # Skip page total and separator lines
                
                if re.match(r'^\d{2}-\d{2}-\d{2}', line):
                    if particulars and date:
                        amounts = re.findall(r'\d{1,3}(?:,\d{3})*\.\d{2}', particulars)
                        if amounts:
                            withdrawal = clean_amount(amounts[0]) if is_withdrawal(particulars) else '0.00'
                            deposit = clean_amount(amounts[0]) if not is_withdrawal(particulars) else '0.00'
                            data.append([date, clean_particulars(particulars), withdrawal, deposit])
                    
                    date = parse_date(line[:8])
                    particulars = line[11:].strip()
                else:
                    particulars += " " + line.strip()
            
            if particulars and date:
                amounts = re.findall(r'\d{1,3}(?:,\d{3})*\.\d{2}', particulars)
                if amounts:
                    withdrawal = clean_amount(amounts[0]) if is_withdrawal(particulars) else '0.00'
                    deposit = clean_amount(amounts[0]) if not is_withdrawal(particulars) else '0.00'
                    data.append([date, clean_particulars(particulars), withdrawal, deposit])

    with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:  
        writer = csv.writer(csvfile)
        writer.writerow(['DATE', 'PARTICULARS', 'WITHDRAWALS', 'DEPOSITS'])
        writer.writerows(data)

if __name__ == "__main__":
    pdf_path = r"C:\Users\Admin\Desktop\Antony\st ant trs.pdf"
    csv_path = r"C:\Users\Admin\Desktop\Antony\ExtractedData.csv"
    extract_data_from_pdf(pdf_path, csv_path)
