﻿unit uExtractText;
{$I ..\..\..\Lib\PDFiumVcl.inc}

interface

uses
{$IFDEF XE2+}
  Winapi.Windows,
  Winapi.Messages,
  System.SysUtils,
  System.Variants,
  System.Classes,
  Vcl.Graphics,
  Vcl.Controls,
  Vcl.Forms,
  Vcl.Dialogs,
  Vcl.XPMan,
  Vcl.Buttons,
  Vcl.StdCtrls,
  Vcl.ComCtrls,
  Vcl.ExtCtrls,
  System.UITypes,
  Winapi.ShellAPI,
{$ELSE}
  Windows,
  Messages,
  SysUtils,
  Variants,
  Classes,
  Graphics,
  Controls,
  Forms,
  Dialogs,
  XPMan,
  Buttons,
  StdCtrls,
  ComCtrls,
  ExtCtrls,
  ShellAPI,
{$ENDIF}
  PDFium;

type
  TFormMain= class(TForm)
    // Main container
    PanelMain: TPanel;

    // Input group
    GroupBoxInput: TGroupBox;
    LabelPdfFile: TLabel;
    EditPdfFile: TEdit;
    SpeedButtonPdfFile: TSpeedButton;
    LabelOutputFile: TLabel;
    EditOutputFile: TEdit;
    SpeedButtonOutputFile: TSpeedButton;

    // Options group
    GroupBoxOptions: TGroupBox;
    LabelPageRange: TLabel;
    RadioButtonAllPages: TRadioButton;
    RadioButtonPageRange: TRadioButton;
    LabelFromPage: TLabel;
    LabelToPage: TLabel;
    LabelPage1: TLabel;
    LabelPage2: TLabel;
    EditFromPage: TEdit;
    EditToPage: TEdit;
    CheckBoxPreserveParagraphs: TCheckBox;
    CheckBoxRemoveNulChars: TCheckBox;
    CheckBoxPageSeparator: TCheckBox;

    // Progress group
    GroupBoxProgress: TGroupBox;
    LabelStatus: TLabel;
    LabelProgress: TLabel;
    ProgressBar: TProgressBar;
    ButtonExtract: TButton;
    ButtonCancel: TButton;
    ButtonOpenOutput: TButton;

    // Log area
    MemoLog: TMemo;

    // Components
    Pdf: TPdf;
    OpenDialog: TOpenDialog;
    SaveDialog: TSaveDialog;

    // Event handlers
    procedure SpeedButtonPdfFileClick(Sender: TObject);
    procedure SpeedButtonOutputFileClick(Sender: TObject);
    procedure ButtonExtractClick(Sender: TObject);
    procedure ButtonCancelClick(Sender: TObject);
    procedure ButtonOpenOutputClick(Sender: TObject);
    procedure EditPdfFileChange(Sender: TObject);
    procedure EditOutputFileChange(Sender: TObject);
    procedure RadioButtonAllPagesClick(Sender: TObject);
    procedure RadioButtonPageRangeClick(Sender: TObject);
  private
    { Private declarations }
    FCancelRequested: Boolean;
    FOutputFileName: string;
    procedure LogMessage(const Msg: string);
    procedure UpdateUI(Extracting: Boolean);
    procedure SetStatus(const Status: string);
    function GetPageRange(out StartPage, EndPage: Integer): Boolean;
  public
    { Public declarations }
  end;

var
  FormMain: TFormMain;

implementation

{$R *.dfm}

procedure TFormMain.SpeedButtonPdfFileClick(Sender: TObject);
begin
  with OpenDialog do
  begin
    FileName:= EditPdfFile.Text;
    if Execute
    then
    begin
      EditPdfFile.Text:= FileName;
      // Auto-generate output filename
      if EditOutputFile.Text= ''
      then
      begin
        FOutputFileName:= ChangeFileExt(FileName, '.txt');
        EditOutputFile.Text:= FOutputFileName;
      end;
    end;
  end;
end;

procedure TFormMain.SpeedButtonOutputFileClick(Sender: TObject);
begin
  with SaveDialog do
  begin
    FileName:= EditOutputFile.Text;
    if Execute
    then
    begin
      EditOutputFile.Text:= FileName;
      FOutputFileName:= FileName;
    end;
  end;
end;

procedure TFormMain.ButtonExtractClick(Sender: TObject);
const
  NewLine= #13#10;
var
  I, StartPage, EndPage: Integer;
  Text: UTF8String;
  CleanText: UTF8String;
  PageText: WString;
  FileStream: TFileStream;
  StartTime: TDateTime;
  ProcessedPages: Integer;

  // Function to clean text and remove NUL characters
function CleanAndFormatText(const RawText: WString): UTF8String;
var
  I: Integer;
  CleanWText: WString;
begin
  CleanWText:= '';
  for I:= 1 to Length(RawText) do
  begin
    // Skip NUL characters but keep all other characters including spaces and line breaks
    if Ord(RawText[I])<> 0
    then
      CleanWText:= CleanWText+ RawText[I];
  end;
  Result:= Utf8Encode(CleanWText);
end;

// Function to extract text with proper paragraph spacing using character position analysis
function ExtractTextWithParagraphs: UTF8String;
var
  CharIndex: Integer;
  CurrentChar: WideChar;
  CurrentY, PrevY: Double;
  LineHeight: Double;
  ResultText: WString;
  LineBuffer: WString;
  YGap: Double;
  MinLineHeight: Double;
begin
  ResultText:= '';
  LineBuffer:= '';
  PrevY:= - 1;

  MinLineHeight:= 999999;

  // First pass: determine typical line height
  for CharIndex:= 0 to Pdf.CharacterCount- 1 do
  begin
    CurrentY:= Pdf.CharacterOrigin[CharIndex].Y;
    if PrevY>= 0
    then
    begin
      YGap:= Abs(CurrentY- PrevY);
      if (YGap> 0)and (YGap< MinLineHeight)
      then
        MinLineHeight:= YGap;
    end;
    PrevY:= CurrentY;
  end;

  // Use minimum line height as reference, be more conservative
  LineHeight:= MinLineHeight;
  if LineHeight<= 0
  then
    LineHeight:= 12; // Default fallback

  // Second pass: build text with paragraph detection
  PrevY:= - 1;

  for CharIndex:= 0 to Pdf.CharacterCount- 1 do
  begin
    CurrentChar:= Pdf.Character[CharIndex];
    CurrentY:= Pdf.CharacterOrigin[CharIndex].Y;

    // Skip NUL characters
    if Ord(CurrentChar)= 0
    then
      Continue;

    // Check for line break based on Y position change
    if PrevY>= 0
    then
    begin
      YGap:= Abs(CurrentY- PrevY);

      if YGap> LineHeight* 1.2
      then // More conservative line detection
      begin
        // Add current line to result
        if LineBuffer<> ''
        then
        begin
          ResultText:= ResultText+ LineBuffer+ #13#10;
          LineBuffer:= '';
        end;

        // Check if this is a paragraph break (much larger gap)
        if YGap> LineHeight* 2.5
        then
        begin
          ResultText:= ResultText+ #13#10; // Add extra line for paragraph break
        end;

      end;
    end;

    // Add character to current line
    LineBuffer:= LineBuffer+ CurrentChar;
    PrevY:= CurrentY;
  end;

  // Add final line
  if LineBuffer<> ''
  then
    ResultText:= ResultText+ LineBuffer;

  Result:= Utf8Encode(ResultText);
end;

begin
  // Initialize
  FCancelRequested:= False;
  ProcessedPages:= 0;
  StartTime:= Now;

  // Validate inputs
  if not FileExists(EditPdfFile.Text)
  then
  begin
    MessageDlg('Please select a valid PDF file!', mtError, [mbOK], 0);
    Exit;
  end;

  if EditOutputFile.Text= ''
  then
  begin
    MessageDlg('Please specify an output file!', mtError, [mbOK], 0);
    Exit;
  end;

  // Update UI for extraction mode
  UpdateUI(True);
  SetStatus('Loading PDF...');
  LogMessage('Starting PDF text extraction: '+ ExtractFileName(EditPdfFile.Text));

  ProgressBar.Position:= 0;
  Screen.Cursor:= crHourGlass;

  try
    // Load PDF with error handling
    try
      Pdf.FileName:= EditPdfFile.Text;
      Pdf.PageNumber:= 0;
      Pdf.Active:= True;
    except
      on E: Exception do
      begin
        MessageDlg('Failed to load PDF: '+ E.Message, mtError, [mbOK], 0);
        LogMessage('Failed to load PDF: '+ E.Message);
        Exit;
      end;
    end;

    // Check if PDF was loaded successfully
    if not Pdf.Active
    then
    begin
      MessageDlg('Unable to activate PDF document!', mtError, [mbOK], 0);
      LogMessage('Unable to activate PDF document');
      Exit;
    end;

    LogMessage('Successfully loaded PDF with '+ IntToStr(Pdf.PageCount)+ ' pages');

    // Get page range
    if not GetPageRange(StartPage, EndPage)
    then
      Exit;

    // Update max page for progress
    EditToPage.Text:= IntToStr(Pdf.PageCount);
    ProgressBar.Max:= EndPage- StartPage+ 1;

    LogMessage('Extraction range: Page '+ IntToStr(StartPage)+ ' to Page '+ IntToStr(EndPage));

    // Create output file with error handling
    try
      FileStream:= TFileStream.Create(EditOutputFile.Text, fmCreate);
    except
      on E: Exception do
      begin
        MessageDlg('Failed to create output file: '+ E.Message, mtError, [mbOK], 0);
        LogMessage('Failed to create output file: '+ E.Message);
        Exit;
      end;
    end;

    LogMessage('Starting text extraction...');
    SetStatus('Extracting text...');

    try
      for I:= StartPage to EndPage do
      begin
        // Check for cancel request
        if FCancelRequested
        then
        begin
          LogMessage('Extraction cancelled by user');
          Break;
        end;

        try
          Pdf.PageNumber:= I;
          SetStatus('Processing page '+ IntToStr(I)+ '...');

          // Extract text based on options
          if CheckBoxPreserveParagraphs.Checked
          then
            CleanText:= ExtractTextWithParagraphs
          else
          begin
            PageText:= Pdf.Text;
            if CheckBoxRemoveNulChars.Checked
            then
              CleanText:= CleanAndFormatText(PageText)
            else
              CleanText:= Utf8Encode(PageText);
          end;

          if Length(CleanText)> 0
          then
          begin
            FileStream.WriteBuffer(CleanText[1], Length(CleanText));

            // Add page separator if enabled and not the last page
            if CheckBoxPageSeparator.Checked and (I< EndPage)
            then
            begin
              Text:= Utf8Encode(NewLine+ NewLine+ NewLine);
              FileStream.WriteBuffer(Text[1], Length(Text));
            end;
          end;

          Inc(ProcessedPages);
          ProgressBar.Position:= ProcessedPages;
          LabelProgress.Caption:= 'Progress: '+ IntToStr(ProcessedPages)+ '/'+ IntToStr(EndPage- StartPage+ 1)+ ' pages';

          LogMessage('Processed page '+ IntToStr(I));
          Application.ProcessMessages;

        except
          on E: Exception do
          begin
            // Log error but continue
            LogMessage('Error processing page '+ IntToStr(I)+ ': '+ E.Message);
            Text:= Utf8Encode(Format('[Page %d Error: %s]%s', [I, E.Message, NewLine]));
            if Length(Text)> 0
            then
              FileStream.WriteBuffer(Text[1], Length(Text));
          end;
        end;
      end;

      // Show completion message
      if FCancelRequested
      then
      begin
        SetStatus('Cancelled');
        LogMessage('Extraction cancelled, processed '+ IntToStr(ProcessedPages)+ ' pages');
        MessageDlg('Extraction cancelled!', mtWarning, [mbOK], 0);
      end
      else
      begin
        SetStatus('Completed');
        LogMessage('Text extraction completed! Processed '+ IntToStr(ProcessedPages)+ ' pages in '+ FormatDateTime('nn:ss', Now- StartTime));
        MessageDlg('Text extraction completed!'+ #13#10#13#10+ 'Output file: '+ EditOutputFile.Text+ #13#10+ 'Pages processed: '+ IntToStr(ProcessedPages)+
          #13#10+ 'Time elapsed: '+ FormatDateTime('nn:ss', Now- StartTime), mtInformation, [mbOK], 0);
      end;

    finally
      FileStream.Free;
    end;
  finally
    Screen.Cursor:= crDefault;
    Pdf.Active:= False;
    UpdateUI(False);
    ProgressBar.Position:= 0;
    LabelProgress.Caption:= 'Progress:';
  end;
end;

// Helper methods
procedure TFormMain.LogMessage(const Msg: string);
begin
  MemoLog.Lines.Add(FormatDateTime('hh:nn:ss', Now)+ ' - '+ Msg);
  MemoLog.Perform(WM_VSCROLL, SB_BOTTOM, 0);
  Application.ProcessMessages;
end;

procedure TFormMain.UpdateUI(Extracting: Boolean);
begin
  ButtonExtract.Enabled:= not Extracting and FileExists(EditPdfFile.Text)and (Trim(EditOutputFile.Text)<> '');
  ButtonCancel.Enabled:= Extracting;
  ButtonOpenOutput.Enabled:= not Extracting and FileExists(EditOutputFile.Text);
  EditPdfFile.Enabled:= not Extracting;
  EditOutputFile.Enabled:= not Extracting;
  SpeedButtonPdfFile.Enabled:= not Extracting;
  SpeedButtonOutputFile.Enabled:= not Extracting;
  GroupBoxOptions.Enabled:= not Extracting;
end;

procedure TFormMain.SetStatus(const Status: string);
begin
  LabelStatus.Caption:= 'Status: '+ Status;
  Application.ProcessMessages;
end;

function TFormMain.GetPageRange(out StartPage, EndPage: Integer): Boolean;
begin
  Result:= True;
  if RadioButtonAllPages.Checked
  then
  begin
    StartPage:= 1;
    EndPage:= Pdf.PageCount;
  end
  else
  begin
    try
      StartPage:= StrToInt(EditFromPage.Text);
      EndPage:= StrToInt(EditToPage.Text);
      if (StartPage< 1)or (EndPage< StartPage)or (EndPage> Pdf.PageCount)
      then
      begin
        MessageDlg('Invalid page range! Please enter a valid page range.', mtError, [mbOK], 0);
        Result:= False;
      end;
    except
      MessageDlg('Invalid page range format! Please enter numbers.', mtError, [mbOK], 0);
      Result:= False;
    end;
  end;
end;

// Event handlers
procedure TFormMain.RadioButtonAllPagesClick(Sender: TObject);
begin
  EditFromPage.Enabled:= False;
  EditToPage.Enabled:= False;
end;

procedure TFormMain.RadioButtonPageRangeClick(Sender: TObject);
begin
  EditFromPage.Enabled:= True;
  EditToPage.Enabled:= True;
end;

procedure TFormMain.ButtonCancelClick(Sender: TObject);
begin
  FCancelRequested:= True;
  SetStatus('Cancelling...');
  LogMessage('User requested to cancel extraction');
end;

procedure TFormMain.ButtonOpenOutputClick(Sender: TObject);
begin
  if FileExists(EditOutputFile.Text)
  then
  begin
    ShellExecute(Handle, 'open', PChar(EditOutputFile.Text), nil, nil, SW_SHOWNORMAL);
    LogMessage('Opened output file: '+ EditOutputFile.Text);
  end
  else
    MessageDlg('Output file does not exist!', mtWarning, [mbOK], 0);
end;

procedure TFormMain.EditPdfFileChange(Sender: TObject);
begin
  UpdateUI(False);
  // Auto-generate output filename when PDF file changes
  if FileExists(EditPdfFile.Text)and (Trim(EditOutputFile.Text)= '')
  then
  begin
    FOutputFileName:= ChangeFileExt(EditPdfFile.Text, '.txt');
    EditOutputFile.Text:= FOutputFileName;
  end;
end;

procedure TFormMain.EditOutputFileChange(Sender: TObject);
begin
  UpdateUI(False);
  FOutputFileName:= EditOutputFile.Text;
end;

end.
