Malek Boubakri Malek Boubakri - 2 months ago 11
Java Question

Java: How to extract text by a selected area from a PDF file using iText?

I am working on a program that extract texts from a PDF file in a specific area, I am using java and iText library.
enter image description here
Right now, i can extract data by taping the area coordinations using this code:

import java.io.IOException;
import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.FilteredTextRenderListener;
import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.pdf.parser.RegionTextRenderFilter;
import com.itextpdf.text.pdf.parser.RenderFilter;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;

/**
* Créer par Malek Boubakri le 03/06/2015 à 15:45.
*/

public class ExtractPageContentArea {
//
public void parsePdf(float x,float y,float width,float height,String pdf) throws IOException {
PdfReader reader = new PdfReader(pdf);
Rectangle rect = new Rectangle(x, y, width, height);
RenderFilter filter = new RegionTextRenderFilter(rect);
TextExtractionStrategy strategy;
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter);
System.out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy));
}
reader.close();
}
}


and this code can draw rectangle and save needed coordinations using:

import java.awt.BorderLayout;
import java.awt.Graphics;
import java.awt.Rectangle;
import java.awt.event.MouseEvent;
import java.awt.event.MouseListener;
import java.awt.event.MouseMotionListener;
import java.util.ArrayList;

import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.SwingConstants;

public class MouseTracker extends JFrame implements MouseListener, MouseMotionListener {

private static final long serialVersionUID = 1L;
private final JLabel mousePosition;
int x1, x2, y1, y2;
int w, h;
private final JLabel recStart;
private final JLabel recStop;
private final JLabel cords; // set up GUI and register mouse event handlers
private final ArrayList< Rectangle > rectangles = new ArrayList< Rectangle >();
private boolean isNewRect = true;

public MouseTracker() {
super( "Rectangle Drawer" );

this.mousePosition = new JLabel();
this.mousePosition.setHorizontalAlignment( SwingConstants.CENTER );
getContentPane().add( this.mousePosition, BorderLayout.CENTER );

JLabel text1 = new JLabel();
text1.setText( "At the center the mouse pointer's coordinates will be displayed." );
getContentPane().add( text1, BorderLayout.SOUTH );

this.recStart = new JLabel();
getContentPane().add( this.recStart, BorderLayout.WEST );

this.recStop = new JLabel();
getContentPane().add( this.recStop, BorderLayout.EAST );

this.cords = new JLabel();
getContentPane().add( this.cords, BorderLayout.NORTH );

addMouseListener( this ); // listens for own mouse and
addMouseMotionListener( this ); // mouse-motion events

setSize( 800, 600 );
setVisible( true );

}

// MouseListener event handlers // handle event when mouse released immediately after press
public void mouseClicked( final MouseEvent event ) {
this.mousePosition.setText( "Clicked at [" + event.getX() + ", " + event.getY() + "]" );

repaint();
}

// handle event when mouse pressed
public void mousePressed( final MouseEvent event ) {

this.mousePosition.setText( "Pressed at [" + ( this.x1 = event.getX() ) + ", " + ( this.y1 = event.getY() ) + "]" );

this.recStart.setText( "Start: [" + this.x1 + ", " + this.y1 + "]" );

repaint();
}

// handle event when mouse released after dragging
public void mouseReleased( final MouseEvent event ) {
this.mousePosition.setText( "Released at [" + ( this.x2 = event.getX() ) + ", " + ( this.y2 = event.getY() ) + "]" );

this.recStop.setText( "End: [" + this.x2 + ", " + this.y2 + "]" );

Rectangle rectangle = getRectangleFromPoints();

this.rectangles.add( rectangle );

this.w = this.h = this.x1 = this.y1 = this.x2 = this.y2 = 0;
this.isNewRect = true;

repaint();
}

private Rectangle getRectangleFromPoints() {
int width = this.x1 - this.x2;
int height = this.y1 - this.y2;
Rectangle rectangle = new Rectangle( width < 0 ? this.x1
: this.x2, height < 0 ? this.y1
: this.y2, Math.abs( width ), Math.abs( height ) );

return rectangle;
}

// handle event when mouse enters area
public void mouseEntered( final MouseEvent event ) {
this.mousePosition.setText( "Mouse entered at [" + event.getX() + ", " + event.getY() + "]" );
repaint();
}

// handle event when mouse exits area
public void mouseExited( final MouseEvent event ) {
this.mousePosition.setText( "Mouse outside window" );
repaint();
}

// MouseMotionListener event handlers // handle event when user drags mouse with button pressed
public void mouseDragged( final MouseEvent event ) {
this.mousePosition.setText( "Dragged at [" + ( this.x2 = event.getX() ) + ", " + ( this.y2 = event.getY() ) + "]" ); // call repaint which calls paint repaint();

this.isNewRect = false;

repaint();
}

// handle event when user moves mouse
public void mouseMoved( final MouseEvent event ) {
this.mousePosition.setText( "Moved at [" + event.getX() + ", " + event.getY() + "]" );
repaint();
}

@Override
public void paint( final Graphics g ) {
super.paint( g ); // clear the frame surface
g.drawString( "Start Rec Here", this.x1, this.y1 );
g.drawString( "End Rec Here", this.x2, this.y2 );

Rectangle newRectangle = getRectangleFromPoints();
if ( !this.isNewRect ) {
g.drawRect( newRectangle.x, newRectangle.y, newRectangle.width, newRectangle.height );
}

for( Rectangle rectangle : this.rectangles ) {
g.drawRect( rectangle.x, rectangle.y, rectangle.width, rectangle.height );
}

this.cords.setText( "w = " + this.w + ", h = " + this.h );

}

public static void main( final String args[] ) {
MouseTracker application = new MouseTracker();
application.setDefaultCloseOperation( JFrame.EXIT_ON_CLOSE );
}

}


I want to use those coordinations to specify the area in the PDF file, i dont really know how can i merge the two fonctions, how to put the drawing space above the document and how to fit the rect coordinations with text coordinations.


  • How to draw above another panel?

  • should i convert PDF to image and put it behind to do that?

  • if i should, please can anyone suggest a good and free OCR library!



Please if anything is blur just comment!
can anyone put me in the road! cause i'm really lost.

waiting for your help..and Thanks( Sorry for my bad english )

Answer

You have a very interesting question and a challenging project. This "answer" may provide some useful ideas, but it is not a finished solution.

You could use the so called glass pane to draw on top of other components.

The most important thing that I think you need to decide on is which libraries are optimal for your project. The iText library is very good and provides all sorts of pdf functionality, like the text extraction you show in your question.

But, as far as I know, there is no support for pdf viewing in iText. You could use a library like ICEpdf for this (see this example). It would be very nice if ICEpdf could support text extraction as well, so you could use one library instead of making ICEpdf work with iText or OCR (and handling issues like zooming the pdf in ICEpdf and compensating for that when you're getting the text).

I'm not sure whether you can extract text with ICEpdf, so iText is currently still used for that in the example code below:

// File ExtractSelectionFromPdf.java

import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.*;

import java.awt.Container;
import java.awt.Point;
import java.io.IOException;
import javax.swing.*;

public class ExtractSelectionFromPdf {
    private static String filePath = "[file path to a pdf file]";

    private PdfViewer pdfViewer;

    public static void main(final String[] arguments) {
        SwingUtilities.invokeLater(() -> new ExtractSelectionFromPdf().launchGUI());
    }

    private void launchGUI() {
        final JFrame frame = new JFrame("Extract selected text from a pdf");
        frame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE);
        final Container contentPane = frame.getContentPane();

        pdfViewer = new PdfViewer();
        contentPane.add(pdfViewer);

        pdfViewer.openDocument(filePath);

        final CustomGlassPane customGlassPane = new CustomGlassPane(this, contentPane);
        frame.setGlassPane(customGlassPane);
        customGlassPane.setVisible(true);

        frame.setBounds(60, 10, 1800, 1000);
        frame.setVisible(true);
    }

    public void handleSelection(final Point topLeft, final Point bottomRight) {
        final int width = bottomRight.x - topLeft.x;
        final int height = bottomRight.y - topLeft.x;
        final String text = parsePdf(topLeft.x, topLeft.y, width, height, filePath);
        System.out.println("text: " + text);
    }

    public String parsePdf(final int x, final int y, final int width, final int height, 
                           final String pdfFilePath) {
        String text = null;

        try {
            final PdfReader pdfReader = new PdfReader(pdfFilePath);
            final int pageNumber = pdfViewer.getCurrentPageNumber() + 1;
            System.out.println("Page number: " + pageNumber);
            final Rectangle selection = new Rectangle(x, y, width, height);
            final RenderFilter renderFilter = new RegionTextRenderFilter(selection);
            final LocationTextExtractionStrategy delegate 
                    = new LocationTextExtractionStrategy();
            final TextExtractionStrategy extractionStrategy 
                    = new FilteredTextRenderListener(delegate, renderFilter);
            text = PdfTextExtractor.getTextFromPage(pdfReader, pageNumber, 
                                                    extractionStrategy);
            pdfReader.close();
        } catch (final IOException e) {
            e.printStackTrace();
        }

        return text;
    }
}


// File PdfViewer.java

import java.util.ResourceBundle;
import javax.swing.*;
import org.icepdf.ri.common.*;
import org.icepdf.ri.common.views.DocumentViewController;
import org.icepdf.ri.util.PropertiesManager;

public class PdfViewer extends JPanel {
    private final SwingController controller;

    public PdfViewer() {
        controller = new SwingController();
        controller.setIsEmbeddedComponent(true);

        final String bundleName = PropertiesManager.DEFAULT_MESSAGE_BUNDLE;
        final ResourceBundle messageBundle = ResourceBundle.getBundle(bundleName);
        final Properties systemProperties = System.getProperties();
        final PropertiesManager properties = new PropertiesManager(systemProperties,
                                                                   messageBundle);

        properties.set(PropertiesManager.PROPERTY_DEFAULT_ZOOM_LEVEL, "1");

        final SwingViewBuilder factory = new SwingViewBuilder(controller, properties);

        final DocumentViewController viewController 
                = controller.getDocumentViewController();
        viewController.setAnnotationCallback(new MyAnnotationCallback(viewController));

        final JScrollPane scrollPane = new JScrollPane(factory.buildViewerPanel());
        final int horizontalPolicy = ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS;
        final int verticalPolicy = ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS;
        scrollPane.setHorizontalScrollBarPolicy(horizontalPolicy);
        scrollPane.setVerticalScrollBarPolicy(verticalPolicy);
        add(scrollPane);
    }

    public void openDocument(final String filePath) {
        controller.openDocument(filePath);
    }

    public int getCurrentPageNumber() {
        return controller.getCurrentPageNumber();
    }
}


// File CustomGlassPane.java

import java.awt.*;
import javax.swing.JComponent;

public class CustomGlassPane extends JComponent {
    private Point topLeftPoint;
    private Point bottomRightPoint;

    public CustomGlassPane(final ExtractSelectionFromPdf extractSelectionFromPdf, 
                           final Container contentPane) {
        final MouseEventsListener listener 
                = new MouseEventsListener(extractSelectionFromPdf, this, contentPane);
        addMouseListener(listener);
        addMouseMotionListener(listener);
    }

    public void setSelection(final Point topLeftPoint, final Point bottomRightPoint) {
        this.topLeftPoint = topLeftPoint;
        this.bottomRightPoint = bottomRightPoint;
    }

    protected void paintComponent(final Graphics graphics) {
        if (topLeftPoint != null && bottomRightPoint != null) {
            graphics.setColor(Color.BLACK);
            graphics.drawRect(topLeftPoint.x, 
                              topLeftPoint.y,
                              bottomRightPoint.x - topLeftPoint.x, 
                              bottomRightPoint.y - topLeftPoint.y);
        }
    }
}


// File MouseEventsListener.java

import java.awt.*;
import java.awt.event.MouseEvent;
import javax.swing.SwingUtilities;
import javax.swing.event.MouseInputAdapter;

public class MouseEventsListener extends MouseInputAdapter {
    private ExtractSelectionFromPdf extractSelectionFromPdf;
    private CustomGlassPane customGlassPane;
    private Container contentPane;
    private Point topLeftPoint;
    private Point bottomRightPoint;

    public MouseEventsListener(final ExtractSelectionFromPdf extractSelectionFromPdf,
                               final CustomGlassPane customGlassPane,
                               final Container contentPane) {
        this.extractSelectionFromPdf = extractSelectionFromPdf;
        this.customGlassPane = customGlassPane;
        this.contentPane = contentPane;
    }

    public void mousePressed(final MouseEvent mouseEvent) {
        topLeftPoint = mouseEvent.getPoint();

        redispatchMouseEvent(mouseEvent);
    }

    public void mouseDragged(final MouseEvent mouseEvent) {
        bottomRightPoint = mouseEvent.getPoint();

        redispatchMouseEvent(mouseEvent, topLeftPoint != null, false);
    }

    public void mouseReleased(final MouseEvent mouseEvent) {
        bottomRightPoint = mouseEvent.getPoint();

        redispatchMouseEvent(mouseEvent, true, true);
    }

    public void mouseMoved(final MouseEvent mouseEvent) {
        redispatchMouseEvent(mouseEvent);
    }

    public void mouseClicked(final MouseEvent mouseEvent) {
        redispatchMouseEvent(mouseEvent);
    }

    public void mouseEntered(final MouseEvent mouseEvent) {
        redispatchMouseEvent(mouseEvent);
    }

    public void mouseExited(final MouseEvent mouseEvent) {
        redispatchMouseEvent(mouseEvent);
    }

    private void redispatchMouseEvent(final MouseEvent mouseEvent) {
        redispatchMouseEvent(mouseEvent, false, false);
    }

    private void redispatchMouseEvent(final MouseEvent mouseEvent, 
                                      final boolean repaint,
                                      final boolean extract) {
        final Point glassPanePoint = mouseEvent.getPoint();
        final Point containerPoint = SwingUtilities.convertPoint(customGlassPane, 
                                                                 glassPanePoint, 
                                                                 contentPane);

        if (containerPoint.y >= 0) {
            final Component component
                    = SwingUtilities.getDeepestComponentAt(contentPane,
                                                           containerPoint.x,
                                                           containerPoint.y);

            if (component != null) {
                final Point componentPoint 
                        = SwingUtilities.convertPoint(customGlassPane, 
                                                      glassPanePoint,
                                                      component);

                // Forward events to the component under the glass pane.
                component.dispatchEvent(new MouseEvent(component,
                                                       mouseEvent.getID(),
                                                       mouseEvent.getWhen(),
                                                       mouseEvent.getModifiers(),
                                                       componentPoint.x,
                                                       componentPoint.y,
                                                       mouseEvent.getClickCount(),
                                                       mouseEvent.isPopupTrigger()));
            }
        }

        // Update the glass pane if requested.
        if (repaint) {
            if (extract) {
                extractSelectionFromPdf.handleSelection(topLeftPoint, bottomRightPoint);

                topLeftPoint = null;
                bottomRightPoint = null;
            }

            customGlassPane.setSelection(topLeftPoint, bottomRightPoint);
            customGlassPane.repaint();
        }
    }
}

The glass pane part of the code above was inspired by the GlassPaneDemo example.

Known remaining issues in the code above:

  • for some reason the scroll down button of the pdf viewer has to be clicked once before the Page Up/Down and Arrow Up/Down keys work.
  • currently the text that is actually extracted seems to be below the selected rectangle.
Comments