formatting

This commit is contained in:
Anthony Stirling
2023-12-30 19:11:27 +00:00
parent 7b43fca6fc
commit 5f771b7851
155 changed files with 5539 additions and 4767 deletions

View File

@@ -24,91 +24,79 @@ import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.text.TextPosition;
/**
* Class to extract tabular data from a PDF. Works by making a first pass of the page to group all
* nearby text items together, and then inferring a 2D grid from these regions. Each table cell is
* then extracted using a PDFTextStripperByArea object.
*
* Class to extract tabular data from a PDF.
* Works by making a first pass of the page to group all nearby text items
* together, and then inferring a 2D grid from these regions. Each table cell
* is then extracted using a PDFTextStripperByArea object.
* <p>Works best when headers are included in the detected region, to ensure representative text in
* every column.
*
* Works best when
* headers are included in the detected region, to ensure representative text
* in every column.
*
* Based upon DrawPrintTextLocations PDFBox example
* <p>Based upon DrawPrintTextLocations PDFBox example
* (https://svn.apache.org/viewvc/pdfbox/trunk/examples/src/main/java/org/apache/pdfbox/examples/util/DrawPrintTextLocations.java)
*
* @author Beldaz
*/
public class PDFTableStripper extends PDFTextStripper
{
public class PDFTableStripper extends PDFTextStripper {
/**
* This will print the documents data, for each table cell.
*
* @param args The command line arguments.
*
* @throws IOException If there is an error parsing the document.
*/
/*
* Used in methods derived from DrawPrintTextLocations
*/
private AffineTransform flipAT;
private AffineTransform rotateAT;
/**
* Regions updated by calls to writeString
*/
/** Regions updated by calls to writeString */
private Set<Rectangle2D> boxes;
// Border to allow when finding intersections
private double dx = 1.0; // This value works for me, feel free to tweak (or add setter)
private double dy = 0.000; // Rows of text tend to overlap, so need to extend
/**
* Region in which to find table (otherwise whole page)
*/
/** Region in which to find table (otherwise whole page) */
private Rectangle2D regionArea;
/**
* Number of rows in inferred table
*/
private int nRows=0;
/** Number of rows in inferred table */
private int nRows = 0;
/**
* Number of columns in inferred table
*/
private int nCols=0;
/** Number of columns in inferred table */
private int nCols = 0;
/**
* This is the object that does the text extraction
*/
/** This is the object that does the text extraction */
private PDFTextStripperByArea regionStripper;
/**
* 1D intervals - used for calculateTableRegions()
* @author Beldaz
*
* @author Beldaz
*/
public static class Interval {
double start;
double end;
public Interval(double start, double end) {
this.start=start; this.end = end;
this.start = start;
this.end = end;
}
public void add(Interval col) {
if(col.start<start)
start = col.start;
if(col.end>end)
end = col.end;
if (col.start < start) start = col.start;
if (col.end > end) end = col.end;
}
public static void addTo(Interval x, LinkedList<Interval> columns) {
int p = 0;
Iterator<Interval> it = columns.iterator();
// Find where x should go
while(it.hasNext()) {
while (it.hasNext()) {
Interval col = it.next();
if(x.end>=col.start) {
if(x.start<=col.end) { // overlaps
if (x.end >= col.start) {
if (x.start <= col.end) { // overlaps
x.add(col);
it.remove();
}
@@ -116,30 +104,26 @@ public class PDFTableStripper extends PDFTextStripper
}
++p;
}
while(it.hasNext()) {
while (it.hasNext()) {
Interval col = it.next();
if(x.start>col.end)
break;
if (x.start > col.end) break;
x.add(col);
it.remove();
}
columns.add(p, x);
}
}
/**
* Instantiate a new PDFTableStripper object.
*
* @param document
* @throws IOException If there is an error loading the properties.
*/
public PDFTableStripper() throws IOException
{
public PDFTableStripper() throws IOException {
super.setShouldSeparateByBeads(false);
regionStripper = new PDFTextStripperByArea();
regionStripper.setSortByPosition( true );
regionStripper.setSortByPosition(true);
}
/**
@@ -147,18 +131,15 @@ public class PDFTableStripper extends PDFTextStripper
*
* @param rect The rectangle area to retrieve the text from.
*/
public void setRegion(Rectangle2D rect )
{
public void setRegion(Rectangle2D rect) {
regionArea = rect;
}
public int getRows()
{
public int getRows() {
return nRows;
}
public int getColumns()
{
public int getColumns() {
return nCols;
}
@@ -167,13 +148,11 @@ public class PDFTableStripper extends PDFTextStripper
*
* @return The text that was identified in that region.
*/
public String getText(int row, int col)
{
return regionStripper.getTextForRegion("el"+col+"x"+row);
public String getText(int row, int col) {
return regionStripper.getTextForRegion("el" + col + "x" + row);
}
public void extractTable(PDPage pdPage) throws IOException
{
public void extractTable(PDPage pdPage) throws IOException {
setStartPage(getCurrentPageNo());
setEndPage(getCurrentPageNo());
@@ -186,11 +165,9 @@ public class PDFTableStripper extends PDFTextStripper
// page may be rotated
rotateAT = new AffineTransform();
int rotation = pdPage.getRotation();
if (rotation != 0)
{
if (rotation != 0) {
PDRectangle mediaBox = pdPage.getMediaBox();
switch (rotation)
{
switch (rotation) {
case 90:
rotateAT.translate(mediaBox.getHeight(), 0);
break;
@@ -213,11 +190,12 @@ public class PDFTableStripper extends PDFTextStripper
Rectangle2D[][] regions = calculateTableRegions();
// System.err.println("Drawing " + nCols + "x" + nRows + "="+ nRows*nCols + " regions");
for(int i=0; i<nCols; ++i) {
for(int j=0; j<nRows; ++j) {
// System.err.println("Drawing " + nCols + "x" + nRows + "="+ nRows*nCols + "
// regions");
for (int i = 0; i < nCols; ++i) {
for (int j = 0; j < nRows; ++j) {
final Rectangle2D region = regions[i][j];
regionStripper.addRegion("el"+i+"x"+j, region);
regionStripper.addRegion("el" + i + "x" + j, region);
}
}
@@ -227,8 +205,8 @@ public class PDFTableStripper extends PDFTextStripper
/**
* Infer a rectangular grid of regions from the boxes field.
*
* @return 2D array of table regions (as Rectangle2D objects). Note that
* some of these regions may have no content.
* @return 2D array of table regions (as Rectangle2D objects). Note that some of these regions
* may have no content.
*/
private Rectangle2D[][] calculateTableRegions() {
@@ -238,7 +216,7 @@ public class PDFTableStripper extends PDFTextStripper
LinkedList<Interval> columns = new LinkedList<Interval>();
LinkedList<Interval> rows = new LinkedList<Interval>();
for(Rectangle2D box: boxes) {
for (Rectangle2D box : boxes) {
Interval x = new Interval(box.getMinX(), box.getMaxX());
Interval y = new Interval(box.getMinY(), box.getMaxY());
@@ -249,12 +227,17 @@ public class PDFTableStripper extends PDFTextStripper
nRows = rows.size();
nCols = columns.size();
Rectangle2D[][] regions = new Rectangle2D[nCols][nRows];
int i=0;
int i = 0;
// Label regions from top left, rather than the transformed orientation
for(Interval column: columns) {
int j=0;
for(Interval row: rows) {
regions[nCols-i-1][nRows-j-1] = new Rectangle2D.Double(column.start, row.start, column.end - column.start, row.end - row.start);
for (Interval column : columns) {
int j = 0;
for (Interval row : rows) {
regions[nCols - i - 1][nRows - j - 1] =
new Rectangle2D.Double(
column.start,
row.start,
column.end - column.start,
row.end - row.start);
++j;
}
++i;
@@ -264,18 +247,15 @@ public class PDFTableStripper extends PDFTextStripper
}
/**
* Register each character's bounding box, updating boxes field to maintain
* a list of all distinct groups of characters.
* Register each character's bounding box, updating boxes field to maintain a list of all
* distinct groups of characters.
*
* Overrides the default functionality of PDFTextStripper.
* Most of this is taken from DrawPrintTextLocations.java, with extra steps
* at end of main loop
* <p>Overrides the default functionality of PDFTextStripper. Most of this is taken from
* DrawPrintTextLocations.java, with extra steps at end of main loop
*/
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException
{
for (TextPosition text : textPositions)
{
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
for (TextPosition text : textPositions) {
// glyph space -> user space
// note: text.getTextMatrix() is *not* the Text Matrix, it's the Text Rendering Matrix
AffineTransform at = text.getTextMatrix().createAffineTransform();
@@ -283,37 +263,35 @@ public class PDFTableStripper extends PDFTextStripper
BoundingBox bbox = font.getBoundingBox();
// advance width, bbox height (glyph space)
float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
Rectangle2D.Float rect = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());
float xadvance =
font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
Rectangle2D.Float rect =
new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());
if (font instanceof PDType3Font)
{
if (font instanceof PDType3Font) {
// bbox and font matrix are unscaled
at.concatenate(font.getFontMatrix().createAffineTransform());
}
else
{
} else {
// bbox and font matrix are already scaled to 1000
at.scale(1/1000f, 1/1000f);
at.scale(1 / 1000f, 1 / 1000f);
}
Shape s = at.createTransformedShape(rect);
s = flipAT.createTransformedShape(s);
s = rotateAT.createTransformedShape(s);
//
// Merge character's bounding box with boxes field
//
Rectangle2D bounds = s.getBounds2D();
// Pad sides to detect almost touching boxes
Rectangle2D hitbox = bounds.getBounds2D();
hitbox.add(bounds.getMinX() - dx , bounds.getMinY() - dy);
hitbox.add(bounds.getMaxX() + dx , bounds.getMaxY() + dy);
hitbox.add(bounds.getMinX() - dx, bounds.getMinY() - dy);
hitbox.add(bounds.getMaxX() + dx, bounds.getMaxY() + dy);
// Find all overlapping boxes
List<Rectangle2D> intersectList = new ArrayList<Rectangle2D>();
for(Rectangle2D box: boxes) {
if(box.intersects(hitbox)) {
for (Rectangle2D box : boxes) {
if (box.intersects(hitbox)) {
intersectList.add(box);
}
}
@@ -321,38 +299,30 @@ public class PDFTableStripper extends PDFTextStripper
// Combine all touching boxes and update
// (NOTE: Potentially this could leave some overlapping boxes un-merged,
// but it's sufficient for now and get's fixed up in calculateTableRegions)
for(Rectangle2D box: intersectList) {
for (Rectangle2D box : intersectList) {
bounds.add(box);
boxes.remove(box);
}
boxes.add(bounds);
}
}
/**
* This method does nothing in this derived class, because beads and regions are incompatible. Beads are
* ignored when stripping by area.
* This method does nothing in this derived class, because beads and regions are incompatible.
* Beads are ignored when stripping by area.
*
* @param aShouldSeparateByBeads The new grouping of beads.
*/
@Override
public final void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
{
}
public final void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) {}
/**
* Adapted from PDFTextStripperByArea
* {@inheritDoc}
*/
/** Adapted from PDFTextStripperByArea {@inheritDoc} */
@Override
protected void processTextPosition( TextPosition text )
{
if(regionArea!=null && !regionArea.contains( text.getX(), text.getY() ) ) {
protected void processTextPosition(TextPosition text) {
if (regionArea != null && !regionArea.contains(text.getX(), text.getY())) {
// skip character
} else {
super.processTextPosition( text );
super.processTextPosition(text);
}
}
}
}