# Description This pull request includes several changes aimed at improving the code structure and removing redundant code. The most significant changes involve reordering methods, removing unnecessary annotations, and refactoring constructors to use dependency injection. Autowired now comes via constructor (which also doesn't need autowired annotation as its done by default for configuration) ## Checklist - [ ] I have read the [Contribution Guidelines](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/CONTRIBUTING.md) - [ ] I have performed a self-review of my own code - [ ] I have attached images of the change if it is UI based - [ ] I have commented my code, particularly in hard-to-understand areas - [ ] If my code has heavily changed functionality I have updated relevant docs on [Stirling-PDFs doc repo](https://github.com/Stirling-Tools/Stirling-Tools.github.io/blob/main/docs/) - [ ] My changes generate no new warnings - [ ] I have read the section [Add New Translation Tags](https://github.com/Stirling-Tools/Stirling-PDF/blob/main/HowToAddNewLanguage.md#add-new-translation-tags) (for new translation tags only)
107 lines
3.7 KiB
Java
107 lines
3.7 KiB
Java
package stirling.software.SPDF.pdf;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.apache.pdfbox.text.PDFTextStripper;
|
|
import org.apache.pdfbox.text.TextPosition;
|
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
import stirling.software.SPDF.model.PDFText;
|
|
|
|
@Slf4j
|
|
public class TextFinder extends PDFTextStripper {
|
|
|
|
private final String searchText;
|
|
private final boolean useRegex;
|
|
private final boolean wholeWordSearch;
|
|
private final List<PDFText> textOccurrences = new ArrayList<>();
|
|
|
|
public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch)
|
|
throws IOException {
|
|
this.searchText = searchText.toLowerCase();
|
|
this.useRegex = useRegex;
|
|
this.wholeWordSearch = wholeWordSearch;
|
|
setSortByPosition(true);
|
|
}
|
|
|
|
private List<MatchInfo> findOccurrencesInText(String searchText, String content) {
|
|
List<MatchInfo> matches = new ArrayList<>();
|
|
|
|
Pattern pattern;
|
|
|
|
if (useRegex) {
|
|
// Use regex-based search
|
|
pattern =
|
|
wholeWordSearch
|
|
? Pattern.compile("\\b" + searchText + "\\b")
|
|
: Pattern.compile(searchText);
|
|
} else {
|
|
// Use normal text search
|
|
pattern =
|
|
wholeWordSearch
|
|
? Pattern.compile("\\b" + Pattern.quote(searchText) + "\\b")
|
|
: Pattern.compile(Pattern.quote(searchText));
|
|
}
|
|
|
|
Matcher matcher = pattern.matcher(content);
|
|
while (matcher.find()) {
|
|
matches.add(new MatchInfo(matcher.start(), matcher.end() - matcher.start()));
|
|
}
|
|
return matches;
|
|
}
|
|
|
|
@Override
|
|
protected void writeString(String text, List<TextPosition> textPositions) {
|
|
for (MatchInfo match : findOccurrencesInText(searchText, text.toLowerCase())) {
|
|
int index = match.startIndex;
|
|
if (index + match.matchLength <= textPositions.size()) {
|
|
// Initial values based on the first character
|
|
TextPosition first = textPositions.get(index);
|
|
float minX = first.getX();
|
|
float minY = first.getY();
|
|
float maxX = first.getX() + first.getWidth();
|
|
float maxY = first.getY() + first.getHeight();
|
|
|
|
// Loop over the rest of the characters and adjust bounding box values
|
|
for (int i = index; i < index + match.matchLength; i++) {
|
|
TextPosition position = textPositions.get(i);
|
|
minX = Math.min(minX, position.getX());
|
|
minY = Math.min(minY, position.getY());
|
|
maxX = Math.max(maxX, position.getX() + position.getWidth());
|
|
maxY = Math.max(maxY, position.getY() + position.getHeight());
|
|
}
|
|
|
|
textOccurrences.add(
|
|
new PDFText(getCurrentPageNo() - 1, minX, minY, maxX, maxY, text));
|
|
}
|
|
}
|
|
}
|
|
|
|
public List<PDFText> getTextLocations(PDDocument document) throws Exception {
|
|
this.getText(document);
|
|
log.debug(
|
|
"Found "
|
|
+ textOccurrences.size()
|
|
+ " occurrences of '"
|
|
+ searchText
|
|
+ "' in the document.");
|
|
|
|
return textOccurrences;
|
|
}
|
|
|
|
private class MatchInfo {
|
|
int startIndex;
|
|
int matchLength;
|
|
|
|
MatchInfo(int startIndex, int matchLength) {
|
|
this.startIndex = startIndex;
|
|
this.matchLength = matchLength;
|
|
}
|
|
}
|
|
}
|