在做全文检索的处理中我们可能需要从非文本文件抽取文本内容。
pdf的话可以用pdfbox来实现。
例:
public static void main(String[] args) { String pdfFile = "c:/test.pdf"; PDDocument doc; try { doc = PDDocument.load(pdfFile); PDFTextStripper stripper = new PDFTextStripper(); System.out.print(stripper.getText(doc)); doc.close(); } catch (IOException e) { e.printStackTrace(); } }
office系列的话可以用poi来实现。
例:
FileInputStream fis = new FileInputStream(inputFile); POIFSFileSystem fileSystem = new POIFSFileSystem(fis); // Firstly, get an extractor for the Workbook POIOLE2TextExtractor oleTextExtractor = ExtractorFactory.createExtractor(fileSystem); // Then a List of extractors for any embedded Excel, Word, PowerPoint // or Visio objects embedded into it. POITextExtractor[] embeddedExtractors = ExtractorFactory.getEmbededDocsTextExtractors(oleTextExtractor); for (POITextExtractor textExtractor : embeddedExtractors) { // If the embedded object was an Excel spreadsheet. if (textExtractor instanceof ExcelExtractor) { ExcelExtractor excelExtractor = (ExcelExtractor) textExtractor; System.out.println(excelExtractor.getText()); } // A Word Document else if (textExtractor instanceof WordExtractor) { WordExtractor wordExtractor = (WordExtractor) textExtractor; String[] paragraphText = wordExtractor.getParagraphText(); for (String paragraph : paragraphText) { System.out.println(paragraph); } // Display the document's header and footer text System.out.println("Footer text: " + wordExtractor.getFooterText()); System.out.println("Header text: " + wordExtractor.getHeaderText()); } // PowerPoint Presentation. else if (textExtractor instanceof PowerPointExtractor) { PowerPointExtractor powerPointExtractor = (PowerPointExtractor) textExtractor; System.out.println("Text: " + powerPointExtractor.getText()); System.out.println("Notes: " + powerPointExtractor.getNotes()); } // Visio Drawing else if (textExtractor instanceof VisioTextExtractor) { VisioTextExtractor visioTextExtractor = (VisioTextExtractor) textExtractor; System.out.println("Text: " + visioTextExtractor.getText()); } }