From 01f386732d86efa4cf26ffd06ba828c3a679869b Mon Sep 17 00:00:00 2001 From: manalejandro Date: Mon, 16 Jul 2018 21:28:08 +0200 Subject: [PATCH] tesseract context trouble --- .../manalejandro/arjion/controllers/MainController.java | 7 +++++-- src/main/resources/application.properties | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/manalejandro/arjion/controllers/MainController.java b/src/main/java/com/manalejandro/arjion/controllers/MainController.java index a13b92b..7751dde 100644 --- a/src/main/java/com/manalejandro/arjion/controllers/MainController.java +++ b/src/main/java/com/manalejandro/arjion/controllers/MainController.java @@ -54,6 +54,9 @@ public class MainController { @Value("${arjion.tesseractpath}") private String tesseractpath; + @Value("${arjion.tesseractdatapath}") + private String tesseractdatapath; + @Autowired public MainController(MainService mainService) { this.mainService = mainService; @@ -95,18 +98,18 @@ public class MainController { PDFParserConfig pdfConfig = new PDFParserConfig(); TesseractOCRConfig tesseractConfig = new TesseractOCRConfig(); tesseractConfig.setTesseractPath(tesseractpath); + tesseractConfig.setTessdataPath(tesseractdatapath); tesseractConfig.setLanguage("spa+eng"); pdfConfig.setExtractInlineImages(true); ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, tesseractConfig); parseContext.set(PDFParserConfig.class, pdfConfig); - parseContext.set(Parser.class, parser); // Usa -1 para no tener lĂ­mite de 100000 chars ContentHandler handler = new BodyContentHandler(-1); // Castea los bytes al Stream de Tika TikaInputStream stream = TikaInputStream.get(bytes); // Parsea el contenido - parser.parse(stream, handler, metadata, new ParseContext()); + parser.parse(stream, handler, metadata, parseContext); // Identifica el idioma del archivo LanguageIdentifier identifier = new LanguageIdentifier(handler.toString()); // Almacena en elasticsearch diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index c33280b..d57275f 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -7,7 +7,8 @@ elasticsearch.nodename=arjion arjion.indexName=documentos arjion.documentType=documento arjion.uploadpath=/upload/ -arjion.tesseractpath=/usr/bin/tesseract +arjion.tesseractpath=/usr/bin +arjion.tesseractdatapath=/usr/share/tesseract-ocr spring.main.allow-bean-definition-overriding=true spring.thymeleaf.enabled=true spring.thymeleaf.prefix=classpath:/templates/