tesseract context trouble

This commit is contained in:
manalejandro 2018-07-16 21:28:08 +02:00
parent d99f758e41
commit 01f386732d
2 changed files with 7 additions and 3 deletions

View File

@ -54,6 +54,9 @@ public class MainController {
@Value("${arjion.tesseractpath}")
private String tesseractpath;
@Value("${arjion.tesseractdatapath}")
private String tesseractdatapath;
@Autowired
public MainController(MainService mainService) {
this.mainService = mainService;
@ -95,18 +98,18 @@ public class MainController {
PDFParserConfig pdfConfig = new PDFParserConfig();
TesseractOCRConfig tesseractConfig = new TesseractOCRConfig();
tesseractConfig.setTesseractPath(tesseractpath);
tesseractConfig.setTessdataPath(tesseractdatapath);
tesseractConfig.setLanguage("spa+eng");
pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, tesseractConfig);
parseContext.set(PDFParserConfig.class, pdfConfig);
parseContext.set(Parser.class, parser);
// Usa -1 para no tener límite de 100000 chars
ContentHandler handler = new BodyContentHandler(-1);
// Castea los bytes al Stream de Tika
TikaInputStream stream = TikaInputStream.get(bytes);
// Parsea el contenido
parser.parse(stream, handler, metadata, new ParseContext());
parser.parse(stream, handler, metadata, parseContext);
// Identifica el idioma del archivo
LanguageIdentifier identifier = new LanguageIdentifier(handler.toString());
// Almacena en elasticsearch

View File

@ -7,7 +7,8 @@ elasticsearch.nodename=arjion
arjion.indexName=documentos
arjion.documentType=documento
arjion.uploadpath=/upload/
arjion.tesseractpath=/usr/bin/tesseract
arjion.tesseractpath=/usr/bin
arjion.tesseractdatapath=/usr/share/tesseract-ocr
spring.main.allow-bean-definition-overriding=true
spring.thymeleaf.enabled=true
spring.thymeleaf.prefix=classpath:/templates/