tesseract context trouble
This commit is contained in:
parent
d99f758e41
commit
01f386732d
@ -54,6 +54,9 @@ public class MainController {
|
||||
@Value("${arjion.tesseractpath}")
|
||||
private String tesseractpath;
|
||||
|
||||
@Value("${arjion.tesseractdatapath}")
|
||||
private String tesseractdatapath;
|
||||
|
||||
@Autowired
|
||||
public MainController(MainService mainService) {
|
||||
this.mainService = mainService;
|
||||
@ -95,18 +98,18 @@ public class MainController {
|
||||
PDFParserConfig pdfConfig = new PDFParserConfig();
|
||||
TesseractOCRConfig tesseractConfig = new TesseractOCRConfig();
|
||||
tesseractConfig.setTesseractPath(tesseractpath);
|
||||
tesseractConfig.setTessdataPath(tesseractdatapath);
|
||||
tesseractConfig.setLanguage("spa+eng");
|
||||
pdfConfig.setExtractInlineImages(true);
|
||||
ParseContext parseContext = new ParseContext();
|
||||
parseContext.set(TesseractOCRConfig.class, tesseractConfig);
|
||||
parseContext.set(PDFParserConfig.class, pdfConfig);
|
||||
parseContext.set(Parser.class, parser);
|
||||
// Usa -1 para no tener límite de 100000 chars
|
||||
ContentHandler handler = new BodyContentHandler(-1);
|
||||
// Castea los bytes al Stream de Tika
|
||||
TikaInputStream stream = TikaInputStream.get(bytes);
|
||||
// Parsea el contenido
|
||||
parser.parse(stream, handler, metadata, new ParseContext());
|
||||
parser.parse(stream, handler, metadata, parseContext);
|
||||
// Identifica el idioma del archivo
|
||||
LanguageIdentifier identifier = new LanguageIdentifier(handler.toString());
|
||||
// Almacena en elasticsearch
|
||||
|
@ -7,7 +7,8 @@ elasticsearch.nodename=arjion
|
||||
arjion.indexName=documentos
|
||||
arjion.documentType=documento
|
||||
arjion.uploadpath=/upload/
|
||||
arjion.tesseractpath=/usr/bin/tesseract
|
||||
arjion.tesseractpath=/usr/bin
|
||||
arjion.tesseractdatapath=/usr/share/tesseract-ocr
|
||||
spring.main.allow-bean-definition-overriding=true
|
||||
spring.thymeleaf.enabled=true
|
||||
spring.thymeleaf.prefix=classpath:/templates/
|
||||
|
Loading…
Reference in New Issue
Block a user