Compare commits

..

12 Commits

Author SHA1 Message Date
manalejandro
bcacdd1cef search service 2018-07-23 02:02:07 +02:00
manalejandro
aa8af0bfd4 v0.3.0 2018-07-21 15:59:42 +02:00
manalejandro
3f5718522f Dockerfile 2018-07-17 00:51:30 +02:00
manalejandro
72c57f5d31 Dockerfile 2018-07-17 00:08:29 +02:00
manalejandro
c7af93d069 Dockerfile 2018-07-16 23:40:08 +02:00
manalejandro
01f386732d tesseract context trouble 2018-07-16 21:28:08 +02:00
manalejandro
d99f758e41 tesseract language 2018-07-16 15:57:10 +02:00
manalejandro
0d837b5a42 tesseract language 2018-07-15 21:59:21 +02:00
manalejandro
81bb011a34 tesseract ocr 2018-07-15 20:02:24 +02:00
manalejandro
2597cbeee5 some changes 2018-07-15 18:03:58 +02:00
manalejandro
d4f77698c7 jai-imageio-jpeg2000 dependency 2018-07-15 15:21:48 +02:00
manalejandro
93f951ca22 download filename 2018-07-15 04:30:38 +02:00
15 changed files with 269 additions and 47 deletions

15
Dockerfile Normal file
View File

@ -0,0 +1,15 @@
FROM debian:stable-slim
RUN apt-get update
RUN apt-get -y upgrade
RUN mkdir /upload /usr/share/man/man1 /usr/share/man/man8
RUN apt-get -y install --no-install-recommends apt apt-transport-https apt-utils readline-common curl gnupg software-properties-common dirmngr openjdk-8-jdk procps
RUN echo "deb https://artifacts.elastic.co/packages/6.x/apt stable main" > /etc/apt/sources.list.d/elastic-6.x.list
RUN apt-key adv --recv-keys D27D666CD88E42B4
RUN apt-get update
RUN apt-get -y install --no-install-recommends maven tesseract-ocr tesseract-ocr-spa elasticsearch git
RUN sed -i "s/#cluster.name: my-application/cluster.name: elasticsearch/" /etc/elasticsearch/elasticsearch.yml
RUN git clone https://gitlab.com/manalejandro/arjion
RUN mvn clean install -f /arjion/pom.xml
RUN echo "/etc/init.d/elasticsearch start && mvn spring-boot:run -f /arjion/pom.xml" > entrypoint.sh
EXPOSE 8080:8080
ENTRYPOINT ["bash", "entrypoint.sh"]

View File

@ -4,6 +4,11 @@
### Proof of Concept with [SpringBoot 2.1.0](https://start.spring.io/), [ElasticSearch](https://www.elastic.co/) and [Apache Tika](https://tika.apache.org/)
## Docker image
$ docker build -t debian:arjion --rm https://gitlab.com/manalejandro/arjion/raw/master/Dockerfile
$ docker run -ti -p 8080:8080 debian:arjion
## License
MIT

25
pom.xml
View File

@ -52,6 +52,31 @@
<version>1.18</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.18</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.github.jai-imageio/jai-imageio-jpeg2000 -->
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-jpeg2000</artifactId>
<version>1.3.0</version>
</dependency>
<dependency>
<groupId>com.levigo.jbig2</groupId>
<artifactId>levigo-jbig2-imageio</artifactId>
<version>2.0</version>
</dependency>
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
<version>1.4.0</version>
</dependency>
<dependency>
<groupId>org.webjars</groupId>
<artifactId>bootstrap</artifactId>

View File

@ -3,14 +3,17 @@ package com.manalejandro.arjion.controllers;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URLEncoder;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import javax.servlet.http.HttpServletResponse;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.manalejandro.arjion.model.Archivo;
import com.manalejandro.arjion.model.Documento;
import com.manalejandro.arjion.services.MainService;
@ -24,12 +27,14 @@ import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BodyContentHandler;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.ByteArrayResource;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
@ -49,6 +54,12 @@ public class MainController {
@Value("${arjion.uploadpath}")
private String uploadpath;
@Value("${arjion.tesseractpath}")
private String tesseractpath;
@Value("${arjion.tesseractdatapath}")
private String tesseractdatapath;
@Autowired
public MainController(MainService mainService) {
this.mainService = mainService;
@ -75,8 +86,7 @@ public class MainController {
documentoVO.setCount(mainService.count());
documentoVO.setDocumentos(mainService.findAllDocumento());
if (archivos.length > 0) {
documentoVO.setArchivos(new ArrayList<Archivo>());
// Recupera la conficuración de Tika
// Recupera la configuración de Tika
TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
// Itera los archivos recibidos
for (int i = 0; i < archivos.length; i++) {
@ -87,26 +97,41 @@ public class MainController {
Path path = Paths.get(uploadpath + filename);
// Instancias necesarias
Metadata metadata = new Metadata();
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
Parser parser = new AutoDetectParser(tikaConfig);
PDFParserConfig pdfConfig = new PDFParserConfig();
TesseractOCRConfig tesseractConfig = new TesseractOCRConfig();
tesseractConfig.setTesseractPath(tesseractpath);
tesseractConfig.setTessdataPath(tesseractdatapath);
tesseractConfig.setLanguage("spa+eng");
pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, tesseractConfig);
parseContext.set(PDFParserConfig.class, pdfConfig);
// Usa -1 para no tener límite de 100000 chars
ContentHandler handler = new BodyContentHandler(-1);
// Castea los bytes al Stream de Tika
TikaInputStream stream = TikaInputStream.get(bytes);
// Parsea el contenido
parser.parse(stream, handler, metadata, new ParseContext());
parser.parse(stream, handler, metadata, parseContext);
// Identifica el idioma del archivo
LanguageIdentifier identifier = new LanguageIdentifier(handler.toString());
// Almacena en elasticsearch
String[] names = metadata.names();
Map<String, String> meta = new HashMap<String, String>();
for (int j = 0; j < names.length; j++) {
meta.put(names[j], metadata.get(names[j]));
}
ObjectMapper mapper = new ObjectMapper();
if (!mainService.save(new Documento(filename, Long.valueOf(archivos[i].getSize()).intValue(),
metadata.toString(), handler.toString(), identifier.getLanguage()))) {
mapper.valueToTree(meta), handler.toString(), identifier.getLanguage()))) {
return "exists";
} else {
// Guarda el archivo en el directorio configurado en las properties
Files.write(path, bytes);
}
// Añade los parámetros al VO para mostrar en la vista
documentoVO.getArchivos().add(new Archivo(filename, String.valueOf(archivos[i].getSize()),
metadata.toString(), handler.toString(), identifier.getLanguage()));
documentoVO.getArchivos().add(new Archivo(filename, Long.valueOf(archivos[i].getSize()).intValue(), meta,
handler.toString(), identifier.getLanguage()));
}
}
model.addAttribute("documentoVO", documentoVO);
@ -116,7 +141,10 @@ public class MainController {
@GetMapping(path = "/detail")
public String detail(final Model model, @RequestParam(value = "nombre", required = true) String nombre) {
DetailVO detailVO = new DetailVO();
detailVO.setDocumento(mainService.findOne(nombre));
ObjectMapper mapper = new ObjectMapper();
Documento doc = mainService.findOne(nombre);
detailVO.setArchivo(new Archivo(doc.getNombre(), doc.getTamano(),
mapper.convertValue(doc.getMetadata(), Map.class), doc.getContenido(), doc.getLenguaje()));
model.addAttribute("detailVO", detailVO);
return "detail";
}
@ -130,9 +158,8 @@ public class MainController {
ByteArrayResource resource = new ByteArrayResource(Files.readAllBytes(path));
String type = file.toURL().openConnection().guessContentTypeFromName(filename);
HttpHeaders responseHeaders = new HttpHeaders();
responseHeaders.add("content-disposition", "attachment; filename=" + filename);
responseHeaders.add("Content-Disposition", "attachment; filename=" + URLEncoder.encode(filename, "UTF-8"));
responseHeaders.add("Content-Type", type);
return ResponseEntity.ok().contentLength(file.length()).headers(responseHeaders)
.contentType(MediaType.parseMediaType("application/octet-stream")).body(resource);
return ResponseEntity.ok().contentLength(file.length()).headers(responseHeaders).body(resource);
}
}

View File

@ -1,14 +1,16 @@
package com.manalejandro.arjion.model;
import java.util.Map;
public class Archivo {
private String nombre;
private String tamano;
private String metadata;
private Integer tamano;
private Map metadata;
private String contenido;
private String lenguaje;
public Archivo(String nombre, String tamano, String metadata, String contenido, String lenguaje) {
public Archivo(String nombre, Integer tamano, Map metadata, String contenido, String lenguaje) {
this.nombre = nombre;
this.tamano = tamano;
this.metadata = metadata;
@ -26,14 +28,14 @@ public class Archivo {
/**
* @return the tamano
*/
public String getTamano() {
public Integer getTamano() {
return tamano;
}
/**
* @return the metadata
*/
public String getMetadata() {
public Map getMetadata() {
return metadata;
}
@ -54,14 +56,14 @@ public class Archivo {
/**
* @param tamano the tamano to set
*/
public void setTamano(String tamano) {
public void setTamano(Integer tamano) {
this.tamano = tamano;
}
/**
* @param metadata the metadata to set
*/
public void setMetadata(String metadata) {
public void setMetadata(Map metadata) {
this.metadata = metadata;
}

View File

@ -0,0 +1,52 @@
package com.manalejandro.arjion.model;
import java.util.ArrayList;
import java.util.List;
public class Consulta {
private List<Documento> documentos = new ArrayList<Documento>();
private String suggest;
private List<String> autocomplete = new ArrayList<String>();
/**
* @return the documentos
*/
public List<Documento> getDocumentos() {
return documentos;
}
/**
* @return the suggest
*/
public String getSuggest() {
return suggest;
}
/**
* @return the autocomplete
*/
public List<String> getAutocomplete() {
return autocomplete;
}
/**
* @param documentos the documentos to set
*/
public void setDocumentos(List<Documento> documentos) {
this.documentos = documentos;
}
/**
* @param suggest the suggest to set
*/
public void setSuggest(String suggest) {
this.suggest = suggest;
}
/**
* @param autocomplete the autocomplete to set
*/
public void setAutocomplete(List<String> autocomplete) {
this.autocomplete = autocomplete;
}
}

View File

@ -1,13 +1,14 @@
package com.manalejandro.arjion.model;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.JsonNode;
import org.springframework.data.annotation.Id;
import org.springframework.data.elasticsearch.annotations.Document;
import org.springframework.data.elasticsearch.annotations.Mapping;
import org.springframework.data.elasticsearch.annotations.Setting;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
@Document(indexName = "#{@indexName}", type = "#{@documentType}")
@Setting(settingPath = "/elasticsearch/settings.json")
@Mapping(mappingPath = "/elasticsearch/mapping.json")
@ -15,15 +16,14 @@ public class Documento {
@Id
public String nombre;
public Integer tamano;
public String metadata;
public JsonNode metadata;
public String contenido;
public String lenguaje;
@JsonCreator
public Documento(@JsonProperty("nombre") String nombre, @JsonProperty("tamano") Integer tamano,
@JsonProperty("metadata") String metadata, @JsonProperty("contenido") String contenido,
@JsonProperty("metadata") JsonNode metadata, @JsonProperty("contenido") String contenido,
@JsonProperty("lenguaje") String lenguaje) {
super();
this.nombre = nombre;
this.tamano = tamano;
this.metadata = metadata;
@ -65,14 +65,14 @@ public class Documento {
* @return the metadata
*/
@JsonProperty("metadata")
public String getMetadata() {
public JsonNode getMetadata() {
return metadata;
}
/**
* @param metadata the metadata to set
*/
public void setMetadata(String metadata) {
public void setMetadata(JsonNode metadata) {
this.metadata = metadata;
}

View File

@ -2,8 +2,11 @@ package com.manalejandro.arjion.services;
import java.util.List;
import com.manalejandro.arjion.model.Consulta;
import com.manalejandro.arjion.model.Documento;
import org.springframework.data.domain.Pageable;
public interface MainService {
public boolean save(Documento doc);
@ -13,4 +16,8 @@ public interface MainService {
public List<Documento> findAllDocumento();
public Documento findOne(String nombre);
public Integer maxTamano();
public Consulta search(String busqueda, String[] tipo, Integer tamano, Pageable pageable);
}

View File

@ -1,22 +1,46 @@
package com.manalejandro.arjion.services;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.manalejandro.arjion.model.Consulta;
import com.manalejandro.arjion.model.Documento;
import com.manalejandro.arjion.repositories.MainRepository;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option;
import org.elasticsearch.search.suggest.SuggestBuilder;
import org.elasticsearch.search.suggest.SuggestBuilders;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.ApplicationContext;
import org.springframework.data.domain.Pageable;
import org.springframework.data.domain.Sort;
import org.springframework.stereotype.Service;
@Service
public class MainServiceImpl implements MainService {
private final ApplicationContext appContext;
private final MainRepository mainRepository;
@Value("#{@indexName}")
private String index;
@Value("#{@documentType}")
private String document;
@Autowired
public MainServiceImpl(MainRepository mainRepository) {
public MainServiceImpl(MainRepository mainRepository, ApplicationContext appContext) {
this.mainRepository = mainRepository;
this.appContext = appContext;
}
@Override
@ -48,4 +72,56 @@ public class MainServiceImpl implements MainService {
public Documento findOne(String nombre) {
return mainRepository.findById(nombre).get();
}
@Override
public Integer maxTamano() {
return mainRepository.findAll(new Sort(Sort.Direction.DESC, "tamano")).iterator().next().getTamano();
}
@Override
public Consulta search(String busqueda, String[] tipo, Integer tamano, Pageable pageable) {
Client client = (Client) appContext.getBean("client");
BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery();
if (busqueda != null && !"null".equals(busqueda) && !busqueda.isEmpty()) {
boolQueryBuilder.must(QueryBuilders.matchQuery("nombre", busqueda));
boolQueryBuilder.should(QueryBuilders.matchQuery("contenido", busqueda));
}
if (tipo != null && tipo.length > 0)
boolQueryBuilder.filter(QueryBuilders.termsQuery("tipo", tipo));
if (tamano != null && tamano >= 0)
boolQueryBuilder.must(QueryBuilders.rangeQuery("tamano").to(tamano).includeUpper(true));
AggregationBuilder aggregation = AggregationBuilders.terms("by_xarchivo").field("x_archivo").size(10000);
SuggestBuilder suggest = new SuggestBuilder()
.addSuggestion("suggest", SuggestBuilders.completionSuggestion("nombre").text(busqueda).size(10))
.addSuggestion("phrase", SuggestBuilders.phraseSuggestion("nombre").text(busqueda).size(1)
.realWordErrorLikelihood((float) 0.95).maxErrors((float) 0.5).gramSize(2));
System.out.println(boolQueryBuilder);
SearchResponse response = client.prepareSearch(index).setQuery(boolQueryBuilder).addAggregation(aggregation)
.suggest(suggest).setSize(pageable.getPageSize()).setFrom(pageable.getPageNumber()).execute()
.actionGet();
Consulta consulta = new Consulta();
consulta.setSuggest(response.getSuggest().getSuggestion("phrase").getEntries().get(0).getOptions().size() > 0
? response.getSuggest().getSuggestion("phrase").getEntries().get(0).getOptions().get(0).getText()
.string()
: "");
for (Entry<? extends Option> entry : response.getSuggest().getSuggestion("suggest").getEntries()) {
entry.getOptions().forEach(option -> {
String suggestText = option.getText().string().trim(),
autocompleteClean = busqueda.replaceAll("[^\\p{Alnum}\\p{IsAlphabetic} ]", "");
for (String item : autocompleteClean.split(" ")) {
if (item.length() > 0) {
consulta.getAutocomplete().add(
suggestText.replaceAll("(?i)((?!<)" + item + "(?![^<>]*>))", "<strong>$1</strong>"));
}
}
});
}
ObjectMapper mapper = new ObjectMapper();
try {
consulta.setDocumentos(mapper.readValue(response.getHits().getHits().toString(), List.class));
} catch (IOException e) {
e.printStackTrace();
}
return consulta;
}
}

View File

@ -1,22 +1,22 @@
package com.manalejandro.arjion.vo;
import com.manalejandro.arjion.model.Documento;
import com.manalejandro.arjion.model.Archivo;
public class DetailVO {
private Documento documento;
private Archivo archivo;
/**
* @return the documento
* @return the archivo
*/
public Documento getDocumento() {
return documento;
public Archivo getArchivo() {
return archivo;
}
/**
* @param documento the documento to set
* @param archivo the archivo to set
*/
public void setDocumento(Documento documento) {
this.documento = documento;
public void setArchivo(Archivo archivo) {
this.archivo = archivo;
}
}

View File

@ -1,5 +1,6 @@
package com.manalejandro.arjion.vo;
import java.util.ArrayList;
import java.util.List;
import com.manalejandro.arjion.model.Archivo;
@ -7,9 +8,9 @@ import com.manalejandro.arjion.model.Documento;
public class DocumentoVO {
private List<Archivo> archivos;
private List<Archivo> archivos = new ArrayList<Archivo>();
private long count;
private List<Documento> documentos;
private List<Documento> documentos = new ArrayList<Documento>();
/**
* @return the archivos

View File

@ -7,6 +7,8 @@ elasticsearch.nodename=arjion
arjion.indexName=documentos
arjion.documentType=documento
arjion.uploadpath=/upload/
arjion.tesseractpath=/usr/bin
arjion.tesseractdatapath=/usr/share/tesseract-ocr
spring.main.allow-bean-definition-overriding=true
spring.thymeleaf.enabled=true
spring.thymeleaf.prefix=classpath:/templates/

View File

@ -1,5 +1,15 @@
{
"documento": {
"dynamic_templates": [
{
"metadata_as_keywords": {
"path_match": "metadata.*",
"mapping": {
"type": "keyword"
}
}
}
],
"properties": {
"@timestamp": {
"type": "date",
@ -15,7 +25,7 @@
"type": "long"
},
"metadata": {
"type": "text"
"type": "object"
},
"contenido": {
"type": "text"

View File

@ -16,20 +16,20 @@
<a th:href="@{/}">
<h1 class="text-primary">Arjion</h1>
</a>
<h3 class="text-warning">[[${detailVO.documento.nombre}]]</h3>
<h3 class="text-warning">[[${detailVO.archivo.nombre}]]</h3>
</header>
<section class="col-md-12">
<hr>
</section>
<section>
<span class="col-md-1 text-muted">Tamaño</span>
<span class="col-md-11 text-muted">[[${detailVO.documento.tamano}]] bytes</span>
<span class="col-md-11 text-muted">[[${detailVO.archivo.tamano}]] bytes</span>
<span class="col-md-1 text-muted">Lenguaje</span>
<span class="col-md-11 text-muted">[[${detailVO.documento.lenguaje}]]</span>
<span class="col-md-11 text-muted">[[${detailVO.archivo.lenguaje}]]</span>
<span class="col-md-1 text-success">Metadatos</span>
<span class="col-md-11 text-success">[[${detailVO.documento.metadata}]]</span>
<span class="col-md-11 text-success"><ul><li th:each="meta : ${detailVO.archivo.metadata}"><span th:text="${meta.key}"></span>: <span th:text="${meta.value}"></span></li></ul></span>
<span class="col-md-1 text-warning">Contenido</span>
<pre class="col-md-11 text-warning">[[${detailVO.documento.contenido}]]</pre>
<pre class="col-md-11 text-warning">[[${detailVO.archivo.contenido}]]</pre>
</div>
</section>
<section class="col-md-12">
@ -37,7 +37,7 @@
</section>
<footer class="col-md-12 text-center">
<span class="col-md-12">
<button class="btn btn-primary" th:onclick="'window.location.pathname=\'' + @{/} + '\''">Volver</button>
<a class="btn btn-primary" th:href="@{/}">Volver</a>
</span>
<span>2018</span>
</footer>

View File

@ -49,7 +49,7 @@
<span class="col-md-1 text-muted">Lenguaje</span>
<span class="col-md-11 text-muted">[[${arc.lenguaje}]]</span>
<span class="col-md-1 text-success">Metadatos</span>
<span class="col-md-11 text-success">[[${arc.metadata}]]</span>
<span class="col-md-11 text-success"><ul><li th:each="meta : ${arc.metadata}"><span th:text="${meta.key}"></span>: <span th:text="${meta.value}"></span></li></ul></span>
<span class="col-md-1 text-warning">Contenido</span>
<pre class="col-md-11 text-warning">[[${arc.contenido}]]</pre>
<span class="col-md-12">