Compare commits

..

1 Commits
dl4j ... master

Author SHA1 Message Date
manalejandro
bcacdd1cef search service 2018-07-23 02:02:07 +02:00
8 changed files with 178 additions and 74 deletions

87
pom.xml
View File

@ -10,19 +10,19 @@
<packaging>war</packaging> <packaging>war</packaging>
<name>arjion</name> <name>arjion</name>
<description>Demo project of Apache Tika for Spring Boot and ML</description> <description>Demo project of Apache Tika for Spring Boot</description>
<parent> <parent>
<groupId>org.springframework.boot</groupId> <groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId> <artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.3.RELEASE</version> <version>2.1.0.BUILD-SNAPSHOT</version>
<relativePath /> <!-- lookup parent from repository --> <relativePath /> <!-- lookup parent from repository -->
</parent> </parent>
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>11</java.version> <java.version>1.8</java.version>
</properties> </properties>
<dependencies> <dependencies>
@ -38,11 +38,6 @@
<groupId>org.springframework.boot</groupId> <groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId> <artifactId>spring-boot-starter-web</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>x-pack-transport</artifactId>
<version>6.4.2</version>
</dependency>
<dependency> <dependency>
<groupId>org.springframework.boot</groupId> <groupId>org.springframework.boot</groupId>
@ -50,37 +45,37 @@
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-app --> <!-- https://mvnrepository.com/artifact/org.apache.tika/tika-app -->
<dependency> <dependency>
<groupId>org.apache.tika</groupId> <groupId>org.apache.tika</groupId>
<artifactId>tika-app</artifactId> <artifactId>tika-app</artifactId>
<version>1.20</version> <version>1.18</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.tika</groupId> <groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId> <artifactId>tika-parsers</artifactId>
<version>1.20</version> <version>1.18</version>
</dependency> </dependency>
<!-- https://mvnrepository.com/artifact/com.github.jai-imageio/jai-imageio-jpeg2000 --> <!-- https://mvnrepository.com/artifact/com.github.jai-imageio/jai-imageio-jpeg2000 -->
<dependency> <dependency>
<groupId>com.github.jai-imageio</groupId> <groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-jpeg2000</artifactId> <artifactId>jai-imageio-jpeg2000</artifactId>
<version>1.3.0</version> <version>1.3.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.levigo.jbig2</groupId> <groupId>com.levigo.jbig2</groupId>
<artifactId>levigo-jbig2-imageio</artifactId> <artifactId>levigo-jbig2-imageio</artifactId>
<version>2.0</version> <version>2.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.github.jai-imageio</groupId> <groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId> <artifactId>jai-imageio-core</artifactId>
<version>1.4.0</version> <version>1.4.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.webjars</groupId> <groupId>org.webjars</groupId>
@ -93,17 +88,6 @@
<version>3.1.1-1</version> <version>3.1.1-1</version>
</dependency> </dependency>
<dependency>
<groupId>org.nd4j</groupId>
<artifactId>nd4j-native-platform</artifactId>
<version>1.0.0-beta3</version>
</dependency>
<dependency>
<groupId>org.deeplearning4j</groupId>
<artifactId>deeplearning4j-core</artifactId>
<version>1.0.0-beta3</version>
</dependency>
</dependencies> </dependencies>
<build> <build>
@ -132,17 +116,6 @@
<enabled>false</enabled> <enabled>false</enabled>
</snapshots> </snapshots>
</repository> </repository>
<!-- add the elastic repo -->
<repository>
<id>elastic</id>
<url>https://artifacts.elastic.co/maven</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories> </repositories>
<pluginRepositories> <pluginRepositories>

View File

@ -10,6 +10,7 @@ import java.nio.file.Paths;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import javax.servlet.http.HttpServletResponse; import javax.servlet.http.HttpServletResponse;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
@ -87,17 +88,13 @@ public class MainController {
if (archivos.length > 0) { if (archivos.length > 0) {
// Recupera la configuración de Tika // Recupera la configuración de Tika
TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
// Si no existe el directorio lo creamos
File fup = new File(uploadpath);
if(!fup.exists() || !fup.isDirectory()) {
fup.mkdir();
}
// Itera los archivos recibidos // Itera los archivos recibidos
for (int i = 0; i < archivos.length; i++) { for (int i = 0; i < archivos.length; i++) {
byte[] bytes = archivos[i].getBytes(); byte[] bytes = archivos[i].getBytes();
// Normaliza el título de los archivos // Normaliza el título de los archivos
String normalized = Normalizer.normalize(archivos[i].getOriginalFilename(), Normalizer.Form.NFD), String normalized = Normalizer.normalize(archivos[i].getOriginalFilename(), Normalizer.Form.NFD),
filename = normalized.replaceAll("\\p{InCombiningDiacriticalMarks}+", ""); filename = normalized.replaceAll("\\p{InCombiningDiacriticalMarks}+", "");
Path path = Paths.get(uploadpath + filename);
// Instancias necesarias // Instancias necesarias
Metadata metadata = new Metadata(); Metadata metadata = new Metadata();
Parser parser = new AutoDetectParser(tikaConfig); Parser parser = new AutoDetectParser(tikaConfig);
@ -130,11 +127,10 @@ public class MainController {
return "exists"; return "exists";
} else { } else {
// Guarda el archivo en el directorio configurado en las properties // Guarda el archivo en el directorio configurado en las properties
Path path = Paths.get(uploadpath + filename);
Files.write(path, bytes); Files.write(path, bytes);
} }
// Añade los parámetros al VO para mostrar en la vista // Añade los parámetros al VO para mostrar en la vista
documentoVO.getArchivos().add(new Archivo(filename, String.valueOf(archivos[i].getSize()), meta, documentoVO.getArchivos().add(new Archivo(filename, Long.valueOf(archivos[i].getSize()).intValue(), meta,
handler.toString(), identifier.getLanguage())); handler.toString(), identifier.getLanguage()));
} }
} }
@ -147,7 +143,7 @@ public class MainController {
DetailVO detailVO = new DetailVO(); DetailVO detailVO = new DetailVO();
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
Documento doc = mainService.findOne(nombre); Documento doc = mainService.findOne(nombre);
detailVO.setArchivo(new Archivo(doc.getNombre(), doc.getTamano().toString(), detailVO.setArchivo(new Archivo(doc.getNombre(), doc.getTamano(),
mapper.convertValue(doc.getMetadata(), Map.class), doc.getContenido(), doc.getLenguaje())); mapper.convertValue(doc.getMetadata(), Map.class), doc.getContenido(), doc.getLenguaje()));
model.addAttribute("detailVO", detailVO); model.addAttribute("detailVO", detailVO);
return "detail"; return "detail";

View File

@ -5,12 +5,12 @@ import java.util.Map;
public class Archivo { public class Archivo {
private String nombre; private String nombre;
private String tamano; private Integer tamano;
private Map metadata; private Map metadata;
private String contenido; private String contenido;
private String lenguaje; private String lenguaje;
public Archivo(String nombre, String tamano, Map metadata, String contenido, String lenguaje) { public Archivo(String nombre, Integer tamano, Map metadata, String contenido, String lenguaje) {
this.nombre = nombre; this.nombre = nombre;
this.tamano = tamano; this.tamano = tamano;
this.metadata = metadata; this.metadata = metadata;
@ -28,7 +28,7 @@ public class Archivo {
/** /**
* @return the tamano * @return the tamano
*/ */
public String getTamano() { public Integer getTamano() {
return tamano; return tamano;
} }
@ -56,7 +56,7 @@ public class Archivo {
/** /**
* @param tamano the tamano to set * @param tamano the tamano to set
*/ */
public void setTamano(String tamano) { public void setTamano(Integer tamano) {
this.tamano = tamano; this.tamano = tamano;
} }

View File

@ -0,0 +1,52 @@
package com.manalejandro.arjion.model;
import java.util.ArrayList;
import java.util.List;
public class Consulta {
private List<Documento> documentos = new ArrayList<Documento>();
private String suggest;
private List<String> autocomplete = new ArrayList<String>();
/**
* @return the documentos
*/
public List<Documento> getDocumentos() {
return documentos;
}
/**
* @return the suggest
*/
public String getSuggest() {
return suggest;
}
/**
* @return the autocomplete
*/
public List<String> getAutocomplete() {
return autocomplete;
}
/**
* @param documentos the documentos to set
*/
public void setDocumentos(List<Documento> documentos) {
this.documentos = documentos;
}
/**
* @param suggest the suggest to set
*/
public void setSuggest(String suggest) {
this.suggest = suggest;
}
/**
* @param autocomplete the autocomplete to set
*/
public void setAutocomplete(List<String> autocomplete) {
this.autocomplete = autocomplete;
}
}

View File

@ -2,8 +2,11 @@ package com.manalejandro.arjion.services;
import java.util.List; import java.util.List;
import com.manalejandro.arjion.model.Consulta;
import com.manalejandro.arjion.model.Documento; import com.manalejandro.arjion.model.Documento;
import org.springframework.data.domain.Pageable;
public interface MainService { public interface MainService {
public boolean save(Documento doc); public boolean save(Documento doc);
@ -13,4 +16,8 @@ public interface MainService {
public List<Documento> findAllDocumento(); public List<Documento> findAllDocumento();
public Documento findOne(String nombre); public Documento findOne(String nombre);
public Integer maxTamano();
public Consulta search(String busqueda, String[] tipo, Integer tamano, Pageable pageable);
} }

View File

@ -1,22 +1,46 @@
package com.manalejandro.arjion.services; package com.manalejandro.arjion.services;
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.manalejandro.arjion.model.Consulta;
import com.manalejandro.arjion.model.Documento; import com.manalejandro.arjion.model.Documento;
import com.manalejandro.arjion.repositories.MainRepository; import com.manalejandro.arjion.repositories.MainRepository;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.aggregations.AggregationBuilder;
import org.elasticsearch.search.aggregations.AggregationBuilders;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option;
import org.elasticsearch.search.suggest.SuggestBuilder;
import org.elasticsearch.search.suggest.SuggestBuilders;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.ApplicationContext;
import org.springframework.data.domain.Pageable;
import org.springframework.data.domain.Sort;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
@Service @Service
public class MainServiceImpl implements MainService { public class MainServiceImpl implements MainService {
private final ApplicationContext appContext;
private final MainRepository mainRepository; private final MainRepository mainRepository;
@Value("#{@indexName}")
private String index;
@Value("#{@documentType}")
private String document;
@Autowired @Autowired
public MainServiceImpl(MainRepository mainRepository) { public MainServiceImpl(MainRepository mainRepository, ApplicationContext appContext) {
this.mainRepository = mainRepository; this.mainRepository = mainRepository;
this.appContext = appContext;
} }
@Override @Override
@ -48,4 +72,56 @@ public class MainServiceImpl implements MainService {
public Documento findOne(String nombre) { public Documento findOne(String nombre) {
return mainRepository.findById(nombre).get(); return mainRepository.findById(nombre).get();
} }
@Override
public Integer maxTamano() {
return mainRepository.findAll(new Sort(Sort.Direction.DESC, "tamano")).iterator().next().getTamano();
}
@Override
public Consulta search(String busqueda, String[] tipo, Integer tamano, Pageable pageable) {
Client client = (Client) appContext.getBean("client");
BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery();
if (busqueda != null && !"null".equals(busqueda) && !busqueda.isEmpty()) {
boolQueryBuilder.must(QueryBuilders.matchQuery("nombre", busqueda));
boolQueryBuilder.should(QueryBuilders.matchQuery("contenido", busqueda));
}
if (tipo != null && tipo.length > 0)
boolQueryBuilder.filter(QueryBuilders.termsQuery("tipo", tipo));
if (tamano != null && tamano >= 0)
boolQueryBuilder.must(QueryBuilders.rangeQuery("tamano").to(tamano).includeUpper(true));
AggregationBuilder aggregation = AggregationBuilders.terms("by_xarchivo").field("x_archivo").size(10000);
SuggestBuilder suggest = new SuggestBuilder()
.addSuggestion("suggest", SuggestBuilders.completionSuggestion("nombre").text(busqueda).size(10))
.addSuggestion("phrase", SuggestBuilders.phraseSuggestion("nombre").text(busqueda).size(1)
.realWordErrorLikelihood((float) 0.95).maxErrors((float) 0.5).gramSize(2));
System.out.println(boolQueryBuilder);
SearchResponse response = client.prepareSearch(index).setQuery(boolQueryBuilder).addAggregation(aggregation)
.suggest(suggest).setSize(pageable.getPageSize()).setFrom(pageable.getPageNumber()).execute()
.actionGet();
Consulta consulta = new Consulta();
consulta.setSuggest(response.getSuggest().getSuggestion("phrase").getEntries().get(0).getOptions().size() > 0
? response.getSuggest().getSuggestion("phrase").getEntries().get(0).getOptions().get(0).getText()
.string()
: "");
for (Entry<? extends Option> entry : response.getSuggest().getSuggestion("suggest").getEntries()) {
entry.getOptions().forEach(option -> {
String suggestText = option.getText().string().trim(),
autocompleteClean = busqueda.replaceAll("[^\\p{Alnum}\\p{IsAlphabetic} ]", "");
for (String item : autocompleteClean.split(" ")) {
if (item.length() > 0) {
consulta.getAutocomplete().add(
suggestText.replaceAll("(?i)((?!<)" + item + "(?![^<>]*>))", "<strong>$1</strong>"));
}
}
});
}
ObjectMapper mapper = new ObjectMapper();
try {
consulta.setDocumentos(mapper.readValue(response.getHits().getHits().toString(), List.class));
} catch (IOException e) {
e.printStackTrace();
}
return consulta;
}
} }

View File

@ -6,7 +6,7 @@ elasticsearch.port=9300
elasticsearch.nodename=arjion elasticsearch.nodename=arjion
arjion.indexName=documentos arjion.indexName=documentos
arjion.documentType=documento arjion.documentType=documento
arjion.uploadpath=./upload/ arjion.uploadpath=/upload/
arjion.tesseractpath=/usr/bin arjion.tesseractpath=/usr/bin
arjion.tesseractdatapath=/usr/share/tesseract-ocr arjion.tesseractdatapath=/usr/share/tesseract-ocr
spring.main.allow-bean-definition-overriding=true spring.main.allow-bean-definition-overriding=true

View File

@ -2,10 +2,10 @@
"documento": { "documento": {
"dynamic_templates": [ "dynamic_templates": [
{ {
"metadata_as_dynamic": { "metadata_as_keywords": {
"path_match": "metadata.*", "path_match": "metadata.*",
"mapping": { "mapping": {
"type": "{dynamic_type}" "type": "keyword"
} }
} }
} }