diff --git a/gxflowfulltextsearch/pom.xml b/gxflowfulltextsearch/pom.xml new file mode 100644 index 000000000..ea5c5856d --- /dev/null +++ b/gxflowfulltextsearch/pom.xml @@ -0,0 +1,63 @@ + + + 4.0.0 + + + com.genexus + parent + ${revision}${changelist} + + + gxflowfulltextsearch + GXflow FullText Search + + + + org.apache.commons + commons-collections4 + ${commons.collections4.version} + + + commons-logging + commons-logging + ${commons.logging.version} + + + org.apache.pdfbox + pdfbox + ${pdfbox.version} + + + org.apache.lucene + lucene-core + ${lucene.version} + + + org.apache.poi + poi + ${poi.version} + + + org.apache.poi + poi-ooxml + ${poi.version} + + + org.apache.logging.log4j + log4j-core + ${log4j.version} + + + + + GXflowFullTextSearch + + + org.apache.maven.plugins + maven-compiler-plugin + 3.8.0 + + + + + diff --git a/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/AnalyzerManager.java b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/AnalyzerManager.java new file mode 100644 index 000000000..7e121a34c --- /dev/null +++ b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/AnalyzerManager.java @@ -0,0 +1,26 @@ +package com.genexus.CA.search; + +import java.util.HashMap; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; + +public class AnalyzerManager { + private static final HashMap hash = new HashMap(); + + public static Analyzer getAnalyzer(String lang) { + Analyzer analyzer = null; + if (hash.containsKey(lang)) { + analyzer = (Analyzer)hash.get(lang); + } else { + if (lang.equals("spa")) { + analyzer = new StandardAnalyzer(); + } else { + analyzer = new StandardAnalyzer(); + } + + hash.put(lang, analyzer); + } + + return (Analyzer)analyzer; + } +} diff --git a/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/IndexManager.java b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/IndexManager.java new file mode 100644 index 000000000..6b24e00e9 --- /dev/null +++ b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/IndexManager.java @@ -0,0 +1,27 @@ +package com.genexus.CA.search; + +import java.util.HashMap; + +public class IndexManager { + private static final HashMap hash = new HashMap(); + + public static void addContent(String dir, String uri, String lang, String title, String summary, byte fromFile, String body, String filePath) { + getIndexer(dir).addContent(uri, lang, title, summary, fromFile, body, filePath); + } + + public static void deleteContent(String dir, String uri) { + getIndexer(dir).deleteContent(uri); + } + + private static synchronized Indexer getIndexer(String dir) { + Indexer indexer = null; + if (hash.containsKey(dir)) { + indexer = (Indexer)hash.get(dir); + } else { + indexer = new Indexer(dir); + hash.put(dir, indexer); + } + + return indexer; + } +} diff --git a/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Indexer.java b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Indexer.java new file mode 100644 index 000000000..f04416b02 --- /dev/null +++ b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Indexer.java @@ -0,0 +1,206 @@ +package com.genexus.CA.search; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.LogManager; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.PDFTextStripperByArea; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; + +public final class Indexer { + private String indexDirectory = "."; + private static final int IDX = 1; + private static final int DLT = 2; + + private static final Logger logger = LogManager.getLogger("Indexer.class"); + + protected Indexer(String directory) { + this.indexDirectory = directory; + if (!this.indexExists(directory)) { + try { + this.indexDirectory = directory; + IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(), true); + writer.close(); + } catch (Exception var3) { + logger.error(var3.getMessage(), var3); + } + } + + } + + protected void addContent(String uri, String lang, String title, String summary, byte fromFile, String body, String filePath) { + Document doc = null; + doc = new Document(); + String content = ""; + if (fromFile == 1) { + try { + if (this.isMicrosoftExtension(filePath)) { + FileInputStream file = new FileInputStream(filePath); + XWPFDocument reader = new XWPFDocument(file); + List data = reader.getParagraphs(); + + XWPFParagraph p; + for(Iterator var14 = data.iterator(); var14.hasNext(); content = content + p.getText()) { + p = var14.next(); + } + } else if (this.isPdfExtension(filePath)) { + PDDocument document = Loader.loadPDF(new File(filePath)); + new PDFTextStripperByArea(); + PDFTextStripper tStripper = new PDFTextStripper(); + content = content + tStripper.getText(document); + } + } catch (IOException var16) { + var16.printStackTrace(); + } + } + + if (this.documentExists(uri, lang)) { + this.indexOperation(2, lang, (Document) null, uri.toLowerCase()); + } + + doc.add(new Field("uri", uri, Store.YES, Index.UN_TOKENIZED)); + doc.add(new Field("content", content, Store.YES, Index.TOKENIZED)); + + try { + this.indexOperation(1, lang, doc, (String)null); + } catch (Exception var15) { + logger.error(var15.getMessage(), var15); + } + + } + + protected void deleteContent(String uri) { + try { + this.indexOperation(2, (String)null, (Document)null, uri.toLowerCase()); + } catch (Exception var3) { + logger.error(var3.getMessage(), var3); + } + + } + + protected synchronized void indexOperation(int op, String lang, Document doc, String uri) { + switch(op) { + case 1: + try { + IndexWriter writer = new IndexWriter(this.getIndexDirectory(), AnalyzerManager.getAnalyzer(lang), false); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + } catch (Exception var9) { + logger.error(var9.getMessage(), var9); + } + break; + case 2: + try { + Term term = null; + int docId = 0; + if (lang == null) { + term = new Term("uri", uri); + } else { + docId = this.getDocumentId(uri, lang); + } + + IndexReader reader = IndexReader.open(this.getIndexDirectory()); + if (lang == null) { + reader.deleteDocuments(term); + } else if (docId != -1) { + reader.deleteDocument(docId); + } + + reader.close(); + } catch (Exception var8) { + logger.error(var8.getMessage(), var8); + } + } + + } + + public String getIndexDirectory() { + return this.indexDirectory; + } + + private boolean indexExists(String dir) { + try { + new IndexSearcher(dir); + return true; + } catch (IOException var3) { + return false; + } + } + + private boolean documentExists(String uri, String lang) { + boolean value = false; + + Hits hits = getHits(uri, lang); + if (hits.length() > 0) { + value = true; + } + + return value; + } + + private int getDocumentId(String uri, String lang) { + int value = -1; + + try { + Hits hits = this.getHits(uri, lang); + if (hits.length() > 0) { + value = hits.id(0); + } + } catch (IOException var7) { + logger.error(var7.getMessage(), var7); + } + + return value; + } + + private boolean isMicrosoftExtension(String filePath) { + return filePath.endsWith(".doc") || filePath.endsWith(".docx") || filePath.endsWith(".xls") || filePath.endsWith(".xlsx") || filePath.endsWith(".ppt") || filePath.endsWith(".pptx"); + } + + private Hits getHits(String uri, String lang) { + IndexSearcher searcher = null; + Hits hits = null; + try { + searcher = new IndexSearcher(this.indexDirectory); + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term("uri", uri)), Occur.MUST); + query.add(new TermQuery(new Term("language", lang)), Occur.MUST); + hits = searcher.search(query); + searcher.close(); + } catch (IOException e) { + logger.error(e.getMessage(), e); + } + + return hits; + } + + private boolean isPdfExtension(String filePath) { + return filePath.endsWith(".pdf"); + } + + private boolean isTxtExtension(String filePath) { + return filePath.endsWith(".txt") || filePath.endsWith(".html"); + } +} diff --git a/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Searcher.java b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Searcher.java new file mode 100644 index 000000000..2c2549303 --- /dev/null +++ b/gxflowfulltextsearch/src/main/java/com/genexus/CA/search/Searcher.java @@ -0,0 +1,53 @@ +package com.genexus.CA.search; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.MultiFieldQueryParser; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.BooleanClause.Occur; + +public class Searcher { + private static final Logger logger = LogManager.getLogger("Searcher.class"); + + public static String search(String dir, String lang, String query, int maxResults, int from) { + StringBuilder buff = new StringBuilder(); + + try { + IndexSearcher searcher = new IndexSearcher(dir); + String[] fields = new String[]{"title", "content"}; + Occur[] clauses = new Occur[]{Occur.SHOULD, Occur.SHOULD}; + Query q = MultiFieldQueryParser.parse(query, fields, clauses, AnalyzerManager.getAnalyzer(lang)); + if (!lang.equals("IND")) { + Query q2 = new TermQuery(new Term("language", lang)); + BooleanQuery bq = new BooleanQuery(); + bq.add((Query) q, Occur.MUST); + bq.add(q2, Occur.MUST); + q = bq; + } + + Hits hits = searcher.search((Query) q); + String time = ""; + int max = hits.length(); + buff.append(""); + buff.append(""); + + for (int i = 0; i < max; ++i) { + buff.append(""); + Document doc = hits.doc(i); + buff.append("").append(doc.getField("uri").stringValue()).append(""); + buff.append(""); + } + } catch (Exception var15) { + logger.error(var15.getMessage(), var15); + } + + buff.append(""); + return buff.toString(); + } +} diff --git a/gxsearch/pom.xml b/gxsearch/pom.xml index 51abb16ea..2c4ba8012 100644 --- a/gxsearch/pom.xml +++ b/gxsearch/pom.xml @@ -32,17 +32,17 @@ org.apache.lucene lucene-core - 2.2.0 + ${lucene.version} org.apache.lucene lucene-highlighter - 2.2.0 + ${lucene.version} org.apache.lucene lucene-spellchecker - 2.2.0 + ${lucene.version} com.github.jtidy diff --git a/java/pom.xml b/java/pom.xml index e3c4e5750..c897b8cfc 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -37,7 +37,7 @@ org.apache.commons commons-collections4 - 4.1 + ${commons.collections4.version} org.apache.logging.log4j @@ -110,7 +110,7 @@ org.apache.pdfbox pdfbox - 3.0.3 + ${pdfbox.version} org.jsoup diff --git a/pom.xml b/pom.xml index 07596c207..c8e66393f 100644 --- a/pom.xml +++ b/pom.xml @@ -22,6 +22,10 @@ 3.0.4 UTF-8 5.4.1 + 3.0.3 + 2.2.0 + 4.1 + 1.2 2.16.2 4.13.2 2.35.6 @@ -127,6 +131,7 @@ gxftps gamutils gamtotp + gxflowfulltextsearch