View Javadoc
1   /*
2    * Copyright 2012-2017 CodeLibs Project and the Others.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13   * either express or implied. See the License for the specific language
14   * governing permissions and limitations under the License.
15   */
16  package org.codelibs.fess.helper;
17  
18  import java.io.UnsupportedEncodingException;
19  import java.net.URLEncoder;
20  import java.util.ArrayList;
21  import java.util.Collections;
22  import java.util.Date;
23  import java.util.HashMap;
24  import java.util.LinkedHashMap;
25  import java.util.List;
26  import java.util.Map;
27  
28  import org.codelibs.core.lang.StringUtil;
29  import org.codelibs.core.security.MessageDigestUtil;
30  import org.codelibs.fess.Constants;
31  import org.codelibs.fess.app.service.CrawlingInfoService;
32  import org.codelibs.fess.es.client.FessEsClient;
33  import org.codelibs.fess.es.config.exentity.CrawlingConfig;
34  import org.codelibs.fess.es.config.exentity.CrawlingInfo;
35  import org.codelibs.fess.es.config.exentity.CrawlingInfoParam;
36  import org.codelibs.fess.exception.FessSystemException;
37  import org.codelibs.fess.mylasta.direction.FessConfig;
38  import org.codelibs.fess.util.ComponentUtil;
39  import org.elasticsearch.index.query.QueryBuilders;
40  import org.elasticsearch.search.aggregations.AggregationBuilders;
41  import org.elasticsearch.search.aggregations.bucket.terms.Terms;
42  import org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket;
43  import org.elasticsearch.search.aggregations.bucket.terms.Terms.Order;
44  import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
45  import org.slf4j.Logger;
46  import org.slf4j.LoggerFactory;
47  
48  public class CrawlingInfoHelper {
49      private static final Logger logger = LoggerFactory.getLogger(CrawlingInfoHelper.class);
50  
51      public static final String FACET_COUNT_KEY = "count";
52  
53      protected Map<String, String> infoMap;
54  
55      protected Long documentExpires;
56  
57      protected int maxSessionIdsInList;
58  
59      protected int urlIdPrefixLength = 445;
60  
61      protected CrawlingInfoService getCrawlingInfoService() {
62          return ComponentUtil.getComponent(CrawlingInfoService.class);
63      }
64  
65      public String getCanonicalSessionId(final String sessionId) {
66          final int idx = sessionId.indexOf('-');
67          if (idx >= 0) {
68              return sessionId.substring(0, idx);
69          }
70          return sessionId;
71      }
72  
73      public synchronized void store(final String sessionId, final boolean create) {
74          CrawlingInfo crawlingInfo = create ? null : getCrawlingInfoService().getLast(sessionId);
75          if (crawlingInfo == null) {
76              crawlingInfo = new CrawlingInfo(sessionId);
77              try {
78                  getCrawlingInfoService().store(crawlingInfo);
79              } catch (final Exception e) {
80                  throw new FessSystemException("No crawling session.", e);
81              }
82          }
83  
84          if (infoMap != null) {
85              final List<CrawlingInfoParam> crawlingInfoParamList = new ArrayList<>();
86              for (final Map.Entry<String, String> entry : infoMap.entrySet()) {
87                  final CrawlingInfoParam crawlingInfoParam = new CrawlingInfoParam();
88                  crawlingInfoParam.setCrawlingInfoId(crawlingInfo.getId());
89                  crawlingInfoParam.setKey(entry.getKey());
90                  crawlingInfoParam.setValue(entry.getValue());
91                  crawlingInfoParamList.add(crawlingInfoParam);
92              }
93              getCrawlingInfoService().storeInfo(crawlingInfoParamList);
94          }
95  
96          infoMap = null;
97      }
98  
99      public synchronized void putToInfoMap(final String key, final String value) {
100         if (infoMap == null) {
101             infoMap = Collections.synchronizedMap(new LinkedHashMap<String, String>());
102         }
103         logger.debug("infoMap: {}={} => {}", key, value, infoMap);
104         infoMap.put(key, value);
105     }
106 
107     public void updateParams(final String sessionId, final String name, final int dayForCleanup) {
108         final CrawlingInfo crawlingInfo = getCrawlingInfoService().getLast(sessionId);
109         if (crawlingInfo == null) {
110             logger.warn("No crawling session: " + sessionId);
111             return;
112         }
113         if (StringUtil.isNotBlank(name)) {
114             crawlingInfo.setName(name);
115         } else {
116             crawlingInfo.setName(Constants.CRAWLING_INFO_SYSTEM_NAME);
117         }
118         if (dayForCleanup >= 0) {
119             final long expires = getExpiredTime(dayForCleanup);
120             crawlingInfo.setExpiredTime(expires);
121             documentExpires = expires;
122         }
123         try {
124             getCrawlingInfoService().store(crawlingInfo);
125         } catch (final Exception e) {
126             throw new FessSystemException("No crawling session.", e);
127         }
128 
129     }
130 
131     public Date getDocumentExpires(final CrawlingConfig config) {
132         if (config != null) {
133             final Integer timeToLive = config.getTimeToLive();
134             if (timeToLive != null) {
135                 // timeToLive minutes
136                 final long now = ComponentUtil.getSystemHelper().getCurrentTimeAsLong();
137                 return new Date(now + timeToLive.longValue() * 1000 * 60);
138             }
139         }
140         return documentExpires != null ? new Date(documentExpires) : null;
141     }
142 
143     protected long getExpiredTime(final int days) {
144         final long now = ComponentUtil.getSystemHelper().getCurrentTimeAsLong();
145         return now + days * Constants.ONE_DAY_IN_MILLIS;
146     }
147 
148     public Map<String, String> getInfoMap(final String sessionId) {
149         final List<CrawlingInfoParam> crawlingInfoParamList = getCrawlingInfoService().getLastCrawlingInfoParamList(sessionId);
150         final Map<String, String> map = new HashMap<>();
151         for (final CrawlingInfoParam crawlingInfoParam : crawlingInfoParamList) {
152             map.put(crawlingInfoParam.getKey(), crawlingInfoParam.getValue());
153         }
154         return map;
155     }
156 
157     public String generateId(final Map<String, Object> dataMap) {
158         final FessConfig fessConfig = ComponentUtil.getFessConfig();
159         final String url = (String) dataMap.get(fessConfig.getIndexFieldUrl());
160         @SuppressWarnings("unchecked")
161         final List<String> roleTypeList = (List<String>) dataMap.get(fessConfig.getIndexFieldRole());
162         return generateId(url, roleTypeList);
163     }
164 
165     public List<Map<String, String>> getSessionIdList(final FessEsClient fessEsClient) {
166         final FessConfig fessConfig = ComponentUtil.getFessConfig();
167         return fessEsClient.search(
168                 fessConfig.getIndexDocumentSearchIndex(),
169                 fessConfig.getIndexDocumentType(),
170                 queryRequestBuilder -> {
171                     queryRequestBuilder.setQuery(QueryBuilders.matchAllQuery());
172                     final TermsAggregationBuilder termsBuilder =
173                             AggregationBuilders.terms(fessConfig.getIndexFieldSegment()).field(fessConfig.getIndexFieldSegment())
174                                     .size(maxSessionIdsInList).order(Order.term(false));
175                     queryRequestBuilder.addAggregation(termsBuilder);
176                     queryRequestBuilder.setPreference(Constants.SEARCH_PREFERENCE_PRIMARY);
177                     return true;
178                 }, (queryRequestBuilder, execTime, searchResponse) -> {
179                     final List<Map<String, String>> sessionIdList = new ArrayList<>();
180                     searchResponse.ifPresent(response -> {
181                         final Terms terms = response.getAggregations().get(fessConfig.getIndexFieldSegment());
182                         for (final Bucket bucket : terms.getBuckets()) {
183                             final Map<String, String> map = new HashMap<>(2);
184                             map.put(fessConfig.getIndexFieldSegment(), bucket.getKey().toString());
185                             map.put(FACET_COUNT_KEY, Long.toString(bucket.getDocCount()));
186                             sessionIdList.add(map);
187                         }
188                     });
189                     return sessionIdList;
190                 });
191     }
192 
193     protected String generateId(final String url, final List<String> roleTypeList) {
194         final StringBuilder buf = new StringBuilder(1000);
195         buf.append(url);
196         if (roleTypeList != null && !roleTypeList.isEmpty()) {
197             Collections.sort(roleTypeList);
198             buf.append(";role=");
199             buf.append(String.join(",", roleTypeList));
200         }
201         final String urlId = buf.toString().trim();
202         final StringBuilder encodedBuf = new StringBuilder(urlId.length() + 100);
203         for (int i = 0; i < urlId.length(); i++) {
204             final char c = urlId.charAt(i);
205             if (c >= 'a' && c <= 'z' //
206                     || c >= 'A' && c <= 'Z' //
207                     || c >= '0' && c <= '9' //
208                     || c == '.' //
209                     || c == '-' //
210                     || c == '*' //
211                     || c == '_' //
212                     || c == ':' //
213                     || c == '+' //
214                     || c == '%' //
215                     || c == '=' //
216                     || c == '&' //
217                     || c == '?' //
218                     || c == '#' //
219                     || c == '[' //
220                     || c == ']' //
221                     || c == '@' //
222                     || c == '~' //
223                     || c == '!' //
224                     || c == '$' //
225                     || c == '\'' //
226                     || c == '(' //
227                     || c == ')' //
228                     || c == ',' //
229                     || c == ';' //
230             ) {
231                 encodedBuf.append(c);
232             } else {
233                 try {
234                     encodedBuf.append(URLEncoder.encode(String.valueOf(c), Constants.UTF_8));
235                 } catch (final UnsupportedEncodingException e) {
236                     // NOP
237                 }
238             }
239         }
240 
241         final String id = encodedBuf.toString();
242         if (id.length() <= urlIdPrefixLength) {
243             return id;
244         }
245         return id.substring(0, urlIdPrefixLength) + MessageDigestUtil.digest("SHA-256", id.substring(urlIdPrefixLength));
246     }
247 
248     public void setMaxSessionIdsInList(final int maxSessionIdsInList) {
249         this.maxSessionIdsInList = maxSessionIdsInList;
250     }
251 
252     public void setUrlIdPrefixLength(final int urlIdPrefixLength) {
253         this.urlIdPrefixLength = urlIdPrefixLength;
254     }
255 
256 }