View Javadoc
1   /*
2    * Copyright 2012-2017 CodeLibs Project and the Others.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13   * either express or implied. See the License for the specific language
14   * governing permissions and limitations under the License.
15   */
16  package org.codelibs.fess.helper;
17  
18  import java.io.UnsupportedEncodingException;
19  import java.net.URLEncoder;
20  import java.util.ArrayList;
21  import java.util.Base64;
22  import java.util.Collections;
23  import java.util.Date;
24  import java.util.HashMap;
25  import java.util.LinkedHashMap;
26  import java.util.List;
27  import java.util.Map;
28  
29  import org.codelibs.core.lang.StringUtil;
30  import org.codelibs.core.security.MessageDigestUtil;
31  import org.codelibs.fess.Constants;
32  import org.codelibs.fess.app.service.CrawlingInfoService;
33  import org.codelibs.fess.es.client.FessEsClient;
34  import org.codelibs.fess.es.config.exentity.CrawlingConfig;
35  import org.codelibs.fess.es.config.exentity.CrawlingInfo;
36  import org.codelibs.fess.es.config.exentity.CrawlingInfoParam;
37  import org.codelibs.fess.exception.FessSystemException;
38  import org.codelibs.fess.mylasta.direction.FessConfig;
39  import org.codelibs.fess.util.ComponentUtil;
40  import org.elasticsearch.index.query.QueryBuilders;
41  import org.elasticsearch.search.aggregations.AggregationBuilders;
42  import org.elasticsearch.search.aggregations.BucketOrder;
43  import org.elasticsearch.search.aggregations.bucket.terms.Terms;
44  import org.elasticsearch.search.aggregations.bucket.terms.Terms.Bucket;
45  import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregationBuilder;
46  import org.slf4j.Logger;
47  import org.slf4j.LoggerFactory;
48  
49  public class CrawlingInfoHelper {
50      private static final Logger logger = LoggerFactory.getLogger(CrawlingInfoHelper.class);
51  
52      public static final String FACET_COUNT_KEY = "count";
53  
54      protected Map<String, String> infoMap;
55  
56      protected Long documentExpires;
57  
58      protected int maxSessionIdsInList;
59  
60      protected int urlIdPrefixLength = 445;
61  
62      protected CrawlingInfoService getCrawlingInfoService() {
63          return ComponentUtil.getComponent(CrawlingInfoService.class);
64      }
65  
66      public String getCanonicalSessionId(final String sessionId) {
67          final int idx = sessionId.indexOf('-');
68          if (idx >= 0) {
69              return sessionId.substring(0, idx);
70          }
71          return sessionId;
72      }
73  
74      public synchronized void store(final String sessionId, final boolean create) {
75          CrawlingInfo crawlingInfo = create ? null : getCrawlingInfoService().getLast(sessionId);
76          if (crawlingInfo == null) {
77              crawlingInfo = new CrawlingInfo(sessionId);
78              try {
79                  getCrawlingInfoService().store(crawlingInfo);
80              } catch (final Exception e) {
81                  throw new FessSystemException("No crawling session.", e);
82              }
83          }
84  
85          if (infoMap != null) {
86              final List<CrawlingInfoParam> crawlingInfoParamList = new ArrayList<>();
87              for (final Map.Entry<String, String> entry : infoMap.entrySet()) {
88                  final CrawlingInfoParam crawlingInfoParam = new CrawlingInfoParam();
89                  crawlingInfoParam.setCrawlingInfoId(crawlingInfo.getId());
90                  crawlingInfoParam.setKey(entry.getKey());
91                  crawlingInfoParam.setValue(entry.getValue());
92                  crawlingInfoParamList.add(crawlingInfoParam);
93              }
94              getCrawlingInfoService().storeInfo(crawlingInfoParamList);
95          }
96  
97          infoMap = null;
98      }
99  
100     public synchronized void putToInfoMap(final String key, final String value) {
101         if (infoMap == null) {
102             infoMap = Collections.synchronizedMap(new LinkedHashMap<String, String>());
103         }
104         logger.debug("infoMap: {}={} => {}", key, value, infoMap);
105         infoMap.put(key, value);
106     }
107 
108     public void updateParams(final String sessionId, final String name, final int dayForCleanup) {
109         final CrawlingInfo crawlingInfo = getCrawlingInfoService().getLast(sessionId);
110         if (crawlingInfo == null) {
111             logger.warn("No crawling session: " + sessionId);
112             return;
113         }
114         if (StringUtil.isNotBlank(name)) {
115             crawlingInfo.setName(name);
116         } else {
117             crawlingInfo.setName(Constants.CRAWLING_INFO_SYSTEM_NAME);
118         }
119         if (dayForCleanup >= 0) {
120             final long expires = getExpiredTime(dayForCleanup);
121             crawlingInfo.setExpiredTime(expires);
122             documentExpires = expires;
123         }
124         try {
125             getCrawlingInfoService().store(crawlingInfo);
126         } catch (final Exception e) {
127             throw new FessSystemException("No crawling session.", e);
128         }
129 
130     }
131 
132     public Date getDocumentExpires(final CrawlingConfig config) {
133         if (config != null) {
134             final Integer timeToLive = config.getTimeToLive();
135             if (timeToLive != null) {
136                 // timeToLive minutes
137                 final long now = ComponentUtil.getSystemHelper().getCurrentTimeAsLong();
138                 return new Date(now + timeToLive.longValue() * 1000 * 60);
139             }
140         }
141         return documentExpires != null ? new Date(documentExpires) : null;
142     }
143 
144     protected long getExpiredTime(final int days) {
145         final long now = ComponentUtil.getSystemHelper().getCurrentTimeAsLong();
146         return now + days * Constants.ONE_DAY_IN_MILLIS;
147     }
148 
149     public Map<String, String> getInfoMap(final String sessionId) {
150         final List<CrawlingInfoParam> crawlingInfoParamList = getCrawlingInfoService().getLastCrawlingInfoParamList(sessionId);
151         final Map<String, String> map = new HashMap<>();
152         for (final CrawlingInfoParam crawlingInfoParam : crawlingInfoParamList) {
153             map.put(crawlingInfoParam.getKey(), crawlingInfoParam.getValue());
154         }
155         return map;
156     }
157 
158     public String generateId(final Map<String, Object> dataMap) {
159         final FessConfig fessConfig = ComponentUtil.getFessConfig();
160         final String url = (String) dataMap.get(fessConfig.getIndexFieldUrl());
161         @SuppressWarnings("unchecked")
162         final List<String> roleTypeList = (List<String>) dataMap.get(fessConfig.getIndexFieldRole());
163         return generateId(url, roleTypeList);
164     }
165 
166     public List<Map<String, String>> getSessionIdList(final FessEsClient fessEsClient) {
167         final FessConfig fessConfig = ComponentUtil.getFessConfig();
168         return fessEsClient.search(
169                 fessConfig.getIndexDocumentSearchIndex(),
170                 fessConfig.getIndexDocumentType(),
171                 queryRequestBuilder -> {
172                     queryRequestBuilder.setQuery(QueryBuilders.matchAllQuery());
173                     final TermsAggregationBuilder termsBuilder =
174                             AggregationBuilders.terms(fessConfig.getIndexFieldSegment()).field(fessConfig.getIndexFieldSegment())
175                                     .size(maxSessionIdsInList).order(BucketOrder.key(false));
176                     queryRequestBuilder.addAggregation(termsBuilder);
177                     queryRequestBuilder.setPreference(Constants.SEARCH_PREFERENCE_LOCAL);
178                     return true;
179                 }, (queryRequestBuilder, execTime, searchResponse) -> {
180                     final List<Map<String, String>> sessionIdList = new ArrayList<>();
181                     searchResponse.ifPresent(response -> {
182                         final Terms terms = response.getAggregations().get(fessConfig.getIndexFieldSegment());
183                         for (final Bucket bucket : terms.getBuckets()) {
184                             final Map<String, String> map = new HashMap<>(2);
185                             map.put(fessConfig.getIndexFieldSegment(), bucket.getKey().toString());
186                             map.put(FACET_COUNT_KEY, Long.toString(bucket.getDocCount()));
187                             sessionIdList.add(map);
188                         }
189                     });
190                     return sessionIdList;
191                 });
192     }
193 
194     protected String generateId(final String url, final List<String> roleTypeList) {
195         final StringBuilder buf = new StringBuilder(1000);
196         buf.append(url);
197         if (roleTypeList != null && !roleTypeList.isEmpty()) {
198             Collections.sort(roleTypeList);
199             buf.append(";role=");
200             buf.append(String.join(",", roleTypeList));
201         }
202         final String urlId = buf.toString().trim();
203         final StringBuilder encodedBuf = new StringBuilder(urlId.length() + 100);
204         for (int i = 0; i < urlId.length(); i++) {
205             final char c = urlId.charAt(i);
206             if (c >= 'a' && c <= 'z' //
207                     || c >= 'A' && c <= 'Z' //
208                     || c >= '0' && c <= '9' //
209                     || c == '.' //
210                     || c == '-' //
211                     || c == '*' //
212                     || c == '_' //
213                     || c == ':' //
214                     || c == '+' //
215                     || c == '%' //
216                     || c == '=' //
217                     || c == '&' //
218                     || c == '?' //
219                     || c == '#' //
220                     || c == '[' //
221                     || c == ']' //
222                     || c == '@' //
223                     || c == '~' //
224                     || c == '!' //
225                     || c == '$' //
226                     || c == '\'' //
227                     || c == '(' //
228                     || c == ')' //
229                     || c == ',' //
230                     || c == ';' //
231             ) {
232                 encodedBuf.append(c);
233             } else {
234                 try {
235                     final String target = String.valueOf(c);
236                     final String converted = URLEncoder.encode(target, Constants.UTF_8);
237                     if (target.equals(converted)) {
238                         encodedBuf.append(Base64.getUrlEncoder().encodeToString(target.getBytes(Constants.CHARSET_UTF_8)));
239                     } else {
240                         encodedBuf.append(converted);
241                     }
242                 } catch (final UnsupportedEncodingException e) {
243                     // NOP
244                 }
245             }
246         }
247 
248         final String id = encodedBuf.toString();
249         if (id.getBytes(Constants.CHARSET_UTF_8).length <= urlIdPrefixLength) {
250             return id;
251         }
252         final String longId = id.substring(0, urlIdPrefixLength) + MessageDigestUtil.digest("SHA-256", id.substring(urlIdPrefixLength));
253         if (longId.getBytes(Constants.CHARSET_UTF_8).length <= urlIdPrefixLength + 64) {
254             return longId;
255         }
256         return longId.substring(0, urlIdPrefixLength + 64);
257     }
258 
259     public void setMaxSessionIdsInList(final int maxSessionIdsInList) {
260         this.maxSessionIdsInList = maxSessionIdsInList;
261     }
262 
263     public void setUrlIdPrefixLength(final int urlIdPrefixLength) {
264         this.urlIdPrefixLength = urlIdPrefixLength;
265     }
266 
267 }