View Javadoc
1   /*
2    * Copyright 2012-2020 CodeLibs Project and the Others.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
13   * either express or implied. See the License for the specific language
14   * governing permissions and limitations under the License.
15   */
16  package org.codelibs.fess.crawler.transformer;
17  
18  import java.net.URLDecoder;
19  import java.util.Arrays;
20  import java.util.Collection;
21  import java.util.Collections;
22  import java.util.HashMap;
23  import java.util.Map;
24  
25  import org.apache.commons.lang3.StringUtils;
26  import org.apache.logging.log4j.Logger;
27  import org.codelibs.core.collection.LruHashMap;
28  import org.codelibs.core.lang.StringUtil;
29  import org.codelibs.fess.Constants;
30  import org.codelibs.fess.crawler.entity.AccessResult;
31  import org.codelibs.fess.crawler.entity.AccessResultData;
32  import org.codelibs.fess.crawler.entity.UrlQueue;
33  import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
34  import org.codelibs.fess.mylasta.direction.FessConfig;
35  import org.codelibs.fess.util.ComponentUtil;
36  import org.codelibs.fess.util.GroovyUtil;
37  
38  public interface FessTransformer {
39  
40      Map<String, String> parentEncodingMap = Collections.synchronizedMap(new LruHashMap<String, String>(1000));
41  
42      FessConfig getFessConfig();
43  
44      Logger getLogger();
45  
46      default String getHost(final String u) {
47          if (StringUtil.isBlank(u)) {
48              return StringUtil.EMPTY; // empty
49          }
50  
51          String url = u;
52          final String originalUrl = url;
53  
54          int idx = url.indexOf("://");
55          if (idx >= 0) {
56              url = url.substring(idx + 3);
57          }
58  
59          idx = url.indexOf('/');
60          if (idx >= 0) {
61              url = url.substring(0, idx);
62          }
63  
64          if (url.equals(originalUrl)) {
65              return getFessConfig().getCrawlerDocumentUnknownHostname();
66          }
67  
68          return url;
69      }
70  
71      default String getSite(final String u, final String encoding) {
72          if (StringUtil.isBlank(u)) {
73              return StringUtil.EMPTY; // empty
74          }
75  
76          String url = u;
77          int idx = url.indexOf("://");
78          if (idx >= 0) {
79              url = url.substring(idx + 3);
80          }
81  
82          idx = url.indexOf('?');
83          if (idx >= 0) {
84              url = url.substring(0, idx);
85          }
86  
87          if (encoding != null) {
88              String enc;
89              if (StringUtil.isNotBlank(getFessConfig().getCrawlerDocumentSiteEncoding())) {
90                  if (getFessConfig().isCrawlerDocumentUseSiteEncodingOnEnglish()) {
91                      if ("ISO-8859-1".equalsIgnoreCase(encoding) || "US-ASCII".equalsIgnoreCase(encoding)) {
92                          enc = getFessConfig().getCrawlerDocumentSiteEncoding();
93                      } else {
94                          enc = encoding;
95                      }
96                  } else {
97                      enc = getFessConfig().getCrawlerDocumentSiteEncoding();
98                  }
99              } else {
100                 enc = encoding;
101             }
102 
103             try {
104                 url = URLDecoder.decode(url, enc);
105             } catch (final Exception e) {}
106         }
107 
108         return abbreviateSite(url);
109     }
110 
111     default void putResultDataBody(final Map<String, Object> dataMap, final String key, final Object value) {
112         final FessConfig fessConfig = ComponentUtil.getFessConfig();
113         if (fessConfig.getIndexFieldUrl().equals(key) || !dataMap.containsKey(key) || !getFessConfig().isCrawlerDocumentAppendData()) {
114             dataMap.put(key, value);
115         } else {
116             final Object oldValue = dataMap.get(key);
117             final Object[] oldValues;
118             if (oldValue instanceof Object[]) {
119                 oldValues = (Object[]) oldValue;
120             } else if (oldValue instanceof Collection<?>) {
121                 oldValues = ((Collection<?>) oldValue).toArray();
122             } else {
123                 oldValues = new Object[] { oldValue };
124             }
125             if (value.getClass().isArray()) {
126                 final Object[] newValues = (Object[]) value;
127                 final Object[] values = Arrays.copyOf(oldValues, oldValues.length + newValues.length);
128                 for (int i = 0; i < newValues.length; i++) {
129                     values[values.length - 1 + i] = newValues[i];
130                 }
131                 dataMap.put(key, values);
132             } else {
133                 final Object[] values = Arrays.copyOf(oldValues, oldValues.length + 1);
134                 values[values.length - 1] = value;
135                 dataMap.put(key, values);
136             }
137         }
138     }
139 
140     default void putResultDataWithTemplate(final Map<String, Object> dataMap, final String key, final Object value, final String template) {
141         Object target = value;
142         if (template != null) {
143             final Map<String, Object> contextMap = new HashMap<>();
144             contextMap.put("doc", dataMap);
145             final Map<String, Object> paramMap = new HashMap<>(dataMap.size() + 2);
146             paramMap.putAll(dataMap);
147             paramMap.put("value", target);
148             paramMap.put("context", contextMap);
149             target = evaluateValue(template, paramMap);
150         }
151         if (key != null && target != null) {
152             putResultDataBody(dataMap, key, target);
153         }
154     }
155 
156     default Object evaluateValue(final String template, final Map<String, Object> paramMap) {
157         if (StringUtil.isEmpty(template)) {
158             return StringUtil.EMPTY;
159         }
160 
161         return GroovyUtil.evaluate(template, paramMap);
162     }
163 
164     default int getMaxSiteLength() {
165         return getFessConfig().getCrawlerDocumentMaxSiteLengthAsInteger();
166     }
167 
168     default String abbreviateSite(final String value) {
169         final int maxSiteLength = getMaxSiteLength();
170         if (maxSiteLength > -1) {
171             return StringUtils.abbreviate(value, maxSiteLength);
172         } else {
173             return value;
174         }
175     }
176 
177     default String getFileName(final String url, final String encoding) {
178         if (StringUtil.isBlank(url)) {
179             return StringUtil.EMPTY;
180         }
181 
182         String u = url;
183         int idx = u.lastIndexOf('?');
184         if (idx >= 0) {
185             u = u.substring(0, idx);
186         }
187 
188         idx = u.lastIndexOf('#');
189         if (idx >= 0) {
190             u = u.substring(0, idx);
191         }
192         u = decodeUrlAsName(u, u.startsWith("file:"));
193         idx = u.lastIndexOf('/');
194         if (idx >= 0) {
195             if (u.length() > idx + 1) {
196                 u = u.substring(idx + 1);
197             } else {
198                 u = StringUtil.EMPTY;
199             }
200         }
201         return u;
202     }
203 
204     default String decodeUrlAsName(final String url, final boolean escapePlus) {
205         if (url == null) {
206             return null;
207         }
208 
209         final FessConfig fessConfig = getFessConfig();
210         String enc = Constants.UTF_8;
211         if (StringUtil.isBlank(fessConfig.getCrawlerDocumentFileNameEncoding())) {
212             final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
213             if (urlQueue != null) {
214                 final String parentUrl = urlQueue.getParentUrl();
215                 if (StringUtil.isNotEmpty(parentUrl)) {
216                     final String sessionId = urlQueue.getSessionId();
217                     final String pageEnc = getParentEncoding(parentUrl, sessionId);
218                     if (pageEnc != null) {
219                         enc = pageEnc;
220                     } else if (urlQueue.getEncoding() != null) {
221                         enc = urlQueue.getEncoding();
222                     }
223                 }
224             }
225         } else {
226             enc = fessConfig.getCrawlerDocumentFileNameEncoding();
227         }
228 
229         final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url;
230         try {
231             return URLDecoder.decode(escapedUrl, enc);
232         } catch (final Exception e) {
233             return url;
234         }
235     }
236 
237     default String getParentEncoding(final String parentUrl, final String sessionId) {
238         final String key = sessionId + ":" + parentUrl;
239         String enc = parentEncodingMap.get(key);
240         if (enc != null) {
241             return enc;
242         }
243 
244         final AccessResult<?> accessResult = ComponentUtil.getDataService().getAccessResult(sessionId, parentUrl);
245         if (accessResult != null) {
246             final AccessResultData<?> accessResultData = accessResult.getAccessResultData();
247             if (accessResultData != null && accessResultData.getEncoding() != null) {
248                 enc = accessResultData.getEncoding();
249                 parentEncodingMap.put(key, enc);
250                 return enc;
251             }
252         }
253         return null;
254     }
255 }