001 //$HeadURL: svn+ssh://jwilden@svn.wald.intevation.org/deegree/base/branches/2.5_testing/src/org/deegree/ogcwebservices/csw/manager/CatalogueHarvester.java $
002 /*----------------------------------------------------------------------------
003 This file is part of deegree, http://deegree.org/
004 Copyright (C) 2001-2009 by:
005 Department of Geography, University of Bonn
006 and
007 lat/lon GmbH
008
009 This library is free software; you can redistribute it and/or modify it under
010 the terms of the GNU Lesser General Public License as published by the Free
011 Software Foundation; either version 2.1 of the License, or (at your option)
012 any later version.
013 This library is distributed in the hope that it will be useful, but WITHOUT
014 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
015 FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
016 details.
017 You should have received a copy of the GNU Lesser General Public License
018 along with this library; if not, write to the Free Software Foundation, Inc.,
019 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
020
021 Contact information:
022
023 lat/lon GmbH
024 Aennchenstr. 19, 53177 Bonn
025 Germany
026 http://lat-lon.de/
027
028 Department of Geography, University of Bonn
029 Prof. Dr. Klaus Greve
030 Postfach 1147, 53001 Bonn
031 Germany
032 http://www.geographie.uni-bonn.de/deegree/
033
034 e-mail: info@deegree.org
035 ----------------------------------------------------------------------------*/
036 package org.deegree.ogcwebservices.csw.manager;
037
038 import java.io.IOException;
039 import java.io.InputStream;
040 import java.net.URI;
041 import java.net.URL;
042 import java.sql.SQLException;
043 import java.util.Arrays;
044 import java.util.Collections;
045 import java.util.Date;
046 import java.util.HashMap;
047 import java.util.Iterator;
048 import java.util.List;
049 import java.util.Map;
050
051 import org.apache.commons.httpclient.HttpClient;
052 import org.apache.commons.httpclient.HttpException;
053 import org.apache.commons.httpclient.methods.PostMethod;
054 import org.apache.commons.httpclient.methods.StringRequestEntity;
055 import org.deegree.enterprise.WebUtils;
056 import org.deegree.framework.log.ILogger;
057 import org.deegree.framework.log.LoggerFactory;
058 import org.deegree.framework.util.CharsetUtils;
059 import org.deegree.framework.util.FileUtils;
060 import org.deegree.framework.util.StringTools;
061 import org.deegree.framework.util.TimeTools;
062 import org.deegree.framework.xml.XMLException;
063 import org.deegree.framework.xml.XMLFragment;
064 import org.deegree.framework.xml.XMLParsingException;
065 import org.deegree.framework.xml.XMLTools;
066 import org.deegree.io.DBPoolException;
067 import org.deegree.ogcwebservices.OGCWebServiceException;
068 import org.deegree.ogcwebservices.csw.configuration.CatalogueConfigurationDocument;
069 import org.deegree.ogcwebservices.csw.manager.HarvestRepository.Record;
070 import org.deegree.ogcwebservices.csw.manager.HarvestRepository.ResourceType;
071 import org.w3c.dom.Element;
072 import org.w3c.dom.Node;
073 import org.xml.sax.SAXException;
074
075 /**
076 * Harverster implementation for harvesting other catalogue services. Just dataset, series
077 * (datasetcollection) und application metadatatypes will be harvested.
078 *
079 *
080 * @version $Revision: 19475 $
081 * @author <a href="mailto:poth@lat-lon.de">Andreas Poth</a>
082 * @author last edited by: $Author: lbuesching $
083 *
084 * @version 1.0. $Revision: 19475 $, $Date: 2009-09-02 14:51:48 +0200 (Mi, 02 Sep 2009) $
085 *
086 * @since 2.0
087 */
088 public class CatalogueHarvester extends AbstractHarvester {
089
090 static final ILogger LOG = LoggerFactory.getLogger( CatalogueHarvester.class );
091
092 private static CatalogueHarvester ch = null;
093
094 private enum HarvestOperation {
095 /**
096 *
097 */
098 insert, /**
099 *
100 */
101 update, /**
102 *
103 */
104 delete, /**
105 *
106 */
107 nothing
108 }
109
110 /**
111 * @param version
112 * the version of the CSW
113 */
114 private CatalogueHarvester( String version ) {
115 super( version );
116 }
117
118 /**
119 * singelton
120 *
121 * @param version
122 * the version of the CSW
123 *
124 * @return instance of CatalogueHarvester
125 */
126 public static CatalogueHarvester getInstance( String version ) {
127 if ( ch == null ) {
128 ch = new CatalogueHarvester( version );
129 }
130 return ch;
131 }
132
133 @Override
134 public void run() {
135 LOG.logDebug( "starting harvest iteration for CatalogueHarvester." );
136 try {
137 HarvestRepository repository = HarvestRepository.getInstance();
138
139 List<URI> sources = repository.getSources();
140 for ( Iterator<URI> iter = sources.iterator(); iter.hasNext(); ) {
141 URI source = iter.next();
142 try {
143 // determine if source shall be harvested
144 if ( shallHarvest( source, ResourceType.catalogue ) ) {
145 // mark source as currently being harvested
146 inProgress.add( source );
147 HarvestProcessor processor = new HarvestProcessor( this, source );
148 processor.start();
149 }
150 } catch ( Exception e ) {
151 e.printStackTrace();
152 LOG.logError( Messages.format( "CatalogueHarvester.exception1", source ), e );
153 informResponseHandlers( source, e );
154 }
155 }
156 } catch ( Exception e ) {
157 LOG.logError( Messages.getString( "CatalogueHarvester.exception2" ), e );
158 }
159
160 }
161
162 /**
163 * inner class for processing asynchronous harvesting of a catalogue
164 *
165 * @version $Revision: 19475 $
166 * @author <a href="mailto:poth@lat-lon.de">Andreas Poth</a>
167 * @author last edited by: $Author: lbuesching $
168 *
169 * @version 1.0. $Revision: 19475 $, $Date: 2009-09-02 14:51:48 +0200 (Mi, 02 Sep 2009) $
170 *
171 * @since 2.0
172 */
173 protected class HarvestProcessor extends AbstractHarvestProcessor {
174
175 private Map<String, Record> records = new HashMap<String, Record>( 10000 );
176
177 private String sourceVersion = "2.0.0";
178
179 /**
180 *
181 * @param owner
182 * @param source
183 */
184 HarvestProcessor( AbstractHarvester owner, URI source ) {
185 super( owner, source );
186 try {
187 String capaRequest = source + "?REQUEST=GetCapabilities&service=CSW";
188 CatalogueConfigurationDocument capa = new CatalogueConfigurationDocument();
189 capa.load( new URL( capaRequest ) );
190 List<String> versions = Arrays.asList( capa.getServiceIdentification().getServiceTypeVersions() );
191 Collections.sort( versions );
192 sourceVersion = versions.get( versions.size() - 1 );
193 } catch ( IOException e ) {
194 LOG.logError( Messages.format( "CatalogueHarvester.exception5", source ), e );
195 } catch ( SAXException e ) {
196 LOG.logError( Messages.format( "CatalogueHarvester.exception6", source ), e );
197 } catch ( XMLParsingException e ) {
198 LOG.logError( Messages.format( "CatalogueHarvester.exception7", source ), e );
199 }
200 }
201
202 @Override
203 public void run() {
204
205 String[] typeNames = new String[] { "csw:dataset", "csw:datasetcollection", "csw:application",
206 "csw:service" };
207 records.clear();
208 try {
209 HarvestRepository repository = HarvestRepository.getInstance();
210 XMLFragment metaData = null;
211 Date harvestingTimestamp = repository.getNextHarvestingTimestamp( source );
212
213 if ( "2.0.2".equals( sourceVersion ) ) {
214 runHarvest( "", metaData, repository );
215 } else {
216 for ( int i = 0; i < typeNames.length; i++ ) {
217 runHarvest( typeNames[i], metaData, repository );
218 }
219 }
220
221 // delete all records from the target catalogue and the
222 // from harvest cache
223 deleteRecordsNoHostedAnymore( source );
224
225 // update timestamps just if transaction has been performed
226 // successfully
227 writeLastHarvestingTimestamp( source, harvestingTimestamp );
228 writeNextHarvestingTimestamp( source, harvestingTimestamp );
229 // inform handlers assigend to the harvest request about successfully harvested
230 // CSW. Even if harvesting a few records has failed s harvest process will
231 // declared
232 // as successfull if it cann be fineshed regulary
233 informResponseHandlers( source );
234 if ( repository.getHarvestInterval( source ) <= 0 ) {
235 repository.dropRequest( source );
236 }
237 } catch ( Exception e ) {
238 LOG.logError( Messages.format( "CatalogueHarvester.exception4", source ), e );
239 try {
240 e.printStackTrace();
241 owner.informResponseHandlers( source, e );
242 } catch ( Exception ee ) {
243 ee.printStackTrace();
244 }
245 } finally {
246 inProgress.remove( source );
247 }
248
249 }
250
251 /**
252 *
253 * @param typeName
254 * @param metaData
255 * @param repository
256 * @throws XMLException
257 * @throws IOException
258 * @throws SAXException
259 * @throws XMLParsingException
260 */
261 private void runHarvest( String typeName, XMLFragment metaData, HarvestRepository repository )
262 throws XMLException, IOException, SAXException, XMLParsingException {
263 int index = 1;
264 int hits = getNoOfMetadataRecord( source, typeName );
265 LOG.logInfo( hits + " metadatasets to harvest ..." );
266 for ( int j = 0; j < hits; j++ ) {
267 try {
268 // read index'th metadata set from CSW
269 metaData = getNextMetadataRecord( source, index, typeName );
270 if ( metaData != null ) {
271 // read record from harvest database if dataset has been harvested
272 // before
273 // or create a new one
274 Record record = createOrGetRecord( source, metaData );
275 records.put( record.getFileIdentifier(), record );
276 String trans = null;
277 try {
278 // determine harvest operation to perfrom
279 // insert: dataset has not been harvested before
280 // update: dataset has been harvested before but has changed
281 // nothing: e.g. dataset is not a known metadata format
282 HarvestOperation ho = getHarvestOperation( record, metaData );
283 if ( ho == HarvestOperation.insert ) {
284 trans = createInsertRequest( metaData );
285 } else if ( ho == HarvestOperation.update ) {
286 trans = createUpdateRequest( getID( metaData ),
287 getIdentifierXPathForUpdate( metaData ), metaData );
288 }
289 // perform harvesting for current dataset; insert it or update
290 // extisting dataset in this CSW
291 if ( ho != HarvestOperation.nothing ) {
292 performTransaction( trans );
293 repository.storeRecord( record );
294 } else {
295 LOG.logInfo( "nothing to Harvest" );
296 }
297 } catch ( Throwable e ) {
298 LOG.logError( Messages.format( "CatalogueHarvester.exception3", index, getID( metaData ),
299 source ), e );
300 try {
301 // inform handlers assigend to the harvest request about
302 // failure
303 // harvesting one specifi dataset.
304 // notice:
305 // if harvisting one dataset fails, not the complete harvest
306 // process fails; the process gones on with next record
307 owner.informResponseHandlers( source, e );
308 } catch ( Exception ee ) {
309 ee.printStackTrace();
310 }
311 // remove fileIdentifier of current dataset from list of
312 // inserted
313 // or updated datasets. After process all available metadata
314 // records this list will be used to adjust the list of dataset
315 // assigend to a specific CSW in harvest-metadata db schema
316 records.remove( record.getFileIdentifier() );
317 }
318 } else {
319 LOG.logInfo( "harvesting will be stopped at index: " + index + " because metadata == null" );
320 }
321 LOG.logDebug( index + " metadata " + ( metaData == null ) );
322 } catch ( Throwable e ) {
323 LOG.logError( Messages.format( "CatalogueHarvester.exception3", index, "not available", source ), e );
324 try {
325 e.printStackTrace();
326 // inform handlers assigend to the harvest request about failure
327 // harvesting one specific dataset.
328 // notice: if harvisting one dataset fails, not the complete harvest
329 // process fails; the process gones on with next record
330 owner.informResponseHandlers( source, e );
331 } catch ( Exception ee ) {
332 ee.printStackTrace();
333 }
334 }
335 index++;
336 if ( index % 1000 == 0 ) {
337 System.gc();
338 }
339
340 }
341 }
342
343 /**
344 * returns the XPath the metadata records dateStamp
345 *
346 * @param metaData
347 * @return the XPath the metadata records dateStamp
348 */
349 private String getDateStampXPath( XMLFragment metaData ) {
350 String xpath = null;
351 if ( metaData != null ) {
352 String nspace = metaData.getRootElement().getNamespaceURI();
353 nspace = StringTools.replace( nspace, "http://", "", true );
354 xpath = Messages.getString( "dateStamp_" + nspace );
355 }
356 return xpath;
357 }
358
359 /**
360 * returns the identifier of a metadata record to enable its update and deletion
361 *
362 * @param metaData
363 * @return the identifier of a metadata record to enable its update and deletion
364 * @throws XMLParsingException
365 */
366 private String getID( XMLFragment metaData )
367 throws XMLParsingException {
368 String xpath = getIdentifierXPath( metaData );
369 String fileIdentifier = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
370 return fileIdentifier;
371 }
372
373 @Override
374 protected String createConstraint( String identifier, String xPath )
375 throws IOException {
376
377 // read template from file
378 URL url = Templates.getTemplate( "Constraints_" + version );
379 String constraints = FileUtils.readTextFile( url ).toString();
380
381 constraints = StringTools.replace( constraints, "$identifier$", identifier, false );
382 return StringTools.replace( constraints, "$xPath$", xPath, false );
383 }
384
385 /**
386 * validates if a record stored in the harvester cache if not provided by the harvested
387 * catalogue any more; if so the record will be removed from the cache and the harvesting
388 * catalogue.
389 *
390 * @throws IOException
391 * @throws SQLException
392 * @throws DBPoolException
393 * @throws SAXException
394 * @throws OGCWebServiceException
395 *
396 */
397 private void deleteRecordsNoHostedAnymore( URI source )
398 throws DBPoolException, SQLException, IOException, OGCWebServiceException, SAXException {
399 HarvestRepository repository = HarvestRepository.getInstance();
400 List<String> cache = repository.getAllRecords( source );
401 int id = repository.getSourceID( source );
402 for ( int i = 0; i < cache.size(); i++ ) {
403 String fid = cache.get( i );
404 Record record = records.remove( fid );
405 if ( record == null ) {
406 repository.dropRecord( repository.new Record( id, null, fid, source ) );
407 String trans = createDeleteRequest( fid );
408 performTransaction( trans );
409 }
410 }
411 }
412
413 /**
414 * the method tries to read a record from the harvest repository. If the is not already
415 * stored in the repository a new record will be created
416 *
417 * @param metaData
418 * @return record from harvest repository
419 * @throws XMLParsingException
420 * @throws IOException
421 * @throws SQLException
422 * @throws DBPoolException
423 */
424 private Record createOrGetRecord( URI source, XMLFragment metaData )
425 throws XMLParsingException, IOException, DBPoolException, SQLException {
426
427 String xpath = getIdentifierXPath( metaData );
428 String fileIdentifier = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
429
430 HarvestRepository repository = HarvestRepository.getInstance();
431 Record record = repository.getRecordByID( source, fileIdentifier );
432 if ( record == null ) {
433 xpath = getDateStampXPath( metaData );
434 String s = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
435 Date date = TimeTools.createCalendar( s ).getTime();
436 record = repository.new Record( -1, date, fileIdentifier, source );
437 }
438
439 return record;
440 }
441
442 /**
443 * determines what operation shall be performed on a metadata record read from a remote
444 * catalogue
445 *
446 * @param metaData
447 * @return type of harvest operation to perform
448 * @throws XMLParsingException
449 */
450 private HarvestOperation getHarvestOperation( Record record, XMLFragment metaData )
451 throws XMLParsingException {
452
453 HarvestOperation ho = HarvestOperation.nothing;
454 if ( record.getSourceId() < 0 ) {
455 ho = HarvestOperation.insert;
456 } else {
457 String xpath = getDateStampXPath( metaData );
458 String s = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
459 Date date = TimeTools.createCalendar( s ).getTime();
460 if ( !date.equals( record.getDatestamp() ) ) {
461 ho = HarvestOperation.update;
462 }
463 }
464 return ho;
465 }
466
467 /**
468 * read
469 *
470 * @param source
471 * @return Metadata record
472 * @throws IOException
473 * @throws HttpException
474 * @throws SAXException
475 * @throws XMLException
476 * @throws XMLParsingException
477 */
478 private XMLFragment getNextMetadataRecord( URI source, int index, String type )
479 throws IOException, XMLException, SAXException, XMLParsingException {
480
481 // read template from file
482 URL url = Templates.getTemplate( "GetRecords_" + sourceVersion );
483 String getRecords = FileUtils.readTextFile( url ).toString();
484 getRecords = StringTools.replace( getRecords, "$index$", Integer.toString( index ), false );
485 getRecords = StringTools.replace( getRecords, "$type$", type, false );
486
487 StringRequestEntity re = new StringRequestEntity( getRecords, "text/xml", CharsetUtils.getSystemCharset() );
488 PostMethod post = new PostMethod( source.toASCIIString() );
489 post.setRequestEntity( re );
490 HttpClient client = new HttpClient();
491 int timeout = 30000;
492 try {
493 timeout = Integer.parseInt( Messages.getString( "harvest.source.timeout" ) );
494 } catch ( Exception e ) {
495 LOG.logInfo( "can not read timeout from messages.properties because: " + e.getMessage()
496 + "; use 30 sec as default" );
497 }
498 client.getHttpConnectionManager().getParams().setSoTimeout( timeout );
499 client = WebUtils.enableProxyUsage( client, source.toURL() );
500 client.executeMethod( post );
501 InputStream is = post.getResponseBodyAsStream();
502 XMLFragment xml = new XMLFragment();
503 xml.load( is, source.toURL().toExternalForm() );
504
505 Node node = XMLTools.getNode( xml.getRootElement(), Messages.getString( "SearchResult.child_"
506 + sourceVersion ), nsc );
507 if ( node != null ) {
508 xml.setRootElement( (Element) node );
509 } else {
510 xml = null;
511 }
512
513 return xml;
514 }
515
516 private int getNoOfMetadataRecord( URI source, String type )
517 throws IOException, XMLException, SAXException, XMLParsingException {
518
519 // read template from file
520 URL url = Templates.getTemplate( "GetNoOfRecords_" + sourceVersion );
521 String getRecords = FileUtils.readTextFile( url ).toString();
522 getRecords = StringTools.replace( getRecords, "$type$", type, false );
523 StringRequestEntity re = new StringRequestEntity( getRecords, "text/xml", CharsetUtils.getSystemCharset() );
524 PostMethod post = new PostMethod( source.toASCIIString() );
525 post.setRequestEntity( re );
526 HttpClient client = new HttpClient();
527 client.getHttpConnectionManager().getParams().setSoTimeout( 30000 );
528 client = WebUtils.enableProxyUsage( client, source.toURL() );
529 client.executeMethod( post );
530 InputStream is = post.getResponseBodyAsStream();
531 XMLFragment xml = new XMLFragment();
532 xml.load( is, source.toURL().toExternalForm() );
533
534 return XMLTools.getNodeAsInt( xml.getRootElement(), Messages.getString( "NumberOfRecordsMatched_"
535 + sourceVersion ), nsc, 0 );
536 }
537
538 }
539
540 }