001 //$HeadURL: svn+ssh://rbezema@svn.wald.intevation.org/deegree/base/branches/2.2_testing/src/org/deegree/ogcwebservices/csw/manager/CatalogueHarvester.java $
002 /*---------------- FILE HEADER ------------------------------------------
003
004 This file is part of deegree.
005 Copyright (C) 2001-2008 by:
006 EXSE, Department of Geography, University of Bonn
007 http://www.giub.uni-bonn.de/deegree/
008 lat/lon GmbH
009 http://www.lat-lon.de
010
011 This library is free software; you can redistribute it and/or
012 modify it under the terms of the GNU Lesser General Public
013 License as published by the Free Software Foundation; either
014 version 2.1 of the License, or (at your option) any later version.
015
016 This library is distributed in the hope that it will be useful,
017 but WITHOUT ANY WARRANTY; without even the implied warranty of
018 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019 Lesser General Public License for more details.
020
021 You should have received a copy of the GNU Lesser General Public
022 License along with this library; if not, write to the Free Software
023 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024
025 Contact:
026
027 Andreas Poth
028 lat/lon GmbH
029 Aennchenstr. 19
030 53115 Bonn
031 Germany
032 E-Mail: poth@lat-lon.de
033
034 Prof. Dr. Klaus Greve
035 Department of Geography
036 University of Bonn
037 Meckenheimer Allee 166
038 53115 Bonn
039 Germany
040 E-Mail: greve@giub.uni-bonn.de
041
042
043 ---------------------------------------------------------------------------*/
044 package org.deegree.ogcwebservices.csw.manager;
045
046 import java.io.IOException;
047 import java.io.InputStream;
048 import java.net.URI;
049 import java.net.URL;
050 import java.sql.SQLException;
051 import java.util.Date;
052 import java.util.HashMap;
053 import java.util.Iterator;
054 import java.util.List;
055 import java.util.Map;
056
057 import org.apache.commons.httpclient.HttpClient;
058 import org.apache.commons.httpclient.HttpException;
059 import org.apache.commons.httpclient.methods.PostMethod;
060 import org.apache.commons.httpclient.methods.StringRequestEntity;
061 import org.deegree.enterprise.WebUtils;
062 import org.deegree.framework.log.ILogger;
063 import org.deegree.framework.log.LoggerFactory;
064 import org.deegree.framework.util.CharsetUtils;
065 import org.deegree.framework.util.FileUtils;
066 import org.deegree.framework.util.StringTools;
067 import org.deegree.framework.util.TimeTools;
068 import org.deegree.framework.xml.XMLException;
069 import org.deegree.framework.xml.XMLFragment;
070 import org.deegree.framework.xml.XMLParsingException;
071 import org.deegree.framework.xml.XMLTools;
072 import org.deegree.io.DBPoolException;
073 import org.deegree.ogcwebservices.OGCWebServiceException;
074 import org.deegree.ogcwebservices.csw.manager.HarvestRepository.Record;
075 import org.deegree.ogcwebservices.csw.manager.HarvestRepository.ResourceType;
076 import org.w3c.dom.Element;
077 import org.w3c.dom.Node;
078 import org.xml.sax.SAXException;
079
080 /**
081 * Harverster implementation for harvesting other catalogue services. Just dataset, series
082 * (datasetcollection) und application metadatatypes will be harvested.
083 *
084 *
085 * @version $Revision: 9345 $
086 * @author <a href="mailto:poth@lat-lon.de">Andreas Poth</a>
087 * @author last edited by: $Author: apoth $
088 *
089 * @version 1.0. $Revision: 9345 $, $Date: 2007-12-27 17:22:25 +0100 (Do, 27 Dez 2007) $
090 *
091 * @since 2.0
092 */
093 public class CatalogueHarvester extends AbstractHarvester {
094
095 private static final ILogger LOG = LoggerFactory.getLogger( CatalogueHarvester.class );
096
097 private static CatalogueHarvester ch = null;
098
099 private enum HarvestOperation {
100 insert, update, delete, nothing
101 };
102
103 /**
104 * singelton
105 *
106 * @return instance of CatalogueHarvester
107 */
108 public static CatalogueHarvester getInstance() {
109 if ( ch == null ) {
110 ch = new CatalogueHarvester();
111 }
112 return ch;
113 }
114
115 @Override
116 public void run() {
117 LOG.logDebug( "starting harvest iteration for CatalogueHarvester." );
118 try {
119 HarvestRepository repository = HarvestRepository.getInstance();
120
121 List<URI> sources = repository.getSources();
122 for ( Iterator iter = sources.iterator(); iter.hasNext(); ) {
123 URI source = (URI) iter.next();
124 try {
125 // determine if source shall be harvested
126 if ( shallHarvest( source, ResourceType.catalogue ) ) {
127 // mark source as currently being harvested
128 inProgress.add( source );
129 HarvestProcessor processor = new HarvestProcessor( this, source );
130 processor.start();
131 }
132 } catch ( Exception e ) {
133 e.printStackTrace();
134 LOG.logError( Messages.format( "CatalogueHarvester.exception1", source ), e );
135 informResponseHandlers( source, e );
136 }
137 }
138 } catch ( Exception e ) {
139 LOG.logError( Messages.getString( "CatalogueHarvester.exception2" ), e );
140 }
141
142 }
143
144 /**
145 * inner class for processing asynchronous harvesting of a catalogue
146 *
147 * @version $Revision: 9345 $
148 * @author <a href="mailto:poth@lat-lon.de">Andreas Poth</a>
149 * @author last edited by: $Author: apoth $
150 *
151 * @version 1.0. $Revision: 9345 $, $Date: 2007-12-27 17:22:25 +0100 (Do, 27 Dez 2007) $
152 *
153 * @since 2.0
154 */
155 protected class HarvestProcessor extends AbstractHarvestProcessor {
156
157 private Map<String, Record> records = new HashMap<String, Record>( 10000 );
158
159 /**
160 *
161 * @param owner
162 * @param source
163 */
164 HarvestProcessor( AbstractHarvester owner, URI source ) {
165 super( owner, source );
166 }
167
168 @Override
169 public void run() {
170
171 String[] typeNames = new String[] { "csw:dataset", "csw:datasetcollection", "csw:application",
172 "csw:service" };
173 records.clear();
174 try {
175 HarvestRepository repository = HarvestRepository.getInstance();
176 XMLFragment metaData = null;
177 Date harvestingTimestamp = repository.getNextHarvestingTimestamp( source );
178 for ( int i = 0; i < typeNames.length; i++ ) {
179 int index = 1;
180 int hits = getNoOfMetadataRecord( source, typeNames[i] );
181 LOG.logInfo( hits + " metadatasets to harvest ..." );
182 for ( int j = 0; j < hits; j++ ) {
183
184 try {
185 // read index'th metadata set from CSW
186 metaData = getNextMetadataRecord( source, index, typeNames[i] );
187 if ( metaData != null ) {
188 // read record from harvest database if dataset has been harvested
189 // before
190 // or create a new one
191 Record record = createOrGetRecord( source, metaData );
192 records.put( record.getFileIdentifier(), record );
193 String trans = null;
194 try {
195 // determine harvest operation to perfrom
196 // insert: dataset has not been harvested before
197 // update: dataset has been harvested before but has changed
198 // nothing: e.g. dataset is not a known metadata format
199 HarvestOperation ho = getHarvestOperation( record, metaData );
200 if ( ho == HarvestOperation.insert ) {
201 trans = createInsertRequest( metaData );
202 } else if ( ho == HarvestOperation.update ) {
203 trans = createUpdateRequest( getID( metaData ), getIdentifierXPath( metaData ),
204 metaData );
205 }
206 // perform harvesting for current dataset; insert it or update
207 // extisting dataset in this CSW
208 if ( ho != HarvestOperation.nothing ) {
209 performTransaction( trans );
210 repository.storeRecord( record );
211 } else {
212 LOG.logInfo( "nothing to Harvest" );
213 }
214 } catch ( Throwable e ) {
215 LOG.logError( Messages.format( "CatalogueHarvester.exception3", index,
216 getID( metaData ), source ), e );
217 try {
218 e.printStackTrace();
219 // inform handlers assigend to the harvest request about
220 // failure
221 // harvesting one specifi dataset.
222 // notice: if harvisting one dataset fails, not the complete
223 // harvest
224 // process fails; the process gones on with next record
225 owner.informResponseHandlers( source, e );
226 } catch ( Exception ee ) {
227 ee.printStackTrace();
228 }
229 // remove fileIdentifier of current dataset from list of
230 // inserted
231 // or updated datasets. After process all available metadata
232 // records
233 // this list will be used to adjust the list of dataset assigend
234 // to
235 // to a specific CSW in harvest-metadata db schema
236 records.remove( record.getFileIdentifier() );
237 }
238 } else {
239 LOG.logInfo( "harvesting will be stopped at index: " + index
240 + " because metadata == null" );
241 }
242 LOG.logDebug( index + " metadata " + ( metaData == null ) );
243 } catch ( Throwable e ) {
244 LOG.logError( Messages.format( "CatalogueHarvester.exception3", index, "not available",
245 source ), e );
246 try {
247 e.printStackTrace();
248 // inform handlers assigend to the harvest request about failure
249 // harvesting one specific dataset.
250 // notice: if harvisting one dataset fails, not the complete harvest
251 // process fails; the process gones on with next record
252 owner.informResponseHandlers( source, e );
253 } catch ( Exception ee ) {
254 ee.printStackTrace();
255 }
256 }
257 index++;
258 if ( index % 1000 == 0 ) {
259 System.gc();
260 }
261
262 }
263
264 }
265
266 // delete all records from the target catalogue and the
267 // from harvest cache
268 deleteRecordsNoHostedAnymore( source );
269
270 // update timestamps just if transaction has been performed
271 // successfully
272 writeLastHarvestingTimestamp( source, harvestingTimestamp );
273 writeNextHarvestingTimestamp( source, harvestingTimestamp );
274 // inform handlers assigend to the harvest request about successfully harvested
275 // CSW. Even if harvesting a few records has failed s harvest process will declared
276 // as successfull if it cann be fineshed regulary
277 informResponseHandlers( source );
278 if ( repository.getHarvestInterval( source ) <= 0 ) {
279 repository.dropRequest( source );
280 }
281 } catch ( Exception e ) {
282 LOG.logError( Messages.format( "CatalogueHarvester.exception4", source ), e );
283 try {
284 e.printStackTrace();
285 owner.informResponseHandlers( source, e );
286 } catch ( Exception ee ) {
287 ee.printStackTrace();
288 }
289 } finally {
290 inProgress.remove( source );
291 }
292
293 }
294
295 /**
296 * returns the XPath the metadata records identifier
297 *
298 * @param metaData
299 * @return the XPath the metadata records identifier
300 */
301 private String getIdentifierXPath( XMLFragment metaData ) {
302 // default is iso 19115
303 String xpath = "iso19115:fileIdentifier/smXML:CharacterString";
304 if ( metaData != null ) {
305 String nspace = metaData.getRootElement().getNamespaceURI();
306 nspace = StringTools.replace( nspace, "http://", "", true );
307 xpath = Messages.getString( "Identifier_" + nspace );
308 }
309 return xpath;
310 }
311
312 /**
313 * returns the XPath the metadata records dateStamp
314 *
315 * @param metaData
316 * @return the XPath the metadata records dateStamp
317 */
318 private String getDateStampXPath( XMLFragment metaData ) {
319 String xpath = null;
320 if ( metaData != null ) {
321 String nspace = metaData.getRootElement().getNamespaceURI();
322 nspace = StringTools.replace( nspace, "http://", "", true );
323 xpath = Messages.getString( "dateStamp_" + nspace );
324 }
325 return xpath;
326 }
327
328 /**
329 * returns the identifier of a metadata record to enable its update and deletion
330 *
331 * @param metaData
332 * @return the identifier of a metadata record to enable its update and deletion
333 * @throws XMLParsingException
334 */
335 private String getID( XMLFragment metaData )
336 throws XMLParsingException {
337 String xpath = getIdentifierXPath( metaData );
338 String fileIdentifier = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
339 return fileIdentifier;
340 }
341
342 @Override
343 protected String createConstraint( String identifier, String xPath )
344 throws IOException {
345
346 // read template from file
347 // TODO
348 // read different templates depending on metadata format
349 URL url = CatalogueHarvester.class.getResource( "iso09_constraints_template.xml" );
350 String constraints = FileUtils.readTextFile( url ).toString();
351
352 constraints = StringTools.replace( constraints, "$identifier$", identifier, false );
353 return StringTools.replace( constraints, "$xPath$", xPath, false );
354 }
355
356 /**
357 * validates if a record stored in the harvester cache if not provided by the harvested
358 * catalogue any more; if so the record will be removed from the cache and the harvesting
359 * catalogue.
360 *
361 * @throws IOException
362 * @throws SQLException
363 * @throws DBPoolException
364 * @throws XMLParsingException
365 * @throws SAXException
366 * @throws OGCWebServiceException
367 *
368 */
369 private void deleteRecordsNoHostedAnymore( URI source )
370 throws DBPoolException, SQLException, IOException, OGCWebServiceException, SAXException {
371 HarvestRepository repository = HarvestRepository.getInstance();
372 List<String> cache = repository.getAllRecords( source );
373 int id = repository.getSourceID( source );
374 for ( int i = 0; i < cache.size(); i++ ) {
375 String fid = cache.get( i );
376 Record record = records.remove( fid );
377 if ( record == null ) {
378 repository.dropRecord( repository.new Record( id, null, fid, source ) );
379 String trans = createDeleteRequest( fid, "./iso19115:fileIdentifier/smXML:CharacterString" );
380 performTransaction( trans );
381 }
382 }
383 }
384
385 /**
386 * the method tries to read a record from the harvest repository. If the is not already
387 * stored in the repository a new record will be created
388 *
389 * @param metaData
390 * @return record from harvest repository
391 * @throws XMLParsingException
392 * @throws IOException
393 * @throws SQLException
394 * @throws DBPoolException
395 */
396 private Record createOrGetRecord( URI source, XMLFragment metaData )
397 throws XMLParsingException, IOException, DBPoolException, SQLException {
398
399 String xpath = getIdentifierXPath( metaData );
400 String fileIdentifier = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
401
402 HarvestRepository repository = HarvestRepository.getInstance();
403 Record record = repository.getRecordByID( source, fileIdentifier );
404 if ( record == null ) {
405 xpath = getDateStampXPath( metaData );
406 String s = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
407 Date date = TimeTools.createCalendar( s ).getTime();
408 record = repository.new Record( -1, date, fileIdentifier, source );
409 }
410
411 return record;
412 }
413
414 /**
415 * determines what operation shall be performed on a metadata record read from a remote
416 * catalogue
417 *
418 * @param metaData
419 * @return type of harvest operation to perform
420 * @throws IOException
421 * @throws SQLException
422 * @throws DBPoolException
423 * @throws XMLParsingException
424 */
425 private HarvestOperation getHarvestOperation( Record record, XMLFragment metaData )
426 throws XMLParsingException {
427
428 HarvestOperation ho = HarvestOperation.nothing;
429 if ( record.getSourceId() < 0 ) {
430 ho = HarvestOperation.insert;
431 } else {
432 String xpath = getDateStampXPath( metaData );
433 String s = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
434 Date date = TimeTools.createCalendar( s ).getTime();
435 if ( !date.equals( record.getDatestamp() ) ) {
436 ho = HarvestOperation.update;
437 }
438 }
439 return ho;
440 }
441
442 /**
443 * read
444 *
445 * @param source
446 * @return Metadata record
447 * @throws IOException
448 * @throws HttpException
449 * @throws SAXException
450 * @throws XMLException
451 * @throws XMLParsingException
452 */
453 private XMLFragment getNextMetadataRecord( URI source, int index, String type )
454 throws IOException, XMLException, SAXException, XMLParsingException {
455
456 // read template from file
457 // TODO
458 // read different templates depending on metadata format
459 URL url = CatalogueHarvester.class.getResource( "iso09_getrecords_template.xml" );
460 String getRecords = FileUtils.readTextFile( url ).toString();
461 getRecords = StringTools.replace( getRecords, "$index$", Integer.toString( index ), false );
462 getRecords = StringTools.replace( getRecords, "$type$", type, false );
463
464 StringRequestEntity re = new StringRequestEntity( getRecords, "text/xml", CharsetUtils.getSystemCharset() );
465 PostMethod post = new PostMethod( source.toASCIIString() );
466 post.setRequestEntity( re );
467 HttpClient client = new HttpClient();
468 int timeout = 30000;
469 try {
470 timeout = Integer.parseInt( Messages.getString( "harvest.source.timeout" ) );
471 } catch ( Exception e ) {
472 LOG.logInfo( "can not read timeout from messages.properties because: " + e.getMessage()
473 + "; use 30 sec as default" );
474 }
475 client.getHttpConnectionManager().getParams().setSoTimeout( timeout );
476 client = WebUtils.enableProxyUsage( client, source.toURL() );
477 client.executeMethod( post );
478 InputStream is = post.getResponseBodyAsStream();
479 XMLFragment xml = new XMLFragment();
480 xml.load( is, source.toURL().toExternalForm() );
481
482 Node node = XMLTools.getNode( xml.getRootElement(), "csw:SearchResults/child::*[1]", nsc );
483 if ( node != null ) {
484 xml.setRootElement( (Element) node );
485 } else {
486 xml = null;
487 }
488
489 return xml;
490 }
491
492 private int getNoOfMetadataRecord( URI source, String type )
493 throws IOException, XMLException, SAXException, XMLParsingException {
494
495 // read template from file
496 // TODO
497 // read different templates depending on metadata format
498 URL url = CatalogueHarvester.class.getResource( "iso09_get_no_of_records_template.xml" );
499 String getRecords = FileUtils.readTextFile( url ).toString();
500 getRecords = StringTools.replace( getRecords, "$type$", type, false );
501
502 StringRequestEntity re = new StringRequestEntity( getRecords, "text/xml", CharsetUtils.getSystemCharset() );
503 PostMethod post = new PostMethod( source.toASCIIString() );
504 post.setRequestEntity( re );
505 HttpClient client = new HttpClient();
506 client.getHttpConnectionManager().getParams().setSoTimeout( 30000 );
507 client = WebUtils.enableProxyUsage( client, source.toURL() );
508 client.executeMethod( post );
509 InputStream is = post.getResponseBodyAsStream();
510 XMLFragment xml = new XMLFragment();
511 xml.load( is, source.toURL().toExternalForm() );
512
513 return XMLTools.getNodeAsInt( xml.getRootElement(), "csw:SearchResults/@numberOfRecordsMatched", nsc, 0 );
514
515 }
516
517 }
518
519 }