001 //$HeadURL: svn+ssh://rbezema@svn.wald.intevation.org/deegree/base/branches/2.2_testing/src/org/deegree/ogcwebservices/csw/manager/CatalogueHarvester.java $ 002 /*---------------- FILE HEADER ------------------------------------------ 003 004 This file is part of deegree. 005 Copyright (C) 2001-2008 by: 006 EXSE, Department of Geography, University of Bonn 007 http://www.giub.uni-bonn.de/deegree/ 008 lat/lon GmbH 009 http://www.lat-lon.de 010 011 This library is free software; you can redistribute it and/or 012 modify it under the terms of the GNU Lesser General Public 013 License as published by the Free Software Foundation; either 014 version 2.1 of the License, or (at your option) any later version. 015 016 This library is distributed in the hope that it will be useful, 017 but WITHOUT ANY WARRANTY; without even the implied warranty of 018 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 019 Lesser General Public License for more details. 020 021 You should have received a copy of the GNU Lesser General Public 022 License along with this library; if not, write to the Free Software 023 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 024 025 Contact: 026 027 Andreas Poth 028 lat/lon GmbH 029 Aennchenstr. 19 030 53115 Bonn 031 Germany 032 E-Mail: poth@lat-lon.de 033 034 Prof. Dr. Klaus Greve 035 Department of Geography 036 University of Bonn 037 Meckenheimer Allee 166 038 53115 Bonn 039 Germany 040 E-Mail: greve@giub.uni-bonn.de 041 042 043 ---------------------------------------------------------------------------*/ 044 package org.deegree.ogcwebservices.csw.manager; 045 046 import java.io.IOException; 047 import java.io.InputStream; 048 import java.net.URI; 049 import java.net.URL; 050 import java.sql.SQLException; 051 import java.util.Date; 052 import java.util.HashMap; 053 import java.util.Iterator; 054 import java.util.List; 055 import java.util.Map; 056 057 import org.apache.commons.httpclient.HttpClient; 058 import org.apache.commons.httpclient.HttpException; 059 import org.apache.commons.httpclient.methods.PostMethod; 060 import org.apache.commons.httpclient.methods.StringRequestEntity; 061 import org.deegree.enterprise.WebUtils; 062 import org.deegree.framework.log.ILogger; 063 import org.deegree.framework.log.LoggerFactory; 064 import org.deegree.framework.util.CharsetUtils; 065 import org.deegree.framework.util.FileUtils; 066 import org.deegree.framework.util.StringTools; 067 import org.deegree.framework.util.TimeTools; 068 import org.deegree.framework.xml.XMLException; 069 import org.deegree.framework.xml.XMLFragment; 070 import org.deegree.framework.xml.XMLParsingException; 071 import org.deegree.framework.xml.XMLTools; 072 import org.deegree.io.DBPoolException; 073 import org.deegree.ogcwebservices.OGCWebServiceException; 074 import org.deegree.ogcwebservices.csw.manager.HarvestRepository.Record; 075 import org.deegree.ogcwebservices.csw.manager.HarvestRepository.ResourceType; 076 import org.w3c.dom.Element; 077 import org.w3c.dom.Node; 078 import org.xml.sax.SAXException; 079 080 /** 081 * Harverster implementation for harvesting other catalogue services. Just dataset, series 082 * (datasetcollection) und application metadatatypes will be harvested. 083 * 084 * 085 * @version $Revision: 9345 $ 086 * @author <a href="mailto:poth@lat-lon.de">Andreas Poth</a> 087 * @author last edited by: $Author: apoth $ 088 * 089 * @version 1.0. $Revision: 9345 $, $Date: 2007-12-27 17:22:25 +0100 (Do, 27 Dez 2007) $ 090 * 091 * @since 2.0 092 */ 093 public class CatalogueHarvester extends AbstractHarvester { 094 095 private static final ILogger LOG = LoggerFactory.getLogger( CatalogueHarvester.class ); 096 097 private static CatalogueHarvester ch = null; 098 099 private enum HarvestOperation { 100 insert, update, delete, nothing 101 }; 102 103 /** 104 * singelton 105 * 106 * @return instance of CatalogueHarvester 107 */ 108 public static CatalogueHarvester getInstance() { 109 if ( ch == null ) { 110 ch = new CatalogueHarvester(); 111 } 112 return ch; 113 } 114 115 @Override 116 public void run() { 117 LOG.logDebug( "starting harvest iteration for CatalogueHarvester." ); 118 try { 119 HarvestRepository repository = HarvestRepository.getInstance(); 120 121 List<URI> sources = repository.getSources(); 122 for ( Iterator iter = sources.iterator(); iter.hasNext(); ) { 123 URI source = (URI) iter.next(); 124 try { 125 // determine if source shall be harvested 126 if ( shallHarvest( source, ResourceType.catalogue ) ) { 127 // mark source as currently being harvested 128 inProgress.add( source ); 129 HarvestProcessor processor = new HarvestProcessor( this, source ); 130 processor.start(); 131 } 132 } catch ( Exception e ) { 133 e.printStackTrace(); 134 LOG.logError( Messages.format( "CatalogueHarvester.exception1", source ), e ); 135 informResponseHandlers( source, e ); 136 } 137 } 138 } catch ( Exception e ) { 139 LOG.logError( Messages.getString( "CatalogueHarvester.exception2" ), e ); 140 } 141 142 } 143 144 /** 145 * inner class for processing asynchronous harvesting of a catalogue 146 * 147 * @version $Revision: 9345 $ 148 * @author <a href="mailto:poth@lat-lon.de">Andreas Poth</a> 149 * @author last edited by: $Author: apoth $ 150 * 151 * @version 1.0. $Revision: 9345 $, $Date: 2007-12-27 17:22:25 +0100 (Do, 27 Dez 2007) $ 152 * 153 * @since 2.0 154 */ 155 protected class HarvestProcessor extends AbstractHarvestProcessor { 156 157 private Map<String, Record> records = new HashMap<String, Record>( 10000 ); 158 159 /** 160 * 161 * @param owner 162 * @param source 163 */ 164 HarvestProcessor( AbstractHarvester owner, URI source ) { 165 super( owner, source ); 166 } 167 168 @Override 169 public void run() { 170 171 String[] typeNames = new String[] { "csw:dataset", "csw:datasetcollection", "csw:application", 172 "csw:service" }; 173 records.clear(); 174 try { 175 HarvestRepository repository = HarvestRepository.getInstance(); 176 XMLFragment metaData = null; 177 Date harvestingTimestamp = repository.getNextHarvestingTimestamp( source ); 178 for ( int i = 0; i < typeNames.length; i++ ) { 179 int index = 1; 180 int hits = getNoOfMetadataRecord( source, typeNames[i] ); 181 LOG.logInfo( hits + " metadatasets to harvest ..." ); 182 for ( int j = 0; j < hits; j++ ) { 183 184 try { 185 // read index'th metadata set from CSW 186 metaData = getNextMetadataRecord( source, index, typeNames[i] ); 187 if ( metaData != null ) { 188 // read record from harvest database if dataset has been harvested 189 // before 190 // or create a new one 191 Record record = createOrGetRecord( source, metaData ); 192 records.put( record.getFileIdentifier(), record ); 193 String trans = null; 194 try { 195 // determine harvest operation to perfrom 196 // insert: dataset has not been harvested before 197 // update: dataset has been harvested before but has changed 198 // nothing: e.g. dataset is not a known metadata format 199 HarvestOperation ho = getHarvestOperation( record, metaData ); 200 if ( ho == HarvestOperation.insert ) { 201 trans = createInsertRequest( metaData ); 202 } else if ( ho == HarvestOperation.update ) { 203 trans = createUpdateRequest( getID( metaData ), getIdentifierXPath( metaData ), 204 metaData ); 205 } 206 // perform harvesting for current dataset; insert it or update 207 // extisting dataset in this CSW 208 if ( ho != HarvestOperation.nothing ) { 209 performTransaction( trans ); 210 repository.storeRecord( record ); 211 } else { 212 LOG.logInfo( "nothing to Harvest" ); 213 } 214 } catch ( Throwable e ) { 215 LOG.logError( Messages.format( "CatalogueHarvester.exception3", index, 216 getID( metaData ), source ), e ); 217 try { 218 e.printStackTrace(); 219 // inform handlers assigend to the harvest request about 220 // failure 221 // harvesting one specifi dataset. 222 // notice: if harvisting one dataset fails, not the complete 223 // harvest 224 // process fails; the process gones on with next record 225 owner.informResponseHandlers( source, e ); 226 } catch ( Exception ee ) { 227 ee.printStackTrace(); 228 } 229 // remove fileIdentifier of current dataset from list of 230 // inserted 231 // or updated datasets. After process all available metadata 232 // records 233 // this list will be used to adjust the list of dataset assigend 234 // to 235 // to a specific CSW in harvest-metadata db schema 236 records.remove( record.getFileIdentifier() ); 237 } 238 } else { 239 LOG.logInfo( "harvesting will be stopped at index: " + index 240 + " because metadata == null" ); 241 } 242 LOG.logDebug( index + " metadata " + ( metaData == null ) ); 243 } catch ( Throwable e ) { 244 LOG.logError( Messages.format( "CatalogueHarvester.exception3", index, "not available", 245 source ), e ); 246 try { 247 e.printStackTrace(); 248 // inform handlers assigend to the harvest request about failure 249 // harvesting one specific dataset. 250 // notice: if harvisting one dataset fails, not the complete harvest 251 // process fails; the process gones on with next record 252 owner.informResponseHandlers( source, e ); 253 } catch ( Exception ee ) { 254 ee.printStackTrace(); 255 } 256 } 257 index++; 258 if ( index % 1000 == 0 ) { 259 System.gc(); 260 } 261 262 } 263 264 } 265 266 // delete all records from the target catalogue and the 267 // from harvest cache 268 deleteRecordsNoHostedAnymore( source ); 269 270 // update timestamps just if transaction has been performed 271 // successfully 272 writeLastHarvestingTimestamp( source, harvestingTimestamp ); 273 writeNextHarvestingTimestamp( source, harvestingTimestamp ); 274 // inform handlers assigend to the harvest request about successfully harvested 275 // CSW. Even if harvesting a few records has failed s harvest process will declared 276 // as successfull if it cann be fineshed regulary 277 informResponseHandlers( source ); 278 if ( repository.getHarvestInterval( source ) <= 0 ) { 279 repository.dropRequest( source ); 280 } 281 } catch ( Exception e ) { 282 LOG.logError( Messages.format( "CatalogueHarvester.exception4", source ), e ); 283 try { 284 e.printStackTrace(); 285 owner.informResponseHandlers( source, e ); 286 } catch ( Exception ee ) { 287 ee.printStackTrace(); 288 } 289 } finally { 290 inProgress.remove( source ); 291 } 292 293 } 294 295 /** 296 * returns the XPath the metadata records identifier 297 * 298 * @param metaData 299 * @return the XPath the metadata records identifier 300 */ 301 private String getIdentifierXPath( XMLFragment metaData ) { 302 // default is iso 19115 303 String xpath = "iso19115:fileIdentifier/smXML:CharacterString"; 304 if ( metaData != null ) { 305 String nspace = metaData.getRootElement().getNamespaceURI(); 306 nspace = StringTools.replace( nspace, "http://", "", true ); 307 xpath = Messages.getString( "Identifier_" + nspace ); 308 } 309 return xpath; 310 } 311 312 /** 313 * returns the XPath the metadata records dateStamp 314 * 315 * @param metaData 316 * @return the XPath the metadata records dateStamp 317 */ 318 private String getDateStampXPath( XMLFragment metaData ) { 319 String xpath = null; 320 if ( metaData != null ) { 321 String nspace = metaData.getRootElement().getNamespaceURI(); 322 nspace = StringTools.replace( nspace, "http://", "", true ); 323 xpath = Messages.getString( "dateStamp_" + nspace ); 324 } 325 return xpath; 326 } 327 328 /** 329 * returns the identifier of a metadata record to enable its update and deletion 330 * 331 * @param metaData 332 * @return the identifier of a metadata record to enable its update and deletion 333 * @throws XMLParsingException 334 */ 335 private String getID( XMLFragment metaData ) 336 throws XMLParsingException { 337 String xpath = getIdentifierXPath( metaData ); 338 String fileIdentifier = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc ); 339 return fileIdentifier; 340 } 341 342 @Override 343 protected String createConstraint( String identifier, String xPath ) 344 throws IOException { 345 346 // read template from file 347 // TODO 348 // read different templates depending on metadata format 349 URL url = CatalogueHarvester.class.getResource( "iso09_constraints_template.xml" ); 350 String constraints = FileUtils.readTextFile( url ).toString(); 351 352 constraints = StringTools.replace( constraints, "$identifier$", identifier, false ); 353 return StringTools.replace( constraints, "$xPath$", xPath, false ); 354 } 355 356 /** 357 * validates if a record stored in the harvester cache if not provided by the harvested 358 * catalogue any more; if so the record will be removed from the cache and the harvesting 359 * catalogue. 360 * 361 * @throws IOException 362 * @throws SQLException 363 * @throws DBPoolException 364 * @throws XMLParsingException 365 * @throws SAXException 366 * @throws OGCWebServiceException 367 * 368 */ 369 private void deleteRecordsNoHostedAnymore( URI source ) 370 throws DBPoolException, SQLException, IOException, OGCWebServiceException, SAXException { 371 HarvestRepository repository = HarvestRepository.getInstance(); 372 List<String> cache = repository.getAllRecords( source ); 373 int id = repository.getSourceID( source ); 374 for ( int i = 0; i < cache.size(); i++ ) { 375 String fid = cache.get( i ); 376 Record record = records.remove( fid ); 377 if ( record == null ) { 378 repository.dropRecord( repository.new Record( id, null, fid, source ) ); 379 String trans = createDeleteRequest( fid, "./iso19115:fileIdentifier/smXML:CharacterString" ); 380 performTransaction( trans ); 381 } 382 } 383 } 384 385 /** 386 * the method tries to read a record from the harvest repository. If the is not already 387 * stored in the repository a new record will be created 388 * 389 * @param metaData 390 * @return record from harvest repository 391 * @throws XMLParsingException 392 * @throws IOException 393 * @throws SQLException 394 * @throws DBPoolException 395 */ 396 private Record createOrGetRecord( URI source, XMLFragment metaData ) 397 throws XMLParsingException, IOException, DBPoolException, SQLException { 398 399 String xpath = getIdentifierXPath( metaData ); 400 String fileIdentifier = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc ); 401 402 HarvestRepository repository = HarvestRepository.getInstance(); 403 Record record = repository.getRecordByID( source, fileIdentifier ); 404 if ( record == null ) { 405 xpath = getDateStampXPath( metaData ); 406 String s = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc ); 407 Date date = TimeTools.createCalendar( s ).getTime(); 408 record = repository.new Record( -1, date, fileIdentifier, source ); 409 } 410 411 return record; 412 } 413 414 /** 415 * determines what operation shall be performed on a metadata record read from a remote 416 * catalogue 417 * 418 * @param metaData 419 * @return type of harvest operation to perform 420 * @throws IOException 421 * @throws SQLException 422 * @throws DBPoolException 423 * @throws XMLParsingException 424 */ 425 private HarvestOperation getHarvestOperation( Record record, XMLFragment metaData ) 426 throws XMLParsingException { 427 428 HarvestOperation ho = HarvestOperation.nothing; 429 if ( record.getSourceId() < 0 ) { 430 ho = HarvestOperation.insert; 431 } else { 432 String xpath = getDateStampXPath( metaData ); 433 String s = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc ); 434 Date date = TimeTools.createCalendar( s ).getTime(); 435 if ( !date.equals( record.getDatestamp() ) ) { 436 ho = HarvestOperation.update; 437 } 438 } 439 return ho; 440 } 441 442 /** 443 * read 444 * 445 * @param source 446 * @return Metadata record 447 * @throws IOException 448 * @throws HttpException 449 * @throws SAXException 450 * @throws XMLException 451 * @throws XMLParsingException 452 */ 453 private XMLFragment getNextMetadataRecord( URI source, int index, String type ) 454 throws IOException, XMLException, SAXException, XMLParsingException { 455 456 // read template from file 457 // TODO 458 // read different templates depending on metadata format 459 URL url = CatalogueHarvester.class.getResource( "iso09_getrecords_template.xml" ); 460 String getRecords = FileUtils.readTextFile( url ).toString(); 461 getRecords = StringTools.replace( getRecords, "$index$", Integer.toString( index ), false ); 462 getRecords = StringTools.replace( getRecords, "$type$", type, false ); 463 464 StringRequestEntity re = new StringRequestEntity( getRecords, "text/xml", CharsetUtils.getSystemCharset() ); 465 PostMethod post = new PostMethod( source.toASCIIString() ); 466 post.setRequestEntity( re ); 467 HttpClient client = new HttpClient(); 468 int timeout = 30000; 469 try { 470 timeout = Integer.parseInt( Messages.getString( "harvest.source.timeout" ) ); 471 } catch ( Exception e ) { 472 LOG.logInfo( "can not read timeout from messages.properties because: " + e.getMessage() 473 + "; use 30 sec as default" ); 474 } 475 client.getHttpConnectionManager().getParams().setSoTimeout( timeout ); 476 client = WebUtils.enableProxyUsage( client, source.toURL() ); 477 client.executeMethod( post ); 478 InputStream is = post.getResponseBodyAsStream(); 479 XMLFragment xml = new XMLFragment(); 480 xml.load( is, source.toURL().toExternalForm() ); 481 482 Node node = XMLTools.getNode( xml.getRootElement(), "csw:SearchResults/child::*[1]", nsc ); 483 if ( node != null ) { 484 xml.setRootElement( (Element) node ); 485 } else { 486 xml = null; 487 } 488 489 return xml; 490 } 491 492 private int getNoOfMetadataRecord( URI source, String type ) 493 throws IOException, XMLException, SAXException, XMLParsingException { 494 495 // read template from file 496 // TODO 497 // read different templates depending on metadata format 498 URL url = CatalogueHarvester.class.getResource( "iso09_get_no_of_records_template.xml" ); 499 String getRecords = FileUtils.readTextFile( url ).toString(); 500 getRecords = StringTools.replace( getRecords, "$type$", type, false ); 501 502 StringRequestEntity re = new StringRequestEntity( getRecords, "text/xml", CharsetUtils.getSystemCharset() ); 503 PostMethod post = new PostMethod( source.toASCIIString() ); 504 post.setRequestEntity( re ); 505 HttpClient client = new HttpClient(); 506 client.getHttpConnectionManager().getParams().setSoTimeout( 30000 ); 507 client = WebUtils.enableProxyUsage( client, source.toURL() ); 508 client.executeMethod( post ); 509 InputStream is = post.getResponseBodyAsStream(); 510 XMLFragment xml = new XMLFragment(); 511 xml.load( is, source.toURL().toExternalForm() ); 512 513 return XMLTools.getNodeAsInt( xml.getRootElement(), "csw:SearchResults/@numberOfRecordsMatched", nsc, 0 ); 514 515 } 516 517 } 518 519 }