001    //$HeadURL: svn+ssh://rbezema@svn.wald.intevation.org/deegree/base/branches/2.2_testing/src/org/deegree/ogcwebservices/csw/manager/CatalogueHarvester.java $
002    /*----------------    FILE HEADER  ------------------------------------------
003    
004     This file is part of deegree.
005     Copyright (C) 2001-2008 by:
006     EXSE, Department of Geography, University of Bonn
007     http://www.giub.uni-bonn.de/deegree/
008     lat/lon GmbH
009     http://www.lat-lon.de
010    
011     This library is free software; you can redistribute it and/or
012     modify it under the terms of the GNU Lesser General Public
013     License as published by the Free Software Foundation; either
014     version 2.1 of the License, or (at your option) any later version.
015    
016     This library is distributed in the hope that it will be useful,
017     but WITHOUT ANY WARRANTY; without even the implied warranty of
018     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
019     Lesser General Public License for more details.
020    
021     You should have received a copy of the GNU Lesser General Public
022     License along with this library; if not, write to the Free Software
023     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024    
025     Contact:
026    
027     Andreas Poth
028     lat/lon GmbH
029     Aennchenstr. 19
030     53115 Bonn
031     Germany
032     E-Mail: poth@lat-lon.de
033    
034     Prof. Dr. Klaus Greve
035     Department of Geography
036     University of Bonn
037     Meckenheimer Allee 166
038     53115 Bonn
039     Germany
040     E-Mail: greve@giub.uni-bonn.de
041    
042     
043     ---------------------------------------------------------------------------*/
044    package org.deegree.ogcwebservices.csw.manager;
045    
046    import java.io.IOException;
047    import java.io.InputStream;
048    import java.net.URI;
049    import java.net.URL;
050    import java.sql.SQLException;
051    import java.util.Date;
052    import java.util.HashMap;
053    import java.util.Iterator;
054    import java.util.List;
055    import java.util.Map;
056    
057    import org.apache.commons.httpclient.HttpClient;
058    import org.apache.commons.httpclient.HttpException;
059    import org.apache.commons.httpclient.methods.PostMethod;
060    import org.apache.commons.httpclient.methods.StringRequestEntity;
061    import org.deegree.enterprise.WebUtils;
062    import org.deegree.framework.log.ILogger;
063    import org.deegree.framework.log.LoggerFactory;
064    import org.deegree.framework.util.CharsetUtils;
065    import org.deegree.framework.util.FileUtils;
066    import org.deegree.framework.util.StringTools;
067    import org.deegree.framework.util.TimeTools;
068    import org.deegree.framework.xml.XMLException;
069    import org.deegree.framework.xml.XMLFragment;
070    import org.deegree.framework.xml.XMLParsingException;
071    import org.deegree.framework.xml.XMLTools;
072    import org.deegree.io.DBPoolException;
073    import org.deegree.ogcwebservices.OGCWebServiceException;
074    import org.deegree.ogcwebservices.csw.manager.HarvestRepository.Record;
075    import org.deegree.ogcwebservices.csw.manager.HarvestRepository.ResourceType;
076    import org.w3c.dom.Element;
077    import org.w3c.dom.Node;
078    import org.xml.sax.SAXException;
079    
080    /**
081     * Harverster implementation for harvesting other catalogue services. Just dataset, series
082     * (datasetcollection) und application metadatatypes will be harvested.
083     * 
084     * 
085     * @version $Revision: 9345 $
086     * @author <a href="mailto:poth@lat-lon.de">Andreas Poth</a>
087     * @author last edited by: $Author: apoth $
088     * 
089     * @version 1.0. $Revision: 9345 $, $Date: 2007-12-27 17:22:25 +0100 (Do, 27 Dez 2007) $
090     * 
091     * @since 2.0
092     */
093    public class CatalogueHarvester extends AbstractHarvester {
094    
095        private static final ILogger LOG = LoggerFactory.getLogger( CatalogueHarvester.class );
096    
097        private static CatalogueHarvester ch = null;
098    
099        private enum HarvestOperation {
100            insert, update, delete, nothing
101        };
102    
103        /**
104         * singelton
105         * 
106         * @return instance of CatalogueHarvester
107         */
108        public static CatalogueHarvester getInstance() {
109            if ( ch == null ) {
110                ch = new CatalogueHarvester();
111            }
112            return ch;
113        }
114    
115        @Override
116        public void run() {
117            LOG.logDebug( "starting harvest iteration for CatalogueHarvester." );
118            try {
119                HarvestRepository repository = HarvestRepository.getInstance();
120    
121                List<URI> sources = repository.getSources();
122                for ( Iterator iter = sources.iterator(); iter.hasNext(); ) {
123                    URI source = (URI) iter.next();
124                    try {
125                        // determine if source shall be harvested
126                        if ( shallHarvest( source, ResourceType.catalogue ) ) {
127                            // mark source as currently being harvested
128                            inProgress.add( source );
129                            HarvestProcessor processor = new HarvestProcessor( this, source );
130                            processor.start();
131                        }
132                    } catch ( Exception e ) {
133                        e.printStackTrace();
134                        LOG.logError( Messages.format( "CatalogueHarvester.exception1", source ), e );
135                        informResponseHandlers( source, e );
136                    }
137                }
138            } catch ( Exception e ) {
139                LOG.logError( Messages.getString( "CatalogueHarvester.exception2" ), e );
140            }
141    
142        }
143    
144        /**
145         * inner class for processing asynchronous harvesting of a catalogue
146         * 
147         * @version $Revision: 9345 $
148         * @author <a href="mailto:poth@lat-lon.de">Andreas Poth</a>
149         * @author last edited by: $Author: apoth $
150         * 
151         * @version 1.0. $Revision: 9345 $, $Date: 2007-12-27 17:22:25 +0100 (Do, 27 Dez 2007) $
152         * 
153         * @since 2.0
154         */
155        protected class HarvestProcessor extends AbstractHarvestProcessor {
156    
157            private Map<String, Record> records = new HashMap<String, Record>( 10000 );
158    
159            /**
160             * 
161             * @param owner
162             * @param source
163             */
164            HarvestProcessor( AbstractHarvester owner, URI source ) {
165                super( owner, source );
166            }
167    
168            @Override
169            public void run() {
170    
171                String[] typeNames = new String[] { "csw:dataset", "csw:datasetcollection", "csw:application",
172                                                   "csw:service" };
173                records.clear();
174                try {
175                    HarvestRepository repository = HarvestRepository.getInstance();
176                    XMLFragment metaData = null;
177                    Date harvestingTimestamp = repository.getNextHarvestingTimestamp( source );
178                    for ( int i = 0; i < typeNames.length; i++ ) {
179                        int index = 1;
180                        int hits = getNoOfMetadataRecord( source, typeNames[i] );
181                        LOG.logInfo( hits + " metadatasets to harvest ..." );
182                        for ( int j = 0; j < hits; j++ ) {
183    
184                            try {
185                                // read index'th metadata set from CSW
186                                metaData = getNextMetadataRecord( source, index, typeNames[i] );
187                                if ( metaData != null ) {
188                                    // read record from harvest database if dataset has been harvested
189                                    // before
190                                    // or create a new one
191                                    Record record = createOrGetRecord( source, metaData );
192                                    records.put( record.getFileIdentifier(), record );
193                                    String trans = null;
194                                    try {
195                                        // determine harvest operation to perfrom
196                                        // insert: dataset has not been harvested before
197                                        // update: dataset has been harvested before but has changed
198                                        // nothing: e.g. dataset is not a known metadata format
199                                        HarvestOperation ho = getHarvestOperation( record, metaData );
200                                        if ( ho == HarvestOperation.insert ) {
201                                            trans = createInsertRequest( metaData );
202                                        } else if ( ho == HarvestOperation.update ) {
203                                            trans = createUpdateRequest( getID( metaData ), getIdentifierXPath( metaData ),
204                                                                         metaData );
205                                        }
206                                        // perform harvesting for current dataset; insert it or update
207                                        // extisting dataset in this CSW
208                                        if ( ho != HarvestOperation.nothing ) {
209                                            performTransaction( trans );
210                                            repository.storeRecord( record );
211                                        } else {
212                                            LOG.logInfo( "nothing to Harvest" );
213                                        }
214                                    } catch ( Throwable e ) {
215                                        LOG.logError( Messages.format( "CatalogueHarvester.exception3", index,
216                                                                       getID( metaData ), source ), e );
217                                        try {
218                                            e.printStackTrace();
219                                            // inform handlers assigend to the harvest request about
220                                            // failure
221                                            // harvesting one specifi dataset.
222                                            // notice: if harvisting one dataset fails, not the complete
223                                            // harvest
224                                            // process fails; the process gones on with next record
225                                            owner.informResponseHandlers( source, e );
226                                        } catch ( Exception ee ) {
227                                            ee.printStackTrace();
228                                        }
229                                        // remove fileIdentifier of current dataset from list of
230                                        // inserted
231                                        // or updated datasets. After process all available metadata
232                                        // records
233                                        // this list will be used to adjust the list of dataset assigend
234                                        // to
235                                        // to a specific CSW in harvest-metadata db schema
236                                        records.remove( record.getFileIdentifier() );
237                                    }
238                                } else {
239                                    LOG.logInfo( "harvesting will be stopped at index: " + index
240                                                 + " because metadata == null" );
241                                }
242                                LOG.logDebug( index + " metadata " + ( metaData == null ) );
243                            } catch ( Throwable e ) {
244                                LOG.logError( Messages.format( "CatalogueHarvester.exception3", index, "not available",
245                                                               source ), e );
246                                try {
247                                    e.printStackTrace();
248                                    // inform handlers assigend to the harvest request about failure
249                                    // harvesting one specific dataset.
250                                    // notice: if harvisting one dataset fails, not the complete harvest
251                                    // process fails; the process gones on with next record
252                                    owner.informResponseHandlers( source, e );
253                                } catch ( Exception ee ) {
254                                    ee.printStackTrace();
255                                }
256                            }
257                            index++;
258                            if ( index % 1000 == 0 ) {
259                                System.gc();
260                            }
261    
262                        }
263    
264                    }
265    
266                    // delete all records from the target catalogue and the
267                    // from harvest cache
268                    deleteRecordsNoHostedAnymore( source );
269    
270                    // update timestamps just if transaction has been performed
271                    // successfully
272                    writeLastHarvestingTimestamp( source, harvestingTimestamp );
273                    writeNextHarvestingTimestamp( source, harvestingTimestamp );
274                    // inform handlers assigend to the harvest request about successfully harvested
275                    // CSW. Even if harvesting a few records has failed s harvest process will declared
276                    // as successfull if it cann be fineshed regulary
277                    informResponseHandlers( source );
278                    if ( repository.getHarvestInterval( source ) <= 0 ) {
279                        repository.dropRequest( source );
280                    }
281                } catch ( Exception e ) {
282                    LOG.logError( Messages.format( "CatalogueHarvester.exception4", source ), e );
283                    try {
284                        e.printStackTrace();
285                        owner.informResponseHandlers( source, e );
286                    } catch ( Exception ee ) {
287                        ee.printStackTrace();
288                    }
289                } finally {
290                    inProgress.remove( source );
291                }
292    
293            }
294    
295            /**
296             * returns the XPath the metadata records identifier
297             * 
298             * @param metaData
299             * @return the XPath the metadata records identifier
300             */
301            private String getIdentifierXPath( XMLFragment metaData ) {
302                // default is iso 19115
303                String xpath = "iso19115:fileIdentifier/smXML:CharacterString";
304                if ( metaData != null ) {
305                    String nspace = metaData.getRootElement().getNamespaceURI();
306                    nspace = StringTools.replace( nspace, "http://", "", true );
307                    xpath = Messages.getString( "Identifier_" + nspace );
308                }
309                return xpath;
310            }
311    
312            /**
313             * returns the XPath the metadata records dateStamp
314             * 
315             * @param metaData
316             * @return the XPath the metadata records dateStamp
317             */
318            private String getDateStampXPath( XMLFragment metaData ) {
319                String xpath = null;
320                if ( metaData != null ) {
321                    String nspace = metaData.getRootElement().getNamespaceURI();
322                    nspace = StringTools.replace( nspace, "http://", "", true );
323                    xpath = Messages.getString( "dateStamp_" + nspace );
324                }
325                return xpath;
326            }
327    
328            /**
329             * returns the identifier of a metadata record to enable its update and deletion
330             * 
331             * @param metaData
332             * @return the identifier of a metadata record to enable its update and deletion
333             * @throws XMLParsingException
334             */
335            private String getID( XMLFragment metaData )
336                                    throws XMLParsingException {
337                String xpath = getIdentifierXPath( metaData );
338                String fileIdentifier = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
339                return fileIdentifier;
340            }
341    
342            @Override
343            protected String createConstraint( String identifier, String xPath )
344                                    throws IOException {
345    
346                // read template from file
347                // TODO
348                // read different templates depending on metadata format
349                URL url = CatalogueHarvester.class.getResource( "iso09_constraints_template.xml" );
350                String constraints = FileUtils.readTextFile( url ).toString();
351    
352                constraints = StringTools.replace( constraints, "$identifier$", identifier, false );
353                return StringTools.replace( constraints, "$xPath$", xPath, false );
354            }
355    
356            /**
357             * validates if a record stored in the harvester cache if not provided by the harvested
358             * catalogue any more; if so the record will be removed from the cache and the harvesting
359             * catalogue.
360             * 
361             * @throws IOException
362             * @throws SQLException
363             * @throws DBPoolException
364             * @throws XMLParsingException
365             * @throws SAXException
366             * @throws OGCWebServiceException
367             * 
368             */
369            private void deleteRecordsNoHostedAnymore( URI source )
370                                    throws DBPoolException, SQLException, IOException, OGCWebServiceException, SAXException {
371                HarvestRepository repository = HarvestRepository.getInstance();
372                List<String> cache = repository.getAllRecords( source );
373                int id = repository.getSourceID( source );
374                for ( int i = 0; i < cache.size(); i++ ) {
375                    String fid = cache.get( i );
376                    Record record = records.remove( fid );
377                    if ( record == null ) {
378                        repository.dropRecord( repository.new Record( id, null, fid, source ) );
379                        String trans = createDeleteRequest( fid, "./iso19115:fileIdentifier/smXML:CharacterString" );
380                        performTransaction( trans );
381                    }
382                }
383            }
384    
385            /**
386             * the method tries to read a record from the harvest repository. If the is not already
387             * stored in the repository a new record will be created
388             * 
389             * @param metaData
390             * @return record from harvest repository
391             * @throws XMLParsingException
392             * @throws IOException
393             * @throws SQLException
394             * @throws DBPoolException
395             */
396            private Record createOrGetRecord( URI source, XMLFragment metaData )
397                                    throws XMLParsingException, IOException, DBPoolException, SQLException {
398    
399                String xpath = getIdentifierXPath( metaData );
400                String fileIdentifier = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
401    
402                HarvestRepository repository = HarvestRepository.getInstance();
403                Record record = repository.getRecordByID( source, fileIdentifier );
404                if ( record == null ) {
405                    xpath = getDateStampXPath( metaData );
406                    String s = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
407                    Date date = TimeTools.createCalendar( s ).getTime();
408                    record = repository.new Record( -1, date, fileIdentifier, source );
409                }
410    
411                return record;
412            }
413    
414            /**
415             * determines what operation shall be performed on a metadata record read from a remote
416             * catalogue
417             * 
418             * @param metaData
419             * @return type of harvest operation to perform
420             * @throws IOException
421             * @throws SQLException
422             * @throws DBPoolException
423             * @throws XMLParsingException
424             */
425            private HarvestOperation getHarvestOperation( Record record, XMLFragment metaData )
426                                    throws XMLParsingException {
427    
428                HarvestOperation ho = HarvestOperation.nothing;
429                if ( record.getSourceId() < 0 ) {
430                    ho = HarvestOperation.insert;
431                } else {
432                    String xpath = getDateStampXPath( metaData );
433                    String s = XMLTools.getRequiredNodeAsString( metaData.getRootElement(), xpath, nsc );
434                    Date date = TimeTools.createCalendar( s ).getTime();
435                    if ( !date.equals( record.getDatestamp() ) ) {
436                        ho = HarvestOperation.update;
437                    }
438                }
439                return ho;
440            }
441    
442            /**
443             * read
444             * 
445             * @param source
446             * @return Metadata record
447             * @throws IOException
448             * @throws HttpException
449             * @throws SAXException
450             * @throws XMLException
451             * @throws XMLParsingException
452             */
453            private XMLFragment getNextMetadataRecord( URI source, int index, String type )
454                                    throws IOException, XMLException, SAXException, XMLParsingException {
455    
456                // read template from file
457                // TODO
458                // read different templates depending on metadata format
459                URL url = CatalogueHarvester.class.getResource( "iso09_getrecords_template.xml" );
460                String getRecords = FileUtils.readTextFile( url ).toString();
461                getRecords = StringTools.replace( getRecords, "$index$", Integer.toString( index ), false );
462                getRecords = StringTools.replace( getRecords, "$type$", type, false );
463    
464                StringRequestEntity re = new StringRequestEntity( getRecords, "text/xml", CharsetUtils.getSystemCharset() );
465                PostMethod post = new PostMethod( source.toASCIIString() );
466                post.setRequestEntity( re );
467                HttpClient client = new HttpClient();
468                int timeout = 30000;
469                try {
470                    timeout = Integer.parseInt( Messages.getString( "harvest.source.timeout" ) );
471                } catch ( Exception e ) {
472                    LOG.logInfo( "can not read timeout from messages.properties because: " + e.getMessage()
473                                 + "; use 30 sec as default" );
474                }
475                client.getHttpConnectionManager().getParams().setSoTimeout( timeout );
476                client = WebUtils.enableProxyUsage( client, source.toURL() );
477                client.executeMethod( post );
478                InputStream is = post.getResponseBodyAsStream();
479                XMLFragment xml = new XMLFragment();
480                xml.load( is, source.toURL().toExternalForm() );
481    
482                Node node = XMLTools.getNode( xml.getRootElement(), "csw:SearchResults/child::*[1]", nsc );
483                if ( node != null ) {
484                    xml.setRootElement( (Element) node );
485                } else {
486                    xml = null;
487                }
488    
489                return xml;
490            }
491    
492            private int getNoOfMetadataRecord( URI source, String type )
493                                    throws IOException, XMLException, SAXException, XMLParsingException {
494    
495                // read template from file
496                // TODO
497                // read different templates depending on metadata format
498                URL url = CatalogueHarvester.class.getResource( "iso09_get_no_of_records_template.xml" );
499                String getRecords = FileUtils.readTextFile( url ).toString();
500                getRecords = StringTools.replace( getRecords, "$type$", type, false );
501    
502                StringRequestEntity re = new StringRequestEntity( getRecords, "text/xml", CharsetUtils.getSystemCharset() );
503                PostMethod post = new PostMethod( source.toASCIIString() );
504                post.setRequestEntity( re );
505                HttpClient client = new HttpClient();
506                client.getHttpConnectionManager().getParams().setSoTimeout( 30000 );
507                client = WebUtils.enableProxyUsage( client, source.toURL() );
508                client.executeMethod( post );
509                InputStream is = post.getResponseBodyAsStream();
510                XMLFragment xml = new XMLFragment();
511                xml.load( is, source.toURL().toExternalForm() );
512    
513                return XMLTools.getNodeAsInt( xml.getRootElement(), "csw:SearchResults/@numberOfRecordsMatched", nsc, 0 );
514    
515            }
516    
517        }
518    
519    }