001    //$HeadURL: https://svn.wald.intevation.org/svn/deegree/base/branches/2.3_testing/src/org/deegree/io/csv/CSVReader.java $
002    /*----------------------------------------------------------------------------
003     This file is part of deegree, http://deegree.org/
004     Copyright (C) 2001-2009 by:
005       Department of Geography, University of Bonn
006     and
007       lat/lon GmbH
008    
009     This library is free software; you can redistribute it and/or modify it under
010     the terms of the GNU Lesser General Public License as published by the Free
011     Software Foundation; either version 2.1 of the License, or (at your option)
012     any later version.
013     This library is distributed in the hope that it will be useful, but WITHOUT
014     ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
015     FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
016     details.
017     You should have received a copy of the GNU Lesser General Public License
018     along with this library; if not, write to the Free Software Foundation, Inc.,
019     59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
020    
021     Contact information:
022    
023     lat/lon GmbH
024     Aennchenstr. 19, 53177 Bonn
025     Germany
026     http://lat-lon.de/
027    
028     Department of Geography, University of Bonn
029     Prof. Dr. Klaus Greve
030     Postfach 1147, 53001 Bonn
031     Germany
032     http://www.geographie.uni-bonn.de/deegree/
033    
034     e-mail: info@deegree.org
035    ----------------------------------------------------------------------------*/
036    
037    package org.deegree.io.csv;
038    
039    import static java.io.StreamTokenizer.TT_EOF;
040    import static java.lang.Double.parseDouble;
041    import static java.util.Collections.unmodifiableList;
042    import static org.deegree.datatypes.Types.VARCHAR;
043    import static org.deegree.framework.log.LoggerFactory.getLogger;
044    import static org.deegree.io.mapinfoapi.MapInfoReader.whitespaceChars;
045    import static org.deegree.io.mapinfoapi.MapInfoReader.wordChars;
046    import static org.deegree.model.feature.FeatureFactory.createFeature;
047    import static org.deegree.model.feature.FeatureFactory.createFeatureCollection;
048    import static org.deegree.model.feature.FeatureFactory.createFeatureProperty;
049    import static org.deegree.model.feature.FeatureFactory.createFeatureType;
050    import static org.deegree.model.feature.FeatureFactory.createGeometryPropertyType;
051    import static org.deegree.model.feature.FeatureFactory.createSimplePropertyType;
052    import static org.deegree.model.spatialschema.GeometryFactory.createPoint;
053    import static org.deegree.model.spatialschema.WKTAdapter.wrap;
054    
055    import java.io.BufferedReader;
056    import java.io.File;
057    import java.io.FileReader;
058    import java.io.IOException;
059    import java.io.StreamTokenizer;
060    import java.io.StringReader;
061    import java.net.URI;
062    import java.net.URISyntaxException;
063    import java.util.ArrayList;
064    import java.util.LinkedList;
065    import java.util.List;
066    
067    import org.deegree.datatypes.QualifiedName;
068    import org.deegree.framework.log.ILogger;
069    import org.deegree.model.feature.FeatureCollection;
070    import org.deegree.model.feature.FeatureProperty;
071    import org.deegree.model.feature.schema.FeatureType;
072    import org.deegree.model.feature.schema.PropertyType;
073    import org.deegree.model.spatialschema.Geometry;
074    import org.deegree.model.spatialschema.GeometryException;
075    
076    /**
077     * <code>CSVReader</code>
078     *
079     * @author <a href="mailto:schmitz@lat-lon.de">Andreas Schmitz</a>
080     * @author last edited by: $Author: mschneider $
081     *
082     * @version $Revision: 18195 $, $Date: 2009-06-18 17:55:39 +0200 (Do, 18. Jun 2009) $
083     */
084    public class CSVReader {
085    
086        private static final ILogger LOG = getLogger( CSVReader.class );
087    
088        private File fileName;
089    
090        private int xcol = 0, ycol = 1, wkt = -1;
091    
092        private static URI APPNS;
093    
094        private List<String[]> header;
095    
096        private boolean ignoreFirstLine, parseGeometryProperty = true;
097    
098        static {
099            try {
100                APPNS = new URI( "http://www.deegree.org/app" );
101            } catch ( URISyntaxException e ) {
102                // yes, cannot happen
103            }
104        }
105    
106        /**
107         * @param name
108         * @param ignoreFirstLine
109         * @throws IOException
110         */
111        public CSVReader( String name, boolean ignoreFirstLine ) throws IOException {
112            this.ignoreFirstLine = ignoreFirstLine;
113    
114            fileName = new File( name ).getAbsoluteFile();
115    
116            header = new ArrayList<String[]>( 3 );
117    
118            BufferedReader in = new BufferedReader( new FileReader( name ) );
119            String str = in.readLine();
120            char separat = determineSeparator( str );
121            do {
122                List<String> lst = parseLine( str, separat );
123                header.add( lst.toArray( new String[lst.size()] ) );
124            } while ( ( ( str = in.readLine() ) != null ) && header.size() < 3 );
125            in.close();
126        }
127    
128        /**
129         * @return max. the first three lines of the file (if there are three)
130         */
131        public List<String[]> getHeader() {
132            return unmodifiableList( header );
133        }
134    
135        /**
136         * By default, a geometry property will be parsed. Set this to false to get "simple property only" features.
137         *
138         * @param parseGeometryProperty
139         */
140        public void setParseGeometryProperty( boolean parseGeometryProperty ) {
141            this.parseGeometryProperty = parseGeometryProperty;
142        }
143    
144        private static char determineSeparator( String s ) {
145            // determine most likely separator
146            int ccount = countChars( s, ',' );
147            int scount = countChars( s, ';' );
148            int tcount = countChars( s, '\t' );
149            if ( ccount >= scount && ccount >= tcount ) {
150                return ',';
151            }
152            if ( tcount >= ccount && tcount >= scount ) {
153                return '\t';
154            }
155            if ( scount >= ccount && scount >= tcount ) {
156                return ';';
157            }
158            return ',';
159        }
160    
161        private static List<String> parseLine( String line, char separator )
162                                throws IOException {
163            String seps = ",;\t";
164            for ( int i = 0; i < seps.length(); ++i ) {
165                if ( line.startsWith( "" + seps.charAt( i ) ) ) {
166                    line = "\"\"" + line;
167                }
168                String dseps = "" + seps.charAt( i ) + seps.charAt( i );
169                while ( line.indexOf( dseps ) != -1 ) {
170                    line = line.replace( dseps, seps.charAt( i ) + "\"\"" + seps.charAt( i ) );
171                }
172            }
173            StreamTokenizer tok = getCSVFromStringTokenizer( line, separator );
174    
175            LinkedList<String> list = new LinkedList<String>();
176    
177            tok.nextToken();
178            if ( tok.ttype == TT_EOF ) {
179                return list;
180            }
181            while ( tok.ttype != TT_EOF ) {
182                list.add( tok.sval );
183                tok.nextToken();
184            }
185    
186            return list;
187        }
188    
189        /**
190         * Also sets wkt to -1.
191         *
192         * @param x
193         * @param y
194         */
195        public void setPointColumns( int x, int y ) {
196            xcol = x;
197            ycol = y;
198            wkt = -1;
199        }
200    
201        /**
202         * @param wkt
203         *            if -1, x/y will be used instead
204         */
205        public void setWKTColumn( int wkt ) {
206            this.wkt = wkt;
207        }
208    
209        /**
210         * @param input
211         * @param separator
212         * @return a tokenizer with a stringreader as data input
213         */
214        public static StreamTokenizer getCSVFromStringTokenizer( String input, char separator ) {
215            StreamTokenizer tok = new StreamTokenizer( new StringReader( input ) );
216    
217            tok.resetSyntax();
218            tok.eolIsSignificant( true );
219            tok.lowerCaseMode( true );
220            tok.slashSlashComments( false );
221            tok.slashStarComments( false );
222            tok.wordChars( 'a', 'z' );
223            tok.wordChars( 'A', 'Z' );
224            tok.wordChars( '\u00a0', '\u00ff' );
225            tok.wordChars( '0', '9' );
226            wordChars( tok, ',', '\t', ';' );
227            wordChars( tok, '.', '-', '_', ' ', '+', '/', '\\', '(', ')', '^' );
228            tok.quoteChar( '"' );
229            whitespaceChars( tok, '\n', '\r', '\f' );
230    
231            // reset separator
232            whitespaceChars( tok, separator );
233    
234            return tok;
235        }
236    
237        private static int countChars( String s, char c ) {
238            int count = 0;
239            for ( int i = 0; i < s.length(); ++i ) {
240                if ( s.charAt( i ) == c ) {
241                    ++count;
242                }
243            }
244            return count;
245        }
246    
247        /**
248         * @return a new feature collection
249         * @throws IOException
250         */
251        public FeatureCollection parseFeatureCollection()
252                                throws IOException {
253            FeatureCollection fc = createFeatureCollection( "uniquemy_", 512 );
254            QualifiedName geomName = new QualifiedName( "app:geometry", APPNS );
255            QualifiedName featureName = new QualifiedName( "app:feature", APPNS );
256    
257            int counter = 0;
258    
259            BufferedReader in = new BufferedReader( new FileReader( fileName ) );
260            String str = in.readLine();
261            List<String> colNames = null;
262    
263            char separator = determineSeparator( str );
264            if ( ignoreFirstLine ) {
265                colNames = parseLine( str, separator );
266                str = in.readLine();
267            }
268            outer: do {
269                LOG.logDebug( "Trying to parse line ", str );
270                List<String> vals = parseLine( str, separator );
271    
272                double x = 0, y = 0;
273                Geometry wktGeom = null;
274                LinkedList<FeatureProperty> fps = new LinkedList<FeatureProperty>();
275                LinkedList<PropertyType> fpt = new LinkedList<PropertyType>();
276    
277                for ( int i = 0; i < vals.size(); ++i ) {
278    
279                    if ( parseGeometryProperty && wkt == -1 && i == xcol ) {
280                        try {
281                            x = parseDouble( vals.get( i ) );
282                        } catch ( NumberFormatException nfe ) {
283                            // puh, CSV is an easy format? I think not...
284                            try {
285                                x = parseDouble( vals.get( i ).replace( ",", "." ) );
286                            } catch ( NumberFormatException nfe2 ) {
287                                LOG.logWarning( "Skipping line " + str );
288                                continue outer;
289                            }
290                        }
291                        continue;
292                    }
293                    if ( parseGeometryProperty && wkt == -1 && i == ycol ) {
294                        if ( vals.get( i ).equals( "" ) ) {
295                            y = 0; // this seems to be a sensible (Java-like) default
296                        } else {
297                            try {
298                                y = parseDouble( vals.get( i ) );
299                            } catch ( NumberFormatException nfe ) {
300                                // puh, CSV is an easy format? I think not...
301                                try {
302                                    y = parseDouble( vals.get( i ).replace( ",", "." ) );
303                                } catch ( NumberFormatException nfe2 ) {
304                                    LOG.logWarning( "Skipping line " + str );
305                                    continue outer;
306                                }
307                            }
308                        }
309                        continue;
310                    }
311                    if ( parseGeometryProperty && wkt != -1 && i == wkt ) {
312                        try {
313                            wktGeom = wrap( vals.get( i ), null );
314                        } catch ( GeometryException e ) {
315                            LOG.logError( "Invalid WKT geometry", e );
316                        }
317                        if ( wktGeom == null ) {
318                            LOG.logError( "Could not parse WKT geometry: " + vals.get( i ) );
319                        }
320                        continue;
321                    }
322    
323                    String n;
324                    if ( ignoreFirstLine ) {
325                        String coln = colNames.get( i );
326                        n = "app:" + ( coln.trim().equals( "" ) ? "property" + i : coln );
327                    } else {
328                        n = "app:property" + i;
329                    }
330                    n = n.replace( ' ', '_' );
331                    QualifiedName name = new QualifiedName( n, APPNS );
332                    fps.add( createFeatureProperty( name, vals.get( i ) ) );
333                    fpt.add( createSimplePropertyType( name, VARCHAR, true ) );
334                }
335    
336                if ( parseGeometryProperty ) {
337                    if ( wkt != -1 && wktGeom != null ) {
338                        fps.add( createFeatureProperty( geomName, wktGeom ) );
339                    } else {
340                        fps.add( createFeatureProperty( geomName, createPoint( x, y, null ) ) );
341                    }
342                    fpt.add( createGeometryPropertyType( geomName, null, 1, 1 ) );
343                }
344    
345                FeatureType tp = createFeatureType( featureName, false, fpt.toArray( new PropertyType[fpt.size()] ) );
346                fc.add( createFeature( ++counter + "", tp, fps ) );
347            } while ( ( ( str = in.readLine() ) != null ) );
348    
349            in.close();
350    
351            // makes sense (?)
352            if ( fc.size() > 0 ) {
353                fc.setFeatureType( fc.getFeature( 0 ).getFeatureType() );
354            }
355    
356            return fc;
357        }
358    }