001/*
002 * Copyright 2006 - 2013
003 *     Stefan Balev     <stefan.balev@graphstream-project.org>
004 *     Julien Baudry    <julien.baudry@graphstream-project.org>
005 *     Antoine Dutot    <antoine.dutot@graphstream-project.org>
006 *     Yoann Pigné      <yoann.pigne@graphstream-project.org>
007 *     Guilhelm Savin   <guilhelm.savin@graphstream-project.org>
008 * 
009 * This file is part of GraphStream <http://graphstream-project.org>.
010 * 
011 * GraphStream is a library whose purpose is to handle static or dynamic
012 * graph, create them from scratch, file or any source and display them.
013 * 
014 * This program is free software distributed under the terms of two licenses, the
015 * CeCILL-C license that fits European law, and the GNU Lesser General Public
016 * License. You can  use, modify and/ or redistribute the software under the terms
017 * of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following
018 * URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by
019 * the Free Software Foundation, either version 3 of the License, or (at your
020 * option) any later version.
021 * 
022 * This program is distributed in the hope that it will be useful, but WITHOUT ANY
023 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
024 * PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more details.
025 * 
026 * You should have received a copy of the GNU Lesser General Public License
027 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
028 * 
029 * The fact that you are presently reading this means that you have had
030 * knowledge of the CeCILL-C and LGPL licenses and that you accept their terms.
031 */
032package org.graphstream.algorithm.generator;
033
034import java.io.BufferedReader;
035import java.io.IOException;
036import java.io.InputStream;
037import java.io.InputStreamReader;
038import java.net.HttpURLConnection;
039import java.net.URI;
040import java.net.URISyntaxException;
041import java.net.URLConnection;
042import java.util.HashSet;
043import java.util.LinkedList;
044import java.util.concurrent.locks.ReentrantLock;
045import java.util.regex.Matcher;
046import java.util.regex.Pattern;
047
048import org.graphstream.graph.Edge;
049import org.graphstream.graph.Node;
050
051/**
052 * Generate a graph using the web. Some urls are given to start and the
053 * generator will extract links on these pages. Each url is a node and there is
054 * an edge between two urls when one has a link to the other. Links are
055 * extracted using the "href" attribute of html elements.
056 * 
057 */
058public class URLGenerator extends BaseGenerator {
059
060        public static enum Mode {
061                HOST, PATH, FULL
062        }
063
064        private static String REGEX = "href=\"([^\"]*)\"";
065
066        protected HashSet<String> urls;
067        protected LinkedList<String> stepUrls;
068        protected HashSet<String> newUrls;
069        protected Pattern hrefPattern;
070        protected Mode mode;
071        protected int threads = 2;
072        protected String nodeWeight = "weight";
073        protected String edgeWeight = "weight";
074        protected LinkedList<URLFilter> filters;
075        protected double step;
076        protected boolean printProgress;
077        protected int depthLimit;
078        protected final ReentrantLock lock;
079
080        public URLGenerator(String... startFrom) {
081                urls = new HashSet<String>();
082                stepUrls = new LinkedList<String>();
083                newUrls = new HashSet<String>();
084                hrefPattern = Pattern.compile(REGEX);
085                mode = Mode.HOST;
086                filters = new LinkedList<URLFilter>();
087                directed = false;
088                step = 0;
089                printProgress = false;
090                depthLimit = 0;
091                lock = new ReentrantLock();
092
093                declineMatchingURL("^(javascript:|mailto:|#).*");
094                declineMatchingURL(".*[.](avi|tar|gz|zip|mp3|mpg|jpg|jpeg|png|ogg|flv|ico|svg)$");
095
096                setUseInternalGraph(true);
097
098                if (startFrom != null) {
099                        for (int i = 0; i < startFrom.length; i++) {
100                                stepUrls.add(startFrom[i]);
101                        }
102                }
103        }
104
105        /*
106         * (non-Javadoc)
107         * 
108         * @see org.graphstream.algorithm.generator.Generator#begin()
109         */
110        public void begin() {
111        }
112
113        /*
114         * (non-Javadoc)
115         * 
116         * @see org.graphstream.algorithm.generator.Generator#nextEvents()
117         */
118        public boolean nextEvents() {
119                sendStepBegins(sourceId, step);
120                sendGraphAttributeChanged(sourceId, "urls.parsed", null, urls.size());
121                sendGraphAttributeChanged(sourceId, "urls.remaining", null,
122                                stepUrls.size());
123
124                if (printProgress)
125                        progress();
126
127                for (String url : stepUrls) {
128                        try {
129                                addNodeURL(url);
130                        } catch (URISyntaxException e) {
131                                e.printStackTrace();
132                        }
133                }
134
135                urls.addAll(stepUrls);
136                newUrls.clear();
137
138                if (threads > 1)
139                        nextEventsThreaded();
140                else {
141                        for (String url : stepUrls) {
142                                try {
143                                        parseUrl(url);
144                                } catch (IOException e) {
145                                        System.err.printf("Failed to parse \"%s\" : %s\n", url,
146                                                        e.getMessage());
147                                }
148                        }
149                }
150
151                stepUrls.clear();
152                stepUrls.addAll(newUrls);
153                step++;
154                
155                return newUrls.size() > 0;
156        }
157
158        /**
159         * Add an url to process.
160         * 
161         * @param url
162         *            a new url
163         */
164        public void addURL(String url) {
165                stepUrls.add(url);
166        }
167
168        /**
169         * Create directed edges.
170         * 
171         * @param on
172         *            true to create directed edges
173         */
174        public void setDirected(boolean on) {
175                setDirectedEdges(on, false);
176        }
177
178        /**
179         * Set the attribute key used to store weight of nodes. Whenever a node is
180         * reached, its weight is increased by one.
181         * 
182         * @param attribute
183         *            attribute key of the weight of nodes
184         */
185        public void setNodeWeightAttribute(String attribute) {
186                this.nodeWeight = attribute;
187        }
188
189        /**
190         * Set the attribute key used to store weight of edges. Whenever an edge is
191         * reached, its weight is increased by one.
192         * 
193         * @param attribute
194         *            attribute key of the weight of edges
195         */
196        public void setEdgeWeightAttribute(String attribute) {
197                this.edgeWeight = attribute;
198        }
199
200        /**
201         * Set the way that url are converted to node id. When mode is Mode.FULL,
202         * then the id is the raw url. With Mode.PATH, the query of the url is
203         * truncated so the url http://host/path?what=xxx will be converted as
204         * http://host/path. With Mode.HOST, the url is converted to the host name
205         * so the url http://host/path will be converted as http://host.
206         * 
207         * @param mode
208         *            mode specifying how to convert url to have node id
209         */
210        public void setMode(Mode mode) {
211                this.mode = mode;
212        }
213
214        /**
215         * Set the amount of threads used to parse urls. Threads are created in the
216         * {@link #nextEvents()} step. At the end of this method, all working thread
217         * have stop.
218         * 
219         * @param count
220         *            amount of threads
221         */
222        public void setThreadCount(int count) {
223                this.threads = count;
224        }
225
226        /**
227         * Set the maximum steps before stop. If 0 or less, limit is disabled.
228         * 
229         * @param depthLimit
230         */
231        public void setDepthLimit(int depthLimit) {
232                this.depthLimit = depthLimit;
233        }
234
235        public void enableProgression(boolean on) {
236                printProgress = on;
237        }
238
239        /**
240         * Can be used to filter url. Url not matching this regex will be discarded.
241         * 
242         * @param regex
243         */
244        public void acceptOnlyMatchingURL(final String regex) {
245                URLFilter f = new URLFilter() {
246                        public boolean accept(String url) {
247                                if (url.matches(regex))
248                                        return true;
249
250                                return false;
251                        }
252                };
253
254                filters.add(f);
255        }
256
257        /**
258         * Can be used to filter url. Url matching this regex will be discarded.
259         * 
260         * @param regex
261         */
262        public void declineMatchingURL(final String regex) {
263                URLFilter f = new URLFilter() {
264                        public boolean accept(String url) {
265                                if (!url.matches(regex))
266                                        return true;
267
268                                return false;
269                        }
270                };
271
272                filters.add(f);
273        }
274
275        /**
276         * Can be used to filter url according to the host. Note that several calls
277         * to this method may lead to discard all url. All hosts should be gived in
278         * a single call.
279         * 
280         * @param hosts
281         *            list of accepted hosts
282         */
283        public void addHostFilter(String... hosts) {
284                if (hosts != null) {
285                        StringBuilder b = new StringBuilder(
286                                        "^(\\w+:)?(//)?([\\w-\\d]+[.])?(");
287                        b.append(hosts[0]);
288
289                        for (int i = 1; i < hosts.length; i++)
290                                b.append("|").append(hosts[i]);
291
292                        b.append(").*");
293
294                        acceptOnlyMatchingURL(b.toString());
295                }
296        }
297
298        protected void nextEventsThreaded() {
299                int t = Math.min(threads, stepUrls.size());
300                int byThreads = stepUrls.size() / t;
301
302                LinkedList<Worker> workers = new LinkedList<Worker>();
303                LinkedList<Thread> workersThreads = new LinkedList<Thread>();
304
305                for (int i = 0; i < t; i++) {
306                        int start = i * byThreads;
307                        int stop = (i + 1) * byThreads;
308
309                        if (i == t - 1)
310                                stop += stepUrls.size() % t;
311
312                        Worker w = new Worker(start, stop, stepUrls);
313                        Thread u = new Thread(w);
314
315                        u.start();
316
317                        workers.add(w);
318                        workersThreads.add(u);
319                }
320
321                for (int i = 0; i < t; i++) {
322                        try {
323                                workersThreads.get(i).join();
324                        } catch (InterruptedException e) {
325                                e.printStackTrace();
326                        }
327                }
328        }
329
330        protected boolean isValid(String url) {
331                for (int i = 0; i < filters.size(); i++) {
332                        if (!filters.get(i).accept(url))
333                                return false;
334                }
335
336                return true;
337        }
338
339        /**
340         * Parse an url and add all extracted links in a specified set.
341         * 
342         * @param url
343         *            the url to parse
344         * @param newUrls
345         *            the set where extracted links will be added
346         * @throws IOException
347         */
348        protected void parseUrl(String url) throws IOException {
349                URI uri;
350                URLConnection conn;
351                InputStream stream;
352                BufferedReader reader;
353                HashSet<String> localUrls = new HashSet<String>();
354
355                if (!isValid(url))
356                        return;
357
358                try {
359                        uri = new URI(url);
360                } catch (URISyntaxException e1) {
361                        throw new IOException(e1);
362                }
363
364                if (uri.getHost() == null) {
365                        System.err.printf("skip invalid uri : '%s'\n", url);
366                        return;
367                }
368
369                if (!uri.isAbsolute()) {
370                        System.err.printf("skip non-absolute uri : '%s'\n", url);
371                        return;
372                }
373
374                conn = uri.toURL().openConnection();
375                conn.setConnectTimeout(1000);
376                conn.setReadTimeout(1000);
377                conn.connect();
378
379                if (conn.getContentType() == null
380                                || !conn.getContentType().startsWith("text/html"))
381                        return;
382
383                stream = conn.getInputStream();
384                reader = new BufferedReader(new InputStreamReader(stream));
385
386                while (reader.ready()) {
387                        String line = reader.readLine();
388
389                        if (line == null)
390                                continue;
391
392                        Matcher m = hrefPattern.matcher(line);
393
394                        while (m.find()) {
395                                String href = m.group(1);
396
397                                if (href == null || href.length() == 0)
398                                        continue;
399
400                                href = href.trim();
401
402                                if (href.charAt(0) == '/')
403                                        href = String.format("%s://%s%s", uri.getScheme(),
404                                                        uri.getHost(), href);
405
406                                if (href.charAt(0) == '.')
407                                        href = String.format("%s%s", url, href);
408
409                                if (!isValid(href))
410                                        continue;
411
412                                try {
413                                        if (depthLimit == 0 || step < depthLimit) {
414                                                synchronizedOperation(href, null);
415                                                synchronizedOperation(url, href);
416                                        } else {
417                                                if (urls.contains(href))
418                                                        synchronizedOperation(url, href);
419                                        }
420                                } catch (URISyntaxException e) {
421                                        throw new IOException(e);
422                                }
423
424                                if (!urls.contains(href)
425                                                && (depthLimit == 0 || step < depthLimit))
426                                        localUrls.add(href);
427                        }
428                }
429
430                lock.lock();
431
432                try {
433                        newUrls.addAll(localUrls);
434                } finally {
435                        lock.unlock();
436                }
437
438                localUrls.clear();
439                localUrls = null;
440
441                try {
442                        if (conn.getDoOutput())
443                                conn.getOutputStream().close();
444                        reader.close();
445                        stream.close();
446                } catch (IOException e) {
447                        // Do not throw this exception
448                }
449
450                if (conn instanceof HttpURLConnection)
451                        ((HttpURLConnection) conn).disconnect();
452        }
453
454        protected String getNodeId(String url) throws URISyntaxException {
455                String nodeId = url;
456                URI uri = new URI(url);
457
458                switch (mode) {
459                case HOST:
460                        nodeId = String.format("%s://%s", uri.getScheme(), uri.getHost());
461                        break;
462                case PATH:
463                        nodeId = String.format("%s://%s%s", uri.getScheme(), uri.getHost(),
464                                        uri.getPath());
465                        break;
466                case FULL:
467                        nodeId = String.format("%s://%s%s%s", uri.getScheme(), uri
468                                        .getHost(), uri.getPath(), uri.getQuery() == null ? ""
469                                        : uri.getQuery());
470                        break;
471                }
472
473                return nodeId;
474        }
475
476        protected String getNodeLabel(String url) throws URISyntaxException {
477                return url;
478        }
479
480        protected String getEdgeId(String nodeId1, String nodeId2) {
481                if (directed || nodeId1.compareTo(nodeId2) < 0)
482                        return String.format("%s > %s", nodeId1, nodeId2);
483
484                return String.format("%s > %s", nodeId2, nodeId1);
485        }
486
487        protected synchronized void synchronizedOperation(String url1, String url2)
488                        throws URISyntaxException {
489                if (url2 == null)
490                        addNodeURL(url1);
491                else
492                        connect(url1, url2);
493        }
494
495        protected void addNodeURL(String url) throws URISyntaxException {
496                String nodeId = getNodeId(url);
497
498                // urls.add(url);
499
500                if (internalGraph.getNode(nodeId) == null) {
501                        addNode(nodeId);
502                        sendNodeAttributeAdded(sourceId, nodeId, "label", getNodeLabel(url));
503                        // System.out.printf("> new url '%s' --> '%s'\n", url, nodeId);
504                }
505
506                Node n = internalGraph.getNode(nodeId);
507                double w;
508
509                if (n.hasNumber(nodeWeight))
510                        w = n.getNumber(nodeWeight);
511                else
512                        w = 0;
513
514                n.setAttribute(nodeWeight, w + 1);
515                sendNodeAttributeChanged(sourceId, nodeId, nodeWeight, null, w + 1);
516        }
517
518        protected void connect(String url1, String url2) throws URISyntaxException {
519                String src, trg, eid;
520
521                src = getNodeId(url1);
522                trg = getNodeId(url2);
523
524                if (internalGraph.getNode(src) == null)
525                        addNode(src);
526
527                if (internalGraph.getNode(trg) == null)
528                        addNode(trg);
529
530                if (!src.equals(trg)) {
531                        eid = getEdgeId(src, trg);
532
533                        if (internalGraph.getEdge(eid) == null)
534                                addEdge(eid, src, trg);
535
536                        Edge e = internalGraph.getEdge(eid);
537                        double w;
538
539                        if (e.hasNumber(edgeWeight))
540                                w = e.getNumber(edgeWeight);
541                        else
542                                w = 0;
543
544                        e.setAttribute(edgeWeight, w + 1);
545                        sendEdgeAttributeChanged(sourceId, eid, edgeWeight, null, w + 1);
546                }
547        }
548
549        protected void progress() {
550                System.out.printf("\033[s\033[K%d urls parsed, %d remaining\033[u",
551                                urls.size(), stepUrls.size());
552        }
553
554        /*
555         * Private class used to distribute url parsing.
556         */
557        private class Worker implements Runnable {
558                int start, stop;
559                LinkedList<String> urls;
560
561                public Worker(int start, int stop, LinkedList<String> urls) {
562                        this.start = start;
563                        this.stop = stop;
564                        this.urls = urls;
565                }
566
567                public void run() {
568                        for (int i = start; i < stop; i++) {
569                                try {
570                                        parseUrl(urls.get(i));
571                                } catch (IOException e) {
572                                        System.err.printf("Failed to parse \"%s\" : %s\n",
573                                                        urls.get(i), e.getMessage());
574                                }
575                        }
576                }
577        }
578
579        /**
580         * Defines url filter.
581         */
582        public static interface URLFilter {
583                /**
584                 * Called by the generator to know if the specified url can be accepted
585                 * by this filter. If a filter return false, then the url is discarded.
586                 * 
587                 * @param url
588                 *            the url to check if it can be accepted
589                 * @return true if the url is accepted
590                 */
591                boolean accept(String url);
592        }
593}