From 15d1daf898cd1cc9007e00d90d9baf0ee88ce78b Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 15 Apr 2024 16:12:54 +0200 Subject: [PATCH] feat: tool and scripts to interactively explore webgraphs - the class GraphExplorer allows to explore webgraphs using the JShell - the class Graph holds all webgraph-related data as memory-mapped data: the graph, its transpose and the map to translate between vertex labels and IDs. It provides methods to access successors and predecessors, etc. - the script graph_explore_download_webgraph.sh downloads all files required for exploring a graph - the script graph_explore_build_vertex_map.sh builds a map of vertex labels to vertex ID and verifies that all graph files required for graph exploration are downloaded. - utility methods - get a common subset (intersection) or the union of the successors or predecessors of a list of vertices - class CountingMergedIntIterator to count occurrences of integers given a list of int iterators as input - print list of vertices - load and save vertex lists from/to files - count top-level domains in lists of vertices - JShell script to load a graph - tutorial / quick start graph exploration --- README.md | 16 +- graph-exploration-README.md | 285 +++++++++++++ .../webgraph/CountingMergedIntIterator.java | 135 ++++++ .../commoncrawl/webgraph/explore/Graph.java | 403 ++++++++++++++++++ .../webgraph/explore/GraphExplorer.java | 295 +++++++++++++ .../graph_explore_build_vertex_map.sh | 144 +++++++ .../graph_explore_download_webgraph.sh | 120 ++++++ .../graph_explore_load_graph.jsh | 32 ++ .../process_webgraph_degrees.sh | 54 +++ .../TestCountingMergedIntIterator.java | 77 ++++ 10 files changed, 1559 insertions(+), 2 deletions(-) create mode 100644 graph-exploration-README.md create mode 100644 src/main/java/org/commoncrawl/webgraph/CountingMergedIntIterator.java create mode 100644 src/main/java/org/commoncrawl/webgraph/explore/Graph.java create mode 100644 src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java create mode 100755 src/script/webgraph_ranking/graph_explore_build_vertex_map.sh create mode 100755 src/script/webgraph_ranking/graph_explore_download_webgraph.sh create mode 100644 src/script/webgraph_ranking/graph_explore_load_graph.jsh create mode 100755 src/script/webgraph_ranking/process_webgraph_degrees.sh create mode 100644 src/test/java/org/commoncrawl/webgraph/TestCountingMergedIntIterator.java diff --git a/README.md b/README.md index 7b42ede..2c76089 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,19 @@ java -cp target/cc-webgraph-0.1-SNAPSHOT-jar-with-dependencies.jar < The assembly jar file includes also the [WebGraph](https://webgraph.di.unimi.it/) and [LAW](https://law.di.unimi.it/software.php) packages required to compute [PageRank](https://en.wikipedia.org/wiki/PageRank) and [Harmonic Centrality](https://en.wikipedia.org/wiki/Centrality#Harmonic_centrality). -Note that the webgraphs are usually multiple Gigabytes in size and require a sufficient Java heap size ([Java option](https://docs.oracle.com/en/java/javase/14/docs/specs/man/java.html#extra-options-for-java) `-Xmx`) for processing. + +### Javadocs + +The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/site/apidocs/index.html` in a browser. + + +## Memory and Disk Requirements + +Note that the webgraphs are usually multiple Gigabytes in size and require for processing +- a sufficient Java heap size ([Java option](https://docs.oracle.com/en/java/javase/21/docs/specs/man/java.html#extra-options-for-java) `-Xmx`) +- enough disk space to store the graphs and temporary data. + +The exact requirements depend on the graph size and the task – graph exploration or ranking, etc. ## Construction and Ranking of Host- and Domain-Level Web Graphs @@ -49,7 +61,7 @@ The shell script is easily adapted to your needs. Please refer to the [LAW datas The Common Crawl webgraph data sets are announced on the [Common Crawl web site](https://commoncrawl.org/tag/webgraph/). -Instructions how to explore the webgraphs are given in the [cc-notebooks project](//github.com/commoncrawl/cc-notebooks/tree/master/cc-webgraph-statistics). +For instructions how to explore the webgraphs using the JShell please see the tutorial [Interactive Graph Exploration](./graph-exploration-README.md). For an older approach using [Jython](https://www.jython.org/) and [pyWebGraph](https://github.com/mapio/py-web-graph), see the [cc-notebooks project](//github.com/commoncrawl/cc-notebooks/tree/master/cc-webgraph-statistics). ## Credits diff --git a/graph-exploration-README.md b/graph-exploration-README.md new file mode 100644 index 0000000..31501ba --- /dev/null +++ b/graph-exploration-README.md @@ -0,0 +1,285 @@ +# Interactive Graph Exploration + +A tutorial how to interactively explore the Common Crawl webgraphs – or other graphs using the webgraph format – using the [JShell](https://docs.oracle.com/en/java/javase/21/jshell/index.html) and the [GraphExplorer](src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java) class. + + +## Quick Start + +1. change into the "cc-webgraph" project directory, [build the cc-webgraph jar](README.md#compiling-and-packaging-java-tools) and remember the project directory using an environment variable + + ``` + $> cd .../cc-webgraph + + $> mvn clean package + + $> CC_WEBGRAPH=$PWD + ``` + +2. select a web graph you want to explore, choose a download directory and download the web graph + + ``` + $> GRAPH=cc-main-2024-feb-apr-may-domain + + $> mkdir .../my-webgraphs/$GRAPH + $> cd .../my-webgraphs/$GRAPH + ``` + + About 15 GiB disk are needed to hold all files of a domain-level webgraph. + + ``` + $> $CC_WEBGRAPH/src/script/webgraph_ranking/graph_explore_download_webgraph.sh $GRAPH + ``` + +3. Build the map from vertex label to vertex ID and vice versa. This allows to look up a reverse domain name (e.g. "org.commoncrawl") and get the corresponding vertex ID. + + ``` + $> $CC_WEBGRAPH/src/script/webgraph_ranking/graph_explore_build_vertex_map.sh $GRAPH $GRAPH-vertices.txt.gz + ``` + +4. Launch the [JShell](https://docs.oracle.com/en/java/javase/21/jshell/index.html) + + ``` + $> jshell --class-path $CC_WEBGRAPH/target/cc-webgraph-*-jar-with-dependencies.jar + | Welcome to JShell -- Version 21.0.3 + | For an introduction type: /help intro + + jshell> + ``` + + Now you may play around with the JShell or load the GraphExplorer class and your graph: + + ``` + jshell> import org.commoncrawl.webgraph.explore.GraphExplorer + + jshell> GraphExplorer e = new GraphExplorer("cc-main-2024-feb-apr-may-domain") + 2024-06-23 13:38:51:084 +0200 [main] INFO Graph - Loading graph cc-main-2024-feb-apr-may-domain.graph + 2024-06-23 13:38:51:193 +0200 [main] INFO Graph - Loading transpose of the graph cc-main-2024-feb-apr-may-domain-t.graph + 2024-06-23 13:38:51:279 +0200 [main] INFO Graph - Loading vertex map cc-main-2024-feb-apr-may-domain.iepm (ImmutableExternalPrefixMap) + 2024-06-23 13:38:52:356 +0200 [main] INFO Graph - Loaded graph cc-main-2024-feb-apr-may-domain.graph + e ==> org.commoncrawl.webgraph.explore.GraphExplorer@4cc0edeb + ``` + + But for now exit the JShell + ``` + jshell> /exit + | Goodbye + ``` + + To make the loading easier, you may use the load script [graph_explore_load_graph.jsh](src/script/webgraph_ranking/graph_explore_load_graph.jsh) and pass the graph name as a Java property to the JShell via command-line option `-R-Dgraph=$GRAPH` + + ``` + $> jshell --class-path $CC_WEBGRAPH/target/cc-webgraph-*-jar-with-dependencies.jar \ + -R-Dgraph=$GRAPH \ + $CC_WEBRAPH/src/script/webgraph_ranking/graph_explore_load_graph.jsh + Loading graph cc-main-2024-feb-apr-may-domain + 2024-06-23 13:30:14:134 +0200 [main] INFO Graph - Loading graph cc-main-2024-feb-apr-may-domain.graph + 2024-06-23 13:30:14:340 +0200 [main] INFO Graph - Loading transpose of the graph cc-main-2024-feb-apr-may-domain-t.graph + 2024-06-23 13:30:14:439 +0200 [main] INFO Graph - Loading vertex map cc-main-2024-feb-apr-may-domain.iepm (ImmutableExternalPrefixMap) + 2024-06-23 13:30:15:595 +0200 [main] INFO Graph - Loaded graph cc-main-2024-feb-apr-may-domain.graph + + Graph cc-main-2024-feb-apr-may-domain loaded into GraphExplorer *e* + Type "e." and press to list the public methods of the class GraphExplorer + ... or "g." for the graph loaded for exploration + + ... or use one of the predefined methods: + void cn(String) + void cn(long) + void pwn() + void ls() + void ls(long) + void ls(String) + void sl() + void sl(long) + void sl(String) + + | Welcome to JShell -- Version 21.0.3 + | For an introduction type: /help intro + + jshell> + ``` + + The predefined methods are those provided by [pyWebGraph](https://github.com/mapio/py-web-graph). + + ``` + jshell> cn("org.commoncrawl") + #111997321 org.commoncrawl + + jshell> pwn() + #111997321 org.commoncrawl + + jshell> ls() // list successors (vertices linked from the domain commoncrawl.org or one of its subdomains) + + jshell> sl() // list predecessors (vertices connected via incoming links) + ``` + + +## Using the Java Classes + +The Java classes "GraphExplorer" and "Graph" bundle a set of methods which help exploring the graphs: +- load the webgraph, its transpose and the vertex map +- access the vertices and their successors or predecessors +- utilities to import or export a list of vertices or counts from or into a file + +The methods are bundled in the classes of the Java package `org.commoncrawl.webgraph.explore`. To get an overview over all provided methods, inspect the source code or see the section [Javadocs](README.md#javadocs) in the main README for how to read the Javadocs. Here only few examples are presented. + +We start again with launching the JShell and loading a webgraph: + +``` +$> jshell --class-path $CC_WEBGRAPH/target/cc-webgraph-*-jar-with-dependencies.jar \ + -R-Dgraph=$GRAPH \ + $CC_WEBRAPH/src/script/webgraph_ranking/graph_explore_load_graph.jsh +jshell> +``` + +Two classes are already instantiated – the *GraphExplorer* `e` and the *Graph* `g`, the former holds a reference to the latter: + +``` +jshell> /vars +| String graph = "cc-main-2024-feb-apr-may-domain" +| GraphExplorer e = org.commoncrawl.webgraph.explore.GraphExplorer@7dc7cbad +| Graph g = org.commoncrawl.webgraph.explore.Graph@4f933fd1 + +jshell> e.getGraph() +$45 ==> org.commoncrawl.webgraph.explore.Graph@4f933fd1 +``` + +First, the vertices in the webgraphs are represented by numbers. So, we need to translage between vertex label and ID: + +``` +jshell> g.vertexLabelToId("org.wikipedia") +$46 ==> 115107569 + +jshell> g.vertexIdToLabel(115107569) +$47 ==> "org.wikipedia" +``` + +One important note: Common Crawl's webgraphs list the host or domain names in [reverse domain name notation](https://en.wikipedia.org/wiki/Reverse_domain_name_notation). The vertex lists are sorted by the reversed names in lexicographic order and then numbered continuously. This gives a close-to-perfect compression of the webgraphs itself. Most of the arcs are close in terms of locality because subdomains or sites of the same region (by country-code top-level domain) are listed in one continous block. Cf. the paper [The WebGraph Framework I: Compression Techniques](https://vigna.di.unimi.it/ftp/papers/WebGraphI.pdf) by Paolo Boldi and Sebastiano Vigna. + +Now, let's look how many other domains are linked from Wikipedia? + +``` +jshell> g.outdegree("org.wikipedia") +$46 ==> 2106338 +``` + +Another note: Common Crawl's webgraphs are based on sample crawls of the web. Same as the crawls, also the webgraphs are not complete and the Wikipedia may in reality link to far more domains. But 2 million linked domains is already not a small sample. + +The Graph class also gives you access to the successors of a vertex, as array or stream of integers, but also as stream of strings (vertex labels): + +``` +jshell> g.successors("org.wikipedia").length +$48 ==> 2106338 + +jshell> g.successorIntStream("org.wikipedia").count() +$49 ==> 2106338 + +jshell> g.successorStream("org.wikipedia").limit(10).forEach(System.out::println) +abb.global +abb.nic +abbott.cardiovascular +abbott.globalpointofcare +abbott.molecular +abbott.pk +abc.www +abudhabi.gov +abudhabi.mediaoffice +abudhabi.tamm +``` + +Using Java streams it's easy to translate between the both representations: + +``` +jshell> g.successorIntStream("org.wikipedia").limit(5).mapToObj(i -> g.vertexIdToLabel(i)).forEach(System.out::println) +abb.global +abb.nic +abbott.cardiovascular +abbott.globalpointofcare +abbott.molecular +``` + +Successors represent outgoing links to other domains. We can do the same for predecsors, that is incoming links from other domains: + +``` +jshell> g.indegree("org.wikipedia") +$50 ==> 2752391 + +jshell> g.predecessorIntStream("org.wikipedia").count() +$51 ==> 2752391 + +jshell> g.predecessorStream("org.wikipedia").limit(5).forEach(System.out::println) +abogado.fabiobalbuena +abogado.jacksonville +abogado.jaskot +abogado.super +ac.789bet +``` + +Technically, webgraphs only store successor lists. But the Graph class holds also two graphs: the "original" one and its transpose. In the transposed graph "successors" are "predecessors", and "outdegree" means "indegree". Some methods on a deeper level take one of the two webgraphs as argument, here it makes a difference whether you pass `g.graph` or `g.graphT`, here to a method which translates vertex IDs to labels and extracts the top-level domain: + +``` +jshell> g.successorTopLevelDomainStream(g.graph, g.vertexLabelToId("org.wikipedia")).limit(5).forEach(System.out::println) +abb +abb +abbott +abbott +abbott + +jshell> g.successorTopLevelDomainStream(g.graphT, g.vertexLabelToId("org.wikipedia")).limit(5).forEach(System.out::println) +abogado +abogado +abogado +abogado +ac +``` + +The top-level domains repeat, and you may want to count the occurrences and create a frequency list. There is a predefined method to perform this: + +``` +jshell> g.successorTopLevelDomainCounts("org.wikipedia").filter(e -> e.getKey().startsWith("abb")).forEach(e -> System.out.printf("%8d\t%s\n", e.getValue(), e.getKey())) + 4 abbott + 2 abb + +jshell> g.successorTopLevelDomainCounts("org.wikipedia").limit(10).forEach(e -> System.out.printf("%8d\t%s\n", e.getValue(), e.getKey())) + 706707 com + 213406 org + 117042 de + 86684 net + 65906 ru + 55914 fr + 53628 uk + 52828 it + 51622 jp + 33729 br +``` + +The same can be done for predecessors using the method "Graph::predecessorTopLevelDomainCounts". + +Dealing with large successor or predecessor lists can be painful and viewing them in a terminal window is practically impossible. We've already discussed how to compress the list to top-level domain counts. Alternatively, you could select the labels by prefix... + +``` +jshell> g.successorStream("org.wikipedia", "za.org.").limit(10).forEach(System.out::println) +za.org.61mech +za.org.aadp +za.org.aag +za.org.abc +za.org.acaparty +za.org.acbio +za.org.accord +za.org.acd +za.org.acdp +za.org.acjr +``` + +... but even then the list may be huge. Then the best option is to write the stream output (vertex labels or top-level domain frequencies) into a file and view it later using a file viewer or use any other tool for further processing: + +``` +jshell> e.saveVerticesToFile(g.successors("org.wikipedia"), "org-wikipedia-successors.txt") + +jshell> e.saveCountsToFile(g.successorTopLevelDomainCounts("org.wikipedia"), "org-wikipedia-successors-tld-counts.txt") +``` + +## Final Remarks + +We hope these few examples will support either to have fun exploring the graphs or to develop your own pipeline to extract insights from the graphs. + +Finally, thanks to the authors of the [WebGraph framework](https://webgraph.di.unimi.it/) and of [pyWebGraph](https://github.com/mapio/py-web-graph) for their work on these powerful tools and for any inspiration taken into these examples. diff --git a/src/main/java/org/commoncrawl/webgraph/CountingMergedIntIterator.java b/src/main/java/org/commoncrawl/webgraph/CountingMergedIntIterator.java new file mode 100644 index 0000000..328e9cd --- /dev/null +++ b/src/main/java/org/commoncrawl/webgraph/CountingMergedIntIterator.java @@ -0,0 +1,135 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright (C) 2024 Common Crawl and contributors + */ +package org.commoncrawl.webgraph; + +import java.util.PriorityQueue; + +import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.webgraph.LazyIntIterator; +import it.unimi.dsi.webgraph.LazyIntIterators; + +/** + * An iterator counting the integers returned by multiple + * {@link LazyIntIterator}s. The input iterators must return integers in a + * monotonically non-decreasing order. The resulting iterator returns the + * unified input integers in strictly non-decreasing order. The method + * {@link getCount()} is used to access the count of the integer returned last + * by {@link nextInt()}. The count equals the number of times any of the + * iterators returned the current integer value. See also + * {@link it.unimi.dsi.webgraph.MergedIntIterator}. + */ +public class CountingMergedIntIterator implements IntIterator { + + protected class QueuedIterator implements Comparable { + LazyIntIterator iter; + int value; + + public QueuedIterator(LazyIntIterator iterator) { + iter = iterator; + value = iterator.nextInt(); + } + + @Override + public int compareTo(QueuedIterator o) { + if (value < o.value) { + return -1; + } + if (value > o.value) { + return 1; + } + return 0; + } + } + + public static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt(); + + private final PriorityQueue iters = new PriorityQueue<>(); + private int currentCount = 0; + + /** + * @param iterators input iterators + */ + public CountingMergedIntIterator(LazyIntIterator... iterators) { + for (final LazyIntIterator iter : iterators) { + final QueuedIterator qiter = new QueuedIterator(iter); + if (qiter.value != LAZY_INT_ITERATOR_EMPTY_VALUE) { + iters.add(qiter); + } + } + } + + /** + * {@inheritDoc} + */ + @Override + public boolean hasNext() { + return iters.size() > 0; + } + + /** + * {@inheritDoc} + * + * @deprecated Please use {@link nextInt()} instead. + */ + @Deprecated + @Override + public Integer next() { + return Integer.valueOf(nextInt()); + } + + /** + * {@inheritDoc} + */ + @Override + public int nextInt() { + QueuedIterator qiter = iters.peek(); + final int value = qiter.value; + int count = 1; + while (true) { + iters.remove(); + int val; + while ((val = qiter.iter.nextInt()) == value) { + count++; + } + if (val != LAZY_INT_ITERATOR_EMPTY_VALUE) { + qiter.value = val; + iters.add(qiter); + } + if (iters.isEmpty()) { + break; + } + qiter = iters.peek(); + if (qiter.value == value) { + count++; + } else { + break; + } + } + currentCount = count; + return value; + } + + /** + * @return the count how often the last integer (returned by {@link nextInt()}) + * was seen in the input iterators + */ + public int getCount() { + return currentCount; + } + + /** + * {@inheritDoc} + */ + @Override + public int skip(int n) { + int i = 0; + while (i < n && hasNext()) { + nextInt(); + i++; + } + return i; + } + +} diff --git a/src/main/java/org/commoncrawl/webgraph/explore/Graph.java b/src/main/java/org/commoncrawl/webgraph/explore/Graph.java new file mode 100644 index 0000000..6a96ce2 --- /dev/null +++ b/src/main/java/org/commoncrawl/webgraph/explore/Graph.java @@ -0,0 +1,403 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright (C) 2024 Common Crawl and contributors + */ +package org.commoncrawl.webgraph.explore; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.AbstractMap.SimpleEntry; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.PrimitiveIterator; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import org.commoncrawl.webgraph.CountingMergedIntIterator; +import org.commoncrawl.webgraph.HostToDomainGraph; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import crawlercommons.domains.EffectiveTldFinder; +import it.unimi.dsi.fastutil.io.BinIO; +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.lang.MutableString; +import it.unimi.dsi.sux4j.mph.GOV4Function; +import it.unimi.dsi.util.FrontCodedStringList; +import it.unimi.dsi.util.ImmutableExternalPrefixMap; +import it.unimi.dsi.util.Interval; +import it.unimi.dsi.util.ShiftAddXorSignedStringMap; +import it.unimi.dsi.webgraph.ImmutableGraph; +import it.unimi.dsi.webgraph.LazyIntIterator; +import it.unimi.dsi.webgraph.LazyIntIterators; + +/** + * Holds webgraph-related data structures and access methods for graph + * exploration. + */ +public class Graph { + + private static Logger LOG = LoggerFactory.getLogger(Graph.class); + + /** The base name of the graph */ + public String name; + /** The graph */ + public ImmutableGraph graph; + /** The transpose of the graph */ + public ImmutableGraph graphT; + + /* Maps to translate between vertex label an ID */ + protected ImmutableExternalPrefixMap vertexMap; + protected FrontCodedStringList vertexMapFcl; + protected ShiftAddXorSignedStringMap vertexMapSmph; + protected GOV4Function vertexMapMph; + + private static int LAZY_INT_ITERATOR_EMPTY_VALUE = LazyIntIterators.EMPTY_ITERATOR.nextInt(); + + public Graph(String name) throws Exception { + this.name = name; + try { + LOG.info("Loading graph {}.graph", name); + graph = ImmutableGraph.loadMapped(name); + LOG.info("Loading transpose of the graph {}-t.graph", name); + graphT = ImmutableGraph.loadMapped(name + "-t"); + if (Files.exists(Paths.get(name + ".iepm"))) { + LOG.info("Loading vertex map {}.iepm (ImmutableExternalPrefixMap)", name); + vertexMap = (ImmutableExternalPrefixMap) BinIO.loadObject(name + ".iepm"); + } else if (Files.exists(Paths.get(name + ".fcl"))) { + LOG.info("Loading vertex map {}.fcl (FrontCodedStringList, maps vertex IDs to labels)", name); + vertexMapFcl = (FrontCodedStringList) BinIO.loadObject(name + ".fcl"); + if (Files.exists(Paths.get(name + ".smph"))) { + LOG.info("Loading vertex map {}.smph (string map perfect hash, maps vertex labels to IDs)", name); + vertexMapSmph = (ShiftAddXorSignedStringMap) BinIO.loadObject(name + ".smph"); + } else if (Files.exists(Paths.get(name + ".mph"))) { + LOG.info("Loading vertex map {}.mph (minimal perfect hash, maps vertex labels to IDs)", name); + vertexMapMph = (GOV4Function) BinIO.loadObject(name + ".mph"); + LOG.warn( + "Using a minimal perfect hash as vertex map does not allow to verify that a vertex label exists. " + + "Non-existant labels are mapped to quasi-random IDs."); + } else { + LOG.error("No vertex mapping found, cannot translate from vertex names to IDs."); + } + } else { + LOG.error("No vertex mapping found, cannot translate from vertex names to IDs."); + } + } catch (IOException | ClassNotFoundException e) { + LOG.error("Failed to load graph {}:", name, e); + throw e; + } + LOG.info("Loaded graph {}.graph", name); + } + + public String vertexIdToLabel(long id) { + if (vertexMap != null) { + return vertexMap.list().get((int) id).toString(); + } else { + return vertexMapFcl.get((int) id).toString(); + } + } + + public long vertexLabelToId(String label) { + if (vertexMap != null) { + return vertexMap.getLong(label); + } else if (vertexMapSmph != null) { + return vertexMapSmph.getLong(label); + } else if (vertexMapMph != null) { + return vertexMapMph.getLong(label); + } else { + throw new RuntimeException("No vertex map loaded."); + } + } + + public int outdegree(long vertexId) { + return graph.outdegree((int) vertexId); + } + + public int outdegree(String vertexLabel) { + return graph.outdegree((int) vertexLabelToId(vertexLabel)); + } + + public int indegree(long vertexId) { + return graphT.outdegree((int) vertexId); + } + + public int indegree(String vertexLabel) { + return graphT.outdegree((int) vertexLabelToId(vertexLabel)); + } + + public int[] successors(long vertexId) { + return graph.successorArray((int) vertexId); + } + + public int[] successors(String vertexLabel) { + return graph.successorArray((int) vertexLabelToId(vertexLabel)); + } + + public Stream successorStream(String vertexLabel) { + return successorStream(graph, vertexLabelToId(vertexLabel)); + } + + public IntStream successorIntStream(String vertexLabel) { + return successorIntStream(graph, vertexLabelToId(vertexLabel)); + } + + public Stream successorStream(String vertexLabel, String prefix) { + return successorStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); + } + + public IntStream successorIntStream(String vertexLabel, String prefix) { + return successorIntStream(graph, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); + } + + public Stream> successorTopLevelDomainCounts(String vertexLabel) { + return successorTopLevelDomainCounts(graph, vertexLabelToId(vertexLabel)); + } + + public Stream successorStream(ImmutableGraph graph, long vertexId) { + return successorIntStream(graph, vertexId).mapToObj(i -> vertexIdToLabel(i)); + } + + public IntStream successorIntStream(ImmutableGraph graph, long vertexId) { + return Arrays.stream(graph.successorArray((int) vertexId)); + } + + private Stream successorStream(ImmutableGraph graph, long vertexId, Interval interval) { + return successorIntStream(graph, vertexId, interval).mapToObj(i -> vertexIdToLabel(i)); + } + + public IntStream successorIntStream(ImmutableGraph graph, long vertexId, Interval interval) { + return Arrays.stream(graph.successorArray((int) vertexId)).filter(x -> (interval.compareTo(x) == 0)); + } + + public Stream successorTopLevelDomainStream(ImmutableGraph graph, long vertexId) { + return Arrays.stream(graph.successorArray((int) vertexId)).mapToObj(i -> getTopLevelDomain(vertexIdToLabel(i))); + } + + public Stream> successorTopLevelDomainCounts(ImmutableGraph graph, long vertexId) { + if (vertexMap != null) { + /* + * speed up if we have a prefix map, utilizing the fact that vertex labels are + * lexicographically sorted by reversed domain name + */ + List> res = new LinkedList<>(); + LazyIntIterator iter = graph.successors((int) vertexId); + int curr = iter.nextInt(); + while (curr != LAZY_INT_ITERATOR_EMPTY_VALUE) { + final MutableString currLabel = vertexMap.list().get(curr); + final int pos = currLabel.indexOf('.'); + final MutableString tldPrefix; + final String tld; + if (pos > -1 && (pos + 1) < currLabel.length()) { + tldPrefix = currLabel.substring(0, pos + 1); + tld = tldPrefix.substring(0, pos).toString(); + } else { + tldPrefix = currLabel; + tld = currLabel.toString(); + } + long count = 1; + final Interval interval = vertexMap.getInterval(tldPrefix); + int next; + while ((next = iter.nextInt()) != LAZY_INT_ITERATOR_EMPTY_VALUE) { + if (next > interval.right) { + break; + } + count++; + } + curr = next; + res.add(new SimpleEntry<>(tld, count)); + } + return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue())); + } + return GraphExplorer.frequencies(successorTopLevelDomainStream(graph, vertexId)); + } + + public Stream> topLevelDomainCounts(IntStream vertexIds) { + if (vertexMap != null) { + List> res = new LinkedList<>(); + PrimitiveIterator.OfInt iter = vertexIds.iterator(); + if (iter.hasNext()) { + int curr = iter.nextInt();; + do { + final MutableString currLabel = vertexMap.list().get(curr); + final int pos = currLabel.indexOf('.'); + final MutableString tldPrefix; + final String tld; + if (pos > -1 && (pos + 1) < currLabel.length()) { + tldPrefix = currLabel.substring(0, pos + 1); + tld = tldPrefix.substring(0, pos).toString(); + } else { + tldPrefix = currLabel; + tld = currLabel.toString(); + } + long count = 1; + final Interval interval = vertexMap.getInterval(tldPrefix); + int next = -1; + while (iter.hasNext()) { + next = iter.nextInt(); + if (next > interval.right) { + break; + } + count++; + } + curr = next; + res.add(new SimpleEntry<>(tld, count)); + } while (curr > -1); + } + return res.stream().sorted(Collections.reverseOrder(Map.Entry.comparingByValue())); + } + return GraphExplorer.frequencies(vertexIds.mapToObj(i -> Graph.getTopLevelDomain(vertexIdToLabel(i)))); + } + + public int[] predecessors(long vertexId) { + return graphT.successorArray((int) vertexId); + } + + public int[] predecessors(String vertexLabel) { + return graphT.successorArray((int) vertexLabelToId(vertexLabel)); + } + + public Stream predecessorStream(String vertexLabel) { + return successorStream(graphT, vertexLabelToId(vertexLabel)); + } + + public IntStream predecessorIntStream(String vertexLabel) { + return successorIntStream(graphT, vertexLabelToId(vertexLabel)); + } + + public Stream predecessorStream(String vertexLabel, String prefix) { + return successorStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); + } + + public IntStream predecessorIntStream(String vertexLabel, String prefix) { + return successorIntStream(graphT, vertexLabelToId(vertexLabel), vertexMap.getInterval(prefix)); + } + + public Stream> predecessorTopLevelDomainCounts(String vertexLabel) { + return successorTopLevelDomainCounts(graphT, vertexLabelToId(vertexLabel)); + } + + public long[] sharedPredecessors(long[] vertices) { + return sharedPredecessors(vertices, vertices.length, vertices.length); + } + + public long[] sharedPredecessors(long[] vertices, int minShared, int maxShared) { + return sharedSuccessors(graphT, vertices, minShared, maxShared); + } + + public long[] sharedSuccessors(long[] vertices) { + return sharedSuccessors(vertices, vertices.length, vertices.length); + } + + public long[] sharedSuccessors(long[] vertices, int minShared, int maxShared) { + return sharedSuccessors(graph, vertices, minShared, maxShared); + } + + /** + * Get shared successors (children) of all {@code vertices} in a {@code graph}. + * The parameters {@code minShared} and {@code maxShared} allow to select the + * intersection, the union or a subset with a specific overlap (shared + * successors). If vertex a has the successors d, e, vertex + * b has d, f and vertex c has d, e, g, then + *
    + *
  • {@code minShared} = {@code maxShared} = {@code vertices.length} returns + * the intersection (d)
  • + *
  • {@code minShared} = 1 and {@code maxShared} = {@code vertices.length} + * returns the union (d, e, f)
  • + *
  • {@code minShared} = {@code maxShared} = 2 returns all successors shared + * by exactly two of the {@code vertices} (e)
  • + *
+ * + * @param graph the graph used to access the successors of a vertex (the + * transpose of the graph will give the predecessors of the + * vertex) + * @param vertices list of vertex IDs + * @param minShared the minimum number of shared links to successors + * @param maxShared the minimum number of shared links to successors + * @return shared successors + */ + public long[] sharedSuccessors(ImmutableGraph graph, long[] vertices, int minShared, int maxShared) { + LazyIntIterator[] iters = new LazyIntIterator[vertices.length]; + for (int i = 0; i < vertices.length; i++) { + iters[i] = graph.successors((int) vertices[i]); + } + CountingMergedIntIterator iter = new CountingMergedIntIterator(iters); + LongArrayList res = new LongArrayList(); + int id; + while (iter.hasNext()) { + id = iter.nextInt(); + if (iter.getCount() >= minShared && iter.getCount() <= maxShared) { + res.add(id); + } + } + res.trim(); + return res.elements(); + } + + public static String getTopLevelDomain(String reversedDomainName) { + int dot = reversedDomainName.indexOf('.'); + if (dot < reversedDomainName.length()) { + return reversedDomainName.substring(0, dot); + } + return reversedDomainName; + } + + /** + * Get the registered domain for a host name based on the ICANN section of the + * public suffix list. + * + * @see EffectiveTldFinder + * + * @param hostName host name, e.g. www.example.org.uk + * @param strict if true return null instead of hostName if no + * valid public suffix is detected + * @return the domain name below the public suffix, e.g. + * example.org.uk + */ + public static String getRegisteredDomain(String hostName, boolean strict) { + return EffectiveTldFinder.getAssignedDomain(hostName, strict, true); + } + + /** + * Get the registered domain for a host name, both in + * reverse + * domain name notation. + * + * @see #getRegisteredDomain(String, boolean) + * + * @param reversedHostName host name in reverse domain name notation, e.g. + * uk.ork.example.www + * @param strict if true return null instead of + * reversedHostName if no valid public + * suffix is detected + * @return the domain name below the public suffix, e.g. + * uk.org.example (in reverse domain name notation) + */ + public static String getRegisteredDomainReversed(String reversedHostName, boolean strict) { + String hostName = reverseDomainName(reversedHostName); + String domainName = getRegisteredDomain(hostName, strict); + if (strict && domainName == null) { + return null; + } else if (hostName.equals(domainName)) { + return reversedHostName; + } + return reverseDomainName(domainName); + } + + /** + * Reverse or "unreverse" a host/domain name: com.example.www is + * reversed to www.example.com and vice versa. + * + * @param domain name + * @return domain name with reverse + * domain name notation (un)applied + */ + private static String reverseDomainName(String reversedDomainName) { + return HostToDomainGraph.reverseHost(reversedDomainName); + } +} diff --git a/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java b/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java new file mode 100644 index 0000000..4df9dcc --- /dev/null +++ b/src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java @@ -0,0 +1,295 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright (C) 2024 Common Crawl and contributors + */ +package org.commoncrawl.webgraph.explore; + +import java.io.IOException; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Map.Entry; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.LongStream; +import java.util.stream.Stream; + +import org.commoncrawl.webgraph.CountingMergedIntIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import it.unimi.dsi.webgraph.LazyIntIterator; + +/** + * Utility class for graph exploration: load and hold all required web graph + * data structures, provided methods to interactively explore the graph. + */ +public class GraphExplorer { + + private static Logger LOG = LoggerFactory.getLogger(GraphExplorer.class); + + public class Vertex { + private long id; + private String label; + + public Vertex(String label) { + this.label = label; + id = g.vertexLabelToId(label); + } + + public Vertex(long id) { + this.id = id; + label = g.vertexIdToLabel(id); + } + + @Override + public String toString() { + return "#" + id + "\t" + label; + } + + public int outdegree() { + return g.outdegree((int) id); + } + + public int indegree() { + return g.indegree((int) id); + } + + public int[] successors() { + return g.graph.successorArray((int) id); + } + + public int[] predecessors() { + return g.graphT.successorArray((int) id); + } + } + + private Graph g = null; + private Vertex v = null; + + public GraphExplorer(String name) throws Exception { + g = new Graph(name); + } + + public Graph getGraph() { + return g; + } + + public Vertex getVertex(String vertexLabel) { + return new Vertex(vertexLabel); + } + + public Vertex getVertex(long vertexId) { + return new Vertex(vertexId); + } + + public void setVertex(String vertexLabel) { + v = getVertex(vertexLabel); + } + + public void setVertex(long vertexId) { + v = getVertex(vertexId); + } + + /* Reimplementation of commands provided by pywebgraph (cn, pwn, ls, sl) */ + + /** + * Change the current working node / vertex. + * + * @param vertexLabel vertex label (node name) + */ + public void cn(String vertexLabel) { + setVertex(vertexLabel); + pwn(); + } + + /** + * Change the current working node / vertex. + * + * @param vertexId vertex ID + */ + public void cn(long vertexId) { + setVertex(vertexId); + pwn(); + } + + /** + * Print the current working node / vertex. + */ + public void pwn() { + if (v == null) { + throw new NullPointerException("Current working node not set, use cn(...) to define the working node."); + } + print(v.toString()); + } + + /** + * Print the successors (outgoing links) of the current working node / vertex. + */ + public void ls() { + if (v == null) { + throw new NullPointerException("Current working node not set, use cn(...) to define the working node."); + } + ls(v.id); + } + + /** + * Print the successors (outgoing links) of a vertex. + * + * @param vertexId vertex ID + */ + public void ls(long vertexId) { + printVertices(g.graph.successors((int) vertexId)); + } + + /** + * Print the successors (outgoing links) of a vertex. + * + * @param vertexLabel vertex label / vertex name + */ + public void ls(String vertexLabel) { + ls(g.vertexLabelToId(vertexLabel)); + } + + /** + * Print the predecessors (incoming links) of the current working node / vertex. + */ + public void sl() { + if (v == null) { + throw new NullPointerException("Current working node not set, use cn(...) to define the working node."); + } + sl(v.id); + } + + /** + * Print the predecessors (incoming links) of a vertex. + * + * @param vertexId vertex ID + */ + public void sl(long vertexId) { + printVertices(g.graphT.successors((int) vertexId)); + } + + /** + * Print the predecessors (incoming links) of a vertex. + * + * @param vertexLabel vertex label / vertex name + */ + public void sl(String vertexLabel) { + sl(g.vertexLabelToId(vertexLabel)); + } + + /* Utilities */ + + public long[] loadVerticesFromFile(String fileName) { + try (Stream in = Files.lines(Paths.get(fileName), StandardCharsets.UTF_8)) { + return in.mapToLong(label -> g.vertexLabelToId(label)).filter(id -> id > -1).toArray(); + } catch (IOException e) { + LOG.error("Failed to load vertices from file {}", fileName, e); + } + return new long[0]; + } + + public void saveVerticesToFile(long[] vertexIDs, String fileName) { + try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false, + StandardCharsets.UTF_8)) { + Arrays.stream(vertexIDs).forEach(id -> out.println(g.vertexIdToLabel(id))); + } catch (IOException e) { + LOG.error("Failed to write vertices to file {}", fileName, e); + } + } + + public void saveVerticesToFile(int[] vertexIDs, String fileName) { + saveVerticesToFile(Arrays.stream(vertexIDs), fileName); + } + + public void saveVerticesToFile(IntStream vertexIDs, String fileName) { + try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false, + StandardCharsets.UTF_8)) { + vertexIDs.forEach(id -> out.println(g.vertexIdToLabel(id))); + } catch (IOException e) { + LOG.error("Failed to write vertices to file {}", fileName, e); + } + } + + public void saveVerticesToFile(LongStream vertexIDs, String fileName) { + try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false, + StandardCharsets.UTF_8)) { + vertexIDs.forEach(id -> out.println(g.vertexIdToLabel(id))); + } catch (IOException e) { + LOG.error("Failed to write vertices to file {}", fileName, e); + } + } + + public void saveToFile(Stream strings, String fileName) { + try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false, + StandardCharsets.UTF_8)) { + strings.forEach(out::println); + } catch (IOException e) { + LOG.error("Failed to write strings to file {}", fileName, e); + } + } + + public void saveCountsToFile(Stream> counts, String fileName) { + try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false, + StandardCharsets.UTF_8)) { + counts.forEach(c -> { + out.print(c.getValue()); + out.print('\t'); + out.print(c.getKey()); + out.print('\n'); + }); + } catch (IOException e) { + LOG.error("Failed to write counts to file {}", fileName, e); + } + } + + private void print(String s) { + System.out.println(s); + } + + public void printVertices(LazyIntIterator it) { + int next = it.nextInt(); + int i = 0; + while (next != CountingMergedIntIterator.LAZY_INT_ITERATOR_EMPTY_VALUE) { + print(String.format("%d: %s", i, (new Vertex(next)).toString())); + next = it.nextInt(); + i++; + } + } + + public void printVertices(long[] vertexIDs) { + int i = 0; + for (long id : vertexIDs) { + print(String.format("%d: %s", i, (new Vertex(id)).toString())); + i++; + } + } + + public void printVertices(int[] vertexIDs) { + int i = 0; + for (long id : vertexIDs) { + print(String.format("%d: %s", i, (new Vertex(id)).toString())); + i++; + } + } + + /** + * Count strings in a stream. Sort the resulting string-count pairs by + * decreasing count (frequency) and secondarily by string in lexicographic + * order. + * + * @param strings stream of strings + * @return stream of pairs {@code } + */ + public static Stream> frequencies(Stream strings) { + final Comparator> comp = Comparator.comparingLong((Entry e) -> e.getValue()) + .reversed().thenComparing(Comparator.comparing((Entry e) -> e.getKey())); + return strings.collect(Collectors.groupingBy(Function.identity(), Collectors.counting())).entrySet().stream() + .sorted(comp); + } +} diff --git a/src/script/webgraph_ranking/graph_explore_build_vertex_map.sh b/src/script/webgraph_ranking/graph_explore_build_vertex_map.sh new file mode 100755 index 0000000..aa9e482 --- /dev/null +++ b/src/script/webgraph_ranking/graph_explore_build_vertex_map.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +# Build node indexes to interactively explore a Common Crawl webgraph. +# The webgraph files are expected to be placed in the current directory. + +NAME="$1" +VERTICES="$2" +if ! shift 2; then + echo "$(basename $0) " + echo + echo "Build node indexes to interactively explore a Common Crawl webgraph." + echo "The webgraph files are expected to be placed in the current directory." + echo + echo " basename of the graph (without the .graph suffix)" + echo " vertices file name (including the file suffix)" + echo " or directory containing the vertices files" + echo + exit 1 +fi + +export LC_ALL=C + +BIN="$(dirname $0)" +WG="$BIN/run_webgraph.sh" + +declare -A suffix_name_map +suffix_name_map=( + graph "webgraph / BVGraph" + properties "webgraph properties" + offsets "webgraph offsets" + iepm "immutable external prefix map" + mph "minimal perfect hash" + fcl "front coded list" + smph "string map perfect hash" +) + +function list_webgraph_files() { + name="$1"; shift + ok=true + for suffix in "$@"; do + if [ -e $name.$suffix ]; then + printf " .%-10s : %-20s (%s)\n" "$suffix" \ + "${suffix_name_map[$suffix]}" "$name.$suffix" + else + echo -e "Missing $name.$suffix (${suffix_name_map[$suffix]})" + ok=false + fi + done + if ! $ok; then + exit 1 + fi +} + +function index_status() { + echo + echo "Prepared webgraph $NAME for look-ups by node label." + echo "The following files (by file suffix) will be used:" + + echo "Webgraph:" + list_webgraph_files $NAME graph properties offsets + echo "Webgraph (transpose):" + list_webgraph_files $NAME-t graph properties offsets + + echo "Mapping vertex labels to vertex IDs:" + if [ -e $NAME.iepm ]; then + list_webgraph_files $NAME iepm + else + list_webgraph_files $NAME mph fcl smph + fi +} + + +# check for graph files (.graph and .properties), also for the +# transpose of the graph ($NAME-t.$suffix) +echo "Verifying webgraph files:" +list_webgraph_files $NAME graph properties +echo "Verifying webgraph files (transpose of the graph):" +list_webgraph_files $NAME-t graph properties + +# check for the vertices file +if ! [ -e $VERTICES ]; then + echo "Vertices file not found" + exit 1 +fi + + +# generate offsets +if ! [ -e $NAME.offsets ]; then + "$WG" it.unimi.dsi.webgraph.BVGraph -O -L $NAME + echo "webgraph offsets file created" +fi +if ! [ -e $NAME-t.offsets ]; then + "$WG" it.unimi.dsi.webgraph.BVGraph -O -L $NAME-t + echo "webgraph offsets file created (transpose of the graph)" +fi + + +# building `iepm` "immutable external prefix map" +# (https://dsiutils.di.unimi.it/docs/it/unimi/dsi/util/ImmutableExternalPrefixMap.html) +# mapping back and forth node names to numbers +if [ -e $NAME.iepm ]; then + index_status + exit 0 +fi +CAT_VERTICES="zcat $VERTICES" +if [ -d $VERTICES ]; then + # host-level webgraph, multiple vertex files + CAT_VERTICES="zcat $VERTICES/*.txt.gz" +fi +if (set -eo pipefail; + eval $CAT_VERTICES \ + | cut -f2 \ + | "$WG" it.unimi.dsi.util.ImmutableExternalPrefixMap -b4Ki $NAME.iepm); then + echo "immutable external prefix map successfully built: $NAME.iepm" + index_status + exit 0 +fi +# Note: building the `iepm` may fail for older versions of the domain +# graph (before the graphs of May, June/July and August 2022) because +# the nodes were not properly lexicographically sorted while folding +# host names to domain names. If this is the case, continue to create +# instead mappings which do not depend on proper sorting. + +# build +# - the `mph` (minimal perfect hash) file mapping from node label +# (reversed domain name) to node ID +# - a front coded list to map node IDs to node labels +if ! [ -e $NAME.mph ] || ! [ -e $NAME.fcl ]; then + zcat $VERTICES \ + | cut -f2 \ + | tee >("$WG" it.unimi.dsi.sux4j.mph.GOV4Function $NAME.mph) \ + | "$WG" it.unimi.dsi.util.FrontCodedStringList -u -r 32 $NAME.fcl +fi + +# build the `smph` file (string map perfect hash) required to +# determine whether a node label is present in the `mph` file +if ! [ -e $NAME.smph ]; then + zcat $VERTICES \ + | cut -f2 \ + | "$WG" it.unimi.dsi.util.ShiftAddXorSignedStringMap $NAME.mph $NAME.smph +fi + + +index_status diff --git a/src/script/webgraph_ranking/graph_explore_download_webgraph.sh b/src/script/webgraph_ranking/graph_explore_download_webgraph.sh new file mode 100755 index 0000000..fdc11f5 --- /dev/null +++ b/src/script/webgraph_ranking/graph_explore_download_webgraph.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +NAME="$1" +if ! shift 1; then + echo "$(basename $0) " + echo + echo "Download all files required to interactively explore a Common Crawl webgraph." + echo "The downloaded files are placed in the current directory." + echo "Wget or curl are required for downloading" + echo + echo " webgraph base name without file suffix, eg. cc-main-2023-mar-may-oct-domain" + echo + exit 1 +fi + +export LC_ALL=C + +BIN="$(dirname $0)" + +USING_CURL=false +USING_WGET=false +if command -v curl &>/dev/null; then + USING_CURL=true +elif command -v wget &>/dev/null; then + USING_WGET=true +else + echo "Either curl or wget are required for downloading" >&2 + exit 1 +fi + +declare -A suffix_name_map +suffix_name_map=( + graph "webgraph / BVGraph" + properties "webgraph properties" + offsets "webgraph offsets" + stats "webgraph statistics" + txt.gz "text file (vertex labels)" +) + +function list_webgraph_files() { + name="$1"; shift + ok=true + for suffix in "$@"; do + if [ -e $name.$suffix ]; then + printf " .%-10s : %-20s (%s)\n" "$suffix" \ + "${suffix_name_map[$suffix]}" "$name.$suffix" + elif [ -d "$name" ] && [[ "$suffix" =~ ^\*. ]]; then + ls "$name"/* | sed 's/^/\t/' + else + echo -e "Missing $name.$suffix (${suffix_name_map[$suffix]})" + ok=false + fi + done + if ! $ok; then + exit 1 + fi +} + +function download_file() { + FILE="$1" + if [ -e "$FILE" ]; then + return # already done + fi + URL="https://data.commoncrawl.org/projects/hyperlinkgraph/$BASE_NAME/$GRAPH_AGGR_LEVEL/$FILE" + echo "Downloading $URL" + + if $USING_CURL; then + + curl --silent --remote-time -o "$FILE" --time-cond "$FILE" --continue-at - "$URL" + + elif $USING_WGET; then + + if [ "$(dirname "$FILE")" == "." ]; then + wget --continue --timestamping "$URL" + else + wget --continue --timestamping --directory-prefix="$(dirname "$FILE")" "$URL" + fi + + fi +} + +function download_files() { + name="$1"; shift + for suffix in "$@"; do + download_file "$name.$suffix" + done +} + + +BASE_NAME="${NAME%-domain}" +BASE_NAME="${BASE_NAME%-host}" +GRAPH_AGGR_LEVEL="${NAME##*-}" + + +download_files "$NAME" graph properties stats +download_files "$NAME-t" graph properties + +if [ $GRAPH_AGGR_LEVEL == "domain" ]; then + download_files "$NAME-vertices" txt.gz +else + download_files "$NAME-vertices" paths.gz + zcat "$NAME-vertices".paths.gz \ + | while read path; do + file=${path#projects/hyperlinkgraph/$BASE_NAME/$GRAPH_AGGR_LEVEL/} + mkdir -p $(dirname "$file") + download_file "$file" + done +fi + +echo "Downloaded files" +echo "- webgraph" +list_webgraph_files $NAME graph properties stats +echo "- webgraph (transpose)" +list_webgraph_files $NAME-t graph properties +echo "- webgraph vertices" +if [ $GRAPH_AGGR_LEVEL == "domain" ]; then + list_webgraph_files $NAME-vertices txt.gz +else + list_webgraph_files vertices "*.txt.gz" +fi diff --git a/src/script/webgraph_ranking/graph_explore_load_graph.jsh b/src/script/webgraph_ranking/graph_explore_load_graph.jsh new file mode 100644 index 0000000..e884f76 --- /dev/null +++ b/src/script/webgraph_ranking/graph_explore_load_graph.jsh @@ -0,0 +1,32 @@ +/open PRINTING + +String graph = System.getProperty("graph") +println("Loading graph " + graph) + +import org.commoncrawl.webgraph.explore.Graph +import org.commoncrawl.webgraph.explore.GraphExplorer +import it.unimi.dsi.webgraph.ImmutableGraph + +GraphExplorer e = new GraphExplorer(graph) +Graph g = e.getGraph() + +println() +println("Graph " + graph + " loaded into GraphExplorer *e*") +println("Type \"e.\" and press to list the public methods of the class GraphExplorer") +println("... or \"g.\" for the graph loaded for exploration") + +/* Define commands provided by pywebgraph (cn, pwn, ls, sl) */ +void cn(String vertexLabel) { e.cn(vertexLabel); } +void cn(long vertexID) { e.cn(vertexID); } +void pwn() { e.pwn(); } +void ls() { e.ls(); } +void ls(long vertexId) { e.ls(vertexId); } +void ls(String vertexLabel) { e.ls(vertexLabel); } +void sl() { e.sl(); } +void sl(long vertexId) { e.sl(vertexId); } +void sl(String vertexLabel) { e.sl(vertexLabel); } + +println() +println("... or use one of the predefined methods:") +/methods cn pwn ls sl +println() \ No newline at end of file diff --git a/src/script/webgraph_ranking/process_webgraph_degrees.sh b/src/script/webgraph_ranking/process_webgraph_degrees.sh new file mode 100755 index 0000000..bd1dcb7 --- /dev/null +++ b/src/script/webgraph_ranking/process_webgraph_degrees.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +set -eo pipefail + +NAME="$1" +TYPE="${2:-domain}" + +if [ -z "$NAME" ]; then + echo "Usage: $(basename $0) []" + echo -e "\tgraph-name\tbase name of the webgraph (without the file suffix .graph)" + echo -e "\ttype\ttype (level) of the graph aggregation: domain (default) or host" + exit 1 +fi + +WG=$(dirname $0)/run_webgraph.sh + +if [ -e $NAME.outdegrees ] && [ -e $NAME.indegrees ]; then + : # out/indegrees already done +else + $WG it.unimi.dsi.webgraph.Stats --save-degrees "$NAME" +fi + + +if [ "$TYPE" == "domain" ]; then + zcat $NAME-vertices.txt.gz +else + zcat vertices/*.txt.gz +fi \ + | cut -f2- \ + | paste $NAME.outdegrees $NAME.indegrees - \ + | gzip >$NAME-outdegrees-indegrees.txt.gz + + +HEADER="outdegree\tindegree\tname" +if [ "$TYPE" == "domain" ]; then + HEADER="outdegree\tindegree\tname\tnumsubdomains" +fi + +(echo -e "$HEADER"; + set +o pipefail; + zcat $NAME-outdegrees-indegrees.txt.gz \ + | perl -aF'\t' -lne 'print if $F[0] > 1000' \ + | sort -k1,1nr \ + | head -10000) \ + | gzip >$NAME-outdegrees-indegrees-topout.txt.gz + +(echo -e "$HEADER"; + set +o pipefail; + zcat $NAME-outdegrees-indegrees.txt.gz \ + | perl -aF'\t' -lne 'print if $F[1] > 1000' \ + | sort -k2,2nr \ + | head -10000) \ + | gzip >$NAME-outdegrees-indegrees-topin.txt.gz + diff --git a/src/test/java/org/commoncrawl/webgraph/TestCountingMergedIntIterator.java b/src/test/java/org/commoncrawl/webgraph/TestCountingMergedIntIterator.java new file mode 100644 index 0000000..3cd1ebf --- /dev/null +++ b/src/test/java/org/commoncrawl/webgraph/TestCountingMergedIntIterator.java @@ -0,0 +1,77 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * Copyright (C) 2022 Common Crawl and contributors + */ +package org.commoncrawl.webgraph; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; + +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import it.unimi.dsi.webgraph.LazyIntIterator; +import it.unimi.dsi.webgraph.LazyIntIterators; + +public class TestCountingMergedIntIterator { + + protected static Logger LOG = LoggerFactory.getLogger(TestCountingMergedIntIterator.class); + + @Test + void testSimple() { + CountingMergedIntIterator iter = new CountingMergedIntIterator(LazyIntIterators.EMPTY_ITERATOR); + assertFalse(iter.hasNext()); + + int[][][] testArrays = { // + {{0, 1}}, // + {{0}, {1}}, // + {{1}, {0}}, // + {{1}, {0}, {}}, // + {{1}, {0}, {}, {0}, {0}}, // + {{1}, {0}, {}, {0}, {0, 1}}, // + // tests for input arrays with repeating numbers + {{1, 1}, {0, 0}, {}, {0, 0}, {0, 0}}, // + {{1, 1}, {0, 0}, {}, {0}, {0, 1}} // + }; + + for (int[][] tArrays : testArrays) { + LazyIntIterator[] tIters = new LazyIntIterator[tArrays.length]; + int totalCountExpected = 0; + for (int i = 0; i < tArrays.length; i++) { + tIters[i] = LazyIntIterators.wrap(tArrays[i]); + totalCountExpected += tArrays[i].length; + } + int totalCount = 0; + iter = new CountingMergedIntIterator(tIters); + assertTrue(iter.hasNext()); + + assertEquals(0, iter.nextInt()); + assertTrue(iter.getCount() > 0); + totalCount += iter.getCount(); + assertTrue(iter.hasNext()); + assertEquals(1, iter.nextInt()); + assertTrue(iter.getCount() > 0); + totalCount += iter.getCount(); + assertFalse(iter.hasNext()); + assertEquals(totalCountExpected, totalCount, + "expected total count for input " + Arrays.deepToString(tArrays) + " is " + totalCountExpected); + } + + // test skip(n) + for (int n = 0; n <= 5; n++) { + for (int[][] tArrays : testArrays) { + LazyIntIterator[] tIters = new LazyIntIterator[tArrays.length]; + for (int i = 0; i < tArrays.length; i++) { + tIters[i] = LazyIntIterators.wrap(tArrays[i]); + } + iter = new CountingMergedIntIterator(tIters); + assertEquals(Math.min(n, 2), iter.skip(n)); + } + } + } + +}