Skip to content

Commit

Permalink
feat: tool and scripts to interactively explore webgraph
Browse files Browse the repository at this point in the history
Add more utility methods to save data in files, to map host names
to registered domains and to translate from/to reverse domain name
notation.
  • Loading branch information
sebastian-nagel committed Jul 4, 2024
1 parent 81424f2 commit 12bedb2
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 0 deletions.
57 changes: 57 additions & 0 deletions src/main/java/org/commoncrawl/webgraph/explore/Graph.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
import java.util.stream.Stream;

import org.commoncrawl.webgraph.CountingMergedIntIterator;
import org.commoncrawl.webgraph.HostToDomainGraph;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import crawlercommons.domains.EffectiveTldFinder;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.lang.MutableString;
Expand Down Expand Up @@ -343,4 +345,59 @@ public static String getTopLevelDomain(String reversedDomainName) {
}
return reversedDomainName;
}

/**
* Get the registered domain for a host name based on the ICANN section of the
* <a href="https://www.publicsuffix.org/">public suffix list</a>.
*
* @see EffectiveTldFinder
*
* @param hostName host name, e.g. <code>www.example.org.uk</code>
* @param strict if true return null instead of <code>hostName</code> if no
* valid public suffix is detected
* @return the domain name below the public suffix, e.g.
* <code>example.org.uk</code>
*/
public static String getRegisteredDomain(String hostName, boolean strict) {
return EffectiveTldFinder.getAssignedDomain(hostName, strict, true);
}

/**
* Get the registered domain for a host name, both in
* <a href= "https://en.wikipedia.org/wiki/Reverse_domain_name_notation">reverse
* domain name notation</a>.
*
* @see #getRegisteredDomain(String, boolean)
*
* @param reversedHostName host name in reverse domain name notation, e.g.
* <code>uk.ork.example.www</code>
* @param strict if true return null instead of
* <code>reversedHostName</code> if no valid public
* suffix is detected
* @return the domain name below the public suffix, e.g.
* <code>uk.org.example</code> (in reverse domain name notation)
*/
public static String getRegisteredDomainReversed(String reversedHostName, boolean strict) {
String hostName = reverseDomainName(reversedHostName);
String domainName = getRegisteredDomain(hostName, strict);
if (strict && domainName == null) {
return null;
} else if (hostName.equals(domainName)) {
return reversedHostName;
}
return reverseDomainName(domainName);
}

/**
* Reverse or "unreverse" a host/domain name: <code>com.example.www</code> is
* reversed to <code>www.example.com</code> and vice versa.
*
* @param domain name
* @return domain name with <a href=
* "https://en.wikipedia.org/wiki/Reverse_domain_name_notation">reverse
* domain name notation</a> (un)applied
*/
private static String reverseDomainName(String reversedDomainName) {
return HostToDomainGraph.reverseHost(reversedDomainName);
}
}
19 changes: 19 additions & 0 deletions src/main/java/org/commoncrawl/webgraph/explore/GraphExplorer.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.LongStream;
import java.util.stream.Stream;

import org.commoncrawl.webgraph.CountingMergedIntIterator;
Expand Down Expand Up @@ -215,6 +216,24 @@ public void saveVerticesToFile(IntStream vertexIDs, String fileName) {
}
}

public void saveVerticesToFile(LongStream vertexIDs, String fileName) {
try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
StandardCharsets.UTF_8)) {
vertexIDs.forEach(id -> out.println(g.vertexIdToLabel(id)));
} catch (IOException e) {
LOG.error("Failed to write vertices to file {}", fileName, e);
}
}

public void saveToFile(Stream<String> strings, String fileName) {
try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
StandardCharsets.UTF_8)) {
strings.forEach(out::println);
} catch (IOException e) {
LOG.error("Failed to write strings to file {}", fileName, e);
}
}

public void saveCountsToFile(Stream<Entry<String, Long>> counts, String fileName) {
try (PrintStream out = new PrintStream(Files.newOutputStream(Paths.get(fileName)), false,
StandardCharsets.UTF_8)) {
Expand Down

0 comments on commit 12bedb2

Please sign in to comment.