Signed-off-by: Peter Müller peter.mueller@ipfire.org --- src/python/location-importer.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/python/location-importer.in b/src/python/location-importer.in index f5ae4a9..77952f2 100644 --- a/src/python/location-importer.in +++ b/src/python/location-importer.in @@ -360,7 +360,7 @@ class CLI(object): ON COMMIT DROP; CREATE UNIQUE INDEX _autnums_number ON _autnums(number);
- CREATE TEMPORARY TABLE _organizations(handle text, name text) + CREATE TEMPORARY TABLE _organizations(handle text, name text NOT NULL) ON COMMIT DROP; CREATE UNIQUE INDEX _organizations_handle ON _organizations(handle); """) @@ -373,7 +373,7 @@ class CLI(object): self.db.execute(""" INSERT INTO autnums(number, name) SELECT _autnums.number, _organizations.name FROM _autnums - LEFT JOIN _organizations ON _autnums.organization = _organizations.handle + JOIN _organizations ON _autnums.organization = _organizations.handle ON CONFLICT (number) DO UPDATE SET name = excluded.name; """)
The extended feeds do not have enough detailed information for us, so that we need to import inetnums from RIRs where possible. Filtering private networks is necessary as RIR data may contain 0.0.0.0/0 or similar entries for administrative purposes or due to misfilings.
Special thanks goes to Michael for spending numerous hours on this, setting up a testing environment and providing helpful advice while debugging.
Partially fixes: #12458
Cc: Michael Tremer michael.tremer@ipfire.org Signed-off-by: Peter Müller peter.mueller@ipfire.org --- src/python/importer.py | 14 ++++---- src/python/location-importer.in | 63 +++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 7 deletions(-)
diff --git a/src/python/importer.py b/src/python/importer.py index de20f37..f19db4b 100644 --- a/src/python/importer.py +++ b/src/python/importer.py @@ -30,8 +30,8 @@ WHOIS_SOURCES = ( "https://ftp.afrinic.net/pub/pub/dbase/afrinic.db.gz",
# Asia Pacific Network Information Centre - #"https://ftp.apnic.net/apnic/whois/apnic.db.inet6num.gz", - #"https://ftp.apnic.net/apnic/whois/apnic.db.inetnum.gz", + "https://ftp.apnic.net/apnic/whois/apnic.db.inet6num.gz", + "https://ftp.apnic.net/apnic/whois/apnic.db.inetnum.gz", #"https://ftp.apnic.net/apnic/whois/apnic.db.route6.gz", #"https://ftp.apnic.net/apnic/whois/apnic.db.route.gz", "https://ftp.apnic.net/apnic/whois/apnic.db.aut-num.gz", @@ -45,8 +45,8 @@ WHOIS_SOURCES = ( # XXX ???
# Réseaux IP Européens - #"https://ftp.ripe.net/ripe/dbase/split/ripe.db.inet6num.gz", - #"https://ftp.ripe.net/ripe/dbase/split/ripe.db.inetnum.gz", + "https://ftp.ripe.net/ripe/dbase/split/ripe.db.inet6num.gz", + "https://ftp.ripe.net/ripe/dbase/split/ripe.db.inetnum.gz", #"https://ftp.ripe.net/ripe/dbase/split/ripe.db.route6.gz", #"https://ftp.ripe.net/ripe/dbase/split/ripe.db.route.gz", "https://ftp.ripe.net/ripe/dbase/split/ripe.db.aut-num.gz", @@ -55,10 +55,10 @@ WHOIS_SOURCES = (
EXTENDED_SOURCES = ( # African Network Information Centre - "https://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-extended-latest", + #"https://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-extended-latest",
# Asia Pacific Network Information Centre - "https://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-extended-latest", + #"https://ftp.apnic.net/apnic/stats/apnic/delegated-apnic-extended-latest",
# American Registry for Internet Numbers "https://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest", @@ -67,7 +67,7 @@ EXTENDED_SOURCES = ( "http://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest",
# Réseaux IP Européens - "https://ftp.ripe.net/pub/stats/ripencc/delegated-ripencc-extended-latest", + #"https://ftp.ripe.net/pub/stats/ripencc/delegated-ripencc-extended-latest", )
class Downloader(object): diff --git a/src/python/location-importer.in b/src/python/location-importer.in index 77952f2..e3a07a0 100644 --- a/src/python/location-importer.in +++ b/src/python/location-importer.in @@ -393,6 +393,10 @@ class CLI(object): if line.startswith("aut-num:"): return self._parse_autnum_block(block)
+ # inetnum + if line.startswith("inet6num:") or line.startswith("inetnum:"): + return self._parse_inetnum_block(block) + # organisation elif line.startswith("organisation:"): return self._parse_org_block(block) @@ -422,6 +426,65 @@ class CLI(object): autnum.get("asn"), autnum.get("org"), )
+ def _parse_inetnum_block(self, block): + logging.debug("Parsing inetnum block:") + + inetnum = {} + for line in block: + logging.debug(line) + + # Split line + key, val = split_line(line) + + if key == "inetnum": + start_address, delim, end_address = val.partition("-") + + # Strip any excess space + start_address, end_address = start_address.rstrip(), end_address.strip() + + # Convert to IP address + try: + start_address = ipaddress.ip_address(start_address) + end_address = ipaddress.ip_address(end_address) + except ValueError: + logging.warning("Could not parse line: %s" % line) + return + + # Set prefix to default + prefix = 32 + + # Count number of addresses in this subnet + num_addresses = int(end_address) - int(start_address) + if num_addresses: + prefix -= math.log(num_addresses, 2) + + inetnum["inetnum"] = "%s/%.0f" % (start_address, prefix) + + elif key == "inet6num": + inetnum[key] = val + + elif key == "country": + if val == "UNITED STATES": + val = "US" + + inetnum[key] = val.upper() + + # Skip empty objects + if not inetnum: + return + + network = ipaddress.ip_network(inetnum.get("inet6num") or inetnum.get("inetnum"), strict=False) + + # Bail out in case we have processed a non-public IP network + if network.is_private: + logging.warning("Skipping non-globally routable network: %s" % network) + return + + self.db.execute("INSERT INTO networks(network, country) \ + VALUES(%s, %s) ON CONFLICT (network) DO UPDATE SET country = excluded.country", + "%s" % network, inetnum.get("country"), + ) + def _parse_org_block(self, block): org = {} for line in block:
When importing inetnums, we might import various small networks which are not relevant for us as long as they do not have a different country code than their parent network.
Therefore we delete all these entries to keep the database smaller without losing any information. The second version of this patch introduces a SQL statement parallelised across all CPUs available, while the DELETE-statement of the first version literally took ages to complete.
However, cleaning up those data still takes about 26 hours (!) on our location02 testing machine, making daily updates of the location database impossible to the current knowledge.
real 1521m30.620s user 38m45.521s sys 9m6.027s
Special thanks goes to Michael for spending numerous hours on this, setting up a testing environment, doing PostgreSQL magic and providing helpful advice while debugging.
Partially fixes: #12458
Cc: Michael Tremer michael.tremer@ipfire.org Signed-off-by: Peter Müller peter.mueller@ipfire.org --- src/python/location-importer.in | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-)
diff --git a/src/python/location-importer.in b/src/python/location-importer.in index e3a07a0..1467923 100644 --- a/src/python/location-importer.in +++ b/src/python/location-importer.in @@ -374,7 +374,27 @@ class CLI(object): INSERT INTO autnums(number, name) SELECT _autnums.number, _organizations.name FROM _autnums JOIN _organizations ON _autnums.organization = _organizations.handle - ON CONFLICT (number) DO UPDATE SET name = excluded.name; + ON CONFLICT (number) DO UPDATE SET name = excluded.name + """) + + self.db.execute(""" + --- Purge any redundant entries + CREATE TEMPORARY TABLE _garbage ON COMMIT DROP + AS + SELECT network FROM networks candidates + WHERE EXISTS ( + SELECT FROM networks + WHERE + networks.network << candidates.network + AND + networks.country = candidates.country + ); + + CREATE UNIQUE INDEX _garbage_search ON _garbage USING BTREE(network); + + DELETE FROM networks WHERE EXISTS ( + SELECT FROM _garbage WHERE networks.network = _garbage.network + ); """)
# Download all extended sources
Hello,
I had to revert this patch and the previous one.
For some reason, it is impossible to generate a new database although I made more processor cores and memory available to the system.
Please review this and try to find another approach that has less of an impact on the database system and that enables us to run a database update within about an hour.
Best, -Michael
On 20 Sep 2020, at 20:21, Peter Müller peter.mueller@ipfire.org wrote:
When importing inetnums, we might import various small networks which are not relevant for us as long as they do not have a different country code than their parent network.
Therefore we delete all these entries to keep the database smaller without losing any information. The second version of this patch introduces a SQL statement parallelised across all CPUs available, while the DELETE-statement of the first version literally took ages to complete.
However, cleaning up those data still takes about 26 hours (!) on our location02 testing machine, making daily updates of the location database impossible to the current knowledge.
real 1521m30.620s user 38m45.521s sys 9m6.027s
Special thanks goes to Michael for spending numerous hours on this, setting up a testing environment, doing PostgreSQL magic and providing helpful advice while debugging.
Partially fixes: #12458
Cc: Michael Tremer michael.tremer@ipfire.org Signed-off-by: Peter Müller peter.mueller@ipfire.org
src/python/location-importer.in | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-)
diff --git a/src/python/location-importer.in b/src/python/location-importer.in index e3a07a0..1467923 100644 --- a/src/python/location-importer.in +++ b/src/python/location-importer.in @@ -374,7 +374,27 @@ class CLI(object): INSERT INTO autnums(number, name) SELECT _autnums.number, _organizations.name FROM _autnums JOIN _organizations ON _autnums.organization = _organizations.handle
ON CONFLICT (number) DO UPDATE SET name = excluded.name;
ON CONFLICT (number) DO UPDATE SET name = excluded.name
""")
self.db.execute("""
--- Purge any redundant entries
CREATE TEMPORARY TABLE _garbage ON COMMIT DROP
AS
SELECT network FROM networks candidates
WHERE EXISTS (
SELECT FROM networks
WHERE
networks.network << candidates.network
AND
networks.country = candidates.country
);
CREATE UNIQUE INDEX _garbage_search ON _garbage USING BTREE(network);
DELETE FROM networks WHERE EXISTS (
SELECT FROM _garbage WHERE networks.network = _garbage.network
); """)
# Download all extended sources
-- 2.26.2