Signed-off-by: Michael Tremer michael.tremer@ipfire.org --- src/scripts/location-importer.in | 40 +++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/src/scripts/location-importer.in b/src/scripts/location-importer.in index 9faf23b..5bd5da3 100644 --- a/src/scripts/location-importer.in +++ b/src/scripts/location-importer.in @@ -182,6 +182,11 @@ class CLI(object): CREATE INDEX IF NOT EXISTS networks_family ON networks USING BTREE(family(network)); CREATE INDEX IF NOT EXISTS networks_search ON networks USING GIST(network inet_ops);
+ -- geofeeds + CREATE TABLE IF NOT EXISTS network_geofeeds(network inet, url text); + CREATE UNIQUE INDEX IF NOT EXISTS network_geofeeds_unique + ON network_geofeeds(network); + -- overrides CREATE TABLE IF NOT EXISTS autnum_overrides( number bigint NOT NULL, @@ -799,6 +804,16 @@ class CLI(object):
inetnum[key].append(val)
+ # Parse the geofeed attribute + elif key == "geofeed": + inetnum["geofeed"] = val + + # Parse geofeed when used as a remark + elif key == "remark": + m = re.match(r"^(?:geofeed|Geofeed)\s+(https://.*)", val) + if m: + inetnum["geofeed"] = m.group(1) + # Skip empty objects if not inetnum or not "country" in inetnum: return @@ -810,7 +825,6 @@ class CLI(object): # them into the database, if _check_parsed_network() succeeded for single_network in inetnum.get("inet6num") or inetnum.get("inetnum"): if self._check_parsed_network(single_network): - # Skip objects with unknown country codes if they are valid to avoid log spam... if validcountries and invalidcountries: log.warning("Skipping network with bogus countr(y|ies) %s (original countries: %s): %s" % \ @@ -823,6 +837,30 @@ class CLI(object): "%s" % single_network, inetnum.get("country")[0], inetnum.get("country"), source_key, )
+ # Update any geofeed information + geofeed = inetnum.get("geofeed", None) + + # Store/update any geofeeds + if geofeed: + self.db.execute(""" + INSERT INTO + network_geofeeds( + network, + url + ) + VALUES( + %s, %s + ) + ON CONFLICT (network) DO + UPDATE SET url = excluded.url""", + "%s" % single_network, geofeed, + ) + + # Delete any previous geofeeds + else: + self.db.execute("DELETE FROM network_geofeeds WHERE network = %s", + "%s" % single_network) + def _parse_org_block(self, block, source_key): org = {} for line in block:
Signed-off-by: Michael Tremer michael.tremer@ipfire.org --- src/scripts/location-importer.in | 143 +++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+)
diff --git a/src/scripts/location-importer.in b/src/scripts/location-importer.in index 5bd5da3..ddec376 100644 --- a/src/scripts/location-importer.in +++ b/src/scripts/location-importer.in @@ -18,6 +18,7 @@ ###############################################################################
import argparse +import concurrent.futures import ipaddress import json import logging @@ -95,6 +96,11 @@ class CLI(object): update_announcements.add_argument("server", nargs=1, help=_("Route Server to connect to"), metavar=_("SERVER"))
+ # Update geofeeds + update_geofeeds = subparsers.add_parser("update-geofeeds", + help=_("Update Geofeeds")) + update_geofeeds.set_defaults(func=self.handle_update_geofeeds) + # Update overrides update_overrides = subparsers.add_parser("update-overrides", help=_("Update overrides"), @@ -183,6 +189,25 @@ class CLI(object): CREATE INDEX IF NOT EXISTS networks_search ON networks USING GIST(network inet_ops);
-- geofeeds + CREATE TABLE IF NOT EXISTS geofeeds( + id serial primary key, + url text, + status integer default null, + updated_at timestamp without time zone default null + ); + CREATE UNIQUE INDEX IF NOT EXISTS geofeeds_unique + ON geofeeds(url); + CREATE TABLE IF NOT EXISTS geofeed_networks( + geofeed_id integer references geofeeds(id) on delete cascade, + network inet, + country text, + region text, + city text + ); + CREATE INDEX IF NOT EXISTS geofeed_networks_geofeed_id + ON geofeed_networks(geofeed_id); + CREATE INDEX IF NOT EXISTS geofeed_networks_search + ON geofeed_networks(network); CREATE TABLE IF NOT EXISTS network_geofeeds(network inet, url text); CREATE UNIQUE INDEX IF NOT EXISTS network_geofeeds_unique ON network_geofeeds(network); @@ -1253,6 +1278,124 @@ class CLI(object): # Otherwise return the line yield line
+ def handle_update_geofeeds(self, ns): + # Fetch all Geofeeds that require an update + geofeeds = self.db.query(""" + SELECT + id, + url + FROM + geofeeds + WHERE + updated_at IS NULL + OR + updated_at <= CURRENT_TIMESTAMP - INTERVAL '1 week' + ORDER BY + id + """) + + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + results = executor.map(self._fetch_geofeed, geofeeds) + + for result in results: + print(result) + + def _fetch_geofeed(self, geofeed): + log.debug("Fetching Geofeed %s" % geofeed.url) + + with self.db.transaction(): + # Open the URL + try: + req = urllib.request.Request(geofeed.url, headers={ + "User-Agent" : "location/%s" % location.__version__, + + # We expect some plain text file in CSV format + "Accept" : "text/csv, text/plain", + }) + + # XXX set proxy + + # Send the request + with urllib.request.urlopen(req, timeout=10) as f: + # Remove any previous data + self.db.execute("DELETE FROM geofeed_networks \ + WHERE geofeed_id = %s", geofeed.id) + + # Read the output line by line + for line in f: + line = line.decode() + + # Strip any newline + line = line.rstrip() + + # Skip empty lines + if not line: + continue + + # Try to parse the line + try: + fields = line.split(",", 5) + except ValueError: + log.debug("Could not parse line: %s" % line) + continue + + # Check if we have enough fields + if len(fields) < 4: + log.debug("Not enough fields in line: %s" % line) + continue + + # Fetch all fields + network, country, region, city, = fields[:4] + + # Try to parse the network + try: + network = ipaddress.ip_network(network, strict=False) + except ValueError: + log.debug("Could not parse network: %s" % network) + continue + + # XXX Check the country code + + # Write this into the database + self.db.execute(""" + INSERT INTO + geofeed_networks ( + geofeed_id, + network, + country, + region, + city + ) + VALUES (%s, %s, %s, %s, %s)""", + geofeed.id, + "%s" % network, + country, + region, + city, + ) + + # Catch any HTTP errors + except urllib.request.HTTPError as e: + self.db.execute("UPDATE geofeeds SET status = %s \ + WHERE id = %s", e.code, geofeed.id) + + # Catch any other errors + except urllib.request.URLError as e: + log.error("Could not fetch URL %s: %s" % (geofeed.url, e)) + + # Mark the geofeed as updated + else: + self.db.execute(""" + UPDATE + geofeeds + SET + updated_at = CURRENT_TIMESTAMP, + status = NULL + WHERE + id = %s""", + geofeed.id, + ) + def handle_update_overrides(self, ns): with self.db.transaction(): # Only drop manually created overrides, as we can be reasonably sure to have them,
Any exceptions will only be raised in the main thread when they are fetched from the executor. We do not need to print any other return values here.
Signed-off-by: Michael Tremer michael.tremer@ipfire.org --- src/scripts/location-importer.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/scripts/location-importer.in b/src/scripts/location-importer.in index ddec376..014f3b6 100644 --- a/src/scripts/location-importer.in +++ b/src/scripts/location-importer.in @@ -1297,8 +1297,9 @@ class CLI(object): with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: results = executor.map(self._fetch_geofeed, geofeeds)
+ # Fetch all results to raise any exceptions for result in results: - print(result) + pass
def _fetch_geofeed(self, geofeed): log.debug("Fetching Geofeed %s" % geofeed.url)
Geofeeds are kept in a separate table to only fetch them once per URL. This needs to be kept in sync which is done before we update any feeds.
Signed-off-by: Michael Tremer michael.tremer@ipfire.org --- src/scripts/location-importer.in | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+)
diff --git a/src/scripts/location-importer.in b/src/scripts/location-importer.in index 014f3b6..12035f1 100644 --- a/src/scripts/location-importer.in +++ b/src/scripts/location-importer.in @@ -1279,6 +1279,38 @@ class CLI(object): yield line
def handle_update_geofeeds(self, ns): + # Sync geofeeds + with self.db.transaction(): + # Delete all geofeeds which are no longer linked + self.db.execute(""" + DELETE FROM + geofeeds + WHERE + NOT EXISTS ( + SELECT + 1 + FROM + network_geofeeds + WHERE + geofeeds.url = network_geofeeds.url + )""", + ) + + # Copy all geofeeds + self.db.execute(""" + INSERT INTO + geofeeds( + url + ) + SELECT + url + FROM + network_geofeeds + ON CONFLICT (url) + DO NOTHING + """, + ) + # Fetch all Geofeeds that require an update geofeeds = self.db.query(""" SELECT
Signed-off-by: Michael Tremer michael.tremer@ipfire.org --- src/scripts/location-importer.in | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+)
diff --git a/src/scripts/location-importer.in b/src/scripts/location-importer.in index 12035f1..f8d2dc8 100644 --- a/src/scripts/location-importer.in +++ b/src/scripts/location-importer.in @@ -291,6 +291,8 @@ class CLI(object): SELECT network FROM networks UNION SELECT network FROM network_overrides + UNION + SELECT network FROM geofeed_networks ),
ordered_networks AS ( @@ -333,6 +335,29 @@ class CLI(object): SELECT country FROM autnum_overrides overrides WHERE networks.autnum = overrides.number ), + ( + SELECT + geofeed_networks.country AS country + FROM + network_geofeeds + + -- Join the data from the geofeeds + LEFT JOIN + geofeeds ON network_geofeeds.url = geofeeds.url + LEFT JOIN + geofeed_networks ON geofeeds.id = geofeed_networks.geofeed_id + + -- Check whether we have a geofeed for this network + WHERE + networks.network <<= network_geofeeds.network + AND + networks.network <<= geofeed_networks.network + + -- Filter for the best result + ORDER BY + masklen(geofeed_networks.network) DESC + LIMIT 1 + ), networks.country ) AS country,
Signed-off-by: Michael Tremer michael.tremer@ipfire.org --- src/scripts/location-importer.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/scripts/location-importer.in b/src/scripts/location-importer.in index f8d2dc8..5759e0c 100644 --- a/src/scripts/location-importer.in +++ b/src/scripts/location-importer.in @@ -207,7 +207,7 @@ class CLI(object): CREATE INDEX IF NOT EXISTS geofeed_networks_geofeed_id ON geofeed_networks(geofeed_id); CREATE INDEX IF NOT EXISTS geofeed_networks_search - ON geofeed_networks(network); + ON geofeed_networks USING GIST(network inet_ops); CREATE TABLE IF NOT EXISTS network_geofeeds(network inet, url text); CREATE UNIQUE INDEX IF NOT EXISTS network_geofeeds_unique ON network_geofeeds(network);
Signed-off-by: Michael Tremer michael.tremer@ipfire.org --- src/scripts/location-importer.in | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/src/scripts/location-importer.in b/src/scripts/location-importer.in index 5759e0c..7e2136e 100644 --- a/src/scripts/location-importer.in +++ b/src/scripts/location-importer.in @@ -211,6 +211,10 @@ class CLI(object): CREATE TABLE IF NOT EXISTS network_geofeeds(network inet, url text); CREATE UNIQUE INDEX IF NOT EXISTS network_geofeeds_unique ON network_geofeeds(network); + CREATE INDEX IF NOT EXISTS network_geofeeds_search + ON network_geofeeds USING GIST(network inet_ops); + CREATE INDEX IF NOT EXISTS network_geofeeds_url + ON network_geofeeds(url);
-- overrides CREATE TABLE IF NOT EXISTS autnum_overrides(
Only RIPE has a dedicated geofeed field. For others, this data needs to be read from the remarks section.
Signed-off-by: Michael Tremer michael.tremer@ipfire.org --- src/scripts/location-importer.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/scripts/location-importer.in b/src/scripts/location-importer.in index 7e2136e..0e2764e 100644 --- a/src/scripts/location-importer.in +++ b/src/scripts/location-importer.in @@ -863,8 +863,8 @@ class CLI(object): inetnum["geofeed"] = val
# Parse geofeed when used as a remark - elif key == "remark": - m = re.match(r"^(?:geofeed|Geofeed)\s+(https://.*)", val) + elif key == "remarks": + m = re.match(r"^(?:Geofeed)\s+(https://.*)", val) if m: inetnum["geofeed"] = m.group(1)
Signed-off-by: Michael Tremer michael.tremer@ipfire.org --- src/scripts/location-importer.in | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/src/scripts/location-importer.in b/src/scripts/location-importer.in index 0e2764e..d0384b5 100644 --- a/src/scripts/location-importer.in +++ b/src/scripts/location-importer.in @@ -894,6 +894,11 @@ class CLI(object): # Update any geofeed information geofeed = inetnum.get("geofeed", None)
+ # Make sure that this is a HTTPS URL + if geofeed and not geofeed.startswith("https://"): + log.warning("Geofeed URL is not using HTTPS: %s" % geofeed) + geofeed = None + # Store/update any geofeeds if geofeed: self.db.execute("""
Signed-off-by: Michael Tremer michael.tremer@ipfire.org --- src/scripts/location-importer.in | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/src/scripts/location-importer.in b/src/scripts/location-importer.in index d0384b5..d8f9cbf 100644 --- a/src/scripts/location-importer.in +++ b/src/scripts/location-importer.in @@ -1421,7 +1421,15 @@ class CLI(object): log.debug("Could not parse network: %s" % network) continue
- # XXX Check the country code + # Strip any excess whitespace from country codes + if country: + country = country.strip() + + # Check the country code + if not location.country_code_is_valid(country): + log.warning("Invalid country code in Geofeed %s: %s" \ + % (geofeed.url, country)) + continue
# Write this into the database self.db.execute("""
Hello Michael,
above all, thank you very much for the patchset and all the work behind it.
Unfortunately, as briefly discussed via the phone already, I have some general concerns regarding geofeeds:
(a) In contrast to RIRs, I do not see geofeed providers as trustworthy source. While the former are not trustworthy in terms of the data they provide (since no vetting or QA of database changes is usually conducted, and it does not look to me like this is going to change soon), at least their infrastructure is: It seems reasonable to me to trust, for example, RIPE's FTP server to serve the same database files regardless of the client requesting it. For some of them, we could even verify that through file signature validation, assuming that it is too costly to do live GPG-signing at scale.
Geofeed URLs, in contrast, can lead to anywhere, and I would not be surprised at all to see dubious ISPs serving different geofeeds to different clients. Given that our IP address ranges are public and static, and libloc reveals itself through the User-Agent HTTP header, it would be quite easy to serve us a geofeed that tampers with data, while playing innocent to other clients.
In addition, many of the 215 geofeed URLs that are currently live (attached) point to services such as Google Docs or GitHub - both don't strike me as reliable sources in terms of persistence. Generally, we have the full problem of URL/domain rot again. :-(
One could argue that these points (to a certain extend) hold true for RIRs as well. However, if we cannot trust them, it's curtains for libloc either way. :-) Some random ISPs trying to make us consuming geolocation data from random URLs, on the other hand, poses a greater risk than benefit to the quality of the location database.
Which brings me directly to the next point...
(b) Presumed we still agree on not being more precise than /24 or /48, all the information geofeeds provide could (should?) have been in the RIR databases as well.
The only exception is ARIN, but since we do not get their raw database, we won't be able to consume any geofeed URLs in it. So, for the area where we lack accuracy of geolocation information most, geofeed won't help us. And for all the other RIRs (LACNIC included, for which we process an additional geolocation database feed already), the geofeeds ideally should not contain any new information to us.
Earlier today, I created a location database text dump on location02 with and without the geofeed patchset applied. The diff can be retrieved from https://people.ipfire.org/~pmueller/location-database-geofeed-diff.tar.gz, and is rather massive, partly because CIDRs smaller than /24 resp. /48 are yet to be ignored by the geofeed processing routines.
I have yet to assess the diff closely, but for a superficial analysis, it appears like geofeed introduces a lot of changes that could have been in the respective RIR databases as well. The fact that they are not there does not inspire confidence.
Apologies for this rather disappointing feedback, and best regards, Peter Müller
Hello Peter,
On 28 Oct 2022, at 21:29, Peter Müller peter.mueller@ipfire.org wrote:
Hello Michael,
above all, thank you very much for the patchset and all the work behind it.
Unfortunately, as briefly discussed via the phone already, I have some general concerns regarding geofeeds:
(a) In contrast to RIRs, I do not see geofeed providers as trustworthy source. While the former are not trustworthy in terms of the data they provide (since no vetting or QA of database changes is usually conducted, and it does not look to me like this is going to change soon), at least their infrastructure is: It seems reasonable to me to trust, for example, RIPE's FTP server to serve the same database files regardless of the client requesting it. For some of them, we could even verify that through file signature validation, assuming that it is too costly to do live GPG-signing at scale.
Geofeed URLs, in contrast, can lead to anywhere, and I would not be surprised at all to see dubious ISPs serving different geofeeds to different clients. Given that our IP address ranges are public and static, and libloc reveals itself through the User-Agent HTTP header, it would be quite easy to serve us a geofeed that tampers with data, while playing innocent to other clients.
In addition, many of the 215 geofeed URLs that are currently live (attached) point to services such as Google Docs or GitHub - both don't strike me as reliable sources in terms of persistence. Generally, we have the full problem of URL/domain rot again. :-(
One could argue that these points (to a certain extend) hold true for RIRs as well. However, if we cannot trust them, it's curtains for libloc either way. :-) Some random ISPs trying to make us consuming geolocation data from random URLs, on the other hand, poses a greater risk than benefit to the quality of the location database.
I see your point, but I disagree.
The RIR databases are self-assessment, too. People can put whatever they want in there and it is not being checked by anyone.
The only thing that you might have in favour of your argument is that there is a better paper trail of any changes than the geo feeds. Those can be changed - even randomly generated. But I believe that we have in both cases no chance to verify any data.
Malicious players will fake their location even in the RIR databases.
What I would suggest as a minimum is to select at least a couple of “trusted” or very large sources that we maintain manually. There are a couple of cloud providers which use Geofeeds and we would quite likely improve the quality of the data for them.
Which brings me directly to the next point...
(b) Presumed we still agree on not being more precise than /24 or /48, all the information geofeeds provide could (should?) have been in the RIR databases as well.
The only exception is ARIN, but since we do not get their raw database, we won't be able to consume any geofeed URLs in it. So, for the area where we lack accuracy of geolocation information most, geofeed won't help us. And for all the other RIRs (LACNIC included, for which we process an additional geolocation database feed already), the geofeeds ideally should not contain any new information to us.
Why should we not process anything smaller than those prefixes? It wouldn’t hurt us at all.
Earlier today, I created a location database text dump on location02 with and without the geofeed patchset applied. The diff can be retrieved from https://people.ipfire.org/~pmueller/location-database-geofeed-diff.tar.gz, and is rather massive, partly because CIDRs smaller than /24 resp. /48 are yet to be ignored by the geofeed processing routines.
I have yet to assess the diff closely, but for a superficial analysis, it appears like geofeed introduces a lot of changes that could have been in the respective RIR databases as well. The fact that they are not there does not inspire confidence.
Apologies for this rather disappointing feedback, and best regards, Peter Müller<20221028_live_geofeeds.txt>
Well, I don’t think this is disappointing. Technically I suspect that you are happy with the code.
We now just need to figure out where to use it and where to not use it.
Best, -Michael