From mboxrd@z Thu Jan 1 00:00:00 1970 From: Michael Tremer To: location@lists.ipfire.org Subject: Re: [PATCH] location-importer: Replace ARIN AS names source with one that offers human-readable names Date: Wed, 13 Dec 2023 11:45:29 +0000 Message-ID: <9FB33B4E-2CD7-4804-83F9-BDA2622FB346@ipfire.org> In-Reply-To: MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="===============2240467331246103136==" List-Id: --===============2240467331246103136== Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Hello Peter, > On 10 Dec 2023, at 19:37, Peter M=C3=BCller wr= ote: >=20 > This patch replaces our previous source for AS names in ARIN's realms > with another file provided by ARIN that contains human-readable names > for organizations ASNs have been allocated to. >=20 > Please note that a >=20 > TRUNCATE autnums; >=20 > is necessary on machines previously running the old version of > location-importer, in order to make use of this changed data source. It would be great if we could avoid any manual interaction in the future... > Signed-off-by: Peter M=C3=BCller > --- > src/scripts/location-importer.in | 47 ++++++++++++++++++-------------- > 1 file changed, 27 insertions(+), 20 deletions(-) >=20 > diff --git a/src/scripts/location-importer.in b/src/scripts/location-import= er.in > index 28a4f6c..96b3a20 100644 > --- a/src/scripts/location-importer.in > +++ b/src/scripts/location-importer.in > @@ -3,7 +3,7 @@ > # = # > # libloc - A library to determine the location of someone on the Internet = # > # = # > -# Copyright (C) 2020-2022 IPFire Development Team = # > +# Copyright (C) 2020-2023 IPFire Development Team = # > # = # > # This library is free software; you can redistribute it and/or = # > # modify it under the terms of the GNU Lesser General Public = # > @@ -19,6 +19,7 @@ >=20 > import argparse > import concurrent.futures > +import csv > import http.client > import ipaddress > import json > @@ -1033,36 +1034,42 @@ class CLI(object): > def _import_as_names_from_arin(self): > downloader =3D location.importer.Downloader() >=20 > - # XXX: Download AS names file from ARIN (note that these names appear to = be quite > - # technical, not intended for human consumption, as description fields in > - # organisation handles for other RIRs are - however, this is what we have= got, > - # and in some cases, it might be still better than nothing) > - for line in downloader.request_lines("https://ftp.arin.net/info/asn.txt"): > - # Valid lines start with a space, followed by the number of the Autonomou= s System ... > - if not line.startswith(" "): > + # Download AS names file from ARIN and load it into CSV parser > + for line in downloader.request_lines("https://ftp.arin.net/pub/resource_r= egistry_service/asns.csv"): > + > + # Valid lines start with a " ... > + if not line.startswith("\""): > continue I would prefer trying to parse first, because it could still be valid CSV eve= n without the quotes. > # Split line and check if there is a valid ASN in it... > - asn, name =3D line.split()[0:2] > + for row in csv.reader([line]): > + orgname =3D row[0] > + orghandle =3D row[1] > + firstasn =3D row[3] > + lastasn =3D row[4] This works, but since I like to give lessons how to write code in a more read= able Python fashion, I would have written this as follow: orgname, orghandle, _, firstasn, lastasn =3D row In case you really cannot rely on the number of fields, wrap this around a tr= y/except block that catches any lines that are shorter. We should expect that= any additional fields are being added at any time. > try: > - asn =3D int(asn) > + firstasn =3D int(firstasn.strip("\"")) > + lastasn =3D int(lastasn.strip("\"")) If you have to strip the quotes, you probably are using the CSV parser with a= n incorrect configuration. You might need to register a new dialect like so: csv.register_dialect(=E2=80=9Carin", delimiter=3D=E2=80=9C,", quoting=3Dcsv= .QUOTE_ALL, quotechar=3D"\=E2=80=9D") You can then initialise the reader like so: reader =3D csv.DictReader(f, dialect=3D=E2=80=9Carin=E2=80=9D) If you are feeling really fancy and want to avoid parsing fields in a complic= ated way, you can even populate the fieldnames parameter. > except ValueError: > - log.debug("Skipping ARIN AS names line not containing an integer for ASN") > + log.debug("Skipping ARIN AS names line not containing valid integers for = ASN") > continue >=20 > # Filter invalid ASNs... > - if not self._check_parsed_asn(asn): > + if not self._check_parsed_asn(firstasn): > continue >=20 > - # Skip any AS name that appears to be a placeholder for a different RIR o= r entity... > - if re.match(r"^(ASN-BLK|)(AFCONC|AFRINIC|APNIC|ASNBLK|LACNIC|RIPE|IANA)(?= :\d?$|\-)", name): > + if firstasn > lastasn: > + continue > + > + # Filter any bulk AS assignments, since these are present for other RIRs = where > + # we get better data from elsewhere. > + if not firstasn =3D=3D lastasn: > continue >=20 > - # Bail out in case the AS name contains anything we do not expect here... > - if re.search(r"[^a-zA-Z0-9-_]", name): > - log.debug("Skipping ARIN AS name for %s containing invalid characters: %s= " % \ > - (asn, name)) > + # Skip any AS name that appears to be a placeholder for a different RIR o= r entity... > + if re.match(r"^(AFRINIC|APNIC|LACNIC|RIPE)$", orghandle.strip("\"")): > + continue This looks all okay. > # Things look good here, run INSERT statement and skip this one if we alrea= dy have > # a (better?) name for this Autonomous System... > @@ -1073,8 +1080,8 @@ class CLI(object): > source > ) VALUES (%s, %s, %s) > ON CONFLICT (number) DO NOTHING""", > - asn, > - name, > + firstasn, > + orgname.strip("\""), With the fixed CSV parser, you will not need to strip here. This also has the= downside of removing any quotes in the middle of the string. Not sure whethe= r that is a likely case, but nevertheless, we should use the data that we rec= eive unchanged unless we have a strong reason to change anything here. Best, -Michael > "ARIN", > ) >=20 > --=20 > 2.35.3 --===============2240467331246103136==--