From mboxrd@z Thu Jan 1 00:00:00 1970 From: Michael Tremer To: location@lists.ipfire.org Subject: Re: [PATCH] location-importer.in: Conduct sanity checks per DROP list Date: Fri, 19 Aug 2022 09:31:37 +0100 Message-ID: In-Reply-To: MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="===============4199211431275149429==" List-Id: --===============4199211431275149429== Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Hello Peter, > On 18 Aug 2022, at 22:42, Peter M=C3=BCller wr= ote: >=20 > Previously, the lack of distinction between different DROP lists caused > only the last one to be persisted. >=20 > Signed-off-by: Peter M=C3=BCller > --- > src/scripts/location-importer.in | 58 +++++++++++++++++++++----------- > 1 file changed, 39 insertions(+), 19 deletions(-) >=20 > diff --git a/src/scripts/location-importer.in b/src/scripts/location-import= er.in > index 8d47497..e4a9ab0 100644 > --- a/src/scripts/location-importer.in > +++ b/src/scripts/location-importer.in > @@ -1427,18 +1427,21 @@ class CLI(object): > def _update_overrides_for_spamhaus_drop(self): > downloader =3D location.importer.Downloader() >=20 > - ip_urls =3D [ > - "https://www.spamhaus.org/drop/drop.txt", > - "https://www.spamhaus.org/drop/edrop.txt", > - "https://www.spamhaus.org/drop/dropv6.txt" > + ip_lists =3D [ > + ("Spamhaus DROP list", "https://www.spamhaus.org/drop/drop.txt"), > + ("Spamhaus EDROP list", "https://www.spamhaus.org/drop/edrop.txt"), > + ("Spamhaus DROPv6 list", "https://www.spamhaus.org/drop/dropv6.txt") > ] Would it not be better to have this as a dict? Or if you prefer something to iterate over, a tuple? Would it not be better/easier if we would use a handle like =E2=80=9CSPAMHAUS= -DROP=E2=80=9D, =E2=80=9CSPAMHAUS-EDROP=E2=80=9D, =E2=80=A6 instead of a stri= ng with spaces? It does not matter that much to me, but since this is a machi= ne-readable string, should it not be less human formatted? >=20 > - asn_urls =3D [ > - "https://www.spamhaus.org/drop/asndrop.txt" > + asn_lists =3D [ > + ("Spamhaus ASN-DROP list", "https://www.spamhaus.org/drop/asndrop.txt= ") > ] >=20 > - for url in ip_urls: > - # Fetch IP list > + for list in ip_lists: > + name =3D list[0] > + url =3D list[1] It would be more Pythonic if you wrote it like this: for name, url in ip_lists: ... > + > + # Fetch IP list from given URL > f =3D downloader.retrieve(url) >=20 > # Split into lines > @@ -1448,11 +1451,11 @@ class CLI(object): > # downloads. > if len(fcontent) > 10: > self.db.execute(""" > - DELETE FROM autnum_overrides WHERE source =3D 'Spamhaus ASN-DROP list= '; > - DELETE FROM network_overrides WHERE source =3D 'Spamhaus DROP lists'; > - """) > + DELETE FROM network_overrides WHERE source =3D '%s'; No need for the =E2=80=98=E2=80=99 around the %s. > + """ % name, > + ) > else: > - log.error("Spamhaus DROP URL %s returned likely bogus file, ignored" %= url) > + log.error("%s (%s) returned likely bogus file, ignored" % (name, url)) > continue >=20 > # Iterate through every line, filter comments and add remaining networks= to > @@ -1475,8 +1478,8 @@ class CLI(object): >=20 > # Sanitize parsed networks... > if not self._check_parsed_network(network): > - log.warning("Skipping bogus network found in Spamhaus DROP URL %s: %= s" % \ > - (url, network)) > + log.warning("Skipping bogus network found in %s (%s): %s" % \ > + (name, url, network)) > continue >=20 > # Conduct SQL statement... > @@ -1488,14 +1491,31 @@ class CLI(object): > ) VALUES (%s, %s, %s) > ON CONFLICT (network) DO UPDATE SET is_drop =3D True""", > "%s" % network, > - "Spamhaus DROP lists", > + name, > True > ) >=20 > - for url in asn_urls: > + for list in asn_lists: > + name =3D list[0] > + url =3D list[1] > + See above. > # Fetch URL > f =3D downloader.retrieve(url) >=20 > + # Split into lines > + fcontent =3D f.readlines() > + > + # Conduct a very basic sanity check to rule out CDN issues causing bogu= s DROP > + # downloads. > + if len(fcontent) > 10: > + self.db.execute(""" > + DELETE FROM autnum_overrides WHERE source =3D '%s'; > + """ % name, > + ) > + else: > + log.error("%s (%s) returned likely bogus file, ignored" % (name, url)) > + continue Is there a reason why this DELETE statement is not part of the transaction? I= f something goes wrong later on, you would have lost the previous data. Would it also not be programmatically better to start the import and then cou= nt how many lines you are importing? The =E2=80=9Cf.readlines()=E2=80=9D call is something that I would consider a= bit ugly. > + > # Iterate through every line, filter comments and add remaining ASNs to > # the override table in case they are valid... > with self.db.transaction(): > @@ -1518,8 +1538,8 @@ class CLI(object): >=20 > # Filter invalid ASNs... > if not self._check_parsed_asn(asn): > - log.warning("Skipping bogus ASN found in Spamhaus DROP URL %s: %s" %= \ > - (url, asn)) > + log.warning("Skipping bogus ASN found in %s (%s): %s" % \ > + (name, url, asn)) > continue >=20 > # Conduct SQL statement... > @@ -1531,7 +1551,7 @@ class CLI(object): > ) VALUES (%s, %s, %s) > ON CONFLICT (number) DO UPDATE SET is_drop =3D True""", > "%s" % asn, > - "Spamhaus ASN-DROP list", > + name, > True > ) >=20 > --=20 > 2.35.3 --===============4199211431275149429==--