From mboxrd@z Thu Jan 1 00:00:00 1970 From: Peter =?utf-8?q?M=C3=BCller?= To: location@lists.ipfire.org Subject: Re: [PATCH] location-importer.in: Conduct sanity checks per DROP list Date: Mon, 26 Sep 2022 18:26:17 +0000 Message-ID: <87682839-dc30-ac0e-6bb1-ec705bdc793e@ipfire.org> In-Reply-To: MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="===============4995535731229145047==" List-Id: --===============4995535731229145047== Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Hello Michael, better late than never: Thanks for your reply and the comments. > Hello Peter, >=20 >> On 18 Aug 2022, at 22:42, Peter M=C3=BCller w= rote: >> >> Previously, the lack of distinction between different DROP lists caused >> only the last one to be persisted. >> >> Signed-off-by: Peter M=C3=BCller >> --- >> src/scripts/location-importer.in | 58 +++++++++++++++++++++----------- >> 1 file changed, 39 insertions(+), 19 deletions(-) >> >> diff --git a/src/scripts/location-importer.in b/src/scripts/location-impor= ter.in >> index 8d47497..e4a9ab0 100644 >> --- a/src/scripts/location-importer.in >> +++ b/src/scripts/location-importer.in >> @@ -1427,18 +1427,21 @@ class CLI(object): >> def _update_overrides_for_spamhaus_drop(self): >> downloader =3D location.importer.Downloader() >> >> - ip_urls =3D [ >> - "https://www.spamhaus.org/drop/drop.txt", >> - "https://www.spamhaus.org/drop/edrop.txt", >> - "https://www.spamhaus.org/drop/dropv6.txt" >> + ip_lists =3D [ >> + ("Spamhaus DROP list", "https://www.spamhaus.org/drop/drop.txt"), >> + ("Spamhaus EDROP list", "https://www.spamhaus.org/drop/edrop.txt"), >> + ("Spamhaus DROPv6 list", "https://www.spamhaus.org/drop/dropv6.txt") >> ] >=20 > Would it not be better to have this as a dict? >=20 > Or if you prefer something to iterate over, a tuple? >=20 > Would it not be better/easier if we would use a handle like =E2=80=9CSPAMHA= US-DROP=E2=80=9D, =E2=80=9CSPAMHAUS-EDROP=E2=80=9D, =E2=80=A6 instead of a st= ring with spaces? It does not matter that much to me, but since this is a mac= hine-readable string, should it not be less human formatted? Hm, good point. I have changed that in the upcoming second version of the pat= ch. >=20 >> >> - asn_urls =3D [ >> - "https://www.spamhaus.org/drop/asndrop.txt" >> + asn_lists =3D [ >> + ("Spamhaus ASN-DROP list", "https://www.spamhaus.org/drop/asndrop.tx= t") >> ] >> >> - for url in ip_urls: >> - # Fetch IP list >> + for list in ip_lists: >> + name =3D list[0] >> + url =3D list[1] >=20 > It would be more Pythonic if you wrote it like this: >=20 > for name, url in ip_lists: > ... >=20 >> + >> + # Fetch IP list from given URL >> f =3D downloader.retrieve(url) >> >> # Split into lines >> @@ -1448,11 +1451,11 @@ class CLI(object): >> # downloads. >> if len(fcontent) > 10: >> self.db.execute(""" >> - DELETE FROM autnum_overrides WHERE source =3D 'Spamhaus ASN-DROP lis= t'; >> - DELETE FROM network_overrides WHERE source =3D 'Spamhaus DROP lists'; >> - """) >> + DELETE FROM network_overrides WHERE source =3D '%s'; >=20 > No need for the =E2=80=98=E2=80=99 around the %s. Yes there is, the SQL command results in an exception otherwise. Second version of the patch is coming right up. Thanks, and best regards, Peter M=C3=BCller >=20 >> + """ % name, >> + ) >> else: >> - log.error("Spamhaus DROP URL %s returned likely bogus file, ignored" = % url) >> + log.error("%s (%s) returned likely bogus file, ignored" % (name, url)) >> continue >> >> # Iterate through every line, filter comments and add remaining network= s to >> @@ -1475,8 +1478,8 @@ class CLI(object): >> >> # Sanitize parsed networks... >> if not self._check_parsed_network(network): >> - log.warning("Skipping bogus network found in Spamhaus DROP URL %s: = %s" % \ >> - (url, network)) >> + log.warning("Skipping bogus network found in %s (%s): %s" % \ >> + (name, url, network)) >> continue >> >> # Conduct SQL statement... >> @@ -1488,14 +1491,31 @@ class CLI(object): >> ) VALUES (%s, %s, %s) >> ON CONFLICT (network) DO UPDATE SET is_drop =3D True""", >> "%s" % network, >> - "Spamhaus DROP lists", >> + name, >> True >> ) >> >> - for url in asn_urls: >> + for list in asn_lists: >> + name =3D list[0] >> + url =3D list[1] >> + >=20 > See above. >=20 >> # Fetch URL >> f =3D downloader.retrieve(url) >> >> + # Split into lines >> + fcontent =3D f.readlines() >> + >> + # Conduct a very basic sanity check to rule out CDN issues causing bog= us DROP >> + # downloads. >> + if len(fcontent) > 10: >> + self.db.execute(""" >> + DELETE FROM autnum_overrides WHERE source =3D '%s'; >> + """ % name, >> + ) >> + else: >> + log.error("%s (%s) returned likely bogus file, ignored" % (name, url)) >> + continue >=20 > Is there a reason why this DELETE statement is not part of the transaction?= If something goes wrong later on, you would have lost the previous data. >=20 > Would it also not be programmatically better to start the import and then c= ount how many lines you are importing? >=20 > The =E2=80=9Cf.readlines()=E2=80=9D call is something that I would consider= a bit ugly. >=20 >> + >> # Iterate through every line, filter comments and add remaining ASNs to >> # the override table in case they are valid... >> with self.db.transaction(): >> @@ -1518,8 +1538,8 @@ class CLI(object): >> >> # Filter invalid ASNs... >> if not self._check_parsed_asn(asn): >> - log.warning("Skipping bogus ASN found in Spamhaus DROP URL %s: %s" = % \ >> - (url, asn)) >> + log.warning("Skipping bogus ASN found in %s (%s): %s" % \ >> + (name, url, asn)) >> continue >> >> # Conduct SQL statement... >> @@ -1531,7 +1551,7 @@ class CLI(object): >> ) VALUES (%s, %s, %s) >> ON CONFLICT (number) DO UPDATE SET is_drop =3D True""", >> "%s" % asn, >> - "Spamhaus ASN-DROP list", >> + name, >> True >> ) >> >> --=20 >> 2.35.3 >=20 --===============4995535731229145047==--