public inbox for location@lists.ipfire.org
 help / color / mirror / Atom feed
From: "Peter Müller" <peter.mueller@ipfire.org>
To: location@lists.ipfire.org
Subject: Re: [PATCH 1/2] location-importer.in: keep track of sources for networks, ASNs, and organisations
Date: Fri, 21 May 2021 12:00:50 +0200	[thread overview]
Message-ID: <97efc87c-44fd-836e-3ad9-e70a37a197a1@ipfire.org> (raw)
In-Reply-To: <20210515115705.9794-1-peter.mueller@ipfire.org>

[-- Attachment #1: Type: text/plain, Size: 10916 bytes --]

Hello *,

as this would break existing SQL tables as well, I will rework it and hand in a second
version of this patch.

Thanks, and best regards,
Peter Müller

> This allows us to trace back concrete changes or anomalies to their RIR
> source, without having to parse everything again. Further, it enables
> adding 3rd party sources such as IP feeds from Amazon, without loosing
> track of the changes introduced by them.
> 
> Depending on the individual systems, it might be necessary to DROP the
> tables for autnums and networks first.
> 
> Signed-off-by: Peter Müller <peter.mueller(a)ipfire.org>
> ---
>  src/python/location-importer.in | 93 +++++++++++++++++----------------
>  1 file changed, 49 insertions(+), 44 deletions(-)
> 
> diff --git a/src/python/location-importer.in b/src/python/location-importer.in
> index e5f55af..fd2bde1 100644
> --- a/src/python/location-importer.in
> +++ b/src/python/location-importer.in
> @@ -155,7 +155,7 @@ class CLI(object):
>  				CREATE INDEX IF NOT EXISTS announcements_search ON announcements USING GIST(network inet_ops);
>  
>  				-- autnums
> -				CREATE TABLE IF NOT EXISTS autnums(number bigint, name text NOT NULL);
> +				CREATE TABLE IF NOT EXISTS autnums(number bigint, name text NOT NULL, source text NOT NULL);
>  				CREATE UNIQUE INDEX IF NOT EXISTS autnums_number ON autnums(number);
>  
>  				-- countries
> @@ -164,7 +164,7 @@ class CLI(object):
>  				CREATE UNIQUE INDEX IF NOT EXISTS countries_country_code ON countries(country_code);
>  
>  				-- networks
> -				CREATE TABLE IF NOT EXISTS networks(network inet, country text);
> +				CREATE TABLE IF NOT EXISTS networks(network inet, country text, source text NOT NULL);
>  				CREATE UNIQUE INDEX IF NOT EXISTS networks_network ON networks(network);
>  				CREATE INDEX IF NOT EXISTS networks_family ON networks USING BTREE(family(network));
>  				CREATE INDEX IF NOT EXISTS networks_search ON networks USING GIST(network inet_ops);
> @@ -369,15 +369,15 @@ class CLI(object):
>  		with self.db.transaction():
>  			# Create some temporary tables to store parsed data
>  			self.db.execute("""
> -				CREATE TEMPORARY TABLE _autnums(number integer, organization text)
> +				CREATE TEMPORARY TABLE _autnums(number integer, organization text, source text NOT NULL)
>  					ON COMMIT DROP;
>  				CREATE UNIQUE INDEX _autnums_number ON _autnums(number);
>  
> -				CREATE TEMPORARY TABLE _organizations(handle text, name text NOT NULL)
> +				CREATE TEMPORARY TABLE _organizations(handle text, name text NOT NULL, source text NOT NULL)
>  					ON COMMIT DROP;
>  				CREATE UNIQUE INDEX _organizations_handle ON _organizations(handle);
>  
> -				CREATE TEMPORARY TABLE _rirdata(network inet NOT NULL, country text NOT NULL)
> +				CREATE TEMPORARY TABLE _rirdata(network inet NOT NULL, country text NOT NULL, source text NOT NULL)
>  					ON COMMIT DROP;
>  				CREATE INDEX _rirdata_search ON _rirdata USING BTREE(family(network), masklen(network));
>  				CREATE UNIQUE INDEX _rirdata_network ON _rirdata(network);
> @@ -395,10 +395,11 @@ class CLI(object):
>  			for row in rows:
>  				validcountries.append(row.country_code)
>  
> -			for source in location.importer.WHOIS_SOURCES:
> -				with downloader.request(source, return_blocks=True) as f:
> -					for block in f:
> -						self._parse_block(block, validcountries)
> +			for source_key in location.importer.WHOIS_SOURCES:
> +				for single_url in location.importer.WHOIS_SOURCES[source_key]:
> +					with downloader.request(single_url, return_blocks=True) as f:
> +						for block in f:
> +							self._parse_block(block, source_key, validcountries)
>  
>  			# Process all parsed networks from every RIR we happen to have access to,
>  			# insert the largest network chunks into the networks table immediately...
> @@ -407,8 +408,8 @@ class CLI(object):
>  			for family in (row.family for row in families):
>  				smallest = self.db.get("SELECT MIN(masklen(network)) AS prefix FROM _rirdata WHERE family(network) = %s", family)
>  
> -				self.db.execute("INSERT INTO networks(network, country) \
> -					SELECT network, country FROM _rirdata WHERE masklen(network) = %s AND family(network) = %s", smallest.prefix, family)
> +				self.db.execute("INSERT INTO networks(network, country, source) \
> +					SELECT network, country, source FROM _rirdata WHERE masklen(network) = %s AND family(network) = %s", smallest.prefix, family)
>  
>  				# ... determine any other prefixes for this network family, ...
>  				prefixes = self.db.query("SELECT DISTINCT masklen(network) AS prefix FROM _rirdata \
> @@ -421,7 +422,8 @@ class CLI(object):
>  						WITH candidates AS (
>  							SELECT
>  								_rirdata.network,
> -								_rirdata.country
> +								_rirdata.country,
> +								_rirdata.source
>  							FROM
>  								_rirdata
>  							WHERE
> @@ -434,6 +436,7 @@ class CLI(object):
>  								DISTINCT ON (c.network)
>  								c.network,
>  								c.country,
> +								c.source,
>  								masklen(networks.network),
>  								networks.country AS parent_country
>  							FROM
> @@ -447,10 +450,11 @@ class CLI(object):
>  								masklen(networks.network) DESC NULLS LAST
>  						)
>  						INSERT INTO
> -							networks(network, country)
> +							networks(network, country, source)
>  						SELECT
>  							network,
> -							country
> +							country,
> +							source
>  						FROM
>  							filtered
>  						WHERE
> @@ -462,19 +466,20 @@ class CLI(object):
>  					)
>  
>  			self.db.execute("""
> -				INSERT INTO autnums(number, name)
> -					SELECT _autnums.number, _organizations.name FROM _autnums
> +				INSERT INTO autnums(number, name, source)
> +					SELECT _autnums.number, _organizations.name, _organizations.source FROM _autnums
>  						JOIN _organizations ON _autnums.organization = _organizations.handle
>  				ON CONFLICT (number) DO UPDATE SET name = excluded.name;
>  			""")
>  
>  		# Download all extended sources
> -		for source in location.importer.EXTENDED_SOURCES:
> -			with self.db.transaction():
> -				# Download data
> -				with downloader.request(source) as f:
> -					for line in f:
> -						self._parse_line(line, validcountries)
> +		for source_key in location.importer.EXTENDED_SOURCES:
> +			for single_url in location.importer.EXTENDED_SOURCES[source_key]:
> +				with self.db.transaction():
> +					# Download data
> +					with downloader.request(single_url) as f:
> +						for line in f:
> +							self._parse_line(line, source_key, validcountries)
>  
>  	def _check_parsed_network(self, network):
>  		"""
> @@ -539,23 +544,23 @@ class CLI(object):
>  		# be suitable for libloc consumption...
>  		return True
>  
> -	def _parse_block(self, block, validcountries = None):
> +	def _parse_block(self, block, source_key, validcountries = None):
>  		# Get first line to find out what type of block this is
>  		line = block[0]
>  
>  		# aut-num
>  		if line.startswith("aut-num:"):
> -			return self._parse_autnum_block(block)
> +			return self._parse_autnum_block(block, source_key)
>  
>  		# inetnum
>  		if line.startswith("inet6num:") or line.startswith("inetnum:"):
> -			return self._parse_inetnum_block(block, validcountries)
> +			return self._parse_inetnum_block(block, source_key, validcountries)
>  
>  		# organisation
>  		elif line.startswith("organisation:"):
> -			return self._parse_org_block(block)
> +			return self._parse_org_block(block, source_key)
>  
> -	def _parse_autnum_block(self, block):
> +	def _parse_autnum_block(self, block, source_key):
>  		autnum = {}
>  		for line in block:
>  			# Split line
> @@ -574,13 +579,13 @@ class CLI(object):
>  			return
>  
>  		# Insert into database
> -		self.db.execute("INSERT INTO _autnums(number, organization) \
> -			VALUES(%s, %s) ON CONFLICT (number) DO UPDATE SET \
> +		self.db.execute("INSERT INTO _autnums(number, organization, source) \
> +			VALUES(%s, %s, %s) ON CONFLICT (number) DO UPDATE SET \
>  				organization = excluded.organization",
> -			autnum.get("asn"), autnum.get("org"),
> +			autnum.get("asn"), autnum.get("org"), source_key,
>  		)
>  
> -	def _parse_inetnum_block(self, block, validcountries = None):
> +	def _parse_inetnum_block(self, block, source_key, validcountries = None):
>  		log.debug("Parsing inetnum block:")
>  
>  		inetnum = {}
> @@ -636,12 +641,12 @@ class CLI(object):
>  					break
>  
>  				# Everything is fine here, run INSERT statement...
> -				self.db.execute("INSERT INTO _rirdata(network, country) \
> -					VALUES(%s, %s) ON CONFLICT (network) DO UPDATE SET country = excluded.country",
> -					"%s" % single_network, inetnum.get("country"),
> +				self.db.execute("INSERT INTO _rirdata(network, country, source) \
> +					VALUES(%s, %s, %s) ON CONFLICT (network) DO UPDATE SET country = excluded.country",
> +					"%s" % single_network, inetnum.get("country"), source_key,
>  				)
>  
> -	def _parse_org_block(self, block):
> +	def _parse_org_block(self, block, source_key):
>  		org = {}
>  		for line in block:
>  			# Split line
> @@ -656,13 +661,13 @@ class CLI(object):
>  		if not org:
>  			return
>  
> -		self.db.execute("INSERT INTO _organizations(handle, name) \
> -			VALUES(%s, %s) ON CONFLICT (handle) DO \
> +		self.db.execute("INSERT INTO _organizations(handle, name, source) \
> +			VALUES(%s, %s, %s) ON CONFLICT (handle) DO \
>  			UPDATE SET name = excluded.name",
> -			org.get("organisation"), org.get("org-name"),
> +			org.get("organisation"), org.get("org-name"), source_key,
>  		)
>  
> -	def _parse_line(self, line, validcountries = None):
> +	def _parse_line(self, line, source_key, validcountries = None):
>  		# Skip version line
>  		if line.startswith("2"):
>  			return
> @@ -689,9 +694,9 @@ class CLI(object):
>  			return
>  
>  		if type in ("ipv6", "ipv4"):
> -			return self._parse_ip_line(country_code, type, line)
> +			return self._parse_ip_line(country_code, type, line, source_key)
>  
> -	def _parse_ip_line(self, country, type, line):
> +	def _parse_ip_line(self, country, type, line, source_key):
>  		try:
>  			address, prefix, date, status, organization = line.split("|")
>  		except ValueError:
> @@ -729,10 +734,10 @@ class CLI(object):
>  		if not self._check_parsed_network(network):
>  			return
>  
> -		self.db.execute("INSERT INTO networks(network, country) \
> -			VALUES(%s, %s) ON CONFLICT (network) DO \
> +		self.db.execute("INSERT INTO networks(network, country, source) \
> +			VALUES(%s, %s, %s) ON CONFLICT (network) DO \
>  			UPDATE SET country = excluded.country",
> -			"%s" % network, country,
> +			"%s" % network, country, source_key,
>  		)
>  
>  	def handle_update_announcements(self, ns):
> 

      parent reply	other threads:[~2021-05-21 10:00 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-05-15 11:57 Peter Müller
2021-05-15 11:57 ` [PATCH 2/2] importer.py: add source information for RIR data feeds Peter Müller
2021-05-21 10:00 ` Peter Müller [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=97efc87c-44fd-836e-3ad9-e70a37a197a1@ipfire.org \
    --to=peter.mueller@ipfire.org \
    --cc=location@lists.ipfire.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox