From 0781559030d3226221e264eb969a523a1736c8d1 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:09:37 +0000 Subject: [PATCH 01/42] Update automation.sh Added compute_metrics.py to the pipeline and removed distribution.py (Bitcoin Tor nodes are now redistributed directly in parse.py) --- bitcoin/automation.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bitcoin/automation.sh b/bitcoin/automation.sh index d010661..995e2f2 100644 --- a/bitcoin/automation.sh +++ b/bitcoin/automation.sh @@ -10,12 +10,12 @@ python3 cleanup_dead_nodes.py python3 collect_geodata.py #python3 collect_osdata.py # not in use python3 parse.py -python3 distribution.py python3 plot.py +python3 compute_metrics.py # The following 2 lines create a folder and move all png and csv files to it mkdir output/"$(date +%Y-%m-%d)" -mv -t output/"$(date +%Y-%m-%d)" output/*.png output/*.csv +mv -t output/"$(date +%Y-%m-%d)" output/{clients,countries,protocols,organizations,ip_type}_*.csv output/response_length.json 2>/dev/null || true sleep 7d # will repeat the whole process every X days From 900c1c3108b8577cf1552335e4d23e58fed40cb1 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:17:31 +0000 Subject: [PATCH 02/42] Implement Tor node redistribution in parse.py instead of distribution.py --- bitcoin/parse.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/bitcoin/parse.py b/bitcoin/parse.py index c008f90..7fa39c2 100644 --- a/bitcoin/parse.py +++ b/bitcoin/parse.py @@ -327,6 +327,46 @@ def version(reachable_nodes, mode): versions_df.to_csv(f'./output/{name.lower()}_{ledger}.csv', index_label = name) +def redistribute_tor_nodes(name, ledger, df, mode): + """ + Redistributes Tor node count proportionally across non-Tor rows. + :param name: lowercase version of the mode ('countries' or 'organizations') used in file naming. + :param ledger: the ledger name. + :param df: the dataframe in which the Tor nodes must be reditributed. + :param mode: the mode name (e.g., 'Countries', 'Organizations'). + """ + date = datetime.today().strftime('%Y-%m-%d') + tor_row = df[df[mode] == 'Tor'] + if tor_row.empty: + logging.info(f"No Tor nodes found in {ledger}.") + return + + number_of_tor_nodes = tor_row[date].values[0] # extract the number of Tor nodes for the given date + number_of_total_nodes_without_tor = df[df[f'{mode}'] != 'Tor'][f'{date}'].sum() # sum of node counts excluding the Tor row + number_of_total_nodes = number_of_total_nodes_without_tor + number_of_tor_nodes + df['Distribution'] = df.apply(lambda row: round((row[f'{date}'] / number_of_total_nodes_without_tor) * number_of_tor_nodes) if row[f'{mode}'] != 'Tor' else 0, axis=1) # create a new column 'Distribution' that distributes the Tor nodes proportionally to non-Tor rows + df[date] = df[date] + df['Distribution'] + df_without_tor = df[df[f'{mode}'] != 'Tor'] # filter out the Tor row + df_without_tor[[mode, date]].to_csv(f'./output/{name}_{ledger}_without_tor.csv', index=False) # save the updated DataFrame to a new CSV + + +def without_tor(): + """ + Loads a CSV file and calls the redistribute_tor_nodes function. + """ + ledger = 'bitcoin' + modes = ['Countries', 'Organizations'] + for mode in modes: + logging.info(f'parse.py: Removing Tor from {ledger} {mode}') + name = mode.lower() + filename = pathlib.Path(f'./output/{name}_{ledger}.csv') + if not filename.is_file(): + logging.warning(f"File not found: {filename}") + return None + df = pd.read_csv(filename) + redistribute_tor_nodes(name, ledger, df, mode) + + def cluster_organizations(ledger): """ Clusters organizations in CSV files. @@ -389,6 +429,8 @@ def main(): geography(reachable_nodes, ledger, mode) if 'Organizations' in MODES: cluster_organizations(ledger) + if 'bitcoin' in LEDGERS: + without_tor() if __name__ == '__main__': main() From 2ce349c74ae9316a68f3e8684f7e35dea4435763 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:18:04 +0000 Subject: [PATCH 03/42] Delete bitcoin/distribution.py Functions now implemented in parse.py --- bitcoin/distribution.py | 54 ----------------------------------------- 1 file changed, 54 deletions(-) delete mode 100644 bitcoin/distribution.py diff --git a/bitcoin/distribution.py b/bitcoin/distribution.py deleted file mode 100644 index 0504a24..0000000 --- a/bitcoin/distribution.py +++ /dev/null @@ -1,54 +0,0 @@ -# This script is only used for the Bitcoin ledger. If the Bitcoin ledger is not selected in the config.yaml file, this script does nothing. - -import network_decentralization.helper as hlp -import logging -import pandas as pd -from pathlib import Path - -def redistribute_tor_nodes(name, ledger, df, mode): - """ - Redistributes Tor node count proportionally across non-Tor rows. - :param name: lowercase version of the mode ('countries' or 'organizations') used in file naming. - :param ledger: the ledger name. - :param df: the dataframe in which the Tor nodes must be reditributed. - :param mode: Countries or Organizations. - """ - tor_row = df[df[mode] == 'Tor'] - if tor_row.empty: - logging.info(f"No Tor nodes found in {ledger}.") - return - - number_of_tor_nodes = tor_row[date].values[0] # extract the number of Tor nodes for the given date - number_of_total_nodes_without_tor = df[df[f'{mode}'] != 'Tor'][f'{date}'].sum() # sum of node counts excluding the Tor row - number_of_total_nodes = number_of_total_nodes_without_tor + number_of_tor_nodes - df['Distribution'] = df.apply(lambda row: round((row[f'{date}'] / number_of_total_nodes_without_tor) * number_of_tor_nodes) if row[f'{mode}'] != 'Tor' else 0, axis=1) # create a new column 'Distribution' that distributes the Tor nodes proportionally to non-Tor rows - df[date] = df[date] + df['Distribution'] - df_without_tor = df[df[f'{mode}'] != 'Tor'] # filter out the Tor row - df_without_tor[[mode, date]].to_csv(f'./output/{name}_{ledger}_without_tor.csv', index=False) # save the updated DataFrame to a new CSV - - -def without_tor(): - """ - Loads a CSV file and calls the redistribute_tor_nodes function. - """ - ledger = 'bitcoin' - for mode in MODES: - logging.info(f'distribution.py: Removing Tor from {ledger} {mode}') - name = mode.lower() - filename = Path(f'./output/{name}_{ledger}.csv') - if not filename.is_file(): - logging.warning(f"File not found: {filename}") - return None - df = pd.read_csv(filename) - redistribute_tor_nodes(name, ledger, df, mode) - -LEDGERS = hlp.get_ledgers() -MODES = hlp.get_mode() -date = hlp.get_date() - -def main(): - if 'bitcoin' in LEDGERS: - without_tor() - -if __name__ == '__main__': - main() From d981f6635bd0f7bf627e1da672e99b5a9badffe8 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:19:46 +0000 Subject: [PATCH 04/42] Add compute_metrics.py --- bitcoin/compute_metrics.py | 183 +++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 bitcoin/compute_metrics.py diff --git a/bitcoin/compute_metrics.py b/bitcoin/compute_metrics.py new file mode 100644 index 0000000..06ef2aa --- /dev/null +++ b/bitcoin/compute_metrics.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +""" +Script to compute network decentralization metrics from CSV files in the output directory. +Processes both organization and country CSV files and outputs metrics in CSV format. +""" + +import csv +import pathlib +import sys +from network_decentralization.metrics.herfindahl_hirschman_index import compute_hhi +from network_decentralization.metrics.nakamoto_coefficient import compute_nakamoto_coefficient +from network_decentralization.metrics.entropy import compute_entropy +from network_decentralization.metrics.concentration_ratio import compute_concentration_ratio + + +def read_csv_data(csv_path): + """ + Read CSV file and extract date and distribution values. + CSV format: Header row is "EntityType,YYYY-MM-DD" followed by data rows "entity_name,count" + + :param csv_path: Path to the CSV file + :return: Tuple of (date, sorted_distribution_list) + """ + with open(csv_path, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + + # Read header + header = next(reader) + date = header[1] # Extract date from header + + # Read data rows and extract counts + distribution = [] + for row in reader: + if len(row) >= 2: + try: + count = int(row[1]) + distribution.append(count) + except (ValueError, IndexError): + continue + + # Sort in descending order for metric calculations + distribution.sort(reverse=True) + + return date, distribution + + +def get_ledger_name(csv_path): + """ + Extract ledger name from CSV filename. + Expected format: organizations_.csv or countries_.csv + + :param csv_path: Path to the CSV file + :return: Ledger name (e.g., 'bitcoin', 'bitcoin_cash') + """ + filename = csv_path.stem # Get filename without extension + parts = filename.split('_') + # Remove 'organizations' or 'countries' prefix + return '_'.join(parts[1:]) + + +def compute_metrics(distribution): + """ + Compute all metrics for a given distribution. + + :param distribution: Sorted list of entity counts (descending order) + :return: Dictionary with all computed metrics + """ + total = sum(distribution) + + if total == 0: + return { + 'hhi': None, + 'nakamoto': None, + 'entropy': None, + 'max_power_ratio': None + } + + metrics = { + 'hhi': compute_hhi(distribution), + 'nakamoto': compute_nakamoto_coefficient(distribution), + 'entropy': compute_entropy(distribution, alpha=1), # Shannon entropy + 'max_power_ratio': max(distribution) / total if distribution else 0 + } + + return metrics + + +def process_csv_files(output_dir, file_pattern, is_country=False): + """ + Process all CSV files matching a pattern and output metrics. + Appends results to existing files or creates new ones. + For bitcoin, uses the _without_tor versions if they exist. + + :param output_dir: Path to the output directory + :param file_pattern: Glob pattern for CSV files (e.g., 'organizations_*.csv') + :param is_country: Boolean to indicate if processing country files + """ + csv_files = sorted(output_dir.glob(file_pattern)) + + for csv_path in csv_files: + # Skip _without_tor files in the glob - we'll handle them explicitly for bitcoin + if '_without_tor' in csv_path.name: + continue + + try: + ledger = get_ledger_name(csv_path) + + # For bitcoin, check if _without_tor version exists and use that instead + if ledger == 'bitcoin': + file_type = 'countries' if is_country else 'organizations' + without_tor_path = output_dir / f"{file_type}_bitcoin_without_tor.csv" + if without_tor_path.exists(): + csv_path = without_tor_path + + date, distribution = read_csv_data(csv_path) + metrics = compute_metrics(distribution) + + # Determine output filename + if is_country: + output_filename = f"output_countries_{ledger}.csv" + output_path = output_dir / output_filename + file_exists = output_path.exists() + + # Write header and data (append if exists) + with open(output_path, 'a', newline='', encoding='utf-8') as f: + writer = csv.writer(f) + if not file_exists: + writer.writerow(['ledger', 'date', 'clustering', 'entropy', 'hhi', 'nakamoto_coefficient', 'max_power_ratio']) + writer.writerow([ + ledger, + date, + 'False', + f"{metrics['entropy']:.15g}", + f"{metrics['hhi']:.16g}", + metrics['nakamoto'], + f"{metrics['max_power_ratio']:.16g}" + ]) + print(f"Appended to: {output_filename}", file=sys.stderr) + else: + output_filename = f"output_organizations_{ledger}.csv" + output_path = output_dir / output_filename + file_exists = output_path.exists() + + # Write header and data (append if exists) + with open(output_path, 'a', newline='', encoding='utf-8') as f: + writer = csv.writer(f) + if not file_exists: + writer.writerow(['ledger', 'date', 'clustering', 'hhi', 'nakamoto_coefficient', 'max_power_ratio']) + writer.writerow([ + ledger, + date, + 'False', + f"{metrics['hhi']:.16g}", + metrics['nakamoto'], + f"{metrics['max_power_ratio']:.16g}" + ]) + print(f"Appended to: {output_filename}", file=sys.stderr) + + except Exception as e: + print(f"Error processing {csv_path.name}: {e}", file=sys.stderr) + continue + + +def main(): + """ + Main entry point for the script. + Processes organization and country CSV files from the output directory. + """ + output_dir = pathlib.Path(__file__).parent / 'output' + + if not output_dir.exists(): + print(f"Error: Output directory not found at {output_dir}", file=sys.stderr) + sys.exit(1) + + # Process organization files + process_csv_files(output_dir, 'organizations_*.csv', is_country=False) + + # Process country files + process_csv_files(output_dir, 'countries_*.csv', is_country=True) + + +if __name__ == '__main__': + main() From 046ac9b91e500851cf599ec8e5752b2efa4c8200 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:21:23 +0000 Subject: [PATCH 05/42] Create concentration_ratio.py --- .../metrics/concentration_ratio.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 bitcoin/network_decentralization/metrics/concentration_ratio.py diff --git a/bitcoin/network_decentralization/metrics/concentration_ratio.py b/bitcoin/network_decentralization/metrics/concentration_ratio.py new file mode 100644 index 0000000..8893cb2 --- /dev/null +++ b/bitcoin/network_decentralization/metrics/concentration_ratio.py @@ -0,0 +1,9 @@ +def compute_concentration_ratio(block_distribution, topn): + """ + Calculates the n-concentration ratio of a distribution of balances + :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order + :param topn: the number of top block producers to consider + :returns: float that represents the ratio of blocks produced by the top n block producers (0 if there weren't any) + """ + total_blocks = sum(block_distribution) + return sum(block_distribution[:topn]) / total_blocks if total_blocks else 0 From 33ab5ec8f911e3415fa069711d44aaf9fca4feec Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:21:50 +0000 Subject: [PATCH 06/42] Upload metrics files --- .../metrics/entropy.py | 48 +++++++++++++++++++ .../metrics/herfindahl_hirschman_index.py | 20 ++++++++ .../metrics/nakamoto_coefficient.py | 10 ++++ .../metrics/tau_index.py | 18 +++++++ .../metrics/total_entities.py | 7 +++ 5 files changed, 103 insertions(+) create mode 100644 bitcoin/network_decentralization/metrics/entropy.py create mode 100644 bitcoin/network_decentralization/metrics/herfindahl_hirschman_index.py create mode 100644 bitcoin/network_decentralization/metrics/nakamoto_coefficient.py create mode 100644 bitcoin/network_decentralization/metrics/tau_index.py create mode 100644 bitcoin/network_decentralization/metrics/total_entities.py diff --git a/bitcoin/network_decentralization/metrics/entropy.py b/bitcoin/network_decentralization/metrics/entropy.py new file mode 100644 index 0000000..56e1b84 --- /dev/null +++ b/bitcoin/network_decentralization/metrics/entropy.py @@ -0,0 +1,48 @@ +from math import log +from network_decentralization.metrics.total_entities import compute_total_entities + + +def compute_entropy(block_distribution, alpha): + """ + Calculates the entropy of a distribution of blocks to entities + Pi is the relative frequency of each entity. + Renyi entropy: 1/(1-alpha) * log2 (sum (Pi**alpha)) + Shannon entropy (alpha=1): −sum P(Si) log2 (Pi) + Min entropy (alpha=-1): -log max Pi + :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order + :param alpha: the entropy parameter (depending on its value the corresponding entropy measure is used) + :returns: a float that represents the entropy of the data or None if the data is empty + """ + all_blocks = sum(block_distribution) + if all_blocks == 0: + return None + if alpha == 1: + entropy = 0 + for value in block_distribution: + rel_freq = value / all_blocks + if rel_freq > 0: + entropy -= rel_freq * log(rel_freq, 2) + else: + if alpha == -1: + entropy = - log(max(block_distribution)/all_blocks, 2) + else: + sum_freqs = 0 + for entry in block_distribution: + sum_freqs += pow(entry/all_blocks, alpha) + entropy = log(sum_freqs, 2) / (1 - alpha) + + return entropy + + +def compute_max_entropy(num_entities, alpha): + return compute_entropy([1 for i in range(num_entities)], alpha) + + +def compute_entropy_percentage(block_distribution, alpha): + if sum(block_distribution) == 0: + return None + try: + total_entities = compute_total_entities(block_distribution) + return compute_entropy(block_distribution, alpha) / compute_max_entropy(total_entities, alpha) + except ZeroDivisionError: + return 0 diff --git a/bitcoin/network_decentralization/metrics/herfindahl_hirschman_index.py b/bitcoin/network_decentralization/metrics/herfindahl_hirschman_index.py new file mode 100644 index 0000000..de22b87 --- /dev/null +++ b/bitcoin/network_decentralization/metrics/herfindahl_hirschman_index.py @@ -0,0 +1,20 @@ +def compute_hhi(block_distribution): + """ + Calculates the Herfindahl-Hirschman index of a distribution of blocks to entities + From investopedia: The HHI is calculated by squaring the market share of each firm competing in a market and then + summing the resulting numbers. It can range from close to 0 to 10,000, with lower values indicating a less + concentrated market. The U.S. Department of Justice considers a market with an HHI of less than 1,500 to be a + competitive marketplace, an HHI of 1,500 to 2,500 to be a moderately concentrated marketplace, + and an HHI of 2,500 or greater to be a highly concentrated marketplace. + :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order + :return: float between 0 and 10,000 that represents the HHI of the given distribution or None if the data is empty + """ + total_blocks = sum(block_distribution) + if total_blocks == 0: + return None + + hhi = 0 + for num_blocks in block_distribution: + hhi += pow(100 * num_blocks / total_blocks, 2) + + return hhi diff --git a/bitcoin/network_decentralization/metrics/nakamoto_coefficient.py b/bitcoin/network_decentralization/metrics/nakamoto_coefficient.py new file mode 100644 index 0000000..6f38992 --- /dev/null +++ b/bitcoin/network_decentralization/metrics/nakamoto_coefficient.py @@ -0,0 +1,10 @@ +from network_decentralization.metrics.tau_index import compute_tau_index + + +def compute_nakamoto_coefficient(block_distribution): + """ + Calculates the Nakamoto coefficient of a distribution of blocks to entities + :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order + :returns: int that represents the Nakamoto coefficient of the given distribution, or None if the data is empty + """ + return compute_tau_index(block_distribution, 0.5) diff --git a/bitcoin/network_decentralization/metrics/tau_index.py b/bitcoin/network_decentralization/metrics/tau_index.py new file mode 100644 index 0000000..a87ae23 --- /dev/null +++ b/bitcoin/network_decentralization/metrics/tau_index.py @@ -0,0 +1,18 @@ +def compute_tau_index(block_distribution, threshold): + """ + Calculates the tau-decentralization index of a distribution of blocks + :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order + :param threshold: float, the parameter of the tau-decentralization index, i.e. the threshold for the power + ratio that is captured by the index (e.g. 0.66 for 66%) + :returns: int that corresponds to the tau index of the given distribution, or None if there were no blocks + """ + total_blocks = sum(block_distribution) + if total_blocks == 0: + return None + tau_index, power_ratio_covered = 0, 0 + for block_amount in block_distribution: + if power_ratio_covered >= threshold: + break + tau_index += 1 + power_ratio_covered += block_amount / total_blocks + return tau_index diff --git a/bitcoin/network_decentralization/metrics/total_entities.py b/bitcoin/network_decentralization/metrics/total_entities.py new file mode 100644 index 0000000..3ebaabe --- /dev/null +++ b/bitcoin/network_decentralization/metrics/total_entities.py @@ -0,0 +1,7 @@ +def compute_total_entities(block_distribution): + """ + Computes the number of entities that have produced blocks in the given timeframe. + :param block_distribution: list of integers, each being the blocks that an entity has produced + :returns: an integer that represents the number of entities that have produced blocks + """ + return len([v for v in block_distribution if v > 0]) From c780365cf2485e2881924bc0ea861345f2c86b04 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 3 Mar 2026 16:36:13 +0000 Subject: [PATCH 07/42] Update README.md --- bitcoin/README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/bitcoin/README.md b/bitcoin/README.md index 683d8ad..9ebe4a9 100644 --- a/bitcoin/README.md +++ b/bitcoin/README.md @@ -13,7 +13,8 @@ This component of the project analyses the decentralisation of Bitcoin, Bitcoin 1. **Network Crawling:** `crawl.py` tries to discover all reachable nodes participating in the network. Based on the [Bitnodes](https://github.com/ayeowch/bitnodes.git) project. 2. **Data Collection:** Scripts collect data about nodes like IP addresses and client versions. 3. **Data Parsing:** `parse.py` formats raw logs into structured files. -4. **Visualisation:** `plot.py` generates several graphs. +4. **Metrics Computation:** `compute_metrics.py` calculates decentralisation metrics. +5. **Visualisation:** `plot.py` generates several graphs. --- @@ -27,12 +28,12 @@ This component of the project analyses the decentralisation of Bitcoin, Bitcoin - **`parse.py`** Processes raw data (e.g., logs from crawling) into structured formats (JSON, CSV) for easier analysis and plotting. +- **`compute_metrics.py`** + Computes network decentralisation metrics (HHI, Nakamoto coefficient, entropy, max power ratio) from CSV files. + - **`analyze.py`** (Not in use) Analyses datasets to extract decentralisation metrics. -- **`distribution.py`** - Distributes Tor nodes among others proportionally, by country or organisation. - - **`plot.py`** Generates data visualisations. @@ -92,6 +93,7 @@ The scripts generate: - Parsed node datasets (CSV, JSON) - Geolocation-enriched data - Plots and charts in PNG +- Computed metrics in `output_organizations_*.csv` and `output_countries_*.csv` files --- @@ -106,9 +108,9 @@ bitcoin/ ├── collect_geodata.py ├── collect_osdata.py ├── crawl.py -├── distribution.py ├── parse.py ├── plot.py +├── compute_metrics.py │ ├── config.yaml ├── requirements.txt From 4820164f949d1661a0656eb5a779a7a7fe673edf Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Wed, 4 Mar 2026 16:40:48 +0000 Subject: [PATCH 08/42] Reorder metrics computation and visualisation sections in workflow --- bitcoin/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bitcoin/README.md b/bitcoin/README.md index 9ebe4a9..f34e25f 100644 --- a/bitcoin/README.md +++ b/bitcoin/README.md @@ -13,8 +13,8 @@ This component of the project analyses the decentralisation of Bitcoin, Bitcoin 1. **Network Crawling:** `crawl.py` tries to discover all reachable nodes participating in the network. Based on the [Bitnodes](https://github.com/ayeowch/bitnodes.git) project. 2. **Data Collection:** Scripts collect data about nodes like IP addresses and client versions. 3. **Data Parsing:** `parse.py` formats raw logs into structured files. -4. **Metrics Computation:** `compute_metrics.py` calculates decentralisation metrics. -5. **Visualisation:** `plot.py` generates several graphs. +4. **Visualisation:** `plot.py` generates several graphs. +5. **Metrics Computation:** `compute_metrics.py` calculates decentralisation metrics. --- From c508167a7b1e4ea3a713d3be5c60fc6a19b6e776 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:13:25 +0000 Subject: [PATCH 09/42] Delete unused script --- bitcoin/analyze.py | 60 ---------------------------------------------- 1 file changed, 60 deletions(-) delete mode 100644 bitcoin/analyze.py diff --git a/bitcoin/analyze.py b/bitcoin/analyze.py deleted file mode 100644 index c30c268..0000000 --- a/bitcoin/analyze.py +++ /dev/null @@ -1,60 +0,0 @@ -import csv -import network_decentralization.helper as hlp -import networkx as nx -import logging - -logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO) - - -LEDGERS = ['bitcoin', 'bitcoin_cash', 'dogecoin', 'litecoin', 'zcash'] -LEDGERS = ['bitcoin_cash', 'dogecoin', 'litecoin', 'zcash'] - - -network_edge_dir = hlp.get_output_directory() / 'network_edges' - -for ledger in LEDGERS: - logging.info(f'Analyzing {ledger}') - - output_dir = hlp.get_output_directory() - edges = [] - nodes = set() - try: - with open(output_dir / 'network_edges' / f'{ledger}.csv') as f: - csv_reader = csv.reader(f) - next(csv_reader) - for source, dest in csv_reader: - nodes.add(source) - nodes.add(dest) - if source != dest: - edges.append((source, dest)) - except FileNotFoundError: - continue - - G = nx.DiGraph() - G.add_edges_from(edges) - - all_nodes = list(G.nodes()) - logging.info(f'\t Nodes: {len(nodes):,} - Edges: {len(edges):,}') - logging.info(f'\t Isolated nodes (no in/out edges): {len(nodes)-len(all_nodes):,}') - - degrees = G.degree() - avg_degree = sum([i[1] for i in degrees]) / len(degrees) - logging.info(f'\t Average node degree: {avg_degree:,}') - - is_strongly_connected = nx.is_strongly_connected(G) - logging.info(f'\t Is strongly connected: {is_strongly_connected}') - - if is_strongly_connected: - diameter = nx.diameter(G) - logging.info(f'\t Diameter (largest component): {diameter:,}') - else: - diameter = {} - for node in all_nodes: - shortest_paths = nx.shortest_path(G, source=node) - longest_shortest_path = max(shortest_paths.items(), key=lambda x: len(x[1]))[1] - if longest_shortest_path == [node]: - diameter[node] = -1 - else: - diameter[node] = len(longest_shortest_path) - logging.info(f'\t Diameter of known graph: {max(diameter.values())}') - logging.info(f'\t Nodes without outgoing edges: {len([i for i in diameter if diameter[i] == -1])}') From 3f12547abc0584dd0e4bf1d390ef6d33be67cc66 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:14:57 +0000 Subject: [PATCH 10/42] Delete unused script --- bitcoin/collect_osdata.py | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 bitcoin/collect_osdata.py diff --git a/bitcoin/collect_osdata.py b/bitcoin/collect_osdata.py deleted file mode 100644 index 43e2cdb..0000000 --- a/bitcoin/collect_osdata.py +++ /dev/null @@ -1,30 +0,0 @@ -from network_decentralization.collect import collect_osdata -import network_decentralization.helper as hlp -import time -import logging - -logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO) - - -def main(): - ledgers = hlp.get_ledgers() - timings = {} - for ledger in ledgers: - start = time.time() - collect_osdata(ledger, time.strftime('%Y-%m-%d')) - total_time = time.time() - start - timings[ledger] = total_time - - print(2*'----------------\n') - for ledger in hlp.get_ledgers(): - total_time = timings[ledger] - days = int(total_time / 86400) - hours = int((total_time - days*86400) / 3600) - mins = int((total_time - hours*3600 - days*86400) / 60) - secs = int(total_time - mins*60 - hours*3600 - days*86400) - print(f'\tcollect_osdata.py: {ledger} total time: {hours:02} hours, {mins:02} mins, {secs:02} secs') - print(2*'----------------\n') - - -if __name__ == '__main__': - main() From 68d7b752f81ae6d2f36a0716f75ad7bcbe91c767 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:16:21 +0000 Subject: [PATCH 11/42] Remove unused analyze.py and collect_osdata.py Removed unused scripts from README and directory structure. --- bitcoin/README.md | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/bitcoin/README.md b/bitcoin/README.md index f34e25f..18c6c10 100644 --- a/bitcoin/README.md +++ b/bitcoin/README.md @@ -31,21 +31,16 @@ This component of the project analyses the decentralisation of Bitcoin, Bitcoin - **`compute_metrics.py`** Computes network decentralisation metrics (HHI, Nakamoto coefficient, entropy, max power ratio) from CSV files. -- **`analyze.py`** (Not in use) - Analyses datasets to extract decentralisation metrics. - - **`plot.py`** Generates data visualisations. - **`collect_geodata.py`** Uses third-party APIs to enrich nodes with geolocation info (country, city, organisation). -- **`collect_osdata.py`** (Not in use) - Identifies the operating system running on nodes. - - **`cleanup_dead_nodes.py`** Scans stored node datasets to remove offline or unreachable nodes. + ### Automation & Configuration - **`automation.sh`** @@ -102,11 +97,9 @@ The scripts generate: ``` bitcoin/ │ -├── analyze.py ├── automation.sh ├── cleanup_dead_nodes.py ├── collect_geodata.py -├── collect_osdata.py ├── crawl.py ├── parse.py ├── plot.py From 117b95a88edff519f9cecbbbe365604f224ccd79 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Fri, 6 Mar 2026 14:50:29 +0000 Subject: [PATCH 12/42] Metrics now layer-agnostic --- .../metrics/concentration_ratio.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bitcoin/network_decentralization/metrics/concentration_ratio.py b/bitcoin/network_decentralization/metrics/concentration_ratio.py index 8893cb2..4c85cb5 100644 --- a/bitcoin/network_decentralization/metrics/concentration_ratio.py +++ b/bitcoin/network_decentralization/metrics/concentration_ratio.py @@ -1,9 +1,9 @@ -def compute_concentration_ratio(block_distribution, topn): +def compute_concentration_ratio(distribution, topn): """ - Calculates the n-concentration ratio of a distribution of balances - :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order - :param topn: the number of top block producers to consider - :returns: float that represents the ratio of blocks produced by the top n block producers (0 if there weren't any) + Calculates the n-concentration ratio of a distribution + :param distribution: list of non-negative counts per entity, sorted in descending order + :param topn: the number of top entities to consider + :returns: float that represents the ratio of total count held by the top n entities (0 if total is 0) """ - total_blocks = sum(block_distribution) - return sum(block_distribution[:topn]) / total_blocks if total_blocks else 0 + total = sum(distribution) + return sum(distribution[:topn]) / total if total else 0 From aa2dec56eae348ea0126ab9d11a856412e5b8dae Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Fri, 6 Mar 2026 14:50:59 +0000 Subject: [PATCH 13/42] Metrics now layer-agnostic --- .../metrics/entropy.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/bitcoin/network_decentralization/metrics/entropy.py b/bitcoin/network_decentralization/metrics/entropy.py index 56e1b84..e4352b7 100644 --- a/bitcoin/network_decentralization/metrics/entropy.py +++ b/bitcoin/network_decentralization/metrics/entropy.py @@ -2,33 +2,33 @@ from network_decentralization.metrics.total_entities import compute_total_entities -def compute_entropy(block_distribution, alpha): +def compute_entropy(distribution, alpha): """ - Calculates the entropy of a distribution of blocks to entities + Calculates the entropy of an entity distribution. Pi is the relative frequency of each entity. Renyi entropy: 1/(1-alpha) * log2 (sum (Pi**alpha)) Shannon entropy (alpha=1): −sum P(Si) log2 (Pi) Min entropy (alpha=-1): -log max Pi - :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order + :param distribution: list of non-negative counts per entity, sorted in descending order :param alpha: the entropy parameter (depending on its value the corresponding entropy measure is used) :returns: a float that represents the entropy of the data or None if the data is empty """ - all_blocks = sum(block_distribution) - if all_blocks == 0: + total = sum(distribution) + if total == 0: return None if alpha == 1: entropy = 0 - for value in block_distribution: - rel_freq = value / all_blocks + for value in distribution: + rel_freq = value / total if rel_freq > 0: entropy -= rel_freq * log(rel_freq, 2) else: if alpha == -1: - entropy = - log(max(block_distribution)/all_blocks, 2) + entropy = -log(max(distribution) / total, 2) else: sum_freqs = 0 - for entry in block_distribution: - sum_freqs += pow(entry/all_blocks, alpha) + for entry in distribution: + sum_freqs += pow(entry / total, alpha) entropy = log(sum_freqs, 2) / (1 - alpha) return entropy @@ -38,11 +38,11 @@ def compute_max_entropy(num_entities, alpha): return compute_entropy([1 for i in range(num_entities)], alpha) -def compute_entropy_percentage(block_distribution, alpha): - if sum(block_distribution) == 0: +def compute_entropy_percentage(distribution, alpha): + if sum(distribution) == 0: return None try: - total_entities = compute_total_entities(block_distribution) - return compute_entropy(block_distribution, alpha) / compute_max_entropy(total_entities, alpha) + total_entities = compute_total_entities(distribution) + return compute_entropy(distribution, alpha) / compute_max_entropy(total_entities, alpha) except ZeroDivisionError: return 0 From 9e6e6807f9f374a8ff3851850c15dfc61bcd0e48 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Fri, 6 Mar 2026 14:51:16 +0000 Subject: [PATCH 14/42] Metrics now layer-agnostic --- .../metrics/herfindahl_hirschman_index.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bitcoin/network_decentralization/metrics/herfindahl_hirschman_index.py b/bitcoin/network_decentralization/metrics/herfindahl_hirschman_index.py index de22b87..fe6fc41 100644 --- a/bitcoin/network_decentralization/metrics/herfindahl_hirschman_index.py +++ b/bitcoin/network_decentralization/metrics/herfindahl_hirschman_index.py @@ -1,20 +1,20 @@ -def compute_hhi(block_distribution): +def compute_hhi(distribution): """ - Calculates the Herfindahl-Hirschman index of a distribution of blocks to entities + Calculates the Herfindahl-Hirschman index of an entity distribution. From investopedia: The HHI is calculated by squaring the market share of each firm competing in a market and then summing the resulting numbers. It can range from close to 0 to 10,000, with lower values indicating a less concentrated market. The U.S. Department of Justice considers a market with an HHI of less than 1,500 to be a competitive marketplace, an HHI of 1,500 to 2,500 to be a moderately concentrated marketplace, and an HHI of 2,500 or greater to be a highly concentrated marketplace. - :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order + :param distribution: list of non-negative counts per entity, sorted in descending order :return: float between 0 and 10,000 that represents the HHI of the given distribution or None if the data is empty """ - total_blocks = sum(block_distribution) - if total_blocks == 0: + total = sum(distribution) + if total == 0: return None hhi = 0 - for num_blocks in block_distribution: - hhi += pow(100 * num_blocks / total_blocks, 2) + for count in distribution: + hhi += pow(100 * count / total, 2) return hhi From b5bd2ecf76adfe325044b2112311449349fd181e Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Fri, 6 Mar 2026 14:51:33 +0000 Subject: [PATCH 15/42] Metrics now layer-agnostic --- .../metrics/nakamoto_coefficient.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bitcoin/network_decentralization/metrics/nakamoto_coefficient.py b/bitcoin/network_decentralization/metrics/nakamoto_coefficient.py index 6f38992..e0bd938 100644 --- a/bitcoin/network_decentralization/metrics/nakamoto_coefficient.py +++ b/bitcoin/network_decentralization/metrics/nakamoto_coefficient.py @@ -1,10 +1,10 @@ from network_decentralization.metrics.tau_index import compute_tau_index -def compute_nakamoto_coefficient(block_distribution): +def compute_nakamoto_coefficient(distribution): """ - Calculates the Nakamoto coefficient of a distribution of blocks to entities - :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order + Calculates the Nakamoto coefficient of an entity distribution. + :param distribution: list of non-negative counts per entity, sorted in descending order :returns: int that represents the Nakamoto coefficient of the given distribution, or None if the data is empty """ - return compute_tau_index(block_distribution, 0.5) + return compute_tau_index(distribution, 0.5) From d524a6d104f6e32455655d37f127bf7b5c034c70 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Fri, 6 Mar 2026 14:51:53 +0000 Subject: [PATCH 16/42] Metrics now layer-agnostic --- .../metrics/tau_index.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bitcoin/network_decentralization/metrics/tau_index.py b/bitcoin/network_decentralization/metrics/tau_index.py index a87ae23..17edd5d 100644 --- a/bitcoin/network_decentralization/metrics/tau_index.py +++ b/bitcoin/network_decentralization/metrics/tau_index.py @@ -1,18 +1,18 @@ -def compute_tau_index(block_distribution, threshold): +def compute_tau_index(distribution, threshold): """ - Calculates the tau-decentralization index of a distribution of blocks - :param block_distribution: a list of integers, each being the blocks that an entity has produced, sorted in descending order + Calculates the tau-decentralization index of an entity distribution. + :param distribution: list of non-negative counts per entity, sorted in descending order :param threshold: float, the parameter of the tau-decentralization index, i.e. the threshold for the power ratio that is captured by the index (e.g. 0.66 for 66%) - :returns: int that corresponds to the tau index of the given distribution, or None if there were no blocks + :returns: int that corresponds to the tau index of the given distribution, or None if total is 0 """ - total_blocks = sum(block_distribution) - if total_blocks == 0: + total = sum(distribution) + if total == 0: return None tau_index, power_ratio_covered = 0, 0 - for block_amount in block_distribution: + for amount in distribution: if power_ratio_covered >= threshold: break tau_index += 1 - power_ratio_covered += block_amount / total_blocks + power_ratio_covered += amount / total return tau_index From 32011d9ec3ab141daecd1eca03147ff340e47ded Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Fri, 6 Mar 2026 14:52:16 +0000 Subject: [PATCH 17/42] Metrics now layer-agnostic --- .../network_decentralization/metrics/total_entities.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bitcoin/network_decentralization/metrics/total_entities.py b/bitcoin/network_decentralization/metrics/total_entities.py index 3ebaabe..0a2e82b 100644 --- a/bitcoin/network_decentralization/metrics/total_entities.py +++ b/bitcoin/network_decentralization/metrics/total_entities.py @@ -1,7 +1,7 @@ -def compute_total_entities(block_distribution): +def compute_total_entities(distribution): """ - Computes the number of entities that have produced blocks in the given timeframe. - :param block_distribution: list of integers, each being the blocks that an entity has produced - :returns: an integer that represents the number of entities that have produced blocks + Computes the number of entities with a positive count in the given distribution. + :param distribution: list of non-negative counts per entity + :returns: number of entities with count > 0 """ - return len([v for v in block_distribution if v > 0]) + return len([v for v in distribution if v > 0]) From bc1db11ef88858be3d22b1a445f9790b11a19ec9 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:03:16 +0000 Subject: [PATCH 18/42] Unused metrics --- .../metrics/tau_index.py | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 bitcoin/network_decentralization/metrics/tau_index.py diff --git a/bitcoin/network_decentralization/metrics/tau_index.py b/bitcoin/network_decentralization/metrics/tau_index.py deleted file mode 100644 index 17edd5d..0000000 --- a/bitcoin/network_decentralization/metrics/tau_index.py +++ /dev/null @@ -1,18 +0,0 @@ -def compute_tau_index(distribution, threshold): - """ - Calculates the tau-decentralization index of an entity distribution. - :param distribution: list of non-negative counts per entity, sorted in descending order - :param threshold: float, the parameter of the tau-decentralization index, i.e. the threshold for the power - ratio that is captured by the index (e.g. 0.66 for 66%) - :returns: int that corresponds to the tau index of the given distribution, or None if total is 0 - """ - total = sum(distribution) - if total == 0: - return None - tau_index, power_ratio_covered = 0, 0 - for amount in distribution: - if power_ratio_covered >= threshold: - break - tau_index += 1 - power_ratio_covered += amount / total - return tau_index From 517fdf8d4f8b257b1073d537073f719fabd33e7f Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:03:27 +0000 Subject: [PATCH 19/42] Unused metrics --- bitcoin/network_decentralization/metrics/total_entities.py | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 bitcoin/network_decentralization/metrics/total_entities.py diff --git a/bitcoin/network_decentralization/metrics/total_entities.py b/bitcoin/network_decentralization/metrics/total_entities.py deleted file mode 100644 index 0a2e82b..0000000 --- a/bitcoin/network_decentralization/metrics/total_entities.py +++ /dev/null @@ -1,7 +0,0 @@ -def compute_total_entities(distribution): - """ - Computes the number of entities with a positive count in the given distribution. - :param distribution: list of non-negative counts per entity - :returns: number of entities with count > 0 - """ - return len([v for v in distribution if v > 0]) From f9d98bc59757e57d0dbe3d0f86a2f7d17ba34536 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:04:23 +0000 Subject: [PATCH 20/42] Add network and geo metrics to config.yaml --- bitcoin/config.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/bitcoin/config.yaml b/bitcoin/config.yaml index fcf6110..42f8f52 100644 --- a/bitcoin/config.yaml +++ b/bitcoin/config.yaml @@ -23,3 +23,17 @@ last_time_active: 1 # The first path will be used to write newly created dbs and the output of runs output_directories: - ./output + +# Metrics for network analysis (organizations) +network_metrics: + - HHI + - Nakamoto +# - Entropy + - Max Power Ratio + +# Metrics for geographic analysis (countries) +geo_metrics: + - HHI + - Nakamoto + - Entropy + - Max Power Ratio From 594013deb1aff4c3497541f860ac312298035ecd Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:06:58 +0000 Subject: [PATCH 21/42] Refactor script to accept metric names as parameters --- bitcoin/compute_metrics.py | 132 ++++++++++++++++++++----------------- 1 file changed, 71 insertions(+), 61 deletions(-) diff --git a/bitcoin/compute_metrics.py b/bitcoin/compute_metrics.py index 06ef2aa..292c71f 100644 --- a/bitcoin/compute_metrics.py +++ b/bitcoin/compute_metrics.py @@ -7,6 +7,7 @@ import csv import pathlib import sys +from network_decentralization.helper import get_config_data, get_metrics_network, get_metrics_geo from network_decentralization.metrics.herfindahl_hirschman_index import compute_hhi from network_decentralization.metrics.nakamoto_coefficient import compute_nakamoto_coefficient from network_decentralization.metrics.entropy import compute_entropy @@ -58,34 +59,41 @@ def get_ledger_name(csv_path): return '_'.join(parts[1:]) -def compute_metrics(distribution): +def compute_metrics(distribution, metric_names): """ - Compute all metrics for a given distribution. + Compute specified metrics for a given distribution. :param distribution: Sorted list of entity counts (descending order) - :return: Dictionary with all computed metrics + :param metric_names: List of metric names to compute (e.g., ['HHI', 'Nakamoto', 'Entropy']) + :return: Dictionary with computed metric values """ total = sum(distribution) + metrics = {} + + # Mapping of metric display names to computation functions + metric_map = { + 'HHI': ('hhi', compute_hhi), + 'Nakamoto': ('nakamoto', compute_nakamoto_coefficient), + 'Entropy': ('entropy', lambda d: compute_entropy(d, alpha=1)), + 'Max Power Ratio': ('max_power_ratio', lambda d: max(d) / total if d else 0) + } if total == 0: - return { - 'hhi': None, - 'nakamoto': None, - 'entropy': None, - 'max_power_ratio': None - } + return {metric_map[name][0]: None for name in metric_names if name in metric_map} - metrics = { - 'hhi': compute_hhi(distribution), - 'nakamoto': compute_nakamoto_coefficient(distribution), - 'entropy': compute_entropy(distribution, alpha=1), # Shannon entropy - 'max_power_ratio': max(distribution) / total if distribution else 0 - } + for metric_name in metric_names: + if metric_name in metric_map: + key, func = metric_map[metric_name] + try: + metrics[key] = func(distribution) + except Exception as e: + print(f"Error computing {metric_name}: {e}", file=sys.stderr) + metrics[key] = None return metrics -def process_csv_files(output_dir, file_pattern, is_country=False): +def process_csv_files(output_dir, file_pattern, is_country, metric_names): """ Process all CSV files matching a pattern and output metrics. Appends results to existing files or creates new ones. @@ -94,6 +102,7 @@ def process_csv_files(output_dir, file_pattern, is_country=False): :param output_dir: Path to the output directory :param file_pattern: Glob pattern for CSV files (e.g., 'organizations_*.csv') :param is_country: Boolean to indicate if processing country files + :param metric_names: List of metric names to compute and output """ csv_files = sorted(output_dir.glob(file_pattern)) @@ -113,48 +122,46 @@ def process_csv_files(output_dir, file_pattern, is_country=False): csv_path = without_tor_path date, distribution = read_csv_data(csv_path) - metrics = compute_metrics(distribution) + metrics = compute_metrics(distribution, metric_names) + + # Determine output filename and metric column mapping + file_type = 'countries' if is_country else 'organizations' + output_filename = f"output_{file_type}_{ledger}.csv" + output_path = output_dir / output_filename + file_exists = output_path.exists() + + # Map display names to internal keys for column ordering + metric_key_map = { + 'HHI': 'hhi', + 'Nakamoto': 'nakamoto', + 'Entropy': 'entropy', + 'Max Power Ratio': 'max_power_ratio' + } - # Determine output filename - if is_country: - output_filename = f"output_countries_{ledger}.csv" - output_path = output_dir / output_filename - file_exists = output_path.exists() + # Build header from metric names + header = ['ledger', 'date', 'clustering'] + metric_names + + # Write header and data (append if exists) + with open(output_path, 'a', newline='', encoding='utf-8') as f: + writer = csv.writer(f) + if not file_exists: + writer.writerow(header) - # Write header and data (append if exists) - with open(output_path, 'a', newline='', encoding='utf-8') as f: - writer = csv.writer(f) - if not file_exists: - writer.writerow(['ledger', 'date', 'clustering', 'entropy', 'hhi', 'nakamoto_coefficient', 'max_power_ratio']) - writer.writerow([ - ledger, - date, - 'False', - f"{metrics['entropy']:.15g}", - f"{metrics['hhi']:.16g}", - metrics['nakamoto'], - f"{metrics['max_power_ratio']:.16g}" - ]) - print(f"Appended to: {output_filename}", file=sys.stderr) - else: - output_filename = f"output_organizations_{ledger}.csv" - output_path = output_dir / output_filename - file_exists = output_path.exists() + # Build row with metric values in the same order as header + row = [ledger, date, 'False'] + for metric_display_name in metric_names: + metric_key = metric_key_map.get(metric_display_name) + value = metrics.get(metric_key) if metric_key else None + if value is None: + row.append('') + elif isinstance(value, float): + row.append(f"{value:.16g}") + else: + row.append(str(value)) - # Write header and data (append if exists) - with open(output_path, 'a', newline='', encoding='utf-8') as f: - writer = csv.writer(f) - if not file_exists: - writer.writerow(['ledger', 'date', 'clustering', 'hhi', 'nakamoto_coefficient', 'max_power_ratio']) - writer.writerow([ - ledger, - date, - 'False', - f"{metrics['hhi']:.16g}", - metrics['nakamoto'], - f"{metrics['max_power_ratio']:.16g}" - ]) - print(f"Appended to: {output_filename}", file=sys.stderr) + writer.writerow(row) + + print(f"Appended to: {output_filename}", file=sys.stderr) except Exception as e: print(f"Error processing {csv_path.name}: {e}", file=sys.stderr) @@ -164,19 +171,22 @@ def process_csv_files(output_dir, file_pattern, is_country=False): def main(): """ Main entry point for the script. - Processes organization and country CSV files from the output directory. + Loads metric names from config and processes organization and country CSV files. """ + # Load metric names from config using helper functions + network_metrics = get_metrics_network() + geo_metrics = get_metrics_geo() + output_dir = pathlib.Path(__file__).parent / 'output' if not output_dir.exists(): print(f"Error: Output directory not found at {output_dir}", file=sys.stderr) sys.exit(1) + # Process organization files with network metrics + process_csv_files(output_dir, 'organizations_*.csv', is_country=False, metric_names=network_metrics) - # Process organization files - process_csv_files(output_dir, 'organizations_*.csv', is_country=False) - - # Process country files - process_csv_files(output_dir, 'countries_*.csv', is_country=True) + # Process country files with geo metrics + process_csv_files(output_dir, 'countries_*.csv', is_country=True, metric_names=geo_metrics) if __name__ == '__main__': From 3fa89d8a1604000346acc31e221b674473d9a1d7 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:07:41 +0000 Subject: [PATCH 22/42] Add functions to retrieve network and geo metrics from config file --- bitcoin/network_decentralization/helper.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/bitcoin/network_decentralization/helper.py b/bitcoin/network_decentralization/helper.py index 7a6651e..bd49263 100644 --- a/bitcoin/network_decentralization/helper.py +++ b/bitcoin/network_decentralization/helper.py @@ -64,6 +64,22 @@ def get_concurrency(): return get_config_data()['execution_parameters']['concurrency'] +def get_metrics_network(): + """ + Retrieves the list of metrics to compute for network analysis (organizations) + :returns: a list of metric names to compute + """ + return get_config_data().get('network_metrics', ['HHI', 'Nakamoto', 'Entropy', 'Max Power Ratio']) + + +def get_metrics_geo(): + """ + Retrieves the list of metrics to compute for geographic analysis (countries) + :returns: a list of metric names to compute + """ + return get_config_data().get('geo_metrics', ['HHI', 'Nakamoto', 'Entropy', 'Max Power Ratio']) + + def get_output_directory(ledger=None, dead=False): """ Reads the config file and retrieves the output directory From 639e404485024a726b0496511f4ba30ac08595ad Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:15:27 +0000 Subject: [PATCH 23/42] Add metrics directory and related Python files --- bitcoin/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bitcoin/README.md b/bitcoin/README.md index 18c6c10..a7a02e5 100644 --- a/bitcoin/README.md +++ b/bitcoin/README.md @@ -116,7 +116,12 @@ bitcoin/ │ ├── collect.py │ ├── constants.py │ ├── helper.py -│ └── protocol.py +│ ├── protocol.py +│ └── metrics/ +│ ├── concentration_ratio.py +│ ├── entropy.py +│ ├── herfindahl_hirschman_index.py +│ └── nakamoto_coefficient.py │ └── seed_info/ ├── bitcoin.json From 3a35644a14c65689eb452f82245adef56acbabc8 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:19:39 +0000 Subject: [PATCH 24/42] Refactor ledger name extraction and processing logic --- bitcoin/compute_metrics.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/bitcoin/compute_metrics.py b/bitcoin/compute_metrics.py index 292c71f..75b739c 100644 --- a/bitcoin/compute_metrics.py +++ b/bitcoin/compute_metrics.py @@ -54,6 +54,7 @@ def get_ledger_name(csv_path): :return: Ledger name (e.g., 'bitcoin', 'bitcoin_cash') """ filename = csv_path.stem # Get filename without extension + filename = filename.replace('_without_tor', '') # Normalize bitcoin without_tor variant parts = filename.split('_') # Remove 'organizations' or 'countries' prefix return '_'.join(parts[1:]) @@ -104,23 +105,19 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names): :param is_country: Boolean to indicate if processing country files :param metric_names: List of metric names to compute and output """ + # For bitcoin, prefer the _without_tor variant if it exists; skip the regular bitcoin file in that case + file_type = 'countries' if is_country else 'organizations' + without_tor_path = output_dir / f"{file_type}_bitcoin_without_tor.csv" + skip_regular_bitcoin = without_tor_path.exists() + csv_files = sorted(output_dir.glob(file_pattern)) for csv_path in csv_files: - # Skip _without_tor files in the glob - we'll handle them explicitly for bitcoin - if '_without_tor' in csv_path.name: + if csv_path.name == f"{file_type}_bitcoin.csv" and skip_regular_bitcoin: continue try: ledger = get_ledger_name(csv_path) - - # For bitcoin, check if _without_tor version exists and use that instead - if ledger == 'bitcoin': - file_type = 'countries' if is_country else 'organizations' - without_tor_path = output_dir / f"{file_type}_bitcoin_without_tor.csv" - if without_tor_path.exists(): - csv_path = without_tor_path - date, distribution = read_csv_data(csv_path) metrics = compute_metrics(distribution, metric_names) From 1061ea4ae4f90f98990c64635cc264c636d98530 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:04:37 +0000 Subject: [PATCH 25/42] Refactor compute_metrics to use compute_concentration_ratio --- bitcoin/compute_metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bitcoin/compute_metrics.py b/bitcoin/compute_metrics.py index 75b739c..ab97205 100644 --- a/bitcoin/compute_metrics.py +++ b/bitcoin/compute_metrics.py @@ -7,7 +7,7 @@ import csv import pathlib import sys -from network_decentralization.helper import get_config_data, get_metrics_network, get_metrics_geo +from network_decentralization.helper import get_metrics_network, get_metrics_geo from network_decentralization.metrics.herfindahl_hirschman_index import compute_hhi from network_decentralization.metrics.nakamoto_coefficient import compute_nakamoto_coefficient from network_decentralization.metrics.entropy import compute_entropy @@ -76,7 +76,7 @@ def compute_metrics(distribution, metric_names): 'HHI': ('hhi', compute_hhi), 'Nakamoto': ('nakamoto', compute_nakamoto_coefficient), 'Entropy': ('entropy', lambda d: compute_entropy(d, alpha=1)), - 'Max Power Ratio': ('max_power_ratio', lambda d: max(d) / total if d else 0) + 'Max Power Ratio': ('max_power_ratio', lambda d: compute_concentration_ratio(d, topn=1)) } if total == 0: From 4ffcd558b3281daecd5c4ee3ce66f11968b2523e Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:19:08 +0000 Subject: [PATCH 26/42] Concentration ratio's top-N is now a parameter --- bitcoin/compute_metrics.py | 76 +++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 25 deletions(-) diff --git a/bitcoin/compute_metrics.py b/bitcoin/compute_metrics.py index ab97205..887c538 100644 --- a/bitcoin/compute_metrics.py +++ b/bitcoin/compute_metrics.py @@ -7,7 +7,7 @@ import csv import pathlib import sys -from network_decentralization.helper import get_metrics_network, get_metrics_geo +from network_decentralization.helper import get_metrics_network, get_metrics_geo, get_concentration_ratio_topn from network_decentralization.metrics.herfindahl_hirschman_index import compute_hhi from network_decentralization.metrics.nakamoto_coefficient import compute_nakamoto_coefficient from network_decentralization.metrics.entropy import compute_entropy @@ -60,29 +60,37 @@ def get_ledger_name(csv_path): return '_'.join(parts[1:]) -def compute_metrics(distribution, metric_names): +def compute_metrics(distribution, metric_names, concentration_ratio_topn): """ Compute specified metrics for a given distribution. :param distribution: Sorted list of entity counts (descending order) :param metric_names: List of metric names to compute (e.g., ['HHI', 'Nakamoto', 'Entropy']) + :param concentration_ratio_topn: list of top-N parameters for concentration ratio metrics :return: Dictionary with computed metric values """ - total = sum(distribution) metrics = {} - + # Mapping of metric display names to computation functions + # Concentration Ratio is handled separately because one metric name expands to multiple outputs (one per configured top-N value). metric_map = { 'HHI': ('hhi', compute_hhi), 'Nakamoto': ('nakamoto', compute_nakamoto_coefficient), 'Entropy': ('entropy', lambda d: compute_entropy(d, alpha=1)), - 'Max Power Ratio': ('max_power_ratio', lambda d: compute_concentration_ratio(d, topn=1)) } - if total == 0: - return {metric_map[name][0]: None for name in metric_names if name in metric_map} - for metric_name in metric_names: + # Keep legacy 'Max Power Ratio' as an alias so older configs still work. + if metric_name in ('Concentration Ratio', 'Max Power Ratio'): + for topn in concentration_ratio_topn: + key = f"concentration_ratio_top_{topn}" + try: + metrics[key] = compute_concentration_ratio(distribution, topn=topn) + except Exception as e: + print(f"Error computing {metric_name} (topn={topn}): {e}", file=sys.stderr) + metrics[key] = None + continue + if metric_name in metric_map: key, func = metric_map[metric_name] try: @@ -94,7 +102,7 @@ def compute_metrics(distribution, metric_names): return metrics -def process_csv_files(output_dir, file_pattern, is_country, metric_names): +def process_csv_files(output_dir, file_pattern, is_country, metric_names, concentration_ratio_topn): """ Process all CSV files matching a pattern and output metrics. Appends results to existing files or creates new ones. @@ -104,6 +112,7 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names): :param file_pattern: Glob pattern for CSV files (e.g., 'organizations_*.csv') :param is_country: Boolean to indicate if processing country files :param metric_names: List of metric names to compute and output + :param concentration_ratio_topn: list of top-N parameters for concentration ratio metrics """ # For bitcoin, prefer the _without_tor variant if it exists; skip the regular bitcoin file in that case file_type = 'countries' if is_country else 'organizations' @@ -119,7 +128,7 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names): try: ledger = get_ledger_name(csv_path) date, distribution = read_csv_data(csv_path) - metrics = compute_metrics(distribution, metric_names) + metrics = compute_metrics(distribution, metric_names, concentration_ratio_topn) # Determine output filename and metric column mapping file_type = 'countries' if is_country else 'organizations' @@ -127,16 +136,21 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names): output_path = output_dir / output_filename file_exists = output_path.exists() - # Map display names to internal keys for column ordering - metric_key_map = { - 'HHI': 'hhi', - 'Nakamoto': 'nakamoto', - 'Entropy': 'entropy', - 'Max Power Ratio': 'max_power_ratio' - } - - # Build header from metric names - header = ['ledger', 'date', 'clustering'] + metric_names + # Build output metric columns from selected metrics. + metric_columns = [] + for metric_name in metric_names: + # Keep legacy 'Max Power Ratio' as an alias so older configs still work. + if metric_name in ('Concentration Ratio', 'Max Power Ratio'): + for topn in concentration_ratio_topn: + metric_columns.append((f"Concentration Ratio (Top {topn})", f"concentration_ratio_top_{topn}")) + elif metric_name == 'HHI': + metric_columns.append(('HHI', 'hhi')) + elif metric_name == 'Nakamoto': + metric_columns.append(('Nakamoto', 'nakamoto')) + elif metric_name == 'Entropy': + metric_columns.append(('Entropy', 'entropy')) + + header = ['ledger', 'date', 'clustering'] + [column[0] for column in metric_columns] # Write header and data (append if exists) with open(output_path, 'a', newline='', encoding='utf-8') as f: @@ -146,9 +160,8 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names): # Build row with metric values in the same order as header row = [ledger, date, 'False'] - for metric_display_name in metric_names: - metric_key = metric_key_map.get(metric_display_name) - value = metrics.get(metric_key) if metric_key else None + for _, metric_key in metric_columns: + value = metrics.get(metric_key) if value is None: row.append('') elif isinstance(value, float): @@ -173,6 +186,7 @@ def main(): # Load metric names from config using helper functions network_metrics = get_metrics_network() geo_metrics = get_metrics_geo() + concentration_ratio_topn = get_concentration_ratio_topn() output_dir = pathlib.Path(__file__).parent / 'output' @@ -180,10 +194,22 @@ def main(): print(f"Error: Output directory not found at {output_dir}", file=sys.stderr) sys.exit(1) # Process organization files with network metrics - process_csv_files(output_dir, 'organizations_*.csv', is_country=False, metric_names=network_metrics) + process_csv_files( + output_dir, + 'organizations_*.csv', + is_country=False, + metric_names=network_metrics, + concentration_ratio_topn=concentration_ratio_topn, + ) # Process country files with geo metrics - process_csv_files(output_dir, 'countries_*.csv', is_country=True, metric_names=geo_metrics) + process_csv_files( + output_dir, + 'countries_*.csv', + is_country=True, + metric_names=geo_metrics, + concentration_ratio_topn=concentration_ratio_topn, + ) if __name__ == '__main__': From d69344ec4266a8beaf389edabc42836794727aef Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:19:53 +0000 Subject: [PATCH 27/42] Concentration ratio's top-N is now a parameter --- bitcoin/network_decentralization/helper.py | 27 ++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/bitcoin/network_decentralization/helper.py b/bitcoin/network_decentralization/helper.py index bd49263..e1178d6 100644 --- a/bitcoin/network_decentralization/helper.py +++ b/bitcoin/network_decentralization/helper.py @@ -69,7 +69,7 @@ def get_metrics_network(): Retrieves the list of metrics to compute for network analysis (organizations) :returns: a list of metric names to compute """ - return get_config_data().get('network_metrics', ['HHI', 'Nakamoto', 'Entropy', 'Max Power Ratio']) + return get_config_data().get('network_metrics', ['HHI', 'Nakamoto', 'Entropy', 'Concentration Ratio']) def get_metrics_geo(): @@ -77,7 +77,30 @@ def get_metrics_geo(): Retrieves the list of metrics to compute for geographic analysis (countries) :returns: a list of metric names to compute """ - return get_config_data().get('geo_metrics', ['HHI', 'Nakamoto', 'Entropy', 'Max Power Ratio']) + return get_config_data().get('geo_metrics', ['HHI', 'Nakamoto', 'Entropy', 'Concentration Ratio']) + + +def get_concentration_ratio_topn(): + """ + Retrieves top-N values used by concentration-ratio based metrics. + :returns: list of unique positive integers (defaults to [1, 3]) + """ + params = get_config_data().get('metrics_parameters', {}) + raw_topn = params.get('concentration_ratio_topn', [1, 3]) + + if not isinstance(raw_topn, list): + raw_topn = [raw_topn] + + values = [] + for value in raw_topn: + try: + parsed = int(value) + if parsed > 0 and parsed not in values: + values.append(parsed) + except (TypeError, ValueError): + continue + + return values if values else [1, 3] def get_output_directory(ledger=None, dead=False): From 61a96a563cdf2f1517b0f24c033241a46a5332f3 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:20:19 +0000 Subject: [PATCH 28/42] Concentration ratio's top-N is now a parameter --- bitcoin/config.yaml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bitcoin/config.yaml b/bitcoin/config.yaml index 42f8f52..dd3b8be 100644 --- a/bitcoin/config.yaml +++ b/bitcoin/config.yaml @@ -29,11 +29,17 @@ network_metrics: - HHI - Nakamoto # - Entropy - - Max Power Ratio + - Concentration Ratio # Metrics for geographic analysis (countries) geo_metrics: - HHI - Nakamoto - Entropy - - Max Power Ratio + - Concentration Ratio + +# Parameters for metric computation +metrics_parameters: + concentration_ratio_topn: + - 1 + - 3 From 7860933bd79efe5c9904a33816e07c48e4bf5a25 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:49:44 +0000 Subject: [PATCH 29/42] Add function to get ledgers without Tor parameter --- bitcoin/network_decentralization/helper.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/bitcoin/network_decentralization/helper.py b/bitcoin/network_decentralization/helper.py index e1178d6..0b80686 100644 --- a/bitcoin/network_decentralization/helper.py +++ b/bitcoin/network_decentralization/helper.py @@ -103,6 +103,24 @@ def get_concentration_ratio_topn(): return values if values else [1, 3] +def get_without_tor_ledgers(): + """ + Retrieves the target ledgers for generating *_without_tor CSV files. + :returns: list of ledger names, or None when not configured + """ + params = get_config_data().get('parse_parameters', {}) + raw_ledgers = params.get('without_tor_ledgers') + + if raw_ledgers is None: + return None + + if not isinstance(raw_ledgers, list): + raw_ledgers = [raw_ledgers] + + ledgers = [ledger.strip() for ledger in raw_ledgers if isinstance(ledger, str) and ledger.strip()] + return list(dict.fromkeys(ledgers)) or None + + def get_output_directory(ledger=None, dead=False): """ Reads the config file and retrieves the output directory From 74672c158a00054d27d42570060618fef98d2052 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:50:10 +0000 Subject: [PATCH 30/42] Add parse_parameters for ledgers without Tor --- bitcoin/config.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bitcoin/config.yaml b/bitcoin/config.yaml index dd3b8be..9490e31 100644 --- a/bitcoin/config.yaml +++ b/bitcoin/config.yaml @@ -43,3 +43,8 @@ metrics_parameters: concentration_ratio_topn: - 1 - 3 + +# Parameters for parsing/output generation +parse_parameters: + without_tor_ledgers: + - bitcoin From 2a5d15f43536ee1d58c419242e3df01158c5be4c Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:51:09 +0000 Subject: [PATCH 31/42] Ledgers without Tor are now parameters --- bitcoin/parse.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/bitcoin/parse.py b/bitcoin/parse.py index 7fa39c2..4793020 100644 --- a/bitcoin/parse.py +++ b/bitcoin/parse.py @@ -350,16 +350,16 @@ def redistribute_tor_nodes(name, ledger, df, mode): df_without_tor[[mode, date]].to_csv(f'./output/{name}_{ledger}_without_tor.csv', index=False) # save the updated DataFrame to a new CSV -def without_tor(): +def create_without_tor_files(ledger): """ - Loads a CSV file and calls the redistribute_tor_nodes function. + Loads CSV files for the given ledger and redistributes Tor nodes. + :param ledger: the ledger to process for *_without_tor output files """ - ledger = 'bitcoin' modes = ['Countries', 'Organizations'] for mode in modes: logging.info(f'parse.py: Removing Tor from {ledger} {mode}') name = mode.lower() - filename = pathlib.Path(f'./output/{name}_{ledger}.csv') + filename = Path(f'./output/{name}_{ledger}.csv') if not filename.is_file(): logging.warning(f"File not found: {filename}") return None @@ -420,6 +420,7 @@ def cluster_organizations(ledger): def main(): logging.info('Start parsing') + without_tor_ledgers = set(hlp.get_without_tor_ledgers() or []) reachable_nodes = {} for ledger in LEDGERS: @@ -429,8 +430,8 @@ def main(): geography(reachable_nodes, ledger, mode) if 'Organizations' in MODES: cluster_organizations(ledger) - if 'bitcoin' in LEDGERS: - without_tor() + if ledger in without_tor_ledgers: + create_without_tor_files(ledger) if __name__ == '__main__': main() From b641976bd16f6990f8905cc3391d5361e9073fde Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:54:11 +0000 Subject: [PATCH 32/42] Rename parameter --- bitcoin/parse.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bitcoin/parse.py b/bitcoin/parse.py index 4793020..90f86fd 100644 --- a/bitcoin/parse.py +++ b/bitcoin/parse.py @@ -327,10 +327,10 @@ def version(reachable_nodes, mode): versions_df.to_csv(f'./output/{name.lower()}_{ledger}.csv', index_label = name) -def redistribute_tor_nodes(name, ledger, df, mode): +def redistribute_tor_nodes(mode_lower, ledger, df, mode): """ Redistributes Tor node count proportionally across non-Tor rows. - :param name: lowercase version of the mode ('countries' or 'organizations') used in file naming. + :param mode_lower: lowercase version of mode ('countries' or 'organizations') used in file naming. :param ledger: the ledger name. :param df: the dataframe in which the Tor nodes must be reditributed. :param mode: the mode name (e.g., 'Countries', 'Organizations'). @@ -347,7 +347,7 @@ def redistribute_tor_nodes(name, ledger, df, mode): df['Distribution'] = df.apply(lambda row: round((row[f'{date}'] / number_of_total_nodes_without_tor) * number_of_tor_nodes) if row[f'{mode}'] != 'Tor' else 0, axis=1) # create a new column 'Distribution' that distributes the Tor nodes proportionally to non-Tor rows df[date] = df[date] + df['Distribution'] df_without_tor = df[df[f'{mode}'] != 'Tor'] # filter out the Tor row - df_without_tor[[mode, date]].to_csv(f'./output/{name}_{ledger}_without_tor.csv', index=False) # save the updated DataFrame to a new CSV + df_without_tor[[mode, date]].to_csv(f'./output/{mode_lower}_{ledger}_without_tor.csv', index=False) # save the updated DataFrame to a new CSV def create_without_tor_files(ledger): @@ -358,13 +358,13 @@ def create_without_tor_files(ledger): modes = ['Countries', 'Organizations'] for mode in modes: logging.info(f'parse.py: Removing Tor from {ledger} {mode}') - name = mode.lower() - filename = Path(f'./output/{name}_{ledger}.csv') + mode_lower = mode.lower() + filename = Path(f'./output/{mode_lower}_{ledger}.csv') if not filename.is_file(): logging.warning(f"File not found: {filename}") return None df = pd.read_csv(filename) - redistribute_tor_nodes(name, ledger, df, mode) + redistribute_tor_nodes(mode_lower, ledger, df, mode) def cluster_organizations(ledger): From 40590a591bf3f56a7f053ad302c6d1176f2d2c7f Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 10 Mar 2026 15:10:51 +0000 Subject: [PATCH 33/42] Update automation.sh --- bitcoin/automation.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bitcoin/automation.sh b/bitcoin/automation.sh index 995e2f2..371b4da 100644 --- a/bitcoin/automation.sh +++ b/bitcoin/automation.sh @@ -8,14 +8,13 @@ do python3 crawl.py # comment this line if new data must not be gathered python3 cleanup_dead_nodes.py python3 collect_geodata.py -#python3 collect_osdata.py # not in use python3 parse.py python3 plot.py python3 compute_metrics.py # The following 2 lines create a folder and move all png and csv files to it mkdir output/"$(date +%Y-%m-%d)" -mv -t output/"$(date +%Y-%m-%d)" output/{clients,countries,protocols,organizations,ip_type}_*.csv output/response_length.json 2>/dev/null || true +mv -t output/"$(date +%Y-%m-%d)" output/{clients,countries,protocols,organizations,ip}_*.csv output/response_length.json 2>/dev/null || true sleep 7d # will repeat the whole process every X days From a995b9de68a377c2477a8d9ca9c267163a49d498 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Fri, 13 Mar 2026 12:31:22 +0000 Subject: [PATCH 34/42] Revert "Unused metrics" This reverts commit 517fdf8d4f8b257b1073d537073f719fabd33e7f. --- bitcoin/network_decentralization/metrics/total_entities.py | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 bitcoin/network_decentralization/metrics/total_entities.py diff --git a/bitcoin/network_decentralization/metrics/total_entities.py b/bitcoin/network_decentralization/metrics/total_entities.py new file mode 100644 index 0000000..0a2e82b --- /dev/null +++ b/bitcoin/network_decentralization/metrics/total_entities.py @@ -0,0 +1,7 @@ +def compute_total_entities(distribution): + """ + Computes the number of entities with a positive count in the given distribution. + :param distribution: list of non-negative counts per entity + :returns: number of entities with count > 0 + """ + return len([v for v in distribution if v > 0]) From f73591e27c9fa214fad019f1cdb1e84f0df5b6f4 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Fri, 13 Mar 2026 12:31:37 +0000 Subject: [PATCH 35/42] Revert "Unused metrics" This reverts commit bc1db11ef88858be3d22b1a445f9790b11a19ec9. --- .../metrics/tau_index.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 bitcoin/network_decentralization/metrics/tau_index.py diff --git a/bitcoin/network_decentralization/metrics/tau_index.py b/bitcoin/network_decentralization/metrics/tau_index.py new file mode 100644 index 0000000..17edd5d --- /dev/null +++ b/bitcoin/network_decentralization/metrics/tau_index.py @@ -0,0 +1,18 @@ +def compute_tau_index(distribution, threshold): + """ + Calculates the tau-decentralization index of an entity distribution. + :param distribution: list of non-negative counts per entity, sorted in descending order + :param threshold: float, the parameter of the tau-decentralization index, i.e. the threshold for the power + ratio that is captured by the index (e.g. 0.66 for 66%) + :returns: int that corresponds to the tau index of the given distribution, or None if total is 0 + """ + total = sum(distribution) + if total == 0: + return None + tau_index, power_ratio_covered = 0, 0 + for amount in distribution: + if power_ratio_covered >= threshold: + break + tau_index += 1 + power_ratio_covered += amount / total + return tau_index From a3a1d908cc003d33ea6bea53ba6628cbbbb9c040 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Mon, 16 Mar 2026 11:41:49 +0000 Subject: [PATCH 36/42] Update automation.sh --- bitcoin/automation.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitcoin/automation.sh b/bitcoin/automation.sh index 371b4da..2c58ebc 100644 --- a/bitcoin/automation.sh +++ b/bitcoin/automation.sh @@ -14,7 +14,7 @@ python3 compute_metrics.py # The following 2 lines create a folder and move all png and csv files to it mkdir output/"$(date +%Y-%m-%d)" -mv -t output/"$(date +%Y-%m-%d)" output/{clients,countries,protocols,organizations,ip}_*.csv output/response_length.json 2>/dev/null || true +mv -t output/"$(date +%Y-%m-%d)" output/{clients,countries,protocols,organizations,ip,discovery,peerstore}*.csv output/response_length.json output/*.png 2>/dev/null || true sleep 7d # will repeat the whole process every X days From 2bccfdd0bce5fbf51a52cbc68a78b655c78dcbb5 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 17 Mar 2026 10:25:26 +0000 Subject: [PATCH 37/42] Remove 'max power ratio' occurrences --- bitcoin/README.md | 2 +- bitcoin/compute_metrics.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/bitcoin/README.md b/bitcoin/README.md index a7a02e5..e319f18 100644 --- a/bitcoin/README.md +++ b/bitcoin/README.md @@ -29,7 +29,7 @@ This component of the project analyses the decentralisation of Bitcoin, Bitcoin Processes raw data (e.g., logs from crawling) into structured formats (JSON, CSV) for easier analysis and plotting. - **`compute_metrics.py`** - Computes network decentralisation metrics (HHI, Nakamoto coefficient, entropy, max power ratio) from CSV files. + Computes network decentralisation metrics (HHI, Nakamoto coefficient, entropy, concentration ratios) from CSV files. - **`plot.py`** Generates data visualisations. diff --git a/bitcoin/compute_metrics.py b/bitcoin/compute_metrics.py index 887c538..6be5cb0 100644 --- a/bitcoin/compute_metrics.py +++ b/bitcoin/compute_metrics.py @@ -80,8 +80,7 @@ def compute_metrics(distribution, metric_names, concentration_ratio_topn): } for metric_name in metric_names: - # Keep legacy 'Max Power Ratio' as an alias so older configs still work. - if metric_name in ('Concentration Ratio', 'Max Power Ratio'): + if metric_name == 'Concentration Ratio': for topn in concentration_ratio_topn: key = f"concentration_ratio_top_{topn}" try: @@ -139,8 +138,7 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names, concen # Build output metric columns from selected metrics. metric_columns = [] for metric_name in metric_names: - # Keep legacy 'Max Power Ratio' as an alias so older configs still work. - if metric_name in ('Concentration Ratio', 'Max Power Ratio'): + if metric_name == 'Concentration Ratio': for topn in concentration_ratio_topn: metric_columns.append((f"Concentration Ratio (Top {topn})", f"concentration_ratio_top_{topn}")) elif metric_name == 'HHI': From 8e6431bcbb654c5954c9e46564f8129f28f9f7b6 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 17 Mar 2026 10:35:14 +0000 Subject: [PATCH 38/42] Update compute_metrics.py --- bitcoin/compute_metrics.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/bitcoin/compute_metrics.py b/bitcoin/compute_metrics.py index 6be5cb0..92629b3 100644 --- a/bitcoin/compute_metrics.py +++ b/bitcoin/compute_metrics.py @@ -141,12 +141,8 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names, concen if metric_name == 'Concentration Ratio': for topn in concentration_ratio_topn: metric_columns.append((f"Concentration Ratio (Top {topn})", f"concentration_ratio_top_{topn}")) - elif metric_name == 'HHI': - metric_columns.append(('HHI', 'hhi')) - elif metric_name == 'Nakamoto': - metric_columns.append(('Nakamoto', 'nakamoto')) - elif metric_name == 'Entropy': - metric_columns.append(('Entropy', 'entropy')) + else: + metric_columns.append((metric_name, metric_name.lower())) header = ['ledger', 'date', 'clustering'] + [column[0] for column in metric_columns] From f23044569252826ef4a7ed8f9d1205543e63e0dd Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Tue, 17 Mar 2026 11:13:21 +0000 Subject: [PATCH 39/42] Entropy function's alpha now configurable --- bitcoin/compute_metrics.py | 29 ++++++++++++++++++---- bitcoin/config.yaml | 2 ++ bitcoin/network_decentralization/helper.py | 23 +++++++++++++++++ 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/bitcoin/compute_metrics.py b/bitcoin/compute_metrics.py index 92629b3..455b92e 100644 --- a/bitcoin/compute_metrics.py +++ b/bitcoin/compute_metrics.py @@ -7,7 +7,7 @@ import csv import pathlib import sys -from network_decentralization.helper import get_metrics_network, get_metrics_geo, get_concentration_ratio_topn +from network_decentralization.helper import get_metrics_network, get_metrics_geo, get_concentration_ratio_topn, get_entropy_alphas from network_decentralization.metrics.herfindahl_hirschman_index import compute_hhi from network_decentralization.metrics.nakamoto_coefficient import compute_nakamoto_coefficient from network_decentralization.metrics.entropy import compute_entropy @@ -60,13 +60,14 @@ def get_ledger_name(csv_path): return '_'.join(parts[1:]) -def compute_metrics(distribution, metric_names, concentration_ratio_topn): +def compute_metrics(distribution, metric_names, concentration_ratio_topn, entropy_alphas): """ Compute specified metrics for a given distribution. :param distribution: Sorted list of entity counts (descending order) :param metric_names: List of metric names to compute (e.g., ['HHI', 'Nakamoto', 'Entropy']) :param concentration_ratio_topn: list of top-N parameters for concentration ratio metrics + :param entropy_alphas: alpha parameters used for entropy metric computation :return: Dictionary with computed metric values """ metrics = {} @@ -76,7 +77,6 @@ def compute_metrics(distribution, metric_names, concentration_ratio_topn): metric_map = { 'HHI': ('hhi', compute_hhi), 'Nakamoto': ('nakamoto', compute_nakamoto_coefficient), - 'Entropy': ('entropy', lambda d: compute_entropy(d, alpha=1)), } for metric_name in metric_names: @@ -90,6 +90,17 @@ def compute_metrics(distribution, metric_names, concentration_ratio_topn): metrics[key] = None continue + if metric_name == 'Entropy': + for alpha in entropy_alphas: + alpha_str = f"{alpha:g}" + key = f"entropy_alpha_{alpha_str}" + try: + metrics[key] = compute_entropy(distribution, alpha=alpha) + except Exception as e: + print(f"Error computing {metric_name} (alpha={alpha_str}): {e}", file=sys.stderr) + metrics[key] = None + continue + if metric_name in metric_map: key, func = metric_map[metric_name] try: @@ -101,7 +112,7 @@ def compute_metrics(distribution, metric_names, concentration_ratio_topn): return metrics -def process_csv_files(output_dir, file_pattern, is_country, metric_names, concentration_ratio_topn): +def process_csv_files(output_dir, file_pattern, is_country, metric_names, concentration_ratio_topn, entropy_alphas): """ Process all CSV files matching a pattern and output metrics. Appends results to existing files or creates new ones. @@ -112,6 +123,7 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names, concen :param is_country: Boolean to indicate if processing country files :param metric_names: List of metric names to compute and output :param concentration_ratio_topn: list of top-N parameters for concentration ratio metrics + :param entropy_alphas: alpha parameters used for entropy metric computation """ # For bitcoin, prefer the _without_tor variant if it exists; skip the regular bitcoin file in that case file_type = 'countries' if is_country else 'organizations' @@ -127,7 +139,7 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names, concen try: ledger = get_ledger_name(csv_path) date, distribution = read_csv_data(csv_path) - metrics = compute_metrics(distribution, metric_names, concentration_ratio_topn) + metrics = compute_metrics(distribution, metric_names, concentration_ratio_topn, entropy_alphas) # Determine output filename and metric column mapping file_type = 'countries' if is_country else 'organizations' @@ -141,6 +153,10 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names, concen if metric_name == 'Concentration Ratio': for topn in concentration_ratio_topn: metric_columns.append((f"Concentration Ratio (Top {topn})", f"concentration_ratio_top_{topn}")) + elif metric_name == 'Entropy': + for alpha in entropy_alphas: + alpha_str = f"{alpha:g}" + metric_columns.append((f"Entropy (alpha={alpha_str})", f"entropy_alpha_{alpha_str}")) else: metric_columns.append((metric_name, metric_name.lower())) @@ -181,6 +197,7 @@ def main(): network_metrics = get_metrics_network() geo_metrics = get_metrics_geo() concentration_ratio_topn = get_concentration_ratio_topn() + entropy_alphas = get_entropy_alphas() output_dir = pathlib.Path(__file__).parent / 'output' @@ -194,6 +211,7 @@ def main(): is_country=False, metric_names=network_metrics, concentration_ratio_topn=concentration_ratio_topn, + entropy_alphas=entropy_alphas, ) # Process country files with geo metrics @@ -203,6 +221,7 @@ def main(): is_country=True, metric_names=geo_metrics, concentration_ratio_topn=concentration_ratio_topn, + entropy_alphas=entropy_alphas, ) diff --git a/bitcoin/config.yaml b/bitcoin/config.yaml index 9490e31..ec7c83e 100644 --- a/bitcoin/config.yaml +++ b/bitcoin/config.yaml @@ -43,6 +43,8 @@ metrics_parameters: concentration_ratio_topn: - 1 - 3 + entropy_alpha: + - 1 # Parameters for parsing/output generation parse_parameters: diff --git a/bitcoin/network_decentralization/helper.py b/bitcoin/network_decentralization/helper.py index 0b80686..8544a81 100644 --- a/bitcoin/network_decentralization/helper.py +++ b/bitcoin/network_decentralization/helper.py @@ -103,6 +103,29 @@ def get_concentration_ratio_topn(): return values if values else [1, 3] +def get_entropy_alphas(): + """ + Retrieves alpha values used by entropy metrics. + :returns: list of unique floats (defaults to [1]) + """ + params = get_config_data().get('metrics_parameters', {}) + raw_alphas = params.get('entropy_alpha', [1]) + + if not isinstance(raw_alphas, list): + raw_alphas = [raw_alphas] + + values = [] + for value in raw_alphas: + try: + parsed = float(value) + if parsed not in values: + values.append(parsed) + except (TypeError, ValueError): + continue + + return values if values else [1] + + def get_without_tor_ledgers(): """ Retrieves the target ledgers for generating *_without_tor CSV files. From 0061c14faf724a1808d964e125c44dfc9095490d Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Wed, 18 Mar 2026 15:17:49 +0000 Subject: [PATCH 40/42] Metrics now loaded as a dictionary from config.yaml --- bitcoin/compute_metrics.py | 145 +++++++++++---------- bitcoin/config.yaml | 29 ++--- bitcoin/network_decentralization/helper.py | 101 +++++++------- 3 files changed, 145 insertions(+), 130 deletions(-) diff --git a/bitcoin/compute_metrics.py b/bitcoin/compute_metrics.py index 455b92e..108d31a 100644 --- a/bitcoin/compute_metrics.py +++ b/bitcoin/compute_metrics.py @@ -7,7 +7,9 @@ import csv import pathlib import sys -from network_decentralization.helper import get_metrics_network, get_metrics_geo, get_concentration_ratio_topn, get_entropy_alphas +from ast import literal_eval + +from network_decentralization.helper import get_metrics_network, get_metrics_geo from network_decentralization.metrics.herfindahl_hirschman_index import compute_hhi from network_decentralization.metrics.nakamoto_coefficient import compute_nakamoto_coefficient from network_decentralization.metrics.entropy import compute_entropy @@ -60,59 +62,89 @@ def get_ledger_name(csv_path): return '_'.join(parts[1:]) -def compute_metrics(distribution, metric_names, concentration_ratio_topn, entropy_alphas): +def normalize_metric_name(metric_name): + """Normalizes metric names from config into registry keys.""" + if metric_name is None: + return '' + return str(metric_name).strip().lower().replace('-', '_').replace(' ', '_') + + +def parse_metric_spec(metric_spec): + """Parses metric token strings like 'entropy=1' into (token, name, parameter).""" + token = str(metric_spec).strip() + if not token: + return None + + if '=' not in token: + return token, normalize_metric_name(token), None + + raw_name, raw_parameter = token.split('=', 1) + normalized_name = normalize_metric_name(raw_name) + parameter_text = raw_parameter.strip() + parameter_value = parse_metric_parameter(parameter_text) + return token, normalized_name, parameter_value + + +def parse_metric_parameter(parameter_text): + """Parses metric parameter values from config strings into Python values.""" + if parameter_text is None: + return None + + text = str(parameter_text).strip() + if not text: + return None + + try: + return literal_eval(text) + except (ValueError, SyntaxError): + return text + + +def build_metric_columns(metric_specs): + """ + Builds ordered metric specs from configured metric tokens. + :param metric_specs: list of metric tokens (e.g., ['hhi', 'entropy=1']) + :returns: list of tuples (metric_token, metric_name, parameter_value) + """ + columns = [] + for metric_spec in metric_specs: + parsed = parse_metric_spec(metric_spec) + if parsed is None: + continue + + metric_token, metric_name, parameter_value = parsed + columns.append((metric_token, metric_name, parameter_value)) + + return columns + + +def compute_metrics(distribution, metric_columns): """ Compute specified metrics for a given distribution. :param distribution: Sorted list of entity counts (descending order) - :param metric_names: List of metric names to compute (e.g., ['HHI', 'Nakamoto', 'Entropy']) - :param concentration_ratio_topn: list of top-N parameters for concentration ratio metrics - :param entropy_alphas: alpha parameters used for entropy metric computation + :param metric_columns: list of tuples (metric_token, metric_name, parameter_value) :return: Dictionary with computed metric values """ metrics = {} - # Mapping of metric display names to computation functions - # Concentration Ratio is handled separately because one metric name expands to multiple outputs (one per configured top-N value). - metric_map = { - 'HHI': ('hhi', compute_hhi), - 'Nakamoto': ('nakamoto', compute_nakamoto_coefficient), - } - - for metric_name in metric_names: - if metric_name == 'Concentration Ratio': - for topn in concentration_ratio_topn: - key = f"concentration_ratio_top_{topn}" - try: - metrics[key] = compute_concentration_ratio(distribution, topn=topn) - except Exception as e: - print(f"Error computing {metric_name} (topn={topn}): {e}", file=sys.stderr) - metrics[key] = None - continue + for metric_token, metric_name, parameter_value in metric_columns: + function_name = f"compute_{metric_name}" - if metric_name == 'Entropy': - for alpha in entropy_alphas: - alpha_str = f"{alpha:g}" - key = f"entropy_alpha_{alpha_str}" - try: - metrics[key] = compute_entropy(distribution, alpha=alpha) - except Exception as e: - print(f"Error computing {metric_name} (alpha={alpha_str}): {e}", file=sys.stderr) - metrics[key] = None - continue - - if metric_name in metric_map: - key, func = metric_map[metric_name] - try: - metrics[key] = func(distribution) - except Exception as e: - print(f"Error computing {metric_name}: {e}", file=sys.stderr) - metrics[key] = None + try: + function = eval(function_name) + if parameter_value is None: + metrics[metric_token] = function(distribution) + else: + metrics[metric_token] = function(distribution, parameter_value) + except Exception as e: + print(f"Error computing {metric_token}: {e}", file=sys.stderr) + metrics[metric_token] = None return metrics -def process_csv_files(output_dir, file_pattern, is_country, metric_names, concentration_ratio_topn, entropy_alphas): +def process_csv_files(output_dir, file_pattern, is_country, metric_names): """ Process all CSV files matching a pattern and output metrics. Appends results to existing files or creates new ones. @@ -122,9 +154,9 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names, concen :param file_pattern: Glob pattern for CSV files (e.g., 'organizations_*.csv') :param is_country: Boolean to indicate if processing country files :param metric_names: List of metric names to compute and output - :param concentration_ratio_topn: list of top-N parameters for concentration ratio metrics - :param entropy_alphas: alpha parameters used for entropy metric computation """ + metric_columns = build_metric_columns(metric_names) + # For bitcoin, prefer the _without_tor variant if it exists; skip the regular bitcoin file in that case file_type = 'countries' if is_country else 'organizations' without_tor_path = output_dir / f"{file_type}_bitcoin_without_tor.csv" @@ -139,7 +171,7 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names, concen try: ledger = get_ledger_name(csv_path) date, distribution = read_csv_data(csv_path) - metrics = compute_metrics(distribution, metric_names, concentration_ratio_topn, entropy_alphas) + metrics = compute_metrics(distribution, metric_columns) # Determine output filename and metric column mapping file_type = 'countries' if is_country else 'organizations' @@ -147,20 +179,7 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names, concen output_path = output_dir / output_filename file_exists = output_path.exists() - # Build output metric columns from selected metrics. - metric_columns = [] - for metric_name in metric_names: - if metric_name == 'Concentration Ratio': - for topn in concentration_ratio_topn: - metric_columns.append((f"Concentration Ratio (Top {topn})", f"concentration_ratio_top_{topn}")) - elif metric_name == 'Entropy': - for alpha in entropy_alphas: - alpha_str = f"{alpha:g}" - metric_columns.append((f"Entropy (alpha={alpha_str})", f"entropy_alpha_{alpha_str}")) - else: - metric_columns.append((metric_name, metric_name.lower())) - - header = ['ledger', 'date', 'clustering'] + [column[0] for column in metric_columns] + header = ['ledger', 'date', 'clustering'] + [metric_token for metric_token, _, _ in metric_columns] # Write header and data (append if exists) with open(output_path, 'a', newline='', encoding='utf-8') as f: @@ -170,8 +189,8 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names, concen # Build row with metric values in the same order as header row = [ledger, date, 'False'] - for _, metric_key in metric_columns: - value = metrics.get(metric_key) + for metric_token, _, _ in metric_columns: + value = metrics.get(metric_token) if value is None: row.append('') elif isinstance(value, float): @@ -196,8 +215,6 @@ def main(): # Load metric names from config using helper functions network_metrics = get_metrics_network() geo_metrics = get_metrics_geo() - concentration_ratio_topn = get_concentration_ratio_topn() - entropy_alphas = get_entropy_alphas() output_dir = pathlib.Path(__file__).parent / 'output' @@ -210,8 +227,6 @@ def main(): 'organizations_*.csv', is_country=False, metric_names=network_metrics, - concentration_ratio_topn=concentration_ratio_topn, - entropy_alphas=entropy_alphas, ) # Process country files with geo metrics @@ -220,8 +235,6 @@ def main(): 'countries_*.csv', is_country=True, metric_names=geo_metrics, - concentration_ratio_topn=concentration_ratio_topn, - entropy_alphas=entropy_alphas, ) diff --git a/bitcoin/config.yaml b/bitcoin/config.yaml index ec7c83e..4082e88 100644 --- a/bitcoin/config.yaml +++ b/bitcoin/config.yaml @@ -10,9 +10,6 @@ mode: - Countries - Organizations -# Used by distribution.py to know which column to distribute -date: '2025-03-28' - execution_parameters: concurrency: 100 @@ -26,27 +23,23 @@ output_directories: # Metrics for network analysis (organizations) network_metrics: - - HHI - - Nakamoto -# - Entropy - - Concentration Ratio + hhi: + nakamoto_coefficient: + concentration_ratio: + - 1 + - 3 # Metrics for geographic analysis (countries) geo_metrics: - - HHI - - Nakamoto - - Entropy - - Concentration Ratio - -# Parameters for metric computation -metrics_parameters: - concentration_ratio_topn: + hhi: + nakamoto_coefficient: + entropy: - 1 - - 3 - entropy_alpha: + concentration_ratio: - 1 + - 3 # Parameters for parsing/output generation parse_parameters: without_tor_ledgers: - - bitcoin + - bitcoin \ No newline at end of file diff --git a/bitcoin/network_decentralization/helper.py b/bitcoin/network_decentralization/helper.py index 8544a81..dd7d9c3 100644 --- a/bitcoin/network_decentralization/helper.py +++ b/bitcoin/network_decentralization/helper.py @@ -42,13 +42,6 @@ def get_mode(): """ return get_config_data()['mode'] -def get_date(): - """ - Retrieves data regarding the date to use - :returns: the date to be used by distribution.py - """ - return get_config_data()['date'] - def get_active(): """ Retrieves data regarding the packets to clean up @@ -66,64 +59,80 @@ def get_concurrency(): def get_metrics_network(): """ - Retrieves the list of metrics to compute for network analysis (organizations) - :returns: a list of metric names to compute + Retrieves the list of metrics to compute for network analysis (organizations). + Supports either a list (e.g., ['hhi', 'nakamoto']) or a dictionary + (e.g., {'concentration_ratio': [1, 3]}), which is expanded to tokens like + 'concentration_ratio=1' and 'concentration_ratio=3'. + :returns: a list of metric tokens to compute """ - return get_config_data().get('network_metrics', ['HHI', 'Nakamoto', 'Entropy', 'Concentration Ratio']) + default = { + 'hhi': None, + 'nakamoto': None, + 'entropy': None, + 'concentration_ratio': None, + } + return _expand_metric_config(get_config_data().get('network_metrics', default), default) def get_metrics_geo(): """ - Retrieves the list of metrics to compute for geographic analysis (countries) - :returns: a list of metric names to compute + Retrieves the list of metrics to compute for geographic analysis (countries). + Supports either a list (e.g., ['hhi', 'nakamoto']) or a dictionary + (e.g., {'concentration_ratio': [1, 3]}), which is expanded to tokens like + 'concentration_ratio=1' and 'concentration_ratio=3'. + :returns: a list of metric tokens to compute """ - return get_config_data().get('geo_metrics', ['HHI', 'Nakamoto', 'Entropy', 'Concentration Ratio']) + default = { + 'hhi': None, + 'nakamoto': None, + 'entropy': None, + 'concentration_ratio': None, + } + return _expand_metric_config(get_config_data().get('geo_metrics', default), default) -def get_concentration_ratio_topn(): +def _expand_metric_config(raw_metrics, default_metrics): """ - Retrieves top-N values used by concentration-ratio based metrics. - :returns: list of unique positive integers (defaults to [1, 3]) + Expands metric configuration into a flat list of metric tokens. + Example: {'entropy': [1, 2]} -> ['entropy=1', 'entropy=2'] """ - params = get_config_data().get('metrics_parameters', {}) - raw_topn = params.get('concentration_ratio_topn', [1, 3]) + metrics = raw_metrics if raw_metrics is not None else default_metrics - if not isinstance(raw_topn, list): - raw_topn = [raw_topn] + if isinstance(metrics, list): + return [str(metric).strip() for metric in metrics if str(metric).strip()] - values = [] - for value in raw_topn: - try: - parsed = int(value) - if parsed > 0 and parsed not in values: - values.append(parsed) - except (TypeError, ValueError): - continue + if not isinstance(metrics, dict): + return [] - return values if values else [1, 3] + expanded = [] + for metric_name, parameter_values in metrics.items(): + name = str(metric_name).strip() + if not name: + continue + if parameter_values is None: + expanded.append(name) + continue -def get_entropy_alphas(): - """ - Retrieves alpha values used by entropy metrics. - :returns: list of unique floats (defaults to [1]) - """ - params = get_config_data().get('metrics_parameters', {}) - raw_alphas = params.get('entropy_alpha', [1]) + if isinstance(parameter_values, list): + values = parameter_values + else: + values = [parameter_values] - if not isinstance(raw_alphas, list): - raw_alphas = [raw_alphas] + unique_values = [] + for value in values: + rendered = None if value is None else str(value).strip() + if rendered is not None and rendered not in unique_values: + unique_values.append(rendered) - values = [] - for value in raw_alphas: - try: - parsed = float(value) - if parsed not in values: - values.append(parsed) - except (TypeError, ValueError): + if not unique_values: + expanded.append(name) continue - return values if values else [1] + for rendered in unique_values: + expanded.append(f"{name}={rendered}") + + return expanded def get_without_tor_ledgers(): From f9e77f5bbbc9861469383cec6425323ceed352c8 Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Wed, 18 Mar 2026 15:26:23 +0000 Subject: [PATCH 41/42] Update requirements.txt --- bitcoin/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitcoin/requirements.txt b/bitcoin/requirements.txt index 44084dd..20224ba 100644 --- a/bitcoin/requirements.txt +++ b/bitcoin/requirements.txt @@ -5,6 +5,6 @@ dnspython>=2.6.1 PySocks>=1.7.1 python3-nmap>=1.6.0 pandas>=2.2.3 +numpy>=1.26 networkx>=3.1 -scipy>=1.13 matplotlib>=3.9 From 0d6d67d4dd5b0d96c1c0795270f2cccd1ecee2cd Mon Sep 17 00:00:00 2001 From: LauraAntunes1 <190111637+LauraAntunes1@users.noreply.github.com> Date: Wed, 18 Mar 2026 16:31:24 +0000 Subject: [PATCH 42/42] No default values for metrics and without_tor not only for bitcoin --- bitcoin/compute_metrics.py | 21 ++++++++++++-------- bitcoin/network_decentralization/helper.py | 23 +++++++--------------- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/bitcoin/compute_metrics.py b/bitcoin/compute_metrics.py index 108d31a..8c76f74 100644 --- a/bitcoin/compute_metrics.py +++ b/bitcoin/compute_metrics.py @@ -9,7 +9,7 @@ import sys from ast import literal_eval -from network_decentralization.helper import get_metrics_network, get_metrics_geo +from network_decentralization.helper import get_metrics_network, get_metrics_geo, get_without_tor_ledgers from network_decentralization.metrics.herfindahl_hirschman_index import compute_hhi from network_decentralization.metrics.nakamoto_coefficient import compute_nakamoto_coefficient from network_decentralization.metrics.entropy import compute_entropy @@ -148,7 +148,7 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names): """ Process all CSV files matching a pattern and output metrics. Appends results to existing files or creates new ones. - For bitcoin, uses the _without_tor versions if they exist. + Uses _without_tor versions when configured in parse_parameters.without_tor_ledgers. :param output_dir: Path to the output directory :param file_pattern: Glob pattern for CSV files (e.g., 'organizations_*.csv') @@ -156,20 +156,25 @@ def process_csv_files(output_dir, file_pattern, is_country, metric_names): :param metric_names: List of metric names to compute and output """ metric_columns = build_metric_columns(metric_names) + without_tor_ledgers = set(get_without_tor_ledgers() or []) - # For bitcoin, prefer the _without_tor variant if it exists; skip the regular bitcoin file in that case + # Prefer configured _without_tor variants and skip the corresponding regular file when both exist. file_type = 'countries' if is_country else 'organizations' - without_tor_path = output_dir / f"{file_type}_bitcoin_without_tor.csv" - skip_regular_bitcoin = without_tor_path.exists() csv_files = sorted(output_dir.glob(file_pattern)) for csv_path in csv_files: - if csv_path.name == f"{file_type}_bitcoin.csv" and skip_regular_bitcoin: - continue - try: ledger = get_ledger_name(csv_path) + + regular_path = output_dir / f"{file_type}_{ledger}.csv" + without_tor_path = output_dir / f"{file_type}_{ledger}_without_tor.csv" + is_regular_file = csv_path.name == regular_path.name + has_without_tor_variant = without_tor_path.exists() + + if is_regular_file and ledger in without_tor_ledgers and has_without_tor_variant: + continue + date, distribution = read_csv_data(csv_path) metrics = compute_metrics(distribution, metric_columns) diff --git a/bitcoin/network_decentralization/helper.py b/bitcoin/network_decentralization/helper.py index dd7d9c3..e2dc9e8 100644 --- a/bitcoin/network_decentralization/helper.py +++ b/bitcoin/network_decentralization/helper.py @@ -65,13 +65,7 @@ def get_metrics_network(): 'concentration_ratio=1' and 'concentration_ratio=3'. :returns: a list of metric tokens to compute """ - default = { - 'hhi': None, - 'nakamoto': None, - 'entropy': None, - 'concentration_ratio': None, - } - return _expand_metric_config(get_config_data().get('network_metrics', default), default) + return _expand_metric_config(get_config_data().get('network_metrics')) def get_metrics_geo(): @@ -82,21 +76,18 @@ def get_metrics_geo(): 'concentration_ratio=1' and 'concentration_ratio=3'. :returns: a list of metric tokens to compute """ - default = { - 'hhi': None, - 'nakamoto': None, - 'entropy': None, - 'concentration_ratio': None, - } - return _expand_metric_config(get_config_data().get('geo_metrics', default), default) + return _expand_metric_config(get_config_data().get('geo_metrics')) -def _expand_metric_config(raw_metrics, default_metrics): +def _expand_metric_config(raw_metrics): """ Expands metric configuration into a flat list of metric tokens. Example: {'entropy': [1, 2]} -> ['entropy=1', 'entropy=2'] """ - metrics = raw_metrics if raw_metrics is not None else default_metrics + metrics = raw_metrics + + if metrics is None: + return [] if isinstance(metrics, list): return [str(metric).strip() for metric in metrics if str(metric).strip()]