diff --git a/bitcoin/README.md b/bitcoin/README.md index 683d8ad..e319f18 100644 --- a/bitcoin/README.md +++ b/bitcoin/README.md @@ -14,6 +14,7 @@ This component of the project analyses the decentralisation of Bitcoin, Bitcoin 2. **Data Collection:** Scripts collect data about nodes like IP addresses and client versions. 3. **Data Parsing:** `parse.py` formats raw logs into structured files. 4. **Visualisation:** `plot.py` generates several graphs. +5. **Metrics Computation:** `compute_metrics.py` calculates decentralisation metrics. --- @@ -27,11 +28,8 @@ This component of the project analyses the decentralisation of Bitcoin, Bitcoin - **`parse.py`** Processes raw data (e.g., logs from crawling) into structured formats (JSON, CSV) for easier analysis and plotting. -- **`analyze.py`** (Not in use) - Analyses datasets to extract decentralisation metrics. - -- **`distribution.py`** - Distributes Tor nodes among others proportionally, by country or organisation. +- **`compute_metrics.py`** + Computes network decentralisation metrics (HHI, Nakamoto coefficient, entropy, concentration ratios) from CSV files. - **`plot.py`** Generates data visualisations. @@ -39,12 +37,10 @@ This component of the project analyses the decentralisation of Bitcoin, Bitcoin - **`collect_geodata.py`** Uses third-party APIs to enrich nodes with geolocation info (country, city, organisation). -- **`collect_osdata.py`** (Not in use) - Identifies the operating system running on nodes. - - **`cleanup_dead_nodes.py`** Scans stored node datasets to remove offline or unreachable nodes. + ### Automation & Configuration - **`automation.sh`** @@ -92,6 +88,7 @@ The scripts generate: - Parsed node datasets (CSV, JSON) - Geolocation-enriched data - Plots and charts in PNG +- Computed metrics in `output_organizations_*.csv` and `output_countries_*.csv` files --- @@ -100,15 +97,13 @@ The scripts generate: ``` bitcoin/ │ -├── analyze.py ├── automation.sh ├── cleanup_dead_nodes.py ├── collect_geodata.py -├── collect_osdata.py ├── crawl.py -├── distribution.py ├── parse.py ├── plot.py +├── compute_metrics.py │ ├── config.yaml ├── requirements.txt @@ -121,7 +116,12 @@ bitcoin/ │ ├── collect.py │ ├── constants.py │ ├── helper.py -│ └── protocol.py +│ ├── protocol.py +│ └── metrics/ +│ ├── concentration_ratio.py +│ ├── entropy.py +│ ├── herfindahl_hirschman_index.py +│ └── nakamoto_coefficient.py │ └── seed_info/ ├── bitcoin.json diff --git a/bitcoin/analyze.py b/bitcoin/analyze.py deleted file mode 100644 index c30c268..0000000 --- a/bitcoin/analyze.py +++ /dev/null @@ -1,60 +0,0 @@ -import csv -import network_decentralization.helper as hlp -import networkx as nx -import logging - -logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO) - - -LEDGERS = ['bitcoin', 'bitcoin_cash', 'dogecoin', 'litecoin', 'zcash'] -LEDGERS = ['bitcoin_cash', 'dogecoin', 'litecoin', 'zcash'] - - -network_edge_dir = hlp.get_output_directory() / 'network_edges' - -for ledger in LEDGERS: - logging.info(f'Analyzing {ledger}') - - output_dir = hlp.get_output_directory() - edges = [] - nodes = set() - try: - with open(output_dir / 'network_edges' / f'{ledger}.csv') as f: - csv_reader = csv.reader(f) - next(csv_reader) - for source, dest in csv_reader: - nodes.add(source) - nodes.add(dest) - if source != dest: - edges.append((source, dest)) - except FileNotFoundError: - continue - - G = nx.DiGraph() - G.add_edges_from(edges) - - all_nodes = list(G.nodes()) - logging.info(f'\t Nodes: {len(nodes):,} - Edges: {len(edges):,}') - logging.info(f'\t Isolated nodes (no in/out edges): {len(nodes)-len(all_nodes):,}') - - degrees = G.degree() - avg_degree = sum([i[1] for i in degrees]) / len(degrees) - logging.info(f'\t Average node degree: {avg_degree:,}') - - is_strongly_connected = nx.is_strongly_connected(G) - logging.info(f'\t Is strongly connected: {is_strongly_connected}') - - if is_strongly_connected: - diameter = nx.diameter(G) - logging.info(f'\t Diameter (largest component): {diameter:,}') - else: - diameter = {} - for node in all_nodes: - shortest_paths = nx.shortest_path(G, source=node) - longest_shortest_path = max(shortest_paths.items(), key=lambda x: len(x[1]))[1] - if longest_shortest_path == [node]: - diameter[node] = -1 - else: - diameter[node] = len(longest_shortest_path) - logging.info(f'\t Diameter of known graph: {max(diameter.values())}') - logging.info(f'\t Nodes without outgoing edges: {len([i for i in diameter if diameter[i] == -1])}') diff --git a/bitcoin/automation.sh b/bitcoin/automation.sh index d010661..2c58ebc 100644 --- a/bitcoin/automation.sh +++ b/bitcoin/automation.sh @@ -8,14 +8,13 @@ do python3 crawl.py # comment this line if new data must not be gathered python3 cleanup_dead_nodes.py python3 collect_geodata.py -#python3 collect_osdata.py # not in use python3 parse.py -python3 distribution.py python3 plot.py +python3 compute_metrics.py # The following 2 lines create a folder and move all png and csv files to it mkdir output/"$(date +%Y-%m-%d)" -mv -t output/"$(date +%Y-%m-%d)" output/*.png output/*.csv +mv -t output/"$(date +%Y-%m-%d)" output/{clients,countries,protocols,organizations,ip,discovery,peerstore}*.csv output/response_length.json output/*.png 2>/dev/null || true sleep 7d # will repeat the whole process every X days diff --git a/bitcoin/collect_osdata.py b/bitcoin/collect_osdata.py deleted file mode 100644 index 43e2cdb..0000000 --- a/bitcoin/collect_osdata.py +++ /dev/null @@ -1,30 +0,0 @@ -from network_decentralization.collect import collect_osdata -import network_decentralization.helper as hlp -import time -import logging - -logging.basicConfig(format='[%(asctime)s] %(message)s', datefmt='%Y/%m/%d %I:%M:%S %p', level=logging.INFO) - - -def main(): - ledgers = hlp.get_ledgers() - timings = {} - for ledger in ledgers: - start = time.time() - collect_osdata(ledger, time.strftime('%Y-%m-%d')) - total_time = time.time() - start - timings[ledger] = total_time - - print(2*'----------------\n') - for ledger in hlp.get_ledgers(): - total_time = timings[ledger] - days = int(total_time / 86400) - hours = int((total_time - days*86400) / 3600) - mins = int((total_time - hours*3600 - days*86400) / 60) - secs = int(total_time - mins*60 - hours*3600 - days*86400) - print(f'\tcollect_osdata.py: {ledger} total time: {hours:02} hours, {mins:02} mins, {secs:02} secs') - print(2*'----------------\n') - - -if __name__ == '__main__': - main() diff --git a/bitcoin/compute_metrics.py b/bitcoin/compute_metrics.py new file mode 100644 index 0000000..8c76f74 --- /dev/null +++ b/bitcoin/compute_metrics.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +""" +Script to compute network decentralization metrics from CSV files in the output directory. +Processes both organization and country CSV files and outputs metrics in CSV format. +""" + +import csv +import pathlib +import sys +from ast import literal_eval + +from network_decentralization.helper import get_metrics_network, get_metrics_geo, get_without_tor_ledgers +from network_decentralization.metrics.herfindahl_hirschman_index import compute_hhi +from network_decentralization.metrics.nakamoto_coefficient import compute_nakamoto_coefficient +from network_decentralization.metrics.entropy import compute_entropy +from network_decentralization.metrics.concentration_ratio import compute_concentration_ratio + + +def read_csv_data(csv_path): + """ + Read CSV file and extract date and distribution values. + CSV format: Header row is "EntityType,YYYY-MM-DD" followed by data rows "entity_name,count" + + :param csv_path: Path to the CSV file + :return: Tuple of (date, sorted_distribution_list) + """ + with open(csv_path, 'r', encoding='utf-8') as f: + reader = csv.reader(f) + + # Read header + header = next(reader) + date = header[1] # Extract date from header + + # Read data rows and extract counts + distribution = [] + for row in reader: + if len(row) >= 2: + try: + count = int(row[1]) + distribution.append(count) + except (ValueError, IndexError): + continue + + # Sort in descending order for metric calculations + distribution.sort(reverse=True) + + return date, distribution + + +def get_ledger_name(csv_path): + """ + Extract ledger name from CSV filename. + Expected format: organizations_.csv or countries_.csv + + :param csv_path: Path to the CSV file + :return: Ledger name (e.g., 'bitcoin', 'bitcoin_cash') + """ + filename = csv_path.stem # Get filename without extension + filename = filename.replace('_without_tor', '') # Normalize bitcoin without_tor variant + parts = filename.split('_') + # Remove 'organizations' or 'countries' prefix + return '_'.join(parts[1:]) + + +def normalize_metric_name(metric_name): + """Normalizes metric names from config into registry keys.""" + if metric_name is None: + return '' + return str(metric_name).strip().lower().replace('-', '_').replace(' ', '_') + + +def parse_metric_spec(metric_spec): + """Parses metric token strings like 'entropy=1' into (token, name, parameter).""" + token = str(metric_spec).strip() + if not token: + return None + + if '=' not in token: + return token, normalize_metric_name(token), None + + raw_name, raw_parameter = token.split('=', 1) + normalized_name = normalize_metric_name(raw_name) + parameter_text = raw_parameter.strip() + parameter_value = parse_metric_parameter(parameter_text) + return token, normalized_name, parameter_value + + +def parse_metric_parameter(parameter_text): + """Parses metric parameter values from config strings into Python values.""" + if parameter_text is None: + return None + + text = str(parameter_text).strip() + if not text: + return None + + try: + return literal_eval(text) + except (ValueError, SyntaxError): + return text + + +def build_metric_columns(metric_specs): + """ + Builds ordered metric specs from configured metric tokens. + :param metric_specs: list of metric tokens (e.g., ['hhi', 'entropy=1']) + :returns: list of tuples (metric_token, metric_name, parameter_value) + """ + columns = [] + for metric_spec in metric_specs: + parsed = parse_metric_spec(metric_spec) + if parsed is None: + continue + + metric_token, metric_name, parameter_value = parsed + columns.append((metric_token, metric_name, parameter_value)) + + return columns + + +def compute_metrics(distribution, metric_columns): + """ + Compute specified metrics for a given distribution. + + :param distribution: Sorted list of entity counts (descending order) + :param metric_columns: list of tuples (metric_token, metric_name, parameter_value) + :return: Dictionary with computed metric values + """ + metrics = {} + + for metric_token, metric_name, parameter_value in metric_columns: + function_name = f"compute_{metric_name}" + + try: + function = eval(function_name) + if parameter_value is None: + metrics[metric_token] = function(distribution) + else: + metrics[metric_token] = function(distribution, parameter_value) + except Exception as e: + print(f"Error computing {metric_token}: {e}", file=sys.stderr) + metrics[metric_token] = None + + return metrics + + +def process_csv_files(output_dir, file_pattern, is_country, metric_names): + """ + Process all CSV files matching a pattern and output metrics. + Appends results to existing files or creates new ones. + Uses _without_tor versions when configured in parse_parameters.without_tor_ledgers. + + :param output_dir: Path to the output directory + :param file_pattern: Glob pattern for CSV files (e.g., 'organizations_*.csv') + :param is_country: Boolean to indicate if processing country files + :param metric_names: List of metric names to compute and output + """ + metric_columns = build_metric_columns(metric_names) + without_tor_ledgers = set(get_without_tor_ledgers() or []) + + # Prefer configured _without_tor variants and skip the corresponding regular file when both exist. + file_type = 'countries' if is_country else 'organizations' + + csv_files = sorted(output_dir.glob(file_pattern)) + + for csv_path in csv_files: + try: + ledger = get_ledger_name(csv_path) + + regular_path = output_dir / f"{file_type}_{ledger}.csv" + without_tor_path = output_dir / f"{file_type}_{ledger}_without_tor.csv" + is_regular_file = csv_path.name == regular_path.name + has_without_tor_variant = without_tor_path.exists() + + if is_regular_file and ledger in without_tor_ledgers and has_without_tor_variant: + continue + + date, distribution = read_csv_data(csv_path) + metrics = compute_metrics(distribution, metric_columns) + + # Determine output filename and metric column mapping + file_type = 'countries' if is_country else 'organizations' + output_filename = f"output_{file_type}_{ledger}.csv" + output_path = output_dir / output_filename + file_exists = output_path.exists() + + header = ['ledger', 'date', 'clustering'] + [metric_token for metric_token, _, _ in metric_columns] + + # Write header and data (append if exists) + with open(output_path, 'a', newline='', encoding='utf-8') as f: + writer = csv.writer(f) + if not file_exists: + writer.writerow(header) + + # Build row with metric values in the same order as header + row = [ledger, date, 'False'] + for metric_token, _, _ in metric_columns: + value = metrics.get(metric_token) + if value is None: + row.append('') + elif isinstance(value, float): + row.append(f"{value:.16g}") + else: + row.append(str(value)) + + writer.writerow(row) + + print(f"Appended to: {output_filename}", file=sys.stderr) + + except Exception as e: + print(f"Error processing {csv_path.name}: {e}", file=sys.stderr) + continue + + +def main(): + """ + Main entry point for the script. + Loads metric names from config and processes organization and country CSV files. + """ + # Load metric names from config using helper functions + network_metrics = get_metrics_network() + geo_metrics = get_metrics_geo() + + output_dir = pathlib.Path(__file__).parent / 'output' + + if not output_dir.exists(): + print(f"Error: Output directory not found at {output_dir}", file=sys.stderr) + sys.exit(1) + # Process organization files with network metrics + process_csv_files( + output_dir, + 'organizations_*.csv', + is_country=False, + metric_names=network_metrics, + ) + + # Process country files with geo metrics + process_csv_files( + output_dir, + 'countries_*.csv', + is_country=True, + metric_names=geo_metrics, + ) + + +if __name__ == '__main__': + main() diff --git a/bitcoin/config.yaml b/bitcoin/config.yaml index fcf6110..4082e88 100644 --- a/bitcoin/config.yaml +++ b/bitcoin/config.yaml @@ -10,9 +10,6 @@ mode: - Countries - Organizations -# Used by distribution.py to know which column to distribute -date: '2025-03-28' - execution_parameters: concurrency: 100 @@ -23,3 +20,26 @@ last_time_active: 1 # The first path will be used to write newly created dbs and the output of runs output_directories: - ./output + +# Metrics for network analysis (organizations) +network_metrics: + hhi: + nakamoto_coefficient: + concentration_ratio: + - 1 + - 3 + +# Metrics for geographic analysis (countries) +geo_metrics: + hhi: + nakamoto_coefficient: + entropy: + - 1 + concentration_ratio: + - 1 + - 3 + +# Parameters for parsing/output generation +parse_parameters: + without_tor_ledgers: + - bitcoin \ No newline at end of file diff --git a/bitcoin/distribution.py b/bitcoin/distribution.py deleted file mode 100644 index 0504a24..0000000 --- a/bitcoin/distribution.py +++ /dev/null @@ -1,54 +0,0 @@ -# This script is only used for the Bitcoin ledger. If the Bitcoin ledger is not selected in the config.yaml file, this script does nothing. - -import network_decentralization.helper as hlp -import logging -import pandas as pd -from pathlib import Path - -def redistribute_tor_nodes(name, ledger, df, mode): - """ - Redistributes Tor node count proportionally across non-Tor rows. - :param name: lowercase version of the mode ('countries' or 'organizations') used in file naming. - :param ledger: the ledger name. - :param df: the dataframe in which the Tor nodes must be reditributed. - :param mode: Countries or Organizations. - """ - tor_row = df[df[mode] == 'Tor'] - if tor_row.empty: - logging.info(f"No Tor nodes found in {ledger}.") - return - - number_of_tor_nodes = tor_row[date].values[0] # extract the number of Tor nodes for the given date - number_of_total_nodes_without_tor = df[df[f'{mode}'] != 'Tor'][f'{date}'].sum() # sum of node counts excluding the Tor row - number_of_total_nodes = number_of_total_nodes_without_tor + number_of_tor_nodes - df['Distribution'] = df.apply(lambda row: round((row[f'{date}'] / number_of_total_nodes_without_tor) * number_of_tor_nodes) if row[f'{mode}'] != 'Tor' else 0, axis=1) # create a new column 'Distribution' that distributes the Tor nodes proportionally to non-Tor rows - df[date] = df[date] + df['Distribution'] - df_without_tor = df[df[f'{mode}'] != 'Tor'] # filter out the Tor row - df_without_tor[[mode, date]].to_csv(f'./output/{name}_{ledger}_without_tor.csv', index=False) # save the updated DataFrame to a new CSV - - -def without_tor(): - """ - Loads a CSV file and calls the redistribute_tor_nodes function. - """ - ledger = 'bitcoin' - for mode in MODES: - logging.info(f'distribution.py: Removing Tor from {ledger} {mode}') - name = mode.lower() - filename = Path(f'./output/{name}_{ledger}.csv') - if not filename.is_file(): - logging.warning(f"File not found: {filename}") - return None - df = pd.read_csv(filename) - redistribute_tor_nodes(name, ledger, df, mode) - -LEDGERS = hlp.get_ledgers() -MODES = hlp.get_mode() -date = hlp.get_date() - -def main(): - if 'bitcoin' in LEDGERS: - without_tor() - -if __name__ == '__main__': - main() diff --git a/bitcoin/network_decentralization/helper.py b/bitcoin/network_decentralization/helper.py index 7a6651e..e2dc9e8 100644 --- a/bitcoin/network_decentralization/helper.py +++ b/bitcoin/network_decentralization/helper.py @@ -42,13 +42,6 @@ def get_mode(): """ return get_config_data()['mode'] -def get_date(): - """ - Retrieves data regarding the date to use - :returns: the date to be used by distribution.py - """ - return get_config_data()['date'] - def get_active(): """ Retrieves data regarding the packets to clean up @@ -64,6 +57,93 @@ def get_concurrency(): return get_config_data()['execution_parameters']['concurrency'] +def get_metrics_network(): + """ + Retrieves the list of metrics to compute for network analysis (organizations). + Supports either a list (e.g., ['hhi', 'nakamoto']) or a dictionary + (e.g., {'concentration_ratio': [1, 3]}), which is expanded to tokens like + 'concentration_ratio=1' and 'concentration_ratio=3'. + :returns: a list of metric tokens to compute + """ + return _expand_metric_config(get_config_data().get('network_metrics')) + + +def get_metrics_geo(): + """ + Retrieves the list of metrics to compute for geographic analysis (countries). + Supports either a list (e.g., ['hhi', 'nakamoto']) or a dictionary + (e.g., {'concentration_ratio': [1, 3]}), which is expanded to tokens like + 'concentration_ratio=1' and 'concentration_ratio=3'. + :returns: a list of metric tokens to compute + """ + return _expand_metric_config(get_config_data().get('geo_metrics')) + + +def _expand_metric_config(raw_metrics): + """ + Expands metric configuration into a flat list of metric tokens. + Example: {'entropy': [1, 2]} -> ['entropy=1', 'entropy=2'] + """ + metrics = raw_metrics + + if metrics is None: + return [] + + if isinstance(metrics, list): + return [str(metric).strip() for metric in metrics if str(metric).strip()] + + if not isinstance(metrics, dict): + return [] + + expanded = [] + for metric_name, parameter_values in metrics.items(): + name = str(metric_name).strip() + if not name: + continue + + if parameter_values is None: + expanded.append(name) + continue + + if isinstance(parameter_values, list): + values = parameter_values + else: + values = [parameter_values] + + unique_values = [] + for value in values: + rendered = None if value is None else str(value).strip() + if rendered is not None and rendered not in unique_values: + unique_values.append(rendered) + + if not unique_values: + expanded.append(name) + continue + + for rendered in unique_values: + expanded.append(f"{name}={rendered}") + + return expanded + + +def get_without_tor_ledgers(): + """ + Retrieves the target ledgers for generating *_without_tor CSV files. + :returns: list of ledger names, or None when not configured + """ + params = get_config_data().get('parse_parameters', {}) + raw_ledgers = params.get('without_tor_ledgers') + + if raw_ledgers is None: + return None + + if not isinstance(raw_ledgers, list): + raw_ledgers = [raw_ledgers] + + ledgers = [ledger.strip() for ledger in raw_ledgers if isinstance(ledger, str) and ledger.strip()] + return list(dict.fromkeys(ledgers)) or None + + def get_output_directory(ledger=None, dead=False): """ Reads the config file and retrieves the output directory diff --git a/bitcoin/network_decentralization/metrics/concentration_ratio.py b/bitcoin/network_decentralization/metrics/concentration_ratio.py new file mode 100644 index 0000000..4c85cb5 --- /dev/null +++ b/bitcoin/network_decentralization/metrics/concentration_ratio.py @@ -0,0 +1,9 @@ +def compute_concentration_ratio(distribution, topn): + """ + Calculates the n-concentration ratio of a distribution + :param distribution: list of non-negative counts per entity, sorted in descending order + :param topn: the number of top entities to consider + :returns: float that represents the ratio of total count held by the top n entities (0 if total is 0) + """ + total = sum(distribution) + return sum(distribution[:topn]) / total if total else 0 diff --git a/bitcoin/network_decentralization/metrics/entropy.py b/bitcoin/network_decentralization/metrics/entropy.py new file mode 100644 index 0000000..e4352b7 --- /dev/null +++ b/bitcoin/network_decentralization/metrics/entropy.py @@ -0,0 +1,48 @@ +from math import log +from network_decentralization.metrics.total_entities import compute_total_entities + + +def compute_entropy(distribution, alpha): + """ + Calculates the entropy of an entity distribution. + Pi is the relative frequency of each entity. + Renyi entropy: 1/(1-alpha) * log2 (sum (Pi**alpha)) + Shannon entropy (alpha=1): −sum P(Si) log2 (Pi) + Min entropy (alpha=-1): -log max Pi + :param distribution: list of non-negative counts per entity, sorted in descending order + :param alpha: the entropy parameter (depending on its value the corresponding entropy measure is used) + :returns: a float that represents the entropy of the data or None if the data is empty + """ + total = sum(distribution) + if total == 0: + return None + if alpha == 1: + entropy = 0 + for value in distribution: + rel_freq = value / total + if rel_freq > 0: + entropy -= rel_freq * log(rel_freq, 2) + else: + if alpha == -1: + entropy = -log(max(distribution) / total, 2) + else: + sum_freqs = 0 + for entry in distribution: + sum_freqs += pow(entry / total, alpha) + entropy = log(sum_freqs, 2) / (1 - alpha) + + return entropy + + +def compute_max_entropy(num_entities, alpha): + return compute_entropy([1 for i in range(num_entities)], alpha) + + +def compute_entropy_percentage(distribution, alpha): + if sum(distribution) == 0: + return None + try: + total_entities = compute_total_entities(distribution) + return compute_entropy(distribution, alpha) / compute_max_entropy(total_entities, alpha) + except ZeroDivisionError: + return 0 diff --git a/bitcoin/network_decentralization/metrics/herfindahl_hirschman_index.py b/bitcoin/network_decentralization/metrics/herfindahl_hirschman_index.py new file mode 100644 index 0000000..fe6fc41 --- /dev/null +++ b/bitcoin/network_decentralization/metrics/herfindahl_hirschman_index.py @@ -0,0 +1,20 @@ +def compute_hhi(distribution): + """ + Calculates the Herfindahl-Hirschman index of an entity distribution. + From investopedia: The HHI is calculated by squaring the market share of each firm competing in a market and then + summing the resulting numbers. It can range from close to 0 to 10,000, with lower values indicating a less + concentrated market. The U.S. Department of Justice considers a market with an HHI of less than 1,500 to be a + competitive marketplace, an HHI of 1,500 to 2,500 to be a moderately concentrated marketplace, + and an HHI of 2,500 or greater to be a highly concentrated marketplace. + :param distribution: list of non-negative counts per entity, sorted in descending order + :return: float between 0 and 10,000 that represents the HHI of the given distribution or None if the data is empty + """ + total = sum(distribution) + if total == 0: + return None + + hhi = 0 + for count in distribution: + hhi += pow(100 * count / total, 2) + + return hhi diff --git a/bitcoin/network_decentralization/metrics/nakamoto_coefficient.py b/bitcoin/network_decentralization/metrics/nakamoto_coefficient.py new file mode 100644 index 0000000..e0bd938 --- /dev/null +++ b/bitcoin/network_decentralization/metrics/nakamoto_coefficient.py @@ -0,0 +1,10 @@ +from network_decentralization.metrics.tau_index import compute_tau_index + + +def compute_nakamoto_coefficient(distribution): + """ + Calculates the Nakamoto coefficient of an entity distribution. + :param distribution: list of non-negative counts per entity, sorted in descending order + :returns: int that represents the Nakamoto coefficient of the given distribution, or None if the data is empty + """ + return compute_tau_index(distribution, 0.5) diff --git a/bitcoin/network_decentralization/metrics/tau_index.py b/bitcoin/network_decentralization/metrics/tau_index.py new file mode 100644 index 0000000..17edd5d --- /dev/null +++ b/bitcoin/network_decentralization/metrics/tau_index.py @@ -0,0 +1,18 @@ +def compute_tau_index(distribution, threshold): + """ + Calculates the tau-decentralization index of an entity distribution. + :param distribution: list of non-negative counts per entity, sorted in descending order + :param threshold: float, the parameter of the tau-decentralization index, i.e. the threshold for the power + ratio that is captured by the index (e.g. 0.66 for 66%) + :returns: int that corresponds to the tau index of the given distribution, or None if total is 0 + """ + total = sum(distribution) + if total == 0: + return None + tau_index, power_ratio_covered = 0, 0 + for amount in distribution: + if power_ratio_covered >= threshold: + break + tau_index += 1 + power_ratio_covered += amount / total + return tau_index diff --git a/bitcoin/network_decentralization/metrics/total_entities.py b/bitcoin/network_decentralization/metrics/total_entities.py new file mode 100644 index 0000000..0a2e82b --- /dev/null +++ b/bitcoin/network_decentralization/metrics/total_entities.py @@ -0,0 +1,7 @@ +def compute_total_entities(distribution): + """ + Computes the number of entities with a positive count in the given distribution. + :param distribution: list of non-negative counts per entity + :returns: number of entities with count > 0 + """ + return len([v for v in distribution if v > 0]) diff --git a/bitcoin/parse.py b/bitcoin/parse.py index c008f90..90f86fd 100644 --- a/bitcoin/parse.py +++ b/bitcoin/parse.py @@ -327,6 +327,46 @@ def version(reachable_nodes, mode): versions_df.to_csv(f'./output/{name.lower()}_{ledger}.csv', index_label = name) +def redistribute_tor_nodes(mode_lower, ledger, df, mode): + """ + Redistributes Tor node count proportionally across non-Tor rows. + :param mode_lower: lowercase version of mode ('countries' or 'organizations') used in file naming. + :param ledger: the ledger name. + :param df: the dataframe in which the Tor nodes must be reditributed. + :param mode: the mode name (e.g., 'Countries', 'Organizations'). + """ + date = datetime.today().strftime('%Y-%m-%d') + tor_row = df[df[mode] == 'Tor'] + if tor_row.empty: + logging.info(f"No Tor nodes found in {ledger}.") + return + + number_of_tor_nodes = tor_row[date].values[0] # extract the number of Tor nodes for the given date + number_of_total_nodes_without_tor = df[df[f'{mode}'] != 'Tor'][f'{date}'].sum() # sum of node counts excluding the Tor row + number_of_total_nodes = number_of_total_nodes_without_tor + number_of_tor_nodes + df['Distribution'] = df.apply(lambda row: round((row[f'{date}'] / number_of_total_nodes_without_tor) * number_of_tor_nodes) if row[f'{mode}'] != 'Tor' else 0, axis=1) # create a new column 'Distribution' that distributes the Tor nodes proportionally to non-Tor rows + df[date] = df[date] + df['Distribution'] + df_without_tor = df[df[f'{mode}'] != 'Tor'] # filter out the Tor row + df_without_tor[[mode, date]].to_csv(f'./output/{mode_lower}_{ledger}_without_tor.csv', index=False) # save the updated DataFrame to a new CSV + + +def create_without_tor_files(ledger): + """ + Loads CSV files for the given ledger and redistributes Tor nodes. + :param ledger: the ledger to process for *_without_tor output files + """ + modes = ['Countries', 'Organizations'] + for mode in modes: + logging.info(f'parse.py: Removing Tor from {ledger} {mode}') + mode_lower = mode.lower() + filename = Path(f'./output/{mode_lower}_{ledger}.csv') + if not filename.is_file(): + logging.warning(f"File not found: {filename}") + return None + df = pd.read_csv(filename) + redistribute_tor_nodes(mode_lower, ledger, df, mode) + + def cluster_organizations(ledger): """ Clusters organizations in CSV files. @@ -380,6 +420,7 @@ def cluster_organizations(ledger): def main(): logging.info('Start parsing') + without_tor_ledgers = set(hlp.get_without_tor_ledgers() or []) reachable_nodes = {} for ledger in LEDGERS: @@ -389,6 +430,8 @@ def main(): geography(reachable_nodes, ledger, mode) if 'Organizations' in MODES: cluster_organizations(ledger) + if ledger in without_tor_ledgers: + create_without_tor_files(ledger) if __name__ == '__main__': main() diff --git a/bitcoin/requirements.txt b/bitcoin/requirements.txt index 44084dd..20224ba 100644 --- a/bitcoin/requirements.txt +++ b/bitcoin/requirements.txt @@ -5,6 +5,6 @@ dnspython>=2.6.1 PySocks>=1.7.1 python3-nmap>=1.6.0 pandas>=2.2.3 +numpy>=1.26 networkx>=3.1 -scipy>=1.13 matplotlib>=3.9