Genomic Barcode Whitelisting Analysis Script

Abraheem ‘Abe’ Khouqeer

2024-12-16

Description:

This is a rendered ipynb (python) script that provides a comprehensive workflow for processing genomic barcode data, primarily for whitelisting and analyzing diversity in sequencing libraries. It includes the following key steps:

Virtual Environment Setup: Establishes a Python virtual environment and installs required dependencies.
File Preparation: Extracts FASTQ data from compressed files and runs quality control checks.
Barcode Processing: Defines functions to extract, count, and filter barcodes from sequencing reads.
Clustering Barcodes: Groups similar barcodes using UMI tools to generate a whitelist of valid barcodes.
Hamming Distance Filtering: Collapses barcodes across replicates based on Hamming distance thresholds.
Output Generation: Saves intermediate and final results as CSV files for further downstream analysis.

This script is designed for flexibility and can process multiple FASTQ files. Each function is detailed below, along with examples of its usage. To use this script effectively, you will need to customize it to match your specific directories and environment configurations.

# Make virtual envrironment

python3 -m venv Juypterenv
source Juypterenv/bin/activate
pip install jinja2 MarkupSafe
pip install umi_tools networkx pandas numpy matplotlib
pip install ipykernel
python -m ipykernel install --user --name=Juypterenv --display-name "Python (Juypterenv)"

# import libs
from umi_tools import network as nk
import umi_tools as umi
import sys
import pandas as pd
import numpy as np, matplotlib.pyplot as plt, networkx as nx, pickle, json, gzip
import subprocess
import os
import csv
from collections import Counter
import re
import gzip, shutil

# Extract all gzip fastq files
dir_name = '' # directory where the files are stored

def gz_extract(directory):
    extension = '.gz'
    os.chdir(directory)
    for item in os.listdir(directory): # loop through items in dir
      if item.endswith(extension): # check for '.gz' extension
          gz_name = os.path.abspath(item) # get full path of files
          file_name = (os.path.basename(gz_name)).rsplit('.',1)[0] #get file name for file within
          with gzip.open(gz_name,'rb') as f_in, open(file_name,'wb') as f_out:
              shutil.copyfileobj(f_in, f_out)
          os.remove(gz_name) # delete zipped file
        
gz_extract(dir_name)

os.getcwd()

# Run QC on Fastq files and check'em Not working 
data_directory = '' # directory where the files are stored
subprocess.run([f'{data_directory}/*.fastq '], shell=True) # Run FastQC on all fastq files in the directory

# Define functions to wrangle Fastq files to include one sequence per line/row

def extract_and_save_sequences(file_path, output_directory, N_READS=1):
    '''
    Extract sequences from a FASTQ file, filter barcodes using regex, count unique barcodes,
    and save them to a CSV file.

    Parameters:
    - file_path (str): The path to the FASTQ file.
    - output_directory (str): The directory where the CSV file will be saved.
    - N_READS (int): The minimum count of reads for a barcode to be included (default is 1).
    '''
    sequences = []
    barcodes = []

    barcode_pattern = '[ATCG]{4}TG[ATCG]{4}CA[ATCG]{4}AC[ATCG]{4}GA[ATCG]{4}GT[ATCG]{4}AG[ATCG]{4}' # Define barcode pattern (can be used with any barcode pattern, be it LARRY or others)

    with open(file_path, 'r') as f:
        lines = f.readlines()

        # Extract the original file name without the extension
        file_name = file_path.split('/')[-1].split('.')[0]

        # Loop through every 4 lines (header, sequence, separator, quality)
        for i in range(0, len(lines), 4):
            sequence = lines[i + 1].strip()  # Extract the sequence line
            match = re.search(barcode_pattern, sequence)

            if match:
                barcode = match.group(0)
                sequences.append(sequence)
                barcodes.append(barcode)

    # Count unique barcodes
    barcode_counts = Counter(barcodes)

    # Filter by the minimum count (N_READS)
    filtered_barcodes = {barcode: count for barcode, count in barcode_counts.items() if count >= N_READS}

    # Create a CSV file with barcodes and counts
    output_file_name = f'Library_{file_name}_barcodes.csv'
    output_file_path = os.path.join(output_directory, output_file_name)

    with open(output_file_path, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['Barcode', 'Count'])  # Header
        csv_writer.writerows(filtered_barcodes.items())

    print(f'Filtered and counted barcodes saved to: {output_file_path}')



def process_all_fastq_files(input_directory, output_directory, N_READS=1):
    '''
    Process all FASTQ files in the specified directory.

    Parameters:
    - input_directory (str): The path to the directory containing FASTQ files.
    - output_directory (str): The directory where the CSV files will be saved.
    - N_READS (int): The minimum count of reads for a barcode to be included (default is 1).
    '''
    for filename in os.listdir(input_directory):
        if filename.endswith('.fastq'):
            file_path = os.path.join(input_directory, filename)
            print(f'Processing file: {file_path}')
            extract_and_save_sequences(file_path, output_directory, N_READS)

# Execute functions on fastq directory
data_directory = '' # directory where the files are stored
input_directory = data_directory
output_directory = data_directory
process_all_fastq_files(input_directory, output_directory, N_READS=5)

# Collpase barcodes and generate a whitelist for each fastq file in directory

def process_csv_files(input_directory, output_directory, threshold=5):
    '''
    Process all CSV files in the specified directory.

    Parameters:
    - input_directory (str): The path to the directory containing CSV files.
    - output_directory (str): The directory where the output CSV files will be saved.
    - threshold (int): The clustering threshold (default is 5).
    '''
    for filename in os.listdir(input_directory):
        if filename.endswith('.csv'):
            csv_path = os.path.join(input_directory, filename)
            print(f'Processing CSV file: {csv_path}')

            # Read CSV file and extract barcodes and counts
            cbFinal = dict()
            with open(csv_path, 'r') as csvfile:
                reader = csv.reader(csvfile)
                next(reader)  # Skip header
                for row in reader:
                    barcode, count = row
                    barcode = barcode.encode('utf-8')
                    count = int(count)

                    if barcode not in cbFinal:
                        cbFinal[barcode] = 0

                    cbFinal[barcode] += count

            # Perform clustering
            uc = nk.UMIClusterer()
            CBclusters = uc(cbFinal, threshold=threshold)

            # Create the final dictionary
            cbFinalClustered = dict()
            for cluster in CBclusters:
                cbFinalClustered[cluster[0]] = sum(cbFinal[barcode] for barcode in cluster)

            # Save the final clustered dictionary to a CSV file with the original CSV file name and 'whitelist'
            csv_file_name = os.path.splitext(filename)[0]  # Extract CSV file name without extension
            output_file_name = f'{csv_file_name}_whitelist.csv'
            output_file_path = os.path.join(output_directory, output_file_name)

            with open(output_file_path, 'w', newline='') as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow(['Barcode', 'Count'])  # Header
                for barcode, count in cbFinalClustered.items():
                    barcode_str = barcode.decode('utf-8')  # Decode barcode from bytes to string
                    csv_writer.writerow([barcode_str, count])

            print(f'Final clustered whitelist saved to: {output_file_path}')



process_csv_files(input_directory, output_directory, threshold = 5)

# Filtering between the replicates to get reproducable whitelists via hamming distance. 
# These are for the dupilcates of each sample: 

def hamming_distance(bc1, bc2):
    '''
    Calculate the Hamming distance between two sequences.

    Parameters:
    - bc1 (str): First sequence.
    - bc2 (str): Second sequence.

    Returns:
    - int: The Hamming distance between bc1 and bc2.
    '''
    # Ensure that the sequences are of equal length
    if len(bc1) != len(bc2):
        raise ValueError('Sequences must be of equal length for Hamming distance calculation.')

    # Calculate Hamming distance
    distance = np.sum([x1 != x2 for x1, x2 in zip(bc1, bc2)])

    return distance

def collapse_and_save(input_csv1, input_csv2, output_csv, HD):
    '''
    Collapse barcodes between datasets and within a dataset based on Hamming distance.

    Parameters:
    - input_csv1 (str): Path to the first input CSV file.
    - input_csv2 (str): Path to the second input CSV file.
    - output_csv (str): Path to the output CSV file.
    - HD (int): Hamming distance threshold.

    Returns:
    - pd.DataFrame: DataFrame containing the final collapsed barcodes.
    '''
    # Load input CSV files
    df1 = pd.read_csv(input_csv1)
    df2 = pd.read_csv(input_csv2)

    # Extract barcodes from DataFrames
    all_bcs_rep1 = sorted(df1['Barcode'])
    all_bcs_rep2 = sorted(df2['Barcode'])

    # Step 1: Collapse between datasets
    bc_map = {}
    for i, bc1 in enumerate(all_bcs_rep1):
        if i > 0 and i % 500 == 0:
            print(f'Mapped {i} out of {len(all_bcs_rep1)} barcodes')

        mapped = False
        for bc2 in all_bcs_rep2:
            if hamming_distance(bc1, bc2) <= HD:
                mapped = True
                bc_map[bc1] = bc2
                break

        if not mapped:
            bc_map[bc1] = bc1

    print(f'\nCollapsed between datasets: {len(bc_map)} barcodes')

    # Step 2: Collapse within a dataset
    for i, bc1 in enumerate(bc_map):
        if i > 0 and i % 500 == 0:
            print(f'Mapped {i} out of {len(bc_map)} barcodes')

        mapped = False
        for bc2 in bc_map:
            if hamming_distance(bc1, bc2) <= HD:
                mapped = True
                bc_map[bc1] = bc2
                break

        if not mapped:
            bc_map[bc1] = bc1

    print(f'\nFinal collapsed barcodes: {len(bc_map)} barcodes')

    # Filter df1 by rows that contain barcodes from bc_map
    df_filtered = df1[df1['Barcode'].isin(bc_map.keys())]

    # Save the filtered DataFrame to an output CSV file
    df_filtered.to_csv(output_csv, sep='\t', index=False)

    return df_filtered

# # Example usage:
# input_csv1 = os.path.join(data_directory, 'Library_Test_R2_barcodes_whitelist.csv')
# input_csv2 = os.path.join(data_directory, 'Library_Test_R1_barcodes_whitelist.csv')
# output_csv = os.path.join(data_directory, 'whitelist_merged.csv')
# HD = 1 # Set your Hamming distance threshold

# result_dataframe = collapse_and_save(input_csv1, input_csv2, output_csv, HD)

def collapse_and_save_directory(input_directory, output_directory, HD):
    '''
    Collapse barcodes between datasets and within a dataset based on Hamming distance for all pairs of CSV files in a directory.

    Parameters:
    - input_directory (str): Path to the input directory containing pairs of CSV files.
    - output_directory (str): Path to the output directory for saving collapsed CSV files.
    - HD (int): Hamming distance threshold.

    Returns:
    - List of pd.DataFrame: List of DataFrames containing the final collapsed barcodes for each pair.
    '''
    result_dataframes = []

    # List all files in the input directory
    files = os.listdir(input_directory)

    # Iterate through each file in the directory
    for file in files:
        # Check if the file has 'R1' in its name
        if file.endswith('.csv') and 'duplicate_1' and 'whitelist' in file:
            # Generate the corresponding file name for 'R2'
            file_R2 = file.replace('duplicate_1', 'duplicate_2')

            # Form the full paths for both R1 and R2 files
            input_csv1 = os.path.join(input_directory, file)
            input_csv2 = os.path.join(input_directory, file_R2)

            # Extract the library name (assuming it is common between R1 and R2)
            library_name = file.replace('_duplicate_1_', '_').replace('_duplicate_2_', '_').split('.csv')[0]
            print(library_name)
            # Form the output CSV file path
            output_csv = os.path.join(output_directory, f'{library_name}_merged.csv')

            # Perform barcode collapsing for the current pair
            collapse_and_save(input_csv1, input_csv2, output_csv, HD)

            # # Append the resulting DataFrame to the list
            # result_dataframes.append(df_filtered)

            # # Save the individual CSV file for the current pair
            # individual_output_csv = os.path.join(output_directory, f'whitelist_{library_name}_individual.csv')
            # df_filtered.to_csv(individual_output_csv, sep='\t', index=False)

    # return result_dataframes

# Example usage:
input_directory = data_directory
output_directory = data_directory
HD = 5  # Set your Hamming distance threshold

result_dataframes = collapse_and_save_directory(input_directory, output_directory, HD)