// Define input and output directory parameters
params.input_dir = 'raw_data'

// Define a list of different trimming combinations for sequence processing
def trimmingCombinations = [
  "QualityTrim_Q25",
  "AdapterTrim_Q25",
  "LengthFilter_75",
  "ComplexityFilter",
  "SlidingWindow_4nt_q25",
  "QualitySlidingHybrid_Q25_4nt_q25", // QualityTrim_Q25 + SlidingWindow_4nt_q25
  "QualityAdapterHybrid_Q25", // QualityTrim_Q25 + AdapterTrim_Q25
  "LengthComplexityHybrid_75", // LengthFilter_75 + ComplexityFilter
  "SlidingComplexityHybrid_4nt_q25", // SlidingWindow_4nt_q25 + ComplexityFilter
  "AdapterSlidingHybrid_Q25_4nt_q25", // AdapterTrim_Q25 + SlidingWindow_4nt_q25
  "QualityLengthHybrid_Q25_75", // QualityTrim_Q25 + LengthFilter_75
  "QualityComplexityHybrid_Q25", // QualityTrim_Q25 + ComplexityFilter
  "AdapterLengthHybrid_Q25_75", // AdapterTrim_Q25 + LengthFilter_75
  "AdapterComplexityHybrid_Q25", // AdapterTrim_Q25 + ComplexityFilter
  "LengthSlidingHybrid_75_4nt_Q25", // LengthFilter_75 + SlidingWindow_4nt_q25
  "NoTrimming"
]

/* Create a channel with trimming combinations 
  and pair them with file pairs from the input directory
*/
Channel
  .from(trimmingCombinations)
  .combine(Channel.fromFilePairs("${params.input_dir}/*_{1,2}.fastq"))
  .map { trimParams, sampleId, files -> 
    println("Trim Params: ${trimParams}, Files: ${files}") 
    tuple(trimParams, files) 
  }
  .set { trimmingChannel }


process fastp_process {
  label 'fastp'
  tag "${trimParams}_${reads[0].getName()}"

  input:
    tuple val(trimParams), file(reads)

  output:
    tuple val(trimParams), \
    path("${reads[0].simpleName}_${trimParams}.fastq"), \
    path("${reads[1].simpleName}_${trimParams}.fastq")

  script:
    String trimParamsStr = trimParams.toString().trim()
    def fastpCmd = "fastp --in1 ${reads[0]} --in2 ${reads[1]}"
    String outputFileName1 = "${reads[0].simpleName}_${trimParams}.fastq"
    String outputFileName2 = "${reads[1].simpleName}_${trimParams}.fastq"
    fastpCmd += " --out1 ${outputFileName1} --out2 ${outputFileName2}"

    String htmlReport = "${reads[0].simpleName}_${trimParams}.html"
    String jsonReport = "${reads[0].simpleName}_${trimParams}.json"
    fastpCmd += " --html ${htmlReport} --json ${jsonReport}"

    def parts = trimParamsStr.tokenize('_')

    def trimActionMap = [
    QualityTrim: { String q -> 
      "--qualified_quality_phred ${q.replace('Q', '')}" 
    },
    AdapterTrim: { String q -> 
      "--detect_adapter_for_pe --qualified_quality_phred " +
      "${q.replace('Q', '')}" 
    },
    LengthFilter: { String length -> 
      "--length_required ${length}" 
    },
    ComplexityFilter: { _ -> 
      "--low_complexity_filter" 
    },
    SlidingWindow: { String windowSize, String meanQuality -> 
      "--cut_right --cut_window_size ${windowSize.replace('nt', '')} " +
      "--cut_mean_quality ${meanQuality.replace('q', '')}" 
    },
    QualitySlidingHybrid: { String q, String windowSize, String meanQuality -> 
      "--cut_right --qualified_quality_phred ${q.replace('Q', '')} " +
      "--cut_window_size ${windowSize.replace('nt', '')} " +
      "--cut_mean_quality ${meanQuality.replace('q', '')}" 
    },
    QualityAdapterHybrid: { String q -> 
      "--detect_adapter_for_pe --qualified_quality_phred " +
      "${q.replace('Q', '')}" 
    },
    LengthComplexityHybrid: { String length -> 
      "--length_required ${length} --low_complexity_filter" 
    },
    SlidingComplexityHybrid: { String windowSize, String meanQuality -> 
      "--cut_right --cut_window_size ${windowSize.replace('nt', '')} " +
      "--cut_mean_quality ${meanQuality.replace('q', '')} " +
      "--low_complexity_filter" 
    },
    AdapterSlidingHybrid: { String q, String windowSize, String meanQuality -> 
      "--detect_adapter_for_pe --cut_right --cut_window_size " +
      "${windowSize.replace('nt', '')} --cut_mean_quality " +
      "${meanQuality.replace('q', '')} --qualified_quality_phred " +
      "${q.replace('Q', '')}" 
    },
    QualityLengthHybrid: { String q, String length -> 
      "--qualified_quality_phred ${q.replace('Q', '')} " +
      "--length_required ${length}" 
    },
    QualityComplexityHybrid: { String q -> 
      "--qualified_quality_phred ${q.replace('Q', '')} --low_complexity_filter" 
    },
    AdapterLengthHybrid: { String q, String length -> 
      "--detect_adapter_for_pe --qualified_quality_phred " +
      "${q.replace('Q', '')} --length_required ${length}" 
    },
    AdapterComplexityHybrid: { String q -> 
      "--detect_adapter_for_pe --qualified_quality_phred " +
      "${q.replace('Q', '')} --low_complexity_filter" 
    },
    LengthSlidingHybrid: { 
      String length, String windowSize, String meanQuality -> 
      "--length_required ${length} --cut_right --cut_window_size " +
      "${windowSize.replace('nt', '')} --cut_mean_quality " +
      "${meanQuality.replace('Q', '')}" 
    },
    NoTrimming: { _ -> 
      "--disable_adapter_trimming --disable_quality_filtering " +
      "--disable_length_filtering" 
    }
    ]

    trimActionMap.each { key, action ->
    if (trimParamsStr.startsWith(key)) {
        fastpCmd += " " + action(parts.drop(1))
        return
      }
    }

    """
    set -euo pipefail
    echo "Trimming parameters: ${trimParams}"
    if [ ${reads.size()} -ne 2 ]; then
      echo "Error: Expected two reads files, but got ${reads.size()}"
      exit 1
    fi
    ${fastpCmd}
    echo "fastp process for ${trimParams}_"\
    "${reads[0].getName()} completed successfully."
    """
}


process spades_process {
  label 'spades'
  tag "${trimParams}"

  input:
    tuple val(trimParams), path(read1), path(read2)
    
  output:
    path "scaffolds_${trimParams}.fasta"

  cpus 4
  memory '8 GB'

  script:
    """
    set -euo pipefail
    echo "Read1: ${read1}"
    echo "Read2: ${read2}"
    echo "Starting SPAdes genome assembly for ${trimParams}..."
    spades.py --isolate -1 ${read1} -2 ${read2} -o output_${trimParams}
    mv output_${trimParams}/scaffolds.fasta scaffolds_${trimParams}.fasta
    echo "SPAdes genome assembly for ${trimParams} completed successfully."
    """
}


process quast_process {
  label 'quast'

  input:
    path genomes

  output:
    path "reports"

  cpus 4
  memory '8 GB'

  script:
    """
    mkdir -p reports
    set -euo pipefail
    for genome in ${genomes}
    do
      genome_name=\$(basename \$genome .fasta)
      echo "Starting QUAST quality assessment for \$genome_name..."
      quast.py -o reports/output_\${genome_name} \${genome}
      mv reports/output_\${genome_name}/report.html \
        reports/report_\${genome_name}.html
      echo "QUAST quality assessment for \$genome_name completed."
    done
    """
}


workflow {
  fastp_process(trimmingChannel)
    .groupTuple()
    .set { fastpCollectChannel }

  spades_process(fastpCollectChannel)
    .collect()
    .set { quastInputChannel }

  quast_process(quastInputChannel)
}

import pandas as pd
import os

# Function to extract required metrics from a report file
def extract_required_metrics(report_path):
    try:
        # Reading the report file
        df = pd.read_csv(report_path, sep='\t', header=None, names=['Metric', 'Value'])
        
        # Dictionary to store extracted metrics
        metrics = {}
        
        # List of key metrics to extract
        required_metrics = [
            ('Total length', 'Total length'),
            ('GC', 'GC (%)'),
            ('Largest contig', 'Largest Contig'),
            ('N50', 'N50'),
            ('N90', 'N90'),
            ('L50', 'L50'),
            ('L90', 'L90'),
            ('# N\'s per 100 kbp', '# N\'s per 100 kbp')
        ]
        
        # Extracting and saving the values of required metrics
        for metric_pattern, metric_name in required_metrics:
            metric_value = df[df['Metric'].str.contains(metric_pattern, case=False, regex=False)]['Value'].values
            if metric_value.size > 0:
                metrics[metric_name] = metric_value[0]
            else:
                metrics[metric_name] = 'N/A'  # Marking as 'N/A' if the metric is missing
        
        return metrics
    except Exception as e:
        print(f"Error processing {report_path}: {e}")
        return None

# Path to the directory containing report files
reports_dir = './reports'

# List of report files
report_files = [file for file in os.listdir(reports_dir) if file.endswith('.tsv')]

# Extracting and saving data from all report files
all_metrics_data = {}
for report_file in report_files:
    # Forming the full path to the report file
    report_path = os.path.join(reports_dir, report_file)
    # Determining the trimming parameter name from the report file name
    trimming_param = report_file.replace('report_scaffolds_', '').replace('.tsv', '')
    # Extracting metrics
    metrics = extract_required_metrics(report_path)
    if metrics:
        all_metrics_data[trimming_param] = metrics

# Converting the data into a DataFrame
all_metrics_df = pd.DataFrame.from_dict(all_metrics_data, orient='index').reset_index()
all_metrics_df.rename(columns={'index': 'Trimming Parameters'}, inplace=True)

# List of trimming parameters in the desired order
sorting_order = [
    "NoTrimming",
    "QualityTrim_Q25",
    "AdapterTrim_Q25",
    "LengthFilter_75",
    "ComplexityFilter",
    "SlidingWindow_4nt_q25",
    "QualitySlidingHybrid_Q25_4nt_q25", 
    "QualityAdapterHybrid_Q25", 
    "LengthComplexityHybrid_75", 
    "SlidingComplexityHybrid_4nt_q25",
    "AdapterSlidingHybrid_Q25_4nt_q25", 
    "QualityLengthHybrid_Q25_75", 
    "QualityComplexityHybrid_Q25", 
    "AdapterLengthHybrid_Q25_75", 
    "AdapterComplexityHybrid_Q25", 
    "LengthSlidingHybrid_75_4nt_Q25", 
]

# Sorting the DataFrame according to the specified order
sorted_metrics_df = all_metrics_df.set_index('Trimming Parameters').reindex(sorting_order).reset_index()

sorted_metrics_df

import seaborn as sns
import matplotlib.pyplot as plt

# Prepare the data
# Make sure the data is numerical.
for column in sorted_metrics_df.columns[1:]:  # Skip the first column with the names of trimming methods
    sorted_metrics_df[column] = pd.to_numeric(sorted_metrics_df[column], errors='coerce')

# Find the base values for "NoTrimming"
base_values = sorted_metrics_df[sorted_metrics_df['Trimming Parameters'] == 'NoTrimming'].iloc[0, 1:]

# Initialization of the list for storing strings
rows_list = []

# Cycle to fill the list of rows
for index, row in sorted_metrics_df.iterrows():
    norm_row = {'Trimming Parameters': row['Trimming Parameters']}
    for metric in sorted_metrics_df.columns[1:]:
        if base_values[metric] != 0:
            norm_row[metric] = (row[metric] - base_values[metric]) / base_values[metric]
        else:
            norm_row[metric] = 0
    rows_list.append(norm_row)

# Creating a DataFrame from a list of strings
norm_deviations = pd.DataFrame(rows_list, columns=sorted_metrics_df.columns)

# Visualization of normalized deviations using a heat map
plt.figure(figsize=(12, 8))
sns.heatmap(norm_deviations.iloc[:, 1:], annot=True, cmap='coolwarm', center=0, yticklabels=norm_deviations['Trimming Parameters'])
plt.title('Normalized Deviation of Genome Assembly Metrics from No Trimming')
plt.xlabel('Assembly Metrics')
plt.ylabel('Trimming Methods')
plt.xticks(rotation=45)  # Rotate signatures for better readability
plt.show()

# Calculate the relationship and add it as a new column to the DataFrame multiplying the result by 10,000
sorted_metrics_df['N_Ratio_x10k'] = (sorted_metrics_df['# N\'s per 100 kbp'] / sorted_metrics_df['N50']) * 10000

# Sort the DataFrame by the new column 'N_Ratio'
sorted_df = sorted_metrics_df.sort_values(by='N_Ratio_x10k', ascending=False)

# Visualisation on one common graph
plt.figure(figsize=(12, 8))
barplot = sns.barplot(x='Trimming Parameters', y='N_Ratio_x10k', data=sorted_df)

plt.title('N_Ratio (# N\'s per 100 kbp / N50) for Different Trimming Methods (Values x10,000)')
plt.xlabel('Trimming Methods')
plt.ylabel('N_Ratio (x10,000)')

# Change the colour of the 'NoTrimming' signature to red
for label in barplot.get_xticklabels():
    if label.get_text() == 'NoTrimming':
        label.set_color('red')

plt.xticks(rotation=45, ha='right')  # Tilt signatures for better readability

plt.tight_layout()
plt.show()

# New path to the directory with reports
second_reports_dir = './reports_2'  

# List of report files in a new directory
second_report_files = [file for file in os.listdir(second_reports_dir) if file.endswith('.tsv')]

# Extracting and saving data from all report files
second_all_metrics_data = {}
for report_file in second_report_files:
    # Forming the full path to the report file
    second_report_path = os.path.join(second_reports_dir, report_file)
    # Determining the trimming parameter name from the second report file name
    second_trimming_param = report_file.replace('output_scaffolds_', '').replace('.tsv', '')
    # Extracting metrics
    second_metrics = extract_required_metrics(second_report_path)
    if second_metrics:
        second_all_metrics_data[second_trimming_param] = second_metrics

# Converting the data into a DataFrame
second_all_metrics_df = pd.DataFrame.from_dict(second_all_metrics_data, orient='index').reset_index()
second_all_metrics_df.rename(columns={'index': 'Trimming Parameters'}, inplace=True)

# List of trimming parameters in the desired order
second_sorting_order = [
    "NoTrimming",
    "SlidingWindow_4nt_q15",
    "SlidingWindow_4nt_q20",
    "SlidingWindow_4nt_q25",
    "SlidingWindow_4nt_q30",
    "SlidingWindow_7nt_q15",
    "SlidingWindow_7nt_q20", 
    "SlidingWindow_7nt_q25", 
    "SlidingWindow_7nt_q30", 
    "SlidingWindow_10nt_q15",
    "SlidingWindow_10nt_q20", 
    "SlidingWindow_10nt_q25", 
    "SlidingWindow_10nt_q30", 
    "SlidingWindow_20nt_q15", 
    "SlidingWindow_20nt_q20", 
    "SlidingWindow_20nt_q25",
    "SlidingWindow_20nt_q30",
    "LengthFilter_50",
    "LengthFilter_75",
    "LengthFilter_100",
    "LengthFilter_150",
    "LengthFilter_200",
    "LengthFilter_250",
]

# Sorting the DataFrame according to the specified order
second_sorted_metrics_df = second_all_metrics_df.set_index('Trimming Parameters').reindex(second_sorting_order).reset_index()

second_sorted_metrics_df

# Prepare the data
# Make sure the data is numerical.
for column in second_sorted_metrics_df.columns[1:]:  # Skip the first column with the names of trimming methods
    second_sorted_metrics_df[column] = pd.to_numeric(second_sorted_metrics_df[column], errors='coerce')

base_values = second_sorted_metrics_df[second_sorted_metrics_df['Trimming Parameters'] == 'NoTrimming'].iloc[0, 1:]

# Initialising a list for storing strings
second_rows_list = []

# Cycle to fill the list of rows
for index, row in second_sorted_metrics_df.iterrows():
    second_norm_row = {'Trimming Parameters': row['Trimming Parameters']}
    for metric in second_sorted_metrics_df.columns[1:]:
        if base_values[metric] != 0:
            second_norm_row[metric] = (row[metric] - base_values[metric]) / base_values[metric]
        else:
            second_norm_row[metric] = 0
    second_rows_list.append(second_norm_row)

# Creating a data frame from a list of strings
second_norm_deviations = pd.DataFrame(second_rows_list, columns=second_sorted_metrics_df.columns)

# Visualisation of normalized deviations using a heat map
plt.figure(figsize=(12, 8))
sns.heatmap(second_norm_deviations.iloc[:, 1:], annot=True, cmap='coolwarm', center=0, yticklabels=second_norm_deviations['Trimming Parameters'])
plt.title('Normalised deviations of genome assembly metrics from the "NoTrimming" method')
plt.xlabel('Assembly Metrics after the second pipeline launch')
plt.ylabel('Trimming Methods after the second pipeline launch')
plt.xticks(rotation=45)  # Rotate signatures for better readability
plt.show()

# Calculate the relationship and add it as a new column to the DataFrame multiplying the result by 10,000
second_sorted_metrics_df['N_Ratio_x10k'] = (second_sorted_metrics_df['# N\'s per 100 kbp'] / second_sorted_metrics_df['N50']) * 10000

# Sort the DataFrame by the new column 'N_Ratio'
second_sorted_df = second_sorted_metrics_df.sort_values(by='N_Ratio_x10k', ascending=False)

# Visualisation on one common graph
plt.figure(figsize=(12, 8))
barplot = sns.barplot(x='Trimming Parameters', y='N_Ratio_x10k', data=second_sorted_df)

plt.title('N_Ratio (# N\'s per 100 kbp / N50) for Different Trimming Methods (Values x10,000) after the second pipeline launch')
plt.xlabel('Trimming Methods')
plt.ylabel('N_Ratio (x10,000)')

# Change the colour of the 'NoTrimming' signature to red
for label in barplot.get_xticklabels():
    if label.get_text() == 'NoTrimming':
        label.set_color('red')

plt.xticks(rotation=45, ha='right')  # Tilt signatures for better readability

plt.tight_layout()
plt.show()

# Data filtering for the "Length Filter" trimming method
length_filter_data = second_sorted_metrics_df[second_sorted_metrics_df['Trimming Parameters'].str.contains('LengthFilter')]

# Extracting Length Filter parameters from 'Trimming Parameters' and sorting by them for line continuity
length_filter_params_and_data = length_filter_data['Trimming Parameters'].str.extract('LengthFilter_(\d+)')[0].astype(int)
length_filter_sorted = length_filter_params_and_data.sort_values().index
length_filter_params = length_filter_params_and_data.loc[length_filter_sorted].astype(int)

# Creating a figure and a set of subplots
fig, axs = plt.subplots(1, 2, figsize=(20, 6))  # 1 row, 2 columns

# Plotting N50 with lines connecting points on the first subplot
axs[0].plot(length_filter_params, length_filter_data.loc[length_filter_sorted, 'N50'], marker='o', linestyle='-', color='blue', label='N50')
axs[0].set_title('Variation of N50 depending on the Length Filter parameter')
axs[0].set_xlabel('Length Filter parameter')
axs[0].set_ylabel('N50 value')
axs[0].grid(True)
axs[0].legend()

# Plotting # N's per 100 kbp with lines connecting points on the second subplot
axs[1].plot(length_filter_params, length_filter_data.loc[length_filter_sorted, "# N's per 100 kbp"], marker='o', linestyle='-', color='red', label="# N's per 100 kbp")
axs[1].set_title("Change of # N's per 100 kbp depending on Length Filter parameter")
axs[1].set_xlabel('Length Filter parameter')
axs[1].set_ylabel("# N's per 100 kbp")
axs[1].grid(True)
axs[1].legend()

plt.show()

import plotly.graph_objects as go

# Filter data for the "Sliding Window" trimming method
sliding_window_data = second_sorted_metrics_df[second_sorted_metrics_df['Trimming Parameters'].str.startswith('SlidingWindow')]

# Extract and prepare the Sliding Window parameters for plotting
window_sizes = sliding_window_data['Trimming Parameters'].str.extract('(\d+)nt')[0].astype(int)
quality_thresholds = sliding_window_data['Trimming Parameters'].str.extract('q(\d+)')[0].astype(int)

# Create a 3D scatter plot for N50 using Plotly
fig = go.Figure(data=[go.Scatter3d(
    x=window_sizes,
    y=quality_thresholds,
    z=sliding_window_data['N_Ratio_x10k'],
    mode='markers',
    marker=dict(size=5, color='blue', opacity=0.8)
)])

fig.update_layout(title='3D Scatter Plot of N_Ratio_x10k for Sliding Window Trimming',
                 scene=dict(xaxis_title='Window Size',
                             yaxis_title='Quality Threshold',
                             zaxis_title='N_Ratio_x10k'))
# Display the plot inline in Jupyter Notebook
fig.show(renderer="iframe")

def extract_correct_metrics(report_file_path):
    try:
        df_correct = pd.read_csv(report_file_path, sep='\t', header=0)
        
        correct_metrics = {
            'Total length': df_correct.loc[0, 'Total length'],
            'GC (%)': df_correct.loc[0, 'GC (%)'],
            'Largest Contig': df_correct.loc[0, 'Largest contig'],
            'N50': df_correct.loc[0, 'N50'],
            'N90': df_correct.loc[0, 'N90'],
            'L50': df_correct.loc[0, 'L50'],
            'L90': df_correct.loc[0, 'L90'],
            "# N's per 100 kbp": df_correct.loc[0, "# N's per 100 kbp"]
        }
        
        return correct_metrics
    except Exception as e:
        print(f"Error processing {report_file_path}: {e}")
        return None


# New path to the directory with reports
third_reports_dir = './reports_3'  

# List of report files in a new directory
third_report_files = [file for file in os.listdir(third_reports_dir) if file.endswith('.tsv')]

# Extracting and saving data from all report files
third_all_metrics_data = {}
for report_file in third_report_files:
    # Forming the full path to the report file
    third_report_path = os.path.join(third_reports_dir, report_file)
    # Determining the trimming parameter name from the third report file name
    third_trimming_param = report_file.replace('output_scaffolds_', '').replace('.tsv', '')
    # Extracting metrics
    third_metrics = extract_correct_metrics(third_report_path)
    if third_metrics:
        third_all_metrics_data[third_trimming_param] = third_metrics


# Converting the data into a DataFrame
third_all_metrics_df = pd.DataFrame.from_dict(third_all_metrics_data, orient='index').reset_index()
third_all_metrics_df.rename(columns={'index': 'Trimming Parameters'}, inplace=True)

# List of trimming parameters in the desired order
third_sorting_order = [
    "SlidingWindow_3nt_q30",
    "SlidingWindow_4nt_q30",
    "SlidingWindow_4nt_q29",
    "SlidingWindow_5nt_q30",
    "LengthFilter_30",
    "LengthFilter_35",
    "LengthFilter_40",
    "LengthFilter_45",
    "LengthFilter_50",
    "LengthFilter_55",
]

# Sorting the DataFrame according to the specified order
third_sorted_metrics_df = third_all_metrics_df.set_index('Trimming Parameters').reindex(third_sorting_order).reset_index()

third_sorted_metrics_df

# Data filtering for the "Length Filter" trimming method
length_filter_data = third_sorted_metrics_df[third_sorted_metrics_df['Trimming Parameters'].str.contains('LengthFilter')]

# Extracting Length Filter parameters from 'Trimming Parameters' and sorting by them for line continuity
length_filter_params_and_data = length_filter_data['Trimming Parameters'].str.extract('LengthFilter_(\d+)')[0].astype(int)
length_filter_sorted = length_filter_params_and_data.sort_values().index
length_filter_params = length_filter_params_and_data.loc[length_filter_sorted].astype(int)

# Creating a figure and a set of subplots
fig, axs = plt.subplots(1, 2, figsize=(20, 6))  # 1 row, 2 columns

# Plotting N50 with lines connecting points on the first subplot
axs[0].plot(length_filter_params, length_filter_data.loc[length_filter_sorted, 'N50'], marker='o', linestyle='-', color='blue', label='N50')
axs[0].set_title('Variation of N50 depending on the Length Filter parameter')
axs[0].set_xlabel('Length Filter parameter')
axs[0].set_ylabel('N50 value')
axs[0].grid(True)
axs[0].legend()

# Plotting # N's per 100 kbp with lines connecting points on the second subplot
axs[1].plot(length_filter_params, length_filter_data.loc[length_filter_sorted, "# N's per 100 kbp"], marker='o', linestyle='-', color='red', label="# N's per 100 kbp")
axs[1].set_title("Change of # N's per 100 kbp depending on Length Filter parameter")
axs[1].set_xlabel('Length Filter parameter')
axs[1].set_ylabel("# N's per 100 kbp")
axs[1].grid(True)
axs[1].legend()

plt.show()

# Calculate the relationship and add it as a new column to the DataFrame multiplying the result by 10,000
third_sorted_metrics_df['N_Ratio_x10k'] = (third_sorted_metrics_df['# N\'s per 100 kbp'] / third_sorted_metrics_df['N50']) * 10000

# Sort the DataFrame by the new column 'N_Ratio'
third_sorted_df = third_sorted_metrics_df.sort_values(by='N_Ratio_x10k', ascending=False)

# Visualisation on one common graph
plt.figure(figsize=(12, 8))
barplot = sns.barplot(x='Trimming Parameters', y='N_Ratio_x10k', data=third_sorted_df)

plt.title('N_Ratio (# N\'s per 100 kbp / N50) for Different Trimming Methods (Values x10,000) after the third pipeline launch')
plt.xlabel('Trimming Methods')
plt.ylabel('N_Ratio (x10,000)')

# Change the colour of the 'NoTrimming' signature to red
for label in barplot.get_xticklabels():
    if label.get_text() == 'SlidingWindow_4nt_q30':
        label.set_color('red')

plt.xticks(rotation=45, ha='right')  # Tilt signatures for better readability

plt.tight_layout()
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Function to extract metrics
def extract_metrics_adjusted(path, metrics_of_interest):
    df = pd.read_csv(path, sep='\t', header=0)  # Reading without setting the index
    metrics_dict = {}
    for metric_name, metric_col in metrics_of_interest:
        if metric_col in df.columns:
            metrics_dict[metric_name] = df[metric_col].iloc[0]  # Assuming metrics are in the first row
        else:
            # Trying to find the metric in a row-wise layout
            metric_row = df[df['Assembly'].str.contains(metric_name, case=False, na=False)]
            if not metric_row.empty:
                metrics_dict[metric_name] = metric_row.iloc[0, 1]  # Taking the value from the second column
            else:
                metrics_dict[metric_name] = None  # Metric not found
    return pd.Series(metrics_dict)

# Paths to the files
paths = [
    './TrialsExtremes/output_scaffolds_SlidingWindow_10nt_q20.tsv',
    './TrialsExtremes/output_scaffolds_NoTrimming.tsv',
    './TrialsExtremes/output_scaffolds_SlidingWindow_3nt_q30.tsv'
]

# Metrics of interest
metrics_list = [
    ('Total length', 'Total length (>= 0 bp)'),
    ('GC', 'GC (%)'),
    ('Largest contig', 'Largest contig'),
    ('N50', 'N50'),
    ('N90', 'N90'),
    ('L50', 'L50'),
    ('L90', 'L90'),
    ("# N's per 100 kbp", "# N's per 100 kbp")
]

# Number of plots per row
plots_per_row = 2
num_metrics = len(metrics_list)
num_rows = np.ceil(num_metrics / plots_per_row).astype(int)

# Create figure with subplots
fig, axes = plt.subplots(num_rows, plots_per_row, figsize=(plots_per_row * 8, num_rows * 6))
axes = axes.flatten()  # Flatten the axes array for easy indexing

# Colors for each trimming method
colors = {
    'SlidingWindow_10nt_q20': 'r',
    'NoTrimming': 'gray',
    'SlidingWindow_3nt_q30': 'g'
}

for i, (metric_name, metric_col) in enumerate(metrics_list):
    # Extract and combine metrics for the current metric
    adjusted_metrics = [extract_metrics_adjusted(path, [(metric_name, metric_col)]) for path in paths]
    combined_metrics_df = pd.concat(adjusted_metrics, axis=1)
    combined_metrics_df.columns = ['SlidingWindow_10nt_q20', 'NoTrimming', 'SlidingWindow_3nt_q30']
    
    # Create grouped bar chart
    ax = axes[i]
    n_groups = 1  # One group for each metric
    bar_width = 0.25
    index = np.arange(n_groups)
    positions = [index - bar_width, index, index + bar_width]

    for j, label in enumerate(combined_metrics_df.columns):
        ax.bar(positions[j], combined_metrics_df[label], color=colors[label], width=bar_width, label=label)

    ax.set_ylabel(metric_name)
    ax.set_title(f'{metric_name}')
    ax.legend()

# Hide any unused axes
for i in range(num_metrics, num_rows * plots_per_row):
    fig.delaxes(axes[i])

fig.tight_layout()
plt.show()

	Trimming Parameters	Total length	GC (%)	Largest Contig	N50	N90	L50	L90	# N's per 100 kbp
0	NoTrimming	4404338	65.49	209132	74608	21687	18	57	9.20
1	QualityTrim_Q25	4347132	65.39	206803	64262	17217	20	68	7.43
2	AdapterTrim_Q25	4347132	65.39	206803	64262	17217	20	68	7.43
3	LengthFilter_75	4371727	65.42	209097	65136	17421	19	64	6.94
4	ComplexityFilter	4370526	65.41	209097	65136	17421	19	64	6.94
5	SlidingWindow_4nt_q25	4338271	65.45	173819	56417	16144	26	81	7.86
6	QualitySlidingHybrid_Q25_4nt_q25	4338271	65.45	173819	56417	16144	26	81	7.86
7	QualityAdapterHybrid_Q25	4347132	65.39	206803	64262	17217	20	68	7.43
8	LengthComplexityHybrid_75	4371727	65.42	209097	65136	17421	19	64	6.94
9	SlidingComplexityHybrid_4nt_q25	4338271	65.45	173819	56417	16144	26	81	7.86
10	AdapterSlidingHybrid_Q25_4nt_q25	4338271	65.45	173819	56417	16144	26	81	7.86
11	QualityLengthHybrid_Q25_75	4348087	65.39	206803	64262	17217	20	69	7.42
12	QualityComplexityHybrid_Q25	4348353	65.39	206803	64262	17387	20	67	7.43
13	AdapterLengthHybrid_Q25_75	4348087	65.39	206803	64262	17217	20	69	7.42
14	AdapterComplexityHybrid_Q25	4348353	65.39	206803	64262	17387	20	67	7.43
15	LengthSlidingHybrid_75_4nt_Q25	4315732	65.40	173819	54049	16002	27	84	9.06

	Trimming Parameters	Total length	GC (%)	Largest Contig	N50	N90	L50	L90	# N's per 100 kbp
0	NoTrimming	4404338	65.49	209132	74608	21687	18	57	9.20
1	SlidingWindow_4nt_q15	4371714	65.47	209097	66370	19392	19	61	7.14
2	SlidingWindow_4nt_q20	4355072	65.47	196013	66370	16891	19	66	6.92
3	SlidingWindow_4nt_q25	4338271	65.45	173819	56417	16144	26	81	7.86
4	SlidingWindow_4nt_q30	4324613	65.42	156069	45172	12240	31	100	4.64
5	SlidingWindow_7nt_q15	4386621	65.47	209097	70975	20667	18	59	11.52
6	SlidingWindow_7nt_q20	4371385	65.48	196013	65416	16973	20	65	11.75
7	SlidingWindow_7nt_q25	4352562	65.46	196013	64262	16735	22	70	7.39
8	SlidingWindow_7nt_q30	4335609	65.44	161559	53760	15315	27	84	10.41
9	SlidingWindow_10nt_q15	4389356	65.48	209097	74525	20679	18	57	11.51
10	SlidingWindow_10nt_q20	4426558	65.49	141613	49888	14028	29	91	20.93
11	SlidingWindow_10nt_q25	4359481	65.47	196013	65247	16891	20	66	7.15
12	SlidingWindow_10nt_q30	4337191	65.45	141533	53760	16002	27	83	8.33
13	SlidingWindow_20nt_q15	4390654	65.47	209097	70975	20679	18	58	9.21
14	SlidingWindow_20nt_q20	4377137	65.47	196013	66370	19418	19	61	6.91
15	SlidingWindow_20nt_q25	4369502	65.47	196013	65416	17217	21	66	11.75
16	SlidingWindow_20nt_q30	4347271	65.46	161570	64262	16677	23	74	10.16
17	LengthFilter_50	4370507	65.41	209097	65136	17421	19	64	6.94
18	LengthFilter_75	4371727	65.42	209097	65136	17421	19	64	6.94
19	LengthFilter_100	4371227	65.42	209075	64713	17421	20	65	6.94
20	LengthFilter_150	4371488	65.42	205948	64262	17416	20	66	6.94
21	LengthFilter_200	4368193	65.42	205900	64262	17416	20	66	7.18
22	LengthFilter_250	4431464	65.42	146203	41434	14197	34	102	16.41

	Trimming Parameters	Total length	GC (%)	Largest Contig	N50	N90	L50	L90	# N's per 100 kbp
0	SlidingWindow_3nt_q30	4302594	65.40	139540	38864	10691	34	111	1.86
1	SlidingWindow_4nt_q30	4310805	65.42	156069	45172	12240	31	100	4.64
2	SlidingWindow_4nt_q29	4314963	65.43	161279	49758	12487	29	94	6.49
3	SlidingWindow_5nt_q30	4315324	65.43	161302	45552	12261	31	98	6.49
4	LengthFilter_30	4321024	65.41	209097	65136	17421	19	64	6.94
5	LengthFilter_35	4321024	65.41	209097	65136	17421	19	64	6.94
6	LengthFilter_40	4321012	65.41	209097	65136	17421	19	64	6.94
7	LengthFilter_45	4321017	65.41	209097	65136	17421	19	64	6.94
8	LengthFilter_50	4321006	65.41	209097	65136	17421	19	64	6.94
9	LengthFilter_55	4321006	65.41	209097	65136	17421	19	64	6.94

Development of a Pipeline for the Benchmarking of Next-Generation Sequencing Quality Control Tools¶

Efim Shliamin¶

Key Features:¶

Selection of trimming methods & the naming conventions:¶

The code provided below is designed to demonstrate the data generated by the program pipeline¶

After that, we would like to visualize the same data, but in a more expressive way.¶

Data visualisation:¶

Analyzing the Normalized Deviation of Genome Assembly Metrics from No Trimming:¶

Analyzing 16 bar plots "N_Ratio (# N's per 100 kbp / N50) for Different Trimming Methods (Values x10,000)":¶

Let's visualize the data, but in a more expressive way again using a heat map:¶

Analyzing the Normalized Deviation of Genome Assembly Metrics from No Trimming after the second pipeline launch:¶

Analyzing 23 bar plots "N_Ratio (# N's per 100 kbp / N50) for Different Trimming Methods (Values x10,000) after the second pipeline launch":¶

Practical Applications of the Optimal Trimming Method¶

Scientific Applications¶