Skip to content

Commit

Permalink
notebook: add iTOL presence absence GCF
Browse files Browse the repository at this point in the history
  • Loading branch information
matinnuhamunada committed Mar 11, 2024
1 parent 79c19c4 commit ee776a1
Showing 1 changed file with 196 additions and 3 deletions.
199 changes: 196 additions & 3 deletions workflow/notebook/automlst-wrapper.rpy.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,7 @@
"source": [
"def create_itol_multiple_barchart_annotation(df, dataset_label, output_file, width=50, sep=\"COMMA\"):\n",
" # Define the iTOL template\n",
" itol_template = \"\"\"\n",
"DATASET_MULTIBAR\n",
" itol_template = \"\"\"DATASET_MULTIBAR\n",
"#In multi-value bar charts, each ID is associated to multiple numeric values, which are displayed as a stacked or aligned bar chart\n",
"#lines starting with a hash are comments and ignored during parsing\n",
"#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).\n",
Expand Down Expand Up @@ -372,6 +371,178 @@
" f.write(itol_annotation)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a92f8c3a-69c1-49d1-9307-d97a75de8a8f",
"metadata": {},
"outputs": [],
"source": [
"def create_itol_heatmap_annotation(df, dataset_label, output_file, strip_width=25, sep=\"COMMA\", color_min=\"#ff0000\", color_max=\"#0000ff\", color_mid=\"#ffff00\"):\n",
" # Define the iTOL template\n",
" itol_template = \"\"\"DATASET_HEATMAP\n",
"#In heatmaps, each ID is associated to multiple numeric values, which are displayed as a set of colored boxes defined by a color gradient\n",
"#lines starting with a hash are comments and ignored during parsing\n",
"#=================================================================#\n",
"# MANDATORY SETTINGS #\n",
"#=================================================================#\n",
"#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).\n",
"#SEPARATOR TAB\n",
"#SEPARATOR SPACE\n",
"SEPARATOR {sep}\n",
"\n",
"#label is used in the legend table (can be changed later)\n",
"DATASET_LABEL,{dataset_label}\n",
"\n",
"#dataset color (can be changed later)\n",
"COLOR,#ff0000\n",
"\n",
"#define labels for each individual field column\n",
"FIELD_LABELS,{field_labels}\n",
"\n",
"#=================================================================#\n",
"# OPTIONAL SETTINGS #\n",
"#=================================================================#\n",
"\n",
"\n",
"#Heatmaps can have an optional Newick formatted tree assigned. Its leaf IDs must exactly match the dataset FIELD_LABELS.\n",
"#The tree will be used to sort the dataset fields, and will be displayed above the dataset. It can have branch lengths defined.\n",
"#All newlines and spaces should be stripped from the tree, and COMMA cannot be used as the dataset separator if a FIELD_TREE is provided.\n",
"#FIELD_TREE (((f1:0.2,f5:0.5):1,(f2_longer_one:0.2,f3:0.3):1.2):0.5,(f4:0.1,f6:0.5):0.8):1.52;\n",
"#FIELD_TREE (((f1,f5),(f2_longer_one,f3)),(f4,f6));\n",
"#FIELD_TREE (:0.1,:0.2,(:0.3,:0.4):0.5):0.0;\n",
"\n",
"#=================================================================#\n",
"# all other optional settings can be set or changed later #\n",
"# in the web interface (under 'Datasets' tab) #\n",
"#=================================================================#\n",
"\n",
"#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.\n",
"MARGIN,0\n",
"\n",
"#width of the individual boxes\n",
"STRIP_WIDTH,{strip_width}\n",
"\n",
"#always show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.\n",
"SHOW_INTERNAL,0\n",
"\n",
"#if a FIELD_TREE is present, it can be hidden by setting this option to 0\n",
"SHOW_TREE,1\n",
"\n",
"#define the heatmap gradient colors. Values in the dataset will be mapped onto the corresponding color gradient.\n",
"COLOR_MIN,{color_min}\n",
"COLOR_MAX,{color_max}\n",
"\n",
"#you can specify a gradient with three colors (e.g red to yellow to green) by setting 'USE_MID_COLOR' to 1, and specifying the midpoint color\n",
"USE_MID_COLOR,1\n",
"COLOR_MID,{color_mid}\n",
"\n",
"#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages\n",
"#=================================================================#\n",
"# Actual data follows after the \"DATA\" keyword #\n",
"#=================================================================#\n",
"DATA\n",
"{data}\n",
"\"\"\"\n",
"\n",
" # Define the field labels and colors\n",
" field_labels = ','.join(df.columns)\n",
" \n",
" # Format the DataFrame to match the iTOL format\n",
" df_itol = df.copy()\n",
" df_itol.index.name = 'ID'\n",
" df_itol = df_itol.reset_index()\n",
" df_itol = df_itol.astype(str)\n",
" data = '\\n'.join(df_itol.apply(lambda x: ','.join(x), axis=1))\n",
"\n",
" # Fill the iTOL template with the field labels, colors, and data\n",
" itol_annotation = itol_template.format(field_labels=field_labels, color_min=color_min, color_max=color_max, color_mid=color_mid,\n",
" strip_width=strip_width, data=data, sep=sep, dataset_label=dataset_label)\n",
"\n",
" # Write the iTOL annotation to a file\n",
" with open(output_file, 'w') as f:\n",
" f.write(itol_annotation)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94807502-d52c-480e-910e-ed5c69f6e577",
"metadata": {},
"outputs": [],
"source": [
"def create_itol_binary_annotation(df, dataset_label, field_shapes, field_colors, output_file, strip_width=25, sep=\"COMMA\", color=\"#ff0000\", height_factor=0.3, symbol_spacing=0):\n",
" # Define the iTOL template\n",
" itol_template = \"\"\"DATASET_BINARY\n",
"#lines starting with a hash are comments and ignored during parsing\n",
"#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throught this file (except in the SEPARATOR line, which uses space).\n",
"\n",
"#SEPARATOR TAB\n",
"#SEPARATOR SPACE\n",
"SEPARATOR {sep}\n",
"\n",
"#label is used in the legend table (can be changed later)\n",
"DATASET_LABEL,{dataset_label}\n",
"\n",
"#dataset color (can be changed later)\n",
"COLOR,{color}\n",
"\n",
"#Binary datasets can contain one or more values for each node. Each value will be represented by a symbol (defined in FIELD_SHAPES) with corresponding color and label (from FIELD_COLORS and FIELD_LABELS). Possible values (defined under DATA below) for each node are 1 (filled shapes), 0 (empty shapes) and -1 (completely ommited).\n",
"\n",
"#define colors for each individual field column (if not defined all symbols will use the main dataset color, defined in COLOR)\n",
"#shapes for each field column; possible choices are\n",
"#1: rectangle \n",
"#2: circle\n",
"#3: star\n",
"#4: right pointing triangle\n",
"#5: left pointing triangle\n",
"FIELD_LABELS,{field_labels}\n",
"FIELD_COLORS,{field_colors}\n",
"FIELD_SHAPES,{field_shapes}\n",
"\n",
"#all other optional settings can be set or changed later in the web interface (under 'Datasets' tab)\n",
"\n",
"#show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.\n",
"SHOW_INTERNAL,1\n",
"\n",
"#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.\n",
"MARGIN,0\n",
"\n",
"#symbol height factor; Default symbol height will be slightly less than the available space between leaves, but you can set a multiplication factor here to increase/decrease it (values from 0 to 1 will decrease it, values above 1 will increase it)\n",
"HEIGHT_FACTOR,{height_factor}\n",
"\n",
"#increase/decrease the spacing between individual levels, when there is more than one binary level defined \n",
"SYMBOL_SPACING,{symbol_spacing}\n",
"\n",
"#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages\n",
"#Actual data follows after the \"DATA\" keyword\n",
"DATA\n",
"{data}\n",
"\"\"\"\n",
"\n",
" # Define the field labels and colors\n",
" field_labels = ','.join([f\"GCF_{c}\" for c in df.columns])\n",
" \n",
" # Format the DataFrame to match the iTOL format\n",
" df_itol = df.copy()\n",
" df_itol.index.name = 'ID'\n",
" df_itol = df_itol.reset_index()\n",
" df_itol = df_itol.astype(str)\n",
" data = '\\n'.join(df_itol.apply(lambda x: ','.join(x), axis=1))\n",
" field_shapes = \",\".join(field_shapes)\n",
" field_colors = \",\".join(field_colors)\n",
"\n",
" # Fill the iTOL template with the field labels, colors, and data\n",
" itol_annotation = itol_template.format(field_labels=field_labels, color=color, field_shapes=field_shapes,\n",
" field_colors=field_colors, strip_width=strip_width, data=data, \n",
" sep=sep, symbol_spacing=symbol_spacing, height_factor=height_factor, \n",
" dataset_label=dataset_label)\n",
"\n",
" # Write the iTOL annotation to a file\n",
" with open(output_file, 'w') as f:\n",
" f.write(itol_annotation)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -472,7 +643,29 @@
" if bigscape_class in df_bigscape_class_counts.index:\n",
" # Store the count of the current BIG-SCAPE class in the genomes DataFrame\n",
" df_genomes.loc[gid, bigscape_class] = df_bigscape_class_counts[bigscape_class]\n",
"\n",
" \n",
" # create absence presence matrix\n",
" df_presence = df_gcf_presence.replace(0, -1)\n",
" for gcf_id in df_gcfs.index:\n",
" fam_type = df_gcfs.loc[gcf_id, \"fam_type\"]\n",
" if fam_type == \"unknown_family\":\n",
" df_presence[str(gcf_id)] = df_presence[str(gcf_id)].replace(1, 0)\n",
" color = \"#ff0000\"\n",
" shape = 1\n",
" field_colors = [color for i in df_presence.columns]\n",
" field_shapes = [str(shape) for i in df_presence.columns]\n",
" \n",
" # write absence presence matrix\n",
" outfile = Path(f\"assets/iTOL_annotation/iTOL_BiG-SCAPE_presence_antismash_{antismash_version}.txt\")\n",
" outfile.parent.mkdir(parents=True, exist_ok=True)\n",
" create_itol_binary_annotation(df_presence, \"GCF presence\", field_shapes, field_colors, outfile, sep=\"COMMA\", color=\"#ff0000\")\n",
" \n",
" # Create a download button for the iTOL annotation\n",
" button_link = f'<a href=\"../{outfile}\" download class=\"md-button\">Download BiG-SCAPE GCF presence</a>'\n",
" \n",
" # Add the download button to the list of button items\n",
" button_items.append(button_link)\n",
" \n",
" # Get the column names of the genomes DataFrame excluding 'Genome ID', 'Unique BGCs', and 'BGCs'\n",
" column_names = df_genomes.drop(['Genome ID', 'Unique BGCs', 'BGCs'], axis=1).columns\n",
"\n",
Expand Down

0 comments on commit ee776a1

Please sign in to comment.