diff --git a/workflow/notebook/automlst-wrapper.rpy.ipynb b/workflow/notebook/automlst-wrapper.rpy.ipynb index 839811f2..1833eb86 100644 --- a/workflow/notebook/automlst-wrapper.rpy.ipynb +++ b/workflow/notebook/automlst-wrapper.rpy.ipynb @@ -316,8 +316,7 @@ "source": [ "def create_itol_multiple_barchart_annotation(df, dataset_label, output_file, width=50, sep=\"COMMA\"):\n", " # Define the iTOL template\n", - " itol_template = \"\"\"\n", - "DATASET_MULTIBAR\n", + " itol_template = \"\"\"DATASET_MULTIBAR\n", "#In multi-value bar charts, each ID is associated to multiple numeric values, which are displayed as a stacked or aligned bar chart\n", "#lines starting with a hash are comments and ignored during parsing\n", "#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).\n", @@ -372,6 +371,178 @@ " f.write(itol_annotation)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "a92f8c3a-69c1-49d1-9307-d97a75de8a8f", + "metadata": {}, + "outputs": [], + "source": [ + "def create_itol_heatmap_annotation(df, dataset_label, output_file, strip_width=25, sep=\"COMMA\", color_min=\"#ff0000\", color_max=\"#0000ff\", color_mid=\"#ffff00\"):\n", + " # Define the iTOL template\n", + " itol_template = \"\"\"DATASET_HEATMAP\n", + "#In heatmaps, each ID is associated to multiple numeric values, which are displayed as a set of colored boxes defined by a color gradient\n", + "#lines starting with a hash are comments and ignored during parsing\n", + "#=================================================================#\n", + "# MANDATORY SETTINGS #\n", + "#=================================================================#\n", + "#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).\n", + "#SEPARATOR TAB\n", + "#SEPARATOR SPACE\n", + "SEPARATOR {sep}\n", + "\n", + "#label is used in the legend table (can be changed later)\n", + "DATASET_LABEL,{dataset_label}\n", + "\n", + "#dataset color (can be changed later)\n", + "COLOR,#ff0000\n", + "\n", + "#define labels for each individual field column\n", + "FIELD_LABELS,{field_labels}\n", + "\n", + "#=================================================================#\n", + "# OPTIONAL SETTINGS #\n", + "#=================================================================#\n", + "\n", + "\n", + "#Heatmaps can have an optional Newick formatted tree assigned. Its leaf IDs must exactly match the dataset FIELD_LABELS.\n", + "#The tree will be used to sort the dataset fields, and will be displayed above the dataset. It can have branch lengths defined.\n", + "#All newlines and spaces should be stripped from the tree, and COMMA cannot be used as the dataset separator if a FIELD_TREE is provided.\n", + "#FIELD_TREE (((f1:0.2,f5:0.5):1,(f2_longer_one:0.2,f3:0.3):1.2):0.5,(f4:0.1,f6:0.5):0.8):1.52;\n", + "#FIELD_TREE (((f1,f5),(f2_longer_one,f3)),(f4,f6));\n", + "#FIELD_TREE (:0.1,:0.2,(:0.3,:0.4):0.5):0.0;\n", + "\n", + "#=================================================================#\n", + "# all other optional settings can be set or changed later #\n", + "# in the web interface (under 'Datasets' tab) #\n", + "#=================================================================#\n", + "\n", + "#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.\n", + "MARGIN,0\n", + "\n", + "#width of the individual boxes\n", + "STRIP_WIDTH,{strip_width}\n", + "\n", + "#always show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.\n", + "SHOW_INTERNAL,0\n", + "\n", + "#if a FIELD_TREE is present, it can be hidden by setting this option to 0\n", + "SHOW_TREE,1\n", + "\n", + "#define the heatmap gradient colors. Values in the dataset will be mapped onto the corresponding color gradient.\n", + "COLOR_MIN,{color_min}\n", + "COLOR_MAX,{color_max}\n", + "\n", + "#you can specify a gradient with three colors (e.g red to yellow to green) by setting 'USE_MID_COLOR' to 1, and specifying the midpoint color\n", + "USE_MID_COLOR,1\n", + "COLOR_MID,{color_mid}\n", + "\n", + "#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages\n", + "#=================================================================#\n", + "# Actual data follows after the \"DATA\" keyword #\n", + "#=================================================================#\n", + "DATA\n", + "{data}\n", + "\"\"\"\n", + "\n", + " # Define the field labels and colors\n", + " field_labels = ','.join(df.columns)\n", + " \n", + " # Format the DataFrame to match the iTOL format\n", + " df_itol = df.copy()\n", + " df_itol.index.name = 'ID'\n", + " df_itol = df_itol.reset_index()\n", + " df_itol = df_itol.astype(str)\n", + " data = '\\n'.join(df_itol.apply(lambda x: ','.join(x), axis=1))\n", + "\n", + " # Fill the iTOL template with the field labels, colors, and data\n", + " itol_annotation = itol_template.format(field_labels=field_labels, color_min=color_min, color_max=color_max, color_mid=color_mid,\n", + " strip_width=strip_width, data=data, sep=sep, dataset_label=dataset_label)\n", + "\n", + " # Write the iTOL annotation to a file\n", + " with open(output_file, 'w') as f:\n", + " f.write(itol_annotation)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94807502-d52c-480e-910e-ed5c69f6e577", + "metadata": {}, + "outputs": [], + "source": [ + "def create_itol_binary_annotation(df, dataset_label, field_shapes, field_colors, output_file, strip_width=25, sep=\"COMMA\", color=\"#ff0000\", height_factor=0.3, symbol_spacing=0):\n", + " # Define the iTOL template\n", + " itol_template = \"\"\"DATASET_BINARY\n", + "#lines starting with a hash are comments and ignored during parsing\n", + "#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throught this file (except in the SEPARATOR line, which uses space).\n", + "\n", + "#SEPARATOR TAB\n", + "#SEPARATOR SPACE\n", + "SEPARATOR {sep}\n", + "\n", + "#label is used in the legend table (can be changed later)\n", + "DATASET_LABEL,{dataset_label}\n", + "\n", + "#dataset color (can be changed later)\n", + "COLOR,{color}\n", + "\n", + "#Binary datasets can contain one or more values for each node. Each value will be represented by a symbol (defined in FIELD_SHAPES) with corresponding color and label (from FIELD_COLORS and FIELD_LABELS). Possible values (defined under DATA below) for each node are 1 (filled shapes), 0 (empty shapes) and -1 (completely ommited).\n", + "\n", + "#define colors for each individual field column (if not defined all symbols will use the main dataset color, defined in COLOR)\n", + "#shapes for each field column; possible choices are\n", + "#1: rectangle \n", + "#2: circle\n", + "#3: star\n", + "#4: right pointing triangle\n", + "#5: left pointing triangle\n", + "FIELD_LABELS,{field_labels}\n", + "FIELD_COLORS,{field_colors}\n", + "FIELD_SHAPES,{field_shapes}\n", + "\n", + "#all other optional settings can be set or changed later in the web interface (under 'Datasets' tab)\n", + "\n", + "#show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.\n", + "SHOW_INTERNAL,1\n", + "\n", + "#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.\n", + "MARGIN,0\n", + "\n", + "#symbol height factor; Default symbol height will be slightly less than the available space between leaves, but you can set a multiplication factor here to increase/decrease it (values from 0 to 1 will decrease it, values above 1 will increase it)\n", + "HEIGHT_FACTOR,{height_factor}\n", + "\n", + "#increase/decrease the spacing between individual levels, when there is more than one binary level defined \n", + "SYMBOL_SPACING,{symbol_spacing}\n", + "\n", + "#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages\n", + "#Actual data follows after the \"DATA\" keyword\n", + "DATA\n", + "{data}\n", + "\"\"\"\n", + "\n", + " # Define the field labels and colors\n", + " field_labels = ','.join([f\"GCF_{c}\" for c in df.columns])\n", + " \n", + " # Format the DataFrame to match the iTOL format\n", + " df_itol = df.copy()\n", + " df_itol.index.name = 'ID'\n", + " df_itol = df_itol.reset_index()\n", + " df_itol = df_itol.astype(str)\n", + " data = '\\n'.join(df_itol.apply(lambda x: ','.join(x), axis=1))\n", + " field_shapes = \",\".join(field_shapes)\n", + " field_colors = \",\".join(field_colors)\n", + "\n", + " # Fill the iTOL template with the field labels, colors, and data\n", + " itol_annotation = itol_template.format(field_labels=field_labels, color=color, field_shapes=field_shapes,\n", + " field_colors=field_colors, strip_width=strip_width, data=data, \n", + " sep=sep, symbol_spacing=symbol_spacing, height_factor=height_factor, \n", + " dataset_label=dataset_label)\n", + "\n", + " # Write the iTOL annotation to a file\n", + " with open(output_file, 'w') as f:\n", + " f.write(itol_annotation)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -472,7 +643,29 @@ " if bigscape_class in df_bigscape_class_counts.index:\n", " # Store the count of the current BIG-SCAPE class in the genomes DataFrame\n", " df_genomes.loc[gid, bigscape_class] = df_bigscape_class_counts[bigscape_class]\n", - "\n", + " \n", + " # create absence presence matrix\n", + " df_presence = df_gcf_presence.replace(0, -1)\n", + " for gcf_id in df_gcfs.index:\n", + " fam_type = df_gcfs.loc[gcf_id, \"fam_type\"]\n", + " if fam_type == \"unknown_family\":\n", + " df_presence[str(gcf_id)] = df_presence[str(gcf_id)].replace(1, 0)\n", + " color = \"#ff0000\"\n", + " shape = 1\n", + " field_colors = [color for i in df_presence.columns]\n", + " field_shapes = [str(shape) for i in df_presence.columns]\n", + " \n", + " # write absence presence matrix\n", + " outfile = Path(f\"assets/iTOL_annotation/iTOL_BiG-SCAPE_presence_antismash_{antismash_version}.txt\")\n", + " outfile.parent.mkdir(parents=True, exist_ok=True)\n", + " create_itol_binary_annotation(df_presence, \"GCF presence\", field_shapes, field_colors, outfile, sep=\"COMMA\", color=\"#ff0000\")\n", + " \n", + " # Create a download button for the iTOL annotation\n", + " button_link = f'Download BiG-SCAPE GCF presence'\n", + " \n", + " # Add the download button to the list of button items\n", + " button_items.append(button_link)\n", + " \n", " # Get the column names of the genomes DataFrame excluding 'Genome ID', 'Unique BGCs', and 'BGCs'\n", " column_names = df_genomes.drop(['Genome ID', 'Unique BGCs', 'BGCs'], axis=1).columns\n", "\n",