notebook: add iTOL presence absence GCF

NBChub · Mar 11, 2024 · ee776a1 · ee776a1
1 parent 79c19c4
commit ee776a1
Showing 1 changed file with 196 additions and 3 deletions.
diff --git a/workflow/notebook/automlst-wrapper.rpy.ipynb b/workflow/notebook/automlst-wrapper.rpy.ipynb
@@ -316,8 +316,7 @@
    "source": [
     "def create_itol_multiple_barchart_annotation(df, dataset_label, output_file, width=50, sep=\"COMMA\"):\n",
     "    # Define the iTOL template\n",
-    "    itol_template = \"\"\"\n",
-    "DATASET_MULTIBAR\n",
+    "    itol_template = \"\"\"DATASET_MULTIBAR\n",
     "#In multi-value bar charts, each ID is associated to multiple numeric values, which are displayed as a stacked or aligned bar chart\n",
     "#lines starting with a hash are comments and ignored during parsing\n",
     "#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).\n",
@@ -372,6 +371,178 @@
     "        f.write(itol_annotation)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a92f8c3a-69c1-49d1-9307-d97a75de8a8f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_itol_heatmap_annotation(df, dataset_label, output_file, strip_width=25, sep=\"COMMA\", color_min=\"#ff0000\", color_max=\"#0000ff\", color_mid=\"#ffff00\"):\n",
+    "    # Define the iTOL template\n",
+    "    itol_template = \"\"\"DATASET_HEATMAP\n",
+    "#In heatmaps, each ID is associated to multiple numeric values, which are displayed as a set of colored boxes defined by a color gradient\n",
+    "#lines starting with a hash are comments and ignored during parsing\n",
+    "#=================================================================#\n",
+    "#                    MANDATORY SETTINGS                           #\n",
+    "#=================================================================#\n",
+    "#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).\n",
+    "#SEPARATOR TAB\n",
+    "#SEPARATOR SPACE\n",
+    "SEPARATOR {sep}\n",
+    "\n",
+    "#label is used in the legend table (can be changed later)\n",
+    "DATASET_LABEL,{dataset_label}\n",
+    "\n",
+    "#dataset color (can be changed later)\n",
+    "COLOR,#ff0000\n",
+    "\n",
+    "#define labels for each individual field column\n",
+    "FIELD_LABELS,{field_labels}\n",
+    "\n",
+    "#=================================================================#\n",
+    "#                    OPTIONAL SETTINGS                            #\n",
+    "#=================================================================#\n",
+    "\n",
+    "\n",
+    "#Heatmaps can have an optional Newick formatted tree assigned. Its leaf IDs must exactly match the dataset FIELD_LABELS.\n",
+    "#The tree will be used to sort the dataset fields, and will be displayed above the dataset. It can have branch lengths defined.\n",
+    "#All newlines and spaces should be stripped from the tree, and COMMA cannot be used as the dataset separator if a FIELD_TREE is provided.\n",
+    "#FIELD_TREE (((f1:0.2,f5:0.5):1,(f2_longer_one:0.2,f3:0.3):1.2):0.5,(f4:0.1,f6:0.5):0.8):1.52;\n",
+    "#FIELD_TREE (((f1,f5),(f2_longer_one,f3)),(f4,f6));\n",
+    "#FIELD_TREE (:0.1,:0.2,(:0.3,:0.4):0.5):0.0;\n",
+    "\n",
+    "#=================================================================#\n",
+    "#     all other optional settings can be set or changed later     #\n",
+    "#           in the web interface (under 'Datasets' tab)           #\n",
+    "#=================================================================#\n",
+    "\n",
+    "#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.\n",
+    "MARGIN,0\n",
+    "\n",
+    "#width of the individual boxes\n",
+    "STRIP_WIDTH,{strip_width}\n",
+    "\n",
+    "#always show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.\n",
+    "SHOW_INTERNAL,0\n",
+    "\n",
+    "#if a FIELD_TREE is present, it can be hidden by setting this option to 0\n",
+    "SHOW_TREE,1\n",
+    "\n",
+    "#define the heatmap gradient colors. Values in the dataset will be mapped onto the corresponding color gradient.\n",
+    "COLOR_MIN,{color_min}\n",
+    "COLOR_MAX,{color_max}\n",
+    "\n",
+    "#you can specify a gradient with three colors (e.g red to yellow to green) by setting 'USE_MID_COLOR' to 1, and specifying the midpoint color\n",
+    "USE_MID_COLOR,1\n",
+    "COLOR_MID,{color_mid}\n",
+    "\n",
+    "#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages\n",
+    "#=================================================================#\n",
+    "#       Actual data follows after the \"DATA\" keyword              #\n",
+    "#=================================================================#\n",
+    "DATA\n",
+    "{data}\n",
+    "\"\"\"\n",
+    "\n",
+    "    # Define the field labels and colors\n",
+    "    field_labels = ','.join(df.columns)\n",
+    "    \n",
+    "    # Format the DataFrame to match the iTOL format\n",
+    "    df_itol = df.copy()\n",
+    "    df_itol.index.name = 'ID'\n",
+    "    df_itol = df_itol.reset_index()\n",
+    "    df_itol = df_itol.astype(str)\n",
+    "    data = '\\n'.join(df_itol.apply(lambda x: ','.join(x), axis=1))\n",
+    "\n",
+    "    # Fill the iTOL template with the field labels, colors, and data\n",
+    "    itol_annotation = itol_template.format(field_labels=field_labels, color_min=color_min, color_max=color_max, color_mid=color_mid,\n",
+    "                                           strip_width=strip_width, data=data, sep=sep, dataset_label=dataset_label)\n",
+    "\n",
+    "    # Write the iTOL annotation to a file\n",
+    "    with open(output_file, 'w') as f:\n",
+    "        f.write(itol_annotation)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94807502-d52c-480e-910e-ed5c69f6e577",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_itol_binary_annotation(df, dataset_label, field_shapes, field_colors, output_file, strip_width=25, sep=\"COMMA\", color=\"#ff0000\", height_factor=0.3, symbol_spacing=0):\n",
+    "    # Define the iTOL template\n",
+    "    itol_template = \"\"\"DATASET_BINARY\n",
+    "#lines starting with a hash are comments and ignored during parsing\n",
+    "#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throught this file (except in the SEPARATOR line, which uses space).\n",
+    "\n",
+    "#SEPARATOR TAB\n",
+    "#SEPARATOR SPACE\n",
+    "SEPARATOR {sep}\n",
+    "\n",
+    "#label is used in the legend table (can be changed later)\n",
+    "DATASET_LABEL,{dataset_label}\n",
+    "\n",
+    "#dataset color (can be changed later)\n",
+    "COLOR,{color}\n",
+    "\n",
+    "#Binary datasets can contain one or more values for each node. Each value will be represented by a symbol (defined in FIELD_SHAPES) with corresponding color and label (from FIELD_COLORS and FIELD_LABELS). Possible values (defined under DATA below) for each node are 1 (filled shapes), 0 (empty shapes) and -1 (completely ommited).\n",
+    "\n",
+    "#define colors for each individual field column (if not defined all symbols will use the main dataset color, defined in COLOR)\n",
+    "#shapes for each field column; possible choices are\n",
+    "#1: rectangle \n",
+    "#2: circle\n",
+    "#3: star\n",
+    "#4: right pointing triangle\n",
+    "#5: left pointing triangle\n",
+    "FIELD_LABELS,{field_labels}\n",
+    "FIELD_COLORS,{field_colors}\n",
+    "FIELD_SHAPES,{field_shapes}\n",
+    "\n",
+    "#all other optional settings can be set or changed later in the web interface (under 'Datasets' tab)\n",
+    "\n",
+    "#show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.\n",
+    "SHOW_INTERNAL,1\n",
+    "\n",
+    "#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.\n",
+    "MARGIN,0\n",
+    "\n",
+    "#symbol height factor; Default symbol height will be slightly less than the available space between leaves, but you can set a multiplication factor here to increase/decrease it (values from 0 to 1 will decrease it, values above 1 will increase it)\n",
+    "HEIGHT_FACTOR,{height_factor}\n",
+    "\n",
+    "#increase/decrease the spacing between individual levels, when there is more than one binary level defined \n",
+    "SYMBOL_SPACING,{symbol_spacing}\n",
+    "\n",
+    "#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages\n",
+    "#Actual data follows after the \"DATA\" keyword\n",
+    "DATA\n",
+    "{data}\n",
+    "\"\"\"\n",
+    "\n",
+    "    # Define the field labels and colors\n",
+    "    field_labels = ','.join([f\"GCF_{c}\" for c in df.columns])\n",
+    "    \n",
+    "    # Format the DataFrame to match the iTOL format\n",
+    "    df_itol = df.copy()\n",
+    "    df_itol.index.name = 'ID'\n",
+    "    df_itol = df_itol.reset_index()\n",
+    "    df_itol = df_itol.astype(str)\n",
+    "    data = '\\n'.join(df_itol.apply(lambda x: ','.join(x), axis=1))\n",
+    "    field_shapes = \",\".join(field_shapes)\n",
+    "    field_colors = \",\".join(field_colors)\n",
+    "\n",
+    "    # Fill the iTOL template with the field labels, colors, and data\n",
+    "    itol_annotation = itol_template.format(field_labels=field_labels, color=color, field_shapes=field_shapes,\n",
+    "                                           field_colors=field_colors, strip_width=strip_width, data=data, \n",
+    "                                           sep=sep, symbol_spacing=symbol_spacing, height_factor=height_factor, \n",
+    "                                           dataset_label=dataset_label)\n",
+    "\n",
+    "    # Write the iTOL annotation to a file\n",
+    "    with open(output_file, 'w') as f:\n",
+    "        f.write(itol_annotation)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -472,7 +643,29 @@
     "                if bigscape_class in df_bigscape_class_counts.index:\n",
     "                    # Store the count of the current BIG-SCAPE class in the genomes DataFrame\n",
     "                    df_genomes.loc[gid, bigscape_class] = df_bigscape_class_counts[bigscape_class]\n",
-    "\n",
+    "        \n",
+    "        # create absence presence matrix\n",
+    "        df_presence = df_gcf_presence.replace(0, -1)\n",
+    "        for gcf_id in df_gcfs.index:\n",
+    "            fam_type = df_gcfs.loc[gcf_id, \"fam_type\"]\n",
+    "            if fam_type == \"unknown_family\":\n",
+    "                df_presence[str(gcf_id)] = df_presence[str(gcf_id)].replace(1, 0)\n",
+    "        color = \"#ff0000\"\n",
+    "        shape = 1\n",
+    "        field_colors = [color for i in df_presence.columns]\n",
+    "        field_shapes = [str(shape) for i in df_presence.columns]\n",
+    "        \n",
+    "        # write absence presence matrix\n",
+    "        outfile = Path(f\"assets/iTOL_annotation/iTOL_BiG-SCAPE_presence_antismash_{antismash_version}.txt\")\n",
+    "        outfile.parent.mkdir(parents=True, exist_ok=True)\n",
+    "        create_itol_binary_annotation(df_presence, \"GCF presence\", field_shapes, field_colors, outfile, sep=\"COMMA\", color=\"#ff0000\")\n",
+    "        \n",
+    "        # Create a download button for the iTOL annotation\n",
+    "        button_link = f'<a href=\"../{outfile}\" download class=\"md-button\">Download BiG-SCAPE GCF presence</a>'\n",
+    "        \n",
+    "        # Add the download button to the list of button items\n",
+    "        button_items.append(button_link)\n",
+    "        \n",
     "        # Get the column names of the genomes DataFrame excluding 'Genome ID', 'Unique BGCs', and 'BGCs'\n",
     "        column_names = df_genomes.drop(['Genome ID', 'Unique BGCs', 'BGCs'], axis=1).columns\n",
     "\n",