bcstats

#!/usr/bin/python3
#
# bcstats was built to provide summarized program information for Bugcrowd, based
# on the h1stats program released by defparam under the MIT license
#
# Authors: pmnh (h1pmnh)
#
# MIT License
# 
# Copyright (c) 2021 pmnh
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from os import dup
import requests
import json
import argparse
import csv
import html
from datetime import datetime
# used for parsing of the program HTML pages to grab React JSON data because there is no API :(
from bs4 import BeautifulSoup

def get_all_programs(args):
    if args.session_cookie.find("%") >= 0:
        print("ERROR: This doesn't look like a _crowdcontrol_session or may be URL encoded, make sure you are using the decoded session cookie!")
        exit(1)

    headers = {
        "Cookie":"_crowdcontrol_session=%s" % args.session_cookie
    }

    offset = 0
    page = 1 # in Apr 2022 or so, BC decided to once again change the pagination API
    progcount = 0
    progtotal = 99999
    all_programs = []
    while(progcount < progtotal):
        search_url = "https://bugcrowd.com/programs.json?%s&page[]=%d" % (args.search_params, page)
        resp = requests.get(search_url, headers=headers)
        # print(resp.text)
        data = json.loads(resp.text)
        # this typically is a fatal error such as an expired cookie
        if "error" in data:
            print(" [!] Error reported: %s" % data["error"])
            print(" [!] Aborting run")
            exit(1)
        
        progcount += len(data["programs"])
        print("[+] Collecting... (%d programs)                              "%(progcount), flush=1, end='\n')
        enriched_programs = []
        for program in data["programs"]:
            print("Extracting data for %s                              " % program["code"], end='\r')
            if "program_url" in program:
                # use case for a "Program starting soon" program
                program["full_url"] = "https://bugcrowd.com%s" % (program["program_url"])
                if program["invited_status"] not in ["joinable","waitlistable"]:
                    try:
                        prog_details = get_program_details(args, program["code"])
                        if prog_details:
                            program.update(prog_details["max_bounties"])
                            program.update(prog_details)
                    except:
                        print(" [!] Unable to get details for program %s" % (program["code"]))

                enriched_programs.append(program)
        all_programs.extend(enriched_programs)

        # update the total from the cursor
        progtotal = data["meta"]["totalHits"]
        # print("total: %d" % progtotal)
        # offset = offset + 25
        page = page + 1
    
    columns = ["code","name","can_subscribe","in_progress?","invited_status","participation","full_url","reward_range_summary",
        "duplicate_bugs","unique_bugs","rewarded_bugs","validation_days","average_bounty","p1","p2","p3","p4","p5","shows_known_issues","has_crowdstream","has_hall_of_fame", "collaboration_enabled","managed?"]

    now = datetime.now()
    filename = "bcstats-%s-%s-%s.csv"%(str(now.year),str(now.month),str(now.day))
    with open(filename,"w") as f:
        writer = csv.DictWriter(f, fieldnames=columns, extrasaction="ignore", lineterminator="\n")
        writer.writeheader()
        writer.writerows(all_programs)

    print("[+] DONE! Output in %s" % (filename))

def get_program_details(args, slug):
    headers = {
        "Cookie":"_crowdcontrol_session=%s" % args.session_cookie
    }

    program_url = "https://bugcrowd.com/%s" % (slug)
    max_bounties = {"p1":0, "p2":0, "p3":0, "p4":0, "p5":0}
    duplicate_bugs = 0
    unique_bugs = 0
    average_bounty = None
    validation_days = None
    rewarded_bugs = None
    shows_known_issues = False

    resp = requests.get(program_url, headers=headers)
    if resp.status_code != 200:
        print("  [+] Program %s returned a status code %s, skipping program details..." % (slug, resp.status_code))
        return None

    data = resp.text
    if data.find("Compliance required") != -1:
        print("  [+] Program %s has a compliance agreement which has not been accepted - skipping" % (slug))
        return None
    else:
        soup = BeautifulSoup(data, features="lxml")
        tags = soup.find_all("div", attrs={"data-react-class": "ResearcherTargetGroups"})
        if len(tags) != 1:
            print("ERROR: Expected only one `ResearcherTargetGroups` React data")

        resp_groups = requests.get(program_url + "/target_groups", headers=headers)
        try:
            program_data = json.loads(resp_groups.text)
        except json.JSONDecodeError as e:
            print(" [!] Unable to decode JSON for program %s: %s" % (slug, e))
            print(" [!] Response received was: %s" % (resp_groups.text))
            program_data = {"groups":[]}

        for group in program_data["groups"]:
            # get max rewards by priority
            for pri in ["1", "2", "3", "4", "5"]:
                if pri in group["reward_range"] and group["reward_range"][pri]["max"] and group["reward_range"][pri]["max"] > max_bounties["p"+pri]:
                    max_bounties["p"+pri] = group["reward_range"][pri]["max"]
            
            # separate JSON call now for each target group stats
            resp_group_stats = requests.get("https://bugcrowd.com" + group["targets_url"], headers=headers)
            group_data = json.loads(resp_group_stats.text)
            # accumulate total and unique bug counts if the program discloses
            for target in group_data["targets"]:
                if "known_issues" in target and "totals" in target["known_issues"]:
                    unique_bugs += target["known_issues"]["totals"]["unique"]
                    duplicate_bugs += target["known_issues"]["totals"]["duplicate"]

        # extract other data from HTML because - no API and no structured data - ugh :~(
        brief_stats_tag = soup.find("div", attrs={"id":"user-guides__bounty-brief__stats"})
        # li tags within here
        brief_stats_lis = brief_stats_tag.find_all("li")
        for brief_stats_li in brief_stats_lis:
            title_tag = brief_stats_li.find("span", attrs={"class":"bc-stat__title"}).text.strip().translate(str.maketrans('', '', ' \n\t\r'))
            # remove whitespace to ease parsing
            if "Validationwithin" == title_tag:
                validation_days = brief_stats_li.find("span", attrs={"class":"bc-stat__fig"}).text.strip()
            elif "Vulnerabilitiesrewarded" == title_tag:
                rewarded_bugs = brief_stats_li.find("span", attrs={"class":"bc-stat__fig"}).text.strip()
            elif "Averagepayout" == title_tag:
                average_bounty = brief_stats_li.find("span", attrs={"class":"bc-stat__fig"}).text.strip()[1:]
        #end big else

        # look for a program which discloses known issues
        has_known_issues = soup.find("td", attrs={"data-label":"Known issues"})
        if has_known_issues and len(has_known_issues) > 0:
            shows_known_issues = True

        has_crowdstream = False
        has_hall_of_fame = False
        # look for which tabs are shown
        subnavs = soup.find_all("li", attrs={"class":"bc-subnav__item"})
        for subnav in subnavs:
            if subnav.text.strip() == "CrowdStream":
                has_crowdstream = True
            if subnav.text.strip() == "Hall of Fame":
                has_hall_of_fame = True

    program_data = {
        "max_bounties":max_bounties,
        "duplicate_bugs":duplicate_bugs,
        "unique_bugs":unique_bugs,
        "rewarded_bugs":rewarded_bugs,
        "validation_days":validation_days,
        "average_bounty":average_bounty,
        "shows_known_issues":shows_known_issues,
        "has_crowdstream":has_crowdstream,
        "has_hall_of_fame":has_hall_of_fame
    }
    # print(json.dumps(program_data))
    return program_data

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-s", "--session_cookie", help="Value of your _crowdcontrol_session cookie", required=True)
    parser.add_argument("-p", "--search_params", default="sort[]=promoted-desc&hidden[]=false", help="Search params for the program search", required=False)
    args = parser.parse_args()
    get_all_programs(args)