Outcome_grade, next_inspection

data-to-insight · Aug 7, 2024 · 8f3a235 · 8f3a235
1 parent 9cb3a52
commit 8f3a235
Show file tree

Hide file tree

Showing 3 changed files with 118 additions and 72 deletions.
diff --git a/index.html b/index.html
@@ -24,9 +24,9 @@
     </head>
     <body>
         <h1>Ofsted SEND Summary</h1>
-        <p>Summarised outcomes of published short and standard SEND inspection reports by Ofsted, refreshed weekly.<br/>An expanded version of the shown summary sheet, refreshed concurrently, is available to <a href="ofsted_childrens_services_overview.xlsx">download here</a> as an .xlsx file. <br/>Data summary is based on the original <i>SEND Outcomes Summary</i> published periodically by the ADCS: <a href="https://adcs.org.uk/inspection/article/ilacs-outcomes-summary">https://adcs.org.uk/inspection/article/ilacs-outcomes-summary</a>. <a href="https://github.com/data-to-insight/ofsted-send-scrape-tool/blob/main/README.md">Read the tool/project background details and future work.</a>.</p>
+        <p>Summarised outcomes of published short and standard SEND inspection reports by Ofsted, refreshed weekly.<br/>An expanded version of the shown summary sheet, refreshed concurrently, is available to <a href="ofsted_childrens_services_overview.xlsx">download here</a> as an .xlsx file. <br/>Data summary is based on the original <i>SEND Outcomes Summary</i> published periodically by the ADCS: <a href="https://www.adcs.org.uk/inspection-of-childrens-services/">https://www.adcs.org.uk/inspection-of-childrens-services/</a>. <a href="https://github.com/data-to-insight/ofsted-send-scrape-tool/blob/main/README.md">Read the tool/project background details and future work.</a>.</p>
         <p>Disclaimer: This summary is built from scraped data direct from https://reports.ofsted.gov.uk/ published PDF inspection report files. As a result of the nuances|variance within the inspection report content or pdf encoding, we're noting some problematic data extraction for a small number of LAs*.<br/> *Known LA extraction issues: <br/><a href="mailto:datatoinsight.enquiries@gmail.com?subject=Ofsted-Scrape-Tool">Feedback</a> on specific problems|inaccuracies|suggestions welcomed.*</p>
-        <p><b>Summary data last updated: 07 08 2024 13:57</b></p>
+        <p><b>Summary data last updated: 07 08 2024 16:38</b></p>
         <p><b>LA inspections last updated: []</b></p>
         <div class="container">
     <table border="1" class="dataframe">

diff --git a/ofsted_childrens_services_inspection_scrape.py b/ofsted_childrens_services_inspection_scrape.py
@@ -367,66 +367,6 @@ def extract_inspection_data_update(pdf_content):
         start_date_formatted = None
         end_date_formatted = None
 
-    # # Extract inspection judgements/grades
-    # #
-
-    # # Can be multiple tables on page 1(dodgy pdf formatting), ensure we only look at the 1st. 
-    # # 
-    # df = pd.DataFrame(tables[0])
-
-    # # Some initial clean-up / consistency checks
-    # df.columns = [col.lower().strip() for col in df.columns] # coerce consistent (headers)
-    # df = df.astype(str).applymap(lambda s: s.lower()) # coerce consistent (data+types)
-    # df = df.replace('\r', ' ', regex=True)
-
-
-    # # Check/enforce the expected grades table structure exists
-    # #
-
-    # # Expected headers exist?
-    # if not set(["judgement", "grade"]).issubset(df.columns):
-    #     # They dont't, so re-allign structure or replace(last resort
-    #     df = fix_invalid_judgement_table_structure(df)
-    #     #  If the df structure is unrecognisable/unfixable, a placeholder df with dummy vals is returned
-
-
-    # # We have a great deal of messy extracted data
-    # # incl multi-line judgement strings that don't line up with grade. Need to address this.
-    # df = fix_misalligned_judgement_table(df)  
-
-    # # Short-term fix
-    # # We have some remaining known anomolies remaining in grade value structure
-    # # This is a not-ideal brute force fix for those
-    # columns_to_replace_grade_val = ['grade', 'overall_effectiveness', 'impact_of_leaders', 'help_and_protection', 'in_care', 'care_leavers', 'in_care_and_care_leavers']
-
-    # for column in columns_to_replace_grade_val:
-    #     # handle just in-case we have a column naming mis-match 
-    #     if column in df.columns:
-    #         df[column] = df[column].replace({r'\b(be good\w*)\b': 'requires improvement', '(?i)nan': 'data_unreadable'}, regex=True)
-    #     else:
-    #         # [TESTING]
-    #         # print(f"Column '{column}' not found in the DataFrame.")
-    #         # print(df.columns)
-
-    #         # Log the column names instead of printing
-    #         logging.warning(f"Inspection date {start_date_formatted} / Column '{column}' not found in the DataFrame.")
-    #         logging.info(df.columns)
-
-
-    # # Get judgement-grades as dict
-    # inspection_grades_dict = dict(zip(df['judgement'], df['grade']))
-
-    # # Ensure not yet introduced judgement is consistent pre-introduction
-    # # new care_leavers judgement introduction date (1st January 2023)
-    # judgement_chg_date_care_leavers = parse_date("01 January 2023", '%d %B %Y')
-    # try:
-    #     # start_date_formatted is valid and pre the judgement introduction date
-    #     if start_date_formatted and start_date_formatted < judgement_chg_date_care_leavers:
-    #         # replace with default str val if inspection pre-dates judgement type
-    #         inspection_grades_dict['care_leavers'] = 'inspection_pre_dates_judgement' # reset/coerce consistency in val
-
-    # except TypeError: # invalid type
-    #     print("Date comparison failed due to invalid input.")
 
 
     return {
@@ -498,6 +438,7 @@ def extract_text_by_pages(pdf_bytes):
 
 def remove_unwanted_sections(pages_content):
      # supercedes extract_text_from_pdf in combo with extract_text_by_pages
+     # we know the last two pages of the reports are superfluous to content/outcome detail
     cleaned_pages = []
     heading_found = False
 
@@ -527,7 +468,7 @@ def extract_inspection_outcome_section(cleaned_text):
     if match:
         section = match.group(1).strip()
 
-        section = clean_text(section)
+        section = clean_text(section) # rem further non-printing chars
 
         # Remove the last paragraph (assumes that more than 2 exist!)
         # This typically only states strategic progress publishing etc. 
@@ -541,6 +482,101 @@ def extract_inspection_outcome_section(cleaned_text):
         return "Inspection outcome section not found."
 
 
+def determine_outcome_grade(inspection_outcome_section):
+    grades = {
+        "positive experiences": 1,
+        "inconsistent experiences": 2,
+        "significant concerns": 3
+    }
+
+    for phrase, grade in grades.items():
+        if phrase in inspection_outcome_section:
+            return grade
+
+    return None  # If no matching phrase is found
+
+
+
+def extract_next_inspection(inspection_outcome_section):
+    pattern = re.compile(r"inspection will be within approximately (\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve) (years?|months?)", re.IGNORECASE)
+    match = pattern.search(inspection_outcome_section)
+
+    if match:
+        # Convert text numbers to numeric
+        # as in-text variations possible, see examples: 5 years., 18 months., 3 years., three years.
+        number_map = {
+            "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
+            "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
+            "eleven": 11, "twelve": 12
+        }
+        number_str = match.group(1).lower()
+        time_frame = number_map.get(number_str, number_str)  # Convert text to number if needed
+        unit = match.group(2).lower()
+
+        return f"{time_frame} {unit}"
+
+    return None  # If no matching time frame is found
+
+
+def calculate_next_inspection_by_date(inspection_end_date, next_inspection):
+    # Parse the inspection_end_date
+    end_date = datetime.strptime(inspection_end_date, "%d %B %Y")
+
+    # Extract number and unit from next_inspection
+    pattern = re.compile(r"(\d+) (years?|months?)", re.IGNORECASE)
+    match = pattern.search(next_inspection)
+
+    if match:
+        number = int(match.group(1))
+        unit = match.group(2).lower()
+
+        if 'year' in unit:
+            next_inspection_date = end_date + timedelta(days=365 * number)
+        elif 'month' in unit:
+            next_inspection_date = end_date + timedelta(days=30 * number)
+
+        return next_inspection_date.strftime("%d %B %Y")
+
+    return None
+
+
+def parse_inspection_date(date_string):
+    formats = ["%d %B %Y", "%d/%m/%Y"]
+    for fmt in formats:
+        try:
+            return datetime.strptime(date_string, fmt)
+        except ValueError:
+            continue
+    raise ValueError(f"Date format for {date_string} is not supported")
+
+def calculate_next_inspection_by_date(inspection_end_date, next_inspection):
+    if not next_inspection:
+        return "Next inspection time frame not found"
+
+    # Parse the inspection_end_date
+    try:
+        end_date = parse_inspection_date(inspection_end_date)
+    except ValueError as e:
+        return str(e)
+
+    # Extract number and unit from next_inspection
+    pattern = re.compile(r"(\d+) (years?|months?)", re.IGNORECASE)
+    match = pattern.search(next_inspection)
+
+    if match:
+        number = int(match.group(1))
+        unit = match.group(2).lower()
+
+        if 'year' in unit:
+            next_inspection_date = end_date + timedelta(days=365 * number)
+        elif 'month' in unit:
+            next_inspection_date = end_date + timedelta(days=30 * number)
+
+        return next_inspection_date.strftime("%d %B %Y")
+
+    return "Invalid next inspection time frame"
+
+
 
 def process_provider_links(provider_links):
     """
@@ -647,6 +683,13 @@ def process_provider_links(provider_links):
                 # print("\nInspection outcome section:")
                 # print(inspection_outcome_section)
 
+                # Determine the outcome grade
+                outcome_grade = determine_outcome_grade(inspection_outcome_section)
+                print(outcome_grade)
+
+                # Next inspection time-frame (comnes back as f"{time_frame} {unit}")
+                next_inspection = extract_next_inspection(inspection_outcome_section)
+
                # Extract the local authority and inspection link, and add the data to the list
                 if not found_inspection_link:
 
@@ -701,16 +744,23 @@ def process_provider_links(provider_links):
 
                         print(f"{local_authority}") # Gives listing console output during run in the format 'data/inspection reports/urn name_of_la'
 
+
+                        next_inspection_by_date = calculate_next_inspection_by_date(inspection_end_date_formatted, next_inspection)
+
+
+
                         data.append({
                                         'urn': urn,
-                                        'local_authority': la_name_str,
-                                        'inspection_link': inspection_link,
-                                        # 'overall_effectiveness_grade': overall_effectiveness,
+                                        'local_authority':  la_name_str,
+                                        'inspection_link':  inspection_link,
+                                        'outcome_grade':    outcome_grade,
+                                        'next_inspection':  next_inspection,
                                         # 'inspection_framework': inspection_framework,
                                         # 'inspector_name': inspector_name,
                                         'inspection_start_date': inspection_start_date_formatted,
                                         'inspection_end_date': inspection_end_date_formatted,
                                         'publication_date': report_published_date,
+                                        'next_inspection_by_date': next_inspection_by_date,
                                         'local_link_to_all_inspections': provider_dir_link,
                                         'inspection_outcome_text': inspection_outcome_section,
 
@@ -754,7 +804,7 @@ def save_data_update(data, filename, file_type='csv', hyperlink_column = None):
 
         # Create a new workbook and add a worksheet
         workbook = xlsxwriter.Workbook(filename_with_extension)
-        sheet = workbook.add_worksheet('ofsted_cs_inspections_overview')  # pass the desired sheet name here
+        sheet = workbook.add_worksheet('ofsted_cs_send_inspections')  # pass the desired worksheet name here
 
         hyperlink_col_index = data.columns.get_loc(hyperlink_column) if hyperlink_column else None
 
@@ -810,7 +860,6 @@ def import_csv_from_folder(folder_name):
         return df
 
 
-
 def reposition_columns(df, key_col, cols_to_move):
     """
     Move one or more columns in a DataFrame to be immediately to the right 
@@ -889,7 +938,7 @@ def save_to_html(data, column_order, local_link_column=None, web_link_column=Non
         'An expanded version of the shown summary sheet, refreshed concurrently, is available to '
         '<a href="ofsted_childrens_services_overview.xlsx">download here</a> as an .xlsx file. '
         '<br/>Data summary is based on the original <i>SEND Outcomes Summary</i> published periodically by the ADCS: '
-        '<a href="https://adcs.org.uk/inspection/article/ilacs-outcomes-summary">https://adcs.org.uk/inspection/article/ilacs-outcomes-summary</a>. '
+        '<a href="https://www.adcs.org.uk/inspection-of-childrens-services/">https://www.adcs.org.uk/inspection-of-childrens-services/</a>. '
         '<a href="https://github.com/data-to-insight/ofsted-send-scrape-tool/blob/main/README.md">Read the tool/project background details and future work.</a>.'
     )
 
@@ -1035,9 +1084,6 @@ def save_to_html(data, column_order, local_link_column=None, web_link_column=Non
 
 
 
-
-
-
 #
 # Scrape Ofsted inspection report data
 #

diff --git a/ofsted_childrens_services_send_overview.xlsx b/ofsted_childrens_services_send_overview.xlsx