diff --git a/index.html b/index.html
index 0f507ac..726265b 100644
--- a/index.html
+++ b/index.html
@@ -24,9 +24,9 @@
Ofsted SEND Summary
- Summarised outcomes of published short and standard SEND inspection reports by Ofsted, refreshed weekly.
An expanded version of the shown summary sheet, refreshed concurrently, is available to download here as an .xlsx file.
Data summary is based on the original SEND Outcomes Summary published periodically by the ADCS: https://adcs.org.uk/inspection/article/ilacs-outcomes-summary. Read the tool/project background details and future work..
+ Summarised outcomes of published short and standard SEND inspection reports by Ofsted, refreshed weekly.
An expanded version of the shown summary sheet, refreshed concurrently, is available to download here as an .xlsx file.
Data summary is based on the original SEND Outcomes Summary published periodically by the ADCS: https://www.adcs.org.uk/inspection-of-childrens-services/. Read the tool/project background details and future work..
Disclaimer: This summary is built from scraped data direct from https://reports.ofsted.gov.uk/ published PDF inspection report files. As a result of the nuances|variance within the inspection report content or pdf encoding, we're noting some problematic data extraction for a small number of LAs*.
*Known LA extraction issues:
Feedback on specific problems|inaccuracies|suggestions welcomed.*
- Summary data last updated: 07 08 2024 13:57
+ Summary data last updated: 07 08 2024 16:38
LA inspections last updated: []
diff --git a/ofsted_childrens_services_inspection_scrape.py b/ofsted_childrens_services_inspection_scrape.py
index 20ad0b6..7b37692 100644
--- a/ofsted_childrens_services_inspection_scrape.py
+++ b/ofsted_childrens_services_inspection_scrape.py
@@ -367,66 +367,6 @@ def extract_inspection_data_update(pdf_content):
start_date_formatted = None
end_date_formatted = None
- # # Extract inspection judgements/grades
- # #
-
- # # Can be multiple tables on page 1(dodgy pdf formatting), ensure we only look at the 1st.
- # #
- # df = pd.DataFrame(tables[0])
-
- # # Some initial clean-up / consistency checks
- # df.columns = [col.lower().strip() for col in df.columns] # coerce consistent (headers)
- # df = df.astype(str).applymap(lambda s: s.lower()) # coerce consistent (data+types)
- # df = df.replace('\r', ' ', regex=True)
-
-
- # # Check/enforce the expected grades table structure exists
- # #
-
- # # Expected headers exist?
- # if not set(["judgement", "grade"]).issubset(df.columns):
- # # They dont't, so re-allign structure or replace(last resort
- # df = fix_invalid_judgement_table_structure(df)
- # # If the df structure is unrecognisable/unfixable, a placeholder df with dummy vals is returned
-
-
- # # We have a great deal of messy extracted data
- # # incl multi-line judgement strings that don't line up with grade. Need to address this.
- # df = fix_misalligned_judgement_table(df)
-
- # # Short-term fix
- # # We have some remaining known anomolies remaining in grade value structure
- # # This is a not-ideal brute force fix for those
- # columns_to_replace_grade_val = ['grade', 'overall_effectiveness', 'impact_of_leaders', 'help_and_protection', 'in_care', 'care_leavers', 'in_care_and_care_leavers']
-
- # for column in columns_to_replace_grade_val:
- # # handle just in-case we have a column naming mis-match
- # if column in df.columns:
- # df[column] = df[column].replace({r'\b(be good\w*)\b': 'requires improvement', '(?i)nan': 'data_unreadable'}, regex=True)
- # else:
- # # [TESTING]
- # # print(f"Column '{column}' not found in the DataFrame.")
- # # print(df.columns)
-
- # # Log the column names instead of printing
- # logging.warning(f"Inspection date {start_date_formatted} / Column '{column}' not found in the DataFrame.")
- # logging.info(df.columns)
-
-
- # # Get judgement-grades as dict
- # inspection_grades_dict = dict(zip(df['judgement'], df['grade']))
-
- # # Ensure not yet introduced judgement is consistent pre-introduction
- # # new care_leavers judgement introduction date (1st January 2023)
- # judgement_chg_date_care_leavers = parse_date("01 January 2023", '%d %B %Y')
- # try:
- # # start_date_formatted is valid and pre the judgement introduction date
- # if start_date_formatted and start_date_formatted < judgement_chg_date_care_leavers:
- # # replace with default str val if inspection pre-dates judgement type
- # inspection_grades_dict['care_leavers'] = 'inspection_pre_dates_judgement' # reset/coerce consistency in val
-
- # except TypeError: # invalid type
- # print("Date comparison failed due to invalid input.")
return {
@@ -498,6 +438,7 @@ def extract_text_by_pages(pdf_bytes):
def remove_unwanted_sections(pages_content):
# supercedes extract_text_from_pdf in combo with extract_text_by_pages
+ # we know the last two pages of the reports are superfluous to content/outcome detail
cleaned_pages = []
heading_found = False
@@ -527,7 +468,7 @@ def extract_inspection_outcome_section(cleaned_text):
if match:
section = match.group(1).strip()
- section = clean_text(section)
+ section = clean_text(section) # rem further non-printing chars
# Remove the last paragraph (assumes that more than 2 exist!)
# This typically only states strategic progress publishing etc.
@@ -541,6 +482,101 @@ def extract_inspection_outcome_section(cleaned_text):
return "Inspection outcome section not found."
+def determine_outcome_grade(inspection_outcome_section):
+ grades = {
+ "positive experiences": 1,
+ "inconsistent experiences": 2,
+ "significant concerns": 3
+ }
+
+ for phrase, grade in grades.items():
+ if phrase in inspection_outcome_section:
+ return grade
+
+ return None # If no matching phrase is found
+
+
+
+def extract_next_inspection(inspection_outcome_section):
+ pattern = re.compile(r"inspection will be within approximately (\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve) (years?|months?)", re.IGNORECASE)
+ match = pattern.search(inspection_outcome_section)
+
+ if match:
+ # Convert text numbers to numeric
+ # as in-text variations possible, see examples: 5 years., 18 months., 3 years., three years.
+ number_map = {
+ "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
+ "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
+ "eleven": 11, "twelve": 12
+ }
+ number_str = match.group(1).lower()
+ time_frame = number_map.get(number_str, number_str) # Convert text to number if needed
+ unit = match.group(2).lower()
+
+ return f"{time_frame} {unit}"
+
+ return None # If no matching time frame is found
+
+
+def calculate_next_inspection_by_date(inspection_end_date, next_inspection):
+ # Parse the inspection_end_date
+ end_date = datetime.strptime(inspection_end_date, "%d %B %Y")
+
+ # Extract number and unit from next_inspection
+ pattern = re.compile(r"(\d+) (years?|months?)", re.IGNORECASE)
+ match = pattern.search(next_inspection)
+
+ if match:
+ number = int(match.group(1))
+ unit = match.group(2).lower()
+
+ if 'year' in unit:
+ next_inspection_date = end_date + timedelta(days=365 * number)
+ elif 'month' in unit:
+ next_inspection_date = end_date + timedelta(days=30 * number)
+
+ return next_inspection_date.strftime("%d %B %Y")
+
+ return None
+
+
+def parse_inspection_date(date_string):
+ formats = ["%d %B %Y", "%d/%m/%Y"]
+ for fmt in formats:
+ try:
+ return datetime.strptime(date_string, fmt)
+ except ValueError:
+ continue
+ raise ValueError(f"Date format for {date_string} is not supported")
+
+def calculate_next_inspection_by_date(inspection_end_date, next_inspection):
+ if not next_inspection:
+ return "Next inspection time frame not found"
+
+ # Parse the inspection_end_date
+ try:
+ end_date = parse_inspection_date(inspection_end_date)
+ except ValueError as e:
+ return str(e)
+
+ # Extract number and unit from next_inspection
+ pattern = re.compile(r"(\d+) (years?|months?)", re.IGNORECASE)
+ match = pattern.search(next_inspection)
+
+ if match:
+ number = int(match.group(1))
+ unit = match.group(2).lower()
+
+ if 'year' in unit:
+ next_inspection_date = end_date + timedelta(days=365 * number)
+ elif 'month' in unit:
+ next_inspection_date = end_date + timedelta(days=30 * number)
+
+ return next_inspection_date.strftime("%d %B %Y")
+
+ return "Invalid next inspection time frame"
+
+
def process_provider_links(provider_links):
"""
@@ -647,6 +683,13 @@ def process_provider_links(provider_links):
# print("\nInspection outcome section:")
# print(inspection_outcome_section)
+ # Determine the outcome grade
+ outcome_grade = determine_outcome_grade(inspection_outcome_section)
+ print(outcome_grade)
+
+ # Next inspection time-frame (comnes back as f"{time_frame} {unit}")
+ next_inspection = extract_next_inspection(inspection_outcome_section)
+
# Extract the local authority and inspection link, and add the data to the list
if not found_inspection_link:
@@ -701,16 +744,23 @@ def process_provider_links(provider_links):
print(f"{local_authority}") # Gives listing console output during run in the format 'data/inspection reports/urn name_of_la'
+
+ next_inspection_by_date = calculate_next_inspection_by_date(inspection_end_date_formatted, next_inspection)
+
+
+
data.append({
'urn': urn,
- 'local_authority': la_name_str,
- 'inspection_link': inspection_link,
- # 'overall_effectiveness_grade': overall_effectiveness,
+ 'local_authority': la_name_str,
+ 'inspection_link': inspection_link,
+ 'outcome_grade': outcome_grade,
+ 'next_inspection': next_inspection,
# 'inspection_framework': inspection_framework,
# 'inspector_name': inspector_name,
'inspection_start_date': inspection_start_date_formatted,
'inspection_end_date': inspection_end_date_formatted,
'publication_date': report_published_date,
+ 'next_inspection_by_date': next_inspection_by_date,
'local_link_to_all_inspections': provider_dir_link,
'inspection_outcome_text': inspection_outcome_section,
@@ -754,7 +804,7 @@ def save_data_update(data, filename, file_type='csv', hyperlink_column = None):
# Create a new workbook and add a worksheet
workbook = xlsxwriter.Workbook(filename_with_extension)
- sheet = workbook.add_worksheet('ofsted_cs_inspections_overview') # pass the desired sheet name here
+ sheet = workbook.add_worksheet('ofsted_cs_send_inspections') # pass the desired worksheet name here
hyperlink_col_index = data.columns.get_loc(hyperlink_column) if hyperlink_column else None
@@ -810,7 +860,6 @@ def import_csv_from_folder(folder_name):
return df
-
def reposition_columns(df, key_col, cols_to_move):
"""
Move one or more columns in a DataFrame to be immediately to the right
@@ -889,7 +938,7 @@ def save_to_html(data, column_order, local_link_column=None, web_link_column=Non
'An expanded version of the shown summary sheet, refreshed concurrently, is available to '
'download here as an .xlsx file. '
'
Data summary is based on the original SEND Outcomes Summary published periodically by the ADCS: '
- 'https://adcs.org.uk/inspection/article/ilacs-outcomes-summary. '
+ 'https://www.adcs.org.uk/inspection-of-childrens-services/. '
'Read the tool/project background details and future work..'
)
@@ -1035,9 +1084,6 @@ def save_to_html(data, column_order, local_link_column=None, web_link_column=Non
-
-
-
#
# Scrape Ofsted inspection report data
#
diff --git a/ofsted_childrens_services_send_overview.xlsx b/ofsted_childrens_services_send_overview.xlsx
index 91a6a6d..f0d7e3c 100644
Binary files a/ofsted_childrens_services_send_overview.xlsx and b/ofsted_childrens_services_send_overview.xlsx differ