Skip to content

Commit

Permalink
Outcome_grade, next_inspection
Browse files Browse the repository at this point in the history
  • Loading branch information
robjharrison committed Aug 7, 2024
1 parent 9cb3a52 commit 8f3a235
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 72 deletions.
4 changes: 2 additions & 2 deletions index.html
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
</head>
<body>
<h1>Ofsted SEND Summary</h1>
<p>Summarised outcomes of published short and standard SEND inspection reports by Ofsted, refreshed weekly.<br/>An expanded version of the shown summary sheet, refreshed concurrently, is available to <a href="ofsted_childrens_services_overview.xlsx">download here</a> as an .xlsx file. <br/>Data summary is based on the original <i>SEND Outcomes Summary</i> published periodically by the ADCS: <a href="https://adcs.org.uk/inspection/article/ilacs-outcomes-summary">https://adcs.org.uk/inspection/article/ilacs-outcomes-summary</a>. <a href="https://github.com/data-to-insight/ofsted-send-scrape-tool/blob/main/README.md">Read the tool/project background details and future work.</a>.</p>
<p>Summarised outcomes of published short and standard SEND inspection reports by Ofsted, refreshed weekly.<br/>An expanded version of the shown summary sheet, refreshed concurrently, is available to <a href="ofsted_childrens_services_overview.xlsx">download here</a> as an .xlsx file. <br/>Data summary is based on the original <i>SEND Outcomes Summary</i> published periodically by the ADCS: <a href="https://www.adcs.org.uk/inspection-of-childrens-services/">https://www.adcs.org.uk/inspection-of-childrens-services/</a>. <a href="https://github.com/data-to-insight/ofsted-send-scrape-tool/blob/main/README.md">Read the tool/project background details and future work.</a>.</p>
<p>Disclaimer: This summary is built from scraped data direct from https://reports.ofsted.gov.uk/ published PDF inspection report files. As a result of the nuances|variance within the inspection report content or pdf encoding, we're noting some problematic data extraction for a small number of LAs*.<br/> *Known LA extraction issues: <br/><a href="mailto:datatoinsight.enquiries@gmail.com?subject=Ofsted-Scrape-Tool">Feedback</a> on specific problems|inaccuracies|suggestions welcomed.*</p>
<p><b>Summary data last updated: 07 08 2024 13:57</b></p>
<p><b>Summary data last updated: 07 08 2024 16:38</b></p>
<p><b>LA inspections last updated: []</b></p>
<div class="container">
<table border="1" class="dataframe">
Expand Down
186 changes: 116 additions & 70 deletions ofsted_childrens_services_inspection_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,66 +367,6 @@ def extract_inspection_data_update(pdf_content):
start_date_formatted = None
end_date_formatted = None

# # Extract inspection judgements/grades
# #

# # Can be multiple tables on page 1(dodgy pdf formatting), ensure we only look at the 1st.
# #
# df = pd.DataFrame(tables[0])

# # Some initial clean-up / consistency checks
# df.columns = [col.lower().strip() for col in df.columns] # coerce consistent (headers)
# df = df.astype(str).applymap(lambda s: s.lower()) # coerce consistent (data+types)
# df = df.replace('\r', ' ', regex=True)


# # Check/enforce the expected grades table structure exists
# #

# # Expected headers exist?
# if not set(["judgement", "grade"]).issubset(df.columns):
# # They dont't, so re-allign structure or replace(last resort
# df = fix_invalid_judgement_table_structure(df)
# # If the df structure is unrecognisable/unfixable, a placeholder df with dummy vals is returned


# # We have a great deal of messy extracted data
# # incl multi-line judgement strings that don't line up with grade. Need to address this.
# df = fix_misalligned_judgement_table(df)

# # Short-term fix
# # We have some remaining known anomolies remaining in grade value structure
# # This is a not-ideal brute force fix for those
# columns_to_replace_grade_val = ['grade', 'overall_effectiveness', 'impact_of_leaders', 'help_and_protection', 'in_care', 'care_leavers', 'in_care_and_care_leavers']

# for column in columns_to_replace_grade_val:
# # handle just in-case we have a column naming mis-match
# if column in df.columns:
# df[column] = df[column].replace({r'\b(be good\w*)\b': 'requires improvement', '(?i)nan': 'data_unreadable'}, regex=True)
# else:
# # [TESTING]
# # print(f"Column '{column}' not found in the DataFrame.")
# # print(df.columns)

# # Log the column names instead of printing
# logging.warning(f"Inspection date {start_date_formatted} / Column '{column}' not found in the DataFrame.")
# logging.info(df.columns)


# # Get judgement-grades as dict
# inspection_grades_dict = dict(zip(df['judgement'], df['grade']))

# # Ensure not yet introduced judgement is consistent pre-introduction
# # new care_leavers judgement introduction date (1st January 2023)
# judgement_chg_date_care_leavers = parse_date("01 January 2023", '%d %B %Y')
# try:
# # start_date_formatted is valid and pre the judgement introduction date
# if start_date_formatted and start_date_formatted < judgement_chg_date_care_leavers:
# # replace with default str val if inspection pre-dates judgement type
# inspection_grades_dict['care_leavers'] = 'inspection_pre_dates_judgement' # reset/coerce consistency in val

# except TypeError: # invalid type
# print("Date comparison failed due to invalid input.")


return {
Expand Down Expand Up @@ -498,6 +438,7 @@ def extract_text_by_pages(pdf_bytes):

def remove_unwanted_sections(pages_content):
# supercedes extract_text_from_pdf in combo with extract_text_by_pages
# we know the last two pages of the reports are superfluous to content/outcome detail
cleaned_pages = []
heading_found = False

Expand Down Expand Up @@ -527,7 +468,7 @@ def extract_inspection_outcome_section(cleaned_text):
if match:
section = match.group(1).strip()

section = clean_text(section)
section = clean_text(section) # rem further non-printing chars

# Remove the last paragraph (assumes that more than 2 exist!)
# This typically only states strategic progress publishing etc.
Expand All @@ -541,6 +482,101 @@ def extract_inspection_outcome_section(cleaned_text):
return "Inspection outcome section not found."


def determine_outcome_grade(inspection_outcome_section):
grades = {
"positive experiences": 1,
"inconsistent experiences": 2,
"significant concerns": 3
}

for phrase, grade in grades.items():
if phrase in inspection_outcome_section:
return grade

return None # If no matching phrase is found



def extract_next_inspection(inspection_outcome_section):
pattern = re.compile(r"inspection will be within approximately (\d+|one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve) (years?|months?)", re.IGNORECASE)
match = pattern.search(inspection_outcome_section)

if match:
# Convert text numbers to numeric
# as in-text variations possible, see examples: 5 years., 18 months., 3 years., three years.
number_map = {
"one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
"six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
"eleven": 11, "twelve": 12
}
number_str = match.group(1).lower()
time_frame = number_map.get(number_str, number_str) # Convert text to number if needed
unit = match.group(2).lower()

return f"{time_frame} {unit}"

return None # If no matching time frame is found


def calculate_next_inspection_by_date(inspection_end_date, next_inspection):
# Parse the inspection_end_date
end_date = datetime.strptime(inspection_end_date, "%d %B %Y")

# Extract number and unit from next_inspection
pattern = re.compile(r"(\d+) (years?|months?)", re.IGNORECASE)
match = pattern.search(next_inspection)

if match:
number = int(match.group(1))
unit = match.group(2).lower()

if 'year' in unit:
next_inspection_date = end_date + timedelta(days=365 * number)
elif 'month' in unit:
next_inspection_date = end_date + timedelta(days=30 * number)

return next_inspection_date.strftime("%d %B %Y")

return None


def parse_inspection_date(date_string):
formats = ["%d %B %Y", "%d/%m/%Y"]
for fmt in formats:
try:
return datetime.strptime(date_string, fmt)
except ValueError:
continue
raise ValueError(f"Date format for {date_string} is not supported")

def calculate_next_inspection_by_date(inspection_end_date, next_inspection):
if not next_inspection:
return "Next inspection time frame not found"

# Parse the inspection_end_date
try:
end_date = parse_inspection_date(inspection_end_date)
except ValueError as e:
return str(e)

# Extract number and unit from next_inspection
pattern = re.compile(r"(\d+) (years?|months?)", re.IGNORECASE)
match = pattern.search(next_inspection)

if match:
number = int(match.group(1))
unit = match.group(2).lower()

if 'year' in unit:
next_inspection_date = end_date + timedelta(days=365 * number)
elif 'month' in unit:
next_inspection_date = end_date + timedelta(days=30 * number)

return next_inspection_date.strftime("%d %B %Y")

return "Invalid next inspection time frame"



def process_provider_links(provider_links):
"""
Expand Down Expand Up @@ -647,6 +683,13 @@ def process_provider_links(provider_links):
# print("\nInspection outcome section:")
# print(inspection_outcome_section)

# Determine the outcome grade
outcome_grade = determine_outcome_grade(inspection_outcome_section)
print(outcome_grade)

# Next inspection time-frame (comnes back as f"{time_frame} {unit}")
next_inspection = extract_next_inspection(inspection_outcome_section)

# Extract the local authority and inspection link, and add the data to the list
if not found_inspection_link:

Expand Down Expand Up @@ -701,16 +744,23 @@ def process_provider_links(provider_links):

print(f"{local_authority}") # Gives listing console output during run in the format 'data/inspection reports/urn name_of_la'


next_inspection_by_date = calculate_next_inspection_by_date(inspection_end_date_formatted, next_inspection)



data.append({
'urn': urn,
'local_authority': la_name_str,
'inspection_link': inspection_link,
# 'overall_effectiveness_grade': overall_effectiveness,
'local_authority': la_name_str,
'inspection_link': inspection_link,
'outcome_grade': outcome_grade,
'next_inspection': next_inspection,
# 'inspection_framework': inspection_framework,
# 'inspector_name': inspector_name,
'inspection_start_date': inspection_start_date_formatted,
'inspection_end_date': inspection_end_date_formatted,
'publication_date': report_published_date,
'next_inspection_by_date': next_inspection_by_date,
'local_link_to_all_inspections': provider_dir_link,
'inspection_outcome_text': inspection_outcome_section,

Expand Down Expand Up @@ -754,7 +804,7 @@ def save_data_update(data, filename, file_type='csv', hyperlink_column = None):

# Create a new workbook and add a worksheet
workbook = xlsxwriter.Workbook(filename_with_extension)
sheet = workbook.add_worksheet('ofsted_cs_inspections_overview') # pass the desired sheet name here
sheet = workbook.add_worksheet('ofsted_cs_send_inspections') # pass the desired worksheet name here

hyperlink_col_index = data.columns.get_loc(hyperlink_column) if hyperlink_column else None

Expand Down Expand Up @@ -810,7 +860,6 @@ def import_csv_from_folder(folder_name):
return df



def reposition_columns(df, key_col, cols_to_move):
"""
Move one or more columns in a DataFrame to be immediately to the right
Expand Down Expand Up @@ -889,7 +938,7 @@ def save_to_html(data, column_order, local_link_column=None, web_link_column=Non
'An expanded version of the shown summary sheet, refreshed concurrently, is available to '
'<a href="ofsted_childrens_services_overview.xlsx">download here</a> as an .xlsx file. '
'<br/>Data summary is based on the original <i>SEND Outcomes Summary</i> published periodically by the ADCS: '
'<a href="https://adcs.org.uk/inspection/article/ilacs-outcomes-summary">https://adcs.org.uk/inspection/article/ilacs-outcomes-summary</a>. '
'<a href="https://www.adcs.org.uk/inspection-of-childrens-services/">https://www.adcs.org.uk/inspection-of-childrens-services/</a>. '
'<a href="https://github.com/data-to-insight/ofsted-send-scrape-tool/blob/main/README.md">Read the tool/project background details and future work.</a>.'
)

Expand Down Expand Up @@ -1035,9 +1084,6 @@ def save_to_html(data, column_order, local_link_column=None, web_link_column=Non






#
# Scrape Ofsted inspection report data
#
Expand Down
Binary file modified ofsted_childrens_services_send_overview.xlsx
Binary file not shown.

0 comments on commit 8f3a235

Please sign in to comment.