-
Notifications
You must be signed in to change notification settings - Fork 3
/
06_Score_GPT_outputs.wls
executable file
·84 lines (60 loc) · 2.63 KB
/
06_Score_GPT_outputs.wls
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env wolframscript
(* ::Package:: *)
(* create summary scores for the GPT-3.5 and GPT-4 outputs *)
(*** define functions for summarization ***)
(* summarize results from one CV run *)
summarize[f_?FileExistsQ]:= With[
{d = Import[f, "RawJSON"]},
<|"dataset" -> Query["metadata", "datasource"]@ d,
"accuracy" -> N@ Mean@ Boole@ Query["results", All, "correctQ"]@ d,
"n_correct" -> Total@ Boole@ Query["results", All, "correctQ"]@ d,
"n_test" -> Query["results", Length]@ d|>]
(* map over files in a particular folder and grouping (e.g., top1, top5) *)
summarize[folder_, group_]:= With[
{files = FileNames[group<>"_*.json", folder],
outputFile = folder<>"/summary_"<>group<>".json"},
Export[outputFile, #, "Compact"->2]&@ Map[summarize]@ files;]
(* compute the summary statistics *)
SetDirectory@NotebookDirectory[];
directories = {"./results/gpt-3.5", "./results/gpt-3.5_finetune", "./results/gpt-4"};
groups = {"top1", "top5"};
Outer[summarize, directories, groups];
(***
Despite the instructions not to do so, the GPT models sometimes add erroneous O2 or H2O
which causes the prediction to not match.
Go back to manually rescore these entries and recompute...
***)
(* make a copy of the original results *)
CopyDirectory[#, #<>"_rescore"]&/@ directories;
(*** define rescoring functions ***)
(* remove offending species *)
removeO2[l_List]:= DeleteCases[l, "O2", Infinity]
removeH2O[l_List]:= DeleteCases[l, "H2O", Infinity]
(* rescore one row of entries *)
rescore[row_Association]:=With[
{newPrediction = removeH2O@ removeO2@ Lookup["prediction"]@ row,
actual = Lookup["answer"]@ row},
(* advanced Association trick for value replacement, see:
https://mathematica.stackexchange.com/a/54745/63709 *)
<|row,
"correctQ"->precursorMatchQ[actual, newPrediction],
"prediction"->newPrediction|>]
(* map over the entire list of entries *)
rescore[data_List]:=rescore/@data
(* update metadata with current time and rescoring indication *)
updateMetadata[original_Association]:=
<|original,
"date" -> DateString["ISODateTime"],
"notes" -> "rescored after removing O2 and H2O"|>
(* overloaded version: apply to a file, and rewrite to the same file *)
rescore[f_?FileExistsQ]:= With[
{d = Import[f, "RawJSON"]},
Export[f, #, "Compact"->2]&@
<|"metadata" -> updateMetadata@ Lookup["metadata"]@ d,
"results" -> rescore@ Lookup["results"]@ d|>]
(* apply to the new files *)
FileSystemScan[rescore, #<>"_rescore", FileNameForms->"*cv*.json"]&/@ directories;
(* recompute summary statistics*)
Outer[summarize, (#<>"_rescore"&/@directories), groups];
(* summarize random baseline result*)
summarize["./results/statistical_baseline",#]&/@groups;