Skip to content

Commit

Permalink
Revise subject field mappings (#1291)
Browse files Browse the repository at this point in the history
* Revise subject field mappings

* Fix subject mapping value

* Add specs

* Add reused method to extraction tools
  • Loading branch information
abelemlih authored Aug 26, 2022
1 parent d57e4c4 commit 5251e9d
Show file tree
Hide file tree
Showing 9 changed files with 328 additions and 24 deletions.
18 changes: 13 additions & 5 deletions lib/marc_indexer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
require 'traject/extract_published'
require 'traject/extract_publisher_details_display'
require 'traject/extract_subject_display'
require 'traject/extract_subject'
require 'traject/extract_subject_geo'
require 'traject/extract_subject_era'
require 'traject/extract_genre'
require 'traject/extract_title_details_display'
require 'traject/extract_title_main_first_char'
require 'traject/extract_title_precise'
Expand Down Expand Up @@ -78,6 +82,10 @@
extend ExtractPublished
extend ExtractPublisherDetailsDisplay
extend ExtractSubjectDisplay
extend ExtractSubjectEra
extend ExtractSubjectGeo
extend ExtractGenre
extend ExtractSubject
extend ExtractTitleDetailsDisplay
extend ExtractTitleMainFirstChar
extend ExtractTitlePrecise
Expand Down Expand Up @@ -206,14 +214,14 @@
to_field 'author_vern_tesim', extract_author_vern

# Subject Fields
to_field 'subject_display_ssim', extract_subject_display(ATOZ, ATOG, VTOZ)
to_field 'subject_era_ssim', extract_marc("650y:651y:654y:655y"), trim_punctuation
to_field 'subject_geo_ssim', extract_marc("651a:650z"), trim_punctuation
to_field 'subject_ssim', extract_marc("600abcdq:610ab:611adc:630aa:650aa:653aa:654a"), trim_punctuation
to_field 'subject_display_ssim', extract_subject_display
to_field 'subject_era_ssim', extract_subject_era
to_field 'subject_geo_ssim', extract_subject_geo
to_field 'subject_ssim', extract_subject, trim_punctuation
to_field 'subject_tesim', extract_marc(subject_tesim_str(ATOZ))

# Genre Fields
to_field 'genre_ssim', extract_marc("655a"), trim_punctuation
to_field 'genre_ssim', extract_genre

# Publication Fields
to_field 'note_publication_dates_tesim', extract_marc('362a')
Expand Down
31 changes: 31 additions & 0 deletions lib/traject/extract_genre.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# frozen_string_literal: true
require 'traject/extraction_tools'
extend ExtractionTools

module ExtractGenre
def extract_genre
lambda do |record, accumulator|
tags = ['655a']

tags.each do |tag|
record.fields(tag.to_i.to_s).find_all do |field|
next unless valid_genre_field?(field)
value = marc21.trim_punctuation(extract_value(tag, field))
accumulator << value unless value.nil? || accumulator.include?(value)
end
end
accumulator
end
end

def valid_genre_field?(field)
(['0', '2'].include? field.indicator2) || valid_genre_source?(field)
end

def valid_genre_source?(field)
valid_sources = ['lcgft', 'homoit', 'aat', 'rbbin', 'rbgenr', 'rbpap', 'rbpri', 'rbprov', 'rbpub']
field.indicator2 == '7' && field.subfields.any? do |subfield|
subfield.code == '2' && valid_sources.include?(subfield.value)
end
end
end
31 changes: 31 additions & 0 deletions lib/traject/extract_subject.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# frozen_string_literal: true
require 'traject/extraction_tools'
extend ExtractionTools

module ExtractSubject
def extract_subject
lambda do |record, accumulator|
tags = ['600abcdq', '610ab', '611adc', '630aa', '650aa', '653aa', '654a']

tags.each do |tag|
record.fields(tag.to_i.to_s).find_all do |field|
next unless valid_subject_field?(field)
value = marc21.trim_punctuation(extract_value(tag, field))
accumulator << value unless value.nil? || accumulator.include?(value)
end
end
accumulator
end
end

def valid_subject_field?(field)
(['0', '2'].include? field.indicator2) || valid_subject_source?(field)
end

def valid_subject_source?(field)
valid_sources = ['lcgft', 'homoit', 'aat', 'rbbin', 'rbgenr', 'rbpap', 'rbpri', 'rbprov', 'rbpub']
field.indicator2 == '7' && field.subfields.any? do |subfield|
subfield.code == '2' && valid_sources.include?(subfield.value)
end
end
end
49 changes: 31 additions & 18 deletions lib/traject/extract_subject_display.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,45 @@
extend ExtractionTools

module ExtractSubjectDisplay
def extract_subject_display(atoz, atog, vtoz)
lambda do |rec, acc|
dfs_to_process = [
"600#{atoz}", "610#{atoz}", "611#{atoz}", "630#{atoz}", "650#{atog}#{vtoz}",
"651aeg#{vtoz}"
]
def extract_subject_display
atoz = ('a'..'z').to_a.join('')
atog = ('a'..'g').to_a.join('')
vtoz = ('v'..'z').to_a.join('')

dfs_to_process.each do |df|
rec.fields(df.to_i.to_s).find_all do |f|
acc << marc21.trim_punctuation(accumulate_values(df, f)) unless f.indicator2 == "4" || any_subfield_fast?(f)
lambda do |record, accumulator|
tags = ["600#{atoz}", "610#{atoz}", "611#{atoz}", "630#{atoz}", "650#{atog}#{vtoz}", "651aeg#{vtoz}"]

tags.each do |tag|
record.fields(tag.to_i.to_s).find_all do |field|
next unless valid_subject_display_field?(field)
value = marc21.trim_punctuation(subject_display_value(tag, field))
accumulator << value unless value.nil? || accumulator.include?(value)
end
end
acc
accumulator
end
end

def any_subfield_fast?(field)
field.subfields.any? { |sf| sf.code == '2' && sf.value == "fast" }
def valid_subject_display_field?(field)
((['0', '2'].include? field.indicator2) || valid_subject_display_source?(field)) && !(field.subfields.any? { |sf| sf.code == '2' && sf.value == "fast" })
end

def accumulate_values(df, field)
ret_array = []
field.each do |sf|
ret_array << sf.value if df.delete(df.to_i.to_s + "vxyz").include?(sf.code)
ret_array << "--#{sf.value}" if "vxyz".include?(sf.code)
def valid_subject_display_source?(field)
valid_sources = ['lcgft', 'homoit', 'aat', 'rbbin', 'rbgenr', 'rbpap', 'rbpri', 'rbprov', 'rbpub']
field.indicator2 == '7' && field.subfields.any? do |subfield|
subfield.code == '2' && valid_sources.include?(subfield.value)
end
end

def subject_display_value(tag, field)
valid_subfield_codes = tag.delete(tag.to_i.to_s)
field_values = []
field.subfields.each do |subfield|
next unless valid_subfield_codes.include? subfield.code

value = 'vxyz'.include?(subfield.code) ? "--#{subfield.value}" : subfield.value
field_values.append(value)
end
ret_array.join('')
field_values.empty? ? nil : field_values.join('')
end
end
31 changes: 31 additions & 0 deletions lib/traject/extract_subject_era.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# frozen_string_literal: true
require 'traject/extraction_tools'
extend ExtractionTools

module ExtractSubjectEra
def extract_subject_era
lambda do |record, accumulator|
tags = ['650y', '651y', '654y', '655y']

tags.each do |tag|
record.fields(tag.to_i.to_s).find_all do |field|
next unless valid_subject_era_field?(field)
value = marc21.trim_punctuation(extract_value(tag, field))
accumulator << value unless value.nil? || accumulator.include?(value)
end
end
accumulator
end
end

def valid_subject_era_field?(field)
(['0', '2'].include? field.indicator2) || valid_subject_era_source?(field)
end

def valid_subject_era_source?(field)
valid_sources = ['lcgft', 'homoit', 'aat', 'rbbin', 'rbgenr', 'rbpap', 'rbpri', 'rbprov', 'rbpub']
field.indicator2 == '7' && field.subfields.any? do |subfield|
subfield.code == '2' && valid_sources.include?(subfield.value)
end
end
end
31 changes: 31 additions & 0 deletions lib/traject/extract_subject_geo.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# frozen_string_literal: true
require 'traject/extraction_tools'
extend ExtractionTools

module ExtractSubjectGeo
def extract_subject_geo
lambda do |record, accumulator|
tags = ['651a', '650z']

tags.each do |tag|
record.fields(tag.to_i.to_s).find_all do |field|
next unless valid_subject_geo_field?(field)
value = marc21.trim_punctuation(extract_value(tag, field))
accumulator << value unless value.nil? || accumulator.include?(value)
end
end
accumulator
end
end

def valid_subject_geo_field?(field)
(['0', '2'].include? field.indicator2) || valid_subject_geo_source?(field)
end

def valid_subject_geo_source?(field)
valid_sources = ['lcgft', 'homoit', 'aat', 'rbbin', 'rbgenr', 'rbpap', 'rbpri', 'rbprov', 'rbpub']
field.indicator2 == '7' && field.subfields.any? do |subfield|
subfield.code == '2' && valid_sources.include?(subfield.value)
end
end
end
21 changes: 20 additions & 1 deletion lib/traject/extraction_tools.rb
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,29 @@ def extract_vern_fields_strict_subfield_order(rec, fields)
build_arr
end

# Extract value of field using a given tag
# @param [String] tag to extract e.g. '600abcdq'
# @param [MARC::DataField] field to extract value from
# @param [String] separator to use when merging the values into one, defaults to empty space
# @return [String] value that matches the input tag
def extract_value(tag, field, separator = '')
valid_subfield_codes = tag.delete(tag.to_i.to_s)
field_values = []
field.subfields.each do |subfield|
next unless valid_subfield_codes.include? subfield.code

field_values.append(subfield.value)
end
field_values.empty? ? nil : field_values.join(separator)
end

def subject_tesim_str(atoz)
%W[
600#{atoz}:610#{atoz}:611#{atoz}:630#{atoz}:650#{atoz}
651#{atoz}:653#{atoz}:654#{atoz}:655#{atoz}
651#{atoz}:653#{atoz}:654#{atoz}:655#{atoz}:656#{atoz}
657#{atoz}:658#{atoz}:662#{atoz}:688#{atoz}:690#{atoz}
691#{atoz}:692#{atoz}:693#{atoz}:694#{atoz}:695#{atoz}
696#{atoz}:697#{atoz}:698#{atoz}:699#{atoz}
].join(':').freeze
end

Expand Down
52 changes: 52 additions & 0 deletions spec/fixtures/alma_marc_resource.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1852,5 +1852,57 @@
</record>
</metadata>
</record>
<record>
<header>
<identifier>oai:alma.01GALI_EMORY:010101010101010101</identifier>
<datestamp>2022-08-19T19:47:03Z</datestamp>
<setSpec>print_books</setSpec>
<setSpec>blacklight</setSpec>
</header>
<metadata>
<record xmlns="http://www.loc.gov/MARC21/slim" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">
<leader>01974cem a2200433 a 4500</leader>
<controlfield tag="005">20210129142546.0</controlfield>
<controlfield tag="006">m f c f </controlfield>
<controlfield tag="007">aj#canzn</controlfield>
<controlfield tag="008">981211s1998 caubc cc a fs 0 eng d</controlfield>
<controlfield tag="001">010101010101010101</controlfield>
<datafield tag="650" ind1=" " ind2="0">
<subfield code="a">Test subject I</subfield>
<subfield code="z">United States I</subfield>
<subfield code="y">20th Century</subfield>
</datafield>
<datafield tag="650" ind1=" " ind2="4">
<subfield code="a">Test subject II</subfield>
<subfield code="z">United States II</subfield>
<subfield code="y">20th Century Test I</subfield>
</datafield>
<datafield tag="650" ind1=" " ind2="7">
<subfield code="a">Test subject III</subfield>
<subfield code="z">United States III</subfield>
<subfield code="y">20th Century Test II</subfield>
</datafield>
<datafield tag="650" ind1=" " ind2="7">
<subfield code="a">Test subject IV</subfield>
<subfield code="z">United States IV</subfield>
<subfield code="y">20th Century Test III</subfield>
<subfield code="2">rbpub</subfield>
</datafield>
<datafield tag="655" ind1=" " ind2="0">
<subfield code="a">Test Genre I</subfield>
</datafield>
<datafield tag="655" ind1=" " ind2="4">
<subfield code="a">Test Genre II</subfield>
</datafield>
<datafield tag="655" ind1=" " ind2="7">
<subfield code="a">Test Genre III</subfield>
</datafield>
<datafield tag="655" ind1=" " ind2="7">
<subfield code="a">Test Genre IV</subfield>
<subfield code="2">rbpub</subfield>
</datafield>
</record>
</metadata>
</record>
</ListRecords>
</OAI-PMH>
Loading

0 comments on commit 5251e9d

Please sign in to comment.