From 67938f26cee9fa42d665798f2096bda729191cb1 Mon Sep 17 00:00:00 2001 From: Yaroslav Nechaev Date: Thu, 27 Sep 2018 14:34:06 +0200 Subject: [PATCH] Changes to schema --- .../fm/alignments/index/BuildUserIndex.java | 2 +- .../alignments/index/ExtractSocialGraph.java | 52 +++++- .../eu/fbk/fm/alignments/index/db/Keys.java | 16 +- .../eu/fbk/fm/alignments/index/db/Public.java | 32 ++-- .../eu/fbk/fm/alignments/index/db/Tables.java | 12 -- .../eu/fbk/fm/alignments/index/db/UDTs.java | 29 +++ .../alignments/index/db/tables/KbIndex.java | 122 ------------ .../alignments/index/db/tables/UserIndex.java | 23 +-- .../fm/alignments/index/db/tables/UserSg.java | 23 ++- .../alignments/index/db/tables/UserText.java | 6 +- .../index/db/tables/UserTextArr.java | 122 ------------ .../db/tables/records/KbIndexRecord.java | 173 ------------------ .../db/tables/records/UserIndexRecord.java | 19 +- .../index/db/tables/records/UserSgRecord.java | 19 +- .../db/tables/records/UserTextArrRecord.java | 173 ------------------ .../db/tables/records/UserTextRecord.java | 36 ++-- .../fbk/fm/alignments/index/db/udt/Test.java | 69 +++++++ .../index/db/udt/records/TestRecord.java | 160 ++++++++++++++++ .../index/sink/PostgresFileSink.java | 8 +- .../fm/alignments/pipeline/ScoreEntities.java | 35 +++- .../alignments/pipeline/SubmitEntities.java | 15 +- .../fm/alignments/scorer/ISWC17Strategy.java | 2 +- .../fm/alignments/scorer/PAI18Strategy.java | 6 - .../embeddings/EntityDirectEmbeddings.java | 2 - .../scorer/embeddings/EntityEmbeddings.java | 9 +- .../alignments/scorer/text/DBTextScorer.java | 14 +- .../scorer/text/DBTextScorerv2.java | 77 ++++++++ .../preprocessing/text/TextProcessor.java | 16 +- alignments/src/main/resources/schema.sql | 19 +- 29 files changed, 551 insertions(+), 740 deletions(-) create mode 100644 alignments/src/main/java/eu/fbk/fm/alignments/index/db/UDTs.java delete mode 100644 alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/KbIndex.java delete mode 100644 alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserTextArr.java delete mode 100644 alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/KbIndexRecord.java delete mode 100644 alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserTextArrRecord.java create mode 100644 alignments/src/main/java/eu/fbk/fm/alignments/index/db/udt/Test.java create mode 100644 alignments/src/main/java/eu/fbk/fm/alignments/index/db/udt/records/TestRecord.java create mode 100644 alignments/src/main/java/eu/fbk/fm/alignments/scorer/text/DBTextScorerv2.java diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/BuildUserIndex.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/BuildUserIndex.java index accb4c8..2a128df 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/BuildUserIndex.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/BuildUserIndex.java @@ -119,7 +119,7 @@ private void startPipeline(Path input, Configuration parameters) throws Exceptio .project(1, 2); reducedTweets - .groupBy(1).reduce((value1, value2) -> new Tuple2<>(value1.f0, value1.f1 + "\n" + value2.f1)) + .groupBy(0).reduce((value1, value2) -> new Tuple2<>(value1.f0, value1.f1 + "\n" + value2.f1)) .output(textOutputFormat).withParameters(parameters); } diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/ExtractSocialGraph.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/ExtractSocialGraph.java index 2ae3780..917af81 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/ExtractSocialGraph.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/ExtractSocialGraph.java @@ -15,6 +15,7 @@ import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.Utils; import org.apache.flink.api.java.operators.DataSource; +import org.apache.flink.api.java.operators.UnsortedGrouping; import org.apache.flink.api.java.tuple.Tuple3; import org.apache.flink.configuration.Configuration; import org.apache.flink.core.fs.Path; @@ -35,6 +36,7 @@ public class ExtractSocialGraph { private static final String TWEETS_PATH = "tweets-path"; private OutputFormat> forwardOutputFormat; + private OutputFormat> forwardCondensedPGFormat; private OutputFormat> backwardOutputFormat; private void start(Path input, Path output) throws Exception { @@ -42,6 +44,7 @@ private void start(Path input, Path output) throws Exception { parameters.setString("db.file", output.getPath()); forwardOutputFormat = new PostgresFileSink>("forward").testFile(parameters); + forwardCondensedPGFormat = new PostgresFileSink>("forward.pg").testFile(parameters); backwardOutputFormat = new PostgresFileSink>("backward").testFile(parameters); startPipeline(input, parameters); @@ -75,11 +78,17 @@ private void processGraphAndOutput(DataSet tweets, GraphEmitter drop .sum(2) .filter(new GraphFilter(2)); - jointEdges - .groupBy(0) + UnsortedGrouping> forwardGrouped = jointEdges + .groupBy(0); + + forwardGrouped .reduceGroup(new EdgeNormalizer()) .output(forwardOutputFormat).withParameters(parameters); + forwardGrouped + .reduceGroup(new CondenseAndNormalizeGraph()) + .output(forwardCondensedPGFormat).setParallelism(1).withParameters(parameters); + jointEdges .groupBy(1) .reduceGroup(new EdgeNormalizer()) @@ -125,6 +134,45 @@ public void flatMap(JsonObject status, Collector> ou } } + private static class CondenseAndNormalizeGraph implements GroupReduceFunction, Tuple3> { + @Override + public void reduce(Iterable> edges, Collector> out) throws Exception { + int sum = 0; + + LinkedList> listedEdges = new LinkedList<>(); + for (Tuple3 edge : edges) { + sum += edge.f2; + listedEdges.add(edge); + } + + StringBuilder ids = new StringBuilder(); + StringBuilder weights = new StringBuilder(); + Long id = null; + for (Tuple3 edge : listedEdges) { + id = edge.f0; + if (ids.length() > 0) { + ids.append(','); + } + ids.append(edge.f1.toString()); + + if (weights.length() > 0) { + weights.append(','); + } + weights.append(String.valueOf((float) edge.f2 / sum)); + + } + if (id == null) { + return; + } + + out.collect(new Tuple3<>( + id, + "{"+ids.toString()+"}", + "{"+weights.toString()+"}" + )); + } + } + private static class EdgeNormalizer implements GroupReduceFunction, Tuple3> { @Override public void reduce(Iterable> edges, Collector> out) throws Exception { diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Keys.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Keys.java index 1ea7c53..2c0a69a 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Keys.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Keys.java @@ -5,16 +5,12 @@ import eu.fbk.fm.alignments.index.db.tables.Alignments; -import eu.fbk.fm.alignments.index.db.tables.KbIndex; -import eu.fbk.fm.alignments.index.db.tables.UserIndex; import eu.fbk.fm.alignments.index.db.tables.UserObjects; +import eu.fbk.fm.alignments.index.db.tables.UserSg; import eu.fbk.fm.alignments.index.db.tables.UserText; -import eu.fbk.fm.alignments.index.db.tables.UserTextArr; import eu.fbk.fm.alignments.index.db.tables.records.AlignmentsRecord; -import eu.fbk.fm.alignments.index.db.tables.records.KbIndexRecord; -import eu.fbk.fm.alignments.index.db.tables.records.UserIndexRecord; import eu.fbk.fm.alignments.index.db.tables.records.UserObjectsRecord; -import eu.fbk.fm.alignments.index.db.tables.records.UserTextArrRecord; +import eu.fbk.fm.alignments.index.db.tables.records.UserSgRecord; import eu.fbk.fm.alignments.index.db.tables.records.UserTextRecord; import javax.annotation.Generated; @@ -47,11 +43,9 @@ public class Keys { // ------------------------------------------------------------------------- public static final UniqueKey ALIGNMENTS_PKEY = UniqueKeys0.ALIGNMENTS_PKEY; - public static final UniqueKey KB_INDEX_PKEY = UniqueKeys0.KB_INDEX_PKEY; - public static final UniqueKey USER_INDEX_PKEY = UniqueKeys0.USER_INDEX_PKEY; public static final UniqueKey USER_OBJECTS_PKEY = UniqueKeys0.USER_OBJECTS_PKEY; + public static final UniqueKey USER_SG_PKEY = UniqueKeys0.USER_SG_PKEY; public static final UniqueKey USER_TEXT_PKEY = UniqueKeys0.USER_TEXT_PKEY; - public static final UniqueKey USER_TEXT_ARR_PKEY = UniqueKeys0.USER_TEXT_ARR_PKEY; // ------------------------------------------------------------------------- // FOREIGN KEY definitions @@ -64,10 +58,8 @@ public class Keys { private static class UniqueKeys0 extends AbstractKeys { public static final UniqueKey ALIGNMENTS_PKEY = createUniqueKey(Alignments.ALIGNMENTS, "alignments_pkey", Alignments.ALIGNMENTS.RESOURCE_ID, Alignments.ALIGNMENTS.UID); - public static final UniqueKey KB_INDEX_PKEY = createUniqueKey(KbIndex.KB_INDEX, "kb_index_pkey", KbIndex.KB_INDEX.KBID); - public static final UniqueKey USER_INDEX_PKEY = createUniqueKey(UserIndex.USER_INDEX, "user_index_pkey", UserIndex.USER_INDEX.FULLNAME, UserIndex.USER_INDEX.UID); public static final UniqueKey USER_OBJECTS_PKEY = createUniqueKey(UserObjects.USER_OBJECTS, "user_objects_pkey", UserObjects.USER_OBJECTS.UID); + public static final UniqueKey USER_SG_PKEY = createUniqueKey(UserSg.USER_SG, "user_sg_pkey", UserSg.USER_SG.UID); public static final UniqueKey USER_TEXT_PKEY = createUniqueKey(UserText.USER_TEXT, "user_text_pkey", UserText.USER_TEXT.UID); - public static final UniqueKey USER_TEXT_ARR_PKEY = createUniqueKey(UserTextArr.USER_TEXT_ARR, "user_text_arr_pkey", UserTextArr.USER_TEXT_ARR.UID); } } diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Public.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Public.java index 2b79e34..d3e15b2 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Public.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Public.java @@ -5,12 +5,11 @@ import eu.fbk.fm.alignments.index.db.tables.Alignments; -import eu.fbk.fm.alignments.index.db.tables.KbIndex; import eu.fbk.fm.alignments.index.db.tables.UserIndex; import eu.fbk.fm.alignments.index.db.tables.UserObjects; import eu.fbk.fm.alignments.index.db.tables.UserSg; import eu.fbk.fm.alignments.index.db.tables.UserText; -import eu.fbk.fm.alignments.index.db.tables.UserTextArr; +import eu.fbk.fm.alignments.index.db.udt.Test; import java.util.ArrayList; import java.util.Arrays; @@ -20,6 +19,7 @@ import org.jooq.Catalog; import org.jooq.Table; +import org.jooq.UDT; import org.jooq.impl.SchemaImpl; @@ -36,7 +36,7 @@ @SuppressWarnings({ "all", "unchecked", "rawtypes" }) public class Public extends SchemaImpl { - private static final long serialVersionUID = 275761985; + private static final long serialVersionUID = 2094861583; /** * The reference instance of public @@ -48,11 +48,6 @@ public class Public extends SchemaImpl { */ public final Alignments ALIGNMENTS = eu.fbk.fm.alignments.index.db.tables.Alignments.ALIGNMENTS; - /** - * The table public.kb_index. - */ - public final KbIndex KB_INDEX = eu.fbk.fm.alignments.index.db.tables.KbIndex.KB_INDEX; - /** * The table public.user_index. */ @@ -73,11 +68,6 @@ public class Public extends SchemaImpl { */ public final UserText USER_TEXT = eu.fbk.fm.alignments.index.db.tables.UserText.USER_TEXT; - /** - * The table public.user_text_arr. - */ - public final UserTextArr USER_TEXT_ARR = eu.fbk.fm.alignments.index.db.tables.UserTextArr.USER_TEXT_ARR; - /** * No further instances allowed */ @@ -104,11 +94,21 @@ public final List> getTables() { private final List> getTables0() { return Arrays.>asList( Alignments.ALIGNMENTS, - KbIndex.KB_INDEX, UserIndex.USER_INDEX, UserObjects.USER_OBJECTS, UserSg.USER_SG, - UserText.USER_TEXT, - UserTextArr.USER_TEXT_ARR); + UserText.USER_TEXT); + } + + @Override + public final List> getUDTs() { + List result = new ArrayList(); + result.addAll(getUDTs0()); + return result; + } + + private final List> getUDTs0() { + return Arrays.>asList( + Test.TEST); } } diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Tables.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Tables.java index 7349cdf..afab614 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Tables.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/Tables.java @@ -5,12 +5,10 @@ import eu.fbk.fm.alignments.index.db.tables.Alignments; -import eu.fbk.fm.alignments.index.db.tables.KbIndex; import eu.fbk.fm.alignments.index.db.tables.UserIndex; import eu.fbk.fm.alignments.index.db.tables.UserObjects; import eu.fbk.fm.alignments.index.db.tables.UserSg; import eu.fbk.fm.alignments.index.db.tables.UserText; -import eu.fbk.fm.alignments.index.db.tables.UserTextArr; import javax.annotation.Generated; @@ -33,11 +31,6 @@ public class Tables { */ public static final Alignments ALIGNMENTS = eu.fbk.fm.alignments.index.db.tables.Alignments.ALIGNMENTS; - /** - * The table public.kb_index. - */ - public static final KbIndex KB_INDEX = eu.fbk.fm.alignments.index.db.tables.KbIndex.KB_INDEX; - /** * The table public.user_index. */ @@ -57,9 +50,4 @@ public class Tables { * The table public.user_text. */ public static final UserText USER_TEXT = eu.fbk.fm.alignments.index.db.tables.UserText.USER_TEXT; - - /** - * The table public.user_text_arr. - */ - public static final UserTextArr USER_TEXT_ARR = eu.fbk.fm.alignments.index.db.tables.UserTextArr.USER_TEXT_ARR; } diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/UDTs.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/UDTs.java new file mode 100644 index 0000000..c221ffa --- /dev/null +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/UDTs.java @@ -0,0 +1,29 @@ +/** + * This class is generated by jOOQ + */ +package eu.fbk.fm.alignments.index.db; + + +import eu.fbk.fm.alignments.index.db.udt.Test; + +import javax.annotation.Generated; + + +/** + * Convenience access to all UDTs in public + */ +@Generated( + value = { + "http://www.jooq.org", + "jOOQ version:3.8.4" + }, + comments = "This class is generated by jOOQ" +) +@SuppressWarnings({ "all", "unchecked", "rawtypes" }) +public class UDTs { + + /** + * The type public.test + */ + public static Test TEST = eu.fbk.fm.alignments.index.db.udt.Test.TEST; +} diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/KbIndex.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/KbIndex.java deleted file mode 100644 index f2f1c2f..0000000 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/KbIndex.java +++ /dev/null @@ -1,122 +0,0 @@ -/** - * This class is generated by jOOQ - */ -package eu.fbk.fm.alignments.index.db.tables; - - -import eu.fbk.fm.alignments.index.db.Keys; -import eu.fbk.fm.alignments.index.db.Public; -import eu.fbk.fm.alignments.index.db.tables.records.KbIndexRecord; - -import java.util.Arrays; -import java.util.List; - -import javax.annotation.Generated; - -import org.jooq.Field; -import org.jooq.Schema; -import org.jooq.Table; -import org.jooq.TableField; -import org.jooq.UniqueKey; -import org.jooq.impl.TableImpl; - - -/** - * This class is generated by jOOQ. - */ -@Generated( - value = { - "http://www.jooq.org", - "jOOQ version:3.8.4" - }, - comments = "This class is generated by jOOQ" -) -@SuppressWarnings({ "all", "unchecked", "rawtypes" }) -public class KbIndex extends TableImpl { - - private static final long serialVersionUID = -1042573490; - - /** - * The reference instance of public.kb_index - */ - public static final KbIndex KB_INDEX = new KbIndex(); - - /** - * The class holding records for this type - */ - @Override - public Class getRecordType() { - return KbIndexRecord.class; - } - - /** - * The column public.kb_index.kbid. - */ - public final TableField KBID = createField("kbid", org.jooq.impl.SQLDataType.BIGINT.nullable(false), this, ""); - - /** - * The column public.kb_index.uri. - */ - public final TableField URI = createField("uri", org.jooq.impl.SQLDataType.CLOB.nullable(false), this, ""); - - /** - * Create a public.kb_index table reference - */ - public KbIndex() { - this("kb_index", null); - } - - /** - * Create an aliased public.kb_index table reference - */ - public KbIndex(String alias) { - this(alias, KB_INDEX); - } - - private KbIndex(String alias, Table aliased) { - this(alias, aliased, null); - } - - private KbIndex(String alias, Table aliased, Field[] parameters) { - super(alias, null, aliased, parameters, ""); - } - - /** - * {@inheritDoc} - */ - @Override - public Schema getSchema() { - return Public.PUBLIC; - } - - /** - * {@inheritDoc} - */ - @Override - public UniqueKey getPrimaryKey() { - return Keys.KB_INDEX_PKEY; - } - - /** - * {@inheritDoc} - */ - @Override - public List> getKeys() { - return Arrays.>asList(Keys.KB_INDEX_PKEY); - } - - /** - * {@inheritDoc} - */ - @Override - public KbIndex as(String alias) { - return new KbIndex(alias, this); - } - - /** - * Rename this table - */ - public KbIndex rename(String name) { - return new KbIndex(name, null); - } -} diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserIndex.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserIndex.java index c44a7c2..49c0a88 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserIndex.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserIndex.java @@ -4,20 +4,15 @@ package eu.fbk.fm.alignments.index.db.tables; -import eu.fbk.fm.alignments.index.db.Keys; import eu.fbk.fm.alignments.index.db.Public; import eu.fbk.fm.alignments.index.db.tables.records.UserIndexRecord; -import java.util.Arrays; -import java.util.List; - import javax.annotation.Generated; import org.jooq.Field; import org.jooq.Schema; import org.jooq.Table; import org.jooq.TableField; -import org.jooq.UniqueKey; import org.jooq.impl.TableImpl; @@ -34,7 +29,7 @@ @SuppressWarnings({ "all", "unchecked", "rawtypes" }) public class UserIndex extends TableImpl { - private static final long serialVersionUID = -450306696; + private static final long serialVersionUID = -486765049; /** * The reference instance of public.user_index @@ -94,22 +89,6 @@ public Schema getSchema() { return Public.PUBLIC; } - /** - * {@inheritDoc} - */ - @Override - public UniqueKey getPrimaryKey() { - return Keys.USER_INDEX_PKEY; - } - - /** - * {@inheritDoc} - */ - @Override - public List> getKeys() { - return Arrays.>asList(Keys.USER_INDEX_PKEY); - } - /** * {@inheritDoc} */ diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserSg.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserSg.java index af8159c..ddc51db 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserSg.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserSg.java @@ -4,15 +4,20 @@ package eu.fbk.fm.alignments.index.db.tables; +import eu.fbk.fm.alignments.index.db.Keys; import eu.fbk.fm.alignments.index.db.Public; import eu.fbk.fm.alignments.index.db.tables.records.UserSgRecord; +import java.util.Arrays; +import java.util.List; + import javax.annotation.Generated; import org.jooq.Field; import org.jooq.Schema; import org.jooq.Table; import org.jooq.TableField; +import org.jooq.UniqueKey; import org.jooq.impl.TableImpl; @@ -29,7 +34,7 @@ @SuppressWarnings({ "all", "unchecked", "rawtypes" }) public class UserSg extends TableImpl { - private static final long serialVersionUID = -1823883885; + private static final long serialVersionUID = 478125782; /** * The reference instance of public.user_sg @@ -89,6 +94,22 @@ public Schema getSchema() { return Public.PUBLIC; } + /** + * {@inheritDoc} + */ + @Override + public UniqueKey getPrimaryKey() { + return Keys.USER_SG_PKEY; + } + + /** + * {@inheritDoc} + */ + @Override + public List> getKeys() { + return Arrays.>asList(Keys.USER_SG_PKEY); + } + /** * {@inheritDoc} */ diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserText.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserText.java index a168344..676a3d6 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserText.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserText.java @@ -34,7 +34,7 @@ @SuppressWarnings({ "all", "unchecked", "rawtypes" }) public class UserText extends TableImpl { - private static final long serialVersionUID = -708992843; + private static final long serialVersionUID = 436441581; /** * The reference instance of public.user_text @@ -55,9 +55,9 @@ public Class getRecordType() { public final TableField UID = createField("uid", org.jooq.impl.SQLDataType.BIGINT.nullable(false), this, ""); /** - * The column public.user_text.lsa. + * The column public.user_text.text. */ - public final TableField LSA = createField("lsa", org.jooq.impl.DefaultDataType.getDefaultDataType("USER-DEFINED"), this, ""); + public final TableField TEXT = createField("text", org.jooq.impl.SQLDataType.CLOB.nullable(false), this, ""); /** * Create a public.user_text table reference diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserTextArr.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserTextArr.java deleted file mode 100644 index 56246df..0000000 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/UserTextArr.java +++ /dev/null @@ -1,122 +0,0 @@ -/** - * This class is generated by jOOQ - */ -package eu.fbk.fm.alignments.index.db.tables; - - -import eu.fbk.fm.alignments.index.db.Keys; -import eu.fbk.fm.alignments.index.db.Public; -import eu.fbk.fm.alignments.index.db.tables.records.UserTextArrRecord; - -import java.util.Arrays; -import java.util.List; - -import javax.annotation.Generated; - -import org.jooq.Field; -import org.jooq.Schema; -import org.jooq.Table; -import org.jooq.TableField; -import org.jooq.UniqueKey; -import org.jooq.impl.TableImpl; - - -/** - * This class is generated by jOOQ. - */ -@Generated( - value = { - "http://www.jooq.org", - "jOOQ version:3.8.4" - }, - comments = "This class is generated by jOOQ" -) -@SuppressWarnings({ "all", "unchecked", "rawtypes" }) -public class UserTextArr extends TableImpl { - - private static final long serialVersionUID = 2072157874; - - /** - * The reference instance of public.user_text_arr - */ - public static final UserTextArr USER_TEXT_ARR = new UserTextArr(); - - /** - * The class holding records for this type - */ - @Override - public Class getRecordType() { - return UserTextArrRecord.class; - } - - /** - * The column public.user_text_arr.uid. - */ - public final TableField UID = createField("uid", org.jooq.impl.SQLDataType.BIGINT.nullable(false), this, ""); - - /** - * The column public.user_text_arr.lsa. - */ - public final TableField LSA = createField("lsa", org.jooq.impl.SQLDataType.REAL.getArrayDataType(), this, ""); - - /** - * Create a public.user_text_arr table reference - */ - public UserTextArr() { - this("user_text_arr", null); - } - - /** - * Create an aliased public.user_text_arr table reference - */ - public UserTextArr(String alias) { - this(alias, USER_TEXT_ARR); - } - - private UserTextArr(String alias, Table aliased) { - this(alias, aliased, null); - } - - private UserTextArr(String alias, Table aliased, Field[] parameters) { - super(alias, null, aliased, parameters, ""); - } - - /** - * {@inheritDoc} - */ - @Override - public Schema getSchema() { - return Public.PUBLIC; - } - - /** - * {@inheritDoc} - */ - @Override - public UniqueKey getPrimaryKey() { - return Keys.USER_TEXT_ARR_PKEY; - } - - /** - * {@inheritDoc} - */ - @Override - public List> getKeys() { - return Arrays.>asList(Keys.USER_TEXT_ARR_PKEY); - } - - /** - * {@inheritDoc} - */ - @Override - public UserTextArr as(String alias) { - return new UserTextArr(alias, this); - } - - /** - * Rename this table - */ - public UserTextArr rename(String name) { - return new UserTextArr(name, null); - } -} diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/KbIndexRecord.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/KbIndexRecord.java deleted file mode 100644 index 896eb99..0000000 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/KbIndexRecord.java +++ /dev/null @@ -1,173 +0,0 @@ -/** - * This class is generated by jOOQ - */ -package eu.fbk.fm.alignments.index.db.tables.records; - - -import eu.fbk.fm.alignments.index.db.tables.KbIndex; - -import javax.annotation.Generated; - -import org.jooq.Field; -import org.jooq.Record1; -import org.jooq.Record2; -import org.jooq.Row2; -import org.jooq.impl.UpdatableRecordImpl; - - -/** - * This class is generated by jOOQ. - */ -@Generated( - value = { - "http://www.jooq.org", - "jOOQ version:3.8.4" - }, - comments = "This class is generated by jOOQ" -) -@SuppressWarnings({ "all", "unchecked", "rawtypes" }) -public class KbIndexRecord extends UpdatableRecordImpl implements Record2 { - - private static final long serialVersionUID = -1621118902; - - /** - * Setter for public.kb_index.kbid. - */ - public void setKbid(Long value) { - set(0, value); - } - - /** - * Getter for public.kb_index.kbid. - */ - public Long getKbid() { - return (Long) get(0); - } - - /** - * Setter for public.kb_index.uri. - */ - public void setUri(String value) { - set(1, value); - } - - /** - * Getter for public.kb_index.uri. - */ - public String getUri() { - return (String) get(1); - } - - // ------------------------------------------------------------------------- - // Primary key information - // ------------------------------------------------------------------------- - - /** - * {@inheritDoc} - */ - @Override - public Record1 key() { - return (Record1) super.key(); - } - - // ------------------------------------------------------------------------- - // Record2 type implementation - // ------------------------------------------------------------------------- - - /** - * {@inheritDoc} - */ - @Override - public Row2 fieldsRow() { - return (Row2) super.fieldsRow(); - } - - /** - * {@inheritDoc} - */ - @Override - public Row2 valuesRow() { - return (Row2) super.valuesRow(); - } - - /** - * {@inheritDoc} - */ - @Override - public Field field1() { - return KbIndex.KB_INDEX.KBID; - } - - /** - * {@inheritDoc} - */ - @Override - public Field field2() { - return KbIndex.KB_INDEX.URI; - } - - /** - * {@inheritDoc} - */ - @Override - public Long value1() { - return getKbid(); - } - - /** - * {@inheritDoc} - */ - @Override - public String value2() { - return getUri(); - } - - /** - * {@inheritDoc} - */ - @Override - public KbIndexRecord value1(Long value) { - setKbid(value); - return this; - } - - /** - * {@inheritDoc} - */ - @Override - public KbIndexRecord value2(String value) { - setUri(value); - return this; - } - - /** - * {@inheritDoc} - */ - @Override - public KbIndexRecord values(Long value1, String value2) { - value1(value1); - value2(value2); - return this; - } - - // ------------------------------------------------------------------------- - // Constructors - // ------------------------------------------------------------------------- - - /** - * Create a detached KbIndexRecord - */ - public KbIndexRecord() { - super(KbIndex.KB_INDEX); - } - - /** - * Create a detached, initialised KbIndexRecord - */ - public KbIndexRecord(Long kbid, String uri) { - super(KbIndex.KB_INDEX); - - set(0, kbid); - set(1, uri); - } -} diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserIndexRecord.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserIndexRecord.java index c7a3a2f..635cedf 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserIndexRecord.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserIndexRecord.java @@ -9,10 +9,9 @@ import javax.annotation.Generated; import org.jooq.Field; -import org.jooq.Record2; import org.jooq.Record3; import org.jooq.Row3; -import org.jooq.impl.UpdatableRecordImpl; +import org.jooq.impl.TableRecordImpl; /** @@ -26,9 +25,9 @@ comments = "This class is generated by jOOQ" ) @SuppressWarnings({ "all", "unchecked", "rawtypes" }) -public class UserIndexRecord extends UpdatableRecordImpl implements Record3 { +public class UserIndexRecord extends TableRecordImpl implements Record3 { - private static final long serialVersionUID = -1612326482; + private static final long serialVersionUID = -915273077; /** * Setter for public.user_index.fullname. @@ -72,18 +71,6 @@ public Integer getFreq() { return (Integer) get(2); } - // ------------------------------------------------------------------------- - // Primary key information - // ------------------------------------------------------------------------- - - /** - * {@inheritDoc} - */ - @Override - public Record2 key() { - return (Record2) super.key(); - } - // ------------------------------------------------------------------------- // Record3 type implementation // ------------------------------------------------------------------------- diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserSgRecord.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserSgRecord.java index 1878b67..99c67f1 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserSgRecord.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserSgRecord.java @@ -9,9 +9,10 @@ import javax.annotation.Generated; import org.jooq.Field; +import org.jooq.Record1; import org.jooq.Record3; import org.jooq.Row3; -import org.jooq.impl.TableRecordImpl; +import org.jooq.impl.UpdatableRecordImpl; /** @@ -25,9 +26,9 @@ comments = "This class is generated by jOOQ" ) @SuppressWarnings({ "all", "unchecked", "rawtypes" }) -public class UserSgRecord extends TableRecordImpl implements Record3 { +public class UserSgRecord extends UpdatableRecordImpl implements Record3 { - private static final long serialVersionUID = 775772085; + private static final long serialVersionUID = -1749527468; /** * Setter for public.user_sg.uid. @@ -71,6 +72,18 @@ public Float[] getWeights() { return (Float[]) get(2); } + // ------------------------------------------------------------------------- + // Primary key information + // ------------------------------------------------------------------------- + + /** + * {@inheritDoc} + */ + @Override + public Record1 key() { + return (Record1) super.key(); + } + // ------------------------------------------------------------------------- // Record3 type implementation // ------------------------------------------------------------------------- diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserTextArrRecord.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserTextArrRecord.java deleted file mode 100644 index 51e95bf..0000000 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserTextArrRecord.java +++ /dev/null @@ -1,173 +0,0 @@ -/** - * This class is generated by jOOQ - */ -package eu.fbk.fm.alignments.index.db.tables.records; - - -import eu.fbk.fm.alignments.index.db.tables.UserTextArr; - -import javax.annotation.Generated; - -import org.jooq.Field; -import org.jooq.Record1; -import org.jooq.Record2; -import org.jooq.Row2; -import org.jooq.impl.UpdatableRecordImpl; - - -/** - * This class is generated by jOOQ. - */ -@Generated( - value = { - "http://www.jooq.org", - "jOOQ version:3.8.4" - }, - comments = "This class is generated by jOOQ" -) -@SuppressWarnings({ "all", "unchecked", "rawtypes" }) -public class UserTextArrRecord extends UpdatableRecordImpl implements Record2 { - - private static final long serialVersionUID = -159096481; - - /** - * Setter for public.user_text_arr.uid. - */ - public void setUid(Long value) { - set(0, value); - } - - /** - * Getter for public.user_text_arr.uid. - */ - public Long getUid() { - return (Long) get(0); - } - - /** - * Setter for public.user_text_arr.lsa. - */ - public void setLsa(Float[] value) { - set(1, value); - } - - /** - * Getter for public.user_text_arr.lsa. - */ - public Float[] getLsa() { - return (Float[]) get(1); - } - - // ------------------------------------------------------------------------- - // Primary key information - // ------------------------------------------------------------------------- - - /** - * {@inheritDoc} - */ - @Override - public Record1 key() { - return (Record1) super.key(); - } - - // ------------------------------------------------------------------------- - // Record2 type implementation - // ------------------------------------------------------------------------- - - /** - * {@inheritDoc} - */ - @Override - public Row2 fieldsRow() { - return (Row2) super.fieldsRow(); - } - - /** - * {@inheritDoc} - */ - @Override - public Row2 valuesRow() { - return (Row2) super.valuesRow(); - } - - /** - * {@inheritDoc} - */ - @Override - public Field field1() { - return UserTextArr.USER_TEXT_ARR.UID; - } - - /** - * {@inheritDoc} - */ - @Override - public Field field2() { - return UserTextArr.USER_TEXT_ARR.LSA; - } - - /** - * {@inheritDoc} - */ - @Override - public Long value1() { - return getUid(); - } - - /** - * {@inheritDoc} - */ - @Override - public Float[] value2() { - return getLsa(); - } - - /** - * {@inheritDoc} - */ - @Override - public UserTextArrRecord value1(Long value) { - setUid(value); - return this; - } - - /** - * {@inheritDoc} - */ - @Override - public UserTextArrRecord value2(Float[] value) { - setLsa(value); - return this; - } - - /** - * {@inheritDoc} - */ - @Override - public UserTextArrRecord values(Long value1, Float[] value2) { - value1(value1); - value2(value2); - return this; - } - - // ------------------------------------------------------------------------- - // Constructors - // ------------------------------------------------------------------------- - - /** - * Create a detached UserTextArrRecord - */ - public UserTextArrRecord() { - super(UserTextArr.USER_TEXT_ARR); - } - - /** - * Create a detached, initialised UserTextArrRecord - */ - public UserTextArrRecord(Long uid, Float[] lsa) { - super(UserTextArr.USER_TEXT_ARR); - - set(0, uid); - set(1, lsa); - } -} diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserTextRecord.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserTextRecord.java index 7ccd062..71892f1 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserTextRecord.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/tables/records/UserTextRecord.java @@ -26,9 +26,9 @@ comments = "This class is generated by jOOQ" ) @SuppressWarnings({ "all", "unchecked", "rawtypes" }) -public class UserTextRecord extends UpdatableRecordImpl implements Record2 { +public class UserTextRecord extends UpdatableRecordImpl implements Record2 { - private static final long serialVersionUID = -267165098; + private static final long serialVersionUID = -919260237; /** * Setter for public.user_text.uid. @@ -45,17 +45,17 @@ public Long getUid() { } /** - * Setter for public.user_text.lsa. + * Setter for public.user_text.text. */ - public void setLsa(Object value) { + public void setText(String value) { set(1, value); } /** - * Getter for public.user_text.lsa. + * Getter for public.user_text.text. */ - public Object getLsa() { - return (Object) get(1); + public String getText() { + return (String) get(1); } // ------------------------------------------------------------------------- @@ -78,7 +78,7 @@ public Record1 key() { * {@inheritDoc} */ @Override - public Row2 fieldsRow() { + public Row2 fieldsRow() { return (Row2) super.fieldsRow(); } @@ -86,7 +86,7 @@ public Row2 fieldsRow() { * {@inheritDoc} */ @Override - public Row2 valuesRow() { + public Row2 valuesRow() { return (Row2) super.valuesRow(); } @@ -102,8 +102,8 @@ public Field field1() { * {@inheritDoc} */ @Override - public Field field2() { - return UserText.USER_TEXT.LSA; + public Field field2() { + return UserText.USER_TEXT.TEXT; } /** @@ -118,8 +118,8 @@ public Long value1() { * {@inheritDoc} */ @Override - public Object value2() { - return getLsa(); + public String value2() { + return getText(); } /** @@ -135,8 +135,8 @@ public UserTextRecord value1(Long value) { * {@inheritDoc} */ @Override - public UserTextRecord value2(Object value) { - setLsa(value); + public UserTextRecord value2(String value) { + setText(value); return this; } @@ -144,7 +144,7 @@ public UserTextRecord value2(Object value) { * {@inheritDoc} */ @Override - public UserTextRecord values(Long value1, Object value2) { + public UserTextRecord values(Long value1, String value2) { value1(value1); value2(value2); return this; @@ -164,10 +164,10 @@ public UserTextRecord() { /** * Create a detached, initialised UserTextRecord */ - public UserTextRecord(Long uid, Object lsa) { + public UserTextRecord(Long uid, String text) { super(UserText.USER_TEXT); set(0, uid); - set(1, lsa); + set(1, text); } } diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/udt/Test.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/udt/Test.java new file mode 100644 index 0000000..82a9793 --- /dev/null +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/udt/Test.java @@ -0,0 +1,69 @@ +/** + * This class is generated by jOOQ + */ +package eu.fbk.fm.alignments.index.db.udt; + + +import eu.fbk.fm.alignments.index.db.Public; +import eu.fbk.fm.alignments.index.db.udt.records.TestRecord; + +import javax.annotation.Generated; + +import org.jooq.Schema; +import org.jooq.UDTField; +import org.jooq.impl.UDTImpl; + + +/** + * This class is generated by jOOQ. + */ +@Generated( + value = { + "http://www.jooq.org", + "jOOQ version:3.8.4" + }, + comments = "This class is generated by jOOQ" +) +@SuppressWarnings({ "all", "unchecked", "rawtypes" }) +public class Test extends UDTImpl { + + private static final long serialVersionUID = 641273527; + + /** + * The reference instance of public.test + */ + public static final Test TEST = new Test(); + + /** + * The class holding records for this type + */ + @Override + public Class getRecordType() { + return TestRecord.class; + } + + /** + * The attribute public.test.k. + */ + public static final UDTField K = createField("k", org.jooq.impl.SQLDataType.INTEGER, TEST, ""); + + /** + * The attribute public.test.v. + */ + public static final UDTField V = createField("v", org.jooq.impl.SQLDataType.CLOB, TEST, ""); + + /** + * No further instances allowed + */ + private Test() { + super("test", null); + } + + /** + * {@inheritDoc} + */ + @Override + public Schema getSchema() { + return Public.PUBLIC; + } +} diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/db/udt/records/TestRecord.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/udt/records/TestRecord.java new file mode 100644 index 0000000..43bbb69 --- /dev/null +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/db/udt/records/TestRecord.java @@ -0,0 +1,160 @@ +/** + * This class is generated by jOOQ + */ +package eu.fbk.fm.alignments.index.db.udt.records; + + +import eu.fbk.fm.alignments.index.db.udt.Test; + +import javax.annotation.Generated; + +import org.jooq.Field; +import org.jooq.Record2; +import org.jooq.Row2; +import org.jooq.impl.UDTRecordImpl; + + +/** + * This class is generated by jOOQ. + */ +@Generated( + value = { + "http://www.jooq.org", + "jOOQ version:3.8.4" + }, + comments = "This class is generated by jOOQ" +) +@SuppressWarnings({ "all", "unchecked", "rawtypes" }) +public class TestRecord extends UDTRecordImpl implements Record2 { + + private static final long serialVersionUID = 659767992; + + /** + * Setter for public.test.k. + */ + public void setK(Integer value) { + set(0, value); + } + + /** + * Getter for public.test.k. + */ + public Integer getK() { + return (Integer) get(0); + } + + /** + * Setter for public.test.v. + */ + public void setV(String value) { + set(1, value); + } + + /** + * Getter for public.test.v. + */ + public String getV() { + return (String) get(1); + } + + // ------------------------------------------------------------------------- + // Record2 type implementation + // ------------------------------------------------------------------------- + + /** + * {@inheritDoc} + */ + @Override + public Row2 fieldsRow() { + return (Row2) super.fieldsRow(); + } + + /** + * {@inheritDoc} + */ + @Override + public Row2 valuesRow() { + return (Row2) super.valuesRow(); + } + + /** + * {@inheritDoc} + */ + @Override + public Field field1() { + return Test.K; + } + + /** + * {@inheritDoc} + */ + @Override + public Field field2() { + return Test.V; + } + + /** + * {@inheritDoc} + */ + @Override + public Integer value1() { + return getK(); + } + + /** + * {@inheritDoc} + */ + @Override + public String value2() { + return getV(); + } + + /** + * {@inheritDoc} + */ + @Override + public TestRecord value1(Integer value) { + setK(value); + return this; + } + + /** + * {@inheritDoc} + */ + @Override + public TestRecord value2(String value) { + setV(value); + return this; + } + + /** + * {@inheritDoc} + */ + @Override + public TestRecord values(Integer value1, String value2) { + value1(value1); + value2(value2); + return this; + } + + // ------------------------------------------------------------------------- + // Constructors + // ------------------------------------------------------------------------- + + /** + * Create a detached TestRecord + */ + public TestRecord() { + super(Test.TEST); + } + + /** + * Create a detached, initialised TestRecord + */ + public TestRecord(Integer k, String v) { + super(Test.TEST); + + set(0, k); + set(1, v); + } +} diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/index/sink/PostgresFileSink.java b/alignments/src/main/java/eu/fbk/fm/alignments/index/sink/PostgresFileSink.java index 6aafc9e..99b685a 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/index/sink/PostgresFileSink.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/index/sink/PostgresFileSink.java @@ -48,11 +48,15 @@ public void configure(Configuration parameters) { } private File prepareFile() { - return prepareFile(0, 0); + return prepareFile(0, 1); } private File prepareFile(int taskNumber, int numTasks) { - return new File(filename, appendix + "-" + taskNumber + "-" + numTasks); + if (numTasks == 1) { + return new File(filename, appendix); + } else { + return new File(filename, appendix + "-" + taskNumber + "-" + numTasks); + } } @Override diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/pipeline/ScoreEntities.java b/alignments/src/main/java/eu/fbk/fm/alignments/pipeline/ScoreEntities.java index 055fb61..faaed9e 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/pipeline/ScoreEntities.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/pipeline/ScoreEntities.java @@ -8,9 +8,14 @@ import eu.fbk.fm.alignments.persistence.ModelEndpoint; import eu.fbk.fm.alignments.persistence.sparql.Endpoint; import eu.fbk.fm.alignments.scorer.ISWC17Strategy; +import eu.fbk.fm.alignments.scorer.PAI18Strategy; import eu.fbk.fm.alignments.scorer.ScoringStrategy; +import eu.fbk.fm.alignments.scorer.text.LSAVectorProvider; +import eu.fbk.fm.alignments.scorer.text.MemoryEmbeddingsProvider; +import eu.fbk.fm.alignments.scorer.text.VectorProvider; import eu.fbk.fm.alignments.utils.DBUtils; import eu.fbk.utils.core.CommandLine; +import eu.fbk.utils.lsa.LSM; import eu.fbk.utils.math.Scaler; import org.jooq.Cursor; import org.jooq.Record; @@ -25,6 +30,8 @@ import javax.sql.DataSource; import java.io.FileReader; import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Paths; import java.sql.Connection; import java.sql.SQLException; import java.util.LinkedList; @@ -48,6 +55,7 @@ public class ScoreEntities { private static final String DB_PASSWORD = "db-password"; private static final String ENDPOINT = "endpoint"; private static final String LSA_PATH = "lsa-path"; + private static final String EMBEDDINGS_PATH = "embeddings-path"; private final DataSource source; private final Endpoint endpoint; @@ -152,7 +160,10 @@ private static CommandLine.Parser provideParameterList() { CommandLine.Type.STRING, true, false, true) .withOption(null, LSA_PATH, "path to LSA model", "DIRECTORY", - CommandLine.Type.STRING, true, false, true); + CommandLine.Type.STRING, true, false, true) + .withOption(null, EMBEDDINGS_PATH, + "path to embeddings to use along with LSA", "DIRECTORY", + CommandLine.Type.STRING, true, false, false); } public static void main(String[] args) throws Exception { @@ -165,10 +176,30 @@ public static void main(String[] args) throws Exception { final String dbPassword = cmd.getOptionValue(DB_PASSWORD, String.class); final String endpointUri = cmd.getOptionValue(ENDPOINT, String.class); final String lsaPath = cmd.getOptionValue(LSA_PATH, String.class); + final String embeddingsPath = cmd.getOptionValue(EMBEDDINGS_PATH, String.class); DataSource source = DBUtils.createPGDataSource(dbConnection, dbUser, dbPassword); Endpoint endpoint = new Endpoint(endpointUri); - ScoringStrategy strategy = ISWC17Strategy.builder().source(source).lsaPath(lsaPath).build(); + + PAI18Strategy strategy = new PAI18Strategy(source); + LSM lsm = new LSM(lsaPath + "/X", 100, true); + VectorProvider textVectorProvider = new LSAVectorProvider(lsm); + List allVectorProviders = new LinkedList<>(); + allVectorProviders.add(textVectorProvider); + if (embeddingsPath != null) { + LinkedList embProviders = new LinkedList<>(); + Files.list(Paths.get(embeddingsPath)).forEach((path) -> { + try { + embProviders.add(new MemoryEmbeddingsProvider(path.toString(), lsaPath)); + } catch (Exception e) { + LOGGER.error("Error while loading embedding", e); + } + }); + LOGGER.info("Loaded {} embedding models", embProviders.size()); + allVectorProviders.addAll(embProviders); + } + strategy.addProvider(ISWC17Strategy.builder().source(source).vectorProviders(allVectorProviders).build()); + ScoreEntities script = new ScoreEntities(source, endpoint, strategy); script.run(); diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/pipeline/SubmitEntities.java b/alignments/src/main/java/eu/fbk/fm/alignments/pipeline/SubmitEntities.java index 99fe4a5..c50cf41 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/pipeline/SubmitEntities.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/pipeline/SubmitEntities.java @@ -1,9 +1,12 @@ package eu.fbk.fm.alignments.pipeline; +import eu.fbk.fm.alignments.evaluation.DatasetEntry; import eu.fbk.fm.alignments.index.FillFromIndex; import eu.fbk.fm.alignments.index.db.tables.records.AlignmentsRecord; import eu.fbk.fm.alignments.persistence.sparql.Endpoint; import eu.fbk.fm.alignments.query.index.AllNamesStrategy; +import eu.fbk.fm.alignments.scorer.FullyResolvedEntry; +import eu.fbk.fm.alignments.scorer.UserData; import eu.fbk.fm.alignments.utils.DBUtils; import eu.fbk.utils.core.CommandLine; import org.jooq.DSLContext; @@ -19,6 +22,7 @@ import java.util.LinkedList; import java.util.List; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; import static eu.fbk.fm.alignments.index.db.tables.Alignments.ALIGNMENTS; @@ -37,7 +41,6 @@ public class SubmitEntities { private final DataSource source; private final FillFromIndex index; - private final FillFromIndex fallbackIndex; public SubmitEntities(DataSource source, Endpoint endpoint) { @@ -45,9 +48,6 @@ public SubmitEntities(DataSource source, Endpoint endpoint) { this.index = new FillFromIndex(endpoint, new AllNamesStrategy(), source); this.index.setTimeout(20); this.index.turnOffVerbose(); - this.fallbackIndex = new FillFromIndex(endpoint, new AllNamesStrategy(1), source); - this.fallbackIndex.setTimeout(20); - this.fallbackIndex.turnOffVerbose(); } public void run(String input) throws IOException { @@ -55,10 +55,9 @@ public void run(String input) throws IOException { Files.lines(Paths.get(input)).parallel().forEach(line -> { //Populating list of candidates line = line.substring(1, line.length()-1); - List uids = index.getUids(line); - if (uids.size() == 0) { - uids = fallbackIndex.getUids(line); - } + FullyResolvedEntry entry = new FullyResolvedEntry(new DatasetEntry(line)); + index.fill(entry); + List uids = entry.candidates.stream().map(UserData::getId).collect(Collectors.toList()); //Saving everything to the database DSLContext context = DSL.using(source, SQLDialect.POSTGRES); diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/scorer/ISWC17Strategy.java b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/ISWC17Strategy.java index acd9215..072f1e5 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/scorer/ISWC17Strategy.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/ISWC17Strategy.java @@ -134,7 +134,7 @@ private void initProvider(List providers, VectorProvider textVe if (source == null) { providers.add(new TextScorer(scorer).all().userData()); } else { - providers.add(new DBTextScorer(source, textVectorProvider)); + providers.add(new DBTextScorerv2(source, scorer)); } } diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/scorer/PAI18Strategy.java b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/PAI18Strategy.java index 6692c3f..f86f477 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/scorer/PAI18Strategy.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/PAI18Strategy.java @@ -1,22 +1,16 @@ package eu.fbk.fm.alignments.scorer; import eu.fbk.fm.alignments.DBpediaResource; -import eu.fbk.fm.alignments.scorer.embeddings.EmbeddingsProvider; import eu.fbk.fm.alignments.scorer.embeddings.EntityDirectEmbeddings; -import eu.fbk.fm.alignments.scorer.embeddings.EntityEmbeddings; import eu.fbk.fm.alignments.scorer.embeddings.SocialGraphEmbeddings; -import eu.fbk.fm.alignments.scorer.text.LSAVectorProvider; import eu.fbk.fm.alignments.scorer.text.MemoryEmbeddingsProvider; import eu.fbk.fm.alignments.scorer.text.VectorProvider; -import eu.fbk.utils.lsa.LSM; import org.apache.flink.api.java.tuple.Tuple2; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import twitter4j.User; import javax.sql.DataSource; -import java.io.IOException; -import java.net.URISyntaxException; import java.util.*; /** diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/scorer/embeddings/EntityDirectEmbeddings.java b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/embeddings/EntityDirectEmbeddings.java index b46e693..dea8570 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/scorer/embeddings/EntityDirectEmbeddings.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/embeddings/EntityDirectEmbeddings.java @@ -7,8 +7,6 @@ import java.io.Serializable; import java.net.URISyntaxException; -import static eu.fbk.fm.alignments.index.db.Tables.KB_INDEX; - /** * Queries embeddings endpoint based on Wikidata URI of an entity * diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/scorer/embeddings/EntityEmbeddings.java b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/embeddings/EntityEmbeddings.java index 80f070f..1b36b15 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/scorer/embeddings/EntityEmbeddings.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/embeddings/EntityEmbeddings.java @@ -4,15 +4,13 @@ import twitter4j.User; import javax.sql.DataSource; -import java.io.Serializable; import java.net.URISyntaxException; -import static eu.fbk.fm.alignments.index.db.Tables.KB_INDEX; - /** * Queries kb_index table in the database and then queries embeddings endpoint * * @author Yaroslav Nechaev (remper@me.com) + * @deprecated */ public class EntityEmbeddings extends EmbeddingsProvider { @@ -25,7 +23,8 @@ public EntityEmbeddings(DataSource source, String embName) throws URISyntaxExcep @Override public double[] _getFeatures(User user, DBpediaResource resource) { - Long userVectorRaw = context + throw new UnsupportedOperationException("This way of querying embeddings is currently disabled, please do not use it"); + /*Long userVectorRaw = context .select(KB_INDEX.KBID) .from(KB_INDEX) .where(KB_INDEX.URI.eq(resource.getIdentifier())) @@ -38,6 +37,6 @@ public double[] _getFeatures(User user, DBpediaResource resource) { Long[] result = new Long[1]; result[0] = userVectorRaw; - return predict((Serializable[]) result); + return predict((Serializable[]) result);*/ } } diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/scorer/text/DBTextScorer.java b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/text/DBTextScorer.java index a0da05a..c4fc32b 100644 --- a/alignments/src/main/java/eu/fbk/fm/alignments/scorer/text/DBTextScorer.java +++ b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/text/DBTextScorer.java @@ -16,11 +16,10 @@ import java.util.List; -import static eu.fbk.fm.alignments.index.db.Tables.USER_TEXT_ARR; -import static eu.fbk.fm.alignments.index.db.tables.UserText.USER_TEXT; - /** * Compares entity's text to the precomputed LSA in the DB + * + * @deprecated */ public class DBTextScorer implements FeatureProvider { @@ -38,7 +37,8 @@ public DBTextScorer(DataSource source, VectorProvider vectorProvider) { @Override public double getFeature(User user, DBpediaResource resource) { - PGobject userVectorRaw; + throw new UnsupportedOperationException("This way of requesting entity's text is no longer supported"); + /*PGobject userVectorRaw; try { userVectorRaw = DSL.using(source, SQLDialect.POSTGRES) .select(USER_TEXT.LSA) @@ -57,7 +57,7 @@ public double getFeature(User user, DBpediaResource resource) { return 0.0d; } - return process(cubeToFloat(userVectorRaw), resource); + return process(cubeToFloat(userVectorRaw), resource);*/ } protected double process(float[] user, DBpediaResource resource) { @@ -110,7 +110,7 @@ public DBTextScorerArr(DataSource source, LSAVectorProvider lsaVectorProvider) { super(source, lsaVectorProvider); } - @Override + /*@Override public double getFeature(User user, DBpediaResource resource) { Float[] userVectorRaw = DSL.using(source, SQLDialect.POSTGRES) .select(USER_TEXT_ARR.LSA) @@ -127,7 +127,7 @@ public double getFeature(User user, DBpediaResource resource) { } return process(DBTextScorer.numberToFloat(userVectorRaw), resource); - } + }*/ } diff --git a/alignments/src/main/java/eu/fbk/fm/alignments/scorer/text/DBTextScorerv2.java b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/text/DBTextScorerv2.java new file mode 100644 index 0000000..5a7719f --- /dev/null +++ b/alignments/src/main/java/eu/fbk/fm/alignments/scorer/text/DBTextScorerv2.java @@ -0,0 +1,77 @@ +package eu.fbk.fm.alignments.scorer.text; + +import eu.fbk.fm.alignments.DBpediaResource; +import eu.fbk.fm.alignments.scorer.FeatureProvider; +import eu.fbk.fm.alignments.scorer.TextScorer; +import org.jooq.SQLDialect; +import org.jooq.impl.DSL; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import twitter4j.User; + +import javax.sql.DataSource; +import java.util.List; + +import static eu.fbk.fm.alignments.index.db.tables.UserText.USER_TEXT; + +/** + * Compares entity's text to the text contained in the DB + */ +public class DBTextScorerv2 implements FeatureProvider { + + private static final Logger LOGGER = LoggerFactory.getLogger(DBTextScorerv2.class); + + protected DataSource source; + protected SimilarityScorer scorer; + + protected boolean verbose = false; + + public DBTextScorerv2(DataSource source, SimilarityScorer scorer) { + this.source = source; + this.scorer = scorer; + } + + @Override + public double getFeature(User user, DBpediaResource resource) { + String userTextRaw; + try { + userTextRaw = DSL.using(source, SQLDialect.POSTGRES) + .select(USER_TEXT.TEXT) + .from(USER_TEXT) + .where(USER_TEXT.UID.eq(user.getId())) + .fetchOne(USER_TEXT.TEXT); + } catch (Exception e) { + LOGGER.error("Something happened while querying user "+user.getScreenName(), e); + throw e; + } + + if (userTextRaw == null) { + if (verbose) { + LOGGER.debug("Can't find text for user: @"+user.getScreenName()+" ("+user.getId()+")"); + } + return 0.0d; + } + if (verbose && userTextRaw.length() <= 5) { + LOGGER.warn("Extremely short text for user: @"+user.getScreenName()+" ("+user.getId()+")"); + } + + return process(userTextRaw, resource); + } + + protected double process(String userText, DBpediaResource resource) { + List resourceTexts = TextScorer.getResourceTexts(resource); + + double topScore = 0.0d; + for (String text : resourceTexts) { + double curScore = scorer.score(userText, text); + if (curScore > topScore) { + topScore = curScore; + } + } + return topScore; + } + + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } +} diff --git a/alignments/src/main/java/eu/fbk/fm/vectorize/preprocessing/text/TextProcessor.java b/alignments/src/main/java/eu/fbk/fm/vectorize/preprocessing/text/TextProcessor.java index 20a89c1..e25d27b 100644 --- a/alignments/src/main/java/eu/fbk/fm/vectorize/preprocessing/text/TextProcessor.java +++ b/alignments/src/main/java/eu/fbk/fm/vectorize/preprocessing/text/TextProcessor.java @@ -44,15 +44,13 @@ protected Tuple3 process(JsonObject status) { final LinkedList replacements = new LinkedList<>(); JsonObject entities = status.getAsJsonObject("entities"); - if (entities == null) { - return new Tuple3<>(id, userId, prepareString(originalText)); + if (entities != null) { + replacements.addAll(addReplacements(entities, "hashtags", hashtag -> breakHashtag(hashtag.get("text").getAsString()))); + replacements.addAll(addReplacements(entities, "user_mentions", mention -> mention.get("name").getAsString())); + replacements.addAll(addReplacements(entities, "urls", url -> " ")); + replacements.addAll(addReplacements(entities, "media", media -> " ")); } - replacements.addAll(addReplacements(entities, "hashtags", hashtag -> breakHashtag(hashtag.get("text").getAsString()))); - replacements.addAll(addReplacements(entities, "user_mentions", mention -> mention.get("name").getAsString())); - replacements.addAll(addReplacements(entities, "urls", url -> " ")); - replacements.addAll(addReplacements(entities, "media", media -> " ")); - // Sorting replacements replacements.sort(Comparator.comparingInt(r -> r.start)); @@ -84,8 +82,8 @@ protected Tuple3 process(JsonObject status) { //.replaceAll("^\\.", "") //.replaceAll("\\.$", "") .replaceAll("https?://[^\\s]+", " ") - .replaceAll("[,.?_!@#$%^&*():|/\\\\]", " ") - .replaceAll("[\"'`‘“´]", " ' ") + //.replaceAll("[,.?_!@#$%^&*():|/\\\\]", " ") + //.replaceAll("[\"'`‘“´]", " ' ") .replaceAll("\\s+", " "); if (noCase) { processedText = processedText.toLowerCase(); diff --git a/alignments/src/main/resources/schema.sql b/alignments/src/main/resources/schema.sql index 5eb3b48..bbb1728 100644 --- a/alignments/src/main/resources/schema.sql +++ b/alignments/src/main/resources/schema.sql @@ -73,7 +73,7 @@ WITH ( CREATE TABLE public.user_text ( uid bigint NOT NULL, - lsa real[] NOT NULL, + text text NOT NULL, CONSTRAINT user_text_pkey PRIMARY KEY (uid) ) WITH ( @@ -100,4 +100,19 @@ WITH ( CREATE INDEX kb_index_uri ON public.kb_index USING btree - (uri); \ No newline at end of file + (uri); + +-- Table: public.alignments + +-- DROP TABLE public.alignments; + +CREATE TABLE public.alignments ( + resource_id character varying(255) NOT NULL, + uid bigint NOT NULL, + score real NOT NULL, + is_alignment boolean NOT NULL, + version smallint NOT NULL, + CONSTRAINT alignments_pkey PRIMARY KEY (resource_id, uid) +); + +CREATE INDEX alignments_version_idx ON public.alignments USING btree (version); \ No newline at end of file