diff --git a/src/tokenizer/stop_word_filter/gen_stopwords.py b/src/tokenizer/stop_word_filter/gen_stopwords.py index 333fa92c9..5cee70fe2 100644 --- a/src/tokenizer/stop_word_filter/gen_stopwords.py +++ b/src/tokenizer/stop_word_filter/gen_stopwords.py @@ -6,6 +6,7 @@ LANGUAGES = [ "finnish", "french", "german", + "hungarian", "italian", "norwegian", "portuguese", diff --git a/src/tokenizer/stop_word_filter/mod.rs b/src/tokenizer/stop_word_filter/mod.rs index adfbf17d4..b4367ec45 100644 --- a/src/tokenizer/stop_word_filter/mod.rs +++ b/src/tokenizer/stop_word_filter/mod.rs @@ -50,6 +50,7 @@ impl StopWordFilter { Language::Finnish => stopwords::FINNISH, Language::French => stopwords::FRENCH, Language::German => stopwords::GERMAN, + Language::Hungarian => stopwords::HUNGARIAN, Language::Italian => stopwords::ITALIAN, Language::Norwegian => stopwords::NORWEGIAN, Language::Portuguese => stopwords::PORTUGUESE, diff --git a/src/tokenizer/stop_word_filter/stopwords.rs b/src/tokenizer/stop_word_filter/stopwords.rs index 7fc47ac45..73a59c4fa 100644 --- a/src/tokenizer/stop_word_filter/stopwords.rs +++ b/src/tokenizer/stop_word_filter/stopwords.rs @@ -1,6 +1,6 @@ /* These stop word lists are from the Snowball project (https://snowballstem.org/) -which carries the following license: +which carries the following copyright and license: Copyright (c) 2001, Dr Martin Porter Copyright (c) 2004,2005, Richard Boulton @@ -862,6 +862,208 @@ pub const GERMAN: &[&str] = &[ "zwischen", ]; +pub const HUNGARIAN: &[&str] = &[ + "a", + "ahogy", + "ahol", + "aki", + "akik", + "akkor", + "alatt", + "által", + "általában", + "amely", + "amelyek", + "amelyekben", + "amelyeket", + "amelyet", + "amelynek", + "ami", + "amit", + "amolyan", + "amíg", + "amikor", + "át", + "abban", + "ahhoz", + "annak", + "arra", + "arról", + "az", + "azok", + "azon", + "azt", + "azzal", + "azért", + "aztán", + "azután", + "azonban", + "bár", + "be", + "belül", + "benne", + "cikk", + "cikkek", + "cikkeket", + "csak", + "de", + "e", + "eddig", + "egész", + "egy", + "egyes", + "egyetlen", + "egyéb", + "egyik", + "egyre", + "ekkor", + "el", + "elég", + "ellen", + "elő", + "először", + "előtt", + "első", + "én", + "éppen", + "ebben", + "ehhez", + "emilyen", + "ennek", + "erre", + "ez", + "ezt", + "ezek", + "ezen", + "ezzel", + "ezért", + "és", + "fel", + "felé", + "hanem", + "hiszen", + "hogy", + "hogyan", + "igen", + "így", + "illetve", + "ill.", + "ill", + "ilyen", + "ilyenkor", + "ison", + "ismét", + "itt", + "jó", + "jól", + "jobban", + "kell", + "kellett", + "keresztül", + "keressünk", + "ki", + "kívül", + "között", + "közül", + "legalább", + "lehet", + "lehetett", + "legyen", + "lenne", + "lenni", + "lesz", + "lett", + "maga", + "magát", + "majd", + "majd", + "már", + "más", + "másik", + "meg", + "még", + "mellett", + "mert", + "mely", + "melyek", + "mi", + "mit", + "míg", + "miért", + "milyen", + "mikor", + "minden", + "mindent", + "mindenki", + "mindig", + "mint", + "mintha", + "mivel", + "most", + "nagy", + "nagyobb", + "nagyon", + "ne", + "néha", + "nekem", + "neki", + "nem", + "néhány", + "nélkül", + "nincs", + "olyan", + "ott", + "össze", + "ő", + "ők", + "őket", + "pedig", + "persze", + "rá", + "s", + "saját", + "sem", + "semmi", + "sok", + "sokat", + "sokkal", + "számára", + "szemben", + "szerint", + "szinte", + "talán", + "tehát", + "teljes", + "tovább", + "továbbá", + "több", + "úgy", + "ugyanis", + "új", + "újabb", + "újra", + "után", + "utána", + "utolsó", + "vagy", + "vagyis", + "valaki", + "valami", + "valamint", + "való", + "vagyok", + "van", + "vannak", + "volt", + "voltam", + "voltak", + "voltunk", + "vissza", + "vele", + "viszont", + "volna", +]; + pub const ITALIAN: &[&str] = &[ "ad", "al",