diff --git a/Jenkinsfile b/Jenkinsfile index 24ac047eb..f7a96b4ea 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -28,7 +28,7 @@ pipeline { MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1' KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' - HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0' + HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-04-26-0' DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/text_normalization/hi/data/date/days.tsv b/nemo_text_processing/text_normalization/hi/data/date/days.tsv index 633e2aec0..6df0fa3d4 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/days.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/days.tsv @@ -1,40 +1,9 @@ -०१ एक -०२ दो -०३ तीन -०४ चार -०५ पाँच -०६ छः -०७ सात -०८ आठ -०९ नौ -१० दस -११ ग्यारह -१२ बारह -१३ तेरह -१४ चौदह -१५ पंद्रह -१६ सोलह -१७ सत्रह -१८ अठारह -१९ उन्नीस -२० बीस -२१ इक्कीस -२२ बाईस -२३ तेईस -२४ चौबीस -२५ पच्चीस -२६ छब्बीस -२७ सत्ताईस -२८ अट्ठाईस -२९ उनतीस -३० तीस -३१ इकतीस 01 एक 02 दो 03 तीन 04 चार 05 पाँच -06 छः +06 छह 07 सात 08 आठ 09 नौ @@ -59,4 +28,35 @@ 28 अट्ठाईस 29 उनतीस 30 तीस -31 इकतीस \ No newline at end of file +31 इकतीस +०१ एक +०२ दो +०३ तीन +०४ चार +०५ पाँच +०६ छह +०७ सात +०८ आठ +०९ नौ +१० दस +११ ग्यारह +१२ बारह +१३ तेरह +१४ चौदह +१५ पंद्रह +१६ सोलह +१७ सत्रह +१८ अठारह +१९ उन्नीस +२० बीस +२१ इक्कीस +२२ बाईस +२३ तेईस +२४ चौबीस +२५ पच्चीस +२६ छब्बीस +२७ सत्ताईस +२८ अट्ठाईस +२९ उनतीस +३० तीस +३१ इकतीस \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/months.tsv b/nemo_text_processing/text_normalization/hi/data/date/months.tsv index af770dafc..3667f07cf 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/months.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/months.tsv @@ -1,17 +1,5 @@ -०१ जनवरी -०२ फ़रवरी -०३ मार्च -०४ अप्रैल -०५ मई -०६ जून -०७ जुलाई -०८ अगस्त -०९ सितंबर -१० अक्टूबर -११ नवंबर -१२ दिसंबर 01 जनवरी -02 फ़रवरी +02 फरवरी 03 मार्च 04 अप्रैल 05 मई @@ -21,4 +9,16 @@ 09 सितंबर 10 अक्टूबर 11 नवंबर -12 दिसंबर \ No newline at end of file +12 दिसंबर +०१ जनवरी +०२ फरवरी +०३ मार्च +०४ अप्रैल +०५ मई +०६ जून +०७ जुलाई +०८ अगस्त +०९ सितंबर +१० अक्टूबर +११ नवंबर +१२ दिसंबर \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv index d4c1ca0b1..6166ec327 100644 --- a/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv +++ b/nemo_text_processing/text_normalization/hi/data/date/prefixes.tsv @@ -1,3 +1,4 @@ -सन् -सन -साल \ No newline at end of file +सन् +सन +साल +दशक \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index da917f3de..451497ed0 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -33,23 +33,33 @@ teens_ties = pynini.union(teens_ties_hi, teens_ties_en) teens_and_ties = pynutil.add_weight(teens_ties, -0.1) -# Read suffixes from file into a list with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f: - suffixes_list = f.read().splitlines() + suffixes_list = [line.rstrip("\n") for line in f if line.strip()] with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f: - prefixes_list = f.read().splitlines() + prefixes_list = [line.rstrip("\n") for line in f if line.strip()] -# Create union of suffixes and prefixes suffix_union = pynini.union(*suffixes_list) prefix_union = pynini.union(*prefixes_list) +verbalized_hundreds = teens_ties_hi.project("output") +verbalized_unit = pynini.union(verbalized_hundreds, digit.project("output")) + +verbalized_year_sou = ( + verbalized_hundreds + pynini.accep(" सौ") + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1) +) + +pad_latin = pynini.union(*[pynini.cross(str(i), f"0{i}") for i in range(1, 10)]) +pad_devanagari = pynini.union(*[pynini.cross(d, f"०{d}") for d in "१२३४५६७८९"]) + class DateFst(GraphFst): """ Finite state transducer for classifying date, e.g. "०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" } - "०४-०१-२०२४" -> date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" } - + "६ मार्च, २०१०" -> date { day: "छह" month: "मार्च" year: "दो हज़ार दस" } + "३१ मई, १९९० ई." -> date { day: "इकतीस" month: "मई" year: "उन्नीस सौ नब्बे" era: "ईसवी" } + "उन्नीस सौ बीस में" -> date { era: "उन्नीस सौ बीस में" } + "02-07-1970" -> date { day: "दो" month: "जुलाई" year: "उन्नीस सौ सत्तर" } Args: cardinal: cardinal GraphFst @@ -68,52 +78,137 @@ def __init__(self, cardinal: GraphFst): ) cardinal_graph = pynini.union( - digit, teens_and_ties, cardinal.graph_hundreds, graph_year_thousands, graph_year_hundreds_as_thousands + digit, + teens_and_ties, + cardinal.graph_hundreds, + graph_year_thousands, + graph_year_hundreds_as_thousands, ) graph_year = pynini.union(graph_year_thousands, graph_year_hundreds_as_thousands) + graph_year_era = pynini.union( + graph_year_thousands, + graph_year_hundreds_as_thousands, + cardinal.graph_hundreds, + ) + delete_dash = pynutil.delete("-") - delete_slash = pynutil.delete("/") + delete_comma = pynutil.delete(",") + delete_space = pynutil.delete(" ") + delete_optional_space = pynini.closure(pynutil.delete(" "), 0, 1) + delete_comma_sep = delete_comma + delete_optional_space + + day_num_padded = pynini.union( + days, + teens_and_ties, + ) - days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space + day_num_bare = pynini.union( + pynini.compose(pad_latin, days), + pynini.compose(pad_devanagari, days), + ) - months_graph = pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space + days_graph_padded = pynutil.insert("day: \"") + day_num_padded + pynutil.insert("\"") + insert_space + days_graph_bare = pynutil.insert("day: \"") + day_num_bare + pynutil.insert("\"") + insert_space - years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space + month_name_acceptor = pynini.project(months, "output") + + months_numeric_padded = months + + months_numeric_bare = pynini.union( + pynini.compose(pad_latin, months), + pynini.compose(pad_devanagari, months), + ) - graph_dd_mm = days_graph + delete_dash + months_graph + months_graph_numeric_padded = ( + pynutil.insert("month: \"") + months_numeric_padded + pynutil.insert("\"") + insert_space + ) + + months_fst_padded = pynini.union(months_numeric_padded, month_name_acceptor) + months_graph_padded = pynutil.insert("month: \"") + months_fst_padded + pynutil.insert("\"") + insert_space - graph_mm_dd = months_graph + delete_dash + days_graph + months_fst_bare = pynini.union(months_numeric_bare, month_name_acceptor) + months_graph_bare = pynutil.insert("month: \"") + months_fst_bare + pynutil.insert("\"") + insert_space - graph_mm_dd += pynutil.insert(" preserve_order: true ") + month_name_graph = pynutil.insert("month: \"") + month_name_acceptor + pynutil.insert("\"") + insert_space + + years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space - # Graph for era era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space range_graph = pynini.cross("-", "से") - # Graph for year century_number = pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) + pynini.accep("वीं") century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space - # Updated logic to use suffix_union year_number = graph_year + suffix_union year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space - # Updated logic to use prefix_union - year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"") + year_prefix = pynutil.insert("era: \"") + prefix_union + pynini.accep(" ") + graph_year + pynutil.insert("\"") - delete_separator = pynini.union(delete_dash, delete_slash) - graph_dd_mm_yyyy = days_graph + delete_separator + months_graph + delete_separator + years_graph + year_prefix_suffix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + graph_year + + suffix_union + + pynutil.insert("\"") + ) - graph_mm_dd_yyyy = months_graph + delete_separator + days_graph + delete_separator + years_graph + graph_verbalized_year_suffix = ( + pynutil.insert("era: \"") + verbalized_year_sou + suffix_union + pynutil.insert("\"") + insert_space + ) - graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") + graph_verbalized_year_bare = ( + pynutil.insert("era: \"") + verbalized_year_sou + pynutil.insert("\"") + insert_space + ) - graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph + graph_verbalized_year_prefix = ( + pynutil.insert("era: \"") + prefix_union + pynini.accep(" ") + verbalized_year_sou + pynutil.insert("\"") + ) - graph_year_suffix = era_graph + graph_verbalized_year_prefix_suffix = ( + pynutil.insert("era: \"") + + prefix_union + + pynini.accep(" ") + + verbalized_year_sou + + suffix_union + + pynutil.insert("\"") + ) + + graph_dd_mm = days_graph_padded + delete_dash + months_graph_padded + + graph_d_m = days_graph_bare + delete_dash + months_graph_bare + + graph_dd_mm_yyyy = days_graph_padded + delete_dash + months_graph_padded + delete_dash + years_graph + + graph_d_m_yyyy = days_graph_bare + delete_dash + months_graph_bare + delete_dash + years_graph + + graph_dd_month = days_graph_padded + delete_space + months_graph_numeric_padded + + graph_dd_month_comma_yyyy = ( + days_graph_padded + delete_space + months_graph_padded + delete_comma_sep + years_graph + ) + + graph_dd_month_comma_yyyy_era = ( + days_graph_padded + delete_space + months_graph_padded + delete_comma_sep + years_graph + era_graph + ) + + graph_month_comma_yyyy = months_graph_padded + delete_comma_sep + years_graph + + graph_month_comma_yyyy_era = months_graph_padded + delete_comma_sep + years_graph + era_graph + + graph_month_name_yyyy = month_name_graph + delete_space + years_graph + + graph_year_era_only = ( + pynutil.insert("era: \"") + + graph_year_era + + insert_space + + year_suffix + + pynutil.insert("\"") + + insert_space + ) graph_range = ( pynutil.insert("era: \"") @@ -126,21 +221,31 @@ def __init__(self, cardinal: GraphFst): + pynutil.insert(" preserve_order: true ") ) - # default assume dd_mm_yyyy + graph_year_suffix = era_graph final_graph = ( - pynutil.add_weight(graph_dd_mm, -0.001) - | graph_mm_dd + pynutil.add_weight(graph_dd_month_comma_yyyy_era, -0.003) + | pynutil.add_weight(graph_month_comma_yyyy_era, -0.003) | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) - | graph_mm_dd_yyyy - | pynutil.add_weight(graph_mm_yyyy, -0.2) - | pynutil.add_weight(graph_year_suffix, -0.001) + | pynutil.add_weight(graph_d_m_yyyy, -0.001) + | pynutil.add_weight(graph_dd_month_comma_yyyy, -0.001) + | pynutil.add_weight(graph_dd_mm, -0.001) + | pynutil.add_weight(graph_d_m, -0.001) + | pynutil.add_weight(graph_dd_month, -0.001) + | pynutil.add_weight(graph_month_name_yyyy, -0.2) + | pynutil.add_weight(graph_month_comma_yyyy, -0.2) + | pynutil.add_weight(graph_year_era_only, -0.005) | pynutil.add_weight(graph_range, -0.005) + | pynutil.add_weight(graph_year_suffix, -0.001) | pynutil.add_weight(century_text, -0.001) - | pynutil.add_weight(year_text, -0.001) + | pynutil.add_weight(graph_verbalized_year_prefix_suffix, -0.012) + | pynutil.add_weight(graph_verbalized_year_prefix, -0.011) + | pynutil.add_weight(graph_verbalized_year_suffix, -0.010) + | pynutil.add_weight(graph_verbalized_year_bare, -0.009) + | pynutil.add_weight(year_prefix_suffix, -0.010) | pynutil.add_weight(year_prefix, -0.009) + | pynutil.add_weight(year_text, -0.001) ) self.final_graph = final_graph.optimize() - self.fst = self.add_tokens(self.final_graph) diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 3e1ded4b1..88cb04727 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -121,7 +121,7 @@ def __init__( pynutil.add_weight(whitelist_graph, 1.01) | pynutil.add_weight(cardinal_graph, 1.1) | pynutil.add_weight(decimal_graph, 1.1) - | pynutil.add_weight(fraction_graph, 1.1) + | pynutil.add_weight(fraction_graph, 1.05) | pynutil.add_weight(date_graph, 1.1) | pynutil.add_weight(time_graph, 1.1) | pynutil.add_weight(measure_graph, 1.1) diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index 86f1f6678..97a71c9b7 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -1,20 +1,19 @@ -06-05~छः मई +06-05~छह मई ३१-०६~इकतीस जून 02-01~दो जनवरी ०४-०१~चार जनवरी -01-10~एक अक्टूबर +01-10~एक अक्टूबर १२-०७~बारह जुलाई -02-27~फ़रवरी सत्ताईस -०४-०३~चार मार्च +०४-०३~चार मार्च 25-03-2020~पच्चीस मार्च दो हज़ार बीस ३०-०५-२०७०~तीस मई दो हज़ार सत्तर -12-07-1970~बारह जुलाई उन्नीस सौ सत्तर ०९-१२-२१०१~नौ दिसंबर इक्कीस सौ एक 23-08-2024~तेईस अगस्त दो हज़ार चौबीस -१०-२९-२०००~अक्टूबर उनतीस दो हज़ार -11-14-1100~नवंबर चौदह ग्यारह सौ -०३-२०१०~मार्च दो हज़ार दस -11-2024~नवंबर दो हज़ार चौबीस +३ मार्च~तीन मार्च +६ मार्च, २०१०~छह मार्च दो हज़ार दस +३१ मई, १९९० ई.~इकतीस मई उन्नीस सौ नब्बे ईसवी +मार्च, २०२४~मार्च दो हज़ार चौबीस +जनवरी, १९९० ई.~जनवरी उन्नीस सौ नब्बे ईसवी २०७०~दो हज़ार सत्तर 2024~दो हज़ार चौबीस १२० ई. पू.~एक सौ बीस ईसा पूर्व @@ -31,4 +30,9 @@ सन 1999~सन उन्नीस सौ निन्यानबे सन् १९२०~सन् उन्नीस सौ बीस साल 1971~साल उन्नीस सौ इकहत्तर -१९२०-२६ तक~उन्नीस सौ बीस से छब्बीस तक \ No newline at end of file +सन 1999 में~सन उन्नीस सौ निन्यानबे में +सन् उन्नीस सौ बीस~सन् उन्नीस सौ बीस +सन उन्नीस सौ बीस में~सन उन्नीस सौ बीस में +१९२०-२६ तक~उन्नीस सौ बीस से छब्बीस तक +2-7-1970~दो जुलाई उन्नीस सौ सत्तर +02-07-1970~दो जुलाई उन्नीस सौ सत्तर \ No newline at end of file