Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ pipeline {
MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1'
KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'
HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/05-04-26-0'
DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {
Expand Down
66 changes: 33 additions & 33 deletions nemo_text_processing/text_normalization/hi/data/date/days.tsv
Original file line number Diff line number Diff line change
@@ -1,40 +1,9 @@
०१ एक
०२ दो
०३ तीन
०४ चार
०५ पाँच
०६ छः
०७ सात
०८ आठ
०९ नौ
१० दस
११ ग्यारह
१२ बारह
१३ तेरह
१४ चौदह
१५ पंद्रह
१६ सोलह
१७ सत्रह
१८ अठारह
१९ उन्नीस
२० बीस
२१ इक्कीस
२२ बाईस
२३ तेईस
२४ चौबीस
२५ पच्चीस
२६ छब्बीस
२७ सत्ताईस
२८ अट्ठाईस
२९ उनतीस
३० तीस
३१ इकतीस
01 एक
02 दो
03 तीन
04 चार
05 पाँच
06 छः
06 छह
07 सात
08 आठ
09 नौ
Expand All @@ -59,4 +28,35 @@
28 अट्ठाईस
29 उनतीस
30 तीस
31 इकतीस
31 इकतीस
०१ एक
०२ दो
०३ तीन
०४ चार
०५ पाँच
०६ छह
०७ सात
०८ आठ
०९ नौ
१० दस
११ ग्यारह
१२ बारह
१३ तेरह
१४ चौदह
१५ पंद्रह
१६ सोलह
१७ सत्रह
१८ अठारह
१९ उन्नीस
२० बीस
२१ इक्कीस
२२ बाईस
२३ तेईस
२४ चौबीस
२५ पच्चीस
२६ छब्बीस
२७ सत्ताईस
२८ अट्ठाईस
२९ उनतीस
३० तीस
३१ इकतीस
28 changes: 14 additions & 14 deletions nemo_text_processing/text_normalization/hi/data/date/months.tsv
Original file line number Diff line number Diff line change
@@ -1,17 +1,5 @@
०१ जनवरी
०२ फ़रवरी
०३ मार्च
०४ अप्रैल
०५ मई
०६ जून
०७ जुलाई
०८ अगस्त
०९ सितंबर
१० अक्टूबर
११ नवंबर
१२ दिसंबर
01 जनवरी
02 फ़रवरी
02 फरवरी
03 मार्च
04 अप्रैल
05 मई
Expand All @@ -21,4 +9,16 @@
09 सितंबर
10 अक्टूबर
11 नवंबर
12 दिसंबर
12 दिसंबर
०१ जनवरी
०२ फरवरी
०३ मार्च
०४ अप्रैल
०५ मई
०६ जून
०७ जुलाई
०८ अगस्त
०९ सितंबर
१० अक्टूबर
११ नवंबर
१२ दिसंबर
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
सन्
सन
साल
सन्
सन
साल
दशक
171 changes: 138 additions & 33 deletions nemo_text_processing/text_normalization/hi/taggers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,33 @@
teens_ties = pynini.union(teens_ties_hi, teens_ties_en)
teens_and_ties = pynutil.add_weight(teens_ties, -0.1)

# Read suffixes from file into a list
with open(get_abs_path("data/date/suffixes.tsv"), "r", encoding="utf-8") as f:
suffixes_list = f.read().splitlines()
suffixes_list = [line.rstrip("\n") for line in f if line.strip()]
with open(get_abs_path("data/date/prefixes.tsv"), "r", encoding="utf-8") as f:
prefixes_list = f.read().splitlines()
prefixes_list = [line.rstrip("\n") for line in f if line.strip()]

# Create union of suffixes and prefixes
suffix_union = pynini.union(*suffixes_list)
prefix_union = pynini.union(*prefixes_list)

verbalized_hundreds = teens_ties_hi.project("output")
verbalized_unit = pynini.union(verbalized_hundreds, digit.project("output"))

verbalized_year_sou = (
verbalized_hundreds + pynini.accep(" सौ") + pynini.closure(pynini.accep(" ") + verbalized_unit, 0, 1)
)

pad_latin = pynini.union(*[pynini.cross(str(i), f"0{i}") for i in range(1, 10)])
pad_devanagari = pynini.union(*[pynini.cross(d, f"०{d}") for d in "१२३४५६७८९"])


class DateFst(GraphFst):
"""
Finite state transducer for classifying date, e.g.
"०१-०४-२०२४" -> date { day: "एक" month: "अप्रैल" year: "दो हज़ार चौबीस" }
"०४-०१-२०२४" -> date { month: "अप्रैल" day: "एक" year: "दो हज़ार चौबीस" }

"६ मार्च, २०१०" -> date { day: "छह" month: "मार्च" year: "दो हज़ार दस" }
"३१ मई, १९९० ई." -> date { day: "इकतीस" month: "मई" year: "उन्नीस सौ नब्बे" era: "ईसवी" }
"उन्नीस सौ बीस में" -> date { era: "उन्नीस सौ बीस में" }
"02-07-1970" -> date { day: "दो" month: "जुलाई" year: "उन्नीस सौ सत्तर" }

Args:
cardinal: cardinal GraphFst
Expand All @@ -68,52 +78,137 @@ def __init__(self, cardinal: GraphFst):
)

cardinal_graph = pynini.union(
digit, teens_and_ties, cardinal.graph_hundreds, graph_year_thousands, graph_year_hundreds_as_thousands
digit,
teens_and_ties,
cardinal.graph_hundreds,
graph_year_thousands,
graph_year_hundreds_as_thousands,
)

graph_year = pynini.union(graph_year_thousands, graph_year_hundreds_as_thousands)

graph_year_era = pynini.union(
graph_year_thousands,
graph_year_hundreds_as_thousands,
cardinal.graph_hundreds,
)

delete_dash = pynutil.delete("-")
delete_slash = pynutil.delete("/")
delete_comma = pynutil.delete(",")
delete_space = pynutil.delete(" ")
delete_optional_space = pynini.closure(pynutil.delete(" "), 0, 1)
delete_comma_sep = delete_comma + delete_optional_space

day_num_padded = pynini.union(
days,
teens_and_ties,
)

days_graph = pynutil.insert("day: \"") + days + pynutil.insert("\"") + insert_space
day_num_bare = pynini.union(
pynini.compose(pad_latin, days),
pynini.compose(pad_devanagari, days),
)

months_graph = pynutil.insert("month: \"") + months + pynutil.insert("\"") + insert_space
days_graph_padded = pynutil.insert("day: \"") + day_num_padded + pynutil.insert("\"") + insert_space
days_graph_bare = pynutil.insert("day: \"") + day_num_bare + pynutil.insert("\"") + insert_space

years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space
month_name_acceptor = pynini.project(months, "output")

months_numeric_padded = months

months_numeric_bare = pynini.union(
pynini.compose(pad_latin, months),
pynini.compose(pad_devanagari, months),
)

graph_dd_mm = days_graph + delete_dash + months_graph
months_graph_numeric_padded = (
pynutil.insert("month: \"") + months_numeric_padded + pynutil.insert("\"") + insert_space
)

months_fst_padded = pynini.union(months_numeric_padded, month_name_acceptor)
months_graph_padded = pynutil.insert("month: \"") + months_fst_padded + pynutil.insert("\"") + insert_space

graph_mm_dd = months_graph + delete_dash + days_graph
months_fst_bare = pynini.union(months_numeric_bare, month_name_acceptor)
months_graph_bare = pynutil.insert("month: \"") + months_fst_bare + pynutil.insert("\"") + insert_space

graph_mm_dd += pynutil.insert(" preserve_order: true ")
month_name_graph = pynutil.insert("month: \"") + month_name_acceptor + pynutil.insert("\"") + insert_space

years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space

# Graph for era
era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space

range_graph = pynini.cross("-", "से")

# Graph for year
century_number = pynini.compose(pynini.closure(NEMO_ALL_DIGIT, 1), cardinal_graph) + pynini.accep("वीं")
century_text = pynutil.insert("era: \"") + century_number + pynutil.insert("\"") + insert_space

# Updated logic to use suffix_union
year_number = graph_year + suffix_union
year_text = pynutil.insert("era: \"") + year_number + pynutil.insert("\"") + insert_space

# Updated logic to use prefix_union
year_prefix = pynutil.insert("era: \"") + prefix_union + insert_space + graph_year + pynutil.insert("\"")
year_prefix = pynutil.insert("era: \"") + prefix_union + pynini.accep(" ") + graph_year + pynutil.insert("\"")

delete_separator = pynini.union(delete_dash, delete_slash)
graph_dd_mm_yyyy = days_graph + delete_separator + months_graph + delete_separator + years_graph
year_prefix_suffix = (
pynutil.insert("era: \"")
+ prefix_union
+ pynini.accep(" ")
+ graph_year
+ suffix_union
+ pynutil.insert("\"")
)

graph_mm_dd_yyyy = months_graph + delete_separator + days_graph + delete_separator + years_graph
graph_verbalized_year_suffix = (
pynutil.insert("era: \"") + verbalized_year_sou + suffix_union + pynutil.insert("\"") + insert_space
)

graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ")
graph_verbalized_year_bare = (
pynutil.insert("era: \"") + verbalized_year_sou + pynutil.insert("\"") + insert_space
)

graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph
graph_verbalized_year_prefix = (
pynutil.insert("era: \"") + prefix_union + pynini.accep(" ") + verbalized_year_sou + pynutil.insert("\"")
)

graph_year_suffix = era_graph
graph_verbalized_year_prefix_suffix = (
pynutil.insert("era: \"")
+ prefix_union
+ pynini.accep(" ")
+ verbalized_year_sou
+ suffix_union
+ pynutil.insert("\"")
)

graph_dd_mm = days_graph_padded + delete_dash + months_graph_padded

graph_d_m = days_graph_bare + delete_dash + months_graph_bare

graph_dd_mm_yyyy = days_graph_padded + delete_dash + months_graph_padded + delete_dash + years_graph

graph_d_m_yyyy = days_graph_bare + delete_dash + months_graph_bare + delete_dash + years_graph

graph_dd_month = days_graph_padded + delete_space + months_graph_numeric_padded

graph_dd_month_comma_yyyy = (
days_graph_padded + delete_space + months_graph_padded + delete_comma_sep + years_graph
)

graph_dd_month_comma_yyyy_era = (
days_graph_padded + delete_space + months_graph_padded + delete_comma_sep + years_graph + era_graph
)

graph_month_comma_yyyy = months_graph_padded + delete_comma_sep + years_graph

graph_month_comma_yyyy_era = months_graph_padded + delete_comma_sep + years_graph + era_graph

graph_month_name_yyyy = month_name_graph + delete_space + years_graph

graph_year_era_only = (
pynutil.insert("era: \"")
+ graph_year_era
+ insert_space
+ year_suffix
+ pynutil.insert("\"")
+ insert_space
)

graph_range = (
pynutil.insert("era: \"")
Expand All @@ -126,21 +221,31 @@ def __init__(self, cardinal: GraphFst):
+ pynutil.insert(" preserve_order: true ")
)

# default assume dd_mm_yyyy
graph_year_suffix = era_graph

final_graph = (
pynutil.add_weight(graph_dd_mm, -0.001)
| graph_mm_dd
pynutil.add_weight(graph_dd_month_comma_yyyy_era, -0.003)
| pynutil.add_weight(graph_month_comma_yyyy_era, -0.003)
| pynutil.add_weight(graph_dd_mm_yyyy, -0.001)
| graph_mm_dd_yyyy
| pynutil.add_weight(graph_mm_yyyy, -0.2)
| pynutil.add_weight(graph_year_suffix, -0.001)
| pynutil.add_weight(graph_d_m_yyyy, -0.001)
| pynutil.add_weight(graph_dd_month_comma_yyyy, -0.001)
| pynutil.add_weight(graph_dd_mm, -0.001)
| pynutil.add_weight(graph_d_m, -0.001)
| pynutil.add_weight(graph_dd_month, -0.001)
| pynutil.add_weight(graph_month_name_yyyy, -0.2)
| pynutil.add_weight(graph_month_comma_yyyy, -0.2)
| pynutil.add_weight(graph_year_era_only, -0.005)
| pynutil.add_weight(graph_range, -0.005)
| pynutil.add_weight(graph_year_suffix, -0.001)
| pynutil.add_weight(century_text, -0.001)
| pynutil.add_weight(year_text, -0.001)
| pynutil.add_weight(graph_verbalized_year_prefix_suffix, -0.012)
| pynutil.add_weight(graph_verbalized_year_prefix, -0.011)
| pynutil.add_weight(graph_verbalized_year_suffix, -0.010)
| pynutil.add_weight(graph_verbalized_year_bare, -0.009)
| pynutil.add_weight(year_prefix_suffix, -0.010)
| pynutil.add_weight(year_prefix, -0.009)
| pynutil.add_weight(year_text, -0.001)
)

self.final_graph = final_graph.optimize()

self.fst = self.add_tokens(self.final_graph)
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def __init__(
pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(cardinal_graph, 1.1)
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(fraction_graph, 1.1)
| pynutil.add_weight(fraction_graph, 1.05)
| pynutil.add_weight(date_graph, 1.1)
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(measure_graph, 1.1)
Expand Down
Loading