1
0
Fork 0
mirror of https://github.com/mealie-recipes/mealie.git synced 2025-07-19 13:19:41 +02:00
mealie/mealie/services/parser_services/crfpp/tokenizer.py
Hayden 60908e5a88
Feature/CRF++ and server side locales (#731)
* add universal toast plugin

* add server side locales

* integrate CRF++ into CI/CD Pipeline

* docs(docs): 📝 add recipe parser docs

* feat(backend):  Continued work on ingredient parsers

* add new model dest

* feat(frontend):  New ingredient parser page

* formatting

Co-authored-by: Hayden <hay-kot@pm.me>
2021-10-09 13:08:23 -08:00

38 lines
1.5 KiB
Python

import re
def clumpFractions(s):
"""
Replaces the whitespace between the integer and fractional part of a quantity
with a dollar sign, so it's interpreted as a single token. The rest of the
string is left alone.
clumpFractions("aaa 1 2/3 bbb")
# => "aaa 1$2/3 bbb"
"""
return re.sub(r"(\d+)\s+(\d)/(\d)", r"\1$\2/\3", s)
def tokenize(s):
"""
Tokenize on parenthesis, punctuation, spaces and American units followed by a slash.
We sometimes give American units and metric units for baking recipes. For example:
* 2 tablespoons/30 mililiters milk or cream
* 2 1/2 cups/300 grams all-purpose flour
The recipe database only allows for one unit, and we want to use the American one.
But we must split the text on "cups/" etc. in order to pick it up.
"""
# handle abbreviation like "100g" by treating it as "100 grams"
s = re.sub(r"(\d+)g", r"\1 grams", s)
s = re.sub(r"(\d+)oz", r"\1 ounces", s)
s = re.sub(r"(\d+)ml", r"\1 milliliters", s, flags=re.IGNORECASE)
# TODO: Replace american_units with list of units from database?
american_units = ["cup", "tablespoon", "teaspoon", "pound", "ounce", "quart", "pint"]
# The following removes slashes following American units and replaces it with a space.
for unit in american_units:
s = s.replace(unit + "/", unit + " ")
s = s.replace(unit + "s/", unit + "s ")
return [token.strip() for token in re.split(r"([,()\s]{1})", clumpFractions(s)) if token and token.strip()]