diff --git a/app/app.py b/app/app.py index f1b7b70..d673ff2 100644 --- a/app/app.py +++ b/app/app.py @@ -4,7 +4,7 @@ from flask_swagger import swagger from flask_swagger_ui import get_swaggerui_blueprint from pkg_resources import resource_filename from .api_keys import Database -from app.language import detect_languages +from app.language import detect_languages, transliterate api_keys_db = None @@ -285,9 +285,9 @@ def create_app(args): try: if batch: - return jsonify({"translatedText": [translator.translate(text) for text in q] }) + return jsonify({"translatedText": [translator.translate(transliterate(text, target_lang=source_lang)) for text in q] }) else: - return jsonify({"translatedText": translator.translate(q) }) + return jsonify({"translatedText": translator.translate(transliterate(q, target_lang=source_lang)) }) except Exception as e: abort(500, description="Cannot translate text: %s" % str(e)) diff --git a/app/language.py b/app/language.py index 17d1822..e6f3613 100644 --- a/app/language.py +++ b/app/language.py @@ -1,5 +1,8 @@ +import string + from argostranslate import translate from polyglot.detect.base import Detector +from polyglot.transliteration.base import Transliterator languages = translate.load_installed_languages() @@ -68,3 +71,51 @@ def detect_languages(text): } for l in candidate_langs ] + + +def __transliterate_line(transliterator, line_text): + new_text = [] + + # transliteration is done word by word + for orig_word in line_text.split(" "): + # remove any punctuation on the right side + r_word = orig_word.rstrip(string.punctuation) + r_diff = set(char for char in orig_word) - set(char for char in r_word) + # and on the left side + l_word = orig_word.lstrip(string.punctuation) + l_diff = set(char for char in orig_word) - set(char for char in l_word) + + # the actual transliteration of the word + t_word = transliterator.transliterate(orig_word.strip(string.punctuation)) + + # if transliteration fails, default back to the original word + if not t_word: + t_word = orig_word + else: + # add back any stripped punctuation + if r_diff: + t_word = t_word + ''.join(r_diff) + if l_diff: + t_word = ''.join(l_diff) + t_word + + new_text.append(t_word) + + # rebuild the text + return " ".join(new_text) + + +def transliterate(text, target_lang="en"): + # initialize the transliterator from polyglot + transliterator = Transliterator(target_lang=target_lang) + + # check for multiline string + if "\n" in text: + lines = [] + # process each line separate + for line in text.split("\n"): + lines.append(__transliterate_line(transliterator, line)) + + # rejoin multiline string + return "\n".join(lines) + else: + return __transliterate_line(transliterator, text)