Spaces:

Dionyssos
/

speech-analysis2

Running

App Files Files Community

Dionyssos commited on 18 days ago

Commit

0213c67

1 Parent(s): 61d3afa

shift TTS

Browse files

Files changed (2) hide show

app.py +558 -0
audionar.py +623 -0

app.py CHANGED Viewed

@@ -16,6 +16,538 @@ import textwrap
 from tts import StyleTTS2
 import audresample
 device = 0 if torch.cuda.is_available() else "cpu"
 duration = 2  # limit processing of audio
@@ -582,4 +1114,30 @@ with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
         submit_btn.click(recognize, input, outputs)
 demo.launch(debug=True)

 from tts import StyleTTS2
 import audresample
+# --
+# -*- coding: utf-8 -*-
+# https://huggingface.co/spaces/dpc/mmstts/tree/main
+# https://huggingface.co/spaces/mms-meta/MMS/blob/main/tts.py
+import json
+import soundfile
+import re
+import unicodedata
+import gradio as gr
+import textwrap
+import numpy as np
+import torch
+import nltk
+from num2words import num2words
+from num2word_greek.numbers2words import convert_numbers
+from vits import VitsModel, VitsTokenizer
+nltk.download('punkt', download_dir='./')
+nltk.download('punkt_tab', download_dir='./')
+nltk.data.path.append('.')
+device = 'cpu'
+def fix_vocals(text, lang='ron'):
+    # Longer phrases should come before shorter ones to prevent partial matches.
+    ron_replacements = {
+        'ţ': 'ț',
+        'ț': 'ts',
+        'î': 'u',
+        'â': 'a',
+        'ş': 's',
+        'w': 'oui',
+        'k': 'c',
+        'l': 'll',
+        # Math symbols
+        'sqrt': ' rădăcina pătrată din ',
+        '^': ' la puterea ',
+        '+': ' plus ',
+        ' - ': ' minus ',  # only replace if standalone so to not say minus if is a-b-c
+        '*': ' ori ',  # times
+        '/': ' împărțit la ',  # divided by
+        '=': ' egal cu ',  # equals
+        'pi': ' pi ',
+        '<': ' mai mic decât ',
+        '>': ' mai mare decât',
+        '%': ' la sută ', # percent (from previous)
+        '(': ' paranteză deschisă ',
+        ')': ' paranteză închisă ',
+        '[': ' paranteză pătrată deschisă ',
+        ']': ' paranteză pătrată închisă ',
+        '{': ' acoladă deschisă ',
+        '}': ' acoladă închisă ',
+        '≠': ' nu este egal cu ',
+        '≤': ' mai mic sau egal cu ',
+        '≥': ' mai mare sau egal cu ',
+        '≈': ' aproximativ ',
+        '∞': ' infinit ',
+        '€': ' euro ',
+        '$': ' dolar ',
+        '£': ' liră ',
+        '&': ' și ',  # and
+        '@': ' la ',  # at
+        '#': ' diez ',  # hash
+        '∑': ' sumă ',
+        '∫': ' integrală ',
+        '√': ' rădăcina pătrată a ', # more generic square root
+    }
+    eng_replacements = {
+        'wik': 'weaky',
+        'sh': 'ss',
+        'ch': 'ttss',
+        'oo': 'oeo',
+        # Math symbols for English
+        'sqrt': ' square root of ',
+        '^': ' to the power of ',
+        '+': ' plus ',
+        ' - ': ' minus ',
+        '*': ' times ',
+        ' / ': ' divided by ',
+        '=': ' equals ',
+        'pi': ' pi ',
+        '<': ' less than ',
+        '>': ' greater than ',
+        # Additional common math symbols from previous list
+        '%': ' percent ',
+        '(': ' open parenthesis ',
+        ')': ' close parenthesis ',
+        '[': ' open bracket ',
+        ']': ' close bracket ',
+        '{': ' open curly brace ',
+        '}': ' close curly brace ',
+        '∑': ' sum ',
+        '∫': ' integral ',
+        '√': ' square root of ',
+        '≠': ' not equals ',
+        '≤': ' less than or equals ',
+        '≥': ' greater than or equals ',
+        '≈': ' approximately ',
+        '∞': ' infinity ',
+        '€': ' euro ',
+        '$': ' dollar ',
+        '£': ' pound ',
+        '&': ' and ',
+        '@': ' at ',
+        '#': ' hash ',
+    }
+    serbian_replacements = {
+        'rn': 'rrn',
+        'ć': 'č',
+        'c': 'č',
+        'đ': 'd',
+        'j': 'i',
+        'l': 'lll',
+        'w': 'v',
+        #  https://huggingface.co/facebook/mms-tts-rmc-script_latin
+        'sqrt': 'kvadratni koren iz',
+        '^': ' na stepen ',
+        '+': ' plus ',
+        ' - ': ' minus ',
+        '*': ' puta ',
+        ' / ': ' podeljeno sa ',
+        '=': ' jednako ',
+        'pi': ' pi ',
+        '<': ' manje od ',
+        '>': ' veće od ',
+        '%': ' procenat ',
+        '(': ' otvorena zagrada ',
+        ')': ' zatvorena zagrada ',
+        '[': ' otvorena uglasta zagrada ',
+        ']': ' zatvorena uglasta zagrada ',
+        '{': ' otvorena vitičasta zagrada ',
+        '}': ' zatvorena vitičasta zagrada ',
+        '∑': ' suma ',
+        '∫': ' integral ',
+        '√': ' kvadratni koren ',
+        '≠': ' nije jednako ',
+        '≤': ' manje ili jednako od ',
+        '≥': ' veće ili jednako od ',
+        '≈': ' približno ',
+        '∞': ' beskonačnost ',
+        '€': ' evro ',
+        '$': ' dolar ',
+        '£': ' funta ',
+        '&': ' i ',
+        '@': ' et ',
+        '#': ' taraba ',
+        # Others
+        #     'rn': 'rrn',
+        # 'ć': 'č',
+        # 'c': 'č',
+        # 'đ': 'd',
+        # 'l': 'le',
+        # 'ij': 'i',
+        # 'ji': 'i',
+        # 'j': 'i',
+        # 'služ': 'sloooozz',  # 'službeno'
+        # 'suver': 'siuveeerra',  # 'suverena'
+        # 'država': 'dirrezav',  # 'država'
+        # 'iči': 'ici',  # 'Graniči'
+        # 's ': 'se',  # a s with space
+        # 'q': 'ku',
+        # 'w': 'aou',
+        # 'z': 's',
+        # "š": "s",
+        # 'th': 'ta',
+        # 'v': 'vv',
+        # "ć": "č",
+        # "đ": "ď",
+        # "lj": "ľ",
+        # "nj": "ň",
+        # "ž": "z",
+        # "c": "č"
+    }
+    deu_replacements = {
+        'sch': 'sh',
+        'ch': 'kh',
+        'ie': 'ee',
+        'ei': 'ai',
+        'ä': 'ae',
+        'ö': 'oe',
+        'ü': 'ue',
+        'ß': 'ss',
+        # Math symbols for German
+        'sqrt': ' Quadratwurzel aus ',
+        '^': ' hoch ',
+        '+': ' plus ',
+        ' - ': ' minus ',
+        '*': ' mal ',
+        ' / ': ' geteilt durch ',
+        '=': ' gleich ',
+        'pi': ' pi ',
+        '<': ' kleiner als ',
+        '>': ' größer als',
+        # Additional common math symbols from previous list
+        '%': ' prozent ',
+        '(': ' Klammer auf ',
+        ')': ' Klammer zu ',
+        '[': ' eckige Klammer auf ',
+        ']': ' eckige Klammer zu ',
+        '{': ' geschweifte Klammer auf ',
+        '}': ' geschweifte Klammer zu ',
+        '∑': ' Summe ',
+        '∫': ' Integral ',
+        '√': ' Quadratwurzel ',
+        '≠': ' ungleich ',
+        '≤': ' kleiner oder gleich ',
+        '≥': ' größer oder gleich ',
+        '≈': ' ungefähr ',
+        '∞': ' unendlich ',
+        '€': ' euro ',
+        '$': ' dollar ',
+        '£': ' pfund ',
+        '&': ' und ',
+        '@': ' at ', # 'Klammeraffe' is also common but 'at' is simpler
+        '#': ' raute ',
+    }
+    fra_replacements = {
+        # French specific phonetic replacements (add as needed)
+        # e.g., 'ç': 's', 'é': 'e', etc.
+        'w': 'v',
+        # Math symbols for French
+        'sqrt': ' racine carrée de ',
+        '^': ' à la puissance ',
+        '+': ' plus ',
+        ' - ': ' moins ',  # tiré ;
+        '*': ' fois ',
+        ' / ': ' divisé par ',
+        '=': ' égale ',
+        'pi': ' pi ',
+        '<': ' inférieur à ',
+        '>': ' supérieur à ',
+        # Add more common math symbols as needed for French
+        '%': ' pour cent ',
+        '(': ' parenthèse ouverte ',
+        ')': ' parenthèse fermée ',
+        '[': ' crochet ouvert ',
+        ']': ' crochet fermé ',
+        '{': ' accolade ouverte ',
+        '}': ' accolade fermée ',
+        '∑': ' somme ',
+        '∫': ' intégrale ',
+        '√': ' racine carrée ',
+        '≠': ' n\'égale pas ',
+        '≤': ' inférieur ou égal à ',
+        '≥': ' supérieur ou égal à ',
+        '≈': ' approximativement ',
+        '∞': ' infini ',
+        '€': ' euro ',
+        '$': ' dollar ',
+        '£': ' livre ',
+        '&': ' et ',
+        '@': ' arobase ',
+        '#': ' dièse ',
+    }
+    hun_replacements = {
+        # Hungarian specific phonetic replacements (add as needed)
+        # e.g., 'á': 'a', 'é': 'e', etc.
+        'ch': 'ts',
+        'cs': 'tz',
+        'g': 'gk',
+        'w': 'v',
+        'z': 'zz',
+        # Math symbols for Hungarian
+        'sqrt': ' négyzetgyök ',
+        '^': ' hatvány ',
+        '+': ' plusz ',
+        ' - ': ' mínusz ',
+        '*': ' szorozva ',
+        ' / ': ' osztva ',
+        '=': ' egyenlő ',
+        'pi': ' pi ',
+        '<': ' kisebb mint ',
+        '>': ' nagyobb mint ',
+        # Add more common math symbols as needed for Hungarian
+        '%': ' százalék ',
+        '(': ' nyitó zárójel ',
+        ')': ' záró zárójel ',
+        '[': ' nyitó szögletes zárójel ',
+        ']': ' záró szögletes zárójel ',
+        '{': ' nyitó kapcsos zárójel ',
+        '}': ' záró kapcsos zárójel ',
+        '∑': ' szumma ',
+        '∫': ' integrál ',
+        '√': ' négyzetgyök ',
+        '≠': ' nem egyenlő ',
+        '≤': ' kisebb vagy egyenlő ',
+        '≥': ' nagyobb vagy egyenlő ',
+        '≈': ' körülbelül ',
+        '∞': ' végtelen ',
+        '€': ' euró ',
+        '$': ' dollár ',
+        '£': ' font ',
+        '&': ' és ',
+        '@': ' kukac ',
+        '#': ' kettőskereszt ',
+    }
+    grc_replacements = {
+        # Ancient Greek specific phonetic replacements (add as needed)
+        # These are more about transliterating Greek letters if they are in the input text.
+        # Math symbols for Ancient Greek (literal translations)
+        'sqrt': ' τετραγωνικὴ ῥίζα ',
+        '^': ' εἰς τὴν δύναμιν ',
+        '+': ' σὺν ',
+        ' - ': ' χωρὶς ',
+        '*': ' πολλάκις ',
+        ' / ': ' διαιρέω ',
+        '=': ' ἴσον ',
+        'pi': ' πῖ ',
+        '<': ' ἔλαττον ',
+        '>': ' μεῖζον ',
+        # Add more common math symbols as needed for Ancient Greek
+        '%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
+        '(': ' ἀνοικτὴ παρένθεσις ',
+        ')': ' κλειστὴ παρένθεσις ',
+        '[': ' ἀνοικτὴ ἀγκύλη ',
+        ']': ' κλειστὴ ἀγκύλη ',
+        '{': ' ἀνοικτὴ σγουρὴ ἀγκύλ�� ',
+        '}': ' κλειστὴ σγουρὴ ἀγκύλη ',
+        '∑': ' ἄθροισμα ',
+        '∫': ' ὁλοκλήρωμα ',
+        '√': ' τετραγωνικὴ ῥίζα ',
+        '≠': ' οὐκ ἴσον ',
+        '≤': ' ἔλαττον ἢ ἴσον ',
+        '≥': ' μεῖζον ἢ ἴσον ',
+        '≈': ' περίπου ',
+        '∞': ' ἄπειρον ',
+        '€': ' εὐρώ ',
+        '$': ' δολάριον ',
+        '£': ' λίρα ',
+        '&': ' καὶ ',
+        '@': ' ἀτ ', # at
+        '#': ' δίεση ', # hash
+    }
+    # Select the appropriate replacement dictionary based on the language
+    replacements_map = {
+        'grc': grc_replacements,
+        'ron': ron_replacements,
+        'eng': eng_replacements,
+        'deu': deu_replacements,
+        'fra': fra_replacements,
+        'hun': hun_replacements,
+        'rmc-script_latin': serbian_replacements,
+    }
+    current_replacements = replacements_map.get(lang)
+    if current_replacements:
+        # Sort replacements by length of the key in descending order.
+        # This is crucial for correctly replacing multi-character strings (like 'sqrt', 'sch')
+        # before their shorter substrings ('s', 'ch', 'q', 'r', 't').
+        sorted_replacements = sorted(current_replacements.items(), key=lambda item: len(item[0]), reverse=True)
+        for old, new in sorted_replacements:
+            text = text.replace(old, new)
+        return text
+    else:
+        # If the language is not supported, return the original text
+        print(f"Warning: Language '{lang}' not supported for text replacement. Returning original text.")
+        return text
+def _num2words(text='01234', lang=None):
+    if lang == 'grc':
+        return convert_numbers(text)
+    return num2words(text, lang=lang)  # HAS TO BE kwarg lang=lang
+def transliterate_number(number_string,
+                         lang=None):
+    if lang == 'rmc-script_latin':
+        lang = 'sr'
+        exponential_pronoun = ' puta deset na stepen od '
+        comma = ' tačka '
+    elif lang == 'ron':
+        lang = 'ro'
+        exponential_pronoun = ' tízszer a erejéig '
+        comma = ' virgulă '
+    elif lang == 'hun':
+        lang = 'hu'
+        exponential_pronoun = ' tízszer a erejéig '
+        comma = ' virgula '
+    elif lang == 'deu':
+        exponential_pronoun = ' mal zehn hoch '
+        comma = ' komma '
+    elif lang == 'fra':
+        lang = 'fr'
+        exponential_pronoun = ' puissance '
+        comma = 'virgule'
+    elif lang == 'grc':
+        exponential_pronoun = ' εις την δυναμην του '
+        comma = 'κομμα'
+    else:
+        lang = lang[:2]
+        exponential_pronoun = ' times ten to the power of '
+        comma = ' point '
+    def replace_number(match):
+        prefix = match.group(1) or ""
+        number_part = match.group(2)
+        suffix = match.group(5) or ""
+        try:
+            if 'e' in number_part.lower():
+                base, exponent = number_part.lower().split('e')
+                words = _num2words(base, lang=lang) + exponential_pronoun + _num2words(exponent, lang=lang)
+            elif '.' in number_part:
+                integer_part, decimal_part = number_part.split('.')
+                words = _num2words(integer_part, lang=lang) + comma + " ".join(
+                    [_num2words(digit, lang=lang) for digit in decimal_part])
+            else:
+                words = _num2words(number_part, lang=lang)
+            return prefix + words + suffix
+        except ValueError:
+            return match.group(0)  # Return original if conversion fails
+    pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
+    return re.sub(pattern, replace_number, number_string)
+language_names = ['Ancient greek',
+                  'English',
+                  'Deutsch',
+                  'French',
+                  'Hungarian',
+                  'Romanian',
+                  'Serbian (Approx.)']
+def audionar_tts(text=None,
+                 lang='romanian'):
+    # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
+    lang = lang.lower()
+    # https://huggingface.co/spaces/mms-meta/MMS
+    if 'hun' in lang:
+        lang_code = 'hun'
+    elif any([i in lang for i in ['ser', 'bosn', 'herzegov', 'montenegr', 'macedon']]):
+        # romani carpathian (has also Vlax) - cooler voice
+        lang_code = 'rmc-script_latin'
+    elif 'rom' in lang:
+        lang_code = 'ron'
+    elif 'ger' in lang or 'deu' in lang or 'allem' in lang:
+        lang_code = 'deu'
+    elif 'french' in lang:
+        lang_code = 'fra'
+    elif 'eng' in lang:
+        lang_code = 'eng'
+    elif 'ancient greek' in lang:
+        lang_code = 'grc'
+    else:
+        lang_code = lang.split()[0].strip()   # latin & future option
+    # LATIN / GRC / CYRILLIC
+    text = only_greek_or_only_latin(text, lang=lang_code)  # assure gr-chars if lang=='grc' / latin if lang!='grc'
+    # NUMERALS (^ in math expression found & substituted here before arriving to fix_vocals)
+    text = transliterate_number(text, lang=lang_code)
+    # PRONOUNC.
+    text = fix_vocals(text, lang=lang_code)
+    # VITS
+    global cached_lang_code, cached_net_g, cached_tokenizer
+    if 'cached_lang_code' not in globals() or cached_lang_code != lang_code:
+        cached_lang_code = lang_code
+        cached_net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval().to(device)
+        cached_tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
+    net_g = cached_net_g
+    tokenizer = cached_tokenizer
+    total_audio = []
+    if not isinstance(text, list):
+        text = textwrap.wrap(text, width=439)
+    for _t in text:
+        inputs = tokenizer(_t, return_tensors="pt")
+        with torch.no_grad():
+            x = net_g(input_ids=inputs.input_ids.to(device),
+                      attention_mask=inputs.attention_mask.to(device),
+                      lang_code=lang_code,
+                      )[0, :]
+            total_audio.append(x)
+        print(f'\n\n_______________________________ {_t} {x.shape=}')
+    x = torch.cat(total_audio).cpu().numpy()
+    tmp_file = f'_speech.wav'
+    soundfile.write(tmp_file, x, 16000)
+    return tmp_file
+# --
 device = 0 if torch.cuda.is_available() else "cpu"
 duration = 2  # limit processing of audio
         submit_btn.click(recognize, input, outputs)
+    with gr.Tab("audionar TTS"):
+            with gr.Row():
+                text_input = gr.Textbox(
+                    lines=4,
+                    value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
+                    label="Type text for TTS"
+                )
+                lang_dropdown = gr.Dropdown(
+                    choices=language_names,
+                    label="TTS language",
+                    value="Ancient greek",
+                )
+            # Create a button to trigger the TTS function
+            tts_button = gr.Button("Generate Audio")
+            # Create the output audio component
+            audio_output = gr.Audio(label="Generated Audio")
+            # Link the button click event to the mms_tts function
+            tts_button.click(
+                fn=audionar_tts,
+                inputs=[text_input, lang_dropdown],
+                outputs=audio_output
+            )
 demo.launch(debug=True)

audionar.py ADDED Viewed

	@@ -0,0 +1,623 @@

+import math
+import numpy as np
+import torch
+from torch import nn
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+import json
+import os
+import re
+from transformers.tokenization_utils import PreTrainedTokenizer
+import phonemizer
+import torch.nn.functional as F
+OSCILLATION = {
+        'deu': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'rmc-script_latin': [2, 2, 1, 2, 2],
+        'hun': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'fra': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'eng': [1, 2, 2, 1, 2, 2],
+        'grc': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
+        'ron': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2],
+    }
+def has_non_roman_characters(input_string):
+    # Find any character outside the ASCII range
+    non_roman_pattern = re.compile(r"[^\x00-\x7F]")
+    # Search the input string for non-Roman characters
+    match = non_roman_pattern.search(input_string)
+    has_non_roman = match is not None
+    return has_non_roman
+class VitsConfig(PretrainedConfig):
+    model_type = "vits"
+    def __init__(
+        self,
+        vocab_size=38,
+        hidden_size=192,
+        num_hidden_layers=6,
+        num_attention_heads=2,
+        use_bias=True,
+        ffn_dim=768,
+        ffn_kernel_size=3,
+        flow_size=192,
+        # hidden_act="relu",
+        upsample_initial_channel=512,
+        upsample_rates=[8, 8, 2, 2],
+        upsample_kernel_sizes=[16, 16, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        prior_encoder_num_flows=4,
+        prior_encoder_num_wavenet_layers=4,
+        wavenet_kernel_size=5,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_bias = use_bias
+        self.ffn_dim = ffn_dim
+        self.ffn_kernel_size = ffn_kernel_size
+        self.flow_size = flow_size
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.prior_encoder_num_flows = prior_encoder_num_flows
+        self.prior_encoder_num_wavenet_layers = prior_encoder_num_wavenet_layers
+        self.wavenet_kernel_size = wavenet_kernel_size
+        super().__init__()
+class VitsWaveNet(torch.nn.Module):
+    def __init__(self, config, num_layers):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.num_layers = num_layers
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        # if hasattr(nn.utils.parametrizations, "weight_norm"):
+        #     # raise ValueError
+        weight_norm = nn.utils.parametrizations.weight_norm
+        # else:
+        #     raise ValueError
+        #     # weight_norm = nn.utils.weight_norm
+        for i in range(num_layers):
+            in_layer = torch.nn.Conv1d(
+                in_channels=config.hidden_size,
+                out_channels=2 * config.hidden_size,
+                kernel_size=config.wavenet_kernel_size,
+                dilation=1,
+                padding=2,
+            )
+            in_layer = weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+            # last one is not necessary
+            if i < num_layers - 1:
+                res_skip_channels = 2 * config.hidden_size
+            else:
+                res_skip_channels = config.hidden_size
+            res_skip_layer = torch.nn.Conv1d(config.hidden_size, res_skip_channels, 1)
+            res_skip_layer = weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+    def forward(self,
+                inputs):
+        outputs = torch.zeros_like(inputs)
+        num_channels = torch.IntTensor([self.hidden_size])[0]
+        for i in range(self.num_layers):
+            in_act = self.in_layers[i](inputs)
+            # global_states = torch.zeros_like(hidden_states)  # style ?
+            # acts = fused_add_tanh_sigmoid_multiply(hidden_states, global_states, num_channels_tensor[0])
+            # --
+            # def fused_add_tanh_sigmoid_multiply(input_a, input_b, num_channels):
+            # in_act = input_a #  + input_b
+            t_act = torch.tanh(in_act[:, :num_channels, :])
+            s_act = torch.sigmoid(in_act[:, num_channels:, :])
+            acts = t_act * s_act
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.num_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_size, :]
+                inputs = inputs + res_acts
+                outputs = outputs + res_skip_acts[:, self.hidden_size :, :]
+            else:
+                outputs = outputs + res_skip_acts
+        return outputs
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+    def get_padding(self, kernel_size, dilation=1):
+        # 1, 3, 5, 15
+        return (kernel_size * dilation - dilation) // 2
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, negative_slope=self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, negative_slope=self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+class VitsHifiGan(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            config.flow_size,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation))
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False)
+    def forward(self,
+                spectrogram):
+        hidden_states = self.conv_pre(spectrogram)
+        for i in range(self.num_upsamples):
+            hidden_states = F.leaky_relu(hidden_states, negative_slope=.1, inplace=True)
+            hidden_states = self.upsampler[i](hidden_states)
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+        hidden_states = F.leaky_relu(hidden_states, negative_slope=.01, inplace=True)
+        hidden_states = self.conv_post(hidden_states)
+        waveform = torch.tanh(hidden_states)
+        return waveform
+class VitsResidualCouplingLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.half_channels = config.flow_size // 2
+        self.conv_pre = nn.Conv1d(self.half_channels, config.hidden_size, 1)
+        self.wavenet = VitsWaveNet(config, num_layers=config.prior_encoder_num_wavenet_layers)
+        self.conv_post = nn.Conv1d(config.hidden_size, self.half_channels, 1)
+    def forward(self,
+                x,
+                reverse=False):
+        first_half, second_half = torch.split(x, [self.half_channels] * 2, dim=1)
+        hidden_states = self.conv_pre(first_half)
+        hidden_states = self.wavenet(hidden_states)
+        mean = self.conv_post(hidden_states)
+        second_half = (second_half - mean)
+        outputs = torch.cat([first_half, second_half], dim=1)
+        return outputs
+class VitsResidualCouplingBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.flows = nn.ModuleList()
+        for _ in range(config.prior_encoder_num_flows):
+            self.flows.append(VitsResidualCouplingLayer(config))
+    def forward(self, x, reverse=False):
+        # x L [1, 192, 481]
+        for flow in reversed(self.flows):
+            x = torch.flip(x, [1])  # flipud CHANNELs
+            x = flow(x, reverse=True)
+        return x
+class VitsAttention(nn.Module):
+    """has no positional info"""
+    def __init__(self, config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scaling = self.head_dim**-0.5
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
+    def _shape(self, tensor, seq_len, bsz):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states,
+        layer_head_mask = None,
+        output_attentions = False,
+    ):
+        bsz, tgt_len, _ = hidden_states.size()
+        # Q
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # K/V
+        hidden_states = hidden_states[:, :40, :]  # drop time-frames from k/v [bs*2, time, 96=ch]
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_output = torch.bmm(attn_weights,
+                                value_states)
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class VitsFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv_1 = nn.Conv1d(config.hidden_size, config.ffn_dim, config.ffn_kernel_size, padding=1)
+        self.conv_2 = nn.Conv1d(config.ffn_dim, config.hidden_size, config.ffn_kernel_size, padding=1)
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = F.relu(self.conv_1(hidden_states))  # inplace changes sound ;
+        hidden_states = self.conv_2(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        return hidden_states
+class VitsEncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = VitsAttention(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
+        self.feed_forward = VitsFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
+    def forward(
+        self,
+        hidden_states,
+        output_attentions = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.attention(
+            hidden_states=hidden_states,
+            # attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.layer_norm(residual + hidden_states)
+        residual = hidden_states
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(residual + hidden_states)
+        outputs = (hidden_states,)
+        return outputs
+class VitsEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([VitsEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+    def forward(
+        self,
+        hidden_states):
+        for _layer in self.layers:
+            layer_outputs = _layer(hidden_states)
+            hidden_states = layer_outputs[0]
+        return hidden_states
+class VitsTextEncoder(nn.Module):
+    """
+    Has VitsEncoder
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.encoder = VitsEncoder(config)  # 6 Layers of VitsAttention
+        self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
+    def forward(self,
+                input_ids
+                ):
+        hidden_states = self.embed_tokens(input_ids) * 4      #Actually4-or-4.856406460551018-@-845-len-ids-deu
+        stats = self.project(self.encoder(hidden_states=hidden_states).transpose(1, 2)).transpose(1, 2)
+        return stats[:, :, :self.config.flow_size]  # prior_means
+class VitsPreTrainedModel(PreTrainedModel):
+    config_class = VitsConfig
+    base_model_prefix = "vits"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = True
+class VitsModel(VitsPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.text_encoder = VitsTextEncoder(config)  # has VitsEncoder that includes 6L of VitsAttention
+        self.flow = VitsResidualCouplingBlock(config)
+        self.decoder = VitsHifiGan(config)
+    def forward(
+        self,
+        input_ids = None,
+        attention_mask = None,
+        speaker_id = None,
+        output_attentions = None,
+        output_hidden_states = None,
+        return_dict = None,
+        labels = None,
+        speed = None,
+        lang_code = 'deu',  # speed oscillation pattern per voice/lang
+    ):
+        mask_dtype = self.text_encoder.embed_tokens.weight.dtype
+        if attention_mask is not None:
+            input_padding_mask = attention_mask.unsqueeze(-1).to(mask_dtype)
+        else:
+            raise ValueError
+            input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).to(mask_dtype)
+        prior_means = self.text_encoder(input_ids=input_ids)
+        input_padding_mask = input_padding_mask.transpose(1, 2)
+        bs, in_len, _ = prior_means.shape
+        # VITS Duration Oscillation
+        pattern = OSCILLATION.get(lang_code, [1, 2, 1])
+        duration = torch.tensor(pattern,
+                                device=prior_means.device).repeat(int(in_len / len(pattern)) + 2)[None, None, :in_len]   # perhaps define [1, 2, 1] per voice or language
+        duration[:, :, 0] = 4
+        duration[:, :, -1] = 3
+        # ATTN
+        predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
+        indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
+        output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
+        output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
+        attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
+        batch_size, _, output_length, input_length = attn_mask.shape
+        cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
+        indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
+        valid_indices = indices.unsqueeze(0) < cum_duration
+        valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
+        padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
+        attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
+        attn = attn[:, 0, :, :]
+        attn = attn + 1e-4 * torch.rand_like(attn)
+        attn /= attn.sum(2, keepdims=True)
+        #print(attn)
+        prior_means = torch.matmul(attn, prior_means)  # try attn to contain .5/.5 instead of 1/0 so it smoothly interpolates repeated prior_means
+        #prior_means = F.interpolate(prior_means.transpose(1,2),   int(1.74 * prior_means.shape[1]), mode='linear').transpose(1,2)  # extend for slow speed
+        # prior means have now been replicated x duration of each prior mean
+        latents = self.flow(prior_means.transpose(1, 2), # + torch.randn_like(prior_means) * .94,
+                            reverse=True)
+        waveform = self.decoder(latents)  # [bs, 1, 16000]
+        return waveform[:, 0, :]
+class VitsTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "vocab.json"}
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        pad_token="<pad>",
+        unk_token="<unk>",
+        language=None,
+        add_blank=True,
+        normalize=True,
+        phonemize=True,
+        is_uroman=False,
+        **kwargs,
+    ):
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.language = language
+        self.add_blank = add_blank
+        self.normalize = normalize
+        self.phonemize = phonemize
+        self.is_uroman = is_uroman
+        super().__init__(
+            pad_token=pad_token,
+            unk_token=unk_token,
+            language=language,
+            add_blank=add_blank,
+            normalize=normalize,
+            phonemize=phonemize,
+            is_uroman=is_uroman,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def normalize_text(self, input_string):
+        """Lowercase the input string, respecting any special token ids that may be part or entirely upper-cased."""
+        all_vocabulary = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())
+        filtered_text = ""
+        i = 0
+        while i < len(input_string):
+            found_match = False
+            for word in all_vocabulary:
+                if input_string[i : i + len(word)] == word:
+                    filtered_text += word
+                    i += len(word)
+                    found_match = True
+                    break
+            if not found_match:
+                filtered_text += input_string[i].lower()
+                i += 1
+        return filtered_text
+    def _preprocess_char(self, text):
+        """Special treatment of characters in certain languages"""
+        if self.language == "ron":
+            text = text.replace("ț", "ţ")
+        return text
+    def prepare_for_tokenization(
+        self, text: str, is_split_into_words: bool = False, normalize = None, **kwargs):
+        normalize = normalize if normalize is not None else self.normalize
+        if normalize:
+            # normalise for casing
+            text = self.normalize_text(text)
+        filtered_text = self._preprocess_char(text)
+        if has_non_roman_characters(filtered_text) and self.is_uroman:
+            # 7 langs -  For now replace all to romans in app.py
+            raise ValueError
+        if self.phonemize:
+            if not is_phonemizer_available():
+                raise ImportError("Please install the `phonemizer` Python package to use this tokenizer.")
+            filtered_text = phonemizer.phonemize(
+                filtered_text,
+                language="en-us",
+                backend="espeak",
+                strip=True,
+                preserve_punctuation=True,
+                with_stress=True,
+            )
+            filtered_text = re.sub(r"\s+", " ", filtered_text)
+        elif normalize:
+            # strip any chars outside of the vocab (punctuation)
+            filtered_text = "".join(list(filter(lambda char: char in self.encoder, filtered_text))).strip()
+        return filtered_text, kwargs
+    def _tokenize(self, text):
+        """Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters."""
+        tokens = list(text)
+        if self.add_blank:
+            # sounds dyslexi if no space between letters
+            # sounds disconnected if >2 spaces between letters
+            interspersed = [self._convert_id_to_token(0)] * (len(tokens) * 2) # + 1)  # +1 rises slice index error if tokens odd
+            interspersed[::2] = tokens
+            tokens = interspersed + [self._convert_id_to_token(0)]  # append one last space (it has indexing error ::2 mismatch if tokens is odd)
+        return tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)