diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..038a590e9a2acb40d56bd7d2165f34dca51a7521 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*/*.pyc +*.pcy +*/__pycache__ +__pycache__ \ No newline at end of file diff --git a/README.md b/README.md index 765d5855302adb78a95d2c0f0f998ace1ae75e97..3f65202411963651a869084438c18472e26558aa 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,93 @@ ---- -title: Prot Xlstm Variant Fitness -emoji: ⚡ -colorFrom: indigo -colorTo: blue -sdk: streamlit -sdk_version: 1.43.2 -app_file: app.py -pinned: false -license: apache-2.0 -short_description: 'This application enables to inspect mutational effects on a ' ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# prot-xLSTM-app + + + +## Getting started + +To make it easy for you to get started with GitLab, here's a list of recommended next steps. + +Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)! + +## Add your files + +- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files +- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command: + +``` +cd existing_repo +git remote add origin https://git.bioinf.jku.at/chemoinformatics/prot-xlstm-app.git +git branch -M main +git push -uf origin main +``` + +## Integrate with your tools + +- [ ] [Set up project integrations](https://git.bioinf.jku.at/chemoinformatics/prot-xlstm-app/-/settings/integrations) + +## Collaborate with your team + +- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/) +- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html) +- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically) +- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/) +- [ ] [Set auto-merge](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html) + +## Test and Deploy + +Use the built-in continuous integration in GitLab. + +- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/) +- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing (SAST)](https://docs.gitlab.com/ee/user/application_security/sast/) +- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html) +- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/) +- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html) + +*** + +# Editing this README + +When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thanks to [makeareadme.com](https://www.makeareadme.com/) for this template. + +## Suggestions for a good README + +Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information. + +## Name +Choose a self-explaining name for your project. + +## Description +Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors. + +## Badges +On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge. + +## Visuals +Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method. + +## Installation +Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection. + +## Usage +Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README. + +## Support +Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc. + +## Roadmap +If you have ideas for releases in the future, it is a good idea to list them in the README. + +## Contributing +State if you are open to contributions and what your requirements are for accepting them. + +For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self. + +You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser. + +## Authors and acknowledgment +Show your appreciation to those who have contributed to the project. + +## License +For open source projects, say how it is licensed. + +## Project status +If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers. diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..0db3d76ffe1ef9b7fd3dbcd9ad561bf3e29f7f8c --- /dev/null +++ b/app.py @@ -0,0 +1,151 @@ +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "" #disable cuda + +import streamlit as st +import numpy as np +import torch +import time +from Bio import SeqIO + +from protxlstm.applications.fitness_prediction import single_mutation_landscape_xlstm, create_mutation_df +from protxlstm.applications.msa_sampler import sample_msa +from protxlstm.models.xlstm import xLSTMLMHeadModel +from protxlstm.utils import load_model +import io + +from frontend.constants import info_text, citation_text + +DEFAULT_SEQUENCE = "MTARGLALGLLLLLLCPAQVFSQSCVWYGECGIAYGDKRYNCEYSGPPKPLPKDGYDLVQELCPGFFFGNVSLCCDVRQLQTLKDNLQLPLQFLSRCPSCFYNLLNLFCELTCSPRQSQFLNVTATEDYVDPVTNQTKTNVKELQYYVGQSFANAMYNACRDVEAPSSNDKALGLLCGKDADACNATNWIEYMFNKDNGQAPFTITPVFSDFPVHGMEPMNNATKGCDESVDEVTAPCSCQDCSIVCGPKPQPPPPPAPWTILGLDAMYVIMWITYMAFLLVFFGAFFAVWCYRKRYFVSEYTPIDSNIAFSVNASDKGEASCCDPVSAAFEGCLRRLFTRWGSFCVRNPGCVIFFSLVFITACSSGLVFVRVTTNPVDLWSAPSSQARLEKEYFDQHFGPFFRTEQLIIRAPLTDKHIYQPYPSGADVPFGPPLDIQILHQVLDLQIAIENITASYDNETVTLQDICLAPLSPYNTNCTILSVLNYFQNSHSVLDHKKGDDFFVYADYHTHFLYCVRAPASLNDTSLLHDPCLGTFGGPVFPWLVLGGYDDQNYNNATALVITFPVNNYYNDTEKLQRAQAWEKEFINFVKNYKNPNLTISFTAERSIEDELNRESDSDVFTVVISYAIMFLYISLALGHMKSCRRLLVDSKVSLGIAGILIVLSSVACSLGVFSYIGLPLTLIVIEVIPFLVLAVGVDNIFILVQAYQRDERLQGETLDQQLGRVLGEVAPSMFLSSFSETVAFFLGALSVMPAVHTFSLFAGLAVFIDFLLQITCFVSLLGLDIKRQEKNRLDIFCCVRGAEDGTSVQASESCLFRFFKNSYSPLLLKDWMRPIVIAIFVGVLSFSIAVLNKVDIGLDQSLSMPDDSYMVDYFKSISQYLHAGPPVYFVLEEGHDYTSSKGQNMVCGGMGCNNDSLVQQIFNAAQLDNYTRIGFAPSSWIDDYFDWVKPQSSCCRVDNITDQFCNASVVDPACVRCRPLTPEGKQRPQGGDFMRFLPMFLSDNPNPKCGKGGHAAYSSAVNILLGHGTRVGATYFMTYHTVLQTSADFIDALKKARLIASNVTETMGINGSAYRVFPYSVFYVFYEQYLTIIDDTIFNLGVSLGAIFLVTMVLLGCELWSAVIMCATIAMVLVNMFGVMWLWGISLNAVSLVNLVMSCGISVEFCSHITRAFTVSMKGSRVERAEEALAHMGSSVFSGITLTKFGGIVVLAFAKSQIFQIFYFRMYLAMVLLGATHGLIFLPVLLSYIGPSVNKAKSCATEERYKGTERERLLNF" + +mutation_positions = [] +msa_file = None + +if 'fitness_done' not in st.session_state: + st.session_state.fitness_done = False + st.session_state.mutations = None + st.session_state.fitness_duration = None + st.session_state.target_sequence = "" + st.session_state.context_sequences = [] + st.session_state.num_context_sequences = 25 + +def run_model(): + try: + st.session_state.fitness_duration = time.time() + checkpoint = "protxlstm/checkpoints/small" + num_context_tokens = 2**15 + df_mutations = create_mutation_df(st.session_state.target_sequence, mutation_positions) + if msa_file != None and st.session_state.num_context_sequences != 0: + def load_sequences_from_msa_file(file_obj): + text_io = io.TextIOWrapper(file_obj, encoding="utf-8") + sequences = [str(record.seq) for record in SeqIO.parse(text_io, "fasta")] + return sequences + msa_sequences = [msa.upper() for msa in load_sequences_from_msa_file(msa_file)] + st.session_state.context_sequences = sample_msa(msa_sequences, max_context_sequences=st.session_state.num_context_sequences, context_length=num_context_tokens) + st.session_state.context_sequences += [st.session_state.target_sequence] + + config_update_kwargs = { + "mlstm_backend": "chunkwise_variable", + "mlstm_chunksize": 1024, + "mlstm_return_last_state": True} + + model = load_model( + checkpoint, + model_class=xLSTMLMHeadModel, + device='cpu', + dtype=torch.bfloat16, + **config_update_kwargs, + ) + model = model.eval() + st.session_state.mutations, _ = single_mutation_landscape_xlstm(model, df_mutations, st.session_state.context_sequences, chunk_chunk_size=2**15) + print("fitness_done") + st.session_state.fitness_done = True + st.session_state.fitness_duration = time.time() - st.session_state.fitness_duration + except Exception as e: + print(e) + +# PAGE STYLE (mainly for custom aa selection) +st.set_page_config(layout="wide") +st.markdown( + """ + + """, + unsafe_allow_html=True +) + + +with st.sidebar: + st.title("Prot-xLSTM Variant Fitness") + + # LOAD SEQUENCE + st.session_state.target_sequence = st.text_area( + "Target protein sequence", + placeholder=DEFAULT_SEQUENCE, + value=st.session_state.target_sequence + ) + if st.button("Load sequence"): + if st.session_state.target_sequence == "": + st.session_state.target_sequence = DEFAULT_SEQUENCE + + # MANAGE CONTEXT SEQUENCES + context_type = st.selectbox( + "Choose how to enter context", + ("Enter manually", "Use MSA file"), + index=None, + placeholder="Choose context", + ) + if context_type == 'Enter manually': + context_sequence_str = st.text_area( + "Enter context protein sequences (seperated by comma)", + placeholder=DEFAULT_SEQUENCE, + ) + st.session_state.context_sequences = context_sequence_str.split(",") + [st.session_state.target_sequence] + elif context_type == 'Use MSA file': + msa_file = st.file_uploader("Choose MSA file") + st.session_state.num_context_sequences = st.number_input("How many of these sequences should be used?", min_value=0, step=1, value=25) + else: + st.session_state.context_sequences = [st.session_state.target_sequence] + +if st.session_state.target_sequence != "": + with st.container(): + + # MUTATION POSITION SELECTION + aas = list(st.session_state.target_sequence) + mutation_indices = np.arange(1, len(aas)+1) + mutation_positions = st.segmented_control( + "Choose mutation positions (click to select)", mutation_indices, selection_mode="multi", format_func=lambda i: aas[i-1], + ) + st.button("Check Fitness", on_click=run_model) + + # DISPLAY RESULTS + if st.session_state.fitness_done: + st.metric(label="Running time", value=f"{st.session_state.fitness_duration:.2f} sec.") + selected_pos = st.selectbox( + "Visualized mutation position", + st.session_state.mutations['position'].unique() + ) + selected_data = st.session_state.mutations.where(st.session_state.mutations['position'] == selected_pos) + st.bar_chart(selected_data, x='mutation', y='effect', horizontal=True) + st.dataframe(st.session_state.mutations, use_container_width=True) + +# TUTORIAL +with st.expander("Info & Tutorial", expanded=True): + st.subheader("Tutorial") + st.markdown("**1.** Choose a target protein sequence (leave empty to use a sample sequence) and press 'Load Sequence'") + st.markdown("**2.** Enter or upload you context sequences. (leave empty to use no context)") + st.markdown("**3.** Choose which amino acids to mutate (click on the AA's to select them) and press 'Check Fitness'") + st.subheader("General Information") + st.markdown(info_text, unsafe_allow_html=True) + st.markdown("") + st.subheader("Cite us / BibTex") + st.code(citation_text, language=None) \ No newline at end of file diff --git a/frontend/constants.py b/frontend/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..072194a46daab48abaab771a1317ead1edae7a1a --- /dev/null +++ b/frontend/constants.py @@ -0,0 +1,41 @@ +info_text = (""" +
+ This application enables to inspect mutational effects on a + predefined protein sequence.
+
+
+ +
+ It is built on the Prot-xLSTM backbone model, an xLSTM model specifically + trained on protein sequences. Prot-xLSTM was trained using the + Fill-In-the-Middle (FIM) objective, which allows it to perform sequence + inpainting. Additionally, the model can be provided with a potentially + large set of homologous sequences to enhance its predictions.
+
+
+ +
+ For further information please refer, to: https://openreview.net/forum?id=IjbXZdugdj.
+
+ + This Hugging Face application is based on the following GitHub repository: + https://github.com/ml-jku/Prot-xLSTM?tab=readme-ov-file.
+ The streamlit application was developed by Elias Bürger. +
+ +
+ Please cite us as follows:
+
+ """) +citation_text = """ + @misc{ + schmidinger2024bioxlstmgenerativemodelingrepresentation, + title={Bio-xLSTM: Generative modeling, representation and in-context learning of biological and chemical sequences}, + author={Niklas Schmidinger and Lisa Schneckenreiter and Philipp Seidl and Johannes Schimunek and Pieter-Jan Hoedt and Johannes Brandstetter and Andreas Mayr and Sohvi Luukkonen and Sepp Hochreiter and Günter Klambauer}, + year={2024}, + eprint={2411.04165}, + archivePrefix={arXiv}, + primaryClass={q-bio.BM}, + url={https://arxiv.org/abs/2411.04165}, + } + """ \ No newline at end of file diff --git a/prot_xlstm_env.yml b/prot_xlstm_env.yml new file mode 100644 index 0000000000000000000000000000000000000000..23158a04688fa10283de5712910ef10ae177e3e3 --- /dev/null +++ b/prot_xlstm_env.yml @@ -0,0 +1,39 @@ +name: prot_xlstm_app +channels: + - pytorch + - nvidia + - conda-forge + - defaults +dependencies: + - cuda=12.1 + - cuda-nvcc=12.1 + - gxx_linux-64=11.2.0 + - python=3.11 + - pip + - pytorch=2.2.0 + - pytorch-cuda=12.1 + - cmake + - ninja + - pip: + - accelerate>=0.26.0 + - biopython #==1.83 + - bottleneck #==1.4.2 + - dacite #==1.8.1 + - ipykernel #==6.29.3 + - mamba_ssm==1.2.0 + - matplotlib #==3.8.4 + - numpy<2.0 #==1.26.4 + - omegaconf #==2.3.0 + - pandas #==2.2.2 + - pyhmmer #==0.10.15 + - rich #==13.7.1 + - scipy #==1.13.0 + - seaborn #==0.13.2 + - torchmetrics #==1.2.1 + - tqdm #==4.66.4 + - transformers==4.44.2 + - tueplots #==0.0.17 + - wandb #==0.17.0 + - streamlit #==1.43.2 + + diff --git a/protxlstm/__init__.py b/protxlstm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7a7fbc60b23c535e14b50988d436ab7a63aec780 --- /dev/null +++ b/protxlstm/__init__.py @@ -0,0 +1 @@ +# __version__ = "0.0.1" \ No newline at end of file diff --git a/protxlstm/applications/__init__.py b/protxlstm/applications/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/protxlstm/applications/fitness_prediction.py b/protxlstm/applications/fitness_prediction.py new file mode 100644 index 0000000000000000000000000000000000000000..135c5518f26c185307e128ddecd810ef3f0b73c7 --- /dev/null +++ b/protxlstm/applications/fitness_prediction.py @@ -0,0 +1,214 @@ +import os +import numpy as np +import pandas as pd + +import torch +from tqdm.auto import tqdm + +from protxlstm.applications.msa_sampler import MSASampler +from protxlstm.generation import generate_sequence +from protxlstm.utils import AA_TO_ID, tokenizer, ID_TO_AA + + +def precompute_context_state(model, sequences, chunk_chunk_size=2**15): + """ + Precompute the output states for a fixed context that remains the same across generations. + Returns the hidden states to continue generation later. + """ + device = next(model.parameters()).device + + input_ids, pos_ids = prepare_context(sequences) + state = None + + for chunk in range(input_ids.shape[1]//chunk_chunk_size+1): + + start_idx = chunk*chunk_chunk_size + end_idx = min((chunk+1)*chunk_chunk_size, input_ids.shape[1]) + + if start_idx == end_idx: + pass + + else: + input_ids_chunk = input_ids[:, start_idx:end_idx].to(device) + pos_ids_chunk = pos_ids[:, start_idx:end_idx].to(device) + + with torch.no_grad(): + outputs = model(input_ids=input_ids_chunk, + position_ids=pos_ids_chunk, + state=state, + output_hidden_states=True, + return_dict=True) + state = outputs.state + + # Return the hidden states for reuse + return state + +def prepare_context(sequences): + tokenized_sequences = tokenizer(sequences, concatenate=False) + pos_ids = torch.cat([torch.arange(0, len(seq), dtype=torch.int64) for seq in tokenized_sequences], 0)[None, :] + input_ids = torch.cat(tokenized_sequences, 0)[None, :].to(torch.int64) + return input_ids, pos_ids + +def prepare_single_mutation_target(target, mut_pos): + + pos_ids = torch.arange(target.shape[1], dtype=torch.int64)[None,:] # default position ids + t = torch.ones((target.shape[0], 1), dtype=torch.int64) + new_target = torch.cat([ + target[:,:mut_pos], # WT sequence until mutated position + AA_TO_ID[""] * t, # Mask token at the muated position + target[:,mut_pos+1:], # WT sequence after mutated position + AA_TO_ID[""] * t, # End of sequence token + AA_TO_ID[""] * t, # Mask token + ], dim=1) + new_pos_ids = torch.cat([ + pos_ids, + 0 * t, # end of sequence + mut_pos * t, # mutation position + ], dim=1) + + is_fim_dict = { AA_TO_ID[""] : pos_ids[:,mut_pos].squeeze().item()} + + return new_target, new_pos_ids, is_fim_dict + +def single_mutation_landscape_xlstm(model, single_mutations, context_sequences, chunk_chunk_size=2**15): + + device = next(model.parameters()).device + + # Tokenize WT target sequence + wt_tokens = tokenizer([context_sequences[-1]], concatenate=True) + + # Precompute hidden state of context + context_state = precompute_context_state(model, context_sequences, chunk_chunk_size=chunk_chunk_size) + + mutation_positions = sorted(single_mutations.position.unique()) + all_logits = np.zeros((len(mutation_positions), 20)) + + # Iterate over all mutated positions + for i, pos in tqdm(enumerate(mutation_positions), total=len(mutation_positions), desc="Generating mutational landscape"): # This loop can be parallelized + + # Prepare target + wt_aa_id = wt_tokens[0, pos+1].int().item() # wild type AA index + target_tokens, target_pos_ids, _ = prepare_single_mutation_target(wt_tokens, pos+1) + + with torch.no_grad(): + outputs = model(input_ids=target_tokens.to(device), + position_ids=target_pos_ids.to(device), + state=context_state, + ) + + # Extact logits and compute mutational effect + logits = outputs.logits.clone().detach() # Raw logits + logits_mut = logits[0, -1, 4:24].log_softmax(-1) # Log-softmax for mutation prediction: (4-24) correspond to natural NNs + mut_effects = logits_mut - logits_mut[wt_aa_id - 4] # Subtract log probability of ground truth + all_logits[i,:] = logits_mut.cpu() + single_mutations.loc[single_mutations.position == pos, 'effect'] = single_mutations.loc[single_mutations.position == pos, 'mutation_idx'].apply(lambda x : mut_effects[x-4].item()) + + return single_mutations, all_logits + +def single_mutation_landscape_mamba(model, single_mutations, context_sequences): + + # Prepare context sequences + context_tokens, context_pos_ids = prepare_context(context_sequences) + + # Tokenize WT target sequence + wt_tokens = tokenizer([context_sequences[-1]], concatenate=True) + + mutation_positions = sorted(single_mutations.position.unique()) + all_logits = np.zeros((len(mutation_positions), 20)) + + # Iterate over all mutated positions + for i, pos in tqdm(enumerate(mutation_positions), total=len(mutation_positions), desc="Generating mutational landscape"): # This loop can be parallelized + + # Prepare target + wt_aa_id = wt_tokens[0, pos+1].int().item() # wild type AA index + target_tokens, target_pos_ids, is_fim_dict = prepare_single_mutation_target(wt_tokens, pos+1) + + # Merge context and target + device = next(model.parameters()).device + context_tokens = torch.cat([context_tokens, target_tokens], dim=1).to(device) + context_pos_ids = torch.cat([context_pos_ids, target_pos_ids], dim=1).to(device) + + # Generate fim-token prediction + output = generate_sequence( + model, + context_tokens, + position_ids=context_pos_ids, + is_fim=is_fim_dict, + max_length=1, + temperature=1.0, + top_k=0, + top_p=0.0, + return_dict_in_generate=True, + output_scores=True, + eos_token_id=AA_TO_ID[""], + device=device + ) + + # Extact logits and compute mutational effect + logits = torch.tensor(output["scores"]) # Raw logits + logits_mut = logits[0, 0, 4:24].log_softmax(-1) # Log-softmax for mutation prediction: (4-24) correspond to natural NNs + mut_effects = logits_mut - logits_mut[wt_aa_id - 4] # Subtract log probability of ground truth + all_logits[i,:] = logits_mut.cpu() + + single_mutations.loc[single_mutations.position == pos, 'effect'] = single_mutations.loc[single_mutations.position == pos, 'mutation_idx'].apply(lambda x : mut_effects[x-4].item()) + + return single_mutations, all_logits + +def single_mutation_landscape_retrieval(single_mutations, msa_sequences, msa_weights_path): + + # One-hot encode MSA sequences + msa_tokens = np.array([[AA_TO_ID[aa.upper()] for aa in seq] for seq in msa_sequences]) + one_hot_tokens = np.zeros((len(msa_tokens), len(msa_tokens[0]), 40)) + one_hot_tokens[np.arange(len(msa_tokens))[:, None], np.arange(len(msa_tokens[0])), msa_tokens] = 1 + + #Load/compute weights + if os.path.exists(msa_weights_path): + weights = np.load(msa_weights_path) + else: + sampler = MSASampler(0.98, 0.7) + weights = sampler.get_weights(msa_tokens)[1] + np.save(msa_weights_path, weights) + assert one_hot_tokens.shape[0] == weights.shape[0] + + # Apply sequence weights, normalize amino acid probabilities per position, and convert to a PyTorch tensor. + one_hot_tokens = one_hot_tokens * weights[:, None, None] + one_hot_tokens = one_hot_tokens.sum(0) + one_hot_tokens = one_hot_tokens[:, 4:24] + 1 / len(msa_sequences) + one_hot_tokens_sum = one_hot_tokens.sum(-1) + one_hot_tokens = one_hot_tokens / one_hot_tokens_sum[:, None] + one_hot_tokens = torch.tensor(one_hot_tokens).float() + + # Compute mutational effects + wild_type = msa_tokens[0] + logits = one_hot_tokens.log() + logits = logits - logits[torch.arange(len(logits)), wild_type - 4][:, None] + + single_mutations['retrieval_effect'] = single_mutations.apply( + lambda row: logits[row['position'], row['mutation_idx'] - 4].item(), axis=1) + + return single_mutations + + +def create_mutation_df(sequence, mutation_positions): + """ + Generate a DataFrame containing all possible mutations at specified positions in a sequence. + + Args: + sequence (str): The original sequence to mutate. + mutation_positions (list of int): List of positions to mutate (1-based index). + + Returns: + pd.DataFrame: + - 'mutation': formatted mutation string (e.g., 'A10G' for Ala at position 10 to Gly). + - 'position': 0-based position in the sequence. + - 'mutation_idx': numeric index for the mutation. + """ + + AAs = {k: v for k, v in ID_TO_AA.items() if 4 <= k <= 23} + mutation_data = [] + for position in mutation_positions: + wt = sequence[position - 1] + for idx, aa in AAs.items(): + mutation = f"{wt}{position}{aa}" + mutation_data.append({'mutation': mutation, 'position': position - 1, 'mutation_idx': idx}) + return pd.DataFrame(mutation_data) \ No newline at end of file diff --git a/protxlstm/applications/generation_utils/create_sequence_df.py b/protxlstm/applications/generation_utils/create_sequence_df.py new file mode 100644 index 0000000000000000000000000000000000000000..20c5278972676533aa5b6d1cabffe72f315cfe7f --- /dev/null +++ b/protxlstm/applications/generation_utils/create_sequence_df.py @@ -0,0 +1,85 @@ +import numpy as np +import pickle +import pandas as pd + +from protxlstm.dataloaders import ProteinMemmapDataset +from protxlstm.utils import decode_sequence, reorder_masked_sequence + + +def create_sequence_df(model_name, family_idx, parameters_list=None, num_sequences = 100, data_dir="./data/"): + + #load dataset + dataset = ProteinMemmapDataset( + msa_memmap_path=f"{data_dir}open_protein_set_memmap.dat", + msa_memmap_meta_path=f"{data_dir}open_protein_set_memmap_indices.csv", + subset_path=f"{data_dir}/cluster_testing_set.txt", + sample=False, + max_msa_len=-1, + reverse=False, + seed=0, + troubleshoot=False, + fim_strategy="multiple_span", + always_mask=False, + max_position_embeddings=2048, + max_seq_position_embeddings=512, + add_position_ids="1d", + mask_fraction=0.2, + max_patches=5, + ) + + family_id = list(dataset.dataset_meta["msa_id"])[family_idx] + + if model_name == "natural": + + data = dataset[family_idx] + sequence_df = pd.DataFrame(columns=["family", "family_id", "sequence", "sequence_length"]) + tokens = data["input_ids"][None,:] + all_context = decode_sequence(tokens[0].cpu().numpy()) + list_sequences_msa = [reorder_masked_sequence(elem+"") for elem in all_context.split("")[1:-1]] + + rd_idxs = np.random.choice(len(list_sequences_msa), num_sequences, replace=False) + natural_sequences = [seq for i, seq in enumerate(list_sequences_msa) if i in rd_idxs] + + df_dict = {"family": [family_idx]*len(natural_sequences), + "family_id": [family_id]*len(natural_sequences), + "sequence": natural_sequences, + "sequence_length": [len(seq) for seq in natural_sequences]} + + sequence_df = pd.concat([sequence_df, pd.DataFrame(df_dict)], ignore_index = True) + + else: + + sequence_df = pd.DataFrame(columns=["family", "family_id", "n_seqs_ctx", "temperature", "top_k", "top_p", "original_sequence", "sequence", "sequence_length", "perplexity"]) + + if parameters_list is None: + parameters_list = [(10,1.,10,1.), (10,1.,15,1.), (10,1.,10,0.95), (10,0.9,10,0.95), (10,0.8,10,0.9), + (100,1.,10,1.), (100,1.,15,1.), (100,1.,10,0.95), (100,0.9,10,0.95), (100,0.8,10,0.9), + (500,1.,10,1.), (500,1.,15,1.), (500,1.,10,0.95), (500,0.9,10,0.95), (500,0.8,10,0.9), + (1000,1.,10,1.), (1000,1.,15,1.), (1000,1.,10,0.95), (1000,0.9,10,0.95), (1000,0.8,10,0.9), + (-1,1.,10,1.), (-1,1.,15,1.), (-1,1.,10,0.95), (-1,0.9,10,0.95), (-1,0.8,10,0.9)] + + for param in parameters_list: + n_seqs_ctx, temperature, top_k, top_p = param + + with open(f"evaluation/generation/generated_sequences/{model_name}/{family_idx}_{param}_{num_sequences}", "rb") as f: + gen_seqs = pickle.load(f) + + original_sequences = list(gen_seqs[family_idx][param].keys()) + reordered_sequences = [reorder_masked_sequence(seq) for seq in original_sequences] + perplexities = [gen_seqs[family_idx][param][seq]["perplexity"] for seq in original_sequences] + df_dict = {"family": [family_idx]*len(original_sequences), + "family_id": [family_id]*len(original_sequences), + "n_seqs_ctx": [n_seqs_ctx]*len(original_sequences), + "temperature": [temperature]*len(original_sequences), + "top_k": [top_k]*len(original_sequences), + "top_p": [top_p]*len(original_sequences), + "original_sequence": original_sequences, + "sequence": reordered_sequences, + "sequence_length": [len(seq) for seq in reordered_sequences], + "perplexity": perplexities + } + + sequence_df = pd.concat([sequence_df, pd.DataFrame(df_dict)], ignore_index = True) + + return sequence_df + diff --git a/protxlstm/applications/generation_utils/score_hamming.py b/protxlstm/applications/generation_utils/score_hamming.py new file mode 100644 index 0000000000000000000000000000000000000000..631b087169c028e004fe5fea9e1307d3318f7f60 --- /dev/null +++ b/protxlstm/applications/generation_utils/score_hamming.py @@ -0,0 +1,80 @@ +import numpy as np +from tqdm import tqdm +import pandas as pd +from Bio import Align + +from protxlstm.dataloaders import ProteinMemmapDataset +from protxlstm.utils import decode_sequence, reorder_masked_sequence + + +aligner = Align.PairwiseAligner() +aligner.mode = 'global' +aligner.match_score = 1 +aligner.mismatch_score = -1 +aligner.open_gap_score = -1 +aligner.extend_gap_score = -1 + +def align_sequences(ref_seq, query_seq, print_alignments=False): + def hamming_str(s1,s2): + assert len(s1) == len(s2) + return sum(np.array(list(s1)) != np.array(list(s2)))/len(s1) + alignments = aligner.align(ref_seq, query_seq) + if print_alignments: + print("Score = %.1f:" % alignments[0].score) + print(alignments[0]) + return hamming_str(alignments[0][0], alignments[0][1]), alignments[0][0], alignments[0][1] + + +def score_hamming(sequence_df, family_idx, data_dir = f"./data/"): + + assert len(set(list(sequence_df["family"]))) == 1 and sequence_df["family"].iloc[0] == family_idx + + #load dataset + dataset = ProteinMemmapDataset( + msa_memmap_path=f"{data_dir}open_protein_set_memmap.dat", + msa_memmap_meta_path=f"{data_dir}open_protein_set_memmap_indices.csv", + subset_path=f"{data_dir}/cluster_testing_set.txt", + sample=False, + max_msa_len=-1, + reverse=False, + seed=0, + troubleshoot=False, + fim_strategy="multiple_span", + always_mask=False, + max_position_embeddings=2048, + max_seq_position_embeddings=512, + add_position_ids="1d", + mask_fraction=0.2, + max_patches=5, + ) + + # Select a sample of the dataset to be the input + data = dataset[family_idx] + tokens = data["input_ids"][None,:] + all_context = decode_sequence(tokens[0].cpu().numpy()) + list_sequences_msa = [reorder_masked_sequence(elem+"") for elem in all_context.split("")[1:-1]] + + # sequence_df["hamming"] = pd.Series(dtype=object) + sequence_df["min_hamming"] = pd.Series() + sequence_df["median_hamming"] = pd.Series() + sequence_df["mean_hamming"] = pd.Series() + sequence_df["std_hamming"] = pd.Series() + + for seq in tqdm(list(sequence_df["sequence"])): + + all_hamming = [] + for ctx_seq in list_sequences_msa: + if ctx_seq == seq: + continue + else: + hamming, _, _ = align_sequences(ctx_seq, seq , print_alignments=False) + all_hamming.append(hamming) + + # sequence_df.loc[sequence_df["sequence"] == seq, "hamming"] = [all_hamming] + sequence_df.loc[sequence_df["sequence"] == seq, "min_hamming"] = np.min(all_hamming) + sequence_df.loc[sequence_df["sequence"] == seq, "median_hamming"] = np.median(all_hamming) + sequence_df.loc[sequence_df["sequence"] == seq, "mean_hamming"] = np.mean(all_hamming) + sequence_df.loc[sequence_df["sequence"] == seq, "std_hamming"] = np.std(all_hamming) + + return sequence_df + \ No newline at end of file diff --git a/protxlstm/applications/generation_utils/score_hmmer.py b/protxlstm/applications/generation_utils/score_hmmer.py new file mode 100644 index 0000000000000000000000000000000000000000..7b30748833a550a8c3631beb39da9271b782f9fe --- /dev/null +++ b/protxlstm/applications/generation_utils/score_hmmer.py @@ -0,0 +1,102 @@ +import string +from Bio import SeqIO +import pyhmmer +from tqdm import tqdm + +alphabet = pyhmmer.easel.Alphabet.amino() + +# This is an efficient way to delete lowercase characters and insertion characters from a string +deletekeys = dict.fromkeys(string.ascii_lowercase) +deletekeys["."] = None +deletekeys["*"] = None +translation = str.maketrans(deletekeys) + +def remove_insertions(sequence: str) -> str: + """ Removes any insertions into the sequence. Needed to load aligned sequences in an MSA. """ + return sequence.translate(translation) + +def read_msa(filename: str): + """ Reads the sequences from an MSA file, automatically removes insertions.""" + return [(record.description, remove_insertions(str(record.seq))) for record in SeqIO.parse(filename, "fasta")] + +def read_msa_unaligned(filename: str): + """ Reads the sequences from an MSA file, removes only . - and * characters.""" + return [(record.description, str(record.seq).replace(".","").replace("-","").replace("*","").upper()) for record in SeqIO.parse(filename, "fasta")] + +def check_msa(msa): + """ Checks if there are any repeated sequences in the MSA""" + seqs = set() + for el in msa: + seqs.add(el[1]) + assert len(seqs) == len(msa), "There are repeated sequences in the MSA" + +def make_hmm_from_a3m_msa(msa_filepath, hmm_filename=None): + # Load MSA from a3m + msa_tup = read_msa(msa_filepath) + # check_msa(msa_tup) + # Create digitized MSA block + all_seqs = [pyhmmer.easel.TextSequence(name=str(i).encode("utf-8"), sequence=seq) for i, (idz, seq) in enumerate(msa_tup)] + msa = pyhmmer.easel.TextMSA(name=b"msa", sequences=all_seqs) + msa = msa.digitize(alphabet) + # Fit HMM + builder = pyhmmer.plan7.Builder(alphabet) + background = pyhmmer.plan7.Background(alphabet) + hmm, _, _ = builder.build_msa(msa, background) + if hmm_filename is not None: + with open(f"{hmm_filename}.hmm", "wb") as output_file: + hmm.write(output_file) + return hmm + +def align_and_score_sequences_in_a3m_with_hmm(hmm, sequences_path=None, sequences_list=None): + if sequences_list is not None: + msa = sequences_list + all_seqs = [pyhmmer.easel.TextSequence(name=str(i).encode("utf-8"), sequence=seq) for i, seq in enumerate(sequences_list)] + elif sequences_path is not None: + # Load sequences from a3m + msa = read_msa_unaligned(sequences_path) + all_seqs = [pyhmmer.easel.TextSequence(name=str(i).encode("utf-8"), sequence=seq) for i, (idz, seq) in enumerate(msa)] + else: + raise NotImplementedError("Missing sequences to align/score") + # Create digitized Sequence block + seq_block = pyhmmer.easel.TextSequenceBlock(all_seqs) + seq_block = seq_block.digitize(alphabet) + # Get all hits from the hmm + background = pyhmmer.plan7.Background(alphabet) + pipeline = pyhmmer.plan7.Pipeline(alphabet, background=background, bias_filter=False, F1=1.0, F2=1.0, F3=1.0) + hits = pipeline.search_hmm(hmm, seq_block) + if len(hits) != len(msa): + print(f"Number of hits: {len(hits)} is different from the number of sequences in the MSA: {len(msa)}") + # Extract hits + all_hits = {} + for hit in hits: + idz, score, evalue = hit.name, hit.score, hit.evalue + i = int(idz.decode("utf-8")) + seq = msa[i][1] if sequences_path is not None else sequences_list[i] + all_hits[seq] = {"score": score, "evalue": evalue} + return all_hits + + +def score_hmmer(sequence_df, family_idx, data_dir = f"./data/"): + + assert len(set(list(sequence_df["family"]))) == 1 and sequence_df["family"].iloc[0] == family_idx + + family_id = sequence_df["family_id"].iloc[0] + msa_filepath = f"{data_dir}/a3m_files/{family_id}/a3m/uniclust30.a3m" + try: + hmm = make_hmm_from_a3m_msa(msa_filepath) + except: + raise Exception(f"Missing MSA of family {family_id}") + + # align sequences + sequences = list(sequence_df["sequence"]) + scores = align_and_score_sequences_in_a3m_with_hmm(hmm, sequences_list=sequences) + + # save the scores associated to each sequence in the main df in the columns "score" and "evalue" + for seq in tqdm(sequences): + sequence_df.loc[sequence_df["sequence"] == seq, "score_gen"] = scores[seq]["score"] if seq in scores.keys() else 0 + sequence_df.loc[sequence_df["sequence"] == seq, "evalue_gen"] = scores[seq]["evalue"] if seq in scores.keys() else 1 + + return sequence_df + + + \ No newline at end of file diff --git a/protxlstm/applications/generation_utils/score_structure.py b/protxlstm/applications/generation_utils/score_structure.py new file mode 100644 index 0000000000000000000000000000000000000000..cc3f8a57320c7f1fb4a27568ca3789a73adbb151 --- /dev/null +++ b/protxlstm/applications/generation_utils/score_structure.py @@ -0,0 +1,55 @@ +from Bio.PDB import PDBParser +import torch +from tqdm import tqdm +from transformers import EsmForProteinFolding + +from protxlstm.utils import MASK_TO_ID + + +pdb_parser = PDBParser() + + +def compute_structure(seq, model): + def keep_sequence(seq, l): + if len(seq) > l: + return False + for mm in list(MASK_TO_ID.keys())+["", "", "", "", "", "", "." , "-"]: + if mm in seq: + return False + return True + keep = keep_sequence(seq, l=750) + if keep: + with torch.no_grad(): + output = model.infer([seq]) + # pdb = model.output_to_pdb(output) + ptm = output["ptm"].item() + pae = output["predicted_aligned_error"].cpu().numpy() + mean_plddt = ((output["plddt"] * output["atom37_atom_exists"]).sum(dim=(1, 2)) / output["atom37_atom_exists"].sum(dim=(1, 2))).item() + pos_plddt = ((output["plddt"] * output["atom37_atom_exists"]).sum(dim=(2,)) / output["atom37_atom_exists"].sum(dim=(2,))).cpu().numpy() + else: + print(f"Sequence is invalid.") + ptm, pae, mean_plddt, pos_plddt = 0, 0 ,0 , 0 + return ptm, pae, mean_plddt, pos_plddt + + +def score_structure(sequence_df, family_idx): + + assert len(set(list(sequence_df["family"]))) == 1 and sequence_df["family"].iloc[0] == family_idx + + device="cuda:0" + + # Import the folding model + model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=True) + + model = model.cuda(device) + model.esm = model.esm.half() + torch.backends.cuda.matmul.allow_tf32 = True + + sequences = list(sequence_df["sequence"]) + for seq in tqdm(sequences): + + ptm, pae, mean_plddt, pos_plddt = compute_structure(seq, model) + sequence_df.loc[sequence_df["sequence"] == seq, "ptm"] = ptm + sequence_df.loc[sequence_df["sequence"] == seq, "mean_plddt"] = mean_plddt + + return sequence_df \ No newline at end of file diff --git a/protxlstm/applications/msa_sampler.py b/protxlstm/applications/msa_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..181aec97ba6af5757a1494f362a07f02ad90bed7 --- /dev/null +++ b/protxlstm/applications/msa_sampler.py @@ -0,0 +1,196 @@ +# Original code from ProtMamba under Apache License 2.0. +# +# Modifications made by Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Modify handling of weights in `MSASampler` + +import math +import os +from typing import Optional, Callable + +import numpy as np +import torch + +from protxlstm.utils import AA_TO_ID + + +def compute_hamming_csim_torch( + seqs: torch.Tensor, + ungapped_msa: torch.Tensor, + gap_token: int, + gap_token_mask: int, +) -> torch.Tensor: + return (seqs.unsqueeze(1) == ungapped_msa).sum(dim=2) + +def _compute_homology_weights( + ungapped_msa: np.ndarray, + gap_token: int, + gap_token_mask: int, + theta: float, + hamming_csim_func: Callable, + max_memory: int = 20, + can_use_torch: bool = True, +) -> np.ndarray: + use_torch = can_use_torch + if use_torch: + hamming_csim_func = compute_hamming_csim_torch + batch_size = math.floor( + 2 + * 1024 + * 1024 + * 1024 + / (ungapped_msa.shape[0] * ungapped_msa.shape[1]) + * max_memory + / 40 + ) + + batch_size = 1 if batch_size == 0 else batch_size + + neighbors = [] + if not use_torch: + masked_ungapped_msa = ungapped_msa.copy() + else: + ungapped_msa = torch.from_numpy(ungapped_msa).byte() + masked_ungapped_msa = ungapped_msa.clone() + masked_ungapped_msa[masked_ungapped_msa == gap_token] = gap_token_mask + for b_start in range(0, len(ungapped_msa), batch_size): + b_end = b_start + batch_size + seqs = ungapped_msa[b_start:b_end] + + sim = hamming_csim_func( + seqs=seqs, + ungapped_msa=masked_ungapped_msa, + gap_token=gap_token, + gap_token_mask=gap_token_mask, + ) + if not use_torch: + sim = sim / (seqs != gap_token).sum(axis=1, keepdims=True) + d = 1 - sim + d = d.clamp(0, 1) + this_neighbors = (d <= theta).sum(axis=1) + else: + sim = sim / (seqs != gap_token).sum(dim=1, keepdim=True) + d = 1 - sim + # fillna + d[torch.isnan(d)] = 0 + d = d.clamp(0, 1) + this_neighbors = (d <= theta).sum(dim=1).cpu() + neighbors.append(this_neighbors) + return np.concatenate(neighbors) + +def compute_homology_weights( + ungapped_msa: np.ndarray, + theta: float = 0.2, + gap_token: int = AA_TO_ID["-"], + gap_token_mask: int = 255, + hamming_csim_func: Callable = compute_hamming_csim_torch, +) -> tuple[int, np.ndarray]: + """ + Calculate the effective number of sequences and sampling probability for the NEIGHBORS and NEIGHBORS_NO_LIMIT sampling methods using numpy. + + Parameters: + + ungapped_msa (np.ndarray): The MSA (from .fa). + theta (float, optional): A parameter used to determine the similarity between sequences. Default is 0.2. + gap_token (int, optional): The token representing gaps in the (Uniprot21 encoded) MSA. Default is 20. + gap_token_mask (int): token for masking gaps. should be a token not representing any other value. + + Returns: + + tuple[int, np.ndarray]: A tuple containing the effective number of sequences and the sampling probability for each sequence in the MSA. + """ + neighbors = _compute_homology_weights( + ungapped_msa=ungapped_msa, + gap_token=gap_token, + gap_token_mask=gap_token_mask, + theta=theta, + hamming_csim_func=hamming_csim_func, + ) + n_eff = np.sum(1 / neighbors) + + p = 1 / neighbors + p /= np.sum(p) + return n_eff, p + +class MSASampler: + + def __init__(self, max_similarity, max_dissimilarity, force_include_first=True): + self.max_similarity = max_similarity + self.max_dissimilarity = max_dissimilarity + self.force_include_first = force_include_first + self.theta = 0.2 + + def _get_sim_filtered_idxs(self, msa: np.ndarray) -> np.ndarray: + nonnormalized_sim = (msa == msa[[0]]).sum(axis=1) + normfactor = msa.shape[1] + norm_sim = nonnormalized_sim / normfactor + + assert (norm_sim.min() >= 0) and (norm_sim.max() <= 1) + dsim = 1 - norm_sim + + max_sim_filter = norm_sim <= self.max_similarity + max_dissim_filter = dsim <= self.max_dissimilarity + return np.where(max_sim_filter & max_dissim_filter)[0] + + def get_weights( + self, msa: np.ndarray, + ) -> tuple[Optional[float], Optional[np.ndarray]]: + return compute_homology_weights( + ungapped_msa=msa, + theta=self.theta, + gap_token_mask=255, + + ) + + def get_sample_idxs( + self, + msa: np.ndarray, + size: int = 1, + random = False, + msa_weights_path = None, + seed = 0, + ) -> np.ndarray: + + np.random.seed(seed) + + if random: + return np.random.choice(len(msa), replace=False, size=size) if len(msa) >= size else np.arange(len(msa)) + + msa = np.array([[AA_TO_ID[aa] for aa in seq.upper()][:len(msa[0])] for seq in msa], dtype=np.uint8) + + if msa_weights_path and os.path.exists(msa_weights_path): + weights = np.load(msa_weights_path) + elif msa_weights_path: + os.makedirs(os.path.dirname(msa_weights_path), exist_ok=True) + _, weights = self.get_weights( + msa=msa, + ) + np.save(msa_weights_path, weights) + else: + _, weights = self.get_weights( + msa=msa, + ) + + + original_msa_sample_idxs = np.arange(len(msa)) + sample_idxs = self._get_sim_filtered_idxs(msa) + original_msa_sample_idxs = original_msa_sample_idxs[sample_idxs] + + if self.force_include_first: + original_msa_sample_idxs = np.concatenate( + [[0], original_msa_sample_idxs[original_msa_sample_idxs != 0]] + ) + return np.random.choice(len(msa), replace=False, size=size, p=weights / weights.sum()) if len(msa) >= size else original_msa_sample_idxs + +def sample_msa(msa_sequences, msa_weights_path=None, context_length=200_000, max_context_sequences=200, seed=0, sort=True): + """Sample MSA sequences for the context""" + n_sequences = min( context_length // len(msa_sequences[0]), len(msa_sequences) if max_context_sequences == 0 else max_context_sequences ) - 1 + sampler = MSASampler(0.98, 0.7, force_include_first=False) + sample_idx = sampler.get_sample_idxs( + msa_sequences, size=n_sequences, msa_weights_path=msa_weights_path, seed=seed + ) + + # Sort sequences from least similar to most similar and add wild type target sequence + if sort: + context_sequences = [msa_sequences[i] for i in sample_idx][::-1] + + return context_sequences \ No newline at end of file diff --git a/protxlstm/applications/sample_sequences.py b/protxlstm/applications/sample_sequences.py new file mode 100644 index 0000000000000000000000000000000000000000..1fcc4747638b836db8f9e2b86d068226e7846686 --- /dev/null +++ b/protxlstm/applications/sample_sequences.py @@ -0,0 +1,200 @@ +import torch +from tqdm import tqdm +import pickle +import os +import argparse +import json + +from protxlstm.dataloaders import ProteinMemmapDataset +from protxlstm.generation import generate_sequence +from protxlstm.utils import ( + AA_TO_ID, + load_model, +) +from protxlstm.models.xlstm import xLSTMLMHeadModel +from protxlstm.models.mamba import MambaLMHeadModelwithPosids + + +def sample_sequences(dataset, + model, + family_idx, + params, + n_samples_per_family, + max_length=1000, + chunk_chunk_size=2**15, + save_path=None, + device="cuda:0"): + """ + Function to sample sequences from the model. Given a dataset, a list of families (their indexes in the dataset) + and a set of generating parameters, it generates `n_samples_per_family` sequences for each family and each parameter set. + The function returns a dictionary with the following structure: + gen_seqs = {family_idx: {parameters: {sequence: perplexity}}} + The parameters are in a list of tuples with the following structure: + parameters_list = [(nr_seqs_ctx, temperature, top_k, top_p)] + """ + gen_seqs = {} + gen_seqs[family_idx] = {} + gen_seqs[family_idx][params] = {} + print(f"Sampling sequences for family {family_idx} and parameters {params}.") + + n_seqs_ctx , temperature, top_k, top_p = params + for _ in tqdm(range(n_samples_per_family)): + # Sample the dataset to get the input + data = dataset[family_idx] + tokens = data["input_ids"][None,:].to(device) + pos_ids = data["position_ids"][None,:].to(device) + + start_seqs = torch.argwhere(tokens[0]==0)[:,0].cpu().numpy() + + n_seqs_ctx = len(start_seqs) if len(start_seqs) < n_seqs_ctx else n_seqs_ctx + L = start_seqs[n_seqs_ctx]+1 + context_tokens = tokens[:,:L] + context_pos_ids = pos_ids[:,:L] + is_fim={} + + # Generate the new sequence + output = generate_sequence(model, + context_tokens, + position_ids=context_pos_ids, + is_fim=is_fim, + max_length=(L+max_length), + temperature=temperature, + top_k=top_k, + top_p=top_p, + return_dict_in_generate=True, + output_scores=True, + eos_token_id=torch.tensor([AA_TO_ID[""]]).to(device), + chunk_chunk_size=chunk_chunk_size, + device=device) + + # Get the perplexity of the generated sequence + output_seq = output["generated"] + loss = torch.nn.functional.cross_entropy(torch.from_numpy(output["scores"]).permute(0, 2, 1), + torch.from_numpy(output["generated_tokens"][0][None,:])) + + # save only sequences with length < max_length + if len(output_seq[0]) < max_length: + + gen_seqs[family_idx][params][output_seq[0]] = {"perplexity": torch.exp(loss).item()} + + if save_path is not None: + if not os.path.exists("evaluation/generation/generated_sequences"): + os.mkdir("evaluation/generation/generated_sequences") + if not os.path.exists(save_path): + os.mkdir(save_path) + with open(f'{save_path}/{family_idx}_{params}_{n_samples_per_family}', "wb") as f: + pickle.dump(gen_seqs, f) + print(f"Sequences saved for family {family_idx} and parameters {params}") + + return gen_seqs + +def generate_sequences(model_name, + checkpoint, + family_idxs=[], + parameters_list=[], + n_samples_per_family = 100, + chunk_size=1024, + chunk_chunk_size=2**15, + data_dir="data/", + device="cuda:0" + ): + + # Load the test dataset + fim_strategy = "multiple_span" + mask_fraction = 0.2 + + dataset = ProteinMemmapDataset( + msa_memmap_path=f"{data_dir}open_protein_set_memmap.dat", + msa_memmap_meta_path=f"{data_dir}open_protein_set_memmap_indices.csv", + subset_path=f"{data_dir}cluster_testing_set.txt", + sample=False, + max_msa_len=-1, + reverse=False, + seed=0, + troubleshoot=False, + fim_strategy=fim_strategy, + always_mask=False, + max_position_embeddings=2048, + max_seq_position_embeddings=512, + add_position_ids="1d", + mask_fraction=mask_fraction + ) + + if model_name == "xlstm": + model_class = xLSTMLMHeadModel + elif model_name == "mamba": + model_class = MambaLMHeadModelwithPosids + + save_path = f"evaluation/generation/generated_sequences/{checkpoint.split('/')[-1]}" + + if model_name == "xlstm": + config_update_kwargs = { + "mlstm_backend": "chunkwise_variable", + "mlstm_chunksize": chunk_size, + "mlstm_return_last_state": True + } + else: + config_update_kwargs = {} + + + #load the model + model = load_model(checkpoint, + model_class=model_class, + device=device, + dtype=torch.bfloat16, + **config_update_kwargs, + ) + model = model.eval() + print("Model loaded.") + + for family_idx in family_idxs: + for params in parameters_list: + params = tuple(params) + if not os.path.exists(f'{save_path}/{family_idx}_{params}_{n_samples_per_family}'): + gen_seqs = sample_sequences( + dataset=dataset, + model=model, + family_idx=family_idx, + params=params, + n_samples_per_family=n_samples_per_family, + chunk_chunk_size=chunk_chunk_size, + save_path=save_path, + device=device) + + print(f"Sampled {len(gen_seqs[family_idx][params])} valid sequences.") + else: + print(f"Sequences for family {family_idx} and parameters {params} already exist.") + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + description="Generate sequences." + ) + parser.add_argument("--model_name", type=str, help="Either 'xlstm' or 'mamba'.") + parser.add_argument("--checkpoint", type=str, help="Path to model checkpoint.") + parser.add_argument("--family_idxs", type=str, help="List of family indices.") + parser.add_argument("--parameters_list", type=str, help="List of sampling parameters.") + parser.add_argument("--n_samples_per_family", type=int, default=100, help="Number of sequences to sample per family and parameter set.") + parser.add_argument("--chunk_size", type=int, default=1024, help="Chunk size for xLSTM context encoding.") + parser.add_argument("--chunk_chunk_size", type=int, default=2*15, help="Length of context sequence part processed at once.") + parser.add_argument("--data_dir", type=str, default="data/", help="Path to dataset.") + parser.add_argument("--device", type=str, default="cuda:0", help="Device.") + + args = parser.parse_args() + + family_idxs = json.loads(args.family_idxs) + parameters_list = json.loads(args.parameters_list) + + # Run sequence generation + generate_sequences( + model_name=args.model_name, + checkpoint=args.checkpoint, + family_idxs=family_idxs, + parameters_list=parameters_list, + n_samples_per_family=args.n_samples_per_family, + chunk_size=args.chunk_size, + chunk_chunk_size=args.chunk_chunk_size, + data_dir=args.data_dir, + device=args.device, + ) \ No newline at end of file diff --git a/protxlstm/applications/score_sequences.py b/protxlstm/applications/score_sequences.py new file mode 100644 index 0000000000000000000000000000000000000000..183a7302868839d8a13589d68103cd4e5aba6382 --- /dev/null +++ b/protxlstm/applications/score_sequences.py @@ -0,0 +1,58 @@ +import argparse +import os +import pickle + +from generation_utils.create_sequence_df import create_sequence_df +from generation_utils.score_hamming import score_hamming +from generation_utils.score_hmmer import score_hmmer +from generation_utils.score_structure import score_structure + + +def score_sequences(model_name, + family_idx, + num_sequences = 100, + data_dir = "data/"): + + if os.path.isfile(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}"): + with open(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}", "rb") as f: + sequence_df = pickle.load(f) + else: + sequence_df = create_sequence_df(model_name, family_idx, data_dir = data_dir, num_sequences = num_sequences) + if not os.path.exists("evaluation/generation/evaluations/"): + os.mkdir("evaluation/generation/evaluations/") + if not os.path.exists(f"evaluation/generation/evaluations/{model_name}/"): + os.mkdir(f"evaluation/generation/evaluations/{model_name}/") + with open(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}", "wb") as f: + pickle.dump(sequence_df, f) + + if not "min_hamming" in sequence_df.columns: + sequence_df = score_hamming(sequence_df, family_idx, data_dir) + with open(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}", "wb") as f: + pickle.dump(sequence_df, f) + + if not "score_gen" in sequence_df.columns: + sequence_df = score_hmmer(sequence_df, family_idx, data_dir) + with open(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}", "wb") as f: + pickle.dump(sequence_df, f) + + if not "ptm" in sequence_df.columns: + sequence_df = score_structure(sequence_df, family_idx) + with open(f"evaluation/generation/evaluations/{model_name}/sequence_df_{family_idx}", "wb") as f: + pickle.dump(sequence_df, f) + + return sequence_df + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser( + description="Generate sequences." + ) + parser.add_argument("--model_name", type=str, help="Either 'xlstm' or 'mamba'.") + parser.add_argument("--family_idx", type=int, help="Family index.") + parser.add_argument("--num_sequences", type=int, default=100, help="Number of sequences.") + parser.add_argument("--data_dir", type=str, default="./data/", help="Path to dataset.") + + args = parser.parse_args() + + sequence_df = score_sequences(args.model_name, args.family_idx, args.num_sequences, args.data_dir) diff --git a/protxlstm/checkpoints/small/config.json b/protxlstm/checkpoints/small/config.json new file mode 100644 index 0000000000000000000000000000000000000000..9f4b38d4b89e3a34831f0ef8ea60a6a235926bd5 --- /dev/null +++ b/protxlstm/checkpoints/small/config.json @@ -0,0 +1 @@ +{"mlstm_block": {"mlstm": {"proj_factor": 2.0, "round_proj_up_dim_up": true, "round_proj_up_to_multiple_of": 64, "_proj_up_dim": 1024, "conv1d_kernel_size": 4, "qkv_proj_blocksize": 4, "num_heads": 4, "embedding_dim": 512, "bias": false, "dropout": 0.0, "context_length": 2048, "backend": "chunkwise", "chunk_size": 1024, "return_last_state": false, "_num_blocks": 16, "_inner_embedding_dim": 1024}}, "slstm_block": {"slstm": {"hidden_size": 512, "num_heads": 4, "num_states": 4, "backend": "cuda", "function": "slstm", "bias_init": "powerlaw_blockdependent", "recurrent_weight_init": "zeros", "_block_idx": 0, "_num_blocks": 16, "num_gates": 4, "gradient_recurrent_cut": false, "gradient_recurrent_clipval": null, "forward_clipval": null, "batch_size": 8, "input_shape": "BSGNH", "internal_input_shape": "SBNGH", "output_shape": "BNSH", "constants": {}, "dtype": "bfloat16", "dtype_b": "float32", "dtype_r": "bfloat16", "dtype_w": "bfloat16", "dtype_g": "bfloat16", "dtype_s": "bfloat16", "dtype_a": "float32", "enable_automatic_mixed_precision": true, "initial_val": 0.0, "embedding_dim": 512, "conv1d_kernel_size": 4, "dropout": 0.0}, "feedforward": {"proj_factor": 1.3, "round_proj_up_dim_up": true, "round_proj_up_to_multiple_of": 64, "_proj_up_dim": 0, "act_fn": "gelu", "embedding_dim": -1, "dropout": 0.0, "bias": false, "ff_type": "ffn_gated", "_num_blocks": 1}, "_num_blocks": 16, "_block_idx": 0}, "context_length": 2048, "num_blocks": 16, "embedding_dim": 512, "add_post_blocks_norm": true, "bias": false, "dropout": 0.0, "checkpoint_blocks": true, "slstm_at": [], "_block_map": "0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", "vocab_size": 38, "tie_weights": false, "weight_decay_on_embedding": false, "add_embedding_dropout": false, "position_embeddings": "rot_1d", "max_position_embeddings": 2048, "max_seq_position_embeddings": 512, "rope_base_frequency": 500000} \ No newline at end of file diff --git a/protxlstm/checkpoints/small/optimizer.pt b/protxlstm/checkpoints/small/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3d2f627f5806a1025cb5f513e959ce31865a696 --- /dev/null +++ b/protxlstm/checkpoints/small/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bcae4c2a893afed859ca9b5926d24e8f7d0b22eca198e4aa950b80909be8e50 +size 207533690 diff --git a/protxlstm/checkpoints/small/pytorch_model.bin b/protxlstm/checkpoints/small/pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..52d24c4e9b15181ebb9aaa9dc120353fdb6be7c0 --- /dev/null +++ b/protxlstm/checkpoints/small/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ba59f1fd544a5d9f6c4adb40730cf90b8f69a772df838246f724586cb1d602a +size 103773526 diff --git a/protxlstm/checkpoints/small/rng_state.pth b/protxlstm/checkpoints/small/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..15c8418f2938a15c2dbf98809baee72c00c4f4ff --- /dev/null +++ b/protxlstm/checkpoints/small/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5480500274058efdf9caa959d35a42a948ff3dc8536e082b9bc22f2ecd423108 +size 14244 diff --git a/protxlstm/checkpoints/small/scheduler.pt b/protxlstm/checkpoints/small/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6900fb94245d3bbd1fdf38e17104afcea503dd51 --- /dev/null +++ b/protxlstm/checkpoints/small/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25a26dbf75285e97210697d5608b6d76ef35aa0d2879be319ef2785f881153b9 +size 1000 diff --git a/protxlstm/checkpoints/small/trainer_state.json b/protxlstm/checkpoints/small/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..97fbb48a14ba8891edc6398005e9b691eb97ecf5 --- /dev/null +++ b/protxlstm/checkpoints/small/trainer_state.json @@ -0,0 +1,80490 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.28442365509575285, + "eval_steps": 250, + "global_step": 76250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.730146296337743e-05, + "grad_norm": 0.5294842720031738, + "learning_rate": 0.0006, + "loss": 2.2135, + "step": 10 + }, + { + "epoch": 7.460292592675485e-05, + "grad_norm": 0.5858336091041565, + "learning_rate": 0.0006, + "loss": 2.2843, + "step": 20 + }, + { + "epoch": 0.00011190438889013227, + "grad_norm": 0.36886391043663025, + "learning_rate": 0.0006, + "loss": 2.3092, + "step": 30 + }, + { + "epoch": 0.0001492058518535097, + "grad_norm": 0.6874925494194031, + "learning_rate": 0.0006, + "loss": 2.2404, + "step": 40 + }, + { + "epoch": 0.0001865073148168871, + "grad_norm": 0.46000632643699646, + "learning_rate": 0.0006, + "loss": 2.2296, + "step": 50 + }, + { + "epoch": 0.0001865073148168871, + "eval_valid_loss": 2.2531306743621826, + "eval_valid_loss/all": 2.1096882820129395, + "eval_valid_loss/end_span": 1.2738336324691772, + "eval_valid_perplexity/batch": 8.245670318603516, + "eval_valid_perplexity/end_span": 3.5745296478271484, + "eval_valid_perplexity/fim": 2.6721651554107666, + "eval_valid_perplexity/first_seq": 15.088037490844727, + "eval_valid_perplexity/last_seq": 9.225531578063965, + "eval_valid_perplexity/second_seq": 13.640207290649414, + "eval_valid_perplexity/seq": 9.281828880310059, + "eval_valid_reconstruction/all": 0.2782035768032074, + "eval_valid_reconstruction/end_span": 0.7024093270301819, + "eval_valid_reconstruction/fim": 0.1826012134552002, + "eval_valid_reconstruction/first_seq": 0.1646122932434082, + "eval_valid_reconstruction/last_seq": 0.320696622133255, + "eval_valid_reconstruction/second_seq": 0.20213307440280914, + "eval_valid_runtime": 459.3712, + "eval_valid_samples_per_second": 0.418, + "eval_valid_steps_per_second": 0.418, + "step": 50 + }, + { + "epoch": 0.0001865073148168871, + "eval_train_loss": 2.2469234466552734, + "eval_train_loss/all": 2.0752158164978027, + "eval_train_loss/end_span": 1.2200616598129272, + "eval_train_perplexity/batch": 7.966265678405762, + "eval_train_perplexity/end_span": 3.3873965740203857, + "eval_train_perplexity/fim": 2.139106273651123, + "eval_train_perplexity/first_seq": 15.244900703430176, + "eval_train_perplexity/last_seq": 8.995733261108398, + "eval_train_perplexity/second_seq": 14.213336944580078, + "eval_train_perplexity/seq": 9.157821655273438, + "eval_train_reconstruction/all": 0.2694132924079895, + "eval_train_reconstruction/end_span": 0.7181357741355896, + "eval_train_reconstruction/fim": 0.14236479997634888, + "eval_train_reconstruction/first_seq": 0.15707798302173615, + "eval_train_reconstruction/last_seq": 0.3258536159992218, + "eval_train_reconstruction/second_seq": 0.18202747404575348, + "eval_train_runtime": 429.4816, + "eval_train_samples_per_second": 0.447, + "eval_train_steps_per_second": 0.447, + "step": 50 + }, + { + "epoch": 0.00022380877778026455, + "grad_norm": 0.40908321738243103, + "learning_rate": 0.0006, + "loss": 2.3034, + "step": 60 + }, + { + "epoch": 0.000261110240743642, + "grad_norm": 0.5082122087478638, + "learning_rate": 0.0006, + "loss": 2.328, + "step": 70 + }, + { + "epoch": 0.0002984117037070194, + "grad_norm": 0.41682225465774536, + "learning_rate": 0.0006, + "loss": 2.2783, + "step": 80 + }, + { + "epoch": 0.0003357131666703968, + "grad_norm": 0.4468563199043274, + "learning_rate": 0.0006, + "loss": 2.3862, + "step": 90 + }, + { + "epoch": 0.0003730146296337742, + "grad_norm": 0.5601614713668823, + "learning_rate": 0.0006, + "loss": 2.3744, + "step": 100 + }, + { + "epoch": 0.0003730146296337742, + "eval_valid_loss": 2.253563165664673, + "eval_valid_loss/all": 2.1095967292785645, + "eval_valid_loss/end_span": 1.4801899194717407, + "eval_valid_perplexity/batch": 8.244915962219238, + "eval_valid_perplexity/end_span": 4.39378023147583, + "eval_valid_perplexity/fim": 2.490084171295166, + "eval_valid_perplexity/first_seq": 14.838619232177734, + "eval_valid_perplexity/last_seq": 9.662276268005371, + "eval_valid_perplexity/second_seq": 14.100909233093262, + "eval_valid_perplexity/seq": 9.27451229095459, + "eval_valid_reconstruction/all": 0.2781042754650116, + "eval_valid_reconstruction/end_span": 0.6562800407409668, + "eval_valid_reconstruction/fim": 0.16842582821846008, + "eval_valid_reconstruction/first_seq": 0.17008842527866364, + "eval_valid_reconstruction/last_seq": 0.30612796545028687, + "eval_valid_reconstruction/second_seq": 0.1848699450492859, + "eval_valid_runtime": 423.8402, + "eval_valid_samples_per_second": 0.453, + "eval_valid_steps_per_second": 0.453, + "step": 100 + }, + { + "epoch": 0.0003730146296337742, + "eval_train_loss": 2.248677968978882, + "eval_train_loss/all": 2.076627016067505, + "eval_train_loss/end_span": 1.4293369054794312, + "eval_train_perplexity/batch": 7.97751522064209, + "eval_train_perplexity/end_span": 4.175929069519043, + "eval_train_perplexity/fim": 2.261843681335449, + "eval_train_perplexity/first_seq": 15.283018112182617, + "eval_train_perplexity/last_seq": 10.066801071166992, + "eval_train_perplexity/second_seq": 14.282281875610352, + "eval_train_perplexity/seq": 9.172231674194336, + "eval_train_reconstruction/all": 0.2689282298088074, + "eval_train_reconstruction/end_span": 0.6692345142364502, + "eval_train_reconstruction/fim": 0.15206366777420044, + "eval_train_reconstruction/first_seq": 0.15970133244991302, + "eval_train_reconstruction/last_seq": 0.2930697202682495, + "eval_train_reconstruction/second_seq": 0.18231292068958282, + "eval_train_runtime": 424.5553, + "eval_train_samples_per_second": 0.452, + "eval_train_steps_per_second": 0.452, + "step": 100 + }, + { + "epoch": 0.00041031609259715166, + "grad_norm": 0.39962038397789, + "learning_rate": 0.0006, + "loss": 2.3273, + "step": 110 + }, + { + "epoch": 0.0004476175555605291, + "grad_norm": 0.6957873106002808, + "learning_rate": 0.0006, + "loss": 2.3005, + "step": 120 + }, + { + "epoch": 0.0004849190185239065, + "grad_norm": 0.4312160015106201, + "learning_rate": 0.0006, + "loss": 2.238, + "step": 130 + }, + { + "epoch": 0.000522220481487284, + "grad_norm": 0.45052003860473633, + "learning_rate": 0.0006, + "loss": 2.3333, + "step": 140 + }, + { + "epoch": 0.0005595219444506613, + "grad_norm": 0.5772297382354736, + "learning_rate": 0.0006, + "loss": 2.2048, + "step": 150 + }, + { + "epoch": 0.0005595219444506613, + "eval_valid_loss": 2.2476377487182617, + "eval_valid_loss/all": 2.104687452316284, + "eval_valid_loss/end_span": 1.3312122821807861, + "eval_valid_perplexity/batch": 8.204538345336914, + "eval_valid_perplexity/end_span": 3.7856297492980957, + "eval_valid_perplexity/fim": 2.341428756713867, + "eval_valid_perplexity/first_seq": 14.93691635131836, + "eval_valid_perplexity/last_seq": 9.111427307128906, + "eval_valid_perplexity/second_seq": 13.53101921081543, + "eval_valid_perplexity/seq": 9.234925270080566, + "eval_valid_reconstruction/all": 0.2792404592037201, + "eval_valid_reconstruction/end_span": 0.6876538991928101, + "eval_valid_reconstruction/fim": 0.1575334668159485, + "eval_valid_reconstruction/first_seq": 0.16778233647346497, + "eval_valid_reconstruction/last_seq": 0.32323330640792847, + "eval_valid_reconstruction/second_seq": 0.20130904018878937, + "eval_valid_runtime": 423.9225, + "eval_valid_samples_per_second": 0.453, + "eval_valid_steps_per_second": 0.453, + "step": 150 + }, + { + "epoch": 0.0005595219444506613, + "eval_train_loss": 2.241420030593872, + "eval_train_loss/all": 2.0703628063201904, + "eval_train_loss/end_span": 1.2840806245803833, + "eval_train_perplexity/batch": 7.927698612213135, + "eval_train_perplexity/end_span": 3.6113462448120117, + "eval_train_perplexity/fim": 2.1995162963867188, + "eval_train_perplexity/first_seq": 15.058992385864258, + "eval_train_perplexity/last_seq": 9.32058048248291, + "eval_train_perplexity/second_seq": 14.374606132507324, + "eval_train_perplexity/seq": 9.11438274383545, + "eval_train_reconstruction/all": 0.27065110206604004, + "eval_train_reconstruction/end_span": 0.7016223073005676, + "eval_train_reconstruction/fim": 0.14737126231193542, + "eval_train_reconstruction/first_seq": 0.16217762231826782, + "eval_train_reconstruction/last_seq": 0.31294456124305725, + "eval_train_reconstruction/second_seq": 0.1819918304681778, + "eval_train_runtime": 423.0699, + "eval_train_samples_per_second": 0.454, + "eval_train_steps_per_second": 0.454, + "step": 150 + }, + { + "epoch": 0.0005968234074140388, + "grad_norm": 0.34899333119392395, + "learning_rate": 0.0006, + "loss": 2.3492, + "step": 160 + }, + { + "epoch": 0.0006341248703774162, + "grad_norm": 0.31025490164756775, + "learning_rate": 0.0006, + "loss": 2.153, + "step": 170 + }, + { + "epoch": 0.0006714263333407936, + "grad_norm": 0.36072802543640137, + "learning_rate": 0.0006, + "loss": 2.2212, + "step": 180 + }, + { + "epoch": 0.0007087277963041711, + "grad_norm": 0.5274562239646912, + "learning_rate": 0.0006, + "loss": 2.2131, + "step": 190 + }, + { + "epoch": 0.0007460292592675485, + "grad_norm": 0.4700324237346649, + "learning_rate": 0.0006, + "loss": 2.2895, + "step": 200 + }, + { + "epoch": 0.0007460292592675485, + "eval_valid_loss": 2.2506353855133057, + "eval_valid_loss/all": 2.1066203117370605, + "eval_valid_loss/end_span": 1.2818691730499268, + "eval_valid_perplexity/batch": 8.220412254333496, + "eval_valid_perplexity/end_span": 3.6033687591552734, + "eval_valid_perplexity/fim": 2.1171066761016846, + "eval_valid_perplexity/first_seq": 14.803163528442383, + "eval_valid_perplexity/last_seq": 9.780407905578613, + "eval_valid_perplexity/second_seq": 14.112411499023438, + "eval_valid_perplexity/seq": 9.239487648010254, + "eval_valid_reconstruction/all": 0.279074490070343, + "eval_valid_reconstruction/end_span": 0.6944790482521057, + "eval_valid_reconstruction/fim": 0.13914673030376434, + "eval_valid_reconstruction/first_seq": 0.17083600163459778, + "eval_valid_reconstruction/last_seq": 0.3013113737106323, + "eval_valid_reconstruction/second_seq": 0.18764500319957733, + "eval_valid_runtime": 425.6035, + "eval_valid_samples_per_second": 0.451, + "eval_valid_steps_per_second": 0.451, + "step": 200 + }, + { + "epoch": 0.0007460292592675485, + "eval_train_loss": 2.2472283840179443, + "eval_train_loss/all": 2.074686050415039, + "eval_train_loss/end_span": 1.2446478605270386, + "eval_train_perplexity/batch": 7.9620466232299805, + "eval_train_perplexity/end_span": 3.471712112426758, + "eval_train_perplexity/fim": 2.2139575481414795, + "eval_train_perplexity/first_seq": 15.228049278259277, + "eval_train_perplexity/last_seq": 9.440801620483398, + "eval_train_perplexity/second_seq": 13.89707088470459, + "eval_train_perplexity/seq": 9.143860816955566, + "eval_train_reconstruction/all": 0.2697349786758423, + "eval_train_reconstruction/end_span": 0.7061820030212402, + "eval_train_reconstruction/fim": 0.14820939302444458, + "eval_train_reconstruction/first_seq": 0.15584173798561096, + "eval_train_reconstruction/last_seq": 0.3100087344646454, + "eval_train_reconstruction/second_seq": 0.1926582306623459, + "eval_train_runtime": 424.9654, + "eval_train_samples_per_second": 0.452, + "eval_train_steps_per_second": 0.452, + "step": 200 + }, + { + "epoch": 0.0007833307222309259, + "grad_norm": 0.8766208291053772, + "learning_rate": 0.0006, + "loss": 2.378, + "step": 210 + }, + { + "epoch": 0.0008206321851943033, + "grad_norm": 0.5716990828514099, + "learning_rate": 0.0006, + "loss": 2.344, + "step": 220 + }, + { + "epoch": 0.0008579336481576807, + "grad_norm": 0.5023060441017151, + "learning_rate": 0.0006, + "loss": 2.3834, + "step": 230 + }, + { + "epoch": 0.0008952351111210582, + "grad_norm": 0.7070898413658142, + "learning_rate": 0.0006, + "loss": 2.2494, + "step": 240 + }, + { + "epoch": 0.0009325365740844356, + "grad_norm": 0.3583345115184784, + "learning_rate": 0.0006, + "loss": 2.2116, + "step": 250 + }, + { + "epoch": 0.0009325365740844356, + "eval_valid_loss": 2.248309850692749, + "eval_valid_loss/all": 2.1048033237457275, + "eval_valid_loss/end_span": 1.5024409294128418, + "eval_valid_perplexity/batch": 8.205489158630371, + "eval_valid_perplexity/end_span": 4.492641925811768, + "eval_valid_perplexity/fim": 2.4862828254699707, + "eval_valid_perplexity/first_seq": 14.72343921661377, + "eval_valid_perplexity/last_seq": 9.438117027282715, + "eval_valid_perplexity/second_seq": 13.643280982971191, + "eval_valid_perplexity/seq": 9.22680377960205, + "eval_valid_reconstruction/all": 0.2794574797153473, + "eval_valid_reconstruction/end_span": 0.6484724283218384, + "eval_valid_reconstruction/fim": 0.1684781312942505, + "eval_valid_reconstruction/first_seq": 0.17253641784191132, + "eval_valid_reconstruction/last_seq": 0.313616544008255, + "eval_valid_reconstruction/second_seq": 0.20000982284545898, + "eval_valid_runtime": 452.2726, + "eval_valid_samples_per_second": 0.425, + "eval_valid_steps_per_second": 0.425, + "step": 250 + }, + { + "epoch": 0.0009325365740844356, + "eval_train_loss": 2.243964910507202, + "eval_train_loss/all": 2.0721964836120605, + "eval_train_loss/end_span": 1.469992995262146, + "eval_train_perplexity/batch": 7.942248821258545, + "eval_train_perplexity/end_span": 4.3492045402526855, + "eval_train_perplexity/fim": 2.0532684326171875, + "eval_train_perplexity/first_seq": 15.334503173828125, + "eval_train_perplexity/last_seq": 9.49709415435791, + "eval_train_perplexity/second_seq": 14.265922546386719, + "eval_train_perplexity/seq": 9.121609687805176, + "eval_train_reconstruction/all": 0.2701709568500519, + "eval_train_reconstruction/end_span": 0.6583806872367859, + "eval_train_reconstruction/fim": 0.13463647663593292, + "eval_train_reconstruction/first_seq": 0.15559479594230652, + "eval_train_reconstruction/last_seq": 0.3089759349822998, + "eval_train_reconstruction/second_seq": 0.18475213646888733, + "eval_train_runtime": 429.2374, + "eval_train_samples_per_second": 0.447, + "eval_train_steps_per_second": 0.447, + "step": 250 + }, + { + "epoch": 0.000969838037047813, + "grad_norm": 0.5618013143539429, + "learning_rate": 0.0006, + "loss": 2.2042, + "step": 260 + }, + { + "epoch": 0.0010071395000111904, + "grad_norm": 0.8972188234329224, + "learning_rate": 0.0006, + "loss": 2.2338, + "step": 270 + }, + { + "epoch": 0.001044440962974568, + "grad_norm": 0.5056031942367554, + "learning_rate": 0.0006, + "loss": 2.2797, + "step": 280 + }, + { + "epoch": 0.0010817424259379452, + "grad_norm": 0.8329886198043823, + "learning_rate": 0.0006, + "loss": 2.3486, + "step": 290 + }, + { + "epoch": 0.0011190438889013227, + "grad_norm": 0.45791521668434143, + "learning_rate": 0.0006, + "loss": 2.221, + "step": 300 + }, + { + "epoch": 0.0011190438889013227, + "eval_valid_loss": 2.245884656906128, + "eval_valid_loss/all": 2.1031289100646973, + "eval_valid_loss/end_span": 1.2658547163009644, + "eval_valid_perplexity/batch": 8.191761016845703, + "eval_valid_perplexity/end_span": 3.5461223125457764, + "eval_valid_perplexity/fim": 2.2775838375091553, + "eval_valid_perplexity/first_seq": 15.019292831420898, + "eval_valid_perplexity/last_seq": 9.43178939819336, + "eval_valid_perplexity/second_seq": 13.908947944641113, + "eval_valid_perplexity/seq": 9.221272468566895, + "eval_valid_reconstruction/all": 0.279676616191864, + "eval_valid_reconstruction/end_span": 0.7044861912727356, + "eval_valid_reconstruction/fim": 0.15314684808254242, + "eval_valid_reconstruction/first_seq": 0.16659711301326752, + "eval_valid_reconstruction/last_seq": 0.3130665719509125, + "eval_valid_reconstruction/second_seq": 0.19150696694850922, + "eval_valid_runtime": 426.1563, + "eval_valid_samples_per_second": 0.451, + "eval_valid_steps_per_second": 0.451, + "step": 300 + }, + { + "epoch": 0.0011190438889013227, + "eval_train_loss": 2.245126724243164, + "eval_train_loss/all": 2.0739002227783203, + "eval_train_loss/end_span": 1.2507367134094238, + "eval_train_perplexity/batch": 7.95579195022583, + "eval_train_perplexity/end_span": 3.492915391921997, + "eval_train_perplexity/fim": 2.33967661857605, + "eval_train_perplexity/first_seq": 15.576973915100098, + "eval_train_perplexity/last_seq": 9.587554931640625, + "eval_train_perplexity/second_seq": 14.069046974182129, + "eval_train_perplexity/seq": 9.146804809570312, + "eval_train_reconstruction/all": 0.26941803097724915, + "eval_train_reconstruction/end_span": 0.712352991104126, + "eval_train_reconstruction/fim": 0.15894348919391632, + "eval_train_reconstruction/first_seq": 0.14921709895133972, + "eval_train_reconstruction/last_seq": 0.3080447018146515, + "eval_train_reconstruction/second_seq": 0.18935894966125488, + "eval_train_runtime": 425.0769, + "eval_train_samples_per_second": 0.452, + "eval_train_steps_per_second": 0.452, + "step": 300 + }, + { + "epoch": 0.0011563453518647002, + "grad_norm": 0.3887104392051697, + "learning_rate": 0.0006, + "loss": 2.3712, + "step": 310 + }, + { + "epoch": 0.0011936468148280777, + "grad_norm": 0.6115559935569763, + "learning_rate": 0.0006, + "loss": 2.3256, + "step": 320 + }, + { + "epoch": 0.001230948277791455, + "grad_norm": 0.3341672420501709, + "learning_rate": 0.0006, + "loss": 2.3411, + "step": 330 + }, + { + "epoch": 0.0012682497407548324, + "grad_norm": 0.4338113069534302, + "learning_rate": 0.0006, + "loss": 2.3353, + "step": 340 + }, + { + "epoch": 0.00130555120371821, + "grad_norm": 0.42838719487190247, + "learning_rate": 0.0006, + "loss": 2.3141, + "step": 350 + }, + { + "epoch": 0.00130555120371821, + "eval_valid_loss": 2.2443888187408447, + "eval_valid_loss/all": 2.1009554862976074, + "eval_valid_loss/end_span": 1.3528696298599243, + "eval_valid_perplexity/batch": 8.173975944519043, + "eval_valid_perplexity/end_span": 3.8685107231140137, + "eval_valid_perplexity/fim": 2.700857400894165, + "eval_valid_perplexity/first_seq": 14.820439338684082, + "eval_valid_perplexity/last_seq": 9.585735321044922, + "eval_valid_perplexity/second_seq": 13.986279487609863, + "eval_valid_perplexity/seq": 9.194361686706543, + "eval_valid_reconstruction/all": 0.2806708514690399, + "eval_valid_reconstruction/end_span": 0.6846615076065063, + "eval_valid_reconstruction/fim": 0.18767519295215607, + "eval_valid_reconstruction/first_seq": 0.1725599616765976, + "eval_valid_reconstruction/last_seq": 0.3077888488769531, + "eval_valid_reconstruction/second_seq": 0.18922173976898193, + "eval_valid_runtime": 423.4396, + "eval_valid_samples_per_second": 0.453, + "eval_valid_steps_per_second": 0.453, + "step": 350 + }, + { + "epoch": 0.00130555120371821, + "eval_train_loss": 2.242582321166992, + "eval_train_loss/all": 2.0709869861602783, + "eval_train_loss/end_span": 1.321014165878296, + "eval_train_perplexity/batch": 7.932648658752441, + "eval_train_perplexity/end_span": 3.7472198009490967, + "eval_train_perplexity/fim": 2.2454028129577637, + "eval_train_perplexity/first_seq": 15.372781753540039, + "eval_train_perplexity/last_seq": 8.907891273498535, + "eval_train_perplexity/second_seq": 14.539653778076172, + "eval_train_perplexity/seq": 9.118193626403809, + "eval_train_reconstruction/all": 0.2707076370716095, + "eval_train_reconstruction/end_span": 0.6962267756462097, + "eval_train_reconstruction/fim": 0.15203306078910828, + "eval_train_reconstruction/first_seq": 0.15258505940437317, + "eval_train_reconstruction/last_seq": 0.32804232835769653, + "eval_train_reconstruction/second_seq": 0.175615593791008, + "eval_train_runtime": 425.8004, + "eval_train_samples_per_second": 0.451, + "eval_train_steps_per_second": 0.451, + "step": 350 + }, + { + "epoch": 0.0013428526666815872, + "grad_norm": 0.43747881054878235, + "learning_rate": 0.0006, + "loss": 2.2589, + "step": 360 + }, + { + "epoch": 0.0013801541296449647, + "grad_norm": 0.3349955081939697, + "learning_rate": 0.0006, + "loss": 2.3769, + "step": 370 + }, + { + "epoch": 0.0014174555926083421, + "grad_norm": 0.5250808596611023, + "learning_rate": 0.0006, + "loss": 2.238, + "step": 380 + }, + { + "epoch": 0.0014547570555717194, + "grad_norm": 0.4960238039493561, + "learning_rate": 0.0006, + "loss": 2.1977, + "step": 390 + }, + { + "epoch": 0.001492058518535097, + "grad_norm": 0.34990715980529785, + "learning_rate": 0.0006, + "loss": 2.246, + "step": 400 + }, + { + "epoch": 0.001492058518535097, + "eval_valid_loss": 2.243054151535034, + "eval_valid_loss/all": 2.1000208854675293, + "eval_valid_loss/end_span": 1.532975673675537, + "eval_valid_perplexity/batch": 8.166340827941895, + "eval_valid_perplexity/end_span": 4.63193941116333, + "eval_valid_perplexity/fim": 2.6533730030059814, + "eval_valid_perplexity/first_seq": 14.365897178649902, + "eval_valid_perplexity/last_seq": 9.484182357788086, + "eval_valid_perplexity/second_seq": 13.739648818969727, + "eval_valid_perplexity/seq": 9.186469078063965, + "eval_valid_reconstruction/all": 0.2810598909854889, + "eval_valid_reconstruction/end_span": 0.6301844716072083, + "eval_valid_reconstruction/fim": 0.1824530065059662, + "eval_valid_reconstruction/first_seq": 0.17728641629219055, + "eval_valid_reconstruction/last_seq": 0.31165972352027893, + "eval_valid_reconstruction/second_seq": 0.1962990015745163, + "eval_valid_runtime": 426.6735, + "eval_valid_samples_per_second": 0.45, + "eval_valid_steps_per_second": 0.45, + "step": 400 + }, + { + "epoch": 0.001492058518535097, + "eval_train_loss": 2.240382671356201, + "eval_train_loss/all": 2.0695576667785645, + "eval_train_loss/end_span": 1.4985074996948242, + "eval_train_perplexity/batch": 7.921318531036377, + "eval_train_perplexity/end_span": 4.475005149841309, + "eval_train_perplexity/fim": 2.1758899688720703, + "eval_train_perplexity/first_seq": 15.219279289245605, + "eval_train_perplexity/last_seq": 9.295220375061035, + "eval_train_perplexity/second_seq": 13.952699661254883, + "eval_train_perplexity/seq": 9.104668617248535, + "eval_train_reconstruction/all": 0.27126941084861755, + "eval_train_reconstruction/end_span": 0.6402729153633118, + "eval_train_reconstruction/fim": 0.14508146047592163, + "eval_train_reconstruction/first_seq": 0.15730254352092743, + "eval_train_reconstruction/last_seq": 0.315996378660202, + "eval_train_reconstruction/second_seq": 0.18928951025009155, + "eval_train_runtime": 424.433, + "eval_train_samples_per_second": 0.452, + "eval_train_steps_per_second": 0.452, + "step": 400 + }, + { + "epoch": 0.0015293599814984744, + "grad_norm": 0.5462997555732727, + "learning_rate": 0.0006, + "loss": 2.3338, + "step": 410 + }, + { + "epoch": 0.0015666614444618519, + "grad_norm": 0.4225701093673706, + "learning_rate": 0.0006, + "loss": 2.2359, + "step": 420 + }, + { + "epoch": 0.0016039629074252292, + "grad_norm": 0.5470573306083679, + "learning_rate": 0.0006, + "loss": 2.2144, + "step": 430 + }, + { + "epoch": 0.0016412643703886066, + "grad_norm": 0.7079697847366333, + "learning_rate": 0.0006, + "loss": 2.3531, + "step": 440 + }, + { + "epoch": 0.0016785658333519841, + "grad_norm": 0.6649802327156067, + "learning_rate": 0.0006, + "loss": 2.3587, + "step": 450 + }, + { + "epoch": 0.0016785658333519841, + "eval_valid_loss": 2.2444708347320557, + "eval_valid_loss/all": 2.1016409397125244, + "eval_valid_loss/end_span": 1.3999078273773193, + "eval_valid_perplexity/batch": 8.179580688476562, + "eval_valid_perplexity/end_span": 4.054826259613037, + "eval_valid_perplexity/fim": 2.5063130855560303, + "eval_valid_perplexity/first_seq": 14.758633613586426, + "eval_valid_perplexity/last_seq": 10.028159141540527, + "eval_valid_perplexity/second_seq": 13.625282287597656, + "eval_valid_perplexity/seq": 9.204562187194824, + "eval_valid_reconstruction/all": 0.28038668632507324, + "eval_valid_reconstruction/end_span": 0.6784474849700928, + "eval_valid_reconstruction/fim": 0.17117659747600555, + "eval_valid_reconstruction/first_seq": 0.16929873824119568, + "eval_valid_reconstruction/last_seq": 0.2971630096435547, + "eval_valid_reconstruction/second_seq": 0.197022944688797, + "eval_valid_runtime": 423.448, + "eval_valid_samples_per_second": 0.453, + "eval_valid_steps_per_second": 0.453, + "step": 450 + }, + { + "epoch": 0.0016785658333519841, + "eval_train_loss": 2.2415096759796143, + "eval_train_loss/all": 2.0707666873931885, + "eval_train_loss/end_span": 1.3528107404708862, + "eval_train_perplexity/batch": 7.930901527404785, + "eval_train_perplexity/end_span": 3.8682830333709717, + "eval_train_perplexity/fim": 2.1766443252563477, + "eval_train_perplexity/first_seq": 15.49321174621582, + "eval_train_perplexity/last_seq": 9.448626518249512, + "eval_train_perplexity/second_seq": 14.096817970275879, + "eval_train_perplexity/seq": 9.122940063476562, + "eval_train_reconstruction/all": 0.2707836925983429, + "eval_train_reconstruction/end_span": 0.6916095018386841, + "eval_train_reconstruction/fim": 0.1456713080406189, + "eval_train_reconstruction/first_seq": 0.15504832565784454, + "eval_train_reconstruction/last_seq": 0.31087860465049744, + "eval_train_reconstruction/second_seq": 0.18818020820617676, + "eval_train_runtime": 423.7711, + "eval_train_samples_per_second": 0.453, + "eval_train_steps_per_second": 0.453, + "step": 450 + }, + { + "epoch": 0.0017158672963153614, + "grad_norm": 1.103627324104309, + "learning_rate": 0.0006, + "loss": 2.3398, + "step": 460 + }, + { + "epoch": 0.0017531687592787389, + "grad_norm": 0.6347258687019348, + "learning_rate": 0.0006, + "loss": 2.2729, + "step": 470 + }, + { + "epoch": 0.0017904702222421164, + "grad_norm": 0.6103149056434631, + "learning_rate": 0.0006, + "loss": 2.1979, + "step": 480 + }, + { + "epoch": 0.0018277716852054939, + "grad_norm": 0.5124384164810181, + "learning_rate": 0.0006, + "loss": 2.243, + "step": 490 + }, + { + "epoch": 0.0018650731481688711, + "grad_norm": 0.5198623538017273, + "learning_rate": 0.0006, + "loss": 2.2323, + "step": 500 + }, + { + "epoch": 0.0018650731481688711, + "eval_valid_loss": 2.2533273696899414, + "eval_valid_loss/all": 2.109740734100342, + "eval_valid_loss/end_span": 1.2636003494262695, + "eval_valid_perplexity/batch": 8.246103286743164, + "eval_valid_perplexity/end_span": 3.538137197494507, + "eval_valid_perplexity/fim": 2.2727465629577637, + "eval_valid_perplexity/first_seq": 14.446853637695312, + "eval_valid_perplexity/last_seq": 9.735045433044434, + "eval_valid_perplexity/second_seq": 13.691295623779297, + "eval_valid_perplexity/seq": 9.280083656311035, + "eval_valid_reconstruction/all": 0.2781563103199005, + "eval_valid_reconstruction/end_span": 0.706154465675354, + "eval_valid_reconstruction/fim": 0.15202882885932922, + "eval_valid_reconstruction/first_seq": 0.18029211461544037, + "eval_valid_reconstruction/last_seq": 0.3045370578765869, + "eval_valid_reconstruction/second_seq": 0.1945400983095169, + "eval_valid_runtime": 423.6481, + "eval_valid_samples_per_second": 0.453, + "eval_valid_steps_per_second": 0.453, + "step": 500 + }, + { + "epoch": 0.0018650731481688711, + "eval_train_loss": 2.247398853302002, + "eval_train_loss/all": 2.075667381286621, + "eval_train_loss/end_span": 1.2168118953704834, + "eval_train_perplexity/batch": 7.969863414764404, + "eval_train_perplexity/end_span": 3.376406192779541, + "eval_train_perplexity/fim": 2.345823049545288, + "eval_train_perplexity/first_seq": 15.406208038330078, + "eval_train_perplexity/last_seq": 9.848387718200684, + "eval_train_perplexity/second_seq": 14.635083198547363, + "eval_train_perplexity/seq": 9.162555694580078, + "eval_train_reconstruction/all": 0.2693566083908081, + "eval_train_reconstruction/end_span": 0.7214562296867371, + "eval_train_reconstruction/fim": 0.1592075675725937, + "eval_train_reconstruction/first_seq": 0.15510539710521698, + "eval_train_reconstruction/last_seq": 0.2959236204624176, + "eval_train_reconstruction/second_seq": 0.1743078976869583, + "eval_train_runtime": 422.3691, + "eval_train_samples_per_second": 0.455, + "eval_train_steps_per_second": 0.455, + "step": 500 + }, + { + "epoch": 0.0019023746111322486, + "grad_norm": 0.5075757503509521, + "learning_rate": 0.0006, + "loss": 2.1106, + "step": 510 + }, + { + "epoch": 0.001939676074095626, + "grad_norm": 0.9810751676559448, + "learning_rate": 0.0006, + "loss": 2.2201, + "step": 520 + }, + { + "epoch": 0.0019769775370590036, + "grad_norm": 0.49368682503700256, + "learning_rate": 0.0006, + "loss": 2.2106, + "step": 530 + }, + { + "epoch": 0.002014279000022381, + "grad_norm": 0.4513261616230011, + "learning_rate": 0.0006, + "loss": 2.3163, + "step": 540 + }, + { + "epoch": 0.002051580462985758, + "grad_norm": 0.5958581566810608, + "learning_rate": 0.0006, + "loss": 2.1857, + "step": 550 + }, + { + "epoch": 0.002051580462985758, + "eval_valid_loss": 2.2453410625457764, + "eval_valid_loss/all": 2.1022517681121826, + "eval_valid_loss/end_span": 1.401026725769043, + "eval_valid_perplexity/batch": 8.184578895568848, + "eval_valid_perplexity/end_span": 4.059365749359131, + "eval_valid_perplexity/fim": 2.361318826675415, + "eval_valid_perplexity/first_seq": 14.299607276916504, + "eval_valid_perplexity/last_seq": 9.705738067626953, + "eval_valid_perplexity/second_seq": 13.971681594848633, + "eval_valid_perplexity/seq": 9.210875511169434, + "eval_valid_reconstruction/all": 0.2803081274032593, + "eval_valid_reconstruction/end_span": 0.6692107319831848, + "eval_valid_reconstruction/fim": 0.15913386642932892, + "eval_valid_reconstruction/first_seq": 0.17946910858154297, + "eval_valid_reconstruction/last_seq": 0.30482998490333557, + "eval_valid_reconstruction/second_seq": 0.19258852303028107, + "eval_valid_runtime": 422.6755, + "eval_valid_samples_per_second": 0.454, + "eval_valid_steps_per_second": 0.454, + "step": 550 + }, + { + "epoch": 0.002051580462985758, + "eval_train_loss": 2.2418103218078613, + "eval_train_loss/all": 2.0709476470947266, + "eval_train_loss/end_span": 1.3661197423934937, + "eval_train_perplexity/batch": 7.932336807250977, + "eval_train_perplexity/end_span": 3.9201102256774902, + "eval_train_perplexity/fim": 2.2450976371765137, + "eval_train_perplexity/first_seq": 15.47325611114502, + "eval_train_perplexity/last_seq": 9.65749740600586, + "eval_train_perplexity/second_seq": 13.833841323852539, + "eval_train_perplexity/seq": 9.125374794006348, + "eval_train_reconstruction/all": 0.2708406150341034, + "eval_train_reconstruction/end_span": 0.6808918118476868, + "eval_train_reconstruction/fim": 0.15220339596271515, + "eval_train_reconstruction/first_seq": 0.15244804322719574, + "eval_train_reconstruction/last_seq": 0.29988136887550354, + "eval_train_reconstruction/second_seq": 0.1922861784696579, + "eval_train_runtime": 422.3382, + "eval_train_samples_per_second": 0.455, + "eval_train_steps_per_second": 0.455, + "step": 550 + }, + { + "epoch": 0.002088881925949136, + "grad_norm": 0.7571771144866943, + "learning_rate": 0.0006, + "loss": 2.2395, + "step": 560 + }, + { + "epoch": 0.002126183388912513, + "grad_norm": 0.41135627031326294, + "learning_rate": 0.0006, + "loss": 2.2978, + "step": 570 + }, + { + "epoch": 0.0021634848518758904, + "grad_norm": 0.38260993361473083, + "learning_rate": 0.0006, + "loss": 2.1848, + "step": 580 + }, + { + "epoch": 0.002200786314839268, + "grad_norm": 0.40670692920684814, + "learning_rate": 0.0006, + "loss": 2.0304, + "step": 590 + }, + { + "epoch": 0.0022380877778026454, + "grad_norm": 0.5311611294746399, + "learning_rate": 0.0006, + "loss": 2.2396, + "step": 600 + }, + { + "epoch": 0.0022380877778026454, + "eval_valid_loss": 2.2458791732788086, + "eval_valid_loss/all": 2.102909564971924, + "eval_valid_loss/end_span": 1.3502534627914429, + "eval_valid_perplexity/batch": 8.189964294433594, + "eval_valid_perplexity/end_span": 3.858403444290161, + "eval_valid_perplexity/fim": 2.1810715198516846, + "eval_valid_perplexity/first_seq": 14.89786148071289, + "eval_valid_perplexity/last_seq": 9.259566307067871, + "eval_valid_perplexity/second_seq": 13.9823579788208, + "eval_valid_perplexity/seq": 9.222006797790527, + "eval_valid_reconstruction/all": 0.28018397092819214, + "eval_valid_reconstruction/end_span": 0.6787510514259338, + "eval_valid_reconstruction/fim": 0.14617964625358582, + "eval_valid_reconstruction/first_seq": 0.1677406132221222, + "eval_valid_reconstruction/last_seq": 0.31934690475463867, + "eval_valid_reconstruction/second_seq": 0.18993933498859406, + "eval_valid_runtime": 422.9791, + "eval_valid_samples_per_second": 0.454, + "eval_valid_steps_per_second": 0.454, + "step": 600 + }, + { + "epoch": 0.0022380877778026454, + "eval_train_loss": 2.240086793899536, + "eval_train_loss/all": 2.0693674087524414, + "eval_train_loss/end_span": 1.3004779815673828, + "eval_train_perplexity/batch": 7.919811725616455, + "eval_train_perplexity/end_span": 3.671051025390625, + "eval_train_perplexity/fim": 2.1240344047546387, + "eval_train_perplexity/first_seq": 15.469769477844238, + "eval_train_perplexity/last_seq": 9.547102928161621, + "eval_train_perplexity/second_seq": 14.567835807800293, + "eval_train_perplexity/seq": 9.110413551330566, + "eval_train_reconstruction/all": 0.27128589153289795, + "eval_train_reconstruction/end_span": 0.6970626711845398, + "eval_train_reconstruction/fim": 0.14182741940021515, + "eval_train_reconstruction/first_seq": 0.15089024603366852, + "eval_train_reconstruction/last_seq": 0.30601802468299866, + "eval_train_reconstruction/second_seq": 0.17608465254306793, + "eval_train_runtime": 423.4838, + "eval_train_samples_per_second": 0.453, + "eval_train_steps_per_second": 0.453, + "step": 600 + }, + { + "epoch": 0.0022753892407660226, + "grad_norm": 0.394218385219574, + "learning_rate": 0.0006, + "loss": 2.2475, + "step": 610 + }, + { + "epoch": 0.0023126907037294003, + "grad_norm": 0.5438580513000488, + "learning_rate": 0.0006, + "loss": 2.3021, + "step": 620 + }, + { + "epoch": 0.0023499921666927776, + "grad_norm": 0.43195679783821106, + "learning_rate": 0.0006, + "loss": 2.3423, + "step": 630 + }, + { + "epoch": 0.0023872936296561553, + "grad_norm": 0.5171619653701782, + "learning_rate": 0.0006, + "loss": 2.2695, + "step": 640 + }, + { + "epoch": 0.0024245950926195326, + "grad_norm": 0.518604040145874, + "learning_rate": 0.0006, + "loss": 2.3336, + "step": 650 + }, + { + "epoch": 0.0024245950926195326, + "eval_valid_loss": 2.2409257888793945, + "eval_valid_loss/all": 2.098336696624756, + "eval_valid_loss/end_span": 1.446785807609558, + "eval_valid_perplexity/batch": 8.15259838104248, + "eval_valid_perplexity/end_span": 4.249433994293213, + "eval_valid_perplexity/fim": 2.521286725997925, + "eval_valid_perplexity/first_seq": 14.52824592590332, + "eval_valid_perplexity/last_seq": 9.446392059326172, + "eval_valid_perplexity/second_seq": 13.337362289428711, + "eval_valid_perplexity/seq": 9.173117637634277, + "eval_valid_reconstruction/all": 0.28126123547554016, + "eval_valid_reconstruction/end_span": 0.6588543653488159, + "eval_valid_reconstruction/fim": 0.17423443496227264, + "eval_valid_reconstruction/first_seq": 0.17518284916877747, + "eval_valid_reconstruction/last_seq": 0.3128245770931244, + "eval_valid_reconstruction/second_seq": 0.20498354732990265, + "eval_valid_runtime": 424.1563, + "eval_valid_samples_per_second": 0.453, + "eval_valid_steps_per_second": 0.453, + "step": 650 + }, + { + "epoch": 0.0024245950926195326, + "eval_train_loss": 2.235626697540283, + "eval_train_loss/all": 2.065098524093628, + "eval_train_loss/end_span": 1.411893367767334, + "eval_train_perplexity/batch": 7.886075019836426, + "eval_train_perplexity/end_span": 4.103717803955078, + "eval_train_perplexity/fim": 2.179630756378174, + "eval_train_perplexity/first_seq": 14.849600791931152, + "eval_train_perplexity/last_seq": 9.779481887817383, + "eval_train_perplexity/second_seq": 13.996376037597656, + "eval_train_perplexity/seq": 9.06552791595459, + "eval_train_reconstruction/all": 0.27230364084243774, + "eval_train_reconstruction/end_span": 0.6713069677352905, + "eval_train_reconstruction/fim": 0.1473502665758133, + "eval_train_reconstruction/first_seq": 0.1683163195848465, + "eval_train_reconstruction/last_seq": 0.29631680250167847, + "eval_train_reconstruction/second_seq": 0.18628011643886566, + "eval_train_runtime": 423.1736, + "eval_train_samples_per_second": 0.454, + "eval_train_steps_per_second": 0.454, + "step": 650 + }, + { + "epoch": 0.00246189655558291, + "grad_norm": 0.4265691339969635, + "learning_rate": 0.0006, + "loss": 2.0784, + "step": 660 + }, + { + "epoch": 0.0024991980185462876, + "grad_norm": 0.6192411184310913, + "learning_rate": 0.0006, + "loss": 2.3116, + "step": 670 + }, + { + "epoch": 0.002536499481509665, + "grad_norm": 0.4246861934661865, + "learning_rate": 0.0006, + "loss": 2.223, + "step": 680 + }, + { + "epoch": 0.002573800944473042, + "grad_norm": 0.5780408382415771, + "learning_rate": 0.0006, + "loss": 2.0753, + "step": 690 + }, + { + "epoch": 0.00261110240743642, + "grad_norm": 0.4032306969165802, + "learning_rate": 0.0006, + "loss": 2.431, + "step": 700 + }, + { + "epoch": 0.00261110240743642, + "eval_valid_loss": 2.2570462226867676, + "eval_valid_loss/all": 2.112900733947754, + "eval_valid_loss/end_span": 1.467542052268982, + "eval_valid_perplexity/batch": 8.272201538085938, + "eval_valid_perplexity/end_span": 4.338558197021484, + "eval_valid_perplexity/fim": 2.313089370727539, + "eval_valid_perplexity/first_seq": 14.657777786254883, + "eval_valid_perplexity/last_seq": 9.61377239227295, + "eval_valid_perplexity/second_seq": 13.539374351501465, + "eval_valid_perplexity/seq": 9.3034029006958, + "eval_valid_reconstruction/all": 0.2769138216972351, + "eval_valid_reconstruction/end_span": 0.6734012961387634, + "eval_valid_reconstruction/fim": 0.15179264545440674, + "eval_valid_reconstruction/first_seq": 0.17034246027469635, + "eval_valid_reconstruction/last_seq": 0.3070250153541565, + "eval_valid_reconstruction/second_seq": 0.20221304893493652, + "eval_valid_runtime": 423.787, + "eval_valid_samples_per_second": 0.453, + "eval_valid_steps_per_second": 0.453, + "step": 700 + }, + { + "epoch": 0.00261110240743642, + "eval_train_loss": 2.251260757446289, + "eval_train_loss/all": 2.0791680812835693, + "eval_train_loss/end_span": 1.366320013999939, + "eval_train_perplexity/batch": 7.997812747955322, + "eval_train_perplexity/end_span": 3.9208953380584717, + "eval_train_perplexity/fim": 2.169670820236206, + "eval_train_perplexity/first_seq": 15.318888664245605, + "eval_train_perplexity/last_seq": 9.560327529907227, + "eval_train_perplexity/second_seq": 13.995455741882324, + "eval_train_perplexity/seq": 9.194334983825684, + "eval_train_reconstruction/all": 0.26783961057662964, + "eval_train_reconstruction/end_span": 0.6910691261291504, + "eval_train_reconstruction/fim": 0.14273840188980103, + "eval_train_reconstruction/first_seq": 0.1571597009897232, + "eval_train_reconstruction/last_seq": 0.3072175681591034, + "eval_train_reconstruction/second_seq": 0.18926985561847687, + "eval_train_runtime": 423.9243, + "eval_train_samples_per_second": 0.453, + "eval_train_steps_per_second": 0.453, + "step": 700 + }, + { + "epoch": 0.002648403870399797, + "grad_norm": 0.3995530903339386, + "learning_rate": 0.0006, + "loss": 2.2408, + "step": 710 + }, + { + "epoch": 0.0026857053333631743, + "grad_norm": 0.5143084526062012, + "learning_rate": 0.0006, + "loss": 2.3706, + "step": 720 + }, + { + "epoch": 0.002723006796326552, + "grad_norm": 0.7673223614692688, + "learning_rate": 0.0006, + "loss": 2.253, + "step": 730 + }, + { + "epoch": 0.0027603082592899293, + "grad_norm": 0.4931619167327881, + "learning_rate": 0.0006, + "loss": 2.157, + "step": 740 + }, + { + "epoch": 0.0027976097222533066, + "grad_norm": 0.5465825200080872, + "learning_rate": 0.0006, + "loss": 2.135, + "step": 750 + }, + { + "epoch": 0.0027976097222533066, + "eval_valid_loss": 2.2584950923919678, + "eval_valid_loss/all": 2.114321231842041, + "eval_valid_loss/end_span": 1.4141557216644287, + "eval_valid_perplexity/batch": 8.283961296081543, + "eval_valid_perplexity/end_span": 4.113012313842773, + "eval_valid_perplexity/fim": 2.218484401702881, + "eval_valid_perplexity/first_seq": 14.638894081115723, + "eval_valid_perplexity/last_seq": 9.71036434173584, + "eval_valid_perplexity/second_seq": 13.773860931396484, + "eval_valid_perplexity/seq": 9.331141471862793, + "eval_valid_reconstruction/all": 0.27683115005493164, + "eval_valid_reconstruction/end_span": 0.6746647953987122, + "eval_valid_reconstruction/fim": 0.1456776261329651, + "eval_valid_reconstruction/first_seq": 0.17465437948703766, + "eval_valid_reconstruction/last_seq": 0.305462509393692, + "eval_valid_reconstruction/second_seq": 0.19755341112613678, + "eval_valid_runtime": 424.216, + "eval_valid_samples_per_second": 0.453, + "eval_valid_steps_per_second": 0.453, + "step": 750 + }, + { + "epoch": 0.0027976097222533066, + "eval_train_loss": 2.2502596378326416, + "eval_train_loss/all": 2.07905650138855, + "eval_train_loss/end_span": 1.3618639707565308, + "eval_train_perplexity/batch": 7.996920108795166, + "eval_train_perplexity/end_span": 3.9034624099731445, + "eval_train_perplexity/fim": 1.9489219188690186, + "eval_train_perplexity/first_seq": 15.351205825805664, + "eval_train_perplexity/last_seq": 10.131032943725586, + "eval_train_perplexity/second_seq": 14.184494972229004, + "eval_train_perplexity/seq": 9.20925235748291, + "eval_train_reconstruction/all": 0.2686041295528412, + "eval_train_reconstruction/end_span": 0.6871709823608398, + "eval_train_reconstruction/fim": 0.12324519455432892, + "eval_train_reconstruction/first_seq": 0.15772439539432526, + "eval_train_reconstruction/last_seq": 0.28852760791778564, + "eval_train_reconstruction/second_seq": 0.18146495521068573, + "eval_train_runtime": 423.3213, + "eval_train_samples_per_second": 0.454, + "eval_train_steps_per_second": 0.454, + "step": 750 + }, + { + "epoch": 0.0028349111852166843, + "grad_norm": 2.4759130477905273, + "learning_rate": 0.0006, + "loss": 2.3881, + "step": 760 + }, + { + "epoch": 0.0028722126481800616, + "grad_norm": 0.4680320620536804, + "learning_rate": 0.0006, + "loss": 2.2437, + "step": 770 + }, + { + "epoch": 0.002909514111143439, + "grad_norm": 0.4531870484352112, + "learning_rate": 0.0006, + "loss": 2.2053, + "step": 780 + }, + { + "epoch": 0.0029468155741068165, + "grad_norm": 0.7503834366798401, + "learning_rate": 0.0006, + "loss": 2.2606, + "step": 790 + }, + { + "epoch": 0.002984117037070194, + "grad_norm": 0.4256652295589447, + "learning_rate": 0.0006, + "loss": 2.1878, + "step": 800 + }, + { + "epoch": 0.002984117037070194, + "eval_valid_loss": 2.2478582859039307, + "eval_valid_loss/all": 2.1047351360321045, + "eval_valid_loss/end_span": 1.4422577619552612, + "eval_valid_perplexity/batch": 8.20492935180664, + "eval_valid_perplexity/end_span": 4.230236053466797, + "eval_valid_perplexity/fim": 2.434185266494751, + "eval_valid_perplexity/first_seq": 14.912623405456543, + "eval_valid_perplexity/last_seq": 9.40966510772705, + "eval_valid_perplexity/second_seq": 13.78257942199707, + "eval_valid_perplexity/seq": 9.234667778015137, + "eval_valid_reconstruction/all": 0.27925172448158264, + "eval_valid_reconstruction/end_span": 0.6576027274131775, + "eval_valid_reconstruction/fim": 0.16590046882629395, + "eval_valid_reconstruction/first_seq": 0.16950972378253937, + "eval_valid_reconstruction/last_seq": 0.314631849527359, + "eval_valid_reconstruction/second_seq": 0.19763003289699554, + "eval_valid_runtime": 427.6396, + "eval_valid_samples_per_second": 0.449, + "eval_valid_steps_per_second": 0.449, + "step": 800 + }, + { + "epoch": 0.002984117037070194, + "eval_train_loss": 2.2421748638153076, + "eval_train_loss/all": 2.0715391635894775, + "eval_train_loss/end_span": 1.413275957107544, + "eval_train_perplexity/batch": 7.93703031539917, + "eval_train_perplexity/end_span": 4.109395503997803, + "eval_train_perplexity/fim": 2.247498035430908, + "eval_train_perplexity/first_seq": 15.157039642333984, + "eval_train_perplexity/last_seq": 9.419574737548828, + "eval_train_perplexity/second_seq": 14.357809066772461, + "eval_train_perplexity/seq": 9.127965927124023, + "eval_train_reconstruction/all": 0.27035680413246155, + "eval_train_reconstruction/end_span": 0.6663448214530945, + "eval_train_reconstruction/fim": 0.15150530636310577, + "eval_train_reconstruction/first_seq": 0.15888845920562744, + "eval_train_reconstruction/last_seq": 0.3131220042705536, + "eval_train_reconstruction/second_seq": 0.18131813406944275, + "eval_train_runtime": 428.2548, + "eval_train_samples_per_second": 0.448, + "eval_train_steps_per_second": 0.448, + "step": 800 + }, + { + "epoch": 0.0030214185000335715, + "grad_norm": 0.5186172127723694, + "learning_rate": 0.0006, + "loss": 2.3685, + "step": 810 + }, + { + "epoch": 0.003058719962996949, + "grad_norm": 0.40983784198760986, + "learning_rate": 0.0006, + "loss": 2.0957, + "step": 820 + }, + { + "epoch": 0.003096021425960326, + "grad_norm": 1.4866093397140503, + "learning_rate": 0.0006, + "loss": 2.2743, + "step": 830 + }, + { + "epoch": 0.0031333228889237038, + "grad_norm": 0.5193774700164795, + "learning_rate": 0.0006, + "loss": 2.2733, + "step": 840 + }, + { + "epoch": 0.003170624351887081, + "grad_norm": 0.5017924308776855, + "learning_rate": 0.0006, + "loss": 2.4245, + "step": 850 + }, + { + "epoch": 0.003170624351887081, + "eval_valid_loss": 2.250948667526245, + "eval_valid_loss/all": 2.1072299480438232, + "eval_valid_loss/end_span": 1.3621842861175537, + "eval_valid_perplexity/batch": 8.225424766540527, + "eval_valid_perplexity/end_span": 3.9047129154205322, + "eval_valid_perplexity/fim": 2.5248663425445557, + "eval_valid_perplexity/first_seq": 14.826288223266602, + "eval_valid_perplexity/last_seq": 9.525287628173828, + "eval_valid_perplexity/second_seq": 13.823966979980469, + "eval_valid_perplexity/seq": 9.250818252563477, + "eval_valid_reconstruction/all": 0.2790178954601288, + "eval_valid_reconstruction/end_span": 0.6849936842918396, + "eval_valid_reconstruction/fim": 0.17273283004760742, + "eval_valid_reconstruction/first_seq": 0.1687050759792328, + "eval_valid_reconstruction/last_seq": 0.3117651343345642, + "eval_valid_reconstruction/second_seq": 0.19452589750289917, + "eval_valid_runtime": 426.404, + "eval_valid_samples_per_second": 0.45, + "eval_valid_steps_per_second": 0.45, + "step": 850 + }, + { + "epoch": 0.003170624351887081, + "eval_train_loss": 2.2475216388702393, + "eval_train_loss/all": 2.0758097171783447, + "eval_train_loss/end_span": 1.3354228734970093, + "eval_train_perplexity/batch": 7.970998287200928, + "eval_train_perplexity/end_span": 3.801603317260742, + "eval_train_perplexity/fim": 2.213397979736328, + "eval_train_perplexity/first_seq": 15.014763832092285, + "eval_train_perplexity/last_seq": 9.345738410949707, + "eval_train_perplexity/second_seq": 14.279513359069824, + "eval_train_perplexity/seq": 9.162177085876465, + "eval_train_reconstruction/all": 0.2694701552391052, + "eval_train_reconstruction/end_span": 0.6933578848838806, + "eval_train_reconstruction/fim": 0.148654505610466, + "eval_train_reconstruction/first_seq": 0.1626291275024414, + "eval_train_reconstruction/last_seq": 0.3107066750526428, + "eval_train_reconstruction/second_seq": 0.18382275104522705, + "eval_train_runtime": 428.2509, + "eval_train_samples_per_second": 0.448, + "eval_train_steps_per_second": 0.448, + "step": 850 + }, + { + "epoch": 0.0032079258148504583, + "grad_norm": 0.40767884254455566, + "learning_rate": 0.0006, + "loss": 2.1674, + "step": 860 + }, + { + "epoch": 0.003245227277813836, + "grad_norm": 0.41380131244659424, + "learning_rate": 0.0006, + "loss": 2.275, + "step": 870 + }, + { + "epoch": 0.0032825287407772133, + "grad_norm": 0.43658992648124695, + "learning_rate": 0.0006, + "loss": 2.1014, + "step": 880 + }, + { + "epoch": 0.0033198302037405905, + "grad_norm": 0.610468327999115, + "learning_rate": 0.0006, + "loss": 2.297, + "step": 890 + }, + { + "epoch": 0.0033571316667039683, + "grad_norm": 0.46477416157722473, + "learning_rate": 0.0006, + "loss": 2.2376, + "step": 900 + }, + { + "epoch": 0.0033571316667039683, + "eval_valid_loss": 2.2530500888824463, + "eval_valid_loss/all": 2.109557628631592, + "eval_valid_loss/end_span": 1.2702674865722656, + "eval_valid_perplexity/batch": 8.244593620300293, + "eval_valid_perplexity/end_span": 3.561805248260498, + "eval_valid_perplexity/fim": 2.3163225650787354, + "eval_valid_perplexity/first_seq": 14.963674545288086, + "eval_valid_perplexity/last_seq": 9.649340629577637, + "eval_valid_perplexity/second_seq": 13.917495727539062, + "eval_valid_perplexity/seq": 9.276806831359863, + "eval_valid_reconstruction/all": 0.27748072147369385, + "eval_valid_reconstruction/end_span": 0.7057514190673828, + "eval_valid_reconstruction/fim": 0.15599589049816132, + "eval_valid_reconstruction/first_seq": 0.1638992577791214, + "eval_valid_reconstruction/last_seq": 0.304791659116745, + "eval_valid_reconstruction/second_seq": 0.19183850288391113, + "eval_valid_runtime": 428.3102, + "eval_valid_samples_per_second": 0.448, + "eval_valid_steps_per_second": 0.448, + "step": 900 + }, + { + "epoch": 0.0033571316667039683, + "eval_train_loss": 2.247819185256958, + "eval_train_loss/all": 2.0762863159179688, + "eval_train_loss/end_span": 1.2318798303604126, + "eval_train_perplexity/batch": 7.97479772567749, + "eval_train_perplexity/end_span": 3.4276669025421143, + "eval_train_perplexity/fim": 2.1326470375061035, + "eval_train_perplexity/first_seq": 15.279287338256836, + "eval_train_perplexity/last_seq": 9.825380325317383, + "eval_train_perplexity/second_seq": 14.111409187316895, + "eval_train_perplexity/seq": 9.169953346252441, + "eval_train_reconstruction/all": 0.2685600817203522, + "eval_train_reconstruction/end_span": 0.7169285416603088, + "eval_train_reconstruction/fim": 0.1412089467048645, + "eval_train_reconstruction/first_seq": 0.15858246386051178, + "eval_train_reconstruction/last_seq": 0.2985265552997589, + "eval_train_reconstruction/second_seq": 0.18437035381793976, + "eval_train_runtime": 426.5567, + "eval_train_samples_per_second": 0.45, + "eval_train_steps_per_second": 0.45, + "step": 900 + }, + { + "epoch": 0.0033944331296673455, + "grad_norm": 0.5421077013015747, + "learning_rate": 0.0006, + "loss": 2.2684, + "step": 910 + }, + { + "epoch": 0.003431734592630723, + "grad_norm": 0.5041180849075317, + "learning_rate": 0.0006, + "loss": 2.2506, + "step": 920 + }, + { + "epoch": 0.0034690360555941005, + "grad_norm": 0.3851730227470398, + "learning_rate": 0.0006, + "loss": 2.2867, + "step": 930 + }, + { + "epoch": 0.0035063375185574778, + "grad_norm": 0.4953295886516571, + "learning_rate": 0.0006, + "loss": 2.2281, + "step": 940 + }, + { + "epoch": 0.003543638981520855, + "grad_norm": 0.843044638633728, + "learning_rate": 0.0006, + "loss": 2.2775, + "step": 950 + }, + { + "epoch": 0.003543638981520855, + "eval_valid_loss": 2.249528646469116, + "eval_valid_loss/all": 2.1060314178466797, + "eval_valid_loss/end_span": 1.2688151597976685, + "eval_valid_perplexity/batch": 8.215572357177734, + "eval_valid_perplexity/end_span": 3.556636095046997, + "eval_valid_perplexity/fim": 2.51434588432312, + "eval_valid_perplexity/first_seq": 14.79843807220459, + "eval_valid_perplexity/last_seq": 9.490052223205566, + "eval_valid_perplexity/second_seq": 13.55102252960205, + "eval_valid_perplexity/seq": 9.243762016296387, + "eval_valid_reconstruction/all": 0.2788875997066498, + "eval_valid_reconstruction/end_span": 0.6985920667648315, + "eval_valid_reconstruction/fim": 0.17202021181583405, + "eval_valid_reconstruction/first_seq": 0.17094913125038147, + "eval_valid_reconstruction/last_seq": 0.3134399354457855, + "eval_valid_reconstruction/second_seq": 0.1999603509902954, + "eval_valid_runtime": 426.3542, + "eval_valid_samples_per_second": 0.45, + "eval_valid_steps_per_second": 0.45, + "step": 950 + }, + { + "epoch": 0.003543638981520855, + "eval_train_loss": 2.2447140216827393, + "eval_train_loss/all": 2.0735087394714355, + "eval_train_loss/end_span": 1.2302989959716797, + "eval_train_perplexity/batch": 7.952678203582764, + "eval_train_perplexity/end_span": 3.422252655029297, + "eval_train_perplexity/fim": 2.3452322483062744, + "eval_train_perplexity/first_seq": 14.949406623840332, + "eval_train_perplexity/last_seq": 9.437347412109375, + "eval_train_perplexity/second_seq": 14.229103088378906, + "eval_train_perplexity/seq": 9.14814281463623, + "eval_train_reconstruction/all": 0.2698134183883667, + "eval_train_reconstruction/end_span": 0.7110738754272461, + "eval_train_reconstruction/fim": 0.15896075963974, + "eval_train_reconstruction/first_seq": 0.16466525197029114, + "eval_train_reconstruction/last_seq": 0.310940682888031, + "eval_train_reconstruction/second_seq": 0.18264156579971313, + "eval_train_runtime": 425.9326, + "eval_train_samples_per_second": 0.451, + "eval_train_steps_per_second": 0.451, + "step": 950 + }, + { + "epoch": 0.0035809404444842327, + "grad_norm": 0.5834395885467529, + "learning_rate": 0.0006, + "loss": 2.1897, + "step": 960 + }, + { + "epoch": 0.00361824190744761, + "grad_norm": 0.39764344692230225, + "learning_rate": 0.0006, + "loss": 2.3303, + "step": 970 + }, + { + "epoch": 0.0036555433704109877, + "grad_norm": 0.4281321167945862, + "learning_rate": 0.0006, + "loss": 2.1527, + "step": 980 + }, + { + "epoch": 0.003692844833374365, + "grad_norm": 0.526283860206604, + "learning_rate": 0.0006, + "loss": 2.3607, + "step": 990 + }, + { + "epoch": 0.0037301462963377423, + "grad_norm": 0.46033525466918945, + "learning_rate": 0.0006, + "loss": 2.1341, + "step": 1000 + }, + { + "epoch": 0.0037301462963377423, + "eval_valid_loss": 2.251211404800415, + "eval_valid_loss/all": 2.108123540878296, + "eval_valid_loss/end_span": 1.2776356935501099, + "eval_valid_perplexity/batch": 8.232778549194336, + "eval_valid_perplexity/end_span": 3.588146209716797, + "eval_valid_perplexity/fim": 2.4623191356658936, + "eval_valid_perplexity/first_seq": 14.84987735748291, + "eval_valid_perplexity/last_seq": 10.085597038269043, + "eval_valid_perplexity/second_seq": 13.688599586486816, + "eval_valid_perplexity/seq": 9.266373634338379, + "eval_valid_reconstruction/all": 0.2782413363456726, + "eval_valid_reconstruction/end_span": 0.7119661569595337, + "eval_valid_reconstruction/fim": 0.16658063232898712, + "eval_valid_reconstruction/first_seq": 0.16951173543930054, + "eval_valid_reconstruction/last_seq": 0.29352492094039917, + "eval_valid_reconstruction/second_seq": 0.19971616566181183, + "eval_valid_runtime": 427.3798, + "eval_valid_samples_per_second": 0.449, + "eval_valid_steps_per_second": 0.449, + "step": 1000 + }, + { + "epoch": 0.0037301462963377423, + "eval_train_loss": 2.2469704151153564, + "eval_train_loss/all": 2.0758914947509766, + "eval_train_loss/end_span": 1.241217851638794, + "eval_train_perplexity/batch": 7.971650123596191, + "eval_train_perplexity/end_span": 3.459824562072754, + "eval_train_perplexity/fim": 2.1171669960021973, + "eval_train_perplexity/first_seq": 15.428604125976562, + "eval_train_perplexity/last_seq": 9.717828750610352, + "eval_train_perplexity/second_seq": 14.396509170532227, + "eval_train_perplexity/seq": 9.174412727355957, + "eval_train_reconstruction/all": 0.26897984743118286, + "eval_train_reconstruction/end_span": 0.7254242897033691, + "eval_train_reconstruction/fim": 0.14009523391723633, + "eval_train_reconstruction/first_seq": 0.15570683777332306, + "eval_train_reconstruction/last_seq": 0.30401769280433655, + "eval_train_reconstruction/second_seq": 0.18058492243289948, + "eval_train_runtime": 427.4397, + "eval_train_samples_per_second": 0.449, + "eval_train_steps_per_second": 0.449, + "step": 1000 + }, + { + "epoch": 0.00376744775930112, + "grad_norm": 0.38100719451904297, + "learning_rate": 0.0006, + "loss": 2.1878, + "step": 1010 + }, + { + "epoch": 0.0038047492222644972, + "grad_norm": 0.9534198641777039, + "learning_rate": 0.0006, + "loss": 2.2324, + "step": 1020 + }, + { + "epoch": 0.0038420506852278745, + "grad_norm": 0.5537413954734802, + "learning_rate": 0.0006, + "loss": 2.2661, + "step": 1030 + }, + { + "epoch": 0.003879352148191252, + "grad_norm": 0.45522740483283997, + "learning_rate": 0.0006, + "loss": 2.3597, + "step": 1040 + }, + { + "epoch": 0.0039166536111546295, + "grad_norm": 0.4478371739387512, + "learning_rate": 0.0006, + "loss": 2.2154, + "step": 1050 + }, + { + "epoch": 0.0039166536111546295, + "eval_valid_loss": 2.244525671005249, + "eval_valid_loss/all": 2.101825475692749, + "eval_valid_loss/end_span": 1.4508376121520996, + "eval_valid_perplexity/batch": 8.181090354919434, + "eval_valid_perplexity/end_span": 4.266686916351318, + "eval_valid_perplexity/fim": 2.7096734046936035, + "eval_valid_perplexity/first_seq": 15.000930786132812, + "eval_valid_perplexity/last_seq": 9.733146667480469, + "eval_valid_perplexity/second_seq": 13.445528984069824, + "eval_valid_perplexity/seq": 9.21006965637207, + "eval_valid_reconstruction/all": 0.28018850088119507, + "eval_valid_reconstruction/end_span": 0.6650782823562622, + "eval_valid_reconstruction/fim": 0.1863560527563095, + "eval_valid_reconstruction/first_seq": 0.1630769968032837, + "eval_valid_reconstruction/last_seq": 0.30173182487487793, + "eval_valid_reconstruction/second_seq": 0.2052842378616333, + "eval_valid_runtime": 428.2009, + "eval_valid_samples_per_second": 0.448, + "eval_valid_steps_per_second": 0.448, + "step": 1050 + }, + { + "epoch": 0.0039166536111546295, + "eval_train_loss": 2.2434675693511963, + "eval_train_loss/all": 2.072892189025879, + "eval_train_loss/end_span": 1.417616844177246, + "eval_train_perplexity/batch": 7.9477763175964355, + "eval_train_perplexity/end_span": 4.127272605895996, + "eval_train_perplexity/fim": 2.031221866607666, + "eval_train_perplexity/first_seq": 15.434487342834473, + "eval_train_perplexity/last_seq": 9.608955383300781, + "eval_train_perplexity/second_seq": 14.344670295715332, + "eval_train_perplexity/seq": 9.146366119384766, + "eval_train_reconstruction/all": 0.26999548077583313, + "eval_train_reconstruction/end_span": 0.6771023869514465, + "eval_train_reconstruction/fim": 0.1327347308397293, + "eval_train_reconstruction/first_seq": 0.15337003767490387, + "eval_train_reconstruction/last_seq": 0.30308812856674194, + "eval_train_reconstruction/second_seq": 0.1809796541929245, + "eval_train_runtime": 428.739, + "eval_train_samples_per_second": 0.448, + "eval_train_steps_per_second": 0.448, + "step": 1050 + }, + { + "epoch": 0.003953955074118007, + "grad_norm": 0.5603331327438354, + "learning_rate": 0.0006, + "loss": 2.158, + "step": 1060 + }, + { + "epoch": 0.003991256537081384, + "grad_norm": 0.5724406838417053, + "learning_rate": 0.0006, + "loss": 2.2604, + "step": 1070 + }, + { + "epoch": 0.004028558000044762, + "grad_norm": 0.42018675804138184, + "learning_rate": 0.0006, + "loss": 2.2801, + "step": 1080 + }, + { + "epoch": 0.004065859463008139, + "grad_norm": 0.38499951362609863, + "learning_rate": 0.0006, + "loss": 2.4312, + "step": 1090 + }, + { + "epoch": 0.004103160925971516, + "grad_norm": 0.5315465331077576, + "learning_rate": 0.0006, + "loss": 2.3525, + "step": 1100 + }, + { + "epoch": 0.004103160925971516, + "eval_valid_loss": 2.246853828430176, + "eval_valid_loss/all": 2.1042068004608154, + "eval_valid_loss/end_span": 1.4185065031051636, + "eval_valid_perplexity/batch": 8.20059585571289, + "eval_valid_perplexity/end_span": 4.130946159362793, + "eval_valid_perplexity/fim": 2.1849782466888428, + "eval_valid_perplexity/first_seq": 14.793771743774414, + "eval_valid_perplexity/last_seq": 9.271878242492676, + "eval_valid_perplexity/second_seq": 13.881256103515625, + "eval_valid_perplexity/seq": 9.239014625549316, + "eval_valid_reconstruction/all": 0.2801474630832672, + "eval_valid_reconstruction/end_span": 0.6761884093284607, + "eval_valid_reconstruction/fim": 0.1467161327600479, + "eval_valid_reconstruction/first_seq": 0.17093312740325928, + "eval_valid_reconstruction/last_seq": 0.31766247749328613, + "eval_valid_reconstruction/second_seq": 0.19591836631298065, + "eval_valid_runtime": 427.8964, + "eval_valid_samples_per_second": 0.449, + "eval_valid_steps_per_second": 0.449, + "step": 1100 + }, + { + "epoch": 0.004103160925971516, + "eval_train_loss": 2.2402536869049072, + "eval_train_loss/all": 2.069697856903076, + "eval_train_loss/end_span": 1.3789020776748657, + "eval_train_perplexity/batch": 7.922429084777832, + "eval_train_perplexity/end_span": 3.9705398082733154, + "eval_train_perplexity/fim": 2.374708652496338, + "eval_train_perplexity/first_seq": 15.297523498535156, + "eval_train_perplexity/last_seq": 9.154006958007812, + "eval_train_perplexity/second_seq": 13.919699668884277, + "eval_train_perplexity/seq": 9.113212585449219, + "eval_train_reconstruction/all": 0.2716251015663147, + "eval_train_reconstruction/end_span": 0.686896800994873, + "eval_train_reconstruction/fim": 0.1635919064283371, + "eval_train_reconstruction/first_seq": 0.15695813298225403, + "eval_train_reconstruction/last_seq": 0.3220425248146057, + "eval_train_reconstruction/second_seq": 0.19098903238773346, + "eval_train_runtime": 427.3341, + "eval_train_samples_per_second": 0.449, + "eval_train_steps_per_second": 0.449, + "step": 1100 + }, + { + "epoch": 0.004140462388934894, + "grad_norm": 0.387356698513031, + "learning_rate": 0.0006, + "loss": 2.3707, + "step": 1110 + }, + { + "epoch": 0.004177763851898272, + "grad_norm": 0.3665528893470764, + "learning_rate": 0.0006, + "loss": 2.4098, + "step": 1120 + }, + { + "epoch": 0.0042150653148616485, + "grad_norm": 0.4210330545902252, + "learning_rate": 0.0006, + "loss": 2.2734, + "step": 1130 + }, + { + "epoch": 0.004252366777825026, + "grad_norm": 0.5299875736236572, + "learning_rate": 0.0006, + "loss": 2.3296, + "step": 1140 + }, + { + "epoch": 0.004289668240788404, + "grad_norm": 0.40548354387283325, + "learning_rate": 0.0006, + "loss": 2.2099, + "step": 1150 + }, + { + "epoch": 0.004289668240788404, + "eval_valid_loss": 2.2397286891937256, + "eval_valid_loss/all": 2.097534656524658, + "eval_valid_loss/end_span": 1.3184131383895874, + "eval_valid_perplexity/batch": 8.146061897277832, + "eval_valid_perplexity/end_span": 3.737485885620117, + "eval_valid_perplexity/fim": 2.279911994934082, + "eval_valid_perplexity/first_seq": 14.879349708557129, + "eval_valid_perplexity/last_seq": 9.469600677490234, + "eval_valid_perplexity/second_seq": 14.015905380249023, + "eval_valid_perplexity/seq": 9.17330265045166, + "eval_valid_reconstruction/all": 0.28196606040000916, + "eval_valid_reconstruction/end_span": 0.6903328895568848, + "eval_valid_reconstruction/fim": 0.1556854099035263, + "eval_valid_reconstruction/first_seq": 0.16806766390800476, + "eval_valid_reconstruction/last_seq": 0.31513512134552, + "eval_valid_reconstruction/second_seq": 0.18865345418453217, + "eval_valid_runtime": 427.1315, + "eval_valid_samples_per_second": 0.45, + "eval_valid_steps_per_second": 0.45, + "step": 1150 + }, + { + "epoch": 0.004289668240788404, + "eval_train_loss": 2.2354462146759033, + "eval_train_loss/all": 2.065378427505493, + "eval_train_loss/end_span": 1.2735751867294312, + "eval_train_perplexity/batch": 7.888282299041748, + "eval_train_perplexity/end_span": 3.573606014251709, + "eval_train_perplexity/fim": 2.102180004119873, + "eval_train_perplexity/first_seq": 15.319217681884766, + "eval_train_perplexity/last_seq": 9.641938209533691, + "eval_train_perplexity/second_seq": 14.017719268798828, + "eval_train_perplexity/seq": 9.071979522705078, + "eval_train_reconstruction/all": 0.27240926027297974, + "eval_train_reconstruction/end_span": 0.7027363181114197, + "eval_train_reconstruction/fim": 0.1411815583705902, + "eval_train_reconstruction/first_seq": 0.15913507342338562, + "eval_train_reconstruction/last_seq": 0.30327996611595154, + "eval_train_reconstruction/second_seq": 0.19200041890144348, + "eval_train_runtime": 426.364, + "eval_train_samples_per_second": 0.45, + "eval_train_steps_per_second": 0.45, + "step": 1150 + }, + { + "epoch": 0.004326969703751781, + "grad_norm": 0.5264424681663513, + "learning_rate": 0.0006, + "loss": 2.2859, + "step": 1160 + }, + { + "epoch": 0.0043642711667151585, + "grad_norm": 0.660700798034668, + "learning_rate": 0.0006, + "loss": 2.0688, + "step": 1170 + }, + { + "epoch": 0.004401572629678536, + "grad_norm": 0.5559983849525452, + "learning_rate": 0.0006, + "loss": 2.3488, + "step": 1180 + }, + { + "epoch": 0.004438874092641913, + "grad_norm": 0.36125487089157104, + "learning_rate": 0.0006, + "loss": 2.08, + "step": 1190 + }, + { + "epoch": 0.004476175555605291, + "grad_norm": 0.4029681980609894, + "learning_rate": 0.0006, + "loss": 2.3539, + "step": 1200 + }, + { + "epoch": 0.004476175555605291, + "eval_valid_loss": 2.2425410747528076, + "eval_valid_loss/all": 2.1003148555755615, + "eval_valid_loss/end_span": 1.355057954788208, + "eval_valid_perplexity/batch": 8.168741226196289, + "eval_valid_perplexity/end_span": 3.876985549926758, + "eval_valid_perplexity/fim": 2.334646224975586, + "eval_valid_perplexity/first_seq": 14.783363342285156, + "eval_valid_perplexity/last_seq": 9.129022598266602, + "eval_valid_perplexity/second_seq": 13.935602188110352, + "eval_valid_perplexity/seq": 9.206737518310547, + "eval_valid_reconstruction/all": 0.28074273467063904, + "eval_valid_reconstruction/end_span": 0.6800510883331299, + "eval_valid_reconstruction/fim": 0.16040174663066864, + "eval_valid_reconstruction/first_seq": 0.16775235533714294, + "eval_valid_reconstruction/last_seq": 0.3224773705005646, + "eval_valid_reconstruction/second_seq": 0.19389955699443817, + "eval_valid_runtime": 428.2607, + "eval_valid_samples_per_second": 0.448, + "eval_valid_steps_per_second": 0.448, + "step": 1200 + }, + { + "epoch": 0.004476175555605291, + "eval_train_loss": 2.2400882244110107, + "eval_train_loss/all": 2.070129632949829, + "eval_train_loss/end_span": 1.3239089250564575, + "eval_train_perplexity/batch": 7.9258503913879395, + "eval_train_perplexity/end_span": 3.758082866668701, + "eval_train_perplexity/fim": 2.1517577171325684, + "eval_train_perplexity/first_seq": 15.541817665100098, + "eval_train_perplexity/last_seq": 9.483269691467285, + "eval_train_perplexity/second_seq": 14.265711784362793, + "eval_train_perplexity/seq": 9.120699882507324, + "eval_train_reconstruction/all": 0.27090123295783997, + "eval_train_reconstruction/end_span": 0.6901019811630249, + "eval_train_reconstruction/fim": 0.14374570548534393, + "eval_train_reconstruction/first_seq": 0.153324156999588, + "eval_train_reconstruction/last_seq": 0.3118365705013275, + "eval_train_reconstruction/second_seq": 0.17940190434455872, + "eval_train_runtime": 428.175, + "eval_train_samples_per_second": 0.448, + "eval_train_steps_per_second": 0.448, + "step": 1200 + }, + { + "epoch": 0.004513477018568668, + "grad_norm": 0.422221302986145, + "learning_rate": 0.0006, + "loss": 2.2539, + "step": 1210 + }, + { + "epoch": 0.004550778481532045, + "grad_norm": 0.5181116461753845, + "learning_rate": 0.0006, + "loss": 2.2896, + "step": 1220 + }, + { + "epoch": 0.004588079944495423, + "grad_norm": 0.40529292821884155, + "learning_rate": 0.0006, + "loss": 2.1465, + "step": 1230 + }, + { + "epoch": 0.004625381407458801, + "grad_norm": 0.6013476252555847, + "learning_rate": 0.0006, + "loss": 2.3999, + "step": 1240 + }, + { + "epoch": 0.004662682870422178, + "grad_norm": 0.7363296747207642, + "learning_rate": 0.0006, + "loss": 2.3717, + "step": 1250 + }, + { + "epoch": 0.004662682870422178, + "eval_valid_loss": 2.2522671222686768, + "eval_valid_loss/all": 2.1091275215148926, + "eval_valid_loss/end_span": 1.2763117551803589, + "eval_valid_perplexity/batch": 8.241047859191895, + "eval_valid_perplexity/end_span": 3.5833988189697266, + "eval_valid_perplexity/fim": 2.222531795501709, + "eval_valid_perplexity/first_seq": 14.751067161560059, + "eval_valid_perplexity/last_seq": 9.1687593460083, + "eval_valid_perplexity/second_seq": 14.022235870361328, + "eval_valid_perplexity/seq": 9.276593208312988, + "eval_valid_reconstruction/all": 0.277487576007843, + "eval_valid_reconstruction/end_span": 0.7064905166625977, + "eval_valid_reconstruction/fim": 0.1476643681526184, + "eval_valid_reconstruction/first_seq": 0.17082582414150238, + "eval_valid_reconstruction/last_seq": 0.31884241104125977, + "eval_valid_reconstruction/second_seq": 0.1914825141429901, + "eval_valid_runtime": 427.9273, + "eval_valid_samples_per_second": 0.449, + "eval_valid_steps_per_second": 0.449, + "step": 1250 + }, + { + "epoch": 0.004662682870422178, + "eval_train_loss": 2.2472307682037354, + "eval_train_loss/all": 2.0758349895477295, + "eval_train_loss/end_span": 1.2453538179397583, + "eval_train_perplexity/batch": 7.9711995124816895, + "eval_train_perplexity/end_span": 3.474163770675659, + "eval_train_perplexity/fim": 2.235272169113159, + "eval_train_perplexity/first_seq": 15.448709487915039, + "eval_train_perplexity/last_seq": 9.53329849243164, + "eval_train_perplexity/second_seq": 14.118819236755371, + "eval_train_perplexity/seq": 9.162629127502441, + "eval_train_reconstruction/all": 0.2688543200492859, + "eval_train_reconstruction/end_span": 0.7168065309524536, + "eval_train_reconstruction/fim": 0.14990796148777008, + "eval_train_reconstruction/first_seq": 0.15358343720436096, + "eval_train_reconstruction/last_seq": 0.3078901469707489, + "eval_train_reconstruction/second_seq": 0.18455477058887482, + "eval_train_runtime": 428.044, + "eval_train_samples_per_second": 0.449, + "eval_train_steps_per_second": 0.449, + "step": 1250 + }, + { + "epoch": 0.004699984333385555, + "grad_norm": 0.6701756119728088, + "learning_rate": 0.0006, + "loss": 2.2579, + "step": 1260 + }, + { + "epoch": 0.004737285796348933, + "grad_norm": 0.5675057768821716, + "learning_rate": 0.0006, + "loss": 2.2748, + "step": 1270 + }, + { + "epoch": 0.004774587259312311, + "grad_norm": 0.3854740560054779, + "learning_rate": 0.0006, + "loss": 2.2074, + "step": 1280 + }, + { + "epoch": 0.0048118887222756875, + "grad_norm": 0.5680768489837646, + "learning_rate": 0.0006, + "loss": 2.287, + "step": 1290 + }, + { + "epoch": 0.004849190185239065, + "grad_norm": 1.1996582746505737, + "learning_rate": 0.0006, + "loss": 2.2882, + "step": 1300 + }, + { + "epoch": 0.004849190185239065, + "eval_valid_loss": 2.25736927986145, + "eval_valid_loss/all": 2.114164352416992, + "eval_valid_loss/end_span": 1.380411982536316, + "eval_valid_perplexity/batch": 8.282661437988281, + "eval_valid_perplexity/end_span": 3.9765396118164062, + "eval_valid_perplexity/fim": 2.4821078777313232, + "eval_valid_perplexity/first_seq": 14.879983901977539, + "eval_valid_perplexity/last_seq": 9.61940860748291, + "eval_valid_perplexity/second_seq": 14.199934005737305, + "eval_valid_perplexity/seq": 9.321694374084473, + "eval_valid_reconstruction/all": 0.27608224749565125, + "eval_valid_reconstruction/end_span": 0.6714438796043396, + "eval_valid_reconstruction/fim": 0.16746948659420013, + "eval_valid_reconstruction/first_seq": 0.16703921556472778, + "eval_valid_reconstruction/last_seq": 0.3072807788848877, + "eval_valid_reconstruction/second_seq": 0.18534398078918457, + "eval_valid_runtime": 427.5448, + "eval_valid_samples_per_second": 0.449, + "eval_valid_steps_per_second": 0.449, + "step": 1300 + }, + { + "epoch": 0.004849190185239065, + "eval_train_loss": 2.252511501312256, + "eval_train_loss/all": 2.0804972648620605, + "eval_train_loss/end_span": 1.3537791967391968, + "eval_train_perplexity/batch": 8.008450508117676, + "eval_train_perplexity/end_span": 3.8720309734344482, + "eval_train_perplexity/fim": 2.020843982696533, + "eval_train_perplexity/first_seq": 15.23729133605957, + "eval_train_perplexity/last_seq": 9.621651649475098, + "eval_train_perplexity/second_seq": 14.539057731628418, + "eval_train_perplexity/seq": 9.207404136657715, + "eval_train_reconstruction/all": 0.2676314413547516, + "eval_train_reconstruction/end_span": 0.6821051239967346, + "eval_train_reconstruction/fim": 0.13043265044689178, + "eval_train_reconstruction/first_seq": 0.15669766068458557, + "eval_train_reconstruction/last_seq": 0.3052464723587036, + "eval_train_reconstruction/second_seq": 0.17579303681850433, + "eval_train_runtime": 428.3795, + "eval_train_samples_per_second": 0.448, + "eval_train_steps_per_second": 0.448, + "step": 1300 + }, + { + "epoch": 0.004886491648202443, + "grad_norm": 7.9912638664245605, + "learning_rate": 0.0006, + "loss": 2.1688, + "step": 1310 + }, + { + "epoch": 0.00492379311116582, + "grad_norm": 0.604499101638794, + "learning_rate": 0.0006, + "loss": 2.259, + "step": 1320 + }, + { + "epoch": 0.004961094574129197, + "grad_norm": 0.5359344482421875, + "learning_rate": 0.0006, + "loss": 2.3315, + "step": 1330 + }, + { + "epoch": 0.004998396037092575, + "grad_norm": 0.3678419291973114, + "learning_rate": 0.0006, + "loss": 2.1778, + "step": 1340 + }, + { + "epoch": 0.005035697500055952, + "grad_norm": 0.46724584698677063, + "learning_rate": 0.0006, + "loss": 2.18, + "step": 1350 + }, + { + "epoch": 0.005035697500055952, + "eval_valid_loss": 2.249326467514038, + "eval_valid_loss/all": 2.1063520908355713, + "eval_valid_loss/end_span": 1.3081533908843994, + "eval_valid_perplexity/batch": 8.218207359313965, + "eval_valid_perplexity/end_span": 3.699336290359497, + "eval_valid_perplexity/fim": 2.533858060836792, + "eval_valid_perplexity/first_seq": 15.161561012268066, + "eval_valid_perplexity/last_seq": 9.38131046295166, + "eval_valid_perplexity/second_seq": 13.735158920288086, + "eval_valid_perplexity/seq": 9.260449409484863, + "eval_valid_reconstruction/all": 0.279072105884552, + "eval_valid_reconstruction/end_span": 0.6962647438049316, + "eval_valid_reconstruction/fim": 0.1734921932220459, + "eval_valid_reconstruction/first_seq": 0.15919487178325653, + "eval_valid_reconstruction/last_seq": 0.31700873374938965, + "eval_valid_reconstruction/second_seq": 0.1970754861831665, + "eval_valid_runtime": 426.0854, + "eval_valid_samples_per_second": 0.451, + "eval_valid_steps_per_second": 0.451, + "step": 1350 + }, + { + "epoch": 0.005035697500055952, + "eval_train_loss": 2.245746612548828, + "eval_train_loss/all": 2.0745909214019775, + "eval_train_loss/end_span": 1.2874573469161987, + "eval_train_perplexity/batch": 7.961288928985596, + "eval_train_perplexity/end_span": 3.623561382293701, + "eval_train_perplexity/fim": 2.097330093383789, + "eval_train_perplexity/first_seq": 15.45327377319336, + "eval_train_perplexity/last_seq": 9.44531536102295, + "eval_train_perplexity/second_seq": 14.393306732177734, + "eval_train_perplexity/seq": 9.163071632385254, + "eval_train_reconstruction/all": 0.26992231607437134, + "eval_train_reconstruction/end_span": 0.7018282413482666, + "eval_train_reconstruction/fim": 0.13929210603237152, + "eval_train_reconstruction/first_seq": 0.1546434760093689, + "eval_train_reconstruction/last_seq": 0.311982125043869, + "eval_train_reconstruction/second_seq": 0.18295583128929138, + "eval_train_runtime": 427.4901, + "eval_train_samples_per_second": 0.449, + "eval_train_steps_per_second": 0.449, + "step": 1350 + }, + { + "epoch": 0.00507299896301933, + "grad_norm": 0.34481343626976013, + "learning_rate": 0.0006, + "loss": 2.3682, + "step": 1360 + }, + { + "epoch": 0.005110300425982707, + "grad_norm": 0.47248056530952454, + "learning_rate": 0.0006, + "loss": 2.2267, + "step": 1370 + }, + { + "epoch": 0.005147601888946084, + "grad_norm": 0.35224831104278564, + "learning_rate": 0.0006, + "loss": 2.25, + "step": 1380 + }, + { + "epoch": 0.005184903351909462, + "grad_norm": 0.5016693472862244, + "learning_rate": 0.0006, + "loss": 2.3128, + "step": 1390 + }, + { + "epoch": 0.00522220481487284, + "grad_norm": 0.8465663194656372, + "learning_rate": 0.0006, + "loss": 2.2985, + "step": 1400 + }, + { + "epoch": 0.00522220481487284, + "eval_valid_loss": 2.2510740756988525, + "eval_valid_loss/all": 2.107717990875244, + "eval_valid_loss/end_span": 1.3131840229034424, + "eval_valid_perplexity/batch": 8.229439735412598, + "eval_valid_perplexity/end_span": 3.7179930210113525, + "eval_valid_perplexity/fim": 2.6065099239349365, + "eval_valid_perplexity/first_seq": 14.773634910583496, + "eval_valid_perplexity/last_seq": 9.73746395111084, + "eval_valid_perplexity/second_seq": 14.190474510192871, + "eval_valid_perplexity/seq": 9.271027565002441, + "eval_valid_reconstruction/all": 0.27864405512809753, + "eval_valid_reconstruction/end_span": 0.6900674700737, + "eval_valid_reconstruction/fim": 0.1771087497472763, + "eval_valid_reconstruction/first_seq": 0.16744092106819153, + "eval_valid_reconstruction/last_seq": 0.3022063076496124, + "eval_valid_reconstruction/second_seq": 0.18859981000423431, + "eval_valid_runtime": 427.8785, + "eval_valid_samples_per_second": 0.449, + "eval_valid_steps_per_second": 0.449, + "step": 1400 + }, + { + "epoch": 0.00522220481487284, + "eval_train_loss": 2.2470695972442627, + "eval_train_loss/all": 2.075305938720703, + "eval_train_loss/end_span": 1.2949750423431396, + "eval_train_perplexity/batch": 7.966983318328857, + "eval_train_perplexity/end_span": 3.650904893875122, + "eval_train_perplexity/fim": 2.125025987625122, + "eval_train_perplexity/first_seq": 15.531187057495117, + "eval_train_perplexity/last_seq": 9.55296802520752, + "eval_train_perplexity/second_seq": 13.964810371398926, + "eval_train_perplexity/seq": 9.159149169921875, + "eval_train_reconstruction/all": 0.2695596516132355, + "eval_train_reconstruction/end_span": 0.6974720358848572, + "eval_train_reconstruction/fim": 0.14067675173282623, + "eval_train_reconstruction/first_seq": 0.15116524696350098, + "eval_train_reconstruction/last_seq": 0.3034098446369171, + "eval_train_reconstruction/second_seq": 0.18959078192710876, + "eval_train_runtime": 427.0809, + "eval_train_samples_per_second": 0.45, + "eval_train_steps_per_second": 0.45, + "step": 1400 + }, + { + "epoch": 0.0052595062778362164, + "grad_norm": 0.4825917184352875, + "learning_rate": 0.0006, + "loss": 2.2985, + "step": 1410 + }, + { + "epoch": 0.005296807740799594, + "grad_norm": 0.7532668709754944, + "learning_rate": 0.0006, + "loss": 2.2569, + "step": 1420 + }, + { + "epoch": 0.005334109203762972, + "grad_norm": 0.45389536023139954, + "learning_rate": 0.0006, + "loss": 2.2531, + "step": 1430 + }, + { + "epoch": 0.005371410666726349, + "grad_norm": 0.4354783296585083, + "learning_rate": 0.0006, + "loss": 2.3607, + "step": 1440 + }, + { + "epoch": 0.005408712129689726, + "grad_norm": 0.5845543742179871, + "learning_rate": 0.0006, + "loss": 2.3272, + "step": 1450 + }, + { + "epoch": 0.005408712129689726, + "eval_valid_loss": 2.2522218227386475, + "eval_valid_loss/all": 2.108693838119507, + "eval_valid_loss/end_span": 1.344927191734314, + "eval_valid_perplexity/batch": 8.23747444152832, + "eval_valid_perplexity/end_span": 3.837907075881958, + "eval_valid_perplexity/fim": 2.4781458377838135, + "eval_valid_perplexity/first_seq": 14.965204238891602, + "eval_valid_perplexity/last_seq": 9.800877571105957, + "eval_valid_perplexity/second_seq": 13.771546363830566, + "eval_valid_perplexity/seq": 9.272320747375488, + "eval_valid_reconstruction/all": 0.2779235541820526, + "eval_valid_reconstruction/end_span": 0.6958125233650208, + "eval_valid_reconstruction/fim": 0.16885563731193542, + "eval_valid_reconstruction/first_seq": 0.1636795997619629, + "eval_valid_reconstruction/last_seq": 0.30098679661750793, + "eval_valid_reconstruction/second_seq": 0.19633926451206207, + "eval_valid_runtime": 423.7406, + "eval_valid_samples_per_second": 0.453, + "eval_valid_steps_per_second": 0.453, + "step": 1450 + }, + { + "epoch": 0.005408712129689726, + "eval_train_loss": 2.2488768100738525, + "eval_train_loss/all": 2.076909303665161, + "eval_train_loss/end_span": 1.3173754215240479, + "eval_train_perplexity/batch": 7.979767799377441, + "eval_train_perplexity/end_span": 3.733609437942505, + "eval_train_perplexity/fim": 2.183814764022827, + "eval_train_perplexity/first_seq": 15.164304733276367, + "eval_train_perplexity/last_seq": 9.73963737487793, + "eval_train_perplexity/second_seq": 13.849471092224121, + "eval_train_perplexity/seq": 9.174666404724121, + "eval_train_reconstruction/all": 0.26878079771995544, + "eval_train_reconstruction/end_span": 0.7039033770561218, + "eval_train_reconstruction/fim": 0.14584006369113922, + "eval_train_reconstruction/first_seq": 0.160858616232872, + "eval_train_reconstruction/last_seq": 0.29802364110946655, + "eval_train_reconstruction/second_seq": 0.19455444812774658, + "eval_train_runtime": 422.8449, + "eval_train_samples_per_second": 0.454, + "eval_train_steps_per_second": 0.454, + "step": 1450 + }, + { + "epoch": 0.005446013592653104, + "grad_norm": 0.4925304651260376, + "learning_rate": 0.0006, + "loss": 2.3412, + "step": 1460 + }, + { + "epoch": 0.005483315055616481, + "grad_norm": 0.36810386180877686, + "learning_rate": 0.0006, + "loss": 2.1549, + "step": 1470 + }, + { + "epoch": 0.005520616518579859, + "grad_norm": 0.5069787502288818, + "learning_rate": 0.0006, + "loss": 2.2448, + "step": 1480 + }, + { + "epoch": 0.005557917981543236, + "grad_norm": 0.46132609248161316, + "learning_rate": 0.0006, + "loss": 2.2792, + "step": 1490 + }, + { + "epoch": 0.005595219444506613, + "grad_norm": 0.687202513217926, + "learning_rate": 0.0006, + "loss": 2.187, + "step": 1500 + }, + { + "epoch": 0.005595219444506613, + "eval_valid_loss": 2.2407093048095703, + "eval_valid_loss/all": 2.0982167720794678, + "eval_valid_loss/end_span": 1.3512235879898071, + "eval_valid_perplexity/batch": 8.151620864868164, + "eval_valid_perplexity/end_span": 3.8621482849121094, + "eval_valid_perplexity/fim": 2.321497917175293, + "eval_valid_perplexity/first_seq": 14.895954132080078, + "eval_valid_perplexity/last_seq": 9.392706871032715, + "eval_valid_perplexity/second_seq": 13.893900871276855, + "eval_valid_perplexity/seq": 9.176228523254395, + "eval_valid_reconstruction/all": 0.2814292907714844, + "eval_valid_reconstruction/end_span": 0.6849014759063721, + "eval_valid_reconstruction/fim": 0.15909643471240997, + "eval_valid_reconstruction/first_seq": 0.172890305519104, + "eval_valid_reconstruction/last_seq": 0.3122873306274414, + "eval_valid_reconstruction/second_seq": 0.19346646964550018, + "eval_valid_runtime": 425.7103, + "eval_valid_samples_per_second": 0.451, + "eval_valid_steps_per_second": 0.451, + "step": 1500 + }, + { + "epoch": 0.005595219444506613, + "eval_train_loss": 2.2386314868927, + "eval_train_loss/all": 2.068192958831787, + "eval_train_loss/end_span": 1.3195552825927734, + "eval_train_perplexity/batch": 7.910515785217285, + "eval_train_perplexity/end_span": 3.7417569160461426, + "eval_train_perplexity/fim": 2.077040910720825, + "eval_train_perplexity/first_seq": 15.34037971496582, + "eval_train_perplexity/last_seq": 9.529004096984863, + "eval_train_perplexity/second_seq": 14.365530967712402, + "eval_train_perplexity/seq": 9.100088119506836, + "eval_train_reconstruction/all": 0.27133679389953613, + "eval_train_reconstruction/end_span": 0.694854736328125, + "eval_train_reconstruction/fim": 0.13817083835601807, + "eval_train_reconstruction/first_seq": 0.15606053173542023, + "eval_train_reconstruction/last_seq": 0.30496957898139954, + "eval_train_reconstruction/second_seq": 0.18000876903533936, + "eval_train_runtime": 426.1729, + "eval_train_samples_per_second": 0.451, + "eval_train_steps_per_second": 0.451, + "step": 1500 + }, + { + "epoch": 0.005632520907469991, + "grad_norm": 0.5072914361953735, + "learning_rate": 0.0006, + "loss": 2.1803, + "step": 1510 + }, + { + "epoch": 0.005669822370433369, + "grad_norm": 0.44118696451187134, + "learning_rate": 0.0006, + "loss": 2.18, + "step": 1520 + }, + { + "epoch": 0.005707123833396745, + "grad_norm": 0.4138711094856262, + "learning_rate": 0.0006, + "loss": 2.2747, + "step": 1530 + }, + { + "epoch": 0.005744425296360123, + "grad_norm": 0.5873846411705017, + "learning_rate": 0.0006, + "loss": 2.1964, + "step": 1540 + }, + { + "epoch": 0.005781726759323501, + "grad_norm": 0.4115417003631592, + "learning_rate": 0.0006, + "loss": 2.3463, + "step": 1550 + }, + { + "epoch": 0.005781726759323501, + "eval_valid_loss": 2.2445859909057617, + "eval_valid_loss/all": 2.1017966270446777, + "eval_valid_loss/end_span": 1.365036129951477, + "eval_valid_perplexity/batch": 8.180854797363281, + "eval_valid_perplexity/end_span": 3.9158644676208496, + "eval_valid_perplexity/fim": 2.4075698852539062, + "eval_valid_perplexity/first_seq": 14.933049201965332, + "eval_valid_perplexity/last_seq": 9.58634090423584, + "eval_valid_perplexity/second_seq": 14.032699584960938, + "eval_valid_perplexity/seq": 9.209527015686035, + "eval_valid_reconstruction/all": 0.2798616290092468, + "eval_valid_reconstruction/end_span": 0.6757923364639282, + "eval_valid_reconstruction/fim": 0.1631661206483841, + "eval_valid_reconstruction/first_seq": 0.16687236726284027, + "eval_valid_reconstruction/last_seq": 0.3041427433490753, + "eval_valid_reconstruction/second_seq": 0.1879492998123169, + "eval_valid_runtime": 422.8199, + "eval_valid_samples_per_second": 0.454, + "eval_valid_steps_per_second": 0.454, + "step": 1550 + }, + { + "epoch": 0.005781726759323501, + "eval_train_loss": 2.2438018321990967, + "eval_train_loss/all": 2.072707414627075, + "eval_train_loss/end_span": 1.3346112966537476, + "eval_train_perplexity/batch": 7.946308135986328, + "eval_train_perplexity/end_span": 3.7985191345214844, + "eval_train_perplexity/fim": 2.073694944381714, + "eval_train_perplexity/first_seq": 15.529191017150879, + "eval_train_perplexity/last_seq": 9.739995002746582, + "eval_train_perplexity/second_seq": 14.258859634399414, + "eval_train_perplexity/seq": 9.13597297668457, + "eval_train_reconstruction/all": 0.26986318826675415, + "eval_train_reconstruction/end_span": 0.6901471614837646, + "eval_train_reconstruction/fim": 0.1359255462884903, + "eval_train_reconstruction/first_seq": 0.15355746448040009, + "eval_train_reconstruction/last_seq": 0.30007103085517883, + "eval_train_reconstruction/second_seq": 0.18271999061107635, + "eval_train_runtime": 423.6315, + "eval_train_samples_per_second": 0.453, + "eval_train_steps_per_second": 0.453, + "step": 1550 + }, + { + "epoch": 0.005819028222286878, + "grad_norm": 0.5179300904273987, + "learning_rate": 0.0006, + "loss": 2.3237, + "step": 1560 + }, + { + "epoch": 0.005856329685250255, + "grad_norm": 0.33153852820396423, + "learning_rate": 0.0006, + "loss": 2.1575, + "step": 1570 + }, + { + "epoch": 0.005893631148213633, + "grad_norm": 0.6335381269454956, + "learning_rate": 0.0006, + "loss": 2.2301, + "step": 1580 + }, + { + "epoch": 0.005930932611177011, + "grad_norm": 0.9745264053344727, + "learning_rate": 0.0006, + "loss": 2.2808, + "step": 1590 + }, + { + "epoch": 0.005968234074140388, + "grad_norm": 0.5520747900009155, + "learning_rate": 0.0006, + "loss": 2.3487, + "step": 1600 + }, + { + "epoch": 0.005968234074140388, + "eval_valid_loss": 2.245920419692993, + "eval_valid_loss/all": 2.103649377822876, + "eval_valid_loss/end_span": 1.3559367656707764, + "eval_valid_perplexity/batch": 8.196025848388672, + "eval_valid_perplexity/end_span": 3.880394220352173, + "eval_valid_perplexity/fim": 2.209564208984375, + "eval_valid_perplexity/first_seq": 14.925773620605469, + "eval_valid_perplexity/last_seq": 9.881972312927246, + "eval_valid_perplexity/second_seq": 13.840446472167969, + "eval_valid_perplexity/seq": 9.229300498962402, + "eval_valid_reconstruction/all": 0.28033533692359924, + "eval_valid_reconstruction/end_span": 0.6793596744537354, + "eval_valid_reconstruction/fim": 0.14849084615707397, + "eval_valid_reconstruction/first_seq": 0.17047317326068878, + "eval_valid_reconstruction/last_seq": 0.3008950650691986, + "eval_valid_reconstruction/second_seq": 0.19336320459842682, + "eval_valid_runtime": 774.0416, + "eval_valid_samples_per_second": 0.248, + "eval_valid_steps_per_second": 0.248, + "step": 1600 + }, + { + "epoch": 0.005968234074140388, + "eval_train_loss": 2.242734670639038, + "eval_train_loss/all": 2.0721378326416016, + "eval_train_loss/end_span": 1.3125208616256714, + "eval_train_perplexity/batch": 7.941783428192139, + "eval_train_perplexity/end_span": 3.7155282497406006, + "eval_train_perplexity/fim": 2.246643304824829, + "eval_train_perplexity/first_seq": 15.426717758178711, + "eval_train_perplexity/last_seq": 9.615175247192383, + "eval_train_perplexity/second_seq": 14.028227806091309, + "eval_train_perplexity/seq": 9.136488914489746, + "eval_train_reconstruction/all": 0.270599365234375, + "eval_train_reconstruction/end_span": 0.6934203505516052, + "eval_train_reconstruction/fim": 0.15281403064727783, + "eval_train_reconstruction/first_seq": 0.15584971010684967, + "eval_train_reconstruction/last_seq": 0.3039902150630951, + "eval_train_reconstruction/second_seq": 0.1892123520374298, + "eval_train_runtime": 645.4861, + "eval_train_samples_per_second": 0.297, + "eval_train_steps_per_second": 0.297, + "step": 1600 + }, + { + "epoch": 0.006005535537103765, + "grad_norm": 0.4024076759815216, + "learning_rate": 0.0006, + "loss": 2.0817, + "step": 1610 + }, + { + "epoch": 0.006042837000067143, + "grad_norm": 0.44341957569122314, + "learning_rate": 0.0006, + "loss": 2.2461, + "step": 1620 + }, + { + "epoch": 0.00608013846303052, + "grad_norm": 0.47213342785835266, + "learning_rate": 0.0006, + "loss": 2.1994, + "step": 1630 + }, + { + "epoch": 0.006117439925993898, + "grad_norm": 0.40573737025260925, + "learning_rate": 0.0006, + "loss": 2.1351, + "step": 1640 + }, + { + "epoch": 0.006154741388957275, + "grad_norm": 0.42623022198677063, + "learning_rate": 0.0006, + "loss": 2.4295, + "step": 1650 + }, + { + "epoch": 0.006154741388957275, + "eval_valid_loss": 2.2460267543792725, + "eval_valid_loss/all": 2.103203773498535, + "eval_valid_loss/end_span": 1.3801193237304688, + "eval_valid_perplexity/batch": 8.192374229431152, + "eval_valid_perplexity/end_span": 3.9753758907318115, + "eval_valid_perplexity/fim": 2.1222944259643555, + "eval_valid_perplexity/first_seq": 14.5342435836792, + "eval_valid_perplexity/last_seq": 9.614267349243164, + "eval_valid_perplexity/second_seq": 13.527344703674316, + "eval_valid_perplexity/seq": 9.22684383392334, + "eval_valid_reconstruction/all": 0.2799656391143799, + "eval_valid_reconstruction/end_span": 0.6786834597587585, + "eval_valid_reconstruction/fim": 0.1410410851240158, + "eval_valid_reconstruction/first_seq": 0.17314158380031586, + "eval_valid_reconstruction/last_seq": 0.30284154415130615, + "eval_valid_reconstruction/second_seq": 0.19561368227005005, + "eval_valid_runtime": 655.563, + "eval_valid_samples_per_second": 0.293, + "eval_valid_steps_per_second": 0.293, + "step": 1650 + }, + { + "epoch": 0.006154741388957275, + "eval_train_loss": 2.242732286453247, + "eval_train_loss/all": 2.072218418121338, + "eval_train_loss/end_span": 1.3270665407180786, + "eval_train_perplexity/batch": 7.942423343658447, + "eval_train_perplexity/end_span": 3.769968032836914, + "eval_train_perplexity/fim": 2.1275041103363037, + "eval_train_perplexity/first_seq": 15.212873458862305, + "eval_train_perplexity/last_seq": 9.622066497802734, + "eval_train_perplexity/second_seq": 14.322379112243652, + "eval_train_perplexity/seq": 9.136427879333496, + "eval_train_reconstruction/all": 0.27028873562812805, + "eval_train_reconstruction/end_span": 0.6940090656280518, + "eval_train_reconstruction/fim": 0.14209643006324768, + "eval_train_reconstruction/first_seq": 0.156190425157547, + "eval_train_reconstruction/last_seq": 0.30103713274002075, + "eval_train_reconstruction/second_seq": 0.1819799244403839, + "eval_train_runtime": 644.0692, + "eval_train_samples_per_second": 0.298, + "eval_train_steps_per_second": 0.298, + "step": 1650 + }, + { + "epoch": 0.006192042851920652, + "grad_norm": 0.6498154997825623, + "learning_rate": 0.0006, + "loss": 2.1971, + "step": 1660 + }, + { + "epoch": 0.00622934431488403, + "grad_norm": 0.4013831615447998, + "learning_rate": 0.0006, + "loss": 2.2984, + "step": 1670 + }, + { + "epoch": 0.0062666457778474075, + "grad_norm": 0.37511542439460754, + "learning_rate": 0.0006, + "loss": 2.4193, + "step": 1680 + }, + { + "epoch": 0.006303947240810784, + "grad_norm": 0.38086622953414917, + "learning_rate": 0.0006, + "loss": 2.361, + "step": 1690 + }, + { + "epoch": 0.006341248703774162, + "grad_norm": 0.6298802495002747, + "learning_rate": 0.0006, + "loss": 2.2136, + "step": 1700 + }, + { + "epoch": 0.006341248703774162, + "eval_valid_loss": 2.2519187927246094, + "eval_valid_loss/all": 2.1085784435272217, + "eval_valid_loss/end_span": 1.4499006271362305, + "eval_valid_perplexity/batch": 8.23652458190918, + "eval_valid_perplexity/end_span": 4.262691020965576, + "eval_valid_perplexity/fim": 2.429232597351074, + "eval_valid_perplexity/first_seq": 14.701018333435059, + "eval_valid_perplexity/last_seq": 9.148624420166016, + "eval_valid_perplexity/second_seq": 14.048918724060059, + "eval_valid_perplexity/seq": 9.273892402648926, + "eval_valid_reconstruction/all": 0.27878516912460327, + "eval_valid_reconstruction/end_span": 0.6613112092018127, + "eval_valid_reconstruction/fim": 0.16514593362808228, + "eval_valid_reconstruction/first_seq": 0.17077438533306122, + "eval_valid_reconstruction/last_seq": 0.32254558801651, + "eval_valid_reconstruction/second_seq": 0.18779674172401428, + "eval_valid_runtime": 629.7584, + "eval_valid_samples_per_second": 0.305, + "eval_valid_steps_per_second": 0.305, + "step": 1700 + }, + { + "epoch": 0.006341248703774162, + "eval_train_loss": 2.246004343032837, + "eval_train_loss/all": 2.0747058391571045, + "eval_train_loss/end_span": 1.4123071432113647, + "eval_train_perplexity/batch": 7.9622039794921875, + "eval_train_perplexity/end_span": 4.105416297912598, + "eval_train_perplexity/fim": 2.2790610790252686, + "eval_train_perplexity/first_seq": 15.31831169128418, + "eval_train_perplexity/last_seq": 9.665667533874512, + "eval_train_perplexity/second_seq": 13.944708824157715, + "eval_train_perplexity/seq": 9.154427528381348, + "eval_train_reconstruction/all": 0.2697245180606842, + "eval_train_reconstruction/end_span": 0.6739349365234375, + "eval_train_reconstruction/fim": 0.15466901659965515, + "eval_train_reconstruction/first_seq": 0.15588602423667908, + "eval_train_reconstruction/last_seq": 0.30310529470443726, + "eval_train_reconstruction/second_seq": 0.19257180392742157, + "eval_train_runtime": 641.6392, + "eval_train_samples_per_second": 0.299, + "eval_train_steps_per_second": 0.299, + "step": 1700 + }, + { + "epoch": 0.00637855016673754, + "grad_norm": 0.3127906322479248, + "learning_rate": 0.0006, + "loss": 2.1979, + "step": 1710 + }, + { + "epoch": 0.006415851629700917, + "grad_norm": 0.3999284505844116, + "learning_rate": 0.0006, + "loss": 2.2116, + "step": 1720 + }, + { + "epoch": 0.006453153092664294, + "grad_norm": 0.5503367185592651, + "learning_rate": 0.0006, + "loss": 2.4374, + "step": 1730 + }, + { + "epoch": 0.006490454555627672, + "grad_norm": 0.35121697187423706, + "learning_rate": 0.0006, + "loss": 2.2487, + "step": 1740 + }, + { + "epoch": 0.006527756018591049, + "grad_norm": 0.44704508781433105, + "learning_rate": 0.0006, + "loss": 2.2565, + "step": 1750 + }, + { + "epoch": 0.006527756018591049, + "eval_valid_loss": 2.247415542602539, + "eval_valid_loss/all": 2.104451894760132, + "eval_valid_loss/end_span": 1.359117031097412, + "eval_valid_perplexity/batch": 8.202606201171875, + "eval_valid_perplexity/end_span": 3.892754554748535, + "eval_valid_perplexity/fim": 2.5430355072021484, + "eval_valid_perplexity/first_seq": 14.86651611328125, + "eval_valid_perplexity/last_seq": 9.369755744934082, + "eval_valid_perplexity/second_seq": 14.33614730834961, + "eval_valid_perplexity/seq": 9.240581512451172, + "eval_valid_reconstruction/all": 0.280115008354187, + "eval_valid_reconstruction/end_span": 0.6839351654052734, + "eval_valid_reconstruction/fim": 0.1749510020017624, + "eval_valid_reconstruction/first_seq": 0.16955673694610596, + "eval_valid_reconstruction/last_seq": 0.3157842755317688, + "eval_valid_reconstruction/second_seq": 0.17962650954723358, + "eval_valid_runtime": 619.017, + "eval_valid_samples_per_second": 0.31, + "eval_valid_steps_per_second": 0.31, + "step": 1750 + }, + { + "epoch": 0.006527756018591049, + "eval_train_loss": 2.240398406982422, + "eval_train_loss/all": 2.069863796234131, + "eval_train_loss/end_span": 1.3209656476974487, + "eval_train_perplexity/batch": 7.923743724822998, + "eval_train_perplexity/end_span": 3.747037887573242, + "eval_train_perplexity/fim": 2.188920259475708, + "eval_train_perplexity/first_seq": 15.588814735412598, + "eval_train_perplexity/last_seq": 9.64716625213623, + "eval_train_perplexity/second_seq": 14.483451843261719, + "eval_train_perplexity/seq": 9.111109733581543, + "eval_train_reconstruction/all": 0.2713903486728668, + "eval_train_reconstruction/end_span": 0.699569821357727, + "eval_train_reconstruction/fim": 0.14826807379722595, + "eval_train_reconstruction/first_seq": 0.15207692980766296, + "eval_train_reconstruction/last_seq": 0.30586203932762146, + "eval_train_reconstruction/second_seq": 0.180084690451622, + "eval_train_runtime": 639.1242, + "eval_train_samples_per_second": 0.3, + "eval_train_steps_per_second": 0.3, + "step": 1750 + }, + { + "epoch": 0.0065650574815544266, + "grad_norm": 0.4837321639060974, + "learning_rate": 0.0006, + "loss": 2.3402, + "step": 1760 + }, + { + "epoch": 0.006602358944517804, + "grad_norm": 0.46977171301841736, + "learning_rate": 0.0006, + "loss": 2.3552, + "step": 1770 + }, + { + "epoch": 0.006639660407481181, + "grad_norm": 0.3700715899467468, + "learning_rate": 0.0006, + "loss": 2.2705, + "step": 1780 + }, + { + "epoch": 0.006676961870444559, + "grad_norm": 0.23813456296920776, + "learning_rate": 0.0006, + "loss": 2.3701, + "step": 1790 + }, + { + "epoch": 0.0067142633334079365, + "grad_norm": 0.43839913606643677, + "learning_rate": 0.0006, + "loss": 2.2305, + "step": 1800 + }, + { + "epoch": 0.0067142633334079365, + "eval_valid_loss": 2.2463271617889404, + "eval_valid_loss/all": 2.103729486465454, + "eval_valid_loss/end_span": 1.224543571472168, + "eval_valid_perplexity/batch": 8.19668197631836, + "eval_valid_perplexity/end_span": 3.4026126861572266, + "eval_valid_perplexity/fim": 2.4097704887390137, + "eval_valid_perplexity/first_seq": 15.483729362487793, + "eval_valid_perplexity/last_seq": 9.667292594909668, + "eval_valid_perplexity/second_seq": 14.08749771118164, + "eval_valid_perplexity/seq": 9.232823371887207, + "eval_valid_reconstruction/all": 0.27974772453308105, + "eval_valid_reconstruction/end_span": 0.7148684859275818, + "eval_valid_reconstruction/fim": 0.16418352723121643, + "eval_valid_reconstruction/first_seq": 0.15567205846309662, + "eval_valid_reconstruction/last_seq": 0.306972861289978, + "eval_valid_reconstruction/second_seq": 0.19032643735408783, + "eval_valid_runtime": 631.4591, + "eval_valid_samples_per_second": 0.304, + "eval_valid_steps_per_second": 0.304, + "step": 1800 + }, + { + "epoch": 0.0067142633334079365, + "eval_train_loss": 2.2409555912017822, + "eval_train_loss/all": 2.070718288421631, + "eval_train_loss/end_span": 1.1773655414581299, + "eval_train_perplexity/batch": 7.930517673492432, + "eval_train_perplexity/end_span": 3.245811939239502, + "eval_train_perplexity/fim": 2.2915046215057373, + "eval_train_perplexity/first_seq": 15.384885787963867, + "eval_train_perplexity/last_seq": 9.27503490447998, + "eval_train_perplexity/second_seq": 14.317779541015625, + "eval_train_perplexity/seq": 9.12890911102295, + "eval_train_reconstruction/all": 0.27070340514183044, + "eval_train_reconstruction/end_span": 0.7306892275810242, + "eval_train_reconstruction/fim": 0.156253844499588, + "eval_train_reconstruction/first_seq": 0.15489491820335388, + "eval_train_reconstruction/last_seq": 0.3159467577934265, + "eval_train_reconstruction/second_seq": 0.18164759874343872, + "eval_train_runtime": 635.4954, + "eval_train_samples_per_second": 0.302, + "eval_train_steps_per_second": 0.302, + "step": 1800 + }, + { + "epoch": 0.006751564796371313, + "grad_norm": 0.298348069190979, + "learning_rate": 0.0006, + "loss": 2.1259, + "step": 1810 + }, + { + "epoch": 0.006788866259334691, + "grad_norm": 0.3750624656677246, + "learning_rate": 0.0006, + "loss": 2.322, + "step": 1820 + }, + { + "epoch": 0.006826167722298069, + "grad_norm": 0.4514702260494232, + "learning_rate": 0.0006, + "loss": 2.3591, + "step": 1830 + }, + { + "epoch": 0.006863469185261446, + "grad_norm": 0.41193366050720215, + "learning_rate": 0.0006, + "loss": 2.1954, + "step": 1840 + }, + { + "epoch": 0.006900770648224823, + "grad_norm": 0.4568136930465698, + "learning_rate": 0.0006, + "loss": 2.2019, + "step": 1850 + }, + { + "epoch": 0.006900770648224823, + "eval_valid_loss": 2.242241859436035, + "eval_valid_loss/all": 2.0995750427246094, + "eval_valid_loss/end_span": 1.2407517433166504, + "eval_valid_perplexity/batch": 8.162700653076172, + "eval_valid_perplexity/end_span": 3.45821213722229, + "eval_valid_perplexity/fim": 2.2322020530700684, + "eval_valid_perplexity/first_seq": 14.352822303771973, + "eval_valid_perplexity/last_seq": 9.300773620605469, + "eval_valid_perplexity/second_seq": 14.194681167602539, + "eval_valid_perplexity/seq": 9.188125610351562, + "eval_valid_reconstruction/all": 0.28123199939727783, + "eval_valid_reconstruction/end_span": 0.7096807956695557, + "eval_valid_reconstruction/fim": 0.15111194550991058, + "eval_valid_reconstruction/first_seq": 0.17770424485206604, + "eval_valid_reconstruction/last_seq": 0.3195769786834717, + "eval_valid_reconstruction/second_seq": 0.18613553047180176, + "eval_valid_runtime": 625.3918, + "eval_valid_samples_per_second": 0.307, + "eval_valid_steps_per_second": 0.307, + "step": 1850 + }, + { + "epoch": 0.006900770648224823, + "eval_train_loss": 2.239694356918335, + "eval_train_loss/all": 2.0690901279449463, + "eval_train_loss/end_span": 1.2115142345428467, + "eval_train_perplexity/batch": 7.91761589050293, + "eval_train_perplexity/end_span": 3.3585665225982666, + "eval_train_perplexity/fim": 2.2483599185943604, + "eval_train_perplexity/first_seq": 15.578682899475098, + "eval_train_perplexity/last_seq": 9.460016250610352, + "eval_train_perplexity/second_seq": 14.357412338256836, + "eval_train_perplexity/seq": 9.10731029510498, + "eval_train_reconstruction/all": 0.271443247795105, + "eval_train_reconstruction/end_span": 0.722449779510498, + "eval_train_reconstruction/fim": 0.15269255638122559, + "eval_train_reconstruction/first_seq": 0.15318846702575684, + "eval_train_reconstruction/last_seq": 0.3099044859409332, + "eval_train_reconstruction/second_seq": 0.17997196316719055, + "eval_train_runtime": 632.4011, + "eval_train_samples_per_second": 0.304, + "eval_train_steps_per_second": 0.304, + "step": 1850 + }, + { + "epoch": 0.006938072111188201, + "grad_norm": 0.6219358444213867, + "learning_rate": 0.0006, + "loss": 2.2524, + "step": 1860 + }, + { + "epoch": 0.006975373574151578, + "grad_norm": 0.43958398699760437, + "learning_rate": 0.0006, + "loss": 2.3515, + "step": 1870 + }, + { + "epoch": 0.0070126750371149555, + "grad_norm": 0.3278726637363434, + "learning_rate": 0.0006, + "loss": 2.3189, + "step": 1880 + }, + { + "epoch": 0.007049976500078333, + "grad_norm": 0.4087875485420227, + "learning_rate": 0.0006, + "loss": 2.2173, + "step": 1890 + }, + { + "epoch": 0.00708727796304171, + "grad_norm": 0.48097413778305054, + "learning_rate": 0.0006, + "loss": 2.1712, + "step": 1900 + }, + { + "epoch": 0.00708727796304171, + "eval_valid_loss": 2.242457866668701, + "eval_valid_loss/all": 2.0997891426086426, + "eval_valid_loss/end_span": 1.217121958732605, + "eval_valid_perplexity/batch": 8.164447784423828, + "eval_valid_perplexity/end_span": 3.377453327178955, + "eval_valid_perplexity/fim": 2.480557441711426, + "eval_valid_perplexity/first_seq": 14.955142974853516, + "eval_valid_perplexity/last_seq": 9.612415313720703, + "eval_valid_perplexity/second_seq": 14.155356407165527, + "eval_valid_perplexity/seq": 9.195425033569336, + "eval_valid_reconstruction/all": 0.28096655011177063, + "eval_valid_reconstruction/end_span": 0.7220392227172852, + "eval_valid_reconstruction/fim": 0.16940589249134064, + "eval_valid_reconstruction/first_seq": 0.16607755422592163, + "eval_valid_reconstruction/last_seq": 0.3087802827358246, + "eval_valid_reconstruction/second_seq": 0.185500368475914, + "eval_valid_runtime": 625.7875, + "eval_valid_samples_per_second": 0.307, + "eval_valid_steps_per_second": 0.307, + "step": 1900 + }, + { + "epoch": 0.00708727796304171, + "eval_train_loss": 2.2393271923065186, + "eval_train_loss/all": 2.0685627460479736, + "eval_train_loss/end_span": 1.1728997230529785, + "eval_train_perplexity/batch": 7.913441181182861, + "eval_train_perplexity/end_span": 3.231348991394043, + "eval_train_perplexity/fim": 2.418208599090576, + "eval_train_perplexity/first_seq": 15.516197204589844, + "eval_train_perplexity/last_seq": 9.619400978088379, + "eval_train_perplexity/second_seq": 14.487268447875977, + "eval_train_perplexity/seq": 9.104260444641113, + "eval_train_reconstruction/all": 0.2712896764278412, + "eval_train_reconstruction/end_span": 0.7371212244033813, + "eval_train_reconstruction/fim": 0.1660480946302414, + "eval_train_reconstruction/first_seq": 0.15065868198871613, + "eval_train_reconstruction/last_seq": 0.3045074939727783, + "eval_train_reconstruction/second_seq": 0.17706382274627686, + "eval_train_runtime": 639.4378, + "eval_train_samples_per_second": 0.3, + "eval_train_steps_per_second": 0.3, + "step": 1900 + }, + { + "epoch": 0.007124579426005088, + "grad_norm": 0.373972088098526, + "learning_rate": 0.0006, + "loss": 2.2489, + "step": 1910 + }, + { + "epoch": 0.0071618808889684655, + "grad_norm": 0.28186315298080444, + "learning_rate": 0.0006, + "loss": 2.2109, + "step": 1920 + }, + { + "epoch": 0.007199182351931843, + "grad_norm": 0.44769856333732605, + "learning_rate": 0.0006, + "loss": 2.3451, + "step": 1930 + }, + { + "epoch": 0.00723648381489522, + "grad_norm": 0.3849020302295685, + "learning_rate": 0.0006, + "loss": 2.2524, + "step": 1940 + }, + { + "epoch": 0.007273785277858598, + "grad_norm": 0.409803181886673, + "learning_rate": 0.0006, + "loss": 2.2518, + "step": 1950 + }, + { + "epoch": 0.007273785277858598, + "eval_valid_loss": 2.241473436355591, + "eval_valid_loss/all": 2.0987229347229004, + "eval_valid_loss/end_span": 1.2849832773208618, + "eval_valid_perplexity/batch": 8.155747413635254, + "eval_valid_perplexity/end_span": 3.614607572555542, + "eval_valid_perplexity/fim": 2.132458448410034, + "eval_valid_perplexity/first_seq": 15.107767105102539, + "eval_valid_perplexity/last_seq": 9.422272682189941, + "eval_valid_perplexity/second_seq": 13.65632438659668, + "eval_valid_perplexity/seq": 9.181624412536621, + "eval_valid_reconstruction/all": 0.28114643692970276, + "eval_valid_reconstruction/end_span": 0.7010515332221985, + "eval_valid_reconstruction/fim": 0.1424456536769867, + "eval_valid_reconstruction/first_seq": 0.16092748939990997, + "eval_valid_reconstruction/last_seq": 0.31528612971305847, + "eval_valid_reconstruction/second_seq": 0.19793008267879486, + "eval_valid_runtime": 636.9618, + "eval_valid_samples_per_second": 0.301, + "eval_valid_steps_per_second": 0.301, + "step": 1950 + }, + { + "epoch": 0.007273785277858598, + "eval_train_loss": 2.23796010017395, + "eval_train_loss/all": 2.0670711994171143, + "eval_train_loss/end_span": 1.2404931783676147, + "eval_train_perplexity/batch": 7.901646614074707, + "eval_train_perplexity/end_span": 3.457318067550659, + "eval_train_perplexity/fim": 2.1007285118103027, + "eval_train_perplexity/first_seq": 15.204852104187012, + "eval_train_perplexity/last_seq": 9.518413543701172, + "eval_train_perplexity/second_seq": 13.957541465759277, + "eval_train_perplexity/seq": 9.0835599899292, + "eval_train_reconstruction/all": 0.2717253863811493, + "eval_train_reconstruction/end_span": 0.7140697836875916, + "eval_train_reconstruction/fim": 0.14078885316848755, + "eval_train_reconstruction/first_seq": 0.16013920307159424, + "eval_train_reconstruction/last_seq": 0.309790700674057, + "eval_train_reconstruction/second_seq": 0.19082729518413544, + "eval_train_runtime": 647.5833, + "eval_train_samples_per_second": 0.296, + "eval_train_steps_per_second": 0.296, + "step": 1950 + }, + { + "epoch": 0.0073110867408219754, + "grad_norm": 0.41368186473846436, + "learning_rate": 0.0006, + "loss": 2.3988, + "step": 1960 + }, + { + "epoch": 0.007348388203785352, + "grad_norm": 0.3983208239078522, + "learning_rate": 0.0006, + "loss": 2.2156, + "step": 1970 + }, + { + "epoch": 0.00738568966674873, + "grad_norm": 0.3345157206058502, + "learning_rate": 0.0006, + "loss": 2.4248, + "step": 1980 + }, + { + "epoch": 0.007422991129712108, + "grad_norm": 0.3138173222541809, + "learning_rate": 0.0006, + "loss": 2.3003, + "step": 1990 + }, + { + "epoch": 0.0074602925926754845, + "grad_norm": 0.35836175084114075, + "learning_rate": 0.0006, + "loss": 2.2451, + "step": 2000 + }, + { + "epoch": 0.0074602925926754845, + "eval_valid_loss": 2.237790107727051, + "eval_valid_loss/all": 2.095613718032837, + "eval_valid_loss/end_span": 1.2597285509109497, + "eval_valid_perplexity/batch": 8.1304292678833, + "eval_valid_perplexity/end_span": 3.5244646072387695, + "eval_valid_perplexity/fim": 2.6033596992492676, + "eval_valid_perplexity/first_seq": 14.65806770324707, + "eval_valid_perplexity/last_seq": 9.659465789794922, + "eval_valid_perplexity/second_seq": 13.931994438171387, + "eval_valid_perplexity/seq": 9.151252746582031, + "eval_valid_reconstruction/all": 0.28177163004875183, + "eval_valid_reconstruction/end_span": 0.6954950094223022, + "eval_valid_reconstruction/fim": 0.17980250716209412, + "eval_valid_reconstruction/first_seq": 0.1736697554588318, + "eval_valid_reconstruction/last_seq": 0.30690810084342957, + "eval_valid_reconstruction/second_seq": 0.1900932639837265, + "eval_valid_runtime": 646.5063, + "eval_valid_samples_per_second": 0.297, + "eval_valid_steps_per_second": 0.297, + "step": 2000 + }, + { + "epoch": 0.0074602925926754845, + "eval_train_loss": 2.2355480194091797, + "eval_train_loss/all": 2.0653116703033447, + "eval_train_loss/end_span": 1.2312179803848267, + "eval_train_perplexity/batch": 7.887755870819092, + "eval_train_perplexity/end_span": 3.4253990650177, + "eval_train_perplexity/fim": 2.0345404148101807, + "eval_train_perplexity/first_seq": 15.544281959533691, + "eval_train_perplexity/last_seq": 9.68161678314209, + "eval_train_perplexity/second_seq": 14.44493579864502, + "eval_train_perplexity/seq": 9.071104049682617, + "eval_train_reconstruction/all": 0.27199655771255493, + "eval_train_reconstruction/end_span": 0.7066817879676819, + "eval_train_reconstruction/fim": 0.13477636873722076, + "eval_train_reconstruction/first_seq": 0.14709660410881042, + "eval_train_reconstruction/last_seq": 0.30546554923057556, + "eval_train_reconstruction/second_seq": 0.1804620623588562, + "eval_train_runtime": 646.0528, + "eval_train_samples_per_second": 0.297, + "eval_train_steps_per_second": 0.297, + "step": 2000 + }, + { + "epoch": 0.007497594055638862, + "grad_norm": 0.38600027561187744, + "learning_rate": 0.0006, + "loss": 2.368, + "step": 2010 + }, + { + "epoch": 0.00753489551860224, + "grad_norm": 0.3272983133792877, + "learning_rate": 0.0006, + "loss": 2.2853, + "step": 2020 + }, + { + "epoch": 0.007572196981565617, + "grad_norm": 0.41665783524513245, + "learning_rate": 0.0006, + "loss": 2.2797, + "step": 2030 + }, + { + "epoch": 0.0076094984445289945, + "grad_norm": 0.4070238173007965, + "learning_rate": 0.0006, + "loss": 2.3492, + "step": 2040 + }, + { + "epoch": 0.007646799907492372, + "grad_norm": 0.47201308608055115, + "learning_rate": 0.0006, + "loss": 2.3766, + "step": 2050 + }, + { + "epoch": 0.007646799907492372, + "eval_valid_loss": 2.241835832595825, + "eval_valid_loss/all": 2.0992271900177, + "eval_valid_loss/end_span": 1.2236748933792114, + "eval_valid_perplexity/batch": 8.15986156463623, + "eval_valid_perplexity/end_span": 3.399658203125, + "eval_valid_perplexity/fim": 2.12453556060791, + "eval_valid_perplexity/first_seq": 15.067255973815918, + "eval_valid_perplexity/last_seq": 9.480020523071289, + "eval_valid_perplexity/second_seq": 13.950920104980469, + "eval_valid_perplexity/seq": 9.18645191192627, + "eval_valid_reconstruction/all": 0.28109312057495117, + "eval_valid_reconstruction/end_span": 0.7141945362091064, + "eval_valid_reconstruction/fim": 0.1421593427658081, + "eval_valid_reconstruction/first_seq": 0.1637456864118576, + "eval_valid_reconstruction/last_seq": 0.31397271156311035, + "eval_valid_reconstruction/second_seq": 0.18850868940353394, + "eval_valid_runtime": 626.9605, + "eval_valid_samples_per_second": 0.306, + "eval_valid_steps_per_second": 0.306, + "step": 2050 + }, + { + "epoch": 0.007646799907492372, + "eval_train_loss": 2.240173578262329, + "eval_train_loss/all": 2.069831371307373, + "eval_train_loss/end_span": 1.1850767135620117, + "eval_train_perplexity/batch": 7.923486709594727, + "eval_train_perplexity/end_span": 3.27093768119812, + "eval_train_perplexity/fim": 2.1765575408935547, + "eval_train_perplexity/first_seq": 15.771288871765137, + "eval_train_perplexity/last_seq": 9.517255783081055, + "eval_train_perplexity/second_seq": 14.319974899291992, + "eval_train_perplexity/seq": 9.120083808898926, + "eval_train_reconstruction/all": 0.2711729109287262, + "eval_train_reconstruction/end_span": 0.7263896465301514, + "eval_train_reconstruction/fim": 0.1469624936580658, + "eval_train_reconstruction/first_seq": 0.14884717762470245, + "eval_train_reconstruction/last_seq": 0.3087036907672882, + "eval_train_reconstruction/second_seq": 0.18550162017345428, + "eval_train_runtime": 635.9816, + "eval_train_samples_per_second": 0.302, + "eval_train_steps_per_second": 0.302, + "step": 2050 + }, + { + "epoch": 0.007684101370455749, + "grad_norm": 0.5402010679244995, + "learning_rate": 0.0006, + "loss": 2.3624, + "step": 2060 + }, + { + "epoch": 0.007721402833419127, + "grad_norm": 0.39548781514167786, + "learning_rate": 0.0006, + "loss": 2.3497, + "step": 2070 + }, + { + "epoch": 0.007758704296382504, + "grad_norm": 0.3721226453781128, + "learning_rate": 0.0006, + "loss": 2.3595, + "step": 2080 + }, + { + "epoch": 0.007796005759345881, + "grad_norm": 0.5265329480171204, + "learning_rate": 0.0006, + "loss": 2.1538, + "step": 2090 + }, + { + "epoch": 0.007833307222309259, + "grad_norm": 0.5272058248519897, + "learning_rate": 0.0006, + "loss": 2.3162, + "step": 2100 + }, + { + "epoch": 0.007833307222309259, + "eval_valid_loss": 2.250322103500366, + "eval_valid_loss/all": 2.1069390773773193, + "eval_valid_loss/end_span": 1.3185569047927856, + "eval_valid_perplexity/batch": 8.22303295135498, + "eval_valid_perplexity/end_span": 3.738023042678833, + "eval_valid_perplexity/fim": 2.3998353481292725, + "eval_valid_perplexity/first_seq": 14.865644454956055, + "eval_valid_perplexity/last_seq": 9.287750244140625, + "eval_valid_perplexity/second_seq": 14.109410285949707, + "eval_valid_perplexity/seq": 9.253811836242676, + "eval_valid_reconstruction/all": 0.2787409722805023, + "eval_valid_reconstruction/end_span": 0.6936265230178833, + "eval_valid_reconstruction/fim": 0.16298659145832062, + "eval_valid_reconstruction/first_seq": 0.1673172563314438, + "eval_valid_reconstruction/last_seq": 0.31877174973487854, + "eval_valid_reconstruction/second_seq": 0.18629372119903564, + "eval_valid_runtime": 634.4215, + "eval_valid_samples_per_second": 0.303, + "eval_valid_steps_per_second": 0.303, + "step": 2100 + }, + { + "epoch": 0.007833307222309259, + "eval_train_loss": 2.247313976287842, + "eval_train_loss/all": 2.075646162033081, + "eval_train_loss/end_span": 1.2854506969451904, + "eval_train_perplexity/batch": 7.9696946144104, + "eval_train_perplexity/end_span": 3.616297483444214, + "eval_train_perplexity/fim": 2.2463109493255615, + "eval_train_perplexity/first_seq": 15.741111755371094, + "eval_train_perplexity/last_seq": 9.68239688873291, + "eval_train_perplexity/second_seq": 14.335671424865723, + "eval_train_perplexity/seq": 9.163172721862793, + "eval_train_reconstruction/all": 0.2693150043487549, + "eval_train_reconstruction/end_span": 0.7050355672836304, + "eval_train_reconstruction/fim": 0.1497105062007904, + "eval_train_reconstruction/first_seq": 0.14910955727100372, + "eval_train_reconstruction/last_seq": 0.3035958707332611, + "eval_train_reconstruction/second_seq": 0.1843090057373047, + "eval_train_runtime": 634.994, + "eval_train_samples_per_second": 0.302, + "eval_train_steps_per_second": 0.302, + "step": 2100 + }, + { + "epoch": 0.007870608685272637, + "grad_norm": 0.5018007159233093, + "learning_rate": 0.0006, + "loss": 2.2436, + "step": 2110 + }, + { + "epoch": 0.007907910148236014, + "grad_norm": 0.4556424617767334, + "learning_rate": 0.0006, + "loss": 2.3434, + "step": 2120 + }, + { + "epoch": 0.007945211611199392, + "grad_norm": 0.5410382747650146, + "learning_rate": 0.0006, + "loss": 1.9881, + "step": 2130 + }, + { + "epoch": 0.007982513074162768, + "grad_norm": 0.4289705157279968, + "learning_rate": 0.0006, + "loss": 2.0446, + "step": 2140 + }, + { + "epoch": 0.008019814537126146, + "grad_norm": 0.7037988305091858, + "learning_rate": 0.0006, + "loss": 2.3986, + "step": 2150 + }, + { + "epoch": 0.008019814537126146, + "eval_valid_loss": 2.2474124431610107, + "eval_valid_loss/all": 2.1046881675720215, + "eval_valid_loss/end_span": 1.328887939453125, + "eval_valid_perplexity/batch": 8.204544067382812, + "eval_valid_perplexity/end_span": 3.776840925216675, + "eval_valid_perplexity/fim": 2.2567074298858643, + "eval_valid_perplexity/first_seq": 14.979233741760254, + "eval_valid_perplexity/last_seq": 9.20396614074707, + "eval_valid_perplexity/second_seq": 14.15402603149414, + "eval_valid_perplexity/seq": 9.236544609069824, + "eval_valid_reconstruction/all": 0.2793480157852173, + "eval_valid_reconstruction/end_span": 0.690510630607605, + "eval_valid_reconstruction/fim": 0.15213409066200256, + "eval_valid_reconstruction/first_seq": 0.16845886409282684, + "eval_valid_reconstruction/last_seq": 0.3194420337677002, + "eval_valid_reconstruction/second_seq": 0.18445347249507904, + "eval_valid_runtime": 630.4981, + "eval_valid_samples_per_second": 0.305, + "eval_valid_steps_per_second": 0.305, + "step": 2150 + }, + { + "epoch": 0.008019814537126146, + "eval_train_loss": 2.242889642715454, + "eval_train_loss/all": 2.07201886177063, + "eval_train_loss/end_span": 1.280889630317688, + "eval_train_perplexity/batch": 7.94083833694458, + "eval_train_perplexity/end_span": 3.5998408794403076, + "eval_train_perplexity/fim": 2.2475738525390625, + "eval_train_perplexity/first_seq": 15.54761791229248, + "eval_train_perplexity/last_seq": 9.0745267868042, + "eval_train_perplexity/second_seq": 14.373156547546387, + "eval_train_perplexity/seq": 9.130599021911621, + "eval_train_reconstruction/all": 0.2705344259738922, + "eval_train_reconstruction/end_span": 0.7055846452713013, + "eval_train_reconstruction/fim": 0.15172220766544342, + "eval_train_reconstruction/first_seq": 0.14930158853530884, + "eval_train_reconstruction/last_seq": 0.3225395083427429, + "eval_train_reconstruction/second_seq": 0.1816522181034088, + "eval_train_runtime": 643.5497, + "eval_train_samples_per_second": 0.298, + "eval_train_steps_per_second": 0.298, + "step": 2150 + }, + { + "epoch": 0.008057116000089523, + "grad_norm": 0.4329743981361389, + "learning_rate": 0.0006, + "loss": 2.3315, + "step": 2160 + }, + { + "epoch": 0.008094417463052901, + "grad_norm": 0.5637417435646057, + "learning_rate": 0.0006, + "loss": 2.278, + "step": 2170 + }, + { + "epoch": 0.008131718926016279, + "grad_norm": 0.3693992495536804, + "learning_rate": 0.0006, + "loss": 2.1783, + "step": 2180 + }, + { + "epoch": 0.008169020388979657, + "grad_norm": 0.808887243270874, + "learning_rate": 0.0006, + "loss": 2.3359, + "step": 2190 + }, + { + "epoch": 0.008206321851943033, + "grad_norm": 0.33626142144203186, + "learning_rate": 0.0006, + "loss": 2.2263, + "step": 2200 + }, + { + "epoch": 0.008206321851943033, + "eval_valid_loss": 2.2393622398376465, + "eval_valid_loss/all": 2.0968613624572754, + "eval_valid_loss/end_span": 1.322473406791687, + "eval_valid_perplexity/batch": 8.140579223632812, + "eval_valid_perplexity/end_span": 3.7526917457580566, + "eval_valid_perplexity/fim": 2.3649039268493652, + "eval_valid_perplexity/first_seq": 15.021223068237305, + "eval_valid_perplexity/last_seq": 9.42583179473877, + "eval_valid_perplexity/second_seq": 14.13618278503418, + "eval_valid_perplexity/seq": 9.161613464355469, + "eval_valid_reconstruction/all": 0.2818208932876587, + "eval_valid_reconstruction/end_span": 0.6908425688743591, + "eval_valid_reconstruction/fim": 0.16137754917144775, + "eval_valid_reconstruction/first_seq": 0.1658455729484558, + "eval_valid_reconstruction/last_seq": 0.31510964035987854, + "eval_valid_reconstruction/second_seq": 0.18534503877162933, + "eval_valid_runtime": 628.7885, + "eval_valid_samples_per_second": 0.305, + "eval_valid_steps_per_second": 0.305, + "step": 2200 + }, + { + "epoch": 0.008206321851943033, + "eval_train_loss": 2.236042022705078, + "eval_train_loss/all": 2.0659799575805664, + "eval_train_loss/end_span": 1.2782838344573975, + "eval_train_perplexity/batch": 7.893028736114502, + "eval_train_perplexity/end_span": 3.59047269821167, + "eval_train_perplexity/fim": 2.1677284240722656, + "eval_train_perplexity/first_seq": 15.830184936523438, + "eval_train_perplexity/last_seq": 8.970903396606445, + "eval_train_perplexity/second_seq": 14.232221603393555, + "eval_train_perplexity/seq": 9.076751708984375, + "eval_train_reconstruction/all": 0.2723698616027832, + "eval_train_reconstruction/end_span": 0.7040180563926697, + "eval_train_reconstruction/fim": 0.145681232213974, + "eval_train_reconstruction/first_seq": 0.14854763448238373, + "eval_train_reconstruction/last_seq": 0.32645246386528015, + "eval_train_reconstruction/second_seq": 0.18584184348583221, + "eval_train_runtime": 639.0778, + "eval_train_samples_per_second": 0.3, + "eval_train_steps_per_second": 0.3, + "step": 2200 + }, + { + "epoch": 0.00824362331490641, + "grad_norm": 0.31903693079948425, + "learning_rate": 0.0006, + "loss": 2.2961, + "step": 2210 + }, + { + "epoch": 0.008280924777869788, + "grad_norm": 0.40031635761260986, + "learning_rate": 0.0006, + "loss": 2.3372, + "step": 2220 + }, + { + "epoch": 0.008318226240833166, + "grad_norm": 0.3510777950286865, + "learning_rate": 0.0006, + "loss": 2.3589, + "step": 2230 + }, + { + "epoch": 0.008355527703796543, + "grad_norm": 0.5148348212242126, + "learning_rate": 0.0006, + "loss": 2.2554, + "step": 2240 + }, + { + "epoch": 0.008392829166759921, + "grad_norm": 0.4196660816669464, + "learning_rate": 0.0006, + "loss": 2.4534, + "step": 2250 + }, + { + "epoch": 0.008392829166759921, + "eval_valid_loss": 2.237905502319336, + "eval_valid_loss/all": 2.095485210418701, + "eval_valid_loss/end_span": 1.470793604850769, + "eval_valid_perplexity/batch": 8.12938404083252, + "eval_valid_perplexity/end_span": 4.352688312530518, + "eval_valid_perplexity/fim": 2.3983237743377686, + "eval_valid_perplexity/first_seq": 14.849035263061523, + "eval_valid_perplexity/last_seq": 9.503610610961914, + "eval_valid_perplexity/second_seq": 13.936535835266113, + "eval_valid_perplexity/seq": 9.149510383605957, + "eval_valid_reconstruction/all": 0.28186550736427307, + "eval_valid_reconstruction/end_span": 0.6470407247543335, + "eval_valid_reconstruction/fim": 0.1653302013874054, + "eval_valid_reconstruction/first_seq": 0.16777466237545013, + "eval_valid_reconstruction/last_seq": 0.31099992990493774, + "eval_valid_reconstruction/second_seq": 0.18996739387512207, + "eval_valid_runtime": 641.5632, + "eval_valid_samples_per_second": 0.299, + "eval_valid_steps_per_second": 0.299, + "step": 2250 + }, + { + "epoch": 0.008392829166759921, + "eval_train_loss": 2.234093427658081, + "eval_train_loss/all": 2.064101457595825, + "eval_train_loss/end_span": 1.4324020147323608, + "eval_train_perplexity/batch": 7.878215789794922, + "eval_train_perplexity/end_span": 4.188748359680176, + "eval_train_perplexity/fim": 2.228761672973633, + "eval_train_perplexity/first_seq": 15.394355773925781, + "eval_train_perplexity/last_seq": 9.062739372253418, + "eval_train_perplexity/second_seq": 14.310056686401367, + "eval_train_perplexity/seq": 9.0599946975708, + "eval_train_reconstruction/all": 0.2726552188396454, + "eval_train_reconstruction/end_span": 0.6562138199806213, + "eval_train_reconstruction/fim": 0.15186849236488342, + "eval_train_reconstruction/first_seq": 0.15375898778438568, + "eval_train_reconstruction/last_seq": 0.32320815324783325, + "eval_train_reconstruction/second_seq": 0.17947395145893097, + "eval_train_runtime": 648.189, + "eval_train_samples_per_second": 0.296, + "eval_train_steps_per_second": 0.296, + "step": 2250 + }, + { + "epoch": 0.008430130629723297, + "grad_norm": 0.39679625630378723, + "learning_rate": 0.0006, + "loss": 2.3, + "step": 2260 + }, + { + "epoch": 0.008467432092686675, + "grad_norm": 0.46270424127578735, + "learning_rate": 0.0006, + "loss": 2.241, + "step": 2270 + }, + { + "epoch": 0.008504733555650052, + "grad_norm": 0.45431438088417053, + "learning_rate": 0.0006, + "loss": 2.2942, + "step": 2280 + }, + { + "epoch": 0.00854203501861343, + "grad_norm": 0.55674147605896, + "learning_rate": 0.0006, + "loss": 2.224, + "step": 2290 + }, + { + "epoch": 0.008579336481576808, + "grad_norm": 0.419575035572052, + "learning_rate": 0.0006, + "loss": 2.2897, + "step": 2300 + }, + { + "epoch": 0.008579336481576808, + "eval_valid_loss": 2.236297607421875, + "eval_valid_loss/all": 2.0937557220458984, + "eval_valid_loss/end_span": 1.3741178512573242, + "eval_valid_perplexity/batch": 8.115337371826172, + "eval_valid_perplexity/end_span": 3.951589345932007, + "eval_valid_perplexity/fim": 2.213665246963501, + "eval_valid_perplexity/first_seq": 15.084228515625, + "eval_valid_perplexity/last_seq": 9.571427345275879, + "eval_valid_perplexity/second_seq": 13.92862606048584, + "eval_valid_perplexity/seq": 9.135486602783203, + "eval_valid_reconstruction/all": 0.28273817896842957, + "eval_valid_reconstruction/end_span": 0.6832786798477173, + "eval_valid_reconstruction/fim": 0.15027910470962524, + "eval_valid_reconstruction/first_seq": 0.1614372581243515, + "eval_valid_reconstruction/last_seq": 0.3063466250896454, + "eval_valid_reconstruction/second_seq": 0.19458769261837006, + "eval_valid_runtime": 651.2853, + "eval_valid_samples_per_second": 0.295, + "eval_valid_steps_per_second": 0.295, + "step": 2300 + }, + { + "epoch": 0.008579336481576808, + "eval_train_loss": 2.2353713512420654, + "eval_train_loss/all": 2.0649266242980957, + "eval_train_loss/end_span": 1.3320050239562988, + "eval_train_perplexity/batch": 7.884719371795654, + "eval_train_perplexity/end_span": 3.7886321544647217, + "eval_train_perplexity/fim": 1.909315586090088, + "eval_train_perplexity/first_seq": 15.525529861450195, + "eval_train_perplexity/last_seq": 9.081412315368652, + "eval_train_perplexity/second_seq": 14.382306098937988, + "eval_train_perplexity/seq": 9.069936752319336, + "eval_train_reconstruction/all": 0.2724943161010742, + "eval_train_reconstruction/end_span": 0.6949769854545593, + "eval_train_reconstruction/fim": 0.12246307730674744, + "eval_train_reconstruction/first_seq": 0.15225553512573242, + "eval_train_reconstruction/last_seq": 0.32182347774505615, + "eval_train_reconstruction/second_seq": 0.17911818623542786, + "eval_train_runtime": 649.5118, + "eval_train_samples_per_second": 0.296, + "eval_train_steps_per_second": 0.296, + "step": 2300 + }, + { + "epoch": 0.008616637944540186, + "grad_norm": 0.31903156638145447, + "learning_rate": 0.0006, + "loss": 2.2779, + "step": 2310 + }, + { + "epoch": 0.008653939407503562, + "grad_norm": 0.3503912389278412, + "learning_rate": 0.0006, + "loss": 2.1477, + "step": 2320 + }, + { + "epoch": 0.00869124087046694, + "grad_norm": 0.5036163926124573, + "learning_rate": 0.0006, + "loss": 2.1661, + "step": 2330 + }, + { + "epoch": 0.008728542333430317, + "grad_norm": 0.5090698003768921, + "learning_rate": 0.0006, + "loss": 2.3276, + "step": 2340 + }, + { + "epoch": 0.008765843796393695, + "grad_norm": 0.40694355964660645, + "learning_rate": 0.0006, + "loss": 2.2977, + "step": 2350 + }, + { + "epoch": 0.008765843796393695, + "eval_valid_loss": 2.243588924407959, + "eval_valid_loss/all": 2.100980043411255, + "eval_valid_loss/end_span": 1.345572590827942, + "eval_valid_perplexity/batch": 8.174177169799805, + "eval_valid_perplexity/end_span": 3.8403849601745605, + "eval_valid_perplexity/fim": 2.4077413082122803, + "eval_valid_perplexity/first_seq": 14.92616081237793, + "eval_valid_perplexity/last_seq": 9.610549926757812, + "eval_valid_perplexity/second_seq": 13.798062324523926, + "eval_valid_perplexity/seq": 9.194979667663574, + "eval_valid_reconstruction/all": 0.2805027961730957, + "eval_valid_reconstruction/end_span": 0.6832299828529358, + "eval_valid_reconstruction/fim": 0.16498133540153503, + "eval_valid_reconstruction/first_seq": 0.1672661006450653, + "eval_valid_reconstruction/last_seq": 0.30906054377555847, + "eval_valid_reconstruction/second_seq": 0.1957000494003296, + "eval_valid_runtime": 656.9811, + "eval_valid_samples_per_second": 0.292, + "eval_valid_steps_per_second": 0.292, + "step": 2350 + }, + { + "epoch": 0.008765843796393695, + "eval_train_loss": 2.2396137714385986, + "eval_train_loss/all": 2.0688483715057373, + "eval_train_loss/end_span": 1.303581714630127, + "eval_train_perplexity/batch": 7.915701866149902, + "eval_train_perplexity/end_span": 3.682462692260742, + "eval_train_perplexity/fim": 2.0766618251800537, + "eval_train_perplexity/first_seq": 15.201045989990234, + "eval_train_perplexity/last_seq": 9.346453666687012, + "eval_train_perplexity/second_seq": 14.356515884399414, + "eval_train_perplexity/seq": 9.101983070373535, + "eval_train_reconstruction/all": 0.27135932445526123, + "eval_train_reconstruction/end_span": 0.6937721371650696, + "eval_train_reconstruction/fim": 0.13718047738075256, + "eval_train_reconstruction/first_seq": 0.15875525772571564, + "eval_train_reconstruction/last_seq": 0.31719768047332764, + "eval_train_reconstruction/second_seq": 0.1805734783411026, + "eval_train_runtime": 647.8537, + "eval_train_samples_per_second": 0.296, + "eval_train_steps_per_second": 0.296, + "step": 2350 + }, + { + "epoch": 0.008803145259357072, + "grad_norm": 0.4405331313610077, + "learning_rate": 0.0006, + "loss": 2.2958, + "step": 2360 + }, + { + "epoch": 0.00884044672232045, + "grad_norm": 0.3900534510612488, + "learning_rate": 0.0006, + "loss": 2.1664, + "step": 2370 + }, + { + "epoch": 0.008877748185283826, + "grad_norm": 1.1057474613189697, + "learning_rate": 0.0006, + "loss": 2.2836, + "step": 2380 + }, + { + "epoch": 0.008915049648247204, + "grad_norm": 0.472281277179718, + "learning_rate": 0.0006, + "loss": 2.4382, + "step": 2390 + }, + { + "epoch": 0.008952351111210581, + "grad_norm": 0.5062803030014038, + "learning_rate": 0.0006, + "loss": 2.149, + "step": 2400 + }, + { + "epoch": 0.008952351111210581, + "eval_valid_loss": 2.2405800819396973, + "eval_valid_loss/all": 2.0982751846313477, + "eval_valid_loss/end_span": 1.2924737930297852, + "eval_valid_perplexity/batch": 8.15209674835205, + "eval_valid_perplexity/end_span": 3.641784429550171, + "eval_valid_perplexity/fim": 2.0833959579467773, + "eval_valid_perplexity/first_seq": 14.77958869934082, + "eval_valid_perplexity/last_seq": 9.441902160644531, + "eval_valid_perplexity/second_seq": 13.25843334197998, + "eval_valid_perplexity/seq": 9.173760414123535, + "eval_valid_reconstruction/all": 0.28127503395080566, + "eval_valid_reconstruction/end_span": 0.699150562286377, + "eval_valid_reconstruction/fim": 0.1370319128036499, + "eval_valid_reconstruction/first_seq": 0.17342543601989746, + "eval_valid_reconstruction/last_seq": 0.3118465542793274, + "eval_valid_reconstruction/second_seq": 0.21033529937267303, + "eval_valid_runtime": 657.5655, + "eval_valid_samples_per_second": 0.292, + "eval_valid_steps_per_second": 0.292, + "step": 2400 + }, + { + "epoch": 0.008952351111210581, + "eval_train_loss": 2.2347357273101807, + "eval_train_loss/all": 2.064485788345337, + "eval_train_loss/end_span": 1.2491209506988525, + "eval_train_perplexity/batch": 7.88124418258667, + "eval_train_perplexity/end_span": 3.487276077270508, + "eval_train_perplexity/fim": 2.0941858291625977, + "eval_train_perplexity/first_seq": 15.430083274841309, + "eval_train_perplexity/last_seq": 9.563831329345703, + "eval_train_perplexity/second_seq": 14.34086799621582, + "eval_train_perplexity/seq": 9.062447547912598, + "eval_train_reconstruction/all": 0.2727017402648926, + "eval_train_reconstruction/end_span": 0.7115058898925781, + "eval_train_reconstruction/fim": 0.14067508280277252, + "eval_train_reconstruction/first_seq": 0.15442608296871185, + "eval_train_reconstruction/last_seq": 0.3074468970298767, + "eval_train_reconstruction/second_seq": 0.18471863865852356, + "eval_train_runtime": 649.8977, + "eval_train_samples_per_second": 0.295, + "eval_train_steps_per_second": 0.295, + "step": 2400 + }, + { + "epoch": 0.008989652574173959, + "grad_norm": 0.38787928223609924, + "learning_rate": 0.0006, + "loss": 2.3825, + "step": 2410 + }, + { + "epoch": 0.009026954037137337, + "grad_norm": 0.576867938041687, + "learning_rate": 0.0006, + "loss": 2.2039, + "step": 2420 + }, + { + "epoch": 0.009064255500100715, + "grad_norm": 0.350676029920578, + "learning_rate": 0.0006, + "loss": 2.3203, + "step": 2430 + }, + { + "epoch": 0.00910155696306409, + "grad_norm": 0.4171850085258484, + "learning_rate": 0.0006, + "loss": 2.1613, + "step": 2440 + }, + { + "epoch": 0.009138858426027468, + "grad_norm": 0.4075266420841217, + "learning_rate": 0.0006, + "loss": 2.2054, + "step": 2450 + }, + { + "epoch": 0.009138858426027468, + "eval_valid_loss": 2.243842363357544, + "eval_valid_loss/all": 2.1013970375061035, + "eval_valid_loss/end_span": 1.3389216661453247, + "eval_valid_perplexity/batch": 8.177586555480957, + "eval_valid_perplexity/end_span": 3.814927577972412, + "eval_valid_perplexity/fim": 2.4268085956573486, + "eval_valid_perplexity/first_seq": 15.137747764587402, + "eval_valid_perplexity/last_seq": 9.151108741760254, + "eval_valid_perplexity/second_seq": 14.427836418151855, + "eval_valid_perplexity/seq": 9.208247184753418, + "eval_valid_reconstruction/all": 0.280428946018219, + "eval_valid_reconstruction/end_span": 0.6846887469291687, + "eval_valid_reconstruction/fim": 0.16560493409633636, + "eval_valid_reconstruction/first_seq": 0.16091537475585938, + "eval_valid_reconstruction/last_seq": 0.3215288519859314, + "eval_valid_reconstruction/second_seq": 0.18326090276241302, + "eval_valid_runtime": 659.6291, + "eval_valid_samples_per_second": 0.291, + "eval_valid_steps_per_second": 0.291, + "step": 2450 + }, + { + "epoch": 0.009138858426027468, + "eval_train_loss": 2.2407829761505127, + "eval_train_loss/all": 2.070347309112549, + "eval_train_loss/end_span": 1.2955939769744873, + "eval_train_perplexity/batch": 7.927576065063477, + "eval_train_perplexity/end_span": 3.653165340423584, + "eval_train_perplexity/fim": 2.331585645675659, + "eval_train_perplexity/first_seq": 15.93541431427002, + "eval_train_perplexity/last_seq": 9.818436622619629, + "eval_train_perplexity/second_seq": 14.139833450317383, + "eval_train_perplexity/seq": 9.118256568908691, + "eval_train_reconstruction/all": 0.2708781063556671, + "eval_train_reconstruction/end_span": 0.6966611742973328, + "eval_train_reconstruction/fim": 0.15831290185451508, + "eval_train_reconstruction/first_seq": 0.14335060119628906, + "eval_train_reconstruction/last_seq": 0.3004459738731384, + "eval_train_reconstruction/second_seq": 0.1870720088481903, + "eval_train_runtime": 658.0864, + "eval_train_samples_per_second": 0.292, + "eval_train_steps_per_second": 0.292, + "step": 2450 + }, + { + "epoch": 0.009176159888990846, + "grad_norm": 0.4742264151573181, + "learning_rate": 0.0006, + "loss": 2.3011, + "step": 2460 + }, + { + "epoch": 0.009213461351954224, + "grad_norm": 4.1947479248046875, + "learning_rate": 0.0006, + "loss": 2.2747, + "step": 2470 + }, + { + "epoch": 0.009250762814917601, + "grad_norm": 0.6589099764823914, + "learning_rate": 0.0006, + "loss": 2.3261, + "step": 2480 + }, + { + "epoch": 0.009288064277880979, + "grad_norm": 0.46395954489707947, + "learning_rate": 0.0006, + "loss": 2.2657, + "step": 2490 + }, + { + "epoch": 0.009325365740844357, + "grad_norm": 0.45722681283950806, + "learning_rate": 0.0006, + "loss": 2.3181, + "step": 2500 + }, + { + "epoch": 0.009325365740844357, + "eval_valid_loss": 2.2516396045684814, + "eval_valid_loss/all": 2.1088249683380127, + "eval_valid_loss/end_span": 1.2655965089797974, + "eval_valid_perplexity/batch": 8.238554954528809, + "eval_valid_perplexity/end_span": 3.5452067852020264, + "eval_valid_perplexity/fim": 2.447633743286133, + "eval_valid_perplexity/first_seq": 14.687074661254883, + "eval_valid_perplexity/last_seq": 9.741211891174316, + "eval_valid_perplexity/second_seq": 14.060194969177246, + "eval_valid_perplexity/seq": 9.2902193069458, + "eval_valid_reconstruction/all": 0.27858421206474304, + "eval_valid_reconstruction/end_span": 0.7019385099411011, + "eval_valid_reconstruction/fim": 0.16734899580478668, + "eval_valid_reconstruction/first_seq": 0.17336443066596985, + "eval_valid_reconstruction/last_seq": 0.30081042647361755, + "eval_valid_reconstruction/second_seq": 0.18961845338344574, + "eval_valid_runtime": 649.3333, + "eval_valid_samples_per_second": 0.296, + "eval_valid_steps_per_second": 0.296, + "step": 2500 + }, + { + "epoch": 0.009325365740844357, + "eval_train_loss": 2.2446868419647217, + "eval_train_loss/all": 2.072404146194458, + "eval_train_loss/end_span": 1.2295786142349243, + "eval_train_perplexity/batch": 7.943898677825928, + "eval_train_perplexity/end_span": 3.419788122177124, + "eval_train_perplexity/fim": 2.121339797973633, + "eval_train_perplexity/first_seq": 15.416558265686035, + "eval_train_perplexity/last_seq": 9.68443489074707, + "eval_train_perplexity/second_seq": 14.278995513916016, + "eval_train_perplexity/seq": 9.124174118041992, + "eval_train_reconstruction/all": 0.2703578770160675, + "eval_train_reconstruction/end_span": 0.7131937742233276, + "eval_train_reconstruction/fim": 0.14193075895309448, + "eval_train_reconstruction/first_seq": 0.1539783924818039, + "eval_train_reconstruction/last_seq": 0.303122878074646, + "eval_train_reconstruction/second_seq": 0.1835634708404541, + "eval_train_runtime": 651.0166, + "eval_train_samples_per_second": 0.295, + "eval_train_steps_per_second": 0.295, + "step": 2500 + }, + { + "epoch": 0.009362667203807733, + "grad_norm": 0.31506773829460144, + "learning_rate": 0.0006, + "loss": 2.3125, + "step": 2510 + }, + { + "epoch": 0.00939996866677111, + "grad_norm": 0.4985862374305725, + "learning_rate": 0.0006, + "loss": 2.3028, + "step": 2520 + }, + { + "epoch": 0.009437270129734488, + "grad_norm": 0.5626375079154968, + "learning_rate": 0.0006, + "loss": 2.1492, + "step": 2530 + }, + { + "epoch": 0.009474571592697866, + "grad_norm": 0.3860004246234894, + "learning_rate": 0.0006, + "loss": 2.3468, + "step": 2540 + }, + { + "epoch": 0.009511873055661244, + "grad_norm": 0.41654497385025024, + "learning_rate": 0.0006, + "loss": 2.3609, + "step": 2550 + }, + { + "epoch": 0.009511873055661244, + "eval_valid_loss": 2.239797830581665, + "eval_valid_loss/all": 2.0973832607269287, + "eval_valid_loss/end_span": 1.3683054447174072, + "eval_valid_perplexity/batch": 8.144828796386719, + "eval_valid_perplexity/end_span": 3.928687572479248, + "eval_valid_perplexity/fim": 2.7563974857330322, + "eval_valid_perplexity/first_seq": 14.635571479797363, + "eval_valid_perplexity/last_seq": 9.782234191894531, + "eval_valid_perplexity/second_seq": 13.71532154083252, + "eval_valid_perplexity/seq": 9.163992881774902, + "eval_valid_reconstruction/all": 0.28199416399002075, + "eval_valid_reconstruction/end_span": 0.6763367056846619, + "eval_valid_reconstruction/fim": 0.1919696182012558, + "eval_valid_reconstruction/first_seq": 0.17156575620174408, + "eval_valid_reconstruction/last_seq": 0.30152302980422974, + "eval_valid_reconstruction/second_seq": 0.19962801039218903, + "eval_valid_runtime": 653.3283, + "eval_valid_samples_per_second": 0.294, + "eval_valid_steps_per_second": 0.294, + "step": 2550 + }, + { + "epoch": 0.009511873055661244, + "eval_train_loss": 2.2349796295166016, + "eval_train_loss/all": 2.0647876262664795, + "eval_train_loss/end_span": 1.328717827796936, + "eval_train_perplexity/batch": 7.8836236000061035, + "eval_train_perplexity/end_span": 3.776198625564575, + "eval_train_perplexity/fim": 2.1323535442352295, + "eval_train_perplexity/first_seq": 15.335370063781738, + "eval_train_perplexity/last_seq": 9.3065824508667, + "eval_train_perplexity/second_seq": 14.68083667755127, + "eval_train_perplexity/seq": 9.06314468383789, + "eval_train_reconstruction/all": 0.2727149426937103, + "eval_train_reconstruction/end_span": 0.687257707118988, + "eval_train_reconstruction/fim": 0.14397260546684265, + "eval_train_reconstruction/first_seq": 0.15720146894454956, + "eval_train_reconstruction/last_seq": 0.3180648386478424, + "eval_train_reconstruction/second_seq": 0.1703791320323944, + "eval_train_runtime": 651.7701, + "eval_train_samples_per_second": 0.295, + "eval_train_steps_per_second": 0.295, + "step": 2550 + }, + { + "epoch": 0.009549174518624621, + "grad_norm": 0.33923444151878357, + "learning_rate": 0.0006, + "loss": 2.2444, + "step": 2560 + }, + { + "epoch": 0.009586475981587997, + "grad_norm": 0.30096036195755005, + "learning_rate": 0.0006, + "loss": 2.1801, + "step": 2570 + }, + { + "epoch": 0.009623777444551375, + "grad_norm": 0.5186877250671387, + "learning_rate": 0.0006, + "loss": 2.0677, + "step": 2580 + }, + { + "epoch": 0.009661078907514753, + "grad_norm": 0.4888685643672943, + "learning_rate": 0.0006, + "loss": 2.3045, + "step": 2590 + }, + { + "epoch": 0.00969838037047813, + "grad_norm": 0.6931986212730408, + "learning_rate": 0.0006, + "loss": 2.289, + "step": 2600 + }, + { + "epoch": 0.00969838037047813, + "eval_valid_loss": 2.248986005783081, + "eval_valid_loss/all": 2.105652332305908, + "eval_valid_loss/end_span": 1.2487999200820923, + "eval_valid_perplexity/batch": 8.212458610534668, + "eval_valid_perplexity/end_span": 3.486156702041626, + "eval_valid_perplexity/fim": 2.323934555053711, + "eval_valid_perplexity/first_seq": 14.982698440551758, + "eval_valid_perplexity/last_seq": 9.463404655456543, + "eval_valid_perplexity/second_seq": 13.694295883178711, + "eval_valid_perplexity/seq": 9.240920066833496, + "eval_valid_reconstruction/all": 0.2792055308818817, + "eval_valid_reconstruction/end_span": 0.7110755443572998, + "eval_valid_reconstruction/fim": 0.15691469609737396, + "eval_valid_reconstruction/first_seq": 0.16544070839881897, + "eval_valid_reconstruction/last_seq": 0.31005123257637024, + "eval_valid_reconstruction/second_seq": 0.19562852382659912, + "eval_valid_runtime": 648.811, + "eval_valid_samples_per_second": 0.296, + "eval_valid_steps_per_second": 0.296, + "step": 2600 + }, + { + "epoch": 0.00969838037047813, + "eval_train_loss": 2.2445240020751953, + "eval_train_loss/all": 2.0732362270355225, + "eval_train_loss/end_span": 1.2087589502334595, + "eval_train_perplexity/batch": 7.9505109786987305, + "eval_train_perplexity/end_span": 3.34932541847229, + "eval_train_perplexity/fim": 2.1699883937835693, + "eval_train_perplexity/first_seq": 15.702327728271484, + "eval_train_perplexity/last_seq": 9.727694511413574, + "eval_train_perplexity/second_seq": 14.019704818725586, + "eval_train_perplexity/seq": 9.143997192382812, + "eval_train_reconstruction/all": 0.26985496282577515, + "eval_train_reconstruction/end_span": 0.7220368981361389, + "eval_train_reconstruction/fim": 0.14465299248695374, + "eval_train_reconstruction/first_seq": 0.15252430737018585, + "eval_train_reconstruction/last_seq": 0.3011466860771179, + "eval_train_reconstruction/second_seq": 0.18753691017627716, + "eval_train_runtime": 657.7212, + "eval_train_samples_per_second": 0.292, + "eval_train_steps_per_second": 0.292, + "step": 2600 + }, + { + "epoch": 0.009735681833441508, + "grad_norm": 0.9431376457214355, + "learning_rate": 0.0006, + "loss": 2.3126, + "step": 2610 + }, + { + "epoch": 0.009772983296404886, + "grad_norm": 0.6110846400260925, + "learning_rate": 0.0006, + "loss": 2.167, + "step": 2620 + }, + { + "epoch": 0.009810284759368262, + "grad_norm": 0.4448404908180237, + "learning_rate": 0.0006, + "loss": 2.3257, + "step": 2630 + }, + { + "epoch": 0.00984758622233164, + "grad_norm": 0.3604948818683624, + "learning_rate": 0.0006, + "loss": 2.2652, + "step": 2640 + }, + { + "epoch": 0.009884887685295017, + "grad_norm": 0.5226991772651672, + "learning_rate": 0.0006, + "loss": 2.2497, + "step": 2650 + }, + { + "epoch": 0.009884887685295017, + "eval_valid_loss": 2.240445137023926, + "eval_valid_loss/all": 2.0979905128479004, + "eval_valid_loss/end_span": 1.353060007095337, + "eval_valid_perplexity/batch": 8.149776458740234, + "eval_valid_perplexity/end_span": 3.8692474365234375, + "eval_valid_perplexity/fim": 2.0927927494049072, + "eval_valid_perplexity/first_seq": 14.781854629516602, + "eval_valid_perplexity/last_seq": 9.286974906921387, + "eval_valid_perplexity/second_seq": 14.089553833007812, + "eval_valid_perplexity/seq": 9.173700332641602, + "eval_valid_reconstruction/all": 0.2813124358654022, + "eval_valid_reconstruction/end_span": 0.679826021194458, + "eval_valid_reconstruction/fim": 0.13856248557567596, + "eval_valid_reconstruction/first_seq": 0.1704687625169754, + "eval_valid_reconstruction/last_seq": 0.31882035732269287, + "eval_valid_reconstruction/second_seq": 0.18617551028728485, + "eval_valid_runtime": 678.2768, + "eval_valid_samples_per_second": 0.283, + "eval_valid_steps_per_second": 0.283, + "step": 2650 + }, + { + "epoch": 0.009884887685295017, + "eval_train_loss": 2.2369027137756348, + "eval_train_loss/all": 2.0669429302215576, + "eval_train_loss/end_span": 1.307533860206604, + "eval_train_perplexity/batch": 7.900633335113525, + "eval_train_perplexity/end_span": 3.697045087814331, + "eval_train_perplexity/fim": 2.3731391429901123, + "eval_train_perplexity/first_seq": 15.063639640808105, + "eval_train_perplexity/last_seq": 9.591606140136719, + "eval_train_perplexity/second_seq": 14.532164573669434, + "eval_train_perplexity/seq": 9.08755111694336, + "eval_train_reconstruction/all": 0.27150315046310425, + "eval_train_reconstruction/end_span": 0.6934643983840942, + "eval_train_reconstruction/fim": 0.16285255551338196, + "eval_train_reconstruction/first_seq": 0.16210339963436127, + "eval_train_reconstruction/last_seq": 0.3062501549720764, + "eval_train_reconstruction/second_seq": 0.17651161551475525, + "eval_train_runtime": 661.3334, + "eval_train_samples_per_second": 0.29, + "eval_train_steps_per_second": 0.29, + "step": 2650 + }, + { + "epoch": 0.009922189148258395, + "grad_norm": 0.36978384852409363, + "learning_rate": 0.0006, + "loss": 2.3149, + "step": 2660 + }, + { + "epoch": 0.009959490611221773, + "grad_norm": 0.28990164399147034, + "learning_rate": 0.0006, + "loss": 2.3573, + "step": 2670 + }, + { + "epoch": 0.00999679207418515, + "grad_norm": 0.45847243070602417, + "learning_rate": 0.0006, + "loss": 2.3069, + "step": 2680 + }, + { + "epoch": 0.010034093537148526, + "grad_norm": 0.5381180047988892, + "learning_rate": 0.0006, + "loss": 2.1478, + "step": 2690 + }, + { + "epoch": 0.010071395000111904, + "grad_norm": 0.4938194453716278, + "learning_rate": 0.0006, + "loss": 2.192, + "step": 2700 + }, + { + "epoch": 0.010071395000111904, + "eval_valid_loss": 2.2383320331573486, + "eval_valid_loss/all": 2.0963990688323975, + "eval_valid_loss/end_span": 1.4336568117141724, + "eval_valid_perplexity/batch": 8.13681697845459, + "eval_valid_perplexity/end_span": 4.194007873535156, + "eval_valid_perplexity/fim": 2.4839160442352295, + "eval_valid_perplexity/first_seq": 15.251800537109375, + "eval_valid_perplexity/last_seq": 9.702035903930664, + "eval_valid_perplexity/second_seq": 13.959261894226074, + "eval_valid_perplexity/seq": 9.16515827178955, + "eval_valid_reconstruction/all": 0.2813456058502197, + "eval_valid_reconstruction/end_span": 0.670672595500946, + "eval_valid_reconstruction/fim": 0.17087748646736145, + "eval_valid_reconstruction/first_seq": 0.1618269830942154, + "eval_valid_reconstruction/last_seq": 0.3027329444885254, + "eval_valid_reconstruction/second_seq": 0.1907668560743332, + "eval_valid_runtime": 737.6201, + "eval_valid_samples_per_second": 0.26, + "eval_valid_steps_per_second": 0.26, + "step": 2700 + }, + { + "epoch": 0.010071395000111904, + "eval_train_loss": 2.2333905696868896, + "eval_train_loss/all": 2.06390643119812, + "eval_train_loss/end_span": 1.3978935480117798, + "eval_train_perplexity/batch": 7.876679420471191, + "eval_train_perplexity/end_span": 4.046667098999023, + "eval_train_perplexity/fim": 2.152695417404175, + "eval_train_perplexity/first_seq": 15.515113830566406, + "eval_train_perplexity/last_seq": 9.626914978027344, + "eval_train_perplexity/second_seq": 14.351104736328125, + "eval_train_perplexity/seq": 9.061674118041992, + "eval_train_reconstruction/all": 0.27216607332229614, + "eval_train_reconstruction/end_span": 0.6826670169830322, + "eval_train_reconstruction/fim": 0.14461824297904968, + "eval_train_reconstruction/first_seq": 0.15116646885871887, + "eval_train_reconstruction/last_seq": 0.3019360601902008, + "eval_train_reconstruction/second_seq": 0.18156394362449646, + "eval_train_runtime": 636.2695, + "eval_train_samples_per_second": 0.302, + "eval_train_steps_per_second": 0.302, + "step": 2700 + }, + { + "epoch": 0.010108696463075282, + "grad_norm": 0.5465103387832642, + "learning_rate": 0.0006, + "loss": 2.3583, + "step": 2710 + }, + { + "epoch": 0.01014599792603866, + "grad_norm": 0.2723056375980377, + "learning_rate": 0.0006, + "loss": 2.3051, + "step": 2720 + }, + { + "epoch": 0.010183299389002037, + "grad_norm": 0.4918171167373657, + "learning_rate": 0.0006, + "loss": 2.2651, + "step": 2730 + }, + { + "epoch": 0.010220600851965415, + "grad_norm": 0.4382787048816681, + "learning_rate": 0.0006, + "loss": 2.4225, + "step": 2740 + }, + { + "epoch": 0.01025790231492879, + "grad_norm": 0.45062056183815, + "learning_rate": 0.0006, + "loss": 2.303, + "step": 2750 + }, + { + "epoch": 0.01025790231492879, + "eval_valid_loss": 2.2378711700439453, + "eval_valid_loss/all": 2.0957064628601074, + "eval_valid_loss/end_span": 1.431689977645874, + "eval_valid_perplexity/batch": 8.131183624267578, + "eval_valid_perplexity/end_span": 4.18576717376709, + "eval_valid_perplexity/fim": 2.4632441997528076, + "eval_valid_perplexity/first_seq": 14.93223762512207, + "eval_valid_perplexity/last_seq": 9.86180591583252, + "eval_valid_perplexity/second_seq": 13.98403549194336, + "eval_valid_perplexity/seq": 9.154555320739746, + "eval_valid_reconstruction/all": 0.2819445729255676, + "eval_valid_reconstruction/end_span": 0.670857310295105, + "eval_valid_reconstruction/fim": 0.17022569477558136, + "eval_valid_reconstruction/first_seq": 0.16416865587234497, + "eval_valid_reconstruction/last_seq": 0.30440229177474976, + "eval_valid_reconstruction/second_seq": 0.19314855337142944, + "eval_valid_runtime": 625.954, + "eval_valid_samples_per_second": 0.307, + "eval_valid_steps_per_second": 0.307, + "step": 2750 + }, + { + "epoch": 0.01025790231492879, + "eval_train_loss": 2.232393980026245, + "eval_train_loss/all": 2.0623371601104736, + "eval_train_loss/end_span": 1.3729870319366455, + "eval_train_perplexity/batch": 7.864328384399414, + "eval_train_perplexity/end_span": 3.9471232891082764, + "eval_train_perplexity/fim": 2.0527749061584473, + "eval_train_perplexity/first_seq": 15.421234130859375, + "eval_train_perplexity/last_seq": 9.403865814208984, + "eval_train_perplexity/second_seq": 13.877188682556152, + "eval_train_perplexity/seq": 9.040068626403809, + "eval_train_reconstruction/all": 0.27310889959335327, + "eval_train_reconstruction/end_span": 0.6861286759376526, + "eval_train_reconstruction/fim": 0.1365414261817932, + "eval_train_reconstruction/first_seq": 0.15278296172618866, + "eval_train_reconstruction/last_seq": 0.31286242604255676, + "eval_train_reconstruction/second_seq": 0.19265058636665344, + "eval_train_runtime": 631.3306, + "eval_train_samples_per_second": 0.304, + "eval_train_steps_per_second": 0.304, + "step": 2750 + }, + { + "epoch": 0.010295203777892168, + "grad_norm": 0.43150272965431213, + "learning_rate": 0.0006, + "loss": 2.2867, + "step": 2760 + }, + { + "epoch": 0.010332505240855546, + "grad_norm": 0.34960973262786865, + "learning_rate": 0.0006, + "loss": 2.3431, + "step": 2770 + }, + { + "epoch": 0.010369806703818924, + "grad_norm": 0.46093183755874634, + "learning_rate": 0.0006, + "loss": 2.3035, + "step": 2780 + }, + { + "epoch": 0.010407108166782301, + "grad_norm": 0.40990370512008667, + "learning_rate": 0.0006, + "loss": 2.3704, + "step": 2790 + }, + { + "epoch": 0.01044440962974568, + "grad_norm": 0.5488452315330505, + "learning_rate": 0.0006, + "loss": 2.1743, + "step": 2800 + }, + { + "epoch": 0.01044440962974568, + "eval_valid_loss": 2.2448220252990723, + "eval_valid_loss/all": 2.103107452392578, + "eval_valid_loss/end_span": 1.1652129888534546, + "eval_valid_perplexity/batch": 8.191585540771484, + "eval_valid_perplexity/end_span": 3.2066056728363037, + "eval_valid_perplexity/fim": 2.413757085800171, + "eval_valid_perplexity/first_seq": 15.231440544128418, + "eval_valid_perplexity/last_seq": 9.688790321350098, + "eval_valid_perplexity/second_seq": 13.722308158874512, + "eval_valid_perplexity/seq": 9.246403694152832, + "eval_valid_reconstruction/all": 0.27938395738601685, + "eval_valid_reconstruction/end_span": 0.7356024384498596, + "eval_valid_reconstruction/fim": 0.16526278853416443, + "eval_valid_reconstruction/first_seq": 0.15872450172901154, + "eval_valid_reconstruction/last_seq": 0.30536654591560364, + "eval_valid_reconstruction/second_seq": 0.19833673536777496, + "eval_valid_runtime": 633.014, + "eval_valid_samples_per_second": 0.303, + "eval_valid_steps_per_second": 0.303, + "step": 2800 + }, + { + "epoch": 0.01044440962974568, + "eval_train_loss": 2.2407124042510986, + "eval_train_loss/all": 2.0710935592651367, + "eval_train_loss/end_span": 1.1277847290039062, + "eval_train_perplexity/batch": 7.9334940910339355, + "eval_train_perplexity/end_span": 3.088806390762329, + "eval_train_perplexity/fim": 2.260180950164795, + "eval_train_perplexity/first_seq": 15.384224891662598, + "eval_train_perplexity/last_seq": 9.953792572021484, + "eval_train_perplexity/second_seq": 14.158724784851074, + "eval_train_perplexity/seq": 9.144603729248047, + "eval_train_reconstruction/all": 0.27010586857795715, + "eval_train_reconstruction/end_span": 0.7466735243797302, + "eval_train_reconstruction/fim": 0.15252169966697693, + "eval_train_reconstruction/first_seq": 0.15441571176052094, + "eval_train_reconstruction/last_seq": 0.29414254426956177, + "eval_train_reconstruction/second_seq": 0.1848381757736206, + "eval_train_runtime": 628.7008, + "eval_train_samples_per_second": 0.305, + "eval_train_steps_per_second": 0.305, + "step": 2800 + }, + { + "epoch": 0.010481711092709057, + "grad_norm": 0.7592857480049133, + "learning_rate": 0.0006, + "loss": 2.3418, + "step": 2810 + }, + { + "epoch": 0.010519012555672433, + "grad_norm": 0.5863295793533325, + "learning_rate": 0.0006, + "loss": 2.2254, + "step": 2820 + }, + { + "epoch": 0.01055631401863581, + "grad_norm": 0.38668978214263916, + "learning_rate": 0.0006, + "loss": 2.3775, + "step": 2830 + }, + { + "epoch": 0.010593615481599188, + "grad_norm": 0.4689910411834717, + "learning_rate": 0.0006, + "loss": 2.3104, + "step": 2840 + }, + { + "epoch": 0.010630916944562566, + "grad_norm": 0.4519980251789093, + "learning_rate": 0.0006, + "loss": 2.1094, + "step": 2850 + }, + { + "epoch": 0.010630916944562566, + "eval_valid_loss": 2.238795042037964, + "eval_valid_loss/all": 2.0965449810028076, + "eval_valid_loss/end_span": 1.3437488079071045, + "eval_valid_perplexity/batch": 8.138004302978516, + "eval_valid_perplexity/end_span": 3.8333871364593506, + "eval_valid_perplexity/fim": 2.5405073165893555, + "eval_valid_perplexity/first_seq": 15.103460311889648, + "eval_valid_perplexity/last_seq": 9.481815338134766, + "eval_valid_perplexity/second_seq": 13.682514190673828, + "eval_valid_perplexity/seq": 9.166695594787598, + "eval_valid_reconstruction/all": 0.28195425868034363, + "eval_valid_reconstruction/end_span": 0.6863104701042175, + "eval_valid_reconstruction/fim": 0.17596358060836792, + "eval_valid_reconstruction/first_seq": 0.16080453991889954, + "eval_valid_reconstruction/last_seq": 0.3102346360683441, + "eval_valid_reconstruction/second_seq": 0.20119808614253998, + "eval_valid_runtime": 629.1521, + "eval_valid_samples_per_second": 0.305, + "eval_valid_steps_per_second": 0.305, + "step": 2850 + }, + { + "epoch": 0.010630916944562566, + "eval_train_loss": 2.2354557514190674, + "eval_train_loss/all": 2.065437078475952, + "eval_train_loss/end_span": 1.316832423210144, + "eval_train_perplexity/batch": 7.888745307922363, + "eval_train_perplexity/end_span": 3.7315826416015625, + "eval_train_perplexity/fim": 2.2824923992156982, + "eval_train_perplexity/first_seq": 15.56185531616211, + "eval_train_perplexity/last_seq": 9.293750762939453, + "eval_train_perplexity/second_seq": 14.294800758361816, + "eval_train_perplexity/seq": 9.072797775268555, + "eval_train_reconstruction/all": 0.27245569229125977, + "eval_train_reconstruction/end_span": 0.6954235434532166, + "eval_train_reconstruction/fim": 0.15606914460659027, + "eval_train_reconstruction/first_seq": 0.15200689435005188, + "eval_train_reconstruction/last_seq": 0.31392547488212585, + "eval_train_reconstruction/second_seq": 0.18116015195846558, + "eval_train_runtime": 633.7779, + "eval_train_samples_per_second": 0.303, + "eval_train_steps_per_second": 0.303, + "step": 2850 + }, + { + "epoch": 0.010668218407525944, + "grad_norm": 0.40998339653015137, + "learning_rate": 0.0006, + "loss": 2.454, + "step": 2860 + }, + { + "epoch": 0.010705519870489321, + "grad_norm": 0.36535611748695374, + "learning_rate": 0.0006, + "loss": 2.3692, + "step": 2870 + }, + { + "epoch": 0.010742821333452697, + "grad_norm": 0.48843252658843994, + "learning_rate": 0.0006, + "loss": 2.1627, + "step": 2880 + }, + { + "epoch": 0.010780122796416075, + "grad_norm": 0.46384456753730774, + "learning_rate": 0.0006, + "loss": 2.2598, + "step": 2890 + }, + { + "epoch": 0.010817424259379453, + "grad_norm": 0.5006093382835388, + "learning_rate": 0.0006, + "loss": 2.3422, + "step": 2900 + }, + { + "epoch": 0.010817424259379453, + "eval_valid_loss": 2.23498272895813, + "eval_valid_loss/all": 2.0931849479675293, + "eval_valid_loss/end_span": 1.248676061630249, + "eval_valid_perplexity/batch": 8.110706329345703, + "eval_valid_perplexity/end_span": 3.485724925994873, + "eval_valid_perplexity/fim": 2.696570634841919, + "eval_valid_perplexity/first_seq": 15.19736099243164, + "eval_valid_perplexity/last_seq": 9.638774871826172, + "eval_valid_perplexity/second_seq": 13.572365760803223, + "eval_valid_perplexity/seq": 9.136712074279785, + "eval_valid_reconstruction/all": 0.28261980414390564, + "eval_valid_reconstruction/end_span": 0.711817741394043, + "eval_valid_reconstruction/fim": 0.18864195048809052, + "eval_valid_reconstruction/first_seq": 0.15994293987751007, + "eval_valid_reconstruction/last_seq": 0.30513039231300354, + "eval_valid_reconstruction/second_seq": 0.1990858018398285, + "eval_valid_runtime": 626.7595, + "eval_valid_samples_per_second": 0.306, + "eval_valid_steps_per_second": 0.306, + "step": 2900 + }, + { + "epoch": 0.010817424259379453, + "eval_train_loss": 2.229048490524292, + "eval_train_loss/all": 2.0597236156463623, + "eval_train_loss/end_span": 1.2203004360198975, + "eval_train_perplexity/batch": 7.843801498413086, + "eval_train_perplexity/end_span": 3.3882055282592773, + "eval_train_perplexity/fim": 2.3369388580322266, + "eval_train_perplexity/first_seq": 15.498188018798828, + "eval_train_perplexity/last_seq": 9.383918762207031, + "eval_train_perplexity/second_seq": 13.994264602661133, + "eval_train_perplexity/seq": 9.020405769348145, + "eval_train_reconstruction/all": 0.2735934853553772, + "eval_train_reconstruction/end_span": 0.7247974872589111, + "eval_train_reconstruction/fim": 0.16156534850597382, + "eval_train_reconstruction/first_seq": 0.15307553112506866, + "eval_train_reconstruction/last_seq": 0.31131550669670105, + "eval_train_reconstruction/second_seq": 0.19073636829853058, + "eval_train_runtime": 631.9305, + "eval_train_samples_per_second": 0.304, + "eval_train_steps_per_second": 0.304, + "step": 2900 + }, + { + "epoch": 0.01085472572234283, + "grad_norm": 0.518629789352417, + "learning_rate": 0.0006, + "loss": 2.2897, + "step": 2910 + }, + { + "epoch": 0.010892027185306208, + "grad_norm": 0.4529993236064911, + "learning_rate": 0.0006, + "loss": 2.2474, + "step": 2920 + }, + { + "epoch": 0.010929328648269586, + "grad_norm": 0.3913392722606659, + "learning_rate": 0.0006, + "loss": 2.2569, + "step": 2930 + }, + { + "epoch": 0.010966630111232962, + "grad_norm": 0.43155673146247864, + "learning_rate": 0.0006, + "loss": 2.3941, + "step": 2940 + }, + { + "epoch": 0.01100393157419634, + "grad_norm": 1.7701342105865479, + "learning_rate": 0.0006, + "loss": 2.4038, + "step": 2950 + }, + { + "epoch": 0.01100393157419634, + "eval_valid_loss": 2.2452151775360107, + "eval_valid_loss/all": 2.102062225341797, + "eval_valid_loss/end_span": 1.3129692077636719, + "eval_valid_perplexity/batch": 8.183028221130371, + "eval_valid_perplexity/end_span": 3.7171945571899414, + "eval_valid_perplexity/fim": 2.2711803913116455, + "eval_valid_perplexity/first_seq": 14.960773468017578, + "eval_valid_perplexity/last_seq": 9.06563949584961, + "eval_valid_perplexity/second_seq": 13.59514331817627, + "eval_valid_perplexity/seq": 9.20980167388916, + "eval_valid_reconstruction/all": 0.2804698050022125, + "eval_valid_reconstruction/end_span": 0.688946008682251, + "eval_valid_reconstruction/fim": 0.15473392605781555, + "eval_valid_reconstruction/first_seq": 0.1638316512107849, + "eval_valid_reconstruction/last_seq": 0.32469668984413147, + "eval_valid_reconstruction/second_seq": 0.19861139357089996, + "eval_valid_runtime": 633.2245, + "eval_valid_samples_per_second": 0.303, + "eval_valid_steps_per_second": 0.303, + "step": 2950 + }, + { + "epoch": 0.01100393157419634, + "eval_train_loss": 2.2400753498077393, + "eval_train_loss/all": 2.0689167976379395, + "eval_train_loss/end_span": 1.2713981866836548, + "eval_train_perplexity/batch": 7.916243553161621, + "eval_train_perplexity/end_span": 3.5658347606658936, + "eval_train_perplexity/fim": 2.090874671936035, + "eval_train_perplexity/first_seq": 15.182646751403809, + "eval_train_perplexity/last_seq": 9.402154922485352, + "eval_train_perplexity/second_seq": 14.46171760559082, + "eval_train_perplexity/seq": 9.098865509033203, + "eval_train_reconstruction/all": 0.2713530361652374, + "eval_train_reconstruction/end_span": 0.700893759727478, + "eval_train_reconstruction/fim": 0.1389196366071701, + "eval_train_reconstruction/first_seq": 0.15752404928207397, + "eval_train_reconstruction/last_seq": 0.3140520453453064, + "eval_train_reconstruction/second_seq": 0.18198175728321075, + "eval_train_runtime": 642.4467, + "eval_train_samples_per_second": 0.299, + "eval_train_steps_per_second": 0.299, + "step": 2950 + }, + { + "epoch": 0.011041233037159717, + "grad_norm": 0.744533121585846, + "learning_rate": 0.0006, + "loss": 2.2245, + "step": 2960 + }, + { + "epoch": 0.011078534500123095, + "grad_norm": 0.3928196430206299, + "learning_rate": 0.0006, + "loss": 2.3438, + "step": 2970 + }, + { + "epoch": 0.011115835963086473, + "grad_norm": 0.43920326232910156, + "learning_rate": 0.0006, + "loss": 2.2122, + "step": 2980 + }, + { + "epoch": 0.01115313742604985, + "grad_norm": 0.4215492308139801, + "learning_rate": 0.0006, + "loss": 2.171, + "step": 2990 + }, + { + "epoch": 0.011190438889013226, + "grad_norm": 0.5520058274269104, + "learning_rate": 0.0006, + "loss": 2.1824, + "step": 3000 + }, + { + "epoch": 0.011190438889013226, + "eval_valid_loss": 2.237058401107788, + "eval_valid_loss/all": 2.0950167179107666, + "eval_valid_loss/end_span": 1.204143762588501, + "eval_valid_perplexity/batch": 8.125576972961426, + "eval_valid_perplexity/end_span": 3.3339033126831055, + "eval_valid_perplexity/fim": 2.8758676052093506, + "eval_valid_perplexity/first_seq": 14.596900939941406, + "eval_valid_perplexity/last_seq": 9.487760543823242, + "eval_valid_perplexity/second_seq": 13.72890567779541, + "eval_valid_perplexity/seq": 9.152220726013184, + "eval_valid_reconstruction/all": 0.2822195291519165, + "eval_valid_reconstruction/end_span": 0.7162766456604004, + "eval_valid_reconstruction/fim": 0.20070290565490723, + "eval_valid_reconstruction/first_seq": 0.17344136536121368, + "eval_valid_reconstruction/last_seq": 0.3133264183998108, + "eval_valid_reconstruction/second_seq": 0.19695094227790833, + "eval_valid_runtime": 637.3795, + "eval_valid_samples_per_second": 0.301, + "eval_valid_steps_per_second": 0.301, + "step": 3000 + }, + { + "epoch": 0.011190438889013226, + "eval_train_loss": 2.233081340789795, + "eval_train_loss/all": 2.063384771347046, + "eval_train_loss/end_span": 1.177062749862671, + "eval_train_perplexity/batch": 7.8725714683532715, + "eval_train_perplexity/end_span": 3.2448294162750244, + "eval_train_perplexity/fim": 1.9237778186798096, + "eval_train_perplexity/first_seq": 15.537031173706055, + "eval_train_perplexity/last_seq": 9.39516544342041, + "eval_train_perplexity/second_seq": 14.404738426208496, + "eval_train_perplexity/seq": 9.054924964904785, + "eval_train_reconstruction/all": 0.2726360559463501, + "eval_train_reconstruction/end_span": 0.7280105352401733, + "eval_train_reconstruction/fim": 0.12405122071504593, + "eval_train_reconstruction/first_seq": 0.15201930701732635, + "eval_train_reconstruction/last_seq": 0.3097652196884155, + "eval_train_reconstruction/second_seq": 0.18234018981456757, + "eval_train_runtime": 634.9558, + "eval_train_samples_per_second": 0.302, + "eval_train_steps_per_second": 0.302, + "step": 3000 + }, + { + "epoch": 0.011227740351976604, + "grad_norm": 0.41169795393943787, + "learning_rate": 0.0006, + "loss": 2.2711, + "step": 3010 + }, + { + "epoch": 0.011265041814939982, + "grad_norm": 0.3990393280982971, + "learning_rate": 0.0006, + "loss": 2.0544, + "step": 3020 + }, + { + "epoch": 0.01130234327790336, + "grad_norm": 0.8220118284225464, + "learning_rate": 0.0006, + "loss": 2.2302, + "step": 3030 + }, + { + "epoch": 0.011339644740866737, + "grad_norm": 8.324554443359375, + "learning_rate": 0.0006, + "loss": 2.3528, + "step": 3040 + }, + { + "epoch": 0.011376946203830115, + "grad_norm": 0.4924704432487488, + "learning_rate": 0.0006, + "loss": 2.3232, + "step": 3050 + }, + { + "epoch": 0.011376946203830115, + "eval_valid_loss": 2.244220733642578, + "eval_valid_loss/all": 2.101409673690796, + "eval_valid_loss/end_span": 1.325554370880127, + "eval_valid_perplexity/batch": 8.177689552307129, + "eval_valid_perplexity/end_span": 3.7642714977264404, + "eval_valid_perplexity/fim": 2.3290698528289795, + "eval_valid_perplexity/first_seq": 14.991559982299805, + "eval_valid_perplexity/last_seq": 9.544842720031738, + "eval_valid_perplexity/second_seq": 13.966415405273438, + "eval_valid_perplexity/seq": 9.212881088256836, + "eval_valid_reconstruction/all": 0.280001163482666, + "eval_valid_reconstruction/end_span": 0.6779699325561523, + "eval_valid_reconstruction/fim": 0.15766319632530212, + "eval_valid_reconstruction/first_seq": 0.16606874763965607, + "eval_valid_reconstruction/last_seq": 0.30813997983932495, + "eval_valid_reconstruction/second_seq": 0.18979860842227936, + "eval_valid_runtime": 636.2576, + "eval_valid_samples_per_second": 0.302, + "eval_valid_steps_per_second": 0.302, + "step": 3050 + }, + { + "epoch": 0.011376946203830115, + "eval_train_loss": 2.2414300441741943, + "eval_train_loss/all": 2.0712904930114746, + "eval_train_loss/end_span": 1.2895894050598145, + "eval_train_perplexity/batch": 7.935056686401367, + "eval_train_perplexity/end_span": 3.6312952041625977, + "eval_train_perplexity/fim": 2.223907470703125, + "eval_train_perplexity/first_seq": 15.2896728515625, + "eval_train_perplexity/last_seq": 9.228426933288574, + "eval_train_perplexity/second_seq": 14.022445678710938, + "eval_train_perplexity/seq": 9.131929397583008, + "eval_train_reconstruction/all": 0.27037540078163147, + "eval_train_reconstruction/end_span": 0.6892746090888977, + "eval_train_reconstruction/fim": 0.1498618721961975, + "eval_train_reconstruction/first_seq": 0.16029909253120422, + "eval_train_reconstruction/last_seq": 0.31957218050956726, + "eval_train_reconstruction/second_seq": 0.18954826891422272, + "eval_train_runtime": 634.5301, + "eval_train_samples_per_second": 0.303, + "eval_train_steps_per_second": 0.303, + "step": 3050 + }, + { + "epoch": 0.01141424766679349, + "grad_norm": 3.798004627227783, + "learning_rate": 0.0006, + "loss": 2.298, + "step": 3060 + }, + { + "epoch": 0.011451549129756869, + "grad_norm": 0.42388078570365906, + "learning_rate": 0.0006, + "loss": 2.2543, + "step": 3070 + }, + { + "epoch": 0.011488850592720246, + "grad_norm": 0.4335278570652008, + "learning_rate": 0.0006, + "loss": 2.2157, + "step": 3080 + }, + { + "epoch": 0.011526152055683624, + "grad_norm": 1.824471116065979, + "learning_rate": 0.0006, + "loss": 2.2741, + "step": 3090 + }, + { + "epoch": 0.011563453518647002, + "grad_norm": 0.4980129599571228, + "learning_rate": 0.0006, + "loss": 2.3572, + "step": 3100 + }, + { + "epoch": 0.011563453518647002, + "eval_valid_loss": 2.250598192214966, + "eval_valid_loss/all": 2.1078264713287354, + "eval_valid_loss/end_span": 1.3677912950515747, + "eval_valid_perplexity/batch": 8.23033332824707, + "eval_valid_perplexity/end_span": 3.926668167114258, + "eval_valid_perplexity/fim": 2.3704962730407715, + "eval_valid_perplexity/first_seq": 15.080050468444824, + "eval_valid_perplexity/last_seq": 10.020319938659668, + "eval_valid_perplexity/second_seq": 13.960789680480957, + "eval_valid_perplexity/seq": 9.285089492797852, + "eval_valid_reconstruction/all": 0.2787753641605377, + "eval_valid_reconstruction/end_span": 0.679497241973877, + "eval_valid_reconstruction/fim": 0.1590288132429123, + "eval_valid_reconstruction/first_seq": 0.16616708040237427, + "eval_valid_reconstruction/last_seq": 0.2944413125514984, + "eval_valid_reconstruction/second_seq": 0.19234125316143036, + "eval_valid_runtime": 640.9569, + "eval_valid_samples_per_second": 0.3, + "eval_valid_steps_per_second": 0.3, + "step": 3100 + }, + { + "epoch": 0.011563453518647002, + "eval_train_loss": 2.2473790645599365, + "eval_train_loss/all": 2.0764670372009277, + "eval_train_loss/end_span": 1.3288748264312744, + "eval_train_perplexity/batch": 7.976239204406738, + "eval_train_perplexity/end_span": 3.7767913341522217, + "eval_train_perplexity/fim": 2.1269707679748535, + "eval_train_perplexity/first_seq": 15.476642608642578, + "eval_train_perplexity/last_seq": 9.579445838928223, + "eval_train_perplexity/second_seq": 14.75365161895752, + "eval_train_perplexity/seq": 9.193331718444824, + "eval_train_reconstruction/all": 0.2693946957588196, + "eval_train_reconstruction/end_span": 0.6909093856811523, + "eval_train_reconstruction/fim": 0.13951371610164642, + "eval_train_reconstruction/first_seq": 0.1550808846950531, + "eval_train_reconstruction/last_seq": 0.304462730884552, + "eval_train_reconstruction/second_seq": 0.1737789809703827, + "eval_train_runtime": 631.6904, + "eval_train_samples_per_second": 0.304, + "eval_train_steps_per_second": 0.304, + "step": 3100 + }, + { + "epoch": 0.01160075498161038, + "grad_norm": 0.3510059416294098, + "learning_rate": 0.0006, + "loss": 2.2604, + "step": 3110 + }, + { + "epoch": 0.011638056444573755, + "grad_norm": 0.7145357131958008, + "learning_rate": 0.0006, + "loss": 2.1687, + "step": 3120 + }, + { + "epoch": 0.011675357907537133, + "grad_norm": 0.4780406653881073, + "learning_rate": 0.0006, + "loss": 2.204, + "step": 3130 + }, + { + "epoch": 0.01171265937050051, + "grad_norm": 0.39783942699432373, + "learning_rate": 0.0006, + "loss": 2.1808, + "step": 3140 + }, + { + "epoch": 0.011749960833463888, + "grad_norm": 0.39985620975494385, + "learning_rate": 0.0006, + "loss": 2.2427, + "step": 3150 + }, + { + "epoch": 0.011749960833463888, + "eval_valid_loss": 2.245640277862549, + "eval_valid_loss/all": 2.1039116382598877, + "eval_valid_loss/end_span": 1.425553798675537, + "eval_valid_perplexity/batch": 8.198175430297852, + "eval_valid_perplexity/end_span": 4.160161018371582, + "eval_valid_perplexity/fim": 2.475196361541748, + "eval_valid_perplexity/first_seq": 15.20156478881836, + "eval_valid_perplexity/last_seq": 9.77794361114502, + "eval_valid_perplexity/second_seq": 14.121078491210938, + "eval_valid_perplexity/seq": 9.24499797821045, + "eval_valid_reconstruction/all": 0.2795087397098541, + "eval_valid_reconstruction/end_span": 0.6656819581985474, + "eval_valid_reconstruction/fim": 0.16932125389575958, + "eval_valid_reconstruction/first_seq": 0.1578778177499771, + "eval_valid_reconstruction/last_seq": 0.30497628450393677, + "eval_valid_reconstruction/second_seq": 0.1884879469871521, + "eval_valid_runtime": 626.0237, + "eval_valid_samples_per_second": 0.307, + "eval_valid_steps_per_second": 0.307, + "step": 3150 + }, + { + "epoch": 0.011749960833463888, + "eval_train_loss": 2.239335775375366, + "eval_train_loss/all": 2.069234848022461, + "eval_train_loss/end_span": 1.3970671892166138, + "eval_train_perplexity/batch": 7.918761730194092, + "eval_train_perplexity/end_span": 4.0433244705200195, + "eval_train_perplexity/fim": 2.2372519969940186, + "eval_train_perplexity/first_seq": 15.7595796585083, + "eval_train_perplexity/last_seq": 9.563583374023438, + "eval_train_perplexity/second_seq": 14.461175918579102, + "eval_train_perplexity/seq": 9.117464065551758, + "eval_train_reconstruction/all": 0.2712610960006714, + "eval_train_reconstruction/end_span": 0.6751156449317932, + "eval_train_reconstruction/fim": 0.1509379893541336, + "eval_train_reconstruction/first_seq": 0.14426498115062714, + "eval_train_reconstruction/last_seq": 0.3074382245540619, + "eval_train_reconstruction/second_seq": 0.1789693385362625, + "eval_train_runtime": 633.1567, + "eval_train_samples_per_second": 0.303, + "eval_train_steps_per_second": 0.303, + "step": 3150 + }, + { + "epoch": 0.011787262296427266, + "grad_norm": 0.34158405661582947, + "learning_rate": 0.0006, + "loss": 2.3158, + "step": 3160 + }, + { + "epoch": 0.011824563759390644, + "grad_norm": 0.3013451099395752, + "learning_rate": 0.0006, + "loss": 2.2867, + "step": 3170 + }, + { + "epoch": 0.011861865222354022, + "grad_norm": 0.3809345066547394, + "learning_rate": 0.0006, + "loss": 2.3402, + "step": 3180 + }, + { + "epoch": 0.011899166685317398, + "grad_norm": 0.479637086391449, + "learning_rate": 0.0006, + "loss": 2.3232, + "step": 3190 + }, + { + "epoch": 0.011936468148280775, + "grad_norm": 0.549404501914978, + "learning_rate": 0.0006, + "loss": 2.2715, + "step": 3200 + }, + { + "epoch": 0.011936468148280775, + "eval_valid_loss": 2.237370491027832, + "eval_valid_loss/all": 2.0957000255584717, + "eval_valid_loss/end_span": 1.327511191368103, + "eval_valid_perplexity/batch": 8.131131172180176, + "eval_valid_perplexity/end_span": 3.7716448307037354, + "eval_valid_perplexity/fim": 2.565765380859375, + "eval_valid_perplexity/first_seq": 14.832175254821777, + "eval_valid_perplexity/last_seq": 9.675580024719238, + "eval_valid_perplexity/second_seq": 14.354745864868164, + "eval_valid_perplexity/seq": 9.158791542053223, + "eval_valid_reconstruction/all": 0.2820015549659729, + "eval_valid_reconstruction/end_span": 0.6933510303497314, + "eval_valid_reconstruction/fim": 0.17845632135868073, + "eval_valid_reconstruction/first_seq": 0.16668355464935303, + "eval_valid_reconstruction/last_seq": 0.304824560880661, + "eval_valid_reconstruction/second_seq": 0.1812986582517624, + "eval_valid_runtime": 631.9082, + "eval_valid_samples_per_second": 0.304, + "eval_valid_steps_per_second": 0.304, + "step": 3200 + }, + { + "epoch": 0.011936468148280775, + "eval_train_loss": 2.233816146850586, + "eval_train_loss/all": 2.0641353130340576, + "eval_train_loss/end_span": 1.2982984781265259, + "eval_train_perplexity/batch": 7.878482341766357, + "eval_train_perplexity/end_span": 3.6630585193634033, + "eval_train_perplexity/fim": 2.17440128326416, + "eval_train_perplexity/first_seq": 15.555214881896973, + "eval_train_perplexity/last_seq": 9.276386260986328, + "eval_train_perplexity/second_seq": 14.252276420593262, + "eval_train_perplexity/seq": 9.065841674804688, + "eval_train_reconstruction/all": 0.2724575698375702, + "eval_train_reconstruction/end_span": 0.7004371881484985, + "eval_train_reconstruction/fim": 0.1472661793231964, + "eval_train_reconstruction/first_seq": 0.1476985216140747, + "eval_train_reconstruction/last_seq": 0.31370487809181213, + "eval_train_reconstruction/second_seq": 0.17969566583633423, + "eval_train_runtime": 637.8532, + "eval_train_samples_per_second": 0.301, + "eval_train_steps_per_second": 0.301, + "step": 3200 + }, + { + "epoch": 0.011973769611244153, + "grad_norm": 0.4011614918708801, + "learning_rate": 0.0006, + "loss": 2.1354, + "step": 3210 + }, + { + "epoch": 0.01201107107420753, + "grad_norm": 0.39745160937309265, + "learning_rate": 0.0006, + "loss": 2.2798, + "step": 3220 + }, + { + "epoch": 0.012048372537170908, + "grad_norm": 0.37895891070365906, + "learning_rate": 0.0006, + "loss": 2.3148, + "step": 3230 + }, + { + "epoch": 0.012085674000134286, + "grad_norm": 0.38363516330718994, + "learning_rate": 0.0006, + "loss": 2.3315, + "step": 3240 + }, + { + "epoch": 0.012122975463097662, + "grad_norm": 0.3017808496952057, + "learning_rate": 0.0006, + "loss": 2.3081, + "step": 3250 + }, + { + "epoch": 0.012122975463097662, + "eval_valid_loss": 2.236523151397705, + "eval_valid_loss/all": 2.0947093963623047, + "eval_valid_loss/end_span": 1.2298357486724854, + "eval_valid_perplexity/batch": 8.123080253601074, + "eval_valid_perplexity/end_span": 3.4206676483154297, + "eval_valid_perplexity/fim": 2.232482433319092, + "eval_valid_perplexity/first_seq": 15.221660614013672, + "eval_valid_perplexity/last_seq": 9.28419017791748, + "eval_valid_perplexity/second_seq": 13.77272891998291, + "eval_valid_perplexity/seq": 9.145561218261719, + "eval_valid_reconstruction/all": 0.2827528417110443, + "eval_valid_reconstruction/end_span": 0.7189801335334778, + "eval_valid_reconstruction/fim": 0.1524771749973297, + "eval_valid_reconstruction/first_seq": 0.1622503697872162, + "eval_valid_reconstruction/last_seq": 0.3174804449081421, + "eval_valid_reconstruction/second_seq": 0.19553202390670776, + "eval_valid_runtime": 628.955, + "eval_valid_samples_per_second": 0.305, + "eval_valid_steps_per_second": 0.305, + "step": 3250 + }, + { + "epoch": 0.012122975463097662, + "eval_train_loss": 2.2320914268493652, + "eval_train_loss/all": 2.0625360012054443, + "eval_train_loss/end_span": 1.1960901021957397, + "eval_train_perplexity/batch": 7.86589241027832, + "eval_train_perplexity/end_span": 3.3071608543395996, + "eval_train_perplexity/fim": 2.1487934589385986, + "eval_train_perplexity/first_seq": 15.59323787689209, + "eval_train_perplexity/last_seq": 9.091526985168457, + "eval_train_perplexity/second_seq": 14.117402076721191, + "eval_train_perplexity/seq": 9.047944068908691, + "eval_train_reconstruction/all": 0.27346500754356384, + "eval_train_reconstruction/end_span": 0.7318431735038757, + "eval_train_reconstruction/fim": 0.14618314802646637, + "eval_train_reconstruction/first_seq": 0.1504058986902237, + "eval_train_reconstruction/last_seq": 0.3209180533885956, + "eval_train_reconstruction/second_seq": 0.18628592789173126, + "eval_train_runtime": 641.0424, + "eval_train_samples_per_second": 0.3, + "eval_train_steps_per_second": 0.3, + "step": 3250 + }, + { + "epoch": 0.01216027692606104, + "grad_norm": 0.3448388874530792, + "learning_rate": 0.0006, + "loss": 2.3865, + "step": 3260 + }, + { + "epoch": 0.012197578389024417, + "grad_norm": 0.47547927498817444, + "learning_rate": 0.0006, + "loss": 2.3436, + "step": 3270 + }, + { + "epoch": 0.012234879851987795, + "grad_norm": 0.3414527475833893, + "learning_rate": 0.0006, + "loss": 2.1618, + "step": 3280 + }, + { + "epoch": 0.012272181314951173, + "grad_norm": 0.5709380507469177, + "learning_rate": 0.0006, + "loss": 2.3517, + "step": 3290 + }, + { + "epoch": 0.01230948277791455, + "grad_norm": 0.5732380747795105, + "learning_rate": 0.0006, + "loss": 2.2592, + "step": 3300 + }, + { + "epoch": 0.01230948277791455, + "eval_valid_loss": 2.2373201847076416, + "eval_valid_loss/all": 2.095163583755493, + "eval_valid_loss/end_span": 1.291953206062317, + "eval_valid_perplexity/batch": 8.12677001953125, + "eval_valid_perplexity/end_span": 3.6398890018463135, + "eval_valid_perplexity/fim": 2.252021074295044, + "eval_valid_perplexity/first_seq": 14.982537269592285, + "eval_valid_perplexity/last_seq": 8.926508903503418, + "eval_valid_perplexity/second_seq": 13.470845222473145, + "eval_valid_perplexity/seq": 9.151323318481445, + "eval_valid_reconstruction/all": 0.2821175754070282, + "eval_valid_reconstruction/end_span": 0.6922345757484436, + "eval_valid_reconstruction/fim": 0.15304780006408691, + "eval_valid_reconstruction/first_seq": 0.1634484976530075, + "eval_valid_reconstruction/last_seq": 0.33043670654296875, + "eval_valid_reconstruction/second_seq": 0.20225097239017487, + "eval_valid_runtime": 645.9564, + "eval_valid_samples_per_second": 0.297, + "eval_valid_steps_per_second": 0.297, + "step": 3300 + }, + { + "epoch": 0.01230948277791455, + "eval_train_loss": 2.2338664531707764, + "eval_train_loss/all": 2.0640504360198975, + "eval_train_loss/end_span": 1.2552000284194946, + "eval_train_perplexity/batch": 7.877813816070557, + "eval_train_perplexity/end_span": 3.508540153503418, + "eval_train_perplexity/fim": 2.1064624786376953, + "eval_train_perplexity/first_seq": 15.713180541992188, + "eval_train_perplexity/last_seq": 9.062037467956543, + "eval_train_perplexity/second_seq": 14.450305938720703, + "eval_train_perplexity/seq": 9.063944816589355, + "eval_train_reconstruction/all": 0.27247339487075806, + "eval_train_reconstruction/end_span": 0.7041807174682617, + "eval_train_reconstruction/fim": 0.14100728929042816, + "eval_train_reconstruction/first_seq": 0.14856557548046112, + "eval_train_reconstruction/last_seq": 0.3206441104412079, + "eval_train_reconstruction/second_seq": 0.17997053265571594, + "eval_train_runtime": 654.8667, + "eval_train_samples_per_second": 0.293, + "eval_train_steps_per_second": 0.293, + "step": 3300 + }, + { + "epoch": 0.012346784240877927, + "grad_norm": 0.6147153973579407, + "learning_rate": 0.0006, + "loss": 2.3118, + "step": 3310 + }, + { + "epoch": 0.012384085703841304, + "grad_norm": 0.4449121057987213, + "learning_rate": 0.0006, + "loss": 2.2698, + "step": 3320 + }, + { + "epoch": 0.012421387166804682, + "grad_norm": 0.41787609457969666, + "learning_rate": 0.0006, + "loss": 2.2513, + "step": 3330 + }, + { + "epoch": 0.01245868862976806, + "grad_norm": 0.26382482051849365, + "learning_rate": 0.0006, + "loss": 2.2872, + "step": 3340 + }, + { + "epoch": 0.012495990092731437, + "grad_norm": 0.4243542551994324, + "learning_rate": 0.0006, + "loss": 2.4251, + "step": 3350 + }, + { + "epoch": 0.012495990092731437, + "eval_valid_loss": 2.2383220195770264, + "eval_valid_loss/all": 2.0961318016052246, + "eval_valid_loss/end_span": 1.3348474502563477, + "eval_valid_perplexity/batch": 8.134642601013184, + "eval_valid_perplexity/end_span": 3.7994163036346436, + "eval_valid_perplexity/fim": 2.4524099826812744, + "eval_valid_perplexity/first_seq": 15.124017715454102, + "eval_valid_perplexity/last_seq": 9.334041595458984, + "eval_valid_perplexity/second_seq": 14.245505332946777, + "eval_valid_perplexity/seq": 9.161725044250488, + "eval_valid_reconstruction/all": 0.2817348539829254, + "eval_valid_reconstruction/end_span": 0.6846392750740051, + "eval_valid_reconstruction/fim": 0.1697850078344345, + "eval_valid_reconstruction/first_seq": 0.162269726395607, + "eval_valid_reconstruction/last_seq": 0.31631869077682495, + "eval_valid_reconstruction/second_seq": 0.1809832602739334, + "eval_valid_runtime": 641.112, + "eval_valid_samples_per_second": 0.299, + "eval_valid_steps_per_second": 0.299, + "step": 3350 + }, + { + "epoch": 0.012495990092731437, + "eval_train_loss": 2.2332847118377686, + "eval_train_loss/all": 2.063401699066162, + "eval_train_loss/end_span": 1.3035318851470947, + "eval_train_perplexity/batch": 7.872704982757568, + "eval_train_perplexity/end_span": 3.682279109954834, + "eval_train_perplexity/fim": 2.0145490169525146, + "eval_train_perplexity/first_seq": 15.914155960083008, + "eval_train_perplexity/last_seq": 9.371662139892578, + "eval_train_perplexity/second_seq": 14.483894348144531, + "eval_train_perplexity/seq": 9.058902740478516, + "eval_train_reconstruction/all": 0.2726314663887024, + "eval_train_reconstruction/end_span": 0.6939992904663086, + "eval_train_reconstruction/fim": 0.13283924758434296, + "eval_train_reconstruction/first_seq": 0.14398498833179474, + "eval_train_reconstruction/last_seq": 0.3093707859516144, + "eval_train_reconstruction/second_seq": 0.17529188096523285, + "eval_train_runtime": 654.0421, + "eval_train_samples_per_second": 0.294, + "eval_train_steps_per_second": 0.294, + "step": 3350 + }, + { + "epoch": 0.012533291555694815, + "grad_norm": 0.399493932723999, + "learning_rate": 0.0006, + "loss": 2.1946, + "step": 3360 + }, + { + "epoch": 0.012570593018658191, + "grad_norm": 0.4826202094554901, + "learning_rate": 0.0006, + "loss": 2.2178, + "step": 3370 + }, + { + "epoch": 0.012607894481621569, + "grad_norm": 0.381093829870224, + "learning_rate": 0.0006, + "loss": 2.2817, + "step": 3380 + }, + { + "epoch": 0.012645195944584946, + "grad_norm": 0.46390125155448914, + "learning_rate": 0.0006, + "loss": 2.1932, + "step": 3390 + }, + { + "epoch": 0.012682497407548324, + "grad_norm": 0.2952154874801636, + "learning_rate": 0.0006, + "loss": 2.3319, + "step": 3400 + }, + { + "epoch": 0.012682497407548324, + "eval_valid_loss": 2.239671230316162, + "eval_valid_loss/all": 2.097308874130249, + "eval_valid_loss/end_span": 1.2595233917236328, + "eval_valid_perplexity/batch": 8.1442232131958, + "eval_valid_perplexity/end_span": 3.5237417221069336, + "eval_valid_perplexity/fim": 2.579524278640747, + "eval_valid_perplexity/first_seq": 14.972506523132324, + "eval_valid_perplexity/last_seq": 9.243182182312012, + "eval_valid_perplexity/second_seq": 14.201735496520996, + "eval_valid_perplexity/seq": 9.16110610961914, + "eval_valid_reconstruction/all": 0.2818159759044647, + "eval_valid_reconstruction/end_span": 0.704980194568634, + "eval_valid_reconstruction/fim": 0.17852187156677246, + "eval_valid_reconstruction/first_seq": 0.16360723972320557, + "eval_valid_reconstruction/last_seq": 0.32134339213371277, + "eval_valid_reconstruction/second_seq": 0.18741914629936218, + "eval_valid_runtime": 640.6155, + "eval_valid_samples_per_second": 0.3, + "eval_valid_steps_per_second": 0.3, + "step": 3400 + }, + { + "epoch": 0.012682497407548324, + "eval_train_loss": 2.2354071140289307, + "eval_train_loss/all": 2.0650293827056885, + "eval_train_loss/end_span": 1.2246111631393433, + "eval_train_perplexity/batch": 7.885529518127441, + "eval_train_perplexity/end_span": 3.4028427600860596, + "eval_train_perplexity/fim": 2.0298707485198975, + "eval_train_perplexity/first_seq": 15.612652778625488, + "eval_train_perplexity/last_seq": 9.655922889709473, + "eval_train_perplexity/second_seq": 13.787046432495117, + "eval_train_perplexity/seq": 9.065977096557617, + "eval_train_reconstruction/all": 0.27259477972984314, + "eval_train_reconstruction/end_span": 0.7160407304763794, + "eval_train_reconstruction/fim": 0.13329586386680603, + "eval_train_reconstruction/first_seq": 0.14931970834732056, + "eval_train_reconstruction/last_seq": 0.30465587973594666, + "eval_train_reconstruction/second_seq": 0.1954329013824463, + "eval_train_runtime": 653.4599, + "eval_train_samples_per_second": 0.294, + "eval_train_steps_per_second": 0.294, + "step": 3400 + }, + { + "epoch": 0.012719798870511702, + "grad_norm": 0.3428683280944824, + "learning_rate": 0.0006, + "loss": 2.2732, + "step": 3410 + }, + { + "epoch": 0.01275710033347508, + "grad_norm": 0.3397960960865021, + "learning_rate": 0.0006, + "loss": 2.0615, + "step": 3420 + }, + { + "epoch": 0.012794401796438456, + "grad_norm": 0.4350416660308838, + "learning_rate": 0.0006, + "loss": 2.3447, + "step": 3430 + }, + { + "epoch": 0.012831703259401833, + "grad_norm": 0.5346946120262146, + "learning_rate": 0.0006, + "loss": 2.2762, + "step": 3440 + }, + { + "epoch": 0.012869004722365211, + "grad_norm": 0.3463191092014313, + "learning_rate": 0.0006, + "loss": 2.2793, + "step": 3450 + }, + { + "epoch": 0.012869004722365211, + "eval_valid_loss": 2.2344114780426025, + "eval_valid_loss/all": 2.0927786827087402, + "eval_valid_loss/end_span": 1.3683785200119019, + "eval_valid_perplexity/batch": 8.10741138458252, + "eval_valid_perplexity/end_span": 3.9289748668670654, + "eval_valid_perplexity/fim": 2.64216947555542, + "eval_valid_perplexity/first_seq": 14.578572273254395, + "eval_valid_perplexity/last_seq": 9.662315368652344, + "eval_valid_perplexity/second_seq": 13.86181926727295, + "eval_valid_perplexity/seq": 9.13008975982666, + "eval_valid_reconstruction/all": 0.28328636288642883, + "eval_valid_reconstruction/end_span": 0.684842050075531, + "eval_valid_reconstruction/fim": 0.1851770281791687, + "eval_valid_reconstruction/first_seq": 0.17351113259792328, + "eval_valid_reconstruction/last_seq": 0.30641496181488037, + "eval_valid_reconstruction/second_seq": 0.19567866623401642, + "eval_valid_runtime": 650.1647, + "eval_valid_samples_per_second": 0.295, + "eval_valid_steps_per_second": 0.295, + "step": 3450 + }, + { + "epoch": 0.012869004722365211, + "eval_train_loss": 2.2310774326324463, + "eval_train_loss/all": 2.061882972717285, + "eval_train_loss/end_span": 1.340378999710083, + "eval_train_perplexity/batch": 7.860757350921631, + "eval_train_perplexity/end_span": 3.820491313934326, + "eval_train_perplexity/fim": 2.2534399032592773, + "eval_train_perplexity/first_seq": 14.918529510498047, + "eval_train_perplexity/last_seq": 9.051750183105469, + "eval_train_perplexity/second_seq": 14.377615928649902, + "eval_train_perplexity/seq": 9.045440673828125, + "eval_train_reconstruction/all": 0.2733670771121979, + "eval_train_reconstruction/end_span": 0.6942178606987, + "eval_train_reconstruction/fim": 0.15422526001930237, + "eval_train_reconstruction/first_seq": 0.16039884090423584, + "eval_train_reconstruction/last_seq": 0.3196142315864563, + "eval_train_reconstruction/second_seq": 0.17662011086940765, + "eval_train_runtime": 637.465, + "eval_train_samples_per_second": 0.301, + "eval_train_steps_per_second": 0.301, + "step": 3450 + }, + { + "epoch": 0.012906306185328589, + "grad_norm": 0.4599052965641022, + "learning_rate": 0.0006, + "loss": 2.2182, + "step": 3460 + }, + { + "epoch": 0.012943607648291966, + "grad_norm": 0.8503429293632507, + "learning_rate": 0.0006, + "loss": 2.3085, + "step": 3470 + }, + { + "epoch": 0.012980909111255344, + "grad_norm": 0.2791067659854889, + "learning_rate": 0.0006, + "loss": 2.2486, + "step": 3480 + }, + { + "epoch": 0.01301821057421872, + "grad_norm": 0.5813776254653931, + "learning_rate": 0.0006, + "loss": 2.2951, + "step": 3490 + }, + { + "epoch": 0.013055512037182098, + "grad_norm": 0.41493624448776245, + "learning_rate": 0.0006, + "loss": 2.186, + "step": 3500 + }, + { + "epoch": 0.013055512037182098, + "eval_valid_loss": 2.2371017932891846, + "eval_valid_loss/all": 2.094789981842041, + "eval_valid_loss/end_span": 1.3399226665496826, + "eval_valid_perplexity/batch": 8.123734474182129, + "eval_valid_perplexity/end_span": 3.8187482357025146, + "eval_valid_perplexity/fim": 2.935004949569702, + "eval_valid_perplexity/first_seq": 14.734557151794434, + "eval_valid_perplexity/last_seq": 9.136434555053711, + "eval_valid_perplexity/second_seq": 13.850035667419434, + "eval_valid_perplexity/seq": 9.143270492553711, + "eval_valid_reconstruction/all": 0.2826443314552307, + "eval_valid_reconstruction/end_span": 0.6792078614234924, + "eval_valid_reconstruction/fim": 0.20402498543262482, + "eval_valid_reconstruction/first_seq": 0.17021989822387695, + "eval_valid_reconstruction/last_seq": 0.32451990246772766, + "eval_valid_reconstruction/second_seq": 0.1945054978132248, + "eval_valid_runtime": 628.4093, + "eval_valid_samples_per_second": 0.306, + "eval_valid_steps_per_second": 0.306, + "step": 3500 + }, + { + "epoch": 0.013055512037182098, + "eval_train_loss": 2.2338039875030518, + "eval_train_loss/all": 2.0637919902801514, + "eval_train_loss/end_span": 1.3036144971847534, + "eval_train_perplexity/batch": 7.8757781982421875, + "eval_train_perplexity/end_span": 3.6825833320617676, + "eval_train_perplexity/fim": 2.0380048751831055, + "eval_train_perplexity/first_seq": 15.239300727844238, + "eval_train_perplexity/last_seq": 9.255334854125977, + "eval_train_perplexity/second_seq": 14.378209114074707, + "eval_train_perplexity/seq": 9.057385444641113, + "eval_train_reconstruction/all": 0.27288195490837097, + "eval_train_reconstruction/end_span": 0.6900887489318848, + "eval_train_reconstruction/fim": 0.1345026195049286, + "eval_train_reconstruction/first_seq": 0.15957871079444885, + "eval_train_reconstruction/last_seq": 0.3150023818016052, + "eval_train_reconstruction/second_seq": 0.1805114597082138, + "eval_train_runtime": 617.1532, + "eval_train_samples_per_second": 0.311, + "eval_train_steps_per_second": 0.311, + "step": 3500 + }, + { + "epoch": 0.013092813500145475, + "grad_norm": 0.4867928624153137, + "learning_rate": 0.0006, + "loss": 2.2462, + "step": 3510 + }, + { + "epoch": 0.013130114963108853, + "grad_norm": 0.4192355275154114, + "learning_rate": 0.0006, + "loss": 2.2506, + "step": 3520 + }, + { + "epoch": 0.01316741642607223, + "grad_norm": 0.5449298620223999, + "learning_rate": 0.0006, + "loss": 2.3341, + "step": 3530 + }, + { + "epoch": 0.013204717889035609, + "grad_norm": 0.4231148362159729, + "learning_rate": 0.0006, + "loss": 2.2203, + "step": 3540 + }, + { + "epoch": 0.013242019351998986, + "grad_norm": 0.47459104657173157, + "learning_rate": 0.0006, + "loss": 2.2514, + "step": 3550 + }, + { + "epoch": 0.013242019351998986, + "eval_valid_loss": 2.2335128784179688, + "eval_valid_loss/all": 2.0918374061584473, + "eval_valid_loss/end_span": 1.4229276180267334, + "eval_valid_perplexity/batch": 8.099783897399902, + "eval_valid_perplexity/end_span": 4.149250030517578, + "eval_valid_perplexity/fim": 2.5138113498687744, + "eval_valid_perplexity/first_seq": 14.842508316040039, + "eval_valid_perplexity/last_seq": 9.36950397491455, + "eval_valid_perplexity/second_seq": 14.030515670776367, + "eval_valid_perplexity/seq": 9.117632865905762, + "eval_valid_reconstruction/all": 0.283500999212265, + "eval_valid_reconstruction/end_span": 0.6714109778404236, + "eval_valid_reconstruction/fim": 0.17522306740283966, + "eval_valid_reconstruction/first_seq": 0.16596171259880066, + "eval_valid_reconstruction/last_seq": 0.32019343972206116, + "eval_valid_reconstruction/second_seq": 0.19077153503894806, + "eval_valid_runtime": 639.1968, + "eval_valid_samples_per_second": 0.3, + "eval_valid_steps_per_second": 0.3, + "step": 3550 + }, + { + "epoch": 0.013242019351998986, + "eval_train_loss": 2.2291290760040283, + "eval_train_loss/all": 2.0599052906036377, + "eval_train_loss/end_span": 1.3745530843734741, + "eval_train_perplexity/batch": 7.845226764678955, + "eval_train_perplexity/end_span": 3.9533095359802246, + "eval_train_perplexity/fim": 1.9623310565948486, + "eval_train_perplexity/first_seq": 15.633888244628906, + "eval_train_perplexity/last_seq": 9.452184677124023, + "eval_train_perplexity/second_seq": 14.330074310302734, + "eval_train_perplexity/seq": 9.02160358428955, + "eval_train_reconstruction/all": 0.27414992451667786, + "eval_train_reconstruction/end_span": 0.6853320002555847, + "eval_train_reconstruction/fim": 0.1283591240644455, + "eval_train_reconstruction/first_seq": 0.15066230297088623, + "eval_train_reconstruction/last_seq": 0.3138449788093567, + "eval_train_reconstruction/second_seq": 0.1814688891172409, + "eval_train_runtime": 639.0993, + "eval_train_samples_per_second": 0.3, + "eval_train_steps_per_second": 0.3, + "step": 3550 + }, + { + "epoch": 0.013279320814962362, + "grad_norm": 0.4176058769226074, + "learning_rate": 0.0006, + "loss": 2.3301, + "step": 3560 + }, + { + "epoch": 0.01331662227792574, + "grad_norm": 0.27476760745048523, + "learning_rate": 0.0006, + "loss": 2.3241, + "step": 3570 + }, + { + "epoch": 0.013353923740889118, + "grad_norm": 0.3388148248195648, + "learning_rate": 0.0006, + "loss": 2.3303, + "step": 3580 + }, + { + "epoch": 0.013391225203852495, + "grad_norm": 0.3774171769618988, + "learning_rate": 0.0006, + "loss": 2.1306, + "step": 3590 + }, + { + "epoch": 0.013428526666815873, + "grad_norm": 0.5836585164070129, + "learning_rate": 0.0006, + "loss": 2.2445, + "step": 3600 + }, + { + "epoch": 0.013428526666815873, + "eval_valid_loss": 2.2364273071289062, + "eval_valid_loss/all": 2.0946178436279297, + "eval_valid_loss/end_span": 1.236499547958374, + "eval_valid_perplexity/batch": 8.122336387634277, + "eval_valid_perplexity/end_span": 3.4435384273529053, + "eval_valid_perplexity/fim": 2.4450347423553467, + "eval_valid_perplexity/first_seq": 14.810608863830566, + "eval_valid_perplexity/last_seq": 9.077751159667969, + "eval_valid_perplexity/second_seq": 13.359610557556152, + "eval_valid_perplexity/seq": 9.145376205444336, + "eval_valid_reconstruction/all": 0.28226733207702637, + "eval_valid_reconstruction/end_span": 0.7067632079124451, + "eval_valid_reconstruction/fim": 0.16902025043964386, + "eval_valid_reconstruction/first_seq": 0.1679963320493698, + "eval_valid_reconstruction/last_seq": 0.32594287395477295, + "eval_valid_reconstruction/second_seq": 0.20436932146549225, + "eval_valid_runtime": 642.0118, + "eval_valid_samples_per_second": 0.299, + "eval_valid_steps_per_second": 0.299, + "step": 3600 + }, + { + "epoch": 0.013428526666815873, + "eval_train_loss": 2.230036497116089, + "eval_train_loss/all": 2.060854911804199, + "eval_train_loss/end_span": 1.205040693283081, + "eval_train_perplexity/batch": 7.852680206298828, + "eval_train_perplexity/end_span": 3.336894989013672, + "eval_train_perplexity/fim": 2.126919984817505, + "eval_train_perplexity/first_seq": 15.352486610412598, + "eval_train_perplexity/last_seq": 9.383328437805176, + "eval_train_perplexity/second_seq": 13.795431137084961, + "eval_train_perplexity/seq": 9.033454895019531, + "eval_train_reconstruction/all": 0.2737453281879425, + "eval_train_reconstruction/end_span": 0.7178654074668884, + "eval_train_reconstruction/fim": 0.14474143087863922, + "eval_train_reconstruction/first_seq": 0.15536218881607056, + "eval_train_reconstruction/last_seq": 0.3126924932003021, + "eval_train_reconstruction/second_seq": 0.1923324465751648, + "eval_train_runtime": 636.2222, + "eval_train_samples_per_second": 0.302, + "eval_train_steps_per_second": 0.302, + "step": 3600 + }, + { + "epoch": 0.01346582812977925, + "grad_norm": 0.5770428776741028, + "learning_rate": 0.0006, + "loss": 2.3599, + "step": 3610 + }, + { + "epoch": 0.013503129592742627, + "grad_norm": 0.42496657371520996, + "learning_rate": 0.0006, + "loss": 2.3098, + "step": 3620 + }, + { + "epoch": 0.013540431055706004, + "grad_norm": 0.3918019235134125, + "learning_rate": 0.0006, + "loss": 2.3998, + "step": 3630 + }, + { + "epoch": 0.013577732518669382, + "grad_norm": 0.4680219292640686, + "learning_rate": 0.0006, + "loss": 2.0802, + "step": 3640 + }, + { + "epoch": 0.01361503398163276, + "grad_norm": 0.5557373762130737, + "learning_rate": 0.0006, + "loss": 2.182, + "step": 3650 + }, + { + "epoch": 0.01361503398163276, + "eval_valid_loss": 2.2381670475006104, + "eval_valid_loss/all": 2.096210241317749, + "eval_valid_loss/end_span": 1.31547212600708, + "eval_valid_perplexity/batch": 8.13528060913086, + "eval_valid_perplexity/end_span": 3.7265100479125977, + "eval_valid_perplexity/fim": 2.458261013031006, + "eval_valid_perplexity/first_seq": 14.81864070892334, + "eval_valid_perplexity/last_seq": 9.671778678894043, + "eval_valid_perplexity/second_seq": 13.746661186218262, + "eval_valid_perplexity/seq": 9.159918785095215, + "eval_valid_reconstruction/all": 0.28186216950416565, + "eval_valid_reconstruction/end_span": 0.6876087784767151, + "eval_valid_reconstruction/fim": 0.1698118895292282, + "eval_valid_reconstruction/first_seq": 0.1690341681241989, + "eval_valid_reconstruction/last_seq": 0.30485814809799194, + "eval_valid_reconstruction/second_seq": 0.19408662617206573, + "eval_valid_runtime": 640.764, + "eval_valid_samples_per_second": 0.3, + "eval_valid_steps_per_second": 0.3, + "step": 3650 + }, + { + "epoch": 0.01361503398163276, + "eval_train_loss": 2.234342575073242, + "eval_train_loss/all": 2.0648837089538574, + "eval_train_loss/end_span": 1.2826167345046997, + "eval_train_perplexity/batch": 7.88438081741333, + "eval_train_perplexity/end_span": 3.6060636043548584, + "eval_train_perplexity/fim": 2.0244596004486084, + "eval_train_perplexity/first_seq": 15.460909843444824, + "eval_train_perplexity/last_seq": 9.548373222351074, + "eval_train_perplexity/second_seq": 14.87026309967041, + "eval_train_perplexity/seq": 9.07052993774414, + "eval_train_reconstruction/all": 0.27236947417259216, + "eval_train_reconstruction/end_span": 0.6980687975883484, + "eval_train_reconstruction/fim": 0.1329820156097412, + "eval_train_reconstruction/first_seq": 0.15422990918159485, + "eval_train_reconstruction/last_seq": 0.3082634210586548, + "eval_train_reconstruction/second_seq": 0.16910460591316223, + "eval_train_runtime": 644.5138, + "eval_train_samples_per_second": 0.298, + "eval_train_steps_per_second": 0.298, + "step": 3650 + }, + { + "epoch": 0.013652335444596138, + "grad_norm": 0.4897306263446808, + "learning_rate": 0.0006, + "loss": 2.3207, + "step": 3660 + }, + { + "epoch": 0.013689636907559515, + "grad_norm": 0.33535459637641907, + "learning_rate": 0.0006, + "loss": 2.3814, + "step": 3670 + }, + { + "epoch": 0.013726938370522891, + "grad_norm": 0.5480711460113525, + "learning_rate": 0.0006, + "loss": 2.2788, + "step": 3680 + }, + { + "epoch": 0.013764239833486269, + "grad_norm": 1.0079611539840698, + "learning_rate": 0.0006, + "loss": 2.1606, + "step": 3690 + }, + { + "epoch": 0.013801541296449647, + "grad_norm": 0.379345178604126, + "learning_rate": 0.0006, + "loss": 2.2881, + "step": 3700 + }, + { + "epoch": 0.013801541296449647, + "eval_valid_loss": 2.2348287105560303, + "eval_valid_loss/all": 2.0930068492889404, + "eval_valid_loss/end_span": 1.334142804145813, + "eval_valid_perplexity/batch": 8.109261512756348, + "eval_valid_perplexity/end_span": 3.7967400550842285, + "eval_valid_perplexity/fim": 2.2022879123687744, + "eval_valid_perplexity/first_seq": 14.801688194274902, + "eval_valid_perplexity/last_seq": 8.91120719909668, + "eval_valid_perplexity/second_seq": 14.161128044128418, + "eval_valid_perplexity/seq": 9.123900413513184, + "eval_valid_reconstruction/all": 0.28341689705848694, + "eval_valid_reconstruction/end_span": 0.6937452554702759, + "eval_valid_reconstruction/fim": 0.15033087134361267, + "eval_valid_reconstruction/first_seq": 0.16720686852931976, + "eval_valid_reconstruction/last_seq": 0.33225589990615845, + "eval_valid_reconstruction/second_seq": 0.18558509647846222, + "eval_valid_runtime": 638.1676, + "eval_valid_samples_per_second": 0.301, + "eval_valid_steps_per_second": 0.301, + "step": 3700 + }, + { + "epoch": 0.013801541296449647, + "eval_train_loss": 2.231764316558838, + "eval_train_loss/all": 2.062101364135742, + "eval_train_loss/end_span": 1.2942548990249634, + "eval_train_perplexity/batch": 7.86247444152832, + "eval_train_perplexity/end_span": 3.6482765674591064, + "eval_train_perplexity/fim": 2.236943244934082, + "eval_train_perplexity/first_seq": 15.467409133911133, + "eval_train_perplexity/last_seq": 9.3272123336792, + "eval_train_perplexity/second_seq": 14.341859817504883, + "eval_train_perplexity/seq": 9.040043830871582, + "eval_train_reconstruction/all": 0.2737829089164734, + "eval_train_reconstruction/end_span": 0.7056939601898193, + "eval_train_reconstruction/fim": 0.15425974130630493, + "eval_train_reconstruction/first_seq": 0.15344570577144623, + "eval_train_reconstruction/last_seq": 0.3115333616733551, + "eval_train_reconstruction/second_seq": 0.1805306375026703, + "eval_train_runtime": 643.7157, + "eval_train_samples_per_second": 0.298, + "eval_train_steps_per_second": 0.298, + "step": 3700 + }, + { + "epoch": 0.013838842759413024, + "grad_norm": 0.47433042526245117, + "learning_rate": 0.0006, + "loss": 2.3246, + "step": 3710 + }, + { + "epoch": 0.013876144222376402, + "grad_norm": 0.5069348812103271, + "learning_rate": 0.0006, + "loss": 2.3283, + "step": 3720 + }, + { + "epoch": 0.01391344568533978, + "grad_norm": 0.39530032873153687, + "learning_rate": 0.0006, + "loss": 2.2148, + "step": 3730 + }, + { + "epoch": 0.013950747148303156, + "grad_norm": 0.35723328590393066, + "learning_rate": 0.0006, + "loss": 2.2857, + "step": 3740 + }, + { + "epoch": 0.013988048611266533, + "grad_norm": 0.3161740005016327, + "learning_rate": 0.0006, + "loss": 2.1737, + "step": 3750 + }, + { + "epoch": 0.013988048611266533, + "eval_valid_loss": 2.233489751815796, + "eval_valid_loss/all": 2.092071056365967, + "eval_valid_loss/end_span": 1.3848586082458496, + "eval_valid_perplexity/batch": 8.101676940917969, + "eval_valid_perplexity/end_span": 3.9942610263824463, + "eval_valid_perplexity/fim": 2.21337628364563, + "eval_valid_perplexity/first_seq": 15.110934257507324, + "eval_valid_perplexity/last_seq": 9.461509704589844, + "eval_valid_perplexity/second_seq": 13.829161643981934, + "eval_valid_perplexity/seq": 9.122177124023438, + "eval_valid_reconstruction/all": 0.2831518054008484, + "eval_valid_reconstruction/end_span": 0.6664128303527832, + "eval_valid_reconstruction/fim": 0.15113958716392517, + "eval_valid_reconstruction/first_seq": 0.1620136946439743, + "eval_valid_reconstruction/last_seq": 0.3109237551689148, + "eval_valid_reconstruction/second_seq": 0.19366197288036346, + "eval_valid_runtime": 634.0743, + "eval_valid_samples_per_second": 0.303, + "eval_valid_steps_per_second": 0.303, + "step": 3750 + }, + { + "epoch": 0.013988048611266533, + "eval_train_loss": 2.229159116744995, + "eval_train_loss/all": 2.0601212978363037, + "eval_train_loss/end_span": 1.3580024242401123, + "eval_train_perplexity/batch": 7.846921443939209, + "eval_train_perplexity/end_span": 3.888418197631836, + "eval_train_perplexity/fim": 2.0624451637268066, + "eval_train_perplexity/first_seq": 15.257242202758789, + "eval_train_perplexity/last_seq": 9.328234672546387, + "eval_train_perplexity/second_seq": 14.130016326904297, + "eval_train_perplexity/seq": 9.026741027832031, + "eval_train_reconstruction/all": 0.27395233511924744, + "eval_train_reconstruction/end_span": 0.6780880093574524, + "eval_train_reconstruction/fim": 0.138339564204216, + "eval_train_reconstruction/first_seq": 0.1544002890586853, + "eval_train_reconstruction/last_seq": 0.3145005404949188, + "eval_train_reconstruction/second_seq": 0.18212953209877014, + "eval_train_runtime": 640.404, + "eval_train_samples_per_second": 0.3, + "eval_train_steps_per_second": 0.3, + "step": 3750 + }, + { + "epoch": 0.014025350074229911, + "grad_norm": 0.5087934136390686, + "learning_rate": 0.0006, + "loss": 2.1872, + "step": 3760 + }, + { + "epoch": 0.014062651537193289, + "grad_norm": 0.46080291271209717, + "learning_rate": 0.0006, + "loss": 2.283, + "step": 3770 + }, + { + "epoch": 0.014099953000156666, + "grad_norm": 1.1836562156677246, + "learning_rate": 0.0006, + "loss": 2.1373, + "step": 3780 + }, + { + "epoch": 0.014137254463120044, + "grad_norm": 0.3101639449596405, + "learning_rate": 0.0006, + "loss": 2.0796, + "step": 3790 + }, + { + "epoch": 0.01417455592608342, + "grad_norm": 0.3567272126674652, + "learning_rate": 0.0006, + "loss": 2.3382, + "step": 3800 + }, + { + "epoch": 0.01417455592608342, + "eval_valid_loss": 2.238217353820801, + "eval_valid_loss/all": 2.096195697784424, + "eval_valid_loss/end_span": 1.2652077674865723, + "eval_valid_perplexity/batch": 8.135162353515625, + "eval_valid_perplexity/end_span": 3.5438289642333984, + "eval_valid_perplexity/fim": 2.4622397422790527, + "eval_valid_perplexity/first_seq": 14.79817008972168, + "eval_valid_perplexity/last_seq": 9.526175498962402, + "eval_valid_perplexity/second_seq": 13.822896003723145, + "eval_valid_perplexity/seq": 9.155091285705566, + "eval_valid_reconstruction/all": 0.2818489372730255, + "eval_valid_reconstruction/end_span": 0.708312451839447, + "eval_valid_reconstruction/fim": 0.17078858613967896, + "eval_valid_reconstruction/first_seq": 0.17074304819107056, + "eval_valid_reconstruction/last_seq": 0.31208717823028564, + "eval_valid_reconstruction/second_seq": 0.19597426056861877, + "eval_valid_runtime": 646.6023, + "eval_valid_samples_per_second": 0.297, + "eval_valid_steps_per_second": 0.297, + "step": 3800 + }, + { + "epoch": 0.01417455592608342, + "eval_train_loss": 2.2327980995178223, + "eval_train_loss/all": 2.062577724456787, + "eval_train_loss/end_span": 1.2255072593688965, + "eval_train_perplexity/batch": 7.866220474243164, + "eval_train_perplexity/end_span": 3.405893325805664, + "eval_train_perplexity/fim": 2.1284708976745605, + "eval_train_perplexity/first_seq": 15.227380752563477, + "eval_train_perplexity/last_seq": 9.41067886352539, + "eval_train_perplexity/second_seq": 13.989120483398438, + "eval_train_perplexity/seq": 9.036921501159668, + "eval_train_reconstruction/all": 0.27316373586654663, + "eval_train_reconstruction/end_span": 0.7207258343696594, + "eval_train_reconstruction/fim": 0.14337417483329773, + "eval_train_reconstruction/first_seq": 0.1551467329263687, + "eval_train_reconstruction/last_seq": 0.3142099976539612, + "eval_train_reconstruction/second_seq": 0.18554838001728058, + "eval_train_runtime": 648.8357, + "eval_train_samples_per_second": 0.296, + "eval_train_steps_per_second": 0.296, + "step": 3800 + }, + { + "epoch": 0.014211857389046798, + "grad_norm": 0.29138436913490295, + "learning_rate": 0.0006, + "loss": 2.2798, + "step": 3810 + }, + { + "epoch": 0.014249158852010176, + "grad_norm": 0.43248024582862854, + "learning_rate": 0.0006, + "loss": 2.2322, + "step": 3820 + }, + { + "epoch": 0.014286460314973553, + "grad_norm": 0.42783430218696594, + "learning_rate": 0.0006, + "loss": 2.1992, + "step": 3830 + }, + { + "epoch": 0.014323761777936931, + "grad_norm": 0.35935142636299133, + "learning_rate": 0.0006, + "loss": 2.2947, + "step": 3840 + }, + { + "epoch": 0.014361063240900309, + "grad_norm": 0.36883312463760376, + "learning_rate": 0.0006, + "loss": 2.2933, + "step": 3850 + }, + { + "epoch": 0.014361063240900309, + "eval_valid_loss": 2.234553575515747, + "eval_valid_loss/all": 2.0932040214538574, + "eval_valid_loss/end_span": 1.3218953609466553, + "eval_valid_perplexity/batch": 8.110860824584961, + "eval_valid_perplexity/end_span": 3.750523328781128, + "eval_valid_perplexity/fim": 2.486238956451416, + "eval_valid_perplexity/first_seq": 14.598234176635742, + "eval_valid_perplexity/last_seq": 9.395788192749023, + "eval_valid_perplexity/second_seq": 14.181360244750977, + "eval_valid_perplexity/seq": 9.130393028259277, + "eval_valid_reconstruction/all": 0.28293925523757935, + "eval_valid_reconstruction/end_span": 0.680872917175293, + "eval_valid_reconstruction/fim": 0.17276249825954437, + "eval_valid_reconstruction/first_seq": 0.1721656173467636, + "eval_valid_reconstruction/last_seq": 0.313448965549469, + "eval_valid_reconstruction/second_seq": 0.18613076210021973, + "eval_valid_runtime": 646.1366, + "eval_valid_samples_per_second": 0.297, + "eval_valid_steps_per_second": 0.297, + "step": 3850 + }, + { + "epoch": 0.014361063240900309, + "eval_train_loss": 2.228746175765991, + "eval_train_loss/all": 2.0594446659088135, + "eval_train_loss/end_span": 1.2788612842559814, + "eval_train_perplexity/batch": 7.84161376953125, + "eval_train_perplexity/end_span": 3.5925464630126953, + "eval_train_perplexity/fim": 2.1094791889190674, + "eval_train_perplexity/first_seq": 15.458174705505371, + "eval_train_perplexity/last_seq": 9.306675910949707, + "eval_train_perplexity/second_seq": 14.332200050354004, + "eval_train_perplexity/seq": 9.015843391418457, + "eval_train_reconstruction/all": 0.27419358491897583, + "eval_train_reconstruction/end_span": 0.6972054839134216, + "eval_train_reconstruction/fim": 0.14264506101608276, + "eval_train_reconstruction/first_seq": 0.15355591475963593, + "eval_train_reconstruction/last_seq": 0.3161221742630005, + "eval_train_reconstruction/second_seq": 0.1864585429430008, + "eval_train_runtime": 647.067, + "eval_train_samples_per_second": 0.297, + "eval_train_steps_per_second": 0.297, + "step": 3850 + }, + { + "epoch": 0.014398364703863686, + "grad_norm": 0.5359156727790833, + "learning_rate": 0.0006, + "loss": 2.3538, + "step": 3860 + }, + { + "epoch": 0.014435666166827062, + "grad_norm": 0.3969944417476654, + "learning_rate": 0.0006, + "loss": 2.2679, + "step": 3870 + }, + { + "epoch": 0.01447296762979044, + "grad_norm": 0.43567436933517456, + "learning_rate": 0.0006, + "loss": 2.1415, + "step": 3880 + }, + { + "epoch": 0.014510269092753818, + "grad_norm": 0.41271117329597473, + "learning_rate": 0.0006, + "loss": 2.3444, + "step": 3890 + }, + { + "epoch": 0.014547570555717195, + "grad_norm": 0.3451387882232666, + "learning_rate": 0.0006, + "loss": 2.4341, + "step": 3900 + }, + { + "epoch": 0.014547570555717195, + "eval_valid_loss": 2.231435775756836, + "eval_valid_loss/all": 2.0899465084075928, + "eval_valid_loss/end_span": 1.2479803562164307, + "eval_valid_perplexity/batch": 8.08448314666748, + "eval_valid_perplexity/end_span": 3.4833009243011475, + "eval_valid_perplexity/fim": 2.3731954097747803, + "eval_valid_perplexity/first_seq": 14.487272262573242, + "eval_valid_perplexity/last_seq": 9.546185493469238, + "eval_valid_perplexity/second_seq": 13.91146183013916, + "eval_valid_perplexity/seq": 9.101770401000977, + "eval_valid_reconstruction/all": 0.28350207209587097, + "eval_valid_reconstruction/end_span": 0.7034285664558411, + "eval_valid_reconstruction/fim": 0.1651536077260971, + "eval_valid_reconstruction/first_seq": 0.17665204405784607, + "eval_valid_reconstruction/last_seq": 0.3115645945072174, + "eval_valid_reconstruction/second_seq": 0.1908068060874939, + "eval_valid_runtime": 644.1337, + "eval_valid_samples_per_second": 0.298, + "eval_valid_steps_per_second": 0.298, + "step": 3900 + }, + { + "epoch": 0.014547570555717195, + "eval_train_loss": 2.2286527156829834, + "eval_train_loss/all": 2.059382915496826, + "eval_train_loss/end_span": 1.2143396139144897, + "eval_train_perplexity/batch": 7.841129779815674, + "eval_train_perplexity/end_span": 3.3680691719055176, + "eval_train_perplexity/fim": 2.221275806427002, + "eval_train_perplexity/first_seq": 15.54979419708252, + "eval_train_perplexity/last_seq": 9.342114448547363, + "eval_train_perplexity/second_seq": 14.003449440002441, + "eval_train_perplexity/seq": 9.017388343811035, + "eval_train_reconstruction/all": 0.27381211519241333, + "eval_train_reconstruction/end_span": 0.7171858549118042, + "eval_train_reconstruction/fim": 0.1521056741476059, + "eval_train_reconstruction/first_seq": 0.15194213390350342, + "eval_train_reconstruction/last_seq": 0.3142174780368805, + "eval_train_reconstruction/second_seq": 0.1895935833454132, + "eval_train_runtime": 645.3502, + "eval_train_samples_per_second": 0.298, + "eval_train_steps_per_second": 0.298, + "step": 3900 + }, + { + "epoch": 0.014584872018680573, + "grad_norm": 0.4993818998336792, + "learning_rate": 0.0006, + "loss": 2.3491, + "step": 3910 + }, + { + "epoch": 0.014622173481643951, + "grad_norm": 0.45917221903800964, + "learning_rate": 0.0006, + "loss": 2.216, + "step": 3920 + }, + { + "epoch": 0.014659474944607327, + "grad_norm": 0.5157418847084045, + "learning_rate": 0.0006, + "loss": 2.2846, + "step": 3930 + }, + { + "epoch": 0.014696776407570705, + "grad_norm": 0.4093446731567383, + "learning_rate": 0.0006, + "loss": 2.206, + "step": 3940 + }, + { + "epoch": 0.014734077870534082, + "grad_norm": 0.40173858404159546, + "learning_rate": 0.0006, + "loss": 2.1296, + "step": 3950 + }, + { + "epoch": 0.014734077870534082, + "eval_valid_loss": 2.234490156173706, + "eval_valid_loss/all": 2.092639446258545, + "eval_valid_loss/end_span": 1.4963828325271606, + "eval_valid_perplexity/batch": 8.106283187866211, + "eval_valid_perplexity/end_span": 4.465507507324219, + "eval_valid_perplexity/fim": 2.3831582069396973, + "eval_valid_perplexity/first_seq": 14.76630687713623, + "eval_valid_perplexity/last_seq": 9.733378410339355, + "eval_valid_perplexity/second_seq": 14.137190818786621, + "eval_valid_perplexity/seq": 9.120440483093262, + "eval_valid_reconstruction/all": 0.2828347682952881, + "eval_valid_reconstruction/end_span": 0.6447808742523193, + "eval_valid_reconstruction/fim": 0.16513767838478088, + "eval_valid_reconstruction/first_seq": 0.16751372814178467, + "eval_valid_reconstruction/last_seq": 0.302472859621048, + "eval_valid_reconstruction/second_seq": 0.1880224347114563, + "eval_valid_runtime": 645.4835, + "eval_valid_samples_per_second": 0.297, + "eval_valid_steps_per_second": 0.297, + "step": 3950 + }, + { + "epoch": 0.014734077870534082, + "eval_train_loss": 2.2315919399261475, + "eval_train_loss/all": 2.062215566635132, + "eval_train_loss/end_span": 1.4604376554489136, + "eval_train_perplexity/batch": 7.863372325897217, + "eval_train_perplexity/end_span": 4.307844638824463, + "eval_train_perplexity/fim": 2.258281707763672, + "eval_train_perplexity/first_seq": 15.36870002746582, + "eval_train_perplexity/last_seq": 9.534683227539062, + "eval_train_perplexity/second_seq": 13.881281852722168, + "eval_train_perplexity/seq": 9.048176765441895, + "eval_train_reconstruction/all": 0.2732280194759369, + "eval_train_reconstruction/end_span": 0.6535377502441406, + "eval_train_reconstruction/fim": 0.15501421689987183, + "eval_train_reconstruction/first_seq": 0.1569657027721405, + "eval_train_reconstruction/last_seq": 0.3067358136177063, + "eval_train_reconstruction/second_seq": 0.18988868594169617, + "eval_train_runtime": 646.307, + "eval_train_samples_per_second": 0.297, + "eval_train_steps_per_second": 0.297, + "step": 3950 + }, + { + "epoch": 0.01477137933349746, + "grad_norm": 0.7998047471046448, + "learning_rate": 0.0006, + "loss": 2.3395, + "step": 3960 + }, + { + "epoch": 0.014808680796460838, + "grad_norm": 0.3427029848098755, + "learning_rate": 0.0006, + "loss": 2.2836, + "step": 3970 + }, + { + "epoch": 0.014845982259424215, + "grad_norm": 0.884882926940918, + "learning_rate": 0.0006, + "loss": 2.3097, + "step": 3980 + }, + { + "epoch": 0.014883283722387591, + "grad_norm": 0.3916429877281189, + "learning_rate": 0.0006, + "loss": 2.3949, + "step": 3990 + }, + { + "epoch": 0.014920585185350969, + "grad_norm": 0.3618682026863098, + "learning_rate": 0.0006, + "loss": 2.3211, + "step": 4000 + }, + { + "epoch": 0.014920585185350969, + "eval_valid_loss": 2.234410524368286, + "eval_valid_loss/all": 2.093080997467041, + "eval_valid_loss/end_span": 1.3147789239883423, + "eval_valid_perplexity/batch": 8.10986328125, + "eval_valid_perplexity/end_span": 3.7239277362823486, + "eval_valid_perplexity/fim": 2.5009119510650635, + "eval_valid_perplexity/first_seq": 14.94644546508789, + "eval_valid_perplexity/last_seq": 9.48786449432373, + "eval_valid_perplexity/second_seq": 13.64065933227539, + "eval_valid_perplexity/seq": 9.137085914611816, + "eval_valid_reconstruction/all": 0.28277167677879333, + "eval_valid_reconstruction/end_span": 0.6888542771339417, + "eval_valid_reconstruction/fim": 0.17381539940834045, + "eval_valid_reconstruction/first_seq": 0.1660507619380951, + "eval_valid_reconstruction/last_seq": 0.31218770146369934, + "eval_valid_reconstruction/second_seq": 0.19652283191680908, + "eval_valid_runtime": 638.6973, + "eval_valid_samples_per_second": 0.301, + "eval_valid_steps_per_second": 0.301, + "step": 4000 + }, + { + "epoch": 0.014920585185350969, + "eval_train_loss": 2.2304697036743164, + "eval_train_loss/all": 2.061603546142578, + "eval_train_loss/end_span": 1.2776758670806885, + "eval_train_perplexity/batch": 7.8585615158081055, + "eval_train_perplexity/end_span": 3.5882904529571533, + "eval_train_perplexity/fim": 2.2365775108337402, + "eval_train_perplexity/first_seq": 15.648283004760742, + "eval_train_perplexity/last_seq": 9.465004920959473, + "eval_train_perplexity/second_seq": 14.230076789855957, + "eval_train_perplexity/seq": 9.048266410827637, + "eval_train_reconstruction/all": 0.27337732911109924, + "eval_train_reconstruction/end_span": 0.7027313113212585, + "eval_train_reconstruction/fim": 0.1532665193080902, + "eval_train_reconstruction/first_seq": 0.15014581382274628, + "eval_train_reconstruction/last_seq": 0.3111930191516876, + "eval_train_reconstruction/second_seq": 0.1813594251871109, + "eval_train_runtime": 642.5722, + "eval_train_samples_per_second": 0.299, + "eval_train_steps_per_second": 0.299, + "step": 4000 + }, + { + "epoch": 0.014957886648314347, + "grad_norm": 0.3505386412143707, + "learning_rate": 0.0006, + "loss": 2.2937, + "step": 4010 + }, + { + "epoch": 0.014995188111277724, + "grad_norm": 0.40146493911743164, + "learning_rate": 0.0006, + "loss": 2.3528, + "step": 4020 + }, + { + "epoch": 0.015032489574241102, + "grad_norm": 0.30962827801704407, + "learning_rate": 0.0006, + "loss": 2.323, + "step": 4030 + }, + { + "epoch": 0.01506979103720448, + "grad_norm": 0.29506611824035645, + "learning_rate": 0.0006, + "loss": 2.3811, + "step": 4040 + }, + { + "epoch": 0.015107092500167856, + "grad_norm": 0.42562124133110046, + "learning_rate": 0.0006, + "loss": 2.2249, + "step": 4050 + }, + { + "epoch": 0.015107092500167856, + "eval_valid_loss": 2.229008674621582, + "eval_valid_loss/all": 2.088139057159424, + "eval_valid_loss/end_span": 1.1980332136154175, + "eval_valid_perplexity/batch": 8.069883346557617, + "eval_valid_perplexity/end_span": 3.3135933876037598, + "eval_valid_perplexity/fim": 2.2218217849731445, + "eval_valid_perplexity/first_seq": 15.062554359436035, + "eval_valid_perplexity/last_seq": 9.355186462402344, + "eval_valid_perplexity/second_seq": 14.068581581115723, + "eval_valid_perplexity/seq": 9.088667869567871, + "eval_valid_reconstruction/all": 0.28433558344841003, + "eval_valid_reconstruction/end_span": 0.7211799621582031, + "eval_valid_reconstruction/fim": 0.1530834585428238, + "eval_valid_reconstruction/first_seq": 0.16627109050750732, + "eval_valid_reconstruction/last_seq": 0.31571686267852783, + "eval_valid_reconstruction/second_seq": 0.18965952098369598, + "eval_valid_runtime": 637.7344, + "eval_valid_samples_per_second": 0.301, + "eval_valid_steps_per_second": 0.301, + "step": 4050 + }, + { + "epoch": 0.015107092500167856, + "eval_train_loss": 2.225217580795288, + "eval_train_loss/all": 2.0568249225616455, + "eval_train_loss/end_span": 1.164444923400879, + "eval_train_perplexity/batch": 7.8210978507995605, + "eval_train_perplexity/end_span": 3.204143762588501, + "eval_train_perplexity/fim": 2.035727024078369, + "eval_train_perplexity/first_seq": 15.735671043395996, + "eval_train_perplexity/last_seq": 9.5162353515625, + "eval_train_perplexity/second_seq": 14.288728713989258, + "eval_train_perplexity/seq": 8.998577117919922, + "eval_train_reconstruction/all": 0.2750249207019806, + "eval_train_reconstruction/end_span": 0.732541024684906, + "eval_train_reconstruction/fim": 0.1367030292749405, + "eval_train_reconstruction/first_seq": 0.14871960878372192, + "eval_train_reconstruction/last_seq": 0.30687445402145386, + "eval_train_reconstruction/second_seq": 0.18602043390274048, + "eval_train_runtime": 636.9638, + "eval_train_samples_per_second": 0.301, + "eval_train_steps_per_second": 0.301, + "step": 4050 + }, + { + "epoch": 0.015144393963131234, + "grad_norm": 0.3367210626602173, + "learning_rate": 0.0006, + "loss": 2.2682, + "step": 4060 + }, + { + "epoch": 0.015181695426094611, + "grad_norm": 0.38677269220352173, + "learning_rate": 0.0006, + "loss": 2.373, + "step": 4070 + }, + { + "epoch": 0.015218996889057989, + "grad_norm": 0.49202895164489746, + "learning_rate": 0.0006, + "loss": 2.2026, + "step": 4080 + }, + { + "epoch": 0.015256298352021367, + "grad_norm": 0.4977208077907562, + "learning_rate": 0.0006, + "loss": 2.0848, + "step": 4090 + }, + { + "epoch": 0.015293599814984744, + "grad_norm": 0.3353670537471771, + "learning_rate": 0.0006, + "loss": 2.1671, + "step": 4100 + }, + { + "epoch": 0.015293599814984744, + "eval_valid_loss": 2.2337753772735596, + "eval_valid_loss/all": 2.092475414276123, + "eval_valid_loss/end_span": 1.3886467218399048, + "eval_valid_perplexity/batch": 8.10495376586914, + "eval_valid_perplexity/end_span": 4.009420394897461, + "eval_valid_perplexity/fim": 2.4165074825286865, + "eval_valid_perplexity/first_seq": 15.224752426147461, + "eval_valid_perplexity/last_seq": 9.119988441467285, + "eval_valid_perplexity/second_seq": 13.68887996673584, + "eval_valid_perplexity/seq": 9.13150691986084, + "eval_valid_reconstruction/all": 0.28309366106987, + "eval_valid_reconstruction/end_span": 0.6674948930740356, + "eval_valid_reconstruction/fim": 0.16740870475769043, + "eval_valid_reconstruction/first_seq": 0.1606975495815277, + "eval_valid_reconstruction/last_seq": 0.3231741786003113, + "eval_valid_reconstruction/second_seq": 0.19765295088291168, + "eval_valid_runtime": 639.7949, + "eval_valid_samples_per_second": 0.3, + "eval_valid_steps_per_second": 0.3, + "step": 4100 + }, + { + "epoch": 0.015293599814984744, + "eval_train_loss": 2.2306301593780518, + "eval_train_loss/all": 2.061248302459717, + "eval_train_loss/end_span": 1.361680507659912, + "eval_train_perplexity/batch": 7.855770111083984, + "eval_train_perplexity/end_span": 3.9027464389801025, + "eval_train_perplexity/fim": 2.0620005130767822, + "eval_train_perplexity/first_seq": 15.215110778808594, + "eval_train_perplexity/last_seq": 9.640843391418457, + "eval_train_perplexity/second_seq": 14.386277198791504, + "eval_train_perplexity/seq": 9.0402193069458, + "eval_train_reconstruction/all": 0.2736521065235138, + "eval_train_reconstruction/end_span": 0.6758408546447754, + "eval_train_reconstruction/fim": 0.13757145404815674, + "eval_train_reconstruction/first_seq": 0.15810851752758026, + "eval_train_reconstruction/last_seq": 0.30447259545326233, + "eval_train_reconstruction/second_seq": 0.1790093034505844, + "eval_train_runtime": 642.5033, + "eval_train_samples_per_second": 0.299, + "eval_train_steps_per_second": 0.299, + "step": 4100 + }, + { + "epoch": 0.01533090127794812, + "grad_norm": 0.4362899363040924, + "learning_rate": 0.0006, + "loss": 2.3137, + "step": 4110 + }, + { + "epoch": 0.015368202740911498, + "grad_norm": 0.4825122356414795, + "learning_rate": 0.0006, + "loss": 2.1358, + "step": 4120 + }, + { + "epoch": 0.015405504203874876, + "grad_norm": 0.3469723165035248, + "learning_rate": 0.0006, + "loss": 2.015, + "step": 4130 + }, + { + "epoch": 0.015442805666838253, + "grad_norm": 0.3038555085659027, + "learning_rate": 0.0006, + "loss": 2.3881, + "step": 4140 + }, + { + "epoch": 0.015480107129801631, + "grad_norm": 0.3069309890270233, + "learning_rate": 0.0006, + "loss": 2.1342, + "step": 4150 + }, + { + "epoch": 0.015480107129801631, + "eval_valid_loss": 2.2285637855529785, + "eval_valid_loss/all": 2.0874452590942383, + "eval_valid_loss/end_span": 1.3471710681915283, + "eval_valid_perplexity/batch": 8.064286231994629, + "eval_valid_perplexity/end_span": 3.8465285301208496, + "eval_valid_perplexity/fim": 2.274289131164551, + "eval_valid_perplexity/first_seq": 15.06218147277832, + "eval_valid_perplexity/last_seq": 9.218493461608887, + "eval_valid_perplexity/second_seq": 13.681949615478516, + "eval_valid_perplexity/seq": 9.081692695617676, + "eval_valid_reconstruction/all": 0.2840255796909332, + "eval_valid_reconstruction/end_span": 0.6782660484313965, + "eval_valid_reconstruction/fim": 0.15680131316184998, + "eval_valid_reconstruction/first_seq": 0.17008717358112335, + "eval_valid_reconstruction/last_seq": 0.32334455847740173, + "eval_valid_reconstruction/second_seq": 0.19521154463291168, + "eval_valid_runtime": 641.8172, + "eval_valid_samples_per_second": 0.299, + "eval_valid_steps_per_second": 0.299, + "step": 4150 + }, + { + "epoch": 0.015480107129801631, + "eval_train_loss": 2.226008653640747, + "eval_train_loss/all": 2.0572354793548584, + "eval_train_loss/end_span": 1.306489109992981, + "eval_train_perplexity/batch": 7.824309349060059, + "eval_train_perplexity/end_span": 3.6931846141815186, + "eval_train_perplexity/fim": 2.145888328552246, + "eval_train_perplexity/first_seq": 15.419260025024414, + "eval_train_perplexity/last_seq": 9.72305965423584, + "eval_train_perplexity/second_seq": 13.933116912841797, + "eval_train_perplexity/seq": 9.002638816833496, + "eval_train_reconstruction/all": 0.2744427025318146, + "eval_train_reconstruction/end_span": 0.6916390657424927, + "eval_train_reconstruction/fim": 0.14590397477149963, + "eval_train_reconstruction/first_seq": 0.15221448242664337, + "eval_train_reconstruction/last_seq": 0.2987931966781616, + "eval_train_reconstruction/second_seq": 0.19087396562099457, + "eval_train_runtime": 639.9713, + "eval_train_samples_per_second": 0.3, + "eval_train_steps_per_second": 0.3, + "step": 4150 + }, + { + "epoch": 0.015517408592765009, + "grad_norm": 0.40021786093711853, + "learning_rate": 0.0006, + "loss": 2.0418, + "step": 4160 + }, + { + "epoch": 0.015554710055728385, + "grad_norm": 0.5722165703773499, + "learning_rate": 0.0006, + "loss": 2.3013, + "step": 4170 + }, + { + "epoch": 0.015592011518691763, + "grad_norm": 0.39669522643089294, + "learning_rate": 0.0006, + "loss": 2.3077, + "step": 4180 + }, + { + "epoch": 0.015629312981655142, + "grad_norm": 0.37049970030784607, + "learning_rate": 0.0006, + "loss": 2.1828, + "step": 4190 + }, + { + "epoch": 0.015666614444618518, + "grad_norm": 0.38377901911735535, + "learning_rate": 0.0006, + "loss": 2.1878, + "step": 4200 + }, + { + "epoch": 0.015666614444618518, + "eval_valid_loss": 2.231642007827759, + "eval_valid_loss/all": 2.0903799533843994, + "eval_valid_loss/end_span": 1.296165108680725, + "eval_valid_perplexity/batch": 8.087987899780273, + "eval_valid_perplexity/end_span": 3.65525221824646, + "eval_valid_perplexity/fim": 2.427898406982422, + "eval_valid_perplexity/first_seq": 14.996628761291504, + "eval_valid_perplexity/last_seq": 9.056838035583496, + "eval_valid_perplexity/second_seq": 13.57192611694336, + "eval_valid_perplexity/seq": 9.111409187316895, + "eval_valid_reconstruction/all": 0.2838113605976105, + "eval_valid_reconstruction/end_span": 0.6994197964668274, + "eval_valid_reconstruction/fim": 0.16920991241931915, + "eval_valid_reconstruction/first_seq": 0.16519396007061005, + "eval_valid_reconstruction/last_seq": 0.32794129848480225, + "eval_valid_reconstruction/second_seq": 0.2034221887588501, + "eval_valid_runtime": 643.7541, + "eval_valid_samples_per_second": 0.298, + "eval_valid_steps_per_second": 0.298, + "step": 4200 + }, + { + "epoch": 0.015666614444618518, + "eval_train_loss": 2.226926326751709, + "eval_train_loss/all": 2.0582010746002197, + "eval_train_loss/end_span": 1.2736027240753174, + "eval_train_perplexity/batch": 7.8318681716918945, + "eval_train_perplexity/end_span": 3.573704481124878, + "eval_train_perplexity/fim": 1.992614507675171, + "eval_train_perplexity/first_seq": 15.250887870788574, + "eval_train_perplexity/last_seq": 9.467254638671875, + "eval_train_perplexity/second_seq": 14.68325138092041, + "eval_train_perplexity/seq": 9.010746002197266, + "eval_train_reconstruction/all": 0.2745562195777893, + "eval_train_reconstruction/end_span": 0.7098854184150696, + "eval_train_reconstruction/fim": 0.1314917802810669, + "eval_train_reconstruction/first_seq": 0.1552983820438385, + "eval_train_reconstruction/last_seq": 0.31215226650238037, + "eval_train_reconstruction/second_seq": 0.1712840348482132, + "eval_train_runtime": 646.1219, + "eval_train_samples_per_second": 0.297, + "eval_train_steps_per_second": 0.297, + "step": 4200 + }, + { + "epoch": 0.015703915907581894, + "grad_norm": 0.28604018688201904, + "learning_rate": 0.0006, + "loss": 2.2906, + "step": 4210 + }, + { + "epoch": 0.015741217370545273, + "grad_norm": 0.35277000069618225, + "learning_rate": 0.0006, + "loss": 2.2941, + "step": 4220 + }, + { + "epoch": 0.01577851883350865, + "grad_norm": 0.27277716994285583, + "learning_rate": 0.0006, + "loss": 2.2433, + "step": 4230 + }, + { + "epoch": 0.01581582029647203, + "grad_norm": 0.5680980682373047, + "learning_rate": 0.0006, + "loss": 2.338, + "step": 4240 + }, + { + "epoch": 0.015853121759435405, + "grad_norm": 0.3664103150367737, + "learning_rate": 0.0006, + "loss": 2.3316, + "step": 4250 + }, + { + "epoch": 0.015853121759435405, + "eval_valid_loss": 2.2275009155273438, + "eval_valid_loss/all": 2.0864100456237793, + "eval_valid_loss/end_span": 1.3671163320541382, + "eval_valid_perplexity/batch": 8.05594253540039, + "eval_valid_perplexity/end_span": 3.9240188598632812, + "eval_valid_perplexity/fim": 2.29783296585083, + "eval_valid_perplexity/first_seq": 14.809700965881348, + "eval_valid_perplexity/last_seq": 9.420282363891602, + "eval_valid_perplexity/second_seq": 13.480561256408691, + "eval_valid_perplexity/seq": 9.06992244720459, + "eval_valid_reconstruction/all": 0.28447970747947693, + "eval_valid_reconstruction/end_span": 0.6755506992340088, + "eval_valid_reconstruction/fim": 0.15873955190181732, + "eval_valid_reconstruction/first_seq": 0.173526331782341, + "eval_valid_reconstruction/last_seq": 0.31024158000946045, + "eval_valid_reconstruction/second_seq": 0.20192575454711914, + "eval_valid_runtime": 643.3823, + "eval_valid_samples_per_second": 0.298, + "eval_valid_steps_per_second": 0.298, + "step": 4250 + }, + { + "epoch": 0.015853121759435405, + "eval_train_loss": 2.225499391555786, + "eval_train_loss/all": 2.056792736053467, + "eval_train_loss/end_span": 1.3396373987197876, + "eval_train_perplexity/batch": 7.820846080780029, + "eval_train_perplexity/end_span": 3.8176589012145996, + "eval_train_perplexity/fim": 2.0777485370635986, + "eval_train_perplexity/first_seq": 15.116822242736816, + "eval_train_perplexity/last_seq": 9.274219512939453, + "eval_train_perplexity/second_seq": 14.156624794006348, + "eval_train_perplexity/seq": 8.996254920959473, + "eval_train_reconstruction/all": 0.27484387159347534, + "eval_train_reconstruction/end_span": 0.6877486109733582, + "eval_train_reconstruction/fim": 0.13991665840148926, + "eval_train_reconstruction/first_seq": 0.16244418919086456, + "eval_train_reconstruction/last_seq": 0.31516164541244507, + "eval_train_reconstruction/second_seq": 0.18291187286376953, + "eval_train_runtime": 638.2235, + "eval_train_samples_per_second": 0.301, + "eval_train_steps_per_second": 0.301, + "step": 4250 + }, + { + "epoch": 0.015890423222398784, + "grad_norm": 0.3918021321296692, + "learning_rate": 0.0006, + "loss": 2.318, + "step": 4260 + }, + { + "epoch": 0.01592772468536216, + "grad_norm": 0.48994916677474976, + "learning_rate": 0.0006, + "loss": 2.3702, + "step": 4270 + }, + { + "epoch": 0.015965026148325536, + "grad_norm": 0.3902009427547455, + "learning_rate": 0.0006, + "loss": 2.2637, + "step": 4280 + }, + { + "epoch": 0.016002327611288916, + "grad_norm": 0.3831264078617096, + "learning_rate": 0.0006, + "loss": 2.2911, + "step": 4290 + }, + { + "epoch": 0.01603962907425229, + "grad_norm": 0.38734757900238037, + "learning_rate": 0.0006, + "loss": 2.4573, + "step": 4300 + }, + { + "epoch": 0.01603962907425229, + "eval_valid_loss": 2.237391710281372, + "eval_valid_loss/all": 2.095573902130127, + "eval_valid_loss/end_span": 1.3734709024429321, + "eval_valid_perplexity/batch": 8.130105972290039, + "eval_valid_perplexity/end_span": 3.949033737182617, + "eval_valid_perplexity/fim": 2.2237560749053955, + "eval_valid_perplexity/first_seq": 15.29196548461914, + "eval_valid_perplexity/last_seq": 9.07364273071289, + "eval_valid_perplexity/second_seq": 13.210837364196777, + "eval_valid_perplexity/seq": 9.160521507263184, + "eval_valid_reconstruction/all": 0.2819597125053406, + "eval_valid_reconstruction/end_span": 0.6738675236701965, + "eval_valid_reconstruction/fim": 0.15125274658203125, + "eval_valid_reconstruction/first_seq": 0.15784184634685516, + "eval_valid_reconstruction/last_seq": 0.3248141407966614, + "eval_valid_reconstruction/second_seq": 0.20915111899375916, + "eval_valid_runtime": 646.9076, + "eval_valid_samples_per_second": 0.297, + "eval_valid_steps_per_second": 0.297, + "step": 4300 + }, + { + "epoch": 0.01603962907425229, + "eval_train_loss": 2.2294833660125732, + "eval_train_loss/all": 2.0596415996551514, + "eval_train_loss/end_span": 1.3577324151992798, + "eval_train_perplexity/batch": 7.84315824508667, + "eval_train_perplexity/end_span": 3.8873684406280518, + "eval_train_perplexity/fim": 2.005563259124756, + "eval_train_perplexity/first_seq": 15.636758804321289, + "eval_train_perplexity/last_seq": 9.627945899963379, + "eval_train_perplexity/second_seq": 14.234481811523438, + "eval_train_perplexity/seq": 9.019646644592285, + "eval_train_reconstruction/all": 0.27406027913093567, + "eval_train_reconstruction/end_span": 0.6805217862129211, + "eval_train_reconstruction/fim": 0.13376228511333466, + "eval_train_reconstruction/first_seq": 0.15122300386428833, + "eval_train_reconstruction/last_seq": 0.30238237977027893, + "eval_train_reconstruction/second_seq": 0.18701811134815216, + "eval_train_runtime": 640.3735, + "eval_train_samples_per_second": 0.3, + "eval_train_steps_per_second": 0.3, + "step": 4300 + }, + { + "epoch": 0.01607693053721567, + "grad_norm": 0.5173091292381287, + "learning_rate": 0.0006, + "loss": 2.0407, + "step": 4310 + }, + { + "epoch": 0.016114232000179047, + "grad_norm": 0.5938049554824829, + "learning_rate": 0.0006, + "loss": 2.2905, + "step": 4320 + }, + { + "epoch": 0.016151533463142423, + "grad_norm": 0.4561450779438019, + "learning_rate": 0.0006, + "loss": 2.203, + "step": 4330 + }, + { + "epoch": 0.016188834926105802, + "grad_norm": 0.35070788860321045, + "learning_rate": 0.0006, + "loss": 2.3348, + "step": 4340 + }, + { + "epoch": 0.01622613638906918, + "grad_norm": 0.3673841953277588, + "learning_rate": 0.0006, + "loss": 2.2997, + "step": 4350 + }, + { + "epoch": 0.01622613638906918, + "eval_valid_loss": 2.2338688373565674, + "eval_valid_loss/all": 2.092867374420166, + "eval_valid_loss/end_span": 1.3542094230651855, + "eval_valid_perplexity/batch": 8.10813045501709, + "eval_valid_perplexity/end_span": 3.873697280883789, + "eval_valid_perplexity/fim": 2.4586172103881836, + "eval_valid_perplexity/first_seq": 15.005348205566406, + "eval_valid_perplexity/last_seq": 9.13229751586914, + "eval_valid_perplexity/second_seq": 14.111308097839355, + "eval_valid_perplexity/seq": 9.142011642456055, + "eval_valid_reconstruction/all": 0.2828872799873352, + "eval_valid_reconstruction/end_span": 0.6835617423057556, + "eval_valid_reconstruction/fim": 0.17158667743206024, + "eval_valid_reconstruction/first_seq": 0.1656300127506256, + "eval_valid_reconstruction/last_seq": 0.32091590762138367, + "eval_valid_reconstruction/second_seq": 0.18932481110095978, + "eval_valid_runtime": 645.0578, + "eval_valid_samples_per_second": 0.298, + "eval_valid_steps_per_second": 0.298, + "step": 4350 + }, + { + "epoch": 0.01622613638906918, + "eval_train_loss": 2.2306272983551025, + "eval_train_loss/all": 2.061833143234253, + "eval_train_loss/end_span": 1.324395775794983, + "eval_train_perplexity/batch": 7.860365867614746, + "eval_train_perplexity/end_span": 3.7599127292633057, + "eval_train_perplexity/fim": 2.1137807369232178, + "eval_train_perplexity/first_seq": 15.348804473876953, + "eval_train_perplexity/last_seq": 9.031394004821777, + "eval_train_perplexity/second_seq": 14.267581939697266, + "eval_train_perplexity/seq": 9.04806137084961, + "eval_train_reconstruction/all": 0.2737312912940979, + "eval_train_reconstruction/end_span": 0.6962027549743652, + "eval_train_reconstruction/fim": 0.14268538355827332, + "eval_train_reconstruction/first_seq": 0.15831315517425537, + "eval_train_reconstruction/last_seq": 0.3263493478298187, + "eval_train_reconstruction/second_seq": 0.1866607964038849, + "eval_train_runtime": 645.0675, + "eval_train_samples_per_second": 0.298, + "eval_train_steps_per_second": 0.298, + "step": 4350 + }, + { + "epoch": 0.016263437852032558, + "grad_norm": 0.5385019779205322, + "learning_rate": 0.0006, + "loss": 2.1916, + "step": 4360 + }, + { + "epoch": 0.016300739314995934, + "grad_norm": 0.35015472769737244, + "learning_rate": 0.0006, + "loss": 2.3433, + "step": 4370 + }, + { + "epoch": 0.016338040777959313, + "grad_norm": 0.33204588294029236, + "learning_rate": 0.0006, + "loss": 2.2293, + "step": 4380 + }, + { + "epoch": 0.01637534224092269, + "grad_norm": 0.5091801881790161, + "learning_rate": 0.0006, + "loss": 2.3352, + "step": 4390 + }, + { + "epoch": 0.016412643703886065, + "grad_norm": 0.4651685655117035, + "learning_rate": 0.0006, + "loss": 2.2071, + "step": 4400 + }, + { + "epoch": 0.016412643703886065, + "eval_valid_loss": 2.232802629470825, + "eval_valid_loss/all": 2.091840982437134, + "eval_valid_loss/end_span": 1.414384365081787, + "eval_valid_perplexity/batch": 8.099813461303711, + "eval_valid_perplexity/end_span": 4.113953113555908, + "eval_valid_perplexity/fim": 2.5655364990234375, + "eval_valid_perplexity/first_seq": 14.430588722229004, + "eval_valid_perplexity/last_seq": 9.09383773803711, + "eval_valid_perplexity/second_seq": 13.830131530761719, + "eval_valid_perplexity/seq": 9.130921363830566, + "eval_valid_reconstruction/all": 0.2831990420818329, + "eval_valid_reconstruction/end_span": 0.663615882396698, + "eval_valid_reconstruction/fim": 0.17781421542167664, + "eval_valid_reconstruction/first_seq": 0.17768874764442444, + "eval_valid_reconstruction/last_seq": 0.3243776559829712, + "eval_valid_reconstruction/second_seq": 0.19129416346549988, + "eval_valid_runtime": 641.7219, + "eval_valid_samples_per_second": 0.299, + "eval_valid_steps_per_second": 0.299, + "step": 4400 + }, + { + "epoch": 0.016412643703886065, + "eval_train_loss": 2.2268245220184326, + "eval_train_loss/all": 2.058184862136841, + "eval_train_loss/end_span": 1.38126802444458, + "eval_train_perplexity/batch": 7.8317413330078125, + "eval_train_perplexity/end_span": 3.979945182800293, + "eval_train_perplexity/fim": 2.3571577072143555, + "eval_train_perplexity/first_seq": 15.262080192565918, + "eval_train_perplexity/last_seq": 9.21839714050293, + "eval_train_perplexity/second_seq": 14.361342430114746, + "eval_train_perplexity/seq": 9.015708923339844, + "eval_train_reconstruction/all": 0.2745019495487213, + "eval_train_reconstruction/end_span": 0.674477219581604, + "eval_train_reconstruction/fim": 0.1630319058895111, + "eval_train_reconstruction/first_seq": 0.15727181732654572, + "eval_train_reconstruction/last_seq": 0.3180443048477173, + "eval_train_reconstruction/second_seq": 0.18241256475448608, + "eval_train_runtime": 637.8552, + "eval_train_samples_per_second": 0.301, + "eval_train_steps_per_second": 0.301, + "step": 4400 + }, + { + "epoch": 0.016449945166849445, + "grad_norm": 0.29968419671058655, + "learning_rate": 0.0006, + "loss": 2.2513, + "step": 4410 + }, + { + "epoch": 0.01648724662981282, + "grad_norm": 0.30826643109321594, + "learning_rate": 0.0006, + "loss": 2.2744, + "step": 4420 + }, + { + "epoch": 0.0165245480927762, + "grad_norm": 0.5630642771720886, + "learning_rate": 0.0006, + "loss": 2.3825, + "step": 4430 + }, + { + "epoch": 0.016561849555739576, + "grad_norm": 0.521980345249176, + "learning_rate": 0.0006, + "loss": 2.1731, + "step": 4440 + }, + { + "epoch": 0.016599151018702952, + "grad_norm": 0.4145258963108063, + "learning_rate": 0.0006, + "loss": 2.1231, + "step": 4450 + }, + { + "epoch": 0.016599151018702952, + "eval_valid_loss": 2.2287673950195312, + "eval_valid_loss/all": 2.0876877307891846, + "eval_valid_loss/end_span": 1.1896390914916992, + "eval_valid_perplexity/batch": 8.066242218017578, + "eval_valid_perplexity/end_span": 3.2858951091766357, + "eval_valid_perplexity/fim": 2.2246546745300293, + "eval_valid_perplexity/first_seq": 14.559488296508789, + "eval_valid_perplexity/last_seq": 9.416278839111328, + "eval_valid_perplexity/second_seq": 13.793615341186523, + "eval_valid_perplexity/seq": 9.087289810180664, + "eval_valid_reconstruction/all": 0.2841479480266571, + "eval_valid_reconstruction/end_span": 0.7208221554756165, + "eval_valid_reconstruction/fim": 0.15246246755123138, + "eval_valid_reconstruction/first_seq": 0.17422667145729065, + "eval_valid_reconstruction/last_seq": 0.31237316131591797, + "eval_valid_reconstruction/second_seq": 0.19583648443222046, + "eval_valid_runtime": 625.7404, + "eval_valid_samples_per_second": 0.307, + "eval_valid_steps_per_second": 0.307, + "step": 4450 + }, + { + "epoch": 0.016599151018702952, + "eval_train_loss": 2.224670648574829, + "eval_train_loss/all": 2.0561766624450684, + "eval_train_loss/end_span": 1.1563949584960938, + "eval_train_perplexity/batch": 7.816029071807861, + "eval_train_perplexity/end_span": 3.1784541606903076, + "eval_train_perplexity/fim": 2.1035706996917725, + "eval_train_perplexity/first_seq": 15.788963317871094, + "eval_train_perplexity/last_seq": 9.08879566192627, + "eval_train_perplexity/second_seq": 14.099090576171875, + "eval_train_perplexity/seq": 8.992533683776855, + "eval_train_reconstruction/all": 0.2748214602470398, + "eval_train_reconstruction/end_span": 0.7327064275741577, + "eval_train_reconstruction/fim": 0.14174334704875946, + "eval_train_reconstruction/first_seq": 0.14901228249073029, + "eval_train_reconstruction/last_seq": 0.31947702169418335, + "eval_train_reconstruction/second_seq": 0.18845418095588684, + "eval_train_runtime": 620.8423, + "eval_train_samples_per_second": 0.309, + "eval_train_steps_per_second": 0.309, + "step": 4450 + }, + { + "epoch": 0.01663645248166633, + "grad_norm": 0.2748422920703888, + "learning_rate": 0.0006, + "loss": 2.2575, + "step": 4460 + }, + { + "epoch": 0.016673753944629707, + "grad_norm": 0.3697293996810913, + "learning_rate": 0.0006, + "loss": 2.395, + "step": 4470 + }, + { + "epoch": 0.016711055407593087, + "grad_norm": 0.6534759998321533, + "learning_rate": 0.0006, + "loss": 2.2079, + "step": 4480 + }, + { + "epoch": 0.016748356870556463, + "grad_norm": 0.2730923891067505, + "learning_rate": 0.0006, + "loss": 2.1771, + "step": 4490 + }, + { + "epoch": 0.016785658333519842, + "grad_norm": 0.5567862391471863, + "learning_rate": 0.0006, + "loss": 2.2502, + "step": 4500 + }, + { + "epoch": 0.016785658333519842, + "eval_valid_loss": 2.2320494651794434, + "eval_valid_loss/all": 2.0902481079101562, + "eval_valid_loss/end_span": 1.4285364151000977, + "eval_valid_perplexity/batch": 8.086921691894531, + "eval_valid_perplexity/end_span": 4.172587871551514, + "eval_valid_perplexity/fim": 2.2679977416992188, + "eval_valid_perplexity/first_seq": 14.948437690734863, + "eval_valid_perplexity/last_seq": 9.353021621704102, + "eval_valid_perplexity/second_seq": 14.081698417663574, + "eval_valid_perplexity/seq": 9.106839179992676, + "eval_valid_reconstruction/all": 0.28340572118759155, + "eval_valid_reconstruction/end_span": 0.6676019430160522, + "eval_valid_reconstruction/fim": 0.15587952733039856, + "eval_valid_reconstruction/first_seq": 0.16513970494270325, + "eval_valid_reconstruction/last_seq": 0.3146131932735443, + "eval_valid_reconstruction/second_seq": 0.18892312049865723, + "eval_valid_runtime": 624.716, + "eval_valid_samples_per_second": 0.307, + "eval_valid_steps_per_second": 0.307, + "step": 4500 + }, + { + "epoch": 0.016785658333519842, + "eval_train_loss": 2.2288951873779297, + "eval_train_loss/all": 2.0594778060913086, + "eval_train_loss/end_span": 1.3987557888031006, + "eval_train_perplexity/batch": 7.841873645782471, + "eval_train_perplexity/end_span": 4.05015754699707, + "eval_train_perplexity/fim": 1.9879062175750732, + "eval_train_perplexity/first_seq": 15.405256271362305, + "eval_train_perplexity/last_seq": 9.04935073852539, + "eval_train_perplexity/second_seq": 13.971181869506836, + "eval_train_perplexity/seq": 9.018119812011719, + "eval_train_reconstruction/all": 0.27401724457740784, + "eval_train_reconstruction/end_span": 0.679174542427063, + "eval_train_reconstruction/fim": 0.13108481466770172, + "eval_train_reconstruction/first_seq": 0.15193718671798706, + "eval_train_reconstruction/last_seq": 0.3231136202812195, + "eval_train_reconstruction/second_seq": 0.18610884249210358, + "eval_train_runtime": 616.7494, + "eval_train_samples_per_second": 0.311, + "eval_train_steps_per_second": 0.311, + "step": 4500 + }, + { + "epoch": 0.016822959796483218, + "grad_norm": 0.34820616245269775, + "learning_rate": 0.0006, + "loss": 2.2156, + "step": 4510 + }, + { + "epoch": 0.016860261259446594, + "grad_norm": 0.5130972266197205, + "learning_rate": 0.0006, + "loss": 2.3034, + "step": 4520 + }, + { + "epoch": 0.016897562722409974, + "grad_norm": 0.38629448413848877, + "learning_rate": 0.0006, + "loss": 2.1689, + "step": 4530 + }, + { + "epoch": 0.01693486418537335, + "grad_norm": 0.6877207159996033, + "learning_rate": 0.0006, + "loss": 2.1155, + "step": 4540 + }, + { + "epoch": 0.01697216564833673, + "grad_norm": 0.34946903586387634, + "learning_rate": 0.0006, + "loss": 2.2151, + "step": 4550 + }, + { + "epoch": 0.01697216564833673, + "eval_valid_loss": 2.227294921875, + "eval_valid_loss/all": 2.0863304138183594, + "eval_valid_loss/end_span": 1.2908527851104736, + "eval_valid_perplexity/batch": 8.055301666259766, + "eval_valid_perplexity/end_span": 3.6358859539031982, + "eval_valid_perplexity/fim": 2.6039326190948486, + "eval_valid_perplexity/first_seq": 14.73962688446045, + "eval_valid_perplexity/last_seq": 9.656063079833984, + "eval_valid_perplexity/second_seq": 13.929828643798828, + "eval_valid_perplexity/seq": 9.07287883758545, + "eval_valid_reconstruction/all": 0.2848661243915558, + "eval_valid_reconstruction/end_span": 0.6976925134658813, + "eval_valid_reconstruction/fim": 0.18250826001167297, + "eval_valid_reconstruction/first_seq": 0.1721092164516449, + "eval_valid_reconstruction/last_seq": 0.3052064776420593, + "eval_valid_reconstruction/second_seq": 0.192360982298851, + "eval_valid_runtime": 610.2694, + "eval_valid_samples_per_second": 0.315, + "eval_valid_steps_per_second": 0.315, + "step": 4550 + }, + { + "epoch": 0.01697216564833673, + "eval_train_loss": 2.2243268489837646, + "eval_train_loss/all": 2.0559558868408203, + "eval_train_loss/end_span": 1.2465910911560059, + "eval_train_perplexity/batch": 7.814303874969482, + "eval_train_perplexity/end_span": 3.4784648418426514, + "eval_train_perplexity/fim": 1.9555267095565796, + "eval_train_perplexity/first_seq": 15.51515007019043, + "eval_train_perplexity/last_seq": 9.298136711120605, + "eval_train_perplexity/second_seq": 13.940802574157715, + "eval_train_perplexity/seq": 8.99685001373291, + "eval_train_reconstruction/all": 0.2751784324645996, + "eval_train_reconstruction/end_span": 0.7124889492988586, + "eval_train_reconstruction/fim": 0.12813672423362732, + "eval_train_reconstruction/first_seq": 0.1517573595046997, + "eval_train_reconstruction/last_seq": 0.31476712226867676, + "eval_train_reconstruction/second_seq": 0.19200605154037476, + "eval_train_runtime": 619.9464, + "eval_train_samples_per_second": 0.31, + "eval_train_steps_per_second": 0.31, + "step": 4550 + }, + { + "epoch": 0.017009467111300105, + "grad_norm": 0.4107982814311981, + "learning_rate": 0.0006, + "loss": 2.388, + "step": 4560 + }, + { + "epoch": 0.017046768574263484, + "grad_norm": 0.5851079821586609, + "learning_rate": 0.0006, + "loss": 2.3329, + "step": 4570 + }, + { + "epoch": 0.01708407003722686, + "grad_norm": 0.44056159257888794, + "learning_rate": 0.0006, + "loss": 2.2729, + "step": 4580 + }, + { + "epoch": 0.017121371500190236, + "grad_norm": 0.2878628969192505, + "learning_rate": 0.0006, + "loss": 2.1913, + "step": 4590 + }, + { + "epoch": 0.017158672963153616, + "grad_norm": 0.4410535395145416, + "learning_rate": 0.0006, + "loss": 2.1664, + "step": 4600 + }, + { + "epoch": 0.017158672963153616, + "eval_valid_loss": 2.2312333583831787, + "eval_valid_loss/all": 2.089841842651367, + "eval_valid_loss/end_span": 1.294942021369934, + "eval_valid_perplexity/batch": 8.083636283874512, + "eval_valid_perplexity/end_span": 3.6507842540740967, + "eval_valid_perplexity/fim": 2.4358794689178467, + "eval_valid_perplexity/first_seq": 15.093009948730469, + "eval_valid_perplexity/last_seq": 9.181614875793457, + "eval_valid_perplexity/second_seq": 14.082353591918945, + "eval_valid_perplexity/seq": 9.102770805358887, + "eval_valid_reconstruction/all": 0.2836804687976837, + "eval_valid_reconstruction/end_span": 0.6902173757553101, + "eval_valid_reconstruction/fim": 0.1697360873222351, + "eval_valid_reconstruction/first_seq": 0.16503621637821198, + "eval_valid_reconstruction/last_seq": 0.3204755187034607, + "eval_valid_reconstruction/second_seq": 0.1893726885318756, + "eval_valid_runtime": 616.2512, + "eval_valid_samples_per_second": 0.312, + "eval_valid_steps_per_second": 0.312, + "step": 4600 + }, + { + "epoch": 0.017158672963153616, + "eval_train_loss": 2.2272768020629883, + "eval_train_loss/all": 2.0582098960876465, + "eval_train_loss/end_span": 1.266724705696106, + "eval_train_perplexity/batch": 7.831937313079834, + "eval_train_perplexity/end_span": 3.549208879470825, + "eval_train_perplexity/fim": 2.3434572219848633, + "eval_train_perplexity/first_seq": 15.37260627746582, + "eval_train_perplexity/last_seq": 9.480881690979004, + "eval_train_perplexity/second_seq": 14.350910186767578, + "eval_train_perplexity/seq": 9.013201713562012, + "eval_train_reconstruction/all": 0.27437663078308105, + "eval_train_reconstruction/end_span": 0.6992981433868408, + "eval_train_reconstruction/fim": 0.16213542222976685, + "eval_train_reconstruction/first_seq": 0.15416370332241058, + "eval_train_reconstruction/last_seq": 0.30925193428993225, + "eval_train_reconstruction/second_seq": 0.18082047998905182, + "eval_train_runtime": 613.4386, + "eval_train_samples_per_second": 0.313, + "eval_train_steps_per_second": 0.313, + "step": 4600 + }, + { + "epoch": 0.01719597442611699, + "grad_norm": 0.2552168369293213, + "learning_rate": 0.0006, + "loss": 2.3134, + "step": 4610 + }, + { + "epoch": 0.01723327588908037, + "grad_norm": 0.341110497713089, + "learning_rate": 0.0006, + "loss": 2.3755, + "step": 4620 + }, + { + "epoch": 0.017270577352043747, + "grad_norm": 0.4102616310119629, + "learning_rate": 0.0006, + "loss": 2.3535, + "step": 4630 + }, + { + "epoch": 0.017307878815007123, + "grad_norm": 0.5866985321044922, + "learning_rate": 0.0006, + "loss": 2.1891, + "step": 4640 + }, + { + "epoch": 0.017345180277970502, + "grad_norm": 0.8637084364891052, + "learning_rate": 0.0006, + "loss": 2.1715, + "step": 4650 + }, + { + "epoch": 0.017345180277970502, + "eval_valid_loss": 2.2300479412078857, + "eval_valid_loss/all": 2.0893802642822266, + "eval_valid_loss/end_span": 1.223044514656067, + "eval_valid_perplexity/batch": 8.079906463623047, + "eval_valid_perplexity/end_span": 3.3975157737731934, + "eval_valid_perplexity/fim": 2.153467893600464, + "eval_valid_perplexity/first_seq": 15.019768714904785, + "eval_valid_perplexity/last_seq": 9.262357711791992, + "eval_valid_perplexity/second_seq": 13.898820877075195, + "eval_valid_perplexity/seq": 9.09907054901123, + "eval_valid_reconstruction/all": 0.2836824357509613, + "eval_valid_reconstruction/end_span": 0.7106736898422241, + "eval_valid_reconstruction/fim": 0.14739897847175598, + "eval_valid_reconstruction/first_seq": 0.1638099104166031, + "eval_valid_reconstruction/last_seq": 0.3192760646343231, + "eval_valid_reconstruction/second_seq": 0.1901453137397766, + "eval_valid_runtime": 624.5766, + "eval_valid_samples_per_second": 0.307, + "eval_valid_steps_per_second": 0.307, + "step": 4650 + }, + { + "epoch": 0.017345180277970502, + "eval_train_loss": 2.2243154048919678, + "eval_train_loss/all": 2.055288076400757, + "eval_train_loss/end_span": 1.1992385387420654, + "eval_train_perplexity/batch": 7.80908727645874, + "eval_train_perplexity/end_span": 3.31758975982666, + "eval_train_perplexity/fim": 2.142611265182495, + "eval_train_perplexity/first_seq": 15.577479362487793, + "eval_train_perplexity/last_seq": 8.98991870880127, + "eval_train_perplexity/second_seq": 14.30414867401123, + "eval_train_perplexity/seq": 8.973538398742676, + "eval_train_reconstruction/all": 0.27523940801620483, + "eval_train_reconstruction/end_span": 0.7202255725860596, + "eval_train_reconstruction/fim": 0.14689932763576508, + "eval_train_reconstruction/first_seq": 0.1491987109184265, + "eval_train_reconstruction/last_seq": 0.32478317618370056, + "eval_train_reconstruction/second_seq": 0.18278232216835022, + "eval_train_runtime": 631.1169, + "eval_train_samples_per_second": 0.304, + "eval_train_steps_per_second": 0.304, + "step": 4650 + }, + { + "epoch": 0.01738248174093388, + "grad_norm": 0.4004186689853668, + "learning_rate": 0.0006, + "loss": 2.0332, + "step": 4660 + }, + { + "epoch": 0.017419783203897258, + "grad_norm": 0.521919846534729, + "learning_rate": 0.0006, + "loss": 2.3446, + "step": 4670 + }, + { + "epoch": 0.017457084666860634, + "grad_norm": 0.5244372487068176, + "learning_rate": 0.0006, + "loss": 2.2136, + "step": 4680 + }, + { + "epoch": 0.017494386129824013, + "grad_norm": 0.9562855958938599, + "learning_rate": 0.0006, + "loss": 2.2263, + "step": 4690 + }, + { + "epoch": 0.01753168759278739, + "grad_norm": 0.4678341746330261, + "learning_rate": 0.0006, + "loss": 2.2703, + "step": 4700 + }, + { + "epoch": 0.01753168759278739, + "eval_valid_loss": 2.2364864349365234, + "eval_valid_loss/all": 2.0946850776672363, + "eval_valid_loss/end_span": 1.489964485168457, + "eval_valid_perplexity/batch": 8.122882843017578, + "eval_valid_perplexity/end_span": 4.4369378089904785, + "eval_valid_perplexity/fim": 2.3065614700317383, + "eval_valid_perplexity/first_seq": 14.54867935180664, + "eval_valid_perplexity/last_seq": 9.590672492980957, + "eval_valid_perplexity/second_seq": 13.692773818969727, + "eval_valid_perplexity/seq": 9.150603294372559, + "eval_valid_reconstruction/all": 0.28235524892807007, + "eval_valid_reconstruction/end_span": 0.6506796479225159, + "eval_valid_reconstruction/fim": 0.1580173820257187, + "eval_valid_reconstruction/first_seq": 0.17667251825332642, + "eval_valid_reconstruction/last_seq": 0.3088613450527191, + "eval_valid_reconstruction/second_seq": 0.1988171935081482, + "eval_valid_runtime": 612.5347, + "eval_valid_samples_per_second": 0.313, + "eval_valid_steps_per_second": 0.313, + "step": 4700 + }, + { + "epoch": 0.01753168759278739, + "eval_train_loss": 2.2346560955047607, + "eval_train_loss/all": 2.0648529529571533, + "eval_train_loss/end_span": 1.4666569232940674, + "eval_train_perplexity/batch": 7.884138584136963, + "eval_train_perplexity/end_span": 4.334719657897949, + "eval_train_perplexity/fim": 2.235015392303467, + "eval_train_perplexity/first_seq": 15.347168922424316, + "eval_train_perplexity/last_seq": 9.096695899963379, + "eval_train_perplexity/second_seq": 14.150089263916016, + "eval_train_perplexity/seq": 9.076361656188965, + "eval_train_reconstruction/all": 0.2724628448486328, + "eval_train_reconstruction/end_span": 0.6562486290931702, + "eval_train_reconstruction/fim": 0.1532498300075531, + "eval_train_reconstruction/first_seq": 0.1551852971315384, + "eval_train_reconstruction/last_seq": 0.3229004144668579, + "eval_train_reconstruction/second_seq": 0.18329092860221863, + "eval_train_runtime": 605.1426, + "eval_train_samples_per_second": 0.317, + "eval_train_steps_per_second": 0.317, + "step": 4700 + }, + { + "epoch": 0.017568989055750765, + "grad_norm": 0.4017414450645447, + "learning_rate": 0.0006, + "loss": 2.3058, + "step": 4710 + }, + { + "epoch": 0.017606290518714145, + "grad_norm": 0.604438066482544, + "learning_rate": 0.0006, + "loss": 2.052, + "step": 4720 + }, + { + "epoch": 0.01764359198167752, + "grad_norm": 0.42955106496810913, + "learning_rate": 0.0006, + "loss": 2.1686, + "step": 4730 + }, + { + "epoch": 0.0176808934446409, + "grad_norm": 0.3129159212112427, + "learning_rate": 0.0006, + "loss": 2.1858, + "step": 4740 + }, + { + "epoch": 0.017718194907604276, + "grad_norm": 0.5254055261611938, + "learning_rate": 0.0006, + "loss": 2.2804, + "step": 4750 + }, + { + "epoch": 0.017718194907604276, + "eval_valid_loss": 2.2299091815948486, + "eval_valid_loss/all": 2.088559627532959, + "eval_valid_loss/end_span": 1.2533888816833496, + "eval_valid_perplexity/batch": 8.073278427124023, + "eval_valid_perplexity/end_span": 3.5021913051605225, + "eval_valid_perplexity/fim": 2.5596330165863037, + "eval_valid_perplexity/first_seq": 14.933544158935547, + "eval_valid_perplexity/last_seq": 8.950615882873535, + "eval_valid_perplexity/second_seq": 13.586867332458496, + "eval_valid_perplexity/seq": 9.093388557434082, + "eval_valid_reconstruction/all": 0.28417351841926575, + "eval_valid_reconstruction/end_span": 0.7094215750694275, + "eval_valid_reconstruction/fim": 0.17872099578380585, + "eval_valid_reconstruction/first_seq": 0.16711844503879547, + "eval_valid_reconstruction/last_seq": 0.33038222789764404, + "eval_valid_reconstruction/second_seq": 0.19986216723918915, + "eval_valid_runtime": 602.5272, + "eval_valid_samples_per_second": 0.319, + "eval_valid_steps_per_second": 0.319, + "step": 4750 + }, + { + "epoch": 0.017718194907604276, + "eval_train_loss": 2.2266712188720703, + "eval_train_loss/all": 2.0575344562530518, + "eval_train_loss/end_span": 1.2174235582351685, + "eval_train_perplexity/batch": 7.826649188995361, + "eval_train_perplexity/end_span": 3.378472089767456, + "eval_train_perplexity/fim": 2.2767174243927, + "eval_train_perplexity/first_seq": 15.262691497802734, + "eval_train_perplexity/last_seq": 8.925394058227539, + "eval_train_perplexity/second_seq": 14.329066276550293, + "eval_train_perplexity/seq": 9.005291938781738, + "eval_train_reconstruction/all": 0.27442818880081177, + "eval_train_reconstruction/end_span": 0.7213900089263916, + "eval_train_reconstruction/fim": 0.15747161209583282, + "eval_train_reconstruction/first_seq": 0.15514856576919556, + "eval_train_reconstruction/last_seq": 0.32928431034088135, + "eval_train_reconstruction/second_seq": 0.18088431656360626, + "eval_train_runtime": 611.6702, + "eval_train_samples_per_second": 0.314, + "eval_train_steps_per_second": 0.314, + "step": 4750 + }, + { + "epoch": 0.017755496370567652, + "grad_norm": 1.4588587284088135, + "learning_rate": 0.0006, + "loss": 2.3523, + "step": 4760 + }, + { + "epoch": 0.01779279783353103, + "grad_norm": 0.32515838742256165, + "learning_rate": 0.0006, + "loss": 2.1345, + "step": 4770 + }, + { + "epoch": 0.017830099296494407, + "grad_norm": 0.3720155358314514, + "learning_rate": 0.0006, + "loss": 2.1929, + "step": 4780 + }, + { + "epoch": 0.017867400759457787, + "grad_norm": 0.7856097221374512, + "learning_rate": 0.0006, + "loss": 2.2616, + "step": 4790 + }, + { + "epoch": 0.017904702222421163, + "grad_norm": 0.2867686450481415, + "learning_rate": 0.0006, + "loss": 2.1768, + "step": 4800 + }, + { + "epoch": 0.017904702222421163, + "eval_valid_loss": 2.225231647491455, + "eval_valid_loss/all": 2.084277868270874, + "eval_valid_loss/end_span": 1.4202536344528198, + "eval_valid_perplexity/batch": 8.03878402709961, + "eval_valid_perplexity/end_span": 4.138169765472412, + "eval_valid_perplexity/fim": 2.3549647331237793, + "eval_valid_perplexity/first_seq": 14.404911041259766, + "eval_valid_perplexity/last_seq": 9.504473686218262, + "eval_valid_perplexity/second_seq": 13.572288513183594, + "eval_valid_perplexity/seq": 9.056797981262207, + "eval_valid_reconstruction/all": 0.2855435609817505, + "eval_valid_reconstruction/end_span": 0.6659060716629028, + "eval_valid_reconstruction/fim": 0.16498568654060364, + "eval_valid_reconstruction/first_seq": 0.18008102476596832, + "eval_valid_reconstruction/last_seq": 0.31172698736190796, + "eval_valid_reconstruction/second_seq": 0.20117619633674622, + "eval_valid_runtime": 615.0725, + "eval_valid_samples_per_second": 0.312, + "eval_valid_steps_per_second": 0.312, + "step": 4800 + }, + { + "epoch": 0.017904702222421163, + "eval_train_loss": 2.2245700359344482, + "eval_train_loss/all": 2.0560598373413086, + "eval_train_loss/end_span": 1.389546275138855, + "eval_train_perplexity/batch": 7.8151164054870605, + "eval_train_perplexity/end_span": 4.013028621673584, + "eval_train_perplexity/fim": 2.010032892227173, + "eval_train_perplexity/first_seq": 15.348640441894531, + "eval_train_perplexity/last_seq": 9.361695289611816, + "eval_train_perplexity/second_seq": 14.364962577819824, + "eval_train_perplexity/seq": 8.99653434753418, + "eval_train_reconstruction/all": 0.27518653869628906, + "eval_train_reconstruction/end_span": 0.6732972264289856, + "eval_train_reconstruction/fim": 0.13449639081954956, + "eval_train_reconstruction/first_seq": 0.1534252017736435, + "eval_train_reconstruction/last_seq": 0.31147894263267517, + "eval_train_reconstruction/second_seq": 0.18188191950321198, + "eval_train_runtime": 616.3791, + "eval_train_samples_per_second": 0.311, + "eval_train_steps_per_second": 0.311, + "step": 4800 + }, + { + "epoch": 0.017942003685384542, + "grad_norm": 0.4236049950122833, + "learning_rate": 0.0006, + "loss": 2.3575, + "step": 4810 + }, + { + "epoch": 0.017979305148347918, + "grad_norm": 0.43978074193000793, + "learning_rate": 0.0006, + "loss": 2.2016, + "step": 4820 + }, + { + "epoch": 0.018016606611311294, + "grad_norm": 0.39589375257492065, + "learning_rate": 0.0006, + "loss": 2.3893, + "step": 4830 + }, + { + "epoch": 0.018053908074274674, + "grad_norm": 0.4501151740550995, + "learning_rate": 0.0006, + "loss": 2.3092, + "step": 4840 + }, + { + "epoch": 0.01809120953723805, + "grad_norm": 0.6066282987594604, + "learning_rate": 0.0006, + "loss": 2.2393, + "step": 4850 + }, + { + "epoch": 0.01809120953723805, + "eval_valid_loss": 2.23250412940979, + "eval_valid_loss/all": 2.0906832218170166, + "eval_valid_loss/end_span": 1.2017816305160522, + "eval_valid_perplexity/batch": 8.09044075012207, + "eval_valid_perplexity/end_span": 3.3260374069213867, + "eval_valid_perplexity/fim": 2.3028972148895264, + "eval_valid_perplexity/first_seq": 15.104219436645508, + "eval_valid_perplexity/last_seq": 9.223464965820312, + "eval_valid_perplexity/second_seq": 13.99393367767334, + "eval_valid_perplexity/seq": 9.109251976013184, + "eval_valid_reconstruction/all": 0.2833459973335266, + "eval_valid_reconstruction/end_span": 0.7237949967384338, + "eval_valid_reconstruction/fim": 0.15931959450244904, + "eval_valid_reconstruction/first_seq": 0.161219984292984, + "eval_valid_reconstruction/last_seq": 0.3204266130924225, + "eval_valid_reconstruction/second_seq": 0.1894390881061554, + "eval_valid_runtime": 601.6867, + "eval_valid_samples_per_second": 0.319, + "eval_valid_steps_per_second": 0.319, + "step": 4850 + }, + { + "epoch": 0.01809120953723805, + "eval_train_loss": 2.229912281036377, + "eval_train_loss/all": 2.0602447986602783, + "eval_train_loss/end_span": 1.1812368631362915, + "eval_train_perplexity/batch": 7.847890853881836, + "eval_train_perplexity/end_span": 3.258401870727539, + "eval_train_perplexity/fim": 1.957936406135559, + "eval_train_perplexity/first_seq": 15.310362815856934, + "eval_train_perplexity/last_seq": 8.988624572753906, + "eval_train_perplexity/second_seq": 14.526022911071777, + "eval_train_perplexity/seq": 9.026004791259766, + "eval_train_reconstruction/all": 0.2736186385154724, + "eval_train_reconstruction/end_span": 0.7327582240104675, + "eval_train_reconstruction/fim": 0.1286165565252304, + "eval_train_reconstruction/first_seq": 0.1578386127948761, + "eval_train_reconstruction/last_seq": 0.3244125247001648, + "eval_train_reconstruction/second_seq": 0.17765478789806366, + "eval_train_runtime": 612.8667, + "eval_train_samples_per_second": 0.313, + "eval_train_steps_per_second": 0.313, + "step": 4850 + }, + { + "epoch": 0.01812851100020143, + "grad_norm": 0.732518196105957, + "learning_rate": 0.0006, + "loss": 2.2425, + "step": 4860 + }, + { + "epoch": 0.018165812463164805, + "grad_norm": 0.5586615204811096, + "learning_rate": 0.0006, + "loss": 2.279, + "step": 4870 + }, + { + "epoch": 0.01820311392612818, + "grad_norm": 0.4198490083217621, + "learning_rate": 0.0006, + "loss": 2.2719, + "step": 4880 + }, + { + "epoch": 0.01824041538909156, + "grad_norm": 0.43557679653167725, + "learning_rate": 0.0006, + "loss": 2.373, + "step": 4890 + }, + { + "epoch": 0.018277716852054936, + "grad_norm": 0.3317711055278778, + "learning_rate": 0.0006, + "loss": 2.1947, + "step": 4900 + }, + { + "epoch": 0.018277716852054936, + "eval_valid_loss": 2.2307026386260986, + "eval_valid_loss/all": 2.0891175270080566, + "eval_valid_loss/end_span": 1.2823426723480225, + "eval_valid_perplexity/batch": 8.077783584594727, + "eval_valid_perplexity/end_span": 3.6050753593444824, + "eval_valid_perplexity/fim": 2.186474084854126, + "eval_valid_perplexity/first_seq": 14.555427551269531, + "eval_valid_perplexity/last_seq": 9.531627655029297, + "eval_valid_perplexity/second_seq": 13.79279613494873, + "eval_valid_perplexity/seq": 9.095075607299805, + "eval_valid_reconstruction/all": 0.28403037786483765, + "eval_valid_reconstruction/end_span": 0.7001482248306274, + "eval_valid_reconstruction/fim": 0.14967234432697296, + "eval_valid_reconstruction/first_seq": 0.17501509189605713, + "eval_valid_reconstruction/last_seq": 0.31028473377227783, + "eval_valid_reconstruction/second_seq": 0.1958203762769699, + "eval_valid_runtime": 604.6888, + "eval_valid_samples_per_second": 0.318, + "eval_valid_steps_per_second": 0.318, + "step": 4900 + }, + { + "epoch": 0.018277716852054936, + "eval_train_loss": 2.227597236633301, + "eval_train_loss/all": 2.0584394931793213, + "eval_train_loss/end_span": 1.2472718954086304, + "eval_train_perplexity/batch": 7.833735466003418, + "eval_train_perplexity/end_span": 3.4808340072631836, + "eval_train_perplexity/fim": 2.2530910968780518, + "eval_train_perplexity/first_seq": 15.492059707641602, + "eval_train_perplexity/last_seq": 9.82913875579834, + "eval_train_perplexity/second_seq": 14.175606727600098, + "eval_train_perplexity/seq": 9.01259708404541, + "eval_train_reconstruction/all": 0.2744061350822449, + "eval_train_reconstruction/end_span": 0.7109367251396179, + "eval_train_reconstruction/fim": 0.15538859367370605, + "eval_train_reconstruction/first_seq": 0.15211409330368042, + "eval_train_reconstruction/last_seq": 0.2971043884754181, + "eval_train_reconstruction/second_seq": 0.1848725825548172, + "eval_train_runtime": 614.5328, + "eval_train_samples_per_second": 0.312, + "eval_train_steps_per_second": 0.312, + "step": 4900 + }, + { + "epoch": 0.018315018315018316, + "grad_norm": 0.4724692404270172, + "learning_rate": 0.0006, + "loss": 2.3199, + "step": 4910 + }, + { + "epoch": 0.018352319777981692, + "grad_norm": 0.4334056079387665, + "learning_rate": 0.0006, + "loss": 2.0263, + "step": 4920 + }, + { + "epoch": 0.01838962124094507, + "grad_norm": 0.40327703952789307, + "learning_rate": 0.0006, + "loss": 2.1882, + "step": 4930 + }, + { + "epoch": 0.018426922703908447, + "grad_norm": 0.3734162449836731, + "learning_rate": 0.0006, + "loss": 2.3402, + "step": 4940 + }, + { + "epoch": 0.018464224166871823, + "grad_norm": 0.31740161776542664, + "learning_rate": 0.0006, + "loss": 2.2129, + "step": 4950 + }, + { + "epoch": 0.018464224166871823, + "eval_valid_loss": 2.2280523777008057, + "eval_valid_loss/all": 2.0872409343719482, + "eval_valid_loss/end_span": 1.278497576713562, + "eval_valid_perplexity/batch": 8.062639236450195, + "eval_valid_perplexity/end_span": 3.591240167617798, + "eval_valid_perplexity/fim": 2.4374959468841553, + "eval_valid_perplexity/first_seq": 14.887173652648926, + "eval_valid_perplexity/last_seq": 9.234759330749512, + "eval_valid_perplexity/second_seq": 14.46737003326416, + "eval_valid_perplexity/seq": 9.084884643554688, + "eval_valid_reconstruction/all": 0.2846405804157257, + "eval_valid_reconstruction/end_span": 0.7024102807044983, + "eval_valid_reconstruction/fim": 0.1697819083929062, + "eval_valid_reconstruction/first_seq": 0.1695229709148407, + "eval_valid_reconstruction/last_seq": 0.3206486701965332, + "eval_valid_reconstruction/second_seq": 0.1808481067419052, + "eval_valid_runtime": 599.9898, + "eval_valid_samples_per_second": 0.32, + "eval_valid_steps_per_second": 0.32, + "step": 4950 + }, + { + "epoch": 0.018464224166871823, + "eval_train_loss": 2.2255027294158936, + "eval_train_loss/all": 2.0571765899658203, + "eval_train_loss/end_span": 1.2386592626571655, + "eval_train_perplexity/batch": 7.823848724365234, + "eval_train_perplexity/end_span": 3.4509835243225098, + "eval_train_perplexity/fim": 2.060685634613037, + "eval_train_perplexity/first_seq": 15.28388500213623, + "eval_train_perplexity/last_seq": 9.5148286819458, + "eval_train_perplexity/second_seq": 14.276500701904297, + "eval_train_perplexity/seq": 9.005280494689941, + "eval_train_reconstruction/all": 0.27468645572662354, + "eval_train_reconstruction/end_span": 0.7139393091201782, + "eval_train_reconstruction/fim": 0.13724322617053986, + "eval_train_reconstruction/first_seq": 0.1553366482257843, + "eval_train_reconstruction/last_seq": 0.30828747153282166, + "eval_train_reconstruction/second_seq": 0.18095189332962036, + "eval_train_runtime": 613.8576, + "eval_train_samples_per_second": 0.313, + "eval_train_steps_per_second": 0.313, + "step": 4950 + }, + { + "epoch": 0.018501525629835203, + "grad_norm": 0.6763415932655334, + "learning_rate": 0.0006, + "loss": 2.1878, + "step": 4960 + }, + { + "epoch": 0.01853882709279858, + "grad_norm": 0.28912076354026794, + "learning_rate": 0.0006, + "loss": 2.229, + "step": 4970 + }, + { + "epoch": 0.018576128555761958, + "grad_norm": 0.30021932721138, + "learning_rate": 0.0006, + "loss": 2.3569, + "step": 4980 + }, + { + "epoch": 0.018613430018725334, + "grad_norm": 0.3913536071777344, + "learning_rate": 0.0006, + "loss": 2.0082, + "step": 4990 + }, + { + "epoch": 0.018650731481688713, + "grad_norm": 0.40157201886177063, + "learning_rate": 0.0006, + "loss": 2.3102, + "step": 5000 + }, + { + "epoch": 0.018650731481688713, + "eval_valid_loss": 2.231816291809082, + "eval_valid_loss/all": 2.090116500854492, + "eval_valid_loss/end_span": 1.2536567449569702, + "eval_valid_perplexity/batch": 8.085857391357422, + "eval_valid_perplexity/end_span": 3.503129720687866, + "eval_valid_perplexity/fim": 2.4444992542266846, + "eval_valid_perplexity/first_seq": 14.988932609558105, + "eval_valid_perplexity/last_seq": 9.657147407531738, + "eval_valid_perplexity/second_seq": 13.899940490722656, + "eval_valid_perplexity/seq": 9.109465599060059, + "eval_valid_reconstruction/all": 0.28369349241256714, + "eval_valid_reconstruction/end_span": 0.7013691663742065, + "eval_valid_reconstruction/fim": 0.17008022964000702, + "eval_valid_reconstruction/first_seq": 0.1685396134853363, + "eval_valid_reconstruction/last_seq": 0.30779141187667847, + "eval_valid_reconstruction/second_seq": 0.19448576867580414, + "eval_valid_runtime": 610.9045, + "eval_valid_samples_per_second": 0.314, + "eval_valid_steps_per_second": 0.314, + "step": 5000 + }, + { + "epoch": 0.018650731481688713, + "eval_train_loss": 2.228693962097168, + "eval_train_loss/all": 2.059605836868286, + "eval_train_loss/end_span": 1.212339997291565, + "eval_train_perplexity/batch": 7.8428778648376465, + "eval_train_perplexity/end_span": 3.3613409996032715, + "eval_train_perplexity/fim": 2.0835564136505127, + "eval_train_perplexity/first_seq": 15.848560333251953, + "eval_train_perplexity/last_seq": 9.602889060974121, + "eval_train_perplexity/second_seq": 14.153904914855957, + "eval_train_perplexity/seq": 9.027050971984863, + "eval_train_reconstruction/all": 0.2740766108036041, + "eval_train_reconstruction/end_span": 0.7130835652351379, + "eval_train_reconstruction/fim": 0.1401520073413849, + "eval_train_reconstruction/first_seq": 0.14190377295017242, + "eval_train_reconstruction/last_seq": 0.30299317836761475, + "eval_train_reconstruction/second_seq": 0.18802404403686523, + "eval_train_runtime": 642.6545, + "eval_train_samples_per_second": 0.299, + "eval_train_steps_per_second": 0.299, + "step": 5000 + }, + { + "epoch": 0.01868803294465209, + "grad_norm": 0.48281627893447876, + "learning_rate": 0.0006, + "loss": 2.2721, + "step": 5010 + }, + { + "epoch": 0.018725334407615465, + "grad_norm": 0.2789275646209717, + "learning_rate": 0.0006, + "loss": 2.3358, + "step": 5020 + }, + { + "epoch": 0.018762635870578845, + "grad_norm": 0.45100677013397217, + "learning_rate": 0.0006, + "loss": 1.9855, + "step": 5030 + }, + { + "epoch": 0.01879993733354222, + "grad_norm": 0.30100253224372864, + "learning_rate": 0.0006, + "loss": 2.3277, + "step": 5040 + }, + { + "epoch": 0.0188372387965056, + "grad_norm": 0.38927093148231506, + "learning_rate": 0.0006, + "loss": 2.2916, + "step": 5050 + }, + { + "epoch": 0.0188372387965056, + "eval_valid_loss": 2.227132558822632, + "eval_valid_loss/all": 2.085930109024048, + "eval_valid_loss/end_span": 1.2624846696853638, + "eval_valid_perplexity/batch": 8.052077293395996, + "eval_valid_perplexity/end_span": 3.534191846847534, + "eval_valid_perplexity/fim": 2.2765214443206787, + "eval_valid_perplexity/first_seq": 14.79440975189209, + "eval_valid_perplexity/last_seq": 9.217061042785645, + "eval_valid_perplexity/second_seq": 13.78470230102539, + "eval_valid_perplexity/seq": 9.07117748260498, + "eval_valid_reconstruction/all": 0.2844759523868561, + "eval_valid_reconstruction/end_span": 0.6955403685569763, + "eval_valid_reconstruction/fim": 0.15760444104671478, + "eval_valid_reconstruction/first_seq": 0.16912469267845154, + "eval_valid_reconstruction/last_seq": 0.320186585187912, + "eval_valid_reconstruction/second_seq": 0.19071704149246216, + "eval_valid_runtime": 653.2786, + "eval_valid_samples_per_second": 0.294, + "eval_valid_steps_per_second": 0.294, + "step": 5050 + }, + { + "epoch": 0.0188372387965056, + "eval_train_loss": 2.2241568565368652, + "eval_train_loss/all": 2.055790424346924, + "eval_train_loss/end_span": 1.2317681312561035, + "eval_train_perplexity/batch": 7.813011169433594, + "eval_train_perplexity/end_span": 3.427284002304077, + "eval_train_perplexity/fim": 2.0652787685394287, + "eval_train_perplexity/first_seq": 15.561202049255371, + "eval_train_perplexity/last_seq": 9.26778507232666, + "eval_train_perplexity/second_seq": 14.341804504394531, + "eval_train_perplexity/seq": 8.993287086486816, + "eval_train_reconstruction/all": 0.27489402890205383, + "eval_train_reconstruction/end_span": 0.7034566402435303, + "eval_train_reconstruction/fim": 0.1394719034433365, + "eval_train_reconstruction/first_seq": 0.1502341330051422, + "eval_train_reconstruction/last_seq": 0.31635117530822754, + "eval_train_reconstruction/second_seq": 0.1842377930879593, + "eval_train_runtime": 637.4956, + "eval_train_samples_per_second": 0.301, + "eval_train_steps_per_second": 0.301, + "step": 5050 + }, + { + "epoch": 0.018874540259468976, + "grad_norm": 0.2959967851638794, + "learning_rate": 0.0006, + "loss": 2.337, + "step": 5060 + }, + { + "epoch": 0.018911841722432352, + "grad_norm": 0.7781514525413513, + "learning_rate": 0.0006, + "loss": 1.9013, + "step": 5070 + }, + { + "epoch": 0.01894914318539573, + "grad_norm": 0.5320495367050171, + "learning_rate": 0.0006, + "loss": 2.1126, + "step": 5080 + }, + { + "epoch": 0.018986444648359108, + "grad_norm": 0.3831121623516083, + "learning_rate": 0.0006, + "loss": 2.3091, + "step": 5090 + }, + { + "epoch": 0.019023746111322487, + "grad_norm": 0.5194263458251953, + "learning_rate": 0.0006, + "loss": 2.3377, + "step": 5100 + }, + { + "epoch": 0.019023746111322487, + "eval_valid_loss": 2.227402687072754, + "eval_valid_loss/all": 2.086432695388794, + "eval_valid_loss/end_span": 1.30880868434906, + "eval_valid_perplexity/batch": 8.05612564086914, + "eval_valid_perplexity/end_span": 3.70176100730896, + "eval_valid_perplexity/fim": 2.214779853820801, + "eval_valid_perplexity/first_seq": 15.038663864135742, + "eval_valid_perplexity/last_seq": 9.72414493560791, + "eval_valid_perplexity/second_seq": 13.865025520324707, + "eval_valid_perplexity/seq": 9.072736740112305, + "eval_valid_reconstruction/all": 0.2850112020969391, + "eval_valid_reconstruction/end_span": 0.7015289664268494, + "eval_valid_reconstruction/fim": 0.1530964970588684, + "eval_valid_reconstruction/first_seq": 0.16551221907138824, + "eval_valid_reconstruction/last_seq": 0.3024412989616394, + "eval_valid_reconstruction/second_seq": 0.19184912741184235, + "eval_valid_runtime": 633.8835, + "eval_valid_samples_per_second": 0.303, + "eval_valid_steps_per_second": 0.303, + "step": 5100 + }, + { + "epoch": 0.019023746111322487, + "eval_train_loss": 2.225003957748413, + "eval_train_loss/all": 2.056514263153076, + "eval_train_loss/end_span": 1.2762327194213867, + "eval_train_perplexity/batch": 7.818668365478516, + "eval_train_perplexity/end_span": 3.583115577697754, + "eval_train_perplexity/fim": 2.443842649459839, + "eval_train_perplexity/first_seq": 15.545442581176758, + "eval_train_perplexity/last_seq": 9.259650230407715, + "eval_train_perplexity/second_seq": 14.307703018188477, + "eval_train_perplexity/seq": 8.996603012084961, + "eval_train_reconstruction/all": 0.2753709554672241, + "eval_train_reconstruction/end_span": 0.7100023627281189, + "eval_train_reconstruction/fim": 0.17151781916618347, + "eval_train_reconstruction/first_seq": 0.1499118208885193, + "eval_train_reconstruction/last_seq": 0.31858545541763306, + "eval_train_reconstruction/second_seq": 0.1809888780117035, + "eval_train_runtime": 631.776, + "eval_train_samples_per_second": 0.304, + "eval_train_steps_per_second": 0.304, + "step": 5100 + }, + { + "epoch": 0.019061047574285863, + "grad_norm": 0.3301027715206146, + "learning_rate": 0.0006, + "loss": 2.3583, + "step": 5110 + }, + { + "epoch": 0.019098349037249242, + "grad_norm": 0.4677412211894989, + "learning_rate": 0.0006, + "loss": 2.1986, + "step": 5120 + }, + { + "epoch": 0.01913565050021262, + "grad_norm": 0.38866379857063293, + "learning_rate": 0.0006, + "loss": 2.2974, + "step": 5130 + }, + { + "epoch": 0.019172951963175994, + "grad_norm": 0.4129382073879242, + "learning_rate": 0.0006, + "loss": 2.1509, + "step": 5140 + }, + { + "epoch": 0.019210253426139374, + "grad_norm": 0.44830232858657837, + "learning_rate": 0.0006, + "loss": 2.3012, + "step": 5150 + }, + { + "epoch": 0.019210253426139374, + "eval_valid_loss": 2.225977659225464, + "eval_valid_loss/all": 2.0851752758026123, + "eval_valid_loss/end_span": 1.3512799739837646, + "eval_valid_perplexity/batch": 8.046001434326172, + "eval_valid_perplexity/end_span": 3.862366199493408, + "eval_valid_perplexity/fim": 2.296969175338745, + "eval_valid_perplexity/first_seq": 14.772443771362305, + "eval_valid_perplexity/last_seq": 9.171225547790527, + "eval_valid_perplexity/second_seq": 13.553791046142578, + "eval_valid_perplexity/seq": 9.061826705932617, + "eval_valid_reconstruction/all": 0.2848491668701172, + "eval_valid_reconstruction/end_span": 0.6825999617576599, + "eval_valid_reconstruction/fim": 0.15991421043872833, + "eval_valid_reconstruction/first_seq": 0.16807648539543152, + "eval_valid_reconstruction/last_seq": 0.31942927837371826, + "eval_valid_reconstruction/second_seq": 0.2023216336965561, + "eval_valid_runtime": 628.3583, + "eval_valid_samples_per_second": 0.306, + "eval_valid_steps_per_second": 0.306, + "step": 5150 + }, + { + "epoch": 0.019210253426139374, + "eval_train_loss": 2.224853992462158, + "eval_train_loss/all": 2.0563454627990723, + "eval_train_loss/end_span": 1.3113752603530884, + "eval_train_perplexity/batch": 7.817348957061768, + "eval_train_perplexity/end_span": 3.7112741470336914, + "eval_train_perplexity/fim": 2.176968574523926, + "eval_train_perplexity/first_seq": 15.429152488708496, + "eval_train_perplexity/last_seq": 9.326526641845703, + "eval_train_perplexity/second_seq": 14.11571979522705, + "eval_train_perplexity/seq": 8.996790885925293, + "eval_train_reconstruction/all": 0.27492955327033997, + "eval_train_reconstruction/end_span": 0.6919100880622864, + "eval_train_reconstruction/fim": 0.14926201105117798, + "eval_train_reconstruction/first_seq": 0.15306755900382996, + "eval_train_reconstruction/last_seq": 0.31820371747016907, + "eval_train_reconstruction/second_seq": 0.18610483407974243, + "eval_train_runtime": 644.1761, + "eval_train_samples_per_second": 0.298, + "eval_train_steps_per_second": 0.298, + "step": 5150 + }, + { + "epoch": 0.01924755488910275, + "grad_norm": 0.47000399231910706, + "learning_rate": 0.0006, + "loss": 2.2647, + "step": 5160 + }, + { + "epoch": 0.01928485635206613, + "grad_norm": 0.44813117384910583, + "learning_rate": 0.0006, + "loss": 2.2749, + "step": 5170 + }, + { + "epoch": 0.019322157815029505, + "grad_norm": 0.6485859155654907, + "learning_rate": 0.0006, + "loss": 2.302, + "step": 5180 + }, + { + "epoch": 0.01935945927799288, + "grad_norm": 0.28421950340270996, + "learning_rate": 0.0006, + "loss": 2.2496, + "step": 5190 + }, + { + "epoch": 0.01939676074095626, + "grad_norm": 0.4684405028820038, + "learning_rate": 0.0006, + "loss": 2.2938, + "step": 5200 + }, + { + "epoch": 0.01939676074095626, + "eval_valid_loss": 2.226271390914917, + "eval_valid_loss/all": 2.0852508544921875, + "eval_valid_loss/end_span": 1.2940599918365479, + "eval_valid_perplexity/batch": 8.046609878540039, + "eval_valid_perplexity/end_span": 3.6475656032562256, + "eval_valid_perplexity/fim": 2.42531156539917, + "eval_valid_perplexity/first_seq": 14.643198013305664, + "eval_valid_perplexity/last_seq": 9.772431373596191, + "eval_valid_perplexity/second_seq": 13.282038688659668, + "eval_valid_perplexity/seq": 9.059696197509766, + "eval_valid_reconstruction/all": 0.2849670946598053, + "eval_valid_reconstruction/end_span": 0.6937979459762573, + "eval_valid_reconstruction/fim": 0.16962699592113495, + "eval_valid_reconstruction/first_seq": 0.17015300691127777, + "eval_valid_reconstruction/last_seq": 0.30368638038635254, + "eval_valid_reconstruction/second_seq": 0.2095472514629364, + "eval_valid_runtime": 652.9466, + "eval_valid_samples_per_second": 0.294, + "eval_valid_steps_per_second": 0.294, + "step": 5200 + }, + { + "epoch": 0.01939676074095626, + "eval_train_loss": 2.2217652797698975, + "eval_train_loss/all": 2.053574562072754, + "eval_train_loss/end_span": 1.2580814361572266, + "eval_train_perplexity/batch": 7.795717716217041, + "eval_train_perplexity/end_span": 3.5186641216278076, + "eval_train_perplexity/fim": 2.2992005348205566, + "eval_train_perplexity/first_seq": 15.37436580657959, + "eval_train_perplexity/last_seq": 9.337796211242676, + "eval_train_perplexity/second_seq": 13.902645111083984, + "eval_train_perplexity/seq": 8.969064712524414, + "eval_train_reconstruction/all": 0.2758619785308838, + "eval_train_reconstruction/end_span": 0.7040315866470337, + "eval_train_reconstruction/fim": 0.16077245771884918, + "eval_train_reconstruction/first_seq": 0.15287049114704132, + "eval_train_reconstruction/last_seq": 0.3154832124710083, + "eval_train_reconstruction/second_seq": 0.18982715904712677, + "eval_train_runtime": 639.7137, + "eval_train_samples_per_second": 0.3, + "eval_train_steps_per_second": 0.3, + "step": 5200 + }, + { + "epoch": 0.019434062203919637, + "grad_norm": 0.34524935483932495, + "learning_rate": 0.0006, + "loss": 2.4792, + "step": 5210 + }, + { + "epoch": 0.019471363666883016, + "grad_norm": 0.3673032224178314, + "learning_rate": 0.0006, + "loss": 2.241, + "step": 5220 + }, + { + "epoch": 0.019508665129846392, + "grad_norm": 0.39859187602996826, + "learning_rate": 0.0006, + "loss": 2.3462, + "step": 5230 + }, + { + "epoch": 0.01954596659280977, + "grad_norm": 0.6623960733413696, + "learning_rate": 0.0006, + "loss": 2.3643, + "step": 5240 + }, + { + "epoch": 0.019583268055773147, + "grad_norm": 0.39964166283607483, + "learning_rate": 0.0006, + "loss": 2.2536, + "step": 5250 + }, + { + "epoch": 0.019583268055773147, + "eval_valid_loss": 2.226854085922241, + "eval_valid_loss/all": 2.085862398147583, + "eval_valid_loss/end_span": 1.303253173828125, + "eval_valid_perplexity/batch": 8.051531791687012, + "eval_valid_perplexity/end_span": 3.681252956390381, + "eval_valid_perplexity/fim": 2.1341159343719482, + "eval_valid_perplexity/first_seq": 14.666471481323242, + "eval_valid_perplexity/last_seq": 9.107754707336426, + "eval_valid_perplexity/second_seq": 13.936253547668457, + "eval_valid_perplexity/seq": 9.06920337677002, + "eval_valid_reconstruction/all": 0.28466036915779114, + "eval_valid_reconstruction/end_span": 0.6938146948814392, + "eval_valid_reconstruction/fim": 0.14431296288967133, + "eval_valid_reconstruction/first_seq": 0.1712976098060608, + "eval_valid_reconstruction/last_seq": 0.32341745495796204, + "eval_valid_reconstruction/second_seq": 0.19354331493377686, + "eval_valid_runtime": 636.1856, + "eval_valid_samples_per_second": 0.302, + "eval_valid_steps_per_second": 0.302, + "step": 5250 + }, + { + "epoch": 0.019583268055773147, + "eval_train_loss": 2.224202871322632, + "eval_train_loss/all": 2.0561442375183105, + "eval_train_loss/end_span": 1.2582424879074097, + "eval_train_perplexity/batch": 7.8157758712768555, + "eval_train_perplexity/end_span": 3.519230842590332, + "eval_train_perplexity/fim": 2.2101547718048096, + "eval_train_perplexity/first_seq": 15.462177276611328, + "eval_train_perplexity/last_seq": 9.374906539916992, + "eval_train_perplexity/second_seq": 14.363807678222656, + "eval_train_perplexity/seq": 8.998396873474121, + "eval_train_reconstruction/all": 0.2749027609825134, + "eval_train_reconstruction/end_span": 0.7050789594650269, + "eval_train_reconstruction/fim": 0.1511363834142685, + "eval_train_reconstruction/first_seq": 0.15225601196289062, + "eval_train_reconstruction/last_seq": 0.3129848539829254, + "eval_train_reconstruction/second_seq": 0.18341432511806488, + "eval_train_runtime": 636.9946, + "eval_train_samples_per_second": 0.301, + "eval_train_steps_per_second": 0.301, + "step": 5250 + }, + { + "epoch": 0.019620569518736523, + "grad_norm": 0.36008375883102417, + "learning_rate": 0.0006, + "loss": 2.3764, + "step": 5260 + }, + { + "epoch": 0.019657870981699903, + "grad_norm": 0.4349192976951599, + "learning_rate": 0.0006, + "loss": 2.3199, + "step": 5270 + }, + { + "epoch": 0.01969517244466328, + "grad_norm": 0.39684122800827026, + "learning_rate": 0.0006, + "loss": 2.3173, + "step": 5280 + }, + { + "epoch": 0.019732473907626658, + "grad_norm": 0.3665699064731598, + "learning_rate": 0.0006, + "loss": 2.1448, + "step": 5290 + }, + { + "epoch": 0.019769775370590034, + "grad_norm": 0.8430445790290833, + "learning_rate": 0.0006, + "loss": 2.3505, + "step": 5300 + }, + { + "epoch": 0.019769775370590034, + "eval_valid_loss": 2.235642910003662, + "eval_valid_loss/all": 2.0934579372406006, + "eval_valid_loss/end_span": 1.400103211402893, + "eval_valid_perplexity/batch": 8.112920761108398, + "eval_valid_perplexity/end_span": 4.055618762969971, + "eval_valid_perplexity/fim": 2.8553531169891357, + "eval_valid_perplexity/first_seq": 14.688867568969727, + "eval_valid_perplexity/last_seq": 9.368022918701172, + "eval_valid_perplexity/second_seq": 13.700928688049316, + "eval_valid_perplexity/seq": 9.133345603942871, + "eval_valid_reconstruction/all": 0.28261154890060425, + "eval_valid_reconstruction/end_span": 0.6830517649650574, + "eval_valid_reconstruction/fim": 0.19901086390018463, + "eval_valid_reconstruction/first_seq": 0.16993001103401184, + "eval_valid_reconstruction/last_seq": 0.31837302446365356, + "eval_valid_reconstruction/second_seq": 0.1961522251367569, + "eval_valid_runtime": 620.5474, + "eval_valid_samples_per_second": 0.309, + "eval_valid_steps_per_second": 0.309, + "step": 5300 + }, + { + "epoch": 0.019769775370590034, + "eval_train_loss": 2.2334187030792236, + "eval_train_loss/all": 2.0634891986846924, + "eval_train_loss/end_span": 1.3552987575531006, + "eval_train_perplexity/batch": 7.873393535614014, + "eval_train_perplexity/end_span": 3.8779194355010986, + "eval_train_perplexity/fim": 2.409327507019043, + "eval_train_perplexity/first_seq": 15.613229751586914, + "eval_train_perplexity/last_seq": 9.556877136230469, + "eval_train_perplexity/second_seq": 14.638660430908203, + "eval_train_perplexity/seq": 9.056438446044922, + "eval_train_reconstruction/all": 0.27283409237861633, + "eval_train_reconstruction/end_span": 0.6912071108818054, + "eval_train_reconstruction/fim": 0.16771847009658813, + "eval_train_reconstruction/first_seq": 0.15180912613868713, + "eval_train_reconstruction/last_seq": 0.3068557381629944, + "eval_train_reconstruction/second_seq": 0.17229226231575012, + "eval_train_runtime": 628.3035, + "eval_train_samples_per_second": 0.306, + "eval_train_steps_per_second": 0.306, + "step": 5300 + }, + { + "epoch": 0.019807076833553414, + "grad_norm": 0.43051597476005554, + "learning_rate": 0.0006, + "loss": 2.165, + "step": 5310 + }, + { + "epoch": 0.01984437829651679, + "grad_norm": 0.5563859939575195, + "learning_rate": 0.0006, + "loss": 2.1383, + "step": 5320 + }, + { + "epoch": 0.019881679759480166, + "grad_norm": 0.42010268568992615, + "learning_rate": 0.0006, + "loss": 2.2134, + "step": 5330 + }, + { + "epoch": 0.019918981222443545, + "grad_norm": 0.4273611605167389, + "learning_rate": 0.0006, + "loss": 2.3716, + "step": 5340 + }, + { + "epoch": 0.01995628268540692, + "grad_norm": 0.37603655457496643, + "learning_rate": 0.0006, + "loss": 2.3578, + "step": 5350 + }, + { + "epoch": 0.01995628268540692, + "eval_valid_loss": 2.2276954650878906, + "eval_valid_loss/all": 2.0864548683166504, + "eval_valid_loss/end_span": 1.3063768148422241, + "eval_valid_perplexity/batch": 8.056303977966309, + "eval_valid_perplexity/end_span": 3.692769765853882, + "eval_valid_perplexity/fim": 2.2280538082122803, + "eval_valid_perplexity/first_seq": 14.635997772216797, + "eval_valid_perplexity/last_seq": 9.188366889953613, + "eval_valid_perplexity/second_seq": 13.803945541381836, + "eval_valid_perplexity/seq": 9.069985389709473, + "eval_valid_reconstruction/all": 0.2847791314125061, + "eval_valid_reconstruction/end_span": 0.6997108459472656, + "eval_valid_reconstruction/fim": 0.15400776267051697, + "eval_valid_reconstruction/first_seq": 0.17217788100242615, + "eval_valid_reconstruction/last_seq": 0.3254532217979431, + "eval_valid_reconstruction/second_seq": 0.19384706020355225, + "eval_valid_runtime": 626.0223, + "eval_valid_samples_per_second": 0.307, + "eval_valid_steps_per_second": 0.307, + "step": 5350 + }, + { + "epoch": 0.01995628268540692, + "eval_train_loss": 2.2241427898406982, + "eval_train_loss/all": 2.0555710792541504, + "eval_train_loss/end_span": 1.2703876495361328, + "eval_train_perplexity/batch": 7.811297416687012, + "eval_train_perplexity/end_span": 3.5622332096099854, + "eval_train_perplexity/fim": 2.394813060760498, + "eval_train_perplexity/first_seq": 15.236427307128906, + "eval_train_perplexity/last_seq": 9.462759971618652, + "eval_train_perplexity/second_seq": 14.25904655456543, + "eval_train_perplexity/seq": 8.986101150512695, + "eval_train_reconstruction/all": 0.27536070346832275, + "eval_train_reconstruction/end_span": 0.7085290551185608, + "eval_train_reconstruction/fim": 0.16807813942432404, + "eval_train_reconstruction/first_seq": 0.15581969916820526, + "eval_train_reconstruction/last_seq": 0.3128945529460907, + "eval_train_reconstruction/second_seq": 0.1814635694026947, + "eval_train_runtime": 638.7717, + "eval_train_samples_per_second": 0.301, + "eval_train_steps_per_second": 0.301, + "step": 5350 + }, + { + "epoch": 0.0199935841483703, + "grad_norm": 0.32589343190193176, + "learning_rate": 0.0006, + "loss": 2.3281, + "step": 5360 + }, + { + "epoch": 0.020030885611333676, + "grad_norm": 0.5252499580383301, + "learning_rate": 0.0006, + "loss": 2.3947, + "step": 5370 + }, + { + "epoch": 0.020068187074297052, + "grad_norm": 0.2597188353538513, + "learning_rate": 0.0006, + "loss": 2.292, + "step": 5380 + }, + { + "epoch": 0.020105488537260432, + "grad_norm": 0.3524569571018219, + "learning_rate": 0.0006, + "loss": 2.2579, + "step": 5390 + }, + { + "epoch": 0.020142790000223808, + "grad_norm": 0.3509899973869324, + "learning_rate": 0.0006, + "loss": 2.124, + "step": 5400 + }, + { + "epoch": 0.020142790000223808, + "eval_valid_loss": 2.2254810333251953, + "eval_valid_loss/all": 2.0846779346466064, + "eval_valid_loss/end_span": 1.3202440738677979, + "eval_valid_perplexity/batch": 8.042000770568848, + "eval_valid_perplexity/end_span": 3.744335174560547, + "eval_valid_perplexity/fim": 2.3799054622650146, + "eval_valid_perplexity/first_seq": 14.804607391357422, + "eval_valid_perplexity/last_seq": 9.42984390258789, + "eval_valid_perplexity/second_seq": 13.727029800415039, + "eval_valid_perplexity/seq": 9.054476737976074, + "eval_valid_reconstruction/all": 0.2851100265979767, + "eval_valid_reconstruction/end_span": 0.6990776658058167, + "eval_valid_reconstruction/fim": 0.1656053364276886, + "eval_valid_reconstruction/first_seq": 0.17080235481262207, + "eval_valid_reconstruction/last_seq": 0.31348463892936707, + "eval_valid_reconstruction/second_seq": 0.1954200714826584, + "eval_valid_runtime": 621.0904, + "eval_valid_samples_per_second": 0.309, + "eval_valid_steps_per_second": 0.309, + "step": 5400 + }, + { + "epoch": 0.020142790000223808, + "eval_train_loss": 2.2214365005493164, + "eval_train_loss/all": 2.053316116333008, + "eval_train_loss/end_span": 1.2743879556655884, + "eval_train_perplexity/batch": 7.793703079223633, + "eval_train_perplexity/end_span": 3.576511859893799, + "eval_train_perplexity/fim": 2.0456182956695557, + "eval_train_perplexity/first_seq": 15.431746482849121, + "eval_train_perplexity/last_seq": 8.633806228637695, + "eval_train_perplexity/second_seq": 14.535938262939453, + "eval_train_perplexity/seq": 8.962868690490723, + "eval_train_reconstruction/all": 0.27574869990348816, + "eval_train_reconstruction/end_span": 0.7075408697128296, + "eval_train_reconstruction/fim": 0.13783547282218933, + "eval_train_reconstruction/first_seq": 0.15272250771522522, + "eval_train_reconstruction/last_seq": 0.3361918032169342, + "eval_train_reconstruction/second_seq": 0.18186639249324799, + "eval_train_runtime": 642.6493, + "eval_train_samples_per_second": 0.299, + "eval_train_steps_per_second": 0.299, + "step": 5400 + }, + { + "epoch": 0.020180091463187187, + "grad_norm": 0.49137645959854126, + "learning_rate": 0.0006, + "loss": 2.2461, + "step": 5410 + }, + { + "epoch": 0.020217392926150563, + "grad_norm": 0.35641536116600037, + "learning_rate": 0.0006, + "loss": 2.2303, + "step": 5420 + }, + { + "epoch": 0.020254694389113943, + "grad_norm": 0.2702391445636749, + "learning_rate": 0.0006, + "loss": 2.4121, + "step": 5430 + }, + { + "epoch": 0.02029199585207732, + "grad_norm": 0.3115755319595337, + "learning_rate": 0.0006, + "loss": 2.3107, + "step": 5440 + }, + { + "epoch": 0.020329297315040695, + "grad_norm": 0.5538231730461121, + "learning_rate": 0.0006, + "loss": 2.3389, + "step": 5450 + }, + { + "epoch": 0.020329297315040695, + "eval_valid_loss": 2.2277586460113525, + "eval_valid_loss/all": 2.086458206176758, + "eval_valid_loss/end_span": 1.2638357877731323, + "eval_valid_perplexity/batch": 8.056330680847168, + "eval_valid_perplexity/end_span": 3.5389702320098877, + "eval_valid_perplexity/fim": 2.848733901977539, + "eval_valid_perplexity/first_seq": 15.146917343139648, + "eval_valid_perplexity/last_seq": 9.147021293640137, + "eval_valid_perplexity/second_seq": 13.863161087036133, + "eval_valid_perplexity/seq": 9.0701265335083, + "eval_valid_reconstruction/all": 0.28483420610427856, + "eval_valid_reconstruction/end_span": 0.706188976764679, + "eval_valid_reconstruction/fim": 0.20034952461719513, + "eval_valid_reconstruction/first_seq": 0.16450129449367523, + "eval_valid_reconstruction/last_seq": 0.32139620184898376, + "eval_valid_reconstruction/second_seq": 0.19363604485988617, + "eval_valid_runtime": 615.9645, + "eval_valid_samples_per_second": 0.312, + "eval_valid_steps_per_second": 0.312, + "step": 5450 + }, + { + "epoch": 0.020329297315040695, + "eval_train_loss": 2.224703788757324, + "eval_train_loss/all": 2.0562520027160645, + "eval_train_loss/end_span": 1.2423838376998901, + "eval_train_perplexity/batch": 7.816617965698242, + "eval_train_perplexity/end_span": 3.4638609886169434, + "eval_train_perplexity/fim": 2.0145890712738037, + "eval_train_perplexity/first_seq": 15.851958274841309, + "eval_train_perplexity/last_seq": 9.26349925994873, + "eval_train_perplexity/second_seq": 14.154516220092773, + "eval_train_perplexity/seq": 8.994952201843262, + "eval_train_reconstruction/all": 0.2751127779483795, + "eval_train_reconstruction/end_span": 0.7112782001495361, + "eval_train_reconstruction/fim": 0.1346932351589203, + "eval_train_reconstruction/first_seq": 0.1455805003643036, + "eval_train_reconstruction/last_seq": 0.3146388828754425, + "eval_train_reconstruction/second_seq": 0.1848672777414322, + "eval_train_runtime": 633.5226, + "eval_train_samples_per_second": 0.303, + "eval_train_steps_per_second": 0.303, + "step": 5450 + }, + { + "epoch": 0.020366598778004074, + "grad_norm": 1.942842721939087, + "learning_rate": 0.0006, + "loss": 2.2924, + "step": 5460 + }, + { + "epoch": 0.02040390024096745, + "grad_norm": 0.32994523644447327, + "learning_rate": 0.0006, + "loss": 2.2433, + "step": 5470 + }, + { + "epoch": 0.02044120170393083, + "grad_norm": 0.36973246932029724, + "learning_rate": 0.0006, + "loss": 2.3229, + "step": 5480 + }, + { + "epoch": 0.020478503166894205, + "grad_norm": 1.0619710683822632, + "learning_rate": 0.0006, + "loss": 2.2547, + "step": 5490 + }, + { + "epoch": 0.02051580462985758, + "grad_norm": 0.41720080375671387, + "learning_rate": 0.0006, + "loss": 2.2479, + "step": 5500 + }, + { + "epoch": 0.02051580462985758, + "eval_valid_loss": 2.229055166244507, + "eval_valid_loss/all": 2.087860107421875, + "eval_valid_loss/end_span": 1.3387422561645508, + "eval_valid_perplexity/batch": 8.067632675170898, + "eval_valid_perplexity/end_span": 3.8142430782318115, + "eval_valid_perplexity/fim": 2.348160743713379, + "eval_valid_perplexity/first_seq": 14.84227466583252, + "eval_valid_perplexity/last_seq": 9.54340934753418, + "eval_valid_perplexity/second_seq": 13.858898162841797, + "eval_valid_perplexity/seq": 9.08495044708252, + "eval_valid_reconstruction/all": 0.2842037081718445, + "eval_valid_reconstruction/end_span": 0.6851649284362793, + "eval_valid_reconstruction/fim": 0.16353793442249298, + "eval_valid_reconstruction/first_seq": 0.16774193942546844, + "eval_valid_reconstruction/last_seq": 0.31016993522644043, + "eval_valid_reconstruction/second_seq": 0.19547441601753235, + "eval_valid_runtime": 622.3294, + "eval_valid_samples_per_second": 0.309, + "eval_valid_steps_per_second": 0.309, + "step": 5500 + }, + { + "epoch": 0.02051580462985758, + "eval_train_loss": 2.226181745529175, + "eval_train_loss/all": 2.057957172393799, + "eval_train_loss/end_span": 1.3065080642700195, + "eval_train_perplexity/batch": 7.829958438873291, + "eval_train_perplexity/end_span": 3.6932544708251953, + "eval_train_perplexity/fim": 2.360867977142334, + "eval_train_perplexity/first_seq": 15.11514949798584, + "eval_train_perplexity/last_seq": 8.895079612731934, + "eval_train_perplexity/second_seq": 14.104120254516602, + "eval_train_perplexity/seq": 9.011122703552246, + "eval_train_reconstruction/all": 0.27445095777511597, + "eval_train_reconstruction/end_span": 0.6960899829864502, + "eval_train_reconstruction/fim": 0.16422565281391144, + "eval_train_reconstruction/first_seq": 0.15801124274730682, + "eval_train_reconstruction/last_seq": 0.327866792678833, + "eval_train_reconstruction/second_seq": 0.18709352612495422, + "eval_train_runtime": 636.4682, + "eval_train_samples_per_second": 0.302, + "eval_train_steps_per_second": 0.302, + "step": 5500 + }, + { + "epoch": 0.02055310609282096, + "grad_norm": 0.4887006878852844, + "learning_rate": 0.0006, + "loss": 2.2541, + "step": 5510 + }, + { + "epoch": 0.020590407555784337, + "grad_norm": 0.45396241545677185, + "learning_rate": 0.0006, + "loss": 2.1317, + "step": 5520 + }, + { + "epoch": 0.020627709018747716, + "grad_norm": 0.5951597094535828, + "learning_rate": 0.0006, + "loss": 2.2705, + "step": 5530 + }, + { + "epoch": 0.020665010481711092, + "grad_norm": 0.4685579538345337, + "learning_rate": 0.0006, + "loss": 2.2903, + "step": 5540 + }, + { + "epoch": 0.02070231194467447, + "grad_norm": 0.43754062056541443, + "learning_rate": 0.0006, + "loss": 2.2542, + "step": 5550 + }, + { + "epoch": 0.02070231194467447, + "eval_valid_loss": 2.223252534866333, + "eval_valid_loss/all": 2.0826680660247803, + "eval_valid_loss/end_span": 1.2186239957809448, + "eval_valid_perplexity/batch": 8.025854110717773, + "eval_valid_perplexity/end_span": 3.3825302124023438, + "eval_valid_perplexity/fim": 2.0830109119415283, + "eval_valid_perplexity/first_seq": 15.052003860473633, + "eval_valid_perplexity/last_seq": 9.024721145629883, + "eval_valid_perplexity/second_seq": 13.956100463867188, + "eval_valid_perplexity/seq": 9.038651466369629, + "eval_valid_reconstruction/all": 0.28543227910995483, + "eval_valid_reconstruction/end_span": 0.7123230695724487, + "eval_valid_reconstruction/fim": 0.14062009751796722, + "eval_valid_reconstruction/first_seq": 0.1651817113161087, + "eval_valid_reconstruction/last_seq": 0.3275787830352783, + "eval_valid_reconstruction/second_seq": 0.19329069554805756, + "eval_valid_runtime": 618.518, + "eval_valid_samples_per_second": 0.31, + "eval_valid_steps_per_second": 0.31, + "step": 5550 + }, + { + "epoch": 0.02070231194467447, + "eval_train_loss": 2.2200191020965576, + "eval_train_loss/all": 2.0522613525390625, + "eval_train_loss/end_span": 1.1985273361206055, + "eval_train_perplexity/batch": 7.785487174987793, + "eval_train_perplexity/end_span": 3.3152310848236084, + "eval_train_perplexity/fim": 2.4175727367401123, + "eval_train_perplexity/first_seq": 15.401650428771973, + "eval_train_perplexity/last_seq": 9.389399528503418, + "eval_train_perplexity/second_seq": 14.430773735046387, + "eval_train_perplexity/seq": 8.95844841003418, + "eval_train_reconstruction/all": 0.2760007381439209, + "eval_train_reconstruction/end_span": 0.7204875946044922, + "eval_train_reconstruction/fim": 0.17067842185497284, + "eval_train_reconstruction/first_seq": 0.15638568997383118, + "eval_train_reconstruction/last_seq": 0.31108349561691284, + "eval_train_reconstruction/second_seq": 0.17974330484867096, + "eval_train_runtime": 637.1829, + "eval_train_samples_per_second": 0.301, + "eval_train_steps_per_second": 0.301, + "step": 5550 + }, + { + "epoch": 0.020739613407637848, + "grad_norm": 0.4627315402030945, + "learning_rate": 0.0006, + "loss": 2.1987, + "step": 5560 + }, + { + "epoch": 0.020776914870601224, + "grad_norm": 0.5638169646263123, + "learning_rate": 0.0006, + "loss": 1.9881, + "step": 5570 + }, + { + "epoch": 0.020814216333564603, + "grad_norm": 0.44906434416770935, + "learning_rate": 0.0006, + "loss": 2.2474, + "step": 5580 + }, + { + "epoch": 0.02085151779652798, + "grad_norm": 0.3960329592227936, + "learning_rate": 0.0006, + "loss": 2.1621, + "step": 5590 + }, + { + "epoch": 0.02088881925949136, + "grad_norm": 0.5010073781013489, + "learning_rate": 0.0006, + "loss": 2.2452, + "step": 5600 + }, + { + "epoch": 0.02088881925949136, + "eval_valid_loss": 2.224942445755005, + "eval_valid_loss/all": 2.0843186378479004, + "eval_valid_loss/end_span": 1.317182183265686, + "eval_valid_perplexity/batch": 8.039112091064453, + "eval_valid_perplexity/end_span": 3.7328879833221436, + "eval_valid_perplexity/fim": 2.2653753757476807, + "eval_valid_perplexity/first_seq": 14.908516883850098, + "eval_valid_perplexity/last_seq": 9.568769454956055, + "eval_valid_perplexity/second_seq": 13.707858085632324, + "eval_valid_perplexity/seq": 9.055315971374512, + "eval_valid_reconstruction/all": 0.28573325276374817, + "eval_valid_reconstruction/end_span": 0.6901636123657227, + "eval_valid_reconstruction/fim": 0.15741552412509918, + "eval_valid_reconstruction/first_seq": 0.16631394624710083, + "eval_valid_reconstruction/last_seq": 0.307340145111084, + "eval_valid_reconstruction/second_seq": 0.20056705176830292, + "eval_valid_runtime": 613.9261, + "eval_valid_samples_per_second": 0.313, + "eval_valid_steps_per_second": 0.313, + "step": 5600 + }, + { + "epoch": 0.02088881925949136, + "eval_train_loss": 2.221632242202759, + "eval_train_loss/all": 2.054037094116211, + "eval_train_loss/end_span": 1.2907874584197998, + "eval_train_perplexity/batch": 7.799324035644531, + "eval_train_perplexity/end_span": 3.635648250579834, + "eval_train_perplexity/fim": 2.0166988372802734, + "eval_train_perplexity/first_seq": 15.441314697265625, + "eval_train_perplexity/last_seq": 9.46691608428955, + "eval_train_perplexity/second_seq": 14.6240234375, + "eval_train_perplexity/seq": 8.982029914855957, + "eval_train_reconstruction/all": 0.2758518159389496, + "eval_train_reconstruction/end_span": 0.697433590888977, + "eval_train_reconstruction/fim": 0.13500377535820007, + "eval_train_reconstruction/first_seq": 0.15390749275684357, + "eval_train_reconstruction/last_seq": 0.3088766634464264, + "eval_train_reconstruction/second_seq": 0.17629976570606232, + "eval_train_runtime": 629.3967, + "eval_train_samples_per_second": 0.305, + "eval_train_steps_per_second": 0.305, + "step": 5600 + }, + { + "epoch": 0.020926120722454734, + "grad_norm": 0.5752795934677124, + "learning_rate": 0.0006, + "loss": 2.4711, + "step": 5610 + }, + { + "epoch": 0.020963422185418114, + "grad_norm": 0.2949106693267822, + "learning_rate": 0.0006, + "loss": 2.3182, + "step": 5620 + }, + { + "epoch": 0.02100072364838149, + "grad_norm": 0.25569188594818115, + "learning_rate": 0.0006, + "loss": 2.3128, + "step": 5630 + }, + { + "epoch": 0.021038025111344866, + "grad_norm": 0.38897904753685, + "learning_rate": 0.0006, + "loss": 2.3042, + "step": 5640 + }, + { + "epoch": 0.021075326574308245, + "grad_norm": 0.4309357702732086, + "learning_rate": 0.0006, + "loss": 2.2485, + "step": 5650 + }, + { + "epoch": 0.021075326574308245, + "eval_valid_loss": 2.2240171432495117, + "eval_valid_loss/all": 2.0833334922790527, + "eval_valid_loss/end_span": 1.3540927171707153, + "eval_valid_perplexity/batch": 8.031196594238281, + "eval_valid_perplexity/end_span": 3.8732452392578125, + "eval_valid_perplexity/fim": 2.28928279876709, + "eval_valid_perplexity/first_seq": 14.781220436096191, + "eval_valid_perplexity/last_seq": 9.075334548950195, + "eval_valid_perplexity/second_seq": 13.88262939453125, + "eval_valid_perplexity/seq": 9.045472145080566, + "eval_valid_reconstruction/all": 0.28584086894989014, + "eval_valid_reconstruction/end_span": 0.6817315220832825, + "eval_valid_reconstruction/fim": 0.15875989198684692, + "eval_valid_reconstruction/first_seq": 0.17014314234256744, + "eval_valid_reconstruction/last_seq": 0.32251739501953125, + "eval_valid_reconstruction/second_seq": 0.19592933356761932, + "eval_valid_runtime": 618.2886, + "eval_valid_samples_per_second": 0.311, + "eval_valid_steps_per_second": 0.311, + "step": 5650 + }, + { + "epoch": 0.021075326574308245, + "eval_train_loss": 2.2211596965789795, + "eval_train_loss/all": 2.053325891494751, + "eval_train_loss/end_span": 1.3346431255340576, + "eval_train_perplexity/batch": 7.793779373168945, + "eval_train_perplexity/end_span": 3.798640012741089, + "eval_train_perplexity/fim": 2.3539490699768066, + "eval_train_perplexity/first_seq": 15.523615837097168, + "eval_train_perplexity/last_seq": 9.278877258300781, + "eval_train_perplexity/second_seq": 14.814210891723633, + "eval_train_perplexity/seq": 8.966780662536621, + "eval_train_reconstruction/all": 0.27607980370521545, + "eval_train_reconstruction/end_span": 0.6851200461387634, + "eval_train_reconstruction/fim": 0.16376009583473206, + "eval_train_reconstruction/first_seq": 0.15156963467597961, + "eval_train_reconstruction/last_seq": 0.3206188976764679, + "eval_train_reconstruction/second_seq": 0.17311903834342957, + "eval_train_runtime": 637.4209, + "eval_train_samples_per_second": 0.301, + "eval_train_steps_per_second": 0.301, + "step": 5650 + }, + { + "epoch": 0.02111262803727162, + "grad_norm": 0.4569694697856903, + "learning_rate": 0.0006, + "loss": 2.2917, + "step": 5660 + }, + { + "epoch": 0.021149929500235, + "grad_norm": 0.3607451319694519, + "learning_rate": 0.0006, + "loss": 2.1736, + "step": 5670 + }, + { + "epoch": 0.021187230963198377, + "grad_norm": 0.5112768411636353, + "learning_rate": 0.0006, + "loss": 2.1932, + "step": 5680 + }, + { + "epoch": 0.021224532426161753, + "grad_norm": 0.3941221535205841, + "learning_rate": 0.0006, + "loss": 2.3363, + "step": 5690 + }, + { + "epoch": 0.021261833889125132, + "grad_norm": 0.46821942925453186, + "learning_rate": 0.0006, + "loss": 2.0144, + "step": 5700 + }, + { + "epoch": 0.021261833889125132, + "eval_valid_loss": 2.230893850326538, + "eval_valid_loss/all": 2.089656352996826, + "eval_valid_loss/end_span": 1.2725584506988525, + "eval_valid_perplexity/batch": 8.082137107849121, + "eval_valid_perplexity/end_span": 3.569974422454834, + "eval_valid_perplexity/fim": 2.254312753677368, + "eval_valid_perplexity/first_seq": 15.328804969787598, + "eval_valid_perplexity/last_seq": 9.107589721679688, + "eval_valid_perplexity/second_seq": 13.877903938293457, + "eval_valid_perplexity/seq": 9.1005859375, + "eval_valid_reconstruction/all": 0.2839661240577698, + "eval_valid_reconstruction/end_span": 0.7044311165809631, + "eval_valid_reconstruction/fim": 0.15477031469345093, + "eval_valid_reconstruction/first_seq": 0.16021312773227692, + "eval_valid_reconstruction/last_seq": 0.32547467947006226, + "eval_valid_reconstruction/second_seq": 0.1949607878923416, + "eval_valid_runtime": 611.9393, + "eval_valid_samples_per_second": 0.314, + "eval_valid_steps_per_second": 0.314, + "step": 5700 + }, + { + "epoch": 0.021261833889125132, + "eval_train_loss": 2.228093385696411, + "eval_train_loss/all": 2.059436798095703, + "eval_train_loss/end_span": 1.2363907098770142, + "eval_train_perplexity/batch": 7.841552257537842, + "eval_train_perplexity/end_span": 3.4431636333465576, + "eval_train_perplexity/fim": 2.0090537071228027, + "eval_train_perplexity/first_seq": 15.407328605651855, + "eval_train_perplexity/last_seq": 9.824735641479492, + "eval_train_perplexity/second_seq": 14.200204849243164, + "eval_train_perplexity/seq": 9.025213241577148, + "eval_train_reconstruction/all": 0.27409040927886963, + "eval_train_reconstruction/end_span": 0.7130901217460632, + "eval_train_reconstruction/fim": 0.1325710266828537, + "eval_train_reconstruction/first_seq": 0.1577351689338684, + "eval_train_reconstruction/last_seq": 0.29236680269241333, + "eval_train_reconstruction/second_seq": 0.18546386063098907, + "eval_train_runtime": 631.5654, + "eval_train_samples_per_second": 0.304, + "eval_train_steps_per_second": 0.304, + "step": 5700 + }, + { + "epoch": 0.021299135352088508, + "grad_norm": 0.43353271484375, + "learning_rate": 0.0006, + "loss": 2.3788, + "step": 5710 + }, + { + "epoch": 0.021336436815051887, + "grad_norm": 0.3123222291469574, + "learning_rate": 0.0006, + "loss": 2.2794, + "step": 5720 + }, + { + "epoch": 0.021373738278015263, + "grad_norm": 0.4540006220340729, + "learning_rate": 0.0006, + "loss": 2.2606, + "step": 5730 + }, + { + "epoch": 0.021411039740978643, + "grad_norm": 0.33468228578567505, + "learning_rate": 0.0006, + "loss": 2.3697, + "step": 5740 + }, + { + "epoch": 0.02144834120394202, + "grad_norm": 0.5110843181610107, + "learning_rate": 0.0006, + "loss": 2.325, + "step": 5750 + }, + { + "epoch": 0.02144834120394202, + "eval_valid_loss": 2.2301886081695557, + "eval_valid_loss/all": 2.088395833969116, + "eval_valid_loss/end_span": 1.2141698598861694, + "eval_valid_perplexity/batch": 8.071955680847168, + "eval_valid_perplexity/end_span": 3.367497444152832, + "eval_valid_perplexity/fim": 2.250370502471924, + "eval_valid_perplexity/first_seq": 15.197168350219727, + "eval_valid_perplexity/last_seq": 9.219261169433594, + "eval_valid_perplexity/second_seq": 13.823446273803711, + "eval_valid_perplexity/seq": 9.085515022277832, + "eval_valid_reconstruction/all": 0.284136027097702, + "eval_valid_reconstruction/end_span": 0.7241054177284241, + "eval_valid_reconstruction/fim": 0.15409541130065918, + "eval_valid_reconstruction/first_seq": 0.16178953647613525, + "eval_valid_reconstruction/last_seq": 0.3181784749031067, + "eval_valid_reconstruction/second_seq": 0.1947881281375885, + "eval_valid_runtime": 620.3293, + "eval_valid_samples_per_second": 0.31, + "eval_valid_steps_per_second": 0.31, + "step": 5750 + }, + { + "epoch": 0.02144834120394202, + "eval_train_loss": 2.226713180541992, + "eval_train_loss/all": 2.057969093322754, + "eval_train_loss/end_span": 1.1781142950057983, + "eval_train_perplexity/batch": 7.830051422119141, + "eval_train_perplexity/end_span": 3.2482430934906006, + "eval_train_perplexity/fim": 2.491793394088745, + "eval_train_perplexity/first_seq": 15.528861999511719, + "eval_train_perplexity/last_seq": 9.603857040405273, + "eval_train_perplexity/second_seq": 14.604172706604004, + "eval_train_perplexity/seq": 9.009343147277832, + "eval_train_reconstruction/all": 0.2744409441947937, + "eval_train_reconstruction/end_span": 0.732449471950531, + "eval_train_reconstruction/fim": 0.17502953112125397, + "eval_train_reconstruction/first_seq": 0.15183259546756744, + "eval_train_reconstruction/last_seq": 0.30335474014282227, + "eval_train_reconstruction/second_seq": 0.1750030517578125, + "eval_train_runtime": 624.6104, + "eval_train_samples_per_second": 0.307, + "eval_train_steps_per_second": 0.307, + "step": 5750 + }, + { + "epoch": 0.021485642666905395, + "grad_norm": 0.31182876229286194, + "learning_rate": 0.0006, + "loss": 2.3115, + "step": 5760 + }, + { + "epoch": 0.021522944129868774, + "grad_norm": 0.6920693516731262, + "learning_rate": 0.0006, + "loss": 2.1662, + "step": 5770 + }, + { + "epoch": 0.02156024559283215, + "grad_norm": 0.3557916283607483, + "learning_rate": 0.0006, + "loss": 2.354, + "step": 5780 + }, + { + "epoch": 0.02159754705579553, + "grad_norm": 0.40260428190231323, + "learning_rate": 0.0006, + "loss": 2.1393, + "step": 5790 + }, + { + "epoch": 0.021634848518758906, + "grad_norm": 0.39083054661750793, + "learning_rate": 0.0006, + "loss": 2.3242, + "step": 5800 + }, + { + "epoch": 0.021634848518758906, + "eval_valid_loss": 2.225937604904175, + "eval_valid_loss/all": 2.0849974155426025, + "eval_valid_loss/end_span": 1.312736988067627, + "eval_valid_perplexity/batch": 8.044570922851562, + "eval_valid_perplexity/end_span": 3.7163312435150146, + "eval_valid_perplexity/fim": 2.371581554412842, + "eval_valid_perplexity/first_seq": 14.957759857177734, + "eval_valid_perplexity/last_seq": 9.259546279907227, + "eval_valid_perplexity/second_seq": 14.09085750579834, + "eval_valid_perplexity/seq": 9.057320594787598, + "eval_valid_reconstruction/all": 0.28513604402542114, + "eval_valid_reconstruction/end_span": 0.7009559869766235, + "eval_valid_reconstruction/fim": 0.16605991125106812, + "eval_valid_reconstruction/first_seq": 0.1629207879304886, + "eval_valid_reconstruction/last_seq": 0.3188686668872833, + "eval_valid_reconstruction/second_seq": 0.1875770092010498, + "eval_valid_runtime": 612.4705, + "eval_valid_samples_per_second": 0.313, + "eval_valid_steps_per_second": 0.313, + "step": 5800 + }, + { + "epoch": 0.021634848518758906, + "eval_train_loss": 2.2216286659240723, + "eval_train_loss/all": 2.0534286499023438, + "eval_train_loss/end_span": 1.2769421339035034, + "eval_train_perplexity/batch": 7.794580459594727, + "eval_train_perplexity/end_span": 3.585658550262451, + "eval_train_perplexity/fim": 2.021721839904785, + "eval_train_perplexity/first_seq": 15.363116264343262, + "eval_train_perplexity/last_seq": 9.307095527648926, + "eval_train_perplexity/second_seq": 14.001900672912598, + "eval_train_perplexity/seq": 8.965397834777832, + "eval_train_reconstruction/all": 0.2757527530193329, + "eval_train_reconstruction/end_span": 0.7084291577339172, + "eval_train_reconstruction/fim": 0.13645309209823608, + "eval_train_reconstruction/first_seq": 0.1544342041015625, + "eval_train_reconstruction/last_seq": 0.31303513050079346, + "eval_train_reconstruction/second_seq": 0.18588270246982574, + "eval_train_runtime": 616.8271, + "eval_train_samples_per_second": 0.311, + "eval_train_steps_per_second": 0.311, + "step": 5800 + }, + { + "epoch": 0.02167214998172228, + "grad_norm": 0.4894244074821472, + "learning_rate": 0.0006, + "loss": 2.1619, + "step": 5810 + }, + { + "epoch": 0.02170945144468566, + "grad_norm": 0.4187922477722168, + "learning_rate": 0.0006, + "loss": 2.1742, + "step": 5820 + }, + { + "epoch": 0.021746752907649037, + "grad_norm": 0.37085816264152527, + "learning_rate": 0.0006, + "loss": 2.2565, + "step": 5830 + }, + { + "epoch": 0.021784054370612416, + "grad_norm": 0.5761265158653259, + "learning_rate": 0.0006, + "loss": 2.4394, + "step": 5840 + }, + { + "epoch": 0.021821355833575792, + "grad_norm": 0.5983036756515503, + "learning_rate": 0.0006, + "loss": 2.2042, + "step": 5850 + }, + { + "epoch": 0.021821355833575792, + "eval_valid_loss": 2.2329487800598145, + "eval_valid_loss/all": 2.0917744636535645, + "eval_valid_loss/end_span": 1.2465213537216187, + "eval_valid_perplexity/batch": 8.099274635314941, + "eval_valid_perplexity/end_span": 3.478222370147705, + "eval_valid_perplexity/fim": 2.1068356037139893, + "eval_valid_perplexity/first_seq": 15.319539070129395, + "eval_valid_perplexity/last_seq": 8.818501472473145, + "eval_valid_perplexity/second_seq": 14.240500450134277, + "eval_valid_perplexity/seq": 9.126779556274414, + "eval_valid_reconstruction/all": 0.28379765152931213, + "eval_valid_reconstruction/end_span": 0.7071787714958191, + "eval_valid_reconstruction/fim": 0.14184463024139404, + "eval_valid_reconstruction/first_seq": 0.15937118232250214, + "eval_valid_reconstruction/last_seq": 0.3354894518852234, + "eval_valid_reconstruction/second_seq": 0.18110890686511993, + "eval_valid_runtime": 601.2968, + "eval_valid_samples_per_second": 0.319, + "eval_valid_steps_per_second": 0.319, + "step": 5850 + }, + { + "epoch": 0.021821355833575792, + "eval_train_loss": 2.228079319000244, + "eval_train_loss/all": 2.0594589710235596, + "eval_train_loss/end_span": 1.2262057065963745, + "eval_train_perplexity/batch": 7.841725826263428, + "eval_train_perplexity/end_span": 3.4082729816436768, + "eval_train_perplexity/fim": 2.081106662750244, + "eval_train_perplexity/first_seq": 15.173215866088867, + "eval_train_perplexity/last_seq": 9.172502517700195, + "eval_train_perplexity/second_seq": 14.524949073791504, + "eval_train_perplexity/seq": 9.026809692382812, + "eval_train_reconstruction/all": 0.2743070125579834, + "eval_train_reconstruction/end_span": 0.7124117612838745, + "eval_train_reconstruction/fim": 0.1396891474723816, + "eval_train_reconstruction/first_seq": 0.15727145969867706, + "eval_train_reconstruction/last_seq": 0.3197454512119293, + "eval_train_reconstruction/second_seq": 0.17550143599510193, + "eval_train_runtime": 607.5027, + "eval_train_samples_per_second": 0.316, + "eval_train_steps_per_second": 0.316, + "step": 5850 + }, + { + "epoch": 0.021858657296539172, + "grad_norm": 0.34768933057785034, + "learning_rate": 0.0006, + "loss": 2.1973, + "step": 5860 + }, + { + "epoch": 0.021895958759502548, + "grad_norm": 0.5033017992973328, + "learning_rate": 0.0006, + "loss": 2.1196, + "step": 5870 + }, + { + "epoch": 0.021933260222465924, + "grad_norm": 0.9934744238853455, + "learning_rate": 0.0006, + "loss": 2.3106, + "step": 5880 + }, + { + "epoch": 0.021970561685429303, + "grad_norm": 0.3728700280189514, + "learning_rate": 0.0006, + "loss": 2.3226, + "step": 5890 + }, + { + "epoch": 0.02200786314839268, + "grad_norm": 0.514291524887085, + "learning_rate": 0.0006, + "loss": 2.1989, + "step": 5900 + }, + { + "epoch": 0.02200786314839268, + "eval_valid_loss": 2.2286550998687744, + "eval_valid_loss/all": 2.088334560394287, + "eval_valid_loss/end_span": 1.3648717403411865, + "eval_valid_perplexity/batch": 8.07146167755127, + "eval_valid_perplexity/end_span": 3.9152207374572754, + "eval_valid_perplexity/fim": 2.512554883956909, + "eval_valid_perplexity/first_seq": 14.729471206665039, + "eval_valid_perplexity/last_seq": 9.144680976867676, + "eval_valid_perplexity/second_seq": 13.8806734085083, + "eval_valid_perplexity/seq": 9.096248626708984, + "eval_valid_reconstruction/all": 0.2839546799659729, + "eval_valid_reconstruction/end_span": 0.6725719571113586, + "eval_valid_reconstruction/fim": 0.1750318557024002, + "eval_valid_reconstruction/first_seq": 0.17040997743606567, + "eval_valid_reconstruction/last_seq": 0.32418039441108704, + "eval_valid_reconstruction/second_seq": 0.1919833868741989, + "eval_valid_runtime": 603.5676, + "eval_valid_samples_per_second": 0.318, + "eval_valid_steps_per_second": 0.318, + "step": 5900 + }, + { + "epoch": 0.02200786314839268, + "eval_train_loss": 2.2237472534179688, + "eval_train_loss/all": 2.0560858249664307, + "eval_train_loss/end_span": 1.3433138132095337, + "eval_train_perplexity/batch": 7.815319538116455, + "eval_train_perplexity/end_span": 3.8317201137542725, + "eval_train_perplexity/fim": 2.1929714679718018, + "eval_train_perplexity/first_seq": 15.585864067077637, + "eval_train_perplexity/last_seq": 9.53062629699707, + "eval_train_perplexity/second_seq": 14.054883003234863, + "eval_train_perplexity/seq": 9.000680923461914, + "eval_train_reconstruction/all": 0.2745686173439026, + "eval_train_reconstruction/end_span": 0.6775015592575073, + "eval_train_reconstruction/fim": 0.15067213773727417, + "eval_train_reconstruction/first_seq": 0.14856810867786407, + "eval_train_reconstruction/last_seq": 0.3092955946922302, + "eval_train_reconstruction/second_seq": 0.1835942268371582, + "eval_train_runtime": 602.3226, + "eval_train_samples_per_second": 0.319, + "eval_train_steps_per_second": 0.319, + "step": 5900 + }, + { + "epoch": 0.02204516461135606, + "grad_norm": 0.5243600606918335, + "learning_rate": 0.0006, + "loss": 2.339, + "step": 5910 + }, + { + "epoch": 0.022082466074319435, + "grad_norm": 0.4888463616371155, + "learning_rate": 0.0006, + "loss": 2.2918, + "step": 5920 + }, + { + "epoch": 0.02211976753728281, + "grad_norm": 0.35090774297714233, + "learning_rate": 0.0006, + "loss": 2.1408, + "step": 5930 + }, + { + "epoch": 0.02215706900024619, + "grad_norm": 0.3568800985813141, + "learning_rate": 0.0006, + "loss": 2.1934, + "step": 5940 + }, + { + "epoch": 0.022194370463209566, + "grad_norm": 0.5147931575775146, + "learning_rate": 0.0006, + "loss": 2.307, + "step": 5950 + }, + { + "epoch": 0.022194370463209566, + "eval_valid_loss": 2.23185133934021, + "eval_valid_loss/all": 2.090348958969116, + "eval_valid_loss/end_span": 1.2652267217636108, + "eval_valid_perplexity/batch": 8.087737083435059, + "eval_valid_perplexity/end_span": 3.543896198272705, + "eval_valid_perplexity/fim": 2.239328145980835, + "eval_valid_perplexity/first_seq": 14.596807479858398, + "eval_valid_perplexity/last_seq": 9.452531814575195, + "eval_valid_perplexity/second_seq": 14.354691505432129, + "eval_valid_perplexity/seq": 9.101688385009766, + "eval_valid_reconstruction/all": 0.2844664454460144, + "eval_valid_reconstruction/end_span": 0.7134448885917664, + "eval_valid_reconstruction/fim": 0.15370072424411774, + "eval_valid_reconstruction/first_seq": 0.17818482220172882, + "eval_valid_reconstruction/last_seq": 0.31359970569610596, + "eval_valid_reconstruction/second_seq": 0.17890752851963043, + "eval_valid_runtime": 602.064, + "eval_valid_samples_per_second": 0.319, + "eval_valid_steps_per_second": 0.319, + "step": 5950 + }, + { + "epoch": 0.022194370463209566, + "eval_train_loss": 2.229752540588379, + "eval_train_loss/all": 2.060256004333496, + "eval_train_loss/end_span": 1.2348239421844482, + "eval_train_perplexity/batch": 7.847978591918945, + "eval_train_perplexity/end_span": 3.4377732276916504, + "eval_train_perplexity/fim": 2.0498147010803223, + "eval_train_perplexity/first_seq": 15.629010200500488, + "eval_train_perplexity/last_seq": 9.185834884643555, + "eval_train_perplexity/second_seq": 14.376083374023438, + "eval_train_perplexity/seq": 9.022818565368652, + "eval_train_reconstruction/all": 0.2743455171585083, + "eval_train_reconstruction/end_span": 0.7209550738334656, + "eval_train_reconstruction/fim": 0.1368505209684372, + "eval_train_reconstruction/first_seq": 0.14687088131904602, + "eval_train_reconstruction/last_seq": 0.31651830673217773, + "eval_train_reconstruction/second_seq": 0.18012282252311707, + "eval_train_runtime": 611.8484, + "eval_train_samples_per_second": 0.314, + "eval_train_steps_per_second": 0.314, + "step": 5950 + }, + { + "epoch": 0.022231671926172945, + "grad_norm": 0.4207587242126465, + "learning_rate": 0.0006, + "loss": 2.2286, + "step": 5960 + }, + { + "epoch": 0.02226897338913632, + "grad_norm": 0.5164647698402405, + "learning_rate": 0.0006, + "loss": 2.2597, + "step": 5970 + }, + { + "epoch": 0.0223062748520997, + "grad_norm": 0.37840592861175537, + "learning_rate": 0.0006, + "loss": 2.2186, + "step": 5980 + }, + { + "epoch": 0.022343576315063077, + "grad_norm": 0.4667038321495056, + "learning_rate": 0.0006, + "loss": 2.3312, + "step": 5990 + }, + { + "epoch": 0.022380877778026453, + "grad_norm": 0.3310874402523041, + "learning_rate": 0.0006, + "loss": 2.3291, + "step": 6000 + }, + { + "epoch": 0.022380877778026453, + "eval_valid_loss": 2.229374408721924, + "eval_valid_loss/all": 2.0883255004882812, + "eval_valid_loss/end_span": 1.2111467123031616, + "eval_valid_perplexity/batch": 8.071388244628906, + "eval_valid_perplexity/end_span": 3.357332229614258, + "eval_valid_perplexity/fim": 2.214534044265747, + "eval_valid_perplexity/first_seq": 14.428644180297852, + "eval_valid_perplexity/last_seq": 9.447149276733398, + "eval_valid_perplexity/second_seq": 14.028589248657227, + "eval_valid_perplexity/seq": 9.087388038635254, + "eval_valid_reconstruction/all": 0.284432590007782, + "eval_valid_reconstruction/end_span": 0.7177093029022217, + "eval_valid_reconstruction/fim": 0.15196554362773895, + "eval_valid_reconstruction/first_seq": 0.1755508929491043, + "eval_valid_reconstruction/last_seq": 0.3131129741668701, + "eval_valid_reconstruction/second_seq": 0.19220338761806488, + "eval_valid_runtime": 608.3554, + "eval_valid_samples_per_second": 0.316, + "eval_valid_steps_per_second": 0.316, + "step": 6000 + }, + { + "epoch": 0.022380877778026453, + "eval_train_loss": 2.2265028953552246, + "eval_train_loss/all": 2.057814359664917, + "eval_train_loss/end_span": 1.1861721277236938, + "eval_train_perplexity/batch": 7.828840255737305, + "eval_train_perplexity/end_span": 3.2745227813720703, + "eval_train_perplexity/fim": 1.9618529081344604, + "eval_train_perplexity/first_seq": 15.24968433380127, + "eval_train_perplexity/last_seq": 9.141945838928223, + "eval_train_perplexity/second_seq": 14.2843656539917, + "eval_train_perplexity/seq": 8.999735832214355, + "eval_train_reconstruction/all": 0.2746197283267975, + "eval_train_reconstruction/end_span": 0.7254465222358704, + "eval_train_reconstruction/fim": 0.12922163307666779, + "eval_train_reconstruction/first_seq": 0.15583932399749756, + "eval_train_reconstruction/last_seq": 0.3213326334953308, + "eval_train_reconstruction/second_seq": 0.1806483417749405, + "eval_train_runtime": 602.426, + "eval_train_samples_per_second": 0.319, + "eval_train_steps_per_second": 0.319, + "step": 6000 + }, + { + "epoch": 0.022418179240989832, + "grad_norm": 0.4117010831832886, + "learning_rate": 0.0006, + "loss": 2.3071, + "step": 6010 + }, + { + "epoch": 0.022455480703953208, + "grad_norm": 0.39749374985694885, + "learning_rate": 0.0006, + "loss": 2.1826, + "step": 6020 + }, + { + "epoch": 0.022492782166916588, + "grad_norm": 0.5297390818595886, + "learning_rate": 0.0006, + "loss": 2.3501, + "step": 6030 + }, + { + "epoch": 0.022530083629879964, + "grad_norm": 0.4210558831691742, + "learning_rate": 0.0006, + "loss": 2.2559, + "step": 6040 + }, + { + "epoch": 0.022567385092843343, + "grad_norm": 0.4082106351852417, + "learning_rate": 0.0006, + "loss": 2.4088, + "step": 6050 + }, + { + "epoch": 0.022567385092843343, + "eval_valid_loss": 2.2321877479553223, + "eval_valid_loss/all": 2.0911612510681152, + "eval_valid_loss/end_span": 1.2914451360702515, + "eval_valid_perplexity/batch": 8.094308853149414, + "eval_valid_perplexity/end_span": 3.63804030418396, + "eval_valid_perplexity/fim": 2.3151135444641113, + "eval_valid_perplexity/first_seq": 14.798049926757812, + "eval_valid_perplexity/last_seq": 9.24698257446289, + "eval_valid_perplexity/second_seq": 13.446891784667969, + "eval_valid_perplexity/seq": 9.114487648010254, + "eval_valid_reconstruction/all": 0.28234219551086426, + "eval_valid_reconstruction/end_span": 0.7021956443786621, + "eval_valid_reconstruction/fim": 0.15922337770462036, + "eval_valid_reconstruction/first_seq": 0.1702926605939865, + "eval_valid_reconstruction/last_seq": 0.3184945583343506, + "eval_valid_reconstruction/second_seq": 0.2000347524881363, + "eval_valid_runtime": 599.729, + "eval_valid_samples_per_second": 0.32, + "eval_valid_steps_per_second": 0.32, + "step": 6050 + }, + { + "epoch": 0.022567385092843343, + "eval_train_loss": 2.2273762226104736, + "eval_train_loss/all": 2.05810284614563, + "eval_train_loss/end_span": 1.2572195529937744, + "eval_train_perplexity/batch": 7.831099033355713, + "eval_train_perplexity/end_span": 3.5156328678131104, + "eval_train_perplexity/fim": 2.1155848503112793, + "eval_train_perplexity/first_seq": 15.463666915893555, + "eval_train_perplexity/last_seq": 9.144914627075195, + "eval_train_perplexity/second_seq": 14.285694122314453, + "eval_train_perplexity/seq": 9.002165794372559, + "eval_train_reconstruction/all": 0.27368178963661194, + "eval_train_reconstruction/end_span": 0.7129805088043213, + "eval_train_reconstruction/fim": 0.14327654242515564, + "eval_train_reconstruction/first_seq": 0.1542086899280548, + "eval_train_reconstruction/last_seq": 0.3229828178882599, + "eval_train_reconstruction/second_seq": 0.18394748866558075, + "eval_train_runtime": 612.2422, + "eval_train_samples_per_second": 0.314, + "eval_train_steps_per_second": 0.314, + "step": 6050 + }, + { + "epoch": 0.02260468655580672, + "grad_norm": 0.39888519048690796, + "learning_rate": 0.0006, + "loss": 2.331, + "step": 6060 + }, + { + "epoch": 0.022641988018770095, + "grad_norm": 0.3682010769844055, + "learning_rate": 0.0006, + "loss": 2.1431, + "step": 6070 + }, + { + "epoch": 0.022679289481733474, + "grad_norm": 0.4392210841178894, + "learning_rate": 0.0006, + "loss": 2.2191, + "step": 6080 + }, + { + "epoch": 0.02271659094469685, + "grad_norm": 0.39702165126800537, + "learning_rate": 0.0006, + "loss": 2.3326, + "step": 6090 + }, + { + "epoch": 0.02275389240766023, + "grad_norm": 0.3697516918182373, + "learning_rate": 0.0006, + "loss": 2.1239, + "step": 6100 + }, + { + "epoch": 0.02275389240766023, + "eval_valid_loss": 2.2341485023498535, + "eval_valid_loss/all": 2.092970132827759, + "eval_valid_loss/end_span": 1.270001769065857, + "eval_valid_perplexity/batch": 8.108963966369629, + "eval_valid_perplexity/end_span": 3.560858964920044, + "eval_valid_perplexity/fim": 2.3972995281219482, + "eval_valid_perplexity/first_seq": 14.92536449432373, + "eval_valid_perplexity/last_seq": 9.251628875732422, + "eval_valid_perplexity/second_seq": 13.891376495361328, + "eval_valid_perplexity/seq": 9.137398719787598, + "eval_valid_reconstruction/all": 0.28289350867271423, + "eval_valid_reconstruction/end_span": 0.7055772542953491, + "eval_valid_reconstruction/fim": 0.16595803201198578, + "eval_valid_reconstruction/first_seq": 0.16728660464286804, + "eval_valid_reconstruction/last_seq": 0.3189758360385895, + "eval_valid_reconstruction/second_seq": 0.1962626427412033, + "eval_valid_runtime": 601.637, + "eval_valid_samples_per_second": 0.319, + "eval_valid_steps_per_second": 0.319, + "step": 6100 + }, + { + "epoch": 0.02275389240766023, + "eval_train_loss": 2.2299740314483643, + "eval_train_loss/all": 2.061197519302368, + "eval_train_loss/end_span": 1.233515977859497, + "eval_train_perplexity/batch": 7.855370998382568, + "eval_train_perplexity/end_span": 3.4332797527313232, + "eval_train_perplexity/fim": 2.0730462074279785, + "eval_train_perplexity/first_seq": 15.736215591430664, + "eval_train_perplexity/last_seq": 9.682657241821289, + "eval_train_perplexity/second_seq": 14.290741920471191, + "eval_train_perplexity/seq": 9.039433479309082, + "eval_train_reconstruction/all": 0.2733984589576721, + "eval_train_reconstruction/end_span": 0.7175217270851135, + "eval_train_reconstruction/fim": 0.13871271908283234, + "eval_train_reconstruction/first_seq": 0.1502525508403778, + "eval_train_reconstruction/last_seq": 0.3004494607448578, + "eval_train_reconstruction/second_seq": 0.1812901794910431, + "eval_train_runtime": 603.6412, + "eval_train_samples_per_second": 0.318, + "eval_train_steps_per_second": 0.318, + "step": 6100 + }, + { + "epoch": 0.022791193870623606, + "grad_norm": 0.3332523703575134, + "learning_rate": 0.0006, + "loss": 2.2746, + "step": 6110 + }, + { + "epoch": 0.02282849533358698, + "grad_norm": 0.44973069429397583, + "learning_rate": 0.0006, + "loss": 2.2704, + "step": 6120 + }, + { + "epoch": 0.02286579679655036, + "grad_norm": 0.4384797513484955, + "learning_rate": 0.0006, + "loss": 2.2101, + "step": 6130 + }, + { + "epoch": 0.022903098259513737, + "grad_norm": 0.4375651478767395, + "learning_rate": 0.0006, + "loss": 2.2516, + "step": 6140 + }, + { + "epoch": 0.022940399722477117, + "grad_norm": 0.5450620055198669, + "learning_rate": 0.0006, + "loss": 2.2696, + "step": 6150 + }, + { + "epoch": 0.022940399722477117, + "eval_valid_loss": 2.2330782413482666, + "eval_valid_loss/all": 2.091163396835327, + "eval_valid_loss/end_span": 1.2480982542037964, + "eval_valid_perplexity/batch": 8.094326972961426, + "eval_valid_perplexity/end_span": 3.4837114810943604, + "eval_valid_perplexity/fim": 2.159867286682129, + "eval_valid_perplexity/first_seq": 15.037975311279297, + "eval_valid_perplexity/last_seq": 9.424782752990723, + "eval_valid_perplexity/second_seq": 14.215546607971191, + "eval_valid_perplexity/seq": 9.113609313964844, + "eval_valid_reconstruction/all": 0.28288203477859497, + "eval_valid_reconstruction/end_span": 0.7085219025611877, + "eval_valid_reconstruction/fim": 0.146192267537117, + "eval_valid_reconstruction/first_seq": 0.16407668590545654, + "eval_valid_reconstruction/last_seq": 0.31285926699638367, + "eval_valid_reconstruction/second_seq": 0.18435214459896088, + "eval_valid_runtime": 602.4949, + "eval_valid_samples_per_second": 0.319, + "eval_valid_steps_per_second": 0.319, + "step": 6150 + }, + { + "epoch": 0.022940399722477117, + "eval_train_loss": 2.2296836376190186, + "eval_train_loss/all": 2.060642957687378, + "eval_train_loss/end_span": 1.2146376371383667, + "eval_train_perplexity/batch": 7.851016044616699, + "eval_train_perplexity/end_span": 3.369072914123535, + "eval_train_perplexity/fim": 2.240095853805542, + "eval_train_perplexity/first_seq": 15.327971458435059, + "eval_train_perplexity/last_seq": 9.332841873168945, + "eval_train_perplexity/second_seq": 14.159527778625488, + "eval_train_perplexity/seq": 9.029488563537598, + "eval_train_reconstruction/all": 0.27323803305625916, + "eval_train_reconstruction/end_span": 0.7215155959129333, + "eval_train_reconstruction/fim": 0.15265235304832458, + "eval_train_reconstruction/first_seq": 0.15747909247875214, + "eval_train_reconstruction/last_seq": 0.3142969310283661, + "eval_train_reconstruction/second_seq": 0.18711917102336884, + "eval_train_runtime": 602.6753, + "eval_train_samples_per_second": 0.319, + "eval_train_steps_per_second": 0.319, + "step": 6150 + }, + { + "epoch": 0.022977701185440493, + "grad_norm": 0.5640407204627991, + "learning_rate": 0.0006, + "loss": 2.2521, + "step": 6160 + }, + { + "epoch": 0.023015002648403872, + "grad_norm": 0.40167298913002014, + "learning_rate": 0.0006, + "loss": 2.2965, + "step": 6170 + }, + { + "epoch": 0.023052304111367248, + "grad_norm": 0.5831903219223022, + "learning_rate": 0.0006, + "loss": 2.407, + "step": 6180 + }, + { + "epoch": 0.023089605574330624, + "grad_norm": 0.37961438298225403, + "learning_rate": 0.0006, + "loss": 2.3128, + "step": 6190 + }, + { + "epoch": 0.023126907037294003, + "grad_norm": 0.31330952048301697, + "learning_rate": 0.0006, + "loss": 2.3207, + "step": 6200 + }, + { + "epoch": 0.023126907037294003, + "eval_valid_loss": 2.2221081256866455, + "eval_valid_loss/all": 2.0816338062286377, + "eval_valid_loss/end_span": 1.3317588567733765, + "eval_valid_perplexity/batch": 8.017557144165039, + "eval_valid_perplexity/end_span": 3.7876994609832764, + "eval_valid_perplexity/fim": 2.1832892894744873, + "eval_valid_perplexity/first_seq": 14.705568313598633, + "eval_valid_perplexity/last_seq": 9.36191177368164, + "eval_valid_perplexity/second_seq": 13.8805570602417, + "eval_valid_perplexity/seq": 9.023948669433594, + "eval_valid_reconstruction/all": 0.28610673546791077, + "eval_valid_reconstruction/end_span": 0.6877150535583496, + "eval_valid_reconstruction/fim": 0.14933575689792633, + "eval_valid_reconstruction/first_seq": 0.17104196548461914, + "eval_valid_reconstruction/last_seq": 0.31734517216682434, + "eval_valid_reconstruction/second_seq": 0.1927083283662796, + "eval_valid_runtime": 599.8068, + "eval_valid_samples_per_second": 0.32, + "eval_valid_steps_per_second": 0.32, + "step": 6200 + }, + { + "epoch": 0.023126907037294003, + "eval_train_loss": 2.21795916557312, + "eval_train_loss/all": 2.050316572189331, + "eval_train_loss/end_span": 1.2966856956481934, + "eval_train_perplexity/batch": 7.770360469818115, + "eval_train_perplexity/end_span": 3.6571555137634277, + "eval_train_perplexity/fim": 2.241058588027954, + "eval_train_perplexity/first_seq": 15.314616203308105, + "eval_train_perplexity/last_seq": 9.012935638427734, + "eval_train_perplexity/second_seq": 14.193510055541992, + "eval_train_perplexity/seq": 8.937355995178223, + "eval_train_reconstruction/all": 0.27664873003959656, + "eval_train_reconstruction/end_span": 0.6987565159797668, + "eval_train_reconstruction/fim": 0.15638460218906403, + "eval_train_reconstruction/first_seq": 0.15727639198303223, + "eval_train_reconstruction/last_seq": 0.3275987505912781, + "eval_train_reconstruction/second_seq": 0.18553946912288666, + "eval_train_runtime": 606.8263, + "eval_train_samples_per_second": 0.316, + "eval_train_steps_per_second": 0.316, + "step": 6200 + }, + { + "epoch": 0.02316420850025738, + "grad_norm": 0.43879613280296326, + "learning_rate": 0.0006, + "loss": 2.248, + "step": 6210 + }, + { + "epoch": 0.02320150996322076, + "grad_norm": 0.3091483414173126, + "learning_rate": 0.0006, + "loss": 2.3148, + "step": 6220 + }, + { + "epoch": 0.023238811426184135, + "grad_norm": 0.327532023191452, + "learning_rate": 0.0006, + "loss": 2.2646, + "step": 6230 + }, + { + "epoch": 0.02327611288914751, + "grad_norm": 0.4704710841178894, + "learning_rate": 0.0006, + "loss": 2.21, + "step": 6240 + }, + { + "epoch": 0.02331341435211089, + "grad_norm": 0.36659350991249084, + "learning_rate": 0.0006, + "loss": 2.263, + "step": 6250 + }, + { + "epoch": 0.02331341435211089, + "eval_valid_loss": 2.2232449054718018, + "eval_valid_loss/all": 2.08259654045105, + "eval_valid_loss/end_span": 1.416275978088379, + "eval_valid_perplexity/batch": 8.025279998779297, + "eval_valid_perplexity/end_span": 4.121742248535156, + "eval_valid_perplexity/fim": 2.515721321105957, + "eval_valid_perplexity/first_seq": 14.873720169067383, + "eval_valid_perplexity/last_seq": 9.615693092346191, + "eval_valid_perplexity/second_seq": 14.283336639404297, + "eval_valid_perplexity/seq": 9.033584594726562, + "eval_valid_reconstruction/all": 0.2860322892665863, + "eval_valid_reconstruction/end_span": 0.6692600846290588, + "eval_valid_reconstruction/fim": 0.17819172143936157, + "eval_valid_reconstruction/first_seq": 0.17285819351673126, + "eval_valid_reconstruction/last_seq": 0.3052835464477539, + "eval_valid_reconstruction/second_seq": 0.18229283392429352, + "eval_valid_runtime": 593.1921, + "eval_valid_samples_per_second": 0.324, + "eval_valid_steps_per_second": 0.324, + "step": 6250 + }, + { + "epoch": 0.02331341435211089, + "eval_train_loss": 2.22149658203125, + "eval_train_loss/all": 2.053450107574463, + "eval_train_loss/end_span": 1.3865370750427246, + "eval_train_perplexity/batch": 7.794747352600098, + "eval_train_perplexity/end_span": 4.000970840454102, + "eval_train_perplexity/fim": 2.0246424674987793, + "eval_train_perplexity/first_seq": 15.29924488067627, + "eval_train_perplexity/last_seq": 9.598743438720703, + "eval_train_perplexity/second_seq": 14.078559875488281, + "eval_train_perplexity/seq": 8.96500015258789, + "eval_train_reconstruction/all": 0.27574220299720764, + "eval_train_reconstruction/end_span": 0.679185152053833, + "eval_train_reconstruction/fim": 0.13614942133426666, + "eval_train_reconstruction/first_seq": 0.15656499564647675, + "eval_train_reconstruction/last_seq": 0.3017086684703827, + "eval_train_reconstruction/second_seq": 0.18623170256614685, + "eval_train_runtime": 608.9498, + "eval_train_samples_per_second": 0.315, + "eval_train_steps_per_second": 0.315, + "step": 6250 + }, + { + "epoch": 0.023350715815074266, + "grad_norm": 0.31579044461250305, + "learning_rate": 0.0006, + "loss": 2.2266, + "step": 6260 + }, + { + "epoch": 0.023388017278037646, + "grad_norm": 0.8024464249610901, + "learning_rate": 0.0006, + "loss": 2.3743, + "step": 6270 + }, + { + "epoch": 0.02342531874100102, + "grad_norm": 0.3776596784591675, + "learning_rate": 0.0006, + "loss": 2.1624, + "step": 6280 + }, + { + "epoch": 0.0234626202039644, + "grad_norm": 0.6123577356338501, + "learning_rate": 0.0006, + "loss": 2.2548, + "step": 6290 + }, + { + "epoch": 0.023499921666927777, + "grad_norm": 0.3050857484340668, + "learning_rate": 0.0006, + "loss": 2.3469, + "step": 6300 + }, + { + "epoch": 0.023499921666927777, + "eval_valid_loss": 2.2263195514678955, + "eval_valid_loss/all": 2.0855228900909424, + "eval_valid_loss/end_span": 1.2978293895721436, + "eval_valid_perplexity/batch": 8.048798561096191, + "eval_valid_perplexity/end_span": 3.6613407135009766, + "eval_valid_perplexity/fim": 2.312067747116089, + "eval_valid_perplexity/first_seq": 15.181781768798828, + "eval_valid_perplexity/last_seq": 9.63138484954834, + "eval_valid_perplexity/second_seq": 13.627039909362793, + "eval_valid_perplexity/seq": 9.055657386779785, + "eval_valid_reconstruction/all": 0.28554779291152954, + "eval_valid_reconstruction/end_span": 0.693481981754303, + "eval_valid_reconstruction/fim": 0.16132214665412903, + "eval_valid_reconstruction/first_seq": 0.1615125685930252, + "eval_valid_reconstruction/last_seq": 0.30730730295181274, + "eval_valid_reconstruction/second_seq": 0.19875293970108032, + "eval_valid_runtime": 609.2188, + "eval_valid_samples_per_second": 0.315, + "eval_valid_steps_per_second": 0.315, + "step": 6300 + }, + { + "epoch": 0.023499921666927777, + "eval_train_loss": 2.223898410797119, + "eval_train_loss/all": 2.0548830032348633, + "eval_train_loss/end_span": 1.260101079940796, + "eval_train_perplexity/batch": 7.805924415588379, + "eval_train_perplexity/end_span": 3.525777816772461, + "eval_train_perplexity/fim": 2.0754997730255127, + "eval_train_perplexity/first_seq": 15.610040664672852, + "eval_train_perplexity/last_seq": 8.981379508972168, + "eval_train_perplexity/second_seq": 14.190528869628906, + "eval_train_perplexity/seq": 8.970931053161621, + "eval_train_reconstruction/all": 0.2756856381893158, + "eval_train_reconstruction/end_span": 0.7062078714370728, + "eval_train_reconstruction/fim": 0.14045560359954834, + "eval_train_reconstruction/first_seq": 0.1533806473016739, + "eval_train_reconstruction/last_seq": 0.3273247182369232, + "eval_train_reconstruction/second_seq": 0.18543966114521027, + "eval_train_runtime": 608.5491, + "eval_train_samples_per_second": 0.316, + "eval_train_steps_per_second": 0.316, + "step": 6300 + }, + { + "epoch": 0.023537223129891153, + "grad_norm": 0.3667004704475403, + "learning_rate": 0.0006, + "loss": 2.3224, + "step": 6310 + }, + { + "epoch": 0.023574524592854532, + "grad_norm": 0.6337170600891113, + "learning_rate": 0.0006, + "loss": 2.1619, + "step": 6320 + }, + { + "epoch": 0.02361182605581791, + "grad_norm": 0.38882386684417725, + "learning_rate": 0.0006, + "loss": 2.2235, + "step": 6330 + }, + { + "epoch": 0.023649127518781288, + "grad_norm": 0.4202781617641449, + "learning_rate": 0.0006, + "loss": 2.162, + "step": 6340 + }, + { + "epoch": 0.023686428981744664, + "grad_norm": 1.040164589881897, + "learning_rate": 0.0006, + "loss": 2.1866, + "step": 6350 + }, + { + "epoch": 0.023686428981744664, + "eval_valid_loss": 2.2251148223876953, + "eval_valid_loss/all": 2.0843493938446045, + "eval_valid_loss/end_span": 1.2832343578338623, + "eval_valid_perplexity/batch": 8.039359092712402, + "eval_valid_perplexity/end_span": 3.6082913875579834, + "eval_valid_perplexity/fim": 2.691561222076416, + "eval_valid_perplexity/first_seq": 14.702970504760742, + "eval_valid_perplexity/last_seq": 9.291967391967773, + "eval_valid_perplexity/second_seq": 13.58441162109375, + "eval_valid_perplexity/seq": 9.047445297241211, + "eval_valid_reconstruction/all": 0.2853447198867798, + "eval_valid_reconstruction/end_span": 0.7062013149261475, + "eval_valid_reconstruction/fim": 0.18894731998443604, + "eval_valid_reconstruction/first_seq": 0.17415715754032135, + "eval_valid_reconstruction/last_seq": 0.31867530941963196, + "eval_valid_reconstruction/second_seq": 0.1995791643857956, + "eval_valid_runtime": 602.0828, + "eval_valid_samples_per_second": 0.319, + "eval_valid_steps_per_second": 0.319, + "step": 6350 + }, + { + "epoch": 0.023686428981744664, + "eval_train_loss": 2.222472906112671, + "eval_train_loss/all": 2.0538830757141113, + "eval_train_loss/end_span": 1.2641737461090088, + "eval_train_perplexity/batch": 7.798122882843018, + "eval_train_perplexity/end_span": 3.5401663780212402, + "eval_train_perplexity/fim": 2.3112940788269043, + "eval_train_perplexity/first_seq": 15.300558090209961, + "eval_train_perplexity/last_seq": 9.052369117736816, + "eval_train_perplexity/second_seq": 14.182229042053223, + "eval_train_perplexity/seq": 8.9636812210083, + "eval_train_reconstruction/all": 0.2755407392978668, + "eval_train_reconstruction/end_span": 0.7141571044921875, + "eval_train_reconstruction/fim": 0.16132202744483948, + "eval_train_reconstruction/first_seq": 0.15370289981365204, + "eval_train_reconstruction/last_seq": 0.32158854603767395, + "eval_train_reconstruction/second_seq": 0.1796800047159195, + "eval_train_runtime": 613.1566, + "eval_train_samples_per_second": 0.313, + "eval_train_steps_per_second": 0.313, + "step": 6350 + }, + { + "epoch": 0.023723730444708043, + "grad_norm": 0.3900385797023773, + "learning_rate": 0.0006, + "loss": 2.2413, + "step": 6360 + }, + { + "epoch": 0.02376103190767142, + "grad_norm": 0.302117258310318, + "learning_rate": 0.0006, + "loss": 2.1816, + "step": 6370 + }, + { + "epoch": 0.023798333370634795, + "grad_norm": 0.38771000504493713, + "learning_rate": 0.0006, + "loss": 2.3792, + "step": 6380 + }, + { + "epoch": 0.023835634833598175, + "grad_norm": 0.47032812237739563, + "learning_rate": 0.0006, + "loss": 2.3512, + "step": 6390 + }, + { + "epoch": 0.02387293629656155, + "grad_norm": 0.3102048635482788, + "learning_rate": 0.0006, + "loss": 2.3513, + "step": 6400 + }, + { + "epoch": 0.02387293629656155, + "eval_valid_loss": 2.2225093841552734, + "eval_valid_loss/all": 2.081709861755371, + "eval_valid_loss/end_span": 1.2543482780456543, + "eval_valid_perplexity/batch": 8.018167495727539, + "eval_valid_perplexity/end_span": 3.5055530071258545, + "eval_valid_perplexity/fim": 2.465724468231201, + "eval_valid_perplexity/first_seq": 14.980337142944336, + "eval_valid_perplexity/last_seq": 9.486498832702637, + "eval_valid_perplexity/second_seq": 14.088845252990723, + "eval_valid_perplexity/seq": 9.023531913757324, + "eval_valid_reconstruction/all": 0.28621864318847656, + "eval_valid_reconstruction/end_span": 0.7070499658584595, + "eval_valid_reconstruction/fim": 0.17387829720973969, + "eval_valid_reconstruction/first_seq": 0.16517110168933868, + "eval_valid_reconstruction/last_seq": 0.3083125948905945, + "eval_valid_reconstruction/second_seq": 0.1877843141555786, + "eval_valid_runtime": 595.0816, + "eval_valid_samples_per_second": 0.323, + "eval_valid_steps_per_second": 0.323, + "step": 6400 + }, + { + "epoch": 0.02387293629656155, + "eval_train_loss": 2.2209746837615967, + "eval_train_loss/all": 2.0525646209716797, + "eval_train_loss/end_span": 1.2263178825378418, + "eval_train_perplexity/batch": 7.787848472595215, + "eval_train_perplexity/end_span": 3.4086554050445557, + "eval_train_perplexity/fim": 2.5668792724609375, + "eval_train_perplexity/first_seq": 15.55329418182373, + "eval_train_perplexity/last_seq": 9.289532661437988, + "eval_train_perplexity/second_seq": 14.545939445495605, + "eval_train_perplexity/seq": 8.954216003417969, + "eval_train_reconstruction/all": 0.27589985728263855, + "eval_train_reconstruction/end_span": 0.7160133719444275, + "eval_train_reconstruction/fim": 0.1820361614227295, + "eval_train_reconstruction/first_seq": 0.1519412100315094, + "eval_train_reconstruction/last_seq": 0.31549519300460815, + "eval_train_reconstruction/second_seq": 0.1720115840435028, + "eval_train_runtime": 604.3457, + "eval_train_samples_per_second": 0.318, + "eval_train_steps_per_second": 0.318, + "step": 6400 + }, + { + "epoch": 0.02391023775952493, + "grad_norm": 0.5097938179969788, + "learning_rate": 0.0006, + "loss": 2.154, + "step": 6410 + }, + { + "epoch": 0.023947539222488306, + "grad_norm": 1.3765785694122314, + "learning_rate": 0.0006, + "loss": 2.3185, + "step": 6420 + }, + { + "epoch": 0.023984840685451682, + "grad_norm": 0.3142150342464447, + "learning_rate": 0.0006, + "loss": 2.2176, + "step": 6430 + }, + { + "epoch": 0.02402214214841506, + "grad_norm": 0.39577776193618774, + "learning_rate": 0.0006, + "loss": 2.4162, + "step": 6440 + }, + { + "epoch": 0.024059443611378437, + "grad_norm": 0.3069345951080322, + "learning_rate": 0.0006, + "loss": 2.03, + "step": 6450 + }, + { + "epoch": 0.024059443611378437, + "eval_valid_loss": 2.2270448207855225, + "eval_valid_loss/all": 2.086339235305786, + "eval_valid_loss/end_span": 1.1840721368789673, + "eval_valid_perplexity/batch": 8.05537223815918, + "eval_valid_perplexity/end_span": 3.267653465270996, + "eval_valid_perplexity/fim": 2.7153689861297607, + "eval_valid_perplexity/first_seq": 14.80854320526123, + "eval_valid_perplexity/last_seq": 9.691040992736816, + "eval_valid_perplexity/second_seq": 13.952367782592773, + "eval_valid_perplexity/seq": 9.073946952819824, + "eval_valid_reconstruction/all": 0.284978449344635, + "eval_valid_reconstruction/end_span": 0.7219473719596863, + "eval_valid_reconstruction/fim": 0.1918305903673172, + "eval_valid_reconstruction/first_seq": 0.17140105366706848, + "eval_valid_reconstruction/last_seq": 0.3031459450721741, + "eval_valid_reconstruction/second_seq": 0.18819159269332886, + "eval_valid_runtime": 603.2691, + "eval_valid_samples_per_second": 0.318, + "eval_valid_steps_per_second": 0.318, + "step": 6450 + }, + { + "epoch": 0.024059443611378437, + "eval_train_loss": 2.2219855785369873, + "eval_train_loss/all": 2.0540177822113037, + "eval_train_loss/end_span": 1.1576368808746338, + "eval_train_perplexity/batch": 7.799173831939697, + "eval_train_perplexity/end_span": 3.182404041290283, + "eval_train_perplexity/fim": 2.1491689682006836, + "eval_train_perplexity/first_seq": 15.636073112487793, + "eval_train_perplexity/last_seq": 8.866277694702148, + "eval_train_perplexity/second_seq": 14.060526847839355, + "eval_train_perplexity/seq": 8.970417022705078, + "eval_train_reconstruction/all": 0.27574601769447327, + "eval_train_reconstruction/end_span": 0.730849027633667, + "eval_train_reconstruction/fim": 0.14772289991378784, + "eval_train_reconstruction/first_seq": 0.15078076720237732, + "eval_train_reconstruction/last_seq": 0.32950103282928467, + "eval_train_reconstruction/second_seq": 0.18694286048412323, + "eval_train_runtime": 612.1745, + "eval_train_samples_per_second": 0.314, + "eval_train_steps_per_second": 0.314, + "step": 6450 + }, + { + "epoch": 0.024096745074341817, + "grad_norm": 0.4404148757457733, + "learning_rate": 0.0006, + "loss": 2.2928, + "step": 6460 + }, + { + "epoch": 0.024134046537305193, + "grad_norm": 0.3626629412174225, + "learning_rate": 0.0006, + "loss": 2.1037, + "step": 6470 + }, + { + "epoch": 0.024171348000268572, + "grad_norm": 0.4184604287147522, + "learning_rate": 0.0006, + "loss": 2.3115, + "step": 6480 + }, + { + "epoch": 0.024208649463231948, + "grad_norm": 1.6755082607269287, + "learning_rate": 0.0006, + "loss": 2.3086, + "step": 6490 + }, + { + "epoch": 0.024245950926195324, + "grad_norm": 0.42397749423980713, + "learning_rate": 0.0006, + "loss": 2.218, + "step": 6500 + }, + { + "epoch": 0.024245950926195324, + "eval_valid_loss": 2.224391460418701, + "eval_valid_loss/all": 2.083630323410034, + "eval_valid_loss/end_span": 1.1959770917892456, + "eval_valid_perplexity/batch": 8.033580780029297, + "eval_valid_perplexity/end_span": 3.3067872524261475, + "eval_valid_perplexity/fim": 2.52307391166687, + "eval_valid_perplexity/first_seq": 14.966246604919434, + "eval_valid_perplexity/last_seq": 9.23080825805664, + "eval_valid_perplexity/second_seq": 13.563981056213379, + "eval_valid_perplexity/seq": 9.046969413757324, + "eval_valid_reconstruction/all": 0.285874605178833, + "eval_valid_reconstruction/end_span": 0.7141649723052979, + "eval_valid_reconstruction/fim": 0.17783962190151215, + "eval_valid_reconstruction/first_seq": 0.16550248861312866, + "eval_valid_reconstruction/last_seq": 0.32271620631217957, + "eval_valid_reconstruction/second_seq": 0.20147451758384705, + "eval_valid_runtime": 597.6865, + "eval_valid_samples_per_second": 0.321, + "eval_valid_steps_per_second": 0.321, + "step": 6500 + }, + { + "epoch": 0.024245950926195324, + "eval_train_loss": 2.22031831741333, + "eval_train_loss/all": 2.0524890422821045, + "eval_train_loss/end_span": 1.169360876083374, + "eval_train_perplexity/batch": 7.787260055541992, + "eval_train_perplexity/end_span": 3.2199339866638184, + "eval_train_perplexity/fim": 2.1037938594818115, + "eval_train_perplexity/first_seq": 15.417267799377441, + "eval_train_perplexity/last_seq": 9.444279670715332, + "eval_train_perplexity/second_seq": 14.189159393310547, + "eval_train_perplexity/seq": 8.957655906677246, + "eval_train_reconstruction/all": 0.2762277126312256, + "eval_train_reconstruction/end_span": 0.7250908017158508, + "eval_train_reconstruction/fim": 0.14395777881145477, + "eval_train_reconstruction/first_seq": 0.15256018936634064, + "eval_train_reconstruction/last_seq": 0.310092031955719, + "eval_train_reconstruction/second_seq": 0.18595421314239502, + "eval_train_runtime": 605.4197, + "eval_train_samples_per_second": 0.317, + "eval_train_steps_per_second": 0.317, + "step": 6500 + }, + { + "epoch": 0.024283252389158703, + "grad_norm": 0.4086765646934509, + "learning_rate": 0.0006, + "loss": 2.1618, + "step": 6510 + }, + { + "epoch": 0.02432055385212208, + "grad_norm": 0.6164826154708862, + "learning_rate": 0.0006, + "loss": 2.1915, + "step": 6520 + }, + { + "epoch": 0.02435785531508546, + "grad_norm": 0.4964602589607239, + "learning_rate": 0.0006, + "loss": 2.258, + "step": 6530 + }, + { + "epoch": 0.024395156778048835, + "grad_norm": 0.4670292139053345, + "learning_rate": 0.0006, + "loss": 2.2217, + "step": 6540 + }, + { + "epoch": 0.02443245824101221, + "grad_norm": 1.032976508140564, + "learning_rate": 0.0006, + "loss": 2.3757, + "step": 6550 + }, + { + "epoch": 0.02443245824101221, + "eval_valid_loss": 2.250242233276367, + "eval_valid_loss/all": 2.106294870376587, + "eval_valid_loss/end_span": 1.2639355659484863, + "eval_valid_perplexity/batch": 8.217737197875977, + "eval_valid_perplexity/end_span": 3.539323329925537, + "eval_valid_perplexity/fim": 2.279515027999878, + "eval_valid_perplexity/first_seq": 14.798163414001465, + "eval_valid_perplexity/last_seq": 9.809117317199707, + "eval_valid_perplexity/second_seq": 13.507341384887695, + "eval_valid_perplexity/seq": 9.23919677734375, + "eval_valid_reconstruction/all": 0.2791549861431122, + "eval_valid_reconstruction/end_span": 0.7112416625022888, + "eval_valid_reconstruction/fim": 0.15384064614772797, + "eval_valid_reconstruction/first_seq": 0.16938410699367523, + "eval_valid_reconstruction/last_seq": 0.29915305972099304, + "eval_valid_reconstruction/second_seq": 0.19955503940582275, + "eval_valid_runtime": 599.045, + "eval_valid_samples_per_second": 0.321, + "eval_valid_steps_per_second": 0.321, + "step": 6550 + }, + { + "epoch": 0.02443245824101221, + "eval_train_loss": 2.2496674060821533, + "eval_train_loss/all": 2.0779504776000977, + "eval_train_loss/end_span": 1.2433310747146606, + "eval_train_perplexity/batch": 7.9880805015563965, + "eval_train_perplexity/end_span": 3.4671435356140137, + "eval_train_perplexity/fim": 2.0173821449279785, + "eval_train_perplexity/first_seq": 15.296819686889648, + "eval_train_perplexity/last_seq": 9.832136154174805, + "eval_train_perplexity/second_seq": 14.54130744934082, + "eval_train_perplexity/seq": 9.181118965148926, + "eval_train_reconstruction/all": 0.2686065137386322, + "eval_train_reconstruction/end_span": 0.7199960350990295, + "eval_train_reconstruction/fim": 0.13075606524944305, + "eval_train_reconstruction/first_seq": 0.15333057940006256, + "eval_train_reconstruction/last_seq": 0.2974081039428711, + "eval_train_reconstruction/second_seq": 0.17547091841697693, + "eval_train_runtime": 603.9932, + "eval_train_samples_per_second": 0.318, + "eval_train_steps_per_second": 0.318, + "step": 6550 + }, + { + "epoch": 0.02446975970397559, + "grad_norm": 0.8046674132347107, + "learning_rate": 0.0006, + "loss": 2.1627, + "step": 6560 + }, + { + "epoch": 0.024507061166938966, + "grad_norm": 3.4960100650787354, + "learning_rate": 0.0006, + "loss": 2.359, + "step": 6570 + }, + { + "epoch": 0.024544362629902346, + "grad_norm": 0.4094212055206299, + "learning_rate": 0.0006, + "loss": 2.3802, + "step": 6580 + }, + { + "epoch": 0.02458166409286572, + "grad_norm": 0.38265326619148254, + "learning_rate": 0.0006, + "loss": 2.3514, + "step": 6590 + }, + { + "epoch": 0.0246189655558291, + "grad_norm": 0.3776273727416992, + "learning_rate": 0.0006, + "loss": 2.3393, + "step": 6600 + }, + { + "epoch": 0.0246189655558291, + "eval_valid_loss": 2.2391600608825684, + "eval_valid_loss/all": 2.096794605255127, + "eval_valid_loss/end_span": 1.3085190057754517, + "eval_valid_perplexity/batch": 8.140035629272461, + "eval_valid_perplexity/end_span": 3.7006888389587402, + "eval_valid_perplexity/fim": 2.2379398345947266, + "eval_valid_perplexity/first_seq": 15.052058219909668, + "eval_valid_perplexity/last_seq": 9.175411224365234, + "eval_valid_perplexity/second_seq": 13.833112716674805, + "eval_valid_perplexity/seq": 9.158960342407227, + "eval_valid_reconstruction/all": 0.2813115417957306, + "eval_valid_reconstruction/end_span": 0.6952963471412659, + "eval_valid_reconstruction/fim": 0.1527610421180725, + "eval_valid_reconstruction/first_seq": 0.16186220943927765, + "eval_valid_reconstruction/last_seq": 0.32280558347702026, + "eval_valid_reconstruction/second_seq": 0.1948552429676056, + "eval_valid_runtime": 602.7162, + "eval_valid_samples_per_second": 0.319, + "eval_valid_steps_per_second": 0.319, + "step": 6600 + }, + { + "epoch": 0.0246189655558291, + "eval_train_loss": 2.2348363399505615, + "eval_train_loss/all": 2.06514310836792, + "eval_train_loss/end_span": 1.2811574935913086, + "eval_train_perplexity/batch": 7.8864264488220215, + "eval_train_perplexity/end_span": 3.6008052825927734, + "eval_train_perplexity/fim": 2.1441850662231445, + "eval_train_perplexity/first_seq": 15.370392799377441, + "eval_train_perplexity/last_seq": 9.151573181152344, + "eval_train_perplexity/second_seq": 14.182076454162598, + "eval_train_perplexity/seq": 9.069623947143555, + "eval_train_reconstruction/all": 0.2719305157661438, + "eval_train_reconstruction/end_span": 0.703273355960846, + "eval_train_reconstruction/fim": 0.14497262239456177, + "eval_train_reconstruction/first_seq": 0.15372353792190552, + "eval_train_reconstruction/last_seq": 0.321162611246109, + "eval_train_reconstruction/second_seq": 0.1835900992155075, + "eval_train_runtime": 607.3348, + "eval_train_samples_per_second": 0.316, + "eval_train_steps_per_second": 0.316, + "step": 6600 + }, + { + "epoch": 0.024656267018792477, + "grad_norm": 0.41233178973197937, + "learning_rate": 0.0006, + "loss": 2.3419, + "step": 6610 + }, + { + "epoch": 0.024693568481755853, + "grad_norm": 0.33389797806739807, + "learning_rate": 0.0006, + "loss": 2.1805, + "step": 6620 + }, + { + "epoch": 0.024730869944719232, + "grad_norm": 0.4743545651435852, + "learning_rate": 0.0006, + "loss": 2.3027, + "step": 6630 + }, + { + "epoch": 0.02476817140768261, + "grad_norm": 0.5216214060783386, + "learning_rate": 0.0006, + "loss": 2.2721, + "step": 6640 + }, + { + "epoch": 0.024805472870645988, + "grad_norm": 0.5047735571861267, + "learning_rate": 0.0006, + "loss": 2.2026, + "step": 6650 + }, + { + "epoch": 0.024805472870645988, + "eval_valid_loss": 2.2273359298706055, + "eval_valid_loss/all": 2.0864720344543457, + "eval_valid_loss/end_span": 1.1863270998001099, + "eval_valid_perplexity/batch": 8.056442260742188, + "eval_valid_perplexity/end_span": 3.2750301361083984, + "eval_valid_perplexity/fim": 2.6551663875579834, + "eval_valid_perplexity/first_seq": 14.916690826416016, + "eval_valid_perplexity/last_seq": 9.192713737487793, + "eval_valid_perplexity/second_seq": 13.628592491149902, + "eval_valid_perplexity/seq": 9.070818901062012, + "eval_valid_reconstruction/all": 0.2844131886959076, + "eval_valid_reconstruction/end_span": 0.7212960720062256, + "eval_valid_reconstruction/fim": 0.18643487989902496, + "eval_valid_reconstruction/first_seq": 0.16689127683639526, + "eval_valid_reconstruction/last_seq": 0.3211696445941925, + "eval_valid_reconstruction/second_seq": 0.1978505253791809, + "eval_valid_runtime": 604.5099, + "eval_valid_samples_per_second": 0.318, + "eval_valid_steps_per_second": 0.318, + "step": 6650 + }, + { + "epoch": 0.024805472870645988, + "eval_train_loss": 2.2224464416503906, + "eval_train_loss/all": 2.054297924041748, + "eval_train_loss/end_span": 1.1688193082809448, + "eval_train_perplexity/batch": 7.801358699798584, + "eval_train_perplexity/end_span": 3.2181906700134277, + "eval_train_perplexity/fim": 2.0337398052215576, + "eval_train_perplexity/first_seq": 15.498930931091309, + "eval_train_perplexity/last_seq": 9.682703971862793, + "eval_train_perplexity/second_seq": 13.878684043884277, + "eval_train_perplexity/seq": 8.971156120300293, + "eval_train_reconstruction/all": 0.2751652002334595, + "eval_train_reconstruction/end_span": 0.7284727692604065, + "eval_train_reconstruction/fim": 0.13628144562244415, + "eval_train_reconstruction/first_seq": 0.1515382081270218, + "eval_train_reconstruction/last_seq": 0.30167168378829956, + "eval_train_reconstruction/second_seq": 0.18993069231510162, + "eval_train_runtime": 604.1323, + "eval_train_samples_per_second": 0.318, + "eval_train_steps_per_second": 0.318, + "step": 6650 + }, + { + "epoch": 0.024842774333609364, + "grad_norm": 0.40449604392051697, + "learning_rate": 0.0006, + "loss": 2.0786, + "step": 6660 + }, + { + "epoch": 0.024880075796572743, + "grad_norm": 0.7739540934562683, + "learning_rate": 0.0006, + "loss": 2.1145, + "step": 6670 + }, + { + "epoch": 0.02491737725953612, + "grad_norm": 0.5637632608413696, + "learning_rate": 0.0006, + "loss": 2.3991, + "step": 6680 + }, + { + "epoch": 0.024954678722499495, + "grad_norm": 0.39901483058929443, + "learning_rate": 0.0006, + "loss": 2.2045, + "step": 6690 + }, + { + "epoch": 0.024991980185462875, + "grad_norm": 0.3972116708755493, + "learning_rate": 0.0006, + "loss": 2.2696, + "step": 6700 + }, + { + "epoch": 0.024991980185462875, + "eval_valid_loss": 2.2277963161468506, + "eval_valid_loss/all": 2.086979627609253, + "eval_valid_loss/end_span": 1.2502943277359009, + "eval_valid_perplexity/batch": 8.060532569885254, + "eval_valid_perplexity/end_span": 3.491370439529419, + "eval_valid_perplexity/fim": 2.5443150997161865, + "eval_valid_perplexity/first_seq": 14.910368919372559, + "eval_valid_perplexity/last_seq": 9.706411361694336, + "eval_valid_perplexity/second_seq": 13.964923858642578, + "eval_valid_perplexity/seq": 9.079867362976074, + "eval_valid_reconstruction/all": 0.2846279442310333, + "eval_valid_reconstruction/end_span": 0.7019811272621155, + "eval_valid_reconstruction/fim": 0.1789139211177826, + "eval_valid_reconstruction/first_seq": 0.16503849625587463, + "eval_valid_reconstruction/last_seq": 0.3056081235408783, + "eval_valid_reconstruction/second_seq": 0.187629833817482, + "eval_valid_runtime": 603.9807, + "eval_valid_samples_per_second": 0.318, + "eval_valid_steps_per_second": 0.318, + "step": 6700 + }, + { + "epoch": 0.024991980185462875, + "eval_train_loss": 2.2247698307037354, + "eval_train_loss/all": 2.056619644165039, + "eval_train_loss/end_span": 1.2290050983428955, + "eval_train_perplexity/batch": 7.819492340087891, + "eval_train_perplexity/end_span": 3.4178273677825928, + "eval_train_perplexity/fim": 2.151702880859375, + "eval_train_perplexity/first_seq": 15.758606910705566, + "eval_train_perplexity/last_seq": 8.769942283630371, + "eval_train_perplexity/second_seq": 13.956512451171875, + "eval_train_perplexity/seq": 9.001006126403809, + "eval_train_reconstruction/all": 0.2748183310031891, + "eval_train_reconstruction/end_span": 0.7101523280143738, + "eval_train_reconstruction/fim": 0.14783205091953278, + "eval_train_reconstruction/first_seq": 0.14541862905025482, + "eval_train_reconstruction/last_seq": 0.33222949504852295, + "eval_train_reconstruction/second_seq": 0.19308850169181824, + "eval_train_runtime": 605.2982, + "eval_train_samples_per_second": 0.317, + "eval_train_steps_per_second": 0.317, + "step": 6700 + }, + { + "epoch": 0.02502928164842625, + "grad_norm": 0.43115144968032837, + "learning_rate": 0.0006, + "loss": 2.2908, + "step": 6710 + }, + { + "epoch": 0.02506658311138963, + "grad_norm": 1.034358024597168, + "learning_rate": 0.0006, + "loss": 2.2352, + "step": 6720 + }, + { + "epoch": 0.025103884574353006, + "grad_norm": 0.4487384557723999, + "learning_rate": 0.0006, + "loss": 2.2494, + "step": 6730 + }, + { + "epoch": 0.025141186037316382, + "grad_norm": 0.4793391823768616, + "learning_rate": 0.0006, + "loss": 2.184, + "step": 6740 + }, + { + "epoch": 0.02517848750027976, + "grad_norm": 0.3735288083553314, + "learning_rate": 0.0006, + "loss": 2.4198, + "step": 6750 + }, + { + "epoch": 0.02517848750027976, + "eval_valid_loss": 2.2273333072662354, + "eval_valid_loss/all": 2.0863661766052246, + "eval_valid_loss/end_span": 1.2783747911453247, + "eval_valid_perplexity/batch": 8.05558967590332, + "eval_valid_perplexity/end_span": 3.59079909324646, + "eval_valid_perplexity/fim": 2.140719413757324, + "eval_valid_perplexity/first_seq": 14.815892219543457, + "eval_valid_perplexity/last_seq": 9.25161361694336, + "eval_valid_perplexity/second_seq": 13.804409980773926, + "eval_valid_perplexity/seq": 9.078352928161621, + "eval_valid_reconstruction/all": 0.2847959101200104, + "eval_valid_reconstruction/end_span": 0.7071819305419922, + "eval_valid_reconstruction/fim": 0.14568762481212616, + "eval_valid_reconstruction/first_seq": 0.16771894693374634, + "eval_valid_reconstruction/last_seq": 0.3237209618091583, + "eval_valid_reconstruction/second_seq": 0.19309715926647186, + "eval_valid_runtime": 610.2963, + "eval_valid_samples_per_second": 0.315, + "eval_valid_steps_per_second": 0.315, + "step": 6750 + }, + { + "epoch": 0.02517848750027976, + "eval_train_loss": 2.224346160888672, + "eval_train_loss/all": 2.055845260620117, + "eval_train_loss/end_span": 1.2374720573425293, + "eval_train_perplexity/batch": 7.81343936920166, + "eval_train_perplexity/end_span": 3.4468889236450195, + "eval_train_perplexity/fim": 2.148872137069702, + "eval_train_perplexity/first_seq": 15.594625473022461, + "eval_train_perplexity/last_seq": 9.082453727722168, + "eval_train_perplexity/second_seq": 14.375603675842285, + "eval_train_perplexity/seq": 8.991738319396973, + "eval_train_reconstruction/all": 0.27496057748794556, + "eval_train_reconstruction/end_span": 0.7191065549850464, + "eval_train_reconstruction/fim": 0.14722001552581787, + "eval_train_reconstruction/first_seq": 0.15070469677448273, + "eval_train_reconstruction/last_seq": 0.32163557410240173, + "eval_train_reconstruction/second_seq": 0.1801588237285614, + "eval_train_runtime": 621.5441, + "eval_train_samples_per_second": 0.309, + "eval_train_steps_per_second": 0.309, + "step": 6750 + }, + { + "epoch": 0.025215788963243137, + "grad_norm": 0.3946113884449005, + "learning_rate": 0.0006, + "loss": 2.2811, + "step": 6760 + }, + { + "epoch": 0.025253090426206517, + "grad_norm": 0.4606720507144928, + "learning_rate": 0.0006, + "loss": 2.2387, + "step": 6770 + }, + { + "epoch": 0.025290391889169893, + "grad_norm": 0.3140448033809662, + "learning_rate": 0.0006, + "loss": 2.1431, + "step": 6780 + }, + { + "epoch": 0.025327693352133272, + "grad_norm": 0.5946782231330872, + "learning_rate": 0.0006, + "loss": 2.1313, + "step": 6790 + }, + { + "epoch": 0.025364994815096648, + "grad_norm": 0.2587294280529022, + "learning_rate": 0.0006, + "loss": 2.3684, + "step": 6800 + }, + { + "epoch": 0.025364994815096648, + "eval_valid_loss": 2.2262401580810547, + "eval_valid_loss/all": 2.0855839252471924, + "eval_valid_loss/end_span": 1.2245616912841797, + "eval_valid_perplexity/batch": 8.049290657043457, + "eval_valid_perplexity/end_span": 3.402674436569214, + "eval_valid_perplexity/fim": 2.3504505157470703, + "eval_valid_perplexity/first_seq": 14.67208480834961, + "eval_valid_perplexity/last_seq": 9.455596923828125, + "eval_valid_perplexity/second_seq": 13.49730396270752, + "eval_valid_perplexity/seq": 9.06385326385498, + "eval_valid_reconstruction/all": 0.2849823534488678, + "eval_valid_reconstruction/end_span": 0.7104405760765076, + "eval_valid_reconstruction/fim": 0.16367295384407043, + "eval_valid_reconstruction/first_seq": 0.17551030218601227, + "eval_valid_reconstruction/last_seq": 0.31127220392227173, + "eval_valid_reconstruction/second_seq": 0.20395444333553314, + "eval_valid_runtime": 610.5878, + "eval_valid_samples_per_second": 0.314, + "eval_valid_steps_per_second": 0.314, + "step": 6800 + }, + { + "epoch": 0.025364994815096648, + "eval_train_loss": 2.2215354442596436, + "eval_train_loss/all": 2.0535247325897217, + "eval_train_loss/end_span": 1.1885032653808594, + "eval_train_perplexity/batch": 7.7953290939331055, + "eval_train_perplexity/end_span": 3.282165050506592, + "eval_train_perplexity/fim": 2.479611873626709, + "eval_train_perplexity/first_seq": 15.404712677001953, + "eval_train_perplexity/last_seq": 8.924766540527344, + "eval_train_perplexity/second_seq": 14.324567794799805, + "eval_train_perplexity/seq": 8.967202186584473, + "eval_train_reconstruction/all": 0.27557751536369324, + "eval_train_reconstruction/end_span": 0.7233011722564697, + "eval_train_reconstruction/fim": 0.17406342923641205, + "eval_train_reconstruction/first_seq": 0.15400689840316772, + "eval_train_reconstruction/last_seq": 0.3265511989593506, + "eval_train_reconstruction/second_seq": 0.17838197946548462, + "eval_train_runtime": 616.1435, + "eval_train_samples_per_second": 0.312, + "eval_train_steps_per_second": 0.312, + "step": 6800 + }, + { + "epoch": 0.025402296278060024, + "grad_norm": 0.4615229070186615, + "learning_rate": 0.0006, + "loss": 2.2086, + "step": 6810 + }, + { + "epoch": 0.025439597741023404, + "grad_norm": 0.5184123516082764, + "learning_rate": 0.0006, + "loss": 2.2189, + "step": 6820 + }, + { + "epoch": 0.02547689920398678, + "grad_norm": 0.29173555970191956, + "learning_rate": 0.0006, + "loss": 2.4262, + "step": 6830 + }, + { + "epoch": 0.02551420066695016, + "grad_norm": 0.401439368724823, + "learning_rate": 0.0006, + "loss": 2.1739, + "step": 6840 + }, + { + "epoch": 0.025551502129913535, + "grad_norm": 0.3060283958911896, + "learning_rate": 0.0006, + "loss": 2.1942, + "step": 6850 + }, + { + "epoch": 0.025551502129913535, + "eval_valid_loss": 2.226011276245117, + "eval_valid_loss/all": 2.085466146469116, + "eval_valid_loss/end_span": 1.4158436059951782, + "eval_valid_perplexity/batch": 8.04834270477295, + "eval_valid_perplexity/end_span": 4.119960784912109, + "eval_valid_perplexity/fim": 2.4479551315307617, + "eval_valid_perplexity/first_seq": 14.988103866577148, + "eval_valid_perplexity/last_seq": 9.20923900604248, + "eval_valid_perplexity/second_seq": 13.93584156036377, + "eval_valid_perplexity/seq": 9.066746711730957, + "eval_valid_reconstruction/all": 0.2847752571105957, + "eval_valid_reconstruction/end_span": 0.6729004383087158, + "eval_valid_reconstruction/fim": 0.17118090391159058, + "eval_valid_reconstruction/first_seq": 0.16474473476409912, + "eval_valid_reconstruction/last_seq": 0.32052305340766907, + "eval_valid_reconstruction/second_seq": 0.19221192598342896, + "eval_valid_runtime": 614.0657, + "eval_valid_samples_per_second": 0.313, + "eval_valid_steps_per_second": 0.313, + "step": 6850 + }, + { + "epoch": 0.025551502129913535, + "eval_train_loss": 2.2223103046417236, + "eval_train_loss/all": 2.054509162902832, + "eval_train_loss/end_span": 1.3795536756515503, + "eval_train_perplexity/batch": 7.803007125854492, + "eval_train_perplexity/end_span": 3.973127841949463, + "eval_train_perplexity/fim": 2.2688186168670654, + "eval_train_perplexity/first_seq": 15.32369613647461, + "eval_train_perplexity/last_seq": 9.269186019897461, + "eval_train_perplexity/second_seq": 14.500226974487305, + "eval_train_perplexity/seq": 8.980362892150879, + "eval_train_reconstruction/all": 0.27527323365211487, + "eval_train_reconstruction/end_span": 0.6848731637001038, + "eval_train_reconstruction/fim": 0.15741941332817078, + "eval_train_reconstruction/first_seq": 0.1536063700914383, + "eval_train_reconstruction/last_seq": 0.3156627118587494, + "eval_train_reconstruction/second_seq": 0.17811015248298645, + "eval_train_runtime": 624.3314, + "eval_train_samples_per_second": 0.308, + "eval_train_steps_per_second": 0.308, + "step": 6850 + }, + { + "epoch": 0.02558880359287691, + "grad_norm": 0.30855512619018555, + "learning_rate": 0.0006, + "loss": 2.4366, + "step": 6860 + }, + { + "epoch": 0.02562610505584029, + "grad_norm": 0.5783065557479858, + "learning_rate": 0.0006, + "loss": 2.2839, + "step": 6870 + }, + { + "epoch": 0.025663406518803666, + "grad_norm": 0.32737088203430176, + "learning_rate": 0.0006, + "loss": 2.3693, + "step": 6880 + }, + { + "epoch": 0.025700707981767046, + "grad_norm": 0.45555487275123596, + "learning_rate": 0.0006, + "loss": 2.3245, + "step": 6890 + }, + { + "epoch": 0.025738009444730422, + "grad_norm": 0.32238754630088806, + "learning_rate": 0.0006, + "loss": 2.166, + "step": 6900 + }, + { + "epoch": 0.025738009444730422, + "eval_valid_loss": 2.228475570678711, + "eval_valid_loss/all": 2.0873849391937256, + "eval_valid_loss/end_span": 1.2457152605056763, + "eval_valid_perplexity/batch": 8.063799858093262, + "eval_valid_perplexity/end_span": 3.475419759750366, + "eval_valid_perplexity/fim": 2.3897554874420166, + "eval_valid_perplexity/first_seq": 14.996350288391113, + "eval_valid_perplexity/last_seq": 9.294014930725098, + "eval_valid_perplexity/second_seq": 13.933801651000977, + "eval_valid_perplexity/seq": 9.081498146057129, + "eval_valid_reconstruction/all": 0.2843603789806366, + "eval_valid_reconstruction/end_span": 0.7057467103004456, + "eval_valid_reconstruction/fim": 0.16604599356651306, + "eval_valid_reconstruction/first_seq": 0.16591662168502808, + "eval_valid_reconstruction/last_seq": 0.31775492429733276, + "eval_valid_reconstruction/second_seq": 0.19285006821155548, + "eval_valid_runtime": 607.9745, + "eval_valid_samples_per_second": 0.316, + "eval_valid_steps_per_second": 0.316, + "step": 6900 + }, + { + "epoch": 0.025738009444730422, + "eval_train_loss": 2.2235727310180664, + "eval_train_loss/all": 2.055382490158081, + "eval_train_loss/end_span": 1.2196130752563477, + "eval_train_perplexity/batch": 7.809824466705322, + "eval_train_perplexity/end_span": 3.3858773708343506, + "eval_train_perplexity/fim": 2.1195342540740967, + "eval_train_perplexity/first_seq": 15.471975326538086, + "eval_train_perplexity/last_seq": 9.547769546508789, + "eval_train_perplexity/second_seq": 14.307723999023438, + "eval_train_perplexity/seq": 8.98719310760498, + "eval_train_reconstruction/all": 0.2749779224395752, + "eval_train_reconstruction/end_span": 0.7162814736366272, + "eval_train_reconstruction/fim": 0.14352557063102722, + "eval_train_reconstruction/first_seq": 0.15294404327869415, + "eval_train_reconstruction/last_seq": 0.3073093891143799, + "eval_train_reconstruction/second_seq": 0.18139097094535828, + "eval_train_runtime": 627.8484, + "eval_train_samples_per_second": 0.306, + "eval_train_steps_per_second": 0.306, + "step": 6900 + }, + { + "epoch": 0.0257753109076938, + "grad_norm": 0.316364049911499, + "learning_rate": 0.0006, + "loss": 2.2397, + "step": 6910 + }, + { + "epoch": 0.025812612370657177, + "grad_norm": 0.3074314594268799, + "learning_rate": 0.0006, + "loss": 2.3218, + "step": 6920 + }, + { + "epoch": 0.025849913833620553, + "grad_norm": 0.26034829020500183, + "learning_rate": 0.0006, + "loss": 2.2907, + "step": 6930 + }, + { + "epoch": 0.025887215296583933, + "grad_norm": 0.3267327845096588, + "learning_rate": 0.0006, + "loss": 2.3518, + "step": 6940 + }, + { + "epoch": 0.02592451675954731, + "grad_norm": 0.5537161827087402, + "learning_rate": 0.0006, + "loss": 2.2567, + "step": 6950 + }, + { + "epoch": 0.02592451675954731, + "eval_valid_loss": 2.2264795303344727, + "eval_valid_loss/all": 2.085676431655884, + "eval_valid_loss/end_span": 1.3543908596038818, + "eval_valid_perplexity/batch": 8.050034523010254, + "eval_valid_perplexity/end_span": 3.8744001388549805, + "eval_valid_perplexity/fim": 2.4315829277038574, + "eval_valid_perplexity/first_seq": 14.707738876342773, + "eval_valid_perplexity/last_seq": 9.38271713256836, + "eval_valid_perplexity/second_seq": 13.92912483215332, + "eval_valid_perplexity/seq": 9.06961441040039, + "eval_valid_reconstruction/all": 0.2849905490875244, + "eval_valid_reconstruction/end_span": 0.6838538646697998, + "eval_valid_reconstruction/fim": 0.17032340168952942, + "eval_valid_reconstruction/first_seq": 0.17297323048114777, + "eval_valid_reconstruction/last_seq": 0.3127536177635193, + "eval_valid_reconstruction/second_seq": 0.19009406864643097, + "eval_valid_runtime": 623.9863, + "eval_valid_samples_per_second": 0.308, + "eval_valid_steps_per_second": 0.308, + "step": 6950 + }, + { + "epoch": 0.02592451675954731, + "eval_train_loss": 2.2233712673187256, + "eval_train_loss/all": 2.055243968963623, + "eval_train_loss/end_span": 1.3050254583358765, + "eval_train_perplexity/batch": 7.808742523193359, + "eval_train_perplexity/end_span": 3.6877830028533936, + "eval_train_perplexity/fim": 2.1569106578826904, + "eval_train_perplexity/first_seq": 15.337154388427734, + "eval_train_perplexity/last_seq": 9.357843399047852, + "eval_train_perplexity/second_seq": 14.227420806884766, + "eval_train_perplexity/seq": 8.985564231872559, + "eval_train_reconstruction/all": 0.2755472958087921, + "eval_train_reconstruction/end_span": 0.6972677707672119, + "eval_train_reconstruction/fim": 0.1476304978132248, + "eval_train_reconstruction/first_seq": 0.1570245772600174, + "eval_train_reconstruction/last_seq": 0.3127562403678894, + "eval_train_reconstruction/second_seq": 0.18288125097751617, + "eval_train_runtime": 617.2411, + "eval_train_samples_per_second": 0.311, + "eval_train_steps_per_second": 0.311, + "step": 6950 + }, + { + "epoch": 0.025961818222510688, + "grad_norm": 0.37574347853660583, + "learning_rate": 0.0006, + "loss": 2.1227, + "step": 6960 + }, + { + "epoch": 0.025999119685474064, + "grad_norm": 0.33706972002983093, + "learning_rate": 0.0006, + "loss": 2.297, + "step": 6970 + }, + { + "epoch": 0.02603642114843744, + "grad_norm": 0.44636234641075134, + "learning_rate": 0.0006, + "loss": 2.3053, + "step": 6980 + }, + { + "epoch": 0.02607372261140082, + "grad_norm": 0.3688061833381653, + "learning_rate": 0.0006, + "loss": 2.1619, + "step": 6990 + }, + { + "epoch": 0.026111024074364195, + "grad_norm": 0.39284953474998474, + "learning_rate": 0.0006, + "loss": 2.1465, + "step": 7000 + }, + { + "epoch": 0.026111024074364195, + "eval_valid_loss": 2.2263829708099365, + "eval_valid_loss/all": 2.085742235183716, + "eval_valid_loss/end_span": 1.2787052392959595, + "eval_valid_perplexity/batch": 8.050564765930176, + "eval_valid_perplexity/end_span": 3.5919859409332275, + "eval_valid_perplexity/fim": 2.2111380100250244, + "eval_valid_perplexity/first_seq": 14.917359352111816, + "eval_valid_perplexity/last_seq": 9.556831359863281, + "eval_valid_perplexity/second_seq": 14.441956520080566, + "eval_valid_perplexity/seq": 9.072091102600098, + "eval_valid_reconstruction/all": 0.28490880131721497, + "eval_valid_reconstruction/end_span": 0.69398033618927, + "eval_valid_reconstruction/fim": 0.15230168402194977, + "eval_valid_reconstruction/first_seq": 0.16530892252922058, + "eval_valid_reconstruction/last_seq": 0.3076981008052826, + "eval_valid_reconstruction/second_seq": 0.18226687610149384, + "eval_valid_runtime": 608.997, + "eval_valid_samples_per_second": 0.315, + "eval_valid_steps_per_second": 0.315, + "step": 7000 + }, + { + "epoch": 0.026111024074364195, + "eval_train_loss": 2.220607042312622, + "eval_train_loss/all": 2.052891492843628, + "eval_train_loss/end_span": 1.2489867210388184, + "eval_train_perplexity/batch": 7.790394306182861, + "eval_train_perplexity/end_span": 3.4868080615997314, + "eval_train_perplexity/fim": 2.0380706787109375, + "eval_train_perplexity/first_seq": 15.794404029846191, + "eval_train_perplexity/last_seq": 9.329318046569824, + "eval_train_perplexity/second_seq": 14.401126861572266, + "eval_train_perplexity/seq": 8.963994979858398, + "eval_train_reconstruction/all": 0.2761158049106598, + "eval_train_reconstruction/end_span": 0.7028924822807312, + "eval_train_reconstruction/fim": 0.1373618245124817, + "eval_train_reconstruction/first_seq": 0.14673066139221191, + "eval_train_reconstruction/last_seq": 0.31290164589881897, + "eval_train_reconstruction/second_seq": 0.18309716880321503, + "eval_train_runtime": 620.4363, + "eval_train_samples_per_second": 0.309, + "eval_train_steps_per_second": 0.309, + "step": 7000 + }, + { + "epoch": 0.026148325537327575, + "grad_norm": 0.35145384073257446, + "learning_rate": 0.0006, + "loss": 2.3858, + "step": 7010 + }, + { + "epoch": 0.02618562700029095, + "grad_norm": 0.42375096678733826, + "learning_rate": 0.0006, + "loss": 2.2954, + "step": 7020 + }, + { + "epoch": 0.02622292846325433, + "grad_norm": 0.49889907240867615, + "learning_rate": 0.0006, + "loss": 2.2855, + "step": 7030 + }, + { + "epoch": 0.026260229926217706, + "grad_norm": 0.5499227643013, + "learning_rate": 0.0006, + "loss": 2.3072, + "step": 7040 + }, + { + "epoch": 0.026297531389181082, + "grad_norm": 0.47621333599090576, + "learning_rate": 0.0006, + "loss": 2.2215, + "step": 7050 + }, + { + "epoch": 0.026297531389181082, + "eval_valid_loss": 2.227569818496704, + "eval_valid_loss/all": 2.0870494842529297, + "eval_valid_loss/end_span": 1.2553235292434692, + "eval_valid_perplexity/batch": 8.061095237731934, + "eval_valid_perplexity/end_span": 3.5089733600616455, + "eval_valid_perplexity/fim": 2.295464277267456, + "eval_valid_perplexity/first_seq": 14.797355651855469, + "eval_valid_perplexity/last_seq": 9.490907669067383, + "eval_valid_perplexity/second_seq": 13.73488712310791, + "eval_valid_perplexity/seq": 9.082905769348145, + "eval_valid_reconstruction/all": 0.28443393111228943, + "eval_valid_reconstruction/end_span": 0.7131630182266235, + "eval_valid_reconstruction/fim": 0.15924197435379028, + "eval_valid_reconstruction/first_seq": 0.16787809133529663, + "eval_valid_reconstruction/last_seq": 0.31085315346717834, + "eval_valid_reconstruction/second_seq": 0.19618616998195648, + "eval_valid_runtime": 613.1612, + "eval_valid_samples_per_second": 0.313, + "eval_valid_steps_per_second": 0.313, + "step": 7050 + }, + { + "epoch": 0.026297531389181082, + "eval_train_loss": 2.221165418624878, + "eval_train_loss/all": 2.0532822608947754, + "eval_train_loss/end_span": 1.2291686534881592, + "eval_train_perplexity/batch": 7.7934393882751465, + "eval_train_perplexity/end_span": 3.418386459350586, + "eval_train_perplexity/fim": 2.09423565864563, + "eval_train_perplexity/first_seq": 15.285383224487305, + "eval_train_perplexity/last_seq": 9.241679191589355, + "eval_train_perplexity/second_seq": 14.004036903381348, + "eval_train_perplexity/seq": 8.966509819030762, + "eval_train_reconstruction/all": 0.27559253573417664, + "eval_train_reconstruction/end_span": 0.7206329107284546, + "eval_train_reconstruction/fim": 0.1424606293439865, + "eval_train_reconstruction/first_seq": 0.1533026099205017, + "eval_train_reconstruction/last_seq": 0.3190266191959381, + "eval_train_reconstruction/second_seq": 0.18640752136707306, + "eval_train_runtime": 613.6899, + "eval_train_samples_per_second": 0.313, + "eval_train_steps_per_second": 0.313, + "step": 7050 + }, + { + "epoch": 0.02633483285214446, + "grad_norm": 0.3623928129673004, + "learning_rate": 0.0006, + "loss": 2.1937, + "step": 7060 + }, + { + "epoch": 0.026372134315107838, + "grad_norm": 0.32521429657936096, + "learning_rate": 0.0006, + "loss": 2.2883, + "step": 7070 + }, + { + "epoch": 0.026409435778071217, + "grad_norm": 0.345354288816452, + "learning_rate": 0.0006, + "loss": 2.1625, + "step": 7080 + }, + { + "epoch": 0.026446737241034593, + "grad_norm": 0.3726172149181366, + "learning_rate": 0.0006, + "loss": 2.2397, + "step": 7090 + }, + { + "epoch": 0.026484038703997972, + "grad_norm": 0.5694578289985657, + "learning_rate": 0.0006, + "loss": 2.3058, + "step": 7100 + }, + { + "epoch": 0.026484038703997972, + "eval_valid_loss": 2.224196434020996, + "eval_valid_loss/all": 2.083839178085327, + "eval_valid_loss/end_span": 1.3756181001663208, + "eval_valid_perplexity/batch": 8.035258293151855, + "eval_valid_perplexity/end_span": 3.95752215385437, + "eval_valid_perplexity/fim": 2.3797504901885986, + "eval_valid_perplexity/first_seq": 14.863312721252441, + "eval_valid_perplexity/last_seq": 8.770219802856445, + "eval_valid_perplexity/second_seq": 13.568273544311523, + "eval_valid_perplexity/seq": 9.055670738220215, + "eval_valid_reconstruction/all": 0.28563934564590454, + "eval_valid_reconstruction/end_span": 0.6858310699462891, + "eval_valid_reconstruction/fim": 0.16680286824703217, + "eval_valid_reconstruction/first_seq": 0.1693394035100937, + "eval_valid_reconstruction/last_seq": 0.3344682455062866, + "eval_valid_reconstruction/second_seq": 0.1996517926454544, + "eval_valid_runtime": 608.2924, + "eval_valid_samples_per_second": 0.316, + "eval_valid_steps_per_second": 0.316, + "step": 7100 + }, + { + "epoch": 0.026484038703997972, + "eval_train_loss": 2.219783067703247, + "eval_train_loss/all": 2.052061080932617, + "eval_train_loss/end_span": 1.3405177593231201, + "eval_train_perplexity/batch": 7.783927917480469, + "eval_train_perplexity/end_span": 3.821021318435669, + "eval_train_perplexity/fim": 2.054439067840576, + "eval_train_perplexity/first_seq": 15.630928993225098, + "eval_train_perplexity/last_seq": 9.554252624511719, + "eval_train_perplexity/second_seq": 14.470719337463379, + "eval_train_perplexity/seq": 8.95528507232666, + "eval_train_reconstruction/all": 0.27617815136909485, + "eval_train_reconstruction/end_span": 0.6939637660980225, + "eval_train_reconstruction/fim": 0.1383761763572693, + "eval_train_reconstruction/first_seq": 0.1519167721271515, + "eval_train_reconstruction/last_seq": 0.3053806722164154, + "eval_train_reconstruction/second_seq": 0.1756424754858017, + "eval_train_runtime": 614.0418, + "eval_train_samples_per_second": 0.313, + "eval_train_steps_per_second": 0.313, + "step": 7100 + }, + { + "epoch": 0.02652134016696135, + "grad_norm": 0.46749186515808105, + "learning_rate": 0.0006, + "loss": 2.2176, + "step": 7110 + }, + { + "epoch": 0.026558641629924724, + "grad_norm": 0.6576831340789795, + "learning_rate": 0.0006, + "loss": 2.3448, + "step": 7120 + }, + { + "epoch": 0.026595943092888104, + "grad_norm": 0.3564291298389435, + "learning_rate": 0.0006, + "loss": 2.3053, + "step": 7130 + }, + { + "epoch": 0.02663324455585148, + "grad_norm": 0.4611573815345764, + "learning_rate": 0.0006, + "loss": 2.3153, + "step": 7140 + }, + { + "epoch": 0.02667054601881486, + "grad_norm": 0.35199499130249023, + "learning_rate": 0.0006, + "loss": 2.2604, + "step": 7150 + }, + { + "epoch": 0.02667054601881486, + "eval_valid_loss": 2.224355459213257, + "eval_valid_loss/all": 2.084012985229492, + "eval_valid_loss/end_span": 1.3550379276275635, + "eval_valid_perplexity/batch": 8.03665542602539, + "eval_valid_perplexity/end_span": 3.87690806388855, + "eval_valid_perplexity/fim": 2.2553205490112305, + "eval_valid_perplexity/first_seq": 15.084362030029297, + "eval_valid_perplexity/last_seq": 9.274800300598145, + "eval_valid_perplexity/second_seq": 14.043966293334961, + "eval_valid_perplexity/seq": 9.051594734191895, + "eval_valid_reconstruction/all": 0.2858107089996338, + "eval_valid_reconstruction/end_span": 0.6872190237045288, + "eval_valid_reconstruction/fim": 0.1565578430891037, + "eval_valid_reconstruction/first_seq": 0.16472162306308746, + "eval_valid_reconstruction/last_seq": 0.31927725672721863, + "eval_valid_reconstruction/second_seq": 0.18896548449993134, + "eval_valid_runtime": 615.0378, + "eval_valid_samples_per_second": 0.312, + "eval_valid_steps_per_second": 0.312, + "step": 7150 + }, + { + "epoch": 0.02667054601881486, + "eval_train_loss": 2.220426321029663, + "eval_train_loss/all": 2.052621603012085, + "eval_train_loss/end_span": 1.3351078033447266, + "eval_train_perplexity/batch": 7.788292407989502, + "eval_train_perplexity/end_span": 3.800405740737915, + "eval_train_perplexity/fim": 2.0264036655426025, + "eval_train_perplexity/first_seq": 15.425790786743164, + "eval_train_perplexity/last_seq": 9.497272491455078, + "eval_train_perplexity/second_seq": 14.411262512207031, + "eval_train_perplexity/seq": 8.959238052368164, + "eval_train_reconstruction/all": 0.27632221579551697, + "eval_train_reconstruction/end_span": 0.6930846571922302, + "eval_train_reconstruction/fim": 0.13630910217761993, + "eval_train_reconstruction/first_seq": 0.15485098958015442, + "eval_train_reconstruction/last_seq": 0.3095373213291168, + "eval_train_reconstruction/second_seq": 0.17973315715789795, + "eval_train_runtime": 609.5426, + "eval_train_samples_per_second": 0.315, + "eval_train_steps_per_second": 0.315, + "step": 7150 + }, + { + "epoch": 0.026707847481778235, + "grad_norm": 0.45397815108299255, + "learning_rate": 0.0006, + "loss": 2.2457, + "step": 7160 + }, + { + "epoch": 0.02674514894474161, + "grad_norm": 0.4419160783290863, + "learning_rate": 0.0006, + "loss": 2.3189, + "step": 7170 + }, + { + "epoch": 0.02678245040770499, + "grad_norm": 0.4883061647415161, + "learning_rate": 0.0006, + "loss": 2.2201, + "step": 7180 + }, + { + "epoch": 0.026819751870668367, + "grad_norm": 0.3073897063732147, + "learning_rate": 0.0006, + "loss": 2.2773, + "step": 7190 + }, + { + "epoch": 0.026857053333631746, + "grad_norm": 6.725080490112305, + "learning_rate": 0.0006, + "loss": 2.1531, + "step": 7200 + }, + { + "epoch": 0.026857053333631746, + "eval_valid_loss": 2.2260539531707764, + "eval_valid_loss/all": 2.0854859352111816, + "eval_valid_loss/end_span": 1.2435933351516724, + "eval_valid_perplexity/batch": 8.048501968383789, + "eval_valid_perplexity/end_span": 3.468052864074707, + "eval_valid_perplexity/fim": 2.3104100227355957, + "eval_valid_perplexity/first_seq": 15.02281665802002, + "eval_valid_perplexity/last_seq": 8.89590835571289, + "eval_valid_perplexity/second_seq": 13.894493103027344, + "eval_valid_perplexity/seq": 9.065804481506348, + "eval_valid_reconstruction/all": 0.28520098328590393, + "eval_valid_reconstruction/end_span": 0.7099679708480835, + "eval_valid_reconstruction/fim": 0.16128437221050262, + "eval_valid_reconstruction/first_seq": 0.1619115173816681, + "eval_valid_reconstruction/last_seq": 0.3319006562232971, + "eval_valid_reconstruction/second_seq": 0.19195975363254547, + "eval_valid_runtime": 606.0685, + "eval_valid_samples_per_second": 0.317, + "eval_valid_steps_per_second": 0.317, + "step": 7200 + }, + { + "epoch": 0.026857053333631746, + "eval_train_loss": 2.2231791019439697, + "eval_train_loss/all": 2.055422782897949, + "eval_train_loss/end_span": 1.2118316888809204, + "eval_train_perplexity/batch": 7.810139179229736, + "eval_train_perplexity/end_span": 3.359632730484009, + "eval_train_perplexity/fim": 2.1736133098602295, + "eval_train_perplexity/first_seq": 15.508386611938477, + "eval_train_perplexity/last_seq": 9.472892761230469, + "eval_train_perplexity/second_seq": 14.089046478271484, + "eval_train_perplexity/seq": 8.98962116241455, + "eval_train_reconstruction/all": 0.27542996406555176, + "eval_train_reconstruction/end_span": 0.7183751463890076, + "eval_train_reconstruction/fim": 0.14907602965831757, + "eval_train_reconstruction/first_seq": 0.15246917307376862, + "eval_train_reconstruction/last_seq": 0.3095250427722931, + "eval_train_reconstruction/second_seq": 0.18688246607780457, + "eval_train_runtime": 614.6501, + "eval_train_samples_per_second": 0.312, + "eval_train_steps_per_second": 0.312, + "step": 7200 + }, + { + "epoch": 0.026894354796595122, + "grad_norm": 0.2948090434074402, + "learning_rate": 0.0006, + "loss": 2.2494, + "step": 7210 + }, + { + "epoch": 0.0269316562595585, + "grad_norm": 0.43722984194755554, + "learning_rate": 0.0006, + "loss": 2.1177, + "step": 7220 + }, + { + "epoch": 0.026968957722521877, + "grad_norm": 0.40496018528938293, + "learning_rate": 0.0006, + "loss": 2.3014, + "step": 7230 + }, + { + "epoch": 0.027006259185485253, + "grad_norm": 0.5858156085014343, + "learning_rate": 0.0006, + "loss": 2.181, + "step": 7240 + }, + { + "epoch": 0.027043560648448633, + "grad_norm": 0.33544278144836426, + "learning_rate": 0.0006, + "loss": 2.2072, + "step": 7250 + }, + { + "epoch": 0.027043560648448633, + "eval_valid_loss": 2.2206525802612305, + "eval_valid_loss/all": 2.08064341545105, + "eval_valid_loss/end_span": 1.46475088596344, + "eval_valid_perplexity/batch": 8.009620666503906, + "eval_valid_perplexity/end_span": 4.326465129852295, + "eval_valid_perplexity/fim": 2.299574851989746, + "eval_valid_perplexity/first_seq": 15.094985961914062, + "eval_valid_perplexity/last_seq": 9.376527786254883, + "eval_valid_perplexity/second_seq": 13.887269973754883, + "eval_valid_perplexity/seq": 9.026476860046387, + "eval_valid_reconstruction/all": 0.28658589720726013, + "eval_valid_reconstruction/end_span": 0.6612400412559509, + "eval_valid_reconstruction/fim": 0.16113528609275818, + "eval_valid_reconstruction/first_seq": 0.1617087870836258, + "eval_valid_reconstruction/last_seq": 0.3142935335636139, + "eval_valid_reconstruction/second_seq": 0.19439978897571564, + "eval_valid_runtime": 620.402, + "eval_valid_samples_per_second": 0.309, + "eval_valid_steps_per_second": 0.309, + "step": 7250 + }, + { + "epoch": 0.027043560648448633, + "eval_train_loss": 2.219939947128296, + "eval_train_loss/all": 2.052365303039551, + "eval_train_loss/end_span": 1.4385714530944824, + "eval_train_perplexity/batch": 7.786296367645264, + "eval_train_perplexity/end_span": 4.214670658111572, + "eval_train_perplexity/fim": 2.0927932262420654, + "eval_train_perplexity/first_seq": 15.184246063232422, + "eval_train_perplexity/last_seq": 9.015652656555176, + "eval_train_perplexity/second_seq": 14.369370460510254, + "eval_train_perplexity/seq": 8.962275505065918, + "eval_train_reconstruction/all": 0.2761179506778717, + "eval_train_reconstruction/end_span": 0.6692336201667786, + "eval_train_reconstruction/fim": 0.14249028265476227, + "eval_train_reconstruction/first_seq": 0.15807639062404633, + "eval_train_reconstruction/last_seq": 0.3243390917778015, + "eval_train_reconstruction/second_seq": 0.18156640231609344, + "eval_train_runtime": 617.2437, + "eval_train_samples_per_second": 0.311, + "eval_train_steps_per_second": 0.311, + "step": 7250 + }, + { + "epoch": 0.02708086211141201, + "grad_norm": 0.5748642683029175, + "learning_rate": 0.0006, + "loss": 2.1897, + "step": 7260 + }, + { + "epoch": 0.027118163574375388, + "grad_norm": 0.4642566442489624, + "learning_rate": 0.0006, + "loss": 2.316, + "step": 7270 + }, + { + "epoch": 0.027155465037338764, + "grad_norm": 0.32935619354248047, + "learning_rate": 0.0006, + "loss": 2.121, + "step": 7280 + }, + { + "epoch": 0.02719276650030214, + "grad_norm": 0.33677589893341064, + "learning_rate": 0.0006, + "loss": 2.3955, + "step": 7290 + }, + { + "epoch": 0.02723006796326552, + "grad_norm": 0.5243373513221741, + "learning_rate": 0.0006, + "loss": 2.2941, + "step": 7300 + }, + { + "epoch": 0.02723006796326552, + "eval_valid_loss": 2.222505807876587, + "eval_valid_loss/all": 2.082172155380249, + "eval_valid_loss/end_span": 1.2867467403411865, + "eval_valid_perplexity/batch": 8.02187442779541, + "eval_valid_perplexity/end_span": 3.6209874153137207, + "eval_valid_perplexity/fim": 2.2497780323028564, + "eval_valid_perplexity/first_seq": 14.931365013122559, + "eval_valid_perplexity/last_seq": 8.615735054016113, + "eval_valid_perplexity/second_seq": 13.368720054626465, + "eval_valid_perplexity/seq": 9.036252975463867, + "eval_valid_reconstruction/all": 0.2858424186706543, + "eval_valid_reconstruction/end_span": 0.6991544961929321, + "eval_valid_reconstruction/fim": 0.15614667534828186, + "eval_valid_reconstruction/first_seq": 0.16715282201766968, + "eval_valid_reconstruction/last_seq": 0.34276241064071655, + "eval_valid_reconstruction/second_seq": 0.20438706874847412, + "eval_valid_runtime": 597.549, + "eval_valid_samples_per_second": 0.321, + "eval_valid_steps_per_second": 0.321, + "step": 7300 + }, + { + "epoch": 0.02723006796326552, + "eval_train_loss": 2.219003677368164, + "eval_train_loss/all": 2.051278591156006, + "eval_train_loss/end_span": 1.258683443069458, + "eval_train_perplexity/batch": 7.777839660644531, + "eval_train_perplexity/end_span": 3.5207831859588623, + "eval_train_perplexity/fim": 2.4547150135040283, + "eval_train_perplexity/first_seq": 15.78964900970459, + "eval_train_perplexity/last_seq": 9.350115776062012, + "eval_train_perplexity/second_seq": 14.502636909484863, + "eval_train_perplexity/seq": 8.950167655944824, + "eval_train_reconstruction/all": 0.2761732339859009, + "eval_train_reconstruction/end_span": 0.7058776617050171, + "eval_train_reconstruction/fim": 0.17265266180038452, + "eval_train_reconstruction/first_seq": 0.14707453548908234, + "eval_train_reconstruction/last_seq": 0.3127956986427307, + "eval_train_reconstruction/second_seq": 0.17567811906337738, + "eval_train_runtime": 621.9501, + "eval_train_samples_per_second": 0.309, + "eval_train_steps_per_second": 0.309, + "step": 7300 + }, + { + "epoch": 0.027267369426228896, + "grad_norm": 0.5498791933059692, + "learning_rate": 0.0006, + "loss": 2.2185, + "step": 7310 + }, + { + "epoch": 0.027304670889192275, + "grad_norm": 0.28497937321662903, + "learning_rate": 0.0006, + "loss": 2.3606, + "step": 7320 + }, + { + "epoch": 0.02734197235215565, + "grad_norm": 0.37158071994781494, + "learning_rate": 0.0006, + "loss": 2.3422, + "step": 7330 + }, + { + "epoch": 0.02737927381511903, + "grad_norm": 0.36103522777557373, + "learning_rate": 0.0006, + "loss": 2.1883, + "step": 7340 + }, + { + "epoch": 0.027416575278082406, + "grad_norm": 0.413963258266449, + "learning_rate": 0.0006, + "loss": 2.3994, + "step": 7350 + }, + { + "epoch": 0.027416575278082406, + "eval_valid_loss": 2.2249515056610107, + "eval_valid_loss/all": 2.084127187728882, + "eval_valid_loss/end_span": 1.3101481199264526, + "eval_valid_perplexity/batch": 8.037572860717773, + "eval_valid_perplexity/end_span": 3.7067227363586426, + "eval_valid_perplexity/fim": 2.5221402645111084, + "eval_valid_perplexity/first_seq": 15.099909782409668, + "eval_valid_perplexity/last_seq": 9.426955223083496, + "eval_valid_perplexity/second_seq": 13.7286958694458, + "eval_valid_perplexity/seq": 9.046500205993652, + "eval_valid_reconstruction/all": 0.2850819230079651, + "eval_valid_reconstruction/end_span": 0.6973950266838074, + "eval_valid_reconstruction/fim": 0.17805065214633942, + "eval_valid_reconstruction/first_seq": 0.1607467234134674, + "eval_valid_reconstruction/last_seq": 0.31464797258377075, + "eval_valid_reconstruction/second_seq": 0.200054332613945, + "eval_valid_runtime": 611.3438, + "eval_valid_samples_per_second": 0.314, + "eval_valid_steps_per_second": 0.314, + "step": 7350 + }, + { + "epoch": 0.027416575278082406, + "eval_train_loss": 2.2235987186431885, + "eval_train_loss/all": 2.0551390647888184, + "eval_train_loss/end_span": 1.2936197519302368, + "eval_train_perplexity/batch": 7.807923793792725, + "eval_train_perplexity/end_span": 3.6459600925445557, + "eval_train_perplexity/fim": 2.067871332168579, + "eval_train_perplexity/first_seq": 15.745848655700684, + "eval_train_perplexity/last_seq": 9.242958068847656, + "eval_train_perplexity/second_seq": 14.466307640075684, + "eval_train_perplexity/seq": 8.983086585998535, + "eval_train_reconstruction/all": 0.27506929636001587, + "eval_train_reconstruction/end_span": 0.7059118151664734, + "eval_train_reconstruction/fim": 0.14018858969211578, + "eval_train_reconstruction/first_seq": 0.1460767537355423, + "eval_train_reconstruction/last_seq": 0.3205454647541046, + "eval_train_reconstruction/second_seq": 0.17730627954006195, + "eval_train_runtime": 611.7724, + "eval_train_samples_per_second": 0.314, + "eval_train_steps_per_second": 0.314, + "step": 7350 + }, + { + "epoch": 0.027453876741045782, + "grad_norm": 0.49834397435188293, + "learning_rate": 0.0006, + "loss": 2.3516, + "step": 7360 + }, + { + "epoch": 0.027491178204009162, + "grad_norm": 0.36372825503349304, + "learning_rate": 0.0006, + "loss": 2.2486, + "step": 7370 + }, + { + "epoch": 0.027528479666972538, + "grad_norm": 0.32320359349250793, + "learning_rate": 0.0006, + "loss": 2.4326, + "step": 7380 + }, + { + "epoch": 0.027565781129935917, + "grad_norm": 0.7311534285545349, + "learning_rate": 0.0006, + "loss": 2.233, + "step": 7390 + }, + { + "epoch": 0.027603082592899293, + "grad_norm": 0.2759965658187866, + "learning_rate": 0.0006, + "loss": 2.377, + "step": 7400 + }, + { + "epoch": 0.027603082592899293, + "eval_valid_loss": 2.232095718383789, + "eval_valid_loss/all": 2.0908138751983643, + "eval_valid_loss/end_span": 1.3325822353363037, + "eval_valid_perplexity/batch": 8.091498374938965, + "eval_valid_perplexity/end_span": 3.7908196449279785, + "eval_valid_perplexity/fim": 2.1759562492370605, + "eval_valid_perplexity/first_seq": 14.272161483764648, + "eval_valid_perplexity/last_seq": 9.11779499053955, + "eval_valid_perplexity/second_seq": 13.739652633666992, + "eval_valid_perplexity/seq": 9.11326789855957, + "eval_valid_reconstruction/all": 0.2836109697818756, + "eval_valid_reconstruction/end_span": 0.7000173926353455, + "eval_valid_reconstruction/fim": 0.14794450998306274, + "eval_valid_reconstruction/first_seq": 0.18138104677200317, + "eval_valid_reconstruction/last_seq": 0.32103320956230164, + "eval_valid_reconstruction/second_seq": 0.1947653442621231, + "eval_valid_runtime": 595.9627, + "eval_valid_samples_per_second": 0.322, + "eval_valid_steps_per_second": 0.322, + "step": 7400 + }, + { + "epoch": 0.027603082592899293, + "eval_train_loss": 2.2292490005493164, + "eval_train_loss/all": 2.060447931289673, + "eval_train_loss/end_span": 1.2931748628616333, + "eval_train_perplexity/batch": 7.849484920501709, + "eval_train_perplexity/end_span": 3.644338369369507, + "eval_train_perplexity/fim": 1.974335789680481, + "eval_train_perplexity/first_seq": 15.654500007629395, + "eval_train_perplexity/last_seq": 9.38444709777832, + "eval_train_perplexity/second_seq": 14.2576322555542, + "eval_train_perplexity/seq": 9.03937816619873, + "eval_train_reconstruction/all": 0.2735862135887146, + "eval_train_reconstruction/end_span": 0.7095617055892944, + "eval_train_reconstruction/fim": 0.13015274703502655, + "eval_train_reconstruction/first_seq": 0.1510711908340454, + "eval_train_reconstruction/last_seq": 0.310514360666275, + "eval_train_reconstruction/second_seq": 0.182538703083992, + "eval_train_runtime": 620.9126, + "eval_train_samples_per_second": 0.309, + "eval_train_steps_per_second": 0.309, + "step": 7400 + }, + { + "epoch": 0.027640384055862673, + "grad_norm": 0.43528908491134644, + "learning_rate": 0.0006, + "loss": 2.253, + "step": 7410 + }, + { + "epoch": 0.02767768551882605, + "grad_norm": 0.4429200291633606, + "learning_rate": 0.0006, + "loss": 2.2247, + "step": 7420 + }, + { + "epoch": 0.027714986981789425, + "grad_norm": 0.56882643699646, + "learning_rate": 0.0006, + "loss": 2.3104, + "step": 7430 + }, + { + "epoch": 0.027752288444752804, + "grad_norm": 0.49955272674560547, + "learning_rate": 0.0006, + "loss": 2.2035, + "step": 7440 + }, + { + "epoch": 0.02778958990771618, + "grad_norm": 0.38944751024246216, + "learning_rate": 0.0006, + "loss": 2.3206, + "step": 7450 + }, + { + "epoch": 0.02778958990771618, + "eval_valid_loss": 2.222792387008667, + "eval_valid_loss/all": 2.082007646560669, + "eval_valid_loss/end_span": 1.2853859663009644, + "eval_valid_perplexity/batch": 8.02055549621582, + "eval_valid_perplexity/end_span": 3.616063356399536, + "eval_valid_perplexity/fim": 2.5965592861175537, + "eval_valid_perplexity/first_seq": 14.748268127441406, + "eval_valid_perplexity/last_seq": 9.543522834777832, + "eval_valid_perplexity/second_seq": 13.523736953735352, + "eval_valid_perplexity/seq": 9.02851676940918, + "eval_valid_reconstruction/all": 0.28614541888237, + "eval_valid_reconstruction/end_span": 0.7005766034126282, + "eval_valid_reconstruction/fim": 0.18335755169391632, + "eval_valid_reconstruction/first_seq": 0.17094732820987701, + "eval_valid_reconstruction/last_seq": 0.3088093400001526, + "eval_valid_reconstruction/second_seq": 0.19754555821418762, + "eval_valid_runtime": 436.3717, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 7450 + }, + { + "epoch": 0.02778958990771618, + "eval_train_loss": 2.221623182296753, + "eval_train_loss/all": 2.0536062717437744, + "eval_train_loss/end_span": 1.257736325263977, + "eval_train_perplexity/batch": 7.79596471786499, + "eval_train_perplexity/end_span": 3.5174500942230225, + "eval_train_perplexity/fim": 1.9985638856887817, + "eval_train_perplexity/first_seq": 15.43333911895752, + "eval_train_perplexity/last_seq": 9.069578170776367, + "eval_train_perplexity/second_seq": 14.557083129882812, + "eval_train_perplexity/seq": 8.972777366638184, + "eval_train_reconstruction/all": 0.2756182849407196, + "eval_train_reconstruction/end_span": 0.7107647657394409, + "eval_train_reconstruction/fim": 0.13326168060302734, + "eval_train_reconstruction/first_seq": 0.15212683379650116, + "eval_train_reconstruction/last_seq": 0.32299554347991943, + "eval_train_reconstruction/second_seq": 0.17194806039333344, + "eval_train_runtime": 436.5056, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 7450 + }, + { + "epoch": 0.02782689137067956, + "grad_norm": 0.3539562225341797, + "learning_rate": 0.0006, + "loss": 2.3236, + "step": 7460 + }, + { + "epoch": 0.027864192833642935, + "grad_norm": 0.3801068067550659, + "learning_rate": 0.0006, + "loss": 2.2742, + "step": 7470 + }, + { + "epoch": 0.02790149429660631, + "grad_norm": 0.4202333986759186, + "learning_rate": 0.0006, + "loss": 2.3787, + "step": 7480 + }, + { + "epoch": 0.02793879575956969, + "grad_norm": 0.49254801869392395, + "learning_rate": 0.0006, + "loss": 2.1543, + "step": 7490 + }, + { + "epoch": 0.027976097222533067, + "grad_norm": 0.3356799781322479, + "learning_rate": 0.0006, + "loss": 2.2786, + "step": 7500 + }, + { + "epoch": 0.027976097222533067, + "eval_valid_loss": 2.2222917079925537, + "eval_valid_loss/all": 2.0818116664886475, + "eval_valid_loss/end_span": 1.2542800903320312, + "eval_valid_perplexity/batch": 8.018983840942383, + "eval_valid_perplexity/end_span": 3.5053138732910156, + "eval_valid_perplexity/fim": 2.5436158180236816, + "eval_valid_perplexity/first_seq": 14.615671157836914, + "eval_valid_perplexity/last_seq": 9.405158996582031, + "eval_valid_perplexity/second_seq": 14.194701194763184, + "eval_valid_perplexity/seq": 9.029688835144043, + "eval_valid_reconstruction/all": 0.28615713119506836, + "eval_valid_reconstruction/end_span": 0.7065688371658325, + "eval_valid_reconstruction/fim": 0.17949548363685608, + "eval_valid_reconstruction/first_seq": 0.174249067902565, + "eval_valid_reconstruction/last_seq": 0.3142777979373932, + "eval_valid_reconstruction/second_seq": 0.18709513545036316, + "eval_valid_runtime": 435.4334, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 7500 + }, + { + "epoch": 0.027976097222533067, + "eval_train_loss": 2.2209198474884033, + "eval_train_loss/all": 2.052722930908203, + "eval_train_loss/end_span": 1.2237673997879028, + "eval_train_perplexity/batch": 7.789081573486328, + "eval_train_perplexity/end_span": 3.399972677230835, + "eval_train_perplexity/fim": 2.2712535858154297, + "eval_train_perplexity/first_seq": 15.489012718200684, + "eval_train_perplexity/last_seq": 9.247760772705078, + "eval_train_perplexity/second_seq": 14.791190147399902, + "eval_train_perplexity/seq": 8.96228313446045, + "eval_train_reconstruction/all": 0.2758922576904297, + "eval_train_reconstruction/end_span": 0.7165600657463074, + "eval_train_reconstruction/fim": 0.15850435197353363, + "eval_train_reconstruction/first_seq": 0.15257732570171356, + "eval_train_reconstruction/last_seq": 0.31614187359809875, + "eval_train_reconstruction/second_seq": 0.16988146305084229, + "eval_train_runtime": 433.2205, + "eval_train_samples_per_second": 0.443, + "eval_train_steps_per_second": 0.443, + "step": 7500 + }, + { + "epoch": 0.028013398685496446, + "grad_norm": 0.40286919474601746, + "learning_rate": 0.0006, + "loss": 2.4336, + "step": 7510 + }, + { + "epoch": 0.028050700148459822, + "grad_norm": 0.4528322219848633, + "learning_rate": 0.0006, + "loss": 2.1514, + "step": 7520 + }, + { + "epoch": 0.0280880016114232, + "grad_norm": 0.39976105093955994, + "learning_rate": 0.0006, + "loss": 2.2712, + "step": 7530 + }, + { + "epoch": 0.028125303074386578, + "grad_norm": 0.33679917454719543, + "learning_rate": 0.0006, + "loss": 2.2059, + "step": 7540 + }, + { + "epoch": 0.028162604537349954, + "grad_norm": 0.284929484128952, + "learning_rate": 0.0006, + "loss": 2.2257, + "step": 7550 + }, + { + "epoch": 0.028162604537349954, + "eval_valid_loss": 2.2254679203033447, + "eval_valid_loss/all": 2.0845727920532227, + "eval_valid_loss/end_span": 1.3556504249572754, + "eval_valid_perplexity/batch": 8.041155815124512, + "eval_valid_perplexity/end_span": 3.8792834281921387, + "eval_valid_perplexity/fim": 2.515770435333252, + "eval_valid_perplexity/first_seq": 15.238995552062988, + "eval_valid_perplexity/last_seq": 9.274471282958984, + "eval_valid_perplexity/second_seq": 13.740442276000977, + "eval_valid_perplexity/seq": 9.055619239807129, + "eval_valid_reconstruction/all": 0.28526046872138977, + "eval_valid_reconstruction/end_span": 0.6905751824378967, + "eval_valid_reconstruction/fim": 0.17720071971416473, + "eval_valid_reconstruction/first_seq": 0.15830017626285553, + "eval_valid_reconstruction/last_seq": 0.31909412145614624, + "eval_valid_reconstruction/second_seq": 0.1923086792230606, + "eval_valid_runtime": 434.0361, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 7550 + }, + { + "epoch": 0.028162604537349954, + "eval_train_loss": 2.2220065593719482, + "eval_train_loss/all": 2.0537986755371094, + "eval_train_loss/end_span": 1.3175972700119019, + "eval_train_perplexity/batch": 7.797464847564697, + "eval_train_perplexity/end_span": 3.7344377040863037, + "eval_train_perplexity/fim": 2.2129616737365723, + "eval_train_perplexity/first_seq": 15.821182250976562, + "eval_train_perplexity/last_seq": 9.420080184936523, + "eval_train_perplexity/second_seq": 14.180179595947266, + "eval_train_perplexity/seq": 8.974220275878906, + "eval_train_reconstruction/all": 0.2756424844264984, + "eval_train_reconstruction/end_span": 0.6995081305503845, + "eval_train_reconstruction/fim": 0.15319202840328217, + "eval_train_reconstruction/first_seq": 0.14341531693935394, + "eval_train_reconstruction/last_seq": 0.3115840256214142, + "eval_train_reconstruction/second_seq": 0.18482211232185364, + "eval_train_runtime": 434.946, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 7550 + }, + { + "epoch": 0.028199906000313333, + "grad_norm": 0.5421603918075562, + "learning_rate": 0.0006, + "loss": 2.193, + "step": 7560 + }, + { + "epoch": 0.02823720746327671, + "grad_norm": 0.3641490340232849, + "learning_rate": 0.0006, + "loss": 2.2781, + "step": 7570 + }, + { + "epoch": 0.02827450892624009, + "grad_norm": 0.6282708644866943, + "learning_rate": 0.0006, + "loss": 2.1363, + "step": 7580 + }, + { + "epoch": 0.028311810389203464, + "grad_norm": 0.5038862824440002, + "learning_rate": 0.0006, + "loss": 2.2027, + "step": 7590 + }, + { + "epoch": 0.02834911185216684, + "grad_norm": 0.27220094203948975, + "learning_rate": 0.0006, + "loss": 2.157, + "step": 7600 + }, + { + "epoch": 0.02834911185216684, + "eval_valid_loss": 2.228092670440674, + "eval_valid_loss/all": 2.087188959121704, + "eval_valid_loss/end_span": 1.2563732862472534, + "eval_valid_perplexity/batch": 8.062219619750977, + "eval_valid_perplexity/end_span": 3.5126588344573975, + "eval_valid_perplexity/fim": 2.2341179847717285, + "eval_valid_perplexity/first_seq": 14.998734474182129, + "eval_valid_perplexity/last_seq": 9.138055801391602, + "eval_valid_perplexity/second_seq": 14.138858795166016, + "eval_valid_perplexity/seq": 9.089489936828613, + "eval_valid_reconstruction/all": 0.2844823896884918, + "eval_valid_reconstruction/end_span": 0.7044139504432678, + "eval_valid_reconstruction/fim": 0.15490929782390594, + "eval_valid_reconstruction/first_seq": 0.1666962057352066, + "eval_valid_reconstruction/last_seq": 0.32081589102745056, + "eval_valid_reconstruction/second_seq": 0.18692365288734436, + "eval_valid_runtime": 436.7601, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 7600 + }, + { + "epoch": 0.02834911185216684, + "eval_train_loss": 2.226202964782715, + "eval_train_loss/all": 2.058171033859253, + "eval_train_loss/end_span": 1.225319266319275, + "eval_train_perplexity/batch": 7.8316330909729, + "eval_train_perplexity/end_span": 3.4052531719207764, + "eval_train_perplexity/fim": 2.0173797607421875, + "eval_train_perplexity/first_seq": 15.584466934204102, + "eval_train_perplexity/last_seq": 9.04158878326416, + "eval_train_perplexity/second_seq": 14.642136573791504, + "eval_train_perplexity/seq": 9.0224027633667, + "eval_train_reconstruction/all": 0.27433958649635315, + "eval_train_reconstruction/end_span": 0.7123144268989563, + "eval_train_reconstruction/fim": 0.13430656492710114, + "eval_train_reconstruction/first_seq": 0.14977186918258667, + "eval_train_reconstruction/last_seq": 0.3254989981651306, + "eval_train_reconstruction/second_seq": 0.17081518471240997, + "eval_train_runtime": 435.9913, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 7600 + }, + { + "epoch": 0.02838641331513022, + "grad_norm": 0.3323260545730591, + "learning_rate": 0.0006, + "loss": 2.3928, + "step": 7610 + }, + { + "epoch": 0.028423714778093596, + "grad_norm": 0.7840943336486816, + "learning_rate": 0.0006, + "loss": 2.1813, + "step": 7620 + }, + { + "epoch": 0.028461016241056975, + "grad_norm": 0.32754138112068176, + "learning_rate": 0.0006, + "loss": 2.2623, + "step": 7630 + }, + { + "epoch": 0.02849831770402035, + "grad_norm": 0.4716815948486328, + "learning_rate": 0.0006, + "loss": 2.2879, + "step": 7640 + }, + { + "epoch": 0.02853561916698373, + "grad_norm": 0.47583094239234924, + "learning_rate": 0.0006, + "loss": 2.4308, + "step": 7650 + }, + { + "epoch": 0.02853561916698373, + "eval_valid_loss": 2.2255489826202393, + "eval_valid_loss/all": 2.084859848022461, + "eval_valid_loss/end_span": 1.3012703657150269, + "eval_valid_perplexity/batch": 8.043463706970215, + "eval_valid_perplexity/end_span": 3.6739609241485596, + "eval_valid_perplexity/fim": 2.950833797454834, + "eval_valid_perplexity/first_seq": 14.68184757232666, + "eval_valid_perplexity/last_seq": 8.937191009521484, + "eval_valid_perplexity/second_seq": 13.647347450256348, + "eval_valid_perplexity/seq": 9.057693481445312, + "eval_valid_reconstruction/all": 0.28535133600234985, + "eval_valid_reconstruction/end_span": 0.6951091289520264, + "eval_valid_reconstruction/fim": 0.20796240866184235, + "eval_valid_reconstruction/first_seq": 0.1713166981935501, + "eval_valid_reconstruction/last_seq": 0.33233439922332764, + "eval_valid_reconstruction/second_seq": 0.19954191148281097, + "eval_valid_runtime": 435.3036, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 7650 + }, + { + "epoch": 0.02853561916698373, + "eval_train_loss": 2.2228660583496094, + "eval_train_loss/all": 2.0547986030578613, + "eval_train_loss/end_span": 1.265215277671814, + "eval_train_perplexity/batch": 7.8052659034729, + "eval_train_perplexity/end_span": 3.543855667114258, + "eval_train_perplexity/fim": 2.1651763916015625, + "eval_train_perplexity/first_seq": 15.625493049621582, + "eval_train_perplexity/last_seq": 9.22199821472168, + "eval_train_perplexity/second_seq": 14.243817329406738, + "eval_train_perplexity/seq": 8.981643676757812, + "eval_train_reconstruction/all": 0.27521196007728577, + "eval_train_reconstruction/end_span": 0.705437183380127, + "eval_train_reconstruction/fim": 0.1480054408311844, + "eval_train_reconstruction/first_seq": 0.14950905740261078, + "eval_train_reconstruction/last_seq": 0.3153078556060791, + "eval_train_reconstruction/second_seq": 0.18342705070972443, + "eval_train_runtime": 432.0904, + "eval_train_samples_per_second": 0.444, + "eval_train_steps_per_second": 0.444, + "step": 7650 + }, + { + "epoch": 0.028572920629947107, + "grad_norm": 0.40656477212905884, + "learning_rate": 0.0006, + "loss": 2.2021, + "step": 7660 + }, + { + "epoch": 0.028610222092910483, + "grad_norm": 0.4316297173500061, + "learning_rate": 0.0006, + "loss": 2.2038, + "step": 7670 + }, + { + "epoch": 0.028647523555873862, + "grad_norm": 0.2767009437084198, + "learning_rate": 0.0006, + "loss": 2.2173, + "step": 7680 + }, + { + "epoch": 0.028684825018837238, + "grad_norm": 0.9198713898658752, + "learning_rate": 0.0006, + "loss": 2.2232, + "step": 7690 + }, + { + "epoch": 0.028722126481800617, + "grad_norm": 0.5561797022819519, + "learning_rate": 0.0006, + "loss": 2.1881, + "step": 7700 + }, + { + "epoch": 0.028722126481800617, + "eval_valid_loss": 2.2261548042297363, + "eval_valid_loss/all": 2.085034132003784, + "eval_valid_loss/end_span": 1.2754329442977905, + "eval_valid_perplexity/batch": 8.044865608215332, + "eval_valid_perplexity/end_span": 3.5802512168884277, + "eval_valid_perplexity/fim": 2.078505754470825, + "eval_valid_perplexity/first_seq": 15.0607271194458, + "eval_valid_perplexity/last_seq": 9.083709716796875, + "eval_valid_perplexity/second_seq": 14.173673629760742, + "eval_valid_perplexity/seq": 9.063185691833496, + "eval_valid_reconstruction/all": 0.2850181460380554, + "eval_valid_reconstruction/end_span": 0.7031712532043457, + "eval_valid_reconstruction/fim": 0.13999763131141663, + "eval_valid_reconstruction/first_seq": 0.16300034523010254, + "eval_valid_reconstruction/last_seq": 0.3221077620983124, + "eval_valid_reconstruction/second_seq": 0.18441396951675415, + "eval_valid_runtime": 436.4743, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 7700 + }, + { + "epoch": 0.028722126481800617, + "eval_train_loss": 2.222698211669922, + "eval_train_loss/all": 2.0543737411499023, + "eval_train_loss/end_span": 1.2375651597976685, + "eval_train_perplexity/batch": 7.801950454711914, + "eval_train_perplexity/end_span": 3.4472098350524902, + "eval_train_perplexity/fim": 2.1034812927246094, + "eval_train_perplexity/first_seq": 15.608752250671387, + "eval_train_perplexity/last_seq": 9.603992462158203, + "eval_train_perplexity/second_seq": 14.46031379699707, + "eval_train_perplexity/seq": 8.979918479919434, + "eval_train_reconstruction/all": 0.275299072265625, + "eval_train_reconstruction/end_span": 0.7128337621688843, + "eval_train_reconstruction/fim": 0.14244137704372406, + "eval_train_reconstruction/first_seq": 0.15171901881694794, + "eval_train_reconstruction/last_seq": 0.3049113154411316, + "eval_train_reconstruction/second_seq": 0.17929093539714813, + "eval_train_runtime": 438.7785, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 7700 + }, + { + "epoch": 0.028759427944763993, + "grad_norm": 0.36158299446105957, + "learning_rate": 0.0006, + "loss": 2.3049, + "step": 7710 + }, + { + "epoch": 0.028796729407727373, + "grad_norm": 0.6197908520698547, + "learning_rate": 0.0006, + "loss": 2.1866, + "step": 7720 + }, + { + "epoch": 0.02883403087069075, + "grad_norm": 0.3487332761287689, + "learning_rate": 0.0006, + "loss": 2.3396, + "step": 7730 + }, + { + "epoch": 0.028871332333654125, + "grad_norm": 0.45350322127342224, + "learning_rate": 0.0006, + "loss": 2.372, + "step": 7740 + }, + { + "epoch": 0.028908633796617504, + "grad_norm": 0.3626973330974579, + "learning_rate": 0.0006, + "loss": 2.127, + "step": 7750 + }, + { + "epoch": 0.028908633796617504, + "eval_valid_loss": 2.2204341888427734, + "eval_valid_loss/all": 2.079927444458008, + "eval_valid_loss/end_span": 1.325140357017517, + "eval_valid_perplexity/batch": 8.003888130187988, + "eval_valid_perplexity/end_span": 3.7627134323120117, + "eval_valid_perplexity/fim": 2.359593391418457, + "eval_valid_perplexity/first_seq": 15.023407936096191, + "eval_valid_perplexity/last_seq": 9.40653133392334, + "eval_valid_perplexity/second_seq": 13.63530445098877, + "eval_valid_perplexity/seq": 9.015280723571777, + "eval_valid_reconstruction/all": 0.28676649928092957, + "eval_valid_reconstruction/end_span": 0.6946278810501099, + "eval_valid_reconstruction/fim": 0.1667136400938034, + "eval_valid_reconstruction/first_seq": 0.16384708881378174, + "eval_valid_reconstruction/last_seq": 0.3148046135902405, + "eval_valid_reconstruction/second_seq": 0.19738571345806122, + "eval_valid_runtime": 440.5679, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 7750 + }, + { + "epoch": 0.028908633796617504, + "eval_train_loss": 2.217578172683716, + "eval_train_loss/all": 2.0498533248901367, + "eval_train_loss/end_span": 1.301703929901123, + "eval_train_perplexity/batch": 7.766761779785156, + "eval_train_perplexity/end_span": 3.6755542755126953, + "eval_train_perplexity/fim": 2.1062214374542236, + "eval_train_perplexity/first_seq": 15.545853614807129, + "eval_train_perplexity/last_seq": 9.055680274963379, + "eval_train_perplexity/second_seq": 13.890422821044922, + "eval_train_perplexity/seq": 8.936196327209473, + "eval_train_reconstruction/all": 0.2769434452056885, + "eval_train_reconstruction/end_span": 0.7055041790008545, + "eval_train_reconstruction/fim": 0.14407531917095184, + "eval_train_reconstruction/first_seq": 0.1526583582162857, + "eval_train_reconstruction/last_seq": 0.32418686151504517, + "eval_train_reconstruction/second_seq": 0.19128598272800446, + "eval_train_runtime": 438.8551, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 7750 + }, + { + "epoch": 0.02894593525958088, + "grad_norm": 0.31763145327568054, + "learning_rate": 0.0006, + "loss": 2.2561, + "step": 7760 + }, + { + "epoch": 0.02898323672254426, + "grad_norm": 0.27665629982948303, + "learning_rate": 0.0006, + "loss": 2.1794, + "step": 7770 + }, + { + "epoch": 0.029020538185507636, + "grad_norm": 0.2890951633453369, + "learning_rate": 0.0006, + "loss": 2.2652, + "step": 7780 + }, + { + "epoch": 0.02905783964847101, + "grad_norm": 0.5025522708892822, + "learning_rate": 0.0006, + "loss": 2.1203, + "step": 7790 + }, + { + "epoch": 0.02909514111143439, + "grad_norm": 0.3712475001811981, + "learning_rate": 0.0006, + "loss": 2.2402, + "step": 7800 + }, + { + "epoch": 0.02909514111143439, + "eval_valid_loss": 2.2241811752319336, + "eval_valid_loss/all": 2.083547830581665, + "eval_valid_loss/end_span": 1.255028486251831, + "eval_valid_perplexity/batch": 8.032917976379395, + "eval_valid_perplexity/end_span": 3.5079383850097656, + "eval_valid_perplexity/fim": 2.497415542602539, + "eval_valid_perplexity/first_seq": 14.57496452331543, + "eval_valid_perplexity/last_seq": 9.293460845947266, + "eval_valid_perplexity/second_seq": 14.00655460357666, + "eval_valid_perplexity/seq": 9.050492286682129, + "eval_valid_reconstruction/all": 0.2855800688266754, + "eval_valid_reconstruction/end_span": 0.7040148377418518, + "eval_valid_reconstruction/fim": 0.1759541928768158, + "eval_valid_reconstruction/first_seq": 0.17463918030261993, + "eval_valid_reconstruction/last_seq": 0.3184466063976288, + "eval_valid_reconstruction/second_seq": 0.18925942480564117, + "eval_valid_runtime": 441.522, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 7800 + }, + { + "epoch": 0.02909514111143439, + "eval_train_loss": 2.221876859664917, + "eval_train_loss/all": 2.053868055343628, + "eval_train_loss/end_span": 1.2400001287460327, + "eval_train_perplexity/batch": 7.798006057739258, + "eval_train_perplexity/end_span": 3.455613851547241, + "eval_train_perplexity/fim": 1.9838844537734985, + "eval_train_perplexity/first_seq": 15.810684204101562, + "eval_train_perplexity/last_seq": 9.57387924194336, + "eval_train_perplexity/second_seq": 14.114925384521484, + "eval_train_perplexity/seq": 8.976716995239258, + "eval_train_reconstruction/all": 0.27564242482185364, + "eval_train_reconstruction/end_span": 0.7097896337509155, + "eval_train_reconstruction/fim": 0.13199163973331451, + "eval_train_reconstruction/first_seq": 0.14491750299930573, + "eval_train_reconstruction/last_seq": 0.3057933747768402, + "eval_train_reconstruction/second_seq": 0.18855004012584686, + "eval_train_runtime": 439.3575, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 7800 + }, + { + "epoch": 0.029132442574397767, + "grad_norm": 0.2864561676979065, + "learning_rate": 0.0006, + "loss": 2.1088, + "step": 7810 + }, + { + "epoch": 0.029169744037361146, + "grad_norm": 0.31966325640678406, + "learning_rate": 0.0006, + "loss": 2.2728, + "step": 7820 + }, + { + "epoch": 0.029207045500324522, + "grad_norm": 0.8171157240867615, + "learning_rate": 0.0006, + "loss": 2.3256, + "step": 7830 + }, + { + "epoch": 0.029244346963287902, + "grad_norm": 0.46794700622558594, + "learning_rate": 0.0006, + "loss": 2.1608, + "step": 7840 + }, + { + "epoch": 0.029281648426251278, + "grad_norm": 0.35895687341690063, + "learning_rate": 0.0006, + "loss": 2.2541, + "step": 7850 + }, + { + "epoch": 0.029281648426251278, + "eval_valid_loss": 2.221090316772461, + "eval_valid_loss/all": 2.0807955265045166, + "eval_valid_loss/end_span": 1.2397150993347168, + "eval_valid_perplexity/batch": 8.010839462280273, + "eval_valid_perplexity/end_span": 3.4546291828155518, + "eval_valid_perplexity/fim": 2.362971544265747, + "eval_valid_perplexity/first_seq": 15.186907768249512, + "eval_valid_perplexity/last_seq": 9.453225135803223, + "eval_valid_perplexity/second_seq": 14.136512756347656, + "eval_valid_perplexity/seq": 9.024133682250977, + "eval_valid_reconstruction/all": 0.28604304790496826, + "eval_valid_reconstruction/end_span": 0.7109188437461853, + "eval_valid_reconstruction/fim": 0.16623280942440033, + "eval_valid_reconstruction/first_seq": 0.16020001471042633, + "eval_valid_reconstruction/last_seq": 0.31091228127479553, + "eval_valid_reconstruction/second_seq": 0.1872672438621521, + "eval_valid_runtime": 437.6191, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 7850 + }, + { + "epoch": 0.029281648426251278, + "eval_train_loss": 2.2184431552886963, + "eval_train_loss/all": 2.0509390830993652, + "eval_train_loss/end_span": 1.2208212614059448, + "eval_train_perplexity/batch": 7.7751994132995605, + "eval_train_perplexity/end_span": 3.389970541000366, + "eval_train_perplexity/fim": 2.096590518951416, + "eval_train_perplexity/first_seq": 15.526229858398438, + "eval_train_perplexity/last_seq": 9.526107788085938, + "eval_train_perplexity/second_seq": 14.395938873291016, + "eval_train_perplexity/seq": 8.94961166381836, + "eval_train_reconstruction/all": 0.27622267603874207, + "eval_train_reconstruction/end_span": 0.718319296836853, + "eval_train_reconstruction/fim": 0.143172025680542, + "eval_train_reconstruction/first_seq": 0.1506127268075943, + "eval_train_reconstruction/last_seq": 0.3085927963256836, + "eval_train_reconstruction/second_seq": 0.177469402551651, + "eval_train_runtime": 436.6737, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 7850 + }, + { + "epoch": 0.029318949889214654, + "grad_norm": 0.40021517872810364, + "learning_rate": 0.0006, + "loss": 2.2685, + "step": 7860 + }, + { + "epoch": 0.029356251352178033, + "grad_norm": 0.5204357504844666, + "learning_rate": 0.0006, + "loss": 2.1803, + "step": 7870 + }, + { + "epoch": 0.02939355281514141, + "grad_norm": 0.3872905969619751, + "learning_rate": 0.0006, + "loss": 2.2908, + "step": 7880 + }, + { + "epoch": 0.02943085427810479, + "grad_norm": 0.4115806818008423, + "learning_rate": 0.0006, + "loss": 2.0992, + "step": 7890 + }, + { + "epoch": 0.029468155741068165, + "grad_norm": 0.5441638827323914, + "learning_rate": 0.0006, + "loss": 2.1826, + "step": 7900 + }, + { + "epoch": 0.029468155741068165, + "eval_valid_loss": 2.2260005474090576, + "eval_valid_loss/all": 2.0848817825317383, + "eval_valid_loss/end_span": 1.2469652891159058, + "eval_valid_perplexity/batch": 8.04364013671875, + "eval_valid_perplexity/end_span": 3.479766845703125, + "eval_valid_perplexity/fim": 2.376546621322632, + "eval_valid_perplexity/first_seq": 14.941043853759766, + "eval_valid_perplexity/last_seq": 9.263331413269043, + "eval_valid_perplexity/second_seq": 13.5570650100708, + "eval_valid_perplexity/seq": 9.053938865661621, + "eval_valid_reconstruction/all": 0.2850548326969147, + "eval_valid_reconstruction/end_span": 0.7090751528739929, + "eval_valid_reconstruction/fim": 0.16495181620121002, + "eval_valid_reconstruction/first_seq": 0.1696702539920807, + "eval_valid_reconstruction/last_seq": 0.31656011939048767, + "eval_valid_reconstruction/second_seq": 0.20081135630607605, + "eval_valid_runtime": 435.6348, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 7900 + }, + { + "epoch": 0.029468155741068165, + "eval_train_loss": 2.222243070602417, + "eval_train_loss/all": 2.05371356010437, + "eval_train_loss/end_span": 1.2238829135894775, + "eval_train_perplexity/batch": 7.7968010902404785, + "eval_train_perplexity/end_span": 3.4003653526306152, + "eval_train_perplexity/fim": 2.168593168258667, + "eval_train_perplexity/first_seq": 15.7245512008667, + "eval_train_perplexity/last_seq": 9.381771087646484, + "eval_train_perplexity/second_seq": 14.227682113647461, + "eval_train_perplexity/seq": 8.967095375061035, + "eval_train_reconstruction/all": 0.2756841480731964, + "eval_train_reconstruction/end_span": 0.7196734547615051, + "eval_train_reconstruction/fim": 0.14863160252571106, + "eval_train_reconstruction/first_seq": 0.14995025098323822, + "eval_train_reconstruction/last_seq": 0.314522922039032, + "eval_train_reconstruction/second_seq": 0.18116220831871033, + "eval_train_runtime": 439.3658, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 7900 + }, + { + "epoch": 0.02950545720403154, + "grad_norm": 0.5126205682754517, + "learning_rate": 0.0006, + "loss": 2.0971, + "step": 7910 + }, + { + "epoch": 0.02954275866699492, + "grad_norm": 0.4067913889884949, + "learning_rate": 0.0006, + "loss": 2.1207, + "step": 7920 + }, + { + "epoch": 0.029580060129958296, + "grad_norm": 0.43059730529785156, + "learning_rate": 0.0006, + "loss": 2.3786, + "step": 7930 + }, + { + "epoch": 0.029617361592921675, + "grad_norm": 0.2663590908050537, + "learning_rate": 0.0006, + "loss": 2.2994, + "step": 7940 + }, + { + "epoch": 0.02965466305588505, + "grad_norm": 0.30312415957450867, + "learning_rate": 0.0006, + "loss": 2.4151, + "step": 7950 + }, + { + "epoch": 0.02965466305588505, + "eval_valid_loss": 2.2183237075805664, + "eval_valid_loss/all": 2.078179121017456, + "eval_valid_loss/end_span": 1.27963387966156, + "eval_valid_perplexity/batch": 7.9899067878723145, + "eval_valid_perplexity/end_span": 3.595323085784912, + "eval_valid_perplexity/fim": 2.2676877975463867, + "eval_valid_perplexity/first_seq": 15.134185791015625, + "eval_valid_perplexity/last_seq": 8.90773868560791, + "eval_valid_perplexity/second_seq": 13.660277366638184, + "eval_valid_perplexity/seq": 9.002283096313477, + "eval_valid_reconstruction/all": 0.2869047224521637, + "eval_valid_reconstruction/end_span": 0.7020336985588074, + "eval_valid_reconstruction/fim": 0.15807436406612396, + "eval_valid_reconstruction/first_seq": 0.16210760176181793, + "eval_valid_reconstruction/last_seq": 0.3303763270378113, + "eval_valid_reconstruction/second_seq": 0.19750072062015533, + "eval_valid_runtime": 434.8808, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 7950 + }, + { + "epoch": 0.02965466305588505, + "eval_train_loss": 2.218984365463257, + "eval_train_loss/all": 2.0515661239624023, + "eval_train_loss/end_span": 1.24853515625, + "eval_train_perplexity/batch": 7.780076026916504, + "eval_train_perplexity/end_span": 3.485233783721924, + "eval_train_perplexity/fim": 2.348776340484619, + "eval_train_perplexity/first_seq": 15.391427040100098, + "eval_train_perplexity/last_seq": 8.928326606750488, + "eval_train_perplexity/second_seq": 14.09713077545166, + "eval_train_perplexity/seq": 8.956035614013672, + "eval_train_reconstruction/all": 0.2762492001056671, + "eval_train_reconstruction/end_span": 0.7119227051734924, + "eval_train_reconstruction/fim": 0.1646970808506012, + "eval_train_reconstruction/first_seq": 0.1533990055322647, + "eval_train_reconstruction/last_seq": 0.32383230328559875, + "eval_train_reconstruction/second_seq": 0.1842638999223709, + "eval_train_runtime": 437.9889, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 7950 + }, + { + "epoch": 0.02969196451884843, + "grad_norm": 0.39584267139434814, + "learning_rate": 0.0006, + "loss": 1.9615, + "step": 7960 + }, + { + "epoch": 0.029729265981811807, + "grad_norm": 0.357891321182251, + "learning_rate": 0.0006, + "loss": 2.1695, + "step": 7970 + }, + { + "epoch": 0.029766567444775183, + "grad_norm": 0.4130466878414154, + "learning_rate": 0.0006, + "loss": 2.2725, + "step": 7980 + }, + { + "epoch": 0.029803868907738562, + "grad_norm": 0.30060169100761414, + "learning_rate": 0.0006, + "loss": 2.2036, + "step": 7990 + }, + { + "epoch": 0.029841170370701938, + "grad_norm": 0.43324920535087585, + "learning_rate": 0.0006, + "loss": 2.2267, + "step": 8000 + }, + { + "epoch": 0.029841170370701938, + "eval_valid_loss": 2.2198359966278076, + "eval_valid_loss/all": 2.0802230834960938, + "eval_valid_loss/end_span": 1.3085120916366577, + "eval_valid_perplexity/batch": 8.006255149841309, + "eval_valid_perplexity/end_span": 3.7006633281707764, + "eval_valid_perplexity/fim": 2.3382747173309326, + "eval_valid_perplexity/first_seq": 14.725166320800781, + "eval_valid_perplexity/last_seq": 9.64765453338623, + "eval_valid_perplexity/second_seq": 14.04266357421875, + "eval_valid_perplexity/seq": 9.026308059692383, + "eval_valid_reconstruction/all": 0.28639349341392517, + "eval_valid_reconstruction/end_span": 0.6950857639312744, + "eval_valid_reconstruction/fim": 0.16406771540641785, + "eval_valid_reconstruction/first_seq": 0.16861192882061005, + "eval_valid_reconstruction/last_seq": 0.3058162331581116, + "eval_valid_reconstruction/second_seq": 0.18993321061134338, + "eval_valid_runtime": 440.5039, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 8000 + }, + { + "epoch": 0.029841170370701938, + "eval_train_loss": 2.213136672973633, + "eval_train_loss/all": 2.0463144779205322, + "eval_train_loss/end_span": 1.2867366075515747, + "eval_train_perplexity/batch": 7.739325046539307, + "eval_train_perplexity/end_span": 3.620950698852539, + "eval_train_perplexity/fim": 1.9280495643615723, + "eval_train_perplexity/first_seq": 15.967706680297852, + "eval_train_perplexity/last_seq": 8.978793144226074, + "eval_train_perplexity/second_seq": 14.067085266113281, + "eval_train_perplexity/seq": 8.909972190856934, + "eval_train_reconstruction/all": 0.27788087725639343, + "eval_train_reconstruction/end_span": 0.7021157145500183, + "eval_train_reconstruction/fim": 0.12743626534938812, + "eval_train_reconstruction/first_seq": 0.14198122918605804, + "eval_train_reconstruction/last_seq": 0.32514268159866333, + "eval_train_reconstruction/second_seq": 0.18711143732070923, + "eval_train_runtime": 438.641, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 8000 + }, + { + "epoch": 0.029878471833665318, + "grad_norm": 0.49459072947502136, + "learning_rate": 0.0006, + "loss": 2.1595, + "step": 8010 + }, + { + "epoch": 0.029915773296628694, + "grad_norm": 0.6214321851730347, + "learning_rate": 0.0006, + "loss": 2.097, + "step": 8020 + }, + { + "epoch": 0.02995307475959207, + "grad_norm": 0.3601341247558594, + "learning_rate": 0.0006, + "loss": 2.0846, + "step": 8030 + }, + { + "epoch": 0.02999037622255545, + "grad_norm": 0.8145695328712463, + "learning_rate": 0.0006, + "loss": 2.3083, + "step": 8040 + }, + { + "epoch": 0.030027677685518825, + "grad_norm": 0.3509317934513092, + "learning_rate": 0.0006, + "loss": 2.3652, + "step": 8050 + }, + { + "epoch": 0.030027677685518825, + "eval_valid_loss": 2.2222094535827637, + "eval_valid_loss/all": 2.0816996097564697, + "eval_valid_loss/end_span": 1.2857691049575806, + "eval_valid_perplexity/batch": 8.018084526062012, + "eval_valid_perplexity/end_span": 3.6174490451812744, + "eval_valid_perplexity/fim": 2.232440710067749, + "eval_valid_perplexity/first_seq": 14.744649887084961, + "eval_valid_perplexity/last_seq": 9.343406677246094, + "eval_valid_perplexity/second_seq": 13.931751251220703, + "eval_valid_perplexity/seq": 9.028935432434082, + "eval_valid_reconstruction/all": 0.28614234924316406, + "eval_valid_reconstruction/end_span": 0.7070154547691345, + "eval_valid_reconstruction/fim": 0.15419578552246094, + "eval_valid_reconstruction/first_seq": 0.16993448138237, + "eval_valid_reconstruction/last_seq": 0.31732842326164246, + "eval_valid_reconstruction/second_seq": 0.19052599370479584, + "eval_valid_runtime": 466.9996, + "eval_valid_samples_per_second": 0.411, + "eval_valid_steps_per_second": 0.411, + "step": 8050 + }, + { + "epoch": 0.030027677685518825, + "eval_train_loss": 2.2222254276275635, + "eval_train_loss/all": 2.054147243499756, + "eval_train_loss/end_span": 1.2566391229629517, + "eval_train_perplexity/batch": 7.800183296203613, + "eval_train_perplexity/end_span": 3.5135929584503174, + "eval_train_perplexity/fim": 2.153684377670288, + "eval_train_perplexity/first_seq": 15.532815933227539, + "eval_train_perplexity/last_seq": 9.452659606933594, + "eval_train_perplexity/second_seq": 14.232001304626465, + "eval_train_perplexity/seq": 8.974921226501465, + "eval_train_reconstruction/all": 0.2755422294139862, + "eval_train_reconstruction/end_span": 0.7150220274925232, + "eval_train_reconstruction/fim": 0.1469663828611374, + "eval_train_reconstruction/first_seq": 0.15132811665534973, + "eval_train_reconstruction/last_seq": 0.3105738162994385, + "eval_train_reconstruction/second_seq": 0.18191568553447723, + "eval_train_runtime": 468.1882, + "eval_train_samples_per_second": 0.41, + "eval_train_steps_per_second": 0.41, + "step": 8050 + }, + { + "epoch": 0.030064979148482204, + "grad_norm": 0.4080299139022827, + "learning_rate": 0.0006, + "loss": 2.3196, + "step": 8060 + }, + { + "epoch": 0.03010228061144558, + "grad_norm": 0.2844442129135132, + "learning_rate": 0.0006, + "loss": 2.3287, + "step": 8070 + }, + { + "epoch": 0.03013958207440896, + "grad_norm": 0.5495111346244812, + "learning_rate": 0.0006, + "loss": 2.1128, + "step": 8080 + }, + { + "epoch": 0.030176883537372336, + "grad_norm": 0.5183519721031189, + "learning_rate": 0.0006, + "loss": 2.1964, + "step": 8090 + }, + { + "epoch": 0.03021418500033571, + "grad_norm": 0.4428865313529968, + "learning_rate": 0.0006, + "loss": 2.3634, + "step": 8100 + }, + { + "epoch": 0.03021418500033571, + "eval_valid_loss": 2.2192370891571045, + "eval_valid_loss/all": 2.078864097595215, + "eval_valid_loss/end_span": 1.2002654075622559, + "eval_valid_perplexity/batch": 7.995381832122803, + "eval_valid_perplexity/end_span": 3.320998191833496, + "eval_valid_perplexity/fim": 2.6996428966522217, + "eval_valid_perplexity/first_seq": 14.95901870727539, + "eval_valid_perplexity/last_seq": 8.885926246643066, + "eval_valid_perplexity/second_seq": 13.889545440673828, + "eval_valid_perplexity/seq": 9.005951881408691, + "eval_valid_reconstruction/all": 0.2867523729801178, + "eval_valid_reconstruction/end_span": 0.7189337611198425, + "eval_valid_reconstruction/fim": 0.1915690004825592, + "eval_valid_reconstruction/first_seq": 0.16472052037715912, + "eval_valid_reconstruction/last_seq": 0.33088162541389465, + "eval_valid_reconstruction/second_seq": 0.19062499701976776, + "eval_valid_runtime": 468.4607, + "eval_valid_samples_per_second": 0.41, + "eval_valid_steps_per_second": 0.41, + "step": 8100 + }, + { + "epoch": 0.03021418500033571, + "eval_train_loss": 2.2172467708587646, + "eval_train_loss/all": 2.050017833709717, + "eval_train_loss/end_span": 1.1792619228363037, + "eval_train_perplexity/batch": 7.768039703369141, + "eval_train_perplexity/end_span": 3.2519731521606445, + "eval_train_perplexity/fim": 2.1626572608947754, + "eval_train_perplexity/first_seq": 15.597384452819824, + "eval_train_perplexity/last_seq": 9.562586784362793, + "eval_train_perplexity/second_seq": 14.527716636657715, + "eval_train_perplexity/seq": 8.943334579467773, + "eval_train_reconstruction/all": 0.27665331959724426, + "eval_train_reconstruction/end_span": 0.723924458026886, + "eval_train_reconstruction/fim": 0.1484825164079666, + "eval_train_reconstruction/first_seq": 0.1471756547689438, + "eval_train_reconstruction/last_seq": 0.30829814076423645, + "eval_train_reconstruction/second_seq": 0.17588599026203156, + "eval_train_runtime": 472.494, + "eval_train_samples_per_second": 0.406, + "eval_train_steps_per_second": 0.406, + "step": 8100 + }, + { + "epoch": 0.03025148646329909, + "grad_norm": 0.4946253299713135, + "learning_rate": 0.0006, + "loss": 2.2765, + "step": 8110 + }, + { + "epoch": 0.030288787926262467, + "grad_norm": 0.4752463102340698, + "learning_rate": 0.0006, + "loss": 2.2629, + "step": 8120 + }, + { + "epoch": 0.030326089389225847, + "grad_norm": 0.2518109679222107, + "learning_rate": 0.0006, + "loss": 2.418, + "step": 8130 + }, + { + "epoch": 0.030363390852189222, + "grad_norm": 0.3058793544769287, + "learning_rate": 0.0006, + "loss": 2.2982, + "step": 8140 + }, + { + "epoch": 0.030400692315152602, + "grad_norm": 0.36457329988479614, + "learning_rate": 0.0006, + "loss": 2.3442, + "step": 8150 + }, + { + "epoch": 0.030400692315152602, + "eval_valid_loss": 2.2199275493621826, + "eval_valid_loss/all": 2.079389810562134, + "eval_valid_loss/end_span": 1.2905887365341187, + "eval_valid_perplexity/batch": 7.99958610534668, + "eval_valid_perplexity/end_span": 3.6349258422851562, + "eval_valid_perplexity/fim": 2.5121209621429443, + "eval_valid_perplexity/first_seq": 14.674823760986328, + "eval_valid_perplexity/last_seq": 9.056486129760742, + "eval_valid_perplexity/second_seq": 13.764719009399414, + "eval_valid_perplexity/seq": 9.009686470031738, + "eval_valid_reconstruction/all": 0.28657010197639465, + "eval_valid_reconstruction/end_span": 0.6926809549331665, + "eval_valid_reconstruction/fim": 0.17715802788734436, + "eval_valid_reconstruction/first_seq": 0.17256669700145721, + "eval_valid_reconstruction/last_seq": 0.3218238651752472, + "eval_valid_reconstruction/second_seq": 0.19348736107349396, + "eval_valid_runtime": 466.8951, + "eval_valid_samples_per_second": 0.411, + "eval_valid_steps_per_second": 0.411, + "step": 8150 + }, + { + "epoch": 0.030400692315152602, + "eval_train_loss": 2.216952085494995, + "eval_train_loss/all": 2.0493428707122803, + "eval_train_loss/end_span": 1.2498794794082642, + "eval_train_perplexity/batch": 7.762798309326172, + "eval_train_perplexity/end_span": 3.489922285079956, + "eval_train_perplexity/fim": 2.115382432937622, + "eval_train_perplexity/first_seq": 15.529120445251465, + "eval_train_perplexity/last_seq": 9.540156364440918, + "eval_train_perplexity/second_seq": 14.144486427307129, + "eval_train_perplexity/seq": 8.929192543029785, + "eval_train_reconstruction/all": 0.2768203020095825, + "eval_train_reconstruction/end_span": 0.7070000767707825, + "eval_train_reconstruction/fim": 0.14416968822479248, + "eval_train_reconstruction/first_seq": 0.15129679441452026, + "eval_train_reconstruction/last_seq": 0.30857715010643005, + "eval_train_reconstruction/second_seq": 0.18772955238819122, + "eval_train_runtime": 466.7447, + "eval_train_samples_per_second": 0.411, + "eval_train_steps_per_second": 0.411, + "step": 8150 + }, + { + "epoch": 0.030437993778115978, + "grad_norm": 0.4344286024570465, + "learning_rate": 0.0006, + "loss": 2.3854, + "step": 8160 + }, + { + "epoch": 0.030475295241079354, + "grad_norm": 0.5003383755683899, + "learning_rate": 0.0006, + "loss": 2.3064, + "step": 8170 + }, + { + "epoch": 0.030512596704042733, + "grad_norm": 0.387129008769989, + "learning_rate": 0.0006, + "loss": 2.3131, + "step": 8180 + }, + { + "epoch": 0.03054989816700611, + "grad_norm": 0.3677014410495758, + "learning_rate": 0.0006, + "loss": 2.1769, + "step": 8190 + }, + { + "epoch": 0.03058719962996949, + "grad_norm": 0.37771403789520264, + "learning_rate": 0.0006, + "loss": 2.2539, + "step": 8200 + }, + { + "epoch": 0.03058719962996949, + "eval_valid_loss": 2.2232844829559326, + "eval_valid_loss/all": 2.0827503204345703, + "eval_valid_loss/end_span": 1.450866460800171, + "eval_valid_perplexity/batch": 8.026514053344727, + "eval_valid_perplexity/end_span": 4.266809940338135, + "eval_valid_perplexity/fim": 2.5641703605651855, + "eval_valid_perplexity/first_seq": 15.144786834716797, + "eval_valid_perplexity/last_seq": 8.95588493347168, + "eval_valid_perplexity/second_seq": 13.966145515441895, + "eval_valid_perplexity/seq": 9.04181957244873, + "eval_valid_reconstruction/all": 0.2859438359737396, + "eval_valid_reconstruction/end_span": 0.6693181395530701, + "eval_valid_reconstruction/fim": 0.18176591396331787, + "eval_valid_reconstruction/first_seq": 0.1620791256427765, + "eval_valid_reconstruction/last_seq": 0.3300292193889618, + "eval_valid_reconstruction/second_seq": 0.19151011109352112, + "eval_valid_runtime": 467.4575, + "eval_valid_samples_per_second": 0.411, + "eval_valid_steps_per_second": 0.411, + "step": 8200 + }, + { + "epoch": 0.03058719962996949, + "eval_train_loss": 2.221015691757202, + "eval_train_loss/all": 2.0531466007232666, + "eval_train_loss/end_span": 1.4047691822052002, + "eval_train_perplexity/batch": 7.79238224029541, + "eval_train_perplexity/end_span": 4.074585914611816, + "eval_train_perplexity/fim": 1.9739106893539429, + "eval_train_perplexity/first_seq": 15.690401077270508, + "eval_train_perplexity/last_seq": 8.764891624450684, + "eval_train_perplexity/second_seq": 14.190650939941406, + "eval_train_perplexity/seq": 8.968337059020996, + "eval_train_reconstruction/all": 0.2760203182697296, + "eval_train_reconstruction/end_span": 0.682147204875946, + "eval_train_reconstruction/fim": 0.1314891129732132, + "eval_train_reconstruction/first_seq": 0.14653617143630981, + "eval_train_reconstruction/last_seq": 0.3327248990535736, + "eval_train_reconstruction/second_seq": 0.18611212074756622, + "eval_train_runtime": 470.4259, + "eval_train_samples_per_second": 0.408, + "eval_train_steps_per_second": 0.408, + "step": 8200 + }, + { + "epoch": 0.030624501092932865, + "grad_norm": 0.42161276936531067, + "learning_rate": 0.0006, + "loss": 2.1711, + "step": 8210 + }, + { + "epoch": 0.03066180255589624, + "grad_norm": 0.3122326135635376, + "learning_rate": 0.0006, + "loss": 2.3165, + "step": 8220 + }, + { + "epoch": 0.03069910401885962, + "grad_norm": 0.4913749694824219, + "learning_rate": 0.0006, + "loss": 2.28, + "step": 8230 + }, + { + "epoch": 0.030736405481822996, + "grad_norm": 0.31483083963394165, + "learning_rate": 0.0006, + "loss": 2.3058, + "step": 8240 + }, + { + "epoch": 0.030773706944786376, + "grad_norm": 0.35547471046447754, + "learning_rate": 0.0006, + "loss": 2.1252, + "step": 8250 + }, + { + "epoch": 0.030773706944786376, + "eval_valid_loss": 2.2235820293426514, + "eval_valid_loss/all": 2.0832107067108154, + "eval_valid_loss/end_span": 1.2978357076644897, + "eval_valid_perplexity/batch": 8.030210494995117, + "eval_valid_perplexity/end_span": 3.6613638401031494, + "eval_valid_perplexity/fim": 2.4652061462402344, + "eval_valid_perplexity/first_seq": 15.353727340698242, + "eval_valid_perplexity/last_seq": 9.60564136505127, + "eval_valid_perplexity/second_seq": 13.851065635681152, + "eval_valid_perplexity/seq": 9.043061256408691, + "eval_valid_reconstruction/all": 0.28550612926483154, + "eval_valid_reconstruction/end_span": 0.7019895911216736, + "eval_valid_reconstruction/fim": 0.17307886481285095, + "eval_valid_reconstruction/first_seq": 0.15672516822814941, + "eval_valid_reconstruction/last_seq": 0.30664557218551636, + "eval_valid_reconstruction/second_seq": 0.19488370418548584, + "eval_valid_runtime": 468.601, + "eval_valid_samples_per_second": 0.41, + "eval_valid_steps_per_second": 0.41, + "step": 8250 + }, + { + "epoch": 0.030773706944786376, + "eval_train_loss": 2.220669984817505, + "eval_train_loss/all": 2.0528523921966553, + "eval_train_loss/end_span": 1.2586288452148438, + "eval_train_perplexity/batch": 7.7900896072387695, + "eval_train_perplexity/end_span": 3.5205907821655273, + "eval_train_perplexity/fim": 2.0417635440826416, + "eval_train_perplexity/first_seq": 15.801078796386719, + "eval_train_perplexity/last_seq": 9.214112281799316, + "eval_train_perplexity/second_seq": 14.558700561523438, + "eval_train_perplexity/seq": 8.961934089660645, + "eval_train_reconstruction/all": 0.27571436762809753, + "eval_train_reconstruction/end_span": 0.7138824462890625, + "eval_train_reconstruction/fim": 0.1376553773880005, + "eval_train_reconstruction/first_seq": 0.14611616730690002, + "eval_train_reconstruction/last_seq": 0.3165859580039978, + "eval_train_reconstruction/second_seq": 0.17705656588077545, + "eval_train_runtime": 464.5355, + "eval_train_samples_per_second": 0.413, + "eval_train_steps_per_second": 0.413, + "step": 8250 + }, + { + "epoch": 0.03081100840774975, + "grad_norm": 0.26217710971832275, + "learning_rate": 0.0006, + "loss": 2.2957, + "step": 8260 + }, + { + "epoch": 0.03084830987071313, + "grad_norm": 0.37034595012664795, + "learning_rate": 0.0006, + "loss": 2.2247, + "step": 8270 + }, + { + "epoch": 0.030885611333676507, + "grad_norm": 0.5273758172988892, + "learning_rate": 0.0006, + "loss": 2.1913, + "step": 8280 + }, + { + "epoch": 0.030922912796639883, + "grad_norm": 0.5263083577156067, + "learning_rate": 0.0006, + "loss": 2.3307, + "step": 8290 + }, + { + "epoch": 0.030960214259603262, + "grad_norm": 0.3890683650970459, + "learning_rate": 0.0006, + "loss": 2.3585, + "step": 8300 + }, + { + "epoch": 0.030960214259603262, + "eval_valid_loss": 2.223881959915161, + "eval_valid_loss/all": 2.083380937576294, + "eval_valid_loss/end_span": 1.3477871417999268, + "eval_valid_perplexity/batch": 8.031577110290527, + "eval_valid_perplexity/end_span": 3.8488991260528564, + "eval_valid_perplexity/fim": 2.4755070209503174, + "eval_valid_perplexity/first_seq": 14.543866157531738, + "eval_valid_perplexity/last_seq": 9.122130393981934, + "eval_valid_perplexity/second_seq": 13.903132438659668, + "eval_valid_perplexity/seq": 9.044459342956543, + "eval_valid_reconstruction/all": 0.2860049903392792, + "eval_valid_reconstruction/end_span": 0.6969264149665833, + "eval_valid_reconstruction/fim": 0.17442117631435394, + "eval_valid_reconstruction/first_seq": 0.17839638888835907, + "eval_valid_reconstruction/last_seq": 0.3239574432373047, + "eval_valid_reconstruction/second_seq": 0.19161269068717957, + "eval_valid_runtime": 462.5181, + "eval_valid_samples_per_second": 0.415, + "eval_valid_steps_per_second": 0.415, + "step": 8300 + }, + { + "epoch": 0.030960214259603262, + "eval_train_loss": 2.2209677696228027, + "eval_train_loss/all": 2.0530757904052734, + "eval_train_loss/end_span": 1.3191190958023071, + "eval_train_perplexity/batch": 7.791830539703369, + "eval_train_perplexity/end_span": 3.7401251792907715, + "eval_train_perplexity/fim": 2.030287265777588, + "eval_train_perplexity/first_seq": 15.858528137207031, + "eval_train_perplexity/last_seq": 9.184927940368652, + "eval_train_perplexity/second_seq": 14.459220886230469, + "eval_train_perplexity/seq": 8.967452049255371, + "eval_train_reconstruction/all": 0.276398628950119, + "eval_train_reconstruction/end_span": 0.7028287053108215, + "eval_train_reconstruction/fim": 0.1370125263929367, + "eval_train_reconstruction/first_seq": 0.14282383024692535, + "eval_train_reconstruction/last_seq": 0.31556791067123413, + "eval_train_reconstruction/second_seq": 0.1754496544599533, + "eval_train_runtime": 465.668, + "eval_train_samples_per_second": 0.412, + "eval_train_steps_per_second": 0.412, + "step": 8300 + }, + { + "epoch": 0.030997515722566638, + "grad_norm": 0.3210814595222473, + "learning_rate": 0.0006, + "loss": 2.34, + "step": 8310 + }, + { + "epoch": 0.031034817185530018, + "grad_norm": 0.5099297761917114, + "learning_rate": 0.0006, + "loss": 2.291, + "step": 8320 + }, + { + "epoch": 0.031072118648493394, + "grad_norm": 0.4379767179489136, + "learning_rate": 0.0006, + "loss": 2.2682, + "step": 8330 + }, + { + "epoch": 0.03110942011145677, + "grad_norm": 0.32827550172805786, + "learning_rate": 0.0006, + "loss": 2.252, + "step": 8340 + }, + { + "epoch": 0.03114672157442015, + "grad_norm": 0.49803268909454346, + "learning_rate": 0.0006, + "loss": 2.3608, + "step": 8350 + }, + { + "epoch": 0.03114672157442015, + "eval_valid_loss": 2.226435899734497, + "eval_valid_loss/all": 2.0857656002044678, + "eval_valid_loss/end_span": 1.2448418140411377, + "eval_valid_perplexity/batch": 8.050752639770508, + "eval_valid_perplexity/end_span": 3.4723854064941406, + "eval_valid_perplexity/fim": 2.4430692195892334, + "eval_valid_perplexity/first_seq": 15.050216674804688, + "eval_valid_perplexity/last_seq": 9.488208770751953, + "eval_valid_perplexity/second_seq": 13.625077247619629, + "eval_valid_perplexity/seq": 9.06729507446289, + "eval_valid_reconstruction/all": 0.28486168384552, + "eval_valid_reconstruction/end_span": 0.7132643461227417, + "eval_valid_reconstruction/fim": 0.17015066742897034, + "eval_valid_reconstruction/first_seq": 0.16502167284488678, + "eval_valid_reconstruction/last_seq": 0.3153192698955536, + "eval_valid_reconstruction/second_seq": 0.19750207662582397, + "eval_valid_runtime": 469.7061, + "eval_valid_samples_per_second": 0.409, + "eval_valid_steps_per_second": 0.409, + "step": 8350 + }, + { + "epoch": 0.03114672157442015, + "eval_train_loss": 2.222024440765381, + "eval_train_loss/all": 2.053440809249878, + "eval_train_loss/end_span": 1.2045625448226929, + "eval_train_perplexity/batch": 7.794674873352051, + "eval_train_perplexity/end_span": 3.3352997303009033, + "eval_train_perplexity/fim": 2.151560068130493, + "eval_train_perplexity/first_seq": 15.729931831359863, + "eval_train_perplexity/last_seq": 9.379487991333008, + "eval_train_perplexity/second_seq": 14.406755447387695, + "eval_train_perplexity/seq": 8.96304702758789, + "eval_train_reconstruction/all": 0.2758825123310089, + "eval_train_reconstruction/end_span": 0.7248477339744568, + "eval_train_reconstruction/fim": 0.14791636168956757, + "eval_train_reconstruction/first_seq": 0.14782917499542236, + "eval_train_reconstruction/last_seq": 0.3131287693977356, + "eval_train_reconstruction/second_seq": 0.1827831119298935, + "eval_train_runtime": 469.83, + "eval_train_samples_per_second": 0.409, + "eval_train_steps_per_second": 0.409, + "step": 8350 + }, + { + "epoch": 0.031184023037383525, + "grad_norm": 0.33905482292175293, + "learning_rate": 0.0006, + "loss": 2.2574, + "step": 8360 + }, + { + "epoch": 0.031221324500346904, + "grad_norm": 0.37466853857040405, + "learning_rate": 0.0006, + "loss": 2.2009, + "step": 8370 + }, + { + "epoch": 0.031258625963310284, + "grad_norm": 0.45091161131858826, + "learning_rate": 0.0006, + "loss": 2.267, + "step": 8380 + }, + { + "epoch": 0.031295927426273656, + "grad_norm": 0.30719617009162903, + "learning_rate": 0.0006, + "loss": 2.3446, + "step": 8390 + }, + { + "epoch": 0.031333228889237036, + "grad_norm": 0.4254585802555084, + "learning_rate": 0.0006, + "loss": 2.0522, + "step": 8400 + }, + { + "epoch": 0.031333228889237036, + "eval_valid_loss": 2.2204737663269043, + "eval_valid_loss/all": 2.080605983734131, + "eval_valid_loss/end_span": 1.2682467699050903, + "eval_valid_perplexity/batch": 8.009321212768555, + "eval_valid_perplexity/end_span": 3.554615020751953, + "eval_valid_perplexity/fim": 2.270019054412842, + "eval_valid_perplexity/first_seq": 14.705154418945312, + "eval_valid_perplexity/last_seq": 9.667094230651855, + "eval_valid_perplexity/second_seq": 13.958033561706543, + "eval_valid_perplexity/seq": 9.025825500488281, + "eval_valid_reconstruction/all": 0.2862657308578491, + "eval_valid_reconstruction/end_span": 0.7091875672340393, + "eval_valid_reconstruction/fim": 0.1586640328168869, + "eval_valid_reconstruction/first_seq": 0.17142009735107422, + "eval_valid_reconstruction/last_seq": 0.3030620217323303, + "eval_valid_reconstruction/second_seq": 0.19268520176410675, + "eval_valid_runtime": 468.1161, + "eval_valid_samples_per_second": 0.41, + "eval_valid_steps_per_second": 0.41, + "step": 8400 + }, + { + "epoch": 0.031333228889237036, + "eval_train_loss": 2.2175652980804443, + "eval_train_loss/all": 2.0501456260681152, + "eval_train_loss/end_span": 1.2477226257324219, + "eval_train_perplexity/batch": 7.7690324783325195, + "eval_train_perplexity/end_span": 3.48240327835083, + "eval_train_perplexity/fim": 2.21486496925354, + "eval_train_perplexity/first_seq": 15.729347229003906, + "eval_train_perplexity/last_seq": 9.110230445861816, + "eval_train_perplexity/second_seq": 14.599863052368164, + "eval_train_perplexity/seq": 8.939513206481934, + "eval_train_reconstruction/all": 0.27665647864341736, + "eval_train_reconstruction/end_span": 0.7145592570304871, + "eval_train_reconstruction/fim": 0.15326009690761566, + "eval_train_reconstruction/first_seq": 0.14849722385406494, + "eval_train_reconstruction/last_seq": 0.32290545105934143, + "eval_train_reconstruction/second_seq": 0.17360951006412506, + "eval_train_runtime": 467.64, + "eval_train_samples_per_second": 0.411, + "eval_train_steps_per_second": 0.411, + "step": 8400 + }, + { + "epoch": 0.031370530352200415, + "grad_norm": 0.47656935453414917, + "learning_rate": 0.0006, + "loss": 2.2323, + "step": 8410 + }, + { + "epoch": 0.03140783181516379, + "grad_norm": 0.5478979349136353, + "learning_rate": 0.0006, + "loss": 2.1661, + "step": 8420 + }, + { + "epoch": 0.03144513327812717, + "grad_norm": 0.3226728141307831, + "learning_rate": 0.0006, + "loss": 2.3671, + "step": 8430 + }, + { + "epoch": 0.03148243474109055, + "grad_norm": 0.3202236294746399, + "learning_rate": 0.0006, + "loss": 2.2003, + "step": 8440 + }, + { + "epoch": 0.031519736204053926, + "grad_norm": 0.4776422381401062, + "learning_rate": 0.0006, + "loss": 2.3123, + "step": 8450 + }, + { + "epoch": 0.031519736204053926, + "eval_valid_loss": 2.217763662338257, + "eval_valid_loss/all": 2.0779945850372314, + "eval_valid_loss/end_span": 1.3177316188812256, + "eval_valid_perplexity/batch": 7.988432884216309, + "eval_valid_perplexity/end_span": 3.7349395751953125, + "eval_valid_perplexity/fim": 2.4129014015197754, + "eval_valid_perplexity/first_seq": 14.372272491455078, + "eval_valid_perplexity/last_seq": 9.40708065032959, + "eval_valid_perplexity/second_seq": 13.694177627563477, + "eval_valid_perplexity/seq": 9.00173282623291, + "eval_valid_reconstruction/all": 0.28731951117515564, + "eval_valid_reconstruction/end_span": 0.6936770081520081, + "eval_valid_reconstruction/fim": 0.17165862023830414, + "eval_valid_reconstruction/first_seq": 0.17863066494464874, + "eval_valid_reconstruction/last_seq": 0.31234318017959595, + "eval_valid_reconstruction/second_seq": 0.19774140417575836, + "eval_valid_runtime": 469.3879, + "eval_valid_samples_per_second": 0.409, + "eval_valid_steps_per_second": 0.409, + "step": 8450 + }, + { + "epoch": 0.031519736204053926, + "eval_train_loss": 2.215461492538452, + "eval_train_loss/all": 2.0483248233795166, + "eval_train_loss/end_span": 1.2991069555282593, + "eval_train_perplexity/batch": 7.754899501800537, + "eval_train_perplexity/end_span": 3.6660213470458984, + "eval_train_perplexity/fim": 2.1185460090637207, + "eval_train_perplexity/first_seq": 15.710498809814453, + "eval_train_perplexity/last_seq": 9.226139068603516, + "eval_train_perplexity/second_seq": 14.406631469726562, + "eval_train_perplexity/seq": 8.9240140914917, + "eval_train_reconstruction/all": 0.2771148085594177, + "eval_train_reconstruction/end_span": 0.6984090805053711, + "eval_train_reconstruction/fim": 0.14538748562335968, + "eval_train_reconstruction/first_seq": 0.14879082143306732, + "eval_train_reconstruction/last_seq": 0.31440553069114685, + "eval_train_reconstruction/second_seq": 0.18085889518260956, + "eval_train_runtime": 466.7112, + "eval_train_samples_per_second": 0.411, + "eval_train_steps_per_second": 0.411, + "step": 8450 + }, + { + "epoch": 0.0315570376670173, + "grad_norm": 0.41233283281326294, + "learning_rate": 0.0006, + "loss": 2.2751, + "step": 8460 + }, + { + "epoch": 0.03159433912998068, + "grad_norm": 0.7606602907180786, + "learning_rate": 0.0006, + "loss": 2.0374, + "step": 8470 + }, + { + "epoch": 0.03163164059294406, + "grad_norm": 0.4562098979949951, + "learning_rate": 0.0006, + "loss": 2.2496, + "step": 8480 + }, + { + "epoch": 0.03166894205590743, + "grad_norm": 0.36072325706481934, + "learning_rate": 0.0006, + "loss": 2.2413, + "step": 8490 + }, + { + "epoch": 0.03170624351887081, + "grad_norm": 1.4077857732772827, + "learning_rate": 0.0006, + "loss": 2.1496, + "step": 8500 + }, + { + "epoch": 0.03170624351887081, + "eval_valid_loss": 2.2210886478424072, + "eval_valid_loss/all": 2.0805952548980713, + "eval_valid_loss/end_span": 1.259790062904358, + "eval_valid_perplexity/batch": 8.009235382080078, + "eval_valid_perplexity/end_span": 3.524681568145752, + "eval_valid_perplexity/fim": 2.2767484188079834, + "eval_valid_perplexity/first_seq": 14.723593711853027, + "eval_valid_perplexity/last_seq": 9.335211753845215, + "eval_valid_perplexity/second_seq": 13.425060272216797, + "eval_valid_perplexity/seq": 9.02608585357666, + "eval_valid_reconstruction/all": 0.28651249408721924, + "eval_valid_reconstruction/end_span": 0.7112993001937866, + "eval_valid_reconstruction/fim": 0.1583636850118637, + "eval_valid_reconstruction/first_seq": 0.17088203132152557, + "eval_valid_reconstruction/last_seq": 0.3153434097766876, + "eval_valid_reconstruction/second_seq": 0.20835444331169128, + "eval_valid_runtime": 464.9894, + "eval_valid_samples_per_second": 0.413, + "eval_valid_steps_per_second": 0.413, + "step": 8500 + }, + { + "epoch": 0.03170624351887081, + "eval_train_loss": 2.2172467708587646, + "eval_train_loss/all": 2.0497617721557617, + "eval_train_loss/end_span": 1.2354601621627808, + "eval_train_perplexity/batch": 7.766050815582275, + "eval_train_perplexity/end_span": 3.4399611949920654, + "eval_train_perplexity/fim": 2.3529751300811768, + "eval_train_perplexity/first_seq": 15.791625022888184, + "eval_train_perplexity/last_seq": 9.523025512695312, + "eval_train_perplexity/second_seq": 14.044465065002441, + "eval_train_perplexity/seq": 8.937295913696289, + "eval_train_reconstruction/all": 0.2768336236476898, + "eval_train_reconstruction/end_span": 0.7191101312637329, + "eval_train_reconstruction/fim": 0.16512928903102875, + "eval_train_reconstruction/first_seq": 0.14631634950637817, + "eval_train_reconstruction/last_seq": 0.3080640733242035, + "eval_train_reconstruction/second_seq": 0.18832047283649445, + "eval_train_runtime": 470.0709, + "eval_train_samples_per_second": 0.408, + "eval_train_steps_per_second": 0.408, + "step": 8500 + }, + { + "epoch": 0.03174354498183419, + "grad_norm": 0.3755190968513489, + "learning_rate": 0.0006, + "loss": 2.2819, + "step": 8510 + }, + { + "epoch": 0.03178084644479757, + "grad_norm": 0.4374644458293915, + "learning_rate": 0.0006, + "loss": 2.2796, + "step": 8520 + }, + { + "epoch": 0.03181814790776094, + "grad_norm": 0.3273330330848694, + "learning_rate": 0.0006, + "loss": 2.2856, + "step": 8530 + }, + { + "epoch": 0.03185544937072432, + "grad_norm": 0.41302478313446045, + "learning_rate": 0.0006, + "loss": 2.2497, + "step": 8540 + }, + { + "epoch": 0.0318927508336877, + "grad_norm": 0.5523306727409363, + "learning_rate": 0.0006, + "loss": 2.1086, + "step": 8550 + }, + { + "epoch": 0.0318927508336877, + "eval_valid_loss": 2.2235517501831055, + "eval_valid_loss/all": 2.083268880844116, + "eval_valid_loss/end_span": 1.2474108934402466, + "eval_valid_perplexity/batch": 8.030677795410156, + "eval_valid_perplexity/end_span": 3.4813177585601807, + "eval_valid_perplexity/fim": 2.9106712341308594, + "eval_valid_perplexity/first_seq": 14.96973705291748, + "eval_valid_perplexity/last_seq": 9.342517852783203, + "eval_valid_perplexity/second_seq": 14.249037742614746, + "eval_valid_perplexity/seq": 9.049901008605957, + "eval_valid_reconstruction/all": 0.28570353984832764, + "eval_valid_reconstruction/end_span": 0.7096364498138428, + "eval_valid_reconstruction/fim": 0.2054285705089569, + "eval_valid_reconstruction/first_seq": 0.16768480837345123, + "eval_valid_reconstruction/last_seq": 0.31588926911354065, + "eval_valid_reconstruction/second_seq": 0.1866312026977539, + "eval_valid_runtime": 469.3828, + "eval_valid_samples_per_second": 0.409, + "eval_valid_steps_per_second": 0.409, + "step": 8550 + }, + { + "epoch": 0.0318927508336877, + "eval_train_loss": 2.220475673675537, + "eval_train_loss/all": 2.0529119968414307, + "eval_train_loss/end_span": 1.2161504030227661, + "eval_train_perplexity/batch": 7.790554046630859, + "eval_train_perplexity/end_span": 3.374173402786255, + "eval_train_perplexity/fim": 2.1680352687835693, + "eval_train_perplexity/first_seq": 15.7829008102417, + "eval_train_perplexity/last_seq": 9.617632865905762, + "eval_train_perplexity/second_seq": 14.243026733398438, + "eval_train_perplexity/seq": 8.967455863952637, + "eval_train_reconstruction/all": 0.27575013041496277, + "eval_train_reconstruction/end_span": 0.7195966243743896, + "eval_train_reconstruction/fim": 0.1484094262123108, + "eval_train_reconstruction/first_seq": 0.14618103206157684, + "eval_train_reconstruction/last_seq": 0.3040156066417694, + "eval_train_reconstruction/second_seq": 0.1859830617904663, + "eval_train_runtime": 468.3207, + "eval_train_samples_per_second": 0.41, + "eval_train_steps_per_second": 0.41, + "step": 8550 + }, + { + "epoch": 0.03193005229665107, + "grad_norm": 0.44907575845718384, + "learning_rate": 0.0006, + "loss": 2.2985, + "step": 8560 + }, + { + "epoch": 0.03196735375961445, + "grad_norm": 0.43024879693984985, + "learning_rate": 0.0006, + "loss": 2.2165, + "step": 8570 + }, + { + "epoch": 0.03200465522257783, + "grad_norm": 0.5523467659950256, + "learning_rate": 0.0006, + "loss": 2.3121, + "step": 8580 + }, + { + "epoch": 0.032041956685541204, + "grad_norm": 0.4683813154697418, + "learning_rate": 0.0006, + "loss": 2.2254, + "step": 8590 + }, + { + "epoch": 0.03207925814850458, + "grad_norm": 0.2852557301521301, + "learning_rate": 0.0006, + "loss": 2.4128, + "step": 8600 + }, + { + "epoch": 0.03207925814850458, + "eval_valid_loss": 2.2231123447418213, + "eval_valid_loss/all": 2.0825774669647217, + "eval_valid_loss/end_span": 1.347214698791504, + "eval_valid_perplexity/batch": 8.025126457214355, + "eval_valid_perplexity/end_span": 3.846696376800537, + "eval_valid_perplexity/fim": 2.6996517181396484, + "eval_valid_perplexity/first_seq": 14.915349960327148, + "eval_valid_perplexity/last_seq": 9.097068786621094, + "eval_valid_perplexity/second_seq": 13.835424423217773, + "eval_valid_perplexity/seq": 9.039337158203125, + "eval_valid_reconstruction/all": 0.28590378165245056, + "eval_valid_reconstruction/end_span": 0.6846988797187805, + "eval_valid_reconstruction/fim": 0.19143171608448029, + "eval_valid_reconstruction/first_seq": 0.16699008643627167, + "eval_valid_reconstruction/last_seq": 0.3246649503707886, + "eval_valid_reconstruction/second_seq": 0.19522501528263092, + "eval_valid_runtime": 466.1173, + "eval_valid_samples_per_second": 0.412, + "eval_valid_steps_per_second": 0.412, + "step": 8600 + }, + { + "epoch": 0.03207925814850458, + "eval_train_loss": 2.219613790512085, + "eval_train_loss/all": 2.05125093460083, + "eval_train_loss/end_span": 1.3008010387420654, + "eval_train_perplexity/batch": 7.777624130249023, + "eval_train_perplexity/end_span": 3.6722371578216553, + "eval_train_perplexity/fim": 2.0106048583984375, + "eval_train_perplexity/first_seq": 15.411335945129395, + "eval_train_perplexity/last_seq": 9.080795288085938, + "eval_train_perplexity/second_seq": 14.026010513305664, + "eval_train_perplexity/seq": 8.945547103881836, + "eval_train_reconstruction/all": 0.2765738070011139, + "eval_train_reconstruction/end_span": 0.694435179233551, + "eval_train_reconstruction/fim": 0.13486678898334503, + "eval_train_reconstruction/first_seq": 0.15400339663028717, + "eval_train_reconstruction/last_seq": 0.3248569965362549, + "eval_train_reconstruction/second_seq": 0.18959519267082214, + "eval_train_runtime": 466.594, + "eval_train_samples_per_second": 0.411, + "eval_train_steps_per_second": 0.411, + "step": 8600 + }, + { + "epoch": 0.03211655961146796, + "grad_norm": 0.40717455744743347, + "learning_rate": 0.0006, + "loss": 2.2579, + "step": 8610 + }, + { + "epoch": 0.03215386107443134, + "grad_norm": 0.5705510377883911, + "learning_rate": 0.0006, + "loss": 2.1716, + "step": 8620 + }, + { + "epoch": 0.032191162537394714, + "grad_norm": 0.602489709854126, + "learning_rate": 0.0006, + "loss": 2.272, + "step": 8630 + }, + { + "epoch": 0.032228464000358094, + "grad_norm": 0.6269575357437134, + "learning_rate": 0.0006, + "loss": 2.0195, + "step": 8640 + }, + { + "epoch": 0.03226576546332147, + "grad_norm": 0.37681353092193604, + "learning_rate": 0.0006, + "loss": 2.3785, + "step": 8650 + }, + { + "epoch": 0.03226576546332147, + "eval_valid_loss": 2.2287628650665283, + "eval_valid_loss/all": 2.0876870155334473, + "eval_valid_loss/end_span": 1.2876908779144287, + "eval_valid_perplexity/batch": 8.06623649597168, + "eval_valid_perplexity/end_span": 3.6244077682495117, + "eval_valid_perplexity/fim": 2.1246767044067383, + "eval_valid_perplexity/first_seq": 14.473182678222656, + "eval_valid_perplexity/last_seq": 8.876007080078125, + "eval_valid_perplexity/second_seq": 14.103780746459961, + "eval_valid_perplexity/seq": 9.082903861999512, + "eval_valid_reconstruction/all": 0.2842976152896881, + "eval_valid_reconstruction/end_span": 0.6986424922943115, + "eval_valid_reconstruction/fim": 0.14346380531787872, + "eval_valid_reconstruction/first_seq": 0.1790168136358261, + "eval_valid_reconstruction/last_seq": 0.33437564969062805, + "eval_valid_reconstruction/second_seq": 0.18974319100379944, + "eval_valid_runtime": 467.8994, + "eval_valid_samples_per_second": 0.41, + "eval_valid_steps_per_second": 0.41, + "step": 8650 + }, + { + "epoch": 0.03226576546332147, + "eval_train_loss": 2.226836919784546, + "eval_train_loss/all": 2.0583608150482178, + "eval_train_loss/end_span": 1.2534879446029663, + "eval_train_perplexity/batch": 7.8331193923950195, + "eval_train_perplexity/end_span": 3.5025384426116943, + "eval_train_perplexity/fim": 1.9653364419937134, + "eval_train_perplexity/first_seq": 15.641024589538574, + "eval_train_perplexity/last_seq": 9.065827369689941, + "eval_train_perplexity/second_seq": 14.325657844543457, + "eval_train_perplexity/seq": 9.016491889953613, + "eval_train_reconstruction/all": 0.2742689251899719, + "eval_train_reconstruction/end_span": 0.7083103060722351, + "eval_train_reconstruction/fim": 0.1298341006040573, + "eval_train_reconstruction/first_seq": 0.14911724627017975, + "eval_train_reconstruction/last_seq": 0.32472917437553406, + "eval_train_reconstruction/second_seq": 0.1830068677663803, + "eval_train_runtime": 467.4404, + "eval_train_samples_per_second": 0.411, + "eval_train_steps_per_second": 0.411, + "step": 8650 + }, + { + "epoch": 0.032303066926284846, + "grad_norm": 0.55269855260849, + "learning_rate": 0.0006, + "loss": 2.2899, + "step": 8660 + }, + { + "epoch": 0.032340368389248225, + "grad_norm": 0.45333564281463623, + "learning_rate": 0.0006, + "loss": 2.184, + "step": 8670 + }, + { + "epoch": 0.032377669852211605, + "grad_norm": 0.48598456382751465, + "learning_rate": 0.0006, + "loss": 2.3476, + "step": 8680 + }, + { + "epoch": 0.032414971315174984, + "grad_norm": 0.5327557921409607, + "learning_rate": 0.0006, + "loss": 2.0989, + "step": 8690 + }, + { + "epoch": 0.03245227277813836, + "grad_norm": 0.37348315119743347, + "learning_rate": 0.0006, + "loss": 1.9873, + "step": 8700 + }, + { + "epoch": 0.03245227277813836, + "eval_valid_loss": 2.22004771232605, + "eval_valid_loss/all": 2.0804364681243896, + "eval_valid_loss/end_span": 1.32597815990448, + "eval_valid_perplexity/batch": 8.007963180541992, + "eval_valid_perplexity/end_span": 3.765867233276367, + "eval_valid_perplexity/fim": 2.5299172401428223, + "eval_valid_perplexity/first_seq": 14.548725128173828, + "eval_valid_perplexity/last_seq": 9.168951034545898, + "eval_valid_perplexity/second_seq": 14.288496971130371, + "eval_valid_perplexity/seq": 9.034161567687988, + "eval_valid_reconstruction/all": 0.28637877106666565, + "eval_valid_reconstruction/end_span": 0.6835506558418274, + "eval_valid_reconstruction/fim": 0.17941653728485107, + "eval_valid_reconstruction/first_seq": 0.17469152808189392, + "eval_valid_reconstruction/last_seq": 0.3218110203742981, + "eval_valid_reconstruction/second_seq": 0.18396006524562836, + "eval_valid_runtime": 465.0823, + "eval_valid_samples_per_second": 0.413, + "eval_valid_steps_per_second": 0.413, + "step": 8700 + }, + { + "epoch": 0.03245227277813836, + "eval_train_loss": 2.2180583477020264, + "eval_train_loss/all": 2.0514512062072754, + "eval_train_loss/end_span": 1.295496940612793, + "eval_train_perplexity/batch": 7.779181957244873, + "eval_train_perplexity/end_span": 3.65281081199646, + "eval_train_perplexity/fim": 2.1341192722320557, + "eval_train_perplexity/first_seq": 15.702601432800293, + "eval_train_perplexity/last_seq": 9.193018913269043, + "eval_train_perplexity/second_seq": 14.461986541748047, + "eval_train_perplexity/seq": 8.966890335083008, + "eval_train_reconstruction/all": 0.2760319709777832, + "eval_train_reconstruction/end_span": 0.6928083300590515, + "eval_train_reconstruction/fim": 0.1456572264432907, + "eval_train_reconstruction/first_seq": 0.1468769609928131, + "eval_train_reconstruction/last_seq": 0.3162009119987488, + "eval_train_reconstruction/second_seq": 0.18087275326251984, + "eval_train_runtime": 463.5425, + "eval_train_samples_per_second": 0.414, + "eval_train_steps_per_second": 0.414, + "step": 8700 + }, + { + "epoch": 0.032489574241101736, + "grad_norm": 0.4094570279121399, + "learning_rate": 0.0006, + "loss": 2.226, + "step": 8710 + }, + { + "epoch": 0.032526875704065115, + "grad_norm": 0.3607815206050873, + "learning_rate": 0.0006, + "loss": 2.1412, + "step": 8720 + }, + { + "epoch": 0.03256417716702849, + "grad_norm": 0.40218067169189453, + "learning_rate": 0.0006, + "loss": 2.4132, + "step": 8730 + }, + { + "epoch": 0.03260147862999187, + "grad_norm": 0.28553974628448486, + "learning_rate": 0.0006, + "loss": 2.2726, + "step": 8740 + }, + { + "epoch": 0.03263878009295525, + "grad_norm": 0.4071624279022217, + "learning_rate": 0.0006, + "loss": 1.9778, + "step": 8750 + }, + { + "epoch": 0.03263878009295525, + "eval_valid_loss": 2.2207462787628174, + "eval_valid_loss/all": 2.0808026790618896, + "eval_valid_loss/end_span": 1.2793611288070679, + "eval_valid_perplexity/batch": 8.010896682739258, + "eval_valid_perplexity/end_span": 3.5943427085876465, + "eval_valid_perplexity/fim": 2.497926950454712, + "eval_valid_perplexity/first_seq": 14.922453880310059, + "eval_valid_perplexity/last_seq": 9.22656536102295, + "eval_valid_perplexity/second_seq": 14.056896209716797, + "eval_valid_perplexity/seq": 9.03387451171875, + "eval_valid_reconstruction/all": 0.2861577570438385, + "eval_valid_reconstruction/end_span": 0.7057814002037048, + "eval_valid_reconstruction/fim": 0.17700614035129547, + "eval_valid_reconstruction/first_seq": 0.1674332171678543, + "eval_valid_reconstruction/last_seq": 0.317121297121048, + "eval_valid_reconstruction/second_seq": 0.18700356781482697, + "eval_valid_runtime": 466.8275, + "eval_valid_samples_per_second": 0.411, + "eval_valid_steps_per_second": 0.411, + "step": 8750 + }, + { + "epoch": 0.03263878009295525, + "eval_train_loss": 2.216513156890869, + "eval_train_loss/all": 2.049680233001709, + "eval_train_loss/end_span": 1.248522400856018, + "eval_train_perplexity/batch": 7.765417575836182, + "eval_train_perplexity/end_span": 3.485189437866211, + "eval_train_perplexity/fim": 2.0728495121002197, + "eval_train_perplexity/first_seq": 15.54817008972168, + "eval_train_perplexity/last_seq": 9.239057540893555, + "eval_train_perplexity/second_seq": 14.374095916748047, + "eval_train_perplexity/seq": 8.944513320922852, + "eval_train_reconstruction/all": 0.2766624391078949, + "eval_train_reconstruction/end_span": 0.7141925096511841, + "eval_train_reconstruction/fim": 0.14181476831436157, + "eval_train_reconstruction/first_seq": 0.15237648785114288, + "eval_train_reconstruction/last_seq": 0.31713008880615234, + "eval_train_reconstruction/second_seq": 0.17818209528923035, + "eval_train_runtime": 465.7626, + "eval_train_samples_per_second": 0.412, + "eval_train_steps_per_second": 0.412, + "step": 8750 + }, + { + "epoch": 0.032676081555918626, + "grad_norm": 0.40939974784851074, + "learning_rate": 0.0006, + "loss": 2.2755, + "step": 8760 + }, + { + "epoch": 0.032713383018882, + "grad_norm": 0.2819778323173523, + "learning_rate": 0.0006, + "loss": 2.1661, + "step": 8770 + }, + { + "epoch": 0.03275068448184538, + "grad_norm": 0.31199920177459717, + "learning_rate": 0.0006, + "loss": 2.1923, + "step": 8780 + }, + { + "epoch": 0.03278798594480876, + "grad_norm": 0.4417700469493866, + "learning_rate": 0.0006, + "loss": 2.3798, + "step": 8790 + }, + { + "epoch": 0.03282528740777213, + "grad_norm": 0.2550967037677765, + "learning_rate": 0.0006, + "loss": 2.2651, + "step": 8800 + }, + { + "epoch": 0.03282528740777213, + "eval_valid_loss": 2.2200448513031006, + "eval_valid_loss/all": 2.0802812576293945, + "eval_valid_loss/end_span": 1.327476143836975, + "eval_valid_perplexity/batch": 8.006720542907715, + "eval_valid_perplexity/end_span": 3.771512508392334, + "eval_valid_perplexity/fim": 2.3924100399017334, + "eval_valid_perplexity/first_seq": 14.776480674743652, + "eval_valid_perplexity/last_seq": 9.123113632202148, + "eval_valid_perplexity/second_seq": 13.465152740478516, + "eval_valid_perplexity/seq": 9.026049613952637, + "eval_valid_reconstruction/all": 0.2862643897533417, + "eval_valid_reconstruction/end_span": 0.6853287220001221, + "eval_valid_reconstruction/fim": 0.1683828979730606, + "eval_valid_reconstruction/first_seq": 0.16829851269721985, + "eval_valid_reconstruction/last_seq": 0.3231923580169678, + "eval_valid_reconstruction/second_seq": 0.2043873369693756, + "eval_valid_runtime": 464.8784, + "eval_valid_samples_per_second": 0.413, + "eval_valid_steps_per_second": 0.413, + "step": 8800 + }, + { + "epoch": 0.03282528740777213, + "eval_train_loss": 2.2146387100219727, + "eval_train_loss/all": 2.0476202964782715, + "eval_train_loss/end_span": 1.290834665298462, + "eval_train_perplexity/batch": 7.7494378089904785, + "eval_train_perplexity/end_span": 3.635819911956787, + "eval_train_perplexity/fim": 2.3275084495544434, + "eval_train_perplexity/first_seq": 15.367636680603027, + "eval_train_perplexity/last_seq": 9.083285331726074, + "eval_train_perplexity/second_seq": 14.498601913452148, + "eval_train_perplexity/seq": 8.918198585510254, + "eval_train_reconstruction/all": 0.27717798948287964, + "eval_train_reconstruction/end_span": 0.6962039470672607, + "eval_train_reconstruction/fim": 0.16409847140312195, + "eval_train_reconstruction/first_seq": 0.15401571989059448, + "eval_train_reconstruction/last_seq": 0.32208675146102905, + "eval_train_reconstruction/second_seq": 0.17563875019550323, + "eval_train_runtime": 470.7577, + "eval_train_samples_per_second": 0.408, + "eval_train_steps_per_second": 0.408, + "step": 8800 + }, + { + "epoch": 0.03286258887073551, + "grad_norm": 0.3338204026222229, + "learning_rate": 0.0006, + "loss": 2.2806, + "step": 8810 + }, + { + "epoch": 0.03289989033369889, + "grad_norm": 0.42017999291419983, + "learning_rate": 0.0006, + "loss": 2.375, + "step": 8820 + }, + { + "epoch": 0.03293719179666227, + "grad_norm": 0.41201847791671753, + "learning_rate": 0.0006, + "loss": 2.2966, + "step": 8830 + }, + { + "epoch": 0.03297449325962564, + "grad_norm": 0.36560940742492676, + "learning_rate": 0.0006, + "loss": 2.3597, + "step": 8840 + }, + { + "epoch": 0.03301179472258902, + "grad_norm": 0.5346307754516602, + "learning_rate": 0.0006, + "loss": 2.2193, + "step": 8850 + }, + { + "epoch": 0.03301179472258902, + "eval_valid_loss": 2.2186715602874756, + "eval_valid_loss/all": 2.078472375869751, + "eval_valid_loss/end_span": 1.2971646785736084, + "eval_valid_perplexity/batch": 7.992250442504883, + "eval_valid_perplexity/end_span": 3.658907890319824, + "eval_valid_perplexity/fim": 2.3294596672058105, + "eval_valid_perplexity/first_seq": 14.823326110839844, + "eval_valid_perplexity/last_seq": 8.992000579833984, + "eval_valid_perplexity/second_seq": 13.706357955932617, + "eval_valid_perplexity/seq": 9.003240585327148, + "eval_valid_reconstruction/all": 0.2870820462703705, + "eval_valid_reconstruction/end_span": 0.7013009786605835, + "eval_valid_reconstruction/fim": 0.16377408802509308, + "eval_valid_reconstruction/first_seq": 0.16649700701236725, + "eval_valid_reconstruction/last_seq": 0.3278668224811554, + "eval_valid_reconstruction/second_seq": 0.19683033227920532, + "eval_valid_runtime": 468.2296, + "eval_valid_samples_per_second": 0.41, + "eval_valid_steps_per_second": 0.41, + "step": 8850 + }, + { + "epoch": 0.03301179472258902, + "eval_train_loss": 2.21597957611084, + "eval_train_loss/all": 2.0485877990722656, + "eval_train_loss/end_span": 1.2579885721206665, + "eval_train_perplexity/batch": 7.756938934326172, + "eval_train_perplexity/end_span": 3.5183374881744385, + "eval_train_perplexity/fim": 1.9850646257400513, + "eval_train_perplexity/first_seq": 15.174355506896973, + "eval_train_perplexity/last_seq": 9.133200645446777, + "eval_train_perplexity/second_seq": 14.011548042297363, + "eval_train_perplexity/seq": 8.921717643737793, + "eval_train_reconstruction/all": 0.2772871255874634, + "eval_train_reconstruction/end_span": 0.7114126682281494, + "eval_train_reconstruction/fim": 0.13315880298614502, + "eval_train_reconstruction/first_seq": 0.16117699444293976, + "eval_train_reconstruction/last_seq": 0.3201013207435608, + "eval_train_reconstruction/second_seq": 0.18992584943771362, + "eval_train_runtime": 464.5504, + "eval_train_samples_per_second": 0.413, + "eval_train_steps_per_second": 0.413, + "step": 8850 + }, + { + "epoch": 0.0330490961855524, + "grad_norm": 0.37217065691947937, + "learning_rate": 0.0006, + "loss": 2.2946, + "step": 8860 + }, + { + "epoch": 0.03308639764851577, + "grad_norm": 0.5599445700645447, + "learning_rate": 0.0006, + "loss": 2.319, + "step": 8870 + }, + { + "epoch": 0.03312369911147915, + "grad_norm": 0.43288859724998474, + "learning_rate": 0.0006, + "loss": 2.3758, + "step": 8880 + }, + { + "epoch": 0.03316100057444253, + "grad_norm": 0.3104209303855896, + "learning_rate": 0.0006, + "loss": 2.2822, + "step": 8890 + }, + { + "epoch": 0.033198302037405904, + "grad_norm": 0.5122387409210205, + "learning_rate": 0.0006, + "loss": 2.2349, + "step": 8900 + }, + { + "epoch": 0.033198302037405904, + "eval_valid_loss": 2.216063976287842, + "eval_valid_loss/all": 2.0763959884643555, + "eval_valid_loss/end_span": 1.2419095039367676, + "eval_valid_perplexity/batch": 7.975672721862793, + "eval_valid_perplexity/end_span": 3.4622182846069336, + "eval_valid_perplexity/fim": 2.167693614959717, + "eval_valid_perplexity/first_seq": 14.85090446472168, + "eval_valid_perplexity/last_seq": 9.432546615600586, + "eval_valid_perplexity/second_seq": 13.905171394348145, + "eval_valid_perplexity/seq": 8.98421573638916, + "eval_valid_reconstruction/all": 0.2877236306667328, + "eval_valid_reconstruction/end_span": 0.7152217626571655, + "eval_valid_reconstruction/fim": 0.15005257725715637, + "eval_valid_reconstruction/first_seq": 0.16571997106075287, + "eval_valid_reconstruction/last_seq": 0.31286755204200745, + "eval_valid_reconstruction/second_seq": 0.19241034984588623, + "eval_valid_runtime": 464.9059, + "eval_valid_samples_per_second": 0.413, + "eval_valid_steps_per_second": 0.413, + "step": 8900 + }, + { + "epoch": 0.033198302037405904, + "eval_train_loss": 2.213270902633667, + "eval_train_loss/all": 2.0463569164276123, + "eval_train_loss/end_span": 1.2079675197601318, + "eval_train_perplexity/batch": 7.739653587341309, + "eval_train_perplexity/end_span": 3.3466756343841553, + "eval_train_perplexity/fim": 2.0600616931915283, + "eval_train_perplexity/first_seq": 15.268653869628906, + "eval_train_perplexity/last_seq": 9.104372024536133, + "eval_train_perplexity/second_seq": 14.302096366882324, + "eval_train_perplexity/seq": 8.906899452209473, + "eval_train_reconstruction/all": 0.27779361605644226, + "eval_train_reconstruction/end_span": 0.7246349453926086, + "eval_train_reconstruction/fim": 0.14019151031970978, + "eval_train_reconstruction/first_seq": 0.1540229320526123, + "eval_train_reconstruction/last_seq": 0.3225269317626953, + "eval_train_reconstruction/second_seq": 0.1812945157289505, + "eval_train_runtime": 466.5265, + "eval_train_samples_per_second": 0.412, + "eval_train_steps_per_second": 0.412, + "step": 8900 + }, + { + "epoch": 0.03323560350036928, + "grad_norm": 0.5190155506134033, + "learning_rate": 0.0006, + "loss": 2.2267, + "step": 8910 + }, + { + "epoch": 0.03327290496333266, + "grad_norm": 0.3451908826828003, + "learning_rate": 0.0006, + "loss": 2.1773, + "step": 8920 + }, + { + "epoch": 0.03331020642629604, + "grad_norm": 0.5671521425247192, + "learning_rate": 0.0006, + "loss": 2.1876, + "step": 8930 + }, + { + "epoch": 0.033347507889259415, + "grad_norm": 0.5964276790618896, + "learning_rate": 0.0006, + "loss": 2.1227, + "step": 8940 + }, + { + "epoch": 0.033384809352222794, + "grad_norm": 0.5191889405250549, + "learning_rate": 0.0006, + "loss": 2.2473, + "step": 8950 + }, + { + "epoch": 0.033384809352222794, + "eval_valid_loss": 2.221781015396118, + "eval_valid_loss/all": 2.081570863723755, + "eval_valid_loss/end_span": 1.3014123439788818, + "eval_valid_perplexity/batch": 8.01705265045166, + "eval_valid_perplexity/end_span": 3.674482583999634, + "eval_valid_perplexity/fim": 2.4742226600646973, + "eval_valid_perplexity/first_seq": 15.06629753112793, + "eval_valid_perplexity/last_seq": 9.201353073120117, + "eval_valid_perplexity/second_seq": 14.230531692504883, + "eval_valid_perplexity/seq": 9.03309154510498, + "eval_valid_reconstruction/all": 0.28642070293426514, + "eval_valid_reconstruction/end_span": 0.6913637518882751, + "eval_valid_reconstruction/fim": 0.17432254552841187, + "eval_valid_reconstruction/first_seq": 0.16254882514476776, + "eval_valid_reconstruction/last_seq": 0.32026591897010803, + "eval_valid_reconstruction/second_seq": 0.18838565051555634, + "eval_valid_runtime": 466.0049, + "eval_valid_samples_per_second": 0.412, + "eval_valid_steps_per_second": 0.412, + "step": 8950 + }, + { + "epoch": 0.033384809352222794, + "eval_train_loss": 2.2187652587890625, + "eval_train_loss/all": 2.0512375831604004, + "eval_train_loss/end_span": 1.2751108407974243, + "eval_train_perplexity/batch": 7.777520656585693, + "eval_train_perplexity/end_span": 3.5790979862213135, + "eval_train_perplexity/fim": 2.1761960983276367, + "eval_train_perplexity/first_seq": 15.413015365600586, + "eval_train_perplexity/last_seq": 9.372901916503906, + "eval_train_perplexity/second_seq": 14.428256034851074, + "eval_train_perplexity/seq": 8.952366828918457, + "eval_train_reconstruction/all": 0.2766684591770172, + "eval_train_reconstruction/end_span": 0.697542130947113, + "eval_train_reconstruction/fim": 0.15083369612693787, + "eval_train_reconstruction/first_seq": 0.15236133337020874, + "eval_train_reconstruction/last_seq": 0.31185805797576904, + "eval_train_reconstruction/second_seq": 0.18014536798000336, + "eval_train_runtime": 462.5726, + "eval_train_samples_per_second": 0.415, + "eval_train_steps_per_second": 0.415, + "step": 8950 + }, + { + "epoch": 0.03342211081518617, + "grad_norm": 0.4652065634727478, + "learning_rate": 0.0006, + "loss": 2.2439, + "step": 8960 + }, + { + "epoch": 0.033459412278149546, + "grad_norm": 0.38321611285209656, + "learning_rate": 0.0006, + "loss": 2.2618, + "step": 8970 + }, + { + "epoch": 0.033496713741112925, + "grad_norm": 0.4312056303024292, + "learning_rate": 0.0006, + "loss": 2.3023, + "step": 8980 + }, + { + "epoch": 0.033534015204076305, + "grad_norm": 0.3309481143951416, + "learning_rate": 0.0006, + "loss": 2.2549, + "step": 8990 + }, + { + "epoch": 0.033571316667039684, + "grad_norm": 0.2126668393611908, + "learning_rate": 0.0006, + "loss": 2.4446, + "step": 9000 + }, + { + "epoch": 0.033571316667039684, + "eval_valid_loss": 2.2240025997161865, + "eval_valid_loss/all": 2.0835912227630615, + "eval_valid_loss/end_span": 1.2254598140716553, + "eval_valid_perplexity/batch": 8.033266067504883, + "eval_valid_perplexity/end_span": 3.405731678009033, + "eval_valid_perplexity/fim": 2.243037700653076, + "eval_valid_perplexity/first_seq": 15.189811706542969, + "eval_valid_perplexity/last_seq": 9.056807518005371, + "eval_valid_perplexity/second_seq": 13.653944969177246, + "eval_valid_perplexity/seq": 9.045975685119629, + "eval_valid_reconstruction/all": 0.28509706258773804, + "eval_valid_reconstruction/end_span": 0.7200580835342407, + "eval_valid_reconstruction/fim": 0.15577921271324158, + "eval_valid_reconstruction/first_seq": 0.1628239005804062, + "eval_valid_reconstruction/last_seq": 0.3256344497203827, + "eval_valid_reconstruction/second_seq": 0.1966639757156372, + "eval_valid_runtime": 463.3243, + "eval_valid_samples_per_second": 0.414, + "eval_valid_steps_per_second": 0.414, + "step": 9000 + }, + { + "epoch": 0.033571316667039684, + "eval_train_loss": 2.2182579040527344, + "eval_train_loss/all": 2.050346612930298, + "eval_train_loss/end_span": 1.1897165775299072, + "eval_train_perplexity/batch": 7.770594120025635, + "eval_train_perplexity/end_span": 3.286149740219116, + "eval_train_perplexity/fim": 1.9731264114379883, + "eval_train_perplexity/first_seq": 15.239380836486816, + "eval_train_perplexity/last_seq": 9.11078929901123, + "eval_train_perplexity/second_seq": 14.049365043640137, + "eval_train_perplexity/seq": 8.936761856079102, + "eval_train_reconstruction/all": 0.2761944532394409, + "eval_train_reconstruction/end_span": 0.7307943105697632, + "eval_train_reconstruction/fim": 0.13067962229251862, + "eval_train_reconstruction/first_seq": 0.1569502204656601, + "eval_train_reconstruction/last_seq": 0.3196830749511719, + "eval_train_reconstruction/second_seq": 0.1873394250869751, + "eval_train_runtime": 465.0389, + "eval_train_samples_per_second": 0.413, + "eval_train_steps_per_second": 0.413, + "step": 9000 + }, + { + "epoch": 0.03360861813000306, + "grad_norm": 0.49098771810531616, + "learning_rate": 0.0006, + "loss": 2.1756, + "step": 9010 + }, + { + "epoch": 0.033645919592966436, + "grad_norm": 1.5143095254898071, + "learning_rate": 0.0006, + "loss": 2.1697, + "step": 9020 + }, + { + "epoch": 0.033683221055929816, + "grad_norm": 0.8357464671134949, + "learning_rate": 0.0006, + "loss": 2.2544, + "step": 9030 + }, + { + "epoch": 0.03372052251889319, + "grad_norm": 0.3392675518989563, + "learning_rate": 0.0006, + "loss": 2.2227, + "step": 9040 + }, + { + "epoch": 0.03375782398185657, + "grad_norm": 0.37298306822776794, + "learning_rate": 0.0006, + "loss": 2.2129, + "step": 9050 + }, + { + "epoch": 0.03375782398185657, + "eval_valid_loss": 2.219590425491333, + "eval_valid_loss/all": 2.0796616077423096, + "eval_valid_loss/end_span": 1.461114525794983, + "eval_valid_perplexity/batch": 8.001760482788086, + "eval_valid_perplexity/end_span": 4.310761451721191, + "eval_valid_perplexity/fim": 2.477993965148926, + "eval_valid_perplexity/first_seq": 14.709940910339355, + "eval_valid_perplexity/last_seq": 9.103514671325684, + "eval_valid_perplexity/second_seq": 13.566917419433594, + "eval_valid_perplexity/seq": 9.01078987121582, + "eval_valid_reconstruction/all": 0.28633686900138855, + "eval_valid_reconstruction/end_span": 0.6606748700141907, + "eval_valid_reconstruction/fim": 0.17538192868232727, + "eval_valid_reconstruction/first_seq": 0.16785883903503418, + "eval_valid_reconstruction/last_seq": 0.3199123740196228, + "eval_valid_reconstruction/second_seq": 0.20038509368896484, + "eval_valid_runtime": 465.8291, + "eval_valid_samples_per_second": 0.412, + "eval_valid_steps_per_second": 0.412, + "step": 9050 + }, + { + "epoch": 0.03375782398185657, + "eval_train_loss": 2.2160489559173584, + "eval_train_loss/all": 2.0487136840820312, + "eval_train_loss/end_span": 1.43361496925354, + "eval_train_perplexity/batch": 7.757915496826172, + "eval_train_perplexity/end_span": 4.1938323974609375, + "eval_train_perplexity/fim": 2.1883134841918945, + "eval_train_perplexity/first_seq": 15.280548095703125, + "eval_train_perplexity/last_seq": 9.356402397155762, + "eval_train_perplexity/second_seq": 14.334365844726562, + "eval_train_perplexity/seq": 8.923949241638184, + "eval_train_reconstruction/all": 0.2767198979854584, + "eval_train_reconstruction/end_span": 0.668885350227356, + "eval_train_reconstruction/fim": 0.1516704261302948, + "eval_train_reconstruction/first_seq": 0.1552097201347351, + "eval_train_reconstruction/last_seq": 0.31368061900138855, + "eval_train_reconstruction/second_seq": 0.182173952460289, + "eval_train_runtime": 466.0872, + "eval_train_samples_per_second": 0.412, + "eval_train_steps_per_second": 0.412, + "step": 9050 + }, + { + "epoch": 0.03379512544481995, + "grad_norm": 0.35396721959114075, + "learning_rate": 0.0006, + "loss": 2.1318, + "step": 9060 + }, + { + "epoch": 0.033832426907783326, + "grad_norm": 0.36641615629196167, + "learning_rate": 0.0006, + "loss": 2.3854, + "step": 9070 + }, + { + "epoch": 0.0338697283707467, + "grad_norm": 0.514470636844635, + "learning_rate": 0.0006, + "loss": 2.054, + "step": 9080 + }, + { + "epoch": 0.03390702983371008, + "grad_norm": 0.5940414667129517, + "learning_rate": 0.0006, + "loss": 2.3485, + "step": 9090 + }, + { + "epoch": 0.03394433129667346, + "grad_norm": 0.42025235295295715, + "learning_rate": 0.0006, + "loss": 2.1316, + "step": 9100 + }, + { + "epoch": 0.03394433129667346, + "eval_valid_loss": 2.223933458328247, + "eval_valid_loss/all": 2.0836684703826904, + "eval_valid_loss/end_span": 1.3279285430908203, + "eval_valid_perplexity/batch": 8.033886909484863, + "eval_valid_perplexity/end_span": 3.773219108581543, + "eval_valid_perplexity/fim": 2.0606637001037598, + "eval_valid_perplexity/first_seq": 14.890254974365234, + "eval_valid_perplexity/last_seq": 9.283330917358398, + "eval_valid_perplexity/second_seq": 13.66118335723877, + "eval_valid_perplexity/seq": 9.055785179138184, + "eval_valid_reconstruction/all": 0.28599223494529724, + "eval_valid_reconstruction/end_span": 0.6942961812019348, + "eval_valid_reconstruction/fim": 0.13905254006385803, + "eval_valid_reconstruction/first_seq": 0.16878315806388855, + "eval_valid_reconstruction/last_seq": 0.3182950019836426, + "eval_valid_reconstruction/second_seq": 0.19866549968719482, + "eval_valid_runtime": 465.4631, + "eval_valid_samples_per_second": 0.412, + "eval_valid_steps_per_second": 0.412, + "step": 9100 + }, + { + "epoch": 0.03394433129667346, + "eval_train_loss": 2.2187254428863525, + "eval_train_loss/all": 2.0514144897460938, + "eval_train_loss/end_span": 1.2953379154205322, + "eval_train_perplexity/batch": 7.778896331787109, + "eval_train_perplexity/end_span": 3.6522300243377686, + "eval_train_perplexity/fim": 2.404940605163574, + "eval_train_perplexity/first_seq": 15.428027153015137, + "eval_train_perplexity/last_seq": 9.35257339477539, + "eval_train_perplexity/second_seq": 14.310019493103027, + "eval_train_perplexity/seq": 8.95559310913086, + "eval_train_reconstruction/all": 0.27663686871528625, + "eval_train_reconstruction/end_span": 0.7039353847503662, + "eval_train_reconstruction/fim": 0.1697070300579071, + "eval_train_reconstruction/first_seq": 0.15388575196266174, + "eval_train_reconstruction/last_seq": 0.3112809360027313, + "eval_train_reconstruction/second_seq": 0.1815289705991745, + "eval_train_runtime": 462.0423, + "eval_train_samples_per_second": 0.416, + "eval_train_steps_per_second": 0.416, + "step": 9100 + }, + { + "epoch": 0.03398163275963683, + "grad_norm": 0.46771475672721863, + "learning_rate": 0.0006, + "loss": 2.2239, + "step": 9110 + }, + { + "epoch": 0.03401893422260021, + "grad_norm": 0.43693986535072327, + "learning_rate": 0.0006, + "loss": 2.2553, + "step": 9120 + }, + { + "epoch": 0.03405623568556359, + "grad_norm": 0.23247359693050385, + "learning_rate": 0.0006, + "loss": 2.1911, + "step": 9130 + }, + { + "epoch": 0.03409353714852697, + "grad_norm": 0.6169652342796326, + "learning_rate": 0.0006, + "loss": 2.2377, + "step": 9140 + }, + { + "epoch": 0.03413083861149034, + "grad_norm": 0.340801477432251, + "learning_rate": 0.0006, + "loss": 2.2328, + "step": 9150 + }, + { + "epoch": 0.03413083861149034, + "eval_valid_loss": 2.2260773181915283, + "eval_valid_loss/all": 2.086254835128784, + "eval_valid_loss/end_span": 1.2067692279815674, + "eval_valid_perplexity/batch": 8.054692268371582, + "eval_valid_perplexity/end_span": 3.342667818069458, + "eval_valid_perplexity/fim": 2.337585926055908, + "eval_valid_perplexity/first_seq": 15.058676719665527, + "eval_valid_perplexity/last_seq": 9.118853569030762, + "eval_valid_perplexity/second_seq": 13.666176795959473, + "eval_valid_perplexity/seq": 9.086249351501465, + "eval_valid_reconstruction/all": 0.2848498225212097, + "eval_valid_reconstruction/end_span": 0.7196375727653503, + "eval_valid_reconstruction/fim": 0.16222652792930603, + "eval_valid_reconstruction/first_seq": 0.16319330036640167, + "eval_valid_reconstruction/last_seq": 0.322971910238266, + "eval_valid_reconstruction/second_seq": 0.1956542432308197, + "eval_valid_runtime": 464.2777, + "eval_valid_samples_per_second": 0.414, + "eval_valid_steps_per_second": 0.414, + "step": 9150 + }, + { + "epoch": 0.03413083861149034, + "eval_train_loss": 2.21683406829834, + "eval_train_loss/all": 2.0484824180603027, + "eval_train_loss/end_span": 1.1612447500228882, + "eval_train_perplexity/batch": 7.756121635437012, + "eval_train_perplexity/end_span": 3.193906307220459, + "eval_train_perplexity/fim": 1.9844236373901367, + "eval_train_perplexity/first_seq": 15.539039611816406, + "eval_train_perplexity/last_seq": 9.50313949584961, + "eval_train_perplexity/second_seq": 14.07639217376709, + "eval_train_perplexity/seq": 8.910714149475098, + "eval_train_reconstruction/all": 0.27697882056236267, + "eval_train_reconstruction/end_span": 0.730262815952301, + "eval_train_reconstruction/fim": 0.13235493004322052, + "eval_train_reconstruction/first_seq": 0.15105213224887848, + "eval_train_reconstruction/last_seq": 0.30962517857551575, + "eval_train_reconstruction/second_seq": 0.18510569632053375, + "eval_train_runtime": 464.9473, + "eval_train_samples_per_second": 0.413, + "eval_train_steps_per_second": 0.413, + "step": 9150 + }, + { + "epoch": 0.03416814007445372, + "grad_norm": 0.4479118883609772, + "learning_rate": 0.0006, + "loss": 2.0657, + "step": 9160 + }, + { + "epoch": 0.0342054415374171, + "grad_norm": 0.38013479113578796, + "learning_rate": 0.0006, + "loss": 2.2265, + "step": 9170 + }, + { + "epoch": 0.03424274300038047, + "grad_norm": 0.30539795756340027, + "learning_rate": 0.0006, + "loss": 2.302, + "step": 9180 + }, + { + "epoch": 0.03428004446334385, + "grad_norm": 0.3491242527961731, + "learning_rate": 0.0006, + "loss": 2.3092, + "step": 9190 + }, + { + "epoch": 0.03431734592630723, + "grad_norm": 0.43061333894729614, + "learning_rate": 0.0006, + "loss": 2.3019, + "step": 9200 + }, + { + "epoch": 0.03431734592630723, + "eval_valid_loss": 2.220285415649414, + "eval_valid_loss/all": 2.080223798751831, + "eval_valid_loss/end_span": 1.3075568675994873, + "eval_valid_perplexity/batch": 8.006260871887207, + "eval_valid_perplexity/end_span": 3.697129964828491, + "eval_valid_perplexity/fim": 2.401113271713257, + "eval_valid_perplexity/first_seq": 15.151049613952637, + "eval_valid_perplexity/last_seq": 8.979823112487793, + "eval_valid_perplexity/second_seq": 13.77459716796875, + "eval_valid_perplexity/seq": 9.026315689086914, + "eval_valid_reconstruction/all": 0.28679347038269043, + "eval_valid_reconstruction/end_span": 0.703781247138977, + "eval_valid_reconstruction/fim": 0.16927027702331543, + "eval_valid_reconstruction/first_seq": 0.16166192293167114, + "eval_valid_reconstruction/last_seq": 0.3261943459510803, + "eval_valid_reconstruction/second_seq": 0.19510911405086517, + "eval_valid_runtime": 465.9406, + "eval_valid_samples_per_second": 0.412, + "eval_valid_steps_per_second": 0.412, + "step": 9200 + }, + { + "epoch": 0.03431734592630723, + "eval_train_loss": 2.217423439025879, + "eval_train_loss/all": 2.0502195358276367, + "eval_train_loss/end_span": 1.2486096620559692, + "eval_train_perplexity/batch": 7.769606590270996, + "eval_train_perplexity/end_span": 3.4854936599731445, + "eval_train_perplexity/fim": 2.175987958908081, + "eval_train_perplexity/first_seq": 15.583036422729492, + "eval_train_perplexity/last_seq": 9.298174858093262, + "eval_train_perplexity/second_seq": 14.09563159942627, + "eval_train_perplexity/seq": 8.943734169006348, + "eval_train_reconstruction/all": 0.2769818902015686, + "eval_train_reconstruction/end_span": 0.7157794833183289, + "eval_train_reconstruction/fim": 0.15050669014453888, + "eval_train_reconstruction/first_seq": 0.15027953684329987, + "eval_train_reconstruction/last_seq": 0.31367263197898865, + "eval_train_reconstruction/second_seq": 0.18702682852745056, + "eval_train_runtime": 462.2277, + "eval_train_samples_per_second": 0.415, + "eval_train_steps_per_second": 0.415, + "step": 9200 + }, + { + "epoch": 0.034354647389270604, + "grad_norm": 0.7296591997146606, + "learning_rate": 0.0006, + "loss": 2.0607, + "step": 9210 + }, + { + "epoch": 0.03439194885223398, + "grad_norm": 0.3030862510204315, + "learning_rate": 0.0006, + "loss": 2.3565, + "step": 9220 + }, + { + "epoch": 0.03442925031519736, + "grad_norm": 0.36113330721855164, + "learning_rate": 0.0006, + "loss": 2.0072, + "step": 9230 + }, + { + "epoch": 0.03446655177816074, + "grad_norm": 0.691631019115448, + "learning_rate": 0.0006, + "loss": 2.1421, + "step": 9240 + }, + { + "epoch": 0.034503853241124115, + "grad_norm": 0.3816870152950287, + "learning_rate": 0.0006, + "loss": 2.178, + "step": 9250 + }, + { + "epoch": 0.034503853241124115, + "eval_valid_loss": 2.2253963947296143, + "eval_valid_loss/all": 2.084583282470703, + "eval_valid_loss/end_span": 1.3709518909454346, + "eval_valid_perplexity/batch": 8.041239738464355, + "eval_valid_perplexity/end_span": 3.939098596572876, + "eval_valid_perplexity/fim": 2.5152394771575928, + "eval_valid_perplexity/first_seq": 14.816125869750977, + "eval_valid_perplexity/last_seq": 9.1087064743042, + "eval_valid_perplexity/second_seq": 13.889498710632324, + "eval_valid_perplexity/seq": 9.05260181427002, + "eval_valid_reconstruction/all": 0.2848278880119324, + "eval_valid_reconstruction/end_span": 0.6886031031608582, + "eval_valid_reconstruction/fim": 0.17603711783885956, + "eval_valid_reconstruction/first_seq": 0.16865843534469604, + "eval_valid_reconstruction/last_seq": 0.32477349042892456, + "eval_valid_reconstruction/second_seq": 0.19165191054344177, + "eval_valid_runtime": 465.4253, + "eval_valid_samples_per_second": 0.413, + "eval_valid_steps_per_second": 0.413, + "step": 9250 + }, + { + "epoch": 0.034503853241124115, + "eval_train_loss": 2.2207887172698975, + "eval_train_loss/all": 2.0527029037475586, + "eval_train_loss/end_span": 1.32279372215271, + "eval_train_perplexity/batch": 7.7889251708984375, + "eval_train_perplexity/end_span": 3.753894090652466, + "eval_train_perplexity/fim": 2.259028196334839, + "eval_train_perplexity/first_seq": 15.872395515441895, + "eval_train_perplexity/last_seq": 9.382804870605469, + "eval_train_perplexity/second_seq": 14.146047592163086, + "eval_train_perplexity/seq": 8.959355354309082, + "eval_train_reconstruction/all": 0.2755616009235382, + "eval_train_reconstruction/end_span": 0.6993608474731445, + "eval_train_reconstruction/fim": 0.15632295608520508, + "eval_train_reconstruction/first_seq": 0.1443149745464325, + "eval_train_reconstruction/last_seq": 0.3124321699142456, + "eval_train_reconstruction/second_seq": 0.1871049851179123, + "eval_train_runtime": 462.2988, + "eval_train_samples_per_second": 0.415, + "eval_train_steps_per_second": 0.415, + "step": 9250 + }, + { + "epoch": 0.034541154704087494, + "grad_norm": 0.3640006482601166, + "learning_rate": 0.0006, + "loss": 2.1905, + "step": 9260 + }, + { + "epoch": 0.034578456167050874, + "grad_norm": 0.40258073806762695, + "learning_rate": 0.0006, + "loss": 2.3095, + "step": 9270 + }, + { + "epoch": 0.034615757630014246, + "grad_norm": 0.2762550413608551, + "learning_rate": 0.0006, + "loss": 2.136, + "step": 9280 + }, + { + "epoch": 0.034653059092977626, + "grad_norm": 0.2855776846408844, + "learning_rate": 0.0006, + "loss": 2.2711, + "step": 9290 + }, + { + "epoch": 0.034690360555941005, + "grad_norm": 0.36063364148139954, + "learning_rate": 0.0006, + "loss": 2.1263, + "step": 9300 + }, + { + "epoch": 0.034690360555941005, + "eval_valid_loss": 2.2202444076538086, + "eval_valid_loss/all": 2.0803775787353516, + "eval_valid_loss/end_span": 1.3902987241744995, + "eval_valid_perplexity/batch": 8.007492065429688, + "eval_valid_perplexity/end_span": 4.016049385070801, + "eval_valid_perplexity/fim": 2.188524007797241, + "eval_valid_perplexity/first_seq": 14.531689643859863, + "eval_valid_perplexity/last_seq": 8.942764282226562, + "eval_valid_perplexity/second_seq": 13.828950881958008, + "eval_valid_perplexity/seq": 9.027034759521484, + "eval_valid_reconstruction/all": 0.28632310032844543, + "eval_valid_reconstruction/end_span": 0.6663110852241516, + "eval_valid_reconstruction/fim": 0.15005700290203094, + "eval_valid_reconstruction/first_seq": 0.17483443021774292, + "eval_valid_reconstruction/last_seq": 0.32842105627059937, + "eval_valid_reconstruction/second_seq": 0.19389936327934265, + "eval_valid_runtime": 464.5441, + "eval_valid_samples_per_second": 0.413, + "eval_valid_steps_per_second": 0.413, + "step": 9300 + }, + { + "epoch": 0.034690360555941005, + "eval_train_loss": 2.2157859802246094, + "eval_train_loss/all": 2.048990249633789, + "eval_train_loss/end_span": 1.3533180952072144, + "eval_train_perplexity/batch": 7.760061264038086, + "eval_train_perplexity/end_span": 3.870246171951294, + "eval_train_perplexity/fim": 2.143291473388672, + "eval_train_perplexity/first_seq": 15.661186218261719, + "eval_train_perplexity/last_seq": 8.89844799041748, + "eval_train_perplexity/second_seq": 14.3004150390625, + "eval_train_perplexity/seq": 8.9335355758667, + "eval_train_reconstruction/all": 0.27700552344322205, + "eval_train_reconstruction/end_span": 0.6751505732536316, + "eval_train_reconstruction/fim": 0.14686892926692963, + "eval_train_reconstruction/first_seq": 0.14803843200206757, + "eval_train_reconstruction/last_seq": 0.3249898850917816, + "eval_train_reconstruction/second_seq": 0.18215768039226532, + "eval_train_runtime": 460.0672, + "eval_train_samples_per_second": 0.417, + "eval_train_steps_per_second": 0.417, + "step": 9300 + }, + { + "epoch": 0.034727662018904384, + "grad_norm": 0.636521577835083, + "learning_rate": 0.0006, + "loss": 2.2838, + "step": 9310 + }, + { + "epoch": 0.03476496348186776, + "grad_norm": 0.4443003833293915, + "learning_rate": 0.0006, + "loss": 2.293, + "step": 9320 + }, + { + "epoch": 0.034802264944831136, + "grad_norm": 0.3850710391998291, + "learning_rate": 0.0006, + "loss": 2.2145, + "step": 9330 + }, + { + "epoch": 0.034839566407794516, + "grad_norm": 0.5204742550849915, + "learning_rate": 0.0006, + "loss": 2.1797, + "step": 9340 + }, + { + "epoch": 0.03487686787075789, + "grad_norm": 0.7805878520011902, + "learning_rate": 0.0006, + "loss": 2.3266, + "step": 9350 + }, + { + "epoch": 0.03487686787075789, + "eval_valid_loss": 2.235086679458618, + "eval_valid_loss/all": 2.0943820476531982, + "eval_valid_loss/end_span": 1.3955861330032349, + "eval_valid_perplexity/batch": 8.120421409606934, + "eval_valid_perplexity/end_span": 4.03734016418457, + "eval_valid_perplexity/fim": 2.796900510787964, + "eval_valid_perplexity/first_seq": 14.894660949707031, + "eval_valid_perplexity/last_seq": 9.239457130432129, + "eval_valid_perplexity/second_seq": 14.12704849243164, + "eval_valid_perplexity/seq": 9.168241500854492, + "eval_valid_reconstruction/all": 0.28230106830596924, + "eval_valid_reconstruction/end_span": 0.6830254793167114, + "eval_valid_reconstruction/fim": 0.19451391696929932, + "eval_valid_reconstruction/first_seq": 0.16906079649925232, + "eval_valid_reconstruction/last_seq": 0.31665360927581787, + "eval_valid_reconstruction/second_seq": 0.18461501598358154, + "eval_valid_runtime": 463.2538, + "eval_valid_samples_per_second": 0.414, + "eval_valid_steps_per_second": 0.414, + "step": 9350 + }, + { + "epoch": 0.03487686787075789, + "eval_train_loss": 2.2255046367645264, + "eval_train_loss/all": 2.0574140548706055, + "eval_train_loss/end_span": 1.2991266250610352, + "eval_train_perplexity/batch": 7.825706958770752, + "eval_train_perplexity/end_span": 3.666093349456787, + "eval_train_perplexity/fim": 2.078824996948242, + "eval_train_perplexity/first_seq": 15.435811996459961, + "eval_train_perplexity/last_seq": 9.28121566772461, + "eval_train_perplexity/second_seq": 14.13683032989502, + "eval_train_perplexity/seq": 9.008723258972168, + "eval_train_reconstruction/all": 0.2746112644672394, + "eval_train_reconstruction/end_span": 0.7031432390213013, + "eval_train_reconstruction/fim": 0.14054621756076813, + "eval_train_reconstruction/first_seq": 0.15187974274158478, + "eval_train_reconstruction/last_seq": 0.31520092487335205, + "eval_train_reconstruction/second_seq": 0.18683968484401703, + "eval_train_runtime": 465.0945, + "eval_train_samples_per_second": 0.413, + "eval_train_steps_per_second": 0.413, + "step": 9350 + }, + { + "epoch": 0.03491416933372127, + "grad_norm": 0.47141581773757935, + "learning_rate": 0.0006, + "loss": 2.1076, + "step": 9360 + }, + { + "epoch": 0.03495147079668465, + "grad_norm": 0.5039620399475098, + "learning_rate": 0.0006, + "loss": 2.1577, + "step": 9370 + }, + { + "epoch": 0.03498877225964803, + "grad_norm": 0.6916616559028625, + "learning_rate": 0.0006, + "loss": 2.3827, + "step": 9380 + }, + { + "epoch": 0.0350260737226114, + "grad_norm": 0.5240175724029541, + "learning_rate": 0.0006, + "loss": 2.1021, + "step": 9390 + }, + { + "epoch": 0.03506337518557478, + "grad_norm": 0.501106858253479, + "learning_rate": 0.0006, + "loss": 2.3329, + "step": 9400 + }, + { + "epoch": 0.03506337518557478, + "eval_valid_loss": 2.2343156337738037, + "eval_valid_loss/all": 2.093390941619873, + "eval_valid_loss/end_span": 1.2967735528945923, + "eval_valid_perplexity/batch": 8.112377166748047, + "eval_valid_perplexity/end_span": 3.6574769020080566, + "eval_valid_perplexity/fim": 2.728208303451538, + "eval_valid_perplexity/first_seq": 15.16482162475586, + "eval_valid_perplexity/last_seq": 9.353779792785645, + "eval_valid_perplexity/second_seq": 14.050765037536621, + "eval_valid_perplexity/seq": 9.152647018432617, + "eval_valid_reconstruction/all": 0.28245100378990173, + "eval_valid_reconstruction/end_span": 0.6956449747085571, + "eval_valid_reconstruction/fim": 0.19109737873077393, + "eval_valid_reconstruction/first_seq": 0.16114869713783264, + "eval_valid_reconstruction/last_seq": 0.31664615869522095, + "eval_valid_reconstruction/second_seq": 0.19018904864788055, + "eval_valid_runtime": 466.4372, + "eval_valid_samples_per_second": 0.412, + "eval_valid_steps_per_second": 0.412, + "step": 9400 + }, + { + "epoch": 0.03506337518557478, + "eval_train_loss": 2.230400323867798, + "eval_train_loss/all": 2.0620157718658447, + "eval_train_loss/end_span": 1.2719392776489258, + "eval_train_perplexity/batch": 7.861801624298096, + "eval_train_perplexity/end_span": 3.5677647590637207, + "eval_train_perplexity/fim": 1.9515866041183472, + "eval_train_perplexity/first_seq": 15.898812294006348, + "eval_train_perplexity/last_seq": 9.314855575561523, + "eval_train_perplexity/second_seq": 14.499542236328125, + "eval_train_perplexity/seq": 9.057774543762207, + "eval_train_reconstruction/all": 0.2730945944786072, + "eval_train_reconstruction/end_span": 0.703055739402771, + "eval_train_reconstruction/fim": 0.12774397432804108, + "eval_train_reconstruction/first_seq": 0.14080096781253815, + "eval_train_reconstruction/last_seq": 0.31381428241729736, + "eval_train_reconstruction/second_seq": 0.17636972665786743, + "eval_train_runtime": 463.1944, + "eval_train_samples_per_second": 0.415, + "eval_train_steps_per_second": 0.415, + "step": 9400 + }, + { + "epoch": 0.03510067664853816, + "grad_norm": 0.3161376118659973, + "learning_rate": 0.0006, + "loss": 2.1586, + "step": 9410 + }, + { + "epoch": 0.03513797811150153, + "grad_norm": 3.8680408000946045, + "learning_rate": 0.0006, + "loss": 2.3642, + "step": 9420 + }, + { + "epoch": 0.03517527957446491, + "grad_norm": 0.3459031581878662, + "learning_rate": 0.0006, + "loss": 2.2447, + "step": 9430 + }, + { + "epoch": 0.03521258103742829, + "grad_norm": 0.48915910720825195, + "learning_rate": 0.0006, + "loss": 2.3485, + "step": 9440 + }, + { + "epoch": 0.03524988250039167, + "grad_norm": 0.42997655272483826, + "learning_rate": 0.0006, + "loss": 2.3539, + "step": 9450 + }, + { + "epoch": 0.03524988250039167, + "eval_valid_loss": 2.2211291790008545, + "eval_valid_loss/all": 2.0809996128082275, + "eval_valid_loss/end_span": 1.3471040725708008, + "eval_valid_perplexity/batch": 8.012474060058594, + "eval_valid_perplexity/end_span": 3.846270799636841, + "eval_valid_perplexity/fim": 2.4651567935943604, + "eval_valid_perplexity/first_seq": 15.108131408691406, + "eval_valid_perplexity/last_seq": 9.62601089477539, + "eval_valid_perplexity/second_seq": 13.717287063598633, + "eval_valid_perplexity/seq": 9.031173706054688, + "eval_valid_reconstruction/all": 0.28638923168182373, + "eval_valid_reconstruction/end_span": 0.6870018839836121, + "eval_valid_reconstruction/fim": 0.17361241579055786, + "eval_valid_reconstruction/first_seq": 0.16155149042606354, + "eval_valid_reconstruction/last_seq": 0.30565178394317627, + "eval_valid_reconstruction/second_seq": 0.1970677226781845, + "eval_valid_runtime": 464.5236, + "eval_valid_samples_per_second": 0.413, + "eval_valid_steps_per_second": 0.413, + "step": 9450 + }, + { + "epoch": 0.03524988250039167, + "eval_train_loss": 2.216425657272339, + "eval_train_loss/all": 2.04929780960083, + "eval_train_loss/end_span": 1.317695140838623, + "eval_train_perplexity/batch": 7.762448310852051, + "eval_train_perplexity/end_span": 3.7348031997680664, + "eval_train_perplexity/fim": 2.1102023124694824, + "eval_train_perplexity/first_seq": 15.411347389221191, + "eval_train_perplexity/last_seq": 9.198657989501953, + "eval_train_perplexity/second_seq": 13.954732894897461, + "eval_train_perplexity/seq": 8.93346881866455, + "eval_train_reconstruction/all": 0.27702611684799194, + "eval_train_reconstruction/end_span": 0.6955589652061462, + "eval_train_reconstruction/fim": 0.14443866908550262, + "eval_train_reconstruction/first_seq": 0.15512995421886444, + "eval_train_reconstruction/last_seq": 0.3175833821296692, + "eval_train_reconstruction/second_seq": 0.19410154223442078, + "eval_train_runtime": 467.5083, + "eval_train_samples_per_second": 0.411, + "eval_train_steps_per_second": 0.411, + "step": 9450 + }, + { + "epoch": 0.03528718396335504, + "grad_norm": 0.2883741855621338, + "learning_rate": 0.0006, + "loss": 2.3694, + "step": 9460 + }, + { + "epoch": 0.03532448542631842, + "grad_norm": 0.3782793879508972, + "learning_rate": 0.0006, + "loss": 2.1167, + "step": 9470 + }, + { + "epoch": 0.0353617868892818, + "grad_norm": 0.3819097578525543, + "learning_rate": 0.0006, + "loss": 2.261, + "step": 9480 + }, + { + "epoch": 0.03539908835224517, + "grad_norm": 0.4258311688899994, + "learning_rate": 0.0006, + "loss": 2.2446, + "step": 9490 + }, + { + "epoch": 0.03543638981520855, + "grad_norm": 0.457147479057312, + "learning_rate": 0.0006, + "loss": 2.1158, + "step": 9500 + }, + { + "epoch": 0.03543638981520855, + "eval_valid_loss": 2.226499557495117, + "eval_valid_loss/all": 2.085606575012207, + "eval_valid_loss/end_span": 1.2214058637619019, + "eval_valid_perplexity/batch": 8.04947280883789, + "eval_valid_perplexity/end_span": 3.3919529914855957, + "eval_valid_perplexity/fim": 2.3186333179473877, + "eval_valid_perplexity/first_seq": 14.88896656036377, + "eval_valid_perplexity/last_seq": 9.19442367553711, + "eval_valid_perplexity/second_seq": 13.901816368103027, + "eval_valid_perplexity/seq": 9.069849967956543, + "eval_valid_reconstruction/all": 0.2846889793872833, + "eval_valid_reconstruction/end_span": 0.7061522603034973, + "eval_valid_reconstruction/fim": 0.16111800074577332, + "eval_valid_reconstruction/first_seq": 0.16602575778961182, + "eval_valid_reconstruction/last_seq": 0.31966322660446167, + "eval_valid_reconstruction/second_seq": 0.1944422423839569, + "eval_valid_runtime": 449.8733, + "eval_valid_samples_per_second": 0.427, + "eval_valid_steps_per_second": 0.427, + "step": 9500 + }, + { + "epoch": 0.03543638981520855, + "eval_train_loss": 2.2214763164520264, + "eval_train_loss/all": 2.0533034801483154, + "eval_train_loss/end_span": 1.1854456663131714, + "eval_train_perplexity/batch": 7.793604850769043, + "eval_train_perplexity/end_span": 3.2721447944641113, + "eval_train_perplexity/fim": 1.9126882553100586, + "eval_train_perplexity/first_seq": 15.716863632202148, + "eval_train_perplexity/last_seq": 8.901344299316406, + "eval_train_perplexity/second_seq": 14.24664306640625, + "eval_train_perplexity/seq": 8.96338176727295, + "eval_train_reconstruction/all": 0.2753438353538513, + "eval_train_reconstruction/end_span": 0.717656672000885, + "eval_train_reconstruction/fim": 0.12455517798662186, + "eval_train_reconstruction/first_seq": 0.14868353307247162, + "eval_train_reconstruction/last_seq": 0.32701751589775085, + "eval_train_reconstruction/second_seq": 0.18775416910648346, + "eval_train_runtime": 439.8923, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 9500 + }, + { + "epoch": 0.03547369127817193, + "grad_norm": 0.3296717703342438, + "learning_rate": 0.0006, + "loss": 2.2717, + "step": 9510 + }, + { + "epoch": 0.035510992741135304, + "grad_norm": 0.4179694354534149, + "learning_rate": 0.0006, + "loss": 2.1868, + "step": 9520 + }, + { + "epoch": 0.035548294204098684, + "grad_norm": 0.5229015946388245, + "learning_rate": 0.0006, + "loss": 2.2305, + "step": 9530 + }, + { + "epoch": 0.03558559566706206, + "grad_norm": 0.39848592877388, + "learning_rate": 0.0006, + "loss": 2.1486, + "step": 9540 + }, + { + "epoch": 0.03562289713002544, + "grad_norm": 0.7172048091888428, + "learning_rate": 0.0006, + "loss": 2.2854, + "step": 9550 + }, + { + "epoch": 0.03562289713002544, + "eval_valid_loss": 2.2191014289855957, + "eval_valid_loss/all": 2.079472303390503, + "eval_valid_loss/end_span": 1.2619317770004272, + "eval_valid_perplexity/batch": 8.000246047973633, + "eval_valid_perplexity/end_span": 3.532238483428955, + "eval_valid_perplexity/fim": 2.345200538635254, + "eval_valid_perplexity/first_seq": 15.298836708068848, + "eval_valid_perplexity/last_seq": 8.936573028564453, + "eval_valid_perplexity/second_seq": 13.925734519958496, + "eval_valid_perplexity/seq": 9.017666816711426, + "eval_valid_reconstruction/all": 0.28667184710502625, + "eval_valid_reconstruction/end_span": 0.7057824730873108, + "eval_valid_reconstruction/fim": 0.16405312716960907, + "eval_valid_reconstruction/first_seq": 0.15774138271808624, + "eval_valid_reconstruction/last_seq": 0.32933682203292847, + "eval_valid_reconstruction/second_seq": 0.19068756699562073, + "eval_valid_runtime": 438.5589, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 9550 + }, + { + "epoch": 0.03562289713002544, + "eval_train_loss": 2.215059995651245, + "eval_train_loss/all": 2.0484731197357178, + "eval_train_loss/end_span": 1.2253074645996094, + "eval_train_perplexity/batch": 7.756049633026123, + "eval_train_perplexity/end_span": 3.405212879180908, + "eval_train_perplexity/fim": 2.120964527130127, + "eval_train_perplexity/first_seq": 15.421693801879883, + "eval_train_perplexity/last_seq": 9.047106742858887, + "eval_train_perplexity/second_seq": 14.264589309692383, + "eval_train_perplexity/seq": 8.934479713439941, + "eval_train_reconstruction/all": 0.27713543176651, + "eval_train_reconstruction/end_span": 0.7184069752693176, + "eval_train_reconstruction/fim": 0.14530740678310394, + "eval_train_reconstruction/first_seq": 0.1526412069797516, + "eval_train_reconstruction/last_seq": 0.3221638798713684, + "eval_train_reconstruction/second_seq": 0.18388201296329498, + "eval_train_runtime": 436.7518, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 9550 + }, + { + "epoch": 0.035660198592988815, + "grad_norm": 0.542579174041748, + "learning_rate": 0.0006, + "loss": 2.22, + "step": 9560 + }, + { + "epoch": 0.035697500055952194, + "grad_norm": 0.38160479068756104, + "learning_rate": 0.0006, + "loss": 2.136, + "step": 9570 + }, + { + "epoch": 0.035734801518915574, + "grad_norm": 0.2799321711063385, + "learning_rate": 0.0006, + "loss": 2.292, + "step": 9580 + }, + { + "epoch": 0.035772102981878946, + "grad_norm": 0.4706970751285553, + "learning_rate": 0.0006, + "loss": 2.1211, + "step": 9590 + }, + { + "epoch": 0.035809404444842326, + "grad_norm": 0.5618536472320557, + "learning_rate": 0.0006, + "loss": 2.0968, + "step": 9600 + }, + { + "epoch": 0.035809404444842326, + "eval_valid_loss": 2.222827196121216, + "eval_valid_loss/all": 2.0826847553253174, + "eval_valid_loss/end_span": 1.2248904705047607, + "eval_valid_perplexity/batch": 8.02598762512207, + "eval_valid_perplexity/end_span": 3.4037933349609375, + "eval_valid_perplexity/fim": 2.4069020748138428, + "eval_valid_perplexity/first_seq": 15.222821235656738, + "eval_valid_perplexity/last_seq": 9.1251802444458, + "eval_valid_perplexity/second_seq": 13.526193618774414, + "eval_valid_perplexity/seq": 9.051177024841309, + "eval_valid_reconstruction/all": 0.2856924831867218, + "eval_valid_reconstruction/end_span": 0.7194085717201233, + "eval_valid_reconstruction/fim": 0.16957354545593262, + "eval_valid_reconstruction/first_seq": 0.16124378144741058, + "eval_valid_reconstruction/last_seq": 0.32356515526771545, + "eval_valid_reconstruction/second_seq": 0.20277166366577148, + "eval_valid_runtime": 436.7041, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 9600 + }, + { + "epoch": 0.035809404444842326, + "eval_train_loss": 2.2182607650756836, + "eval_train_loss/all": 2.051211357116699, + "eval_train_loss/end_span": 1.1841845512390137, + "eval_train_perplexity/batch": 7.777316570281982, + "eval_train_perplexity/end_span": 3.2680208683013916, + "eval_train_perplexity/fim": 2.1555988788604736, + "eval_train_perplexity/first_seq": 15.650450706481934, + "eval_train_perplexity/last_seq": 9.474828720092773, + "eval_train_perplexity/second_seq": 14.34839916229248, + "eval_train_perplexity/seq": 8.956098556518555, + "eval_train_reconstruction/all": 0.27636849880218506, + "eval_train_reconstruction/end_span": 0.7352930903434753, + "eval_train_reconstruction/fim": 0.1479799598455429, + "eval_train_reconstruction/first_seq": 0.14867596328258514, + "eval_train_reconstruction/last_seq": 0.30744072794914246, + "eval_train_reconstruction/second_seq": 0.18254226446151733, + "eval_train_runtime": 435.0289, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 9600 + }, + { + "epoch": 0.035846705907805705, + "grad_norm": 0.6432527303695679, + "learning_rate": 0.0006, + "loss": 2.1821, + "step": 9610 + }, + { + "epoch": 0.035884007370769085, + "grad_norm": 0.31976377964019775, + "learning_rate": 0.0006, + "loss": 2.2111, + "step": 9620 + }, + { + "epoch": 0.03592130883373246, + "grad_norm": 14.550761222839355, + "learning_rate": 0.0006, + "loss": 2.3886, + "step": 9630 + }, + { + "epoch": 0.035958610296695837, + "grad_norm": 1.6877764463424683, + "learning_rate": 0.0006, + "loss": 2.2214, + "step": 9640 + }, + { + "epoch": 0.035995911759659216, + "grad_norm": 24.65630531311035, + "learning_rate": 0.0006, + "loss": 2.1682, + "step": 9650 + }, + { + "epoch": 0.035995911759659216, + "eval_valid_loss": 2.2453835010528564, + "eval_valid_loss/all": 2.1035847663879395, + "eval_valid_loss/end_span": 1.4007318019866943, + "eval_valid_perplexity/batch": 8.195496559143066, + "eval_valid_perplexity/end_span": 4.058168888092041, + "eval_valid_perplexity/fim": 2.1781563758850098, + "eval_valid_perplexity/first_seq": 15.0755033493042, + "eval_valid_perplexity/last_seq": 9.880012512207031, + "eval_valid_perplexity/second_seq": 13.708822250366211, + "eval_valid_perplexity/seq": 9.247340202331543, + "eval_valid_reconstruction/all": 0.27942946553230286, + "eval_valid_reconstruction/end_span": 0.6636845469474792, + "eval_valid_reconstruction/fim": 0.14579373598098755, + "eval_valid_reconstruction/first_seq": 0.16419851779937744, + "eval_valid_reconstruction/last_seq": 0.29832714796066284, + "eval_valid_reconstruction/second_seq": 0.1986415535211563, + "eval_valid_runtime": 433.9339, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 9650 + }, + { + "epoch": 0.035995911759659216, + "eval_train_loss": 2.2395756244659424, + "eval_train_loss/all": 2.070389747619629, + "eval_train_loss/end_span": 1.367322564125061, + "eval_train_perplexity/batch": 7.92791223526001, + "eval_train_perplexity/end_span": 3.924828052520752, + "eval_train_perplexity/fim": 2.025921106338501, + "eval_train_perplexity/first_seq": 15.795963287353516, + "eval_train_perplexity/last_seq": 9.705460548400879, + "eval_train_perplexity/second_seq": 14.62612247467041, + "eval_train_perplexity/seq": 9.137495040893555, + "eval_train_reconstruction/all": 0.27032557129859924, + "eval_train_reconstruction/end_span": 0.6771472692489624, + "eval_train_reconstruction/fim": 0.13250719010829926, + "eval_train_reconstruction/first_seq": 0.14820565283298492, + "eval_train_reconstruction/last_seq": 0.2996368110179901, + "eval_train_reconstruction/second_seq": 0.17211273312568665, + "eval_train_runtime": 438.3083, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 9650 + }, + { + "epoch": 0.03603321322262259, + "grad_norm": 0.3829328119754791, + "learning_rate": 0.0006, + "loss": 2.2531, + "step": 9660 + }, + { + "epoch": 0.03607051468558597, + "grad_norm": 0.8179451823234558, + "learning_rate": 0.0006, + "loss": 2.2607, + "step": 9670 + }, + { + "epoch": 0.03610781614854935, + "grad_norm": 0.3766489028930664, + "learning_rate": 0.0006, + "loss": 2.3677, + "step": 9680 + }, + { + "epoch": 0.03614511761151273, + "grad_norm": 0.4772166609764099, + "learning_rate": 0.0006, + "loss": 2.1164, + "step": 9690 + }, + { + "epoch": 0.0361824190744761, + "grad_norm": 0.5250348448753357, + "learning_rate": 0.0006, + "loss": 2.3823, + "step": 9700 + }, + { + "epoch": 0.0361824190744761, + "eval_valid_loss": 2.234534978866577, + "eval_valid_loss/all": 2.093507766723633, + "eval_valid_loss/end_span": 1.2881711721420288, + "eval_valid_perplexity/batch": 8.113325119018555, + "eval_valid_perplexity/end_span": 3.6261489391326904, + "eval_valid_perplexity/fim": 2.389601707458496, + "eval_valid_perplexity/first_seq": 15.23676872253418, + "eval_valid_perplexity/last_seq": 9.826929092407227, + "eval_valid_perplexity/second_seq": 14.056109428405762, + "eval_valid_perplexity/seq": 9.152141571044922, + "eval_valid_reconstruction/all": 0.2825137972831726, + "eval_valid_reconstruction/end_span": 0.702551007270813, + "eval_valid_reconstruction/fim": 0.16415002942085266, + "eval_valid_reconstruction/first_seq": 0.1626325249671936, + "eval_valid_reconstruction/last_seq": 0.3024257719516754, + "eval_valid_reconstruction/second_seq": 0.19465257227420807, + "eval_valid_runtime": 441.586, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 9700 + }, + { + "epoch": 0.0361824190744761, + "eval_train_loss": 2.2306039333343506, + "eval_train_loss/all": 2.062084913253784, + "eval_train_loss/end_span": 1.249167561531067, + "eval_train_perplexity/batch": 7.862345218658447, + "eval_train_perplexity/end_span": 3.487438678741455, + "eval_train_perplexity/fim": 2.032740592956543, + "eval_train_perplexity/first_seq": 15.77372932434082, + "eval_train_perplexity/last_seq": 9.468232154846191, + "eval_train_perplexity/second_seq": 13.937004089355469, + "eval_train_perplexity/seq": 9.060038566589355, + "eval_train_reconstruction/all": 0.27319660782814026, + "eval_train_reconstruction/end_span": 0.7138579487800598, + "eval_train_reconstruction/fim": 0.13506464660167694, + "eval_train_reconstruction/first_seq": 0.15100093185901642, + "eval_train_reconstruction/last_seq": 0.30912521481513977, + "eval_train_reconstruction/second_seq": 0.1944839209318161, + "eval_train_runtime": 437.4773, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 9700 + }, + { + "epoch": 0.03621972053743948, + "grad_norm": 0.4029107093811035, + "learning_rate": 0.0006, + "loss": 2.3124, + "step": 9710 + }, + { + "epoch": 0.03625702200040286, + "grad_norm": 0.40300223231315613, + "learning_rate": 0.0006, + "loss": 2.3857, + "step": 9720 + }, + { + "epoch": 0.03629432346336623, + "grad_norm": 0.623062252998352, + "learning_rate": 0.0006, + "loss": 2.1273, + "step": 9730 + }, + { + "epoch": 0.03633162492632961, + "grad_norm": 5.688636779785156, + "learning_rate": 0.0006, + "loss": 2.0408, + "step": 9740 + }, + { + "epoch": 0.03636892638929299, + "grad_norm": 0.40580594539642334, + "learning_rate": 0.0006, + "loss": 2.2761, + "step": 9750 + }, + { + "epoch": 0.03636892638929299, + "eval_valid_loss": 2.230525255203247, + "eval_valid_loss/all": 2.0896518230438232, + "eval_valid_loss/end_span": 1.2756131887435913, + "eval_valid_perplexity/batch": 8.082100868225098, + "eval_valid_perplexity/end_span": 3.5808966159820557, + "eval_valid_perplexity/fim": 2.54815411567688, + "eval_valid_perplexity/first_seq": 15.275473594665527, + "eval_valid_perplexity/last_seq": 9.758148193359375, + "eval_valid_perplexity/second_seq": 13.84870433807373, + "eval_valid_perplexity/seq": 9.11037540435791, + "eval_valid_reconstruction/all": 0.28357651829719543, + "eval_valid_reconstruction/end_span": 0.7028471231460571, + "eval_valid_reconstruction/fim": 0.17877893149852753, + "eval_valid_reconstruction/first_seq": 0.15691739320755005, + "eval_valid_reconstruction/last_seq": 0.2994128167629242, + "eval_valid_reconstruction/second_seq": 0.19387508928775787, + "eval_valid_runtime": 440.6252, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 9750 + }, + { + "epoch": 0.03636892638929299, + "eval_train_loss": 2.2263011932373047, + "eval_train_loss/all": 2.058617353439331, + "eval_train_loss/end_span": 1.2244778871536255, + "eval_train_perplexity/batch": 7.835129261016846, + "eval_train_perplexity/end_span": 3.4023892879486084, + "eval_train_perplexity/fim": 2.2433950901031494, + "eval_train_perplexity/first_seq": 16.13576889038086, + "eval_train_perplexity/last_seq": 8.811736106872559, + "eval_train_perplexity/second_seq": 14.766172409057617, + "eval_train_perplexity/seq": 9.025357246398926, + "eval_train_reconstruction/all": 0.27417391538619995, + "eval_train_reconstruction/end_span": 0.718726396560669, + "eval_train_reconstruction/fim": 0.15448597073554993, + "eval_train_reconstruction/first_seq": 0.14373956620693207, + "eval_train_reconstruction/last_seq": 0.32965579628944397, + "eval_train_reconstruction/second_seq": 0.17062394320964813, + "eval_train_runtime": 437.9972, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 9750 + }, + { + "epoch": 0.03640622785225636, + "grad_norm": 0.4903717637062073, + "learning_rate": 0.0006, + "loss": 2.329, + "step": 9760 + }, + { + "epoch": 0.03644352931521974, + "grad_norm": 0.6308809518814087, + "learning_rate": 0.0006, + "loss": 2.271, + "step": 9770 + }, + { + "epoch": 0.03648083077818312, + "grad_norm": 0.7618521451950073, + "learning_rate": 0.0006, + "loss": 2.1103, + "step": 9780 + }, + { + "epoch": 0.0365181322411465, + "grad_norm": 0.37255144119262695, + "learning_rate": 0.0006, + "loss": 2.3008, + "step": 9790 + }, + { + "epoch": 0.03655543370410987, + "grad_norm": 0.48169445991516113, + "learning_rate": 0.0006, + "loss": 2.3517, + "step": 9800 + }, + { + "epoch": 0.03655543370410987, + "eval_valid_loss": 2.229982376098633, + "eval_valid_loss/all": 2.08921217918396, + "eval_valid_loss/end_span": 1.2193739414215088, + "eval_valid_perplexity/batch": 8.078548431396484, + "eval_valid_perplexity/end_span": 3.3850677013397217, + "eval_valid_perplexity/fim": 2.3004331588745117, + "eval_valid_perplexity/first_seq": 14.684462547302246, + "eval_valid_perplexity/last_seq": 9.461397171020508, + "eval_valid_perplexity/second_seq": 14.169942855834961, + "eval_valid_perplexity/seq": 9.103353500366211, + "eval_valid_reconstruction/all": 0.28390589356422424, + "eval_valid_reconstruction/end_span": 0.719799816608429, + "eval_valid_reconstruction/fim": 0.15780189633369446, + "eval_valid_reconstruction/first_seq": 0.17458494007587433, + "eval_valid_reconstruction/last_seq": 0.3128577172756195, + "eval_valid_reconstruction/second_seq": 0.1859976351261139, + "eval_valid_runtime": 439.8133, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 9800 + }, + { + "epoch": 0.03655543370410987, + "eval_train_loss": 2.2257606983184814, + "eval_train_loss/all": 2.057896375656128, + "eval_train_loss/end_span": 1.1786620616912842, + "eval_train_perplexity/batch": 7.829482078552246, + "eval_train_perplexity/end_span": 3.2500228881835938, + "eval_train_perplexity/fim": 2.020098924636841, + "eval_train_perplexity/first_seq": 15.661921501159668, + "eval_train_perplexity/last_seq": 9.548104286193848, + "eval_train_perplexity/second_seq": 14.260743141174316, + "eval_train_perplexity/seq": 9.014659881591797, + "eval_train_reconstruction/all": 0.2745119333267212, + "eval_train_reconstruction/end_span": 0.7339416146278381, + "eval_train_reconstruction/fim": 0.1352272927761078, + "eval_train_reconstruction/first_seq": 0.15231236815452576, + "eval_train_reconstruction/last_seq": 0.3083364963531494, + "eval_train_reconstruction/second_seq": 0.18485449254512787, + "eval_train_runtime": 438.6663, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 9800 + }, + { + "epoch": 0.03659273516707325, + "grad_norm": 0.45223739743232727, + "learning_rate": 0.0006, + "loss": 2.2936, + "step": 9810 + }, + { + "epoch": 0.03663003663003663, + "grad_norm": 0.7335954308509827, + "learning_rate": 0.0006, + "loss": 2.2971, + "step": 9820 + }, + { + "epoch": 0.036667338093000004, + "grad_norm": 0.5189562439918518, + "learning_rate": 0.0006, + "loss": 2.2963, + "step": 9830 + }, + { + "epoch": 0.036704639555963384, + "grad_norm": 0.5580240488052368, + "learning_rate": 0.0006, + "loss": 2.2801, + "step": 9840 + }, + { + "epoch": 0.03674194101892676, + "grad_norm": 0.40440425276756287, + "learning_rate": 0.0006, + "loss": 2.222, + "step": 9850 + }, + { + "epoch": 0.03674194101892676, + "eval_valid_loss": 2.2293832302093506, + "eval_valid_loss/all": 2.088606357574463, + "eval_valid_loss/end_span": 1.2555575370788574, + "eval_valid_perplexity/batch": 8.073655128479004, + "eval_valid_perplexity/end_span": 3.5097947120666504, + "eval_valid_perplexity/fim": 2.239253044128418, + "eval_valid_perplexity/first_seq": 14.869880676269531, + "eval_valid_perplexity/last_seq": 9.005791664123535, + "eval_valid_perplexity/second_seq": 14.132752418518066, + "eval_valid_perplexity/seq": 9.097413063049316, + "eval_valid_reconstruction/all": 0.28418681025505066, + "eval_valid_reconstruction/end_span": 0.7064937949180603, + "eval_valid_reconstruction/fim": 0.1534498780965805, + "eval_valid_reconstruction/first_seq": 0.1669715791940689, + "eval_valid_reconstruction/last_seq": 0.32552552223205566, + "eval_valid_reconstruction/second_seq": 0.1897023767232895, + "eval_valid_runtime": 440.8028, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 9850 + }, + { + "epoch": 0.03674194101892676, + "eval_train_loss": 2.225740432739258, + "eval_train_loss/all": 2.057816505432129, + "eval_train_loss/end_span": 1.2208362817764282, + "eval_train_perplexity/batch": 7.828856945037842, + "eval_train_perplexity/end_span": 3.390021562576294, + "eval_train_perplexity/fim": 2.1694908142089844, + "eval_train_perplexity/first_seq": 15.481517791748047, + "eval_train_perplexity/last_seq": 9.270078659057617, + "eval_train_perplexity/second_seq": 14.708780288696289, + "eval_train_perplexity/seq": 9.015852928161621, + "eval_train_reconstruction/all": 0.27458083629608154, + "eval_train_reconstruction/end_span": 0.7166823744773865, + "eval_train_reconstruction/fim": 0.14775387942790985, + "eval_train_reconstruction/first_seq": 0.15015292167663574, + "eval_train_reconstruction/last_seq": 0.3149588704109192, + "eval_train_reconstruction/second_seq": 0.17191366851329803, + "eval_train_runtime": 440.7987, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 9850 + }, + { + "epoch": 0.03677924248189014, + "grad_norm": 0.7074773907661438, + "learning_rate": 0.0006, + "loss": 2.0422, + "step": 9860 + }, + { + "epoch": 0.036816543944853515, + "grad_norm": 0.3344417214393616, + "learning_rate": 0.0006, + "loss": 2.092, + "step": 9870 + }, + { + "epoch": 0.036853845407816894, + "grad_norm": 0.4004669189453125, + "learning_rate": 0.0006, + "loss": 2.2885, + "step": 9880 + }, + { + "epoch": 0.036891146870780274, + "grad_norm": 0.6072329878807068, + "learning_rate": 0.0006, + "loss": 2.0132, + "step": 9890 + }, + { + "epoch": 0.036928448333743646, + "grad_norm": 0.30712974071502686, + "learning_rate": 0.0006, + "loss": 2.2493, + "step": 9900 + }, + { + "epoch": 0.036928448333743646, + "eval_valid_loss": 2.2255194187164307, + "eval_valid_loss/all": 2.0850751399993896, + "eval_valid_loss/end_span": 1.2766764163970947, + "eval_valid_perplexity/batch": 8.045195579528809, + "eval_valid_perplexity/end_span": 3.5847058296203613, + "eval_valid_perplexity/fim": 2.292935848236084, + "eval_valid_perplexity/first_seq": 14.843806266784668, + "eval_valid_perplexity/last_seq": 9.618403434753418, + "eval_valid_perplexity/second_seq": 14.136870384216309, + "eval_valid_perplexity/seq": 9.072409629821777, + "eval_valid_reconstruction/all": 0.28486672043800354, + "eval_valid_reconstruction/end_span": 0.7046602368354797, + "eval_valid_reconstruction/fim": 0.15935444831848145, + "eval_valid_reconstruction/first_seq": 0.16825342178344727, + "eval_valid_reconstruction/last_seq": 0.304237425327301, + "eval_valid_reconstruction/second_seq": 0.1885761320590973, + "eval_valid_runtime": 441.8348, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 9900 + }, + { + "epoch": 0.036928448333743646, + "eval_train_loss": 2.2205073833465576, + "eval_train_loss/all": 2.053267240524292, + "eval_train_loss/end_span": 1.2444255352020264, + "eval_train_perplexity/batch": 7.7933220863342285, + "eval_train_perplexity/end_span": 3.470940351486206, + "eval_train_perplexity/fim": 2.4889252185821533, + "eval_train_perplexity/first_seq": 15.70843505859375, + "eval_train_perplexity/last_seq": 9.064695358276367, + "eval_train_perplexity/second_seq": 14.473114013671875, + "eval_train_perplexity/seq": 8.979167938232422, + "eval_train_reconstruction/all": 0.2755695581436157, + "eval_train_reconstruction/end_span": 0.716468334197998, + "eval_train_reconstruction/fim": 0.17500096559524536, + "eval_train_reconstruction/first_seq": 0.1513783186674118, + "eval_train_reconstruction/last_seq": 0.3224453628063202, + "eval_train_reconstruction/second_seq": 0.17603854835033417, + "eval_train_runtime": 443.6303, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 9900 + }, + { + "epoch": 0.036965749796707026, + "grad_norm": 0.3676368296146393, + "learning_rate": 0.0006, + "loss": 2.1953, + "step": 9910 + }, + { + "epoch": 0.037003051259670405, + "grad_norm": 0.2758420705795288, + "learning_rate": 0.0006, + "loss": 2.2218, + "step": 9920 + }, + { + "epoch": 0.037040352722633785, + "grad_norm": 0.4100590944290161, + "learning_rate": 0.0006, + "loss": 2.3325, + "step": 9930 + }, + { + "epoch": 0.03707765418559716, + "grad_norm": 0.3185981214046478, + "learning_rate": 0.0006, + "loss": 2.25, + "step": 9940 + }, + { + "epoch": 0.03711495564856054, + "grad_norm": 0.43998128175735474, + "learning_rate": 0.0006, + "loss": 2.2239, + "step": 9950 + }, + { + "epoch": 0.03711495564856054, + "eval_valid_loss": 2.2231714725494385, + "eval_valid_loss/all": 2.083064317703247, + "eval_valid_loss/end_span": 1.2342498302459717, + "eval_valid_perplexity/batch": 8.029034614562988, + "eval_valid_perplexity/end_span": 3.435800075531006, + "eval_valid_perplexity/fim": 2.7332334518432617, + "eval_valid_perplexity/first_seq": 14.494423866271973, + "eval_valid_perplexity/last_seq": 8.866095542907715, + "eval_valid_perplexity/second_seq": 13.816816329956055, + "eval_valid_perplexity/seq": 9.0473051071167, + "eval_valid_reconstruction/all": 0.2860480844974518, + "eval_valid_reconstruction/end_span": 0.7214062213897705, + "eval_valid_reconstruction/fim": 0.1935451328754425, + "eval_valid_reconstruction/first_seq": 0.17665743827819824, + "eval_valid_reconstruction/last_seq": 0.3289623260498047, + "eval_valid_reconstruction/second_seq": 0.1957724541425705, + "eval_valid_runtime": 439.4938, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 9950 + }, + { + "epoch": 0.03711495564856054, + "eval_train_loss": 2.2187745571136475, + "eval_train_loss/all": 2.051208972930908, + "eval_train_loss/end_span": 1.1922962665557861, + "eval_train_perplexity/batch": 7.7772979736328125, + "eval_train_perplexity/end_span": 3.29463791847229, + "eval_train_perplexity/fim": 1.9657719135284424, + "eval_train_perplexity/first_seq": 15.670658111572266, + "eval_train_perplexity/last_seq": 9.671412467956543, + "eval_train_perplexity/second_seq": 14.18537712097168, + "eval_train_perplexity/seq": 8.949352264404297, + "eval_train_reconstruction/all": 0.2766723334789276, + "eval_train_reconstruction/end_span": 0.7344506978988647, + "eval_train_reconstruction/fim": 0.13069461286067963, + "eval_train_reconstruction/first_seq": 0.15027903020381927, + "eval_train_reconstruction/last_seq": 0.29975342750549316, + "eval_train_reconstruction/second_seq": 0.18787306547164917, + "eval_train_runtime": 439.3141, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 9950 + }, + { + "epoch": 0.037152257111523916, + "grad_norm": 0.3626756966114044, + "learning_rate": 0.0006, + "loss": 2.2479, + "step": 9960 + }, + { + "epoch": 0.03718955857448729, + "grad_norm": 0.33658289909362793, + "learning_rate": 0.0006, + "loss": 2.2534, + "step": 9970 + }, + { + "epoch": 0.03722686003745067, + "grad_norm": 0.3263479769229889, + "learning_rate": 0.0006, + "loss": 2.339, + "step": 9980 + }, + { + "epoch": 0.03726416150041405, + "grad_norm": 0.4103391766548157, + "learning_rate": 0.0006, + "loss": 2.2432, + "step": 9990 + }, + { + "epoch": 0.03730146296337743, + "grad_norm": 0.4006844460964203, + "learning_rate": 0.0006, + "loss": 2.2329, + "step": 10000 + }, + { + "epoch": 0.03730146296337743, + "eval_valid_loss": 2.2260794639587402, + "eval_valid_loss/all": 2.085944890975952, + "eval_valid_loss/end_span": 1.3774303197860718, + "eval_valid_perplexity/batch": 8.052196502685547, + "eval_valid_perplexity/end_span": 3.96470046043396, + "eval_valid_perplexity/fim": 2.4473166465759277, + "eval_valid_perplexity/first_seq": 14.56863784790039, + "eval_valid_perplexity/last_seq": 8.79513168334961, + "eval_valid_perplexity/second_seq": 13.34836196899414, + "eval_valid_perplexity/seq": 9.080236434936523, + "eval_valid_reconstruction/all": 0.28497689962387085, + "eval_valid_reconstruction/end_span": 0.6732320189476013, + "eval_valid_reconstruction/fim": 0.17078958451747894, + "eval_valid_reconstruction/first_seq": 0.17081685364246368, + "eval_valid_reconstruction/last_seq": 0.3367895483970642, + "eval_valid_reconstruction/second_seq": 0.20610515773296356, + "eval_valid_runtime": 441.5914, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 10000 + }, + { + "epoch": 0.03730146296337743, + "eval_train_loss": 2.222076892852783, + "eval_train_loss/all": 2.0547823905944824, + "eval_train_loss/end_span": 1.3395143747329712, + "eval_train_perplexity/batch": 7.805139064788818, + "eval_train_perplexity/end_span": 3.8171892166137695, + "eval_train_perplexity/fim": 2.157956838607788, + "eval_train_perplexity/first_seq": 15.39345645904541, + "eval_train_perplexity/last_seq": 9.411904335021973, + "eval_train_perplexity/second_seq": 14.187342643737793, + "eval_train_perplexity/seq": 8.991207122802734, + "eval_train_reconstruction/all": 0.27539506554603577, + "eval_train_reconstruction/end_span": 0.6846321225166321, + "eval_train_reconstruction/fim": 0.14751572906970978, + "eval_train_reconstruction/first_seq": 0.15420134365558624, + "eval_train_reconstruction/last_seq": 0.30987897515296936, + "eval_train_reconstruction/second_seq": 0.1846802830696106, + "eval_train_runtime": 439.797, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 10000 + }, + { + "epoch": 0.0373387644263408, + "grad_norm": 0.32284700870513916, + "learning_rate": 0.0006, + "loss": 2.3785, + "step": 10010 + }, + { + "epoch": 0.03737606588930418, + "grad_norm": 0.526276171207428, + "learning_rate": 0.0006, + "loss": 2.3108, + "step": 10020 + }, + { + "epoch": 0.03741336735226756, + "grad_norm": 0.3721446096897125, + "learning_rate": 0.0006, + "loss": 2.4144, + "step": 10030 + }, + { + "epoch": 0.03745066881523093, + "grad_norm": 0.2934119701385498, + "learning_rate": 0.0006, + "loss": 2.2577, + "step": 10040 + }, + { + "epoch": 0.03748797027819431, + "grad_norm": 0.2606014311313629, + "learning_rate": 0.0006, + "loss": 2.2847, + "step": 10050 + }, + { + "epoch": 0.03748797027819431, + "eval_valid_loss": 2.2199482917785645, + "eval_valid_loss/all": 2.0799214839935303, + "eval_valid_loss/end_span": 1.2525837421417236, + "eval_valid_perplexity/batch": 8.003840446472168, + "eval_valid_perplexity/end_span": 3.499372720718384, + "eval_valid_perplexity/fim": 2.298851490020752, + "eval_valid_perplexity/first_seq": 15.02573299407959, + "eval_valid_perplexity/last_seq": 9.172614097595215, + "eval_valid_perplexity/second_seq": 13.571427345275879, + "eval_valid_perplexity/seq": 9.019379615783691, + "eval_valid_reconstruction/all": 0.2868565320968628, + "eval_valid_reconstruction/end_span": 0.7075344920158386, + "eval_valid_reconstruction/fim": 0.16111783683300018, + "eval_valid_reconstruction/first_seq": 0.16336508095264435, + "eval_valid_reconstruction/last_seq": 0.31893712282180786, + "eval_valid_reconstruction/second_seq": 0.20141011476516724, + "eval_valid_runtime": 440.1658, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 10050 + }, + { + "epoch": 0.03748797027819431, + "eval_train_loss": 2.218284845352173, + "eval_train_loss/all": 2.0508956909179688, + "eval_train_loss/end_span": 1.221977710723877, + "eval_train_perplexity/batch": 7.774861812591553, + "eval_train_perplexity/end_span": 3.393893241882324, + "eval_train_perplexity/fim": 2.4558043479919434, + "eval_train_perplexity/first_seq": 15.227129936218262, + "eval_train_perplexity/last_seq": 9.47222900390625, + "eval_train_perplexity/second_seq": 14.299378395080566, + "eval_train_perplexity/seq": 8.95108699798584, + "eval_train_reconstruction/all": 0.27665331959724426, + "eval_train_reconstruction/end_span": 0.716549277305603, + "eval_train_reconstruction/fim": 0.17402803897857666, + "eval_train_reconstruction/first_seq": 0.16146717965602875, + "eval_train_reconstruction/last_seq": 0.3121015727519989, + "eval_train_reconstruction/second_seq": 0.18252019584178925, + "eval_train_runtime": 437.8562, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 10050 + }, + { + "epoch": 0.03752527174115769, + "grad_norm": 0.5163314938545227, + "learning_rate": 0.0006, + "loss": 2.3053, + "step": 10060 + }, + { + "epoch": 0.03756257320412106, + "grad_norm": 0.28915509581565857, + "learning_rate": 0.0006, + "loss": 2.2836, + "step": 10070 + }, + { + "epoch": 0.03759987466708444, + "grad_norm": 0.4770803153514862, + "learning_rate": 0.0006, + "loss": 2.3175, + "step": 10080 + }, + { + "epoch": 0.03763717613004782, + "grad_norm": 0.37647977471351624, + "learning_rate": 0.0006, + "loss": 2.2923, + "step": 10090 + }, + { + "epoch": 0.0376744775930112, + "grad_norm": 0.4997406005859375, + "learning_rate": 0.0006, + "loss": 2.1538, + "step": 10100 + }, + { + "epoch": 0.0376744775930112, + "eval_valid_loss": 2.222053289413452, + "eval_valid_loss/all": 2.0810134410858154, + "eval_valid_loss/end_span": 1.3212352991104126, + "eval_valid_perplexity/batch": 8.012584686279297, + "eval_valid_perplexity/end_span": 3.7480485439300537, + "eval_valid_perplexity/fim": 2.4686477184295654, + "eval_valid_perplexity/first_seq": 14.95934009552002, + "eval_valid_perplexity/last_seq": 9.48231029510498, + "eval_valid_perplexity/second_seq": 13.633119583129883, + "eval_valid_perplexity/seq": 9.023175239562988, + "eval_valid_reconstruction/all": 0.2862562835216522, + "eval_valid_reconstruction/end_span": 0.6922799348831177, + "eval_valid_reconstruction/fim": 0.1732095330953598, + "eval_valid_reconstruction/first_seq": 0.16578340530395508, + "eval_valid_reconstruction/last_seq": 0.3122941255569458, + "eval_valid_reconstruction/second_seq": 0.20068927109241486, + "eval_valid_runtime": 438.5251, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 10100 + }, + { + "epoch": 0.0376744775930112, + "eval_train_loss": 2.2198567390441895, + "eval_train_loss/all": 2.051974058151245, + "eval_train_loss/end_span": 1.2853260040283203, + "eval_train_perplexity/batch": 7.783250331878662, + "eval_train_perplexity/end_span": 3.615846633911133, + "eval_train_perplexity/fim": 2.0753040313720703, + "eval_train_perplexity/first_seq": 15.276034355163574, + "eval_train_perplexity/last_seq": 9.856529235839844, + "eval_train_perplexity/second_seq": 14.552464485168457, + "eval_train_perplexity/seq": 8.954876899719238, + "eval_train_reconstruction/all": 0.27598774433135986, + "eval_train_reconstruction/end_span": 0.7021946907043457, + "eval_train_reconstruction/fim": 0.13994361460208893, + "eval_train_reconstruction/first_seq": 0.15650902688503265, + "eval_train_reconstruction/last_seq": 0.29924145340919495, + "eval_train_reconstruction/second_seq": 0.17618128657341003, + "eval_train_runtime": 434.864, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 10100 + }, + { + "epoch": 0.03771177905597457, + "grad_norm": 0.4215572774410248, + "learning_rate": 0.0006, + "loss": 2.1255, + "step": 10110 + }, + { + "epoch": 0.03774908051893795, + "grad_norm": 0.44912588596343994, + "learning_rate": 0.0006, + "loss": 2.2979, + "step": 10120 + }, + { + "epoch": 0.03778638198190133, + "grad_norm": 0.5368903875350952, + "learning_rate": 0.0006, + "loss": 2.259, + "step": 10130 + }, + { + "epoch": 0.037823683444864704, + "grad_norm": 0.5064523816108704, + "learning_rate": 0.0006, + "loss": 2.3618, + "step": 10140 + }, + { + "epoch": 0.037860984907828084, + "grad_norm": 0.5163928270339966, + "learning_rate": 0.0006, + "loss": 2.1184, + "step": 10150 + }, + { + "epoch": 0.037860984907828084, + "eval_valid_loss": 2.2246463298797607, + "eval_valid_loss/all": 2.0832011699676514, + "eval_valid_loss/end_span": 1.2321802377700806, + "eval_valid_perplexity/batch": 8.030133247375488, + "eval_valid_perplexity/end_span": 3.428696870803833, + "eval_valid_perplexity/fim": 2.209479808807373, + "eval_valid_perplexity/first_seq": 14.744049072265625, + "eval_valid_perplexity/last_seq": 9.255450248718262, + "eval_valid_perplexity/second_seq": 13.735843658447266, + "eval_valid_perplexity/seq": 9.037487030029297, + "eval_valid_reconstruction/all": 0.2862468957901001, + "eval_valid_reconstruction/end_span": 0.7010535001754761, + "eval_valid_reconstruction/fim": 0.15254339575767517, + "eval_valid_reconstruction/first_seq": 0.1697356253862381, + "eval_valid_reconstruction/last_seq": 0.31839558482170105, + "eval_valid_reconstruction/second_seq": 0.2009049355983734, + "eval_valid_runtime": 436.9371, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 10150 + }, + { + "epoch": 0.037860984907828084, + "eval_train_loss": 2.220421075820923, + "eval_train_loss/all": 2.052267074584961, + "eval_train_loss/end_span": 1.1933298110961914, + "eval_train_perplexity/batch": 7.785531520843506, + "eval_train_perplexity/end_span": 3.2980449199676514, + "eval_train_perplexity/fim": 2.1614134311676025, + "eval_train_perplexity/first_seq": 15.788899421691895, + "eval_train_perplexity/last_seq": 9.037881851196289, + "eval_train_perplexity/second_seq": 14.119428634643555, + "eval_train_perplexity/seq": 8.95644760131836, + "eval_train_reconstruction/all": 0.27678680419921875, + "eval_train_reconstruction/end_span": 0.7134736776351929, + "eval_train_reconstruction/fim": 0.14890466630458832, + "eval_train_reconstruction/first_seq": 0.14799292385578156, + "eval_train_reconstruction/last_seq": 0.3246222138404846, + "eval_train_reconstruction/second_seq": 0.18625646829605103, + "eval_train_runtime": 439.1483, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 10150 + }, + { + "epoch": 0.03789828637079146, + "grad_norm": 0.3924078047275543, + "learning_rate": 0.0006, + "loss": 2.2409, + "step": 10160 + }, + { + "epoch": 0.03793558783375484, + "grad_norm": 0.4943222999572754, + "learning_rate": 0.0006, + "loss": 2.3527, + "step": 10170 + }, + { + "epoch": 0.037972889296718215, + "grad_norm": 0.393647164106369, + "learning_rate": 0.0006, + "loss": 2.2222, + "step": 10180 + }, + { + "epoch": 0.038010190759681595, + "grad_norm": 0.701164186000824, + "learning_rate": 0.0006, + "loss": 2.2249, + "step": 10190 + }, + { + "epoch": 0.038047492222644974, + "grad_norm": 0.5145665407180786, + "learning_rate": 0.0006, + "loss": 2.1842, + "step": 10200 + }, + { + "epoch": 0.038047492222644974, + "eval_valid_loss": 2.2177131175994873, + "eval_valid_loss/all": 2.077821969985962, + "eval_valid_loss/end_span": 1.3447153568267822, + "eval_valid_perplexity/batch": 7.987053871154785, + "eval_valid_perplexity/end_span": 3.8370940685272217, + "eval_valid_perplexity/fim": 2.463463544845581, + "eval_valid_perplexity/first_seq": 14.798381805419922, + "eval_valid_perplexity/last_seq": 9.199179649353027, + "eval_valid_perplexity/second_seq": 13.968767166137695, + "eval_valid_perplexity/seq": 8.997425079345703, + "eval_valid_reconstruction/all": 0.2874409556388855, + "eval_valid_reconstruction/end_span": 0.6837655901908875, + "eval_valid_reconstruction/fim": 0.17412731051445007, + "eval_valid_reconstruction/first_seq": 0.17051222920417786, + "eval_valid_reconstruction/last_seq": 0.3205607235431671, + "eval_valid_reconstruction/second_seq": 0.18751560151576996, + "eval_valid_runtime": 438.2175, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 10200 + }, + { + "epoch": 0.038047492222644974, + "eval_train_loss": 2.214303731918335, + "eval_train_loss/all": 2.0473504066467285, + "eval_train_loss/end_span": 1.3136922121047974, + "eval_train_perplexity/batch": 7.7473464012146, + "eval_train_perplexity/end_span": 3.7198829650878906, + "eval_train_perplexity/fim": 2.1328728199005127, + "eval_train_perplexity/first_seq": 15.385637283325195, + "eval_train_perplexity/last_seq": 9.450027465820312, + "eval_train_perplexity/second_seq": 14.660710334777832, + "eval_train_perplexity/seq": 8.919060707092285, + "eval_train_reconstruction/all": 0.27765828371047974, + "eval_train_reconstruction/end_span": 0.6928001046180725, + "eval_train_reconstruction/fim": 0.1475876420736313, + "eval_train_reconstruction/first_seq": 0.15169037878513336, + "eval_train_reconstruction/last_seq": 0.30927079916000366, + "eval_train_reconstruction/second_seq": 0.17315973341464996, + "eval_train_runtime": 439.8787, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 10200 + }, + { + "epoch": 0.03808479368560835, + "grad_norm": 0.28357014060020447, + "learning_rate": 0.0006, + "loss": 2.3031, + "step": 10210 + }, + { + "epoch": 0.038122095148571726, + "grad_norm": 0.41614529490470886, + "learning_rate": 0.0006, + "loss": 2.1743, + "step": 10220 + }, + { + "epoch": 0.038159396611535105, + "grad_norm": 0.44281449913978577, + "learning_rate": 0.0006, + "loss": 2.1595, + "step": 10230 + }, + { + "epoch": 0.038196698074498485, + "grad_norm": 0.28738316893577576, + "learning_rate": 0.0006, + "loss": 2.2537, + "step": 10240 + }, + { + "epoch": 0.03823399953746186, + "grad_norm": 0.5112762451171875, + "learning_rate": 0.0006, + "loss": 2.1831, + "step": 10250 + }, + { + "epoch": 0.03823399953746186, + "eval_valid_loss": 2.2187252044677734, + "eval_valid_loss/all": 2.0790560245513916, + "eval_valid_loss/end_span": 1.340148687362671, + "eval_valid_perplexity/batch": 7.9969162940979, + "eval_valid_perplexity/end_span": 3.8196113109588623, + "eval_valid_perplexity/fim": 2.2761409282684326, + "eval_valid_perplexity/first_seq": 14.933639526367188, + "eval_valid_perplexity/last_seq": 8.573948860168457, + "eval_valid_perplexity/second_seq": 14.151445388793945, + "eval_valid_perplexity/seq": 9.016963005065918, + "eval_valid_reconstruction/all": 0.28707337379455566, + "eval_valid_reconstruction/end_span": 0.6863319277763367, + "eval_valid_reconstruction/fim": 0.15928608179092407, + "eval_valid_reconstruction/first_seq": 0.1691119521856308, + "eval_valid_reconstruction/last_seq": 0.3413967490196228, + "eval_valid_reconstruction/second_seq": 0.1855752021074295, + "eval_valid_runtime": 440.7245, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 10250 + }, + { + "epoch": 0.03823399953746186, + "eval_train_loss": 2.215214729309082, + "eval_train_loss/all": 2.048661708831787, + "eval_train_loss/end_span": 1.300461769104004, + "eval_train_perplexity/batch": 7.75751256942749, + "eval_train_perplexity/end_span": 3.6709914207458496, + "eval_train_perplexity/fim": 2.1868064403533936, + "eval_train_perplexity/first_seq": 15.37048053741455, + "eval_train_perplexity/last_seq": 9.329129219055176, + "eval_train_perplexity/second_seq": 13.937602043151855, + "eval_train_perplexity/seq": 8.935932159423828, + "eval_train_reconstruction/all": 0.2772330641746521, + "eval_train_reconstruction/end_span": 0.697318434715271, + "eval_train_reconstruction/fim": 0.15150907635688782, + "eval_train_reconstruction/first_seq": 0.15568728744983673, + "eval_train_reconstruction/last_seq": 0.31490546464920044, + "eval_train_reconstruction/second_seq": 0.1928306519985199, + "eval_train_runtime": 438.7025, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 10250 + }, + { + "epoch": 0.03827130100042524, + "grad_norm": 0.5067139267921448, + "learning_rate": 0.0006, + "loss": 2.2016, + "step": 10260 + }, + { + "epoch": 0.038308602463388616, + "grad_norm": 0.3765329122543335, + "learning_rate": 0.0006, + "loss": 2.2989, + "step": 10270 + }, + { + "epoch": 0.03834590392635199, + "grad_norm": 0.4014286994934082, + "learning_rate": 0.0006, + "loss": 2.3482, + "step": 10280 + }, + { + "epoch": 0.03838320538931537, + "grad_norm": 0.3947312533855438, + "learning_rate": 0.0006, + "loss": 2.2816, + "step": 10290 + }, + { + "epoch": 0.03842050685227875, + "grad_norm": 0.2860790491104126, + "learning_rate": 0.0006, + "loss": 2.2686, + "step": 10300 + }, + { + "epoch": 0.03842050685227875, + "eval_valid_loss": 2.221849203109741, + "eval_valid_loss/all": 2.0813403129577637, + "eval_valid_loss/end_span": 1.267986536026001, + "eval_valid_perplexity/batch": 8.015204429626465, + "eval_valid_perplexity/end_span": 3.553690195083618, + "eval_valid_perplexity/fim": 2.2640247344970703, + "eval_valid_perplexity/first_seq": 14.822672843933105, + "eval_valid_perplexity/last_seq": 9.058276176452637, + "eval_valid_perplexity/second_seq": 13.69229793548584, + "eval_valid_perplexity/seq": 9.027206420898438, + "eval_valid_reconstruction/all": 0.2863800525665283, + "eval_valid_reconstruction/end_span": 0.7042117714881897, + "eval_valid_reconstruction/fim": 0.15685446560382843, + "eval_valid_reconstruction/first_seq": 0.16901883482933044, + "eval_valid_reconstruction/last_seq": 0.3187856674194336, + "eval_valid_reconstruction/second_seq": 0.1983402669429779, + "eval_valid_runtime": 435.2302, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 10300 + }, + { + "epoch": 0.03842050685227875, + "eval_train_loss": 2.2187705039978027, + "eval_train_loss/all": 2.0508012771606445, + "eval_train_loss/end_span": 1.214084506034851, + "eval_train_perplexity/batch": 7.774127960205078, + "eval_train_perplexity/end_span": 3.3672099113464355, + "eval_train_perplexity/fim": 1.9720350503921509, + "eval_train_perplexity/first_seq": 15.667990684509277, + "eval_train_perplexity/last_seq": 9.488622665405273, + "eval_train_perplexity/second_seq": 14.53754997253418, + "eval_train_perplexity/seq": 8.941712379455566, + "eval_train_reconstruction/all": 0.27678725123405457, + "eval_train_reconstruction/end_span": 0.718757688999176, + "eval_train_reconstruction/fim": 0.13165292143821716, + "eval_train_reconstruction/first_seq": 0.1467314213514328, + "eval_train_reconstruction/last_seq": 0.30514752864837646, + "eval_train_reconstruction/second_seq": 0.17638269066810608, + "eval_train_runtime": 436.986, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 10300 + }, + { + "epoch": 0.03845780831524213, + "grad_norm": 1.2233446836471558, + "learning_rate": 0.0006, + "loss": 2.4039, + "step": 10310 + }, + { + "epoch": 0.0384951097782055, + "grad_norm": 0.33476465940475464, + "learning_rate": 0.0006, + "loss": 2.2993, + "step": 10320 + }, + { + "epoch": 0.03853241124116888, + "grad_norm": 0.4084964096546173, + "learning_rate": 0.0006, + "loss": 2.2613, + "step": 10330 + }, + { + "epoch": 0.03856971270413226, + "grad_norm": 0.4019353985786438, + "learning_rate": 0.0006, + "loss": 2.2374, + "step": 10340 + }, + { + "epoch": 0.03860701416709563, + "grad_norm": 0.3497420847415924, + "learning_rate": 0.0006, + "loss": 2.3073, + "step": 10350 + }, + { + "epoch": 0.03860701416709563, + "eval_valid_loss": 2.2161951065063477, + "eval_valid_loss/all": 2.07645320892334, + "eval_valid_loss/end_span": 1.31941556930542, + "eval_valid_perplexity/batch": 7.976129055023193, + "eval_valid_perplexity/end_span": 3.741234302520752, + "eval_valid_perplexity/fim": 2.698162794113159, + "eval_valid_perplexity/first_seq": 15.038534164428711, + "eval_valid_perplexity/last_seq": 9.124238014221191, + "eval_valid_perplexity/second_seq": 13.732110023498535, + "eval_valid_perplexity/seq": 8.983841896057129, + "eval_valid_reconstruction/all": 0.2873912751674652, + "eval_valid_reconstruction/end_span": 0.7034971714019775, + "eval_valid_reconstruction/fim": 0.19233034551143646, + "eval_valid_reconstruction/first_seq": 0.16320489346981049, + "eval_valid_reconstruction/last_seq": 0.32163259387016296, + "eval_valid_reconstruction/second_seq": 0.19858431816101074, + "eval_valid_runtime": 436.9498, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 10350 + }, + { + "epoch": 0.03860701416709563, + "eval_train_loss": 2.213923215866089, + "eval_train_loss/all": 2.0469236373901367, + "eval_train_loss/end_span": 1.2744213342666626, + "eval_train_perplexity/batch": 7.7440409660339355, + "eval_train_perplexity/end_span": 3.5766310691833496, + "eval_train_perplexity/fim": 2.1983959674835205, + "eval_train_perplexity/first_seq": 15.499725341796875, + "eval_train_perplexity/last_seq": 9.428593635559082, + "eval_train_perplexity/second_seq": 14.27139949798584, + "eval_train_perplexity/seq": 8.912152290344238, + "eval_train_reconstruction/all": 0.2773873805999756, + "eval_train_reconstruction/end_span": 0.7154496908187866, + "eval_train_reconstruction/fim": 0.15236659348011017, + "eval_train_reconstruction/first_seq": 0.15065480768680573, + "eval_train_reconstruction/last_seq": 0.31153836846351624, + "eval_train_reconstruction/second_seq": 0.1854580044746399, + "eval_train_runtime": 437.9879, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 10350 + }, + { + "epoch": 0.03864431563005901, + "grad_norm": 0.4419501721858978, + "learning_rate": 0.0006, + "loss": 2.1896, + "step": 10360 + }, + { + "epoch": 0.03868161709302239, + "grad_norm": 0.45138421654701233, + "learning_rate": 0.0006, + "loss": 2.2423, + "step": 10370 + }, + { + "epoch": 0.03871891855598576, + "grad_norm": 0.3743667006492615, + "learning_rate": 0.0006, + "loss": 2.2928, + "step": 10380 + }, + { + "epoch": 0.03875622001894914, + "grad_norm": 0.4062157869338989, + "learning_rate": 0.0006, + "loss": 2.2324, + "step": 10390 + }, + { + "epoch": 0.03879352148191252, + "grad_norm": 0.4648546874523163, + "learning_rate": 0.0006, + "loss": 2.1939, + "step": 10400 + }, + { + "epoch": 0.03879352148191252, + "eval_valid_loss": 2.2192060947418213, + "eval_valid_loss/all": 2.078037738800049, + "eval_valid_loss/end_span": 1.2575994729995728, + "eval_valid_perplexity/batch": 7.9887776374816895, + "eval_valid_perplexity/end_span": 3.5169687271118164, + "eval_valid_perplexity/fim": 2.4556405544281006, + "eval_valid_perplexity/first_seq": 14.785809516906738, + "eval_valid_perplexity/last_seq": 9.320040702819824, + "eval_valid_perplexity/second_seq": 13.736678123474121, + "eval_valid_perplexity/seq": 8.997135162353516, + "eval_valid_reconstruction/all": 0.28732097148895264, + "eval_valid_reconstruction/end_span": 0.7069845795631409, + "eval_valid_reconstruction/fim": 0.17322464287281036, + "eval_valid_reconstruction/first_seq": 0.16978976130485535, + "eval_valid_reconstruction/last_seq": 0.3168080747127533, + "eval_valid_reconstruction/second_seq": 0.194230318069458, + "eval_valid_runtime": 443.0592, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 10400 + }, + { + "epoch": 0.03879352148191252, + "eval_train_loss": 2.21453595161438, + "eval_train_loss/all": 2.047356128692627, + "eval_train_loss/end_span": 1.2250710725784302, + "eval_train_perplexity/batch": 7.7473907470703125, + "eval_train_perplexity/end_span": 3.4044079780578613, + "eval_train_perplexity/fim": 1.9744279384613037, + "eval_train_perplexity/first_seq": 15.4453763961792, + "eval_train_perplexity/last_seq": 9.174087524414062, + "eval_train_perplexity/second_seq": 14.140393257141113, + "eval_train_perplexity/seq": 8.91486930847168, + "eval_train_reconstruction/all": 0.27757954597473145, + "eval_train_reconstruction/end_span": 0.7171574831008911, + "eval_train_reconstruction/fim": 0.13225650787353516, + "eval_train_reconstruction/first_seq": 0.1506856232881546, + "eval_train_reconstruction/last_seq": 0.3200245499610901, + "eval_train_reconstruction/second_seq": 0.18605269491672516, + "eval_train_runtime": 439.5158, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 10400 + }, + { + "epoch": 0.0388308229448759, + "grad_norm": 0.3671760857105255, + "learning_rate": 0.0006, + "loss": 2.3036, + "step": 10410 + }, + { + "epoch": 0.03886812440783927, + "grad_norm": 0.3641047775745392, + "learning_rate": 0.0006, + "loss": 2.1958, + "step": 10420 + }, + { + "epoch": 0.03890542587080265, + "grad_norm": 0.3237457275390625, + "learning_rate": 0.0006, + "loss": 2.2922, + "step": 10430 + }, + { + "epoch": 0.03894272733376603, + "grad_norm": 0.3962446451187134, + "learning_rate": 0.0006, + "loss": 2.2431, + "step": 10440 + }, + { + "epoch": 0.038980028796729405, + "grad_norm": 0.3411063551902771, + "learning_rate": 0.0006, + "loss": 2.3555, + "step": 10450 + }, + { + "epoch": 0.038980028796729405, + "eval_valid_loss": 2.212932825088501, + "eval_valid_loss/all": 2.073396921157837, + "eval_valid_loss/end_span": 1.3413846492767334, + "eval_valid_perplexity/batch": 7.951788902282715, + "eval_valid_perplexity/end_span": 3.8243350982666016, + "eval_valid_perplexity/fim": 2.6268656253814697, + "eval_valid_perplexity/first_seq": 14.968273162841797, + "eval_valid_perplexity/last_seq": 9.521736145019531, + "eval_valid_perplexity/second_seq": 13.610960006713867, + "eval_valid_perplexity/seq": 8.957984924316406, + "eval_valid_reconstruction/all": 0.28836342692375183, + "eval_valid_reconstruction/end_span": 0.6891397833824158, + "eval_valid_reconstruction/fim": 0.1888030618429184, + "eval_valid_reconstruction/first_seq": 0.16759078204631805, + "eval_valid_reconstruction/last_seq": 0.3091568350791931, + "eval_valid_reconstruction/second_seq": 0.19892849028110504, + "eval_valid_runtime": 441.8744, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 10450 + }, + { + "epoch": 0.038980028796729405, + "eval_train_loss": 2.21185302734375, + "eval_train_loss/all": 2.0452802181243896, + "eval_train_loss/end_span": 1.30446195602417, + "eval_train_perplexity/batch": 7.731324672698975, + "eval_train_perplexity/end_span": 3.6857054233551025, + "eval_train_perplexity/fim": 2.1771888732910156, + "eval_train_perplexity/first_seq": 15.65198040008545, + "eval_train_perplexity/last_seq": 8.785441398620605, + "eval_train_perplexity/second_seq": 14.319220542907715, + "eval_train_perplexity/seq": 8.899816513061523, + "eval_train_reconstruction/all": 0.2780803143978119, + "eval_train_reconstruction/end_span": 0.6957790851593018, + "eval_train_reconstruction/fim": 0.1519780158996582, + "eval_train_reconstruction/first_seq": 0.15014998614788055, + "eval_train_reconstruction/last_seq": 0.33010217547416687, + "eval_train_reconstruction/second_seq": 0.1821691393852234, + "eval_train_runtime": 440.3353, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 10450 + }, + { + "epoch": 0.039017330259692784, + "grad_norm": 0.2801978588104248, + "learning_rate": 0.0006, + "loss": 2.2844, + "step": 10460 + }, + { + "epoch": 0.03905463172265616, + "grad_norm": 0.45250698924064636, + "learning_rate": 0.0006, + "loss": 2.1623, + "step": 10470 + }, + { + "epoch": 0.03909193318561954, + "grad_norm": 0.34125763177871704, + "learning_rate": 0.0006, + "loss": 2.3731, + "step": 10480 + }, + { + "epoch": 0.039129234648582915, + "grad_norm": 0.2789068818092346, + "learning_rate": 0.0006, + "loss": 2.1737, + "step": 10490 + }, + { + "epoch": 0.039166536111546295, + "grad_norm": 0.8543901443481445, + "learning_rate": 0.0006, + "loss": 2.3086, + "step": 10500 + }, + { + "epoch": 0.039166536111546295, + "eval_valid_loss": 2.222409963607788, + "eval_valid_loss/all": 2.0821821689605713, + "eval_valid_loss/end_span": 1.2398217916488647, + "eval_valid_perplexity/batch": 8.021955490112305, + "eval_valid_perplexity/end_span": 3.4549977779388428, + "eval_valid_perplexity/fim": 2.215672492980957, + "eval_valid_perplexity/first_seq": 15.071161270141602, + "eval_valid_perplexity/last_seq": 9.333030700683594, + "eval_valid_perplexity/second_seq": 13.899771690368652, + "eval_valid_perplexity/seq": 9.04394817352295, + "eval_valid_reconstruction/all": 0.286011666059494, + "eval_valid_reconstruction/end_span": 0.7100625038146973, + "eval_valid_reconstruction/fim": 0.15395516157150269, + "eval_valid_reconstruction/first_seq": 0.15936535596847534, + "eval_valid_reconstruction/last_seq": 0.3171016573905945, + "eval_valid_reconstruction/second_seq": 0.192466601729393, + "eval_valid_runtime": 436.7152, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 10500 + }, + { + "epoch": 0.039166536111546295, + "eval_train_loss": 2.2196130752563477, + "eval_train_loss/all": 2.0521552562713623, + "eval_train_loss/end_span": 1.2023499011993408, + "eval_train_perplexity/batch": 7.784660816192627, + "eval_train_perplexity/end_span": 3.327928066253662, + "eval_train_perplexity/fim": 2.34967041015625, + "eval_train_perplexity/first_seq": 15.631235122680664, + "eval_train_perplexity/last_seq": 9.116046905517578, + "eval_train_perplexity/second_seq": 14.453165054321289, + "eval_train_perplexity/seq": 8.962693214416504, + "eval_train_reconstruction/all": 0.27627015113830566, + "eval_train_reconstruction/end_span": 0.7206929922103882, + "eval_train_reconstruction/fim": 0.16457873582839966, + "eval_train_reconstruction/first_seq": 0.14850962162017822, + "eval_train_reconstruction/last_seq": 0.3212226629257202, + "eval_train_reconstruction/second_seq": 0.17462091147899628, + "eval_train_runtime": 437.689, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 10500 + }, + { + "epoch": 0.039203837574509674, + "grad_norm": 0.542627215385437, + "learning_rate": 0.0006, + "loss": 2.3109, + "step": 10510 + }, + { + "epoch": 0.03924113903747305, + "grad_norm": 0.4040825068950653, + "learning_rate": 0.0006, + "loss": 2.1203, + "step": 10520 + }, + { + "epoch": 0.039278440500436426, + "grad_norm": 0.3342771828174591, + "learning_rate": 0.0006, + "loss": 2.1028, + "step": 10530 + }, + { + "epoch": 0.039315741963399806, + "grad_norm": 0.3145125210285187, + "learning_rate": 0.0006, + "loss": 2.2633, + "step": 10540 + }, + { + "epoch": 0.039353043426363185, + "grad_norm": 0.4098788797855377, + "learning_rate": 0.0006, + "loss": 2.2512, + "step": 10550 + }, + { + "epoch": 0.039353043426363185, + "eval_valid_loss": 2.218336820602417, + "eval_valid_loss/all": 2.0783274173736572, + "eval_valid_loss/end_span": 1.3004604578018188, + "eval_valid_perplexity/batch": 7.991092205047607, + "eval_valid_perplexity/end_span": 3.6709866523742676, + "eval_valid_perplexity/fim": 2.4293949604034424, + "eval_valid_perplexity/first_seq": 15.106424331665039, + "eval_valid_perplexity/last_seq": 8.967920303344727, + "eval_valid_perplexity/second_seq": 13.96121883392334, + "eval_valid_perplexity/seq": 9.00298023223877, + "eval_valid_reconstruction/all": 0.28701964020729065, + "eval_valid_reconstruction/end_span": 0.7023047208786011, + "eval_valid_reconstruction/fim": 0.17126010358333588, + "eval_valid_reconstruction/first_seq": 0.16146831214427948, + "eval_valid_reconstruction/last_seq": 0.3299212157726288, + "eval_valid_reconstruction/second_seq": 0.19079436361789703, + "eval_valid_runtime": 433.643, + "eval_valid_samples_per_second": 0.443, + "eval_valid_steps_per_second": 0.443, + "step": 10550 + }, + { + "epoch": 0.039353043426363185, + "eval_train_loss": 2.216939687728882, + "eval_train_loss/all": 2.0494544506073, + "eval_train_loss/end_span": 1.263033151626587, + "eval_train_perplexity/batch": 7.763664245605469, + "eval_train_perplexity/end_span": 3.536130905151367, + "eval_train_perplexity/fim": 2.1103081703186035, + "eval_train_perplexity/first_seq": 15.72196102142334, + "eval_train_perplexity/last_seq": 9.23574161529541, + "eval_train_perplexity/second_seq": 14.361520767211914, + "eval_train_perplexity/seq": 8.937264442443848, + "eval_train_reconstruction/all": 0.27675288915634155, + "eval_train_reconstruction/end_span": 0.7120328545570374, + "eval_train_reconstruction/fim": 0.14424918591976166, + "eval_train_reconstruction/first_seq": 0.14720268547534943, + "eval_train_reconstruction/last_seq": 0.316317081451416, + "eval_train_reconstruction/second_seq": 0.17712517082691193, + "eval_train_runtime": 435.3218, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 10550 + }, + { + "epoch": 0.03939034488932656, + "grad_norm": 0.2733953297138214, + "learning_rate": 0.0006, + "loss": 2.1188, + "step": 10560 + }, + { + "epoch": 0.03942764635228994, + "grad_norm": 0.3509595990180969, + "learning_rate": 0.0006, + "loss": 2.1635, + "step": 10570 + }, + { + "epoch": 0.039464947815253316, + "grad_norm": 0.5110011100769043, + "learning_rate": 0.0006, + "loss": 2.0572, + "step": 10580 + }, + { + "epoch": 0.03950224927821669, + "grad_norm": 0.44775035977363586, + "learning_rate": 0.0006, + "loss": 2.3483, + "step": 10590 + }, + { + "epoch": 0.03953955074118007, + "grad_norm": 0.39442452788352966, + "learning_rate": 0.0006, + "loss": 2.3165, + "step": 10600 + }, + { + "epoch": 0.03953955074118007, + "eval_valid_loss": 2.217559814453125, + "eval_valid_loss/all": 2.078124761581421, + "eval_valid_loss/end_span": 1.2341865301132202, + "eval_valid_perplexity/batch": 7.98947286605835, + "eval_valid_perplexity/end_span": 3.4355826377868652, + "eval_valid_perplexity/fim": 2.4816999435424805, + "eval_valid_perplexity/first_seq": 14.871546745300293, + "eval_valid_perplexity/last_seq": 9.5575590133667, + "eval_valid_perplexity/second_seq": 13.369370460510254, + "eval_valid_perplexity/seq": 9.008673667907715, + "eval_valid_reconstruction/all": 0.28711554408073425, + "eval_valid_reconstruction/end_span": 0.7089093327522278, + "eval_valid_reconstruction/fim": 0.17609524726867676, + "eval_valid_reconstruction/first_seq": 0.17015540599822998, + "eval_valid_reconstruction/last_seq": 0.3058595657348633, + "eval_valid_reconstruction/second_seq": 0.20353081822395325, + "eval_valid_runtime": 437.0587, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 10600 + }, + { + "epoch": 0.03953955074118007, + "eval_train_loss": 2.2152788639068604, + "eval_train_loss/all": 2.048658609390259, + "eval_train_loss/end_span": 1.1989362239837646, + "eval_train_perplexity/batch": 7.757488250732422, + "eval_train_perplexity/end_span": 3.316586971282959, + "eval_train_perplexity/fim": 2.3168249130249023, + "eval_train_perplexity/first_seq": 15.622870445251465, + "eval_train_perplexity/last_seq": 9.364608764648438, + "eval_train_perplexity/second_seq": 14.307668685913086, + "eval_train_perplexity/seq": 8.934948921203613, + "eval_train_reconstruction/all": 0.27717316150665283, + "eval_train_reconstruction/end_span": 0.7186933755874634, + "eval_train_reconstruction/fim": 0.1625223308801651, + "eval_train_reconstruction/first_seq": 0.14900662004947662, + "eval_train_reconstruction/last_seq": 0.31351977586746216, + "eval_train_reconstruction/second_seq": 0.18642185628414154, + "eval_train_runtime": 437.4855, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 10600 + }, + { + "epoch": 0.03957685220414345, + "grad_norm": 0.3928414583206177, + "learning_rate": 0.0006, + "loss": 2.2049, + "step": 10610 + }, + { + "epoch": 0.03961415366710683, + "grad_norm": 0.36928436160087585, + "learning_rate": 0.0006, + "loss": 2.1566, + "step": 10620 + }, + { + "epoch": 0.0396514551300702, + "grad_norm": 0.4445672929286957, + "learning_rate": 0.0006, + "loss": 2.3433, + "step": 10630 + }, + { + "epoch": 0.03968875659303358, + "grad_norm": 0.38633203506469727, + "learning_rate": 0.0006, + "loss": 2.2997, + "step": 10640 + }, + { + "epoch": 0.03972605805599696, + "grad_norm": 0.4041095972061157, + "learning_rate": 0.0006, + "loss": 2.2306, + "step": 10650 + }, + { + "epoch": 0.03972605805599696, + "eval_valid_loss": 2.2191007137298584, + "eval_valid_loss/all": 2.0794336795806885, + "eval_valid_loss/end_span": 1.2540414333343506, + "eval_valid_perplexity/batch": 7.999937057495117, + "eval_valid_perplexity/end_span": 3.5044775009155273, + "eval_valid_perplexity/fim": 2.4007577896118164, + "eval_valid_perplexity/first_seq": 14.74061107635498, + "eval_valid_perplexity/last_seq": 9.295794486999512, + "eval_valid_perplexity/second_seq": 13.935781478881836, + "eval_valid_perplexity/seq": 9.014182090759277, + "eval_valid_reconstruction/all": 0.2866951525211334, + "eval_valid_reconstruction/end_span": 0.7044673562049866, + "eval_valid_reconstruction/fim": 0.1686001718044281, + "eval_valid_reconstruction/first_seq": 0.16976675391197205, + "eval_valid_reconstruction/last_seq": 0.3168206810951233, + "eval_valid_reconstruction/second_seq": 0.19457529485225677, + "eval_valid_runtime": 436.3658, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 10650 + }, + { + "epoch": 0.03972605805599696, + "eval_train_loss": 2.2172317504882812, + "eval_train_loss/all": 2.050290107727051, + "eval_train_loss/end_span": 1.213322639465332, + "eval_train_perplexity/batch": 7.77015495300293, + "eval_train_perplexity/end_span": 3.3646457195281982, + "eval_train_perplexity/fim": 2.525331735610962, + "eval_train_perplexity/first_seq": 15.260039329528809, + "eval_train_perplexity/last_seq": 9.41765022277832, + "eval_train_perplexity/second_seq": 14.117486953735352, + "eval_train_perplexity/seq": 8.947708129882812, + "eval_train_reconstruction/all": 0.27667856216430664, + "eval_train_reconstruction/end_span": 0.7169296145439148, + "eval_train_reconstruction/fim": 0.17890901863574982, + "eval_train_reconstruction/first_seq": 0.1582394540309906, + "eval_train_reconstruction/last_seq": 0.3121328353881836, + "eval_train_reconstruction/second_seq": 0.1870896816253662, + "eval_train_runtime": 434.6302, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 10650 + }, + { + "epoch": 0.03976335951896033, + "grad_norm": 0.3187023997306824, + "learning_rate": 0.0006, + "loss": 2.3197, + "step": 10660 + }, + { + "epoch": 0.03980066098192371, + "grad_norm": 0.40843576192855835, + "learning_rate": 0.0006, + "loss": 2.1133, + "step": 10670 + }, + { + "epoch": 0.03983796244488709, + "grad_norm": 0.3515080213546753, + "learning_rate": 0.0006, + "loss": 2.417, + "step": 10680 + }, + { + "epoch": 0.03987526390785046, + "grad_norm": 0.4035349488258362, + "learning_rate": 0.0006, + "loss": 2.1997, + "step": 10690 + }, + { + "epoch": 0.03991256537081384, + "grad_norm": 0.2983191907405853, + "learning_rate": 0.0006, + "loss": 2.3673, + "step": 10700 + }, + { + "epoch": 0.03991256537081384, + "eval_valid_loss": 2.2282590866088867, + "eval_valid_loss/all": 2.085580587387085, + "eval_valid_loss/end_span": 1.2631211280822754, + "eval_valid_perplexity/batch": 8.049263000488281, + "eval_valid_perplexity/end_span": 3.5364420413970947, + "eval_valid_perplexity/fim": 2.3079800605773926, + "eval_valid_perplexity/first_seq": 14.915851593017578, + "eval_valid_perplexity/last_seq": 8.910079002380371, + "eval_valid_perplexity/second_seq": 13.838400840759277, + "eval_valid_perplexity/seq": 9.061771392822266, + "eval_valid_reconstruction/all": 0.2851901650428772, + "eval_valid_reconstruction/end_span": 0.7090027332305908, + "eval_valid_reconstruction/fim": 0.16190089285373688, + "eval_valid_reconstruction/first_seq": 0.1670731157064438, + "eval_valid_reconstruction/last_seq": 0.332342267036438, + "eval_valid_reconstruction/second_seq": 0.19472217559814453, + "eval_valid_runtime": 435.244, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 10700 + }, + { + "epoch": 0.03991256537081384, + "eval_train_loss": 2.2213008403778076, + "eval_train_loss/all": 2.0525498390197754, + "eval_train_loss/end_span": 1.2247787714004517, + "eval_train_perplexity/batch": 7.78773307800293, + "eval_train_perplexity/end_span": 3.4034130573272705, + "eval_train_perplexity/fim": 2.061828851699829, + "eval_train_perplexity/first_seq": 15.33043098449707, + "eval_train_perplexity/last_seq": 8.976615905761719, + "eval_train_perplexity/second_seq": 14.557517051696777, + "eval_train_perplexity/seq": 8.956358909606934, + "eval_train_reconstruction/all": 0.27619871497154236, + "eval_train_reconstruction/end_span": 0.7236608862876892, + "eval_train_reconstruction/fim": 0.14061447978019714, + "eval_train_reconstruction/first_seq": 0.15324363112449646, + "eval_train_reconstruction/last_seq": 0.32492873072624207, + "eval_train_reconstruction/second_seq": 0.1792726069688797, + "eval_train_runtime": 434.7498, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 10700 + }, + { + "epoch": 0.03994986683377722, + "grad_norm": 0.4069550931453705, + "learning_rate": 0.0006, + "loss": 2.123, + "step": 10710 + }, + { + "epoch": 0.0399871682967406, + "grad_norm": 0.5383780598640442, + "learning_rate": 0.0006, + "loss": 2.3494, + "step": 10720 + }, + { + "epoch": 0.04002446975970397, + "grad_norm": 0.3822064697742462, + "learning_rate": 0.0006, + "loss": 2.2276, + "step": 10730 + }, + { + "epoch": 0.04006177122266735, + "grad_norm": 0.319685697555542, + "learning_rate": 0.0006, + "loss": 2.1928, + "step": 10740 + }, + { + "epoch": 0.04009907268563073, + "grad_norm": 0.2783815264701843, + "learning_rate": 0.0006, + "loss": 2.1474, + "step": 10750 + }, + { + "epoch": 0.04009907268563073, + "eval_valid_loss": 2.2189207077026367, + "eval_valid_loss/all": 2.0788776874542236, + "eval_valid_loss/end_span": 1.3706971406936646, + "eval_valid_perplexity/batch": 7.995490550994873, + "eval_valid_perplexity/end_span": 3.9380950927734375, + "eval_valid_perplexity/fim": 2.4408676624298096, + "eval_valid_perplexity/first_seq": 14.876174926757812, + "eval_valid_perplexity/last_seq": 9.235333442687988, + "eval_valid_perplexity/second_seq": 13.673754692077637, + "eval_valid_perplexity/seq": 9.012642860412598, + "eval_valid_reconstruction/all": 0.2870250940322876, + "eval_valid_reconstruction/end_span": 0.6828985810279846, + "eval_valid_reconstruction/fim": 0.1720137596130371, + "eval_valid_reconstruction/first_seq": 0.16775648295879364, + "eval_valid_reconstruction/last_seq": 0.31877121329307556, + "eval_valid_reconstruction/second_seq": 0.196670264005661, + "eval_valid_runtime": 438.0059, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 10750 + }, + { + "epoch": 0.04009907268563073, + "eval_train_loss": 2.2191131114959717, + "eval_train_loss/all": 2.052107334136963, + "eval_train_loss/end_span": 1.3255661725997925, + "eval_train_perplexity/batch": 7.784287929534912, + "eval_train_perplexity/end_span": 3.7643160820007324, + "eval_train_perplexity/fim": 2.0067288875579834, + "eval_train_perplexity/first_seq": 15.800897598266602, + "eval_train_perplexity/last_seq": 9.224618911743164, + "eval_train_perplexity/second_seq": 14.375003814697266, + "eval_train_perplexity/seq": 8.968428611755371, + "eval_train_reconstruction/all": 0.2762526571750641, + "eval_train_reconstruction/end_span": 0.6930804252624512, + "eval_train_reconstruction/fim": 0.13473451137542725, + "eval_train_reconstruction/first_seq": 0.1451055407524109, + "eval_train_reconstruction/last_seq": 0.31302395462989807, + "eval_train_reconstruction/second_seq": 0.1799958348274231, + "eval_train_runtime": 434.6946, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 10750 + }, + { + "epoch": 0.040136374148594105, + "grad_norm": 0.3655882477760315, + "learning_rate": 0.0006, + "loss": 2.1759, + "step": 10760 + }, + { + "epoch": 0.040173675611557484, + "grad_norm": 0.3230573534965515, + "learning_rate": 0.0006, + "loss": 2.3413, + "step": 10770 + }, + { + "epoch": 0.040210977074520864, + "grad_norm": 0.682584285736084, + "learning_rate": 0.0006, + "loss": 2.3535, + "step": 10780 + }, + { + "epoch": 0.04024827853748424, + "grad_norm": 0.3421095311641693, + "learning_rate": 0.0006, + "loss": 1.9734, + "step": 10790 + }, + { + "epoch": 0.040285580000447616, + "grad_norm": 0.3368506133556366, + "learning_rate": 0.0006, + "loss": 1.9695, + "step": 10800 + }, + { + "epoch": 0.040285580000447616, + "eval_valid_loss": 2.223522186279297, + "eval_valid_loss/all": 2.083833932876587, + "eval_valid_loss/end_span": 1.3978856801986694, + "eval_valid_perplexity/batch": 8.035216331481934, + "eval_valid_perplexity/end_span": 4.046635150909424, + "eval_valid_perplexity/fim": 2.281499147415161, + "eval_valid_perplexity/first_seq": 15.096375465393066, + "eval_valid_perplexity/last_seq": 9.08193588256836, + "eval_valid_perplexity/second_seq": 13.740346908569336, + "eval_valid_perplexity/seq": 9.056405067443848, + "eval_valid_reconstruction/all": 0.2847721576690674, + "eval_valid_reconstruction/end_span": 0.6767321228981018, + "eval_valid_reconstruction/fim": 0.1587757021188736, + "eval_valid_reconstruction/first_seq": 0.1642773449420929, + "eval_valid_reconstruction/last_seq": 0.3225133717060089, + "eval_valid_reconstruction/second_seq": 0.19679652154445648, + "eval_valid_runtime": 436.1547, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 10800 + }, + { + "epoch": 0.040285580000447616, + "eval_train_loss": 2.220597743988037, + "eval_train_loss/all": 2.0523486137390137, + "eval_train_loss/end_span": 1.3492116928100586, + "eval_train_perplexity/batch": 7.786166191101074, + "eval_train_perplexity/end_span": 3.8543858528137207, + "eval_train_perplexity/fim": 2.09283447265625, + "eval_train_perplexity/first_seq": 15.709966659545898, + "eval_train_perplexity/last_seq": 9.453527450561523, + "eval_train_perplexity/second_seq": 14.744895935058594, + "eval_train_perplexity/seq": 8.956374168395996, + "eval_train_reconstruction/all": 0.2758592665195465, + "eval_train_reconstruction/end_span": 0.6902903914451599, + "eval_train_reconstruction/fim": 0.14271248877048492, + "eval_train_reconstruction/first_seq": 0.1502404808998108, + "eval_train_reconstruction/last_seq": 0.3087535500526428, + "eval_train_reconstruction/second_seq": 0.1718226820230484, + "eval_train_runtime": 435.9019, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 10800 + }, + { + "epoch": 0.040322881463410995, + "grad_norm": 0.44915834069252014, + "learning_rate": 0.0006, + "loss": 2.3049, + "step": 10810 + }, + { + "epoch": 0.040360182926374374, + "grad_norm": 0.45107409358024597, + "learning_rate": 0.0006, + "loss": 2.2993, + "step": 10820 + }, + { + "epoch": 0.04039748438933775, + "grad_norm": 0.4536902904510498, + "learning_rate": 0.0006, + "loss": 2.1372, + "step": 10830 + }, + { + "epoch": 0.040434785852301126, + "grad_norm": 0.3731260895729065, + "learning_rate": 0.0006, + "loss": 2.2768, + "step": 10840 + }, + { + "epoch": 0.040472087315264506, + "grad_norm": 1.04689359664917, + "learning_rate": 0.0006, + "loss": 2.2006, + "step": 10850 + }, + { + "epoch": 0.040472087315264506, + "eval_valid_loss": 2.2177274227142334, + "eval_valid_loss/all": 2.077800989151001, + "eval_valid_loss/end_span": 1.2727274894714355, + "eval_valid_perplexity/batch": 7.986886501312256, + "eval_valid_perplexity/end_span": 3.570578098297119, + "eval_valid_perplexity/fim": 2.2457425594329834, + "eval_valid_perplexity/first_seq": 14.907756805419922, + "eval_valid_perplexity/last_seq": 8.994742393493652, + "eval_valid_perplexity/second_seq": 13.77404499053955, + "eval_valid_perplexity/seq": 8.999503135681152, + "eval_valid_reconstruction/all": 0.2873460650444031, + "eval_valid_reconstruction/end_span": 0.704488217830658, + "eval_valid_reconstruction/fim": 0.15636663138866425, + "eval_valid_reconstruction/first_seq": 0.16967415809631348, + "eval_valid_reconstruction/last_seq": 0.3256979286670685, + "eval_valid_reconstruction/second_seq": 0.19681456685066223, + "eval_valid_runtime": 436.3745, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 10850 + }, + { + "epoch": 0.040472087315264506, + "eval_train_loss": 2.2182130813598633, + "eval_train_loss/all": 2.0511178970336914, + "eval_train_loss/end_span": 1.2381317615509033, + "eval_train_perplexity/batch": 7.776589870452881, + "eval_train_perplexity/end_span": 3.4491636753082275, + "eval_train_perplexity/fim": 2.103696584701538, + "eval_train_perplexity/first_seq": 15.600020408630371, + "eval_train_perplexity/last_seq": 9.232295989990234, + "eval_train_perplexity/second_seq": 14.229683876037598, + "eval_train_perplexity/seq": 8.954937934875488, + "eval_train_reconstruction/all": 0.2766326665878296, + "eval_train_reconstruction/end_span": 0.7153058648109436, + "eval_train_reconstruction/fim": 0.14456693828105927, + "eval_train_reconstruction/first_seq": 0.1511942446231842, + "eval_train_reconstruction/last_seq": 0.31672507524490356, + "eval_train_reconstruction/second_seq": 0.18209075927734375, + "eval_train_runtime": 437.2743, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 10850 + }, + { + "epoch": 0.040509388778227885, + "grad_norm": 0.2914244830608368, + "learning_rate": 0.0006, + "loss": 2.1646, + "step": 10860 + }, + { + "epoch": 0.04054669024119126, + "grad_norm": 0.3572905361652374, + "learning_rate": 0.0006, + "loss": 2.2644, + "step": 10870 + }, + { + "epoch": 0.04058399170415464, + "grad_norm": 0.2916671335697174, + "learning_rate": 0.0006, + "loss": 2.3523, + "step": 10880 + }, + { + "epoch": 0.04062129316711802, + "grad_norm": 0.3039584755897522, + "learning_rate": 0.0006, + "loss": 2.3316, + "step": 10890 + }, + { + "epoch": 0.04065859463008139, + "grad_norm": 0.3473069369792938, + "learning_rate": 0.0006, + "loss": 2.2518, + "step": 10900 + }, + { + "epoch": 0.04065859463008139, + "eval_valid_loss": 2.2164905071258545, + "eval_valid_loss/all": 2.0767436027526855, + "eval_valid_loss/end_span": 1.2195141315460205, + "eval_valid_perplexity/batch": 7.978445529937744, + "eval_valid_perplexity/end_span": 3.385542392730713, + "eval_valid_perplexity/fim": 2.2364001274108887, + "eval_valid_perplexity/first_seq": 14.845385551452637, + "eval_valid_perplexity/last_seq": 9.082204818725586, + "eval_valid_perplexity/second_seq": 13.745733261108398, + "eval_valid_perplexity/seq": 8.987125396728516, + "eval_valid_reconstruction/all": 0.28776776790618896, + "eval_valid_reconstruction/end_span": 0.7141048908233643, + "eval_valid_reconstruction/fim": 0.1553756445646286, + "eval_valid_reconstruction/first_seq": 0.16813461482524872, + "eval_valid_reconstruction/last_seq": 0.3286627233028412, + "eval_valid_reconstruction/second_seq": 0.19676555693149567, + "eval_valid_runtime": 439.5292, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 10900 + }, + { + "epoch": 0.04065859463008139, + "eval_train_loss": 2.2142114639282227, + "eval_train_loss/all": 2.0473477840423584, + "eval_train_loss/end_span": 1.1919243335723877, + "eval_train_perplexity/batch": 7.747326374053955, + "eval_train_perplexity/end_span": 3.293412685394287, + "eval_train_perplexity/fim": 2.1449685096740723, + "eval_train_perplexity/first_seq": 15.822468757629395, + "eval_train_perplexity/last_seq": 8.885318756103516, + "eval_train_perplexity/second_seq": 14.358586311340332, + "eval_train_perplexity/seq": 8.920157432556152, + "eval_train_reconstruction/all": 0.2777148485183716, + "eval_train_reconstruction/end_span": 0.7249764800071716, + "eval_train_reconstruction/fim": 0.14877280592918396, + "eval_train_reconstruction/first_seq": 0.14522601664066315, + "eval_train_reconstruction/last_seq": 0.3303799033164978, + "eval_train_reconstruction/second_seq": 0.18039613962173462, + "eval_train_runtime": 436.283, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 10900 + }, + { + "epoch": 0.04069589609304477, + "grad_norm": 0.5308670401573181, + "learning_rate": 0.0006, + "loss": 2.3003, + "step": 10910 + }, + { + "epoch": 0.04073319755600815, + "grad_norm": 0.4959028661251068, + "learning_rate": 0.0006, + "loss": 2.2647, + "step": 10920 + }, + { + "epoch": 0.04077049901897153, + "grad_norm": 0.5087364315986633, + "learning_rate": 0.0006, + "loss": 2.0678, + "step": 10930 + }, + { + "epoch": 0.0408078004819349, + "grad_norm": 0.25619930028915405, + "learning_rate": 0.0006, + "loss": 2.1841, + "step": 10940 + }, + { + "epoch": 0.04084510194489828, + "grad_norm": 0.49177539348602295, + "learning_rate": 0.0006, + "loss": 2.2998, + "step": 10950 + }, + { + "epoch": 0.04084510194489828, + "eval_valid_loss": 2.2167885303497314, + "eval_valid_loss/all": 2.077268600463867, + "eval_valid_loss/end_span": 1.2754970788955688, + "eval_valid_perplexity/batch": 7.982635498046875, + "eval_valid_perplexity/end_span": 3.5804808139801025, + "eval_valid_perplexity/fim": 2.345586061477661, + "eval_valid_perplexity/first_seq": 14.746502876281738, + "eval_valid_perplexity/last_seq": 9.169740676879883, + "eval_valid_perplexity/second_seq": 13.844637870788574, + "eval_valid_perplexity/seq": 8.997748374938965, + "eval_valid_reconstruction/all": 0.28748250007629395, + "eval_valid_reconstruction/end_span": 0.7052673101425171, + "eval_valid_reconstruction/fim": 0.16549953818321228, + "eval_valid_reconstruction/first_seq": 0.17216506600379944, + "eval_valid_reconstruction/last_seq": 0.32027727365493774, + "eval_valid_reconstruction/second_seq": 0.19503065943717957, + "eval_valid_runtime": 437.7521, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 10950 + }, + { + "epoch": 0.04084510194489828, + "eval_train_loss": 2.2139883041381836, + "eval_train_loss/all": 2.0470499992370605, + "eval_train_loss/end_span": 1.2341645956039429, + "eval_train_perplexity/batch": 7.745019435882568, + "eval_train_perplexity/end_span": 3.435507297515869, + "eval_train_perplexity/fim": 2.363816022872925, + "eval_train_perplexity/first_seq": 15.334872245788574, + "eval_train_perplexity/last_seq": 8.784443855285645, + "eval_train_perplexity/second_seq": 13.887227058410645, + "eval_train_perplexity/seq": 8.9132661819458, + "eval_train_reconstruction/all": 0.2775636613368988, + "eval_train_reconstruction/end_span": 0.7174922227859497, + "eval_train_reconstruction/fim": 0.16709275543689728, + "eval_train_reconstruction/first_seq": 0.15619385242462158, + "eval_train_reconstruction/last_seq": 0.3330250382423401, + "eval_train_reconstruction/second_seq": 0.1902436465024948, + "eval_train_runtime": 442.8215, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 10950 + }, + { + "epoch": 0.04088240340786166, + "grad_norm": 0.37826862931251526, + "learning_rate": 0.0006, + "loss": 2.2762, + "step": 10960 + }, + { + "epoch": 0.04091970487082503, + "grad_norm": 0.3682517111301422, + "learning_rate": 0.0006, + "loss": 2.2737, + "step": 10970 + }, + { + "epoch": 0.04095700633378841, + "grad_norm": 0.36473944783210754, + "learning_rate": 0.0006, + "loss": 2.3379, + "step": 10980 + }, + { + "epoch": 0.04099430779675179, + "grad_norm": 0.3571345806121826, + "learning_rate": 0.0006, + "loss": 2.3646, + "step": 10990 + }, + { + "epoch": 0.04103160925971516, + "grad_norm": 0.3394929766654968, + "learning_rate": 0.0006, + "loss": 2.1731, + "step": 11000 + }, + { + "epoch": 0.04103160925971516, + "eval_valid_loss": 2.2121660709381104, + "eval_valid_loss/all": 2.0731287002563477, + "eval_valid_loss/end_span": 1.2969621419906616, + "eval_valid_perplexity/batch": 7.9496564865112305, + "eval_valid_perplexity/end_span": 3.6581668853759766, + "eval_valid_perplexity/fim": 2.448836088180542, + "eval_valid_perplexity/first_seq": 14.848231315612793, + "eval_valid_perplexity/last_seq": 8.902912139892578, + "eval_valid_perplexity/second_seq": 14.102593421936035, + "eval_valid_perplexity/seq": 8.96030044555664, + "eval_valid_reconstruction/all": 0.288625031709671, + "eval_valid_reconstruction/end_span": 0.6992884874343872, + "eval_valid_reconstruction/fim": 0.17443640530109406, + "eval_valid_reconstruction/first_seq": 0.16678035259246826, + "eval_valid_reconstruction/last_seq": 0.32908540964126587, + "eval_valid_reconstruction/second_seq": 0.18869885802268982, + "eval_valid_runtime": 439.1333, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 11000 + }, + { + "epoch": 0.04103160925971516, + "eval_train_loss": 2.211759567260742, + "eval_train_loss/all": 2.0455996990203857, + "eval_train_loss/end_span": 1.257738471031189, + "eval_train_perplexity/batch": 7.733795166015625, + "eval_train_perplexity/end_span": 3.5174577236175537, + "eval_train_perplexity/fim": 2.084280490875244, + "eval_train_perplexity/first_seq": 15.634977340698242, + "eval_train_perplexity/last_seq": 9.064783096313477, + "eval_train_perplexity/second_seq": 14.316659927368164, + "eval_train_perplexity/seq": 8.907862663269043, + "eval_train_reconstruction/all": 0.27788373827934265, + "eval_train_reconstruction/end_span": 0.7127713561058044, + "eval_train_reconstruction/fim": 0.14262141287326813, + "eval_train_reconstruction/first_seq": 0.14996160566806793, + "eval_train_reconstruction/last_seq": 0.3209982216358185, + "eval_train_reconstruction/second_seq": 0.1809970587491989, + "eval_train_runtime": 441.4239, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 11000 + }, + { + "epoch": 0.04106891072267854, + "grad_norm": 0.432656466960907, + "learning_rate": 0.0006, + "loss": 2.155, + "step": 11010 + }, + { + "epoch": 0.04110621218564192, + "grad_norm": 0.4332851767539978, + "learning_rate": 0.0006, + "loss": 2.2711, + "step": 11020 + }, + { + "epoch": 0.0411435136486053, + "grad_norm": 0.20350037515163422, + "learning_rate": 0.0006, + "loss": 2.4156, + "step": 11030 + }, + { + "epoch": 0.041180815111568674, + "grad_norm": 0.4422769248485565, + "learning_rate": 0.0006, + "loss": 2.2086, + "step": 11040 + }, + { + "epoch": 0.04121811657453205, + "grad_norm": 0.5508260726928711, + "learning_rate": 0.0006, + "loss": 2.3049, + "step": 11050 + }, + { + "epoch": 0.04121811657453205, + "eval_valid_loss": 2.2136640548706055, + "eval_valid_loss/all": 2.0741984844207764, + "eval_valid_loss/end_span": 1.253113031387329, + "eval_valid_perplexity/batch": 7.958165168762207, + "eval_valid_perplexity/end_span": 3.501225471496582, + "eval_valid_perplexity/fim": 2.3737285137176514, + "eval_valid_perplexity/first_seq": 14.770951271057129, + "eval_valid_perplexity/last_seq": 9.112630844116211, + "eval_valid_perplexity/second_seq": 13.457003593444824, + "eval_valid_perplexity/seq": 8.9640474319458, + "eval_valid_reconstruction/all": 0.28854185342788696, + "eval_valid_reconstruction/end_span": 0.7125769257545471, + "eval_valid_reconstruction/fim": 0.1679811179637909, + "eval_valid_reconstruction/first_seq": 0.16879966855049133, + "eval_valid_reconstruction/last_seq": 0.3208836019039154, + "eval_valid_reconstruction/second_seq": 0.20374199748039246, + "eval_valid_runtime": 438.2417, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 11050 + }, + { + "epoch": 0.04121811657453205, + "eval_train_loss": 2.21441650390625, + "eval_train_loss/all": 2.047522783279419, + "eval_train_loss/end_span": 1.2063125371932983, + "eval_train_perplexity/batch": 7.748682022094727, + "eval_train_perplexity/end_span": 3.34114146232605, + "eval_train_perplexity/fim": 2.046243906021118, + "eval_train_perplexity/first_seq": 15.450882911682129, + "eval_train_perplexity/last_seq": 9.507218360900879, + "eval_train_perplexity/second_seq": 14.330046653747559, + "eval_train_perplexity/seq": 8.917357444763184, + "eval_train_reconstruction/all": 0.27741140127182007, + "eval_train_reconstruction/end_span": 0.72446608543396, + "eval_train_reconstruction/fim": 0.1387166827917099, + "eval_train_reconstruction/first_seq": 0.1534758061170578, + "eval_train_reconstruction/last_seq": 0.3078805208206177, + "eval_train_reconstruction/second_seq": 0.18217115104198456, + "eval_train_runtime": 437.8747, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 11050 + }, + { + "epoch": 0.04125541803749543, + "grad_norm": 0.45991864800453186, + "learning_rate": 0.0006, + "loss": 2.4972, + "step": 11060 + }, + { + "epoch": 0.041292719500458805, + "grad_norm": 0.29867836833000183, + "learning_rate": 0.0006, + "loss": 2.2084, + "step": 11070 + }, + { + "epoch": 0.041330020963422184, + "grad_norm": 0.3857950270175934, + "learning_rate": 0.0006, + "loss": 2.2335, + "step": 11080 + }, + { + "epoch": 0.041367322426385564, + "grad_norm": 0.3804987967014313, + "learning_rate": 0.0006, + "loss": 2.3272, + "step": 11090 + }, + { + "epoch": 0.04140462388934894, + "grad_norm": 0.3924068510532379, + "learning_rate": 0.0006, + "loss": 2.1379, + "step": 11100 + }, + { + "epoch": 0.04140462388934894, + "eval_valid_loss": 2.216885566711426, + "eval_valid_loss/all": 2.0773487091064453, + "eval_valid_loss/end_span": 1.3911691904067993, + "eval_valid_perplexity/batch": 7.983274936676025, + "eval_valid_perplexity/end_span": 4.019546985626221, + "eval_valid_perplexity/fim": 2.449533462524414, + "eval_valid_perplexity/first_seq": 14.532081604003906, + "eval_valid_perplexity/last_seq": 9.243464469909668, + "eval_valid_perplexity/second_seq": 13.77292251586914, + "eval_valid_perplexity/seq": 9.002659797668457, + "eval_valid_reconstruction/all": 0.2872016429901123, + "eval_valid_reconstruction/end_span": 0.6637821793556213, + "eval_valid_reconstruction/fim": 0.1736585646867752, + "eval_valid_reconstruction/first_seq": 0.17677079141139984, + "eval_valid_reconstruction/last_seq": 0.31958842277526855, + "eval_valid_reconstruction/second_seq": 0.19478096067905426, + "eval_valid_runtime": 435.408, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 11100 + }, + { + "epoch": 0.04140462388934894, + "eval_train_loss": 2.2160074710845947, + "eval_train_loss/all": 2.0493314266204834, + "eval_train_loss/end_span": 1.3597302436828613, + "eval_train_perplexity/batch": 7.762709617614746, + "eval_train_perplexity/end_span": 3.8951423168182373, + "eval_train_perplexity/fim": 2.1392579078674316, + "eval_train_perplexity/first_seq": 15.858010292053223, + "eval_train_perplexity/last_seq": 8.800400733947754, + "eval_train_perplexity/second_seq": 14.603877067565918, + "eval_train_perplexity/seq": 8.942390441894531, + "eval_train_reconstruction/all": 0.2767683267593384, + "eval_train_reconstruction/end_span": 0.6758331060409546, + "eval_train_reconstruction/fim": 0.14697247743606567, + "eval_train_reconstruction/first_seq": 0.1441321223974228, + "eval_train_reconstruction/last_seq": 0.33191201090812683, + "eval_train_reconstruction/second_seq": 0.1769159883260727, + "eval_train_runtime": 438.9607, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 11100 + }, + { + "epoch": 0.041441925352312316, + "grad_norm": 0.33161213994026184, + "learning_rate": 0.0006, + "loss": 2.2636, + "step": 11110 + }, + { + "epoch": 0.041479226815275695, + "grad_norm": 0.2487470507621765, + "learning_rate": 0.0006, + "loss": 2.3091, + "step": 11120 + }, + { + "epoch": 0.041516528278239075, + "grad_norm": 0.2305283397436142, + "learning_rate": 0.0006, + "loss": 2.1389, + "step": 11130 + }, + { + "epoch": 0.04155382974120245, + "grad_norm": 0.5547738075256348, + "learning_rate": 0.0006, + "loss": 2.1161, + "step": 11140 + }, + { + "epoch": 0.04159113120416583, + "grad_norm": 0.32129934430122375, + "learning_rate": 0.0006, + "loss": 2.2941, + "step": 11150 + }, + { + "epoch": 0.04159113120416583, + "eval_valid_loss": 2.2139580249786377, + "eval_valid_loss/all": 2.0746982097625732, + "eval_valid_loss/end_span": 1.2814007997512817, + "eval_valid_perplexity/batch": 7.962143421173096, + "eval_valid_perplexity/end_span": 3.6016814708709717, + "eval_valid_perplexity/fim": 2.3935558795928955, + "eval_valid_perplexity/first_seq": 15.140433311462402, + "eval_valid_perplexity/last_seq": 9.55765438079834, + "eval_valid_perplexity/second_seq": 14.092211723327637, + "eval_valid_perplexity/seq": 8.977940559387207, + "eval_valid_reconstruction/all": 0.2882041931152344, + "eval_valid_reconstruction/end_span": 0.7010846138000488, + "eval_valid_reconstruction/fim": 0.1695018708705902, + "eval_valid_reconstruction/first_seq": 0.16213497519493103, + "eval_valid_reconstruction/last_seq": 0.3096252381801605, + "eval_valid_reconstruction/second_seq": 0.18454281985759735, + "eval_valid_runtime": 436.5598, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 11150 + }, + { + "epoch": 0.04159113120416583, + "eval_train_loss": 2.2131259441375732, + "eval_train_loss/all": 2.0465505123138428, + "eval_train_loss/end_span": 1.2447960376739502, + "eval_train_perplexity/batch": 7.741151809692383, + "eval_train_perplexity/end_span": 3.472226619720459, + "eval_train_perplexity/fim": 2.1021037101745605, + "eval_train_perplexity/first_seq": 15.752945899963379, + "eval_train_perplexity/last_seq": 9.262730598449707, + "eval_train_perplexity/second_seq": 14.11569595336914, + "eval_train_perplexity/seq": 8.914824485778809, + "eval_train_reconstruction/all": 0.277767539024353, + "eval_train_reconstruction/end_span": 0.7123242616653442, + "eval_train_reconstruction/fim": 0.14509467780590057, + "eval_train_reconstruction/first_seq": 0.1445029228925705, + "eval_train_reconstruction/last_seq": 0.31780949234962463, + "eval_train_reconstruction/second_seq": 0.1873338669538498, + "eval_train_runtime": 442.8585, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 11150 + }, + { + "epoch": 0.041628432667129206, + "grad_norm": 0.3571701943874359, + "learning_rate": 0.0006, + "loss": 2.2886, + "step": 11160 + }, + { + "epoch": 0.041665734130092585, + "grad_norm": 0.28803718090057373, + "learning_rate": 0.0006, + "loss": 2.3241, + "step": 11170 + }, + { + "epoch": 0.04170303559305596, + "grad_norm": 0.33930703997612, + "learning_rate": 0.0006, + "loss": 2.3525, + "step": 11180 + }, + { + "epoch": 0.04174033705601934, + "grad_norm": 0.5475901961326599, + "learning_rate": 0.0006, + "loss": 2.2189, + "step": 11190 + }, + { + "epoch": 0.04177763851898272, + "grad_norm": 0.3038954734802246, + "learning_rate": 0.0006, + "loss": 1.9968, + "step": 11200 + }, + { + "epoch": 0.04177763851898272, + "eval_valid_loss": 2.216698408126831, + "eval_valid_loss/all": 2.077389717102051, + "eval_valid_loss/end_span": 1.1910905838012695, + "eval_valid_perplexity/batch": 7.983602046966553, + "eval_valid_perplexity/end_span": 3.29066801071167, + "eval_valid_perplexity/fim": 2.5410423278808594, + "eval_valid_perplexity/first_seq": 14.532233238220215, + "eval_valid_perplexity/last_seq": 9.097526550292969, + "eval_valid_perplexity/second_seq": 13.862985610961914, + "eval_valid_perplexity/seq": 9.002846717834473, + "eval_valid_reconstruction/all": 0.28719085454940796, + "eval_valid_reconstruction/end_span": 0.717026948928833, + "eval_valid_reconstruction/fim": 0.18218696117401123, + "eval_valid_reconstruction/first_seq": 0.17329847812652588, + "eval_valid_reconstruction/last_seq": 0.32210707664489746, + "eval_valid_reconstruction/second_seq": 0.19507598876953125, + "eval_valid_runtime": 439.5977, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 11200 + }, + { + "epoch": 0.04177763851898272, + "eval_train_loss": 2.2146987915039062, + "eval_train_loss/all": 2.0480003356933594, + "eval_train_loss/end_span": 1.1570605039596558, + "eval_train_perplexity/batch": 7.752383232116699, + "eval_train_perplexity/end_span": 3.180570363998413, + "eval_train_perplexity/fim": 2.0330257415771484, + "eval_train_perplexity/first_seq": 15.439577102661133, + "eval_train_perplexity/last_seq": 9.227910041809082, + "eval_train_perplexity/second_seq": 14.0864839553833, + "eval_train_perplexity/seq": 8.92527961730957, + "eval_train_reconstruction/all": 0.27727365493774414, + "eval_train_reconstruction/end_span": 0.7307128310203552, + "eval_train_reconstruction/fim": 0.13783325254917145, + "eval_train_reconstruction/first_seq": 0.1548551321029663, + "eval_train_reconstruction/last_seq": 0.3122663199901581, + "eval_train_reconstruction/second_seq": 0.18554894626140594, + "eval_train_runtime": 439.7918, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 11200 + }, + { + "epoch": 0.04181493998194609, + "grad_norm": 0.35268041491508484, + "learning_rate": 0.0006, + "loss": 2.329, + "step": 11210 + }, + { + "epoch": 0.04185224144490947, + "grad_norm": 0.24242432415485382, + "learning_rate": 0.0006, + "loss": 2.2834, + "step": 11220 + }, + { + "epoch": 0.04188954290787285, + "grad_norm": 0.3589034676551819, + "learning_rate": 0.0006, + "loss": 2.3807, + "step": 11230 + }, + { + "epoch": 0.04192684437083623, + "grad_norm": 0.40835312008857727, + "learning_rate": 0.0006, + "loss": 2.0383, + "step": 11240 + }, + { + "epoch": 0.0419641458337996, + "grad_norm": 0.48060178756713867, + "learning_rate": 0.0006, + "loss": 2.3279, + "step": 11250 + }, + { + "epoch": 0.0419641458337996, + "eval_valid_loss": 2.2132656574249268, + "eval_valid_loss/all": 2.0740232467651367, + "eval_valid_loss/end_span": 1.257102370262146, + "eval_valid_perplexity/batch": 7.956770896911621, + "eval_valid_perplexity/end_span": 3.515220880508423, + "eval_valid_perplexity/fim": 2.493088960647583, + "eval_valid_perplexity/first_seq": 14.73799991607666, + "eval_valid_perplexity/last_seq": 9.044279098510742, + "eval_valid_perplexity/second_seq": 13.948981285095215, + "eval_valid_perplexity/seq": 8.968101501464844, + "eval_valid_reconstruction/all": 0.28839728236198425, + "eval_valid_reconstruction/end_span": 0.703361451625824, + "eval_valid_reconstruction/fim": 0.1788293570280075, + "eval_valid_reconstruction/first_seq": 0.1685882955789566, + "eval_valid_reconstruction/last_seq": 0.3271196484565735, + "eval_valid_reconstruction/second_seq": 0.18768306076526642, + "eval_valid_runtime": 442.1805, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 11250 + }, + { + "epoch": 0.0419641458337996, + "eval_train_loss": 2.2109215259552, + "eval_train_loss/all": 2.0443532466888428, + "eval_train_loss/end_span": 1.223685383796692, + "eval_train_perplexity/batch": 7.724161148071289, + "eval_train_perplexity/end_span": 3.3996939659118652, + "eval_train_perplexity/fim": 1.89543879032135, + "eval_train_perplexity/first_seq": 15.413581848144531, + "eval_train_perplexity/last_seq": 9.35201358795166, + "eval_train_perplexity/second_seq": 14.446196556091309, + "eval_train_perplexity/seq": 8.891281127929688, + "eval_train_reconstruction/all": 0.27845701575279236, + "eval_train_reconstruction/end_span": 0.7146368026733398, + "eval_train_reconstruction/fim": 0.12424223870038986, + "eval_train_reconstruction/first_seq": 0.15345963835716248, + "eval_train_reconstruction/last_seq": 0.3097224831581116, + "eval_train_reconstruction/second_seq": 0.17582501471042633, + "eval_train_runtime": 441.1812, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 11250 + }, + { + "epoch": 0.04200144729676298, + "grad_norm": 0.3172040581703186, + "learning_rate": 0.0006, + "loss": 2.2668, + "step": 11260 + }, + { + "epoch": 0.04203874875972636, + "grad_norm": 0.305164635181427, + "learning_rate": 0.0006, + "loss": 2.3536, + "step": 11270 + }, + { + "epoch": 0.04207605022268973, + "grad_norm": 0.6238866448402405, + "learning_rate": 0.0006, + "loss": 2.2199, + "step": 11280 + }, + { + "epoch": 0.04211335168565311, + "grad_norm": 0.42227259278297424, + "learning_rate": 0.0006, + "loss": 2.0217, + "step": 11290 + }, + { + "epoch": 0.04215065314861649, + "grad_norm": 0.5829721689224243, + "learning_rate": 0.0006, + "loss": 2.305, + "step": 11300 + }, + { + "epoch": 0.04215065314861649, + "eval_valid_loss": 2.2151296138763428, + "eval_valid_loss/all": 2.075509548187256, + "eval_valid_loss/end_span": 1.2429254055023193, + "eval_valid_perplexity/batch": 7.968605995178223, + "eval_valid_perplexity/end_span": 3.4657373428344727, + "eval_valid_perplexity/fim": 2.2226667404174805, + "eval_valid_perplexity/first_seq": 14.57115650177002, + "eval_valid_perplexity/last_seq": 9.665151596069336, + "eval_valid_perplexity/second_seq": 13.866827011108398, + "eval_valid_perplexity/seq": 8.98391342163086, + "eval_valid_reconstruction/all": 0.2878977060317993, + "eval_valid_reconstruction/end_span": 0.7160455584526062, + "eval_valid_reconstruction/fim": 0.1554456353187561, + "eval_valid_reconstruction/first_seq": 0.17085452377796173, + "eval_valid_reconstruction/last_seq": 0.3064330220222473, + "eval_valid_reconstruction/second_seq": 0.1933450549840927, + "eval_valid_runtime": 443.3584, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 11300 + }, + { + "epoch": 0.04215065314861649, + "eval_train_loss": 2.2139086723327637, + "eval_train_loss/all": 2.0471065044403076, + "eval_train_loss/end_span": 1.211394190788269, + "eval_train_perplexity/batch": 7.745457172393799, + "eval_train_perplexity/end_span": 3.358163356781006, + "eval_train_perplexity/fim": 2.0399346351623535, + "eval_train_perplexity/first_seq": 15.450005531311035, + "eval_train_perplexity/last_seq": 9.104764938354492, + "eval_train_perplexity/second_seq": 14.349455833435059, + "eval_train_perplexity/seq": 8.917035102844238, + "eval_train_reconstruction/all": 0.27760347723960876, + "eval_train_reconstruction/end_span": 0.7291724681854248, + "eval_train_reconstruction/fim": 0.13812556862831116, + "eval_train_reconstruction/first_seq": 0.15223552286624908, + "eval_train_reconstruction/last_seq": 0.3230181932449341, + "eval_train_reconstruction/second_seq": 0.17936304211616516, + "eval_train_runtime": 440.0069, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 11300 + }, + { + "epoch": 0.04218795461157986, + "grad_norm": 0.5212255120277405, + "learning_rate": 0.0006, + "loss": 2.2111, + "step": 11310 + }, + { + "epoch": 0.04222525607454324, + "grad_norm": 0.3632051646709442, + "learning_rate": 0.0006, + "loss": 2.25, + "step": 11320 + }, + { + "epoch": 0.04226255753750662, + "grad_norm": 0.3673204481601715, + "learning_rate": 0.0006, + "loss": 2.2792, + "step": 11330 + }, + { + "epoch": 0.04229985900047, + "grad_norm": 0.4800594747066498, + "learning_rate": 0.0006, + "loss": 2.3136, + "step": 11340 + }, + { + "epoch": 0.042337160463433374, + "grad_norm": 0.28818216919898987, + "learning_rate": 0.0006, + "loss": 2.3429, + "step": 11350 + }, + { + "epoch": 0.042337160463433374, + "eval_valid_loss": 2.2165844440460205, + "eval_valid_loss/all": 2.0765280723571777, + "eval_valid_loss/end_span": 1.2533841133117676, + "eval_valid_perplexity/batch": 7.976726055145264, + "eval_valid_perplexity/end_span": 3.5021746158599854, + "eval_valid_perplexity/fim": 2.126397132873535, + "eval_valid_perplexity/first_seq": 15.039230346679688, + "eval_valid_perplexity/last_seq": 9.0037260055542, + "eval_valid_perplexity/second_seq": 14.006478309631348, + "eval_valid_perplexity/seq": 8.985698699951172, + "eval_valid_reconstruction/all": 0.287375807762146, + "eval_valid_reconstruction/end_span": 0.7091257572174072, + "eval_valid_reconstruction/fim": 0.14614713191986084, + "eval_valid_reconstruction/first_seq": 0.16400383412837982, + "eval_valid_reconstruction/last_seq": 0.325344055891037, + "eval_valid_reconstruction/second_seq": 0.19071869552135468, + "eval_valid_runtime": 439.313, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 11350 + }, + { + "epoch": 0.042337160463433374, + "eval_train_loss": 2.2152528762817383, + "eval_train_loss/all": 2.048196792602539, + "eval_train_loss/end_span": 1.2230476140975952, + "eval_train_perplexity/batch": 7.753906726837158, + "eval_train_perplexity/end_span": 3.397526264190674, + "eval_train_perplexity/fim": 2.4733927249908447, + "eval_train_perplexity/first_seq": 15.379369735717773, + "eval_train_perplexity/last_seq": 9.40481185913086, + "eval_train_perplexity/second_seq": 14.440452575683594, + "eval_train_perplexity/seq": 8.921294212341309, + "eval_train_reconstruction/all": 0.27704083919525146, + "eval_train_reconstruction/end_span": 0.7200354933738708, + "eval_train_reconstruction/fim": 0.17593644559383392, + "eval_train_reconstruction/first_seq": 0.15556353330612183, + "eval_train_reconstruction/last_seq": 0.3120805323123932, + "eval_train_reconstruction/second_seq": 0.17936280369758606, + "eval_train_runtime": 436.325, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 11350 + }, + { + "epoch": 0.04237446192639675, + "grad_norm": 0.4179999530315399, + "learning_rate": 0.0006, + "loss": 2.1857, + "step": 11360 + }, + { + "epoch": 0.04241176338936013, + "grad_norm": 0.36656877398490906, + "learning_rate": 0.0006, + "loss": 2.3282, + "step": 11370 + }, + { + "epoch": 0.042449064852323505, + "grad_norm": 0.2737393379211426, + "learning_rate": 0.0006, + "loss": 2.3065, + "step": 11380 + }, + { + "epoch": 0.042486366315286885, + "grad_norm": 0.29581156373023987, + "learning_rate": 0.0006, + "loss": 2.3941, + "step": 11390 + }, + { + "epoch": 0.042523667778250264, + "grad_norm": 0.3634241819381714, + "learning_rate": 0.0006, + "loss": 2.0652, + "step": 11400 + }, + { + "epoch": 0.042523667778250264, + "eval_valid_loss": 2.2136948108673096, + "eval_valid_loss/all": 2.0740041732788086, + "eval_valid_loss/end_span": 1.2763453722000122, + "eval_valid_perplexity/batch": 7.9566192626953125, + "eval_valid_perplexity/end_span": 3.583519220352173, + "eval_valid_perplexity/fim": 2.2701292037963867, + "eval_valid_perplexity/first_seq": 14.974433898925781, + "eval_valid_perplexity/last_seq": 9.258634567260742, + "eval_valid_perplexity/second_seq": 13.879925727844238, + "eval_valid_perplexity/seq": 8.968924522399902, + "eval_valid_reconstruction/all": 0.2882853150367737, + "eval_valid_reconstruction/end_span": 0.7008174657821655, + "eval_valid_reconstruction/fim": 0.15861058235168457, + "eval_valid_reconstruction/first_seq": 0.1653282791376114, + "eval_valid_reconstruction/last_seq": 0.31960850954055786, + "eval_valid_reconstruction/second_seq": 0.19666209816932678, + "eval_valid_runtime": 435.0205, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 11400 + }, + { + "epoch": 0.042523667778250264, + "eval_train_loss": 2.2119667530059814, + "eval_train_loss/all": 2.045241117477417, + "eval_train_loss/end_span": 1.2364853620529175, + "eval_train_perplexity/batch": 7.731022357940674, + "eval_train_perplexity/end_span": 3.4434895515441895, + "eval_train_perplexity/fim": 2.137047529220581, + "eval_train_perplexity/first_seq": 15.604083061218262, + "eval_train_perplexity/last_seq": 9.180987358093262, + "eval_train_perplexity/second_seq": 14.195557594299316, + "eval_train_perplexity/seq": 8.895350456237793, + "eval_train_reconstruction/all": 0.27813997864723206, + "eval_train_reconstruction/end_span": 0.7129189968109131, + "eval_train_reconstruction/fim": 0.14832545816898346, + "eval_train_reconstruction/first_seq": 0.15045906603336334, + "eval_train_reconstruction/last_seq": 0.31624624133110046, + "eval_train_reconstruction/second_seq": 0.1863194704055786, + "eval_train_runtime": 434.5122, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 11400 + }, + { + "epoch": 0.04256096924121364, + "grad_norm": 0.22253786027431488, + "learning_rate": 0.0006, + "loss": 2.3541, + "step": 11410 + }, + { + "epoch": 0.042598270704177016, + "grad_norm": 0.8500723242759705, + "learning_rate": 0.0006, + "loss": 2.1063, + "step": 11420 + }, + { + "epoch": 0.042635572167140395, + "grad_norm": 0.4510164260864258, + "learning_rate": 0.0006, + "loss": 2.2683, + "step": 11430 + }, + { + "epoch": 0.042672873630103775, + "grad_norm": 0.2677662670612335, + "learning_rate": 0.0006, + "loss": 2.2362, + "step": 11440 + }, + { + "epoch": 0.04271017509306715, + "grad_norm": 0.3809254765510559, + "learning_rate": 0.0006, + "loss": 2.1288, + "step": 11450 + }, + { + "epoch": 0.04271017509306715, + "eval_valid_loss": 2.2133986949920654, + "eval_valid_loss/all": 2.073709726333618, + "eval_valid_loss/end_span": 1.4088494777679443, + "eval_valid_perplexity/batch": 7.9542765617370605, + "eval_valid_perplexity/end_span": 4.091245651245117, + "eval_valid_perplexity/fim": 2.0579965114593506, + "eval_valid_perplexity/first_seq": 14.44345474243164, + "eval_valid_perplexity/last_seq": 9.106402397155762, + "eval_valid_perplexity/second_seq": 13.71292495727539, + "eval_valid_perplexity/seq": 8.959639549255371, + "eval_valid_reconstruction/all": 0.2884698510169983, + "eval_valid_reconstruction/end_span": 0.6660495400428772, + "eval_valid_reconstruction/fim": 0.14046154916286469, + "eval_valid_reconstruction/first_seq": 0.1743244230747223, + "eval_valid_reconstruction/last_seq": 0.3227086663246155, + "eval_valid_reconstruction/second_seq": 0.19417275488376617, + "eval_valid_runtime": 434.5434, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 11450 + }, + { + "epoch": 0.04271017509306715, + "eval_train_loss": 2.212627649307251, + "eval_train_loss/all": 2.0454719066619873, + "eval_train_loss/end_span": 1.366927981376648, + "eval_train_perplexity/batch": 7.73280668258667, + "eval_train_perplexity/end_span": 3.9232797622680664, + "eval_train_perplexity/fim": 2.2998170852661133, + "eval_train_perplexity/first_seq": 15.46430492401123, + "eval_train_perplexity/last_seq": 9.264090538024902, + "eval_train_perplexity/second_seq": 14.462462425231934, + "eval_train_perplexity/seq": 8.896740913391113, + "eval_train_reconstruction/all": 0.27820950746536255, + "eval_train_reconstruction/end_span": 0.6784636974334717, + "eval_train_reconstruction/fim": 0.1616477221250534, + "eval_train_reconstruction/first_seq": 0.15141834318637848, + "eval_train_reconstruction/last_seq": 0.31575462222099304, + "eval_train_reconstruction/second_seq": 0.17845740914344788, + "eval_train_runtime": 435.4974, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 11450 + }, + { + "epoch": 0.04274747655603053, + "grad_norm": 0.4401298761367798, + "learning_rate": 0.0006, + "loss": 2.3024, + "step": 11460 + }, + { + "epoch": 0.042784778018993906, + "grad_norm": 0.5881706476211548, + "learning_rate": 0.0006, + "loss": 2.2533, + "step": 11470 + }, + { + "epoch": 0.042822079481957286, + "grad_norm": 1.0685759782791138, + "learning_rate": 0.0006, + "loss": 2.3233, + "step": 11480 + }, + { + "epoch": 0.04285938094492066, + "grad_norm": 0.297518789768219, + "learning_rate": 0.0006, + "loss": 2.2226, + "step": 11490 + }, + { + "epoch": 0.04289668240788404, + "grad_norm": 0.3843104839324951, + "learning_rate": 0.0006, + "loss": 2.2448, + "step": 11500 + }, + { + "epoch": 0.04289668240788404, + "eval_valid_loss": 2.212491750717163, + "eval_valid_loss/all": 2.072793483734131, + "eval_valid_loss/end_span": 1.35763680934906, + "eval_valid_perplexity/batch": 7.946991920471191, + "eval_valid_perplexity/end_span": 3.8869967460632324, + "eval_valid_perplexity/fim": 2.6321980953216553, + "eval_valid_perplexity/first_seq": 15.055646896362305, + "eval_valid_perplexity/last_seq": 9.546467781066895, + "eval_valid_perplexity/second_seq": 13.626256942749023, + "eval_valid_perplexity/seq": 8.957236289978027, + "eval_valid_reconstruction/all": 0.28881826996803284, + "eval_valid_reconstruction/end_span": 0.6806825399398804, + "eval_valid_reconstruction/fim": 0.1881900131702423, + "eval_valid_reconstruction/first_seq": 0.16299760341644287, + "eval_valid_reconstruction/last_seq": 0.3076598644256592, + "eval_valid_reconstruction/second_seq": 0.1994556039571762, + "eval_valid_runtime": 434.2727, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 11500 + }, + { + "epoch": 0.04289668240788404, + "eval_train_loss": 2.2087113857269287, + "eval_train_loss/all": 2.04236102104187, + "eval_train_loss/end_span": 1.3203696012496948, + "eval_train_perplexity/batch": 7.7087883949279785, + "eval_train_perplexity/end_span": 3.744805097579956, + "eval_train_perplexity/fim": 2.1593637466430664, + "eval_train_perplexity/first_seq": 15.59399700164795, + "eval_train_perplexity/last_seq": 8.84363079071045, + "eval_train_perplexity/second_seq": 14.072821617126465, + "eval_train_perplexity/seq": 8.870293617248535, + "eval_train_reconstruction/all": 0.2792717516422272, + "eval_train_reconstruction/end_span": 0.6920132040977478, + "eval_train_reconstruction/fim": 0.15075714886188507, + "eval_train_reconstruction/first_seq": 0.15153814852237701, + "eval_train_reconstruction/last_seq": 0.3309516906738281, + "eval_train_reconstruction/second_seq": 0.18660478293895721, + "eval_train_runtime": 434.7914, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 11500 + }, + { + "epoch": 0.04293398387084742, + "grad_norm": 0.34705299139022827, + "learning_rate": 0.0006, + "loss": 2.1625, + "step": 11510 + }, + { + "epoch": 0.04297128533381079, + "grad_norm": 0.31218522787094116, + "learning_rate": 0.0006, + "loss": 2.2754, + "step": 11520 + }, + { + "epoch": 0.04300858679677417, + "grad_norm": 0.43104878067970276, + "learning_rate": 0.0006, + "loss": 2.295, + "step": 11530 + }, + { + "epoch": 0.04304588825973755, + "grad_norm": 0.5702462792396545, + "learning_rate": 0.0006, + "loss": 2.1394, + "step": 11540 + }, + { + "epoch": 0.04308318972270093, + "grad_norm": 0.2925890386104584, + "learning_rate": 0.0006, + "loss": 2.1348, + "step": 11550 + }, + { + "epoch": 0.04308318972270093, + "eval_valid_loss": 2.2159156799316406, + "eval_valid_loss/all": 2.0766475200653076, + "eval_valid_loss/end_span": 1.2819547653198242, + "eval_valid_perplexity/batch": 7.9776787757873535, + "eval_valid_perplexity/end_span": 3.603677272796631, + "eval_valid_perplexity/fim": 2.2303435802459717, + "eval_valid_perplexity/first_seq": 15.072821617126465, + "eval_valid_perplexity/last_seq": 9.223919868469238, + "eval_valid_perplexity/second_seq": 13.639823913574219, + "eval_valid_perplexity/seq": 9.005413055419922, + "eval_valid_reconstruction/all": 0.2876065671443939, + "eval_valid_reconstruction/end_span": 0.6962568163871765, + "eval_valid_reconstruction/fim": 0.15579159557819366, + "eval_valid_reconstruction/first_seq": 0.1645914614200592, + "eval_valid_reconstruction/last_seq": 0.3169287443161011, + "eval_valid_reconstruction/second_seq": 0.20070147514343262, + "eval_valid_runtime": 435.9723, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 11550 + }, + { + "epoch": 0.04308318972270093, + "eval_train_loss": 2.213552474975586, + "eval_train_loss/all": 2.0470056533813477, + "eval_train_loss/end_span": 1.2373117208480835, + "eval_train_perplexity/batch": 7.744676113128662, + "eval_train_perplexity/end_span": 3.446336269378662, + "eval_train_perplexity/fim": 2.188189744949341, + "eval_train_perplexity/first_seq": 15.802826881408691, + "eval_train_perplexity/last_seq": 9.337108612060547, + "eval_train_perplexity/second_seq": 14.071358680725098, + "eval_train_perplexity/seq": 8.923213005065918, + "eval_train_reconstruction/all": 0.2776026129722595, + "eval_train_reconstruction/end_span": 0.7111269235610962, + "eval_train_reconstruction/fim": 0.15170851349830627, + "eval_train_reconstruction/first_seq": 0.14755240082740784, + "eval_train_reconstruction/last_seq": 0.312894731760025, + "eval_train_reconstruction/second_seq": 0.18663889169692993, + "eval_train_runtime": 436.9649, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 11550 + }, + { + "epoch": 0.0431204911856643, + "grad_norm": 0.3233534097671509, + "learning_rate": 0.0006, + "loss": 2.3383, + "step": 11560 + }, + { + "epoch": 0.04315779264862768, + "grad_norm": 0.2598138749599457, + "learning_rate": 0.0006, + "loss": 2.317, + "step": 11570 + }, + { + "epoch": 0.04319509411159106, + "grad_norm": 0.7258971929550171, + "learning_rate": 0.0006, + "loss": 2.3158, + "step": 11580 + }, + { + "epoch": 0.04323239557455443, + "grad_norm": 0.30168643593788147, + "learning_rate": 0.0006, + "loss": 2.3094, + "step": 11590 + }, + { + "epoch": 0.04326969703751781, + "grad_norm": 0.29025742411613464, + "learning_rate": 0.0006, + "loss": 2.2051, + "step": 11600 + }, + { + "epoch": 0.04326969703751781, + "eval_valid_loss": 2.214344024658203, + "eval_valid_loss/all": 2.074971914291382, + "eval_valid_loss/end_span": 1.3057801723480225, + "eval_valid_perplexity/batch": 7.964322566986084, + "eval_valid_perplexity/end_span": 3.6905672550201416, + "eval_valid_perplexity/fim": 2.1678128242492676, + "eval_valid_perplexity/first_seq": 14.697919845581055, + "eval_valid_perplexity/last_seq": 9.365166664123535, + "eval_valid_perplexity/second_seq": 13.811912536621094, + "eval_valid_perplexity/seq": 8.983515739440918, + "eval_valid_reconstruction/all": 0.2879156768321991, + "eval_valid_reconstruction/end_span": 0.6910619139671326, + "eval_valid_reconstruction/fim": 0.15075333416461945, + "eval_valid_reconstruction/first_seq": 0.17299160361289978, + "eval_valid_reconstruction/last_seq": 0.3191975951194763, + "eval_valid_reconstruction/second_seq": 0.19317464530467987, + "eval_valid_runtime": 439.4171, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 11600 + }, + { + "epoch": 0.04326969703751781, + "eval_train_loss": 2.210228204727173, + "eval_train_loss/all": 2.0438690185546875, + "eval_train_loss/end_span": 1.2693734169006348, + "eval_train_perplexity/batch": 7.72042179107666, + "eval_train_perplexity/end_span": 3.558622121810913, + "eval_train_perplexity/fim": 2.2260029315948486, + "eval_train_perplexity/first_seq": 15.389324188232422, + "eval_train_perplexity/last_seq": 9.286043167114258, + "eval_train_perplexity/second_seq": 14.628591537475586, + "eval_train_perplexity/seq": 8.886870384216309, + "eval_train_reconstruction/all": 0.27832382917404175, + "eval_train_reconstruction/end_span": 0.6999173164367676, + "eval_train_reconstruction/fim": 0.15547676384449005, + "eval_train_reconstruction/first_seq": 0.157231405377388, + "eval_train_reconstruction/last_seq": 0.31306561827659607, + "eval_train_reconstruction/second_seq": 0.17465665936470032, + "eval_train_runtime": 437.0564, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 11600 + }, + { + "epoch": 0.04330699850048119, + "grad_norm": 0.3701659142971039, + "learning_rate": 0.0006, + "loss": 2.1007, + "step": 11610 + }, + { + "epoch": 0.04334429996344456, + "grad_norm": 0.3717748522758484, + "learning_rate": 0.0006, + "loss": 2.3098, + "step": 11620 + }, + { + "epoch": 0.04338160142640794, + "grad_norm": 0.39399877190589905, + "learning_rate": 0.0006, + "loss": 2.3097, + "step": 11630 + }, + { + "epoch": 0.04341890288937132, + "grad_norm": 0.31280970573425293, + "learning_rate": 0.0006, + "loss": 2.2363, + "step": 11640 + }, + { + "epoch": 0.0434562043523347, + "grad_norm": 0.36010026931762695, + "learning_rate": 0.0006, + "loss": 2.2325, + "step": 11650 + }, + { + "epoch": 0.0434562043523347, + "eval_valid_loss": 2.2174885272979736, + "eval_valid_loss/all": 2.077322483062744, + "eval_valid_loss/end_span": 1.2244073152542114, + "eval_valid_perplexity/batch": 7.983065605163574, + "eval_valid_perplexity/end_span": 3.402149200439453, + "eval_valid_perplexity/fim": 2.2154998779296875, + "eval_valid_perplexity/first_seq": 14.790816307067871, + "eval_valid_perplexity/last_seq": 8.907999992370605, + "eval_valid_perplexity/second_seq": 13.760239601135254, + "eval_valid_perplexity/seq": 8.99770450592041, + "eval_valid_reconstruction/all": 0.28697311878204346, + "eval_valid_reconstruction/end_span": 0.7109810709953308, + "eval_valid_reconstruction/fim": 0.1539004147052765, + "eval_valid_reconstruction/first_seq": 0.17042779922485352, + "eval_valid_reconstruction/last_seq": 0.3261950612068176, + "eval_valid_reconstruction/second_seq": 0.19517379999160767, + "eval_valid_runtime": 437.1205, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 11650 + }, + { + "epoch": 0.0434562043523347, + "eval_train_loss": 2.215139150619507, + "eval_train_loss/all": 2.047668933868408, + "eval_train_loss/end_span": 1.1827094554901123, + "eval_train_perplexity/batch": 7.749814510345459, + "eval_train_perplexity/end_span": 3.2632036209106445, + "eval_train_perplexity/fim": 2.2779791355133057, + "eval_train_perplexity/first_seq": 15.334993362426758, + "eval_train_perplexity/last_seq": 9.170486450195312, + "eval_train_perplexity/second_seq": 13.987046241760254, + "eval_train_perplexity/seq": 8.916594505310059, + "eval_train_reconstruction/all": 0.2770116329193115, + "eval_train_reconstruction/end_span": 0.7223621606826782, + "eval_train_reconstruction/fim": 0.15992189943790436, + "eval_train_reconstruction/first_seq": 0.15426217019557953, + "eval_train_reconstruction/last_seq": 0.3171784281730652, + "eval_train_reconstruction/second_seq": 0.19217674434185028, + "eval_train_runtime": 440.8943, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 11650 + }, + { + "epoch": 0.043493505815298074, + "grad_norm": 0.5999832153320312, + "learning_rate": 0.0006, + "loss": 2.0905, + "step": 11660 + }, + { + "epoch": 0.04353080727826145, + "grad_norm": 0.34570837020874023, + "learning_rate": 0.0006, + "loss": 2.4143, + "step": 11670 + }, + { + "epoch": 0.04356810874122483, + "grad_norm": 0.41852375864982605, + "learning_rate": 0.0006, + "loss": 2.1169, + "step": 11680 + }, + { + "epoch": 0.043605410204188205, + "grad_norm": 0.29271695017814636, + "learning_rate": 0.0006, + "loss": 2.2768, + "step": 11690 + }, + { + "epoch": 0.043642711667151585, + "grad_norm": 0.4363093376159668, + "learning_rate": 0.0006, + "loss": 2.3071, + "step": 11700 + }, + { + "epoch": 0.043642711667151585, + "eval_valid_loss": 2.212031602859497, + "eval_valid_loss/all": 2.0726613998413086, + "eval_valid_loss/end_span": 1.3534468412399292, + "eval_valid_perplexity/batch": 7.945942401885986, + "eval_valid_perplexity/end_span": 3.870744466781616, + "eval_valid_perplexity/fim": 2.2332448959350586, + "eval_valid_perplexity/first_seq": 14.930855751037598, + "eval_valid_perplexity/last_seq": 9.393205642700195, + "eval_valid_perplexity/second_seq": 14.009320259094238, + "eval_valid_perplexity/seq": 8.95643424987793, + "eval_valid_reconstruction/all": 0.28864750266075134, + "eval_valid_reconstruction/end_span": 0.6810712218284607, + "eval_valid_reconstruction/fim": 0.15645435452461243, + "eval_valid_reconstruction/first_seq": 0.167913556098938, + "eval_valid_reconstruction/last_seq": 0.31173476576805115, + "eval_valid_reconstruction/second_seq": 0.19296695291996002, + "eval_valid_runtime": 440.3825, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 11700 + }, + { + "epoch": 0.043642711667151585, + "eval_train_loss": 2.2096164226531982, + "eval_train_loss/all": 2.043038845062256, + "eval_train_loss/end_span": 1.3196017742156982, + "eval_train_perplexity/batch": 7.714015483856201, + "eval_train_perplexity/end_span": 3.7419309616088867, + "eval_train_perplexity/fim": 2.049931049346924, + "eval_train_perplexity/first_seq": 15.819952011108398, + "eval_train_perplexity/last_seq": 9.374262809753418, + "eval_train_perplexity/second_seq": 14.216471672058105, + "eval_train_perplexity/seq": 8.878835678100586, + "eval_train_reconstruction/all": 0.27872660756111145, + "eval_train_reconstruction/end_span": 0.6894676685333252, + "eval_train_reconstruction/fim": 0.1403888612985611, + "eval_train_reconstruction/first_seq": 0.1463700532913208, + "eval_train_reconstruction/last_seq": 0.3145099878311157, + "eval_train_reconstruction/second_seq": 0.18049414455890656, + "eval_train_runtime": 441.4556, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 11700 + }, + { + "epoch": 0.043680013130114964, + "grad_norm": 0.35412272810935974, + "learning_rate": 0.0006, + "loss": 2.4039, + "step": 11710 + }, + { + "epoch": 0.043717314593078344, + "grad_norm": 0.3961142301559448, + "learning_rate": 0.0006, + "loss": 2.3555, + "step": 11720 + }, + { + "epoch": 0.043754616056041716, + "grad_norm": 0.6132240891456604, + "learning_rate": 0.0006, + "loss": 2.2837, + "step": 11730 + }, + { + "epoch": 0.043791917519005095, + "grad_norm": 0.4165128469467163, + "learning_rate": 0.0006, + "loss": 2.0843, + "step": 11740 + }, + { + "epoch": 0.043829218981968475, + "grad_norm": 0.3607114255428314, + "learning_rate": 0.0006, + "loss": 2.2151, + "step": 11750 + }, + { + "epoch": 0.043829218981968475, + "eval_valid_loss": 2.221630096435547, + "eval_valid_loss/all": 2.0814998149871826, + "eval_valid_loss/end_span": 1.3114084005355835, + "eval_valid_perplexity/batch": 8.016483306884766, + "eval_valid_perplexity/end_span": 3.711397171020508, + "eval_valid_perplexity/fim": 2.3954856395721436, + "eval_valid_perplexity/first_seq": 15.04220962524414, + "eval_valid_perplexity/last_seq": 9.25216007232666, + "eval_valid_perplexity/second_seq": 13.75609016418457, + "eval_valid_perplexity/seq": 9.038973808288574, + "eval_valid_reconstruction/all": 0.2858095169067383, + "eval_valid_reconstruction/end_span": 0.690156877040863, + "eval_valid_reconstruction/fim": 0.16889718174934387, + "eval_valid_reconstruction/first_seq": 0.16160082817077637, + "eval_valid_reconstruction/last_seq": 0.319904088973999, + "eval_valid_reconstruction/second_seq": 0.1959277242422104, + "eval_valid_runtime": 439.575, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 11750 + }, + { + "epoch": 0.043829218981968475, + "eval_train_loss": 2.2188010215759277, + "eval_train_loss/all": 2.0514001846313477, + "eval_train_loss/end_span": 1.2806652784347534, + "eval_train_perplexity/batch": 7.778785228729248, + "eval_train_perplexity/end_span": 3.5990333557128906, + "eval_train_perplexity/fim": 2.1853294372558594, + "eval_train_perplexity/first_seq": 15.474579811096191, + "eval_train_perplexity/last_seq": 8.704236030578613, + "eval_train_perplexity/second_seq": 14.341435432434082, + "eval_train_perplexity/seq": 8.957703590393066, + "eval_train_reconstruction/all": 0.27601513266563416, + "eval_train_reconstruction/end_span": 0.6997577548027039, + "eval_train_reconstruction/fim": 0.15121619403362274, + "eval_train_reconstruction/first_seq": 0.15038831532001495, + "eval_train_reconstruction/last_seq": 0.3331592381000519, + "eval_train_reconstruction/second_seq": 0.18118001520633698, + "eval_train_runtime": 440.6801, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 11750 + }, + { + "epoch": 0.04386652044493185, + "grad_norm": 0.4611087143421173, + "learning_rate": 0.0006, + "loss": 2.2433, + "step": 11760 + }, + { + "epoch": 0.04390382190789523, + "grad_norm": 0.49455296993255615, + "learning_rate": 0.0006, + "loss": 2.4147, + "step": 11770 + }, + { + "epoch": 0.043941123370858606, + "grad_norm": 0.4618271291255951, + "learning_rate": 0.0006, + "loss": 2.3862, + "step": 11780 + }, + { + "epoch": 0.043978424833821986, + "grad_norm": 0.511441707611084, + "learning_rate": 0.0006, + "loss": 2.2174, + "step": 11790 + }, + { + "epoch": 0.04401572629678536, + "grad_norm": 0.3447416126728058, + "learning_rate": 0.0006, + "loss": 2.1791, + "step": 11800 + }, + { + "epoch": 0.04401572629678536, + "eval_valid_loss": 2.215059280395508, + "eval_valid_loss/all": 2.0755293369293213, + "eval_valid_loss/end_span": 1.219788908958435, + "eval_valid_perplexity/batch": 7.96876335144043, + "eval_valid_perplexity/end_span": 3.386472702026367, + "eval_valid_perplexity/fim": 2.3184332847595215, + "eval_valid_perplexity/first_seq": 15.020206451416016, + "eval_valid_perplexity/last_seq": 9.00639533996582, + "eval_valid_perplexity/second_seq": 13.751696586608887, + "eval_valid_perplexity/seq": 8.982399940490723, + "eval_valid_reconstruction/all": 0.28764528036117554, + "eval_valid_reconstruction/end_span": 0.7123776078224182, + "eval_valid_reconstruction/fim": 0.16316522657871246, + "eval_valid_reconstruction/first_seq": 0.16306453943252563, + "eval_valid_reconstruction/last_seq": 0.3281959891319275, + "eval_valid_reconstruction/second_seq": 0.19756710529327393, + "eval_valid_runtime": 437.8739, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 11800 + }, + { + "epoch": 0.04401572629678536, + "eval_train_loss": 2.2125909328460693, + "eval_train_loss/all": 2.045891284942627, + "eval_train_loss/end_span": 1.1908446550369263, + "eval_train_perplexity/batch": 7.736050605773926, + "eval_train_perplexity/end_span": 3.289858818054199, + "eval_train_perplexity/fim": 2.0236899852752686, + "eval_train_perplexity/first_seq": 15.286275863647461, + "eval_train_perplexity/last_seq": 8.620269775390625, + "eval_train_perplexity/second_seq": 14.021767616271973, + "eval_train_perplexity/seq": 8.907063484191895, + "eval_train_reconstruction/all": 0.2776961326599121, + "eval_train_reconstruction/end_span": 0.7219382524490356, + "eval_train_reconstruction/fim": 0.1366698294878006, + "eval_train_reconstruction/first_seq": 0.15204545855522156, + "eval_train_reconstruction/last_seq": 0.33761757612228394, + "eval_train_reconstruction/second_seq": 0.19097429513931274, + "eval_train_runtime": 437.4141, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 11800 + }, + { + "epoch": 0.04405302775974874, + "grad_norm": 0.34387046098709106, + "learning_rate": 0.0006, + "loss": 2.1723, + "step": 11810 + }, + { + "epoch": 0.04409032922271212, + "grad_norm": 0.5362696647644043, + "learning_rate": 0.0006, + "loss": 2.1694, + "step": 11820 + }, + { + "epoch": 0.04412763068567549, + "grad_norm": 0.2974952161312103, + "learning_rate": 0.0006, + "loss": 2.342, + "step": 11830 + }, + { + "epoch": 0.04416493214863887, + "grad_norm": 0.5134224891662598, + "learning_rate": 0.0006, + "loss": 2.1287, + "step": 11840 + }, + { + "epoch": 0.04420223361160225, + "grad_norm": 0.35918283462524414, + "learning_rate": 0.0006, + "loss": 2.1846, + "step": 11850 + }, + { + "epoch": 0.04420223361160225, + "eval_valid_loss": 2.21124529838562, + "eval_valid_loss/all": 2.071962594985962, + "eval_valid_loss/end_span": 1.1739201545715332, + "eval_valid_perplexity/batch": 7.940391540527344, + "eval_valid_perplexity/end_span": 3.2346482276916504, + "eval_valid_perplexity/fim": 2.4892489910125732, + "eval_valid_perplexity/first_seq": 14.505738258361816, + "eval_valid_perplexity/last_seq": 9.003185272216797, + "eval_valid_perplexity/second_seq": 13.461172103881836, + "eval_valid_perplexity/seq": 8.9465913772583, + "eval_valid_reconstruction/all": 0.2889372408390045, + "eval_valid_reconstruction/end_span": 0.7270562648773193, + "eval_valid_reconstruction/fim": 0.1783241182565689, + "eval_valid_reconstruction/first_seq": 0.17302030324935913, + "eval_valid_reconstruction/last_seq": 0.32549530267715454, + "eval_valid_reconstruction/second_seq": 0.20356956124305725, + "eval_valid_runtime": 440.4971, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 11850 + }, + { + "epoch": 0.04420223361160225, + "eval_train_loss": 2.210725784301758, + "eval_train_loss/all": 2.0442373752593994, + "eval_train_loss/end_span": 1.1483744382858276, + "eval_train_perplexity/batch": 7.723266124725342, + "eval_train_perplexity/end_span": 3.1530632972717285, + "eval_train_perplexity/fim": 2.2823472023010254, + "eval_train_perplexity/first_seq": 15.605868339538574, + "eval_train_perplexity/last_seq": 8.978754043579102, + "eval_train_perplexity/second_seq": 14.566856384277344, + "eval_train_perplexity/seq": 8.893165588378906, + "eval_train_reconstruction/all": 0.278357595205307, + "eval_train_reconstruction/end_span": 0.734709620475769, + "eval_train_reconstruction/fim": 0.16162529587745667, + "eval_train_reconstruction/first_seq": 0.14850199222564697, + "eval_train_reconstruction/last_seq": 0.3243437111377716, + "eval_train_reconstruction/second_seq": 0.17736060917377472, + "eval_train_runtime": 435.6616, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 11850 + }, + { + "epoch": 0.04423953507456562, + "grad_norm": 0.42952290177345276, + "learning_rate": 0.0006, + "loss": 2.3329, + "step": 11860 + }, + { + "epoch": 0.044276836537529, + "grad_norm": 0.48747923970222473, + "learning_rate": 0.0006, + "loss": 2.2329, + "step": 11870 + }, + { + "epoch": 0.04431413800049238, + "grad_norm": 0.43687960505485535, + "learning_rate": 0.0006, + "loss": 2.0643, + "step": 11880 + }, + { + "epoch": 0.04435143946345576, + "grad_norm": 1.3337947130203247, + "learning_rate": 0.0006, + "loss": 2.3002, + "step": 11890 + }, + { + "epoch": 0.04438874092641913, + "grad_norm": 0.5380728840827942, + "learning_rate": 0.0006, + "loss": 2.1597, + "step": 11900 + }, + { + "epoch": 0.04438874092641913, + "eval_valid_loss": 2.216745615005493, + "eval_valid_loss/all": 2.0767874717712402, + "eval_valid_loss/end_span": 1.231370449066162, + "eval_valid_perplexity/batch": 7.978795528411865, + "eval_valid_perplexity/end_span": 3.4259214401245117, + "eval_valid_perplexity/fim": 2.3017992973327637, + "eval_valid_perplexity/first_seq": 15.074755668640137, + "eval_valid_perplexity/last_seq": 8.945270538330078, + "eval_valid_perplexity/second_seq": 13.926554679870605, + "eval_valid_perplexity/seq": 8.991541862487793, + "eval_valid_reconstruction/all": 0.28763628005981445, + "eval_valid_reconstruction/end_span": 0.7025799751281738, + "eval_valid_reconstruction/fim": 0.16151688992977142, + "eval_valid_reconstruction/first_seq": 0.1604032814502716, + "eval_valid_reconstruction/last_seq": 0.3296840190887451, + "eval_valid_reconstruction/second_seq": 0.19285009801387787, + "eval_valid_runtime": 436.9112, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 11900 + }, + { + "epoch": 0.04438874092641913, + "eval_train_loss": 2.21575665473938, + "eval_train_loss/all": 2.0480735301971436, + "eval_train_loss/end_span": 1.2045109272003174, + "eval_train_perplexity/batch": 7.752950668334961, + "eval_train_perplexity/end_span": 3.335127592086792, + "eval_train_perplexity/fim": 2.0214269161224365, + "eval_train_perplexity/first_seq": 15.254281044006348, + "eval_train_perplexity/last_seq": 8.883903503417969, + "eval_train_perplexity/second_seq": 14.544576644897461, + "eval_train_perplexity/seq": 8.921435356140137, + "eval_train_reconstruction/all": 0.27750250697135925, + "eval_train_reconstruction/end_span": 0.7101824283599854, + "eval_train_reconstruction/fim": 0.1367679387331009, + "eval_train_reconstruction/first_seq": 0.15779699385166168, + "eval_train_reconstruction/last_seq": 0.3290710747241974, + "eval_train_reconstruction/second_seq": 0.1770634800195694, + "eval_train_runtime": 439.227, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 11900 + }, + { + "epoch": 0.04442604238938251, + "grad_norm": 0.30455681681632996, + "learning_rate": 0.0006, + "loss": 2.3422, + "step": 11910 + }, + { + "epoch": 0.04446334385234589, + "grad_norm": 0.36478281021118164, + "learning_rate": 0.0006, + "loss": 2.4063, + "step": 11920 + }, + { + "epoch": 0.04450064531530926, + "grad_norm": 0.28319627046585083, + "learning_rate": 0.0006, + "loss": 2.1524, + "step": 11930 + }, + { + "epoch": 0.04453794677827264, + "grad_norm": 0.3103103041648865, + "learning_rate": 0.0006, + "loss": 2.2681, + "step": 11940 + }, + { + "epoch": 0.04457524824123602, + "grad_norm": 0.3427082896232605, + "learning_rate": 0.0006, + "loss": 2.3972, + "step": 11950 + }, + { + "epoch": 0.04457524824123602, + "eval_valid_loss": 2.216935634613037, + "eval_valid_loss/all": 2.0765726566314697, + "eval_valid_loss/end_span": 1.2348154783248901, + "eval_valid_perplexity/batch": 7.977081775665283, + "eval_valid_perplexity/end_span": 3.437744140625, + "eval_valid_perplexity/fim": 2.259078025817871, + "eval_valid_perplexity/first_seq": 14.889391899108887, + "eval_valid_perplexity/last_seq": 9.284172058105469, + "eval_valid_perplexity/second_seq": 13.527351379394531, + "eval_valid_perplexity/seq": 8.980953216552734, + "eval_valid_reconstruction/all": 0.2877699136734009, + "eval_valid_reconstruction/end_span": 0.704632580280304, + "eval_valid_reconstruction/fim": 0.15798693895339966, + "eval_valid_reconstruction/first_seq": 0.16623513400554657, + "eval_valid_reconstruction/last_seq": 0.31917956471443176, + "eval_valid_reconstruction/second_seq": 0.20196422934532166, + "eval_valid_runtime": 434.4649, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 11950 + }, + { + "epoch": 0.04457524824123602, + "eval_train_loss": 2.2171542644500732, + "eval_train_loss/all": 2.049283981323242, + "eval_train_loss/end_span": 1.200032114982605, + "eval_train_perplexity/batch": 7.762341022491455, + "eval_train_perplexity/end_span": 3.320223569869995, + "eval_train_perplexity/fim": 2.328061103820801, + "eval_train_perplexity/first_seq": 15.583671569824219, + "eval_train_perplexity/last_seq": 9.144879341125488, + "eval_train_perplexity/second_seq": 14.231882095336914, + "eval_train_perplexity/seq": 8.925163269042969, + "eval_train_reconstruction/all": 0.27705907821655273, + "eval_train_reconstruction/end_span": 0.715813159942627, + "eval_train_reconstruction/fim": 0.16370531916618347, + "eval_train_reconstruction/first_seq": 0.1518932282924652, + "eval_train_reconstruction/last_seq": 0.3216812014579773, + "eval_train_reconstruction/second_seq": 0.1841191202402115, + "eval_train_runtime": 436.731, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 11950 + }, + { + "epoch": 0.0446125497041994, + "grad_norm": 0.40458160638809204, + "learning_rate": 0.0006, + "loss": 2.3404, + "step": 11960 + }, + { + "epoch": 0.044649851167162774, + "grad_norm": 0.38108062744140625, + "learning_rate": 0.0006, + "loss": 2.2356, + "step": 11970 + }, + { + "epoch": 0.04468715263012615, + "grad_norm": 0.3916551470756531, + "learning_rate": 0.0006, + "loss": 2.2551, + "step": 11980 + }, + { + "epoch": 0.04472445409308953, + "grad_norm": 0.5915437340736389, + "learning_rate": 0.0006, + "loss": 2.2419, + "step": 11990 + }, + { + "epoch": 0.044761755556052905, + "grad_norm": 0.4045082926750183, + "learning_rate": 0.0006, + "loss": 2.1441, + "step": 12000 + }, + { + "epoch": 0.044761755556052905, + "eval_valid_loss": 2.219221353530884, + "eval_valid_loss/all": 2.0792229175567627, + "eval_valid_loss/end_span": 1.3024053573608398, + "eval_valid_perplexity/batch": 7.998250961303711, + "eval_valid_perplexity/end_span": 3.678133249282837, + "eval_valid_perplexity/fim": 2.257565975189209, + "eval_valid_perplexity/first_seq": 15.253972053527832, + "eval_valid_perplexity/last_seq": 9.60629653930664, + "eval_valid_perplexity/second_seq": 13.86599349975586, + "eval_valid_perplexity/seq": 9.013260841369629, + "eval_valid_reconstruction/all": 0.2869069576263428, + "eval_valid_reconstruction/end_span": 0.6954297423362732, + "eval_valid_reconstruction/fim": 0.15706656873226166, + "eval_valid_reconstruction/first_seq": 0.15851955115795135, + "eval_valid_reconstruction/last_seq": 0.304921418428421, + "eval_valid_reconstruction/second_seq": 0.19171354174613953, + "eval_valid_runtime": 440.9303, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 12000 + }, + { + "epoch": 0.044761755556052905, + "eval_train_loss": 2.2215025424957275, + "eval_train_loss/all": 2.0538551807403564, + "eval_train_loss/end_span": 1.2710084915161133, + "eval_train_perplexity/batch": 7.797905445098877, + "eval_train_perplexity/end_span": 3.5644454956054688, + "eval_train_perplexity/fim": 2.420186996459961, + "eval_train_perplexity/first_seq": 15.721263885498047, + "eval_train_perplexity/last_seq": 9.396742820739746, + "eval_train_perplexity/second_seq": 14.808024406433105, + "eval_train_perplexity/seq": 8.978919982910156, + "eval_train_reconstruction/all": 0.275528222322464, + "eval_train_reconstruction/end_span": 0.7053816914558411, + "eval_train_reconstruction/fim": 0.16971716284751892, + "eval_train_reconstruction/first_seq": 0.14586365222930908, + "eval_train_reconstruction/last_seq": 0.3132595419883728, + "eval_train_reconstruction/second_seq": 0.17413605749607086, + "eval_train_runtime": 440.7437, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 12000 + }, + { + "epoch": 0.044799057019016285, + "grad_norm": 0.39017102122306824, + "learning_rate": 0.0006, + "loss": 2.1458, + "step": 12010 + }, + { + "epoch": 0.044836358481979664, + "grad_norm": 0.4626333713531494, + "learning_rate": 0.0006, + "loss": 2.2833, + "step": 12020 + }, + { + "epoch": 0.044873659944943044, + "grad_norm": 0.3614653944969177, + "learning_rate": 0.0006, + "loss": 2.2689, + "step": 12030 + }, + { + "epoch": 0.044910961407906416, + "grad_norm": 0.282240092754364, + "learning_rate": 0.0006, + "loss": 2.3501, + "step": 12040 + }, + { + "epoch": 0.044948262870869796, + "grad_norm": 0.3558536469936371, + "learning_rate": 0.0006, + "loss": 2.3211, + "step": 12050 + }, + { + "epoch": 0.044948262870869796, + "eval_valid_loss": 2.216665029525757, + "eval_valid_loss/all": 2.0765511989593506, + "eval_valid_loss/end_span": 1.3595019578933716, + "eval_valid_perplexity/batch": 7.976910591125488, + "eval_valid_perplexity/end_span": 3.8942532539367676, + "eval_valid_perplexity/fim": 2.1450533866882324, + "eval_valid_perplexity/first_seq": 14.70189094543457, + "eval_valid_perplexity/last_seq": 9.446784019470215, + "eval_valid_perplexity/second_seq": 14.262014389038086, + "eval_valid_perplexity/seq": 8.985639572143555, + "eval_valid_reconstruction/all": 0.2877844274044037, + "eval_valid_reconstruction/end_span": 0.6835764050483704, + "eval_valid_reconstruction/fim": 0.1483185738325119, + "eval_valid_reconstruction/first_seq": 0.17111530900001526, + "eval_valid_reconstruction/last_seq": 0.31289300322532654, + "eval_valid_reconstruction/second_seq": 0.18467991054058075, + "eval_valid_runtime": 440.7134, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 12050 + }, + { + "epoch": 0.044948262870869796, + "eval_train_loss": 2.2160918712615967, + "eval_train_loss/all": 2.0487029552459717, + "eval_train_loss/end_span": 1.3220305442810059, + "eval_train_perplexity/batch": 7.7578325271606445, + "eval_train_perplexity/end_span": 3.751030206680298, + "eval_train_perplexity/fim": 2.209406614303589, + "eval_train_perplexity/first_seq": 15.535442352294922, + "eval_train_perplexity/last_seq": 9.191374778747559, + "eval_train_perplexity/second_seq": 14.222818374633789, + "eval_train_perplexity/seq": 8.921189308166504, + "eval_train_reconstruction/all": 0.27727577090263367, + "eval_train_reconstruction/end_span": 0.6926162242889404, + "eval_train_reconstruction/fim": 0.15347278118133545, + "eval_train_reconstruction/first_seq": 0.14939099550247192, + "eval_train_reconstruction/last_seq": 0.3197101354598999, + "eval_train_reconstruction/second_seq": 0.18412260711193085, + "eval_train_runtime": 440.6139, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 12050 + }, + { + "epoch": 0.044985564333833175, + "grad_norm": 0.3608415722846985, + "learning_rate": 0.0006, + "loss": 2.2494, + "step": 12060 + }, + { + "epoch": 0.04502286579679655, + "grad_norm": 0.5139852166175842, + "learning_rate": 0.0006, + "loss": 2.1095, + "step": 12070 + }, + { + "epoch": 0.04506016725975993, + "grad_norm": 0.41564589738845825, + "learning_rate": 0.0006, + "loss": 2.32, + "step": 12080 + }, + { + "epoch": 0.045097468722723306, + "grad_norm": 0.3837343454360962, + "learning_rate": 0.0006, + "loss": 2.2579, + "step": 12090 + }, + { + "epoch": 0.045134770185686686, + "grad_norm": 0.4701690673828125, + "learning_rate": 0.0006, + "loss": 2.0711, + "step": 12100 + }, + { + "epoch": 0.045134770185686686, + "eval_valid_loss": 2.21893048286438, + "eval_valid_loss/all": 2.079138994216919, + "eval_valid_loss/end_span": 1.218531847000122, + "eval_valid_perplexity/batch": 7.997580051422119, + "eval_valid_perplexity/end_span": 3.382218360900879, + "eval_valid_perplexity/fim": 2.2108864784240723, + "eval_valid_perplexity/first_seq": 14.717809677124023, + "eval_valid_perplexity/last_seq": 9.552851676940918, + "eval_valid_perplexity/second_seq": 13.928683280944824, + "eval_valid_perplexity/seq": 9.016316413879395, + "eval_valid_reconstruction/all": 0.28666701912879944, + "eval_valid_reconstruction/end_span": 0.7112732529640198, + "eval_valid_reconstruction/fim": 0.15461765229701996, + "eval_valid_reconstruction/first_seq": 0.17102177441120148, + "eval_valid_reconstruction/last_seq": 0.3093704879283905, + "eval_valid_reconstruction/second_seq": 0.19232448935508728, + "eval_valid_runtime": 440.8415, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 12100 + }, + { + "epoch": 0.045134770185686686, + "eval_train_loss": 2.2161543369293213, + "eval_train_loss/all": 2.049006938934326, + "eval_train_loss/end_span": 1.1854660511016846, + "eval_train_perplexity/batch": 7.760190963745117, + "eval_train_perplexity/end_span": 3.2722115516662598, + "eval_train_perplexity/fim": 2.0183045864105225, + "eval_train_perplexity/first_seq": 15.55749225616455, + "eval_train_perplexity/last_seq": 8.823683738708496, + "eval_train_perplexity/second_seq": 14.439353942871094, + "eval_train_perplexity/seq": 8.9378662109375, + "eval_train_reconstruction/all": 0.2767783999443054, + "eval_train_reconstruction/end_span": 0.7227404117584229, + "eval_train_reconstruction/fim": 0.13646450638771057, + "eval_train_reconstruction/first_seq": 0.1501264125108719, + "eval_train_reconstruction/last_seq": 0.33027592301368713, + "eval_train_reconstruction/second_seq": 0.17852197587490082, + "eval_train_runtime": 439.6142, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 12100 + }, + { + "epoch": 0.04517207164865006, + "grad_norm": 0.46942901611328125, + "learning_rate": 0.0006, + "loss": 2.1173, + "step": 12110 + }, + { + "epoch": 0.04520937311161344, + "grad_norm": 0.5951825976371765, + "learning_rate": 0.0006, + "loss": 2.2571, + "step": 12120 + }, + { + "epoch": 0.04524667457457682, + "grad_norm": 0.4289625585079193, + "learning_rate": 0.0006, + "loss": 2.325, + "step": 12130 + }, + { + "epoch": 0.04528397603754019, + "grad_norm": 0.47703346610069275, + "learning_rate": 0.0006, + "loss": 2.3163, + "step": 12140 + }, + { + "epoch": 0.04532127750050357, + "grad_norm": 0.6313005685806274, + "learning_rate": 0.0006, + "loss": 2.1893, + "step": 12150 + }, + { + "epoch": 0.04532127750050357, + "eval_valid_loss": 2.217172861099243, + "eval_valid_loss/all": 2.0776360034942627, + "eval_valid_loss/end_span": 1.3605711460113525, + "eval_valid_perplexity/batch": 7.985568523406982, + "eval_valid_perplexity/end_span": 3.898419141769409, + "eval_valid_perplexity/fim": 2.4191315174102783, + "eval_valid_perplexity/first_seq": 14.89986801147461, + "eval_valid_perplexity/last_seq": 8.949009895324707, + "eval_valid_perplexity/second_seq": 13.799213409423828, + "eval_valid_perplexity/seq": 9.006550788879395, + "eval_valid_reconstruction/all": 0.28719937801361084, + "eval_valid_reconstruction/end_span": 0.6802120208740234, + "eval_valid_reconstruction/fim": 0.17106060683727264, + "eval_valid_reconstruction/first_seq": 0.16472285985946655, + "eval_valid_reconstruction/last_seq": 0.32735881209373474, + "eval_valid_reconstruction/second_seq": 0.19406288862228394, + "eval_valid_runtime": 438.2952, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 12150 + }, + { + "epoch": 0.04532127750050357, + "eval_train_loss": 2.2125041484832764, + "eval_train_loss/all": 2.0458943843841553, + "eval_train_loss/end_span": 1.3260971307754517, + "eval_train_perplexity/batch": 7.736074447631836, + "eval_train_perplexity/end_span": 3.766315221786499, + "eval_train_perplexity/fim": 2.010164499282837, + "eval_train_perplexity/first_seq": 15.674947738647461, + "eval_train_perplexity/last_seq": 9.571765899658203, + "eval_train_perplexity/second_seq": 14.593647956848145, + "eval_train_perplexity/seq": 8.9102783203125, + "eval_train_reconstruction/all": 0.2779650092124939, + "eval_train_reconstruction/end_span": 0.6895828247070312, + "eval_train_reconstruction/fim": 0.13648062944412231, + "eval_train_reconstruction/first_seq": 0.1463707536458969, + "eval_train_reconstruction/last_seq": 0.30391183495521545, + "eval_train_reconstruction/second_seq": 0.17508698999881744, + "eval_train_runtime": 442.5613, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 12150 + }, + { + "epoch": 0.04535857896346695, + "grad_norm": 0.5797920823097229, + "learning_rate": 0.0006, + "loss": 2.3349, + "step": 12160 + }, + { + "epoch": 0.04539588042643032, + "grad_norm": 0.4439983665943146, + "learning_rate": 0.0006, + "loss": 2.3634, + "step": 12170 + }, + { + "epoch": 0.0454331818893937, + "grad_norm": 0.2984038293361664, + "learning_rate": 0.0006, + "loss": 2.4243, + "step": 12180 + }, + { + "epoch": 0.04547048335235708, + "grad_norm": 0.43167760968208313, + "learning_rate": 0.0006, + "loss": 2.2712, + "step": 12190 + }, + { + "epoch": 0.04550778481532046, + "grad_norm": 0.33466836810112, + "learning_rate": 0.0006, + "loss": 2.2531, + "step": 12200 + }, + { + "epoch": 0.04550778481532046, + "eval_valid_loss": 2.2109715938568115, + "eval_valid_loss/all": 2.0718326568603516, + "eval_valid_loss/end_span": 1.2983893156051636, + "eval_valid_perplexity/batch": 7.93936014175415, + "eval_valid_perplexity/end_span": 3.663391351699829, + "eval_valid_perplexity/fim": 2.1968331336975098, + "eval_valid_perplexity/first_seq": 14.63715934753418, + "eval_valid_perplexity/last_seq": 9.337961196899414, + "eval_valid_perplexity/second_seq": 13.427861213684082, + "eval_valid_perplexity/seq": 8.947670936584473, + "eval_valid_reconstruction/all": 0.28887251019477844, + "eval_valid_reconstruction/end_span": 0.6993207335472107, + "eval_valid_reconstruction/fim": 0.15307189524173737, + "eval_valid_reconstruction/first_seq": 0.16808128356933594, + "eval_valid_reconstruction/last_seq": 0.317375123500824, + "eval_valid_reconstruction/second_seq": 0.2064545750617981, + "eval_valid_runtime": 441.9026, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 12200 + }, + { + "epoch": 0.04550778481532046, + "eval_train_loss": 2.2096526622772217, + "eval_train_loss/all": 2.043477773666382, + "eval_train_loss/end_span": 1.2671395540237427, + "eval_train_perplexity/batch": 7.71740198135376, + "eval_train_perplexity/end_span": 3.5506815910339355, + "eval_train_perplexity/fim": 2.0319135189056396, + "eval_train_perplexity/first_seq": 15.263965606689453, + "eval_train_perplexity/last_seq": 9.203950881958008, + "eval_train_perplexity/second_seq": 14.170014381408691, + "eval_train_perplexity/seq": 8.882392883300781, + "eval_train_reconstruction/all": 0.27859264612197876, + "eval_train_reconstruction/end_span": 0.7098202109336853, + "eval_train_reconstruction/fim": 0.1377580761909485, + "eval_train_reconstruction/first_seq": 0.15638737380504608, + "eval_train_reconstruction/last_seq": 0.3177782893180847, + "eval_train_reconstruction/second_seq": 0.18479570746421814, + "eval_train_runtime": 441.7412, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 12200 + }, + { + "epoch": 0.04554508627828383, + "grad_norm": 0.28465259075164795, + "learning_rate": 0.0006, + "loss": 2.4049, + "step": 12210 + }, + { + "epoch": 0.04558238774124721, + "grad_norm": 0.3204372823238373, + "learning_rate": 0.0006, + "loss": 2.2219, + "step": 12220 + }, + { + "epoch": 0.04561968920421059, + "grad_norm": 0.42368006706237793, + "learning_rate": 0.0006, + "loss": 2.3153, + "step": 12230 + }, + { + "epoch": 0.04565699066717396, + "grad_norm": 0.42806684970855713, + "learning_rate": 0.0006, + "loss": 2.1419, + "step": 12240 + }, + { + "epoch": 0.04569429213013734, + "grad_norm": 0.4132760465145111, + "learning_rate": 0.0006, + "loss": 2.1581, + "step": 12250 + }, + { + "epoch": 0.04569429213013734, + "eval_valid_loss": 2.209690809249878, + "eval_valid_loss/all": 2.070521354675293, + "eval_valid_loss/end_span": 1.2975034713745117, + "eval_valid_perplexity/batch": 7.928956031799316, + "eval_valid_perplexity/end_span": 3.6601476669311523, + "eval_valid_perplexity/fim": 2.3755486011505127, + "eval_valid_perplexity/first_seq": 14.57175064086914, + "eval_valid_perplexity/last_seq": 9.258647918701172, + "eval_valid_perplexity/second_seq": 13.626058578491211, + "eval_valid_perplexity/seq": 8.936570167541504, + "eval_valid_reconstruction/all": 0.28927698731422424, + "eval_valid_reconstruction/end_span": 0.6993624567985535, + "eval_valid_reconstruction/fim": 0.16693732142448425, + "eval_valid_reconstruction/first_seq": 0.17364704608917236, + "eval_valid_reconstruction/last_seq": 0.3163146376609802, + "eval_valid_reconstruction/second_seq": 0.19971401989459991, + "eval_valid_runtime": 441.1783, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 12250 + }, + { + "epoch": 0.04569429213013734, + "eval_train_loss": 2.208315134048462, + "eval_train_loss/all": 2.042165994644165, + "eval_train_loss/end_span": 1.2546526193618774, + "eval_train_perplexity/batch": 7.707284927368164, + "eval_train_perplexity/end_span": 3.506619930267334, + "eval_train_perplexity/fim": 2.1964807510375977, + "eval_train_perplexity/first_seq": 15.77270221710205, + "eval_train_perplexity/last_seq": 9.318954467773438, + "eval_train_perplexity/second_seq": 14.408795356750488, + "eval_train_perplexity/seq": 8.8720121383667, + "eval_train_reconstruction/all": 0.27887818217277527, + "eval_train_reconstruction/end_span": 0.7127431631088257, + "eval_train_reconstruction/fim": 0.15352991223335266, + "eval_train_reconstruction/first_seq": 0.14950141310691833, + "eval_train_reconstruction/last_seq": 0.31270474195480347, + "eval_train_reconstruction/second_seq": 0.18027862906455994, + "eval_train_runtime": 439.414, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 12250 + }, + { + "epoch": 0.04573159359310072, + "grad_norm": 0.46687939763069153, + "learning_rate": 0.0006, + "loss": 2.1358, + "step": 12260 + }, + { + "epoch": 0.0457688950560641, + "grad_norm": 2.3754866123199463, + "learning_rate": 0.0006, + "loss": 2.2605, + "step": 12270 + }, + { + "epoch": 0.045806196519027474, + "grad_norm": 0.3113176226615906, + "learning_rate": 0.0006, + "loss": 2.244, + "step": 12280 + }, + { + "epoch": 0.045843497981990854, + "grad_norm": 1.460611343383789, + "learning_rate": 0.0006, + "loss": 2.1497, + "step": 12290 + }, + { + "epoch": 0.04588079944495423, + "grad_norm": 3.1130034923553467, + "learning_rate": 0.0006, + "loss": 2.265, + "step": 12300 + }, + { + "epoch": 0.04588079944495423, + "eval_valid_loss": 2.2284224033355713, + "eval_valid_loss/all": 2.0877041816711426, + "eval_valid_loss/end_span": 1.4166600704193115, + "eval_valid_perplexity/batch": 8.066374778747559, + "eval_valid_perplexity/end_span": 4.123325824737549, + "eval_valid_perplexity/fim": 2.350910186767578, + "eval_valid_perplexity/first_seq": 15.146098136901855, + "eval_valid_perplexity/last_seq": 9.324872970581055, + "eval_valid_perplexity/second_seq": 14.245193481445312, + "eval_valid_perplexity/seq": 9.09333324432373, + "eval_valid_reconstruction/all": 0.28464043140411377, + "eval_valid_reconstruction/end_span": 0.6690438389778137, + "eval_valid_reconstruction/fim": 0.15229587256908417, + "eval_valid_reconstruction/first_seq": 0.1619284749031067, + "eval_valid_reconstruction/last_seq": 0.31698259711265564, + "eval_valid_reconstruction/second_seq": 0.18990832567214966, + "eval_valid_runtime": 441.9637, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 12300 + }, + { + "epoch": 0.04588079944495423, + "eval_train_loss": 2.2272393703460693, + "eval_train_loss/all": 2.0587308406829834, + "eval_train_loss/end_span": 1.3595350980758667, + "eval_train_perplexity/batch": 7.8360185623168945, + "eval_train_perplexity/end_span": 3.8943824768066406, + "eval_train_perplexity/fim": 2.1408965587615967, + "eval_train_perplexity/first_seq": 15.660558700561523, + "eval_train_perplexity/last_seq": 9.259037017822266, + "eval_train_perplexity/second_seq": 14.821856498718262, + "eval_train_perplexity/seq": 9.024727821350098, + "eval_train_reconstruction/all": 0.27454453706741333, + "eval_train_reconstruction/end_span": 0.6877096891403198, + "eval_train_reconstruction/fim": 0.13640516996383667, + "eval_train_reconstruction/first_seq": 0.15092407166957855, + "eval_train_reconstruction/last_seq": 0.3180675506591797, + "eval_train_reconstruction/second_seq": 0.16612103581428528, + "eval_train_runtime": 438.8555, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 12300 + }, + { + "epoch": 0.045918100907917606, + "grad_norm": 0.5067312121391296, + "learning_rate": 0.0006, + "loss": 2.0704, + "step": 12310 + }, + { + "epoch": 0.045955402370880985, + "grad_norm": 0.2815656363964081, + "learning_rate": 0.0006, + "loss": 2.2746, + "step": 12320 + }, + { + "epoch": 0.045992703833844364, + "grad_norm": 0.4213363230228424, + "learning_rate": 0.0006, + "loss": 2.3978, + "step": 12330 + }, + { + "epoch": 0.046030005296807744, + "grad_norm": 0.3177335858345032, + "learning_rate": 0.0006, + "loss": 2.4071, + "step": 12340 + }, + { + "epoch": 0.046067306759771116, + "grad_norm": 0.3909590244293213, + "learning_rate": 0.0006, + "loss": 2.1546, + "step": 12350 + }, + { + "epoch": 0.046067306759771116, + "eval_valid_loss": 2.215296983718872, + "eval_valid_loss/all": 2.0755512714385986, + "eval_valid_loss/end_span": 1.3216358423233032, + "eval_valid_perplexity/batch": 7.96893835067749, + "eval_valid_perplexity/end_span": 3.7495501041412354, + "eval_valid_perplexity/fim": 2.3951663970947266, + "eval_valid_perplexity/first_seq": 14.873028755187988, + "eval_valid_perplexity/last_seq": 9.12944507598877, + "eval_valid_perplexity/second_seq": 13.648776054382324, + "eval_valid_perplexity/seq": 8.980051040649414, + "eval_valid_reconstruction/all": 0.2876434028148651, + "eval_valid_reconstruction/end_span": 0.6964362859725952, + "eval_valid_reconstruction/fim": 0.1687004119157791, + "eval_valid_reconstruction/first_seq": 0.16549216210842133, + "eval_valid_reconstruction/last_seq": 0.32181060314178467, + "eval_valid_reconstruction/second_seq": 0.19843228161334991, + "eval_valid_runtime": 441.0465, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 12350 + }, + { + "epoch": 0.046067306759771116, + "eval_train_loss": 2.2131640911102295, + "eval_train_loss/all": 2.046226739883423, + "eval_train_loss/end_span": 1.274835467338562, + "eval_train_perplexity/batch": 7.738646030426025, + "eval_train_perplexity/end_span": 3.5781126022338867, + "eval_train_perplexity/fim": 2.2772483825683594, + "eval_train_perplexity/first_seq": 15.455923080444336, + "eval_train_perplexity/last_seq": 9.682419776916504, + "eval_train_perplexity/second_seq": 13.843976974487305, + "eval_train_perplexity/seq": 8.906108856201172, + "eval_train_reconstruction/all": 0.2776827812194824, + "eval_train_reconstruction/end_span": 0.7113771438598633, + "eval_train_reconstruction/fim": 0.15810252726078033, + "eval_train_reconstruction/first_seq": 0.15037614107131958, + "eval_train_reconstruction/last_seq": 0.2998175024986267, + "eval_train_reconstruction/second_seq": 0.19430576264858246, + "eval_train_runtime": 438.7488, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 12350 + }, + { + "epoch": 0.046104608222734496, + "grad_norm": 0.7889612913131714, + "learning_rate": 0.0006, + "loss": 2.246, + "step": 12360 + }, + { + "epoch": 0.046141909685697875, + "grad_norm": 0.3681749403476715, + "learning_rate": 0.0006, + "loss": 2.241, + "step": 12370 + }, + { + "epoch": 0.04617921114866125, + "grad_norm": 0.7821197509765625, + "learning_rate": 0.0006, + "loss": 2.2638, + "step": 12380 + }, + { + "epoch": 0.04621651261162463, + "grad_norm": 0.3978370130062103, + "learning_rate": 0.0006, + "loss": 2.3178, + "step": 12390 + }, + { + "epoch": 0.04625381407458801, + "grad_norm": 0.27283188700675964, + "learning_rate": 0.0006, + "loss": 2.176, + "step": 12400 + }, + { + "epoch": 0.04625381407458801, + "eval_valid_loss": 2.2167627811431885, + "eval_valid_loss/all": 2.077265739440918, + "eval_valid_loss/end_span": 1.281856894493103, + "eval_valid_perplexity/batch": 7.982612609863281, + "eval_valid_perplexity/end_span": 3.6033244132995605, + "eval_valid_perplexity/fim": 2.5020341873168945, + "eval_valid_perplexity/first_seq": 14.841633796691895, + "eval_valid_perplexity/last_seq": 9.257118225097656, + "eval_valid_perplexity/second_seq": 13.912426948547363, + "eval_valid_perplexity/seq": 9.000395774841309, + "eval_valid_reconstruction/all": 0.2876220941543579, + "eval_valid_reconstruction/end_span": 0.7004004716873169, + "eval_valid_reconstruction/fim": 0.17665180563926697, + "eval_valid_reconstruction/first_seq": 0.16660131514072418, + "eval_valid_reconstruction/last_seq": 0.319058895111084, + "eval_valid_reconstruction/second_seq": 0.19014403223991394, + "eval_valid_runtime": 438.7198, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 12400 + }, + { + "epoch": 0.04625381407458801, + "eval_train_loss": 2.2164409160614014, + "eval_train_loss/all": 2.0494754314422607, + "eval_train_loss/end_span": 1.251387357711792, + "eval_train_perplexity/batch": 7.763827323913574, + "eval_train_perplexity/end_span": 3.4951887130737305, + "eval_train_perplexity/fim": 2.1952667236328125, + "eval_train_perplexity/first_seq": 15.464982986450195, + "eval_train_perplexity/last_seq": 9.039816856384277, + "eval_train_perplexity/second_seq": 14.491703987121582, + "eval_train_perplexity/seq": 8.940777778625488, + "eval_train_reconstruction/all": 0.27715441584587097, + "eval_train_reconstruction/end_span": 0.7114467620849609, + "eval_train_reconstruction/fim": 0.15073536336421967, + "eval_train_reconstruction/first_seq": 0.1532393842935562, + "eval_train_reconstruction/last_seq": 0.3238137662410736, + "eval_train_reconstruction/second_seq": 0.18067380785942078, + "eval_train_runtime": 441.655, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 12400 + }, + { + "epoch": 0.046291115537551386, + "grad_norm": 0.5263310670852661, + "learning_rate": 0.0006, + "loss": 2.2199, + "step": 12410 + }, + { + "epoch": 0.04632841700051476, + "grad_norm": 0.34755679965019226, + "learning_rate": 0.0006, + "loss": 2.2543, + "step": 12420 + }, + { + "epoch": 0.04636571846347814, + "grad_norm": 0.3377625346183777, + "learning_rate": 0.0006, + "loss": 2.0539, + "step": 12430 + }, + { + "epoch": 0.04640301992644152, + "grad_norm": 0.37855204939842224, + "learning_rate": 0.0006, + "loss": 2.312, + "step": 12440 + }, + { + "epoch": 0.04644032138940489, + "grad_norm": 0.4998064339160919, + "learning_rate": 0.0006, + "loss": 2.2487, + "step": 12450 + }, + { + "epoch": 0.04644032138940489, + "eval_valid_loss": 2.2215664386749268, + "eval_valid_loss/all": 2.0816009044647217, + "eval_valid_loss/end_span": 1.2996412515640259, + "eval_valid_perplexity/batch": 8.017293930053711, + "eval_valid_perplexity/end_span": 3.667980432510376, + "eval_valid_perplexity/fim": 2.2112739086151123, + "eval_valid_perplexity/first_seq": 15.167146682739258, + "eval_valid_perplexity/last_seq": 9.367500305175781, + "eval_valid_perplexity/second_seq": 14.053619384765625, + "eval_valid_perplexity/seq": 9.039353370666504, + "eval_valid_reconstruction/all": 0.28623080253601074, + "eval_valid_reconstruction/end_span": 0.6882862448692322, + "eval_valid_reconstruction/fim": 0.14970141649246216, + "eval_valid_reconstruction/first_seq": 0.16242335736751556, + "eval_valid_reconstruction/last_seq": 0.31440192461013794, + "eval_valid_reconstruction/second_seq": 0.18621644377708435, + "eval_valid_runtime": 439.0879, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 12450 + }, + { + "epoch": 0.04644032138940489, + "eval_train_loss": 2.220456838607788, + "eval_train_loss/all": 2.0529792308807373, + "eval_train_loss/end_span": 1.2701961994171143, + "eval_train_perplexity/batch": 7.791078090667725, + "eval_train_perplexity/end_span": 3.561551332473755, + "eval_train_perplexity/fim": 2.12119460105896, + "eval_train_perplexity/first_seq": 15.84982681274414, + "eval_train_perplexity/last_seq": 9.318221092224121, + "eval_train_perplexity/second_seq": 14.626460075378418, + "eval_train_perplexity/seq": 8.972411155700684, + "eval_train_reconstruction/all": 0.27604779601097107, + "eval_train_reconstruction/end_span": 0.6995605230331421, + "eval_train_reconstruction/fim": 0.14196886122226715, + "eval_train_reconstruction/first_seq": 0.14766305685043335, + "eval_train_reconstruction/last_seq": 0.3157116770744324, + "eval_train_reconstruction/second_seq": 0.17702318727970123, + "eval_train_runtime": 442.2465, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 12450 + }, + { + "epoch": 0.04647762285236827, + "grad_norm": 0.43056556582450867, + "learning_rate": 0.0006, + "loss": 2.2044, + "step": 12460 + }, + { + "epoch": 0.04651492431533165, + "grad_norm": 0.4676961600780487, + "learning_rate": 0.0006, + "loss": 2.3915, + "step": 12470 + }, + { + "epoch": 0.04655222577829502, + "grad_norm": 0.398835688829422, + "learning_rate": 0.0006, + "loss": 2.1888, + "step": 12480 + }, + { + "epoch": 0.0465895272412584, + "grad_norm": 0.33784303069114685, + "learning_rate": 0.0006, + "loss": 2.1548, + "step": 12490 + }, + { + "epoch": 0.04662682870422178, + "grad_norm": 0.41083699464797974, + "learning_rate": 0.0006, + "loss": 2.2906, + "step": 12500 + }, + { + "epoch": 0.04662682870422178, + "eval_valid_loss": 2.2178053855895996, + "eval_valid_loss/all": 2.0782554149627686, + "eval_valid_loss/end_span": 1.4150742292404175, + "eval_valid_perplexity/batch": 7.990516662597656, + "eval_valid_perplexity/end_span": 4.11679220199585, + "eval_valid_perplexity/fim": 2.5975375175476074, + "eval_valid_perplexity/first_seq": 14.750412940979004, + "eval_valid_perplexity/last_seq": 9.471494674682617, + "eval_valid_perplexity/second_seq": 13.616806030273438, + "eval_valid_perplexity/seq": 9.00549030303955, + "eval_valid_reconstruction/all": 0.28690680861473083, + "eval_valid_reconstruction/end_span": 0.6736792922019958, + "eval_valid_reconstruction/fim": 0.18311920762062073, + "eval_valid_reconstruction/first_seq": 0.16964617371559143, + "eval_valid_reconstruction/last_seq": 0.3112727403640747, + "eval_valid_reconstruction/second_seq": 0.19872145354747772, + "eval_valid_runtime": 437.9606, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 12500 + }, + { + "epoch": 0.04662682870422178, + "eval_train_loss": 2.2156150341033936, + "eval_train_loss/all": 2.0480618476867676, + "eval_train_loss/end_span": 1.3613845109939575, + "eval_train_perplexity/batch": 7.7528605461120605, + "eval_train_perplexity/end_span": 3.9015913009643555, + "eval_train_perplexity/fim": 2.0662763118743896, + "eval_train_perplexity/first_seq": 15.385791778564453, + "eval_train_perplexity/last_seq": 9.078658103942871, + "eval_train_perplexity/second_seq": 13.905439376831055, + "eval_train_perplexity/seq": 8.917832374572754, + "eval_train_reconstruction/all": 0.2770264148712158, + "eval_train_reconstruction/end_span": 0.6860935091972351, + "eval_train_reconstruction/fim": 0.1397777944803238, + "eval_train_reconstruction/first_seq": 0.15364298224449158, + "eval_train_reconstruction/last_seq": 0.32273656129837036, + "eval_train_reconstruction/second_seq": 0.19269193708896637, + "eval_train_runtime": 440.3165, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 12500 + }, + { + "epoch": 0.04666413016718516, + "grad_norm": 0.3776227831840515, + "learning_rate": 0.0006, + "loss": 2.3323, + "step": 12510 + }, + { + "epoch": 0.04670143163014853, + "grad_norm": 0.3055909276008606, + "learning_rate": 0.0006, + "loss": 2.3018, + "step": 12520 + }, + { + "epoch": 0.04673873309311191, + "grad_norm": 0.4571980834007263, + "learning_rate": 0.0006, + "loss": 2.0759, + "step": 12530 + }, + { + "epoch": 0.04677603455607529, + "grad_norm": 0.39204975962638855, + "learning_rate": 0.0006, + "loss": 2.2051, + "step": 12540 + }, + { + "epoch": 0.046813336019038664, + "grad_norm": 0.373680055141449, + "learning_rate": 0.0006, + "loss": 2.0795, + "step": 12550 + }, + { + "epoch": 0.046813336019038664, + "eval_valid_loss": 2.2240750789642334, + "eval_valid_loss/all": 2.0842506885528564, + "eval_valid_loss/end_span": 1.2688626050949097, + "eval_valid_perplexity/batch": 8.038565635681152, + "eval_valid_perplexity/end_span": 3.556804656982422, + "eval_valid_perplexity/fim": 2.4295365810394287, + "eval_valid_perplexity/first_seq": 15.375901222229004, + "eval_valid_perplexity/last_seq": 9.423728942871094, + "eval_valid_perplexity/second_seq": 13.653599739074707, + "eval_valid_perplexity/seq": 9.07474136352539, + "eval_valid_reconstruction/all": 0.28555697202682495, + "eval_valid_reconstruction/end_span": 0.7034760117530823, + "eval_valid_reconstruction/fim": 0.16986598074436188, + "eval_valid_reconstruction/first_seq": 0.15654920041561127, + "eval_valid_reconstruction/last_seq": 0.3082543909549713, + "eval_valid_reconstruction/second_seq": 0.19756615161895752, + "eval_valid_runtime": 438.722, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 12550 + }, + { + "epoch": 0.046813336019038664, + "eval_train_loss": 2.219472885131836, + "eval_train_loss/all": 2.0508482456207275, + "eval_train_loss/end_span": 1.2253363132476807, + "eval_train_perplexity/batch": 7.774493217468262, + "eval_train_perplexity/end_span": 3.405311107635498, + "eval_train_perplexity/fim": 2.0031418800354004, + "eval_train_perplexity/first_seq": 15.718756675720215, + "eval_train_perplexity/last_seq": 9.00826358795166, + "eval_train_perplexity/second_seq": 14.55888843536377, + "eval_train_perplexity/seq": 8.932082176208496, + "eval_train_reconstruction/all": 0.2763846814632416, + "eval_train_reconstruction/end_span": 0.7156223654747009, + "eval_train_reconstruction/fim": 0.13280272483825684, + "eval_train_reconstruction/first_seq": 0.14756476879119873, + "eval_train_reconstruction/last_seq": 0.3250249922275543, + "eval_train_reconstruction/second_seq": 0.17784063518047333, + "eval_train_runtime": 438.595, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 12550 + }, + { + "epoch": 0.04685063748200204, + "grad_norm": 0.40133264660835266, + "learning_rate": 0.0006, + "loss": 2.2915, + "step": 12560 + }, + { + "epoch": 0.04688793894496542, + "grad_norm": 0.4837457537651062, + "learning_rate": 0.0006, + "loss": 2.3319, + "step": 12570 + }, + { + "epoch": 0.0469252404079288, + "grad_norm": 0.34557560086250305, + "learning_rate": 0.0006, + "loss": 2.2912, + "step": 12580 + }, + { + "epoch": 0.046962541870892174, + "grad_norm": 0.31892120838165283, + "learning_rate": 0.0006, + "loss": 2.2887, + "step": 12590 + }, + { + "epoch": 0.046999843333855554, + "grad_norm": 0.3541051745414734, + "learning_rate": 0.0006, + "loss": 2.27, + "step": 12600 + }, + { + "epoch": 0.046999843333855554, + "eval_valid_loss": 2.215589761734009, + "eval_valid_loss/all": 2.0758652687072754, + "eval_valid_loss/end_span": 1.3056575059890747, + "eval_valid_perplexity/batch": 7.97144079208374, + "eval_valid_perplexity/end_span": 3.6901144981384277, + "eval_valid_perplexity/fim": 2.2885851860046387, + "eval_valid_perplexity/first_seq": 15.124994277954102, + "eval_valid_perplexity/last_seq": 8.977652549743652, + "eval_valid_perplexity/second_seq": 14.241539001464844, + "eval_valid_perplexity/seq": 8.979483604431152, + "eval_valid_reconstruction/all": 0.28789013624191284, + "eval_valid_reconstruction/end_span": 0.701396107673645, + "eval_valid_reconstruction/fim": 0.16029128432273865, + "eval_valid_reconstruction/first_seq": 0.16020287573337555, + "eval_valid_reconstruction/last_seq": 0.3262048363685608, + "eval_valid_reconstruction/second_seq": 0.18323175609111786, + "eval_valid_runtime": 437.6172, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 12600 + }, + { + "epoch": 0.046999843333855554, + "eval_train_loss": 2.213597059249878, + "eval_train_loss/all": 2.0466420650482178, + "eval_train_loss/end_span": 1.2537503242492676, + "eval_train_perplexity/batch": 7.741860866546631, + "eval_train_perplexity/end_span": 3.503457546234131, + "eval_train_perplexity/fim": 2.266465425491333, + "eval_train_perplexity/first_seq": 15.741003036499023, + "eval_train_perplexity/last_seq": 9.349674224853516, + "eval_train_perplexity/second_seq": 14.004948616027832, + "eval_train_perplexity/seq": 8.906388282775879, + "eval_train_reconstruction/all": 0.27764421701431274, + "eval_train_reconstruction/end_span": 0.7129287123680115, + "eval_train_reconstruction/fim": 0.1592654287815094, + "eval_train_reconstruction/first_seq": 0.1480683982372284, + "eval_train_reconstruction/last_seq": 0.31248292326927185, + "eval_train_reconstruction/second_seq": 0.18844664096832275, + "eval_train_runtime": 439.8896, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 12600 + }, + { + "epoch": 0.04703714479681893, + "grad_norm": 0.2981853783130646, + "learning_rate": 0.0006, + "loss": 2.1355, + "step": 12610 + }, + { + "epoch": 0.047074446259782306, + "grad_norm": 0.3213859498500824, + "learning_rate": 0.0006, + "loss": 2.3511, + "step": 12620 + }, + { + "epoch": 0.047111747722745685, + "grad_norm": 0.3316155672073364, + "learning_rate": 0.0006, + "loss": 2.1707, + "step": 12630 + }, + { + "epoch": 0.047149049185709065, + "grad_norm": 0.25531166791915894, + "learning_rate": 0.0006, + "loss": 2.2678, + "step": 12640 + }, + { + "epoch": 0.047186350648672444, + "grad_norm": 0.47539111971855164, + "learning_rate": 0.0006, + "loss": 2.223, + "step": 12650 + }, + { + "epoch": 0.047186350648672444, + "eval_valid_loss": 2.2140820026397705, + "eval_valid_loss/all": 2.074657440185547, + "eval_valid_loss/end_span": 1.2501893043518066, + "eval_valid_perplexity/batch": 7.961818695068359, + "eval_valid_perplexity/end_span": 3.4910037517547607, + "eval_valid_perplexity/fim": 2.5360655784606934, + "eval_valid_perplexity/first_seq": 14.479543685913086, + "eval_valid_perplexity/last_seq": 9.038578033447266, + "eval_valid_perplexity/second_seq": 13.775654792785645, + "eval_valid_perplexity/seq": 8.97016429901123, + "eval_valid_reconstruction/all": 0.28844863176345825, + "eval_valid_reconstruction/end_span": 0.7024658918380737, + "eval_valid_reconstruction/fim": 0.18054942786693573, + "eval_valid_reconstruction/first_seq": 0.17566920816898346, + "eval_valid_reconstruction/last_seq": 0.3270966112613678, + "eval_valid_reconstruction/second_seq": 0.19642312824726105, + "eval_valid_runtime": 440.5732, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 12650 + }, + { + "epoch": 0.047186350648672444, + "eval_train_loss": 2.2120141983032227, + "eval_train_loss/all": 2.0450267791748047, + "eval_train_loss/end_span": 1.2178072929382324, + "eval_train_perplexity/batch": 7.729365348815918, + "eval_train_perplexity/end_span": 3.3797688484191895, + "eval_train_perplexity/fim": 2.192157506942749, + "eval_train_perplexity/first_seq": 15.56551742553711, + "eval_train_perplexity/last_seq": 9.116724967956543, + "eval_train_perplexity/second_seq": 14.165302276611328, + "eval_train_perplexity/seq": 8.891751289367676, + "eval_train_reconstruction/all": 0.27826330065727234, + "eval_train_reconstruction/end_span": 0.7116580605506897, + "eval_train_reconstruction/fim": 0.15230555832386017, + "eval_train_reconstruction/first_seq": 0.1527089923620224, + "eval_train_reconstruction/last_seq": 0.3259797990322113, + "eval_train_reconstruction/second_seq": 0.18765981495380402, + "eval_train_runtime": 437.891, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 12650 + }, + { + "epoch": 0.04722365211163582, + "grad_norm": 0.515141487121582, + "learning_rate": 0.0006, + "loss": 2.0935, + "step": 12660 + }, + { + "epoch": 0.047260953574599196, + "grad_norm": 0.5598498582839966, + "learning_rate": 0.0006, + "loss": 2.2473, + "step": 12670 + }, + { + "epoch": 0.047298255037562575, + "grad_norm": 0.2668553590774536, + "learning_rate": 0.0006, + "loss": 2.1357, + "step": 12680 + }, + { + "epoch": 0.04733555650052595, + "grad_norm": 0.4343683421611786, + "learning_rate": 0.0006, + "loss": 2.2502, + "step": 12690 + }, + { + "epoch": 0.04737285796348933, + "grad_norm": 0.46989670395851135, + "learning_rate": 0.0006, + "loss": 2.2813, + "step": 12700 + }, + { + "epoch": 0.04737285796348933, + "eval_valid_loss": 2.2188849449157715, + "eval_valid_loss/all": 2.0789034366607666, + "eval_valid_loss/end_span": 1.411570429801941, + "eval_valid_perplexity/batch": 7.995696544647217, + "eval_valid_perplexity/end_span": 4.102392673492432, + "eval_valid_perplexity/fim": 2.2765867710113525, + "eval_valid_perplexity/first_seq": 15.003974914550781, + "eval_valid_perplexity/last_seq": 9.453904151916504, + "eval_valid_perplexity/second_seq": 13.493967056274414, + "eval_valid_perplexity/seq": 9.004093170166016, + "eval_valid_reconstruction/all": 0.28692325949668884, + "eval_valid_reconstruction/end_span": 0.6703702211380005, + "eval_valid_reconstruction/fim": 0.15757107734680176, + "eval_valid_reconstruction/first_seq": 0.1673782467842102, + "eval_valid_reconstruction/last_seq": 0.31360548734664917, + "eval_valid_reconstruction/second_seq": 0.20119331777095795, + "eval_valid_runtime": 437.5515, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 12700 + }, + { + "epoch": 0.04737285796348933, + "eval_train_loss": 2.2162675857543945, + "eval_train_loss/all": 2.048819065093994, + "eval_train_loss/end_span": 1.3750859498977661, + "eval_train_perplexity/batch": 7.75873327255249, + "eval_train_perplexity/end_span": 3.955416679382324, + "eval_train_perplexity/fim": 1.9639155864715576, + "eval_train_perplexity/first_seq": 15.38581371307373, + "eval_train_perplexity/last_seq": 9.148089408874512, + "eval_train_perplexity/second_seq": 14.512608528137207, + "eval_train_perplexity/seq": 8.924192428588867, + "eval_train_reconstruction/all": 0.2770612835884094, + "eval_train_reconstruction/end_span": 0.6811978220939636, + "eval_train_reconstruction/fim": 0.13109742105007172, + "eval_train_reconstruction/first_seq": 0.15410102903842926, + "eval_train_reconstruction/last_seq": 0.31714773178100586, + "eval_train_reconstruction/second_seq": 0.17923285067081451, + "eval_train_runtime": 432.7498, + "eval_train_samples_per_second": 0.444, + "eval_train_steps_per_second": 0.444, + "step": 12700 + }, + { + "epoch": 0.04741015942645271, + "grad_norm": 0.5595499873161316, + "learning_rate": 0.0006, + "loss": 2.2156, + "step": 12710 + }, + { + "epoch": 0.047447460889416086, + "grad_norm": 0.3958284258842468, + "learning_rate": 0.0006, + "loss": 2.1928, + "step": 12720 + }, + { + "epoch": 0.04748476235237946, + "grad_norm": 0.315793514251709, + "learning_rate": 0.0006, + "loss": 2.2494, + "step": 12730 + }, + { + "epoch": 0.04752206381534284, + "grad_norm": 0.314822793006897, + "learning_rate": 0.0006, + "loss": 2.2909, + "step": 12740 + }, + { + "epoch": 0.04755936527830622, + "grad_norm": 0.3039688169956207, + "learning_rate": 0.0006, + "loss": 2.2932, + "step": 12750 + }, + { + "epoch": 0.04755936527830622, + "eval_valid_loss": 2.2139627933502197, + "eval_valid_loss/all": 2.0747828483581543, + "eval_valid_loss/end_span": 1.2541462182998657, + "eval_valid_perplexity/batch": 7.962817192077637, + "eval_valid_perplexity/end_span": 3.5048446655273438, + "eval_valid_perplexity/fim": 2.23258900642395, + "eval_valid_perplexity/first_seq": 14.833413124084473, + "eval_valid_perplexity/last_seq": 9.362306594848633, + "eval_valid_perplexity/second_seq": 13.961678504943848, + "eval_valid_perplexity/seq": 8.974966049194336, + "eval_valid_reconstruction/all": 0.2882605493068695, + "eval_valid_reconstruction/end_span": 0.7061067819595337, + "eval_valid_reconstruction/fim": 0.1559944450855255, + "eval_valid_reconstruction/first_seq": 0.16996949911117554, + "eval_valid_reconstruction/last_seq": 0.31397509574890137, + "eval_valid_reconstruction/second_seq": 0.19203373789787292, + "eval_valid_runtime": 436.5484, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 12750 + }, + { + "epoch": 0.04755936527830622, + "eval_train_loss": 2.208491086959839, + "eval_train_loss/all": 2.0420420169830322, + "eval_train_loss/end_span": 1.2126545906066895, + "eval_train_perplexity/batch": 7.706329822540283, + "eval_train_perplexity/end_span": 3.362398624420166, + "eval_train_perplexity/fim": 1.9351415634155273, + "eval_train_perplexity/first_seq": 15.099743843078613, + "eval_train_perplexity/last_seq": 9.570124626159668, + "eval_train_perplexity/second_seq": 14.495162963867188, + "eval_train_perplexity/seq": 8.86621379852295, + "eval_train_reconstruction/all": 0.2790068984031677, + "eval_train_reconstruction/end_span": 0.7183368802070618, + "eval_train_reconstruction/fim": 0.12841886281967163, + "eval_train_reconstruction/first_seq": 0.16187356412410736, + "eval_train_reconstruction/last_seq": 0.30676737427711487, + "eval_train_reconstruction/second_seq": 0.17684125900268555, + "eval_train_runtime": 439.0678, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 12750 + }, + { + "epoch": 0.04759666674126959, + "grad_norm": 0.4002349078655243, + "learning_rate": 0.0006, + "loss": 2.1869, + "step": 12760 + }, + { + "epoch": 0.04763396820423297, + "grad_norm": 0.45673587918281555, + "learning_rate": 0.0006, + "loss": 2.2829, + "step": 12770 + }, + { + "epoch": 0.04767126966719635, + "grad_norm": 0.3826148211956024, + "learning_rate": 0.0006, + "loss": 2.3359, + "step": 12780 + }, + { + "epoch": 0.04770857113015972, + "grad_norm": 0.4591665267944336, + "learning_rate": 0.0006, + "loss": 2.1264, + "step": 12790 + }, + { + "epoch": 0.0477458725931231, + "grad_norm": 0.2617729902267456, + "learning_rate": 0.0006, + "loss": 2.3664, + "step": 12800 + }, + { + "epoch": 0.0477458725931231, + "eval_valid_loss": 2.212651491165161, + "eval_valid_loss/all": 2.0735599994659424, + "eval_valid_loss/end_span": 1.2639107704162598, + "eval_valid_perplexity/batch": 7.953085899353027, + "eval_valid_perplexity/end_span": 3.5392355918884277, + "eval_valid_perplexity/fim": 2.534198522567749, + "eval_valid_perplexity/first_seq": 14.860799789428711, + "eval_valid_perplexity/last_seq": 9.143506050109863, + "eval_valid_perplexity/second_seq": 13.638678550720215, + "eval_valid_perplexity/seq": 8.96545696258545, + "eval_valid_reconstruction/all": 0.288617342710495, + "eval_valid_reconstruction/end_span": 0.7041816115379333, + "eval_valid_reconstruction/fim": 0.1804472804069519, + "eval_valid_reconstruction/first_seq": 0.16815324127674103, + "eval_valid_reconstruction/last_seq": 0.3203454315662384, + "eval_valid_reconstruction/second_seq": 0.19880206882953644, + "eval_valid_runtime": 439.2565, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 12800 + }, + { + "epoch": 0.0477458725931231, + "eval_train_loss": 2.2083518505096436, + "eval_train_loss/all": 2.0420515537261963, + "eval_train_loss/end_span": 1.2226911783218384, + "eval_train_perplexity/batch": 7.7064032554626465, + "eval_train_perplexity/end_span": 3.396315574645996, + "eval_train_perplexity/fim": 2.4027163982391357, + "eval_train_perplexity/first_seq": 15.802894592285156, + "eval_train_perplexity/last_seq": 8.915459632873535, + "eval_train_perplexity/second_seq": 14.120933532714844, + "eval_train_perplexity/seq": 8.868818283081055, + "eval_train_reconstruction/all": 0.2790583074092865, + "eval_train_reconstruction/end_span": 0.7172303795814514, + "eval_train_reconstruction/fim": 0.17127728462219238, + "eval_train_reconstruction/first_seq": 0.14282546937465668, + "eval_train_reconstruction/last_seq": 0.3221534490585327, + "eval_train_reconstruction/second_seq": 0.1853337436914444, + "eval_train_runtime": 438.1923, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 12800 + }, + { + "epoch": 0.04778317405608648, + "grad_norm": 0.41188251972198486, + "learning_rate": 0.0006, + "loss": 2.3368, + "step": 12810 + }, + { + "epoch": 0.04782047551904986, + "grad_norm": 0.3744537830352783, + "learning_rate": 0.0006, + "loss": 2.1311, + "step": 12820 + }, + { + "epoch": 0.04785777698201323, + "grad_norm": 0.46409979462623596, + "learning_rate": 0.0006, + "loss": 2.313, + "step": 12830 + }, + { + "epoch": 0.04789507844497661, + "grad_norm": 0.47171398997306824, + "learning_rate": 0.0006, + "loss": 2.0774, + "step": 12840 + }, + { + "epoch": 0.04793237990793999, + "grad_norm": 0.2694631814956665, + "learning_rate": 0.0006, + "loss": 2.2204, + "step": 12850 + }, + { + "epoch": 0.04793237990793999, + "eval_valid_loss": 2.218601942062378, + "eval_valid_loss/all": 2.0792362689971924, + "eval_valid_loss/end_span": 1.3162341117858887, + "eval_valid_perplexity/batch": 7.998357772827148, + "eval_valid_perplexity/end_span": 3.7293505668640137, + "eval_valid_perplexity/fim": 2.43462872505188, + "eval_valid_perplexity/first_seq": 14.737462997436523, + "eval_valid_perplexity/last_seq": 9.33358097076416, + "eval_valid_perplexity/second_seq": 14.148365020751953, + "eval_valid_perplexity/seq": 9.017090797424316, + "eval_valid_reconstruction/all": 0.2867591083049774, + "eval_valid_reconstruction/end_span": 0.6956920027732849, + "eval_valid_reconstruction/fim": 0.17212136089801788, + "eval_valid_reconstruction/first_seq": 0.167795330286026, + "eval_valid_reconstruction/last_seq": 0.31586793065071106, + "eval_valid_reconstruction/second_seq": 0.18705157935619354, + "eval_valid_runtime": 435.4604, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 12850 + }, + { + "epoch": 0.04793237990793999, + "eval_train_loss": 2.216127872467041, + "eval_train_loss/all": 2.049074649810791, + "eval_train_loss/end_span": 1.2795116901397705, + "eval_train_perplexity/batch": 7.760716438293457, + "eval_train_perplexity/end_span": 3.594883918762207, + "eval_train_perplexity/fim": 1.9354588985443115, + "eval_train_perplexity/first_seq": 15.657743453979492, + "eval_train_perplexity/last_seq": 8.906933784484863, + "eval_train_perplexity/second_seq": 14.51048755645752, + "eval_train_perplexity/seq": 8.928804397583008, + "eval_train_reconstruction/all": 0.27707672119140625, + "eval_train_reconstruction/end_span": 0.7085534930229187, + "eval_train_reconstruction/fim": 0.12803302705287933, + "eval_train_reconstruction/first_seq": 0.1492641270160675, + "eval_train_reconstruction/last_seq": 0.3263343274593353, + "eval_train_reconstruction/second_seq": 0.17720003426074982, + "eval_train_runtime": 436.6508, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 12850 + }, + { + "epoch": 0.047969681370903364, + "grad_norm": 0.4358513653278351, + "learning_rate": 0.0006, + "loss": 2.2331, + "step": 12860 + }, + { + "epoch": 0.04800698283386674, + "grad_norm": 0.4316340684890747, + "learning_rate": 0.0006, + "loss": 2.1154, + "step": 12870 + }, + { + "epoch": 0.04804428429683012, + "grad_norm": 0.3147183954715729, + "learning_rate": 0.0006, + "loss": 2.3497, + "step": 12880 + }, + { + "epoch": 0.0480815857597935, + "grad_norm": 0.382026344537735, + "learning_rate": 0.0006, + "loss": 2.3317, + "step": 12890 + }, + { + "epoch": 0.048118887222756875, + "grad_norm": 0.6166332960128784, + "learning_rate": 0.0006, + "loss": 2.3916, + "step": 12900 + }, + { + "epoch": 0.048118887222756875, + "eval_valid_loss": 2.2156875133514404, + "eval_valid_loss/all": 2.0763230323791504, + "eval_valid_loss/end_span": 1.2962219715118408, + "eval_valid_perplexity/batch": 7.975090980529785, + "eval_valid_perplexity/end_span": 3.6554601192474365, + "eval_valid_perplexity/fim": 2.301935911178589, + "eval_valid_perplexity/first_seq": 15.285773277282715, + "eval_valid_perplexity/last_seq": 8.927973747253418, + "eval_valid_perplexity/second_seq": 14.198620796203613, + "eval_valid_perplexity/seq": 8.986319541931152, + "eval_valid_reconstruction/all": 0.28789544105529785, + "eval_valid_reconstruction/end_span": 0.6985515356063843, + "eval_valid_reconstruction/fim": 0.16181282699108124, + "eval_valid_reconstruction/first_seq": 0.15944425761699677, + "eval_valid_reconstruction/last_seq": 0.3287734389305115, + "eval_valid_reconstruction/second_seq": 0.18448390066623688, + "eval_valid_runtime": 436.8056, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 12900 + }, + { + "epoch": 0.048118887222756875, + "eval_train_loss": 2.212467908859253, + "eval_train_loss/all": 2.045537233352661, + "eval_train_loss/end_span": 1.2492414712905884, + "eval_train_perplexity/batch": 7.733312129974365, + "eval_train_perplexity/end_span": 3.487696409225464, + "eval_train_perplexity/fim": 2.0030434131622314, + "eval_train_perplexity/first_seq": 15.560489654541016, + "eval_train_perplexity/last_seq": 8.967496871948242, + "eval_train_perplexity/second_seq": 14.0930814743042, + "eval_train_perplexity/seq": 8.893233299255371, + "eval_train_reconstruction/all": 0.2782629430294037, + "eval_train_reconstruction/end_span": 0.7130756974220276, + "eval_train_reconstruction/fim": 0.13539999723434448, + "eval_train_reconstruction/first_seq": 0.152681365609169, + "eval_train_reconstruction/last_seq": 0.32643041014671326, + "eval_train_reconstruction/second_seq": 0.18735969066619873, + "eval_train_runtime": 434.6066, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 12900 + }, + { + "epoch": 0.048156188685720254, + "grad_norm": 0.3791089355945587, + "learning_rate": 0.0006, + "loss": 2.3039, + "step": 12910 + }, + { + "epoch": 0.04819349014868363, + "grad_norm": 0.6336148977279663, + "learning_rate": 0.0006, + "loss": 2.1595, + "step": 12920 + }, + { + "epoch": 0.048230791611647006, + "grad_norm": 0.3682132065296173, + "learning_rate": 0.0006, + "loss": 2.0888, + "step": 12930 + }, + { + "epoch": 0.048268093074610385, + "grad_norm": 0.4174031913280487, + "learning_rate": 0.0006, + "loss": 2.3273, + "step": 12940 + }, + { + "epoch": 0.048305394537573765, + "grad_norm": 0.41403186321258545, + "learning_rate": 0.0006, + "loss": 2.3118, + "step": 12950 + }, + { + "epoch": 0.048305394537573765, + "eval_valid_loss": 2.2169392108917236, + "eval_valid_loss/all": 2.077746629714966, + "eval_valid_loss/end_span": 1.295844554901123, + "eval_valid_perplexity/batch": 7.986452102661133, + "eval_valid_perplexity/end_span": 3.654080629348755, + "eval_valid_perplexity/fim": 2.3074026107788086, + "eval_valid_perplexity/first_seq": 14.747859954833984, + "eval_valid_perplexity/last_seq": 8.843514442443848, + "eval_valid_perplexity/second_seq": 13.646492004394531, + "eval_valid_perplexity/seq": 9.002530097961426, + "eval_valid_reconstruction/all": 0.2872076630592346, + "eval_valid_reconstruction/end_span": 0.6910849809646606, + "eval_valid_reconstruction/fim": 0.1627642810344696, + "eval_valid_reconstruction/first_seq": 0.1730135828256607, + "eval_valid_reconstruction/last_seq": 0.3325306475162506, + "eval_valid_reconstruction/second_seq": 0.19925479590892792, + "eval_valid_runtime": 436.0594, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 12950 + }, + { + "epoch": 0.048305394537573765, + "eval_train_loss": 2.2125189304351807, + "eval_train_loss/all": 2.045823097229004, + "eval_train_loss/end_span": 1.2642678022384644, + "eval_train_perplexity/batch": 7.735523223876953, + "eval_train_perplexity/end_span": 3.540499448776245, + "eval_train_perplexity/fim": 2.5209457874298096, + "eval_train_perplexity/first_seq": 15.64102840423584, + "eval_train_perplexity/last_seq": 8.77253532409668, + "eval_train_perplexity/second_seq": 14.450643539428711, + "eval_train_perplexity/seq": 8.899998664855957, + "eval_train_reconstruction/all": 0.2779601216316223, + "eval_train_reconstruction/end_span": 0.6999250054359436, + "eval_train_reconstruction/fim": 0.1793598085641861, + "eval_train_reconstruction/first_seq": 0.14998456835746765, + "eval_train_reconstruction/last_seq": 0.3308401107788086, + "eval_train_reconstruction/second_seq": 0.17475968599319458, + "eval_train_runtime": 438.7456, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 12950 + }, + { + "epoch": 0.048342696000537144, + "grad_norm": 0.32443884015083313, + "learning_rate": 0.0006, + "loss": 2.2369, + "step": 12960 + }, + { + "epoch": 0.04837999746350052, + "grad_norm": 1.3876465559005737, + "learning_rate": 0.0006, + "loss": 2.2932, + "step": 12970 + }, + { + "epoch": 0.048417298926463896, + "grad_norm": 0.5419057011604309, + "learning_rate": 0.0006, + "loss": 2.2843, + "step": 12980 + }, + { + "epoch": 0.048454600389427276, + "grad_norm": 0.4049803614616394, + "learning_rate": 0.0006, + "loss": 2.1282, + "step": 12990 + }, + { + "epoch": 0.04849190185239065, + "grad_norm": 0.4268828332424164, + "learning_rate": 0.0006, + "loss": 2.1121, + "step": 13000 + }, + { + "epoch": 0.04849190185239065, + "eval_valid_loss": 2.2195024490356445, + "eval_valid_loss/all": 2.0799319744110107, + "eval_valid_loss/end_span": 1.26791512966156, + "eval_valid_perplexity/batch": 8.003924369812012, + "eval_valid_perplexity/end_span": 3.553436279296875, + "eval_valid_perplexity/fim": 2.2725040912628174, + "eval_valid_perplexity/first_seq": 14.830591201782227, + "eval_valid_perplexity/last_seq": 9.660170555114746, + "eval_valid_perplexity/second_seq": 14.322122573852539, + "eval_valid_perplexity/seq": 9.021291732788086, + "eval_valid_reconstruction/all": 0.28642216324806213, + "eval_valid_reconstruction/end_span": 0.6931393146514893, + "eval_valid_reconstruction/fim": 0.15867362916469574, + "eval_valid_reconstruction/first_seq": 0.1718374788761139, + "eval_valid_reconstruction/last_seq": 0.3009859621524811, + "eval_valid_reconstruction/second_seq": 0.18420863151550293, + "eval_valid_runtime": 434.8247, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 13000 + }, + { + "epoch": 0.04849190185239065, + "eval_train_loss": 2.215606927871704, + "eval_train_loss/all": 2.048720121383667, + "eval_train_loss/end_span": 1.2317899465560913, + "eval_train_perplexity/batch": 7.757965564727783, + "eval_train_perplexity/end_span": 3.427358865737915, + "eval_train_perplexity/fim": 1.9615886211395264, + "eval_train_perplexity/first_seq": 15.491945266723633, + "eval_train_perplexity/last_seq": 9.437950134277344, + "eval_train_perplexity/second_seq": 14.65314769744873, + "eval_train_perplexity/seq": 8.930153846740723, + "eval_train_reconstruction/all": 0.27711790800094604, + "eval_train_reconstruction/end_span": 0.7056905627250671, + "eval_train_reconstruction/fim": 0.13100220263004303, + "eval_train_reconstruction/first_seq": 0.14983214437961578, + "eval_train_reconstruction/last_seq": 0.31127503514289856, + "eval_train_reconstruction/second_seq": 0.1739792674779892, + "eval_train_runtime": 442.0181, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 13000 + }, + { + "epoch": 0.04852920331535403, + "grad_norm": 0.36405086517333984, + "learning_rate": 0.0006, + "loss": 2.1419, + "step": 13010 + }, + { + "epoch": 0.04856650477831741, + "grad_norm": 0.6410958170890808, + "learning_rate": 0.0006, + "loss": 2.2492, + "step": 13020 + }, + { + "epoch": 0.048603806241280786, + "grad_norm": 0.6680881977081299, + "learning_rate": 0.0006, + "loss": 2.0254, + "step": 13030 + }, + { + "epoch": 0.04864110770424416, + "grad_norm": 0.6594759821891785, + "learning_rate": 0.0006, + "loss": 2.1325, + "step": 13040 + }, + { + "epoch": 0.04867840916720754, + "grad_norm": 0.2992919981479645, + "learning_rate": 0.0006, + "loss": 2.3441, + "step": 13050 + }, + { + "epoch": 0.04867840916720754, + "eval_valid_loss": 2.2164149284362793, + "eval_valid_loss/all": 2.0771782398223877, + "eval_valid_loss/end_span": 1.3015292882919312, + "eval_valid_perplexity/batch": 7.981914043426514, + "eval_valid_perplexity/end_span": 3.674912452697754, + "eval_valid_perplexity/fim": 2.2076191902160645, + "eval_valid_perplexity/first_seq": 15.061732292175293, + "eval_valid_perplexity/last_seq": 9.392070770263672, + "eval_valid_perplexity/second_seq": 13.876213073730469, + "eval_valid_perplexity/seq": 8.998517036437988, + "eval_valid_reconstruction/all": 0.2870772182941437, + "eval_valid_reconstruction/end_span": 0.6967833042144775, + "eval_valid_reconstruction/fim": 0.15303893387317657, + "eval_valid_reconstruction/first_seq": 0.16705894470214844, + "eval_valid_reconstruction/last_seq": 0.31002917885780334, + "eval_valid_reconstruction/second_seq": 0.19630737602710724, + "eval_valid_runtime": 441.6559, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 13050 + }, + { + "epoch": 0.04867840916720754, + "eval_train_loss": 2.2135579586029053, + "eval_train_loss/all": 2.046520471572876, + "eval_train_loss/end_span": 1.2577953338623047, + "eval_train_perplexity/batch": 7.740919589996338, + "eval_train_perplexity/end_span": 3.51765775680542, + "eval_train_perplexity/fim": 2.107259511947632, + "eval_train_perplexity/first_seq": 15.485320091247559, + "eval_train_perplexity/last_seq": 9.12241268157959, + "eval_train_perplexity/second_seq": 14.302048683166504, + "eval_train_perplexity/seq": 8.905777931213379, + "eval_train_reconstruction/all": 0.27750062942504883, + "eval_train_reconstruction/end_span": 0.7084078192710876, + "eval_train_reconstruction/fim": 0.14426951110363007, + "eval_train_reconstruction/first_seq": 0.15086351335048676, + "eval_train_reconstruction/last_seq": 0.32037466764450073, + "eval_train_reconstruction/second_seq": 0.18489138782024384, + "eval_train_runtime": 439.3771, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 13050 + }, + { + "epoch": 0.04871571063017092, + "grad_norm": 0.23677994310855865, + "learning_rate": 0.0006, + "loss": 2.3005, + "step": 13060 + }, + { + "epoch": 0.04875301209313429, + "grad_norm": 0.4948999583721161, + "learning_rate": 0.0006, + "loss": 2.3237, + "step": 13070 + }, + { + "epoch": 0.04879031355609767, + "grad_norm": 0.2905726432800293, + "learning_rate": 0.0006, + "loss": 2.278, + "step": 13080 + }, + { + "epoch": 0.04882761501906105, + "grad_norm": 0.4521219730377197, + "learning_rate": 0.0006, + "loss": 2.2354, + "step": 13090 + }, + { + "epoch": 0.04886491648202442, + "grad_norm": 0.3550070524215698, + "learning_rate": 0.0006, + "loss": 2.4036, + "step": 13100 + }, + { + "epoch": 0.04886491648202442, + "eval_valid_loss": 2.2154972553253174, + "eval_valid_loss/all": 2.076136827468872, + "eval_valid_loss/end_span": 1.296722650527954, + "eval_valid_perplexity/batch": 7.973606109619141, + "eval_valid_perplexity/end_span": 3.6572906970977783, + "eval_valid_perplexity/fim": 2.2155745029449463, + "eval_valid_perplexity/first_seq": 15.194658279418945, + "eval_valid_perplexity/last_seq": 9.014055252075195, + "eval_valid_perplexity/second_seq": 14.082139015197754, + "eval_valid_perplexity/seq": 8.987589836120605, + "eval_valid_reconstruction/all": 0.28793662786483765, + "eval_valid_reconstruction/end_span": 0.7055102586746216, + "eval_valid_reconstruction/fim": 0.1538240611553192, + "eval_valid_reconstruction/first_seq": 0.16061931848526, + "eval_valid_reconstruction/last_seq": 0.32771921157836914, + "eval_valid_reconstruction/second_seq": 0.18711403012275696, + "eval_valid_runtime": 441.6402, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 13100 + }, + { + "epoch": 0.04886491648202442, + "eval_train_loss": 2.2134053707122803, + "eval_train_loss/all": 2.046565532684326, + "eval_train_loss/end_span": 1.2520283460617065, + "eval_train_perplexity/batch": 7.741268157958984, + "eval_train_perplexity/end_span": 3.497429847717285, + "eval_train_perplexity/fim": 2.2120871543884277, + "eval_train_perplexity/first_seq": 15.619879722595215, + "eval_train_perplexity/last_seq": 8.975180625915527, + "eval_train_perplexity/second_seq": 14.13963794708252, + "eval_train_perplexity/seq": 8.905285835266113, + "eval_train_reconstruction/all": 0.27799415588378906, + "eval_train_reconstruction/end_span": 0.7190221548080444, + "eval_train_reconstruction/fim": 0.1537824273109436, + "eval_train_reconstruction/first_seq": 0.14859728515148163, + "eval_train_reconstruction/last_seq": 0.3251590430736542, + "eval_train_reconstruction/second_seq": 0.18792279064655304, + "eval_train_runtime": 436.5147, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 13100 + }, + { + "epoch": 0.0489022179449878, + "grad_norm": 0.3444182574748993, + "learning_rate": 0.0006, + "loss": 2.1457, + "step": 13110 + }, + { + "epoch": 0.04893951940795118, + "grad_norm": 0.30173489451408386, + "learning_rate": 0.0006, + "loss": 2.17, + "step": 13120 + }, + { + "epoch": 0.04897682087091456, + "grad_norm": 0.25227227807044983, + "learning_rate": 0.0006, + "loss": 2.3987, + "step": 13130 + }, + { + "epoch": 0.04901412233387793, + "grad_norm": 0.49565038084983826, + "learning_rate": 0.0006, + "loss": 2.2407, + "step": 13140 + }, + { + "epoch": 0.04905142379684131, + "grad_norm": 0.37831154465675354, + "learning_rate": 0.0006, + "loss": 2.2893, + "step": 13150 + }, + { + "epoch": 0.04905142379684131, + "eval_valid_loss": 2.216531276702881, + "eval_valid_loss/all": 2.0768420696258545, + "eval_valid_loss/end_span": 1.3784013986587524, + "eval_valid_perplexity/batch": 7.979231357574463, + "eval_valid_perplexity/end_span": 3.968552350997925, + "eval_valid_perplexity/fim": 2.3357717990875244, + "eval_valid_perplexity/first_seq": 14.910568237304688, + "eval_valid_perplexity/last_seq": 9.029378890991211, + "eval_valid_perplexity/second_seq": 13.99412727355957, + "eval_valid_perplexity/seq": 8.993865013122559, + "eval_valid_reconstruction/all": 0.287471741437912, + "eval_valid_reconstruction/end_span": 0.6781964302062988, + "eval_valid_reconstruction/fim": 0.16429883241653442, + "eval_valid_reconstruction/first_seq": 0.16729867458343506, + "eval_valid_reconstruction/last_seq": 0.32454681396484375, + "eval_valid_reconstruction/second_seq": 0.191118061542511, + "eval_valid_runtime": 437.0392, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 13150 + }, + { + "epoch": 0.04905142379684131, + "eval_train_loss": 2.2134218215942383, + "eval_train_loss/all": 2.0463905334472656, + "eval_train_loss/end_span": 1.3450292348861694, + "eval_train_perplexity/batch": 7.739913463592529, + "eval_train_perplexity/end_span": 3.838298797607422, + "eval_train_perplexity/fim": 2.422973871231079, + "eval_train_perplexity/first_seq": 15.29910659790039, + "eval_train_perplexity/last_seq": 9.488073348999023, + "eval_train_perplexity/second_seq": 14.321928024291992, + "eval_train_perplexity/seq": 8.900727272033691, + "eval_train_reconstruction/all": 0.2777182459831238, + "eval_train_reconstruction/end_span": 0.6886370778083801, + "eval_train_reconstruction/fim": 0.1717231124639511, + "eval_train_reconstruction/first_seq": 0.1558101326227188, + "eval_train_reconstruction/last_seq": 0.308251291513443, + "eval_train_reconstruction/second_seq": 0.18226857483386993, + "eval_train_runtime": 434.586, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 13150 + }, + { + "epoch": 0.04908872525980469, + "grad_norm": 0.3789633810520172, + "learning_rate": 0.0006, + "loss": 2.1706, + "step": 13160 + }, + { + "epoch": 0.049126026722768064, + "grad_norm": 0.48633989691734314, + "learning_rate": 0.0006, + "loss": 2.322, + "step": 13170 + }, + { + "epoch": 0.04916332818573144, + "grad_norm": 0.35645633935928345, + "learning_rate": 0.0006, + "loss": 2.2642, + "step": 13180 + }, + { + "epoch": 0.04920062964869482, + "grad_norm": 0.4326246976852417, + "learning_rate": 0.0006, + "loss": 2.1304, + "step": 13190 + }, + { + "epoch": 0.0492379311116582, + "grad_norm": 0.3690440356731415, + "learning_rate": 0.0006, + "loss": 2.1934, + "step": 13200 + }, + { + "epoch": 0.0492379311116582, + "eval_valid_loss": 2.222445487976074, + "eval_valid_loss/all": 2.0828425884246826, + "eval_valid_loss/end_span": 1.292852520942688, + "eval_valid_perplexity/batch": 8.027255058288574, + "eval_valid_perplexity/end_span": 3.6431639194488525, + "eval_valid_perplexity/fim": 2.4330310821533203, + "eval_valid_perplexity/first_seq": 15.03282356262207, + "eval_valid_perplexity/last_seq": 9.624340057373047, + "eval_valid_perplexity/second_seq": 13.513477325439453, + "eval_valid_perplexity/seq": 9.058330535888672, + "eval_valid_reconstruction/all": 0.2862447202205658, + "eval_valid_reconstruction/end_span": 0.6948514580726624, + "eval_valid_reconstruction/fim": 0.17193453013896942, + "eval_valid_reconstruction/first_seq": 0.1623140275478363, + "eval_valid_reconstruction/last_seq": 0.30716973543167114, + "eval_valid_reconstruction/second_seq": 0.20230591297149658, + "eval_valid_runtime": 435.4509, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 13200 + }, + { + "epoch": 0.0492379311116582, + "eval_train_loss": 2.217282295227051, + "eval_train_loss/all": 2.050405263900757, + "eval_train_loss/end_span": 1.2522082328796387, + "eval_train_perplexity/batch": 7.771049976348877, + "eval_train_perplexity/end_span": 3.498059034347534, + "eval_train_perplexity/fim": 2.099656105041504, + "eval_train_perplexity/first_seq": 15.586079597473145, + "eval_train_perplexity/last_seq": 8.9158935546875, + "eval_train_perplexity/second_seq": 13.726378440856934, + "eval_train_perplexity/seq": 8.949015617370605, + "eval_train_reconstruction/all": 0.27684348821640015, + "eval_train_reconstruction/end_span": 0.7072614431381226, + "eval_train_reconstruction/fim": 0.14309880137443542, + "eval_train_reconstruction/first_seq": 0.14846840500831604, + "eval_train_reconstruction/last_seq": 0.3281749486923218, + "eval_train_reconstruction/second_seq": 0.19940780103206635, + "eval_train_runtime": 433.6444, + "eval_train_samples_per_second": 0.443, + "eval_train_steps_per_second": 0.443, + "step": 13200 + }, + { + "epoch": 0.049275232574621575, + "grad_norm": 0.38552945852279663, + "learning_rate": 0.0006, + "loss": 2.3736, + "step": 13210 + }, + { + "epoch": 0.049312534037584954, + "grad_norm": 0.3133887052536011, + "learning_rate": 0.0006, + "loss": 2.2963, + "step": 13220 + }, + { + "epoch": 0.049349835500548334, + "grad_norm": 0.2922358810901642, + "learning_rate": 0.0006, + "loss": 2.2656, + "step": 13230 + }, + { + "epoch": 0.049387136963511706, + "grad_norm": 0.3447176516056061, + "learning_rate": 0.0006, + "loss": 2.2322, + "step": 13240 + }, + { + "epoch": 0.049424438426475086, + "grad_norm": 0.3384292423725128, + "learning_rate": 0.0006, + "loss": 2.253, + "step": 13250 + }, + { + "epoch": 0.049424438426475086, + "eval_valid_loss": 2.214923143386841, + "eval_valid_loss/all": 2.075775384902954, + "eval_valid_loss/end_span": 1.2970834970474243, + "eval_valid_perplexity/batch": 7.970724582672119, + "eval_valid_perplexity/end_span": 3.6586108207702637, + "eval_valid_perplexity/fim": 2.313537359237671, + "eval_valid_perplexity/first_seq": 14.971075057983398, + "eval_valid_perplexity/last_seq": 9.059249877929688, + "eval_valid_perplexity/second_seq": 13.903281211853027, + "eval_valid_perplexity/seq": 8.985391616821289, + "eval_valid_reconstruction/all": 0.28785690665245056, + "eval_valid_reconstruction/end_span": 0.7062989473342896, + "eval_valid_reconstruction/fim": 0.16266533732414246, + "eval_valid_reconstruction/first_seq": 0.16584451496601105, + "eval_valid_reconstruction/last_seq": 0.3273540437221527, + "eval_valid_reconstruction/second_seq": 0.19294562935829163, + "eval_valid_runtime": 433.9159, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 13250 + }, + { + "epoch": 0.049424438426475086, + "eval_train_loss": 2.211219549179077, + "eval_train_loss/all": 2.044658660888672, + "eval_train_loss/end_span": 1.246802806854248, + "eval_train_perplexity/batch": 7.726520538330078, + "eval_train_perplexity/end_span": 3.479201555252075, + "eval_train_perplexity/fim": 2.232971429824829, + "eval_train_perplexity/first_seq": 15.728960990905762, + "eval_train_perplexity/last_seq": 8.099337577819824, + "eval_train_perplexity/second_seq": 14.324263572692871, + "eval_train_perplexity/seq": 8.889939308166504, + "eval_train_reconstruction/all": 0.2782776653766632, + "eval_train_reconstruction/end_span": 0.7188184857368469, + "eval_train_reconstruction/fim": 0.1568090170621872, + "eval_train_reconstruction/first_seq": 0.14932869374752045, + "eval_train_reconstruction/last_seq": 0.3572607934474945, + "eval_train_reconstruction/second_seq": 0.1832800805568695, + "eval_train_runtime": 437.6573, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 13250 + }, + { + "epoch": 0.049461739889438465, + "grad_norm": 0.3259712755680084, + "learning_rate": 0.0006, + "loss": 2.305, + "step": 13260 + }, + { + "epoch": 0.049499041352401844, + "grad_norm": 0.324116587638855, + "learning_rate": 0.0006, + "loss": 2.2855, + "step": 13270 + }, + { + "epoch": 0.04953634281536522, + "grad_norm": 0.5034117698669434, + "learning_rate": 0.0006, + "loss": 2.1782, + "step": 13280 + }, + { + "epoch": 0.049573644278328596, + "grad_norm": 0.3190314471721649, + "learning_rate": 0.0006, + "loss": 2.2303, + "step": 13290 + }, + { + "epoch": 0.049610945741291976, + "grad_norm": 0.36371564865112305, + "learning_rate": 0.0006, + "loss": 2.1538, + "step": 13300 + }, + { + "epoch": 0.049610945741291976, + "eval_valid_loss": 2.213306188583374, + "eval_valid_loss/all": 2.0739595890045166, + "eval_valid_loss/end_span": 1.1946340799331665, + "eval_valid_perplexity/batch": 7.956264495849609, + "eval_valid_perplexity/end_span": 3.302349090576172, + "eval_valid_perplexity/fim": 2.409766674041748, + "eval_valid_perplexity/first_seq": 14.924663543701172, + "eval_valid_perplexity/last_seq": 9.005134582519531, + "eval_valid_perplexity/second_seq": 13.81096076965332, + "eval_valid_perplexity/seq": 8.968768119812012, + "eval_valid_reconstruction/all": 0.2883370518684387, + "eval_valid_reconstruction/end_span": 0.7100250720977783, + "eval_valid_reconstruction/fim": 0.17019858956336975, + "eval_valid_reconstruction/first_seq": 0.1681784987449646, + "eval_valid_reconstruction/last_seq": 0.3248508870601654, + "eval_valid_reconstruction/second_seq": 0.19336974620819092, + "eval_valid_runtime": 438.2098, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 13300 + }, + { + "epoch": 0.049610945741291976, + "eval_train_loss": 2.211113691329956, + "eval_train_loss/all": 2.044215440750122, + "eval_train_loss/end_span": 1.1631016731262207, + "eval_train_perplexity/batch": 7.72309684753418, + "eval_train_perplexity/end_span": 3.199842691421509, + "eval_train_perplexity/fim": 2.0000104904174805, + "eval_train_perplexity/first_seq": 15.395471572875977, + "eval_train_perplexity/last_seq": 9.245871543884277, + "eval_train_perplexity/second_seq": 14.460569381713867, + "eval_train_perplexity/seq": 8.883781433105469, + "eval_train_reconstruction/all": 0.27838045358657837, + "eval_train_reconstruction/end_span": 0.7228372097015381, + "eval_train_reconstruction/fim": 0.13410541415214539, + "eval_train_reconstruction/first_seq": 0.15313056111335754, + "eval_train_reconstruction/last_seq": 0.31534039974212646, + "eval_train_reconstruction/second_seq": 0.1766602098941803, + "eval_train_runtime": 440.0907, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 13300 + }, + { + "epoch": 0.04964824720425535, + "grad_norm": 0.3435182571411133, + "learning_rate": 0.0006, + "loss": 2.2604, + "step": 13310 + }, + { + "epoch": 0.04968554866721873, + "grad_norm": 0.3158823549747467, + "learning_rate": 0.0006, + "loss": 2.3614, + "step": 13320 + }, + { + "epoch": 0.04972285013018211, + "grad_norm": 0.2635186016559601, + "learning_rate": 0.0006, + "loss": 2.2669, + "step": 13330 + }, + { + "epoch": 0.04976015159314549, + "grad_norm": 0.44977691769599915, + "learning_rate": 0.0006, + "loss": 2.0883, + "step": 13340 + }, + { + "epoch": 0.04979745305610886, + "grad_norm": 0.3806319832801819, + "learning_rate": 0.0006, + "loss": 2.3242, + "step": 13350 + }, + { + "epoch": 0.04979745305610886, + "eval_valid_loss": 2.217358350753784, + "eval_valid_loss/all": 2.0776925086975098, + "eval_valid_loss/end_span": 1.1480077505111694, + "eval_valid_perplexity/batch": 7.986020088195801, + "eval_valid_perplexity/end_span": 3.151907205581665, + "eval_valid_perplexity/fim": 2.594017744064331, + "eval_valid_perplexity/first_seq": 14.952155113220215, + "eval_valid_perplexity/last_seq": 9.326809883117676, + "eval_valid_perplexity/second_seq": 13.637690544128418, + "eval_valid_perplexity/seq": 8.999669075012207, + "eval_valid_reconstruction/all": 0.2873119115829468, + "eval_valid_reconstruction/end_span": 0.7305806279182434, + "eval_valid_reconstruction/fim": 0.18491563200950623, + "eval_valid_reconstruction/first_seq": 0.16822117567062378, + "eval_valid_reconstruction/last_seq": 0.31890276074409485, + "eval_valid_reconstruction/second_seq": 0.19921737909317017, + "eval_valid_runtime": 437.5953, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 13350 + }, + { + "epoch": 0.04979745305610886, + "eval_train_loss": 2.2140796184539795, + "eval_train_loss/all": 2.0470314025878906, + "eval_train_loss/end_span": 1.120246410369873, + "eval_train_perplexity/batch": 7.744875431060791, + "eval_train_perplexity/end_span": 3.0656094551086426, + "eval_train_perplexity/fim": 2.0861830711364746, + "eval_train_perplexity/first_seq": 15.541491508483887, + "eval_train_perplexity/last_seq": 9.188103675842285, + "eval_train_perplexity/second_seq": 14.094512939453125, + "eval_train_perplexity/seq": 8.913037300109863, + "eval_train_reconstruction/all": 0.27784618735313416, + "eval_train_reconstruction/end_span": 0.7408708930015564, + "eval_train_reconstruction/fim": 0.1427527219057083, + "eval_train_reconstruction/first_seq": 0.15426288545131683, + "eval_train_reconstruction/last_seq": 0.3196532130241394, + "eval_train_reconstruction/second_seq": 0.18765471875667572, + "eval_train_runtime": 439.1979, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 13350 + }, + { + "epoch": 0.04983475451907224, + "grad_norm": 0.30469509959220886, + "learning_rate": 0.0006, + "loss": 2.2724, + "step": 13360 + }, + { + "epoch": 0.04987205598203562, + "grad_norm": 0.43637916445732117, + "learning_rate": 0.0006, + "loss": 2.23, + "step": 13370 + }, + { + "epoch": 0.04990935744499899, + "grad_norm": 0.4532536566257477, + "learning_rate": 0.0006, + "loss": 2.0858, + "step": 13380 + }, + { + "epoch": 0.04994665890796237, + "grad_norm": 0.3618452847003937, + "learning_rate": 0.0006, + "loss": 2.2209, + "step": 13390 + }, + { + "epoch": 0.04998396037092575, + "grad_norm": 0.2970339059829712, + "learning_rate": 0.0006, + "loss": 2.2697, + "step": 13400 + }, + { + "epoch": 0.04998396037092575, + "eval_valid_loss": 2.216634511947632, + "eval_valid_loss/all": 2.076956272125244, + "eval_valid_loss/end_span": 1.2338911294937134, + "eval_valid_perplexity/batch": 7.980142593383789, + "eval_valid_perplexity/end_span": 3.434567928314209, + "eval_valid_perplexity/fim": 2.4081923961639404, + "eval_valid_perplexity/first_seq": 14.865612983703613, + "eval_valid_perplexity/last_seq": 9.098402976989746, + "eval_valid_perplexity/second_seq": 14.282670021057129, + "eval_valid_perplexity/seq": 8.987170219421387, + "eval_valid_reconstruction/all": 0.2880011796951294, + "eval_valid_reconstruction/end_span": 0.7087203860282898, + "eval_valid_reconstruction/fim": 0.1716124266386032, + "eval_valid_reconstruction/first_seq": 0.1647862195968628, + "eval_valid_reconstruction/last_seq": 0.32005929946899414, + "eval_valid_reconstruction/second_seq": 0.18139848113059998, + "eval_valid_runtime": 439.4059, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 13400 + }, + { + "epoch": 0.04998396037092575, + "eval_train_loss": 2.2131357192993164, + "eval_train_loss/all": 2.0455856323242188, + "eval_train_loss/end_span": 1.1951014995574951, + "eval_train_perplexity/batch": 7.733686447143555, + "eval_train_perplexity/end_span": 3.3038930892944336, + "eval_train_perplexity/fim": 2.4965436458587646, + "eval_train_perplexity/first_seq": 15.10476016998291, + "eval_train_perplexity/last_seq": 9.13987922668457, + "eval_train_perplexity/second_seq": 14.55286693572998, + "eval_train_perplexity/seq": 8.89145565032959, + "eval_train_reconstruction/all": 0.2785017192363739, + "eval_train_reconstruction/end_span": 0.7205225229263306, + "eval_train_reconstruction/fim": 0.1777075231075287, + "eval_train_reconstruction/first_seq": 0.16089394688606262, + "eval_train_reconstruction/last_seq": 0.3189181089401245, + "eval_train_reconstruction/second_seq": 0.1770789474248886, + "eval_train_runtime": 437.2616, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 13400 + }, + { + "epoch": 0.05002126183388912, + "grad_norm": 0.4091651141643524, + "learning_rate": 0.0006, + "loss": 2.1831, + "step": 13410 + }, + { + "epoch": 0.0500585632968525, + "grad_norm": 0.3064252436161041, + "learning_rate": 0.0006, + "loss": 2.2171, + "step": 13420 + }, + { + "epoch": 0.05009586475981588, + "grad_norm": 0.3726728558540344, + "learning_rate": 0.0006, + "loss": 2.2306, + "step": 13430 + }, + { + "epoch": 0.05013316622277926, + "grad_norm": 0.3199009895324707, + "learning_rate": 0.0006, + "loss": 2.1681, + "step": 13440 + }, + { + "epoch": 0.05017046768574263, + "grad_norm": 0.3518046438694, + "learning_rate": 0.0006, + "loss": 2.1262, + "step": 13450 + }, + { + "epoch": 0.05017046768574263, + "eval_valid_loss": 2.214230537414551, + "eval_valid_loss/all": 2.0750114917755127, + "eval_valid_loss/end_span": 1.271993637084961, + "eval_valid_perplexity/batch": 7.964637756347656, + "eval_valid_perplexity/end_span": 3.5679585933685303, + "eval_valid_perplexity/fim": 2.3527443408966064, + "eval_valid_perplexity/first_seq": 15.046977043151855, + "eval_valid_perplexity/last_seq": 9.067928314208984, + "eval_valid_perplexity/second_seq": 13.772012710571289, + "eval_valid_perplexity/seq": 8.974964141845703, + "eval_valid_reconstruction/all": 0.28774774074554443, + "eval_valid_reconstruction/end_span": 0.696876049041748, + "eval_valid_reconstruction/fim": 0.16573195159435272, + "eval_valid_reconstruction/first_seq": 0.16261371970176697, + "eval_valid_reconstruction/last_seq": 0.3263283371925354, + "eval_valid_reconstruction/second_seq": 0.19725893437862396, + "eval_valid_runtime": 436.8819, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 13450 + }, + { + "epoch": 0.05017046768574263, + "eval_train_loss": 2.211230516433716, + "eval_train_loss/all": 2.0446321964263916, + "eval_train_loss/end_span": 1.2353421449661255, + "eval_train_perplexity/batch": 7.726316452026367, + "eval_train_perplexity/end_span": 3.4395551681518555, + "eval_train_perplexity/fim": 2.2391955852508545, + "eval_train_perplexity/first_seq": 15.315284729003906, + "eval_train_perplexity/last_seq": 9.08856201171875, + "eval_train_perplexity/second_seq": 13.959228515625, + "eval_train_perplexity/seq": 8.892413139343262, + "eval_train_reconstruction/all": 0.27812737226486206, + "eval_train_reconstruction/end_span": 0.7073827385902405, + "eval_train_reconstruction/fim": 0.1567888855934143, + "eval_train_reconstruction/first_seq": 0.15847118198871613, + "eval_train_reconstruction/last_seq": 0.3192347288131714, + "eval_train_reconstruction/second_seq": 0.19212155044078827, + "eval_train_runtime": 437.8744, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 13450 + }, + { + "epoch": 0.05020776914870601, + "grad_norm": 0.3957997262477875, + "learning_rate": 0.0006, + "loss": 2.2031, + "step": 13460 + }, + { + "epoch": 0.05024507061166939, + "grad_norm": 0.2857617735862732, + "learning_rate": 0.0006, + "loss": 2.2502, + "step": 13470 + }, + { + "epoch": 0.050282372074632764, + "grad_norm": 0.37773171067237854, + "learning_rate": 0.0006, + "loss": 2.2367, + "step": 13480 + }, + { + "epoch": 0.050319673537596143, + "grad_norm": 2.6467044353485107, + "learning_rate": 0.0006, + "loss": 2.196, + "step": 13490 + }, + { + "epoch": 0.05035697500055952, + "grad_norm": 0.3717648386955261, + "learning_rate": 0.0006, + "loss": 2.3189, + "step": 13500 + }, + { + "epoch": 0.05035697500055952, + "eval_valid_loss": 2.2153351306915283, + "eval_valid_loss/all": 2.0760068893432617, + "eval_valid_loss/end_span": 1.2311214208602905, + "eval_valid_perplexity/batch": 7.972569942474365, + "eval_valid_perplexity/end_span": 3.4250683784484863, + "eval_valid_perplexity/fim": 2.5234057903289795, + "eval_valid_perplexity/first_seq": 15.093006134033203, + "eval_valid_perplexity/last_seq": 9.423492431640625, + "eval_valid_perplexity/second_seq": 14.050004005432129, + "eval_valid_perplexity/seq": 8.987297058105469, + "eval_valid_reconstruction/all": 0.2875899076461792, + "eval_valid_reconstruction/end_span": 0.7059550285339355, + "eval_valid_reconstruction/fim": 0.17975300550460815, + "eval_valid_reconstruction/first_seq": 0.16255789995193481, + "eval_valid_reconstruction/last_seq": 0.3118882477283478, + "eval_valid_reconstruction/second_seq": 0.18972913920879364, + "eval_valid_runtime": 440.2075, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 13500 + }, + { + "epoch": 0.05035697500055952, + "eval_train_loss": 2.2131447792053223, + "eval_train_loss/all": 2.0466597080230713, + "eval_train_loss/end_span": 1.2004643678665161, + "eval_train_perplexity/batch": 7.741997241973877, + "eval_train_perplexity/end_span": 3.3216590881347656, + "eval_train_perplexity/fim": 2.5302939414978027, + "eval_train_perplexity/first_seq": 15.379783630371094, + "eval_train_perplexity/last_seq": 9.19922161102295, + "eval_train_perplexity/second_seq": 14.312028884887695, + "eval_train_perplexity/seq": 8.910431861877441, + "eval_train_reconstruction/all": 0.2774913012981415, + "eval_train_reconstruction/end_span": 0.7168916463851929, + "eval_train_reconstruction/fim": 0.18024586141109467, + "eval_train_reconstruction/first_seq": 0.15478040277957916, + "eval_train_reconstruction/last_seq": 0.31590089201927185, + "eval_train_reconstruction/second_seq": 0.18140625953674316, + "eval_train_runtime": 439.6587, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 13500 + }, + { + "epoch": 0.0503942764635229, + "grad_norm": 0.3408631682395935, + "learning_rate": 0.0006, + "loss": 2.1706, + "step": 13510 + }, + { + "epoch": 0.050431577926486275, + "grad_norm": 0.3995738625526428, + "learning_rate": 0.0006, + "loss": 2.1126, + "step": 13520 + }, + { + "epoch": 0.050468879389449654, + "grad_norm": 0.5147782564163208, + "learning_rate": 0.0006, + "loss": 2.26, + "step": 13530 + }, + { + "epoch": 0.050506180852413034, + "grad_norm": 0.2545604109764099, + "learning_rate": 0.0006, + "loss": 2.1846, + "step": 13540 + }, + { + "epoch": 0.050543482315376406, + "grad_norm": 0.5802586078643799, + "learning_rate": 0.0006, + "loss": 2.2196, + "step": 13550 + }, + { + "epoch": 0.050543482315376406, + "eval_valid_loss": 2.2136223316192627, + "eval_valid_loss/all": 2.074248790740967, + "eval_valid_loss/end_span": 1.3437315225601196, + "eval_valid_perplexity/batch": 7.958565711975098, + "eval_valid_perplexity/end_span": 3.8333208560943604, + "eval_valid_perplexity/fim": 2.208385467529297, + "eval_valid_perplexity/first_seq": 15.344487190246582, + "eval_valid_perplexity/last_seq": 9.547626495361328, + "eval_valid_perplexity/second_seq": 14.050309181213379, + "eval_valid_perplexity/seq": 8.969005584716797, + "eval_valid_reconstruction/all": 0.28857189416885376, + "eval_valid_reconstruction/end_span": 0.6937714219093323, + "eval_valid_reconstruction/fim": 0.15489275753498077, + "eval_valid_reconstruction/first_seq": 0.1574029177427292, + "eval_valid_reconstruction/last_seq": 0.3106134533882141, + "eval_valid_reconstruction/second_seq": 0.18704406917095184, + "eval_valid_runtime": 435.1259, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 13550 + }, + { + "epoch": 0.050543482315376406, + "eval_train_loss": 2.2116222381591797, + "eval_train_loss/all": 2.044841766357422, + "eval_train_loss/end_span": 1.2969239950180054, + "eval_train_perplexity/batch": 7.727935791015625, + "eval_train_perplexity/end_span": 3.658027172088623, + "eval_train_perplexity/fim": 2.0435678958892822, + "eval_train_perplexity/first_seq": 15.191969871520996, + "eval_train_perplexity/last_seq": 9.052623748779297, + "eval_train_perplexity/second_seq": 14.327658653259277, + "eval_train_perplexity/seq": 8.89140796661377, + "eval_train_reconstruction/all": 0.27860531210899353, + "eval_train_reconstruction/end_span": 0.7052955031394958, + "eval_train_reconstruction/fim": 0.1404624879360199, + "eval_train_reconstruction/first_seq": 0.16100400686264038, + "eval_train_reconstruction/last_seq": 0.323369562625885, + "eval_train_reconstruction/second_seq": 0.17962056398391724, + "eval_train_runtime": 437.8307, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 13550 + }, + { + "epoch": 0.050580783778339786, + "grad_norm": 0.35379311442375183, + "learning_rate": 0.0006, + "loss": 2.1944, + "step": 13560 + }, + { + "epoch": 0.050618085241303165, + "grad_norm": 0.3889213800430298, + "learning_rate": 0.0006, + "loss": 2.2825, + "step": 13570 + }, + { + "epoch": 0.050655386704266545, + "grad_norm": 0.29208678007125854, + "learning_rate": 0.0006, + "loss": 2.2752, + "step": 13580 + }, + { + "epoch": 0.05069268816722992, + "grad_norm": 0.2902907431125641, + "learning_rate": 0.0006, + "loss": 2.342, + "step": 13590 + }, + { + "epoch": 0.050729989630193296, + "grad_norm": 0.4899529218673706, + "learning_rate": 0.0006, + "loss": 2.0722, + "step": 13600 + }, + { + "epoch": 0.050729989630193296, + "eval_valid_loss": 2.2141408920288086, + "eval_valid_loss/all": 2.0747509002685547, + "eval_valid_loss/end_span": 1.3526948690414429, + "eval_valid_perplexity/batch": 7.962562561035156, + "eval_valid_perplexity/end_span": 3.8678348064422607, + "eval_valid_perplexity/fim": 2.4296140670776367, + "eval_valid_perplexity/first_seq": 14.731462478637695, + "eval_valid_perplexity/last_seq": 8.820192337036133, + "eval_valid_perplexity/second_seq": 14.005760192871094, + "eval_valid_perplexity/seq": 8.9746675491333, + "eval_valid_reconstruction/all": 0.28801339864730835, + "eval_valid_reconstruction/end_span": 0.6792513728141785, + "eval_valid_reconstruction/fim": 0.17273835837841034, + "eval_valid_reconstruction/first_seq": 0.16982680559158325, + "eval_valid_reconstruction/last_seq": 0.3318479061126709, + "eval_valid_reconstruction/second_seq": 0.1906067430973053, + "eval_valid_runtime": 437.557, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 13600 + }, + { + "epoch": 0.050729989630193296, + "eval_train_loss": 2.210371255874634, + "eval_train_loss/all": 2.043259859085083, + "eval_train_loss/end_span": 1.3124560117721558, + "eval_train_perplexity/batch": 7.715720176696777, + "eval_train_perplexity/end_span": 3.715287208557129, + "eval_train_perplexity/fim": 1.9916056394577026, + "eval_train_perplexity/first_seq": 15.655059814453125, + "eval_train_perplexity/last_seq": 9.015738487243652, + "eval_train_perplexity/second_seq": 14.637253761291504, + "eval_train_perplexity/seq": 8.877367973327637, + "eval_train_reconstruction/all": 0.27851226925849915, + "eval_train_reconstruction/end_span": 0.6912732720375061, + "eval_train_reconstruction/fim": 0.13432322442531586, + "eval_train_reconstruction/first_seq": 0.15079325437545776, + "eval_train_reconstruction/last_seq": 0.32555267214775085, + "eval_train_reconstruction/second_seq": 0.1750316172838211, + "eval_train_runtime": 438.1978, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 13600 + }, + { + "epoch": 0.050767291093156676, + "grad_norm": 0.4748624265193939, + "learning_rate": 0.0006, + "loss": 2.2445, + "step": 13610 + }, + { + "epoch": 0.05080459255612005, + "grad_norm": 0.31605514883995056, + "learning_rate": 0.0006, + "loss": 2.3636, + "step": 13620 + }, + { + "epoch": 0.05084189401908343, + "grad_norm": 0.3606204688549042, + "learning_rate": 0.0006, + "loss": 2.2001, + "step": 13630 + }, + { + "epoch": 0.05087919548204681, + "grad_norm": 0.4100816249847412, + "learning_rate": 0.0006, + "loss": 2.1362, + "step": 13640 + }, + { + "epoch": 0.05091649694501019, + "grad_norm": 0.552841067314148, + "learning_rate": 0.0006, + "loss": 2.3948, + "step": 13650 + }, + { + "epoch": 0.05091649694501019, + "eval_valid_loss": 2.2142763137817383, + "eval_valid_loss/all": 2.0745396614074707, + "eval_valid_loss/end_span": 1.2317464351654053, + "eval_valid_perplexity/batch": 7.960880756378174, + "eval_valid_perplexity/end_span": 3.4272096157073975, + "eval_valid_perplexity/fim": 2.216813564300537, + "eval_valid_perplexity/first_seq": 15.233979225158691, + "eval_valid_perplexity/last_seq": 9.570131301879883, + "eval_valid_perplexity/second_seq": 14.099843978881836, + "eval_valid_perplexity/seq": 8.96650218963623, + "eval_valid_reconstruction/all": 0.28842803835868835, + "eval_valid_reconstruction/end_span": 0.7100651860237122, + "eval_valid_reconstruction/fim": 0.15472081303596497, + "eval_valid_reconstruction/first_seq": 0.16106411814689636, + "eval_valid_reconstruction/last_seq": 0.307179719209671, + "eval_valid_reconstruction/second_seq": 0.18473465740680695, + "eval_valid_runtime": 435.8353, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 13650 + }, + { + "epoch": 0.05091649694501019, + "eval_train_loss": 2.2127087116241455, + "eval_train_loss/all": 2.045536756515503, + "eval_train_loss/end_span": 1.1986597776412964, + "eval_train_perplexity/batch": 7.7333083152771, + "eval_train_perplexity/end_span": 3.3156702518463135, + "eval_train_perplexity/fim": 2.1286232471466064, + "eval_train_perplexity/first_seq": 15.407052993774414, + "eval_train_perplexity/last_seq": 9.164191246032715, + "eval_train_perplexity/second_seq": 14.498785018920898, + "eval_train_perplexity/seq": 8.89915943145752, + "eval_train_reconstruction/all": 0.27811291813850403, + "eval_train_reconstruction/end_span": 0.719169557094574, + "eval_train_reconstruction/fim": 0.1473022848367691, + "eval_train_reconstruction/first_seq": 0.15398311614990234, + "eval_train_reconstruction/last_seq": 0.31744319200515747, + "eval_train_reconstruction/second_seq": 0.17655616998672485, + "eval_train_runtime": 438.7942, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 13650 + }, + { + "epoch": 0.05095379840797356, + "grad_norm": 0.3929978907108307, + "learning_rate": 0.0006, + "loss": 2.1063, + "step": 13660 + }, + { + "epoch": 0.05099109987093694, + "grad_norm": 0.2294468730688095, + "learning_rate": 0.0006, + "loss": 2.277, + "step": 13670 + }, + { + "epoch": 0.05102840133390032, + "grad_norm": 0.3358318507671356, + "learning_rate": 0.0006, + "loss": 2.3673, + "step": 13680 + }, + { + "epoch": 0.05106570279686369, + "grad_norm": 0.5096798539161682, + "learning_rate": 0.0006, + "loss": 2.2503, + "step": 13690 + }, + { + "epoch": 0.05110300425982707, + "grad_norm": 0.2572123110294342, + "learning_rate": 0.0006, + "loss": 2.313, + "step": 13700 + }, + { + "epoch": 0.05110300425982707, + "eval_valid_loss": 2.2167811393737793, + "eval_valid_loss/all": 2.0769052505493164, + "eval_valid_loss/end_span": 1.252472996711731, + "eval_valid_perplexity/batch": 7.979735374450684, + "eval_valid_perplexity/end_span": 3.4989852905273438, + "eval_valid_perplexity/fim": 2.861790180206299, + "eval_valid_perplexity/first_seq": 14.563231468200684, + "eval_valid_perplexity/last_seq": 8.921342849731445, + "eval_valid_perplexity/second_seq": 13.743780136108398, + "eval_valid_perplexity/seq": 8.994100570678711, + "eval_valid_reconstruction/all": 0.28736478090286255, + "eval_valid_reconstruction/end_span": 0.7060254812240601, + "eval_valid_reconstruction/fim": 0.2038099318742752, + "eval_valid_reconstruction/first_seq": 0.17412391304969788, + "eval_valid_reconstruction/last_seq": 0.3285522758960724, + "eval_valid_reconstruction/second_seq": 0.197398379445076, + "eval_valid_runtime": 436.9241, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 13700 + }, + { + "epoch": 0.05110300425982707, + "eval_train_loss": 2.213855743408203, + "eval_train_loss/all": 2.046717405319214, + "eval_train_loss/end_span": 1.2134838104248047, + "eval_train_perplexity/batch": 7.742444038391113, + "eval_train_perplexity/end_span": 3.365187883377075, + "eval_train_perplexity/fim": 2.2447190284729004, + "eval_train_perplexity/first_seq": 15.92732048034668, + "eval_train_perplexity/last_seq": 9.029195785522461, + "eval_train_perplexity/second_seq": 14.269490242004395, + "eval_train_perplexity/seq": 8.910277366638184, + "eval_train_reconstruction/all": 0.2775915563106537, + "eval_train_reconstruction/end_span": 0.7170628905296326, + "eval_train_reconstruction/fim": 0.15730151534080505, + "eval_train_reconstruction/first_seq": 0.14307640492916107, + "eval_train_reconstruction/last_seq": 0.32431498169898987, + "eval_train_reconstruction/second_seq": 0.18058796226978302, + "eval_train_runtime": 437.9453, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 13700 + }, + { + "epoch": 0.05114030572279045, + "grad_norm": 0.38771846890449524, + "learning_rate": 0.0006, + "loss": 2.0433, + "step": 13710 + }, + { + "epoch": 0.05117760718575382, + "grad_norm": 0.48152247071266174, + "learning_rate": 0.0006, + "loss": 2.2869, + "step": 13720 + }, + { + "epoch": 0.0512149086487172, + "grad_norm": 0.328829288482666, + "learning_rate": 0.0006, + "loss": 2.2106, + "step": 13730 + }, + { + "epoch": 0.05125221011168058, + "grad_norm": 0.2876850366592407, + "learning_rate": 0.0006, + "loss": 2.0891, + "step": 13740 + }, + { + "epoch": 0.05128951157464396, + "grad_norm": 0.44283586740493774, + "learning_rate": 0.0006, + "loss": 2.2973, + "step": 13750 + }, + { + "epoch": 0.05128951157464396, + "eval_valid_loss": 2.2140214443206787, + "eval_valid_loss/all": 2.0745861530303955, + "eval_valid_loss/end_span": 1.2922312021255493, + "eval_valid_perplexity/batch": 7.961251258850098, + "eval_valid_perplexity/end_span": 3.6409010887145996, + "eval_valid_perplexity/fim": 2.1884381771087646, + "eval_valid_perplexity/first_seq": 15.011009216308594, + "eval_valid_perplexity/last_seq": 9.169550895690918, + "eval_valid_perplexity/second_seq": 14.068202018737793, + "eval_valid_perplexity/seq": 8.971506118774414, + "eval_valid_reconstruction/all": 0.28818511962890625, + "eval_valid_reconstruction/end_span": 0.6975691318511963, + "eval_valid_reconstruction/fim": 0.15284472703933716, + "eval_valid_reconstruction/first_seq": 0.16394343972206116, + "eval_valid_reconstruction/last_seq": 0.321599543094635, + "eval_valid_reconstruction/second_seq": 0.19082202017307281, + "eval_valid_runtime": 436.9726, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 13750 + }, + { + "epoch": 0.05128951157464396, + "eval_train_loss": 2.2111027240753174, + "eval_train_loss/all": 2.044459581375122, + "eval_train_loss/end_span": 1.2522064447402954, + "eval_train_perplexity/batch": 7.724982738494873, + "eval_train_perplexity/end_span": 3.4980525970458984, + "eval_train_perplexity/fim": 2.149350643157959, + "eval_train_perplexity/first_seq": 15.34825611114502, + "eval_train_perplexity/last_seq": 9.25373363494873, + "eval_train_perplexity/second_seq": 14.117822647094727, + "eval_train_perplexity/seq": 8.892848014831543, + "eval_train_reconstruction/all": 0.2783288061618805, + "eval_train_reconstruction/end_span": 0.7097756266593933, + "eval_train_reconstruction/fim": 0.14944185316562653, + "eval_train_reconstruction/first_seq": 0.15465980768203735, + "eval_train_reconstruction/last_seq": 0.3164205551147461, + "eval_train_reconstruction/second_seq": 0.1860402226448059, + "eval_train_runtime": 439.5459, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 13750 + }, + { + "epoch": 0.05132681303760733, + "grad_norm": 0.40904664993286133, + "learning_rate": 0.0006, + "loss": 2.3122, + "step": 13760 + }, + { + "epoch": 0.05136411450057071, + "grad_norm": 0.3615592122077942, + "learning_rate": 0.0006, + "loss": 2.2315, + "step": 13770 + }, + { + "epoch": 0.05140141596353409, + "grad_norm": 0.3834556043148041, + "learning_rate": 0.0006, + "loss": 2.271, + "step": 13780 + }, + { + "epoch": 0.051438717426497464, + "grad_norm": 0.3161361515522003, + "learning_rate": 0.0006, + "loss": 2.2462, + "step": 13790 + }, + { + "epoch": 0.051476018889460844, + "grad_norm": 0.32696062326431274, + "learning_rate": 0.0006, + "loss": 2.19, + "step": 13800 + }, + { + "epoch": 0.051476018889460844, + "eval_valid_loss": 2.214834213256836, + "eval_valid_loss/all": 2.0754127502441406, + "eval_valid_loss/end_span": 1.344208002090454, + "eval_valid_perplexity/batch": 7.96783447265625, + "eval_valid_perplexity/end_span": 3.8351478576660156, + "eval_valid_perplexity/fim": 2.174506902694702, + "eval_valid_perplexity/first_seq": 15.059492111206055, + "eval_valid_perplexity/last_seq": 9.26364517211914, + "eval_valid_perplexity/second_seq": 13.762533187866211, + "eval_valid_perplexity/seq": 8.981911659240723, + "eval_valid_reconstruction/all": 0.28796684741973877, + "eval_valid_reconstruction/end_span": 0.6811622381210327, + "eval_valid_reconstruction/fim": 0.1507079154253006, + "eval_valid_reconstruction/first_seq": 0.16671122610569, + "eval_valid_reconstruction/last_seq": 0.3221757411956787, + "eval_valid_reconstruction/second_seq": 0.19472792744636536, + "eval_valid_runtime": 436.7333, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 13800 + }, + { + "epoch": 0.051476018889460844, + "eval_train_loss": 2.2122299671173096, + "eval_train_loss/all": 2.0459086894989014, + "eval_train_loss/end_span": 1.3092966079711914, + "eval_train_perplexity/batch": 7.736185073852539, + "eval_train_perplexity/end_span": 3.7035677433013916, + "eval_train_perplexity/fim": 1.9161279201507568, + "eval_train_perplexity/first_seq": 15.288776397705078, + "eval_train_perplexity/last_seq": 9.103614807128906, + "eval_train_perplexity/second_seq": 14.07499885559082, + "eval_train_perplexity/seq": 8.905755043029785, + "eval_train_reconstruction/all": 0.2779303789138794, + "eval_train_reconstruction/end_span": 0.691881537437439, + "eval_train_reconstruction/fim": 0.12670481204986572, + "eval_train_reconstruction/first_seq": 0.15864892303943634, + "eval_train_reconstruction/last_seq": 0.3200385570526123, + "eval_train_reconstruction/second_seq": 0.18545834720134735, + "eval_train_runtime": 436.6907, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 13800 + }, + { + "epoch": 0.05151332035242422, + "grad_norm": 0.7055468559265137, + "learning_rate": 0.0006, + "loss": 2.1033, + "step": 13810 + }, + { + "epoch": 0.0515506218153876, + "grad_norm": 0.695976197719574, + "learning_rate": 0.0006, + "loss": 2.142, + "step": 13820 + }, + { + "epoch": 0.051587923278350975, + "grad_norm": 0.4816497266292572, + "learning_rate": 0.0006, + "loss": 2.2971, + "step": 13830 + }, + { + "epoch": 0.051625224741314354, + "grad_norm": 0.3835015594959259, + "learning_rate": 0.0006, + "loss": 2.1988, + "step": 13840 + }, + { + "epoch": 0.051662526204277734, + "grad_norm": 0.3730984926223755, + "learning_rate": 0.0006, + "loss": 2.2059, + "step": 13850 + }, + { + "epoch": 0.051662526204277734, + "eval_valid_loss": 2.210470676422119, + "eval_valid_loss/all": 2.0714731216430664, + "eval_valid_loss/end_span": 1.2619315385818481, + "eval_valid_perplexity/batch": 7.9365057945251465, + "eval_valid_perplexity/end_span": 3.5322375297546387, + "eval_valid_perplexity/fim": 2.485218048095703, + "eval_valid_perplexity/first_seq": 14.684225082397461, + "eval_valid_perplexity/last_seq": 9.264917373657227, + "eval_valid_perplexity/second_seq": 13.345412254333496, + "eval_valid_perplexity/seq": 8.948028564453125, + "eval_valid_reconstruction/all": 0.28888174891471863, + "eval_valid_reconstruction/end_span": 0.7030796408653259, + "eval_valid_reconstruction/fim": 0.17783229053020477, + "eval_valid_reconstruction/first_seq": 0.17182311415672302, + "eval_valid_reconstruction/last_seq": 0.31686803698539734, + "eval_valid_reconstruction/second_seq": 0.2030840963125229, + "eval_valid_runtime": 437.976, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 13850 + }, + { + "epoch": 0.051662526204277734, + "eval_train_loss": 2.207407236099243, + "eval_train_loss/all": 2.0414388179779053, + "eval_train_loss/end_span": 1.2226401567459106, + "eval_train_perplexity/batch": 7.7016825675964355, + "eval_train_perplexity/end_span": 3.3961422443389893, + "eval_train_perplexity/fim": 2.1760895252227783, + "eval_train_perplexity/first_seq": 15.657384872436523, + "eval_train_perplexity/last_seq": 9.451824188232422, + "eval_train_perplexity/second_seq": 14.2188720703125, + "eval_train_perplexity/seq": 8.867584228515625, + "eval_train_reconstruction/all": 0.2790379822254181, + "eval_train_reconstruction/end_span": 0.7168264389038086, + "eval_train_reconstruction/fim": 0.15200884640216827, + "eval_train_reconstruction/first_seq": 0.15104222297668457, + "eval_train_reconstruction/last_seq": 0.308328241109848, + "eval_train_reconstruction/second_seq": 0.18246038258075714, + "eval_train_runtime": 435.6138, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 13850 + }, + { + "epoch": 0.051699827667241106, + "grad_norm": 0.4951052665710449, + "learning_rate": 0.0006, + "loss": 2.3467, + "step": 13860 + }, + { + "epoch": 0.051737129130204486, + "grad_norm": 0.384351521730423, + "learning_rate": 0.0006, + "loss": 2.2953, + "step": 13870 + }, + { + "epoch": 0.051774430593167865, + "grad_norm": 0.3428598940372467, + "learning_rate": 0.0006, + "loss": 2.2001, + "step": 13880 + }, + { + "epoch": 0.051811732056131245, + "grad_norm": 0.39459946751594543, + "learning_rate": 0.0006, + "loss": 2.1744, + "step": 13890 + }, + { + "epoch": 0.05184903351909462, + "grad_norm": 0.48046451807022095, + "learning_rate": 0.0006, + "loss": 2.3365, + "step": 13900 + }, + { + "epoch": 0.05184903351909462, + "eval_valid_loss": 2.2125051021575928, + "eval_valid_loss/all": 2.073115348815918, + "eval_valid_loss/end_span": 1.3345575332641602, + "eval_valid_perplexity/batch": 7.949550151824951, + "eval_valid_perplexity/end_span": 3.7983150482177734, + "eval_valid_perplexity/fim": 2.493842124938965, + "eval_valid_perplexity/first_seq": 14.911410331726074, + "eval_valid_perplexity/last_seq": 8.537471771240234, + "eval_valid_perplexity/second_seq": 13.544068336486816, + "eval_valid_perplexity/seq": 8.960097312927246, + "eval_valid_reconstruction/all": 0.2889116406440735, + "eval_valid_reconstruction/end_span": 0.6829123497009277, + "eval_valid_reconstruction/fim": 0.17828570306301117, + "eval_valid_reconstruction/first_seq": 0.16369201242923737, + "eval_valid_reconstruction/last_seq": 0.3453501760959625, + "eval_valid_reconstruction/second_seq": 0.20139238238334656, + "eval_valid_runtime": 438.1896, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 13900 + }, + { + "epoch": 0.05184903351909462, + "eval_train_loss": 2.208622932434082, + "eval_train_loss/all": 2.0422351360321045, + "eval_train_loss/end_span": 1.2914502620697021, + "eval_train_perplexity/batch": 7.707818031311035, + "eval_train_perplexity/end_span": 3.63805890083313, + "eval_train_perplexity/fim": 2.1777586936950684, + "eval_train_perplexity/first_seq": 15.131450653076172, + "eval_train_perplexity/last_seq": 9.508642196655273, + "eval_train_perplexity/second_seq": 13.94313907623291, + "eval_train_perplexity/seq": 8.86836051940918, + "eval_train_reconstruction/all": 0.279228150844574, + "eval_train_reconstruction/end_span": 0.6970134377479553, + "eval_train_reconstruction/fim": 0.15259402990341187, + "eval_train_reconstruction/first_seq": 0.16121987998485565, + "eval_train_reconstruction/last_seq": 0.30698931217193604, + "eval_train_reconstruction/second_seq": 0.19094431400299072, + "eval_train_runtime": 434.4165, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 13900 + }, + { + "epoch": 0.051886334982058, + "grad_norm": 0.22049924731254578, + "learning_rate": 0.0006, + "loss": 2.2417, + "step": 13910 + }, + { + "epoch": 0.051923636445021376, + "grad_norm": 0.39221495389938354, + "learning_rate": 0.0006, + "loss": 2.3713, + "step": 13920 + }, + { + "epoch": 0.05196093790798475, + "grad_norm": 0.6425257325172424, + "learning_rate": 0.0006, + "loss": 2.0827, + "step": 13930 + }, + { + "epoch": 0.05199823937094813, + "grad_norm": 0.3850648105144501, + "learning_rate": 0.0006, + "loss": 2.3172, + "step": 13940 + }, + { + "epoch": 0.05203554083391151, + "grad_norm": 0.29891231656074524, + "learning_rate": 0.0006, + "loss": 2.4267, + "step": 13950 + }, + { + "epoch": 0.05203554083391151, + "eval_valid_loss": 2.216592788696289, + "eval_valid_loss/all": 2.076758623123169, + "eval_valid_loss/end_span": 1.2883405685424805, + "eval_valid_perplexity/batch": 7.978565216064453, + "eval_valid_perplexity/end_span": 3.626763105392456, + "eval_valid_perplexity/fim": 2.5413272380828857, + "eval_valid_perplexity/first_seq": 14.756688117980957, + "eval_valid_perplexity/last_seq": 8.693455696105957, + "eval_valid_perplexity/second_seq": 13.8410005569458, + "eval_valid_perplexity/seq": 8.986092567443848, + "eval_valid_reconstruction/all": 0.2875613570213318, + "eval_valid_reconstruction/end_span": 0.6956309676170349, + "eval_valid_reconstruction/fim": 0.18022985756397247, + "eval_valid_reconstruction/first_seq": 0.1672792285680771, + "eval_valid_reconstruction/last_seq": 0.33693012595176697, + "eval_valid_reconstruction/second_seq": 0.1982685774564743, + "eval_valid_runtime": 438.0964, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 13950 + }, + { + "epoch": 0.05203554083391151, + "eval_train_loss": 2.2119579315185547, + "eval_train_loss/all": 2.044935464859009, + "eval_train_loss/end_span": 1.2585601806640625, + "eval_train_perplexity/batch": 7.728659629821777, + "eval_train_perplexity/end_span": 3.5203492641448975, + "eval_train_perplexity/fim": 2.226001739501953, + "eval_train_perplexity/first_seq": 15.47686767578125, + "eval_train_perplexity/last_seq": 9.59518814086914, + "eval_train_perplexity/second_seq": 14.313773155212402, + "eval_train_perplexity/seq": 8.888607025146484, + "eval_train_reconstruction/all": 0.27834123373031616, + "eval_train_reconstruction/end_span": 0.7066487073898315, + "eval_train_reconstruction/fim": 0.15504062175750732, + "eval_train_reconstruction/first_seq": 0.1531129777431488, + "eval_train_reconstruction/last_seq": 0.3042271137237549, + "eval_train_reconstruction/second_seq": 0.18506361544132233, + "eval_train_runtime": 435.5897, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 13950 + }, + { + "epoch": 0.05207284229687488, + "grad_norm": 0.2890293002128601, + "learning_rate": 0.0006, + "loss": 2.3134, + "step": 13960 + }, + { + "epoch": 0.05211014375983826, + "grad_norm": 0.30046147108078003, + "learning_rate": 0.0006, + "loss": 2.2476, + "step": 13970 + }, + { + "epoch": 0.05214744522280164, + "grad_norm": 0.29838472604751587, + "learning_rate": 0.0006, + "loss": 2.3726, + "step": 13980 + }, + { + "epoch": 0.05218474668576502, + "grad_norm": 0.2975066006183624, + "learning_rate": 0.0006, + "loss": 2.2398, + "step": 13990 + }, + { + "epoch": 0.05222204814872839, + "grad_norm": 0.48332011699676514, + "learning_rate": 0.0006, + "loss": 2.2672, + "step": 14000 + }, + { + "epoch": 0.05222204814872839, + "eval_valid_loss": 2.2128000259399414, + "eval_valid_loss/all": 2.0736169815063477, + "eval_valid_loss/end_span": 1.2038066387176514, + "eval_valid_perplexity/batch": 7.95353889465332, + "eval_valid_perplexity/end_span": 3.3327794075012207, + "eval_valid_perplexity/fim": 2.2859272956848145, + "eval_valid_perplexity/first_seq": 14.972042083740234, + "eval_valid_perplexity/last_seq": 8.989681243896484, + "eval_valid_perplexity/second_seq": 13.540487289428711, + "eval_valid_perplexity/seq": 8.965553283691406, + "eval_valid_reconstruction/all": 0.28895217180252075, + "eval_valid_reconstruction/end_span": 0.7185767889022827, + "eval_valid_reconstruction/fim": 0.16108538210391998, + "eval_valid_reconstruction/first_seq": 0.16686005890369415, + "eval_valid_reconstruction/last_seq": 0.3282117247581482, + "eval_valid_reconstruction/second_seq": 0.20129378139972687, + "eval_valid_runtime": 439.9953, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 14000 + }, + { + "epoch": 0.05222204814872839, + "eval_train_loss": 2.209865093231201, + "eval_train_loss/all": 2.043574810028076, + "eval_train_loss/end_span": 1.1659072637557983, + "eval_train_perplexity/batch": 7.718150615692139, + "eval_train_perplexity/end_span": 3.2088327407836914, + "eval_train_perplexity/fim": 1.9907957315444946, + "eval_train_perplexity/first_seq": 15.526259422302246, + "eval_train_perplexity/last_seq": 9.022679328918457, + "eval_train_perplexity/second_seq": 14.400975227355957, + "eval_train_perplexity/seq": 8.883591651916504, + "eval_train_reconstruction/all": 0.27893415093421936, + "eval_train_reconstruction/end_span": 0.7323862910270691, + "eval_train_reconstruction/fim": 0.13425679504871368, + "eval_train_reconstruction/first_seq": 0.15286670625209808, + "eval_train_reconstruction/last_seq": 0.32425129413604736, + "eval_train_reconstruction/second_seq": 0.1814427524805069, + "eval_train_runtime": 437.9644, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 14000 + }, + { + "epoch": 0.05225934961169177, + "grad_norm": 0.32073283195495605, + "learning_rate": 0.0006, + "loss": 2.3618, + "step": 14010 + }, + { + "epoch": 0.05229665107465515, + "grad_norm": 0.5209726691246033, + "learning_rate": 0.0006, + "loss": 2.1384, + "step": 14020 + }, + { + "epoch": 0.05233395253761852, + "grad_norm": 0.25257325172424316, + "learning_rate": 0.0006, + "loss": 2.366, + "step": 14030 + }, + { + "epoch": 0.0523712540005819, + "grad_norm": 0.23700077831745148, + "learning_rate": 0.0006, + "loss": 2.3101, + "step": 14040 + }, + { + "epoch": 0.05240855546354528, + "grad_norm": 0.3806556761264801, + "learning_rate": 0.0006, + "loss": 2.117, + "step": 14050 + }, + { + "epoch": 0.05240855546354528, + "eval_valid_loss": 2.2085025310516357, + "eval_valid_loss/all": 2.069612503051758, + "eval_valid_loss/end_span": 1.2721962928771973, + "eval_valid_perplexity/batch": 7.9217529296875, + "eval_valid_perplexity/end_span": 3.5686817169189453, + "eval_valid_perplexity/fim": 2.2619755268096924, + "eval_valid_perplexity/first_seq": 15.215880393981934, + "eval_valid_perplexity/last_seq": 8.708518028259277, + "eval_valid_perplexity/second_seq": 13.488080978393555, + "eval_valid_perplexity/seq": 8.92920207977295, + "eval_valid_reconstruction/all": 0.2898170053958893, + "eval_valid_reconstruction/end_span": 0.6917202472686768, + "eval_valid_reconstruction/fim": 0.1599043756723404, + "eval_valid_reconstruction/first_seq": 0.1636548936367035, + "eval_valid_reconstruction/last_seq": 0.33519116044044495, + "eval_valid_reconstruction/second_seq": 0.2058330774307251, + "eval_valid_runtime": 436.7655, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 14050 + }, + { + "epoch": 0.05240855546354528, + "eval_train_loss": 2.206167459487915, + "eval_train_loss/all": 2.040451765060425, + "eval_train_loss/end_span": 1.2266473770141602, + "eval_train_perplexity/batch": 7.694084167480469, + "eval_train_perplexity/end_span": 3.409778594970703, + "eval_train_perplexity/fim": 2.2117557525634766, + "eval_train_perplexity/first_seq": 15.665188789367676, + "eval_train_perplexity/last_seq": 8.815058708190918, + "eval_train_perplexity/second_seq": 14.47135066986084, + "eval_train_perplexity/seq": 8.858677864074707, + "eval_train_reconstruction/all": 0.2798105478286743, + "eval_train_reconstruction/end_span": 0.7067437767982483, + "eval_train_reconstruction/fim": 0.15550701320171356, + "eval_train_reconstruction/first_seq": 0.14749488234519958, + "eval_train_reconstruction/last_seq": 0.33364132046699524, + "eval_train_reconstruction/second_seq": 0.17560026049613953, + "eval_train_runtime": 439.5491, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 14050 + }, + { + "epoch": 0.05244585692650866, + "grad_norm": 0.31421297788619995, + "learning_rate": 0.0006, + "loss": 2.3145, + "step": 14060 + }, + { + "epoch": 0.05248315838947203, + "grad_norm": 0.3692414164543152, + "learning_rate": 0.0006, + "loss": 2.1359, + "step": 14070 + }, + { + "epoch": 0.05252045985243541, + "grad_norm": 0.4761316478252411, + "learning_rate": 0.0006, + "loss": 2.2862, + "step": 14080 + }, + { + "epoch": 0.05255776131539879, + "grad_norm": 0.345133513212204, + "learning_rate": 0.0006, + "loss": 2.2148, + "step": 14090 + }, + { + "epoch": 0.052595062778362164, + "grad_norm": 0.28841307759284973, + "learning_rate": 0.0006, + "loss": 2.3074, + "step": 14100 + }, + { + "epoch": 0.052595062778362164, + "eval_valid_loss": 2.2122809886932373, + "eval_valid_loss/all": 2.0726945400238037, + "eval_valid_loss/end_span": 1.2526538372039795, + "eval_valid_perplexity/batch": 7.9462056159973145, + "eval_valid_perplexity/end_span": 3.4996180534362793, + "eval_valid_perplexity/fim": 2.3948442935943604, + "eval_valid_perplexity/first_seq": 14.878692626953125, + "eval_valid_perplexity/last_seq": 9.228246688842773, + "eval_valid_perplexity/second_seq": 14.0301513671875, + "eval_valid_perplexity/seq": 8.950901985168457, + "eval_valid_reconstruction/all": 0.28873810172080994, + "eval_valid_reconstruction/end_span": 0.7013241648674011, + "eval_valid_reconstruction/fim": 0.1711030751466751, + "eval_valid_reconstruction/first_seq": 0.16747307777404785, + "eval_valid_reconstruction/last_seq": 0.3190557658672333, + "eval_valid_reconstruction/second_seq": 0.18815414607524872, + "eval_valid_runtime": 438.604, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 14100 + }, + { + "epoch": 0.052595062778362164, + "eval_train_loss": 2.208415985107422, + "eval_train_loss/all": 2.0416338443756104, + "eval_train_loss/end_span": 1.2284941673278809, + "eval_train_perplexity/batch": 7.703184604644775, + "eval_train_perplexity/end_span": 3.416081666946411, + "eval_train_perplexity/fim": 2.016767978668213, + "eval_train_perplexity/first_seq": 15.333633422851562, + "eval_train_perplexity/last_seq": 9.357928276062012, + "eval_train_perplexity/second_seq": 14.02641487121582, + "eval_train_perplexity/seq": 8.858963966369629, + "eval_train_reconstruction/all": 0.27936941385269165, + "eval_train_reconstruction/end_span": 0.7119804620742798, + "eval_train_reconstruction/fim": 0.13735359907150269, + "eval_train_reconstruction/first_seq": 0.15574724972248077, + "eval_train_reconstruction/last_seq": 0.3104930520057678, + "eval_train_reconstruction/second_seq": 0.19182585179805756, + "eval_train_runtime": 437.7279, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 14100 + }, + { + "epoch": 0.052632364241325544, + "grad_norm": 0.45169028639793396, + "learning_rate": 0.0006, + "loss": 2.2589, + "step": 14110 + }, + { + "epoch": 0.05266966570428892, + "grad_norm": 0.34194162487983704, + "learning_rate": 0.0006, + "loss": 2.2733, + "step": 14120 + }, + { + "epoch": 0.0527069671672523, + "grad_norm": 0.581635594367981, + "learning_rate": 0.0006, + "loss": 2.181, + "step": 14130 + }, + { + "epoch": 0.052744268630215675, + "grad_norm": 0.39112192392349243, + "learning_rate": 0.0006, + "loss": 2.1247, + "step": 14140 + }, + { + "epoch": 0.052781570093179055, + "grad_norm": 0.37492042779922485, + "learning_rate": 0.0006, + "loss": 2.2446, + "step": 14150 + }, + { + "epoch": 0.052781570093179055, + "eval_valid_loss": 2.212548017501831, + "eval_valid_loss/all": 2.073103427886963, + "eval_valid_loss/end_span": 1.3072350025177002, + "eval_valid_perplexity/batch": 7.949455261230469, + "eval_valid_perplexity/end_span": 3.6959402561187744, + "eval_valid_perplexity/fim": 2.336015462875366, + "eval_valid_perplexity/first_seq": 15.061624526977539, + "eval_valid_perplexity/last_seq": 9.174004554748535, + "eval_valid_perplexity/second_seq": 14.10387134552002, + "eval_valid_perplexity/seq": 8.953107833862305, + "eval_valid_reconstruction/all": 0.288442462682724, + "eval_valid_reconstruction/end_span": 0.6926196813583374, + "eval_valid_reconstruction/fim": 0.1660800576210022, + "eval_valid_reconstruction/first_seq": 0.16874714195728302, + "eval_valid_reconstruction/last_seq": 0.31819021701812744, + "eval_valid_reconstruction/second_seq": 0.18513000011444092, + "eval_valid_runtime": 437.8697, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 14150 + }, + { + "epoch": 0.052781570093179055, + "eval_train_loss": 2.2129886150360107, + "eval_train_loss/all": 2.0445973873138428, + "eval_train_loss/end_span": 1.2696665525436401, + "eval_train_perplexity/batch": 7.726047515869141, + "eval_train_perplexity/end_span": 3.5596654415130615, + "eval_train_perplexity/fim": 2.212909698486328, + "eval_train_perplexity/first_seq": 15.508312225341797, + "eval_train_perplexity/last_seq": 9.43122673034668, + "eval_train_perplexity/second_seq": 14.526206016540527, + "eval_train_perplexity/seq": 8.881361961364746, + "eval_train_reconstruction/all": 0.27808448672294617, + "eval_train_reconstruction/end_span": 0.7047573924064636, + "eval_train_reconstruction/fim": 0.15451717376708984, + "eval_train_reconstruction/first_seq": 0.14868509769439697, + "eval_train_reconstruction/last_seq": 0.311159610748291, + "eval_train_reconstruction/second_seq": 0.17727342247962952, + "eval_train_runtime": 435.434, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 14150 + }, + { + "epoch": 0.052818871556142434, + "grad_norm": 0.3037920296192169, + "learning_rate": 0.0006, + "loss": 2.1927, + "step": 14160 + }, + { + "epoch": 0.05285617301910581, + "grad_norm": 0.5145536065101624, + "learning_rate": 0.0006, + "loss": 2.1834, + "step": 14170 + }, + { + "epoch": 0.052893474482069186, + "grad_norm": 1.161399245262146, + "learning_rate": 0.0006, + "loss": 2.3843, + "step": 14180 + }, + { + "epoch": 0.052930775945032565, + "grad_norm": 0.3581426441669464, + "learning_rate": 0.0006, + "loss": 2.2415, + "step": 14190 + }, + { + "epoch": 0.052968077407995945, + "grad_norm": 0.45331767201423645, + "learning_rate": 0.0006, + "loss": 2.2364, + "step": 14200 + }, + { + "epoch": 0.052968077407995945, + "eval_valid_loss": 2.2269134521484375, + "eval_valid_loss/all": 2.086449146270752, + "eval_valid_loss/end_span": 1.3517727851867676, + "eval_valid_perplexity/batch": 8.056257247924805, + "eval_valid_perplexity/end_span": 3.864269971847534, + "eval_valid_perplexity/fim": 2.384434700012207, + "eval_valid_perplexity/first_seq": 15.34715461730957, + "eval_valid_perplexity/last_seq": 9.135842323303223, + "eval_valid_perplexity/second_seq": 13.786832809448242, + "eval_valid_perplexity/seq": 9.078059196472168, + "eval_valid_reconstruction/all": 0.2843259274959564, + "eval_valid_reconstruction/end_span": 0.6929914355278015, + "eval_valid_reconstruction/fim": 0.16583819687366486, + "eval_valid_reconstruction/first_seq": 0.15770691633224487, + "eval_valid_reconstruction/last_seq": 0.31912365555763245, + "eval_valid_reconstruction/second_seq": 0.1992333084344864, + "eval_valid_runtime": 440.8845, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 14200 + }, + { + "epoch": 0.052968077407995945, + "eval_train_loss": 2.2227256298065186, + "eval_train_loss/all": 2.0540876388549805, + "eval_train_loss/end_span": 1.3184930086135864, + "eval_train_perplexity/batch": 7.799718379974365, + "eval_train_perplexity/end_span": 3.7377843856811523, + "eval_train_perplexity/fim": 1.9856661558151245, + "eval_train_perplexity/first_seq": 15.786588668823242, + "eval_train_perplexity/last_seq": 9.156512260437012, + "eval_train_perplexity/second_seq": 13.830507278442383, + "eval_train_perplexity/seq": 8.973944664001465, + "eval_train_reconstruction/all": 0.2754027545452118, + "eval_train_reconstruction/end_span": 0.703872561454773, + "eval_train_reconstruction/fim": 0.13137002289295197, + "eval_train_reconstruction/first_seq": 0.14808495342731476, + "eval_train_reconstruction/last_seq": 0.31746789813041687, + "eval_train_reconstruction/second_seq": 0.19559292495250702, + "eval_train_runtime": 436.4723, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 14200 + }, + { + "epoch": 0.05300537887095932, + "grad_norm": 0.31447845697402954, + "learning_rate": 0.0006, + "loss": 2.0645, + "step": 14210 + }, + { + "epoch": 0.0530426803339227, + "grad_norm": 0.2981111407279968, + "learning_rate": 0.0006, + "loss": 2.4358, + "step": 14220 + }, + { + "epoch": 0.053079981796886076, + "grad_norm": 0.3609547019004822, + "learning_rate": 0.0006, + "loss": 2.27, + "step": 14230 + }, + { + "epoch": 0.05311728325984945, + "grad_norm": 0.40516385436058044, + "learning_rate": 0.0006, + "loss": 2.3452, + "step": 14240 + }, + { + "epoch": 0.05315458472281283, + "grad_norm": 0.37222054600715637, + "learning_rate": 0.0006, + "loss": 2.0699, + "step": 14250 + }, + { + "epoch": 0.05315458472281283, + "eval_valid_loss": 2.2155306339263916, + "eval_valid_loss/all": 2.076165199279785, + "eval_valid_loss/end_span": 1.2231005430221558, + "eval_valid_perplexity/batch": 7.973832130432129, + "eval_valid_perplexity/end_span": 3.3977062702178955, + "eval_valid_perplexity/fim": 2.108241319656372, + "eval_valid_perplexity/first_seq": 14.995001792907715, + "eval_valid_perplexity/last_seq": 9.065274238586426, + "eval_valid_perplexity/second_seq": 14.198705673217773, + "eval_valid_perplexity/seq": 8.988280296325684, + "eval_valid_reconstruction/all": 0.2873506247997284, + "eval_valid_reconstruction/end_span": 0.7094658613204956, + "eval_valid_reconstruction/fim": 0.1445332169532776, + "eval_valid_reconstruction/first_seq": 0.16772010922431946, + "eval_valid_reconstruction/last_seq": 0.32483208179473877, + "eval_valid_reconstruction/second_seq": 0.18719302117824554, + "eval_valid_runtime": 436.7283, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 14250 + }, + { + "epoch": 0.05315458472281283, + "eval_train_loss": 2.2119390964508057, + "eval_train_loss/all": 2.044794797897339, + "eval_train_loss/end_span": 1.1860650777816772, + "eval_train_perplexity/batch": 7.727572441101074, + "eval_train_perplexity/end_span": 3.274172306060791, + "eval_train_perplexity/fim": 2.024315595626831, + "eval_train_perplexity/first_seq": 15.890437126159668, + "eval_train_perplexity/last_seq": 9.032752990722656, + "eval_train_perplexity/second_seq": 14.692787170410156, + "eval_train_perplexity/seq": 8.891885757446289, + "eval_train_reconstruction/all": 0.27808502316474915, + "eval_train_reconstruction/end_span": 0.7231913208961487, + "eval_train_reconstruction/fim": 0.13717924058437347, + "eval_train_reconstruction/first_seq": 0.1455383449792862, + "eval_train_reconstruction/last_seq": 0.32153385877609253, + "eval_train_reconstruction/second_seq": 0.17549051344394684, + "eval_train_runtime": 435.8342, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 14250 + }, + { + "epoch": 0.05319188618577621, + "grad_norm": 0.4061858355998993, + "learning_rate": 0.0006, + "loss": 2.4085, + "step": 14260 + }, + { + "epoch": 0.05322918764873958, + "grad_norm": 0.5518601536750793, + "learning_rate": 0.0006, + "loss": 2.1308, + "step": 14270 + }, + { + "epoch": 0.05326648911170296, + "grad_norm": 0.4834247827529907, + "learning_rate": 0.0006, + "loss": 2.2333, + "step": 14280 + }, + { + "epoch": 0.05330379057466634, + "grad_norm": 0.35346513986587524, + "learning_rate": 0.0006, + "loss": 2.1911, + "step": 14290 + }, + { + "epoch": 0.05334109203762972, + "grad_norm": 0.5011093616485596, + "learning_rate": 0.0006, + "loss": 2.2591, + "step": 14300 + }, + { + "epoch": 0.05334109203762972, + "eval_valid_loss": 2.2103803157806396, + "eval_valid_loss/all": 2.0713388919830322, + "eval_valid_loss/end_span": 1.2896206378936768, + "eval_valid_perplexity/batch": 7.935440540313721, + "eval_valid_perplexity/end_span": 3.63140869140625, + "eval_valid_perplexity/fim": 2.2998335361480713, + "eval_valid_perplexity/first_seq": 15.080999374389648, + "eval_valid_perplexity/last_seq": 8.967866897583008, + "eval_valid_perplexity/second_seq": 13.528583526611328, + "eval_valid_perplexity/seq": 8.941742897033691, + "eval_valid_reconstruction/all": 0.2890538275241852, + "eval_valid_reconstruction/end_span": 0.7019956707954407, + "eval_valid_reconstruction/fim": 0.16272670030593872, + "eval_valid_reconstruction/first_seq": 0.16378651559352875, + "eval_valid_reconstruction/last_seq": 0.3267209529876709, + "eval_valid_reconstruction/second_seq": 0.2033989280462265, + "eval_valid_runtime": 436.3866, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 14300 + }, + { + "epoch": 0.05334109203762972, + "eval_train_loss": 2.2081665992736816, + "eval_train_loss/all": 2.0410101413726807, + "eval_train_loss/end_span": 1.2637070417404175, + "eval_train_perplexity/batch": 7.6983819007873535, + "eval_train_perplexity/end_span": 3.5385146141052246, + "eval_train_perplexity/fim": 2.0872161388397217, + "eval_train_perplexity/first_seq": 15.285717964172363, + "eval_train_perplexity/last_seq": 9.145387649536133, + "eval_train_perplexity/second_seq": 14.216272354125977, + "eval_train_perplexity/seq": 8.857029914855957, + "eval_train_reconstruction/all": 0.27935004234313965, + "eval_train_reconstruction/end_span": 0.710123598575592, + "eval_train_reconstruction/fim": 0.14412936568260193, + "eval_train_reconstruction/first_seq": 0.1555052101612091, + "eval_train_reconstruction/last_seq": 0.31954362988471985, + "eval_train_reconstruction/second_seq": 0.18339861929416656, + "eval_train_runtime": 437.8329, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 14300 + }, + { + "epoch": 0.05337839350059309, + "grad_norm": 0.4438125491142273, + "learning_rate": 0.0006, + "loss": 2.2726, + "step": 14310 + }, + { + "epoch": 0.05341569496355647, + "grad_norm": 0.38581350445747375, + "learning_rate": 0.0006, + "loss": 2.2648, + "step": 14320 + }, + { + "epoch": 0.05345299642651985, + "grad_norm": 0.30637243390083313, + "learning_rate": 0.0006, + "loss": 2.1771, + "step": 14330 + }, + { + "epoch": 0.05349029788948322, + "grad_norm": 0.3755868375301361, + "learning_rate": 0.0006, + "loss": 2.2521, + "step": 14340 + }, + { + "epoch": 0.0535275993524466, + "grad_norm": 0.5236443281173706, + "learning_rate": 0.0006, + "loss": 2.0682, + "step": 14350 + }, + { + "epoch": 0.0535275993524466, + "eval_valid_loss": 2.2108402252197266, + "eval_valid_loss/all": 2.0718748569488525, + "eval_valid_loss/end_span": 1.2155648469924927, + "eval_valid_perplexity/batch": 7.939694881439209, + "eval_valid_perplexity/end_span": 3.3721983432769775, + "eval_valid_perplexity/fim": 2.015650749206543, + "eval_valid_perplexity/first_seq": 15.126498222351074, + "eval_valid_perplexity/last_seq": 8.72557258605957, + "eval_valid_perplexity/second_seq": 13.763416290283203, + "eval_valid_perplexity/seq": 8.951125144958496, + "eval_valid_reconstruction/all": 0.2888835370540619, + "eval_valid_reconstruction/end_span": 0.7136959433555603, + "eval_valid_reconstruction/fim": 0.13724090158939362, + "eval_valid_reconstruction/first_seq": 0.16239936649799347, + "eval_valid_reconstruction/last_seq": 0.33790627121925354, + "eval_valid_reconstruction/second_seq": 0.19880393147468567, + "eval_valid_runtime": 440.1889, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 14350 + }, + { + "epoch": 0.0535275993524466, + "eval_train_loss": 2.208038568496704, + "eval_train_loss/all": 2.041684865951538, + "eval_train_loss/end_span": 1.1765811443328857, + "eval_train_perplexity/batch": 7.703577995300293, + "eval_train_perplexity/end_span": 3.243267059326172, + "eval_train_perplexity/fim": 2.204021692276001, + "eval_train_perplexity/first_seq": 15.582085609436035, + "eval_train_perplexity/last_seq": 9.560074806213379, + "eval_train_perplexity/second_seq": 14.087907791137695, + "eval_train_perplexity/seq": 8.864544868469238, + "eval_train_reconstruction/all": 0.279090017080307, + "eval_train_reconstruction/end_span": 0.7260403633117676, + "eval_train_reconstruction/fim": 0.1542651355266571, + "eval_train_reconstruction/first_seq": 0.1501864492893219, + "eval_train_reconstruction/last_seq": 0.30609115958213806, + "eval_train_reconstruction/second_seq": 0.18681402504444122, + "eval_train_runtime": 441.0444, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 14350 + }, + { + "epoch": 0.05356490081540998, + "grad_norm": 0.5202054381370544, + "learning_rate": 0.0006, + "loss": 2.2517, + "step": 14360 + }, + { + "epoch": 0.05360220227837336, + "grad_norm": 0.3084091544151306, + "learning_rate": 0.0006, + "loss": 2.3977, + "step": 14370 + }, + { + "epoch": 0.05363950374133673, + "grad_norm": 0.47532787919044495, + "learning_rate": 0.0006, + "loss": 2.4015, + "step": 14380 + }, + { + "epoch": 0.05367680520430011, + "grad_norm": 0.35925403237342834, + "learning_rate": 0.0006, + "loss": 2.15, + "step": 14390 + }, + { + "epoch": 0.05371410666726349, + "grad_norm": 0.39670267701148987, + "learning_rate": 0.0006, + "loss": 2.2325, + "step": 14400 + }, + { + "epoch": 0.05371410666726349, + "eval_valid_loss": 2.210951089859009, + "eval_valid_loss/all": 2.071929931640625, + "eval_valid_loss/end_span": 1.1800466775894165, + "eval_valid_perplexity/batch": 7.940132141113281, + "eval_valid_perplexity/end_span": 3.254526138305664, + "eval_valid_perplexity/fim": 2.4551539421081543, + "eval_valid_perplexity/first_seq": 14.91812801361084, + "eval_valid_perplexity/last_seq": 9.669408798217773, + "eval_valid_perplexity/second_seq": 13.669366836547852, + "eval_valid_perplexity/seq": 8.949783325195312, + "eval_valid_reconstruction/all": 0.28869324922561646, + "eval_valid_reconstruction/end_span": 0.731060802936554, + "eval_valid_reconstruction/fim": 0.17602258920669556, + "eval_valid_reconstruction/first_seq": 0.16828270256519318, + "eval_valid_reconstruction/last_seq": 0.3051030933856964, + "eval_valid_reconstruction/second_seq": 0.19788725674152374, + "eval_valid_runtime": 439.0141, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 14400 + }, + { + "epoch": 0.05371410666726349, + "eval_train_loss": 2.207105875015259, + "eval_train_loss/all": 2.0410478115081787, + "eval_train_loss/end_span": 1.1419947147369385, + "eval_train_perplexity/batch": 7.698671817779541, + "eval_train_perplexity/end_span": 3.13301157951355, + "eval_train_perplexity/fim": 2.154959201812744, + "eval_train_perplexity/first_seq": 15.514946937561035, + "eval_train_perplexity/last_seq": 8.59494400024414, + "eval_train_perplexity/second_seq": 14.621854782104492, + "eval_train_perplexity/seq": 8.861682891845703, + "eval_train_reconstruction/all": 0.279092401266098, + "eval_train_reconstruction/end_span": 0.7416321635246277, + "eval_train_reconstruction/fim": 0.15056940913200378, + "eval_train_reconstruction/first_seq": 0.15187321603298187, + "eval_train_reconstruction/last_seq": 0.3360902667045593, + "eval_train_reconstruction/second_seq": 0.17292527854442596, + "eval_train_runtime": 441.7764, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 14400 + }, + { + "epoch": 0.053751408130226865, + "grad_norm": 0.30167528986930847, + "learning_rate": 0.0006, + "loss": 2.2721, + "step": 14410 + }, + { + "epoch": 0.053788709593190244, + "grad_norm": 0.821892499923706, + "learning_rate": 0.0006, + "loss": 2.2725, + "step": 14420 + }, + { + "epoch": 0.05382601105615362, + "grad_norm": 0.5540940165519714, + "learning_rate": 0.0006, + "loss": 2.2921, + "step": 14430 + }, + { + "epoch": 0.053863312519117, + "grad_norm": 0.2752131521701813, + "learning_rate": 0.0006, + "loss": 2.2091, + "step": 14440 + }, + { + "epoch": 0.053900613982080375, + "grad_norm": 0.2831604480743408, + "learning_rate": 0.0006, + "loss": 2.2654, + "step": 14450 + }, + { + "epoch": 0.053900613982080375, + "eval_valid_loss": 2.2130279541015625, + "eval_valid_loss/all": 2.0735020637512207, + "eval_valid_loss/end_span": 1.2998967170715332, + "eval_valid_perplexity/batch": 7.952624797821045, + "eval_valid_perplexity/end_span": 3.668917655944824, + "eval_valid_perplexity/fim": 2.4348745346069336, + "eval_valid_perplexity/first_seq": 14.798759460449219, + "eval_valid_perplexity/last_seq": 9.50021743774414, + "eval_valid_perplexity/second_seq": 13.944199562072754, + "eval_valid_perplexity/seq": 8.960954666137695, + "eval_valid_reconstruction/all": 0.2884306013584137, + "eval_valid_reconstruction/end_span": 0.6919389367103577, + "eval_valid_reconstruction/fim": 0.1731739491224289, + "eval_valid_reconstruction/first_seq": 0.16855792701244354, + "eval_valid_reconstruction/last_seq": 0.3094586133956909, + "eval_valid_reconstruction/second_seq": 0.1887999176979065, + "eval_valid_runtime": 437.9534, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 14450 + }, + { + "epoch": 0.053900613982080375, + "eval_train_loss": 2.209195852279663, + "eval_train_loss/all": 2.0425846576690674, + "eval_train_loss/end_span": 1.2716556787490845, + "eval_train_perplexity/batch": 7.710512638092041, + "eval_train_perplexity/end_span": 3.5667531490325928, + "eval_train_perplexity/fim": 2.158080577850342, + "eval_train_perplexity/first_seq": 15.561361312866211, + "eval_train_perplexity/last_seq": 9.036849975585938, + "eval_train_perplexity/second_seq": 14.221641540527344, + "eval_train_perplexity/seq": 8.874107360839844, + "eval_train_reconstruction/all": 0.2787482440471649, + "eval_train_reconstruction/end_span": 0.7013913989067078, + "eval_train_reconstruction/fim": 0.1506417989730835, + "eval_train_reconstruction/first_seq": 0.14978572726249695, + "eval_train_reconstruction/last_seq": 0.3222389817237854, + "eval_train_reconstruction/second_seq": 0.1838632971048355, + "eval_train_runtime": 438.479, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 14450 + }, + { + "epoch": 0.053937915445043755, + "grad_norm": 0.4304949641227722, + "learning_rate": 0.0006, + "loss": 1.9499, + "step": 14460 + }, + { + "epoch": 0.053975216908007134, + "grad_norm": 0.29628047347068787, + "learning_rate": 0.0006, + "loss": 2.2997, + "step": 14470 + }, + { + "epoch": 0.05401251837097051, + "grad_norm": 0.34781453013420105, + "learning_rate": 0.0006, + "loss": 2.257, + "step": 14480 + }, + { + "epoch": 0.054049819833933886, + "grad_norm": 0.2835012376308441, + "learning_rate": 0.0006, + "loss": 2.26, + "step": 14490 + }, + { + "epoch": 0.054087121296897266, + "grad_norm": 0.49867042899131775, + "learning_rate": 0.0006, + "loss": 2.227, + "step": 14500 + }, + { + "epoch": 0.054087121296897266, + "eval_valid_loss": 2.2100183963775635, + "eval_valid_loss/all": 2.070930242538452, + "eval_valid_loss/end_span": 1.3595482110977173, + "eval_valid_perplexity/batch": 7.932198524475098, + "eval_valid_perplexity/end_span": 3.8944334983825684, + "eval_valid_perplexity/fim": 2.402085065841675, + "eval_valid_perplexity/first_seq": 14.829212188720703, + "eval_valid_perplexity/last_seq": 9.42387866973877, + "eval_valid_perplexity/second_seq": 13.703365325927734, + "eval_valid_perplexity/seq": 8.94162368774414, + "eval_valid_reconstruction/all": 0.28891077637672424, + "eval_valid_reconstruction/end_span": 0.682004988193512, + "eval_valid_reconstruction/fim": 0.17139850556850433, + "eval_valid_reconstruction/first_seq": 0.16782167553901672, + "eval_valid_reconstruction/last_seq": 0.31392157077789307, + "eval_valid_reconstruction/second_seq": 0.19831953942775726, + "eval_valid_runtime": 435.8732, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 14500 + }, + { + "epoch": 0.054087121296897266, + "eval_train_loss": 2.2074859142303467, + "eval_train_loss/all": 2.04162335395813, + "eval_train_loss/end_span": 1.314995288848877, + "eval_train_perplexity/batch": 7.703104019165039, + "eval_train_perplexity/end_span": 3.724733352661133, + "eval_train_perplexity/fim": 2.1553962230682373, + "eval_train_perplexity/first_seq": 15.454279899597168, + "eval_train_perplexity/last_seq": 9.13154125213623, + "eval_train_perplexity/second_seq": 14.3528470993042, + "eval_train_perplexity/seq": 8.870530128479004, + "eval_train_reconstruction/all": 0.2790939211845398, + "eval_train_reconstruction/end_span": 0.6959392428398132, + "eval_train_reconstruction/fim": 0.15057241916656494, + "eval_train_reconstruction/first_seq": 0.15155836939811707, + "eval_train_reconstruction/last_seq": 0.3200291395187378, + "eval_train_reconstruction/second_seq": 0.18184134364128113, + "eval_train_runtime": 434.4137, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 14500 + }, + { + "epoch": 0.054124422759860645, + "grad_norm": 0.4263724386692047, + "learning_rate": 0.0006, + "loss": 2.3482, + "step": 14510 + }, + { + "epoch": 0.05416172422282402, + "grad_norm": 0.4056016206741333, + "learning_rate": 0.0006, + "loss": 2.135, + "step": 14520 + }, + { + "epoch": 0.0541990256857874, + "grad_norm": 0.40068453550338745, + "learning_rate": 0.0006, + "loss": 2.2402, + "step": 14530 + }, + { + "epoch": 0.054236327148750776, + "grad_norm": 0.3775457739830017, + "learning_rate": 0.0006, + "loss": 2.2569, + "step": 14540 + }, + { + "epoch": 0.05427362861171415, + "grad_norm": 0.42796334624290466, + "learning_rate": 0.0006, + "loss": 2.2804, + "step": 14550 + }, + { + "epoch": 0.05427362861171415, + "eval_valid_loss": 2.210376501083374, + "eval_valid_loss/all": 2.0710086822509766, + "eval_valid_loss/end_span": 1.2592424154281616, + "eval_valid_perplexity/batch": 7.932820796966553, + "eval_valid_perplexity/end_span": 3.522751808166504, + "eval_valid_perplexity/fim": 2.6050143241882324, + "eval_valid_perplexity/first_seq": 15.035393714904785, + "eval_valid_perplexity/last_seq": 9.390796661376953, + "eval_valid_perplexity/second_seq": 13.637466430664062, + "eval_valid_perplexity/seq": 8.940110206604004, + "eval_valid_reconstruction/all": 0.2892795205116272, + "eval_valid_reconstruction/end_span": 0.6985699534416199, + "eval_valid_reconstruction/fim": 0.18755748867988586, + "eval_valid_reconstruction/first_seq": 0.16477984189987183, + "eval_valid_reconstruction/last_seq": 0.3149089217185974, + "eval_valid_reconstruction/second_seq": 0.19970425963401794, + "eval_valid_runtime": 438.6268, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 14550 + }, + { + "epoch": 0.05427362861171415, + "eval_train_loss": 2.2063095569610596, + "eval_train_loss/all": 2.040179491043091, + "eval_train_loss/end_span": 1.2222235202789307, + "eval_train_perplexity/batch": 7.691989898681641, + "eval_train_perplexity/end_span": 3.3947277069091797, + "eval_train_perplexity/fim": 2.225386142730713, + "eval_train_perplexity/first_seq": 15.80762004852295, + "eval_train_perplexity/last_seq": 9.143244743347168, + "eval_train_perplexity/second_seq": 14.224466323852539, + "eval_train_perplexity/seq": 8.8523588180542, + "eval_train_reconstruction/all": 0.27977216243743896, + "eval_train_reconstruction/end_span": 0.7102017998695374, + "eval_train_reconstruction/fim": 0.15668481588363647, + "eval_train_reconstruction/first_seq": 0.14791959524154663, + "eval_train_reconstruction/last_seq": 0.3198028802871704, + "eval_train_reconstruction/second_seq": 0.18709677457809448, + "eval_train_runtime": 435.4671, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 14550 + }, + { + "epoch": 0.05431093007467753, + "grad_norm": 0.35708534717559814, + "learning_rate": 0.0006, + "loss": 2.1763, + "step": 14560 + }, + { + "epoch": 0.05434823153764091, + "grad_norm": 0.51427161693573, + "learning_rate": 0.0006, + "loss": 2.0432, + "step": 14570 + }, + { + "epoch": 0.05438553300060428, + "grad_norm": 0.5193996429443359, + "learning_rate": 0.0006, + "loss": 2.2385, + "step": 14580 + }, + { + "epoch": 0.05442283446356766, + "grad_norm": 0.3440590798854828, + "learning_rate": 0.0006, + "loss": 2.226, + "step": 14590 + }, + { + "epoch": 0.05446013592653104, + "grad_norm": 0.40886056423187256, + "learning_rate": 0.0006, + "loss": 2.2325, + "step": 14600 + }, + { + "epoch": 0.05446013592653104, + "eval_valid_loss": 2.210634231567383, + "eval_valid_loss/all": 2.071443557739258, + "eval_valid_loss/end_span": 1.3434066772460938, + "eval_valid_perplexity/batch": 7.9362711906433105, + "eval_valid_perplexity/end_span": 3.832075834274292, + "eval_valid_perplexity/fim": 2.2673144340515137, + "eval_valid_perplexity/first_seq": 14.975109100341797, + "eval_valid_perplexity/last_seq": 8.871973991394043, + "eval_valid_perplexity/second_seq": 13.676920890808105, + "eval_valid_perplexity/seq": 8.944686889648438, + "eval_valid_reconstruction/all": 0.2891407608985901, + "eval_valid_reconstruction/end_span": 0.690875232219696, + "eval_valid_reconstruction/fim": 0.15880215167999268, + "eval_valid_reconstruction/first_seq": 0.16585305333137512, + "eval_valid_reconstruction/last_seq": 0.3323828876018524, + "eval_valid_reconstruction/second_seq": 0.19532275199890137, + "eval_valid_runtime": 434.5, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 14600 + }, + { + "epoch": 0.05446013592653104, + "eval_train_loss": 2.2068727016448975, + "eval_train_loss/all": 2.0410726070404053, + "eval_train_loss/end_span": 1.3129017353057861, + "eval_train_perplexity/batch": 7.698862552642822, + "eval_train_perplexity/end_span": 3.7169437408447266, + "eval_train_perplexity/fim": 2.1066129207611084, + "eval_train_perplexity/first_seq": 15.707079887390137, + "eval_train_perplexity/last_seq": 8.668389320373535, + "eval_train_perplexity/second_seq": 14.215553283691406, + "eval_train_perplexity/seq": 8.859485626220703, + "eval_train_reconstruction/all": 0.2792874872684479, + "eval_train_reconstruction/end_span": 0.6986305713653564, + "eval_train_reconstruction/fim": 0.14487716555595398, + "eval_train_reconstruction/first_seq": 0.1492154896259308, + "eval_train_reconstruction/last_seq": 0.33656972646713257, + "eval_train_reconstruction/second_seq": 0.1810092031955719, + "eval_train_runtime": 436.68, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 14600 + }, + { + "epoch": 0.05449743738949442, + "grad_norm": 0.37237250804901123, + "learning_rate": 0.0006, + "loss": 2.2931, + "step": 14610 + }, + { + "epoch": 0.05453473885245779, + "grad_norm": 0.2960972785949707, + "learning_rate": 0.0006, + "loss": 2.2885, + "step": 14620 + }, + { + "epoch": 0.05457204031542117, + "grad_norm": 0.3977106213569641, + "learning_rate": 0.0006, + "loss": 2.3597, + "step": 14630 + }, + { + "epoch": 0.05460934177838455, + "grad_norm": 0.3131948709487915, + "learning_rate": 0.0006, + "loss": 2.2712, + "step": 14640 + }, + { + "epoch": 0.05464664324134792, + "grad_norm": 0.4789983928203583, + "learning_rate": 0.0006, + "loss": 2.2418, + "step": 14650 + }, + { + "epoch": 0.05464664324134792, + "eval_valid_loss": 2.211676597595215, + "eval_valid_loss/all": 2.0723838806152344, + "eval_valid_loss/end_span": 1.248513102531433, + "eval_valid_perplexity/batch": 7.943737506866455, + "eval_valid_perplexity/end_span": 3.485157012939453, + "eval_valid_perplexity/fim": 2.2114601135253906, + "eval_valid_perplexity/first_seq": 15.072426795959473, + "eval_valid_perplexity/last_seq": 9.250936508178711, + "eval_valid_perplexity/second_seq": 14.380817413330078, + "eval_valid_perplexity/seq": 8.955195426940918, + "eval_valid_reconstruction/all": 0.2890656292438507, + "eval_valid_reconstruction/end_span": 0.7040707468986511, + "eval_valid_reconstruction/fim": 0.15493108332157135, + "eval_valid_reconstruction/first_seq": 0.16244786977767944, + "eval_valid_reconstruction/last_seq": 0.31518927216529846, + "eval_valid_reconstruction/second_seq": 0.1806870996952057, + "eval_valid_runtime": 435.8422, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 14650 + }, + { + "epoch": 0.05464664324134792, + "eval_train_loss": 2.207615613937378, + "eval_train_loss/all": 2.0414865016937256, + "eval_train_loss/end_span": 1.2195053100585938, + "eval_train_perplexity/batch": 7.702049732208252, + "eval_train_perplexity/end_span": 3.385512590408325, + "eval_train_perplexity/fim": 2.0237176418304443, + "eval_train_perplexity/first_seq": 15.596023559570312, + "eval_train_perplexity/last_seq": 9.064818382263184, + "eval_train_perplexity/second_seq": 14.301557540893555, + "eval_train_perplexity/seq": 8.864821434020996, + "eval_train_reconstruction/all": 0.27935341000556946, + "eval_train_reconstruction/end_span": 0.7126399278640747, + "eval_train_reconstruction/fim": 0.13780908286571503, + "eval_train_reconstruction/first_seq": 0.14862145483493805, + "eval_train_reconstruction/last_seq": 0.3217496871948242, + "eval_train_reconstruction/second_seq": 0.1819525510072708, + "eval_train_runtime": 436.1565, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 14650 + }, + { + "epoch": 0.0546839447043113, + "grad_norm": 0.29622790217399597, + "learning_rate": 0.0006, + "loss": 2.3016, + "step": 14660 + }, + { + "epoch": 0.05472124616727468, + "grad_norm": 0.3486441969871521, + "learning_rate": 0.0006, + "loss": 2.2704, + "step": 14670 + }, + { + "epoch": 0.05475854763023806, + "grad_norm": 0.373584508895874, + "learning_rate": 0.0006, + "loss": 2.2478, + "step": 14680 + }, + { + "epoch": 0.05479584909320143, + "grad_norm": 0.5372012257575989, + "learning_rate": 0.0006, + "loss": 2.3356, + "step": 14690 + }, + { + "epoch": 0.05483315055616481, + "grad_norm": 10.811714172363281, + "learning_rate": 0.0006, + "loss": 2.171, + "step": 14700 + }, + { + "epoch": 0.05483315055616481, + "eval_valid_loss": 2.2123286724090576, + "eval_valid_loss/all": 2.0729875564575195, + "eval_valid_loss/end_span": 1.2192881107330322, + "eval_valid_perplexity/batch": 7.9485344886779785, + "eval_valid_perplexity/end_span": 3.384777307510376, + "eval_valid_perplexity/fim": 2.3751556873321533, + "eval_valid_perplexity/first_seq": 14.880012512207031, + "eval_valid_perplexity/last_seq": 9.310142517089844, + "eval_valid_perplexity/second_seq": 13.7443208694458, + "eval_valid_perplexity/seq": 8.948249816894531, + "eval_valid_reconstruction/all": 0.2880995273590088, + "eval_valid_reconstruction/end_span": 0.7221905589103699, + "eval_valid_reconstruction/fim": 0.16735321283340454, + "eval_valid_reconstruction/first_seq": 0.16716347634792328, + "eval_valid_reconstruction/last_seq": 0.31706464290618896, + "eval_valid_reconstruction/second_seq": 0.19832652807235718, + "eval_valid_runtime": 431.7601, + "eval_valid_samples_per_second": 0.445, + "eval_valid_steps_per_second": 0.445, + "step": 14700 + }, + { + "epoch": 0.05483315055616481, + "eval_train_loss": 2.210329294204712, + "eval_train_loss/all": 2.043581485748291, + "eval_train_loss/end_span": 1.174951195716858, + "eval_train_perplexity/batch": 7.718202590942383, + "eval_train_perplexity/end_span": 3.2379848957061768, + "eval_train_perplexity/fim": 2.260444164276123, + "eval_train_perplexity/first_seq": 15.294864654541016, + "eval_train_perplexity/last_seq": 9.148067474365234, + "eval_train_perplexity/second_seq": 14.018271446228027, + "eval_train_perplexity/seq": 8.874146461486816, + "eval_train_reconstruction/all": 0.27800998091697693, + "eval_train_reconstruction/end_span": 0.7348856925964355, + "eval_train_reconstruction/fim": 0.15776222944259644, + "eval_train_reconstruction/first_seq": 0.15555442869663239, + "eval_train_reconstruction/last_seq": 0.32079002261161804, + "eval_train_reconstruction/second_seq": 0.18894344568252563, + "eval_train_runtime": 435.2976, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 14700 + }, + { + "epoch": 0.05487045201912819, + "grad_norm": 0.32744869589805603, + "learning_rate": 0.0006, + "loss": 2.2533, + "step": 14710 + }, + { + "epoch": 0.054907753482091565, + "grad_norm": 0.2749288082122803, + "learning_rate": 0.0006, + "loss": 2.3192, + "step": 14720 + }, + { + "epoch": 0.054945054945054944, + "grad_norm": 0.30212071537971497, + "learning_rate": 0.0006, + "loss": 2.2197, + "step": 14730 + }, + { + "epoch": 0.054982356408018324, + "grad_norm": 0.41284090280532837, + "learning_rate": 0.0006, + "loss": 2.3013, + "step": 14740 + }, + { + "epoch": 0.0550196578709817, + "grad_norm": 0.45811590552330017, + "learning_rate": 0.0006, + "loss": 2.2275, + "step": 14750 + }, + { + "epoch": 0.0550196578709817, + "eval_valid_loss": 2.2114474773406982, + "eval_valid_loss/all": 2.072172164916992, + "eval_valid_loss/end_span": 1.3901633024215698, + "eval_valid_perplexity/batch": 7.942055702209473, + "eval_valid_perplexity/end_span": 4.015505790710449, + "eval_valid_perplexity/fim": 2.3515865802764893, + "eval_valid_perplexity/first_seq": 14.875174522399902, + "eval_valid_perplexity/last_seq": 8.860702514648438, + "eval_valid_perplexity/second_seq": 14.030742645263672, + "eval_valid_perplexity/seq": 8.947840690612793, + "eval_valid_reconstruction/all": 0.2891574203968048, + "eval_valid_reconstruction/end_span": 0.671418309211731, + "eval_valid_reconstruction/fim": 0.16751350462436676, + "eval_valid_reconstruction/first_seq": 0.1679454892873764, + "eval_valid_reconstruction/last_seq": 0.3325899541378021, + "eval_valid_reconstruction/second_seq": 0.18925754725933075, + "eval_valid_runtime": 435.1022, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 14750 + }, + { + "epoch": 0.0550196578709817, + "eval_train_loss": 2.20768141746521, + "eval_train_loss/all": 2.0414392948150635, + "eval_train_loss/end_span": 1.3552192449569702, + "eval_train_perplexity/batch": 7.701686382293701, + "eval_train_perplexity/end_span": 3.877610921859741, + "eval_train_perplexity/fim": 2.0996289253234863, + "eval_train_perplexity/first_seq": 15.124406814575195, + "eval_train_perplexity/last_seq": 8.689337730407715, + "eval_train_perplexity/second_seq": 14.098115921020508, + "eval_train_perplexity/seq": 8.860466957092285, + "eval_train_reconstruction/all": 0.2794196605682373, + "eval_train_reconstruction/end_span": 0.6804445385932922, + "eval_train_reconstruction/fim": 0.14601729810237885, + "eval_train_reconstruction/first_seq": 0.15983006358146667, + "eval_train_reconstruction/last_seq": 0.33616259694099426, + "eval_train_reconstruction/second_seq": 0.19038191437721252, + "eval_train_runtime": 434.4438, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 14750 + }, + { + "epoch": 0.055056959333945076, + "grad_norm": 0.32891449332237244, + "learning_rate": 0.0006, + "loss": 2.2946, + "step": 14760 + }, + { + "epoch": 0.055094260796908455, + "grad_norm": 0.4361763298511505, + "learning_rate": 0.0006, + "loss": 2.293, + "step": 14770 + }, + { + "epoch": 0.055131562259871834, + "grad_norm": 0.30128785967826843, + "learning_rate": 0.0006, + "loss": 2.1983, + "step": 14780 + }, + { + "epoch": 0.05516886372283521, + "grad_norm": 0.24318256974220276, + "learning_rate": 0.0006, + "loss": 2.1977, + "step": 14790 + }, + { + "epoch": 0.055206165185798586, + "grad_norm": 0.4573081135749817, + "learning_rate": 0.0006, + "loss": 2.034, + "step": 14800 + }, + { + "epoch": 0.055206165185798586, + "eval_valid_loss": 2.215014696121216, + "eval_valid_loss/all": 2.075496196746826, + "eval_valid_loss/end_span": 1.3140673637390137, + "eval_valid_perplexity/batch": 7.968499183654785, + "eval_valid_perplexity/end_span": 3.721278667449951, + "eval_valid_perplexity/fim": 2.5102035999298096, + "eval_valid_perplexity/first_seq": 15.023977279663086, + "eval_valid_perplexity/last_seq": 9.069610595703125, + "eval_valid_perplexity/second_seq": 13.916438102722168, + "eval_valid_perplexity/seq": 8.975824356079102, + "eval_valid_reconstruction/all": 0.28678610920906067, + "eval_valid_reconstruction/end_span": 0.6916280388832092, + "eval_valid_reconstruction/fim": 0.1773296594619751, + "eval_valid_reconstruction/first_seq": 0.16306868195533752, + "eval_valid_reconstruction/last_seq": 0.32596921920776367, + "eval_valid_reconstruction/second_seq": 0.19009144604206085, + "eval_valid_runtime": 434.5443, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 14800 + }, + { + "epoch": 0.055206165185798586, + "eval_train_loss": 2.212902307510376, + "eval_train_loss/all": 2.045855760574341, + "eval_train_loss/end_span": 1.2714240550994873, + "eval_train_perplexity/batch": 7.735775470733643, + "eval_train_perplexity/end_span": 3.565927028656006, + "eval_train_perplexity/fim": 2.169118881225586, + "eval_train_perplexity/first_seq": 15.608942031860352, + "eval_train_perplexity/last_seq": 9.099899291992188, + "eval_train_perplexity/second_seq": 14.32688045501709, + "eval_train_perplexity/seq": 8.895846366882324, + "eval_train_reconstruction/all": 0.27751943469047546, + "eval_train_reconstruction/end_span": 0.7032757997512817, + "eval_train_reconstruction/fim": 0.15073207020759583, + "eval_train_reconstruction/first_seq": 0.14920294284820557, + "eval_train_reconstruction/last_seq": 0.32219570875167847, + "eval_train_reconstruction/second_seq": 0.1839357614517212, + "eval_train_runtime": 436.8853, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 14800 + }, + { + "epoch": 0.055243466648761966, + "grad_norm": 0.555713415145874, + "learning_rate": 0.0006, + "loss": 2.1951, + "step": 14810 + }, + { + "epoch": 0.055280768111725345, + "grad_norm": 0.34099313616752625, + "learning_rate": 0.0006, + "loss": 2.3568, + "step": 14820 + }, + { + "epoch": 0.05531806957468872, + "grad_norm": 0.39469513297080994, + "learning_rate": 0.0006, + "loss": 2.2542, + "step": 14830 + }, + { + "epoch": 0.0553553710376521, + "grad_norm": 0.34062716364860535, + "learning_rate": 0.0006, + "loss": 2.21, + "step": 14840 + }, + { + "epoch": 0.05539267250061548, + "grad_norm": 0.37436002492904663, + "learning_rate": 0.0006, + "loss": 2.3733, + "step": 14850 + }, + { + "epoch": 0.05539267250061548, + "eval_valid_loss": 2.2121222019195557, + "eval_valid_loss/all": 2.0730016231536865, + "eval_valid_loss/end_span": 1.280105710029602, + "eval_valid_perplexity/batch": 7.948646068572998, + "eval_valid_perplexity/end_span": 3.597019910812378, + "eval_valid_perplexity/fim": 2.519871950149536, + "eval_valid_perplexity/first_seq": 14.809743881225586, + "eval_valid_perplexity/last_seq": 8.850278854370117, + "eval_valid_perplexity/second_seq": 14.243590354919434, + "eval_valid_perplexity/seq": 8.952779769897461, + "eval_valid_reconstruction/all": 0.2881387174129486, + "eval_valid_reconstruction/end_span": 0.7013394236564636, + "eval_valid_reconstruction/fim": 0.17973586916923523, + "eval_valid_reconstruction/first_seq": 0.1666325032711029, + "eval_valid_reconstruction/last_seq": 0.33147403597831726, + "eval_valid_reconstruction/second_seq": 0.18547846376895905, + "eval_valid_runtime": 434.6576, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 14850 + }, + { + "epoch": 0.05539267250061548, + "eval_train_loss": 2.2062666416168213, + "eval_train_loss/all": 2.039886236190796, + "eval_train_loss/end_span": 1.2322171926498413, + "eval_train_perplexity/batch": 7.68973445892334, + "eval_train_perplexity/end_span": 3.428823471069336, + "eval_train_perplexity/fim": 2.467855215072632, + "eval_train_perplexity/first_seq": 15.707270622253418, + "eval_train_perplexity/last_seq": 8.93169116973877, + "eval_train_perplexity/second_seq": 13.815716743469238, + "eval_train_perplexity/seq": 8.845722198486328, + "eval_train_reconstruction/all": 0.2794303297996521, + "eval_train_reconstruction/end_span": 0.7163914442062378, + "eval_train_reconstruction/fim": 0.17683564126491547, + "eval_train_reconstruction/first_seq": 0.14730994403362274, + "eval_train_reconstruction/last_seq": 0.32701703906059265, + "eval_train_reconstruction/second_seq": 0.1916656345129013, + "eval_train_runtime": 439.5387, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 14850 + }, + { + "epoch": 0.05542997396357885, + "grad_norm": 0.2514016032218933, + "learning_rate": 0.0006, + "loss": 2.4777, + "step": 14860 + }, + { + "epoch": 0.05546727542654223, + "grad_norm": 0.4139682352542877, + "learning_rate": 0.0006, + "loss": 2.3179, + "step": 14870 + }, + { + "epoch": 0.05550457688950561, + "grad_norm": 0.7526871562004089, + "learning_rate": 0.0006, + "loss": 2.1972, + "step": 14880 + }, + { + "epoch": 0.05554187835246898, + "grad_norm": 0.2883457839488983, + "learning_rate": 0.0006, + "loss": 2.2826, + "step": 14890 + }, + { + "epoch": 0.05557917981543236, + "grad_norm": 0.3600759208202362, + "learning_rate": 0.0006, + "loss": 2.0602, + "step": 14900 + }, + { + "epoch": 0.05557917981543236, + "eval_valid_loss": 2.217109441757202, + "eval_valid_loss/all": 2.0774877071380615, + "eval_valid_loss/end_span": 1.379617691040039, + "eval_valid_perplexity/batch": 7.984384536743164, + "eval_valid_perplexity/end_span": 3.9733822345733643, + "eval_valid_perplexity/fim": 2.314455986022949, + "eval_valid_perplexity/first_seq": 15.0635347366333, + "eval_valid_perplexity/last_seq": 9.204991340637207, + "eval_valid_perplexity/second_seq": 13.80150032043457, + "eval_valid_perplexity/seq": 8.99567699432373, + "eval_valid_reconstruction/all": 0.2873586118221283, + "eval_valid_reconstruction/end_span": 0.6784994602203369, + "eval_valid_reconstruction/fim": 0.16248512268066406, + "eval_valid_reconstruction/first_seq": 0.16470371186733246, + "eval_valid_reconstruction/last_seq": 0.3209441900253296, + "eval_valid_reconstruction/second_seq": 0.19366292655467987, + "eval_valid_runtime": 442.0752, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 14900 + }, + { + "epoch": 0.05557917981543236, + "eval_train_loss": 2.214109420776367, + "eval_train_loss/all": 2.0471103191375732, + "eval_train_loss/end_span": 1.3395824432373047, + "eval_train_perplexity/batch": 7.745486736297607, + "eval_train_perplexity/end_span": 3.8174490928649902, + "eval_train_perplexity/fim": 2.0754570960998535, + "eval_train_perplexity/first_seq": 15.795435905456543, + "eval_train_perplexity/last_seq": 9.385527610778809, + "eval_train_perplexity/second_seq": 14.145784378051758, + "eval_train_perplexity/seq": 8.915614128112793, + "eval_train_reconstruction/all": 0.27766209840774536, + "eval_train_reconstruction/end_span": 0.6876314878463745, + "eval_train_reconstruction/fim": 0.14269210398197174, + "eval_train_reconstruction/first_seq": 0.14615283906459808, + "eval_train_reconstruction/last_seq": 0.31065329909324646, + "eval_train_reconstruction/second_seq": 0.19066187739372253, + "eval_train_runtime": 437.4248, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 14900 + }, + { + "epoch": 0.05561648127839574, + "grad_norm": 0.26834797859191895, + "learning_rate": 0.0006, + "loss": 2.3664, + "step": 14910 + }, + { + "epoch": 0.05565378274135912, + "grad_norm": 0.3587716221809387, + "learning_rate": 0.0006, + "loss": 2.1314, + "step": 14920 + }, + { + "epoch": 0.05569108420432249, + "grad_norm": 0.32516810297966003, + "learning_rate": 0.0006, + "loss": 2.2483, + "step": 14930 + }, + { + "epoch": 0.05572838566728587, + "grad_norm": 0.4058746099472046, + "learning_rate": 0.0006, + "loss": 2.3012, + "step": 14940 + }, + { + "epoch": 0.05576568713024925, + "grad_norm": 0.27909180521965027, + "learning_rate": 0.0006, + "loss": 2.3329, + "step": 14950 + }, + { + "epoch": 0.05576568713024925, + "eval_valid_loss": 2.209442377090454, + "eval_valid_loss/all": 2.0704119205474854, + "eval_valid_loss/end_span": 1.3714542388916016, + "eval_valid_perplexity/batch": 7.928088188171387, + "eval_valid_perplexity/end_span": 3.941077709197998, + "eval_valid_perplexity/fim": 2.2687442302703857, + "eval_valid_perplexity/first_seq": 14.897015571594238, + "eval_valid_perplexity/last_seq": 8.703781127929688, + "eval_valid_perplexity/second_seq": 14.101268768310547, + "eval_valid_perplexity/seq": 8.932235717773438, + "eval_valid_reconstruction/all": 0.28940048813819885, + "eval_valid_reconstruction/end_span": 0.673137903213501, + "eval_valid_reconstruction/fim": 0.16069962084293365, + "eval_valid_reconstruction/first_seq": 0.17022675275802612, + "eval_valid_reconstruction/last_seq": 0.3368738889694214, + "eval_valid_reconstruction/second_seq": 0.1867307871580124, + "eval_valid_runtime": 441.3564, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 14950 + }, + { + "epoch": 0.05576568713024925, + "eval_train_loss": 2.205059289932251, + "eval_train_loss/all": 2.0388805866241455, + "eval_train_loss/end_span": 1.3275141716003418, + "eval_train_perplexity/batch": 7.682004928588867, + "eval_train_perplexity/end_span": 3.771656036376953, + "eval_train_perplexity/fim": 2.3291072845458984, + "eval_train_perplexity/first_seq": 15.454817771911621, + "eval_train_perplexity/last_seq": 9.396080017089844, + "eval_train_perplexity/second_seq": 14.4950590133667, + "eval_train_perplexity/seq": 8.8355131149292, + "eval_train_reconstruction/all": 0.279968798160553, + "eval_train_reconstruction/end_span": 0.6842421293258667, + "eval_train_reconstruction/fim": 0.16687196493148804, + "eval_train_reconstruction/first_seq": 0.15548133850097656, + "eval_train_reconstruction/last_seq": 0.310263991355896, + "eval_train_reconstruction/second_seq": 0.17582063376903534, + "eval_train_runtime": 440.2811, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 14950 + }, + { + "epoch": 0.05580298859321262, + "grad_norm": 0.3419393301010132, + "learning_rate": 0.0006, + "loss": 2.1971, + "step": 14960 + }, + { + "epoch": 0.055840290056176, + "grad_norm": 0.446483850479126, + "learning_rate": 0.0006, + "loss": 2.3707, + "step": 14970 + }, + { + "epoch": 0.05587759151913938, + "grad_norm": 0.29740676283836365, + "learning_rate": 0.0006, + "loss": 2.391, + "step": 14980 + }, + { + "epoch": 0.05591489298210276, + "grad_norm": 0.4207051396369934, + "learning_rate": 0.0006, + "loss": 2.0825, + "step": 14990 + }, + { + "epoch": 0.055952194445066133, + "grad_norm": 0.2925545871257782, + "learning_rate": 0.0006, + "loss": 2.2901, + "step": 15000 + }, + { + "epoch": 0.055952194445066133, + "eval_valid_loss": 2.2095701694488525, + "eval_valid_loss/all": 2.070547580718994, + "eval_valid_loss/end_span": 1.2578082084655762, + "eval_valid_perplexity/batch": 7.929163932800293, + "eval_valid_perplexity/end_span": 3.517703056335449, + "eval_valid_perplexity/fim": 2.3364152908325195, + "eval_valid_perplexity/first_seq": 15.056597709655762, + "eval_valid_perplexity/last_seq": 9.117953300476074, + "eval_valid_perplexity/second_seq": 13.417799949645996, + "eval_valid_perplexity/seq": 8.936121940612793, + "eval_valid_reconstruction/all": 0.28945857286453247, + "eval_valid_reconstruction/end_span": 0.7072235345840454, + "eval_valid_reconstruction/fim": 0.16558830440044403, + "eval_valid_reconstruction/first_seq": 0.16599427163600922, + "eval_valid_reconstruction/last_seq": 0.324506551027298, + "eval_valid_reconstruction/second_seq": 0.2035544216632843, + "eval_valid_runtime": 438.011, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 15000 + }, + { + "epoch": 0.055952194445066133, + "eval_train_loss": 2.2069995403289795, + "eval_train_loss/all": 2.0408825874328613, + "eval_train_loss/end_span": 1.2307780981063843, + "eval_train_perplexity/batch": 7.697399616241455, + "eval_train_perplexity/end_span": 3.4238927364349365, + "eval_train_perplexity/fim": 2.284696340560913, + "eval_train_perplexity/first_seq": 15.532886505126953, + "eval_train_perplexity/last_seq": 8.832820892333984, + "eval_train_perplexity/second_seq": 14.232245445251465, + "eval_train_perplexity/seq": 8.857756614685059, + "eval_train_reconstruction/all": 0.279495507478714, + "eval_train_reconstruction/end_span": 0.7154518961906433, + "eval_train_reconstruction/fim": 0.1615787297487259, + "eval_train_reconstruction/first_seq": 0.1508777141571045, + "eval_train_reconstruction/last_seq": 0.3318234086036682, + "eval_train_reconstruction/second_seq": 0.18311794102191925, + "eval_train_runtime": 434.1858, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 15000 + }, + { + "epoch": 0.05598949590802951, + "grad_norm": 0.42752212285995483, + "learning_rate": 0.0006, + "loss": 2.346, + "step": 15010 + }, + { + "epoch": 0.05602679737099289, + "grad_norm": 0.3541978597640991, + "learning_rate": 0.0006, + "loss": 2.2021, + "step": 15020 + }, + { + "epoch": 0.056064098833956265, + "grad_norm": 0.8126165866851807, + "learning_rate": 0.0006, + "loss": 2.1158, + "step": 15030 + }, + { + "epoch": 0.056101400296919644, + "grad_norm": 0.3865251839160919, + "learning_rate": 0.0006, + "loss": 2.2256, + "step": 15040 + }, + { + "epoch": 0.056138701759883024, + "grad_norm": 0.35017111897468567, + "learning_rate": 0.0006, + "loss": 2.2319, + "step": 15050 + }, + { + "epoch": 0.056138701759883024, + "eval_valid_loss": 2.215061902999878, + "eval_valid_loss/all": 2.0755629539489746, + "eval_valid_loss/end_span": 1.3607646226882935, + "eval_valid_perplexity/batch": 7.96903133392334, + "eval_valid_perplexity/end_span": 3.8991734981536865, + "eval_valid_perplexity/fim": 2.5764763355255127, + "eval_valid_perplexity/first_seq": 14.7987699508667, + "eval_valid_perplexity/last_seq": 9.446077346801758, + "eval_valid_perplexity/second_seq": 14.117914199829102, + "eval_valid_perplexity/seq": 8.978886604309082, + "eval_valid_reconstruction/all": 0.28839391469955444, + "eval_valid_reconstruction/end_span": 0.6757712364196777, + "eval_valid_reconstruction/fim": 0.184268981218338, + "eval_valid_reconstruction/first_seq": 0.17001888155937195, + "eval_valid_reconstruction/last_seq": 0.3120085299015045, + "eval_valid_reconstruction/second_seq": 0.18876324594020844, + "eval_valid_runtime": 433.3364, + "eval_valid_samples_per_second": 0.443, + "eval_valid_steps_per_second": 0.443, + "step": 15050 + }, + { + "epoch": 0.056138701759883024, + "eval_train_loss": 2.2108352184295654, + "eval_train_loss/all": 2.0440683364868164, + "eval_train_loss/end_span": 1.3242279291152954, + "eval_train_perplexity/batch": 7.72196102142334, + "eval_train_perplexity/end_span": 3.759281873703003, + "eval_train_perplexity/fim": 2.177886486053467, + "eval_train_perplexity/first_seq": 15.522165298461914, + "eval_train_perplexity/last_seq": 8.973539352416992, + "eval_train_perplexity/second_seq": 14.566523551940918, + "eval_train_perplexity/seq": 8.883387565612793, + "eval_train_reconstruction/all": 0.27909189462661743, + "eval_train_reconstruction/end_span": 0.687606930732727, + "eval_train_reconstruction/fim": 0.15257945656776428, + "eval_train_reconstruction/first_seq": 0.15171892940998077, + "eval_train_reconstruction/last_seq": 0.32605183124542236, + "eval_train_reconstruction/second_seq": 0.17662276327610016, + "eval_train_runtime": 438.1138, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 15050 + }, + { + "epoch": 0.0561760032228464, + "grad_norm": 0.275616317987442, + "learning_rate": 0.0006, + "loss": 2.3961, + "step": 15060 + }, + { + "epoch": 0.056213304685809776, + "grad_norm": 0.4256030023097992, + "learning_rate": 0.0006, + "loss": 2.1945, + "step": 15070 + }, + { + "epoch": 0.056250606148773155, + "grad_norm": 0.3273322284221649, + "learning_rate": 0.0006, + "loss": 2.3691, + "step": 15080 + }, + { + "epoch": 0.056287907611736535, + "grad_norm": 0.49707117676734924, + "learning_rate": 0.0006, + "loss": 2.3357, + "step": 15090 + }, + { + "epoch": 0.05632520907469991, + "grad_norm": 0.334383487701416, + "learning_rate": 0.0006, + "loss": 2.3888, + "step": 15100 + }, + { + "epoch": 0.05632520907469991, + "eval_valid_loss": 2.2084057331085205, + "eval_valid_loss/all": 2.069413661956787, + "eval_valid_loss/end_span": 1.3048745393753052, + "eval_valid_perplexity/batch": 7.920177936553955, + "eval_valid_perplexity/end_span": 3.6872265338897705, + "eval_valid_perplexity/fim": 2.4416520595550537, + "eval_valid_perplexity/first_seq": 15.259031295776367, + "eval_valid_perplexity/last_seq": 9.159934043884277, + "eval_valid_perplexity/second_seq": 13.884218215942383, + "eval_valid_perplexity/seq": 8.922670364379883, + "eval_valid_reconstruction/all": 0.2897886037826538, + "eval_valid_reconstruction/end_span": 0.6989333629608154, + "eval_valid_reconstruction/fim": 0.17588670551776886, + "eval_valid_reconstruction/first_seq": 0.1597823053598404, + "eval_valid_reconstruction/last_seq": 0.32468900084495544, + "eval_valid_reconstruction/second_seq": 0.1915217787027359, + "eval_valid_runtime": 437.2077, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 15100 + }, + { + "epoch": 0.05632520907469991, + "eval_train_loss": 2.2040913105010986, + "eval_train_loss/all": 2.0380144119262695, + "eval_train_loss/end_span": 1.2707217931747437, + "eval_train_perplexity/batch": 7.67535400390625, + "eval_train_perplexity/end_span": 3.5634236335754395, + "eval_train_perplexity/fim": 2.0058412551879883, + "eval_train_perplexity/first_seq": 15.52706241607666, + "eval_train_perplexity/last_seq": 8.554338455200195, + "eval_train_perplexity/second_seq": 14.106006622314453, + "eval_train_perplexity/seq": 8.829280853271484, + "eval_train_reconstruction/all": 0.2803772985935211, + "eval_train_reconstruction/end_span": 0.707983136177063, + "eval_train_reconstruction/fim": 0.13724558055400848, + "eval_train_reconstruction/first_seq": 0.15207447111606598, + "eval_train_reconstruction/last_seq": 0.3421391248703003, + "eval_train_reconstruction/second_seq": 0.18730084598064423, + "eval_train_runtime": 437.5695, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 15100 + }, + { + "epoch": 0.056362510537663287, + "grad_norm": 0.3645230531692505, + "learning_rate": 0.0006, + "loss": 2.2013, + "step": 15110 + }, + { + "epoch": 0.056399812000626666, + "grad_norm": 0.2787967324256897, + "learning_rate": 0.0006, + "loss": 2.2742, + "step": 15120 + }, + { + "epoch": 0.056437113463590045, + "grad_norm": 0.36131975054740906, + "learning_rate": 0.0006, + "loss": 2.1889, + "step": 15130 + }, + { + "epoch": 0.05647441492655342, + "grad_norm": 0.3055667579174042, + "learning_rate": 0.0006, + "loss": 2.0542, + "step": 15140 + }, + { + "epoch": 0.0565117163895168, + "grad_norm": 0.2621464133262634, + "learning_rate": 0.0006, + "loss": 2.4054, + "step": 15150 + }, + { + "epoch": 0.0565117163895168, + "eval_valid_loss": 2.2084579467773438, + "eval_valid_loss/all": 2.069218635559082, + "eval_valid_loss/end_span": 1.1734716892242432, + "eval_valid_perplexity/batch": 7.918633460998535, + "eval_valid_perplexity/end_span": 3.2331979274749756, + "eval_valid_perplexity/fim": 2.352687358856201, + "eval_valid_perplexity/first_seq": 15.092391014099121, + "eval_valid_perplexity/last_seq": 8.91752815246582, + "eval_valid_perplexity/second_seq": 13.818041801452637, + "eval_valid_perplexity/seq": 8.917262077331543, + "eval_valid_reconstruction/all": 0.2896179258823395, + "eval_valid_reconstruction/end_span": 0.7226393222808838, + "eval_valid_reconstruction/fim": 0.16809481382369995, + "eval_valid_reconstruction/first_seq": 0.1646193563938141, + "eval_valid_reconstruction/last_seq": 0.32928335666656494, + "eval_valid_reconstruction/second_seq": 0.1947813332080841, + "eval_valid_runtime": 439.4307, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 15150 + }, + { + "epoch": 0.0565117163895168, + "eval_train_loss": 2.205725908279419, + "eval_train_loss/all": 2.0395896434783936, + "eval_train_loss/end_span": 1.1517608165740967, + "eval_train_perplexity/batch": 7.687453746795654, + "eval_train_perplexity/end_span": 3.1637587547302246, + "eval_train_perplexity/fim": 2.042656660079956, + "eval_train_perplexity/first_seq": 15.288914680480957, + "eval_train_perplexity/last_seq": 9.490973472595215, + "eval_train_perplexity/second_seq": 14.339042663574219, + "eval_train_perplexity/seq": 8.843685150146484, + "eval_train_reconstruction/all": 0.2798509895801544, + "eval_train_reconstruction/end_span": 0.7293519973754883, + "eval_train_reconstruction/fim": 0.14017024636268616, + "eval_train_reconstruction/first_seq": 0.1562524437904358, + "eval_train_reconstruction/last_seq": 0.30764007568359375, + "eval_train_reconstruction/second_seq": 0.17938737571239471, + "eval_train_runtime": 436.2957, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 15150 + }, + { + "epoch": 0.05654901785248018, + "grad_norm": 0.30381178855895996, + "learning_rate": 0.0006, + "loss": 2.2232, + "step": 15160 + }, + { + "epoch": 0.05658631931544355, + "grad_norm": 0.3982648253440857, + "learning_rate": 0.0006, + "loss": 2.3254, + "step": 15170 + }, + { + "epoch": 0.05662362077840693, + "grad_norm": 0.33388248085975647, + "learning_rate": 0.0006, + "loss": 2.1447, + "step": 15180 + }, + { + "epoch": 0.05666092224137031, + "grad_norm": 0.2910587191581726, + "learning_rate": 0.0006, + "loss": 2.0749, + "step": 15190 + }, + { + "epoch": 0.05669822370433368, + "grad_norm": 0.3028600811958313, + "learning_rate": 0.0006, + "loss": 2.2227, + "step": 15200 + }, + { + "epoch": 0.05669822370433368, + "eval_valid_loss": 2.208015203475952, + "eval_valid_loss/all": 2.068950891494751, + "eval_valid_loss/end_span": 1.2982852458953857, + "eval_valid_perplexity/batch": 7.916513442993164, + "eval_valid_perplexity/end_span": 3.6630101203918457, + "eval_valid_perplexity/fim": 2.5176000595092773, + "eval_valid_perplexity/first_seq": 14.848560333251953, + "eval_valid_perplexity/last_seq": 9.046749114990234, + "eval_valid_perplexity/second_seq": 14.016727447509766, + "eval_valid_perplexity/seq": 8.913647651672363, + "eval_valid_reconstruction/all": 0.28976351022720337, + "eval_valid_reconstruction/end_span": 0.6911337971687317, + "eval_valid_reconstruction/fim": 0.18237662315368652, + "eval_valid_reconstruction/first_seq": 0.16833974421024323, + "eval_valid_reconstruction/last_seq": 0.325303852558136, + "eval_valid_reconstruction/second_seq": 0.19012998044490814, + "eval_valid_runtime": 438.4541, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 15200 + }, + { + "epoch": 0.05669822370433368, + "eval_train_loss": 2.20381760597229, + "eval_train_loss/all": 2.03725528717041, + "eval_train_loss/end_span": 1.2748743295669556, + "eval_train_perplexity/batch": 7.669529438018799, + "eval_train_perplexity/end_span": 3.578251600265503, + "eval_train_perplexity/fim": 2.1416547298431396, + "eval_train_perplexity/first_seq": 15.47061824798584, + "eval_train_perplexity/last_seq": 8.516252517700195, + "eval_train_perplexity/second_seq": 14.033482551574707, + "eval_train_perplexity/seq": 8.816516876220703, + "eval_train_reconstruction/all": 0.2806057035923004, + "eval_train_reconstruction/end_span": 0.7031840085983276, + "eval_train_reconstruction/fim": 0.15026244521141052, + "eval_train_reconstruction/first_seq": 0.1514231264591217, + "eval_train_reconstruction/last_seq": 0.3403529226779938, + "eval_train_reconstruction/second_seq": 0.19039389491081238, + "eval_train_runtime": 433.4569, + "eval_train_samples_per_second": 0.443, + "eval_train_steps_per_second": 0.443, + "step": 15200 + }, + { + "epoch": 0.05673552516729706, + "grad_norm": 0.29324042797088623, + "learning_rate": 0.0006, + "loss": 2.2856, + "step": 15210 + }, + { + "epoch": 0.05677282663026044, + "grad_norm": 0.301666796207428, + "learning_rate": 0.0006, + "loss": 2.2862, + "step": 15220 + }, + { + "epoch": 0.05681012809322382, + "grad_norm": 0.2774173617362976, + "learning_rate": 0.0006, + "loss": 2.3315, + "step": 15230 + }, + { + "epoch": 0.05684742955618719, + "grad_norm": 0.4275822341442108, + "learning_rate": 0.0006, + "loss": 2.2535, + "step": 15240 + }, + { + "epoch": 0.05688473101915057, + "grad_norm": 0.4238768517971039, + "learning_rate": 0.0006, + "loss": 2.2531, + "step": 15250 + }, + { + "epoch": 0.05688473101915057, + "eval_valid_loss": 2.2098886966705322, + "eval_valid_loss/all": 2.0707290172576904, + "eval_valid_loss/end_span": 1.2833638191223145, + "eval_valid_perplexity/batch": 7.930602550506592, + "eval_valid_perplexity/end_span": 3.6087584495544434, + "eval_valid_perplexity/fim": 2.2477893829345703, + "eval_valid_perplexity/first_seq": 14.482460975646973, + "eval_valid_perplexity/last_seq": 9.111994743347168, + "eval_valid_perplexity/second_seq": 13.492779731750488, + "eval_valid_perplexity/seq": 8.932229042053223, + "eval_valid_reconstruction/all": 0.2894878089427948, + "eval_valid_reconstruction/end_span": 0.7001257538795471, + "eval_valid_reconstruction/fim": 0.15795260667800903, + "eval_valid_reconstruction/first_seq": 0.17507338523864746, + "eval_valid_reconstruction/last_seq": 0.32239529490470886, + "eval_valid_reconstruction/second_seq": 0.20428809523582458, + "eval_valid_runtime": 435.4538, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 15250 + }, + { + "epoch": 0.05688473101915057, + "eval_train_loss": 2.207704544067383, + "eval_train_loss/all": 2.0408270359039307, + "eval_train_loss/end_span": 1.2443207502365112, + "eval_train_perplexity/batch": 7.696972370147705, + "eval_train_perplexity/end_span": 3.470576524734497, + "eval_train_perplexity/fim": 2.4386651515960693, + "eval_train_perplexity/first_seq": 15.262873649597168, + "eval_train_perplexity/last_seq": 9.324443817138672, + "eval_train_perplexity/second_seq": 14.196444511413574, + "eval_train_perplexity/seq": 8.852603912353516, + "eval_train_reconstruction/all": 0.27951204776763916, + "eval_train_reconstruction/end_span": 0.7125576138496399, + "eval_train_reconstruction/fim": 0.17520642280578613, + "eval_train_reconstruction/first_seq": 0.15716548264026642, + "eval_train_reconstruction/last_seq": 0.31263959407806396, + "eval_train_reconstruction/second_seq": 0.18289804458618164, + "eval_train_runtime": 435.7317, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 15250 + }, + { + "epoch": 0.05692203248211395, + "grad_norm": 0.30153730511665344, + "learning_rate": 0.0006, + "loss": 2.2932, + "step": 15260 + }, + { + "epoch": 0.05695933394507732, + "grad_norm": 0.5039985775947571, + "learning_rate": 0.0006, + "loss": 2.017, + "step": 15270 + }, + { + "epoch": 0.0569966354080407, + "grad_norm": 0.38497817516326904, + "learning_rate": 0.0006, + "loss": 2.2126, + "step": 15280 + }, + { + "epoch": 0.05703393687100408, + "grad_norm": 0.3932197093963623, + "learning_rate": 0.0006, + "loss": 2.361, + "step": 15290 + }, + { + "epoch": 0.05707123833396746, + "grad_norm": 0.2598981559276581, + "learning_rate": 0.0006, + "loss": 2.3188, + "step": 15300 + }, + { + "epoch": 0.05707123833396746, + "eval_valid_loss": 2.203249216079712, + "eval_valid_loss/all": 2.0645670890808105, + "eval_valid_loss/end_span": 1.238220453262329, + "eval_valid_perplexity/batch": 7.881885051727295, + "eval_valid_perplexity/end_span": 3.449469566345215, + "eval_valid_perplexity/fim": 2.2416059970855713, + "eval_valid_perplexity/first_seq": 14.947734832763672, + "eval_valid_perplexity/last_seq": 9.107565879821777, + "eval_valid_perplexity/second_seq": 13.85996150970459, + "eval_valid_perplexity/seq": 8.877276420593262, + "eval_valid_reconstruction/all": 0.29110708832740784, + "eval_valid_reconstruction/end_span": 0.7142094373703003, + "eval_valid_reconstruction/fim": 0.1594255566596985, + "eval_valid_reconstruction/first_seq": 0.1665564924478531, + "eval_valid_reconstruction/last_seq": 0.3226204216480255, + "eval_valid_reconstruction/second_seq": 0.1978253424167633, + "eval_valid_runtime": 435.5545, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 15300 + }, + { + "epoch": 0.05707123833396746, + "eval_train_loss": 2.202925443649292, + "eval_train_loss/all": 2.0370004177093506, + "eval_train_loss/end_span": 1.206291913986206, + "eval_train_perplexity/batch": 7.667575359344482, + "eval_train_perplexity/end_span": 3.3410725593566895, + "eval_train_perplexity/fim": 2.2660186290740967, + "eval_train_perplexity/first_seq": 15.506470680236816, + "eval_train_perplexity/last_seq": 9.248353958129883, + "eval_train_perplexity/second_seq": 14.478891372680664, + "eval_train_perplexity/seq": 8.8192777633667, + "eval_train_reconstruction/all": 0.2805224657058716, + "eval_train_reconstruction/end_span": 0.7240597009658813, + "eval_train_reconstruction/fim": 0.16179674863815308, + "eval_train_reconstruction/first_seq": 0.153407484292984, + "eval_train_reconstruction/last_seq": 0.3156402111053467, + "eval_train_reconstruction/second_seq": 0.17692308127880096, + "eval_train_runtime": 434.8873, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 15300 + }, + { + "epoch": 0.057108539796930834, + "grad_norm": 0.4237917959690094, + "learning_rate": 0.0006, + "loss": 2.2923, + "step": 15310 + }, + { + "epoch": 0.05714584125989421, + "grad_norm": 0.34609025716781616, + "learning_rate": 0.0006, + "loss": 2.172, + "step": 15320 + }, + { + "epoch": 0.05718314272285759, + "grad_norm": 0.46841198205947876, + "learning_rate": 0.0006, + "loss": 2.3776, + "step": 15330 + }, + { + "epoch": 0.057220444185820965, + "grad_norm": 0.446713924407959, + "learning_rate": 0.0006, + "loss": 2.3898, + "step": 15340 + }, + { + "epoch": 0.057257745648784344, + "grad_norm": 0.2850036323070526, + "learning_rate": 0.0006, + "loss": 2.3255, + "step": 15350 + }, + { + "epoch": 0.057257745648784344, + "eval_valid_loss": 2.2049310207366943, + "eval_valid_loss/all": 2.065884828567505, + "eval_valid_loss/end_span": 1.2887332439422607, + "eval_valid_perplexity/batch": 7.89227819442749, + "eval_valid_perplexity/end_span": 3.628187656402588, + "eval_valid_perplexity/fim": 2.242690324783325, + "eval_valid_perplexity/first_seq": 15.098930358886719, + "eval_valid_perplexity/last_seq": 8.971652030944824, + "eval_valid_perplexity/second_seq": 13.741787910461426, + "eval_valid_perplexity/seq": 8.888869285583496, + "eval_valid_reconstruction/all": 0.29063183069229126, + "eval_valid_reconstruction/end_span": 0.6963390111923218, + "eval_valid_reconstruction/fim": 0.1589116007089615, + "eval_valid_reconstruction/first_seq": 0.16101381182670593, + "eval_valid_reconstruction/last_seq": 0.33115634322166443, + "eval_valid_reconstruction/second_seq": 0.19750326871871948, + "eval_valid_runtime": 435.768, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 15350 + }, + { + "epoch": 0.057257745648784344, + "eval_train_loss": 2.2033603191375732, + "eval_train_loss/all": 2.0371153354644775, + "eval_train_loss/end_span": 1.249991774559021, + "eval_train_perplexity/batch": 7.668456554412842, + "eval_train_perplexity/end_span": 3.490314245223999, + "eval_train_perplexity/fim": 2.105848550796509, + "eval_train_perplexity/first_seq": 15.396678924560547, + "eval_train_perplexity/last_seq": 9.042908668518066, + "eval_train_perplexity/second_seq": 14.411035537719727, + "eval_train_perplexity/seq": 8.817492485046387, + "eval_train_reconstruction/all": 0.2803962230682373, + "eval_train_reconstruction/end_span": 0.710037112236023, + "eval_train_reconstruction/fim": 0.14630520343780518, + "eval_train_reconstruction/first_seq": 0.15449288487434387, + "eval_train_reconstruction/last_seq": 0.3238840103149414, + "eval_train_reconstruction/second_seq": 0.178530752658844, + "eval_train_runtime": 436.3798, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 15350 + }, + { + "epoch": 0.057295047111747724, + "grad_norm": 0.3716357946395874, + "learning_rate": 0.0006, + "loss": 2.3415, + "step": 15360 + }, + { + "epoch": 0.0573323485747111, + "grad_norm": 0.227377787232399, + "learning_rate": 0.0006, + "loss": 2.1739, + "step": 15370 + }, + { + "epoch": 0.057369650037674476, + "grad_norm": 0.31249988079071045, + "learning_rate": 0.0006, + "loss": 2.1704, + "step": 15380 + }, + { + "epoch": 0.057406951500637855, + "grad_norm": 0.3068172335624695, + "learning_rate": 0.0006, + "loss": 2.1098, + "step": 15390 + }, + { + "epoch": 0.057444252963601235, + "grad_norm": 0.3323993980884552, + "learning_rate": 0.0006, + "loss": 2.1059, + "step": 15400 + }, + { + "epoch": 0.057444252963601235, + "eval_valid_loss": 2.2111623287200928, + "eval_valid_loss/all": 2.0718634128570557, + "eval_valid_loss/end_span": 1.2967718839645386, + "eval_valid_perplexity/batch": 7.93960428237915, + "eval_valid_perplexity/end_span": 3.657470941543579, + "eval_valid_perplexity/fim": 2.58504056930542, + "eval_valid_perplexity/first_seq": 15.085020065307617, + "eval_valid_perplexity/last_seq": 8.587733268737793, + "eval_valid_perplexity/second_seq": 13.707571029663086, + "eval_valid_perplexity/seq": 8.9461030960083, + "eval_valid_reconstruction/all": 0.28898951411247253, + "eval_valid_reconstruction/end_span": 0.6949018836021423, + "eval_valid_reconstruction/fim": 0.18563957512378693, + "eval_valid_reconstruction/first_seq": 0.16335177421569824, + "eval_valid_reconstruction/last_seq": 0.34120315313339233, + "eval_valid_reconstruction/second_seq": 0.197401225566864, + "eval_valid_runtime": 434.5148, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 15400 + }, + { + "epoch": 0.057444252963601235, + "eval_train_loss": 2.209859848022461, + "eval_train_loss/all": 2.0434200763702393, + "eval_train_loss/end_span": 1.2589881420135498, + "eval_train_perplexity/batch": 7.716956615447998, + "eval_train_perplexity/end_span": 3.5218560695648193, + "eval_train_perplexity/fim": 2.3573427200317383, + "eval_train_perplexity/first_seq": 15.516401290893555, + "eval_train_perplexity/last_seq": 8.987634658813477, + "eval_train_perplexity/second_seq": 14.330142974853516, + "eval_train_perplexity/seq": 8.878632545471191, + "eval_train_reconstruction/all": 0.2787145674228668, + "eval_train_reconstruction/end_span": 0.7057673335075378, + "eval_train_reconstruction/fim": 0.16654819250106812, + "eval_train_reconstruction/first_seq": 0.152101069688797, + "eval_train_reconstruction/last_seq": 0.32169491052627563, + "eval_train_reconstruction/second_seq": 0.18500706553459167, + "eval_train_runtime": 435.5665, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 15400 + }, + { + "epoch": 0.05748155442656461, + "grad_norm": 0.27412065863609314, + "learning_rate": 0.0006, + "loss": 2.0222, + "step": 15410 + }, + { + "epoch": 0.05751885588952799, + "grad_norm": 0.494976282119751, + "learning_rate": 0.0006, + "loss": 2.1667, + "step": 15420 + }, + { + "epoch": 0.057556157352491366, + "grad_norm": 0.3348826766014099, + "learning_rate": 0.0006, + "loss": 2.3002, + "step": 15430 + }, + { + "epoch": 0.057593458815454746, + "grad_norm": 0.34057220816612244, + "learning_rate": 0.0006, + "loss": 2.4481, + "step": 15440 + }, + { + "epoch": 0.05763076027841812, + "grad_norm": 0.25129687786102295, + "learning_rate": 0.0006, + "loss": 2.264, + "step": 15450 + }, + { + "epoch": 0.05763076027841812, + "eval_valid_loss": 2.208996534347534, + "eval_valid_loss/all": 2.069962501525879, + "eval_valid_loss/end_span": 1.2243062257766724, + "eval_valid_perplexity/batch": 7.924525737762451, + "eval_valid_perplexity/end_span": 3.4018051624298096, + "eval_valid_perplexity/fim": 2.4797239303588867, + "eval_valid_perplexity/first_seq": 15.041102409362793, + "eval_valid_perplexity/last_seq": 8.927085876464844, + "eval_valid_perplexity/second_seq": 13.632232666015625, + "eval_valid_perplexity/seq": 8.933798789978027, + "eval_valid_reconstruction/all": 0.28964075446128845, + "eval_valid_reconstruction/end_span": 0.7119657397270203, + "eval_valid_reconstruction/fim": 0.17740994691848755, + "eval_valid_reconstruction/first_seq": 0.16096197068691254, + "eval_valid_reconstruction/last_seq": 0.33177244663238525, + "eval_valid_reconstruction/second_seq": 0.19846971333026886, + "eval_valid_runtime": 436.4247, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 15450 + }, + { + "epoch": 0.05763076027841812, + "eval_train_loss": 2.2060022354125977, + "eval_train_loss/all": 2.0397417545318604, + "eval_train_loss/end_span": 1.1907788515090942, + "eval_train_perplexity/batch": 7.688623428344727, + "eval_train_perplexity/end_span": 3.289642333984375, + "eval_train_perplexity/fim": 2.1572377681732178, + "eval_train_perplexity/first_seq": 15.7332181930542, + "eval_train_perplexity/last_seq": 9.22176456451416, + "eval_train_perplexity/second_seq": 13.94601821899414, + "eval_train_perplexity/seq": 8.845362663269043, + "eval_train_reconstruction/all": 0.2797752022743225, + "eval_train_reconstruction/end_span": 0.7223257422447205, + "eval_train_reconstruction/fim": 0.15177281200885773, + "eval_train_reconstruction/first_seq": 0.14481203258037567, + "eval_train_reconstruction/last_seq": 0.31729331612586975, + "eval_train_reconstruction/second_seq": 0.1934831589460373, + "eval_train_runtime": 439.9113, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 15450 + }, + { + "epoch": 0.0576680617413815, + "grad_norm": 0.3115614354610443, + "learning_rate": 0.0006, + "loss": 2.2461, + "step": 15460 + }, + { + "epoch": 0.05770536320434488, + "grad_norm": 0.34772467613220215, + "learning_rate": 0.0006, + "loss": 2.127, + "step": 15470 + }, + { + "epoch": 0.05774266466730825, + "grad_norm": 0.5180483460426331, + "learning_rate": 0.0006, + "loss": 2.1393, + "step": 15480 + }, + { + "epoch": 0.05777996613027163, + "grad_norm": 0.3321819603443146, + "learning_rate": 0.0006, + "loss": 2.4079, + "step": 15490 + }, + { + "epoch": 0.05781726759323501, + "grad_norm": 0.36722126603126526, + "learning_rate": 0.0006, + "loss": 2.4199, + "step": 15500 + }, + { + "epoch": 0.05781726759323501, + "eval_valid_loss": 2.2099344730377197, + "eval_valid_loss/all": 2.0709614753723145, + "eval_valid_loss/end_span": 1.2311733961105347, + "eval_valid_perplexity/batch": 7.932446479797363, + "eval_valid_perplexity/end_span": 3.425246238708496, + "eval_valid_perplexity/fim": 2.3205626010894775, + "eval_valid_perplexity/first_seq": 14.767193794250488, + "eval_valid_perplexity/last_seq": 9.12717056274414, + "eval_valid_perplexity/second_seq": 13.3098783493042, + "eval_valid_perplexity/seq": 8.93567943572998, + "eval_valid_reconstruction/all": 0.2896878123283386, + "eval_valid_reconstruction/end_span": 0.7150929570198059, + "eval_valid_reconstruction/fim": 0.16436010599136353, + "eval_valid_reconstruction/first_seq": 0.17191942036151886, + "eval_valid_reconstruction/last_seq": 0.3228168189525604, + "eval_valid_reconstruction/second_seq": 0.20808564126491547, + "eval_valid_runtime": 438.4634, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 15500 + }, + { + "epoch": 0.05781726759323501, + "eval_train_loss": 2.2064783573150635, + "eval_train_loss/all": 2.039937734603882, + "eval_train_loss/end_span": 1.2011533975601196, + "eval_train_perplexity/batch": 7.690130233764648, + "eval_train_perplexity/end_span": 3.323948621749878, + "eval_train_perplexity/fim": 2.046243667602539, + "eval_train_perplexity/first_seq": 15.31204605102539, + "eval_train_perplexity/last_seq": 9.098377227783203, + "eval_train_perplexity/second_seq": 14.276578903198242, + "eval_train_perplexity/seq": 8.843338012695312, + "eval_train_reconstruction/all": 0.2799583077430725, + "eval_train_reconstruction/end_span": 0.7245171666145325, + "eval_train_reconstruction/fim": 0.14123882353305817, + "eval_train_reconstruction/first_seq": 0.15844444930553436, + "eval_train_reconstruction/last_seq": 0.32258227467536926, + "eval_train_reconstruction/second_seq": 0.18074393272399902, + "eval_train_runtime": 436.6528, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 15500 + }, + { + "epoch": 0.05785456905619838, + "grad_norm": 0.3464924097061157, + "learning_rate": 0.0006, + "loss": 2.2323, + "step": 15510 + }, + { + "epoch": 0.05789187051916176, + "grad_norm": 0.25450849533081055, + "learning_rate": 0.0006, + "loss": 2.1168, + "step": 15520 + }, + { + "epoch": 0.05792917198212514, + "grad_norm": 0.3434736728668213, + "learning_rate": 0.0006, + "loss": 2.2592, + "step": 15530 + }, + { + "epoch": 0.05796647344508852, + "grad_norm": 0.3432539403438568, + "learning_rate": 0.0006, + "loss": 2.0944, + "step": 15540 + }, + { + "epoch": 0.05800377490805189, + "grad_norm": 0.25916922092437744, + "learning_rate": 0.0006, + "loss": 2.2364, + "step": 15550 + }, + { + "epoch": 0.05800377490805189, + "eval_valid_loss": 2.2125661373138428, + "eval_valid_loss/all": 2.0731170177459717, + "eval_valid_loss/end_span": 1.299618124961853, + "eval_valid_perplexity/batch": 7.949563503265381, + "eval_valid_perplexity/end_span": 3.667895793914795, + "eval_valid_perplexity/fim": 2.3443973064422607, + "eval_valid_perplexity/first_seq": 15.145819664001465, + "eval_valid_perplexity/last_seq": 8.646425247192383, + "eval_valid_perplexity/second_seq": 13.500036239624023, + "eval_valid_perplexity/seq": 8.948477745056152, + "eval_valid_reconstruction/all": 0.28860586881637573, + "eval_valid_reconstruction/end_span": 0.7036203742027283, + "eval_valid_reconstruction/fim": 0.16676779091358185, + "eval_valid_reconstruction/first_seq": 0.16420100629329681, + "eval_valid_reconstruction/last_seq": 0.3409932851791382, + "eval_valid_reconstruction/second_seq": 0.20128071308135986, + "eval_valid_runtime": 438.1941, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 15550 + }, + { + "epoch": 0.05800377490805189, + "eval_train_loss": 2.2127740383148193, + "eval_train_loss/all": 2.0453431606292725, + "eval_train_loss/end_span": 1.2637096643447876, + "eval_train_perplexity/batch": 7.7318115234375, + "eval_train_perplexity/end_span": 3.5385239124298096, + "eval_train_perplexity/fim": 1.9487804174423218, + "eval_train_perplexity/first_seq": 15.43691635131836, + "eval_train_perplexity/last_seq": 9.179537773132324, + "eval_train_perplexity/second_seq": 14.831340789794922, + "eval_train_perplexity/seq": 8.889184951782227, + "eval_train_reconstruction/all": 0.27825015783309937, + "eval_train_reconstruction/end_span": 0.7158316969871521, + "eval_train_reconstruction/fim": 0.1302383542060852, + "eval_train_reconstruction/first_seq": 0.15249183773994446, + "eval_train_reconstruction/last_seq": 0.3186715245246887, + "eval_train_reconstruction/second_seq": 0.16997027397155762, + "eval_train_runtime": 438.3897, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 15550 + }, + { + "epoch": 0.05804107637101527, + "grad_norm": 0.5200812220573425, + "learning_rate": 0.0006, + "loss": 1.9549, + "step": 15560 + }, + { + "epoch": 0.05807837783397865, + "grad_norm": 0.35883402824401855, + "learning_rate": 0.0006, + "loss": 2.3507, + "step": 15570 + }, + { + "epoch": 0.05811567929694202, + "grad_norm": 0.2198290377855301, + "learning_rate": 0.0006, + "loss": 2.0746, + "step": 15580 + }, + { + "epoch": 0.0581529807599054, + "grad_norm": 0.42738229036331177, + "learning_rate": 0.0006, + "loss": 2.2449, + "step": 15590 + }, + { + "epoch": 0.05819028222286878, + "grad_norm": 0.47211354970932007, + "learning_rate": 0.0006, + "loss": 2.2811, + "step": 15600 + }, + { + "epoch": 0.05819028222286878, + "eval_valid_loss": 2.2075893878936768, + "eval_valid_loss/all": 2.068902015686035, + "eval_valid_loss/end_span": 1.2868726253509521, + "eval_valid_perplexity/batch": 7.916126728057861, + "eval_valid_perplexity/end_span": 3.621443271636963, + "eval_valid_perplexity/fim": 2.321438789367676, + "eval_valid_perplexity/first_seq": 14.678169250488281, + "eval_valid_perplexity/last_seq": 9.32141399383545, + "eval_valid_perplexity/second_seq": 14.054617881774902, + "eval_valid_perplexity/seq": 8.91508960723877, + "eval_valid_reconstruction/all": 0.2897503674030304, + "eval_valid_reconstruction/end_span": 0.7039545774459839, + "eval_valid_reconstruction/fim": 0.16640456020832062, + "eval_valid_reconstruction/first_seq": 0.17253407835960388, + "eval_valid_reconstruction/last_seq": 0.31623464822769165, + "eval_valid_reconstruction/second_seq": 0.18717831373214722, + "eval_valid_runtime": 434.9745, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 15600 + }, + { + "epoch": 0.05819028222286878, + "eval_train_loss": 2.2036640644073486, + "eval_train_loss/all": 2.0376060009002686, + "eval_train_loss/end_span": 1.2522987127304077, + "eval_train_perplexity/batch": 7.672219753265381, + "eval_train_perplexity/end_span": 3.498375415802002, + "eval_train_perplexity/fim": 2.1533238887786865, + "eval_train_perplexity/first_seq": 15.524323463439941, + "eval_train_perplexity/last_seq": 9.076937675476074, + "eval_train_perplexity/second_seq": 14.022439002990723, + "eval_train_perplexity/seq": 8.82264232635498, + "eval_train_reconstruction/all": 0.280381441116333, + "eval_train_reconstruction/end_span": 0.7140973806381226, + "eval_train_reconstruction/fim": 0.15120215713977814, + "eval_train_reconstruction/first_seq": 0.15224511921405792, + "eval_train_reconstruction/last_seq": 0.3236485421657562, + "eval_train_reconstruction/second_seq": 0.18828800320625305, + "eval_train_runtime": 434.8427, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 15600 + }, + { + "epoch": 0.05822758368583216, + "grad_norm": 0.31297874450683594, + "learning_rate": 0.0006, + "loss": 2.27, + "step": 15610 + }, + { + "epoch": 0.058264885148795534, + "grad_norm": 0.2777692377567291, + "learning_rate": 0.0006, + "loss": 2.4136, + "step": 15620 + }, + { + "epoch": 0.05830218661175891, + "grad_norm": 0.2792506814002991, + "learning_rate": 0.0006, + "loss": 2.3777, + "step": 15630 + }, + { + "epoch": 0.05833948807472229, + "grad_norm": 0.3431245684623718, + "learning_rate": 0.0006, + "loss": 2.272, + "step": 15640 + }, + { + "epoch": 0.058376789537685665, + "grad_norm": 0.2969723641872406, + "learning_rate": 0.0006, + "loss": 2.304, + "step": 15650 + }, + { + "epoch": 0.058376789537685665, + "eval_valid_loss": 2.2119743824005127, + "eval_valid_loss/all": 2.0728323459625244, + "eval_valid_loss/end_span": 1.3237608671188354, + "eval_valid_perplexity/batch": 7.947300910949707, + "eval_valid_perplexity/end_span": 3.757526397705078, + "eval_valid_perplexity/fim": 2.5917739868164062, + "eval_valid_perplexity/first_seq": 15.178162574768066, + "eval_valid_perplexity/last_seq": 8.728914260864258, + "eval_valid_perplexity/second_seq": 14.199636459350586, + "eval_valid_perplexity/seq": 8.95117473602295, + "eval_valid_reconstruction/all": 0.2887938916683197, + "eval_valid_reconstruction/end_span": 0.6806427836418152, + "eval_valid_reconstruction/fim": 0.1864612102508545, + "eval_valid_reconstruction/first_seq": 0.16360780596733093, + "eval_valid_reconstruction/last_seq": 0.3381120264530182, + "eval_valid_reconstruction/second_seq": 0.19051256775856018, + "eval_valid_runtime": 437.1944, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 15650 + }, + { + "epoch": 0.058376789537685665, + "eval_train_loss": 2.208164691925049, + "eval_train_loss/all": 2.0415754318237305, + "eval_train_loss/end_span": 1.2954189777374268, + "eval_train_perplexity/batch": 7.70273494720459, + "eval_train_perplexity/end_span": 3.6525259017944336, + "eval_train_perplexity/fim": 2.1184146404266357, + "eval_train_perplexity/first_seq": 15.626465797424316, + "eval_train_perplexity/last_seq": 9.302962303161621, + "eval_train_perplexity/second_seq": 13.967314720153809, + "eval_train_perplexity/seq": 8.85845947265625, + "eval_train_reconstruction/all": 0.27918919920921326, + "eval_train_reconstruction/end_span": 0.6899374127388, + "eval_train_reconstruction/fim": 0.1482449173927307, + "eval_train_reconstruction/first_seq": 0.14979828894138336, + "eval_train_reconstruction/last_seq": 0.3136025369167328, + "eval_train_reconstruction/second_seq": 0.19089803099632263, + "eval_train_runtime": 436.3214, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 15650 + }, + { + "epoch": 0.058414091000649045, + "grad_norm": 0.38503047823905945, + "learning_rate": 0.0006, + "loss": 2.3589, + "step": 15660 + }, + { + "epoch": 0.058451392463612424, + "grad_norm": 0.3192938566207886, + "learning_rate": 0.0006, + "loss": 2.1956, + "step": 15670 + }, + { + "epoch": 0.058488693926575804, + "grad_norm": 0.35297203063964844, + "learning_rate": 0.0006, + "loss": 2.0912, + "step": 15680 + }, + { + "epoch": 0.058525995389539176, + "grad_norm": 0.31012117862701416, + "learning_rate": 0.0006, + "loss": 2.3237, + "step": 15690 + }, + { + "epoch": 0.058563296852502555, + "grad_norm": 0.3994489312171936, + "learning_rate": 0.0006, + "loss": 2.2758, + "step": 15700 + }, + { + "epoch": 0.058563296852502555, + "eval_valid_loss": 2.2144529819488525, + "eval_valid_loss/all": 2.0747036933898926, + "eval_valid_loss/end_span": 1.312268614768982, + "eval_valid_perplexity/batch": 7.962186813354492, + "eval_valid_perplexity/end_span": 3.7145910263061523, + "eval_valid_perplexity/fim": 2.2586143016815186, + "eval_valid_perplexity/first_seq": 14.714048385620117, + "eval_valid_perplexity/last_seq": 9.207947731018066, + "eval_valid_perplexity/second_seq": 13.794739723205566, + "eval_valid_perplexity/seq": 8.970234870910645, + "eval_valid_reconstruction/all": 0.28819867968559265, + "eval_valid_reconstruction/end_span": 0.6895773410797119, + "eval_valid_reconstruction/fim": 0.15878574550151825, + "eval_valid_reconstruction/first_seq": 0.16788284480571747, + "eval_valid_reconstruction/last_seq": 0.3207498788833618, + "eval_valid_reconstruction/second_seq": 0.19718986749649048, + "eval_valid_runtime": 436.3474, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 15700 + }, + { + "epoch": 0.058563296852502555, + "eval_train_loss": 2.2104079723358154, + "eval_train_loss/all": 2.043473958969116, + "eval_train_loss/end_span": 1.278417944908142, + "eval_train_perplexity/batch": 7.717372417449951, + "eval_train_perplexity/end_span": 3.590954065322876, + "eval_train_perplexity/fim": 2.384901285171509, + "eval_train_perplexity/first_seq": 15.258471488952637, + "eval_train_perplexity/last_seq": 9.562454223632812, + "eval_train_perplexity/second_seq": 14.250835418701172, + "eval_train_perplexity/seq": 8.875109672546387, + "eval_train_reconstruction/all": 0.2786162793636322, + "eval_train_reconstruction/end_span": 0.7015708684921265, + "eval_train_reconstruction/fim": 0.1695461869239807, + "eval_train_reconstruction/first_seq": 0.15661604702472687, + "eval_train_reconstruction/last_seq": 0.30553990602493286, + "eval_train_reconstruction/second_seq": 0.18415221571922302, + "eval_train_runtime": 435.5764, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 15700 + }, + { + "epoch": 0.058600598315465935, + "grad_norm": 0.4026651382446289, + "learning_rate": 0.0006, + "loss": 2.1257, + "step": 15710 + }, + { + "epoch": 0.05863789977842931, + "grad_norm": 0.3088865578174591, + "learning_rate": 0.0006, + "loss": 2.2085, + "step": 15720 + }, + { + "epoch": 0.05867520124139269, + "grad_norm": 0.3838180899620056, + "learning_rate": 0.0006, + "loss": 2.2002, + "step": 15730 + }, + { + "epoch": 0.058712502704356066, + "grad_norm": 0.2739745080471039, + "learning_rate": 0.0006, + "loss": 2.1695, + "step": 15740 + }, + { + "epoch": 0.058749804167319446, + "grad_norm": 0.8015409111976624, + "learning_rate": 0.0006, + "loss": 2.177, + "step": 15750 + }, + { + "epoch": 0.058749804167319446, + "eval_valid_loss": 2.208021402359009, + "eval_valid_loss/all": 2.069010019302368, + "eval_valid_loss/end_span": 1.2199311256408691, + "eval_valid_perplexity/batch": 7.9169816970825195, + "eval_valid_perplexity/end_span": 3.3869545459747314, + "eval_valid_perplexity/fim": 2.1829545497894287, + "eval_valid_perplexity/first_seq": 14.886669158935547, + "eval_valid_perplexity/last_seq": 9.503213882446289, + "eval_valid_perplexity/second_seq": 13.864017486572266, + "eval_valid_perplexity/seq": 8.919093132019043, + "eval_valid_reconstruction/all": 0.29035452008247375, + "eval_valid_reconstruction/end_span": 0.7126247882843018, + "eval_valid_reconstruction/fim": 0.15381135046482086, + "eval_valid_reconstruction/first_seq": 0.16821369528770447, + "eval_valid_reconstruction/last_seq": 0.309526264667511, + "eval_valid_reconstruction/second_seq": 0.19289496541023254, + "eval_valid_runtime": 437.4975, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 15750 + }, + { + "epoch": 0.058749804167319446, + "eval_train_loss": 2.205836057662964, + "eval_train_loss/all": 2.0397748947143555, + "eval_train_loss/end_span": 1.1738370656967163, + "eval_train_perplexity/batch": 7.688878059387207, + "eval_train_perplexity/end_span": 3.234379291534424, + "eval_train_perplexity/fim": 2.1677169799804688, + "eval_train_perplexity/first_seq": 15.476125717163086, + "eval_train_perplexity/last_seq": 9.051106452941895, + "eval_train_perplexity/second_seq": 14.344708442687988, + "eval_train_perplexity/seq": 8.85048770904541, + "eval_train_reconstruction/all": 0.28008511662483215, + "eval_train_reconstruction/end_span": 0.726636528968811, + "eval_train_reconstruction/fim": 0.15180537104606628, + "eval_train_reconstruction/first_seq": 0.15440987050533295, + "eval_train_reconstruction/last_seq": 0.3213885426521301, + "eval_train_reconstruction/second_seq": 0.1802038550376892, + "eval_train_runtime": 434.9756, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 15750 + }, + { + "epoch": 0.05878710563028282, + "grad_norm": 0.6998051404953003, + "learning_rate": 0.0006, + "loss": 2.3318, + "step": 15760 + }, + { + "epoch": 0.0588244070932462, + "grad_norm": 0.5018815994262695, + "learning_rate": 0.0006, + "loss": 2.2469, + "step": 15770 + }, + { + "epoch": 0.05886170855620958, + "grad_norm": 0.347044974565506, + "learning_rate": 0.0006, + "loss": 2.3186, + "step": 15780 + }, + { + "epoch": 0.05889901001917295, + "grad_norm": 0.46822187304496765, + "learning_rate": 0.0006, + "loss": 2.1962, + "step": 15790 + }, + { + "epoch": 0.05893631148213633, + "grad_norm": 0.1939675509929657, + "learning_rate": 0.0006, + "loss": 2.2336, + "step": 15800 + }, + { + "epoch": 0.05893631148213633, + "eval_valid_loss": 2.2161667346954346, + "eval_valid_loss/all": 2.076704502105713, + "eval_valid_loss/end_span": 1.2883667945861816, + "eval_valid_perplexity/batch": 7.978133678436279, + "eval_valid_perplexity/end_span": 3.6268582344055176, + "eval_valid_perplexity/fim": 2.2868845462799072, + "eval_valid_perplexity/first_seq": 14.634737968444824, + "eval_valid_perplexity/last_seq": 9.131254196166992, + "eval_valid_perplexity/second_seq": 13.776761054992676, + "eval_valid_perplexity/seq": 8.982954978942871, + "eval_valid_reconstruction/all": 0.28656959533691406, + "eval_valid_reconstruction/end_span": 0.7014126777648926, + "eval_valid_reconstruction/fim": 0.16068679094314575, + "eval_valid_reconstruction/first_seq": 0.16821666061878204, + "eval_valid_reconstruction/last_seq": 0.32310202717781067, + "eval_valid_reconstruction/second_seq": 0.19350111484527588, + "eval_valid_runtime": 439.1806, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 15800 + }, + { + "epoch": 0.05893631148213633, + "eval_train_loss": 2.2078797817230225, + "eval_train_loss/all": 2.040318727493286, + "eval_train_loss/end_span": 1.2510793209075928, + "eval_train_perplexity/batch": 7.693060874938965, + "eval_train_perplexity/end_span": 3.494112253189087, + "eval_train_perplexity/fim": 1.919388771057129, + "eval_train_perplexity/first_seq": 15.403897285461426, + "eval_train_perplexity/last_seq": 9.099385261535645, + "eval_train_perplexity/second_seq": 14.356330871582031, + "eval_train_perplexity/seq": 8.837705612182617, + "eval_train_reconstruction/all": 0.27893510460853577, + "eval_train_reconstruction/end_span": 0.7141375541687012, + "eval_train_reconstruction/fim": 0.12696115672588348, + "eval_train_reconstruction/first_seq": 0.15355180203914642, + "eval_train_reconstruction/last_seq": 0.32239872217178345, + "eval_train_reconstruction/second_seq": 0.1805019974708557, + "eval_train_runtime": 439.2901, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 15800 + }, + { + "epoch": 0.05897361294509971, + "grad_norm": 0.30693691968917847, + "learning_rate": 0.0006, + "loss": 2.216, + "step": 15810 + }, + { + "epoch": 0.05901091440806308, + "grad_norm": 0.3499642312526703, + "learning_rate": 0.0006, + "loss": 2.1962, + "step": 15820 + }, + { + "epoch": 0.05904821587102646, + "grad_norm": 0.25594717264175415, + "learning_rate": 0.0006, + "loss": 2.2743, + "step": 15830 + }, + { + "epoch": 0.05908551733398984, + "grad_norm": 0.4443255662918091, + "learning_rate": 0.0006, + "loss": 2.2454, + "step": 15840 + }, + { + "epoch": 0.05912281879695322, + "grad_norm": 0.4606648087501526, + "learning_rate": 0.0006, + "loss": 2.3682, + "step": 15850 + }, + { + "epoch": 0.05912281879695322, + "eval_valid_loss": 2.2090306282043457, + "eval_valid_loss/all": 2.070251941680908, + "eval_valid_loss/end_span": 1.210762619972229, + "eval_valid_perplexity/batch": 7.926819801330566, + "eval_valid_perplexity/end_span": 3.3560431003570557, + "eval_valid_perplexity/fim": 2.312634229660034, + "eval_valid_perplexity/first_seq": 14.303842544555664, + "eval_valid_perplexity/last_seq": 9.114808082580566, + "eval_valid_perplexity/second_seq": 13.60760498046875, + "eval_valid_perplexity/seq": 8.924084663391113, + "eval_valid_reconstruction/all": 0.28918567299842834, + "eval_valid_reconstruction/end_span": 0.7136244177818298, + "eval_valid_reconstruction/fim": 0.16474924981594086, + "eval_valid_reconstruction/first_seq": 0.17870576679706573, + "eval_valid_reconstruction/last_seq": 0.3196561336517334, + "eval_valid_reconstruction/second_seq": 0.2000386118888855, + "eval_valid_runtime": 435.1669, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 15850 + }, + { + "epoch": 0.05912281879695322, + "eval_train_loss": 2.2040975093841553, + "eval_train_loss/all": 2.0376341342926025, + "eval_train_loss/end_span": 1.173200011253357, + "eval_train_perplexity/batch": 7.672435760498047, + "eval_train_perplexity/end_span": 3.2323195934295654, + "eval_train_perplexity/fim": 2.3005597591400146, + "eval_train_perplexity/first_seq": 15.30875301361084, + "eval_train_perplexity/last_seq": 9.278200149536133, + "eval_train_perplexity/second_seq": 13.909253120422363, + "eval_train_perplexity/seq": 8.82132625579834, + "eval_train_reconstruction/all": 0.2803526520729065, + "eval_train_reconstruction/end_span": 0.7267974615097046, + "eval_train_reconstruction/fim": 0.16292795538902283, + "eval_train_reconstruction/first_seq": 0.1577852964401245, + "eval_train_reconstruction/last_seq": 0.3176397383213043, + "eval_train_reconstruction/second_seq": 0.18933777511119843, + "eval_train_runtime": 435.7193, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 15850 + }, + { + "epoch": 0.05916012025991659, + "grad_norm": 0.35947176814079285, + "learning_rate": 0.0006, + "loss": 2.1113, + "step": 15860 + }, + { + "epoch": 0.05919742172287997, + "grad_norm": 0.29065603017807007, + "learning_rate": 0.0006, + "loss": 2.3012, + "step": 15870 + }, + { + "epoch": 0.05923472318584335, + "grad_norm": 0.3237083852291107, + "learning_rate": 0.0006, + "loss": 2.3707, + "step": 15880 + }, + { + "epoch": 0.05927202464880672, + "grad_norm": 0.4132396876811981, + "learning_rate": 0.0006, + "loss": 2.339, + "step": 15890 + }, + { + "epoch": 0.0593093261117701, + "grad_norm": 0.363695353269577, + "learning_rate": 0.0006, + "loss": 2.1396, + "step": 15900 + }, + { + "epoch": 0.0593093261117701, + "eval_valid_loss": 2.208996534347534, + "eval_valid_loss/all": 2.070042610168457, + "eval_valid_loss/end_span": 1.2252445220947266, + "eval_valid_perplexity/batch": 7.925160884857178, + "eval_valid_perplexity/end_span": 3.404998540878296, + "eval_valid_perplexity/fim": 2.2801623344421387, + "eval_valid_perplexity/first_seq": 14.845629692077637, + "eval_valid_perplexity/last_seq": 9.455071449279785, + "eval_valid_perplexity/second_seq": 14.012252807617188, + "eval_valid_perplexity/seq": 8.929398536682129, + "eval_valid_reconstruction/all": 0.28974905610084534, + "eval_valid_reconstruction/end_span": 0.7184356451034546, + "eval_valid_reconstruction/fim": 0.16148050129413605, + "eval_valid_reconstruction/first_seq": 0.16828453540802002, + "eval_valid_reconstruction/last_seq": 0.31396031379699707, + "eval_valid_reconstruction/second_seq": 0.19162023067474365, + "eval_valid_runtime": 460.9726, + "eval_valid_samples_per_second": 0.417, + "eval_valid_steps_per_second": 0.417, + "step": 15900 + }, + { + "epoch": 0.0593093261117701, + "eval_train_loss": 2.205110549926758, + "eval_train_loss/all": 2.0384418964385986, + "eval_train_loss/end_span": 1.179408311843872, + "eval_train_perplexity/batch": 7.678635597229004, + "eval_train_perplexity/end_span": 3.2524492740631104, + "eval_train_perplexity/fim": 2.1630706787109375, + "eval_train_perplexity/first_seq": 15.213968276977539, + "eval_train_perplexity/last_seq": 8.997441291809082, + "eval_train_perplexity/second_seq": 14.487894058227539, + "eval_train_perplexity/seq": 8.835232734680176, + "eval_train_reconstruction/all": 0.2801797688007355, + "eval_train_reconstruction/end_span": 0.733696699142456, + "eval_train_reconstruction/fim": 0.15158244967460632, + "eval_train_reconstruction/first_seq": 0.15763981640338898, + "eval_train_reconstruction/last_seq": 0.3267212212085724, + "eval_train_reconstruction/second_seq": 0.17526398599147797, + "eval_train_runtime": 461.1163, + "eval_train_samples_per_second": 0.416, + "eval_train_steps_per_second": 0.416, + "step": 15900 + }, + { + "epoch": 0.05934662757473348, + "grad_norm": 0.28963443636894226, + "learning_rate": 0.0006, + "loss": 2.2144, + "step": 15910 + }, + { + "epoch": 0.05938392903769686, + "grad_norm": 0.30375605821609497, + "learning_rate": 0.0006, + "loss": 2.2851, + "step": 15920 + }, + { + "epoch": 0.059421230500660234, + "grad_norm": 0.3184267580509186, + "learning_rate": 0.0006, + "loss": 2.2603, + "step": 15930 + }, + { + "epoch": 0.05945853196362361, + "grad_norm": 0.27106016874313354, + "learning_rate": 0.0006, + "loss": 2.243, + "step": 15940 + }, + { + "epoch": 0.05949583342658699, + "grad_norm": 0.526357114315033, + "learning_rate": 0.0006, + "loss": 2.2247, + "step": 15950 + }, + { + "epoch": 0.05949583342658699, + "eval_valid_loss": 2.2076468467712402, + "eval_valid_loss/all": 2.068855047225952, + "eval_valid_loss/end_span": 1.2781282663345337, + "eval_valid_perplexity/batch": 7.915754795074463, + "eval_valid_perplexity/end_span": 3.589914083480835, + "eval_valid_perplexity/fim": 2.5539371967315674, + "eval_valid_perplexity/first_seq": 15.047769546508789, + "eval_valid_perplexity/last_seq": 9.37348747253418, + "eval_valid_perplexity/second_seq": 13.90249252319336, + "eval_valid_perplexity/seq": 8.916056632995605, + "eval_valid_reconstruction/all": 0.28975892066955566, + "eval_valid_reconstruction/end_span": 0.6968381404876709, + "eval_valid_reconstruction/fim": 0.18421989679336548, + "eval_valid_reconstruction/first_seq": 0.16320198774337769, + "eval_valid_reconstruction/last_seq": 0.31585997343063354, + "eval_valid_reconstruction/second_seq": 0.19184757769107819, + "eval_valid_runtime": 465.4886, + "eval_valid_samples_per_second": 0.412, + "eval_valid_steps_per_second": 0.412, + "step": 15950 + }, + { + "epoch": 0.05949583342658699, + "eval_train_loss": 2.2035434246063232, + "eval_train_loss/all": 2.037433624267578, + "eval_train_loss/end_span": 1.2353782653808594, + "eval_train_perplexity/batch": 7.670897483825684, + "eval_train_perplexity/end_span": 3.4396793842315674, + "eval_train_perplexity/fim": 2.181225061416626, + "eval_train_perplexity/first_seq": 15.460467338562012, + "eval_train_perplexity/last_seq": 8.751411437988281, + "eval_train_perplexity/second_seq": 14.066645622253418, + "eval_train_perplexity/seq": 8.824516296386719, + "eval_train_reconstruction/all": 0.28025639057159424, + "eval_train_reconstruction/end_span": 0.7122951745986938, + "eval_train_reconstruction/fim": 0.15417535603046417, + "eval_train_reconstruction/first_seq": 0.15179166197776794, + "eval_train_reconstruction/last_seq": 0.3310989439487457, + "eval_train_reconstruction/second_seq": 0.18869119882583618, + "eval_train_runtime": 465.0756, + "eval_train_samples_per_second": 0.413, + "eval_train_steps_per_second": 0.413, + "step": 15950 + }, + { + "epoch": 0.059533134889550365, + "grad_norm": 0.3924109637737274, + "learning_rate": 0.0006, + "loss": 2.2563, + "step": 15960 + }, + { + "epoch": 0.059570436352513745, + "grad_norm": 0.43430522084236145, + "learning_rate": 0.0006, + "loss": 2.3611, + "step": 15970 + }, + { + "epoch": 0.059607737815477124, + "grad_norm": 0.5587413311004639, + "learning_rate": 0.0006, + "loss": 2.3902, + "step": 15980 + }, + { + "epoch": 0.059645039278440504, + "grad_norm": 0.39695560932159424, + "learning_rate": 0.0006, + "loss": 2.2347, + "step": 15990 + }, + { + "epoch": 0.059682340741403876, + "grad_norm": 0.4715542495250702, + "learning_rate": 0.0006, + "loss": 2.1428, + "step": 16000 + }, + { + "epoch": 0.059682340741403876, + "eval_valid_loss": 2.206249475479126, + "eval_valid_loss/all": 2.067502737045288, + "eval_valid_loss/end_span": 1.3046995401382446, + "eval_valid_perplexity/batch": 7.905057430267334, + "eval_valid_perplexity/end_span": 3.6865813732147217, + "eval_valid_perplexity/fim": 2.352104425430298, + "eval_valid_perplexity/first_seq": 14.435288429260254, + "eval_valid_perplexity/last_seq": 8.917730331420898, + "eval_valid_perplexity/second_seq": 13.336121559143066, + "eval_valid_perplexity/seq": 8.906798362731934, + "eval_valid_reconstruction/all": 0.2903445065021515, + "eval_valid_reconstruction/end_span": 0.6859942078590393, + "eval_valid_reconstruction/fim": 0.16774508357048035, + "eval_valid_reconstruction/first_seq": 0.17535021901130676, + "eval_valid_reconstruction/last_seq": 0.328011691570282, + "eval_valid_reconstruction/second_seq": 0.2080812007188797, + "eval_valid_runtime": 448.9638, + "eval_valid_samples_per_second": 0.428, + "eval_valid_steps_per_second": 0.428, + "step": 16000 + }, + { + "epoch": 0.059682340741403876, + "eval_train_loss": 2.2043490409851074, + "eval_train_loss/all": 2.03836727142334, + "eval_train_loss/end_span": 1.2846274375915527, + "eval_train_perplexity/batch": 7.678062915802002, + "eval_train_perplexity/end_span": 3.613321542739868, + "eval_train_perplexity/fim": 2.3915443420410156, + "eval_train_perplexity/first_seq": 15.357085227966309, + "eval_train_perplexity/last_seq": 8.612405776977539, + "eval_train_perplexity/second_seq": 14.488473892211914, + "eval_train_perplexity/seq": 8.835918426513672, + "eval_train_reconstruction/all": 0.2801284193992615, + "eval_train_reconstruction/end_span": 0.6937751173973083, + "eval_train_reconstruction/fim": 0.17091916501522064, + "eval_train_reconstruction/first_seq": 0.15453536808490753, + "eval_train_reconstruction/last_seq": 0.3383578062057495, + "eval_train_reconstruction/second_seq": 0.1786516308784485, + "eval_train_runtime": 453.9031, + "eval_train_samples_per_second": 0.423, + "eval_train_steps_per_second": 0.423, + "step": 16000 + }, + { + "epoch": 0.059719642204367256, + "grad_norm": 0.4372246265411377, + "learning_rate": 0.0006, + "loss": 2.2926, + "step": 16010 + }, + { + "epoch": 0.059756943667330635, + "grad_norm": 0.3755424916744232, + "learning_rate": 0.0006, + "loss": 2.2876, + "step": 16020 + }, + { + "epoch": 0.05979424513029401, + "grad_norm": 0.4728882610797882, + "learning_rate": 0.0006, + "loss": 2.1802, + "step": 16030 + }, + { + "epoch": 0.05983154659325739, + "grad_norm": 0.31563636660575867, + "learning_rate": 0.0006, + "loss": 2.3484, + "step": 16040 + }, + { + "epoch": 0.059868848056220766, + "grad_norm": 0.38896358013153076, + "learning_rate": 0.0006, + "loss": 2.2705, + "step": 16050 + }, + { + "epoch": 0.059868848056220766, + "eval_valid_loss": 2.20760440826416, + "eval_valid_loss/all": 2.0685877799987793, + "eval_valid_loss/end_span": 1.1988275051116943, + "eval_valid_perplexity/batch": 7.913639545440674, + "eval_valid_perplexity/end_span": 3.3162264823913574, + "eval_valid_perplexity/fim": 2.2648425102233887, + "eval_valid_perplexity/first_seq": 14.720203399658203, + "eval_valid_perplexity/last_seq": 9.254457473754883, + "eval_valid_perplexity/second_seq": 13.761286735534668, + "eval_valid_perplexity/seq": 8.915349006652832, + "eval_valid_reconstruction/all": 0.2903876304626465, + "eval_valid_reconstruction/end_span": 0.7215939164161682, + "eval_valid_reconstruction/fim": 0.1603883057832718, + "eval_valid_reconstruction/first_seq": 0.1702900379896164, + "eval_valid_reconstruction/last_seq": 0.31791776418685913, + "eval_valid_reconstruction/second_seq": 0.19688844680786133, + "eval_valid_runtime": 455.5592, + "eval_valid_samples_per_second": 0.421, + "eval_valid_steps_per_second": 0.421, + "step": 16050 + }, + { + "epoch": 0.059868848056220766, + "eval_train_loss": 2.203984498977661, + "eval_train_loss/all": 2.0380403995513916, + "eval_train_loss/end_span": 1.160093903541565, + "eval_train_perplexity/batch": 7.675553321838379, + "eval_train_perplexity/end_span": 3.190232753753662, + "eval_train_perplexity/fim": 2.2685015201568604, + "eval_train_perplexity/first_seq": 15.63698673248291, + "eval_train_perplexity/last_seq": 9.55860710144043, + "eval_train_perplexity/second_seq": 14.190711975097656, + "eval_train_perplexity/seq": 8.832854270935059, + "eval_train_reconstruction/all": 0.2805117964744568, + "eval_train_reconstruction/end_span": 0.7363554239273071, + "eval_train_reconstruction/fim": 0.16043388843536377, + "eval_train_reconstruction/first_seq": 0.14824768900871277, + "eval_train_reconstruction/last_seq": 0.30674096941947937, + "eval_train_reconstruction/second_seq": 0.18690364062786102, + "eval_train_runtime": 464.4293, + "eval_train_samples_per_second": 0.413, + "eval_train_steps_per_second": 0.413, + "step": 16050 + }, + { + "epoch": 0.05990614951918414, + "grad_norm": 0.34125563502311707, + "learning_rate": 0.0006, + "loss": 2.393, + "step": 16060 + }, + { + "epoch": 0.05994345098214752, + "grad_norm": 0.3529304265975952, + "learning_rate": 0.0006, + "loss": 2.029, + "step": 16070 + }, + { + "epoch": 0.0599807524451109, + "grad_norm": 0.412006676197052, + "learning_rate": 0.0006, + "loss": 2.2779, + "step": 16080 + }, + { + "epoch": 0.06001805390807428, + "grad_norm": 0.5722240805625916, + "learning_rate": 0.0006, + "loss": 2.2835, + "step": 16090 + }, + { + "epoch": 0.06005535537103765, + "grad_norm": 0.3228626847267151, + "learning_rate": 0.0006, + "loss": 2.4398, + "step": 16100 + }, + { + "epoch": 0.06005535537103765, + "eval_valid_loss": 2.212984800338745, + "eval_valid_loss/all": 2.0736546516418457, + "eval_valid_loss/end_span": 1.2749228477478027, + "eval_valid_perplexity/batch": 7.953838348388672, + "eval_valid_perplexity/end_span": 3.578425407409668, + "eval_valid_perplexity/fim": 2.1934621334075928, + "eval_valid_perplexity/first_seq": 14.630788803100586, + "eval_valid_perplexity/last_seq": 8.79987621307373, + "eval_valid_perplexity/second_seq": 13.630139350891113, + "eval_valid_perplexity/seq": 8.964255332946777, + "eval_valid_reconstruction/all": 0.2886354625225067, + "eval_valid_reconstruction/end_span": 0.6944872140884399, + "eval_valid_reconstruction/fim": 0.15306082367897034, + "eval_valid_reconstruction/first_seq": 0.17437370121479034, + "eval_valid_reconstruction/last_seq": 0.3333587944507599, + "eval_valid_reconstruction/second_seq": 0.2005409151315689, + "eval_valid_runtime": 459.5365, + "eval_valid_samples_per_second": 0.418, + "eval_valid_steps_per_second": 0.418, + "step": 16100 + }, + { + "epoch": 0.06005535537103765, + "eval_train_loss": 2.209968328475952, + "eval_train_loss/all": 2.043301820755005, + "eval_train_loss/end_span": 1.2399351596832275, + "eval_train_perplexity/batch": 7.716043949127197, + "eval_train_perplexity/end_span": 3.4553894996643066, + "eval_train_perplexity/fim": 2.042915105819702, + "eval_train_perplexity/first_seq": 15.655444145202637, + "eval_train_perplexity/last_seq": 9.00047492980957, + "eval_train_perplexity/second_seq": 14.25905990600586, + "eval_train_perplexity/seq": 8.87804889678955, + "eval_train_reconstruction/all": 0.27869051694869995, + "eval_train_reconstruction/end_span": 0.7078365683555603, + "eval_train_reconstruction/fim": 0.13898715376853943, + "eval_train_reconstruction/first_seq": 0.14955079555511475, + "eval_train_reconstruction/last_seq": 0.3265548050403595, + "eval_train_reconstruction/second_seq": 0.18303559720516205, + "eval_train_runtime": 465.2439, + "eval_train_samples_per_second": 0.413, + "eval_train_steps_per_second": 0.413, + "step": 16100 + }, + { + "epoch": 0.06009265683400103, + "grad_norm": 0.3953229784965515, + "learning_rate": 0.0006, + "loss": 2.3304, + "step": 16110 + }, + { + "epoch": 0.06012995829696441, + "grad_norm": 0.4067004323005676, + "learning_rate": 0.0006, + "loss": 2.2022, + "step": 16120 + }, + { + "epoch": 0.06016725975992778, + "grad_norm": 0.3774869740009308, + "learning_rate": 0.0006, + "loss": 2.2646, + "step": 16130 + }, + { + "epoch": 0.06020456122289116, + "grad_norm": 0.5325101017951965, + "learning_rate": 0.0006, + "loss": 2.083, + "step": 16140 + }, + { + "epoch": 0.06024186268585454, + "grad_norm": 0.3990691006183624, + "learning_rate": 0.0006, + "loss": 2.3813, + "step": 16150 + }, + { + "epoch": 0.06024186268585454, + "eval_valid_loss": 2.209702253341675, + "eval_valid_loss/all": 2.0701839923858643, + "eval_valid_loss/end_span": 1.203547716140747, + "eval_valid_perplexity/batch": 7.926281452178955, + "eval_valid_perplexity/end_span": 3.331916570663452, + "eval_valid_perplexity/fim": 2.267758846282959, + "eval_valid_perplexity/first_seq": 14.752065658569336, + "eval_valid_perplexity/last_seq": 9.342671394348145, + "eval_valid_perplexity/second_seq": 13.751236915588379, + "eval_valid_perplexity/seq": 8.92745590209961, + "eval_valid_reconstruction/all": 0.2896157205104828, + "eval_valid_reconstruction/end_span": 0.7155151963233948, + "eval_valid_reconstruction/fim": 0.1597665250301361, + "eval_valid_reconstruction/first_seq": 0.16837060451507568, + "eval_valid_reconstruction/last_seq": 0.3163624703884125, + "eval_valid_reconstruction/second_seq": 0.1974448561668396, + "eval_valid_runtime": 464.9822, + "eval_valid_samples_per_second": 0.413, + "eval_valid_steps_per_second": 0.413, + "step": 16150 + }, + { + "epoch": 0.06024186268585454, + "eval_train_loss": 2.208784341812134, + "eval_train_loss/all": 2.042158603668213, + "eval_train_loss/end_span": 1.1739858388900757, + "eval_train_perplexity/batch": 7.707228183746338, + "eval_train_perplexity/end_span": 3.23486065864563, + "eval_train_perplexity/fim": 2.1680092811584473, + "eval_train_perplexity/first_seq": 15.43130874633789, + "eval_train_perplexity/last_seq": 9.103361129760742, + "eval_train_perplexity/second_seq": 13.979334831237793, + "eval_train_perplexity/seq": 8.868794441223145, + "eval_train_reconstruction/all": 0.27918022871017456, + "eval_train_reconstruction/end_span": 0.7278331518173218, + "eval_train_reconstruction/fim": 0.15222972631454468, + "eval_train_reconstruction/first_seq": 0.15006375312805176, + "eval_train_reconstruction/last_seq": 0.32120952010154724, + "eval_train_reconstruction/second_seq": 0.19057892262935638, + "eval_train_runtime": 463.1503, + "eval_train_samples_per_second": 0.415, + "eval_train_steps_per_second": 0.415, + "step": 16150 + }, + { + "epoch": 0.06027916414881792, + "grad_norm": 0.32074278593063354, + "learning_rate": 0.0006, + "loss": 2.0901, + "step": 16160 + }, + { + "epoch": 0.06031646561178129, + "grad_norm": 0.24767936766147614, + "learning_rate": 0.0006, + "loss": 2.1863, + "step": 16170 + }, + { + "epoch": 0.06035376707474467, + "grad_norm": 0.4243989884853363, + "learning_rate": 0.0006, + "loss": 2.2795, + "step": 16180 + }, + { + "epoch": 0.06039106853770805, + "grad_norm": 0.19615331292152405, + "learning_rate": 0.0006, + "loss": 2.3417, + "step": 16190 + }, + { + "epoch": 0.06042837000067142, + "grad_norm": 0.509572446346283, + "learning_rate": 0.0006, + "loss": 2.3447, + "step": 16200 + }, + { + "epoch": 0.06042837000067142, + "eval_valid_loss": 2.2064714431762695, + "eval_valid_loss/all": 2.0674149990081787, + "eval_valid_loss/end_span": 1.2642878293991089, + "eval_valid_perplexity/batch": 7.904364109039307, + "eval_valid_perplexity/end_span": 3.5405702590942383, + "eval_valid_perplexity/fim": 2.05656361579895, + "eval_valid_perplexity/first_seq": 14.603859901428223, + "eval_valid_perplexity/last_seq": 9.239665985107422, + "eval_valid_perplexity/second_seq": 14.1366548538208, + "eval_valid_perplexity/seq": 8.901878356933594, + "eval_valid_reconstruction/all": 0.2906929552555084, + "eval_valid_reconstruction/end_span": 0.7040456533432007, + "eval_valid_reconstruction/fim": 0.14124181866645813, + "eval_valid_reconstruction/first_seq": 0.17697083950042725, + "eval_valid_reconstruction/last_seq": 0.3188515901565552, + "eval_valid_reconstruction/second_seq": 0.18710964918136597, + "eval_valid_runtime": 484.6803, + "eval_valid_samples_per_second": 0.396, + "eval_valid_steps_per_second": 0.396, + "step": 16200 + }, + { + "epoch": 0.06042837000067142, + "eval_train_loss": 2.2060139179229736, + "eval_train_loss/all": 2.039698362350464, + "eval_train_loss/end_span": 1.2328916788101196, + "eval_train_perplexity/batch": 7.688289642333984, + "eval_train_perplexity/end_span": 3.4311368465423584, + "eval_train_perplexity/fim": 2.052722215652466, + "eval_train_perplexity/first_seq": 15.505369186401367, + "eval_train_perplexity/last_seq": 8.935362815856934, + "eval_train_perplexity/second_seq": 14.616848945617676, + "eval_train_perplexity/seq": 8.843098640441895, + "eval_train_reconstruction/all": 0.2800649404525757, + "eval_train_reconstruction/end_span": 0.7145199775695801, + "eval_train_reconstruction/fim": 0.14038977026939392, + "eval_train_reconstruction/first_seq": 0.15250951051712036, + "eval_train_reconstruction/last_seq": 0.3269274830818176, + "eval_train_reconstruction/second_seq": 0.17697206139564514, + "eval_train_runtime": 473.5301, + "eval_train_samples_per_second": 0.405, + "eval_train_steps_per_second": 0.405, + "step": 16200 + }, + { + "epoch": 0.0604656714636348, + "grad_norm": 0.25501394271850586, + "learning_rate": 0.0006, + "loss": 2.3369, + "step": 16210 + }, + { + "epoch": 0.06050297292659818, + "grad_norm": 0.330944299697876, + "learning_rate": 0.0006, + "loss": 2.2026, + "step": 16220 + }, + { + "epoch": 0.06054027438956156, + "grad_norm": 0.29708877205848694, + "learning_rate": 0.0006, + "loss": 2.136, + "step": 16230 + }, + { + "epoch": 0.060577575852524934, + "grad_norm": 0.344900906085968, + "learning_rate": 0.0006, + "loss": 2.0643, + "step": 16240 + }, + { + "epoch": 0.060614877315488314, + "grad_norm": 0.42992985248565674, + "learning_rate": 0.0006, + "loss": 2.1329, + "step": 16250 + }, + { + "epoch": 0.060614877315488314, + "eval_valid_loss": 2.208789825439453, + "eval_valid_loss/all": 2.069464921951294, + "eval_valid_loss/end_span": 1.279086947441101, + "eval_valid_perplexity/batch": 7.920583724975586, + "eval_valid_perplexity/end_span": 3.5933573246002197, + "eval_valid_perplexity/fim": 2.0919580459594727, + "eval_valid_perplexity/first_seq": 15.458565711975098, + "eval_valid_perplexity/last_seq": 8.773578643798828, + "eval_valid_perplexity/second_seq": 13.851627349853516, + "eval_valid_perplexity/seq": 8.92396068572998, + "eval_valid_reconstruction/all": 0.2898334562778473, + "eval_valid_reconstruction/end_span": 0.6989811062812805, + "eval_valid_reconstruction/fim": 0.14412033557891846, + "eval_valid_reconstruction/first_seq": 0.15641652047634125, + "eval_valid_reconstruction/last_seq": 0.3364299237728119, + "eval_valid_reconstruction/second_seq": 0.19242270290851593, + "eval_valid_runtime": 455.429, + "eval_valid_samples_per_second": 0.422, + "eval_valid_steps_per_second": 0.422, + "step": 16250 + }, + { + "epoch": 0.060614877315488314, + "eval_train_loss": 2.2066261768341064, + "eval_train_loss/all": 2.0407021045684814, + "eval_train_loss/end_span": 1.2420244216918945, + "eval_train_perplexity/batch": 7.696010589599609, + "eval_train_perplexity/end_span": 3.462616205215454, + "eval_train_perplexity/fim": 2.051340103149414, + "eval_train_perplexity/first_seq": 15.693184852600098, + "eval_train_perplexity/last_seq": 8.993809700012207, + "eval_train_perplexity/second_seq": 14.714662551879883, + "eval_train_perplexity/seq": 8.858534812927246, + "eval_train_reconstruction/all": 0.2793899178504944, + "eval_train_reconstruction/end_span": 0.7098856568336487, + "eval_train_reconstruction/fim": 0.14021959900856018, + "eval_train_reconstruction/first_seq": 0.14806059002876282, + "eval_train_reconstruction/last_seq": 0.3237880766391754, + "eval_train_reconstruction/second_seq": 0.1705067902803421, + "eval_train_runtime": 460.4883, + "eval_train_samples_per_second": 0.417, + "eval_train_steps_per_second": 0.417, + "step": 16250 + }, + { + "epoch": 0.06065217877845169, + "grad_norm": 0.37815821170806885, + "learning_rate": 0.0006, + "loss": 2.0202, + "step": 16260 + }, + { + "epoch": 0.060689480241415066, + "grad_norm": 0.5192695260047913, + "learning_rate": 0.0006, + "loss": 2.2528, + "step": 16270 + }, + { + "epoch": 0.060726781704378445, + "grad_norm": 0.6371777057647705, + "learning_rate": 0.0006, + "loss": 2.2203, + "step": 16280 + }, + { + "epoch": 0.060764083167341824, + "grad_norm": 0.4065472185611725, + "learning_rate": 0.0006, + "loss": 2.3444, + "step": 16290 + }, + { + "epoch": 0.060801384630305204, + "grad_norm": 0.504209041595459, + "learning_rate": 0.0006, + "loss": 2.3343, + "step": 16300 + }, + { + "epoch": 0.060801384630305204, + "eval_valid_loss": 2.206383466720581, + "eval_valid_loss/all": 2.0674490928649902, + "eval_valid_loss/end_span": 1.2725811004638672, + "eval_valid_perplexity/batch": 7.904633522033691, + "eval_valid_perplexity/end_span": 3.5700552463531494, + "eval_valid_perplexity/fim": 2.2489540576934814, + "eval_valid_perplexity/first_seq": 14.933910369873047, + "eval_valid_perplexity/last_seq": 8.892248153686523, + "eval_valid_perplexity/second_seq": 13.674637794494629, + "eval_valid_perplexity/seq": 8.905536651611328, + "eval_valid_reconstruction/all": 0.29038089513778687, + "eval_valid_reconstruction/end_span": 0.6957823634147644, + "eval_valid_reconstruction/fim": 0.1592526137828827, + "eval_valid_reconstruction/first_seq": 0.1681443750858307, + "eval_valid_reconstruction/last_seq": 0.3304498791694641, + "eval_valid_reconstruction/second_seq": 0.19641751050949097, + "eval_valid_runtime": 455.0754, + "eval_valid_samples_per_second": 0.422, + "eval_valid_steps_per_second": 0.422, + "step": 16300 + }, + { + "epoch": 0.060801384630305204, + "eval_train_loss": 2.2036216259002686, + "eval_train_loss/all": 2.037787914276123, + "eval_train_loss/end_span": 1.24857497215271, + "eval_train_perplexity/batch": 7.6736159324646, + "eval_train_perplexity/end_span": 3.48537278175354, + "eval_train_perplexity/fim": 2.1771905422210693, + "eval_train_perplexity/first_seq": 15.198299407958984, + "eval_train_perplexity/last_seq": 9.047607421875, + "eval_train_perplexity/second_seq": 13.851210594177246, + "eval_train_perplexity/seq": 8.827052116394043, + "eval_train_reconstruction/all": 0.2803453505039215, + "eval_train_reconstruction/end_span": 0.7047781944274902, + "eval_train_reconstruction/fim": 0.1527198851108551, + "eval_train_reconstruction/first_seq": 0.15798324346542358, + "eval_train_reconstruction/last_seq": 0.3227371275424957, + "eval_train_reconstruction/second_seq": 0.19238047301769257, + "eval_train_runtime": 455.6239, + "eval_train_samples_per_second": 0.421, + "eval_train_steps_per_second": 0.421, + "step": 16300 + }, + { + "epoch": 0.060838686093268576, + "grad_norm": 0.7140604853630066, + "learning_rate": 0.0006, + "loss": 2.3379, + "step": 16310 + }, + { + "epoch": 0.060875987556231956, + "grad_norm": 0.42322999238967896, + "learning_rate": 0.0006, + "loss": 2.3673, + "step": 16320 + }, + { + "epoch": 0.060913289019195335, + "grad_norm": 0.3415990471839905, + "learning_rate": 0.0006, + "loss": 2.2966, + "step": 16330 + }, + { + "epoch": 0.06095059048215871, + "grad_norm": 0.5154109001159668, + "learning_rate": 0.0006, + "loss": 2.2689, + "step": 16340 + }, + { + "epoch": 0.06098789194512209, + "grad_norm": 0.2317839115858078, + "learning_rate": 0.0006, + "loss": 2.3841, + "step": 16350 + }, + { + "epoch": 0.06098789194512209, + "eval_valid_loss": 2.2090117931365967, + "eval_valid_loss/all": 2.0699234008789062, + "eval_valid_loss/end_span": 1.2826728820800781, + "eval_valid_perplexity/batch": 7.924216270446777, + "eval_valid_perplexity/end_span": 3.6062660217285156, + "eval_valid_perplexity/fim": 2.5983500480651855, + "eval_valid_perplexity/first_seq": 14.476122856140137, + "eval_valid_perplexity/last_seq": 8.799180030822754, + "eval_valid_perplexity/second_seq": 13.783062934875488, + "eval_valid_perplexity/seq": 8.925216674804688, + "eval_valid_reconstruction/all": 0.2895297408103943, + "eval_valid_reconstruction/end_span": 0.6910330057144165, + "eval_valid_reconstruction/fim": 0.18707972764968872, + "eval_valid_reconstruction/first_seq": 0.17612378299236298, + "eval_valid_reconstruction/last_seq": 0.33581167459487915, + "eval_valid_reconstruction/second_seq": 0.19496774673461914, + "eval_valid_runtime": 447.0009, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 16350 + }, + { + "epoch": 0.06098789194512209, + "eval_train_loss": 2.2062366008758545, + "eval_train_loss/all": 2.0401229858398438, + "eval_train_loss/end_span": 1.2497189044952393, + "eval_train_perplexity/batch": 7.691555023193359, + "eval_train_perplexity/end_span": 3.4893620014190674, + "eval_train_perplexity/fim": 2.003290891647339, + "eval_train_perplexity/first_seq": 15.58956527709961, + "eval_train_perplexity/last_seq": 9.261310577392578, + "eval_train_perplexity/second_seq": 14.033308982849121, + "eval_train_perplexity/seq": 8.850056648254395, + "eval_train_reconstruction/all": 0.2794967293739319, + "eval_train_reconstruction/end_span": 0.7028581500053406, + "eval_train_reconstruction/fim": 0.1363561451435089, + "eval_train_reconstruction/first_seq": 0.15034353733062744, + "eval_train_reconstruction/last_seq": 0.3138892352581024, + "eval_train_reconstruction/second_seq": 0.18648208677768707, + "eval_train_runtime": 459.4607, + "eval_train_samples_per_second": 0.418, + "eval_train_steps_per_second": 0.418, + "step": 16350 + }, + { + "epoch": 0.06102519340808547, + "grad_norm": 0.3163726329803467, + "learning_rate": 0.0006, + "loss": 2.1134, + "step": 16360 + }, + { + "epoch": 0.06106249487104884, + "grad_norm": 0.3706524670124054, + "learning_rate": 0.0006, + "loss": 2.1868, + "step": 16370 + }, + { + "epoch": 0.06109979633401222, + "grad_norm": 0.3858804404735565, + "learning_rate": 0.0006, + "loss": 2.1825, + "step": 16380 + }, + { + "epoch": 0.0611370977969756, + "grad_norm": 0.6705479621887207, + "learning_rate": 0.0006, + "loss": 2.2647, + "step": 16390 + }, + { + "epoch": 0.06117439925993898, + "grad_norm": 0.4253837764263153, + "learning_rate": 0.0006, + "loss": 2.1813, + "step": 16400 + }, + { + "epoch": 0.06117439925993898, + "eval_valid_loss": 2.2100954055786133, + "eval_valid_loss/all": 2.070897102355957, + "eval_valid_loss/end_span": 1.1929571628570557, + "eval_valid_perplexity/batch": 7.931935787200928, + "eval_valid_perplexity/end_span": 3.296816110610962, + "eval_valid_perplexity/fim": 2.2394368648529053, + "eval_valid_perplexity/first_seq": 14.444876670837402, + "eval_valid_perplexity/last_seq": 8.56992244720459, + "eval_valid_perplexity/second_seq": 13.624303817749023, + "eval_valid_perplexity/seq": 8.933653831481934, + "eval_valid_reconstruction/all": 0.28925999999046326, + "eval_valid_reconstruction/end_span": 0.724030077457428, + "eval_valid_reconstruction/fim": 0.15735922753810883, + "eval_valid_reconstruction/first_seq": 0.17338085174560547, + "eval_valid_reconstruction/last_seq": 0.34421393275260925, + "eval_valid_reconstruction/second_seq": 0.1976654827594757, + "eval_valid_runtime": 460.0881, + "eval_valid_samples_per_second": 0.417, + "eval_valid_steps_per_second": 0.417, + "step": 16400 + }, + { + "epoch": 0.06117439925993898, + "eval_train_loss": 2.2076094150543213, + "eval_train_loss/all": 2.041166067123413, + "eval_train_loss/end_span": 1.1493960618972778, + "eval_train_perplexity/batch": 7.699582099914551, + "eval_train_perplexity/end_span": 3.1562862396240234, + "eval_train_perplexity/fim": 2.0655806064605713, + "eval_train_perplexity/first_seq": 15.624412536621094, + "eval_train_perplexity/last_seq": 9.071271896362305, + "eval_train_perplexity/second_seq": 14.433244705200195, + "eval_train_perplexity/seq": 8.856629371643066, + "eval_train_reconstruction/all": 0.27938854694366455, + "eval_train_reconstruction/end_span": 0.7377427220344543, + "eval_train_reconstruction/fim": 0.14170721173286438, + "eval_train_reconstruction/first_seq": 0.15025626122951508, + "eval_train_reconstruction/last_seq": 0.32474201917648315, + "eval_train_reconstruction/second_seq": 0.17921705543994904, + "eval_train_runtime": 463.0281, + "eval_train_samples_per_second": 0.415, + "eval_train_steps_per_second": 0.415, + "step": 16400 + }, + { + "epoch": 0.06121170072290235, + "grad_norm": 0.4093838334083557, + "learning_rate": 0.0006, + "loss": 2.2408, + "step": 16410 + }, + { + "epoch": 0.06124900218586573, + "grad_norm": 0.3654783070087433, + "learning_rate": 0.0006, + "loss": 2.2707, + "step": 16420 + }, + { + "epoch": 0.06128630364882911, + "grad_norm": 0.37434086203575134, + "learning_rate": 0.0006, + "loss": 2.0328, + "step": 16430 + }, + { + "epoch": 0.06132360511179248, + "grad_norm": 0.22930118441581726, + "learning_rate": 0.0006, + "loss": 2.4545, + "step": 16440 + }, + { + "epoch": 0.06136090657475586, + "grad_norm": 0.35906749963760376, + "learning_rate": 0.0006, + "loss": 2.1724, + "step": 16450 + }, + { + "epoch": 0.06136090657475586, + "eval_valid_loss": 2.2082154750823975, + "eval_valid_loss/all": 2.0692944526672363, + "eval_valid_loss/end_span": 1.2146438360214233, + "eval_valid_perplexity/batch": 7.919233798980713, + "eval_valid_perplexity/end_span": 3.369093894958496, + "eval_valid_perplexity/fim": 2.4673893451690674, + "eval_valid_perplexity/first_seq": 14.655590057373047, + "eval_valid_perplexity/last_seq": 8.718371391296387, + "eval_valid_perplexity/second_seq": 13.80409049987793, + "eval_valid_perplexity/seq": 8.922091484069824, + "eval_valid_reconstruction/all": 0.2898320257663727, + "eval_valid_reconstruction/end_span": 0.7171245813369751, + "eval_valid_reconstruction/fim": 0.177970752120018, + "eval_valid_reconstruction/first_seq": 0.1716148853302002, + "eval_valid_reconstruction/last_seq": 0.3389855623245239, + "eval_valid_reconstruction/second_seq": 0.1951722502708435, + "eval_valid_runtime": 456.7332, + "eval_valid_samples_per_second": 0.42, + "eval_valid_steps_per_second": 0.42, + "step": 16450 + }, + { + "epoch": 0.06136090657475586, + "eval_train_loss": 2.2052078247070312, + "eval_train_loss/all": 2.039335012435913, + "eval_train_loss/end_span": 1.1853320598602295, + "eval_train_perplexity/batch": 7.685496807098389, + "eval_train_perplexity/end_span": 3.271773099899292, + "eval_train_perplexity/fim": 2.324549913406372, + "eval_train_perplexity/first_seq": 15.46573543548584, + "eval_train_perplexity/last_seq": 9.429097175598145, + "eval_train_perplexity/second_seq": 14.068672180175781, + "eval_train_perplexity/seq": 8.842108726501465, + "eval_train_reconstruction/all": 0.2799321413040161, + "eval_train_reconstruction/end_span": 0.7271942496299744, + "eval_train_reconstruction/fim": 0.16624927520751953, + "eval_train_reconstruction/first_seq": 0.15154264867305756, + "eval_train_reconstruction/last_seq": 0.3085850477218628, + "eval_train_reconstruction/second_seq": 0.18608087301254272, + "eval_train_runtime": 453.5656, + "eval_train_samples_per_second": 0.423, + "eval_train_steps_per_second": 0.423, + "step": 16450 + }, + { + "epoch": 0.06139820803771924, + "grad_norm": 0.30793634057044983, + "learning_rate": 0.0006, + "loss": 2.2413, + "step": 16460 + }, + { + "epoch": 0.06143550950068262, + "grad_norm": 0.6378623843193054, + "learning_rate": 0.0006, + "loss": 2.2746, + "step": 16470 + }, + { + "epoch": 0.06147281096364599, + "grad_norm": 0.609226405620575, + "learning_rate": 0.0006, + "loss": 2.1243, + "step": 16480 + }, + { + "epoch": 0.06151011242660937, + "grad_norm": 0.3348153829574585, + "learning_rate": 0.0006, + "loss": 2.3877, + "step": 16490 + }, + { + "epoch": 0.06154741388957275, + "grad_norm": 0.43033111095428467, + "learning_rate": 0.0006, + "loss": 2.2871, + "step": 16500 + }, + { + "epoch": 0.06154741388957275, + "eval_valid_loss": 2.2045347690582275, + "eval_valid_loss/all": 2.0655484199523926, + "eval_valid_loss/end_span": 1.173404574394226, + "eval_valid_perplexity/batch": 7.889623641967773, + "eval_valid_perplexity/end_span": 3.232980966567993, + "eval_valid_perplexity/fim": 2.665104389190674, + "eval_valid_perplexity/first_seq": 14.478650093078613, + "eval_valid_perplexity/last_seq": 9.362449645996094, + "eval_valid_perplexity/second_seq": 13.721500396728516, + "eval_valid_perplexity/seq": 8.887020111083984, + "eval_valid_reconstruction/all": 0.2909027636051178, + "eval_valid_reconstruction/end_span": 0.7312396764755249, + "eval_valid_reconstruction/fim": 0.1926562488079071, + "eval_valid_reconstruction/first_seq": 0.17783388495445251, + "eval_valid_reconstruction/last_seq": 0.3152323067188263, + "eval_valid_reconstruction/second_seq": 0.19673356413841248, + "eval_valid_runtime": 455.4329, + "eval_valid_samples_per_second": 0.422, + "eval_valid_steps_per_second": 0.422, + "step": 16500 + }, + { + "epoch": 0.06154741388957275, + "eval_train_loss": 2.2028791904449463, + "eval_train_loss/all": 2.0368943214416504, + "eval_train_loss/end_span": 1.1488988399505615, + "eval_train_perplexity/batch": 7.666761875152588, + "eval_train_perplexity/end_span": 3.154717206954956, + "eval_train_perplexity/fim": 2.1075847148895264, + "eval_train_perplexity/first_seq": 15.551351547241211, + "eval_train_perplexity/last_seq": 9.119112014770508, + "eval_train_perplexity/second_seq": 14.150510787963867, + "eval_train_perplexity/seq": 8.818666458129883, + "eval_train_reconstruction/all": 0.28074944019317627, + "eval_train_reconstruction/end_span": 0.7411795258522034, + "eval_train_reconstruction/fim": 0.14737442135810852, + "eval_train_reconstruction/first_seq": 0.15239088237285614, + "eval_train_reconstruction/last_seq": 0.3206944465637207, + "eval_train_reconstruction/second_seq": 0.1866534948348999, + "eval_train_runtime": 452.1892, + "eval_train_samples_per_second": 0.425, + "eval_train_steps_per_second": 0.425, + "step": 16500 + }, + { + "epoch": 0.061584715352536124, + "grad_norm": 0.2852994501590729, + "learning_rate": 0.0006, + "loss": 2.1792, + "step": 16510 + }, + { + "epoch": 0.0616220168154995, + "grad_norm": 0.2798702120780945, + "learning_rate": 0.0006, + "loss": 2.3084, + "step": 16520 + }, + { + "epoch": 0.06165931827846288, + "grad_norm": 0.34691134095191956, + "learning_rate": 0.0006, + "loss": 2.1264, + "step": 16530 + }, + { + "epoch": 0.06169661974142626, + "grad_norm": 0.39865830540657043, + "learning_rate": 0.0006, + "loss": 2.2852, + "step": 16540 + }, + { + "epoch": 0.061733921204389634, + "grad_norm": 0.44452938437461853, + "learning_rate": 0.0006, + "loss": 2.2125, + "step": 16550 + }, + { + "epoch": 0.061733921204389634, + "eval_valid_loss": 2.2059223651885986, + "eval_valid_loss/all": 2.0666701793670654, + "eval_valid_loss/end_span": 1.2064728736877441, + "eval_valid_perplexity/batch": 7.898478984832764, + "eval_valid_perplexity/end_span": 3.34167742729187, + "eval_valid_perplexity/fim": 2.484147548675537, + "eval_valid_perplexity/first_seq": 15.099550247192383, + "eval_valid_perplexity/last_seq": 8.798332214355469, + "eval_valid_perplexity/second_seq": 13.692444801330566, + "eval_valid_perplexity/seq": 8.89377212524414, + "eval_valid_reconstruction/all": 0.2906671166419983, + "eval_valid_reconstruction/end_span": 0.7128626108169556, + "eval_valid_reconstruction/fim": 0.17978119850158691, + "eval_valid_reconstruction/first_seq": 0.16341131925582886, + "eval_valid_reconstruction/last_seq": 0.3343311548233032, + "eval_valid_reconstruction/second_seq": 0.1978125274181366, + "eval_valid_runtime": 457.6543, + "eval_valid_samples_per_second": 0.42, + "eval_valid_steps_per_second": 0.42, + "step": 16550 + }, + { + "epoch": 0.061733921204389634, + "eval_train_loss": 2.2042629718780518, + "eval_train_loss/all": 2.0383477210998535, + "eval_train_loss/end_span": 1.1863676309585571, + "eval_train_perplexity/batch": 7.677912712097168, + "eval_train_perplexity/end_span": 3.275162935256958, + "eval_train_perplexity/fim": 2.048257827758789, + "eval_train_perplexity/first_seq": 15.355217933654785, + "eval_train_perplexity/last_seq": 9.059852600097656, + "eval_train_perplexity/second_seq": 14.212924003601074, + "eval_train_perplexity/seq": 8.829479217529297, + "eval_train_reconstruction/all": 0.2804108262062073, + "eval_train_reconstruction/end_span": 0.7218869924545288, + "eval_train_reconstruction/fim": 0.14122889935970306, + "eval_train_reconstruction/first_seq": 0.15179240703582764, + "eval_train_reconstruction/last_seq": 0.32303470373153687, + "eval_train_reconstruction/second_seq": 0.18074500560760498, + "eval_train_runtime": 452.6227, + "eval_train_samples_per_second": 0.424, + "eval_train_steps_per_second": 0.424, + "step": 16550 + }, + { + "epoch": 0.061771222667353014, + "grad_norm": 0.40048664808273315, + "learning_rate": 0.0006, + "loss": 2.3403, + "step": 16560 + }, + { + "epoch": 0.06180852413031639, + "grad_norm": 0.288134902715683, + "learning_rate": 0.0006, + "loss": 2.1094, + "step": 16570 + }, + { + "epoch": 0.061845825593279766, + "grad_norm": 0.4658225476741791, + "learning_rate": 0.0006, + "loss": 2.1974, + "step": 16580 + }, + { + "epoch": 0.061883127056243145, + "grad_norm": 0.4468512535095215, + "learning_rate": 0.0006, + "loss": 2.2741, + "step": 16590 + }, + { + "epoch": 0.061920428519206525, + "grad_norm": 0.36721402406692505, + "learning_rate": 0.0006, + "loss": 2.2347, + "step": 16600 + }, + { + "epoch": 0.061920428519206525, + "eval_valid_loss": 2.207622766494751, + "eval_valid_loss/all": 2.068384885787964, + "eval_valid_loss/end_span": 1.2638860940933228, + "eval_valid_perplexity/batch": 7.912034034729004, + "eval_valid_perplexity/end_span": 3.5391483306884766, + "eval_valid_perplexity/fim": 2.376085042953491, + "eval_valid_perplexity/first_seq": 15.028559684753418, + "eval_valid_perplexity/last_seq": 8.711851119995117, + "eval_valid_perplexity/second_seq": 13.809245109558105, + "eval_valid_perplexity/seq": 8.908820152282715, + "eval_valid_reconstruction/all": 0.28999412059783936, + "eval_valid_reconstruction/end_span": 0.6940533518791199, + "eval_valid_reconstruction/fim": 0.16982541978359222, + "eval_valid_reconstruction/first_seq": 0.1636536419391632, + "eval_valid_reconstruction/last_seq": 0.3384073078632355, + "eval_valid_reconstruction/second_seq": 0.19331437349319458, + "eval_valid_runtime": 458.7571, + "eval_valid_samples_per_second": 0.419, + "eval_valid_steps_per_second": 0.419, + "step": 16600 + }, + { + "epoch": 0.061920428519206525, + "eval_train_loss": 2.2054855823516846, + "eval_train_loss/all": 2.0391833782196045, + "eval_train_loss/end_span": 1.2399446964263916, + "eval_train_perplexity/batch": 7.68433141708374, + "eval_train_perplexity/end_span": 3.4554224014282227, + "eval_train_perplexity/fim": 1.855580449104309, + "eval_train_perplexity/first_seq": 15.310935974121094, + "eval_train_perplexity/last_seq": 8.810358047485352, + "eval_train_perplexity/second_seq": 14.49140739440918, + "eval_train_perplexity/seq": 8.836360931396484, + "eval_train_reconstruction/all": 0.27997255325317383, + "eval_train_reconstruction/end_span": 0.7032756209373474, + "eval_train_reconstruction/fim": 0.12190930545330048, + "eval_train_reconstruction/first_seq": 0.15891961753368378, + "eval_train_reconstruction/last_seq": 0.3285973072052002, + "eval_train_reconstruction/second_seq": 0.17823000252246857, + "eval_train_runtime": 461.8539, + "eval_train_samples_per_second": 0.416, + "eval_train_steps_per_second": 0.416, + "step": 16600 + }, + { + "epoch": 0.061957729982169904, + "grad_norm": 0.32819533348083496, + "learning_rate": 0.0006, + "loss": 2.0541, + "step": 16610 + }, + { + "epoch": 0.061995031445133277, + "grad_norm": 0.23910640180110931, + "learning_rate": 0.0006, + "loss": 2.2897, + "step": 16620 + }, + { + "epoch": 0.062032332908096656, + "grad_norm": 0.4013685882091522, + "learning_rate": 0.0006, + "loss": 2.1631, + "step": 16630 + }, + { + "epoch": 0.062069634371060035, + "grad_norm": 0.3407006561756134, + "learning_rate": 0.0006, + "loss": 2.2301, + "step": 16640 + }, + { + "epoch": 0.06210693583402341, + "grad_norm": 0.40532195568084717, + "learning_rate": 0.0006, + "loss": 2.35, + "step": 16650 + }, + { + "epoch": 0.06210693583402341, + "eval_valid_loss": 2.206138849258423, + "eval_valid_loss/all": 2.0671966075897217, + "eval_valid_loss/end_span": 1.2107940912246704, + "eval_valid_perplexity/batch": 7.902637958526611, + "eval_valid_perplexity/end_span": 3.3561487197875977, + "eval_valid_perplexity/fim": 2.3553483486175537, + "eval_valid_perplexity/first_seq": 14.918305397033691, + "eval_valid_perplexity/last_seq": 8.932584762573242, + "eval_valid_perplexity/second_seq": 13.747804641723633, + "eval_valid_perplexity/seq": 8.902447700500488, + "eval_valid_reconstruction/all": 0.29041028022766113, + "eval_valid_reconstruction/end_span": 0.7127775549888611, + "eval_valid_reconstruction/fim": 0.16682016849517822, + "eval_valid_reconstruction/first_seq": 0.16570955514907837, + "eval_valid_reconstruction/last_seq": 0.33452484011650085, + "eval_valid_reconstruction/second_seq": 0.1978864222764969, + "eval_valid_runtime": 460.8626, + "eval_valid_samples_per_second": 0.417, + "eval_valid_steps_per_second": 0.417, + "step": 16650 + }, + { + "epoch": 0.06210693583402341, + "eval_train_loss": 2.203796148300171, + "eval_train_loss/all": 2.0379064083099365, + "eval_train_loss/end_span": 1.1875778436660767, + "eval_train_perplexity/batch": 7.674525260925293, + "eval_train_perplexity/end_span": 3.2791290283203125, + "eval_train_perplexity/fim": 2.0662741661071777, + "eval_train_perplexity/first_seq": 15.059545516967773, + "eval_train_perplexity/last_seq": 9.681941986083984, + "eval_train_perplexity/second_seq": 14.498418807983398, + "eval_train_perplexity/seq": 8.830435752868652, + "eval_train_reconstruction/all": 0.2803823947906494, + "eval_train_reconstruction/end_span": 0.7219556570053101, + "eval_train_reconstruction/fim": 0.1422489732503891, + "eval_train_reconstruction/first_seq": 0.1620495468378067, + "eval_train_reconstruction/last_seq": 0.30236944556236267, + "eval_train_reconstruction/second_seq": 0.17916430532932281, + "eval_train_runtime": 454.6346, + "eval_train_samples_per_second": 0.422, + "eval_train_steps_per_second": 0.422, + "step": 16650 + }, + { + "epoch": 0.06214423729698679, + "grad_norm": 0.8113141655921936, + "learning_rate": 0.0006, + "loss": 2.2396, + "step": 16660 + }, + { + "epoch": 0.06218153875995017, + "grad_norm": 0.34694990515708923, + "learning_rate": 0.0006, + "loss": 2.2359, + "step": 16670 + }, + { + "epoch": 0.06221884022291354, + "grad_norm": 0.2796533405780792, + "learning_rate": 0.0006, + "loss": 2.3466, + "step": 16680 + }, + { + "epoch": 0.06225614168587692, + "grad_norm": 0.3651605546474457, + "learning_rate": 0.0006, + "loss": 2.3572, + "step": 16690 + }, + { + "epoch": 0.0622934431488403, + "grad_norm": 0.2954265773296356, + "learning_rate": 0.0006, + "loss": 2.2538, + "step": 16700 + }, + { + "epoch": 0.0622934431488403, + "eval_valid_loss": 2.2096164226531982, + "eval_valid_loss/all": 2.0704128742218018, + "eval_valid_loss/end_span": 1.2637263536453247, + "eval_valid_perplexity/batch": 7.928095817565918, + "eval_valid_perplexity/end_span": 3.5385830402374268, + "eval_valid_perplexity/fim": 2.1009323596954346, + "eval_valid_perplexity/first_seq": 14.706865310668945, + "eval_valid_perplexity/last_seq": 9.081753730773926, + "eval_valid_perplexity/second_seq": 14.011557579040527, + "eval_valid_perplexity/seq": 8.931205749511719, + "eval_valid_reconstruction/all": 0.28956684470176697, + "eval_valid_reconstruction/end_span": 0.7025202512741089, + "eval_valid_reconstruction/fim": 0.14496612548828125, + "eval_valid_reconstruction/first_seq": 0.17119264602661133, + "eval_valid_reconstruction/last_seq": 0.32194486260414124, + "eval_valid_reconstruction/second_seq": 0.19131310284137726, + "eval_valid_runtime": 444.8791, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 16700 + }, + { + "epoch": 0.0622934431488403, + "eval_train_loss": 2.208177328109741, + "eval_train_loss/all": 2.0420079231262207, + "eval_train_loss/end_span": 1.2242913246154785, + "eval_train_perplexity/batch": 7.706067085266113, + "eval_train_perplexity/end_span": 3.401754379272461, + "eval_train_perplexity/fim": 2.168829917907715, + "eval_train_perplexity/first_seq": 15.716954231262207, + "eval_train_perplexity/last_seq": 9.13165283203125, + "eval_train_perplexity/second_seq": 14.15624713897705, + "eval_train_perplexity/seq": 8.869257926940918, + "eval_train_reconstruction/all": 0.27902963757514954, + "eval_train_reconstruction/end_span": 0.7162798643112183, + "eval_train_reconstruction/fim": 0.15242743492126465, + "eval_train_reconstruction/first_seq": 0.14743755757808685, + "eval_train_reconstruction/last_seq": 0.323713481426239, + "eval_train_reconstruction/second_seq": 0.18778479099273682, + "eval_train_runtime": 446.5577, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 16700 + }, + { + "epoch": 0.06233074461180368, + "grad_norm": 0.3902156352996826, + "learning_rate": 0.0006, + "loss": 2.2243, + "step": 16710 + }, + { + "epoch": 0.06236804607476705, + "grad_norm": 0.3725644052028656, + "learning_rate": 0.0006, + "loss": 2.3218, + "step": 16720 + }, + { + "epoch": 0.06240534753773043, + "grad_norm": 0.42521893978118896, + "learning_rate": 0.0006, + "loss": 2.2687, + "step": 16730 + }, + { + "epoch": 0.06244264900069381, + "grad_norm": 0.4514099955558777, + "learning_rate": 0.0006, + "loss": 2.3475, + "step": 16740 + }, + { + "epoch": 0.06247995046365718, + "grad_norm": 0.4223182499408722, + "learning_rate": 0.0006, + "loss": 2.326, + "step": 16750 + }, + { + "epoch": 0.06247995046365718, + "eval_valid_loss": 2.2115652561187744, + "eval_valid_loss/all": 2.072418212890625, + "eval_valid_loss/end_span": 1.2750383615493774, + "eval_valid_perplexity/batch": 7.944010257720947, + "eval_valid_perplexity/end_span": 3.578838586807251, + "eval_valid_perplexity/fim": 2.306821584701538, + "eval_valid_perplexity/first_seq": 14.776290893554688, + "eval_valid_perplexity/last_seq": 9.400671005249023, + "eval_valid_perplexity/second_seq": 13.91221809387207, + "eval_valid_perplexity/seq": 8.952540397644043, + "eval_valid_reconstruction/all": 0.28863126039505005, + "eval_valid_reconstruction/end_span": 0.7034371495246887, + "eval_valid_reconstruction/fim": 0.163665771484375, + "eval_valid_reconstruction/first_seq": 0.17089082300662994, + "eval_valid_reconstruction/last_seq": 0.3104456961154938, + "eval_valid_reconstruction/second_seq": 0.1904027909040451, + "eval_valid_runtime": 448.908, + "eval_valid_samples_per_second": 0.428, + "eval_valid_steps_per_second": 0.428, + "step": 16750 + }, + { + "epoch": 0.06247995046365718, + "eval_train_loss": 2.2060234546661377, + "eval_train_loss/all": 2.039979934692383, + "eval_train_loss/end_span": 1.2422176599502563, + "eval_train_perplexity/batch": 7.690454959869385, + "eval_train_perplexity/end_span": 3.463285446166992, + "eval_train_perplexity/fim": 2.197038173675537, + "eval_train_perplexity/first_seq": 15.015365600585938, + "eval_train_perplexity/last_seq": 9.575079917907715, + "eval_train_perplexity/second_seq": 14.120354652404785, + "eval_train_perplexity/seq": 8.847088813781738, + "eval_train_reconstruction/all": 0.2796037495136261, + "eval_train_reconstruction/end_span": 0.7129827737808228, + "eval_train_reconstruction/fim": 0.15391312539577484, + "eval_train_reconstruction/first_seq": 0.1625814437866211, + "eval_train_reconstruction/last_seq": 0.3043384253978729, + "eval_train_reconstruction/second_seq": 0.18861983716487885, + "eval_train_runtime": 432.9402, + "eval_train_samples_per_second": 0.443, + "eval_train_steps_per_second": 0.443, + "step": 16750 + }, + { + "epoch": 0.06251725192662057, + "grad_norm": 0.9513124227523804, + "learning_rate": 0.0006, + "loss": 2.0919, + "step": 16760 + }, + { + "epoch": 0.06255455338958393, + "grad_norm": 0.5207437872886658, + "learning_rate": 0.0006, + "loss": 2.29, + "step": 16770 + }, + { + "epoch": 0.06259185485254731, + "grad_norm": 0.45944273471832275, + "learning_rate": 0.0006, + "loss": 2.1475, + "step": 16780 + }, + { + "epoch": 0.06262915631551069, + "grad_norm": 0.5766828656196594, + "learning_rate": 0.0006, + "loss": 2.0268, + "step": 16790 + }, + { + "epoch": 0.06266645777847407, + "grad_norm": 0.3064805269241333, + "learning_rate": 0.0006, + "loss": 2.2956, + "step": 16800 + }, + { + "epoch": 0.06266645777847407, + "eval_valid_loss": 2.2105119228363037, + "eval_valid_loss/all": 2.071324586868286, + "eval_valid_loss/end_span": 1.287631869316101, + "eval_valid_perplexity/batch": 7.935327053070068, + "eval_valid_perplexity/end_span": 3.6241939067840576, + "eval_valid_perplexity/fim": 2.414199113845825, + "eval_valid_perplexity/first_seq": 15.14476490020752, + "eval_valid_perplexity/last_seq": 9.226055145263672, + "eval_valid_perplexity/second_seq": 13.873305320739746, + "eval_valid_perplexity/seq": 8.945977210998535, + "eval_valid_reconstruction/all": 0.2891952693462372, + "eval_valid_reconstruction/end_span": 0.6978175044059753, + "eval_valid_reconstruction/fim": 0.1726168394088745, + "eval_valid_reconstruction/first_seq": 0.1652025431394577, + "eval_valid_reconstruction/last_seq": 0.31883031129837036, + "eval_valid_reconstruction/second_seq": 0.19292834401130676, + "eval_valid_runtime": 434.6801, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 16800 + }, + { + "epoch": 0.06266645777847407, + "eval_train_loss": 2.2080132961273193, + "eval_train_loss/all": 2.041898488998413, + "eval_train_loss/end_span": 1.2497025728225708, + "eval_train_perplexity/batch": 7.705223560333252, + "eval_train_perplexity/end_span": 3.489305019378662, + "eval_train_perplexity/fim": 2.0188629627227783, + "eval_train_perplexity/first_seq": 15.55063247680664, + "eval_train_perplexity/last_seq": 9.073064804077148, + "eval_train_perplexity/second_seq": 14.629902839660645, + "eval_train_perplexity/seq": 8.871274948120117, + "eval_train_reconstruction/all": 0.2790544629096985, + "eval_train_reconstruction/end_span": 0.7103548049926758, + "eval_train_reconstruction/fim": 0.13803480565547943, + "eval_train_reconstruction/first_seq": 0.15419717133045197, + "eval_train_reconstruction/last_seq": 0.321951687335968, + "eval_train_reconstruction/second_seq": 0.175464928150177, + "eval_train_runtime": 435.3069, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 16800 + }, + { + "epoch": 0.06270375924143745, + "grad_norm": 0.5714784860610962, + "learning_rate": 0.0006, + "loss": 2.2777, + "step": 16810 + }, + { + "epoch": 0.06274106070440083, + "grad_norm": 0.7288968563079834, + "learning_rate": 0.0006, + "loss": 2.1574, + "step": 16820 + }, + { + "epoch": 0.06277836216736421, + "grad_norm": 0.3957771956920624, + "learning_rate": 0.0006, + "loss": 2.2345, + "step": 16830 + }, + { + "epoch": 0.06281566363032758, + "grad_norm": 0.36259686946868896, + "learning_rate": 0.0006, + "loss": 2.158, + "step": 16840 + }, + { + "epoch": 0.06285296509329096, + "grad_norm": 0.2810764014720917, + "learning_rate": 0.0006, + "loss": 2.31, + "step": 16850 + }, + { + "epoch": 0.06285296509329096, + "eval_valid_loss": 2.2084405422210693, + "eval_valid_loss/all": 2.069603681564331, + "eval_valid_loss/end_span": 1.197165608406067, + "eval_valid_perplexity/batch": 7.921682834625244, + "eval_valid_perplexity/end_span": 3.3107197284698486, + "eval_valid_perplexity/fim": 2.1811788082122803, + "eval_valid_perplexity/first_seq": 14.979025840759277, + "eval_valid_perplexity/last_seq": 9.57774829864502, + "eval_valid_perplexity/second_seq": 13.701820373535156, + "eval_valid_perplexity/seq": 8.929252624511719, + "eval_valid_reconstruction/all": 0.2896047532558441, + "eval_valid_reconstruction/end_span": 0.716896653175354, + "eval_valid_reconstruction/fim": 0.1517719328403473, + "eval_valid_reconstruction/first_seq": 0.167887344956398, + "eval_valid_reconstruction/last_seq": 0.30720949172973633, + "eval_valid_reconstruction/second_seq": 0.19934336841106415, + "eval_valid_runtime": 434.5227, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 16850 + }, + { + "epoch": 0.06285296509329096, + "eval_train_loss": 2.2051942348480225, + "eval_train_loss/all": 2.039201021194458, + "eval_train_loss/end_span": 1.1601457595825195, + "eval_train_perplexity/batch": 7.68446683883667, + "eval_train_perplexity/end_span": 3.1903982162475586, + "eval_train_perplexity/fim": 2.2571094036102295, + "eval_train_perplexity/first_seq": 15.21480655670166, + "eval_train_perplexity/last_seq": 9.033713340759277, + "eval_train_perplexity/second_seq": 14.366472244262695, + "eval_train_perplexity/seq": 8.84033489227295, + "eval_train_reconstruction/all": 0.2799444794654846, + "eval_train_reconstruction/end_span": 0.7301816940307617, + "eval_train_reconstruction/fim": 0.15928469598293304, + "eval_train_reconstruction/first_seq": 0.15889111161231995, + "eval_train_reconstruction/last_seq": 0.3219064474105835, + "eval_train_reconstruction/second_seq": 0.18049243092536926, + "eval_train_runtime": 437.3155, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 16850 + }, + { + "epoch": 0.06289026655625433, + "grad_norm": 0.2904658615589142, + "learning_rate": 0.0006, + "loss": 2.2274, + "step": 16860 + }, + { + "epoch": 0.06292756801921771, + "grad_norm": 0.40127524733543396, + "learning_rate": 0.0006, + "loss": 2.023, + "step": 16870 + }, + { + "epoch": 0.0629648694821811, + "grad_norm": 0.3738210201263428, + "learning_rate": 0.0006, + "loss": 2.2483, + "step": 16880 + }, + { + "epoch": 0.06300217094514447, + "grad_norm": 0.3727729916572571, + "learning_rate": 0.0006, + "loss": 2.3157, + "step": 16890 + }, + { + "epoch": 0.06303947240810785, + "grad_norm": 0.31189367175102234, + "learning_rate": 0.0006, + "loss": 2.1977, + "step": 16900 + }, + { + "epoch": 0.06303947240810785, + "eval_valid_loss": 2.2086212635040283, + "eval_valid_loss/all": 2.069530487060547, + "eval_valid_loss/end_span": 1.2552032470703125, + "eval_valid_perplexity/batch": 7.921103000640869, + "eval_valid_perplexity/end_span": 3.5085513591766357, + "eval_valid_perplexity/fim": 2.3629391193389893, + "eval_valid_perplexity/first_seq": 15.009641647338867, + "eval_valid_perplexity/last_seq": 8.96075439453125, + "eval_valid_perplexity/second_seq": 13.619029998779297, + "eval_valid_perplexity/seq": 8.923439979553223, + "eval_valid_reconstruction/all": 0.28999775648117065, + "eval_valid_reconstruction/end_span": 0.7054839730262756, + "eval_valid_reconstruction/fim": 0.16866132616996765, + "eval_valid_reconstruction/first_seq": 0.16413678228855133, + "eval_valid_reconstruction/last_seq": 0.3284708857536316, + "eval_valid_reconstruction/second_seq": 0.19883911311626434, + "eval_valid_runtime": 432.9457, + "eval_valid_samples_per_second": 0.443, + "eval_valid_steps_per_second": 0.443, + "step": 16900 + }, + { + "epoch": 0.06303947240810785, + "eval_train_loss": 2.2068045139312744, + "eval_train_loss/all": 2.040313243865967, + "eval_train_loss/end_span": 1.2114468812942505, + "eval_train_perplexity/batch": 7.693018436431885, + "eval_train_perplexity/end_span": 3.358340263366699, + "eval_train_perplexity/fim": 2.005955457687378, + "eval_train_perplexity/first_seq": 15.42122745513916, + "eval_train_perplexity/last_seq": 8.97310733795166, + "eval_train_perplexity/second_seq": 14.114824295043945, + "eval_train_perplexity/seq": 8.847951889038086, + "eval_train_reconstruction/all": 0.2797382175922394, + "eval_train_reconstruction/end_span": 0.7180362343788147, + "eval_train_reconstruction/fim": 0.13696298003196716, + "eval_train_reconstruction/first_seq": 0.15460151433944702, + "eval_train_reconstruction/last_seq": 0.3237588107585907, + "eval_train_reconstruction/second_seq": 0.18677574396133423, + "eval_train_runtime": 437.6236, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 16900 + }, + { + "epoch": 0.06307677387107122, + "grad_norm": 0.432925283908844, + "learning_rate": 0.0006, + "loss": 2.1093, + "step": 16910 + }, + { + "epoch": 0.0631140753340346, + "grad_norm": 0.4030383825302124, + "learning_rate": 0.0006, + "loss": 2.2786, + "step": 16920 + }, + { + "epoch": 0.06315137679699798, + "grad_norm": 0.2982783317565918, + "learning_rate": 0.0006, + "loss": 2.4399, + "step": 16930 + }, + { + "epoch": 0.06318867825996136, + "grad_norm": 0.4126668870449066, + "learning_rate": 0.0006, + "loss": 2.3744, + "step": 16940 + }, + { + "epoch": 0.06322597972292474, + "grad_norm": 0.4237982928752899, + "learning_rate": 0.0006, + "loss": 2.3131, + "step": 16950 + }, + { + "epoch": 0.06322597972292474, + "eval_valid_loss": 2.21050763130188, + "eval_valid_loss/all": 2.071065902709961, + "eval_valid_loss/end_span": 1.296093225479126, + "eval_valid_perplexity/batch": 7.933274745941162, + "eval_valid_perplexity/end_span": 3.65498948097229, + "eval_valid_perplexity/fim": 2.3229081630706787, + "eval_valid_perplexity/first_seq": 15.02991771697998, + "eval_valid_perplexity/last_seq": 9.30334758758545, + "eval_valid_perplexity/second_seq": 13.68828296661377, + "eval_valid_perplexity/seq": 8.932323455810547, + "eval_valid_reconstruction/all": 0.2896097004413605, + "eval_valid_reconstruction/end_span": 0.6927871108055115, + "eval_valid_reconstruction/fim": 0.1645335704088211, + "eval_valid_reconstruction/first_seq": 0.1620151251554489, + "eval_valid_reconstruction/last_seq": 0.31508147716522217, + "eval_valid_reconstruction/second_seq": 0.19726699590682983, + "eval_valid_runtime": 441.1447, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 16950 + }, + { + "epoch": 0.06322597972292474, + "eval_train_loss": 2.2078254222869873, + "eval_train_loss/all": 2.041383981704712, + "eval_train_loss/end_span": 1.2673066854476929, + "eval_train_perplexity/batch": 7.701260089874268, + "eval_train_perplexity/end_span": 3.5512750148773193, + "eval_train_perplexity/fim": 2.084296464920044, + "eval_train_perplexity/first_seq": 15.786558151245117, + "eval_train_perplexity/last_seq": 9.270476341247559, + "eval_train_perplexity/second_seq": 14.285629272460938, + "eval_train_perplexity/seq": 8.854965209960938, + "eval_train_reconstruction/all": 0.27952563762664795, + "eval_train_reconstruction/end_span": 0.7012412548065186, + "eval_train_reconstruction/fim": 0.1434290111064911, + "eval_train_reconstruction/first_seq": 0.1454884111881256, + "eval_train_reconstruction/last_seq": 0.31467103958129883, + "eval_train_reconstruction/second_seq": 0.18429026007652283, + "eval_train_runtime": 439.9759, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 16950 + }, + { + "epoch": 0.06326328118588811, + "grad_norm": 0.44444143772125244, + "learning_rate": 0.0006, + "loss": 2.1212, + "step": 16960 + }, + { + "epoch": 0.0633005826488515, + "grad_norm": 0.40001365542411804, + "learning_rate": 0.0006, + "loss": 2.0704, + "step": 16970 + }, + { + "epoch": 0.06333788411181486, + "grad_norm": 0.35864007472991943, + "learning_rate": 0.0006, + "loss": 2.1112, + "step": 16980 + }, + { + "epoch": 0.06337518557477824, + "grad_norm": 0.3314301371574402, + "learning_rate": 0.0006, + "loss": 2.2003, + "step": 16990 + }, + { + "epoch": 0.06341248703774162, + "grad_norm": 0.4008799195289612, + "learning_rate": 0.0006, + "loss": 2.1465, + "step": 17000 + }, + { + "epoch": 0.06341248703774162, + "eval_valid_loss": 2.208420515060425, + "eval_valid_loss/all": 2.069228410720825, + "eval_valid_loss/end_span": 1.1967415809631348, + "eval_valid_perplexity/batch": 7.918710708618164, + "eval_valid_perplexity/end_span": 3.3093161582946777, + "eval_valid_perplexity/fim": 2.1928553581237793, + "eval_valid_perplexity/first_seq": 14.737027168273926, + "eval_valid_perplexity/last_seq": 8.783669471740723, + "eval_valid_perplexity/second_seq": 13.988850593566895, + "eval_valid_perplexity/seq": 8.919461250305176, + "eval_valid_reconstruction/all": 0.28982073068618774, + "eval_valid_reconstruction/end_span": 0.7181103825569153, + "eval_valid_reconstruction/fim": 0.15322284400463104, + "eval_valid_reconstruction/first_seq": 0.1716921478509903, + "eval_valid_reconstruction/last_seq": 0.333927720785141, + "eval_valid_reconstruction/second_seq": 0.1908818781375885, + "eval_valid_runtime": 437.4063, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 17000 + }, + { + "epoch": 0.06341248703774162, + "eval_train_loss": 2.208475351333618, + "eval_train_loss/all": 2.042099714279175, + "eval_train_loss/end_span": 1.163535237312317, + "eval_train_perplexity/batch": 7.7067742347717285, + "eval_train_perplexity/end_span": 3.20123028755188, + "eval_train_perplexity/fim": 2.0738096237182617, + "eval_train_perplexity/first_seq": 15.411170959472656, + "eval_train_perplexity/last_seq": 9.4505615234375, + "eval_train_perplexity/second_seq": 14.362325668334961, + "eval_train_perplexity/seq": 8.865768432617188, + "eval_train_reconstruction/all": 0.27908891439437866, + "eval_train_reconstruction/end_span": 0.7278059720993042, + "eval_train_reconstruction/fim": 0.14272215962409973, + "eval_train_reconstruction/first_seq": 0.15099844336509705, + "eval_train_reconstruction/last_seq": 0.30891287326812744, + "eval_train_reconstruction/second_seq": 0.1823355257511139, + "eval_train_runtime": 442.3774, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 17000 + }, + { + "epoch": 0.063449788500705, + "grad_norm": 0.5139293074607849, + "learning_rate": 0.0006, + "loss": 2.2873, + "step": 17010 + }, + { + "epoch": 0.06348708996366838, + "grad_norm": 0.3175686001777649, + "learning_rate": 0.0006, + "loss": 2.299, + "step": 17020 + }, + { + "epoch": 0.06352439142663176, + "grad_norm": 0.41984260082244873, + "learning_rate": 0.0006, + "loss": 2.0411, + "step": 17030 + }, + { + "epoch": 0.06356169288959514, + "grad_norm": 0.330082505941391, + "learning_rate": 0.0006, + "loss": 2.2041, + "step": 17040 + }, + { + "epoch": 0.0635989943525585, + "grad_norm": 0.408496618270874, + "learning_rate": 0.0006, + "loss": 2.286, + "step": 17050 + }, + { + "epoch": 0.0635989943525585, + "eval_valid_loss": 2.2051355838775635, + "eval_valid_loss/all": 2.066706895828247, + "eval_valid_loss/end_span": 1.2608109712600708, + "eval_valid_perplexity/batch": 7.898768901824951, + "eval_valid_perplexity/end_span": 3.5282816886901855, + "eval_valid_perplexity/fim": 2.5137979984283447, + "eval_valid_perplexity/first_seq": 14.7957181930542, + "eval_valid_perplexity/last_seq": 9.21772289276123, + "eval_valid_perplexity/second_seq": 13.640177726745605, + "eval_valid_perplexity/seq": 8.90294361114502, + "eval_valid_reconstruction/all": 0.29060637950897217, + "eval_valid_reconstruction/end_span": 0.7040082812309265, + "eval_valid_reconstruction/fim": 0.18130898475646973, + "eval_valid_reconstruction/first_seq": 0.16875697672367096, + "eval_valid_reconstruction/last_seq": 0.31860363483428955, + "eval_valid_reconstruction/second_seq": 0.19836114346981049, + "eval_valid_runtime": 441.3476, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 17050 + }, + { + "epoch": 0.0635989943525585, + "eval_train_loss": 2.2040886878967285, + "eval_train_loss/all": 2.038774251937866, + "eval_train_loss/end_span": 1.2290915250778198, + "eval_train_perplexity/batch": 7.681188106536865, + "eval_train_perplexity/end_span": 3.4181227684020996, + "eval_train_perplexity/fim": 1.9681313037872314, + "eval_train_perplexity/first_seq": 15.298063278198242, + "eval_train_perplexity/last_seq": 8.94711685180664, + "eval_train_perplexity/second_seq": 14.303801536560059, + "eval_train_perplexity/seq": 8.842785835266113, + "eval_train_reconstruction/all": 0.2802531123161316, + "eval_train_reconstruction/end_span": 0.7136614918708801, + "eval_train_reconstruction/fim": 0.1333940625190735, + "eval_train_reconstruction/first_seq": 0.1573759913444519, + "eval_train_reconstruction/last_seq": 0.3302062451839447, + "eval_train_reconstruction/second_seq": 0.18408162891864777, + "eval_train_runtime": 441.0262, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 17050 + }, + { + "epoch": 0.06363629581552188, + "grad_norm": 0.3666945993900299, + "learning_rate": 0.0006, + "loss": 2.2442, + "step": 17060 + }, + { + "epoch": 0.06367359727848526, + "grad_norm": 0.38367176055908203, + "learning_rate": 0.0006, + "loss": 2.3147, + "step": 17070 + }, + { + "epoch": 0.06371089874144864, + "grad_norm": 0.2686998248100281, + "learning_rate": 0.0006, + "loss": 2.3112, + "step": 17080 + }, + { + "epoch": 0.06374820020441202, + "grad_norm": 0.35908177495002747, + "learning_rate": 0.0006, + "loss": 2.2549, + "step": 17090 + }, + { + "epoch": 0.0637855016673754, + "grad_norm": 0.5090898275375366, + "learning_rate": 0.0006, + "loss": 2.1452, + "step": 17100 + }, + { + "epoch": 0.0637855016673754, + "eval_valid_loss": 2.2051899433135986, + "eval_valid_loss/all": 2.0666983127593994, + "eval_valid_loss/end_span": 1.2827321290969849, + "eval_valid_perplexity/batch": 7.898701190948486, + "eval_valid_perplexity/end_span": 3.6064796447753906, + "eval_valid_perplexity/fim": 2.070187568664551, + "eval_valid_perplexity/first_seq": 14.855989456176758, + "eval_valid_perplexity/last_seq": 9.255924224853516, + "eval_valid_perplexity/second_seq": 14.13073444366455, + "eval_valid_perplexity/seq": 8.903207778930664, + "eval_valid_reconstruction/all": 0.29058516025543213, + "eval_valid_reconstruction/end_span": 0.6901475191116333, + "eval_valid_reconstruction/fim": 0.1426021307706833, + "eval_valid_reconstruction/first_seq": 0.16766098141670227, + "eval_valid_reconstruction/last_seq": 0.31558799743652344, + "eval_valid_reconstruction/second_seq": 0.18571653962135315, + "eval_valid_runtime": 439.379, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 17100 + }, + { + "epoch": 0.0637855016673754, + "eval_train_loss": 2.2029364109039307, + "eval_train_loss/all": 2.037569522857666, + "eval_train_loss/end_span": 1.2439297437667847, + "eval_train_perplexity/batch": 7.671939849853516, + "eval_train_perplexity/end_span": 3.469219923019409, + "eval_train_perplexity/fim": 1.9977431297302246, + "eval_train_perplexity/first_seq": 15.431572914123535, + "eval_train_perplexity/last_seq": 8.883329391479492, + "eval_train_perplexity/second_seq": 14.385763168334961, + "eval_train_perplexity/seq": 8.834092140197754, + "eval_train_reconstruction/all": 0.28053417801856995, + "eval_train_reconstruction/end_span": 0.7028520703315735, + "eval_train_reconstruction/fim": 0.1366160809993744, + "eval_train_reconstruction/first_seq": 0.15130330622196198, + "eval_train_reconstruction/last_seq": 0.32935425639152527, + "eval_train_reconstruction/second_seq": 0.17960475385189056, + "eval_train_runtime": 440.2611, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 17100 + }, + { + "epoch": 0.06382280313033876, + "grad_norm": 0.4841967225074768, + "learning_rate": 0.0006, + "loss": 2.1649, + "step": 17110 + }, + { + "epoch": 0.06386010459330214, + "grad_norm": 0.5250768661499023, + "learning_rate": 0.0006, + "loss": 2.0079, + "step": 17120 + }, + { + "epoch": 0.06389740605626552, + "grad_norm": 0.3517023026943207, + "learning_rate": 0.0006, + "loss": 2.3268, + "step": 17130 + }, + { + "epoch": 0.0639347075192289, + "grad_norm": 0.4897274672985077, + "learning_rate": 0.0006, + "loss": 2.3183, + "step": 17140 + }, + { + "epoch": 0.06397200898219228, + "grad_norm": 0.3725831210613251, + "learning_rate": 0.0006, + "loss": 2.2551, + "step": 17150 + }, + { + "epoch": 0.06397200898219228, + "eval_valid_loss": 2.204721212387085, + "eval_valid_loss/all": 2.0661449432373047, + "eval_valid_loss/end_span": 1.234635353088379, + "eval_valid_perplexity/batch": 7.894331455230713, + "eval_valid_perplexity/end_span": 3.4371249675750732, + "eval_valid_perplexity/fim": 2.5103037357330322, + "eval_valid_perplexity/first_seq": 15.011234283447266, + "eval_valid_perplexity/last_seq": 9.217052459716797, + "eval_valid_perplexity/second_seq": 13.31436538696289, + "eval_valid_perplexity/seq": 8.897321701049805, + "eval_valid_reconstruction/all": 0.29068222641944885, + "eval_valid_reconstruction/end_span": 0.7116243839263916, + "eval_valid_reconstruction/fim": 0.18135663866996765, + "eval_valid_reconstruction/first_seq": 0.16425491869449615, + "eval_valid_reconstruction/last_seq": 0.317200243473053, + "eval_valid_reconstruction/second_seq": 0.20401640236377716, + "eval_valid_runtime": 439.978, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 17150 + }, + { + "epoch": 0.06397200898219228, + "eval_train_loss": 2.204707145690918, + "eval_train_loss/all": 2.0391345024108887, + "eval_train_loss/end_span": 1.1998329162597656, + "eval_train_perplexity/batch": 7.683955669403076, + "eval_train_perplexity/end_span": 3.3195621967315674, + "eval_train_perplexity/fim": 1.9876041412353516, + "eval_train_perplexity/first_seq": 15.505059242248535, + "eval_train_perplexity/last_seq": 9.101312637329102, + "eval_train_perplexity/second_seq": 14.397597312927246, + "eval_train_perplexity/seq": 8.846264839172363, + "eval_train_reconstruction/all": 0.2799433469772339, + "eval_train_reconstruction/end_span": 0.7241559028625488, + "eval_train_reconstruction/fim": 0.13545958697795868, + "eval_train_reconstruction/first_seq": 0.15312965214252472, + "eval_train_reconstruction/last_seq": 0.3219890296459198, + "eval_train_reconstruction/second_seq": 0.17815446853637695, + "eval_train_runtime": 440.8523, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 17150 + }, + { + "epoch": 0.06400931044515566, + "grad_norm": 0.40101510286331177, + "learning_rate": 0.0006, + "loss": 2.1185, + "step": 17160 + }, + { + "epoch": 0.06404661190811904, + "grad_norm": 0.34414246678352356, + "learning_rate": 0.0006, + "loss": 2.3313, + "step": 17170 + }, + { + "epoch": 0.06408391337108241, + "grad_norm": 0.3864780068397522, + "learning_rate": 0.0006, + "loss": 2.2764, + "step": 17180 + }, + { + "epoch": 0.06412121483404579, + "grad_norm": 0.33570295572280884, + "learning_rate": 0.0006, + "loss": 2.2103, + "step": 17190 + }, + { + "epoch": 0.06415851629700917, + "grad_norm": 0.34661680459976196, + "learning_rate": 0.0006, + "loss": 2.2287, + "step": 17200 + }, + { + "epoch": 0.06415851629700917, + "eval_valid_loss": 2.204219341278076, + "eval_valid_loss/all": 2.0655486583709717, + "eval_valid_loss/end_span": 1.2461705207824707, + "eval_valid_perplexity/batch": 7.889625549316406, + "eval_valid_perplexity/end_span": 3.4770023822784424, + "eval_valid_perplexity/fim": 2.4989380836486816, + "eval_valid_perplexity/first_seq": 14.838244438171387, + "eval_valid_perplexity/last_seq": 9.246977806091309, + "eval_valid_perplexity/second_seq": 13.633652687072754, + "eval_valid_perplexity/seq": 8.88991641998291, + "eval_valid_reconstruction/all": 0.2909214496612549, + "eval_valid_reconstruction/end_span": 0.7097470760345459, + "eval_valid_reconstruction/fim": 0.180849090218544, + "eval_valid_reconstruction/first_seq": 0.1646798849105835, + "eval_valid_reconstruction/last_seq": 0.3196781277656555, + "eval_valid_reconstruction/second_seq": 0.19906125962734222, + "eval_valid_runtime": 438.3265, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 17200 + }, + { + "epoch": 0.06415851629700917, + "eval_train_loss": 2.202112913131714, + "eval_train_loss/all": 2.036597967147827, + "eval_train_loss/end_span": 1.2219096422195435, + "eval_train_perplexity/batch": 7.66448974609375, + "eval_train_perplexity/end_span": 3.393662214279175, + "eval_train_perplexity/fim": 2.084202289581299, + "eval_train_perplexity/first_seq": 15.601177215576172, + "eval_train_perplexity/last_seq": 9.412617683410645, + "eval_train_perplexity/second_seq": 14.26282024383545, + "eval_train_perplexity/seq": 8.817914009094238, + "eval_train_reconstruction/all": 0.28079476952552795, + "eval_train_reconstruction/end_span": 0.7199487090110779, + "eval_train_reconstruction/fim": 0.14420029520988464, + "eval_train_reconstruction/first_seq": 0.14854787290096283, + "eval_train_reconstruction/last_seq": 0.3106667697429657, + "eval_train_reconstruction/second_seq": 0.18265193700790405, + "eval_train_runtime": 437.671, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 17200 + }, + { + "epoch": 0.06419581775997255, + "grad_norm": 0.5442944169044495, + "learning_rate": 0.0006, + "loss": 2.2832, + "step": 17210 + }, + { + "epoch": 0.06423311922293592, + "grad_norm": 0.34395915269851685, + "learning_rate": 0.0006, + "loss": 2.2693, + "step": 17220 + }, + { + "epoch": 0.0642704206858993, + "grad_norm": 0.37931519746780396, + "learning_rate": 0.0006, + "loss": 2.1966, + "step": 17230 + }, + { + "epoch": 0.06430772214886268, + "grad_norm": 0.2874719202518463, + "learning_rate": 0.0006, + "loss": 2.0499, + "step": 17240 + }, + { + "epoch": 0.06434502361182605, + "grad_norm": 0.2768102288246155, + "learning_rate": 0.0006, + "loss": 2.3893, + "step": 17250 + }, + { + "epoch": 0.06434502361182605, + "eval_valid_loss": 2.205016851425171, + "eval_valid_loss/all": 2.0661089420318604, + "eval_valid_loss/end_span": 1.2767220735549927, + "eval_valid_perplexity/batch": 7.894047260284424, + "eval_valid_perplexity/end_span": 3.584869623184204, + "eval_valid_perplexity/fim": 2.417060613632202, + "eval_valid_perplexity/first_seq": 14.613717079162598, + "eval_valid_perplexity/last_seq": 8.950968742370605, + "eval_valid_perplexity/second_seq": 13.982661247253418, + "eval_valid_perplexity/seq": 8.889126777648926, + "eval_valid_reconstruction/all": 0.2904740571975708, + "eval_valid_reconstruction/end_span": 0.6940946578979492, + "eval_valid_reconstruction/fim": 0.17357251048088074, + "eval_valid_reconstruction/first_seq": 0.1713528037071228, + "eval_valid_reconstruction/last_seq": 0.32537955045700073, + "eval_valid_reconstruction/second_seq": 0.18852150440216064, + "eval_valid_runtime": 440.8253, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 17250 + }, + { + "epoch": 0.06434502361182605, + "eval_train_loss": 2.2046868801116943, + "eval_train_loss/all": 2.038994789123535, + "eval_train_loss/end_span": 1.2594183683395386, + "eval_train_perplexity/batch": 7.682882308959961, + "eval_train_perplexity/end_span": 3.523371696472168, + "eval_train_perplexity/fim": 2.155731201171875, + "eval_train_perplexity/first_seq": 15.713953018188477, + "eval_train_perplexity/last_seq": 9.63805103302002, + "eval_train_perplexity/second_seq": 14.14068603515625, + "eval_train_perplexity/seq": 8.840811729431152, + "eval_train_reconstruction/all": 0.2798282206058502, + "eval_train_reconstruction/end_span": 0.7009328007698059, + "eval_train_reconstruction/fim": 0.1494562327861786, + "eval_train_reconstruction/first_seq": 0.15038339793682098, + "eval_train_reconstruction/last_seq": 0.3046478033065796, + "eval_train_reconstruction/second_seq": 0.18446627259254456, + "eval_train_runtime": 441.5908, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 17250 + }, + { + "epoch": 0.06438232507478943, + "grad_norm": 0.4988112449645996, + "learning_rate": 0.0006, + "loss": 2.2484, + "step": 17260 + }, + { + "epoch": 0.06441962653775281, + "grad_norm": 0.4829193651676178, + "learning_rate": 0.0006, + "loss": 2.2319, + "step": 17270 + }, + { + "epoch": 0.06445692800071619, + "grad_norm": 0.29255902767181396, + "learning_rate": 0.0006, + "loss": 2.2383, + "step": 17280 + }, + { + "epoch": 0.06449422946367957, + "grad_norm": 0.42050012946128845, + "learning_rate": 0.0006, + "loss": 2.0718, + "step": 17290 + }, + { + "epoch": 0.06453153092664295, + "grad_norm": 0.3323630690574646, + "learning_rate": 0.0006, + "loss": 2.3576, + "step": 17300 + }, + { + "epoch": 0.06453153092664295, + "eval_valid_loss": 2.2045323848724365, + "eval_valid_loss/all": 2.065951347351074, + "eval_valid_loss/end_span": 1.3356965780258179, + "eval_valid_perplexity/batch": 7.892803192138672, + "eval_valid_perplexity/end_span": 3.8026437759399414, + "eval_valid_perplexity/fim": 2.5699877738952637, + "eval_valid_perplexity/first_seq": 15.105602264404297, + "eval_valid_perplexity/last_seq": 8.874878883361816, + "eval_valid_perplexity/second_seq": 13.845532417297363, + "eval_valid_perplexity/seq": 8.889626502990723, + "eval_valid_reconstruction/all": 0.29066210985183716, + "eval_valid_reconstruction/end_span": 0.6869437098503113, + "eval_valid_reconstruction/fim": 0.18540255725383759, + "eval_valid_reconstruction/first_seq": 0.162835493683815, + "eval_valid_reconstruction/last_seq": 0.3297816514968872, + "eval_valid_reconstruction/second_seq": 0.19433020055294037, + "eval_valid_runtime": 440.6832, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 17300 + }, + { + "epoch": 0.06453153092664295, + "eval_train_loss": 2.202974557876587, + "eval_train_loss/all": 2.037426710128784, + "eval_train_loss/end_span": 1.3011507987976074, + "eval_train_perplexity/batch": 7.670844554901123, + "eval_train_perplexity/end_span": 3.6735217571258545, + "eval_train_perplexity/fim": 2.23480486869812, + "eval_train_perplexity/first_seq": 15.347326278686523, + "eval_train_perplexity/last_seq": 8.614851951599121, + "eval_train_perplexity/second_seq": 14.270110130310059, + "eval_train_perplexity/seq": 8.824762344360352, + "eval_train_reconstruction/all": 0.2802385091781616, + "eval_train_reconstruction/end_span": 0.6991001963615417, + "eval_train_reconstruction/fim": 0.15830793976783752, + "eval_train_reconstruction/first_seq": 0.15418139100074768, + "eval_train_reconstruction/last_seq": 0.34064945578575134, + "eval_train_reconstruction/second_seq": 0.18426410853862762, + "eval_train_runtime": 438.7405, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 17300 + }, + { + "epoch": 0.06456883238960633, + "grad_norm": 0.3707965612411499, + "learning_rate": 0.0006, + "loss": 2.3326, + "step": 17310 + }, + { + "epoch": 0.06460613385256969, + "grad_norm": 0.2807806432247162, + "learning_rate": 0.0006, + "loss": 2.3118, + "step": 17320 + }, + { + "epoch": 0.06464343531553307, + "grad_norm": 0.32658615708351135, + "learning_rate": 0.0006, + "loss": 2.1586, + "step": 17330 + }, + { + "epoch": 0.06468073677849645, + "grad_norm": 0.2712886929512024, + "learning_rate": 0.0006, + "loss": 2.2681, + "step": 17340 + }, + { + "epoch": 0.06471803824145983, + "grad_norm": 0.2918921113014221, + "learning_rate": 0.0006, + "loss": 2.3449, + "step": 17350 + }, + { + "epoch": 0.06471803824145983, + "eval_valid_loss": 2.2037365436553955, + "eval_valid_loss/all": 2.065108060836792, + "eval_valid_loss/end_span": 1.2135435342788696, + "eval_valid_perplexity/batch": 7.886149883270264, + "eval_valid_perplexity/end_span": 3.365388870239258, + "eval_valid_perplexity/fim": 2.224059820175171, + "eval_valid_perplexity/first_seq": 14.5764799118042, + "eval_valid_perplexity/last_seq": 8.889090538024902, + "eval_valid_perplexity/second_seq": 13.700702667236328, + "eval_valid_perplexity/seq": 8.885540008544922, + "eval_valid_reconstruction/all": 0.29115256667137146, + "eval_valid_reconstruction/end_span": 0.717705488204956, + "eval_valid_reconstruction/fim": 0.15702186524868011, + "eval_valid_reconstruction/first_seq": 0.173740416765213, + "eval_valid_reconstruction/last_seq": 0.3284836709499359, + "eval_valid_reconstruction/second_seq": 0.1935892254114151, + "eval_valid_runtime": 438.3655, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 17350 + }, + { + "epoch": 0.06471803824145983, + "eval_train_loss": 2.202590227127075, + "eval_train_loss/all": 2.0370752811431885, + "eval_train_loss/end_span": 1.1822388172149658, + "eval_train_perplexity/batch": 7.668148994445801, + "eval_train_perplexity/end_span": 3.2616682052612305, + "eval_train_perplexity/fim": 2.176187753677368, + "eval_train_perplexity/first_seq": 15.381917953491211, + "eval_train_perplexity/last_seq": 9.205414772033691, + "eval_train_perplexity/second_seq": 14.605677604675293, + "eval_train_perplexity/seq": 8.822879791259766, + "eval_train_reconstruction/all": 0.2805061340332031, + "eval_train_reconstruction/end_span": 0.7290237545967102, + "eval_train_reconstruction/fim": 0.1519421935081482, + "eval_train_reconstruction/first_seq": 0.15554958581924438, + "eval_train_reconstruction/last_seq": 0.31898412108421326, + "eval_train_reconstruction/second_seq": 0.17569394409656525, + "eval_train_runtime": 438.8812, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 17350 + }, + { + "epoch": 0.06475533970442321, + "grad_norm": 0.6782407164573669, + "learning_rate": 0.0006, + "loss": 2.2678, + "step": 17360 + }, + { + "epoch": 0.06479264116738659, + "grad_norm": 0.29044729471206665, + "learning_rate": 0.0006, + "loss": 2.1701, + "step": 17370 + }, + { + "epoch": 0.06482994263034997, + "grad_norm": 0.27901870012283325, + "learning_rate": 0.0006, + "loss": 2.3225, + "step": 17380 + }, + { + "epoch": 0.06486724409331333, + "grad_norm": 0.32466959953308105, + "learning_rate": 0.0006, + "loss": 2.3017, + "step": 17390 + }, + { + "epoch": 0.06490454555627671, + "grad_norm": 0.4065795838832855, + "learning_rate": 0.0006, + "loss": 2.3779, + "step": 17400 + }, + { + "epoch": 0.06490454555627671, + "eval_valid_loss": 2.2056350708007812, + "eval_valid_loss/all": 2.066469430923462, + "eval_valid_loss/end_span": 1.2505515813827515, + "eval_valid_perplexity/batch": 7.896893501281738, + "eval_valid_perplexity/end_span": 3.4922688007354736, + "eval_valid_perplexity/fim": 2.3857622146606445, + "eval_valid_perplexity/first_seq": 15.197900772094727, + "eval_valid_perplexity/last_seq": 9.190814018249512, + "eval_valid_perplexity/second_seq": 13.461268424987793, + "eval_valid_perplexity/seq": 8.888294219970703, + "eval_valid_reconstruction/all": 0.29058122634887695, + "eval_valid_reconstruction/end_span": 0.7080420255661011, + "eval_valid_reconstruction/fim": 0.17089639604091644, + "eval_valid_reconstruction/first_seq": 0.15926937758922577, + "eval_valid_reconstruction/last_seq": 0.32114502787590027, + "eval_valid_reconstruction/second_seq": 0.1979648917913437, + "eval_valid_runtime": 439.8823, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 17400 + }, + { + "epoch": 0.06490454555627671, + "eval_train_loss": 2.2061588764190674, + "eval_train_loss/all": 2.040290355682373, + "eval_train_loss/end_span": 1.227301836013794, + "eval_train_perplexity/batch": 7.692842483520508, + "eval_train_perplexity/end_span": 3.412010908126831, + "eval_train_perplexity/fim": 2.1164298057556152, + "eval_train_perplexity/first_seq": 15.362833976745605, + "eval_train_perplexity/last_seq": 9.022006034851074, + "eval_train_perplexity/second_seq": 14.011054039001465, + "eval_train_perplexity/seq": 8.846420288085938, + "eval_train_reconstruction/all": 0.2795927822589874, + "eval_train_reconstruction/end_span": 0.7187016010284424, + "eval_train_reconstruction/fim": 0.14718015491962433, + "eval_train_reconstruction/first_seq": 0.15327733755111694, + "eval_train_reconstruction/last_seq": 0.32510438561439514, + "eval_train_reconstruction/second_seq": 0.1886182278394699, + "eval_train_runtime": 439.2936, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 17400 + }, + { + "epoch": 0.06494184701924009, + "grad_norm": 0.35778841376304626, + "learning_rate": 0.0006, + "loss": 2.2877, + "step": 17410 + }, + { + "epoch": 0.06497914848220347, + "grad_norm": 0.46742284297943115, + "learning_rate": 0.0006, + "loss": 2.3554, + "step": 17420 + }, + { + "epoch": 0.06501644994516685, + "grad_norm": 0.3544106185436249, + "learning_rate": 0.0006, + "loss": 2.2342, + "step": 17430 + }, + { + "epoch": 0.06505375140813023, + "grad_norm": 0.28844186663627625, + "learning_rate": 0.0006, + "loss": 2.255, + "step": 17440 + }, + { + "epoch": 0.06509105287109361, + "grad_norm": 0.29648932814598083, + "learning_rate": 0.0006, + "loss": 2.2433, + "step": 17450 + }, + { + "epoch": 0.06509105287109361, + "eval_valid_loss": 2.201200485229492, + "eval_valid_loss/all": 2.0629160404205322, + "eval_valid_loss/end_span": 1.2713781595230103, + "eval_valid_perplexity/batch": 7.868882179260254, + "eval_valid_perplexity/end_span": 3.565763473510742, + "eval_valid_perplexity/fim": 2.167325258255005, + "eval_valid_perplexity/first_seq": 14.795761108398438, + "eval_valid_perplexity/last_seq": 8.902054786682129, + "eval_valid_perplexity/second_seq": 13.675085067749023, + "eval_valid_perplexity/seq": 8.864005088806152, + "eval_valid_reconstruction/all": 0.29161450266838074, + "eval_valid_reconstruction/end_span": 0.7020437121391296, + "eval_valid_reconstruction/fim": 0.152689591050148, + "eval_valid_reconstruction/first_seq": 0.16714009642601013, + "eval_valid_reconstruction/last_seq": 0.3303394615650177, + "eval_valid_reconstruction/second_seq": 0.19969727098941803, + "eval_valid_runtime": 440.6131, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 17450 + }, + { + "epoch": 0.06509105287109361, + "eval_train_loss": 2.2019217014312744, + "eval_train_loss/all": 2.0367400646209717, + "eval_train_loss/end_span": 1.2454556226730347, + "eval_train_perplexity/batch": 7.665579319000244, + "eval_train_perplexity/end_span": 3.474517583847046, + "eval_train_perplexity/fim": 2.5629868507385254, + "eval_train_perplexity/first_seq": 15.519586563110352, + "eval_train_perplexity/last_seq": 8.952396392822266, + "eval_train_perplexity/second_seq": 14.327159881591797, + "eval_train_perplexity/seq": 8.820452690124512, + "eval_train_reconstruction/all": 0.2806962728500366, + "eval_train_reconstruction/end_span": 0.7121132612228394, + "eval_train_reconstruction/fim": 0.18512730300426483, + "eval_train_reconstruction/first_seq": 0.15173552930355072, + "eval_train_reconstruction/last_seq": 0.3265437185764313, + "eval_train_reconstruction/second_seq": 0.1807125359773636, + "eval_train_runtime": 439.1162, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 17450 + }, + { + "epoch": 0.06512835433405698, + "grad_norm": 0.3482985198497772, + "learning_rate": 0.0006, + "loss": 2.1667, + "step": 17460 + }, + { + "epoch": 0.06516565579702036, + "grad_norm": 0.35290277004241943, + "learning_rate": 0.0006, + "loss": 2.3493, + "step": 17470 + }, + { + "epoch": 0.06520295725998373, + "grad_norm": 0.40872922539711, + "learning_rate": 0.0006, + "loss": 2.3069, + "step": 17480 + }, + { + "epoch": 0.06524025872294711, + "grad_norm": 0.306988388299942, + "learning_rate": 0.0006, + "loss": 2.2592, + "step": 17490 + }, + { + "epoch": 0.0652775601859105, + "grad_norm": 0.326811820268631, + "learning_rate": 0.0006, + "loss": 2.3346, + "step": 17500 + }, + { + "epoch": 0.0652775601859105, + "eval_valid_loss": 2.2050318717956543, + "eval_valid_loss/all": 2.0661416053771973, + "eval_valid_loss/end_span": 1.275955080986023, + "eval_valid_perplexity/batch": 7.8943047523498535, + "eval_valid_perplexity/end_span": 3.582120895385742, + "eval_valid_perplexity/fim": 2.1098473072052, + "eval_valid_perplexity/first_seq": 14.941339492797852, + "eval_valid_perplexity/last_seq": 9.346471786499023, + "eval_valid_perplexity/second_seq": 13.806035995483398, + "eval_valid_perplexity/seq": 8.891205787658691, + "eval_valid_reconstruction/all": 0.2909901440143585, + "eval_valid_reconstruction/end_span": 0.6947676539421082, + "eval_valid_reconstruction/fim": 0.14668451249599457, + "eval_valid_reconstruction/first_seq": 0.168117955327034, + "eval_valid_reconstruction/last_seq": 0.31426572799682617, + "eval_valid_reconstruction/second_seq": 0.19416075944900513, + "eval_valid_runtime": 442.5592, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 17500 + }, + { + "epoch": 0.0652775601859105, + "eval_train_loss": 2.205604314804077, + "eval_train_loss/all": 2.039785385131836, + "eval_train_loss/end_span": 1.2383230924606323, + "eval_train_perplexity/batch": 7.688958644866943, + "eval_train_perplexity/end_span": 3.4498236179351807, + "eval_train_perplexity/fim": 2.1202239990234375, + "eval_train_perplexity/first_seq": 15.428983688354492, + "eval_train_perplexity/last_seq": 8.465779304504395, + "eval_train_perplexity/second_seq": 14.175451278686523, + "eval_train_perplexity/seq": 8.848319053649902, + "eval_train_reconstruction/all": 0.28012615442276, + "eval_train_reconstruction/end_span": 0.708379328250885, + "eval_train_reconstruction/fim": 0.14722248911857605, + "eval_train_reconstruction/first_seq": 0.1542363464832306, + "eval_train_reconstruction/last_seq": 0.3451928198337555, + "eval_train_reconstruction/second_seq": 0.18762776255607605, + "eval_train_runtime": 442.7038, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 17500 + }, + { + "epoch": 0.06531486164887387, + "grad_norm": 0.34624555706977844, + "learning_rate": 0.0006, + "loss": 2.1367, + "step": 17510 + }, + { + "epoch": 0.06535216311183725, + "grad_norm": 0.41883257031440735, + "learning_rate": 0.0006, + "loss": 2.1914, + "step": 17520 + }, + { + "epoch": 0.06538946457480062, + "grad_norm": 0.37207290530204773, + "learning_rate": 0.0006, + "loss": 2.3192, + "step": 17530 + }, + { + "epoch": 0.065426766037764, + "grad_norm": 0.43459001183509827, + "learning_rate": 0.0006, + "loss": 2.3192, + "step": 17540 + }, + { + "epoch": 0.06546406750072738, + "grad_norm": 0.3196081519126892, + "learning_rate": 0.0006, + "loss": 2.2495, + "step": 17550 + }, + { + "epoch": 0.06546406750072738, + "eval_valid_loss": 2.2066361904144287, + "eval_valid_loss/all": 2.0677413940429688, + "eval_valid_loss/end_span": 1.1740938425064087, + "eval_valid_perplexity/batch": 7.906944274902344, + "eval_valid_perplexity/end_span": 3.2352099418640137, + "eval_valid_perplexity/fim": 2.5577964782714844, + "eval_valid_perplexity/first_seq": 14.855982780456543, + "eval_valid_perplexity/last_seq": 9.489855766296387, + "eval_valid_perplexity/second_seq": 13.8558349609375, + "eval_valid_perplexity/seq": 8.908842086791992, + "eval_valid_reconstruction/all": 0.29029911756515503, + "eval_valid_reconstruction/end_span": 0.7315037250518799, + "eval_valid_reconstruction/fim": 0.18486350774765015, + "eval_valid_reconstruction/first_seq": 0.16954702138900757, + "eval_valid_reconstruction/last_seq": 0.31227579712867737, + "eval_valid_reconstruction/second_seq": 0.19435232877731323, + "eval_valid_runtime": 434.0364, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 17550 + }, + { + "epoch": 0.06546406750072738, + "eval_train_loss": 2.20531964302063, + "eval_train_loss/all": 2.0396153926849365, + "eval_train_loss/end_span": 1.1497751474380493, + "eval_train_perplexity/batch": 7.687652111053467, + "eval_train_perplexity/end_span": 3.157482862472534, + "eval_train_perplexity/fim": 1.9075795412063599, + "eval_train_perplexity/first_seq": 15.57772445678711, + "eval_train_perplexity/last_seq": 9.46765422821045, + "eval_train_perplexity/second_seq": 14.470553398132324, + "eval_train_perplexity/seq": 8.849845886230469, + "eval_train_reconstruction/all": 0.27993717789649963, + "eval_train_reconstruction/end_span": 0.7416537404060364, + "eval_train_reconstruction/fim": 0.12698708474636078, + "eval_train_reconstruction/first_seq": 0.1510947048664093, + "eval_train_reconstruction/last_seq": 0.3089234232902527, + "eval_train_reconstruction/second_seq": 0.18122750520706177, + "eval_train_runtime": 435.0088, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 17550 + }, + { + "epoch": 0.06550136896369076, + "grad_norm": 0.28783494234085083, + "learning_rate": 0.0006, + "loss": 2.2933, + "step": 17560 + }, + { + "epoch": 0.06553867042665414, + "grad_norm": 0.4054516851902008, + "learning_rate": 0.0006, + "loss": 2.1771, + "step": 17570 + }, + { + "epoch": 0.06557597188961752, + "grad_norm": 0.2681034505367279, + "learning_rate": 0.0006, + "loss": 2.2652, + "step": 17580 + }, + { + "epoch": 0.0656132733525809, + "grad_norm": 0.609931468963623, + "learning_rate": 0.0006, + "loss": 2.2134, + "step": 17590 + }, + { + "epoch": 0.06565057481554426, + "grad_norm": 0.3764537274837494, + "learning_rate": 0.0006, + "loss": 2.2985, + "step": 17600 + }, + { + "epoch": 0.06565057481554426, + "eval_valid_loss": 2.2024056911468506, + "eval_valid_loss/all": 2.0638482570648193, + "eval_valid_loss/end_span": 1.2497648000717163, + "eval_valid_perplexity/batch": 7.876221179962158, + "eval_valid_perplexity/end_span": 3.4895222187042236, + "eval_valid_perplexity/fim": 2.315495491027832, + "eval_valid_perplexity/first_seq": 14.763226509094238, + "eval_valid_perplexity/last_seq": 9.055158615112305, + "eval_valid_perplexity/second_seq": 14.1345853805542, + "eval_valid_perplexity/seq": 8.873209953308105, + "eval_valid_reconstruction/all": 0.2914625406265259, + "eval_valid_reconstruction/end_span": 0.7101960182189941, + "eval_valid_reconstruction/fim": 0.16487547755241394, + "eval_valid_reconstruction/first_seq": 0.1684875637292862, + "eval_valid_reconstruction/last_seq": 0.3250545263290405, + "eval_valid_reconstruction/second_seq": 0.18926125764846802, + "eval_valid_runtime": 436.4584, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 17600 + }, + { + "epoch": 0.06565057481554426, + "eval_train_loss": 2.2003557682037354, + "eval_train_loss/all": 2.0350797176361084, + "eval_train_loss/end_span": 1.2137032747268677, + "eval_train_perplexity/batch": 7.652862071990967, + "eval_train_perplexity/end_span": 3.365926504135132, + "eval_train_perplexity/fim": 2.1910219192504883, + "eval_train_perplexity/first_seq": 14.98520565032959, + "eval_train_perplexity/last_seq": 9.278297424316406, + "eval_train_perplexity/second_seq": 14.210565567016602, + "eval_train_perplexity/seq": 8.807867050170898, + "eval_train_reconstruction/all": 0.2812176048755646, + "eval_train_reconstruction/end_span": 0.7219129800796509, + "eval_train_reconstruction/fim": 0.15361273288726807, + "eval_train_reconstruction/first_seq": 0.16269207000732422, + "eval_train_reconstruction/last_seq": 0.31431248784065247, + "eval_train_reconstruction/second_seq": 0.18337103724479675, + "eval_train_runtime": 440.1467, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 17600 + }, + { + "epoch": 0.06568787627850764, + "grad_norm": 1.633780837059021, + "learning_rate": 0.0006, + "loss": 2.1797, + "step": 17610 + }, + { + "epoch": 0.06572517774147102, + "grad_norm": 0.3077356219291687, + "learning_rate": 0.0006, + "loss": 2.4141, + "step": 17620 + }, + { + "epoch": 0.0657624792044344, + "grad_norm": 0.4899127781391144, + "learning_rate": 0.0006, + "loss": 2.1634, + "step": 17630 + }, + { + "epoch": 0.06579978066739778, + "grad_norm": 0.2858479917049408, + "learning_rate": 0.0006, + "loss": 2.2389, + "step": 17640 + }, + { + "epoch": 0.06583708213036116, + "grad_norm": 0.4243810176849365, + "learning_rate": 0.0006, + "loss": 2.3317, + "step": 17650 + }, + { + "epoch": 0.06583708213036116, + "eval_valid_loss": 2.1989266872406006, + "eval_valid_loss/all": 2.0605411529541016, + "eval_valid_loss/end_span": 1.348014235496521, + "eval_valid_perplexity/batch": 7.850216865539551, + "eval_valid_perplexity/end_span": 3.8497731685638428, + "eval_valid_perplexity/fim": 1.9795321226119995, + "eval_valid_perplexity/first_seq": 14.800054550170898, + "eval_valid_perplexity/last_seq": 9.197611808776855, + "eval_valid_perplexity/second_seq": 13.700225830078125, + "eval_valid_perplexity/seq": 8.843443870544434, + "eval_valid_reconstruction/all": 0.2926102578639984, + "eval_valid_reconstruction/end_span": 0.6878118515014648, + "eval_valid_reconstruction/fim": 0.1350904256105423, + "eval_valid_reconstruction/first_seq": 0.17015673220157623, + "eval_valid_reconstruction/last_seq": 0.320488303899765, + "eval_valid_reconstruction/second_seq": 0.19668591022491455, + "eval_valid_runtime": 447.728, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 17650 + }, + { + "epoch": 0.06583708213036116, + "eval_train_loss": 2.1994450092315674, + "eval_train_loss/all": 2.034148693084717, + "eval_train_loss/end_span": 1.3187779188156128, + "eval_train_perplexity/batch": 7.645740509033203, + "eval_train_perplexity/end_span": 3.738849401473999, + "eval_train_perplexity/fim": 2.3899412155151367, + "eval_train_perplexity/first_seq": 15.738638877868652, + "eval_train_perplexity/last_seq": 9.197218894958496, + "eval_train_perplexity/second_seq": 14.456622123718262, + "eval_train_perplexity/seq": 8.797224998474121, + "eval_train_reconstruction/all": 0.28152427077293396, + "eval_train_reconstruction/end_span": 0.6978294253349304, + "eval_train_reconstruction/fim": 0.17210640013217926, + "eval_train_reconstruction/first_seq": 0.14784960448741913, + "eval_train_reconstruction/last_seq": 0.31991130113601685, + "eval_train_reconstruction/second_seq": 0.18075956404209137, + "eval_train_runtime": 440.392, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 17650 + }, + { + "epoch": 0.06587438359332454, + "grad_norm": 0.3801358938217163, + "learning_rate": 0.0006, + "loss": 2.1464, + "step": 17660 + }, + { + "epoch": 0.0659116850562879, + "grad_norm": 0.2997782826423645, + "learning_rate": 0.0006, + "loss": 2.2644, + "step": 17670 + }, + { + "epoch": 0.06594898651925128, + "grad_norm": 0.23190025985240936, + "learning_rate": 0.0006, + "loss": 2.3422, + "step": 17680 + }, + { + "epoch": 0.06598628798221466, + "grad_norm": 0.36781179904937744, + "learning_rate": 0.0006, + "loss": 2.244, + "step": 17690 + }, + { + "epoch": 0.06602358944517804, + "grad_norm": 0.3409995138645172, + "learning_rate": 0.0006, + "loss": 2.3806, + "step": 17700 + }, + { + "epoch": 0.06602358944517804, + "eval_valid_loss": 2.2074873447418213, + "eval_valid_loss/all": 2.068268060684204, + "eval_valid_loss/end_span": 1.3341150283813477, + "eval_valid_perplexity/batch": 7.911109447479248, + "eval_valid_perplexity/end_span": 3.7966344356536865, + "eval_valid_perplexity/fim": 2.214458703994751, + "eval_valid_perplexity/first_seq": 14.41763687133789, + "eval_valid_perplexity/last_seq": 9.346017837524414, + "eval_valid_perplexity/second_seq": 14.054208755493164, + "eval_valid_perplexity/seq": 8.911446571350098, + "eval_valid_reconstruction/all": 0.29087141156196594, + "eval_valid_reconstruction/end_span": 0.6817686557769775, + "eval_valid_reconstruction/fim": 0.1522877812385559, + "eval_valid_reconstruction/first_seq": 0.17786066234111786, + "eval_valid_reconstruction/last_seq": 0.3159785866737366, + "eval_valid_reconstruction/second_seq": 0.18856793642044067, + "eval_valid_runtime": 434.9997, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 17700 + }, + { + "epoch": 0.06602358944517804, + "eval_train_loss": 2.2076284885406494, + "eval_train_loss/all": 2.0415303707122803, + "eval_train_loss/end_span": 1.2855504751205444, + "eval_train_perplexity/batch": 7.702387809753418, + "eval_train_perplexity/end_span": 3.6166582107543945, + "eval_train_perplexity/fim": 1.9970893859863281, + "eval_train_perplexity/first_seq": 15.58353042602539, + "eval_train_perplexity/last_seq": 9.01694393157959, + "eval_train_perplexity/second_seq": 13.899609565734863, + "eval_train_perplexity/seq": 8.867552757263184, + "eval_train_reconstruction/all": 0.27992871403694153, + "eval_train_reconstruction/end_span": 0.6946414709091187, + "eval_train_reconstruction/fim": 0.13373036682605743, + "eval_train_reconstruction/first_seq": 0.15187186002731323, + "eval_train_reconstruction/last_seq": 0.3260221779346466, + "eval_train_reconstruction/second_seq": 0.19497013092041016, + "eval_train_runtime": 436.2097, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 17700 + }, + { + "epoch": 0.06606089090814142, + "grad_norm": 0.36607658863067627, + "learning_rate": 0.0006, + "loss": 2.1029, + "step": 17710 + }, + { + "epoch": 0.0660981923711048, + "grad_norm": 0.35287147760391235, + "learning_rate": 0.0006, + "loss": 2.2157, + "step": 17720 + }, + { + "epoch": 0.06613549383406817, + "grad_norm": 0.46916326880455017, + "learning_rate": 0.0006, + "loss": 2.2588, + "step": 17730 + }, + { + "epoch": 0.06617279529703154, + "grad_norm": 0.4434422552585602, + "learning_rate": 0.0006, + "loss": 2.2553, + "step": 17740 + }, + { + "epoch": 0.06621009675999492, + "grad_norm": 0.4514656066894531, + "learning_rate": 0.0006, + "loss": 2.2084, + "step": 17750 + }, + { + "epoch": 0.06621009675999492, + "eval_valid_loss": 2.2057547569274902, + "eval_valid_loss/all": 2.0668416023254395, + "eval_valid_loss/end_span": 1.2766070365905762, + "eval_valid_perplexity/batch": 7.899832725524902, + "eval_valid_perplexity/end_span": 3.5844571590423584, + "eval_valid_perplexity/fim": 2.046754837036133, + "eval_valid_perplexity/first_seq": 14.635665893554688, + "eval_valid_perplexity/last_seq": 8.920339584350586, + "eval_valid_perplexity/second_seq": 14.107335090637207, + "eval_valid_perplexity/seq": 8.903899192810059, + "eval_valid_reconstruction/all": 0.29028868675231934, + "eval_valid_reconstruction/end_span": 0.6987256407737732, + "eval_valid_reconstruction/fim": 0.1407282054424286, + "eval_valid_reconstruction/first_seq": 0.17126138508319855, + "eval_valid_reconstruction/last_seq": 0.33111751079559326, + "eval_valid_reconstruction/second_seq": 0.18849493563175201, + "eval_valid_runtime": 443.1209, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 17750 + }, + { + "epoch": 0.06621009675999492, + "eval_train_loss": 2.204484224319458, + "eval_train_loss/all": 2.0387465953826904, + "eval_train_loss/end_span": 1.251460313796997, + "eval_train_perplexity/batch": 7.680975914001465, + "eval_train_perplexity/end_span": 3.49544358253479, + "eval_train_perplexity/fim": 1.9278438091278076, + "eval_train_perplexity/first_seq": 15.435124397277832, + "eval_train_perplexity/last_seq": 8.971579551696777, + "eval_train_perplexity/second_seq": 14.296013832092285, + "eval_train_perplexity/seq": 8.841012001037598, + "eval_train_reconstruction/all": 0.2797943949699402, + "eval_train_reconstruction/end_span": 0.7095245718955994, + "eval_train_reconstruction/fim": 0.12803076207637787, + "eval_train_reconstruction/first_seq": 0.15231521427631378, + "eval_train_reconstruction/last_seq": 0.3241617977619171, + "eval_train_reconstruction/second_seq": 0.18001689016819, + "eval_train_runtime": 437.7855, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 17750 + }, + { + "epoch": 0.0662473982229583, + "grad_norm": 0.33042195439338684, + "learning_rate": 0.0006, + "loss": 2.324, + "step": 17760 + }, + { + "epoch": 0.06628469968592168, + "grad_norm": 0.21840901672840118, + "learning_rate": 0.0006, + "loss": 2.1964, + "step": 17770 + }, + { + "epoch": 0.06632200114888506, + "grad_norm": 0.36393794417381287, + "learning_rate": 0.0006, + "loss": 2.1221, + "step": 17780 + }, + { + "epoch": 0.06635930261184844, + "grad_norm": 0.4579876959323883, + "learning_rate": 0.0006, + "loss": 2.2303, + "step": 17790 + }, + { + "epoch": 0.06639660407481181, + "grad_norm": 0.46068552136421204, + "learning_rate": 0.0006, + "loss": 2.1284, + "step": 17800 + }, + { + "epoch": 0.06643390553777519, + "grad_norm": 0.2806225121021271, + "learning_rate": 0.0006, + "loss": 2.3116, + "step": 17810 + }, + { + "epoch": 0.06647120700073857, + "grad_norm": 0.620272696018219, + "learning_rate": 0.0006, + "loss": 2.2317, + "step": 17820 + }, + { + "epoch": 0.06650850846370195, + "grad_norm": 0.4640248119831085, + "learning_rate": 0.0006, + "loss": 2.3218, + "step": 17830 + }, + { + "epoch": 0.06654580992666533, + "grad_norm": 0.3709312379360199, + "learning_rate": 0.0006, + "loss": 2.213, + "step": 17840 + }, + { + "epoch": 0.0665831113896287, + "grad_norm": 0.3582509458065033, + "learning_rate": 0.0006, + "loss": 2.2505, + "step": 17850 + }, + { + "epoch": 0.06662041285259208, + "grad_norm": 0.30682259798049927, + "learning_rate": 0.0006, + "loss": 2.1518, + "step": 17860 + }, + { + "epoch": 0.06665771431555545, + "grad_norm": 0.4937511086463928, + "learning_rate": 0.0006, + "loss": 2.068, + "step": 17870 + }, + { + "epoch": 0.06669501577851883, + "grad_norm": 0.45193812251091003, + "learning_rate": 0.0006, + "loss": 2.1444, + "step": 17880 + }, + { + "epoch": 0.06673231724148221, + "grad_norm": 0.44489166140556335, + "learning_rate": 0.0006, + "loss": 2.2024, + "step": 17890 + }, + { + "epoch": 0.06676961870444559, + "grad_norm": 0.30093735456466675, + "learning_rate": 0.0006, + "loss": 2.3643, + "step": 17900 + }, + { + "epoch": 0.06680692016740897, + "grad_norm": 0.30381956696510315, + "learning_rate": 0.0006, + "loss": 2.2405, + "step": 17910 + }, + { + "epoch": 0.06684422163037235, + "grad_norm": 0.2575649321079254, + "learning_rate": 0.0006, + "loss": 2.1379, + "step": 17920 + }, + { + "epoch": 0.06688152309333573, + "grad_norm": 0.4452294707298279, + "learning_rate": 0.0006, + "loss": 2.0994, + "step": 17930 + }, + { + "epoch": 0.06691882455629909, + "grad_norm": 0.3104233741760254, + "learning_rate": 0.0006, + "loss": 2.3661, + "step": 17940 + }, + { + "epoch": 0.06695612601926247, + "grad_norm": 0.4568098485469818, + "learning_rate": 0.0006, + "loss": 2.0187, + "step": 17950 + }, + { + "epoch": 0.06699342748222585, + "grad_norm": 0.4275231659412384, + "learning_rate": 0.0006, + "loss": 2.075, + "step": 17960 + }, + { + "epoch": 0.06703072894518923, + "grad_norm": 0.43656718730926514, + "learning_rate": 0.0006, + "loss": 2.1771, + "step": 17970 + }, + { + "epoch": 0.06706803040815261, + "grad_norm": 0.3754292130470276, + "learning_rate": 0.0006, + "loss": 2.2796, + "step": 17980 + }, + { + "epoch": 0.06710533187111599, + "grad_norm": 0.45015668869018555, + "learning_rate": 0.0006, + "loss": 2.3238, + "step": 17990 + }, + { + "epoch": 0.06714263333407937, + "grad_norm": 0.3770838677883148, + "learning_rate": 0.0006, + "loss": 2.3116, + "step": 18000 + }, + { + "epoch": 0.06714263333407937, + "eval_valid_loss": 2.205200433731079, + "eval_valid_loss/all": 2.066322088241577, + "eval_valid_loss/end_span": 1.250537633895874, + "eval_valid_perplexity/batch": 7.895730018615723, + "eval_valid_perplexity/end_span": 3.492219924926758, + "eval_valid_perplexity/fim": 2.1721081733703613, + "eval_valid_perplexity/first_seq": 14.880928039550781, + "eval_valid_perplexity/last_seq": 9.1593599319458, + "eval_valid_perplexity/second_seq": 13.687209129333496, + "eval_valid_perplexity/seq": 8.89204216003418, + "eval_valid_reconstruction/all": 0.29058700799942017, + "eval_valid_reconstruction/end_span": 0.7123508453369141, + "eval_valid_reconstruction/fim": 0.15229596197605133, + "eval_valid_reconstruction/first_seq": 0.16689904034137726, + "eval_valid_reconstruction/last_seq": 0.3236975371837616, + "eval_valid_reconstruction/second_seq": 0.19510455429553986, + "eval_valid_runtime": 440.047, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 18000 + }, + { + "epoch": 0.06714263333407937, + "eval_train_loss": 2.202077627182007, + "eval_train_loss/all": 2.0361571311950684, + "eval_train_loss/end_span": 1.2108876705169678, + "eval_train_perplexity/batch": 7.661111831665039, + "eval_train_perplexity/end_span": 3.3564627170562744, + "eval_train_perplexity/fim": 1.970818281173706, + "eval_train_perplexity/first_seq": 15.514824867248535, + "eval_train_perplexity/last_seq": 9.324966430664062, + "eval_train_perplexity/second_seq": 14.020828247070312, + "eval_train_perplexity/seq": 8.814906120300293, + "eval_train_reconstruction/all": 0.28068387508392334, + "eval_train_reconstruction/end_span": 0.7256144285202026, + "eval_train_reconstruction/fim": 0.13379718363285065, + "eval_train_reconstruction/first_seq": 0.15282824635505676, + "eval_train_reconstruction/last_seq": 0.314238041639328, + "eval_train_reconstruction/second_seq": 0.18833108246326447, + "eval_train_runtime": 439.6051, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 18000 + }, + { + "epoch": 0.06717993479704273, + "grad_norm": 0.4909980595111847, + "learning_rate": 0.0006, + "loss": 2.189, + "step": 18010 + }, + { + "epoch": 0.06721723626000611, + "grad_norm": 0.5949395298957825, + "learning_rate": 0.0006, + "loss": 2.2837, + "step": 18020 + }, + { + "epoch": 0.06725453772296949, + "grad_norm": 0.3247208297252655, + "learning_rate": 0.0006, + "loss": 2.2577, + "step": 18030 + }, + { + "epoch": 0.06729183918593287, + "grad_norm": 0.5312273502349854, + "learning_rate": 0.0006, + "loss": 2.2819, + "step": 18040 + }, + { + "epoch": 0.06732914064889625, + "grad_norm": 0.33282843232154846, + "learning_rate": 0.0006, + "loss": 2.1435, + "step": 18050 + }, + { + "epoch": 0.06736644211185963, + "grad_norm": 0.4168374538421631, + "learning_rate": 0.0006, + "loss": 2.1597, + "step": 18060 + }, + { + "epoch": 0.06740374357482301, + "grad_norm": 0.308117151260376, + "learning_rate": 0.0006, + "loss": 2.2184, + "step": 18070 + }, + { + "epoch": 0.06744104503778638, + "grad_norm": 0.27267226576805115, + "learning_rate": 0.0006, + "loss": 2.1593, + "step": 18080 + }, + { + "epoch": 0.06747834650074976, + "grad_norm": 0.330649733543396, + "learning_rate": 0.0006, + "loss": 2.0464, + "step": 18090 + }, + { + "epoch": 0.06751564796371314, + "grad_norm": 0.39877837896347046, + "learning_rate": 0.0006, + "loss": 2.369, + "step": 18100 + }, + { + "epoch": 0.06755294942667651, + "grad_norm": 0.32183369994163513, + "learning_rate": 0.0006, + "loss": 2.1566, + "step": 18110 + }, + { + "epoch": 0.0675902508896399, + "grad_norm": 0.34610891342163086, + "learning_rate": 0.0006, + "loss": 2.2191, + "step": 18120 + }, + { + "epoch": 0.06762755235260327, + "grad_norm": 0.4018115699291229, + "learning_rate": 0.0006, + "loss": 2.0545, + "step": 18130 + }, + { + "epoch": 0.06766485381556665, + "grad_norm": 0.3465992510318756, + "learning_rate": 0.0006, + "loss": 2.2552, + "step": 18140 + }, + { + "epoch": 0.06770215527853002, + "grad_norm": 0.4398839771747589, + "learning_rate": 0.0006, + "loss": 2.3323, + "step": 18150 + }, + { + "epoch": 0.0677394567414934, + "grad_norm": 0.3550983667373657, + "learning_rate": 0.0006, + "loss": 2.1914, + "step": 18160 + }, + { + "epoch": 0.06777675820445678, + "grad_norm": 0.4492173194885254, + "learning_rate": 0.0006, + "loss": 2.2821, + "step": 18170 + }, + { + "epoch": 0.06781405966742016, + "grad_norm": 0.23469533026218414, + "learning_rate": 0.0006, + "loss": 2.241, + "step": 18180 + }, + { + "epoch": 0.06785136113038354, + "grad_norm": 0.377481609582901, + "learning_rate": 0.0006, + "loss": 2.2697, + "step": 18190 + }, + { + "epoch": 0.06788866259334692, + "grad_norm": 0.31940555572509766, + "learning_rate": 0.0006, + "loss": 2.1696, + "step": 18200 + }, + { + "epoch": 0.0679259640563103, + "grad_norm": 0.25468188524246216, + "learning_rate": 0.0006, + "loss": 2.2796, + "step": 18210 + }, + { + "epoch": 0.06796326551927366, + "grad_norm": 0.5631659626960754, + "learning_rate": 0.0006, + "loss": 2.2669, + "step": 18220 + }, + { + "epoch": 0.06800056698223704, + "grad_norm": 0.3117319345474243, + "learning_rate": 0.0006, + "loss": 2.0457, + "step": 18230 + }, + { + "epoch": 0.06803786844520042, + "grad_norm": 0.4360451400279999, + "learning_rate": 0.0006, + "loss": 2.1052, + "step": 18240 + }, + { + "epoch": 0.0680751699081638, + "grad_norm": 0.5439634323120117, + "learning_rate": 0.0006, + "loss": 2.2919, + "step": 18250 + }, + { + "epoch": 0.0680751699081638, + "eval_valid_loss": 2.2044594287872314, + "eval_valid_loss/all": 2.0657031536102295, + "eval_valid_loss/end_span": 1.2719799280166626, + "eval_valid_perplexity/batch": 7.890844345092773, + "eval_valid_perplexity/end_span": 3.5679097175598145, + "eval_valid_perplexity/fim": 2.264911651611328, + "eval_valid_perplexity/first_seq": 14.570267677307129, + "eval_valid_perplexity/last_seq": 9.08146858215332, + "eval_valid_perplexity/second_seq": 13.746726989746094, + "eval_valid_perplexity/seq": 8.890393257141113, + "eval_valid_reconstruction/all": 0.2909683585166931, + "eval_valid_reconstruction/end_span": 0.7044093608856201, + "eval_valid_reconstruction/fim": 0.16221068799495697, + "eval_valid_reconstruction/first_seq": 0.17275913059711456, + "eval_valid_reconstruction/last_seq": 0.32379674911499023, + "eval_valid_reconstruction/second_seq": 0.19485031068325043, + "eval_valid_runtime": 439.1257, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 18250 + }, + { + "epoch": 0.0680751699081638, + "eval_train_loss": 2.202782392501831, + "eval_train_loss/all": 2.037187337875366, + "eval_train_loss/end_span": 1.2380880117416382, + "eval_train_perplexity/batch": 7.669008731842041, + "eval_train_perplexity/end_span": 3.4490127563476562, + "eval_train_perplexity/fim": 2.2454535961151123, + "eval_train_perplexity/first_seq": 15.478070259094238, + "eval_train_perplexity/last_seq": 9.298595428466797, + "eval_train_perplexity/second_seq": 14.264684677124023, + "eval_train_perplexity/seq": 8.82649040222168, + "eval_train_reconstruction/all": 0.28047218918800354, + "eval_train_reconstruction/end_span": 0.7142038941383362, + "eval_train_reconstruction/fim": 0.1588621884584427, + "eval_train_reconstruction/first_seq": 0.15455682575702667, + "eval_train_reconstruction/last_seq": 0.31452569365501404, + "eval_train_reconstruction/second_seq": 0.1829262226819992, + "eval_train_runtime": 434.357, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 18250 + }, + { + "epoch": 0.06811247137112718, + "grad_norm": 0.2895471751689911, + "learning_rate": 0.0006, + "loss": 2.1676, + "step": 18260 + }, + { + "epoch": 0.06814977283409056, + "grad_norm": 0.4097922742366791, + "learning_rate": 0.0006, + "loss": 1.9829, + "step": 18270 + }, + { + "epoch": 0.06818707429705394, + "grad_norm": 0.2736271619796753, + "learning_rate": 0.0006, + "loss": 1.9095, + "step": 18280 + }, + { + "epoch": 0.0682243757600173, + "grad_norm": 0.35072943568229675, + "learning_rate": 0.0006, + "loss": 2.2743, + "step": 18290 + }, + { + "epoch": 0.06826167722298068, + "grad_norm": 0.430340051651001, + "learning_rate": 0.0006, + "loss": 2.2239, + "step": 18300 + }, + { + "epoch": 0.06829897868594406, + "grad_norm": 0.6883416175842285, + "learning_rate": 0.0006, + "loss": 2.1485, + "step": 18310 + }, + { + "epoch": 0.06833628014890744, + "grad_norm": 0.46861788630485535, + "learning_rate": 0.0006, + "loss": 2.1713, + "step": 18320 + }, + { + "epoch": 0.06837358161187082, + "grad_norm": 0.36971697211265564, + "learning_rate": 0.0006, + "loss": 2.0549, + "step": 18330 + }, + { + "epoch": 0.0684108830748342, + "grad_norm": 0.36235034465789795, + "learning_rate": 0.0006, + "loss": 2.3093, + "step": 18340 + }, + { + "epoch": 0.06844818453779757, + "grad_norm": 0.27287495136260986, + "learning_rate": 0.0006, + "loss": 2.3186, + "step": 18350 + }, + { + "epoch": 0.06848548600076095, + "grad_norm": 0.2999447286128998, + "learning_rate": 0.0006, + "loss": 2.3982, + "step": 18360 + }, + { + "epoch": 0.06852278746372432, + "grad_norm": 0.3612465262413025, + "learning_rate": 0.0006, + "loss": 2.1412, + "step": 18370 + }, + { + "epoch": 0.0685600889266877, + "grad_norm": 0.2991202473640442, + "learning_rate": 0.0006, + "loss": 2.3353, + "step": 18380 + }, + { + "epoch": 0.06859739038965108, + "grad_norm": 0.2366013526916504, + "learning_rate": 0.0006, + "loss": 2.2486, + "step": 18390 + }, + { + "epoch": 0.06863469185261446, + "grad_norm": 2.759899139404297, + "learning_rate": 0.0006, + "loss": 2.1093, + "step": 18400 + }, + { + "epoch": 0.06867199331557784, + "grad_norm": 0.33159157633781433, + "learning_rate": 0.0006, + "loss": 2.0338, + "step": 18410 + }, + { + "epoch": 0.06870929477854121, + "grad_norm": 0.46613723039627075, + "learning_rate": 0.0006, + "loss": 2.3266, + "step": 18420 + }, + { + "epoch": 0.06874659624150459, + "grad_norm": 0.49459031224250793, + "learning_rate": 0.0006, + "loss": 2.24, + "step": 18430 + }, + { + "epoch": 0.06878389770446797, + "grad_norm": 0.4476267993450165, + "learning_rate": 0.0006, + "loss": 2.1943, + "step": 18440 + }, + { + "epoch": 0.06882119916743135, + "grad_norm": 5.7038702964782715, + "learning_rate": 0.0006, + "loss": 2.2389, + "step": 18450 + }, + { + "epoch": 0.06885850063039473, + "grad_norm": 0.6400591731071472, + "learning_rate": 0.0006, + "loss": 1.9713, + "step": 18460 + }, + { + "epoch": 0.0688958020933581, + "grad_norm": 0.4469614326953888, + "learning_rate": 0.0006, + "loss": 2.3337, + "step": 18470 + }, + { + "epoch": 0.06893310355632148, + "grad_norm": 0.29109224677085876, + "learning_rate": 0.0006, + "loss": 2.1796, + "step": 18480 + }, + { + "epoch": 0.06897040501928485, + "grad_norm": 0.2916240096092224, + "learning_rate": 0.0006, + "loss": 2.1879, + "step": 18490 + }, + { + "epoch": 0.06900770648224823, + "grad_norm": 0.3404162526130676, + "learning_rate": 0.0006, + "loss": 2.3057, + "step": 18500 + }, + { + "epoch": 0.06900770648224823, + "eval_valid_loss": 2.215487241744995, + "eval_valid_loss/all": 2.076376438140869, + "eval_valid_loss/end_span": 1.3117009401321411, + "eval_valid_perplexity/batch": 7.9755167961120605, + "eval_valid_perplexity/end_span": 3.7124831676483154, + "eval_valid_perplexity/fim": 2.2027857303619385, + "eval_valid_perplexity/first_seq": 14.847908973693848, + "eval_valid_perplexity/last_seq": 9.436278343200684, + "eval_valid_perplexity/second_seq": 13.689072608947754, + "eval_valid_perplexity/seq": 8.992331504821777, + "eval_valid_reconstruction/all": 0.28803128004074097, + "eval_valid_reconstruction/end_span": 0.6978315711021423, + "eval_valid_reconstruction/fim": 0.15349173545837402, + "eval_valid_reconstruction/first_seq": 0.17080843448638916, + "eval_valid_reconstruction/last_seq": 0.3133516311645508, + "eval_valid_reconstruction/second_seq": 0.2016409933567047, + "eval_valid_runtime": 437.4612, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 18500 + }, + { + "epoch": 0.06900770648224823, + "eval_train_loss": 2.211092472076416, + "eval_train_loss/all": 2.043949842453003, + "eval_train_loss/end_span": 1.2684624195098877, + "eval_train_perplexity/batch": 7.721045970916748, + "eval_train_perplexity/end_span": 3.5553817749023438, + "eval_train_perplexity/fim": 2.100829839706421, + "eval_train_perplexity/first_seq": 15.496906280517578, + "eval_train_perplexity/last_seq": 8.926090240478516, + "eval_train_perplexity/second_seq": 14.505472183227539, + "eval_train_perplexity/seq": 8.874128341674805, + "eval_train_reconstruction/all": 0.2789021134376526, + "eval_train_reconstruction/end_span": 0.7090796232223511, + "eval_train_reconstruction/fim": 0.14531309902668, + "eval_train_reconstruction/first_seq": 0.1530827134847641, + "eval_train_reconstruction/last_seq": 0.3290387988090515, + "eval_train_reconstruction/second_seq": 0.17714068293571472, + "eval_train_runtime": 436.7546, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 18500 + }, + { + "epoch": 0.06904500794521161, + "grad_norm": 0.4549565017223358, + "learning_rate": 0.0006, + "loss": 2.2233, + "step": 18510 + }, + { + "epoch": 0.06908230940817499, + "grad_norm": 0.3812030553817749, + "learning_rate": 0.0006, + "loss": 2.2434, + "step": 18520 + }, + { + "epoch": 0.06911961087113837, + "grad_norm": 0.3364100158214569, + "learning_rate": 0.0006, + "loss": 2.3868, + "step": 18530 + }, + { + "epoch": 0.06915691233410175, + "grad_norm": 0.45900583267211914, + "learning_rate": 0.0006, + "loss": 2.1748, + "step": 18540 + }, + { + "epoch": 0.06919421379706513, + "grad_norm": 0.40494081377983093, + "learning_rate": 0.0006, + "loss": 2.3167, + "step": 18550 + }, + { + "epoch": 0.06923151526002849, + "grad_norm": 0.416813462972641, + "learning_rate": 0.0006, + "loss": 2.3065, + "step": 18560 + }, + { + "epoch": 0.06926881672299187, + "grad_norm": 0.29929593205451965, + "learning_rate": 0.0006, + "loss": 2.2113, + "step": 18570 + }, + { + "epoch": 0.06930611818595525, + "grad_norm": 0.34512582421302795, + "learning_rate": 0.0006, + "loss": 2.3, + "step": 18580 + }, + { + "epoch": 0.06934341964891863, + "grad_norm": 0.3241370916366577, + "learning_rate": 0.0006, + "loss": 2.2436, + "step": 18590 + }, + { + "epoch": 0.06938072111188201, + "grad_norm": 0.313599169254303, + "learning_rate": 0.0006, + "loss": 2.1837, + "step": 18600 + }, + { + "epoch": 0.06941802257484539, + "grad_norm": 0.40209150314331055, + "learning_rate": 0.0006, + "loss": 2.3391, + "step": 18610 + }, + { + "epoch": 0.06945532403780877, + "grad_norm": 0.520918607711792, + "learning_rate": 0.0006, + "loss": 2.1896, + "step": 18620 + }, + { + "epoch": 0.06949262550077213, + "grad_norm": 0.43504229187965393, + "learning_rate": 0.0006, + "loss": 2.3268, + "step": 18630 + }, + { + "epoch": 0.06952992696373551, + "grad_norm": 0.42055919766426086, + "learning_rate": 0.0006, + "loss": 2.3281, + "step": 18640 + }, + { + "epoch": 0.0695672284266989, + "grad_norm": 0.3137040138244629, + "learning_rate": 0.0006, + "loss": 2.2351, + "step": 18650 + }, + { + "epoch": 0.06960452988966227, + "grad_norm": 0.8122360706329346, + "learning_rate": 0.0006, + "loss": 2.206, + "step": 18660 + }, + { + "epoch": 0.06964183135262565, + "grad_norm": 0.3499402105808258, + "learning_rate": 0.0006, + "loss": 2.344, + "step": 18670 + }, + { + "epoch": 0.06967913281558903, + "grad_norm": 0.3920418918132782, + "learning_rate": 0.0006, + "loss": 2.0803, + "step": 18680 + }, + { + "epoch": 0.06971643427855241, + "grad_norm": 0.3869876265525818, + "learning_rate": 0.0006, + "loss": 2.2729, + "step": 18690 + }, + { + "epoch": 0.06975373574151578, + "grad_norm": 0.2728255093097687, + "learning_rate": 0.0006, + "loss": 2.2545, + "step": 18700 + }, + { + "epoch": 0.06979103720447916, + "grad_norm": 0.48461270332336426, + "learning_rate": 0.0006, + "loss": 2.1924, + "step": 18710 + }, + { + "epoch": 0.06982833866744254, + "grad_norm": 0.43247556686401367, + "learning_rate": 0.0006, + "loss": 2.3783, + "step": 18720 + }, + { + "epoch": 0.06986564013040591, + "grad_norm": 0.2717408239841461, + "learning_rate": 0.0006, + "loss": 2.3051, + "step": 18730 + }, + { + "epoch": 0.0699029415933693, + "grad_norm": 0.47314417362213135, + "learning_rate": 0.0006, + "loss": 2.2772, + "step": 18740 + }, + { + "epoch": 0.06994024305633267, + "grad_norm": 0.36961013078689575, + "learning_rate": 0.0006, + "loss": 2.0925, + "step": 18750 + }, + { + "epoch": 0.06994024305633267, + "eval_valid_loss": 2.203244686126709, + "eval_valid_loss/all": 2.0649688243865967, + "eval_valid_loss/end_span": 1.2230905294418335, + "eval_valid_perplexity/batch": 7.88505220413208, + "eval_valid_perplexity/end_span": 3.397672176361084, + "eval_valid_perplexity/fim": 2.1278858184814453, + "eval_valid_perplexity/first_seq": 14.513175964355469, + "eval_valid_perplexity/last_seq": 8.860660552978516, + "eval_valid_perplexity/second_seq": 13.838898658752441, + "eval_valid_perplexity/seq": 8.888459205627441, + "eval_valid_reconstruction/all": 0.2909010648727417, + "eval_valid_reconstruction/end_span": 0.7053136229515076, + "eval_valid_reconstruction/fim": 0.14850179851055145, + "eval_valid_reconstruction/first_seq": 0.17722098529338837, + "eval_valid_reconstruction/last_seq": 0.3325238525867462, + "eval_valid_reconstruction/second_seq": 0.19466030597686768, + "eval_valid_runtime": 486.305, + "eval_valid_samples_per_second": 0.395, + "eval_valid_steps_per_second": 0.395, + "step": 18750 + }, + { + "epoch": 0.06994024305633267, + "eval_train_loss": 2.201887607574463, + "eval_train_loss/all": 2.0367231369018555, + "eval_train_loss/end_span": 1.1950348615646362, + "eval_train_perplexity/batch": 7.665449142456055, + "eval_train_perplexity/end_span": 3.303673028945923, + "eval_train_perplexity/fim": 1.9690994024276733, + "eval_train_perplexity/first_seq": 15.390124320983887, + "eval_train_perplexity/last_seq": 9.312630653381348, + "eval_train_perplexity/second_seq": 14.386489868164062, + "eval_train_perplexity/seq": 8.82325267791748, + "eval_train_reconstruction/all": 0.280596524477005, + "eval_train_reconstruction/end_span": 0.7158154249191284, + "eval_train_reconstruction/fim": 0.13336387276649475, + "eval_train_reconstruction/first_seq": 0.15604448318481445, + "eval_train_reconstruction/last_seq": 0.31670260429382324, + "eval_train_reconstruction/second_seq": 0.17901599407196045, + "eval_train_runtime": 471.8451, + "eval_train_samples_per_second": 0.407, + "eval_train_steps_per_second": 0.407, + "step": 18750 + }, + { + "epoch": 0.06997754451929605, + "grad_norm": 0.494963675737381, + "learning_rate": 0.0006, + "loss": 2.2397, + "step": 18760 + }, + { + "epoch": 0.07001484598225942, + "grad_norm": 0.19322346150875092, + "learning_rate": 0.0006, + "loss": 2.2011, + "step": 18770 + }, + { + "epoch": 0.0700521474452228, + "grad_norm": 0.3864636719226837, + "learning_rate": 0.0006, + "loss": 2.2108, + "step": 18780 + }, + { + "epoch": 0.07008944890818618, + "grad_norm": 0.4945468008518219, + "learning_rate": 0.0006, + "loss": 2.0997, + "step": 18790 + }, + { + "epoch": 0.07012675037114956, + "grad_norm": 0.3658040761947632, + "learning_rate": 0.0006, + "loss": 2.1782, + "step": 18800 + }, + { + "epoch": 0.07016405183411294, + "grad_norm": 0.3002001643180847, + "learning_rate": 0.0006, + "loss": 2.3684, + "step": 18810 + }, + { + "epoch": 0.07020135329707632, + "grad_norm": 0.39419060945510864, + "learning_rate": 0.0006, + "loss": 2.3085, + "step": 18820 + }, + { + "epoch": 0.0702386547600397, + "grad_norm": 0.20758317410945892, + "learning_rate": 0.0006, + "loss": 2.1888, + "step": 18830 + }, + { + "epoch": 0.07027595622300306, + "grad_norm": 0.3958282768726349, + "learning_rate": 0.0006, + "loss": 1.957, + "step": 18840 + }, + { + "epoch": 0.07031325768596644, + "grad_norm": 0.8485749959945679, + "learning_rate": 0.0006, + "loss": 2.1405, + "step": 18850 + }, + { + "epoch": 0.07035055914892982, + "grad_norm": 0.3457006812095642, + "learning_rate": 0.0006, + "loss": 2.3096, + "step": 18860 + }, + { + "epoch": 0.0703878606118932, + "grad_norm": 4.340597629547119, + "learning_rate": 0.0006, + "loss": 2.0373, + "step": 18870 + }, + { + "epoch": 0.07042516207485658, + "grad_norm": 0.32348617911338806, + "learning_rate": 0.0006, + "loss": 2.2235, + "step": 18880 + }, + { + "epoch": 0.07046246353781996, + "grad_norm": 0.2956148087978363, + "learning_rate": 0.0006, + "loss": 2.3143, + "step": 18890 + }, + { + "epoch": 0.07049976500078334, + "grad_norm": 0.30575042963027954, + "learning_rate": 0.0006, + "loss": 2.337, + "step": 18900 + }, + { + "epoch": 0.0705370664637467, + "grad_norm": 0.4632345139980316, + "learning_rate": 0.0006, + "loss": 2.277, + "step": 18910 + }, + { + "epoch": 0.07057436792671008, + "grad_norm": 0.3905487358570099, + "learning_rate": 0.0006, + "loss": 2.238, + "step": 18920 + }, + { + "epoch": 0.07061166938967346, + "grad_norm": 0.35045352578163147, + "learning_rate": 0.0006, + "loss": 2.3021, + "step": 18930 + }, + { + "epoch": 0.07064897085263684, + "grad_norm": 0.41687914729118347, + "learning_rate": 0.0006, + "loss": 2.2096, + "step": 18940 + }, + { + "epoch": 0.07068627231560022, + "grad_norm": 0.46258753538131714, + "learning_rate": 0.0006, + "loss": 2.1429, + "step": 18950 + }, + { + "epoch": 0.0707235737785636, + "grad_norm": 66.3487548828125, + "learning_rate": 0.0006, + "loss": 2.2374, + "step": 18960 + }, + { + "epoch": 0.07076087524152697, + "grad_norm": 0.33060991764068604, + "learning_rate": 0.0006, + "loss": 2.3059, + "step": 18970 + }, + { + "epoch": 0.07079817670449035, + "grad_norm": 0.3307645916938782, + "learning_rate": 0.0006, + "loss": 2.3074, + "step": 18980 + }, + { + "epoch": 0.07083547816745372, + "grad_norm": 4.7067036628723145, + "learning_rate": 0.0006, + "loss": 2.2091, + "step": 18990 + }, + { + "epoch": 0.0708727796304171, + "grad_norm": 0.3118055462837219, + "learning_rate": 0.0006, + "loss": 2.2779, + "step": 19000 + }, + { + "epoch": 0.0708727796304171, + "eval_valid_loss": 2.203562021255493, + "eval_valid_loss/all": 2.0645389556884766, + "eval_valid_loss/end_span": 1.283961534500122, + "eval_valid_perplexity/batch": 7.8816633224487305, + "eval_valid_perplexity/end_span": 3.6109161376953125, + "eval_valid_perplexity/fim": 2.431058168411255, + "eval_valid_perplexity/first_seq": 14.681017875671387, + "eval_valid_perplexity/last_seq": 8.782727241516113, + "eval_valid_perplexity/second_seq": 13.789268493652344, + "eval_valid_perplexity/seq": 8.877230644226074, + "eval_valid_reconstruction/all": 0.29101812839508057, + "eval_valid_reconstruction/end_span": 0.7015789747238159, + "eval_valid_reconstruction/fim": 0.17392556369304657, + "eval_valid_reconstruction/first_seq": 0.1722097098827362, + "eval_valid_reconstruction/last_seq": 0.3362840712070465, + "eval_valid_reconstruction/second_seq": 0.19336096942424774, + "eval_valid_runtime": 443.7284, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 19000 + }, + { + "epoch": 0.0708727796304171, + "eval_train_loss": 2.2013156414031982, + "eval_train_loss/all": 2.03580379486084, + "eval_train_loss/end_span": 1.2387621402740479, + "eval_train_perplexity/batch": 7.658405303955078, + "eval_train_perplexity/end_span": 3.451338529586792, + "eval_train_perplexity/fim": 2.1058385372161865, + "eval_train_perplexity/first_seq": 15.483366966247559, + "eval_train_perplexity/last_seq": 8.9310302734375, + "eval_train_perplexity/second_seq": 14.304888725280762, + "eval_train_perplexity/seq": 8.810328483581543, + "eval_train_reconstruction/all": 0.28084036707878113, + "eval_train_reconstruction/end_span": 0.7144591212272644, + "eval_train_reconstruction/fim": 0.14651191234588623, + "eval_train_reconstruction/first_seq": 0.15204308927059174, + "eval_train_reconstruction/last_seq": 0.329267680644989, + "eval_train_reconstruction/second_seq": 0.18270762264728546, + "eval_train_runtime": 441.5854, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 19000 + }, + { + "epoch": 0.07091008109338048, + "grad_norm": 0.3978573679924011, + "learning_rate": 0.0006, + "loss": 2.1718, + "step": 19010 + }, + { + "epoch": 0.07094738255634386, + "grad_norm": 0.36573415994644165, + "learning_rate": 0.0006, + "loss": 2.3548, + "step": 19020 + }, + { + "epoch": 0.07098468401930724, + "grad_norm": 0.28348520398139954, + "learning_rate": 0.0006, + "loss": 2.2188, + "step": 19030 + }, + { + "epoch": 0.07102198548227061, + "grad_norm": 0.2443002164363861, + "learning_rate": 0.0006, + "loss": 2.184, + "step": 19040 + }, + { + "epoch": 0.07105928694523399, + "grad_norm": 0.3999202847480774, + "learning_rate": 0.0006, + "loss": 2.0823, + "step": 19050 + }, + { + "epoch": 0.07109658840819737, + "grad_norm": 0.5695214867591858, + "learning_rate": 0.0006, + "loss": 2.3498, + "step": 19060 + }, + { + "epoch": 0.07113388987116075, + "grad_norm": 0.3835121691226959, + "learning_rate": 0.0006, + "loss": 2.0555, + "step": 19070 + }, + { + "epoch": 0.07117119133412413, + "grad_norm": 0.3607633709907532, + "learning_rate": 0.0006, + "loss": 2.202, + "step": 19080 + }, + { + "epoch": 0.0712084927970875, + "grad_norm": 0.29053235054016113, + "learning_rate": 0.0006, + "loss": 2.1224, + "step": 19090 + }, + { + "epoch": 0.07124579426005088, + "grad_norm": 0.4597359597682953, + "learning_rate": 0.0006, + "loss": 2.2171, + "step": 19100 + }, + { + "epoch": 0.07128309572301425, + "grad_norm": 0.3167975842952728, + "learning_rate": 0.0006, + "loss": 2.229, + "step": 19110 + }, + { + "epoch": 0.07132039718597763, + "grad_norm": 0.2936479449272156, + "learning_rate": 0.0006, + "loss": 2.2176, + "step": 19120 + }, + { + "epoch": 0.07135769864894101, + "grad_norm": 0.4514843821525574, + "learning_rate": 0.0006, + "loss": 2.318, + "step": 19130 + }, + { + "epoch": 0.07139500011190439, + "grad_norm": 0.3532397747039795, + "learning_rate": 0.0006, + "loss": 2.2913, + "step": 19140 + }, + { + "epoch": 0.07143230157486777, + "grad_norm": 0.30686578154563904, + "learning_rate": 0.0006, + "loss": 2.1694, + "step": 19150 + }, + { + "epoch": 0.07146960303783115, + "grad_norm": 0.3092157244682312, + "learning_rate": 0.0006, + "loss": 2.3099, + "step": 19160 + }, + { + "epoch": 0.07150690450079453, + "grad_norm": 0.46181511878967285, + "learning_rate": 0.0006, + "loss": 2.2913, + "step": 19170 + }, + { + "epoch": 0.07154420596375789, + "grad_norm": 0.49155285954475403, + "learning_rate": 0.0006, + "loss": 2.1811, + "step": 19180 + }, + { + "epoch": 0.07158150742672127, + "grad_norm": 2.006441116333008, + "learning_rate": 0.0006, + "loss": 2.331, + "step": 19190 + }, + { + "epoch": 0.07161880888968465, + "grad_norm": 0.37267202138900757, + "learning_rate": 0.0006, + "loss": 2.1964, + "step": 19200 + }, + { + "epoch": 0.07165611035264803, + "grad_norm": 0.3454102575778961, + "learning_rate": 0.0006, + "loss": 2.1652, + "step": 19210 + }, + { + "epoch": 0.07169341181561141, + "grad_norm": 0.33908215165138245, + "learning_rate": 0.0006, + "loss": 2.3079, + "step": 19220 + }, + { + "epoch": 0.07173071327857479, + "grad_norm": 0.25000908970832825, + "learning_rate": 0.0006, + "loss": 2.2381, + "step": 19230 + }, + { + "epoch": 0.07176801474153817, + "grad_norm": 0.23822802305221558, + "learning_rate": 0.0006, + "loss": 2.0867, + "step": 19240 + }, + { + "epoch": 0.07180531620450153, + "grad_norm": 0.30579549074172974, + "learning_rate": 0.0006, + "loss": 2.261, + "step": 19250 + }, + { + "epoch": 0.07180531620450153, + "eval_valid_loss": 2.214451789855957, + "eval_valid_loss/all": 2.0757033824920654, + "eval_valid_loss/end_span": 1.4417310953140259, + "eval_valid_perplexity/batch": 7.970150470733643, + "eval_valid_perplexity/end_span": 4.22800874710083, + "eval_valid_perplexity/fim": 2.499366283416748, + "eval_valid_perplexity/first_seq": 14.801794052124023, + "eval_valid_perplexity/last_seq": 9.505407333374023, + "eval_valid_perplexity/second_seq": 13.946250915527344, + "eval_valid_perplexity/seq": 8.998655319213867, + "eval_valid_reconstruction/all": 0.28854817152023315, + "eval_valid_reconstruction/end_span": 0.6796367168426514, + "eval_valid_reconstruction/fim": 0.1776282787322998, + "eval_valid_reconstruction/first_seq": 0.1698693484067917, + "eval_valid_reconstruction/last_seq": 0.31140926480293274, + "eval_valid_reconstruction/second_seq": 0.191496342420578, + "eval_valid_runtime": 438.6957, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 19250 + }, + { + "epoch": 0.07180531620450153, + "eval_train_loss": 2.210818290710449, + "eval_train_loss/all": 2.045069694519043, + "eval_train_loss/end_span": 1.370963454246521, + "eval_train_perplexity/batch": 7.729697227478027, + "eval_train_perplexity/end_span": 3.9391441345214844, + "eval_train_perplexity/fim": 1.9894185066223145, + "eval_train_perplexity/first_seq": 15.611580848693848, + "eval_train_perplexity/last_seq": 9.055437088012695, + "eval_train_perplexity/second_seq": 14.358689308166504, + "eval_train_perplexity/seq": 8.910304069519043, + "eval_train_reconstruction/all": 0.27852359414100647, + "eval_train_reconstruction/end_span": 0.6934024691581726, + "eval_train_reconstruction/fim": 0.13415426015853882, + "eval_train_reconstruction/first_seq": 0.15167737007141113, + "eval_train_reconstruction/last_seq": 0.3231690227985382, + "eval_train_reconstruction/second_seq": 0.183973491191864, + "eval_train_runtime": 433.9273, + "eval_train_samples_per_second": 0.442, + "eval_train_steps_per_second": 0.442, + "step": 19250 + }, + { + "epoch": 0.07184261766746491, + "grad_norm": 0.4351430833339691, + "learning_rate": 0.0006, + "loss": 2.2914, + "step": 19260 + }, + { + "epoch": 0.0718799191304283, + "grad_norm": 0.412994921207428, + "learning_rate": 0.0006, + "loss": 2.3365, + "step": 19270 + }, + { + "epoch": 0.07191722059339167, + "grad_norm": 0.32911431789398193, + "learning_rate": 0.0006, + "loss": 2.3425, + "step": 19280 + }, + { + "epoch": 0.07195452205635505, + "grad_norm": 0.7754552960395813, + "learning_rate": 0.0006, + "loss": 2.2391, + "step": 19290 + }, + { + "epoch": 0.07199182351931843, + "grad_norm": 0.3202420473098755, + "learning_rate": 0.0006, + "loss": 2.2357, + "step": 19300 + }, + { + "epoch": 0.07202912498228181, + "grad_norm": 0.3107485771179199, + "learning_rate": 0.0006, + "loss": 2.2234, + "step": 19310 + }, + { + "epoch": 0.07206642644524518, + "grad_norm": 0.3028591275215149, + "learning_rate": 0.0006, + "loss": 2.1548, + "step": 19320 + }, + { + "epoch": 0.07210372790820856, + "grad_norm": 0.5087175369262695, + "learning_rate": 0.0006, + "loss": 2.2721, + "step": 19330 + }, + { + "epoch": 0.07214102937117194, + "grad_norm": 0.3016623854637146, + "learning_rate": 0.0006, + "loss": 2.0921, + "step": 19340 + }, + { + "epoch": 0.07217833083413532, + "grad_norm": 0.3543054759502411, + "learning_rate": 0.0006, + "loss": 2.3936, + "step": 19350 + }, + { + "epoch": 0.0722156322970987, + "grad_norm": 0.29812753200531006, + "learning_rate": 0.0006, + "loss": 2.3996, + "step": 19360 + }, + { + "epoch": 0.07225293376006207, + "grad_norm": 0.5087674856185913, + "learning_rate": 0.0006, + "loss": 2.1978, + "step": 19370 + }, + { + "epoch": 0.07229023522302545, + "grad_norm": 0.4418676197528839, + "learning_rate": 0.0006, + "loss": 2.1978, + "step": 19380 + }, + { + "epoch": 0.07232753668598882, + "grad_norm": 0.6610691547393799, + "learning_rate": 0.0006, + "loss": 2.1652, + "step": 19390 + }, + { + "epoch": 0.0723648381489522, + "grad_norm": 0.3318582773208618, + "learning_rate": 0.0006, + "loss": 2.2998, + "step": 19400 + }, + { + "epoch": 0.07240213961191558, + "grad_norm": 0.35804927349090576, + "learning_rate": 0.0006, + "loss": 2.2284, + "step": 19410 + }, + { + "epoch": 0.07243944107487896, + "grad_norm": 0.4223398268222809, + "learning_rate": 0.0006, + "loss": 2.1922, + "step": 19420 + }, + { + "epoch": 0.07247674253784234, + "grad_norm": 0.34437429904937744, + "learning_rate": 0.0006, + "loss": 2.2697, + "step": 19430 + }, + { + "epoch": 0.07251404400080572, + "grad_norm": 0.2826463282108307, + "learning_rate": 0.0006, + "loss": 2.2823, + "step": 19440 + }, + { + "epoch": 0.0725513454637691, + "grad_norm": 0.2638222873210907, + "learning_rate": 0.0006, + "loss": 2.0907, + "step": 19450 + }, + { + "epoch": 0.07258864692673246, + "grad_norm": 0.24040374159812927, + "learning_rate": 0.0006, + "loss": 2.1953, + "step": 19460 + }, + { + "epoch": 0.07262594838969584, + "grad_norm": 0.4006545841693878, + "learning_rate": 0.0006, + "loss": 2.1957, + "step": 19470 + }, + { + "epoch": 0.07266324985265922, + "grad_norm": 0.602562665939331, + "learning_rate": 0.0006, + "loss": 2.2279, + "step": 19480 + }, + { + "epoch": 0.0727005513156226, + "grad_norm": 0.3253021836280823, + "learning_rate": 0.0006, + "loss": 2.2585, + "step": 19490 + }, + { + "epoch": 0.07273785277858598, + "grad_norm": 0.32018768787384033, + "learning_rate": 0.0006, + "loss": 2.1292, + "step": 19500 + }, + { + "epoch": 0.07273785277858598, + "eval_valid_loss": 2.204693555831909, + "eval_valid_loss/all": 2.0662622451782227, + "eval_valid_loss/end_span": 1.4502620697021484, + "eval_valid_perplexity/batch": 7.895257472991943, + "eval_valid_perplexity/end_span": 4.2642316818237305, + "eval_valid_perplexity/fim": 2.166198968887329, + "eval_valid_perplexity/first_seq": 14.765820503234863, + "eval_valid_perplexity/last_seq": 8.992956161499023, + "eval_valid_perplexity/second_seq": 13.504597663879395, + "eval_valid_perplexity/seq": 8.897915840148926, + "eval_valid_reconstruction/all": 0.2902951240539551, + "eval_valid_reconstruction/end_span": 0.6640446186065674, + "eval_valid_reconstruction/fim": 0.15209895372390747, + "eval_valid_reconstruction/first_seq": 0.16817691922187805, + "eval_valid_reconstruction/last_seq": 0.3268628716468811, + "eval_valid_reconstruction/second_seq": 0.1993161141872406, + "eval_valid_runtime": 438.4809, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 19500 + }, + { + "epoch": 0.07273785277858598, + "eval_train_loss": 2.2003676891326904, + "eval_train_loss/all": 2.035033702850342, + "eval_train_loss/end_span": 1.4098132848739624, + "eval_train_perplexity/batch": 7.652510166168213, + "eval_train_perplexity/end_span": 4.095190525054932, + "eval_train_perplexity/fim": 1.9954041242599487, + "eval_train_perplexity/first_seq": 15.500394821166992, + "eval_train_perplexity/last_seq": 8.947970390319824, + "eval_train_perplexity/second_seq": 14.234142303466797, + "eval_train_perplexity/seq": 8.805912017822266, + "eval_train_reconstruction/all": 0.28094911575317383, + "eval_train_reconstruction/end_span": 0.6765978932380676, + "eval_train_reconstruction/fim": 0.13682657480239868, + "eval_train_reconstruction/first_seq": 0.1520499587059021, + "eval_train_reconstruction/last_seq": 0.32811734080314636, + "eval_train_reconstruction/second_seq": 0.18400610983371735, + "eval_train_runtime": 438.0463, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 19500 + }, + { + "epoch": 0.07277515424154936, + "grad_norm": 0.296619176864624, + "learning_rate": 0.0006, + "loss": 2.1594, + "step": 19510 + }, + { + "epoch": 0.07281245570451272, + "grad_norm": 0.4463210105895996, + "learning_rate": 0.0006, + "loss": 2.0757, + "step": 19520 + }, + { + "epoch": 0.0728497571674761, + "grad_norm": 0.3320896625518799, + "learning_rate": 0.0006, + "loss": 2.3925, + "step": 19530 + }, + { + "epoch": 0.07288705863043948, + "grad_norm": 0.3265852928161621, + "learning_rate": 0.0006, + "loss": 2.2434, + "step": 19540 + }, + { + "epoch": 0.07292436009340286, + "grad_norm": 0.4063425660133362, + "learning_rate": 0.0006, + "loss": 2.3762, + "step": 19550 + }, + { + "epoch": 0.07296166155636624, + "grad_norm": 0.34964215755462646, + "learning_rate": 0.0006, + "loss": 2.2862, + "step": 19560 + }, + { + "epoch": 0.07299896301932962, + "grad_norm": 0.38222312927246094, + "learning_rate": 0.0006, + "loss": 2.1516, + "step": 19570 + }, + { + "epoch": 0.073036264482293, + "grad_norm": 0.5383628606796265, + "learning_rate": 0.0006, + "loss": 2.3163, + "step": 19580 + }, + { + "epoch": 0.07307356594525637, + "grad_norm": 0.21230870485305786, + "learning_rate": 0.0006, + "loss": 2.2223, + "step": 19590 + }, + { + "epoch": 0.07311086740821975, + "grad_norm": 0.7448016405105591, + "learning_rate": 0.0006, + "loss": 2.2675, + "step": 19600 + }, + { + "epoch": 0.07314816887118313, + "grad_norm": 0.42028000950813293, + "learning_rate": 0.0006, + "loss": 2.2101, + "step": 19610 + }, + { + "epoch": 0.0731854703341465, + "grad_norm": 0.39149293303489685, + "learning_rate": 0.0006, + "loss": 2.3216, + "step": 19620 + }, + { + "epoch": 0.07322277179710988, + "grad_norm": 0.4473921060562134, + "learning_rate": 0.0006, + "loss": 2.1044, + "step": 19630 + }, + { + "epoch": 0.07326007326007326, + "grad_norm": 0.3700016736984253, + "learning_rate": 0.0006, + "loss": 2.3688, + "step": 19640 + }, + { + "epoch": 0.07329737472303664, + "grad_norm": 0.2673030495643616, + "learning_rate": 0.0006, + "loss": 2.1755, + "step": 19650 + }, + { + "epoch": 0.07333467618600001, + "grad_norm": 0.34864288568496704, + "learning_rate": 0.0006, + "loss": 2.0856, + "step": 19660 + }, + { + "epoch": 0.07337197764896339, + "grad_norm": 0.458988755941391, + "learning_rate": 0.0006, + "loss": 2.2786, + "step": 19670 + }, + { + "epoch": 0.07340927911192677, + "grad_norm": 0.4346422255039215, + "learning_rate": 0.0006, + "loss": 2.1532, + "step": 19680 + }, + { + "epoch": 0.07344658057489015, + "grad_norm": 0.434661328792572, + "learning_rate": 0.0006, + "loss": 2.2319, + "step": 19690 + }, + { + "epoch": 0.07348388203785353, + "grad_norm": 0.4151219427585602, + "learning_rate": 0.0006, + "loss": 2.4172, + "step": 19700 + }, + { + "epoch": 0.0735211835008169, + "grad_norm": 0.31244856119155884, + "learning_rate": 0.0006, + "loss": 2.2171, + "step": 19710 + }, + { + "epoch": 0.07355848496378029, + "grad_norm": 0.4042627215385437, + "learning_rate": 0.0006, + "loss": 2.2677, + "step": 19720 + }, + { + "epoch": 0.07359578642674365, + "grad_norm": 0.24911600351333618, + "learning_rate": 0.0006, + "loss": 2.3098, + "step": 19730 + }, + { + "epoch": 0.07363308788970703, + "grad_norm": 0.42040205001831055, + "learning_rate": 0.0006, + "loss": 2.1553, + "step": 19740 + }, + { + "epoch": 0.07367038935267041, + "grad_norm": 0.20605123043060303, + "learning_rate": 0.0006, + "loss": 2.2318, + "step": 19750 + }, + { + "epoch": 0.07367038935267041, + "eval_valid_loss": 2.2055671215057373, + "eval_valid_loss/all": 2.06667160987854, + "eval_valid_loss/end_span": 1.2213637828826904, + "eval_valid_perplexity/batch": 7.898489952087402, + "eval_valid_perplexity/end_span": 3.391810178756714, + "eval_valid_perplexity/fim": 2.1455397605895996, + "eval_valid_perplexity/first_seq": 14.992517471313477, + "eval_valid_perplexity/last_seq": 9.022255897521973, + "eval_valid_perplexity/second_seq": 13.74503231048584, + "eval_valid_perplexity/seq": 8.89660358428955, + "eval_valid_reconstruction/all": 0.29061180353164673, + "eval_valid_reconstruction/end_span": 0.712500274181366, + "eval_valid_reconstruction/fim": 0.14963436126708984, + "eval_valid_reconstruction/first_seq": 0.16557854413986206, + "eval_valid_reconstruction/last_seq": 0.32665979862213135, + "eval_valid_reconstruction/second_seq": 0.1936480700969696, + "eval_valid_runtime": 438.3442, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 19750 + }, + { + "epoch": 0.07367038935267041, + "eval_train_loss": 2.201927900314331, + "eval_train_loss/all": 2.0359530448913574, + "eval_train_loss/end_span": 1.184378981590271, + "eval_train_perplexity/batch": 7.659548759460449, + "eval_train_perplexity/end_span": 3.2686562538146973, + "eval_train_perplexity/fim": 2.263960599899292, + "eval_train_perplexity/first_seq": 15.279327392578125, + "eval_train_perplexity/last_seq": 8.976593017578125, + "eval_train_perplexity/second_seq": 14.310694694519043, + "eval_train_perplexity/seq": 8.807610511779785, + "eval_train_reconstruction/all": 0.2809242308139801, + "eval_train_reconstruction/end_span": 0.7228122353553772, + "eval_train_reconstruction/fim": 0.16080620884895325, + "eval_train_reconstruction/first_seq": 0.16042836010456085, + "eval_train_reconstruction/last_seq": 0.32508254051208496, + "eval_train_reconstruction/second_seq": 0.17982955276966095, + "eval_train_runtime": 437.2729, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 19750 + }, + { + "epoch": 0.07370769081563379, + "grad_norm": 0.6959152221679688, + "learning_rate": 0.0006, + "loss": 2.1904, + "step": 19760 + }, + { + "epoch": 0.07374499227859717, + "grad_norm": 0.45020654797554016, + "learning_rate": 0.0006, + "loss": 2.294, + "step": 19770 + }, + { + "epoch": 0.07378229374156055, + "grad_norm": 0.48432984948158264, + "learning_rate": 0.0006, + "loss": 2.1146, + "step": 19780 + }, + { + "epoch": 0.07381959520452393, + "grad_norm": 0.2676103413105011, + "learning_rate": 0.0006, + "loss": 2.2234, + "step": 19790 + }, + { + "epoch": 0.07385689666748729, + "grad_norm": 0.347054660320282, + "learning_rate": 0.0006, + "loss": 2.3143, + "step": 19800 + }, + { + "epoch": 0.07389419813045067, + "grad_norm": 0.43490567803382874, + "learning_rate": 0.0006, + "loss": 2.0286, + "step": 19810 + }, + { + "epoch": 0.07393149959341405, + "grad_norm": 0.3797460198402405, + "learning_rate": 0.0006, + "loss": 2.0317, + "step": 19820 + }, + { + "epoch": 0.07396880105637743, + "grad_norm": 0.40680935978889465, + "learning_rate": 0.0006, + "loss": 2.2464, + "step": 19830 + }, + { + "epoch": 0.07400610251934081, + "grad_norm": 0.42112693190574646, + "learning_rate": 0.0006, + "loss": 2.2815, + "step": 19840 + }, + { + "epoch": 0.07404340398230419, + "grad_norm": 0.3572753071784973, + "learning_rate": 0.0006, + "loss": 2.113, + "step": 19850 + }, + { + "epoch": 0.07408070544526757, + "grad_norm": 0.36063557863235474, + "learning_rate": 0.0006, + "loss": 2.2382, + "step": 19860 + }, + { + "epoch": 0.07411800690823094, + "grad_norm": 0.3130436837673187, + "learning_rate": 0.0006, + "loss": 2.275, + "step": 19870 + }, + { + "epoch": 0.07415530837119431, + "grad_norm": 0.2955836057662964, + "learning_rate": 0.0006, + "loss": 2.1806, + "step": 19880 + }, + { + "epoch": 0.0741926098341577, + "grad_norm": 0.39103198051452637, + "learning_rate": 0.0006, + "loss": 2.2727, + "step": 19890 + }, + { + "epoch": 0.07422991129712107, + "grad_norm": 0.3119851350784302, + "learning_rate": 0.0006, + "loss": 2.265, + "step": 19900 + }, + { + "epoch": 0.07426721276008445, + "grad_norm": 0.3067319095134735, + "learning_rate": 0.0006, + "loss": 2.1277, + "step": 19910 + }, + { + "epoch": 0.07430451422304783, + "grad_norm": 0.38696199655532837, + "learning_rate": 0.0006, + "loss": 2.2169, + "step": 19920 + }, + { + "epoch": 0.07434181568601121, + "grad_norm": 0.39403852820396423, + "learning_rate": 0.0006, + "loss": 2.0838, + "step": 19930 + }, + { + "epoch": 0.07437911714897458, + "grad_norm": 0.2953088879585266, + "learning_rate": 0.0006, + "loss": 2.1543, + "step": 19940 + }, + { + "epoch": 0.07441641861193796, + "grad_norm": 0.27476492524147034, + "learning_rate": 0.0006, + "loss": 2.2083, + "step": 19950 + }, + { + "epoch": 0.07445372007490134, + "grad_norm": 0.4667927920818329, + "learning_rate": 0.0006, + "loss": 2.2501, + "step": 19960 + }, + { + "epoch": 0.07449102153786472, + "grad_norm": 0.2908404469490051, + "learning_rate": 0.0006, + "loss": 2.3354, + "step": 19970 + }, + { + "epoch": 0.0745283230008281, + "grad_norm": 0.3784105181694031, + "learning_rate": 0.0006, + "loss": 2.1035, + "step": 19980 + }, + { + "epoch": 0.07456562446379147, + "grad_norm": 0.4002230763435364, + "learning_rate": 0.0006, + "loss": 2.2626, + "step": 19990 + }, + { + "epoch": 0.07460292592675485, + "grad_norm": 0.4575248658657074, + "learning_rate": 0.0006, + "loss": 2.3033, + "step": 20000 + }, + { + "epoch": 0.07460292592675485, + "eval_valid_loss": 2.204876661300659, + "eval_valid_loss/all": 2.0665786266326904, + "eval_valid_loss/end_span": 1.309346079826355, + "eval_valid_perplexity/batch": 7.8977556228637695, + "eval_valid_perplexity/end_span": 3.7037508487701416, + "eval_valid_perplexity/fim": 2.4637465476989746, + "eval_valid_perplexity/first_seq": 14.558225631713867, + "eval_valid_perplexity/last_seq": 8.676453590393066, + "eval_valid_perplexity/second_seq": 13.324210166931152, + "eval_valid_perplexity/seq": 8.900126457214355, + "eval_valid_reconstruction/all": 0.29079166054725647, + "eval_valid_reconstruction/end_span": 0.6953045129776001, + "eval_valid_reconstruction/fim": 0.17771482467651367, + "eval_valid_reconstruction/first_seq": 0.17660865187644958, + "eval_valid_reconstruction/last_seq": 0.3401395082473755, + "eval_valid_reconstruction/second_seq": 0.20956042408943176, + "eval_valid_runtime": 437.0226, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 20000 + }, + { + "epoch": 0.07460292592675485, + "eval_train_loss": 2.203460931777954, + "eval_train_loss/all": 2.0383212566375732, + "eval_train_loss/end_span": 1.2803579568862915, + "eval_train_perplexity/batch": 7.677709579467773, + "eval_train_perplexity/end_span": 3.5979273319244385, + "eval_train_perplexity/fim": 2.58758282661438, + "eval_train_perplexity/first_seq": 15.68045711517334, + "eval_train_perplexity/last_seq": 8.364130973815918, + "eval_train_perplexity/second_seq": 14.202091217041016, + "eval_train_perplexity/seq": 8.839497566223145, + "eval_train_reconstruction/all": 0.2802712023258209, + "eval_train_reconstruction/end_span": 0.7072843909263611, + "eval_train_reconstruction/fim": 0.18641288578510284, + "eval_train_reconstruction/first_seq": 0.14774101972579956, + "eval_train_reconstruction/last_seq": 0.3506273627281189, + "eval_train_reconstruction/second_seq": 0.1805422306060791, + "eval_train_runtime": 438.3831, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 20000 + }, + { + "epoch": 0.07464022738971822, + "grad_norm": 0.4865427017211914, + "learning_rate": 0.0006, + "loss": 1.9957, + "step": 20010 + }, + { + "epoch": 0.0746775288526816, + "grad_norm": 0.3547218441963196, + "learning_rate": 0.0006, + "loss": 2.303, + "step": 20020 + }, + { + "epoch": 0.07471483031564498, + "grad_norm": 0.45891544222831726, + "learning_rate": 0.0006, + "loss": 2.1025, + "step": 20030 + }, + { + "epoch": 0.07475213177860836, + "grad_norm": 0.556763768196106, + "learning_rate": 0.0006, + "loss": 2.2021, + "step": 20040 + }, + { + "epoch": 0.07478943324157174, + "grad_norm": 0.35524114966392517, + "learning_rate": 0.0006, + "loss": 2.1606, + "step": 20050 + }, + { + "epoch": 0.07482673470453512, + "grad_norm": 0.309732586145401, + "learning_rate": 0.0006, + "loss": 2.2621, + "step": 20060 + }, + { + "epoch": 0.0748640361674985, + "grad_norm": 0.3610772490501404, + "learning_rate": 0.0006, + "loss": 2.169, + "step": 20070 + }, + { + "epoch": 0.07490133763046186, + "grad_norm": 0.36683183908462524, + "learning_rate": 0.0006, + "loss": 2.319, + "step": 20080 + }, + { + "epoch": 0.07493863909342524, + "grad_norm": 0.37566137313842773, + "learning_rate": 0.0006, + "loss": 2.2926, + "step": 20090 + }, + { + "epoch": 0.07497594055638862, + "grad_norm": 0.28001144528388977, + "learning_rate": 0.0006, + "loss": 2.1232, + "step": 20100 + }, + { + "epoch": 0.075013242019352, + "grad_norm": 0.3588540852069855, + "learning_rate": 0.0006, + "loss": 2.2145, + "step": 20110 + }, + { + "epoch": 0.07505054348231538, + "grad_norm": 0.30391696095466614, + "learning_rate": 0.0006, + "loss": 2.3541, + "step": 20120 + }, + { + "epoch": 0.07508784494527876, + "grad_norm": 0.2721695899963379, + "learning_rate": 0.0006, + "loss": 2.2553, + "step": 20130 + }, + { + "epoch": 0.07512514640824212, + "grad_norm": 0.4675159752368927, + "learning_rate": 0.0006, + "loss": 1.9882, + "step": 20140 + }, + { + "epoch": 0.0751624478712055, + "grad_norm": 0.2646726667881012, + "learning_rate": 0.0006, + "loss": 2.1449, + "step": 20150 + }, + { + "epoch": 0.07519974933416888, + "grad_norm": 0.25743287801742554, + "learning_rate": 0.0006, + "loss": 2.1982, + "step": 20160 + }, + { + "epoch": 0.07523705079713226, + "grad_norm": 0.3346676230430603, + "learning_rate": 0.0006, + "loss": 2.1191, + "step": 20170 + }, + { + "epoch": 0.07527435226009564, + "grad_norm": 0.547634482383728, + "learning_rate": 0.0006, + "loss": 2.1633, + "step": 20180 + }, + { + "epoch": 0.07531165372305902, + "grad_norm": 0.3406450152397156, + "learning_rate": 0.0006, + "loss": 2.127, + "step": 20190 + }, + { + "epoch": 0.0753489551860224, + "grad_norm": 0.3657689690589905, + "learning_rate": 0.0006, + "loss": 2.0018, + "step": 20200 + }, + { + "epoch": 0.07538625664898577, + "grad_norm": 0.3908751606941223, + "learning_rate": 0.0006, + "loss": 2.2543, + "step": 20210 + }, + { + "epoch": 0.07542355811194915, + "grad_norm": 0.5571268796920776, + "learning_rate": 0.0006, + "loss": 2.1559, + "step": 20220 + }, + { + "epoch": 0.07546085957491253, + "grad_norm": 1.9713027477264404, + "learning_rate": 0.0006, + "loss": 2.3028, + "step": 20230 + }, + { + "epoch": 0.0754981610378759, + "grad_norm": 0.2825464904308319, + "learning_rate": 0.0006, + "loss": 2.2008, + "step": 20240 + }, + { + "epoch": 0.07553546250083928, + "grad_norm": 0.38932037353515625, + "learning_rate": 0.0006, + "loss": 2.2316, + "step": 20250 + }, + { + "epoch": 0.07553546250083928, + "eval_valid_loss": 2.2072341442108154, + "eval_valid_loss/all": 2.0688226222991943, + "eval_valid_loss/end_span": 1.2321404218673706, + "eval_valid_perplexity/batch": 7.91549825668335, + "eval_valid_perplexity/end_span": 3.428560256958008, + "eval_valid_perplexity/fim": 2.1352787017822266, + "eval_valid_perplexity/first_seq": 15.153874397277832, + "eval_valid_perplexity/last_seq": 8.842114448547363, + "eval_valid_perplexity/second_seq": 13.95496940612793, + "eval_valid_perplexity/seq": 8.921995162963867, + "eval_valid_reconstruction/all": 0.2899520695209503, + "eval_valid_reconstruction/end_span": 0.707033097743988, + "eval_valid_reconstruction/fim": 0.1487504243850708, + "eval_valid_reconstruction/first_seq": 0.16017699241638184, + "eval_valid_reconstruction/last_seq": 0.33017539978027344, + "eval_valid_reconstruction/second_seq": 0.18926197290420532, + "eval_valid_runtime": 437.1326, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 20250 + }, + { + "epoch": 0.07553546250083928, + "eval_train_loss": 2.203810453414917, + "eval_train_loss/all": 2.0385053157806396, + "eval_train_loss/end_span": 1.200669288635254, + "eval_train_perplexity/batch": 7.6791229248046875, + "eval_train_perplexity/end_span": 3.3223397731781006, + "eval_train_perplexity/fim": 2.1116364002227783, + "eval_train_perplexity/first_seq": 15.62664794921875, + "eval_train_perplexity/last_seq": 9.019941329956055, + "eval_train_perplexity/second_seq": 14.343206405639648, + "eval_train_perplexity/seq": 8.84354305267334, + "eval_train_reconstruction/all": 0.28023332357406616, + "eval_train_reconstruction/end_span": 0.7176878452301025, + "eval_train_reconstruction/fim": 0.14715933799743652, + "eval_train_reconstruction/first_seq": 0.14929436147212982, + "eval_train_reconstruction/last_seq": 0.32709166407585144, + "eval_train_reconstruction/second_seq": 0.18111875653266907, + "eval_train_runtime": 439.4857, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 20250 + }, + { + "epoch": 0.07557276396380266, + "grad_norm": 0.27639055252075195, + "learning_rate": 0.0006, + "loss": 2.1993, + "step": 20260 + }, + { + "epoch": 0.07561006542676604, + "grad_norm": 0.36597761511802673, + "learning_rate": 0.0006, + "loss": 2.1551, + "step": 20270 + }, + { + "epoch": 0.07564736688972941, + "grad_norm": 0.4164372980594635, + "learning_rate": 0.0006, + "loss": 2.3682, + "step": 20280 + }, + { + "epoch": 0.07568466835269279, + "grad_norm": 0.3923951983451843, + "learning_rate": 0.0006, + "loss": 2.1403, + "step": 20290 + }, + { + "epoch": 0.07572196981565617, + "grad_norm": 0.33674705028533936, + "learning_rate": 0.0006, + "loss": 2.1847, + "step": 20300 + }, + { + "epoch": 0.07575927127861955, + "grad_norm": 0.5353322625160217, + "learning_rate": 0.0006, + "loss": 2.1839, + "step": 20310 + }, + { + "epoch": 0.07579657274158293, + "grad_norm": 3.3512752056121826, + "learning_rate": 0.0006, + "loss": 2.1059, + "step": 20320 + }, + { + "epoch": 0.0758338742045463, + "grad_norm": 0.36969050765037537, + "learning_rate": 0.0006, + "loss": 2.0387, + "step": 20330 + }, + { + "epoch": 0.07587117566750969, + "grad_norm": 0.3276326358318329, + "learning_rate": 0.0006, + "loss": 2.3217, + "step": 20340 + }, + { + "epoch": 0.07590847713047305, + "grad_norm": 0.37636566162109375, + "learning_rate": 0.0006, + "loss": 2.1147, + "step": 20350 + }, + { + "epoch": 0.07594577859343643, + "grad_norm": 0.35967516899108887, + "learning_rate": 0.0006, + "loss": 2.1733, + "step": 20360 + }, + { + "epoch": 0.07598308005639981, + "grad_norm": 0.4410649538040161, + "learning_rate": 0.0006, + "loss": 2.2229, + "step": 20370 + }, + { + "epoch": 0.07602038151936319, + "grad_norm": 0.33467522263526917, + "learning_rate": 0.0006, + "loss": 2.1045, + "step": 20380 + }, + { + "epoch": 0.07605768298232657, + "grad_norm": 0.3778712749481201, + "learning_rate": 0.0006, + "loss": 2.2633, + "step": 20390 + }, + { + "epoch": 0.07609498444528995, + "grad_norm": 0.46993520855903625, + "learning_rate": 0.0006, + "loss": 2.2887, + "step": 20400 + }, + { + "epoch": 0.07613228590825333, + "grad_norm": 0.31059902906417847, + "learning_rate": 0.0006, + "loss": 2.1348, + "step": 20410 + }, + { + "epoch": 0.0761695873712167, + "grad_norm": 0.2863432765007019, + "learning_rate": 0.0006, + "loss": 2.2971, + "step": 20420 + }, + { + "epoch": 0.07620688883418007, + "grad_norm": 0.3676382899284363, + "learning_rate": 0.0006, + "loss": 2.2328, + "step": 20430 + }, + { + "epoch": 0.07624419029714345, + "grad_norm": 0.4108024537563324, + "learning_rate": 0.0006, + "loss": 2.3294, + "step": 20440 + }, + { + "epoch": 0.07628149176010683, + "grad_norm": 0.4851938486099243, + "learning_rate": 0.0006, + "loss": 2.2191, + "step": 20450 + }, + { + "epoch": 0.07631879322307021, + "grad_norm": 0.4267195761203766, + "learning_rate": 0.0006, + "loss": 2.2112, + "step": 20460 + }, + { + "epoch": 0.07635609468603359, + "grad_norm": 0.4440667927265167, + "learning_rate": 0.0006, + "loss": 2.2066, + "step": 20470 + }, + { + "epoch": 0.07639339614899697, + "grad_norm": 0.3251100778579712, + "learning_rate": 0.0006, + "loss": 2.2851, + "step": 20480 + }, + { + "epoch": 0.07643069761196034, + "grad_norm": 0.2953893840312958, + "learning_rate": 0.0006, + "loss": 2.2812, + "step": 20490 + }, + { + "epoch": 0.07646799907492371, + "grad_norm": 0.3171637952327728, + "learning_rate": 0.0006, + "loss": 2.092, + "step": 20500 + }, + { + "epoch": 0.07646799907492371, + "eval_valid_loss": 2.203902244567871, + "eval_valid_loss/all": 2.065427541732788, + "eval_valid_loss/end_span": 1.212340235710144, + "eval_valid_perplexity/batch": 7.888669967651367, + "eval_valid_perplexity/end_span": 3.361341714859009, + "eval_valid_perplexity/fim": 2.441906452178955, + "eval_valid_perplexity/first_seq": 14.801271438598633, + "eval_valid_perplexity/last_seq": 8.861023902893066, + "eval_valid_perplexity/second_seq": 13.44740104675293, + "eval_valid_perplexity/seq": 8.891554832458496, + "eval_valid_reconstruction/all": 0.2907044291496277, + "eval_valid_reconstruction/end_span": 0.7207106947898865, + "eval_valid_reconstruction/fim": 0.17573796212673187, + "eval_valid_reconstruction/first_seq": 0.16987761855125427, + "eval_valid_reconstruction/last_seq": 0.33176863193511963, + "eval_valid_reconstruction/second_seq": 0.2037733942270279, + "eval_valid_runtime": 439.6713, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 20500 + }, + { + "epoch": 0.07646799907492371, + "eval_train_loss": 2.20145583152771, + "eval_train_loss/all": 2.0359504222869873, + "eval_train_loss/end_span": 1.1626349687576294, + "eval_train_perplexity/batch": 7.6595282554626465, + "eval_train_perplexity/end_span": 3.198349714279175, + "eval_train_perplexity/fim": 2.4194445610046387, + "eval_train_perplexity/first_seq": 15.33916187286377, + "eval_train_perplexity/last_seq": 9.32441234588623, + "eval_train_perplexity/second_seq": 14.45250415802002, + "eval_train_perplexity/seq": 8.814362525939941, + "eval_train_reconstruction/all": 0.2808133065700531, + "eval_train_reconstruction/end_span": 0.7385844588279724, + "eval_train_reconstruction/fim": 0.17440740764141083, + "eval_train_reconstruction/first_seq": 0.15651874244213104, + "eval_train_reconstruction/last_seq": 0.31047752499580383, + "eval_train_reconstruction/second_seq": 0.1790585070848465, + "eval_train_runtime": 440.3291, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 20500 + }, + { + "epoch": 0.0765053005378871, + "grad_norm": 0.34475842118263245, + "learning_rate": 0.0006, + "loss": 2.0548, + "step": 20510 + }, + { + "epoch": 0.07654260200085047, + "grad_norm": 0.7814569473266602, + "learning_rate": 0.0006, + "loss": 2.263, + "step": 20520 + }, + { + "epoch": 0.07657990346381385, + "grad_norm": 0.3121640086174011, + "learning_rate": 0.0006, + "loss": 2.1932, + "step": 20530 + }, + { + "epoch": 0.07661720492677723, + "grad_norm": 0.43017953634262085, + "learning_rate": 0.0006, + "loss": 2.2925, + "step": 20540 + }, + { + "epoch": 0.07665450638974061, + "grad_norm": 0.32597634196281433, + "learning_rate": 0.0006, + "loss": 2.2186, + "step": 20550 + }, + { + "epoch": 0.07669180785270398, + "grad_norm": 0.4209960401058197, + "learning_rate": 0.0006, + "loss": 2.1759, + "step": 20560 + }, + { + "epoch": 0.07672910931566736, + "grad_norm": 0.4531393349170685, + "learning_rate": 0.0006, + "loss": 2.2607, + "step": 20570 + }, + { + "epoch": 0.07676641077863074, + "grad_norm": 0.31657981872558594, + "learning_rate": 0.0006, + "loss": 2.3357, + "step": 20580 + }, + { + "epoch": 0.07680371224159412, + "grad_norm": 0.3939512372016907, + "learning_rate": 0.0006, + "loss": 2.2738, + "step": 20590 + }, + { + "epoch": 0.0768410137045575, + "grad_norm": 0.3266310691833496, + "learning_rate": 0.0006, + "loss": 2.2851, + "step": 20600 + }, + { + "epoch": 0.07687831516752087, + "grad_norm": 0.3328496217727661, + "learning_rate": 0.0006, + "loss": 2.1989, + "step": 20610 + }, + { + "epoch": 0.07691561663048425, + "grad_norm": 0.3506198227405548, + "learning_rate": 0.0006, + "loss": 2.1744, + "step": 20620 + }, + { + "epoch": 0.07695291809344762, + "grad_norm": 0.3508453965187073, + "learning_rate": 0.0006, + "loss": 2.1686, + "step": 20630 + }, + { + "epoch": 0.076990219556411, + "grad_norm": 0.465751051902771, + "learning_rate": 0.0006, + "loss": 2.2018, + "step": 20640 + }, + { + "epoch": 0.07702752101937438, + "grad_norm": 0.4617888927459717, + "learning_rate": 0.0006, + "loss": 2.1572, + "step": 20650 + }, + { + "epoch": 0.07706482248233776, + "grad_norm": 0.3633979558944702, + "learning_rate": 0.0006, + "loss": 2.2171, + "step": 20660 + }, + { + "epoch": 0.07710212394530114, + "grad_norm": 0.4335210621356964, + "learning_rate": 0.0006, + "loss": 2.2681, + "step": 20670 + }, + { + "epoch": 0.07713942540826452, + "grad_norm": 0.38980832695961, + "learning_rate": 0.0006, + "loss": 2.1566, + "step": 20680 + }, + { + "epoch": 0.0771767268712279, + "grad_norm": 0.3394508361816406, + "learning_rate": 0.0006, + "loss": 2.2192, + "step": 20690 + }, + { + "epoch": 0.07721402833419126, + "grad_norm": 0.2917945683002472, + "learning_rate": 0.0006, + "loss": 2.381, + "step": 20700 + }, + { + "epoch": 0.07725132979715464, + "grad_norm": 0.32472869753837585, + "learning_rate": 0.0006, + "loss": 2.1984, + "step": 20710 + }, + { + "epoch": 0.07728863126011802, + "grad_norm": 0.3487061858177185, + "learning_rate": 0.0006, + "loss": 2.332, + "step": 20720 + }, + { + "epoch": 0.0773259327230814, + "grad_norm": 0.4870343804359436, + "learning_rate": 0.0006, + "loss": 2.2422, + "step": 20730 + }, + { + "epoch": 0.07736323418604478, + "grad_norm": 0.4816491901874542, + "learning_rate": 0.0006, + "loss": 2.1601, + "step": 20740 + }, + { + "epoch": 0.07740053564900816, + "grad_norm": 0.39884012937545776, + "learning_rate": 0.0006, + "loss": 2.154, + "step": 20750 + }, + { + "epoch": 0.07740053564900816, + "eval_valid_loss": 2.205946683883667, + "eval_valid_loss/all": 2.0673234462738037, + "eval_valid_loss/end_span": 1.2326160669326782, + "eval_valid_perplexity/batch": 7.903640270233154, + "eval_valid_perplexity/end_span": 3.4301915168762207, + "eval_valid_perplexity/fim": 2.533297061920166, + "eval_valid_perplexity/first_seq": 15.208550453186035, + "eval_valid_perplexity/last_seq": 8.835030555725098, + "eval_valid_perplexity/second_seq": 14.311455726623535, + "eval_valid_perplexity/seq": 8.909985542297363, + "eval_valid_reconstruction/all": 0.29070785641670227, + "eval_valid_reconstruction/end_span": 0.724861741065979, + "eval_valid_reconstruction/fim": 0.18201522529125214, + "eval_valid_reconstruction/first_seq": 0.15993013978004456, + "eval_valid_reconstruction/last_seq": 0.3347350060939789, + "eval_valid_reconstruction/second_seq": 0.1812952607870102, + "eval_valid_runtime": 440.5724, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 20750 + }, + { + "epoch": 0.07740053564900816, + "eval_train_loss": 2.2020576000213623, + "eval_train_loss/all": 2.0366110801696777, + "eval_train_loss/end_span": 1.162888765335083, + "eval_train_perplexity/batch": 7.664590358734131, + "eval_train_perplexity/end_span": 3.1991615295410156, + "eval_train_perplexity/fim": 1.8450416326522827, + "eval_train_perplexity/first_seq": 15.648473739624023, + "eval_train_perplexity/last_seq": 8.743465423583984, + "eval_train_perplexity/second_seq": 14.26180362701416, + "eval_train_perplexity/seq": 8.820669174194336, + "eval_train_reconstruction/all": 0.2807987928390503, + "eval_train_reconstruction/end_span": 0.7414664030075073, + "eval_train_reconstruction/fim": 0.12101425975561142, + "eval_train_reconstruction/first_seq": 0.15040333569049835, + "eval_train_reconstruction/last_seq": 0.33269065618515015, + "eval_train_reconstruction/second_seq": 0.18134254217147827, + "eval_train_runtime": 435.0842, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 20750 + }, + { + "epoch": 0.07743783711197152, + "grad_norm": 0.33621522784233093, + "learning_rate": 0.0006, + "loss": 2.3063, + "step": 20760 + }, + { + "epoch": 0.0774751385749349, + "grad_norm": 0.3992060422897339, + "learning_rate": 0.0006, + "loss": 2.2805, + "step": 20770 + }, + { + "epoch": 0.07751244003789828, + "grad_norm": 0.3541458249092102, + "learning_rate": 0.0006, + "loss": 2.0227, + "step": 20780 + }, + { + "epoch": 0.07754974150086166, + "grad_norm": 0.4933648705482483, + "learning_rate": 0.0006, + "loss": 2.2297, + "step": 20790 + }, + { + "epoch": 0.07758704296382504, + "grad_norm": 0.31731149554252625, + "learning_rate": 0.0006, + "loss": 2.2045, + "step": 20800 + }, + { + "epoch": 0.07762434442678842, + "grad_norm": 0.33849117159843445, + "learning_rate": 0.0006, + "loss": 2.3196, + "step": 20810 + }, + { + "epoch": 0.0776616458897518, + "grad_norm": 0.4469691514968872, + "learning_rate": 0.0006, + "loss": 2.2058, + "step": 20820 + }, + { + "epoch": 0.07769894735271517, + "grad_norm": 0.3849935531616211, + "learning_rate": 0.0006, + "loss": 2.2393, + "step": 20830 + }, + { + "epoch": 0.07773624881567855, + "grad_norm": 0.3082422912120819, + "learning_rate": 0.0006, + "loss": 2.0883, + "step": 20840 + }, + { + "epoch": 0.07777355027864193, + "grad_norm": 0.399243026971817, + "learning_rate": 0.0006, + "loss": 2.2957, + "step": 20850 + }, + { + "epoch": 0.0778108517416053, + "grad_norm": 0.4022061824798584, + "learning_rate": 0.0006, + "loss": 2.2141, + "step": 20860 + }, + { + "epoch": 0.07784815320456868, + "grad_norm": 0.34863758087158203, + "learning_rate": 0.0006, + "loss": 2.3361, + "step": 20870 + }, + { + "epoch": 0.07788545466753206, + "grad_norm": 0.3755008280277252, + "learning_rate": 0.0006, + "loss": 2.1243, + "step": 20880 + }, + { + "epoch": 0.07792275613049544, + "grad_norm": 0.4842625558376312, + "learning_rate": 0.0006, + "loss": 2.2397, + "step": 20890 + }, + { + "epoch": 0.07796005759345881, + "grad_norm": 0.3555808961391449, + "learning_rate": 0.0006, + "loss": 2.36, + "step": 20900 + }, + { + "epoch": 0.07799735905642219, + "grad_norm": 0.5149843692779541, + "learning_rate": 0.0006, + "loss": 2.0797, + "step": 20910 + }, + { + "epoch": 0.07803466051938557, + "grad_norm": 0.23355206847190857, + "learning_rate": 0.0006, + "loss": 2.301, + "step": 20920 + }, + { + "epoch": 0.07807196198234895, + "grad_norm": 0.26561516523361206, + "learning_rate": 0.0006, + "loss": 2.2237, + "step": 20930 + }, + { + "epoch": 0.07810926344531233, + "grad_norm": 0.21108679473400116, + "learning_rate": 0.0006, + "loss": 2.2117, + "step": 20940 + }, + { + "epoch": 0.0781465649082757, + "grad_norm": 0.4606146216392517, + "learning_rate": 0.0006, + "loss": 2.3238, + "step": 20950 + }, + { + "epoch": 0.07818386637123909, + "grad_norm": 0.3692648112773895, + "learning_rate": 0.0006, + "loss": 2.3359, + "step": 20960 + }, + { + "epoch": 0.07822116783420245, + "grad_norm": 0.2946014702320099, + "learning_rate": 0.0006, + "loss": 2.2261, + "step": 20970 + }, + { + "epoch": 0.07825846929716583, + "grad_norm": 0.3360518217086792, + "learning_rate": 0.0006, + "loss": 2.0134, + "step": 20980 + }, + { + "epoch": 0.07829577076012921, + "grad_norm": 0.4462694823741913, + "learning_rate": 0.0006, + "loss": 2.1258, + "step": 20990 + }, + { + "epoch": 0.07833307222309259, + "grad_norm": 2.2985894680023193, + "learning_rate": 0.0006, + "loss": 2.0706, + "step": 21000 + }, + { + "epoch": 0.07833307222309259, + "eval_valid_loss": 2.2079896926879883, + "eval_valid_loss/all": 2.069401264190674, + "eval_valid_loss/end_span": 1.2818390130996704, + "eval_valid_perplexity/batch": 7.920079708099365, + "eval_valid_perplexity/end_span": 3.603260040283203, + "eval_valid_perplexity/fim": 2.4566750526428223, + "eval_valid_perplexity/first_seq": 15.122625350952148, + "eval_valid_perplexity/last_seq": 9.211037635803223, + "eval_valid_perplexity/second_seq": 13.724267959594727, + "eval_valid_perplexity/seq": 8.927891731262207, + "eval_valid_reconstruction/all": 0.28975051641464233, + "eval_valid_reconstruction/end_span": 0.6945086717605591, + "eval_valid_reconstruction/fim": 0.17529675364494324, + "eval_valid_reconstruction/first_seq": 0.16339072585105896, + "eval_valid_reconstruction/last_seq": 0.32041680812835693, + "eval_valid_reconstruction/second_seq": 0.1940975934267044, + "eval_valid_runtime": 446.0742, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 21000 + }, + { + "epoch": 0.07833307222309259, + "eval_train_loss": 2.2028751373291016, + "eval_train_loss/all": 2.037306070327759, + "eval_train_loss/end_span": 1.2453207969665527, + "eval_train_perplexity/batch": 7.669919013977051, + "eval_train_perplexity/end_span": 3.4740490913391113, + "eval_train_perplexity/fim": 2.129671335220337, + "eval_train_perplexity/first_seq": 15.527595520019531, + "eval_train_perplexity/last_seq": 9.423490524291992, + "eval_train_perplexity/second_seq": 14.572101593017578, + "eval_train_perplexity/seq": 8.827340126037598, + "eval_train_reconstruction/all": 0.28044992685317993, + "eval_train_reconstruction/end_span": 0.7038647532463074, + "eval_train_reconstruction/fim": 0.14852818846702576, + "eval_train_reconstruction/first_seq": 0.15217719972133636, + "eval_train_reconstruction/last_seq": 0.31276360154151917, + "eval_train_reconstruction/second_seq": 0.17699511349201202, + "eval_train_runtime": 443.291, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 21000 + }, + { + "epoch": 0.07837037368605597, + "grad_norm": 0.3710562586784363, + "learning_rate": 0.0006, + "loss": 2.2911, + "step": 21010 + }, + { + "epoch": 0.07840767514901935, + "grad_norm": 0.3887312710285187, + "learning_rate": 0.0006, + "loss": 2.2461, + "step": 21020 + }, + { + "epoch": 0.07844497661198273, + "grad_norm": 0.44818115234375, + "learning_rate": 0.0006, + "loss": 2.1128, + "step": 21030 + }, + { + "epoch": 0.0784822780749461, + "grad_norm": 0.5259755849838257, + "learning_rate": 0.0006, + "loss": 2.3143, + "step": 21040 + }, + { + "epoch": 0.07851957953790947, + "grad_norm": 0.3888412117958069, + "learning_rate": 0.0006, + "loss": 2.0268, + "step": 21050 + }, + { + "epoch": 0.07855688100087285, + "grad_norm": 0.4029580354690552, + "learning_rate": 0.0006, + "loss": 2.1128, + "step": 21060 + }, + { + "epoch": 0.07859418246383623, + "grad_norm": 0.37885522842407227, + "learning_rate": 0.0006, + "loss": 2.3272, + "step": 21070 + }, + { + "epoch": 0.07863148392679961, + "grad_norm": 0.46182966232299805, + "learning_rate": 0.0006, + "loss": 2.1862, + "step": 21080 + }, + { + "epoch": 0.07866878538976299, + "grad_norm": 0.36659887433052063, + "learning_rate": 0.0006, + "loss": 2.2618, + "step": 21090 + }, + { + "epoch": 0.07870608685272637, + "grad_norm": 0.36990708112716675, + "learning_rate": 0.0006, + "loss": 2.1085, + "step": 21100 + }, + { + "epoch": 0.07874338831568974, + "grad_norm": 0.40200522541999817, + "learning_rate": 0.0006, + "loss": 2.2782, + "step": 21110 + }, + { + "epoch": 0.07878068977865312, + "grad_norm": 0.28192612528800964, + "learning_rate": 0.0006, + "loss": 2.283, + "step": 21120 + }, + { + "epoch": 0.0788179912416165, + "grad_norm": 1.0771962404251099, + "learning_rate": 0.0006, + "loss": 2.2007, + "step": 21130 + }, + { + "epoch": 0.07885529270457987, + "grad_norm": 0.2984173893928528, + "learning_rate": 0.0006, + "loss": 2.3038, + "step": 21140 + }, + { + "epoch": 0.07889259416754325, + "grad_norm": 0.3358711302280426, + "learning_rate": 0.0006, + "loss": 2.284, + "step": 21150 + }, + { + "epoch": 0.07892989563050663, + "grad_norm": 0.2772088944911957, + "learning_rate": 0.0006, + "loss": 2.2018, + "step": 21160 + }, + { + "epoch": 0.07896719709347001, + "grad_norm": 0.3906197249889374, + "learning_rate": 0.0006, + "loss": 2.2939, + "step": 21170 + }, + { + "epoch": 0.07900449855643338, + "grad_norm": 0.4041854441165924, + "learning_rate": 0.0006, + "loss": 2.1187, + "step": 21180 + }, + { + "epoch": 0.07904180001939676, + "grad_norm": 0.4544260501861572, + "learning_rate": 0.0006, + "loss": 2.1997, + "step": 21190 + }, + { + "epoch": 0.07907910148236014, + "grad_norm": 0.47467291355133057, + "learning_rate": 0.0006, + "loss": 2.2387, + "step": 21200 + }, + { + "epoch": 0.07911640294532352, + "grad_norm": 0.4928154945373535, + "learning_rate": 0.0006, + "loss": 2.2015, + "step": 21210 + }, + { + "epoch": 0.0791537044082869, + "grad_norm": 0.3742581307888031, + "learning_rate": 0.0006, + "loss": 2.1436, + "step": 21220 + }, + { + "epoch": 0.07919100587125028, + "grad_norm": 0.4131743609905243, + "learning_rate": 0.0006, + "loss": 2.0406, + "step": 21230 + }, + { + "epoch": 0.07922830733421365, + "grad_norm": 0.45185503363609314, + "learning_rate": 0.0006, + "loss": 2.0746, + "step": 21240 + }, + { + "epoch": 0.07926560879717702, + "grad_norm": 0.30080774426460266, + "learning_rate": 0.0006, + "loss": 2.2361, + "step": 21250 + }, + { + "epoch": 0.07926560879717702, + "eval_valid_loss": 2.2063703536987305, + "eval_valid_loss/all": 2.0681777000427246, + "eval_valid_loss/end_span": 1.2869441509246826, + "eval_valid_perplexity/batch": 7.910394668579102, + "eval_valid_perplexity/end_span": 3.621702194213867, + "eval_valid_perplexity/fim": 2.106157064437866, + "eval_valid_perplexity/first_seq": 14.605500221252441, + "eval_valid_perplexity/last_seq": 8.79613208770752, + "eval_valid_perplexity/second_seq": 14.250574111938477, + "eval_valid_perplexity/seq": 8.922480583190918, + "eval_valid_reconstruction/all": 0.2899196147918701, + "eval_valid_reconstruction/end_span": 0.7012337446212769, + "eval_valid_reconstruction/fim": 0.1455768197774887, + "eval_valid_reconstruction/first_seq": 0.17251135408878326, + "eval_valid_reconstruction/last_seq": 0.33254775404930115, + "eval_valid_reconstruction/second_seq": 0.183371439576149, + "eval_valid_runtime": 440.6119, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 21250 + }, + { + "epoch": 0.07926560879717702, + "eval_train_loss": 2.2023797035217285, + "eval_train_loss/all": 2.0372636318206787, + "eval_train_loss/end_span": 1.2489068508148193, + "eval_train_perplexity/batch": 7.669593811035156, + "eval_train_perplexity/end_span": 3.486529588699341, + "eval_train_perplexity/fim": 2.2198662757873535, + "eval_train_perplexity/first_seq": 15.406700134277344, + "eval_train_perplexity/last_seq": 9.307602882385254, + "eval_train_perplexity/second_seq": 14.430000305175781, + "eval_train_perplexity/seq": 8.835655212402344, + "eval_train_reconstruction/all": 0.2803916335105896, + "eval_train_reconstruction/end_span": 0.7119981646537781, + "eval_train_reconstruction/fim": 0.15718328952789307, + "eval_train_reconstruction/first_seq": 0.15281899273395538, + "eval_train_reconstruction/last_seq": 0.312486857175827, + "eval_train_reconstruction/second_seq": 0.17964041233062744, + "eval_train_runtime": 443.2565, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 21250 + }, + { + "epoch": 0.0793029102601404, + "grad_norm": 0.2590612471103668, + "learning_rate": 0.0006, + "loss": 2.1793, + "step": 21260 + }, + { + "epoch": 0.07934021172310378, + "grad_norm": 0.3838469386100769, + "learning_rate": 0.0006, + "loss": 2.0381, + "step": 21270 + }, + { + "epoch": 0.07937751318606716, + "grad_norm": 0.3028656840324402, + "learning_rate": 0.0006, + "loss": 2.2614, + "step": 21280 + }, + { + "epoch": 0.07941481464903054, + "grad_norm": 0.4028010368347168, + "learning_rate": 0.0006, + "loss": 2.275, + "step": 21290 + }, + { + "epoch": 0.07945211611199392, + "grad_norm": 0.5406781435012817, + "learning_rate": 0.0006, + "loss": 2.2089, + "step": 21300 + }, + { + "epoch": 0.0794894175749573, + "grad_norm": 0.36720627546310425, + "learning_rate": 0.0006, + "loss": 2.2339, + "step": 21310 + }, + { + "epoch": 0.07952671903792066, + "grad_norm": 0.26086118817329407, + "learning_rate": 0.0006, + "loss": 2.2627, + "step": 21320 + }, + { + "epoch": 0.07956402050088404, + "grad_norm": 0.44473353028297424, + "learning_rate": 0.0006, + "loss": 1.8683, + "step": 21330 + }, + { + "epoch": 0.07960132196384742, + "grad_norm": 0.46587803959846497, + "learning_rate": 0.0006, + "loss": 2.1623, + "step": 21340 + }, + { + "epoch": 0.0796386234268108, + "grad_norm": 0.333270400762558, + "learning_rate": 0.0006, + "loss": 2.2195, + "step": 21350 + }, + { + "epoch": 0.07967592488977418, + "grad_norm": 0.37026673555374146, + "learning_rate": 0.0006, + "loss": 2.2875, + "step": 21360 + }, + { + "epoch": 0.07971322635273756, + "grad_norm": 0.4445638954639435, + "learning_rate": 0.0006, + "loss": 2.335, + "step": 21370 + }, + { + "epoch": 0.07975052781570093, + "grad_norm": 0.37841638922691345, + "learning_rate": 0.0006, + "loss": 2.2707, + "step": 21380 + }, + { + "epoch": 0.0797878292786643, + "grad_norm": 0.37961462140083313, + "learning_rate": 0.0006, + "loss": 2.1268, + "step": 21390 + }, + { + "epoch": 0.07982513074162768, + "grad_norm": 0.7060167789459229, + "learning_rate": 0.0006, + "loss": 2.1832, + "step": 21400 + }, + { + "epoch": 0.07986243220459106, + "grad_norm": 0.29775598645210266, + "learning_rate": 0.0006, + "loss": 2.3469, + "step": 21410 + }, + { + "epoch": 0.07989973366755444, + "grad_norm": 0.37313541769981384, + "learning_rate": 0.0006, + "loss": 2.1455, + "step": 21420 + }, + { + "epoch": 0.07993703513051782, + "grad_norm": 0.35064035654067993, + "learning_rate": 0.0006, + "loss": 2.2229, + "step": 21430 + }, + { + "epoch": 0.0799743365934812, + "grad_norm": 0.35152825713157654, + "learning_rate": 0.0006, + "loss": 2.1611, + "step": 21440 + }, + { + "epoch": 0.08001163805644457, + "grad_norm": 0.5679817795753479, + "learning_rate": 0.0006, + "loss": 2.3203, + "step": 21450 + }, + { + "epoch": 0.08004893951940795, + "grad_norm": 0.5587647557258606, + "learning_rate": 0.0006, + "loss": 2.1835, + "step": 21460 + }, + { + "epoch": 0.08008624098237133, + "grad_norm": 0.3017193675041199, + "learning_rate": 0.0006, + "loss": 2.2306, + "step": 21470 + }, + { + "epoch": 0.0801235424453347, + "grad_norm": 0.32012173533439636, + "learning_rate": 0.0006, + "loss": 2.1439, + "step": 21480 + }, + { + "epoch": 0.08016084390829809, + "grad_norm": 0.3317354917526245, + "learning_rate": 0.0006, + "loss": 2.2581, + "step": 21490 + }, + { + "epoch": 0.08019814537126146, + "grad_norm": 0.25114530324935913, + "learning_rate": 0.0006, + "loss": 2.2883, + "step": 21500 + }, + { + "epoch": 0.08019814537126146, + "eval_valid_loss": 2.200291395187378, + "eval_valid_loss/all": 2.0626180171966553, + "eval_valid_loss/end_span": 1.2530750036239624, + "eval_valid_perplexity/batch": 7.866537570953369, + "eval_valid_perplexity/end_span": 3.5010921955108643, + "eval_valid_perplexity/fim": 2.1807284355163574, + "eval_valid_perplexity/first_seq": 14.647873878479004, + "eval_valid_perplexity/last_seq": 9.394142150878906, + "eval_valid_perplexity/second_seq": 13.769120216369629, + "eval_valid_perplexity/seq": 8.865253448486328, + "eval_valid_reconstruction/all": 0.29170599579811096, + "eval_valid_reconstruction/end_span": 0.7001531720161438, + "eval_valid_reconstruction/fim": 0.15437522530555725, + "eval_valid_reconstruction/first_seq": 0.16980750858783722, + "eval_valid_reconstruction/last_seq": 0.3096804618835449, + "eval_valid_reconstruction/second_seq": 0.1954815685749054, + "eval_valid_runtime": 438.2174, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 21500 + }, + { + "epoch": 0.08019814537126146, + "eval_train_loss": 2.1968624591827393, + "eval_train_loss/all": 2.0320816040039062, + "eval_train_loss/end_span": 1.2177342176437378, + "eval_train_perplexity/batch": 7.629952430725098, + "eval_train_perplexity/end_span": 3.3795218467712402, + "eval_train_perplexity/fim": 2.10129976272583, + "eval_train_perplexity/first_seq": 15.446474075317383, + "eval_train_perplexity/last_seq": 8.805591583251953, + "eval_train_perplexity/second_seq": 14.36095905303955, + "eval_train_perplexity/seq": 8.778796195983887, + "eval_train_reconstruction/all": 0.2819790542125702, + "eval_train_reconstruction/end_span": 0.7110966444015503, + "eval_train_reconstruction/fim": 0.14790357649326324, + "eval_train_reconstruction/first_seq": 0.15166912972927094, + "eval_train_reconstruction/last_seq": 0.32617151737213135, + "eval_train_reconstruction/second_seq": 0.1776961088180542, + "eval_train_runtime": 439.0206, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 21500 + }, + { + "epoch": 0.08023544683422484, + "grad_norm": 0.23223979771137238, + "learning_rate": 0.0006, + "loss": 2.2949, + "step": 21510 + }, + { + "epoch": 0.08027274829718821, + "grad_norm": 0.2395848035812378, + "learning_rate": 0.0006, + "loss": 2.1402, + "step": 21520 + }, + { + "epoch": 0.08031004976015159, + "grad_norm": 0.32226040959358215, + "learning_rate": 0.0006, + "loss": 2.2549, + "step": 21530 + }, + { + "epoch": 0.08034735122311497, + "grad_norm": 0.3734733760356903, + "learning_rate": 0.0006, + "loss": 2.1435, + "step": 21540 + }, + { + "epoch": 0.08038465268607835, + "grad_norm": 0.26460719108581543, + "learning_rate": 0.0006, + "loss": 2.2739, + "step": 21550 + }, + { + "epoch": 0.08042195414904173, + "grad_norm": 0.3145654499530792, + "learning_rate": 0.0006, + "loss": 2.395, + "step": 21560 + }, + { + "epoch": 0.0804592556120051, + "grad_norm": 8.644425392150879, + "learning_rate": 0.0006, + "loss": 2.4164, + "step": 21570 + }, + { + "epoch": 0.08049655707496849, + "grad_norm": 0.3779350817203522, + "learning_rate": 0.0006, + "loss": 2.3369, + "step": 21580 + }, + { + "epoch": 0.08053385853793185, + "grad_norm": 0.4675690829753876, + "learning_rate": 0.0006, + "loss": 2.1764, + "step": 21590 + }, + { + "epoch": 0.08057116000089523, + "grad_norm": 0.27358826994895935, + "learning_rate": 0.0006, + "loss": 2.1888, + "step": 21600 + }, + { + "epoch": 0.08060846146385861, + "grad_norm": 0.3014376163482666, + "learning_rate": 0.0006, + "loss": 2.3144, + "step": 21610 + }, + { + "epoch": 0.08064576292682199, + "grad_norm": 0.26363492012023926, + "learning_rate": 0.0006, + "loss": 2.0702, + "step": 21620 + }, + { + "epoch": 0.08068306438978537, + "grad_norm": 0.2824566066265106, + "learning_rate": 0.0006, + "loss": 2.1283, + "step": 21630 + }, + { + "epoch": 0.08072036585274875, + "grad_norm": 0.39595067501068115, + "learning_rate": 0.0006, + "loss": 2.2978, + "step": 21640 + }, + { + "epoch": 0.08075766731571213, + "grad_norm": 0.2530269920825958, + "learning_rate": 0.0006, + "loss": 2.1997, + "step": 21650 + }, + { + "epoch": 0.0807949687786755, + "grad_norm": 0.33327361941337585, + "learning_rate": 0.0006, + "loss": 2.1827, + "step": 21660 + }, + { + "epoch": 0.08083227024163887, + "grad_norm": 0.26762282848358154, + "learning_rate": 0.0006, + "loss": 2.2394, + "step": 21670 + }, + { + "epoch": 0.08086957170460225, + "grad_norm": 0.49346446990966797, + "learning_rate": 0.0006, + "loss": 2.1651, + "step": 21680 + }, + { + "epoch": 0.08090687316756563, + "grad_norm": 0.3560800850391388, + "learning_rate": 0.0006, + "loss": 2.1174, + "step": 21690 + }, + { + "epoch": 0.08094417463052901, + "grad_norm": 0.28409042954444885, + "learning_rate": 0.0006, + "loss": 2.4411, + "step": 21700 + }, + { + "epoch": 0.08098147609349239, + "grad_norm": 0.3263789713382721, + "learning_rate": 0.0006, + "loss": 2.2559, + "step": 21710 + }, + { + "epoch": 0.08101877755645577, + "grad_norm": 0.3316487669944763, + "learning_rate": 0.0006, + "loss": 2.1916, + "step": 21720 + }, + { + "epoch": 0.08105607901941914, + "grad_norm": 0.32204681634902954, + "learning_rate": 0.0006, + "loss": 2.2446, + "step": 21730 + }, + { + "epoch": 0.08109338048238252, + "grad_norm": 0.3185572326183319, + "learning_rate": 0.0006, + "loss": 2.1254, + "step": 21740 + }, + { + "epoch": 0.0811306819453459, + "grad_norm": 0.34711501002311707, + "learning_rate": 0.0006, + "loss": 2.2117, + "step": 21750 + }, + { + "epoch": 0.0811306819453459, + "eval_valid_loss": 2.204639196395874, + "eval_valid_loss/all": 2.0660758018493652, + "eval_valid_loss/end_span": 1.2659870386123657, + "eval_valid_perplexity/batch": 7.89378547668457, + "eval_valid_perplexity/end_span": 3.5465915203094482, + "eval_valid_perplexity/fim": 2.0139248371124268, + "eval_valid_perplexity/first_seq": 14.710687637329102, + "eval_valid_perplexity/last_seq": 9.288948059082031, + "eval_valid_perplexity/second_seq": 13.853654861450195, + "eval_valid_perplexity/seq": 8.896154403686523, + "eval_valid_reconstruction/all": 0.290813684463501, + "eval_valid_reconstruction/end_span": 0.7009071111679077, + "eval_valid_reconstruction/fim": 0.1377006322145462, + "eval_valid_reconstruction/first_seq": 0.17470501363277435, + "eval_valid_reconstruction/last_seq": 0.3166534900665283, + "eval_valid_reconstruction/second_seq": 0.19389371573925018, + "eval_valid_runtime": 444.679, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 21750 + }, + { + "epoch": 0.0811306819453459, + "eval_train_loss": 2.2024123668670654, + "eval_train_loss/all": 2.03705096244812, + "eval_train_loss/end_span": 1.2278326749801636, + "eval_train_perplexity/batch": 7.667962551116943, + "eval_train_perplexity/end_span": 3.413822650909424, + "eval_train_perplexity/fim": 2.451698064804077, + "eval_train_perplexity/first_seq": 15.34334945678711, + "eval_train_perplexity/last_seq": 8.827119827270508, + "eval_train_perplexity/second_seq": 13.802480697631836, + "eval_train_perplexity/seq": 8.82840347290039, + "eval_train_reconstruction/all": 0.2807720899581909, + "eval_train_reconstruction/end_span": 0.7129246592521667, + "eval_train_reconstruction/fim": 0.17718516290187836, + "eval_train_reconstruction/first_seq": 0.15499483048915863, + "eval_train_reconstruction/last_seq": 0.33160290122032166, + "eval_train_reconstruction/second_seq": 0.19438092410564423, + "eval_train_runtime": 439.5969, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 21750 + }, + { + "epoch": 0.08116798340830927, + "grad_norm": 0.30665111541748047, + "learning_rate": 0.0006, + "loss": 2.3281, + "step": 21760 + }, + { + "epoch": 0.08120528487127265, + "grad_norm": 0.35182568430900574, + "learning_rate": 0.0006, + "loss": 2.1145, + "step": 21770 + }, + { + "epoch": 0.08124258633423603, + "grad_norm": 0.44817447662353516, + "learning_rate": 0.0006, + "loss": 2.1676, + "step": 21780 + }, + { + "epoch": 0.08127988779719941, + "grad_norm": 0.41452154517173767, + "learning_rate": 0.0006, + "loss": 2.1574, + "step": 21790 + }, + { + "epoch": 0.08131718926016278, + "grad_norm": 0.25155892968177795, + "learning_rate": 0.0006, + "loss": 2.2465, + "step": 21800 + }, + { + "epoch": 0.08135449072312616, + "grad_norm": 0.22013774514198303, + "learning_rate": 0.0006, + "loss": 2.331, + "step": 21810 + }, + { + "epoch": 0.08139179218608954, + "grad_norm": 0.3539905250072479, + "learning_rate": 0.0006, + "loss": 2.1408, + "step": 21820 + }, + { + "epoch": 0.08142909364905292, + "grad_norm": 0.3372999429702759, + "learning_rate": 0.0006, + "loss": 2.221, + "step": 21830 + }, + { + "epoch": 0.0814663951120163, + "grad_norm": 1.8709436655044556, + "learning_rate": 0.0006, + "loss": 2.1805, + "step": 21840 + }, + { + "epoch": 0.08150369657497968, + "grad_norm": 1.0069453716278076, + "learning_rate": 0.0006, + "loss": 2.092, + "step": 21850 + }, + { + "epoch": 0.08154099803794305, + "grad_norm": 0.4430239796638489, + "learning_rate": 0.0006, + "loss": 2.1688, + "step": 21860 + }, + { + "epoch": 0.08157829950090642, + "grad_norm": 0.2651693522930145, + "learning_rate": 0.0006, + "loss": 2.0214, + "step": 21870 + }, + { + "epoch": 0.0816156009638698, + "grad_norm": 0.7575454115867615, + "learning_rate": 0.0006, + "loss": 2.1483, + "step": 21880 + }, + { + "epoch": 0.08165290242683318, + "grad_norm": 0.34332600235939026, + "learning_rate": 0.0006, + "loss": 2.2861, + "step": 21890 + }, + { + "epoch": 0.08169020388979656, + "grad_norm": 0.4793160855770111, + "learning_rate": 0.0006, + "loss": 2.1509, + "step": 21900 + }, + { + "epoch": 0.08172750535275994, + "grad_norm": 0.31493061780929565, + "learning_rate": 0.0006, + "loss": 2.2869, + "step": 21910 + }, + { + "epoch": 0.08176480681572332, + "grad_norm": 0.5682433247566223, + "learning_rate": 0.0006, + "loss": 2.1168, + "step": 21920 + }, + { + "epoch": 0.08180210827868668, + "grad_norm": 0.33288225531578064, + "learning_rate": 0.0006, + "loss": 2.305, + "step": 21930 + }, + { + "epoch": 0.08183940974165006, + "grad_norm": 0.5038242340087891, + "learning_rate": 0.0006, + "loss": 2.0936, + "step": 21940 + }, + { + "epoch": 0.08187671120461344, + "grad_norm": 0.29998335242271423, + "learning_rate": 0.0006, + "loss": 2.2774, + "step": 21950 + }, + { + "epoch": 0.08191401266757682, + "grad_norm": 0.2987041473388672, + "learning_rate": 0.0006, + "loss": 2.1678, + "step": 21960 + }, + { + "epoch": 0.0819513141305402, + "grad_norm": 0.29671502113342285, + "learning_rate": 0.0006, + "loss": 2.13, + "step": 21970 + }, + { + "epoch": 0.08198861559350358, + "grad_norm": 0.328075110912323, + "learning_rate": 0.0006, + "loss": 2.1224, + "step": 21980 + }, + { + "epoch": 0.08202591705646696, + "grad_norm": 0.6911919116973877, + "learning_rate": 0.0006, + "loss": 2.2764, + "step": 21990 + }, + { + "epoch": 0.08206321851943033, + "grad_norm": 0.3173505365848541, + "learning_rate": 0.0006, + "loss": 2.254, + "step": 22000 + }, + { + "epoch": 0.08206321851943033, + "eval_valid_loss": 2.1991939544677734, + "eval_valid_loss/all": 2.0612986087799072, + "eval_valid_loss/end_span": 1.2960567474365234, + "eval_valid_perplexity/batch": 7.856165409088135, + "eval_valid_perplexity/end_span": 3.6548562049865723, + "eval_valid_perplexity/fim": 2.3379945755004883, + "eval_valid_perplexity/first_seq": 15.161051750183105, + "eval_valid_perplexity/last_seq": 8.853443145751953, + "eval_valid_perplexity/second_seq": 13.633174896240234, + "eval_valid_perplexity/seq": 8.85531997680664, + "eval_valid_reconstruction/all": 0.29222050309181213, + "eval_valid_reconstruction/end_span": 0.6885176301002502, + "eval_valid_reconstruction/fim": 0.16699479520320892, + "eval_valid_reconstruction/first_seq": 0.1608142852783203, + "eval_valid_reconstruction/last_seq": 0.33065739274024963, + "eval_valid_reconstruction/second_seq": 0.19810178875923157, + "eval_valid_runtime": 441.9805, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 22000 + }, + { + "epoch": 0.08206321851943033, + "eval_train_loss": 2.1982545852661133, + "eval_train_loss/all": 2.03364896774292, + "eval_train_loss/end_span": 1.2650245428085327, + "eval_train_perplexity/batch": 7.641920566558838, + "eval_train_perplexity/end_span": 3.543179750442505, + "eval_train_perplexity/fim": 2.04616379737854, + "eval_train_perplexity/first_seq": 15.655701637268066, + "eval_train_perplexity/last_seq": 8.92896556854248, + "eval_train_perplexity/second_seq": 14.202998161315918, + "eval_train_perplexity/seq": 8.801267623901367, + "eval_train_reconstruction/all": 0.2814522683620453, + "eval_train_reconstruction/end_span": 0.6981149315834045, + "eval_train_reconstruction/fim": 0.14075930416584015, + "eval_train_reconstruction/first_seq": 0.1468166559934616, + "eval_train_reconstruction/last_seq": 0.3261718153953552, + "eval_train_reconstruction/second_seq": 0.18255409598350525, + "eval_train_runtime": 443.7481, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 22000 + }, + { + "epoch": 0.0821005199823937, + "grad_norm": 0.22090283036231995, + "learning_rate": 0.0006, + "loss": 2.2886, + "step": 22010 + }, + { + "epoch": 0.08213782144535708, + "grad_norm": 0.38346949219703674, + "learning_rate": 0.0006, + "loss": 2.2932, + "step": 22020 + }, + { + "epoch": 0.08217512290832046, + "grad_norm": 0.5097378492355347, + "learning_rate": 0.0006, + "loss": 2.2437, + "step": 22030 + }, + { + "epoch": 0.08221242437128384, + "grad_norm": 0.4056205749511719, + "learning_rate": 0.0006, + "loss": 2.3687, + "step": 22040 + }, + { + "epoch": 0.08224972583424722, + "grad_norm": 0.33606818318367004, + "learning_rate": 0.0006, + "loss": 2.0396, + "step": 22050 + }, + { + "epoch": 0.0822870272972106, + "grad_norm": 1.1789820194244385, + "learning_rate": 0.0006, + "loss": 2.1062, + "step": 22060 + }, + { + "epoch": 0.08232432876017397, + "grad_norm": 0.5251872539520264, + "learning_rate": 0.0006, + "loss": 2.0796, + "step": 22070 + }, + { + "epoch": 0.08236163022313735, + "grad_norm": 0.27345308661460876, + "learning_rate": 0.0006, + "loss": 2.17, + "step": 22080 + }, + { + "epoch": 0.08239893168610073, + "grad_norm": 0.3546929359436035, + "learning_rate": 0.0006, + "loss": 2.2806, + "step": 22090 + }, + { + "epoch": 0.0824362331490641, + "grad_norm": 0.25352349877357483, + "learning_rate": 0.0006, + "loss": 2.2452, + "step": 22100 + }, + { + "epoch": 0.08247353461202749, + "grad_norm": 0.23863448202610016, + "learning_rate": 0.0006, + "loss": 2.3036, + "step": 22110 + }, + { + "epoch": 0.08251083607499086, + "grad_norm": 0.2962379455566406, + "learning_rate": 0.0006, + "loss": 2.2997, + "step": 22120 + }, + { + "epoch": 0.08254813753795424, + "grad_norm": 0.531206488609314, + "learning_rate": 0.0006, + "loss": 2.3258, + "step": 22130 + }, + { + "epoch": 0.08258543900091761, + "grad_norm": 0.4408222734928131, + "learning_rate": 0.0006, + "loss": 2.336, + "step": 22140 + }, + { + "epoch": 0.08262274046388099, + "grad_norm": 0.4069701135158539, + "learning_rate": 0.0006, + "loss": 2.2871, + "step": 22150 + }, + { + "epoch": 0.08266004192684437, + "grad_norm": 0.43480220437049866, + "learning_rate": 0.0006, + "loss": 2.3294, + "step": 22160 + }, + { + "epoch": 0.08269734338980775, + "grad_norm": 0.4440224766731262, + "learning_rate": 0.0006, + "loss": 2.1483, + "step": 22170 + }, + { + "epoch": 0.08273464485277113, + "grad_norm": 0.2760339379310608, + "learning_rate": 0.0006, + "loss": 2.1172, + "step": 22180 + }, + { + "epoch": 0.08277194631573451, + "grad_norm": 0.41439828276634216, + "learning_rate": 0.0006, + "loss": 2.1644, + "step": 22190 + }, + { + "epoch": 0.08280924777869789, + "grad_norm": 0.2889701724052429, + "learning_rate": 0.0006, + "loss": 2.1529, + "step": 22200 + }, + { + "epoch": 0.08284654924166125, + "grad_norm": 0.7872457504272461, + "learning_rate": 0.0006, + "loss": 2.1842, + "step": 22210 + }, + { + "epoch": 0.08288385070462463, + "grad_norm": 0.3082640469074249, + "learning_rate": 0.0006, + "loss": 2.3699, + "step": 22220 + }, + { + "epoch": 0.08292115216758801, + "grad_norm": 0.2478032410144806, + "learning_rate": 0.0006, + "loss": 2.1969, + "step": 22230 + }, + { + "epoch": 0.08295845363055139, + "grad_norm": 0.259365975856781, + "learning_rate": 0.0006, + "loss": 2.2811, + "step": 22240 + }, + { + "epoch": 0.08299575509351477, + "grad_norm": 0.32714223861694336, + "learning_rate": 0.0006, + "loss": 2.1534, + "step": 22250 + }, + { + "epoch": 0.08299575509351477, + "eval_valid_loss": 2.200167179107666, + "eval_valid_loss/all": 2.062335252761841, + "eval_valid_loss/end_span": 1.1757357120513916, + "eval_valid_perplexity/batch": 7.86431360244751, + "eval_valid_perplexity/end_span": 3.2405261993408203, + "eval_valid_perplexity/fim": 2.1445391178131104, + "eval_valid_perplexity/first_seq": 14.563047409057617, + "eval_valid_perplexity/last_seq": 9.424431800842285, + "eval_valid_perplexity/second_seq": 13.744767189025879, + "eval_valid_perplexity/seq": 8.870055198669434, + "eval_valid_reconstruction/all": 0.29172301292419434, + "eval_valid_reconstruction/end_span": 0.7283372282981873, + "eval_valid_reconstruction/fim": 0.15047504007816315, + "eval_valid_reconstruction/first_seq": 0.17420460283756256, + "eval_valid_reconstruction/last_seq": 0.31261464953422546, + "eval_valid_reconstruction/second_seq": 0.1957181841135025, + "eval_valid_runtime": 442.3091, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 22250 + }, + { + "epoch": 0.08299575509351477, + "eval_train_loss": 2.1968209743499756, + "eval_train_loss/all": 2.032390594482422, + "eval_train_loss/end_span": 1.1419199705123901, + "eval_train_perplexity/batch": 7.632310390472412, + "eval_train_perplexity/end_span": 3.132777452468872, + "eval_train_perplexity/fim": 2.3179967403411865, + "eval_train_perplexity/first_seq": 15.544990539550781, + "eval_train_perplexity/last_seq": 8.866974830627441, + "eval_train_perplexity/second_seq": 14.541397094726562, + "eval_train_perplexity/seq": 8.787991523742676, + "eval_train_reconstruction/all": 0.28202149271965027, + "eval_train_reconstruction/end_span": 0.7421258687973022, + "eval_train_reconstruction/fim": 0.16677039861679077, + "eval_train_reconstruction/first_seq": 0.15097440779209137, + "eval_train_reconstruction/last_seq": 0.32954978942871094, + "eval_train_reconstruction/second_seq": 0.17468568682670593, + "eval_train_runtime": 438.3201, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 22250 + }, + { + "epoch": 0.08303305655647815, + "grad_norm": 0.5398199558258057, + "learning_rate": 0.0006, + "loss": 2.3853, + "step": 22260 + }, + { + "epoch": 0.08307035801944153, + "grad_norm": 0.34841713309288025, + "learning_rate": 0.0006, + "loss": 2.3944, + "step": 22270 + }, + { + "epoch": 0.0831076594824049, + "grad_norm": 0.5181465148925781, + "learning_rate": 0.0006, + "loss": 2.3411, + "step": 22280 + }, + { + "epoch": 0.08314496094536827, + "grad_norm": 0.278865247964859, + "learning_rate": 0.0006, + "loss": 2.1514, + "step": 22290 + }, + { + "epoch": 0.08318226240833165, + "grad_norm": 0.4066673517227173, + "learning_rate": 0.0006, + "loss": 2.2384, + "step": 22300 + }, + { + "epoch": 0.08321956387129503, + "grad_norm": 0.20855183899402618, + "learning_rate": 0.0006, + "loss": 2.3043, + "step": 22310 + }, + { + "epoch": 0.08325686533425841, + "grad_norm": 0.4617965519428253, + "learning_rate": 0.0006, + "loss": 1.9788, + "step": 22320 + }, + { + "epoch": 0.08329416679722179, + "grad_norm": 0.6306042671203613, + "learning_rate": 0.0006, + "loss": 2.1476, + "step": 22330 + }, + { + "epoch": 0.08333146826018517, + "grad_norm": 0.26590776443481445, + "learning_rate": 0.0006, + "loss": 2.2935, + "step": 22340 + }, + { + "epoch": 0.08336876972314854, + "grad_norm": 0.37193232774734497, + "learning_rate": 0.0006, + "loss": 2.2372, + "step": 22350 + }, + { + "epoch": 0.08340607118611192, + "grad_norm": 0.3352736532688141, + "learning_rate": 0.0006, + "loss": 2.1296, + "step": 22360 + }, + { + "epoch": 0.0834433726490753, + "grad_norm": 0.4609498083591461, + "learning_rate": 0.0006, + "loss": 2.383, + "step": 22370 + }, + { + "epoch": 0.08348067411203867, + "grad_norm": 0.697857677936554, + "learning_rate": 0.0006, + "loss": 2.2588, + "step": 22380 + }, + { + "epoch": 0.08351797557500205, + "grad_norm": 0.41223931312561035, + "learning_rate": 0.0006, + "loss": 2.1501, + "step": 22390 + }, + { + "epoch": 0.08355527703796543, + "grad_norm": 0.32877182960510254, + "learning_rate": 0.0006, + "loss": 2.2916, + "step": 22400 + }, + { + "epoch": 0.08359257850092881, + "grad_norm": 0.5490226149559021, + "learning_rate": 0.0006, + "loss": 2.1169, + "step": 22410 + }, + { + "epoch": 0.08362987996389218, + "grad_norm": 0.35495302081108093, + "learning_rate": 0.0006, + "loss": 2.1819, + "step": 22420 + }, + { + "epoch": 0.08366718142685556, + "grad_norm": 0.7181602716445923, + "learning_rate": 0.0006, + "loss": 2.1099, + "step": 22430 + }, + { + "epoch": 0.08370448288981894, + "grad_norm": 0.3498927354812622, + "learning_rate": 0.0006, + "loss": 2.2568, + "step": 22440 + }, + { + "epoch": 0.08374178435278232, + "grad_norm": 0.4166022539138794, + "learning_rate": 0.0006, + "loss": 2.1509, + "step": 22450 + }, + { + "epoch": 0.0837790858157457, + "grad_norm": 0.4617297947406769, + "learning_rate": 0.0006, + "loss": 2.3342, + "step": 22460 + }, + { + "epoch": 0.08381638727870908, + "grad_norm": 0.47498857975006104, + "learning_rate": 0.0006, + "loss": 2.3381, + "step": 22470 + }, + { + "epoch": 0.08385368874167246, + "grad_norm": 0.43547889590263367, + "learning_rate": 0.0006, + "loss": 2.2455, + "step": 22480 + }, + { + "epoch": 0.08389099020463582, + "grad_norm": 0.3708992600440979, + "learning_rate": 0.0006, + "loss": 2.2621, + "step": 22490 + }, + { + "epoch": 0.0839282916675992, + "grad_norm": 0.3550635874271393, + "learning_rate": 0.0006, + "loss": 2.299, + "step": 22500 + }, + { + "epoch": 0.0839282916675992, + "eval_valid_loss": 2.200228452682495, + "eval_valid_loss/all": 2.0623323917388916, + "eval_valid_loss/end_span": 1.1573220491409302, + "eval_valid_perplexity/batch": 7.864291191101074, + "eval_valid_perplexity/end_span": 3.1814022064208984, + "eval_valid_perplexity/fim": 2.1360859870910645, + "eval_valid_perplexity/first_seq": 14.726728439331055, + "eval_valid_perplexity/last_seq": 8.901649475097656, + "eval_valid_perplexity/second_seq": 13.624157905578613, + "eval_valid_perplexity/seq": 8.8651762008667, + "eval_valid_reconstruction/all": 0.2919599115848541, + "eval_valid_reconstruction/end_span": 0.7300386428833008, + "eval_valid_reconstruction/fim": 0.1507529616355896, + "eval_valid_reconstruction/first_seq": 0.1717856377363205, + "eval_valid_reconstruction/last_seq": 0.3300480842590332, + "eval_valid_reconstruction/second_seq": 0.19886521995067596, + "eval_valid_runtime": 441.5344, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 22500 + }, + { + "epoch": 0.0839282916675992, + "eval_train_loss": 2.196850061416626, + "eval_train_loss/all": 2.0320630073547363, + "eval_train_loss/end_span": 1.1236200332641602, + "eval_train_perplexity/batch": 7.629810333251953, + "eval_train_perplexity/end_span": 3.0759692192077637, + "eval_train_perplexity/fim": 2.194894790649414, + "eval_train_perplexity/first_seq": 15.559844017028809, + "eval_train_perplexity/last_seq": 9.682865142822266, + "eval_train_perplexity/second_seq": 14.226287841796875, + "eval_train_perplexity/seq": 8.782071113586426, + "eval_train_reconstruction/all": 0.28207850456237793, + "eval_train_reconstruction/end_span": 0.7406704425811768, + "eval_train_reconstruction/fim": 0.15609265863895416, + "eval_train_reconstruction/first_seq": 0.15060941874980927, + "eval_train_reconstruction/last_seq": 0.3049343228340149, + "eval_train_reconstruction/second_seq": 0.18200096487998962, + "eval_train_runtime": 436.754, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 22500 + }, + { + "epoch": 0.08396559313056258, + "grad_norm": 0.4849705100059509, + "learning_rate": 0.0006, + "loss": 2.1682, + "step": 22510 + }, + { + "epoch": 0.08400289459352596, + "grad_norm": 0.30367523431777954, + "learning_rate": 0.0006, + "loss": 2.266, + "step": 22520 + }, + { + "epoch": 0.08404019605648934, + "grad_norm": 0.3507865369319916, + "learning_rate": 0.0006, + "loss": 2.0929, + "step": 22530 + }, + { + "epoch": 0.08407749751945272, + "grad_norm": 0.2501373291015625, + "learning_rate": 0.0006, + "loss": 2.1288, + "step": 22540 + }, + { + "epoch": 0.08411479898241608, + "grad_norm": 0.414782851934433, + "learning_rate": 0.0006, + "loss": 2.2739, + "step": 22550 + }, + { + "epoch": 0.08415210044537946, + "grad_norm": 0.3389852046966553, + "learning_rate": 0.0006, + "loss": 2.1429, + "step": 22560 + }, + { + "epoch": 0.08418940190834284, + "grad_norm": 0.32728907465934753, + "learning_rate": 0.0006, + "loss": 2.3006, + "step": 22570 + }, + { + "epoch": 0.08422670337130622, + "grad_norm": 0.3701893389225006, + "learning_rate": 0.0006, + "loss": 2.2817, + "step": 22580 + }, + { + "epoch": 0.0842640048342696, + "grad_norm": 0.40225282311439514, + "learning_rate": 0.0006, + "loss": 2.1381, + "step": 22590 + }, + { + "epoch": 0.08430130629723298, + "grad_norm": 0.4546735882759094, + "learning_rate": 0.0006, + "loss": 2.177, + "step": 22600 + }, + { + "epoch": 0.08433860776019636, + "grad_norm": 0.37589800357818604, + "learning_rate": 0.0006, + "loss": 2.0982, + "step": 22610 + }, + { + "epoch": 0.08437590922315973, + "grad_norm": 0.43269023299217224, + "learning_rate": 0.0006, + "loss": 2.3358, + "step": 22620 + }, + { + "epoch": 0.0844132106861231, + "grad_norm": 7.292312145233154, + "learning_rate": 0.0006, + "loss": 2.0734, + "step": 22630 + }, + { + "epoch": 0.08445051214908648, + "grad_norm": 0.4428325891494751, + "learning_rate": 0.0006, + "loss": 2.1418, + "step": 22640 + }, + { + "epoch": 0.08448781361204986, + "grad_norm": 0.3028727173805237, + "learning_rate": 0.0006, + "loss": 2.1705, + "step": 22650 + }, + { + "epoch": 0.08452511507501324, + "grad_norm": 0.2293478548526764, + "learning_rate": 0.0006, + "loss": 2.3195, + "step": 22660 + }, + { + "epoch": 0.08456241653797662, + "grad_norm": 0.4312838613986969, + "learning_rate": 0.0006, + "loss": 2.075, + "step": 22670 + }, + { + "epoch": 0.08459971800094, + "grad_norm": 0.38666677474975586, + "learning_rate": 0.0006, + "loss": 2.1884, + "step": 22680 + }, + { + "epoch": 0.08463701946390337, + "grad_norm": 0.43321606516838074, + "learning_rate": 0.0006, + "loss": 2.2632, + "step": 22690 + }, + { + "epoch": 0.08467432092686675, + "grad_norm": 0.2723686099052429, + "learning_rate": 0.0006, + "loss": 2.1572, + "step": 22700 + }, + { + "epoch": 0.08471162238983013, + "grad_norm": 0.27836814522743225, + "learning_rate": 0.0006, + "loss": 2.1784, + "step": 22710 + }, + { + "epoch": 0.0847489238527935, + "grad_norm": 0.3310793340206146, + "learning_rate": 0.0006, + "loss": 2.1159, + "step": 22720 + }, + { + "epoch": 0.08478622531575689, + "grad_norm": 0.3678654134273529, + "learning_rate": 0.0006, + "loss": 2.1752, + "step": 22730 + }, + { + "epoch": 0.08482352677872027, + "grad_norm": 0.4455249011516571, + "learning_rate": 0.0006, + "loss": 2.1268, + "step": 22740 + }, + { + "epoch": 0.08486082824168364, + "grad_norm": 0.28719961643218994, + "learning_rate": 0.0006, + "loss": 2.1706, + "step": 22750 + }, + { + "epoch": 0.08486082824168364, + "eval_valid_loss": 2.2008016109466553, + "eval_valid_loss/all": 2.062711238861084, + "eval_valid_loss/end_span": 1.2588286399841309, + "eval_valid_perplexity/batch": 7.8672709465026855, + "eval_valid_perplexity/end_span": 3.521294355392456, + "eval_valid_perplexity/fim": 2.320754289627075, + "eval_valid_perplexity/first_seq": 14.850500106811523, + "eval_valid_perplexity/last_seq": 9.010581016540527, + "eval_valid_perplexity/second_seq": 13.441993713378906, + "eval_valid_perplexity/seq": 8.867573738098145, + "eval_valid_reconstruction/all": 0.2919856309890747, + "eval_valid_reconstruction/end_span": 0.7072916030883789, + "eval_valid_reconstruction/fim": 0.1673809140920639, + "eval_valid_reconstruction/first_seq": 0.1657019406557083, + "eval_valid_reconstruction/last_seq": 0.32940730452537537, + "eval_valid_reconstruction/second_seq": 0.2027151733636856, + "eval_valid_runtime": 444.3959, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 22750 + }, + { + "epoch": 0.08486082824168364, + "eval_train_loss": 2.1980950832366943, + "eval_train_loss/all": 2.0331954956054688, + "eval_train_loss/end_span": 1.2170459032058716, + "eval_train_perplexity/batch": 7.638455867767334, + "eval_train_perplexity/end_span": 3.3771963119506836, + "eval_train_perplexity/fim": 2.163498878479004, + "eval_train_perplexity/first_seq": 15.734237670898438, + "eval_train_perplexity/last_seq": 9.24365234375, + "eval_train_perplexity/second_seq": 14.438934326171875, + "eval_train_perplexity/seq": 8.796285629272461, + "eval_train_reconstruction/all": 0.2818004786968231, + "eval_train_reconstruction/end_span": 0.7199534773826599, + "eval_train_reconstruction/fim": 0.1535182148218155, + "eval_train_reconstruction/first_seq": 0.1464405208826065, + "eval_train_reconstruction/last_seq": 0.3157537281513214, + "eval_train_reconstruction/second_seq": 0.17940855026245117, + "eval_train_runtime": 501.1334, + "eval_train_samples_per_second": 0.383, + "eval_train_steps_per_second": 0.383, + "step": 22750 + }, + { + "epoch": 0.08489812970464701, + "grad_norm": 0.38437554240226746, + "learning_rate": 0.0006, + "loss": 2.2529, + "step": 22760 + }, + { + "epoch": 0.08493543116761039, + "grad_norm": 0.3664029836654663, + "learning_rate": 0.0006, + "loss": 2.2248, + "step": 22770 + }, + { + "epoch": 0.08497273263057377, + "grad_norm": 0.39964085817337036, + "learning_rate": 0.0006, + "loss": 2.1351, + "step": 22780 + }, + { + "epoch": 0.08501003409353715, + "grad_norm": 0.36398306488990784, + "learning_rate": 0.0006, + "loss": 2.3024, + "step": 22790 + }, + { + "epoch": 0.08504733555650053, + "grad_norm": 0.3596619963645935, + "learning_rate": 0.0006, + "loss": 2.3188, + "step": 22800 + }, + { + "epoch": 0.08508463701946391, + "grad_norm": 0.37181878089904785, + "learning_rate": 0.0006, + "loss": 2.1597, + "step": 22810 + }, + { + "epoch": 0.08512193848242729, + "grad_norm": 0.37518978118896484, + "learning_rate": 0.0006, + "loss": 2.1924, + "step": 22820 + }, + { + "epoch": 0.08515923994539065, + "grad_norm": 0.37994349002838135, + "learning_rate": 0.0006, + "loss": 2.1824, + "step": 22830 + }, + { + "epoch": 0.08519654140835403, + "grad_norm": 0.382828950881958, + "learning_rate": 0.0006, + "loss": 2.1399, + "step": 22840 + }, + { + "epoch": 0.08523384287131741, + "grad_norm": 0.35797131061553955, + "learning_rate": 0.0006, + "loss": 2.2773, + "step": 22850 + }, + { + "epoch": 0.08527114433428079, + "grad_norm": 0.34620222449302673, + "learning_rate": 0.0006, + "loss": 2.299, + "step": 22860 + }, + { + "epoch": 0.08530844579724417, + "grad_norm": 0.4852699637413025, + "learning_rate": 0.0006, + "loss": 2.3371, + "step": 22870 + }, + { + "epoch": 0.08534574726020755, + "grad_norm": 0.3226359784603119, + "learning_rate": 0.0006, + "loss": 2.0848, + "step": 22880 + }, + { + "epoch": 0.08538304872317093, + "grad_norm": 0.3452526330947876, + "learning_rate": 0.0006, + "loss": 2.2111, + "step": 22890 + }, + { + "epoch": 0.0854203501861343, + "grad_norm": 0.3801783323287964, + "learning_rate": 0.0006, + "loss": 1.9315, + "step": 22900 + }, + { + "epoch": 0.08545765164909767, + "grad_norm": 0.27001523971557617, + "learning_rate": 0.0006, + "loss": 2.2295, + "step": 22910 + }, + { + "epoch": 0.08549495311206105, + "grad_norm": 0.4367460310459137, + "learning_rate": 0.0006, + "loss": 2.3078, + "step": 22920 + }, + { + "epoch": 0.08553225457502443, + "grad_norm": 0.3608933091163635, + "learning_rate": 0.0006, + "loss": 2.1781, + "step": 22930 + }, + { + "epoch": 0.08556955603798781, + "grad_norm": 0.22184406220912933, + "learning_rate": 0.0006, + "loss": 2.3359, + "step": 22940 + }, + { + "epoch": 0.08560685750095119, + "grad_norm": 0.41112470626831055, + "learning_rate": 0.0006, + "loss": 2.1469, + "step": 22950 + }, + { + "epoch": 0.08564415896391457, + "grad_norm": 0.3301678001880646, + "learning_rate": 0.0006, + "loss": 2.1341, + "step": 22960 + }, + { + "epoch": 0.08568146042687794, + "grad_norm": 0.4710603952407837, + "learning_rate": 0.0006, + "loss": 2.25, + "step": 22970 + }, + { + "epoch": 0.08571876188984132, + "grad_norm": 0.4587034583091736, + "learning_rate": 0.0006, + "loss": 2.275, + "step": 22980 + }, + { + "epoch": 0.0857560633528047, + "grad_norm": 0.33414164185523987, + "learning_rate": 0.0006, + "loss": 2.0727, + "step": 22990 + }, + { + "epoch": 0.08579336481576808, + "grad_norm": 0.4974324107170105, + "learning_rate": 0.0006, + "loss": 2.2684, + "step": 23000 + }, + { + "epoch": 0.08579336481576808, + "eval_valid_loss": 2.2018673419952393, + "eval_valid_loss/all": 2.063713788986206, + "eval_valid_loss/end_span": 1.2387851476669312, + "eval_valid_perplexity/batch": 7.875162124633789, + "eval_valid_perplexity/end_span": 3.451417922973633, + "eval_valid_perplexity/fim": 2.1495819091796875, + "eval_valid_perplexity/first_seq": 14.850298881530762, + "eval_valid_perplexity/last_seq": 9.059967041015625, + "eval_valid_perplexity/second_seq": 13.914881706237793, + "eval_valid_perplexity/seq": 8.873485565185547, + "eval_valid_reconstruction/all": 0.2913450002670288, + "eval_valid_reconstruction/end_span": 0.711837112903595, + "eval_valid_reconstruction/fim": 0.15068666636943817, + "eval_valid_reconstruction/first_seq": 0.16834338009357452, + "eval_valid_reconstruction/last_seq": 0.3240521550178528, + "eval_valid_reconstruction/second_seq": 0.18988806009292603, + "eval_valid_runtime": 479.0123, + "eval_valid_samples_per_second": 0.401, + "eval_valid_steps_per_second": 0.401, + "step": 23000 + }, + { + "epoch": 0.08579336481576808, + "eval_train_loss": 2.198742389678955, + "eval_train_loss/all": 2.0337281227111816, + "eval_train_loss/end_span": 1.203667402267456, + "eval_train_perplexity/batch": 7.642525672912598, + "eval_train_perplexity/end_span": 3.332315444946289, + "eval_train_perplexity/fim": 2.0306389331817627, + "eval_train_perplexity/first_seq": 15.45191764831543, + "eval_train_perplexity/last_seq": 8.894145965576172, + "eval_train_perplexity/second_seq": 14.27239990234375, + "eval_train_perplexity/seq": 8.796500205993652, + "eval_train_reconstruction/all": 0.28158727288246155, + "eval_train_reconstruction/end_span": 0.722240149974823, + "eval_train_reconstruction/fim": 0.13998988270759583, + "eval_train_reconstruction/first_seq": 0.15263886749744415, + "eval_train_reconstruction/last_seq": 0.32565271854400635, + "eval_train_reconstruction/second_seq": 0.18330422043800354, + "eval_train_runtime": 483.8871, + "eval_train_samples_per_second": 0.397, + "eval_train_steps_per_second": 0.397, + "step": 23000 + }, + { + "epoch": 0.08583066627873145, + "grad_norm": 0.25470101833343506, + "learning_rate": 0.0006, + "loss": 2.3658, + "step": 23010 + }, + { + "epoch": 0.08586796774169483, + "grad_norm": 0.42597994208335876, + "learning_rate": 0.0006, + "loss": 2.208, + "step": 23020 + }, + { + "epoch": 0.08590526920465821, + "grad_norm": 0.31451061367988586, + "learning_rate": 0.0006, + "loss": 2.2255, + "step": 23030 + }, + { + "epoch": 0.08594257066762158, + "grad_norm": 0.3054519295692444, + "learning_rate": 0.0006, + "loss": 2.195, + "step": 23040 + }, + { + "epoch": 0.08597987213058496, + "grad_norm": 0.34391239285469055, + "learning_rate": 0.0006, + "loss": 2.1791, + "step": 23050 + }, + { + "epoch": 0.08601717359354834, + "grad_norm": 0.383419930934906, + "learning_rate": 0.0006, + "loss": 2.3609, + "step": 23060 + }, + { + "epoch": 0.08605447505651172, + "grad_norm": 0.23904819786548615, + "learning_rate": 0.0006, + "loss": 2.2937, + "step": 23070 + }, + { + "epoch": 0.0860917765194751, + "grad_norm": 0.30270469188690186, + "learning_rate": 0.0006, + "loss": 2.3284, + "step": 23080 + }, + { + "epoch": 0.08612907798243848, + "grad_norm": 7.0523576736450195, + "learning_rate": 0.0006, + "loss": 2.3582, + "step": 23090 + }, + { + "epoch": 0.08616637944540186, + "grad_norm": 0.35257425904273987, + "learning_rate": 0.0006, + "loss": 2.3297, + "step": 23100 + }, + { + "epoch": 0.08620368090836522, + "grad_norm": 0.5501157641410828, + "learning_rate": 0.0006, + "loss": 2.2178, + "step": 23110 + }, + { + "epoch": 0.0862409823713286, + "grad_norm": 0.29959890246391296, + "learning_rate": 0.0006, + "loss": 2.2575, + "step": 23120 + }, + { + "epoch": 0.08627828383429198, + "grad_norm": 0.4330222010612488, + "learning_rate": 0.0006, + "loss": 2.1449, + "step": 23130 + }, + { + "epoch": 0.08631558529725536, + "grad_norm": 0.32238084077835083, + "learning_rate": 0.0006, + "loss": 2.4019, + "step": 23140 + }, + { + "epoch": 0.08635288676021874, + "grad_norm": 0.3361015319824219, + "learning_rate": 0.0006, + "loss": 2.2321, + "step": 23150 + }, + { + "epoch": 0.08639018822318212, + "grad_norm": 0.32348570227622986, + "learning_rate": 0.0006, + "loss": 2.4001, + "step": 23160 + }, + { + "epoch": 0.08642748968614548, + "grad_norm": 0.1807316392660141, + "learning_rate": 0.0006, + "loss": 2.3068, + "step": 23170 + }, + { + "epoch": 0.08646479114910886, + "grad_norm": 0.5901692509651184, + "learning_rate": 0.0006, + "loss": 2.1435, + "step": 23180 + }, + { + "epoch": 0.08650209261207224, + "grad_norm": 0.34052348136901855, + "learning_rate": 0.0006, + "loss": 2.3493, + "step": 23190 + }, + { + "epoch": 0.08653939407503562, + "grad_norm": 0.373146653175354, + "learning_rate": 0.0006, + "loss": 2.3142, + "step": 23200 + }, + { + "epoch": 0.086576695537999, + "grad_norm": 0.2968055009841919, + "learning_rate": 0.0006, + "loss": 2.4011, + "step": 23210 + }, + { + "epoch": 0.08661399700096238, + "grad_norm": 0.39880937337875366, + "learning_rate": 0.0006, + "loss": 2.2268, + "step": 23220 + }, + { + "epoch": 0.08665129846392576, + "grad_norm": 0.2584717571735382, + "learning_rate": 0.0006, + "loss": 2.1863, + "step": 23230 + }, + { + "epoch": 0.08668859992688913, + "grad_norm": 0.4246843755245209, + "learning_rate": 0.0006, + "loss": 2.2496, + "step": 23240 + }, + { + "epoch": 0.0867259013898525, + "grad_norm": 0.35585689544677734, + "learning_rate": 0.0006, + "loss": 2.3547, + "step": 23250 + }, + { + "epoch": 0.0867259013898525, + "eval_valid_loss": 2.2008161544799805, + "eval_valid_loss/all": 2.06276535987854, + "eval_valid_loss/end_span": 1.3602677583694458, + "eval_valid_perplexity/batch": 7.867696762084961, + "eval_valid_perplexity/end_span": 3.8972365856170654, + "eval_valid_perplexity/fim": 2.456362247467041, + "eval_valid_perplexity/first_seq": 14.4777250289917, + "eval_valid_perplexity/last_seq": 9.00462532043457, + "eval_valid_perplexity/second_seq": 13.714801788330078, + "eval_valid_perplexity/seq": 8.865954399108887, + "eval_valid_reconstruction/all": 0.29181548953056335, + "eval_valid_reconstruction/end_span": 0.6744361519813538, + "eval_valid_reconstruction/fim": 0.17775598168373108, + "eval_valid_reconstruction/first_seq": 0.17573413252830505, + "eval_valid_reconstruction/last_seq": 0.3235141634941101, + "eval_valid_reconstruction/second_seq": 0.19521862268447876, + "eval_valid_runtime": 441.7745, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 23250 + }, + { + "epoch": 0.0867259013898525, + "eval_train_loss": 2.1997194290161133, + "eval_train_loss/all": 2.0348896980285645, + "eval_train_loss/end_span": 1.3213777542114258, + "eval_train_perplexity/batch": 7.6514081954956055, + "eval_train_perplexity/end_span": 3.748582363128662, + "eval_train_perplexity/fim": 2.0788564682006836, + "eval_train_perplexity/first_seq": 15.637519836425781, + "eval_train_perplexity/last_seq": 8.763166427612305, + "eval_train_perplexity/second_seq": 14.296944618225098, + "eval_train_perplexity/seq": 8.811088562011719, + "eval_train_reconstruction/all": 0.2813841998577118, + "eval_train_reconstruction/end_span": 0.6858958601951599, + "eval_train_reconstruction/fim": 0.14399245381355286, + "eval_train_reconstruction/first_seq": 0.15190111100673676, + "eval_train_reconstruction/last_seq": 0.3294079303741455, + "eval_train_reconstruction/second_seq": 0.18271119892597198, + "eval_train_runtime": 439.8361, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 23250 + }, + { + "epoch": 0.08676320285281588, + "grad_norm": 0.29343268275260925, + "learning_rate": 0.0006, + "loss": 2.2576, + "step": 23260 + }, + { + "epoch": 0.08680050431577926, + "grad_norm": 0.2552952468395233, + "learning_rate": 0.0006, + "loss": 2.1896, + "step": 23270 + }, + { + "epoch": 0.08683780577874264, + "grad_norm": 0.2058487832546234, + "learning_rate": 0.0006, + "loss": 2.2742, + "step": 23280 + }, + { + "epoch": 0.08687510724170602, + "grad_norm": 0.4108065962791443, + "learning_rate": 0.0006, + "loss": 2.2258, + "step": 23290 + }, + { + "epoch": 0.0869124087046694, + "grad_norm": 0.5234811902046204, + "learning_rate": 0.0006, + "loss": 2.1652, + "step": 23300 + }, + { + "epoch": 0.08694971016763277, + "grad_norm": 0.25418299436569214, + "learning_rate": 0.0006, + "loss": 2.1242, + "step": 23310 + }, + { + "epoch": 0.08698701163059615, + "grad_norm": 0.3018188178539276, + "learning_rate": 0.0006, + "loss": 2.2841, + "step": 23320 + }, + { + "epoch": 0.08702431309355953, + "grad_norm": 0.43094727396965027, + "learning_rate": 0.0006, + "loss": 2.2003, + "step": 23330 + }, + { + "epoch": 0.0870616145565229, + "grad_norm": 0.5272669196128845, + "learning_rate": 0.0006, + "loss": 2.164, + "step": 23340 + }, + { + "epoch": 0.08709891601948629, + "grad_norm": 0.32997509837150574, + "learning_rate": 0.0006, + "loss": 2.322, + "step": 23350 + }, + { + "epoch": 0.08713621748244967, + "grad_norm": 0.29299360513687134, + "learning_rate": 0.0006, + "loss": 2.1217, + "step": 23360 + }, + { + "epoch": 0.08717351894541304, + "grad_norm": 0.3071252107620239, + "learning_rate": 0.0006, + "loss": 2.1093, + "step": 23370 + }, + { + "epoch": 0.08721082040837641, + "grad_norm": 0.3675433099269867, + "learning_rate": 0.0006, + "loss": 2.1934, + "step": 23380 + }, + { + "epoch": 0.08724812187133979, + "grad_norm": 0.420011967420578, + "learning_rate": 0.0006, + "loss": 2.2943, + "step": 23390 + }, + { + "epoch": 0.08728542333430317, + "grad_norm": 0.32825765013694763, + "learning_rate": 0.0006, + "loss": 2.3444, + "step": 23400 + }, + { + "epoch": 0.08732272479726655, + "grad_norm": 0.6467567086219788, + "learning_rate": 0.0006, + "loss": 2.1783, + "step": 23410 + }, + { + "epoch": 0.08736002626022993, + "grad_norm": 0.41268548369407654, + "learning_rate": 0.0006, + "loss": 2.1901, + "step": 23420 + }, + { + "epoch": 0.08739732772319331, + "grad_norm": 0.346320241689682, + "learning_rate": 0.0006, + "loss": 2.2248, + "step": 23430 + }, + { + "epoch": 0.08743462918615669, + "grad_norm": 0.33749696612358093, + "learning_rate": 0.0006, + "loss": 2.1998, + "step": 23440 + }, + { + "epoch": 0.08747193064912005, + "grad_norm": 0.3542497456073761, + "learning_rate": 0.0006, + "loss": 2.1521, + "step": 23450 + }, + { + "epoch": 0.08750923211208343, + "grad_norm": 0.40117794275283813, + "learning_rate": 0.0006, + "loss": 2.2475, + "step": 23460 + }, + { + "epoch": 0.08754653357504681, + "grad_norm": 0.34510537981987, + "learning_rate": 0.0006, + "loss": 2.271, + "step": 23470 + }, + { + "epoch": 0.08758383503801019, + "grad_norm": 0.45261475443840027, + "learning_rate": 0.0006, + "loss": 2.2283, + "step": 23480 + }, + { + "epoch": 0.08762113650097357, + "grad_norm": 0.2933933138847351, + "learning_rate": 0.0006, + "loss": 2.2196, + "step": 23490 + }, + { + "epoch": 0.08765843796393695, + "grad_norm": 0.39049914479255676, + "learning_rate": 0.0006, + "loss": 2.2462, + "step": 23500 + }, + { + "epoch": 0.08765843796393695, + "eval_valid_loss": 2.196892499923706, + "eval_valid_loss/all": 2.059279680252075, + "eval_valid_loss/end_span": 1.2248541116714478, + "eval_valid_perplexity/batch": 7.840320110321045, + "eval_valid_perplexity/end_span": 3.403669595718384, + "eval_valid_perplexity/fim": 2.2087178230285645, + "eval_valid_perplexity/first_seq": 14.28592586517334, + "eval_valid_perplexity/last_seq": 9.505219459533691, + "eval_valid_perplexity/second_seq": 13.875511169433594, + "eval_valid_perplexity/seq": 8.840325355529785, + "eval_valid_reconstruction/all": 0.292746901512146, + "eval_valid_reconstruction/end_span": 0.7152091264724731, + "eval_valid_reconstruction/fim": 0.15774555504322052, + "eval_valid_reconstruction/first_seq": 0.17982135713100433, + "eval_valid_reconstruction/last_seq": 0.30865514278411865, + "eval_valid_reconstruction/second_seq": 0.19660605490207672, + "eval_valid_runtime": 446.1656, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 23500 + }, + { + "epoch": 0.08765843796393695, + "eval_train_loss": 2.1929891109466553, + "eval_train_loss/all": 2.028921127319336, + "eval_train_loss/end_span": 1.188990592956543, + "eval_train_perplexity/batch": 7.6058759689331055, + "eval_train_perplexity/end_span": 3.2837648391723633, + "eval_train_perplexity/fim": 2.2168631553649902, + "eval_train_perplexity/first_seq": 15.488373756408691, + "eval_train_perplexity/last_seq": 9.331910133361816, + "eval_train_perplexity/second_seq": 14.572185516357422, + "eval_train_perplexity/seq": 8.755047798156738, + "eval_train_reconstruction/all": 0.28296443819999695, + "eval_train_reconstruction/end_span": 0.7260177135467529, + "eval_train_reconstruction/fim": 0.15879099071025848, + "eval_train_reconstruction/first_seq": 0.152952179312706, + "eval_train_reconstruction/last_seq": 0.3123103678226471, + "eval_train_reconstruction/second_seq": 0.1764528900384903, + "eval_train_runtime": 442.6104, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 23500 + }, + { + "epoch": 0.08769573942690033, + "grad_norm": 0.35908037424087524, + "learning_rate": 0.0006, + "loss": 2.2491, + "step": 23510 + }, + { + "epoch": 0.0877330408898637, + "grad_norm": 0.37691089510917664, + "learning_rate": 0.0006, + "loss": 2.1688, + "step": 23520 + }, + { + "epoch": 0.08777034235282707, + "grad_norm": 0.28509950637817383, + "learning_rate": 0.0006, + "loss": 2.2315, + "step": 23530 + }, + { + "epoch": 0.08780764381579045, + "grad_norm": 0.3287522494792938, + "learning_rate": 0.0006, + "loss": 2.1045, + "step": 23540 + }, + { + "epoch": 0.08784494527875383, + "grad_norm": 0.5166968107223511, + "learning_rate": 0.0006, + "loss": 2.1921, + "step": 23550 + }, + { + "epoch": 0.08788224674171721, + "grad_norm": 0.21816474199295044, + "learning_rate": 0.0006, + "loss": 2.3238, + "step": 23560 + }, + { + "epoch": 0.08791954820468059, + "grad_norm": 0.6295764446258545, + "learning_rate": 0.0006, + "loss": 2.2158, + "step": 23570 + }, + { + "epoch": 0.08795684966764397, + "grad_norm": 0.19394394755363464, + "learning_rate": 0.0006, + "loss": 2.3533, + "step": 23580 + }, + { + "epoch": 0.08799415113060734, + "grad_norm": 0.22416073083877563, + "learning_rate": 0.0006, + "loss": 2.1862, + "step": 23590 + }, + { + "epoch": 0.08803145259357072, + "grad_norm": 0.4875907599925995, + "learning_rate": 0.0006, + "loss": 2.2195, + "step": 23600 + }, + { + "epoch": 0.0880687540565341, + "grad_norm": 0.3428478240966797, + "learning_rate": 0.0006, + "loss": 2.1705, + "step": 23610 + }, + { + "epoch": 0.08810605551949748, + "grad_norm": 0.32608577609062195, + "learning_rate": 0.0006, + "loss": 2.3564, + "step": 23620 + }, + { + "epoch": 0.08814335698246085, + "grad_norm": 0.38229870796203613, + "learning_rate": 0.0006, + "loss": 2.1987, + "step": 23630 + }, + { + "epoch": 0.08818065844542423, + "grad_norm": 0.3147888481616974, + "learning_rate": 0.0006, + "loss": 2.0445, + "step": 23640 + }, + { + "epoch": 0.08821795990838761, + "grad_norm": 0.4736103117465973, + "learning_rate": 0.0006, + "loss": 2.1265, + "step": 23650 + }, + { + "epoch": 0.08825526137135098, + "grad_norm": 0.5556378960609436, + "learning_rate": 0.0006, + "loss": 2.1784, + "step": 23660 + }, + { + "epoch": 0.08829256283431436, + "grad_norm": 0.3379930555820465, + "learning_rate": 0.0006, + "loss": 2.1261, + "step": 23670 + }, + { + "epoch": 0.08832986429727774, + "grad_norm": 0.37659305334091187, + "learning_rate": 0.0006, + "loss": 2.2328, + "step": 23680 + }, + { + "epoch": 0.08836716576024112, + "grad_norm": 0.38033944368362427, + "learning_rate": 0.0006, + "loss": 2.2426, + "step": 23690 + }, + { + "epoch": 0.0884044672232045, + "grad_norm": 0.3038197457790375, + "learning_rate": 0.0006, + "loss": 2.3656, + "step": 23700 + }, + { + "epoch": 0.08844176868616788, + "grad_norm": 0.5058729648590088, + "learning_rate": 0.0006, + "loss": 2.1905, + "step": 23710 + }, + { + "epoch": 0.08847907014913124, + "grad_norm": 0.4603695571422577, + "learning_rate": 0.0006, + "loss": 2.2597, + "step": 23720 + }, + { + "epoch": 0.08851637161209462, + "grad_norm": 0.3962642252445221, + "learning_rate": 0.0006, + "loss": 2.1348, + "step": 23730 + }, + { + "epoch": 0.088553673075058, + "grad_norm": 0.3362116515636444, + "learning_rate": 0.0006, + "loss": 2.1266, + "step": 23740 + }, + { + "epoch": 0.08859097453802138, + "grad_norm": 0.25569725036621094, + "learning_rate": 0.0006, + "loss": 2.2969, + "step": 23750 + }, + { + "epoch": 0.08859097453802138, + "eval_valid_loss": 2.199781656265259, + "eval_valid_loss/all": 2.061934232711792, + "eval_valid_loss/end_span": 1.3175454139709473, + "eval_valid_perplexity/batch": 7.8611602783203125, + "eval_valid_perplexity/end_span": 3.7342441082000732, + "eval_valid_perplexity/fim": 2.36627197265625, + "eval_valid_perplexity/first_seq": 15.001731872558594, + "eval_valid_perplexity/last_seq": 9.242526054382324, + "eval_valid_perplexity/second_seq": 13.747113227844238, + "eval_valid_perplexity/seq": 8.859357833862305, + "eval_valid_reconstruction/all": 0.29152727127075195, + "eval_valid_reconstruction/end_span": 0.6862613558769226, + "eval_valid_reconstruction/fim": 0.17089654505252838, + "eval_valid_reconstruction/first_seq": 0.16430878639221191, + "eval_valid_reconstruction/last_seq": 0.31797894835472107, + "eval_valid_reconstruction/second_seq": 0.1979627013206482, + "eval_valid_runtime": 449.0336, + "eval_valid_samples_per_second": 0.428, + "eval_valid_steps_per_second": 0.428, + "step": 23750 + }, + { + "epoch": 0.08859097453802138, + "eval_train_loss": 2.197450637817383, + "eval_train_loss/all": 2.032984495162964, + "eval_train_loss/end_span": 1.2916792631149292, + "eval_train_perplexity/batch": 7.636844635009766, + "eval_train_perplexity/end_span": 3.63889217376709, + "eval_train_perplexity/fim": 2.153265953063965, + "eval_train_perplexity/first_seq": 15.332002639770508, + "eval_train_perplexity/last_seq": 9.356172561645508, + "eval_train_perplexity/second_seq": 14.359798431396484, + "eval_train_perplexity/seq": 8.795140266418457, + "eval_train_reconstruction/all": 0.2813898026943207, + "eval_train_reconstruction/end_span": 0.694770872592926, + "eval_train_reconstruction/fim": 0.15161485970020294, + "eval_train_reconstruction/first_seq": 0.1536991000175476, + "eval_train_reconstruction/last_seq": 0.311859667301178, + "eval_train_reconstruction/second_seq": 0.1778121441602707, + "eval_train_runtime": 441.6947, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 23750 + }, + { + "epoch": 0.08862827600098476, + "grad_norm": 0.46217918395996094, + "learning_rate": 0.0006, + "loss": 2.3658, + "step": 23760 + }, + { + "epoch": 0.08866557746394814, + "grad_norm": 0.2513478398323059, + "learning_rate": 0.0006, + "loss": 2.2111, + "step": 23770 + }, + { + "epoch": 0.08870287892691152, + "grad_norm": 0.26316845417022705, + "learning_rate": 0.0006, + "loss": 2.3021, + "step": 23780 + }, + { + "epoch": 0.08874018038987488, + "grad_norm": 0.3809851408004761, + "learning_rate": 0.0006, + "loss": 2.2295, + "step": 23790 + }, + { + "epoch": 0.08877748185283826, + "grad_norm": 0.5791058540344238, + "learning_rate": 0.0006, + "loss": 2.1565, + "step": 23800 + }, + { + "epoch": 0.08881478331580164, + "grad_norm": 0.3336416482925415, + "learning_rate": 0.0006, + "loss": 2.005, + "step": 23810 + }, + { + "epoch": 0.08885208477876502, + "grad_norm": 0.42514118552207947, + "learning_rate": 0.0006, + "loss": 2.258, + "step": 23820 + }, + { + "epoch": 0.0888893862417284, + "grad_norm": 0.30683189630508423, + "learning_rate": 0.0006, + "loss": 2.2142, + "step": 23830 + }, + { + "epoch": 0.08892668770469178, + "grad_norm": 0.3296416699886322, + "learning_rate": 0.0006, + "loss": 2.1415, + "step": 23840 + }, + { + "epoch": 0.08896398916765516, + "grad_norm": 0.34046098589897156, + "learning_rate": 0.0006, + "loss": 2.1071, + "step": 23850 + }, + { + "epoch": 0.08900129063061853, + "grad_norm": 0.35148322582244873, + "learning_rate": 0.0006, + "loss": 2.1847, + "step": 23860 + }, + { + "epoch": 0.0890385920935819, + "grad_norm": 0.6631432175636292, + "learning_rate": 0.0006, + "loss": 2.2149, + "step": 23870 + }, + { + "epoch": 0.08907589355654529, + "grad_norm": 0.8488190174102783, + "learning_rate": 0.0006, + "loss": 2.231, + "step": 23880 + }, + { + "epoch": 0.08911319501950866, + "grad_norm": 0.43314245343208313, + "learning_rate": 0.0006, + "loss": 2.3236, + "step": 23890 + }, + { + "epoch": 0.08915049648247204, + "grad_norm": 0.3846605122089386, + "learning_rate": 0.0006, + "loss": 2.2479, + "step": 23900 + }, + { + "epoch": 0.08918779794543542, + "grad_norm": 0.5540704131126404, + "learning_rate": 0.0006, + "loss": 2.2558, + "step": 23910 + }, + { + "epoch": 0.0892250994083988, + "grad_norm": 0.3349238634109497, + "learning_rate": 0.0006, + "loss": 2.3028, + "step": 23920 + }, + { + "epoch": 0.08926240087136217, + "grad_norm": 0.3928166925907135, + "learning_rate": 0.0006, + "loss": 2.23, + "step": 23930 + }, + { + "epoch": 0.08929970233432555, + "grad_norm": 0.3528369069099426, + "learning_rate": 0.0006, + "loss": 2.4057, + "step": 23940 + }, + { + "epoch": 0.08933700379728893, + "grad_norm": 0.35886991024017334, + "learning_rate": 0.0006, + "loss": 2.3803, + "step": 23950 + }, + { + "epoch": 0.0893743052602523, + "grad_norm": 0.34848490357398987, + "learning_rate": 0.0006, + "loss": 2.0471, + "step": 23960 + }, + { + "epoch": 0.08941160672321569, + "grad_norm": 0.2829303443431854, + "learning_rate": 0.0006, + "loss": 2.1257, + "step": 23970 + }, + { + "epoch": 0.08944890818617907, + "grad_norm": 0.48922768235206604, + "learning_rate": 0.0006, + "loss": 2.214, + "step": 23980 + }, + { + "epoch": 0.08948620964914245, + "grad_norm": 0.2789973318576813, + "learning_rate": 0.0006, + "loss": 2.4019, + "step": 23990 + }, + { + "epoch": 0.08952351111210581, + "grad_norm": 0.4667893648147583, + "learning_rate": 0.0006, + "loss": 2.2799, + "step": 24000 + }, + { + "epoch": 0.08952351111210581, + "eval_valid_loss": 2.200570821762085, + "eval_valid_loss/all": 2.0627381801605225, + "eval_valid_loss/end_span": 1.1697450876235962, + "eval_valid_perplexity/batch": 7.867483139038086, + "eval_valid_perplexity/end_span": 3.2211713790893555, + "eval_valid_perplexity/fim": 2.1728320121765137, + "eval_valid_perplexity/first_seq": 14.873404502868652, + "eval_valid_perplexity/last_seq": 8.738134384155273, + "eval_valid_perplexity/second_seq": 13.881765365600586, + "eval_valid_perplexity/seq": 8.865335464477539, + "eval_valid_reconstruction/all": 0.2913528382778168, + "eval_valid_reconstruction/end_span": 0.7253627777099609, + "eval_valid_reconstruction/fim": 0.15399806201457977, + "eval_valid_reconstruction/first_seq": 0.1667734831571579, + "eval_valid_reconstruction/last_seq": 0.3356359004974365, + "eval_valid_reconstruction/second_seq": 0.19240379333496094, + "eval_valid_runtime": 462.4081, + "eval_valid_samples_per_second": 0.415, + "eval_valid_steps_per_second": 0.415, + "step": 24000 + }, + { + "epoch": 0.08952351111210581, + "eval_train_loss": 2.195523977279663, + "eval_train_loss/all": 2.030777931213379, + "eval_train_loss/end_span": 1.1336774826049805, + "eval_train_perplexity/batch": 7.620011806488037, + "eval_train_perplexity/end_span": 3.1070616245269775, + "eval_train_perplexity/fim": 2.170234203338623, + "eval_train_perplexity/first_seq": 15.370572090148926, + "eval_train_perplexity/last_seq": 8.96842098236084, + "eval_train_perplexity/second_seq": 14.210155487060547, + "eval_train_perplexity/seq": 8.768070220947266, + "eval_train_reconstruction/all": 0.28217968344688416, + "eval_train_reconstruction/end_span": 0.7387996912002563, + "eval_train_reconstruction/fim": 0.15329009294509888, + "eval_train_reconstruction/first_seq": 0.15336346626281738, + "eval_train_reconstruction/last_seq": 0.32581979036331177, + "eval_train_reconstruction/second_seq": 0.18363335728645325, + "eval_train_runtime": 461.5793, + "eval_train_samples_per_second": 0.416, + "eval_train_steps_per_second": 0.416, + "step": 24000 + }, + { + "epoch": 0.08956081257506919, + "grad_norm": 0.33822089433670044, + "learning_rate": 0.0006, + "loss": 2.2445, + "step": 24010 + }, + { + "epoch": 0.08959811403803257, + "grad_norm": 1.6858209371566772, + "learning_rate": 0.0006, + "loss": 2.2036, + "step": 24020 + }, + { + "epoch": 0.08963541550099595, + "grad_norm": 0.3292395770549774, + "learning_rate": 0.0006, + "loss": 2.3372, + "step": 24030 + }, + { + "epoch": 0.08967271696395933, + "grad_norm": 0.3261185586452484, + "learning_rate": 0.0006, + "loss": 2.265, + "step": 24040 + }, + { + "epoch": 0.08971001842692271, + "grad_norm": 1.1041275262832642, + "learning_rate": 0.0006, + "loss": 1.9719, + "step": 24050 + }, + { + "epoch": 0.08974731988988609, + "grad_norm": 0.29810526967048645, + "learning_rate": 0.0006, + "loss": 2.0813, + "step": 24060 + }, + { + "epoch": 0.08978462135284945, + "grad_norm": 0.3502158224582672, + "learning_rate": 0.0006, + "loss": 2.3621, + "step": 24070 + }, + { + "epoch": 0.08982192281581283, + "grad_norm": 0.44113555550575256, + "learning_rate": 0.0006, + "loss": 2.3565, + "step": 24080 + }, + { + "epoch": 0.08985922427877621, + "grad_norm": 0.4964047074317932, + "learning_rate": 0.0006, + "loss": 2.1758, + "step": 24090 + }, + { + "epoch": 0.08989652574173959, + "grad_norm": 0.28436073660850525, + "learning_rate": 0.0006, + "loss": 2.2886, + "step": 24100 + }, + { + "epoch": 0.08993382720470297, + "grad_norm": 0.7692009210586548, + "learning_rate": 0.0006, + "loss": 2.2449, + "step": 24110 + }, + { + "epoch": 0.08997112866766635, + "grad_norm": 0.31572338938713074, + "learning_rate": 0.0006, + "loss": 2.4461, + "step": 24120 + }, + { + "epoch": 0.09000843013062973, + "grad_norm": 0.25105592608451843, + "learning_rate": 0.0006, + "loss": 2.0863, + "step": 24130 + }, + { + "epoch": 0.0900457315935931, + "grad_norm": 0.45005688071250916, + "learning_rate": 0.0006, + "loss": 2.2313, + "step": 24140 + }, + { + "epoch": 0.09008303305655647, + "grad_norm": 0.4540651738643646, + "learning_rate": 0.0006, + "loss": 2.0735, + "step": 24150 + }, + { + "epoch": 0.09012033451951985, + "grad_norm": 0.44277939200401306, + "learning_rate": 0.0006, + "loss": 2.2267, + "step": 24160 + }, + { + "epoch": 0.09015763598248323, + "grad_norm": 0.3380310535430908, + "learning_rate": 0.0006, + "loss": 2.3256, + "step": 24170 + }, + { + "epoch": 0.09019493744544661, + "grad_norm": 0.3872617483139038, + "learning_rate": 0.0006, + "loss": 2.2319, + "step": 24180 + }, + { + "epoch": 0.09023223890840999, + "grad_norm": 0.45661187171936035, + "learning_rate": 0.0006, + "loss": 2.1559, + "step": 24190 + }, + { + "epoch": 0.09026954037137337, + "grad_norm": 0.5256473422050476, + "learning_rate": 0.0006, + "loss": 2.3618, + "step": 24200 + }, + { + "epoch": 0.09030684183433674, + "grad_norm": 0.4046097695827484, + "learning_rate": 0.0006, + "loss": 2.3884, + "step": 24210 + }, + { + "epoch": 0.09034414329730012, + "grad_norm": 0.3343294858932495, + "learning_rate": 0.0006, + "loss": 2.3032, + "step": 24220 + }, + { + "epoch": 0.0903814447602635, + "grad_norm": 0.36459317803382874, + "learning_rate": 0.0006, + "loss": 2.3677, + "step": 24230 + }, + { + "epoch": 0.09041874622322688, + "grad_norm": 0.3258163630962372, + "learning_rate": 0.0006, + "loss": 2.1566, + "step": 24240 + }, + { + "epoch": 0.09045604768619026, + "grad_norm": 0.6225035786628723, + "learning_rate": 0.0006, + "loss": 2.2299, + "step": 24250 + }, + { + "epoch": 0.09045604768619026, + "eval_valid_loss": 2.201075315475464, + "eval_valid_loss/all": 2.0632691383361816, + "eval_valid_loss/end_span": 1.179713249206543, + "eval_valid_perplexity/batch": 7.871661186218262, + "eval_valid_perplexity/end_span": 3.253441095352173, + "eval_valid_perplexity/fim": 2.310088634490967, + "eval_valid_perplexity/first_seq": 14.894025802612305, + "eval_valid_perplexity/last_seq": 9.278022766113281, + "eval_valid_perplexity/second_seq": 14.191726684570312, + "eval_valid_perplexity/seq": 8.876398086547852, + "eval_valid_reconstruction/all": 0.29145684838294983, + "eval_valid_reconstruction/end_span": 0.7247921228408813, + "eval_valid_reconstruction/fim": 0.16472938656806946, + "eval_valid_reconstruction/first_seq": 0.16842703521251678, + "eval_valid_reconstruction/last_seq": 0.3179069459438324, + "eval_valid_reconstruction/second_seq": 0.1846829652786255, + "eval_valid_runtime": 447.4168, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 24250 + }, + { + "epoch": 0.09045604768619026, + "eval_train_loss": 2.198983907699585, + "eval_train_loss/all": 2.034512519836426, + "eval_train_loss/end_span": 1.1435999870300293, + "eval_train_perplexity/batch": 7.648522853851318, + "eval_train_perplexity/end_span": 3.138045072555542, + "eval_train_perplexity/fim": 2.1071834564208984, + "eval_train_perplexity/first_seq": 15.393563270568848, + "eval_train_perplexity/last_seq": 8.760506629943848, + "eval_train_perplexity/second_seq": 14.213028907775879, + "eval_train_perplexity/seq": 8.809725761413574, + "eval_train_reconstruction/all": 0.28112730383872986, + "eval_train_reconstruction/end_span": 0.7363146543502808, + "eval_train_reconstruction/fim": 0.14770248532295227, + "eval_train_reconstruction/first_seq": 0.1549515575170517, + "eval_train_reconstruction/last_seq": 0.3307977318763733, + "eval_train_reconstruction/second_seq": 0.18407805263996124, + "eval_train_runtime": 433.6577, + "eval_train_samples_per_second": 0.443, + "eval_train_steps_per_second": 0.443, + "step": 24250 + }, + { + "epoch": 0.09049334914915363, + "grad_norm": 0.4049758315086365, + "learning_rate": 0.0006, + "loss": 2.2193, + "step": 24260 + }, + { + "epoch": 0.09053065061211701, + "grad_norm": 0.34303799271583557, + "learning_rate": 0.0006, + "loss": 2.1158, + "step": 24270 + }, + { + "epoch": 0.09056795207508038, + "grad_norm": 0.33922526240348816, + "learning_rate": 0.0006, + "loss": 2.0806, + "step": 24280 + }, + { + "epoch": 0.09060525353804376, + "grad_norm": 0.5258698463439941, + "learning_rate": 0.0006, + "loss": 2.1747, + "step": 24290 + }, + { + "epoch": 0.09064255500100714, + "grad_norm": 0.3759170174598694, + "learning_rate": 0.0006, + "loss": 2.2873, + "step": 24300 + }, + { + "epoch": 0.09067985646397052, + "grad_norm": 0.33726921677589417, + "learning_rate": 0.0006, + "loss": 2.1566, + "step": 24310 + }, + { + "epoch": 0.0907171579269339, + "grad_norm": 0.296013206243515, + "learning_rate": 0.0006, + "loss": 2.1015, + "step": 24320 + }, + { + "epoch": 0.09075445938989728, + "grad_norm": 0.6070913076400757, + "learning_rate": 0.0006, + "loss": 2.081, + "step": 24330 + }, + { + "epoch": 0.09079176085286064, + "grad_norm": 0.3431278169155121, + "learning_rate": 0.0006, + "loss": 2.0813, + "step": 24340 + }, + { + "epoch": 0.09082906231582402, + "grad_norm": 0.3618711233139038, + "learning_rate": 0.0006, + "loss": 2.1345, + "step": 24350 + }, + { + "epoch": 0.0908663637787874, + "grad_norm": 0.4072912931442261, + "learning_rate": 0.0006, + "loss": 2.3146, + "step": 24360 + }, + { + "epoch": 0.09090366524175078, + "grad_norm": 0.34038493037223816, + "learning_rate": 0.0006, + "loss": 2.2581, + "step": 24370 + }, + { + "epoch": 0.09094096670471416, + "grad_norm": 0.3210369944572449, + "learning_rate": 0.0006, + "loss": 2.3938, + "step": 24380 + }, + { + "epoch": 0.09097826816767754, + "grad_norm": 0.4051613211631775, + "learning_rate": 0.0006, + "loss": 2.3062, + "step": 24390 + }, + { + "epoch": 0.09101556963064092, + "grad_norm": 0.3378663957118988, + "learning_rate": 0.0006, + "loss": 2.322, + "step": 24400 + }, + { + "epoch": 0.09105287109360428, + "grad_norm": 0.332653671503067, + "learning_rate": 0.0006, + "loss": 2.0882, + "step": 24410 + }, + { + "epoch": 0.09109017255656766, + "grad_norm": 0.44245997071266174, + "learning_rate": 0.0006, + "loss": 2.2252, + "step": 24420 + }, + { + "epoch": 0.09112747401953104, + "grad_norm": 0.630198061466217, + "learning_rate": 0.0006, + "loss": 2.2687, + "step": 24430 + }, + { + "epoch": 0.09116477548249442, + "grad_norm": 0.4582229256629944, + "learning_rate": 0.0006, + "loss": 2.3411, + "step": 24440 + }, + { + "epoch": 0.0912020769454578, + "grad_norm": 0.28869137167930603, + "learning_rate": 0.0006, + "loss": 2.0999, + "step": 24450 + }, + { + "epoch": 0.09123937840842118, + "grad_norm": 0.4462984502315521, + "learning_rate": 0.0006, + "loss": 2.2245, + "step": 24460 + }, + { + "epoch": 0.09127667987138456, + "grad_norm": 0.20941735804080963, + "learning_rate": 0.0006, + "loss": 2.2849, + "step": 24470 + }, + { + "epoch": 0.09131398133434793, + "grad_norm": 0.5177513360977173, + "learning_rate": 0.0006, + "loss": 2.1808, + "step": 24480 + }, + { + "epoch": 0.0913512827973113, + "grad_norm": 0.5380834341049194, + "learning_rate": 0.0006, + "loss": 2.1001, + "step": 24490 + }, + { + "epoch": 0.09138858426027469, + "grad_norm": 0.3336670398712158, + "learning_rate": 0.0006, + "loss": 2.3604, + "step": 24500 + }, + { + "epoch": 0.09138858426027469, + "eval_valid_loss": 2.1988813877105713, + "eval_valid_loss/all": 2.0605807304382324, + "eval_valid_loss/end_span": 1.2128331661224365, + "eval_valid_perplexity/batch": 7.850527763366699, + "eval_valid_perplexity/end_span": 3.362999200820923, + "eval_valid_perplexity/fim": 2.53971791267395, + "eval_valid_perplexity/first_seq": 14.92766284942627, + "eval_valid_perplexity/last_seq": 8.827770233154297, + "eval_valid_perplexity/second_seq": 13.798776626586914, + "eval_valid_perplexity/seq": 8.84310531616211, + "eval_valid_reconstruction/all": 0.2924298644065857, + "eval_valid_reconstruction/end_span": 0.7095934748649597, + "eval_valid_reconstruction/fim": 0.18480120599269867, + "eval_valid_reconstruction/first_seq": 0.167632594704628, + "eval_valid_reconstruction/last_seq": 0.33105722069740295, + "eval_valid_reconstruction/second_seq": 0.19440945982933044, + "eval_valid_runtime": 438.8513, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 24500 + }, + { + "epoch": 0.09138858426027469, + "eval_train_loss": 2.199958562850952, + "eval_train_loss/all": 2.034867286682129, + "eval_train_loss/end_span": 1.1841446161270142, + "eval_train_perplexity/batch": 7.651236534118652, + "eval_train_perplexity/end_span": 3.267890214920044, + "eval_train_perplexity/fim": 2.0245559215545654, + "eval_train_perplexity/first_seq": 15.688085556030273, + "eval_train_perplexity/last_seq": 9.407329559326172, + "eval_train_perplexity/second_seq": 14.230528831481934, + "eval_train_perplexity/seq": 8.807229995727539, + "eval_train_reconstruction/all": 0.2811797857284546, + "eval_train_reconstruction/end_span": 0.720075786113739, + "eval_train_reconstruction/fim": 0.13922345638275146, + "eval_train_reconstruction/first_seq": 0.1498716026544571, + "eval_train_reconstruction/last_seq": 0.31183069944381714, + "eval_train_reconstruction/second_seq": 0.18224750459194183, + "eval_train_runtime": 437.9692, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 24500 + }, + { + "epoch": 0.09142588572323807, + "grad_norm": 0.3964593708515167, + "learning_rate": 0.0006, + "loss": 2.2883, + "step": 24510 + }, + { + "epoch": 0.09146318718620144, + "grad_norm": 0.24099846184253693, + "learning_rate": 0.0006, + "loss": 2.2421, + "step": 24520 + }, + { + "epoch": 0.09150048864916482, + "grad_norm": 0.3088171184062958, + "learning_rate": 0.0006, + "loss": 2.2548, + "step": 24530 + }, + { + "epoch": 0.0915377901121282, + "grad_norm": 0.326539009809494, + "learning_rate": 0.0006, + "loss": 2.2716, + "step": 24540 + }, + { + "epoch": 0.09157509157509157, + "grad_norm": 0.3199881613254547, + "learning_rate": 0.0006, + "loss": 2.1318, + "step": 24550 + }, + { + "epoch": 0.09161239303805495, + "grad_norm": 0.3988211154937744, + "learning_rate": 0.0006, + "loss": 2.2014, + "step": 24560 + }, + { + "epoch": 0.09164969450101833, + "grad_norm": 6.473701000213623, + "learning_rate": 0.0006, + "loss": 2.1962, + "step": 24570 + }, + { + "epoch": 0.09168699596398171, + "grad_norm": 0.32680875062942505, + "learning_rate": 0.0006, + "loss": 2.2375, + "step": 24580 + }, + { + "epoch": 0.09172429742694509, + "grad_norm": 0.39957645535469055, + "learning_rate": 0.0006, + "loss": 2.2302, + "step": 24590 + }, + { + "epoch": 0.09176159888990847, + "grad_norm": 0.27709314227104187, + "learning_rate": 0.0006, + "loss": 2.305, + "step": 24600 + }, + { + "epoch": 0.09179890035287185, + "grad_norm": 0.41029390692710876, + "learning_rate": 0.0006, + "loss": 2.0869, + "step": 24610 + }, + { + "epoch": 0.09183620181583521, + "grad_norm": 0.29893702268600464, + "learning_rate": 0.0006, + "loss": 2.4129, + "step": 24620 + }, + { + "epoch": 0.09187350327879859, + "grad_norm": 0.3451939821243286, + "learning_rate": 0.0006, + "loss": 2.3112, + "step": 24630 + }, + { + "epoch": 0.09191080474176197, + "grad_norm": 0.562200129032135, + "learning_rate": 0.0006, + "loss": 2.3666, + "step": 24640 + }, + { + "epoch": 0.09194810620472535, + "grad_norm": 0.5003570914268494, + "learning_rate": 0.0006, + "loss": 2.3109, + "step": 24650 + }, + { + "epoch": 0.09198540766768873, + "grad_norm": 0.3407922089099884, + "learning_rate": 0.0006, + "loss": 2.1742, + "step": 24660 + }, + { + "epoch": 0.09202270913065211, + "grad_norm": 0.2717753052711487, + "learning_rate": 0.0006, + "loss": 2.1916, + "step": 24670 + }, + { + "epoch": 0.09206001059361549, + "grad_norm": 0.32123616337776184, + "learning_rate": 0.0006, + "loss": 2.267, + "step": 24680 + }, + { + "epoch": 0.09209731205657885, + "grad_norm": 0.33102643489837646, + "learning_rate": 0.0006, + "loss": 2.2288, + "step": 24690 + }, + { + "epoch": 0.09213461351954223, + "grad_norm": 0.45570749044418335, + "learning_rate": 0.0006, + "loss": 2.2821, + "step": 24700 + }, + { + "epoch": 0.09217191498250561, + "grad_norm": 0.2817331552505493, + "learning_rate": 0.0006, + "loss": 2.3837, + "step": 24710 + }, + { + "epoch": 0.09220921644546899, + "grad_norm": 0.4859289824962616, + "learning_rate": 0.0006, + "loss": 2.1928, + "step": 24720 + }, + { + "epoch": 0.09224651790843237, + "grad_norm": 152.47866821289062, + "learning_rate": 0.0006, + "loss": 2.2938, + "step": 24730 + }, + { + "epoch": 0.09228381937139575, + "grad_norm": 0.514054536819458, + "learning_rate": 0.0006, + "loss": 2.1345, + "step": 24740 + }, + { + "epoch": 0.09232112083435913, + "grad_norm": 66.7249984741211, + "learning_rate": 0.0006, + "loss": 2.1498, + "step": 24750 + }, + { + "epoch": 0.09232112083435913, + "eval_valid_loss": 2.2014150619506836, + "eval_valid_loss/all": 2.06352162361145, + "eval_valid_loss/end_span": 1.1952916383743286, + "eval_valid_perplexity/batch": 7.8736491203308105, + "eval_valid_perplexity/end_span": 3.304521322250366, + "eval_valid_perplexity/fim": 2.2382354736328125, + "eval_valid_perplexity/first_seq": 15.0785551071167, + "eval_valid_perplexity/last_seq": 8.79272747039795, + "eval_valid_perplexity/second_seq": 13.81174087524414, + "eval_valid_perplexity/seq": 8.877165794372559, + "eval_valid_reconstruction/all": 0.29176250100135803, + "eval_valid_reconstruction/end_span": 0.7143312692642212, + "eval_valid_reconstruction/fim": 0.1592877060174942, + "eval_valid_reconstruction/first_seq": 0.15900690853595734, + "eval_valid_reconstruction/last_seq": 0.3332084119319916, + "eval_valid_reconstruction/second_seq": 0.1946583241224289, + "eval_valid_runtime": 433.7695, + "eval_valid_samples_per_second": 0.443, + "eval_valid_steps_per_second": 0.443, + "step": 24750 + }, + { + "epoch": 0.09232112083435913, + "eval_train_loss": 2.200843572616577, + "eval_train_loss/all": 2.0363731384277344, + "eval_train_loss/end_span": 1.1656957864761353, + "eval_train_perplexity/batch": 7.662766933441162, + "eval_train_perplexity/end_span": 3.2081542015075684, + "eval_train_perplexity/fim": 1.9111844301223755, + "eval_train_perplexity/first_seq": 15.575481414794922, + "eval_train_perplexity/last_seq": 9.093709945678711, + "eval_train_perplexity/second_seq": 14.095168113708496, + "eval_train_perplexity/seq": 8.82958984375, + "eval_train_reconstruction/all": 0.2811748683452606, + "eval_train_reconstruction/end_span": 0.7258778214454651, + "eval_train_reconstruction/fim": 0.12762939929962158, + "eval_train_reconstruction/first_seq": 0.14930152893066406, + "eval_train_reconstruction/last_seq": 0.32370564341545105, + "eval_train_reconstruction/second_seq": 0.18725484609603882, + "eval_train_runtime": 442.0192, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 24750 + }, + { + "epoch": 0.0923584222973225, + "grad_norm": 0.4425767958164215, + "learning_rate": 0.0006, + "loss": 2.3191, + "step": 24760 + }, + { + "epoch": 0.09239572376028587, + "grad_norm": 0.7700709104537964, + "learning_rate": 0.0006, + "loss": 2.1101, + "step": 24770 + }, + { + "epoch": 0.09243302522324925, + "grad_norm": 0.4670099914073944, + "learning_rate": 0.0006, + "loss": 2.2227, + "step": 24780 + }, + { + "epoch": 0.09247032668621263, + "grad_norm": 0.43901893496513367, + "learning_rate": 0.0006, + "loss": 2.3253, + "step": 24790 + }, + { + "epoch": 0.09250762814917601, + "grad_norm": 0.6079726815223694, + "learning_rate": 0.0006, + "loss": 2.2068, + "step": 24800 + }, + { + "epoch": 0.09254492961213939, + "grad_norm": 0.3439103960990906, + "learning_rate": 0.0006, + "loss": 2.2027, + "step": 24810 + }, + { + "epoch": 0.09258223107510277, + "grad_norm": 0.5716179609298706, + "learning_rate": 0.0006, + "loss": 2.2153, + "step": 24820 + }, + { + "epoch": 0.09261953253806614, + "grad_norm": 0.39248594641685486, + "learning_rate": 0.0006, + "loss": 2.3624, + "step": 24830 + }, + { + "epoch": 0.09265683400102952, + "grad_norm": 0.335062712430954, + "learning_rate": 0.0006, + "loss": 2.3772, + "step": 24840 + }, + { + "epoch": 0.0926941354639929, + "grad_norm": 0.39807215332984924, + "learning_rate": 0.0006, + "loss": 2.2277, + "step": 24850 + }, + { + "epoch": 0.09273143692695628, + "grad_norm": 0.2822161316871643, + "learning_rate": 0.0006, + "loss": 2.121, + "step": 24860 + }, + { + "epoch": 0.09276873838991966, + "grad_norm": 0.318215936422348, + "learning_rate": 0.0006, + "loss": 2.2062, + "step": 24870 + }, + { + "epoch": 0.09280603985288303, + "grad_norm": 0.28424328565597534, + "learning_rate": 0.0006, + "loss": 2.3861, + "step": 24880 + }, + { + "epoch": 0.09284334131584641, + "grad_norm": 0.3838539719581604, + "learning_rate": 0.0006, + "loss": 2.1293, + "step": 24890 + }, + { + "epoch": 0.09288064277880978, + "grad_norm": 0.3227640688419342, + "learning_rate": 0.0006, + "loss": 2.3165, + "step": 24900 + }, + { + "epoch": 0.09291794424177316, + "grad_norm": 0.32307589054107666, + "learning_rate": 0.0006, + "loss": 2.1454, + "step": 24910 + }, + { + "epoch": 0.09295524570473654, + "grad_norm": 0.34411442279815674, + "learning_rate": 0.0006, + "loss": 2.2319, + "step": 24920 + }, + { + "epoch": 0.09299254716769992, + "grad_norm": 0.46198374032974243, + "learning_rate": 0.0006, + "loss": 2.3972, + "step": 24930 + }, + { + "epoch": 0.0930298486306633, + "grad_norm": 0.48177048563957214, + "learning_rate": 0.0006, + "loss": 2.3932, + "step": 24940 + }, + { + "epoch": 0.09306715009362668, + "grad_norm": 0.3654782176017761, + "learning_rate": 0.0006, + "loss": 2.0902, + "step": 24950 + }, + { + "epoch": 0.09310445155659004, + "grad_norm": 2.671253204345703, + "learning_rate": 0.0006, + "loss": 2.1546, + "step": 24960 + }, + { + "epoch": 0.09314175301955342, + "grad_norm": 0.27970239520072937, + "learning_rate": 0.0006, + "loss": 2.3248, + "step": 24970 + }, + { + "epoch": 0.0931790544825168, + "grad_norm": 0.489754855632782, + "learning_rate": 0.0006, + "loss": 2.1043, + "step": 24980 + }, + { + "epoch": 0.09321635594548018, + "grad_norm": 0.3030097186565399, + "learning_rate": 0.0006, + "loss": 2.3033, + "step": 24990 + }, + { + "epoch": 0.09325365740844356, + "grad_norm": 0.39388322830200195, + "learning_rate": 0.0006, + "loss": 2.1938, + "step": 25000 + }, + { + "epoch": 0.09325365740844356, + "eval_valid_loss": 2.2013518810272217, + "eval_valid_loss/all": 2.0628983974456787, + "eval_valid_loss/end_span": 1.2409942150115967, + "eval_valid_perplexity/batch": 7.868743419647217, + "eval_valid_perplexity/end_span": 3.4590508937835693, + "eval_valid_perplexity/fim": 2.400967836380005, + "eval_valid_perplexity/first_seq": 15.031426429748535, + "eval_valid_perplexity/last_seq": 8.978853225708008, + "eval_valid_perplexity/second_seq": 14.125105857849121, + "eval_valid_perplexity/seq": 8.864251136779785, + "eval_valid_reconstruction/all": 0.29135727882385254, + "eval_valid_reconstruction/end_span": 0.7094403505325317, + "eval_valid_reconstruction/fim": 0.17308686673641205, + "eval_valid_reconstruction/first_seq": 0.15965662896633148, + "eval_valid_reconstruction/last_seq": 0.32783469557762146, + "eval_valid_reconstruction/second_seq": 0.18843671679496765, + "eval_valid_runtime": 436.3952, + "eval_valid_samples_per_second": 0.44, + "eval_valid_steps_per_second": 0.44, + "step": 25000 + }, + { + "epoch": 0.09325365740844356, + "eval_train_loss": 2.200878143310547, + "eval_train_loss/all": 2.0355031490325928, + "eval_train_loss/end_span": 1.2198374271392822, + "eval_train_perplexity/batch": 7.656103134155273, + "eval_train_perplexity/end_span": 3.3866372108459473, + "eval_train_perplexity/fim": 2.1137051582336426, + "eval_train_perplexity/first_seq": 15.460762023925781, + "eval_train_perplexity/last_seq": 8.813679695129395, + "eval_train_perplexity/second_seq": 14.15633487701416, + "eval_train_perplexity/seq": 8.811531066894531, + "eval_train_reconstruction/all": 0.28066956996917725, + "eval_train_reconstruction/end_span": 0.7184399366378784, + "eval_train_reconstruction/fim": 0.14766819775104523, + "eval_train_reconstruction/first_seq": 0.1531287133693695, + "eval_train_reconstruction/last_seq": 0.3325038254261017, + "eval_train_reconstruction/second_seq": 0.18238770961761475, + "eval_train_runtime": 433.6108, + "eval_train_samples_per_second": 0.443, + "eval_train_steps_per_second": 0.443, + "step": 25000 + }, + { + "epoch": 0.09329095887140694, + "grad_norm": 0.26944082975387573, + "learning_rate": 0.0006, + "loss": 2.31, + "step": 25010 + }, + { + "epoch": 0.09332826033437032, + "grad_norm": 0.6248197555541992, + "learning_rate": 0.0006, + "loss": 2.2361, + "step": 25020 + }, + { + "epoch": 0.09336556179733368, + "grad_norm": 0.2774829864501953, + "learning_rate": 0.0006, + "loss": 2.2916, + "step": 25030 + }, + { + "epoch": 0.09340286326029706, + "grad_norm": 0.3775261342525482, + "learning_rate": 0.0006, + "loss": 2.1023, + "step": 25040 + }, + { + "epoch": 0.09344016472326044, + "grad_norm": 0.25583794713020325, + "learning_rate": 0.0006, + "loss": 2.1374, + "step": 25050 + }, + { + "epoch": 0.09347746618622382, + "grad_norm": 0.40090322494506836, + "learning_rate": 0.0006, + "loss": 1.9945, + "step": 25060 + }, + { + "epoch": 0.0935147676491872, + "grad_norm": 0.3454028069972992, + "learning_rate": 0.0006, + "loss": 2.2379, + "step": 25070 + }, + { + "epoch": 0.09355206911215058, + "grad_norm": 0.413065642118454, + "learning_rate": 0.0006, + "loss": 2.1237, + "step": 25080 + }, + { + "epoch": 0.09358937057511396, + "grad_norm": 0.6978026628494263, + "learning_rate": 0.0006, + "loss": 2.2483, + "step": 25090 + }, + { + "epoch": 0.09362667203807733, + "grad_norm": 0.3413706421852112, + "learning_rate": 0.0006, + "loss": 2.2625, + "step": 25100 + }, + { + "epoch": 0.0936639735010407, + "grad_norm": 0.33570238947868347, + "learning_rate": 0.0006, + "loss": 2.148, + "step": 25110 + }, + { + "epoch": 0.09370127496400409, + "grad_norm": 0.41296324133872986, + "learning_rate": 0.0006, + "loss": 2.2266, + "step": 25120 + }, + { + "epoch": 0.09373857642696747, + "grad_norm": 0.4594546854496002, + "learning_rate": 0.0006, + "loss": 2.1979, + "step": 25130 + }, + { + "epoch": 0.09377587788993084, + "grad_norm": 0.3805590569972992, + "learning_rate": 0.0006, + "loss": 2.2328, + "step": 25140 + }, + { + "epoch": 0.09381317935289422, + "grad_norm": 0.311188280582428, + "learning_rate": 0.0006, + "loss": 2.2485, + "step": 25150 + }, + { + "epoch": 0.0938504808158576, + "grad_norm": 0.3581857979297638, + "learning_rate": 0.0006, + "loss": 2.1179, + "step": 25160 + }, + { + "epoch": 0.09388778227882097, + "grad_norm": 0.3148276209831238, + "learning_rate": 0.0006, + "loss": 2.1773, + "step": 25170 + }, + { + "epoch": 0.09392508374178435, + "grad_norm": 0.4385329782962799, + "learning_rate": 0.0006, + "loss": 2.1153, + "step": 25180 + }, + { + "epoch": 0.09396238520474773, + "grad_norm": 0.3188403248786926, + "learning_rate": 0.0006, + "loss": 2.211, + "step": 25190 + }, + { + "epoch": 0.09399968666771111, + "grad_norm": 0.6365643739700317, + "learning_rate": 0.0006, + "loss": 2.2334, + "step": 25200 + }, + { + "epoch": 0.09403698813067449, + "grad_norm": 0.20475004613399506, + "learning_rate": 0.0006, + "loss": 2.1425, + "step": 25210 + }, + { + "epoch": 0.09407428959363787, + "grad_norm": 0.36357492208480835, + "learning_rate": 0.0006, + "loss": 2.3033, + "step": 25220 + }, + { + "epoch": 0.09411159105660125, + "grad_norm": 0.29923880100250244, + "learning_rate": 0.0006, + "loss": 2.2784, + "step": 25230 + }, + { + "epoch": 0.09414889251956461, + "grad_norm": 0.3513795733451843, + "learning_rate": 0.0006, + "loss": 2.2036, + "step": 25240 + }, + { + "epoch": 0.09418619398252799, + "grad_norm": 0.2761458456516266, + "learning_rate": 0.0006, + "loss": 2.4067, + "step": 25250 + }, + { + "epoch": 0.09418619398252799, + "eval_valid_loss": 2.1960480213165283, + "eval_valid_loss/all": 2.0581419467926025, + "eval_valid_loss/end_span": 1.2873276472091675, + "eval_valid_perplexity/batch": 7.831405162811279, + "eval_valid_perplexity/end_span": 3.623091459274292, + "eval_valid_perplexity/fim": 2.0299174785614014, + "eval_valid_perplexity/first_seq": 14.369976997375488, + "eval_valid_perplexity/last_seq": 8.717787742614746, + "eval_valid_perplexity/second_seq": 13.264070510864258, + "eval_valid_perplexity/seq": 8.824134826660156, + "eval_valid_reconstruction/all": 0.2931548058986664, + "eval_valid_reconstruction/end_span": 0.7043417692184448, + "eval_valid_reconstruction/fim": 0.1406330019235611, + "eval_valid_reconstruction/first_seq": 0.17907516658306122, + "eval_valid_reconstruction/last_seq": 0.33638402819633484, + "eval_valid_reconstruction/second_seq": 0.2050279974937439, + "eval_valid_runtime": 437.638, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 25250 + }, + { + "epoch": 0.09418619398252799, + "eval_train_loss": 2.1957952976226807, + "eval_train_loss/all": 2.031290292739868, + "eval_train_loss/end_span": 1.262041687965393, + "eval_train_perplexity/batch": 7.623917102813721, + "eval_train_perplexity/end_span": 3.5326266288757324, + "eval_train_perplexity/fim": 2.362499475479126, + "eval_train_perplexity/first_seq": 15.384177207946777, + "eval_train_perplexity/last_seq": 8.589537620544434, + "eval_train_perplexity/second_seq": 14.024134635925293, + "eval_train_perplexity/seq": 8.780648231506348, + "eval_train_reconstruction/all": 0.28252750635147095, + "eval_train_reconstruction/end_span": 0.7126649022102356, + "eval_train_reconstruction/fim": 0.17063434422016144, + "eval_train_reconstruction/first_seq": 0.1511743813753128, + "eval_train_reconstruction/last_seq": 0.3374459445476532, + "eval_train_reconstruction/second_seq": 0.18683913350105286, + "eval_train_runtime": 440.4068, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 25250 + }, + { + "epoch": 0.09422349544549137, + "grad_norm": 0.2872428297996521, + "learning_rate": 0.0006, + "loss": 2.2106, + "step": 25260 + }, + { + "epoch": 0.09426079690845475, + "grad_norm": 0.3276542127132416, + "learning_rate": 0.0006, + "loss": 2.272, + "step": 25270 + }, + { + "epoch": 0.09429809837141813, + "grad_norm": 0.42001524567604065, + "learning_rate": 0.0006, + "loss": 2.142, + "step": 25280 + }, + { + "epoch": 0.09433539983438151, + "grad_norm": 0.375753790140152, + "learning_rate": 0.0006, + "loss": 2.2846, + "step": 25290 + }, + { + "epoch": 0.09437270129734489, + "grad_norm": 0.3564431667327881, + "learning_rate": 0.0006, + "loss": 2.2154, + "step": 25300 + }, + { + "epoch": 0.09441000276030825, + "grad_norm": 0.4237607419490814, + "learning_rate": 0.0006, + "loss": 2.0984, + "step": 25310 + }, + { + "epoch": 0.09444730422327163, + "grad_norm": 0.3034175932407379, + "learning_rate": 0.0006, + "loss": 2.2918, + "step": 25320 + }, + { + "epoch": 0.09448460568623501, + "grad_norm": 0.39488691091537476, + "learning_rate": 0.0006, + "loss": 2.2613, + "step": 25330 + }, + { + "epoch": 0.09452190714919839, + "grad_norm": 0.43020889163017273, + "learning_rate": 0.0006, + "loss": 2.2959, + "step": 25340 + }, + { + "epoch": 0.09455920861216177, + "grad_norm": 0.499120831489563, + "learning_rate": 0.0006, + "loss": 2.1463, + "step": 25350 + }, + { + "epoch": 0.09459651007512515, + "grad_norm": 0.29956820607185364, + "learning_rate": 0.0006, + "loss": 2.3058, + "step": 25360 + }, + { + "epoch": 0.09463381153808853, + "grad_norm": 0.4367377758026123, + "learning_rate": 0.0006, + "loss": 2.1873, + "step": 25370 + }, + { + "epoch": 0.0946711130010519, + "grad_norm": 0.5357590913772583, + "learning_rate": 0.0006, + "loss": 2.0854, + "step": 25380 + }, + { + "epoch": 0.09470841446401528, + "grad_norm": 0.24337118864059448, + "learning_rate": 0.0006, + "loss": 2.2769, + "step": 25390 + }, + { + "epoch": 0.09474571592697865, + "grad_norm": 0.3845956027507782, + "learning_rate": 0.0006, + "loss": 2.0803, + "step": 25400 + }, + { + "epoch": 0.09478301738994203, + "grad_norm": 0.3176881968975067, + "learning_rate": 0.0006, + "loss": 2.2513, + "step": 25410 + }, + { + "epoch": 0.09482031885290541, + "grad_norm": 0.3988525867462158, + "learning_rate": 0.0006, + "loss": 2.2149, + "step": 25420 + }, + { + "epoch": 0.09485762031586879, + "grad_norm": 0.4300619065761566, + "learning_rate": 0.0006, + "loss": 2.2464, + "step": 25430 + }, + { + "epoch": 0.09489492177883217, + "grad_norm": 0.3238670527935028, + "learning_rate": 0.0006, + "loss": 2.0432, + "step": 25440 + }, + { + "epoch": 0.09493222324179554, + "grad_norm": 0.24889621138572693, + "learning_rate": 0.0006, + "loss": 2.1815, + "step": 25450 + }, + { + "epoch": 0.09496952470475892, + "grad_norm": 0.3210107088088989, + "learning_rate": 0.0006, + "loss": 2.1993, + "step": 25460 + }, + { + "epoch": 0.0950068261677223, + "grad_norm": 0.4217771887779236, + "learning_rate": 0.0006, + "loss": 2.3333, + "step": 25470 + }, + { + "epoch": 0.09504412763068568, + "grad_norm": 0.4381966292858124, + "learning_rate": 0.0006, + "loss": 2.2494, + "step": 25480 + }, + { + "epoch": 0.09508142909364906, + "grad_norm": 0.31966182589530945, + "learning_rate": 0.0006, + "loss": 2.1379, + "step": 25490 + }, + { + "epoch": 0.09511873055661244, + "grad_norm": 0.2976202368736267, + "learning_rate": 0.0006, + "loss": 2.2837, + "step": 25500 + }, + { + "epoch": 0.09511873055661244, + "eval_valid_loss": 2.197573661804199, + "eval_valid_loss/all": 2.0600411891937256, + "eval_valid_loss/end_span": 1.2212722301483154, + "eval_valid_perplexity/batch": 7.846292972564697, + "eval_valid_perplexity/end_span": 3.3914997577667236, + "eval_valid_perplexity/fim": 2.4329793453216553, + "eval_valid_perplexity/first_seq": 14.908516883850098, + "eval_valid_perplexity/last_seq": 9.211788177490234, + "eval_valid_perplexity/second_seq": 13.528289794921875, + "eval_valid_perplexity/seq": 8.849769592285156, + "eval_valid_reconstruction/all": 0.2926262319087982, + "eval_valid_reconstruction/end_span": 0.7150223851203918, + "eval_valid_reconstruction/fim": 0.17705105245113373, + "eval_valid_reconstruction/first_seq": 0.1651102751493454, + "eval_valid_reconstruction/last_seq": 0.31704431772232056, + "eval_valid_reconstruction/second_seq": 0.20224395394325256, + "eval_valid_runtime": 440.5194, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 25500 + }, + { + "epoch": 0.09511873055661244, + "eval_train_loss": 2.1953365802764893, + "eval_train_loss/all": 2.0313100814819336, + "eval_train_loss/end_span": 1.1895490884780884, + "eval_train_perplexity/batch": 7.624067783355713, + "eval_train_perplexity/end_span": 3.28559947013855, + "eval_train_perplexity/fim": 2.143672466278076, + "eval_train_perplexity/first_seq": 15.390634536743164, + "eval_train_perplexity/last_seq": 9.430153846740723, + "eval_train_perplexity/second_seq": 14.270858764648438, + "eval_train_perplexity/seq": 8.78233528137207, + "eval_train_reconstruction/all": 0.2822519540786743, + "eval_train_reconstruction/end_span": 0.7245185375213623, + "eval_train_reconstruction/fim": 0.15102335810661316, + "eval_train_reconstruction/first_seq": 0.15631003677845, + "eval_train_reconstruction/last_seq": 0.30816012620925903, + "eval_train_reconstruction/second_seq": 0.18309317529201508, + "eval_train_runtime": 435.6379, + "eval_train_samples_per_second": 0.441, + "eval_train_steps_per_second": 0.441, + "step": 25500 + }, + { + "epoch": 0.09515603201957581, + "grad_norm": 0.31727907061576843, + "learning_rate": 0.0006, + "loss": 2.2568, + "step": 25510 + }, + { + "epoch": 0.09519333348253918, + "grad_norm": 0.45327138900756836, + "learning_rate": 0.0006, + "loss": 2.1196, + "step": 25520 + }, + { + "epoch": 0.09523063494550256, + "grad_norm": 0.36967921257019043, + "learning_rate": 0.0006, + "loss": 2.3491, + "step": 25530 + }, + { + "epoch": 0.09526793640846594, + "grad_norm": 1.1263164281845093, + "learning_rate": 0.0006, + "loss": 2.0203, + "step": 25540 + }, + { + "epoch": 0.09530523787142932, + "grad_norm": 0.3667004704475403, + "learning_rate": 0.0006, + "loss": 2.0341, + "step": 25550 + }, + { + "epoch": 0.0953425393343927, + "grad_norm": 0.30818307399749756, + "learning_rate": 0.0006, + "loss": 2.0937, + "step": 25560 + }, + { + "epoch": 0.09537984079735608, + "grad_norm": 0.6617195010185242, + "learning_rate": 0.0006, + "loss": 2.1286, + "step": 25570 + }, + { + "epoch": 0.09541714226031944, + "grad_norm": 0.39776813983917236, + "learning_rate": 0.0006, + "loss": 2.1528, + "step": 25580 + }, + { + "epoch": 0.09545444372328282, + "grad_norm": 0.36796706914901733, + "learning_rate": 0.0006, + "loss": 2.0627, + "step": 25590 + }, + { + "epoch": 0.0954917451862462, + "grad_norm": 1.480796217918396, + "learning_rate": 0.0006, + "loss": 2.2908, + "step": 25600 + }, + { + "epoch": 0.09552904664920958, + "grad_norm": 0.6382247805595398, + "learning_rate": 0.0006, + "loss": 2.3406, + "step": 25610 + }, + { + "epoch": 0.09556634811217296, + "grad_norm": 0.4140372574329376, + "learning_rate": 0.0006, + "loss": 2.1473, + "step": 25620 + }, + { + "epoch": 0.09560364957513634, + "grad_norm": 0.37382546067237854, + "learning_rate": 0.0006, + "loss": 2.141, + "step": 25630 + }, + { + "epoch": 0.09564095103809972, + "grad_norm": 0.27454468607902527, + "learning_rate": 0.0006, + "loss": 2.177, + "step": 25640 + }, + { + "epoch": 0.09567825250106309, + "grad_norm": 0.3534495234489441, + "learning_rate": 0.0006, + "loss": 2.3015, + "step": 25650 + }, + { + "epoch": 0.09571555396402646, + "grad_norm": 0.1967046558856964, + "learning_rate": 0.0006, + "loss": 2.2454, + "step": 25660 + }, + { + "epoch": 0.09575285542698984, + "grad_norm": 0.34211286902427673, + "learning_rate": 0.0006, + "loss": 2.2045, + "step": 25670 + }, + { + "epoch": 0.09579015688995322, + "grad_norm": 0.2535310387611389, + "learning_rate": 0.0006, + "loss": 2.1653, + "step": 25680 + }, + { + "epoch": 0.0958274583529166, + "grad_norm": 0.2522045075893402, + "learning_rate": 0.0006, + "loss": 2.2152, + "step": 25690 + }, + { + "epoch": 0.09586475981587998, + "grad_norm": 0.5034493207931519, + "learning_rate": 0.0006, + "loss": 2.2693, + "step": 25700 + }, + { + "epoch": 0.09590206127884336, + "grad_norm": 0.3016928732395172, + "learning_rate": 0.0006, + "loss": 2.0264, + "step": 25710 + }, + { + "epoch": 0.09593936274180673, + "grad_norm": 0.3861317038536072, + "learning_rate": 0.0006, + "loss": 2.3195, + "step": 25720 + }, + { + "epoch": 0.0959766642047701, + "grad_norm": 0.20197893679141998, + "learning_rate": 0.0006, + "loss": 2.2162, + "step": 25730 + }, + { + "epoch": 0.09601396566773349, + "grad_norm": 0.5741440653800964, + "learning_rate": 0.0006, + "loss": 2.182, + "step": 25740 + }, + { + "epoch": 0.09605126713069687, + "grad_norm": 0.38413241505622864, + "learning_rate": 0.0006, + "loss": 2.2926, + "step": 25750 + }, + { + "epoch": 0.09605126713069687, + "eval_valid_loss": 2.196957588195801, + "eval_valid_loss/all": 2.0593440532684326, + "eval_valid_loss/end_span": 1.3278592824935913, + "eval_valid_perplexity/batch": 7.840825080871582, + "eval_valid_perplexity/end_span": 3.7729578018188477, + "eval_valid_perplexity/fim": 2.2490575313568115, + "eval_valid_perplexity/first_seq": 14.657267570495605, + "eval_valid_perplexity/last_seq": 9.180389404296875, + "eval_valid_perplexity/second_seq": 13.714550018310547, + "eval_valid_perplexity/seq": 8.839024543762207, + "eval_valid_reconstruction/all": 0.2926133871078491, + "eval_valid_reconstruction/end_span": 0.6915386915206909, + "eval_valid_reconstruction/fim": 0.16142933070659637, + "eval_valid_reconstruction/first_seq": 0.17351700365543365, + "eval_valid_reconstruction/last_seq": 0.3202563226222992, + "eval_valid_reconstruction/second_seq": 0.19565647840499878, + "eval_valid_runtime": 438.7236, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 25750 + }, + { + "epoch": 0.09605126713069687, + "eval_train_loss": 2.1957051753997803, + "eval_train_loss/all": 2.0317015647888184, + "eval_train_loss/end_span": 1.279941201210022, + "eval_train_perplexity/batch": 7.627053260803223, + "eval_train_perplexity/end_span": 3.596428155899048, + "eval_train_perplexity/fim": 2.5571701526641846, + "eval_train_perplexity/first_seq": 15.030896186828613, + "eval_train_perplexity/last_seq": 8.95492172241211, + "eval_train_perplexity/second_seq": 14.084260940551758, + "eval_train_perplexity/seq": 8.782447814941406, + "eval_train_reconstruction/all": 0.2821756601333618, + "eval_train_reconstruction/end_span": 0.7055187821388245, + "eval_train_reconstruction/fim": 0.1852610856294632, + "eval_train_reconstruction/first_seq": 0.15969763696193695, + "eval_train_reconstruction/last_seq": 0.3303692936897278, + "eval_train_reconstruction/second_seq": 0.18597577512264252, + "eval_train_runtime": 441.4631, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 25750 + }, + { + "epoch": 0.09608856859366025, + "grad_norm": 0.411504864692688, + "learning_rate": 0.0006, + "loss": 1.9406, + "step": 25760 + }, + { + "epoch": 0.09612587005662362, + "grad_norm": 0.4818795621395111, + "learning_rate": 0.0006, + "loss": 2.2411, + "step": 25770 + }, + { + "epoch": 0.096163171519587, + "grad_norm": 0.43651339411735535, + "learning_rate": 0.0006, + "loss": 1.9768, + "step": 25780 + }, + { + "epoch": 0.09620047298255037, + "grad_norm": 0.2916339039802551, + "learning_rate": 0.0006, + "loss": 2.2736, + "step": 25790 + }, + { + "epoch": 0.09623777444551375, + "grad_norm": 0.48461347818374634, + "learning_rate": 0.0006, + "loss": 2.3057, + "step": 25800 + }, + { + "epoch": 0.09627507590847713, + "grad_norm": 0.3366664946079254, + "learning_rate": 0.0006, + "loss": 2.2166, + "step": 25810 + }, + { + "epoch": 0.09631237737144051, + "grad_norm": 0.41458702087402344, + "learning_rate": 0.0006, + "loss": 2.1602, + "step": 25820 + }, + { + "epoch": 0.09634967883440389, + "grad_norm": 0.4235171675682068, + "learning_rate": 0.0006, + "loss": 2.3288, + "step": 25830 + }, + { + "epoch": 0.09638698029736727, + "grad_norm": 0.4461333751678467, + "learning_rate": 0.0006, + "loss": 2.2308, + "step": 25840 + }, + { + "epoch": 0.09642428176033065, + "grad_norm": 0.3805779814720154, + "learning_rate": 0.0006, + "loss": 2.3255, + "step": 25850 + }, + { + "epoch": 0.09646158322329401, + "grad_norm": 0.36780333518981934, + "learning_rate": 0.0006, + "loss": 2.31, + "step": 25860 + }, + { + "epoch": 0.09649888468625739, + "grad_norm": 0.3809477984905243, + "learning_rate": 0.0006, + "loss": 2.1718, + "step": 25870 + }, + { + "epoch": 0.09653618614922077, + "grad_norm": 0.43762320280075073, + "learning_rate": 0.0006, + "loss": 2.2879, + "step": 25880 + }, + { + "epoch": 0.09657348761218415, + "grad_norm": 0.25203636288642883, + "learning_rate": 0.0006, + "loss": 2.1312, + "step": 25890 + }, + { + "epoch": 0.09661078907514753, + "grad_norm": 0.26871928572654724, + "learning_rate": 0.0006, + "loss": 2.3026, + "step": 25900 + }, + { + "epoch": 0.09664809053811091, + "grad_norm": 0.47207337617874146, + "learning_rate": 0.0006, + "loss": 2.2642, + "step": 25910 + }, + { + "epoch": 0.09668539200107429, + "grad_norm": 0.3364354074001312, + "learning_rate": 0.0006, + "loss": 2.2254, + "step": 25920 + }, + { + "epoch": 0.09672269346403765, + "grad_norm": 0.2625349760055542, + "learning_rate": 0.0006, + "loss": 2.1607, + "step": 25930 + }, + { + "epoch": 0.09675999492700103, + "grad_norm": 0.33057835698127747, + "learning_rate": 0.0006, + "loss": 2.033, + "step": 25940 + }, + { + "epoch": 0.09679729638996441, + "grad_norm": 0.4283050000667572, + "learning_rate": 0.0006, + "loss": 2.3046, + "step": 25950 + }, + { + "epoch": 0.09683459785292779, + "grad_norm": 0.27761203050613403, + "learning_rate": 0.0006, + "loss": 2.1982, + "step": 25960 + }, + { + "epoch": 0.09687189931589117, + "grad_norm": 0.40157487988471985, + "learning_rate": 0.0006, + "loss": 2.1579, + "step": 25970 + }, + { + "epoch": 0.09690920077885455, + "grad_norm": 0.3398038148880005, + "learning_rate": 0.0006, + "loss": 2.1674, + "step": 25980 + }, + { + "epoch": 0.09694650224181793, + "grad_norm": 0.40102308988571167, + "learning_rate": 0.0006, + "loss": 2.1081, + "step": 25990 + }, + { + "epoch": 0.0969838037047813, + "grad_norm": 0.292097270488739, + "learning_rate": 0.0006, + "loss": 2.156, + "step": 26000 + }, + { + "epoch": 0.0969838037047813, + "eval_valid_loss": 2.2019565105438232, + "eval_valid_loss/all": 2.0641512870788574, + "eval_valid_loss/end_span": 1.273452877998352, + "eval_valid_perplexity/batch": 7.878608226776123, + "eval_valid_perplexity/end_span": 3.573168992996216, + "eval_valid_perplexity/fim": 2.4246561527252197, + "eval_valid_perplexity/first_seq": 14.733837127685547, + "eval_valid_perplexity/last_seq": 9.156060218811035, + "eval_valid_perplexity/second_seq": 13.91850471496582, + "eval_valid_perplexity/seq": 8.890152931213379, + "eval_valid_reconstruction/all": 0.29166778922080994, + "eval_valid_reconstruction/end_span": 0.6995549201965332, + "eval_valid_reconstruction/fim": 0.17483320832252502, + "eval_valid_reconstruction/first_seq": 0.16740740835666656, + "eval_valid_reconstruction/last_seq": 0.3210854232311249, + "eval_valid_reconstruction/second_seq": 0.19119198620319366, + "eval_valid_runtime": 437.0005, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 26000 + }, + { + "epoch": 0.0969838037047813, + "eval_train_loss": 2.1980576515197754, + "eval_train_loss/all": 2.033724784851074, + "eval_train_loss/end_span": 1.2332082986831665, + "eval_train_perplexity/batch": 7.642499923706055, + "eval_train_perplexity/end_span": 3.4322235584259033, + "eval_train_perplexity/fim": 1.974111557006836, + "eval_train_perplexity/first_seq": 15.72242546081543, + "eval_train_perplexity/last_seq": 9.02271842956543, + "eval_train_perplexity/second_seq": 14.842734336853027, + "eval_train_perplexity/seq": 8.807456016540527, + "eval_train_reconstruction/all": 0.2817116677761078, + "eval_train_reconstruction/end_span": 0.7127124667167664, + "eval_train_reconstruction/fim": 0.1346345841884613, + "eval_train_reconstruction/first_seq": 0.14588557183742523, + "eval_train_reconstruction/last_seq": 0.3237876296043396, + "eval_train_reconstruction/second_seq": 0.17326925694942474, + "eval_train_runtime": 439.0058, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 26000 + }, + { + "epoch": 0.09702110516774468, + "grad_norm": 0.590908944606781, + "learning_rate": 0.0006, + "loss": 2.3538, + "step": 26010 + }, + { + "epoch": 0.09705840663070806, + "grad_norm": 0.2778763473033905, + "learning_rate": 0.0006, + "loss": 2.2329, + "step": 26020 + }, + { + "epoch": 0.09709570809367143, + "grad_norm": 0.3620823323726654, + "learning_rate": 0.0006, + "loss": 2.1902, + "step": 26030 + }, + { + "epoch": 0.09713300955663481, + "grad_norm": 0.31820130348205566, + "learning_rate": 0.0006, + "loss": 2.0896, + "step": 26040 + }, + { + "epoch": 0.0971703110195982, + "grad_norm": 0.29232192039489746, + "learning_rate": 0.0006, + "loss": 2.1768, + "step": 26050 + }, + { + "epoch": 0.09720761248256157, + "grad_norm": 0.5322180390357971, + "learning_rate": 0.0006, + "loss": 2.1205, + "step": 26060 + }, + { + "epoch": 0.09724491394552494, + "grad_norm": 0.32955148816108704, + "learning_rate": 0.0006, + "loss": 2.2346, + "step": 26070 + }, + { + "epoch": 0.09728221540848832, + "grad_norm": 0.38338446617126465, + "learning_rate": 0.0006, + "loss": 2.2761, + "step": 26080 + }, + { + "epoch": 0.0973195168714517, + "grad_norm": 0.399667888879776, + "learning_rate": 0.0006, + "loss": 2.2933, + "step": 26090 + }, + { + "epoch": 0.09735681833441508, + "grad_norm": 0.33149561285972595, + "learning_rate": 0.0006, + "loss": 2.0838, + "step": 26100 + }, + { + "epoch": 0.09739411979737846, + "grad_norm": 0.3599260151386261, + "learning_rate": 0.0006, + "loss": 2.266, + "step": 26110 + }, + { + "epoch": 0.09743142126034184, + "grad_norm": 0.3945426046848297, + "learning_rate": 0.0006, + "loss": 2.2748, + "step": 26120 + }, + { + "epoch": 0.0974687227233052, + "grad_norm": 0.3373388350009918, + "learning_rate": 0.0006, + "loss": 2.2338, + "step": 26130 + }, + { + "epoch": 0.09750602418626858, + "grad_norm": 0.3943660259246826, + "learning_rate": 0.0006, + "loss": 2.2683, + "step": 26140 + }, + { + "epoch": 0.09754332564923196, + "grad_norm": 0.2839943468570709, + "learning_rate": 0.0006, + "loss": 2.2826, + "step": 26150 + }, + { + "epoch": 0.09758062711219534, + "grad_norm": 0.559692919254303, + "learning_rate": 0.0006, + "loss": 2.0569, + "step": 26160 + }, + { + "epoch": 0.09761792857515872, + "grad_norm": 0.28933149576187134, + "learning_rate": 0.0006, + "loss": 2.1929, + "step": 26170 + }, + { + "epoch": 0.0976552300381221, + "grad_norm": 0.1855718344449997, + "learning_rate": 0.0006, + "loss": 2.3363, + "step": 26180 + }, + { + "epoch": 0.09769253150108548, + "grad_norm": 0.37715357542037964, + "learning_rate": 0.0006, + "loss": 2.274, + "step": 26190 + }, + { + "epoch": 0.09772983296404884, + "grad_norm": 0.34484270215034485, + "learning_rate": 0.0006, + "loss": 2.3003, + "step": 26200 + }, + { + "epoch": 0.09776713442701222, + "grad_norm": 0.5156585574150085, + "learning_rate": 0.0006, + "loss": 2.1235, + "step": 26210 + }, + { + "epoch": 0.0978044358899756, + "grad_norm": 0.44618189334869385, + "learning_rate": 0.0006, + "loss": 2.1893, + "step": 26220 + }, + { + "epoch": 0.09784173735293898, + "grad_norm": 0.3692554831504822, + "learning_rate": 0.0006, + "loss": 2.3621, + "step": 26230 + }, + { + "epoch": 0.09787903881590236, + "grad_norm": 1.747628927230835, + "learning_rate": 0.0006, + "loss": 2.2794, + "step": 26240 + }, + { + "epoch": 0.09791634027886574, + "grad_norm": 0.2692986726760864, + "learning_rate": 0.0006, + "loss": 2.2503, + "step": 26250 + }, + { + "epoch": 0.09791634027886574, + "eval_valid_loss": 2.1991727352142334, + "eval_valid_loss/all": 2.061147928237915, + "eval_valid_loss/end_span": 1.2884689569473267, + "eval_valid_perplexity/batch": 7.854981422424316, + "eval_valid_perplexity/end_span": 3.6272289752960205, + "eval_valid_perplexity/fim": 2.2510359287261963, + "eval_valid_perplexity/first_seq": 15.193328857421875, + "eval_valid_perplexity/last_seq": 9.324846267700195, + "eval_valid_perplexity/second_seq": 13.720525741577148, + "eval_valid_perplexity/seq": 8.855572700500488, + "eval_valid_reconstruction/all": 0.29239922761917114, + "eval_valid_reconstruction/end_span": 0.6954875588417053, + "eval_valid_reconstruction/fim": 0.16063694655895233, + "eval_valid_reconstruction/first_seq": 0.1626286804676056, + "eval_valid_reconstruction/last_seq": 0.31609994173049927, + "eval_valid_reconstruction/second_seq": 0.19878721237182617, + "eval_valid_runtime": 446.455, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 26250 + }, + { + "epoch": 0.09791634027886574, + "eval_train_loss": 2.1973917484283447, + "eval_train_loss/all": 2.0327839851379395, + "eval_train_loss/end_span": 1.2511390447616577, + "eval_train_perplexity/batch": 7.635313510894775, + "eval_train_perplexity/end_span": 3.494320869445801, + "eval_train_perplexity/fim": 2.056335926055908, + "eval_train_perplexity/first_seq": 15.513670921325684, + "eval_train_perplexity/last_seq": 8.867918014526367, + "eval_train_perplexity/second_seq": 14.110305786132812, + "eval_train_perplexity/seq": 8.78947925567627, + "eval_train_reconstruction/all": 0.28197187185287476, + "eval_train_reconstruction/end_span": 0.7092933058738708, + "eval_train_reconstruction/fim": 0.14248165488243103, + "eval_train_reconstruction/first_seq": 0.14935410022735596, + "eval_train_reconstruction/last_seq": 0.32882624864578247, + "eval_train_reconstruction/second_seq": 0.18717829883098602, + "eval_train_runtime": 442.0324, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 26250 + }, + { + "epoch": 0.09795364174182912, + "grad_norm": 0.23590293526649475, + "learning_rate": 0.0006, + "loss": 2.3555, + "step": 26260 + }, + { + "epoch": 0.09799094320479249, + "grad_norm": 0.2812768816947937, + "learning_rate": 0.0006, + "loss": 2.4259, + "step": 26270 + }, + { + "epoch": 0.09802824466775586, + "grad_norm": 0.347288578748703, + "learning_rate": 0.0006, + "loss": 2.2502, + "step": 26280 + }, + { + "epoch": 0.09806554613071924, + "grad_norm": 0.5310620665550232, + "learning_rate": 0.0006, + "loss": 2.1679, + "step": 26290 + }, + { + "epoch": 0.09810284759368262, + "grad_norm": 0.38141170144081116, + "learning_rate": 0.0006, + "loss": 2.3806, + "step": 26300 + }, + { + "epoch": 0.098140149056646, + "grad_norm": 0.28891876339912415, + "learning_rate": 0.0006, + "loss": 2.1649, + "step": 26310 + }, + { + "epoch": 0.09817745051960938, + "grad_norm": 0.33677688241004944, + "learning_rate": 0.0006, + "loss": 2.1319, + "step": 26320 + }, + { + "epoch": 0.09821475198257276, + "grad_norm": 0.25594478845596313, + "learning_rate": 0.0006, + "loss": 2.2416, + "step": 26330 + }, + { + "epoch": 0.09825205344553613, + "grad_norm": 0.36770594120025635, + "learning_rate": 0.0006, + "loss": 2.2083, + "step": 26340 + }, + { + "epoch": 0.09828935490849951, + "grad_norm": 0.3752437233924866, + "learning_rate": 0.0006, + "loss": 2.3996, + "step": 26350 + }, + { + "epoch": 0.09832665637146289, + "grad_norm": 0.3736467659473419, + "learning_rate": 0.0006, + "loss": 2.3976, + "step": 26360 + }, + { + "epoch": 0.09836395783442627, + "grad_norm": 0.3885818123817444, + "learning_rate": 0.0006, + "loss": 2.4669, + "step": 26370 + }, + { + "epoch": 0.09840125929738965, + "grad_norm": 0.28161486983299255, + "learning_rate": 0.0006, + "loss": 2.2901, + "step": 26380 + }, + { + "epoch": 0.09843856076035302, + "grad_norm": 0.3718496263027191, + "learning_rate": 0.0006, + "loss": 2.0563, + "step": 26390 + }, + { + "epoch": 0.0984758622233164, + "grad_norm": 0.3761841952800751, + "learning_rate": 0.0006, + "loss": 2.1086, + "step": 26400 + }, + { + "epoch": 0.09851316368627977, + "grad_norm": 0.36505433917045593, + "learning_rate": 0.0006, + "loss": 2.2195, + "step": 26410 + }, + { + "epoch": 0.09855046514924315, + "grad_norm": 0.3561365008354187, + "learning_rate": 0.0006, + "loss": 2.1647, + "step": 26420 + }, + { + "epoch": 0.09858776661220653, + "grad_norm": 0.49949321150779724, + "learning_rate": 0.0006, + "loss": 2.1919, + "step": 26430 + }, + { + "epoch": 0.09862506807516991, + "grad_norm": 0.316218763589859, + "learning_rate": 0.0006, + "loss": 2.1183, + "step": 26440 + }, + { + "epoch": 0.09866236953813329, + "grad_norm": 0.37057387828826904, + "learning_rate": 0.0006, + "loss": 2.2342, + "step": 26450 + }, + { + "epoch": 0.09869967100109667, + "grad_norm": 0.3140566647052765, + "learning_rate": 0.0006, + "loss": 2.3268, + "step": 26460 + }, + { + "epoch": 0.09873697246406005, + "grad_norm": 0.25952446460723877, + "learning_rate": 0.0006, + "loss": 2.1209, + "step": 26470 + }, + { + "epoch": 0.09877427392702341, + "grad_norm": 0.2784128487110138, + "learning_rate": 0.0006, + "loss": 2.2645, + "step": 26480 + }, + { + "epoch": 0.09881157538998679, + "grad_norm": 0.2895413637161255, + "learning_rate": 0.0006, + "loss": 2.1205, + "step": 26490 + }, + { + "epoch": 0.09884887685295017, + "grad_norm": 0.5206440687179565, + "learning_rate": 0.0006, + "loss": 2.2612, + "step": 26500 + }, + { + "epoch": 0.09884887685295017, + "eval_valid_loss": 2.2001068592071533, + "eval_valid_loss/all": 2.062283754348755, + "eval_valid_loss/end_span": 1.2763453722000122, + "eval_valid_perplexity/batch": 7.863908767700195, + "eval_valid_perplexity/end_span": 3.583519220352173, + "eval_valid_perplexity/fim": 2.391671895980835, + "eval_valid_perplexity/first_seq": 15.042651176452637, + "eval_valid_perplexity/last_seq": 9.261078834533691, + "eval_valid_perplexity/second_seq": 13.712993621826172, + "eval_valid_perplexity/seq": 8.866894721984863, + "eval_valid_reconstruction/all": 0.2922978699207306, + "eval_valid_reconstruction/end_span": 0.7010971307754517, + "eval_valid_reconstruction/fim": 0.17316731810569763, + "eval_valid_reconstruction/first_seq": 0.16449767351150513, + "eval_valid_reconstruction/last_seq": 0.3193347752094269, + "eval_valid_reconstruction/second_seq": 0.1967347413301468, + "eval_valid_runtime": 447.478, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 26500 + }, + { + "epoch": 0.09884887685295017, + "eval_train_loss": 2.197455406188965, + "eval_train_loss/all": 2.032820224761963, + "eval_train_loss/end_span": 1.2238354682922363, + "eval_train_perplexity/batch": 7.635590076446533, + "eval_train_perplexity/end_span": 3.4002041816711426, + "eval_train_perplexity/fim": 2.1335365772247314, + "eval_train_perplexity/first_seq": 15.83816909790039, + "eval_train_perplexity/last_seq": 8.978174209594727, + "eval_train_perplexity/second_seq": 14.50376033782959, + "eval_train_perplexity/seq": 8.78947925567627, + "eval_train_reconstruction/all": 0.28216487169265747, + "eval_train_reconstruction/end_span": 0.7167187929153442, + "eval_train_reconstruction/fim": 0.15033046901226044, + "eval_train_reconstruction/first_seq": 0.14402517676353455, + "eval_train_reconstruction/last_seq": 0.3241423964500427, + "eval_train_reconstruction/second_seq": 0.17855289578437805, + "eval_train_runtime": 448.4823, + "eval_train_samples_per_second": 0.428, + "eval_train_steps_per_second": 0.428, + "step": 26500 + }, + { + "epoch": 0.09888617831591355, + "grad_norm": 0.36006954312324524, + "learning_rate": 0.0006, + "loss": 2.3027, + "step": 26510 + }, + { + "epoch": 0.09892347977887693, + "grad_norm": 0.30726194381713867, + "learning_rate": 0.0006, + "loss": 2.0414, + "step": 26520 + }, + { + "epoch": 0.09896078124184031, + "grad_norm": 0.2814301550388336, + "learning_rate": 0.0006, + "loss": 2.132, + "step": 26530 + }, + { + "epoch": 0.09899808270480369, + "grad_norm": 0.26214799284935, + "learning_rate": 0.0006, + "loss": 2.2385, + "step": 26540 + }, + { + "epoch": 0.09903538416776705, + "grad_norm": 0.5144386887550354, + "learning_rate": 0.0006, + "loss": 1.989, + "step": 26550 + }, + { + "epoch": 0.09907268563073043, + "grad_norm": 0.4094049632549286, + "learning_rate": 0.0006, + "loss": 2.2241, + "step": 26560 + }, + { + "epoch": 0.09910998709369381, + "grad_norm": 1.0545717477798462, + "learning_rate": 0.0006, + "loss": 2.1464, + "step": 26570 + }, + { + "epoch": 0.09914728855665719, + "grad_norm": 0.35360297560691833, + "learning_rate": 0.0006, + "loss": 2.1192, + "step": 26580 + }, + { + "epoch": 0.09918459001962057, + "grad_norm": 0.46747347712516785, + "learning_rate": 0.0006, + "loss": 2.34, + "step": 26590 + }, + { + "epoch": 0.09922189148258395, + "grad_norm": 0.3643774092197418, + "learning_rate": 0.0006, + "loss": 2.3221, + "step": 26600 + }, + { + "epoch": 0.09925919294554733, + "grad_norm": 0.35294201970100403, + "learning_rate": 0.0006, + "loss": 2.1461, + "step": 26610 + }, + { + "epoch": 0.0992964944085107, + "grad_norm": 0.3288007080554962, + "learning_rate": 0.0006, + "loss": 2.1804, + "step": 26620 + }, + { + "epoch": 0.09933379587147408, + "grad_norm": 0.46128973364830017, + "learning_rate": 0.0006, + "loss": 2.1328, + "step": 26630 + }, + { + "epoch": 0.09937109733443746, + "grad_norm": 0.3712892234325409, + "learning_rate": 0.0006, + "loss": 2.3451, + "step": 26640 + }, + { + "epoch": 0.09940839879740083, + "grad_norm": 0.3426896631717682, + "learning_rate": 0.0006, + "loss": 2.3498, + "step": 26650 + }, + { + "epoch": 0.09944570026036421, + "grad_norm": 0.27800416946411133, + "learning_rate": 0.0006, + "loss": 2.1632, + "step": 26660 + }, + { + "epoch": 0.0994830017233276, + "grad_norm": 0.4071045219898224, + "learning_rate": 0.0006, + "loss": 2.075, + "step": 26670 + }, + { + "epoch": 0.09952030318629097, + "grad_norm": 0.39135539531707764, + "learning_rate": 0.0006, + "loss": 2.4078, + "step": 26680 + }, + { + "epoch": 0.09955760464925434, + "grad_norm": 0.31018680334091187, + "learning_rate": 0.0006, + "loss": 2.3224, + "step": 26690 + }, + { + "epoch": 0.09959490611221772, + "grad_norm": 0.39749541878700256, + "learning_rate": 0.0006, + "loss": 2.2766, + "step": 26700 + }, + { + "epoch": 0.0996322075751811, + "grad_norm": 0.330966591835022, + "learning_rate": 0.0006, + "loss": 2.1696, + "step": 26710 + }, + { + "epoch": 0.09966950903814448, + "grad_norm": 0.3874906897544861, + "learning_rate": 0.0006, + "loss": 2.3334, + "step": 26720 + }, + { + "epoch": 0.09970681050110786, + "grad_norm": 0.552114725112915, + "learning_rate": 0.0006, + "loss": 2.0552, + "step": 26730 + }, + { + "epoch": 0.09974411196407124, + "grad_norm": 0.5346937775611877, + "learning_rate": 0.0006, + "loss": 2.1699, + "step": 26740 + }, + { + "epoch": 0.0997814134270346, + "grad_norm": 0.3976539373397827, + "learning_rate": 0.0006, + "loss": 2.1834, + "step": 26750 + }, + { + "epoch": 0.0997814134270346, + "eval_valid_loss": 2.2055046558380127, + "eval_valid_loss/all": 2.067413568496704, + "eval_valid_loss/end_span": 1.1680678129196167, + "eval_valid_perplexity/batch": 7.90435266494751, + "eval_valid_perplexity/end_span": 3.215773105621338, + "eval_valid_perplexity/fim": 2.2480785846710205, + "eval_valid_perplexity/first_seq": 14.510252952575684, + "eval_valid_perplexity/last_seq": 8.942170143127441, + "eval_valid_perplexity/second_seq": 13.692434310913086, + "eval_valid_perplexity/seq": 8.913374900817871, + "eval_valid_reconstruction/all": 0.2908341884613037, + "eval_valid_reconstruction/end_span": 0.7336822152137756, + "eval_valid_reconstruction/fim": 0.15956184267997742, + "eval_valid_reconstruction/first_seq": 0.17559634149074554, + "eval_valid_reconstruction/last_seq": 0.3323363959789276, + "eval_valid_reconstruction/second_seq": 0.19660870730876923, + "eval_valid_runtime": 444.8651, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 26750 + }, + { + "epoch": 0.0997814134270346, + "eval_train_loss": 2.2017791271209717, + "eval_train_loss/all": 2.036931276321411, + "eval_train_loss/end_span": 1.1363013982772827, + "eval_train_perplexity/batch": 7.6670451164245605, + "eval_train_perplexity/end_span": 3.115225076675415, + "eval_train_perplexity/fim": 2.4236624240875244, + "eval_train_perplexity/first_seq": 15.402549743652344, + "eval_train_perplexity/last_seq": 9.174094200134277, + "eval_train_perplexity/second_seq": 14.434727668762207, + "eval_train_perplexity/seq": 8.82931900024414, + "eval_train_reconstruction/all": 0.280929297208786, + "eval_train_reconstruction/end_span": 0.7427340149879456, + "eval_train_reconstruction/fim": 0.17432361841201782, + "eval_train_reconstruction/first_seq": 0.15421313047409058, + "eval_train_reconstruction/last_seq": 0.32161974906921387, + "eval_train_reconstruction/second_seq": 0.18120360374450684, + "eval_train_runtime": 440.3686, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 26750 + }, + { + "epoch": 0.09981871488999798, + "grad_norm": 0.4470910429954529, + "learning_rate": 0.0006, + "loss": 2.0815, + "step": 26760 + }, + { + "epoch": 0.09985601635296136, + "grad_norm": 0.5296902656555176, + "learning_rate": 0.0006, + "loss": 2.3093, + "step": 26770 + }, + { + "epoch": 0.09989331781592474, + "grad_norm": 0.2697773277759552, + "learning_rate": 0.0006, + "loss": 2.3086, + "step": 26780 + }, + { + "epoch": 0.09993061927888812, + "grad_norm": 0.3095818758010864, + "learning_rate": 0.0006, + "loss": 2.3359, + "step": 26790 + }, + { + "epoch": 0.0999679207418515, + "grad_norm": 0.32954615354537964, + "learning_rate": 0.0006, + "loss": 2.3119, + "step": 26800 + }, + { + "epoch": 0.10000522220481488, + "grad_norm": 0.4575413763523102, + "learning_rate": 0.0006, + "loss": 2.0535, + "step": 26810 + }, + { + "epoch": 0.10004252366777824, + "grad_norm": 0.2824740707874298, + "learning_rate": 0.0006, + "loss": 2.155, + "step": 26820 + }, + { + "epoch": 0.10007982513074162, + "grad_norm": 0.48478248715400696, + "learning_rate": 0.0006, + "loss": 2.3084, + "step": 26830 + }, + { + "epoch": 0.100117126593705, + "grad_norm": 0.4848015308380127, + "learning_rate": 0.0006, + "loss": 2.2685, + "step": 26840 + }, + { + "epoch": 0.10015442805666838, + "grad_norm": 0.4370060861110687, + "learning_rate": 0.0006, + "loss": 2.2824, + "step": 26850 + }, + { + "epoch": 0.10019172951963176, + "grad_norm": 0.35447418689727783, + "learning_rate": 0.0006, + "loss": 2.2412, + "step": 26860 + }, + { + "epoch": 0.10022903098259514, + "grad_norm": 0.30166447162628174, + "learning_rate": 0.0006, + "loss": 2.3103, + "step": 26870 + }, + { + "epoch": 0.10026633244555852, + "grad_norm": 0.4310900568962097, + "learning_rate": 0.0006, + "loss": 2.1957, + "step": 26880 + }, + { + "epoch": 0.10030363390852189, + "grad_norm": 0.2866376042366028, + "learning_rate": 0.0006, + "loss": 2.193, + "step": 26890 + }, + { + "epoch": 0.10034093537148527, + "grad_norm": 0.5149939656257629, + "learning_rate": 0.0006, + "loss": 2.0974, + "step": 26900 + }, + { + "epoch": 0.10037823683444864, + "grad_norm": 0.42758265137672424, + "learning_rate": 0.0006, + "loss": 2.2599, + "step": 26910 + }, + { + "epoch": 0.10041553829741202, + "grad_norm": 0.3976931571960449, + "learning_rate": 0.0006, + "loss": 2.2128, + "step": 26920 + }, + { + "epoch": 0.1004528397603754, + "grad_norm": 0.36597877740859985, + "learning_rate": 0.0006, + "loss": 2.3876, + "step": 26930 + }, + { + "epoch": 0.10049014122333878, + "grad_norm": 0.3486170768737793, + "learning_rate": 0.0006, + "loss": 2.2022, + "step": 26940 + }, + { + "epoch": 0.10052744268630216, + "grad_norm": 0.3079548478126526, + "learning_rate": 0.0006, + "loss": 2.0958, + "step": 26950 + }, + { + "epoch": 0.10056474414926553, + "grad_norm": 0.39991018176078796, + "learning_rate": 0.0006, + "loss": 2.3284, + "step": 26960 + }, + { + "epoch": 0.10060204561222891, + "grad_norm": 0.39681753516197205, + "learning_rate": 0.0006, + "loss": 2.2623, + "step": 26970 + }, + { + "epoch": 0.10063934707519229, + "grad_norm": 0.3819544315338135, + "learning_rate": 0.0006, + "loss": 2.1903, + "step": 26980 + }, + { + "epoch": 0.10067664853815567, + "grad_norm": 0.475624680519104, + "learning_rate": 0.0006, + "loss": 2.2857, + "step": 26990 + }, + { + "epoch": 0.10071395000111905, + "grad_norm": 0.45471230149269104, + "learning_rate": 0.0006, + "loss": 2.2188, + "step": 27000 + }, + { + "epoch": 0.10071395000111905, + "eval_valid_loss": 2.1972548961639404, + "eval_valid_loss/all": 2.0599091053009033, + "eval_valid_loss/end_span": 1.191144585609436, + "eval_valid_perplexity/batch": 7.845256805419922, + "eval_valid_perplexity/end_span": 3.2908456325531006, + "eval_valid_perplexity/fim": 2.2212576866149902, + "eval_valid_perplexity/first_seq": 14.806439399719238, + "eval_valid_perplexity/last_seq": 9.0185546875, + "eval_valid_perplexity/second_seq": 12.962342262268066, + "eval_valid_perplexity/seq": 8.85043716430664, + "eval_valid_reconstruction/all": 0.2924656569957733, + "eval_valid_reconstruction/end_span": 0.7233386635780334, + "eval_valid_reconstruction/fim": 0.1577453464269638, + "eval_valid_reconstruction/first_seq": 0.16975867748260498, + "eval_valid_reconstruction/last_seq": 0.3275272250175476, + "eval_valid_reconstruction/second_seq": 0.2158035784959793, + "eval_valid_runtime": 443.8065, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 27000 + }, + { + "epoch": 0.10071395000111905, + "eval_train_loss": 2.1934587955474854, + "eval_train_loss/all": 2.0297577381134033, + "eval_train_loss/end_span": 1.1624995470046997, + "eval_train_perplexity/batch": 7.612241744995117, + "eval_train_perplexity/end_span": 3.1979167461395264, + "eval_train_perplexity/fim": 2.241971254348755, + "eval_train_perplexity/first_seq": 15.350506782531738, + "eval_train_perplexity/last_seq": 8.823113441467285, + "eval_train_perplexity/second_seq": 14.439020156860352, + "eval_train_perplexity/seq": 8.770657539367676, + "eval_train_reconstruction/all": 0.2826817035675049, + "eval_train_reconstruction/end_span": 0.7334626317024231, + "eval_train_reconstruction/fim": 0.1601077914237976, + "eval_train_reconstruction/first_seq": 0.15346898138523102, + "eval_train_reconstruction/last_seq": 0.32909587025642395, + "eval_train_reconstruction/second_seq": 0.1791718751192093, + "eval_train_runtime": 442.683, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 27000 + }, + { + "epoch": 0.10075125146408243, + "grad_norm": 0.3676218092441559, + "learning_rate": 0.0006, + "loss": 2.3476, + "step": 27010 + }, + { + "epoch": 0.1007885529270458, + "grad_norm": 0.4090222716331482, + "learning_rate": 0.0006, + "loss": 2.2303, + "step": 27020 + }, + { + "epoch": 0.10082585439000917, + "grad_norm": 0.3778429329395294, + "learning_rate": 0.0006, + "loss": 2.3134, + "step": 27030 + }, + { + "epoch": 0.10086315585297255, + "grad_norm": 0.455099493265152, + "learning_rate": 0.0006, + "loss": 2.1148, + "step": 27040 + }, + { + "epoch": 0.10090045731593593, + "grad_norm": 0.23002296686172485, + "learning_rate": 0.0006, + "loss": 2.2715, + "step": 27050 + }, + { + "epoch": 0.10093775877889931, + "grad_norm": 0.5437016487121582, + "learning_rate": 0.0006, + "loss": 2.2985, + "step": 27060 + }, + { + "epoch": 0.10097506024186269, + "grad_norm": 0.32403600215911865, + "learning_rate": 0.0006, + "loss": 2.3546, + "step": 27070 + }, + { + "epoch": 0.10101236170482607, + "grad_norm": 0.37253233790397644, + "learning_rate": 0.0006, + "loss": 2.3653, + "step": 27080 + }, + { + "epoch": 0.10104966316778945, + "grad_norm": 0.23093882203102112, + "learning_rate": 0.0006, + "loss": 2.1192, + "step": 27090 + }, + { + "epoch": 0.10108696463075281, + "grad_norm": 0.3258850574493408, + "learning_rate": 0.0006, + "loss": 2.0909, + "step": 27100 + }, + { + "epoch": 0.10112426609371619, + "grad_norm": 0.4201984405517578, + "learning_rate": 0.0006, + "loss": 2.1986, + "step": 27110 + }, + { + "epoch": 0.10116156755667957, + "grad_norm": 0.42252328991889954, + "learning_rate": 0.0006, + "loss": 2.095, + "step": 27120 + }, + { + "epoch": 0.10119886901964295, + "grad_norm": 0.468634694814682, + "learning_rate": 0.0006, + "loss": 2.1189, + "step": 27130 + }, + { + "epoch": 0.10123617048260633, + "grad_norm": 0.2923310101032257, + "learning_rate": 0.0006, + "loss": 2.1056, + "step": 27140 + }, + { + "epoch": 0.10127347194556971, + "grad_norm": 0.3354492783546448, + "learning_rate": 0.0006, + "loss": 2.2833, + "step": 27150 + }, + { + "epoch": 0.10131077340853309, + "grad_norm": 0.29145875573158264, + "learning_rate": 0.0006, + "loss": 2.1512, + "step": 27160 + }, + { + "epoch": 0.10134807487149645, + "grad_norm": 0.39014026522636414, + "learning_rate": 0.0006, + "loss": 2.3239, + "step": 27170 + }, + { + "epoch": 0.10138537633445983, + "grad_norm": 0.33353152871131897, + "learning_rate": 0.0006, + "loss": 2.3704, + "step": 27180 + }, + { + "epoch": 0.10142267779742321, + "grad_norm": 0.2671719193458557, + "learning_rate": 0.0006, + "loss": 2.2369, + "step": 27190 + }, + { + "epoch": 0.10145997926038659, + "grad_norm": 0.22683212161064148, + "learning_rate": 0.0006, + "loss": 2.2394, + "step": 27200 + }, + { + "epoch": 0.10149728072334997, + "grad_norm": 0.3314630091190338, + "learning_rate": 0.0006, + "loss": 2.2626, + "step": 27210 + }, + { + "epoch": 0.10153458218631335, + "grad_norm": 0.40592435002326965, + "learning_rate": 0.0006, + "loss": 2.3059, + "step": 27220 + }, + { + "epoch": 0.10157188364927673, + "grad_norm": 0.5629202723503113, + "learning_rate": 0.0006, + "loss": 2.2217, + "step": 27230 + }, + { + "epoch": 0.1016091851122401, + "grad_norm": 0.3339300751686096, + "learning_rate": 0.0006, + "loss": 2.1614, + "step": 27240 + }, + { + "epoch": 0.10164648657520348, + "grad_norm": 0.4116133153438568, + "learning_rate": 0.0006, + "loss": 2.1578, + "step": 27250 + }, + { + "epoch": 0.10164648657520348, + "eval_valid_loss": 2.195249319076538, + "eval_valid_loss/all": 2.057738780975342, + "eval_valid_loss/end_span": 1.2008905410766602, + "eval_valid_perplexity/batch": 7.828248500823975, + "eval_valid_perplexity/end_span": 3.32307505607605, + "eval_valid_perplexity/fim": 2.291179895401001, + "eval_valid_perplexity/first_seq": 14.805352210998535, + "eval_valid_perplexity/last_seq": 8.717802047729492, + "eval_valid_perplexity/second_seq": 13.657581329345703, + "eval_valid_perplexity/seq": 8.821375846862793, + "eval_valid_reconstruction/all": 0.2930072546005249, + "eval_valid_reconstruction/end_span": 0.7189689874649048, + "eval_valid_reconstruction/fim": 0.16564030945301056, + "eval_valid_reconstruction/first_seq": 0.16963434219360352, + "eval_valid_reconstruction/last_seq": 0.33530229330062866, + "eval_valid_reconstruction/second_seq": 0.2029103934764862, + "eval_valid_runtime": 442.0403, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 27250 + }, + { + "epoch": 0.10164648657520348, + "eval_train_loss": 2.1916210651397705, + "eval_train_loss/all": 2.0274133682250977, + "eval_train_loss/end_span": 1.1634289026260376, + "eval_train_perplexity/batch": 7.594417095184326, + "eval_train_perplexity/end_span": 3.200890064239502, + "eval_train_perplexity/fim": 1.9021044969558716, + "eval_train_perplexity/first_seq": 15.440180778503418, + "eval_train_perplexity/last_seq": 9.043067932128906, + "eval_train_perplexity/second_seq": 14.422323226928711, + "eval_train_perplexity/seq": 8.738907814025879, + "eval_train_reconstruction/all": 0.2834167778491974, + "eval_train_reconstruction/end_span": 0.7294660806655884, + "eval_train_reconstruction/fim": 0.12743644416332245, + "eval_train_reconstruction/first_seq": 0.15208493173122406, + "eval_train_reconstruction/last_seq": 0.3260824978351593, + "eval_train_reconstruction/second_seq": 0.17939458787441254, + "eval_train_runtime": 442.2719, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 27250 + }, + { + "epoch": 0.10168378803816686, + "grad_norm": 0.35011881589889526, + "learning_rate": 0.0006, + "loss": 2.2325, + "step": 27260 + }, + { + "epoch": 0.10172108950113024, + "grad_norm": 0.31001484394073486, + "learning_rate": 0.0006, + "loss": 2.1501, + "step": 27270 + }, + { + "epoch": 0.10175839096409361, + "grad_norm": 0.3237469494342804, + "learning_rate": 0.0006, + "loss": 2.2818, + "step": 27280 + }, + { + "epoch": 0.101795692427057, + "grad_norm": 0.2148105651140213, + "learning_rate": 0.0006, + "loss": 2.3114, + "step": 27290 + }, + { + "epoch": 0.10183299389002037, + "grad_norm": 0.39826369285583496, + "learning_rate": 0.0006, + "loss": 2.2512, + "step": 27300 + }, + { + "epoch": 0.10187029535298374, + "grad_norm": 0.2761245667934418, + "learning_rate": 0.0006, + "loss": 2.262, + "step": 27310 + }, + { + "epoch": 0.10190759681594712, + "grad_norm": 0.3587057590484619, + "learning_rate": 0.0006, + "loss": 2.2253, + "step": 27320 + }, + { + "epoch": 0.1019448982789105, + "grad_norm": 0.3476635217666626, + "learning_rate": 0.0006, + "loss": 2.1226, + "step": 27330 + }, + { + "epoch": 0.10198219974187388, + "grad_norm": 0.6561121940612793, + "learning_rate": 0.0006, + "loss": 2.1611, + "step": 27340 + }, + { + "epoch": 0.10201950120483726, + "grad_norm": 0.3096863329410553, + "learning_rate": 0.0006, + "loss": 2.3534, + "step": 27350 + }, + { + "epoch": 0.10205680266780064, + "grad_norm": 0.5114722847938538, + "learning_rate": 0.0006, + "loss": 2.2267, + "step": 27360 + }, + { + "epoch": 0.102094104130764, + "grad_norm": 0.37125465273857117, + "learning_rate": 0.0006, + "loss": 2.1751, + "step": 27370 + }, + { + "epoch": 0.10213140559372738, + "grad_norm": 0.3139439523220062, + "learning_rate": 0.0006, + "loss": 2.2556, + "step": 27380 + }, + { + "epoch": 0.10216870705669076, + "grad_norm": 0.39509856700897217, + "learning_rate": 0.0006, + "loss": 2.2377, + "step": 27390 + }, + { + "epoch": 0.10220600851965414, + "grad_norm": 0.7116186618804932, + "learning_rate": 0.0006, + "loss": 2.3073, + "step": 27400 + }, + { + "epoch": 0.10224330998261752, + "grad_norm": 0.43431922793388367, + "learning_rate": 0.0006, + "loss": 2.0616, + "step": 27410 + }, + { + "epoch": 0.1022806114455809, + "grad_norm": 1.0669350624084473, + "learning_rate": 0.0006, + "loss": 2.2557, + "step": 27420 + }, + { + "epoch": 0.10231791290854428, + "grad_norm": 0.2454005628824234, + "learning_rate": 0.0006, + "loss": 2.1944, + "step": 27430 + }, + { + "epoch": 0.10235521437150764, + "grad_norm": 0.3424343466758728, + "learning_rate": 0.0006, + "loss": 2.2092, + "step": 27440 + }, + { + "epoch": 0.10239251583447102, + "grad_norm": 0.4036269783973694, + "learning_rate": 0.0006, + "loss": 2.0777, + "step": 27450 + }, + { + "epoch": 0.1024298172974344, + "grad_norm": 0.3414200246334076, + "learning_rate": 0.0006, + "loss": 2.237, + "step": 27460 + }, + { + "epoch": 0.10246711876039778, + "grad_norm": 0.3154274821281433, + "learning_rate": 0.0006, + "loss": 2.35, + "step": 27470 + }, + { + "epoch": 0.10250442022336116, + "grad_norm": 0.4371702969074249, + "learning_rate": 0.0006, + "loss": 2.2455, + "step": 27480 + }, + { + "epoch": 0.10254172168632454, + "grad_norm": 0.4134783446788788, + "learning_rate": 0.0006, + "loss": 2.131, + "step": 27490 + }, + { + "epoch": 0.10257902314928792, + "grad_norm": 0.390781968832016, + "learning_rate": 0.0006, + "loss": 2.2774, + "step": 27500 + }, + { + "epoch": 0.10257902314928792, + "eval_valid_loss": 2.1971747875213623, + "eval_valid_loss/all": 2.0596470832824707, + "eval_valid_loss/end_span": 1.3387067317962646, + "eval_valid_perplexity/batch": 7.843201160430908, + "eval_valid_perplexity/end_span": 3.814107656478882, + "eval_valid_perplexity/fim": 2.6214137077331543, + "eval_valid_perplexity/first_seq": 14.917961120605469, + "eval_valid_perplexity/last_seq": 9.035022735595703, + "eval_valid_perplexity/second_seq": 13.946064949035645, + "eval_valid_perplexity/seq": 8.847983360290527, + "eval_valid_reconstruction/all": 0.29259443283081055, + "eval_valid_reconstruction/end_span": 0.6922488212585449, + "eval_valid_reconstruction/fim": 0.1920900046825409, + "eval_valid_reconstruction/first_seq": 0.16738677024841309, + "eval_valid_reconstruction/last_seq": 0.3278651237487793, + "eval_valid_reconstruction/second_seq": 0.19311796128749847, + "eval_valid_runtime": 441.8736, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 27500 + }, + { + "epoch": 0.10257902314928792, + "eval_train_loss": 2.193655014038086, + "eval_train_loss/all": 2.029796838760376, + "eval_train_loss/end_span": 1.2971251010894775, + "eval_train_perplexity/batch": 7.612539768218994, + "eval_train_perplexity/end_span": 3.6587629318237305, + "eval_train_perplexity/fim": 2.1496639251708984, + "eval_train_perplexity/first_seq": 15.428781509399414, + "eval_train_perplexity/last_seq": 8.923772811889648, + "eval_train_perplexity/second_seq": 14.38512134552002, + "eval_train_perplexity/seq": 8.769010543823242, + "eval_train_reconstruction/all": 0.2826666533946991, + "eval_train_reconstruction/end_span": 0.7053354978561401, + "eval_train_reconstruction/fim": 0.15268775820732117, + "eval_train_reconstruction/first_seq": 0.15136265754699707, + "eval_train_reconstruction/last_seq": 0.32637426257133484, + "eval_train_reconstruction/second_seq": 0.17873911559581757, + "eval_train_runtime": 447.3329, + "eval_train_samples_per_second": 0.429, + "eval_train_steps_per_second": 0.429, + "step": 27500 + }, + { + "epoch": 0.10261632461225129, + "grad_norm": 0.3247007429599762, + "learning_rate": 0.0006, + "loss": 2.2987, + "step": 27510 + }, + { + "epoch": 0.10265362607521467, + "grad_norm": 0.2909890115261078, + "learning_rate": 0.0006, + "loss": 2.2035, + "step": 27520 + }, + { + "epoch": 0.10269092753817805, + "grad_norm": 0.3047213554382324, + "learning_rate": 0.0006, + "loss": 2.0503, + "step": 27530 + }, + { + "epoch": 0.10272822900114142, + "grad_norm": 0.30406832695007324, + "learning_rate": 0.0006, + "loss": 2.3419, + "step": 27540 + }, + { + "epoch": 0.1027655304641048, + "grad_norm": 0.4739646017551422, + "learning_rate": 0.0006, + "loss": 2.2167, + "step": 27550 + }, + { + "epoch": 0.10280283192706818, + "grad_norm": 0.32217615842819214, + "learning_rate": 0.0006, + "loss": 2.3494, + "step": 27560 + }, + { + "epoch": 0.10284013339003156, + "grad_norm": 0.28573042154312134, + "learning_rate": 0.0006, + "loss": 2.2488, + "step": 27570 + }, + { + "epoch": 0.10287743485299493, + "grad_norm": 0.5073219537734985, + "learning_rate": 0.0006, + "loss": 2.255, + "step": 27580 + }, + { + "epoch": 0.10291473631595831, + "grad_norm": 0.3889826834201813, + "learning_rate": 0.0006, + "loss": 2.2425, + "step": 27590 + }, + { + "epoch": 0.10295203777892169, + "grad_norm": 0.31023770570755005, + "learning_rate": 0.0006, + "loss": 2.2687, + "step": 27600 + }, + { + "epoch": 0.10298933924188507, + "grad_norm": 0.42934122681617737, + "learning_rate": 0.0006, + "loss": 2.1402, + "step": 27610 + }, + { + "epoch": 0.10302664070484845, + "grad_norm": 0.36889463663101196, + "learning_rate": 0.0006, + "loss": 2.2557, + "step": 27620 + }, + { + "epoch": 0.10306394216781183, + "grad_norm": 0.3787262439727783, + "learning_rate": 0.0006, + "loss": 2.2558, + "step": 27630 + }, + { + "epoch": 0.1031012436307752, + "grad_norm": 0.3231115937232971, + "learning_rate": 0.0006, + "loss": 2.3097, + "step": 27640 + }, + { + "epoch": 0.10313854509373857, + "grad_norm": 0.3856291174888611, + "learning_rate": 0.0006, + "loss": 2.341, + "step": 27650 + }, + { + "epoch": 0.10317584655670195, + "grad_norm": 0.36896374821662903, + "learning_rate": 0.0006, + "loss": 2.2671, + "step": 27660 + }, + { + "epoch": 0.10321314801966533, + "grad_norm": 0.3221743106842041, + "learning_rate": 0.0006, + "loss": 2.2863, + "step": 27670 + }, + { + "epoch": 0.10325044948262871, + "grad_norm": 0.41990524530410767, + "learning_rate": 0.0006, + "loss": 2.2198, + "step": 27680 + }, + { + "epoch": 0.10328775094559209, + "grad_norm": 0.3597131371498108, + "learning_rate": 0.0006, + "loss": 2.1375, + "step": 27690 + }, + { + "epoch": 0.10332505240855547, + "grad_norm": 0.5255808234214783, + "learning_rate": 0.0006, + "loss": 2.2463, + "step": 27700 + }, + { + "epoch": 0.10336235387151885, + "grad_norm": 0.33636584877967834, + "learning_rate": 0.0006, + "loss": 2.2096, + "step": 27710 + }, + { + "epoch": 0.10339965533448221, + "grad_norm": 0.3525097966194153, + "learning_rate": 0.0006, + "loss": 2.1736, + "step": 27720 + }, + { + "epoch": 0.10343695679744559, + "grad_norm": 0.23813028633594513, + "learning_rate": 0.0006, + "loss": 2.3585, + "step": 27730 + }, + { + "epoch": 0.10347425826040897, + "grad_norm": 0.3064330816268921, + "learning_rate": 0.0006, + "loss": 2.3137, + "step": 27740 + }, + { + "epoch": 0.10351155972337235, + "grad_norm": 0.3257289528846741, + "learning_rate": 0.0006, + "loss": 2.044, + "step": 27750 + }, + { + "epoch": 0.10351155972337235, + "eval_valid_loss": 2.195061683654785, + "eval_valid_loss/all": 2.057380199432373, + "eval_valid_loss/end_span": 1.3622925281524658, + "eval_valid_perplexity/batch": 7.825441837310791, + "eval_valid_perplexity/end_span": 3.9051356315612793, + "eval_valid_perplexity/fim": 2.264561653137207, + "eval_valid_perplexity/first_seq": 14.787480354309082, + "eval_valid_perplexity/last_seq": 8.97586727142334, + "eval_valid_perplexity/second_seq": 13.825776100158691, + "eval_valid_perplexity/seq": 8.821860313415527, + "eval_valid_reconstruction/all": 0.29332980513572693, + "eval_valid_reconstruction/end_span": 0.6820749044418335, + "eval_valid_reconstruction/fim": 0.16185243427753448, + "eval_valid_reconstruction/first_seq": 0.1713990718126297, + "eval_valid_reconstruction/last_seq": 0.32764679193496704, + "eval_valid_reconstruction/second_seq": 0.1944127082824707, + "eval_valid_runtime": 439.4456, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 27750 + }, + { + "epoch": 0.10351155972337235, + "eval_train_loss": 2.1920528411865234, + "eval_train_loss/all": 2.028151035308838, + "eval_train_loss/end_span": 1.326572299003601, + "eval_train_perplexity/batch": 7.6000213623046875, + "eval_train_perplexity/end_span": 3.7681052684783936, + "eval_train_perplexity/fim": 2.0792007446289062, + "eval_train_perplexity/first_seq": 15.587900161743164, + "eval_train_perplexity/last_seq": 9.008462905883789, + "eval_train_perplexity/second_seq": 14.113730430603027, + "eval_train_perplexity/seq": 8.749836921691895, + "eval_train_reconstruction/all": 0.28322118520736694, + "eval_train_reconstruction/end_span": 0.6915829181671143, + "eval_train_reconstruction/fim": 0.1454506814479828, + "eval_train_reconstruction/first_seq": 0.14999103546142578, + "eval_train_reconstruction/last_seq": 0.3250427842140198, + "eval_train_reconstruction/second_seq": 0.18717148900032043, + "eval_train_runtime": 439.4782, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 27750 + }, + { + "epoch": 0.10354886118633573, + "grad_norm": 0.5571143627166748, + "learning_rate": 0.0006, + "loss": 2.1451, + "step": 27760 + }, + { + "epoch": 0.10358616264929911, + "grad_norm": 0.23656992614269257, + "learning_rate": 0.0006, + "loss": 2.3338, + "step": 27770 + }, + { + "epoch": 0.10362346411226249, + "grad_norm": 0.2488403618335724, + "learning_rate": 0.0006, + "loss": 2.2295, + "step": 27780 + }, + { + "epoch": 0.10366076557522585, + "grad_norm": 0.3307616710662842, + "learning_rate": 0.0006, + "loss": 2.4122, + "step": 27790 + }, + { + "epoch": 0.10369806703818923, + "grad_norm": 0.3959094285964966, + "learning_rate": 0.0006, + "loss": 2.1223, + "step": 27800 + }, + { + "epoch": 0.10373536850115261, + "grad_norm": 0.3857887089252472, + "learning_rate": 0.0006, + "loss": 2.3659, + "step": 27810 + }, + { + "epoch": 0.103772669964116, + "grad_norm": 0.24710172414779663, + "learning_rate": 0.0006, + "loss": 2.2629, + "step": 27820 + }, + { + "epoch": 0.10380997142707937, + "grad_norm": 0.47801777720451355, + "learning_rate": 0.0006, + "loss": 2.2098, + "step": 27830 + }, + { + "epoch": 0.10384727289004275, + "grad_norm": 0.42160722613334656, + "learning_rate": 0.0006, + "loss": 2.1536, + "step": 27840 + }, + { + "epoch": 0.10388457435300613, + "grad_norm": 0.29022935032844543, + "learning_rate": 0.0006, + "loss": 2.14, + "step": 27850 + }, + { + "epoch": 0.1039218758159695, + "grad_norm": 0.3811284303665161, + "learning_rate": 0.0006, + "loss": 2.149, + "step": 27860 + }, + { + "epoch": 0.10395917727893288, + "grad_norm": 0.3163352608680725, + "learning_rate": 0.0006, + "loss": 2.2494, + "step": 27870 + }, + { + "epoch": 0.10399647874189626, + "grad_norm": 0.2762415111064911, + "learning_rate": 0.0006, + "loss": 2.3755, + "step": 27880 + }, + { + "epoch": 0.10403378020485964, + "grad_norm": 0.4025452435016632, + "learning_rate": 0.0006, + "loss": 2.2195, + "step": 27890 + }, + { + "epoch": 0.10407108166782301, + "grad_norm": 0.4374993145465851, + "learning_rate": 0.0006, + "loss": 2.2951, + "step": 27900 + }, + { + "epoch": 0.1041083831307864, + "grad_norm": 0.3735571801662445, + "learning_rate": 0.0006, + "loss": 2.157, + "step": 27910 + }, + { + "epoch": 0.10414568459374976, + "grad_norm": 0.45061102509498596, + "learning_rate": 0.0006, + "loss": 2.0118, + "step": 27920 + }, + { + "epoch": 0.10418298605671314, + "grad_norm": 7.0126953125, + "learning_rate": 0.0006, + "loss": 2.2223, + "step": 27930 + }, + { + "epoch": 0.10422028751967652, + "grad_norm": 0.26461610198020935, + "learning_rate": 0.0006, + "loss": 2.2713, + "step": 27940 + }, + { + "epoch": 0.1042575889826399, + "grad_norm": 0.44193923473358154, + "learning_rate": 0.0006, + "loss": 2.2223, + "step": 27950 + }, + { + "epoch": 0.10429489044560328, + "grad_norm": 0.44670769572257996, + "learning_rate": 0.0006, + "loss": 2.1515, + "step": 27960 + }, + { + "epoch": 0.10433219190856666, + "grad_norm": 0.2800580561161041, + "learning_rate": 0.0006, + "loss": 2.3002, + "step": 27970 + }, + { + "epoch": 0.10436949337153004, + "grad_norm": 0.29681217670440674, + "learning_rate": 0.0006, + "loss": 2.3515, + "step": 27980 + }, + { + "epoch": 0.1044067948344934, + "grad_norm": 0.460275262594223, + "learning_rate": 0.0006, + "loss": 2.1632, + "step": 27990 + }, + { + "epoch": 0.10444409629745678, + "grad_norm": 0.41485336422920227, + "learning_rate": 0.0006, + "loss": 2.2347, + "step": 28000 + }, + { + "epoch": 0.10444409629745678, + "eval_valid_loss": 2.194765329360962, + "eval_valid_loss/all": 2.0573086738586426, + "eval_valid_loss/end_span": 1.247881293296814, + "eval_valid_perplexity/batch": 7.8248820304870605, + "eval_valid_perplexity/end_span": 3.4829556941986084, + "eval_valid_perplexity/fim": 2.46462345123291, + "eval_valid_perplexity/first_seq": 14.963316917419434, + "eval_valid_perplexity/last_seq": 9.125195503234863, + "eval_valid_perplexity/second_seq": 13.65591049194336, + "eval_valid_perplexity/seq": 8.820394515991211, + "eval_valid_reconstruction/all": 0.2929205298423767, + "eval_valid_reconstruction/end_span": 0.7048164010047913, + "eval_valid_reconstruction/fim": 0.17869597673416138, + "eval_valid_reconstruction/first_seq": 0.16423329710960388, + "eval_valid_reconstruction/last_seq": 0.3182559311389923, + "eval_valid_reconstruction/second_seq": 0.19661445915699005, + "eval_valid_runtime": 441.8852, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 28000 + }, + { + "epoch": 0.10444409629745678, + "eval_train_loss": 2.1914479732513428, + "eval_train_loss/all": 2.0276474952697754, + "eval_train_loss/end_span": 1.215867280960083, + "eval_train_perplexity/batch": 7.596195220947266, + "eval_train_perplexity/end_span": 3.373218297958374, + "eval_train_perplexity/fim": 2.0960304737091064, + "eval_train_perplexity/first_seq": 15.611227035522461, + "eval_train_perplexity/last_seq": 8.998794555664062, + "eval_train_perplexity/second_seq": 13.846027374267578, + "eval_train_perplexity/seq": 8.74703311920166, + "eval_train_reconstruction/all": 0.2831277549266815, + "eval_train_reconstruction/end_span": 0.7140493392944336, + "eval_train_reconstruction/fim": 0.14760272204875946, + "eval_train_reconstruction/first_seq": 0.14961372315883636, + "eval_train_reconstruction/last_seq": 0.3251195549964905, + "eval_train_reconstruction/second_seq": 0.1949710249900818, + "eval_train_runtime": 442.0471, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 28000 + }, + { + "epoch": 0.10448139776042016, + "grad_norm": 0.45212826132774353, + "learning_rate": 0.0006, + "loss": 2.3477, + "step": 28010 + }, + { + "epoch": 0.10451869922338354, + "grad_norm": 0.4055435061454773, + "learning_rate": 0.0006, + "loss": 2.2666, + "step": 28020 + }, + { + "epoch": 0.10455600068634692, + "grad_norm": 0.2704727053642273, + "learning_rate": 0.0006, + "loss": 2.0861, + "step": 28030 + }, + { + "epoch": 0.1045933021493103, + "grad_norm": 0.46742933988571167, + "learning_rate": 0.0006, + "loss": 2.1287, + "step": 28040 + }, + { + "epoch": 0.10463060361227368, + "grad_norm": 0.3497444987297058, + "learning_rate": 0.0006, + "loss": 2.1399, + "step": 28050 + }, + { + "epoch": 0.10466790507523704, + "grad_norm": 0.26861271262168884, + "learning_rate": 0.0006, + "loss": 2.1934, + "step": 28060 + }, + { + "epoch": 0.10470520653820042, + "grad_norm": 0.4788433015346527, + "learning_rate": 0.0006, + "loss": 2.2158, + "step": 28070 + }, + { + "epoch": 0.1047425080011638, + "grad_norm": 0.4826478958129883, + "learning_rate": 0.0006, + "loss": 2.0921, + "step": 28080 + }, + { + "epoch": 0.10477980946412718, + "grad_norm": 0.3087609112262726, + "learning_rate": 0.0006, + "loss": 2.2839, + "step": 28090 + }, + { + "epoch": 0.10481711092709056, + "grad_norm": 0.33832409977912903, + "learning_rate": 0.0006, + "loss": 2.4022, + "step": 28100 + }, + { + "epoch": 0.10485441239005394, + "grad_norm": 0.5571007132530212, + "learning_rate": 0.0006, + "loss": 2.4237, + "step": 28110 + }, + { + "epoch": 0.10489171385301732, + "grad_norm": 0.5758800506591797, + "learning_rate": 0.0006, + "loss": 2.1906, + "step": 28120 + }, + { + "epoch": 0.10492901531598069, + "grad_norm": 0.2519943416118622, + "learning_rate": 0.0006, + "loss": 2.2139, + "step": 28130 + }, + { + "epoch": 0.10496631677894407, + "grad_norm": 0.24192628264427185, + "learning_rate": 0.0006, + "loss": 2.3546, + "step": 28140 + }, + { + "epoch": 0.10500361824190745, + "grad_norm": 0.3489873707294464, + "learning_rate": 0.0006, + "loss": 2.2725, + "step": 28150 + }, + { + "epoch": 0.10504091970487082, + "grad_norm": 0.26521560549736023, + "learning_rate": 0.0006, + "loss": 2.2457, + "step": 28160 + }, + { + "epoch": 0.1050782211678342, + "grad_norm": 0.4211449921131134, + "learning_rate": 0.0006, + "loss": 2.3079, + "step": 28170 + }, + { + "epoch": 0.10511552263079758, + "grad_norm": 0.2212926745414734, + "learning_rate": 0.0006, + "loss": 2.3025, + "step": 28180 + }, + { + "epoch": 0.10515282409376096, + "grad_norm": 0.48224422335624695, + "learning_rate": 0.0006, + "loss": 2.2656, + "step": 28190 + }, + { + "epoch": 0.10519012555672433, + "grad_norm": 0.34217947721481323, + "learning_rate": 0.0006, + "loss": 2.1657, + "step": 28200 + }, + { + "epoch": 0.10522742701968771, + "grad_norm": 0.3018058240413666, + "learning_rate": 0.0006, + "loss": 2.16, + "step": 28210 + }, + { + "epoch": 0.10526472848265109, + "grad_norm": 0.4590214192867279, + "learning_rate": 0.0006, + "loss": 2.3996, + "step": 28220 + }, + { + "epoch": 0.10530202994561447, + "grad_norm": 0.3270617127418518, + "learning_rate": 0.0006, + "loss": 2.2988, + "step": 28230 + }, + { + "epoch": 0.10533933140857785, + "grad_norm": 0.3503461480140686, + "learning_rate": 0.0006, + "loss": 2.3123, + "step": 28240 + }, + { + "epoch": 0.10537663287154123, + "grad_norm": 0.29843035340309143, + "learning_rate": 0.0006, + "loss": 2.2362, + "step": 28250 + }, + { + "epoch": 0.10537663287154123, + "eval_valid_loss": 2.1953580379486084, + "eval_valid_loss/all": 2.0567286014556885, + "eval_valid_loss/end_span": 1.2583292722702026, + "eval_valid_perplexity/batch": 7.8203444480896, + "eval_valid_perplexity/end_span": 3.5195364952087402, + "eval_valid_perplexity/fim": 2.6772170066833496, + "eval_valid_perplexity/first_seq": 14.879409790039062, + "eval_valid_perplexity/last_seq": 8.754284858703613, + "eval_valid_perplexity/second_seq": 14.301114082336426, + "eval_valid_perplexity/seq": 8.81092643737793, + "eval_valid_reconstruction/all": 0.2934790551662445, + "eval_valid_reconstruction/end_span": 0.7031306624412537, + "eval_valid_reconstruction/fim": 0.19622217118740082, + "eval_valid_reconstruction/first_seq": 0.1708831936120987, + "eval_valid_reconstruction/last_seq": 0.3367294669151306, + "eval_valid_reconstruction/second_seq": 0.17883457243442535, + "eval_valid_runtime": 442.5925, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 28250 + }, + { + "epoch": 0.10537663287154123, + "eval_train_loss": 2.193936586380005, + "eval_train_loss/all": 2.0297787189483643, + "eval_train_loss/end_span": 1.2277593612670898, + "eval_train_perplexity/batch": 7.612401485443115, + "eval_train_perplexity/end_span": 3.413572311401367, + "eval_train_perplexity/fim": 2.051745653152466, + "eval_train_perplexity/first_seq": 15.274992942810059, + "eval_train_perplexity/last_seq": 8.68233585357666, + "eval_train_perplexity/second_seq": 13.94129467010498, + "eval_train_perplexity/seq": 8.76419734954834, + "eval_train_reconstruction/all": 0.28268975019454956, + "eval_train_reconstruction/end_span": 0.7116473913192749, + "eval_train_reconstruction/fim": 0.1426578164100647, + "eval_train_reconstruction/first_seq": 0.15497811138629913, + "eval_train_reconstruction/last_seq": 0.3345572352409363, + "eval_train_reconstruction/second_seq": 0.19043874740600586, + "eval_train_runtime": 444.0992, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 28250 + }, + { + "epoch": 0.1054139343345046, + "grad_norm": 0.3252033293247223, + "learning_rate": 0.0006, + "loss": 2.3003, + "step": 28260 + }, + { + "epoch": 0.10545123579746797, + "grad_norm": 0.40976613759994507, + "learning_rate": 0.0006, + "loss": 2.1864, + "step": 28270 + }, + { + "epoch": 0.10548853726043135, + "grad_norm": 0.6237596273422241, + "learning_rate": 0.0006, + "loss": 2.0134, + "step": 28280 + }, + { + "epoch": 0.10552583872339473, + "grad_norm": 0.4428408741950989, + "learning_rate": 0.0006, + "loss": 2.1413, + "step": 28290 + }, + { + "epoch": 0.10556314018635811, + "grad_norm": 0.3151623606681824, + "learning_rate": 0.0006, + "loss": 2.2777, + "step": 28300 + }, + { + "epoch": 0.10560044164932149, + "grad_norm": 0.8705333471298218, + "learning_rate": 0.0006, + "loss": 2.1965, + "step": 28310 + }, + { + "epoch": 0.10563774311228487, + "grad_norm": 0.419382780790329, + "learning_rate": 0.0006, + "loss": 2.2662, + "step": 28320 + }, + { + "epoch": 0.10567504457524825, + "grad_norm": 0.6069053411483765, + "learning_rate": 0.0006, + "loss": 2.205, + "step": 28330 + }, + { + "epoch": 0.10571234603821161, + "grad_norm": 0.2499803751707077, + "learning_rate": 0.0006, + "loss": 2.1638, + "step": 28340 + }, + { + "epoch": 0.10574964750117499, + "grad_norm": 0.3657747507095337, + "learning_rate": 0.0006, + "loss": 2.2757, + "step": 28350 + }, + { + "epoch": 0.10578694896413837, + "grad_norm": 0.3142489194869995, + "learning_rate": 0.0006, + "loss": 2.2517, + "step": 28360 + }, + { + "epoch": 0.10582425042710175, + "grad_norm": 0.37293246388435364, + "learning_rate": 0.0006, + "loss": 2.2983, + "step": 28370 + }, + { + "epoch": 0.10586155189006513, + "grad_norm": 0.3381359279155731, + "learning_rate": 0.0006, + "loss": 2.3495, + "step": 28380 + }, + { + "epoch": 0.10589885335302851, + "grad_norm": 0.25923392176628113, + "learning_rate": 0.0006, + "loss": 2.2208, + "step": 28390 + }, + { + "epoch": 0.10593615481599189, + "grad_norm": 0.36352798342704773, + "learning_rate": 0.0006, + "loss": 2.2165, + "step": 28400 + }, + { + "epoch": 0.10597345627895526, + "grad_norm": 0.33414873480796814, + "learning_rate": 0.0006, + "loss": 2.1627, + "step": 28410 + }, + { + "epoch": 0.10601075774191863, + "grad_norm": 0.3262239992618561, + "learning_rate": 0.0006, + "loss": 2.2359, + "step": 28420 + }, + { + "epoch": 0.10604805920488201, + "grad_norm": 0.2883182764053345, + "learning_rate": 0.0006, + "loss": 2.3632, + "step": 28430 + }, + { + "epoch": 0.1060853606678454, + "grad_norm": 0.3803117871284485, + "learning_rate": 0.0006, + "loss": 2.2867, + "step": 28440 + }, + { + "epoch": 0.10612266213080877, + "grad_norm": 0.409274697303772, + "learning_rate": 0.0006, + "loss": 2.0097, + "step": 28450 + }, + { + "epoch": 0.10615996359377215, + "grad_norm": 0.4343721866607666, + "learning_rate": 0.0006, + "loss": 2.3617, + "step": 28460 + }, + { + "epoch": 0.10619726505673553, + "grad_norm": 0.34245964884757996, + "learning_rate": 0.0006, + "loss": 2.1459, + "step": 28470 + }, + { + "epoch": 0.1062345665196989, + "grad_norm": 0.2831079661846161, + "learning_rate": 0.0006, + "loss": 2.2332, + "step": 28480 + }, + { + "epoch": 0.10627186798266228, + "grad_norm": 0.3522815704345703, + "learning_rate": 0.0006, + "loss": 2.1962, + "step": 28490 + }, + { + "epoch": 0.10630916944562566, + "grad_norm": 0.36888134479522705, + "learning_rate": 0.0006, + "loss": 2.2157, + "step": 28500 + }, + { + "epoch": 0.10630916944562566, + "eval_valid_loss": 2.194746971130371, + "eval_valid_loss/all": 2.0573410987854004, + "eval_valid_loss/end_span": 1.3075722455978394, + "eval_valid_perplexity/batch": 7.825135707855225, + "eval_valid_perplexity/end_span": 3.6971869468688965, + "eval_valid_perplexity/fim": 2.3241610527038574, + "eval_valid_perplexity/first_seq": 15.059215545654297, + "eval_valid_perplexity/last_seq": 9.151542663574219, + "eval_valid_perplexity/second_seq": 13.91821575164795, + "eval_valid_perplexity/seq": 8.820334434509277, + "eval_valid_reconstruction/all": 0.2929764986038208, + "eval_valid_reconstruction/end_span": 0.6971324682235718, + "eval_valid_reconstruction/fim": 0.16795901954174042, + "eval_valid_reconstruction/first_seq": 0.162258118391037, + "eval_valid_reconstruction/last_seq": 0.32239779829978943, + "eval_valid_reconstruction/second_seq": 0.18662086129188538, + "eval_valid_runtime": 448.7445, + "eval_valid_samples_per_second": 0.428, + "eval_valid_steps_per_second": 0.428, + "step": 28500 + }, + { + "epoch": 0.10630916944562566, + "eval_train_loss": 2.193859815597534, + "eval_train_loss/all": 2.0299806594848633, + "eval_train_loss/end_span": 1.2776870727539062, + "eval_train_perplexity/batch": 7.61393928527832, + "eval_train_perplexity/end_span": 3.5883305072784424, + "eval_train_perplexity/fim": 2.1666297912597656, + "eval_train_perplexity/first_seq": 15.593862533569336, + "eval_train_perplexity/last_seq": 9.008819580078125, + "eval_train_perplexity/second_seq": 14.36169147491455, + "eval_train_perplexity/seq": 8.768158912658691, + "eval_train_reconstruction/all": 0.282551109790802, + "eval_train_reconstruction/end_span": 0.7078232765197754, + "eval_train_reconstruction/fim": 0.15292882919311523, + "eval_train_reconstruction/first_seq": 0.1502695381641388, + "eval_train_reconstruction/last_seq": 0.32647407054901123, + "eval_train_reconstruction/second_seq": 0.1801677793264389, + "eval_train_runtime": 439.5136, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 28500 + }, + { + "epoch": 0.10634647090858904, + "grad_norm": 0.3795298635959625, + "learning_rate": 0.0006, + "loss": 2.4084, + "step": 28510 + }, + { + "epoch": 0.10638377237155242, + "grad_norm": 0.46410298347473145, + "learning_rate": 0.0006, + "loss": 2.2581, + "step": 28520 + }, + { + "epoch": 0.1064210738345158, + "grad_norm": 0.281143456697464, + "learning_rate": 0.0006, + "loss": 2.2341, + "step": 28530 + }, + { + "epoch": 0.10645837529747916, + "grad_norm": 0.4221750497817993, + "learning_rate": 0.0006, + "loss": 2.1793, + "step": 28540 + }, + { + "epoch": 0.10649567676044254, + "grad_norm": 0.2883257567882538, + "learning_rate": 0.0006, + "loss": 2.2365, + "step": 28550 + }, + { + "epoch": 0.10653297822340592, + "grad_norm": 0.3474821150302887, + "learning_rate": 0.0006, + "loss": 2.1778, + "step": 28560 + }, + { + "epoch": 0.1065702796863693, + "grad_norm": 0.29629525542259216, + "learning_rate": 0.0006, + "loss": 2.2659, + "step": 28570 + }, + { + "epoch": 0.10660758114933268, + "grad_norm": 0.38108357787132263, + "learning_rate": 0.0006, + "loss": 2.3492, + "step": 28580 + }, + { + "epoch": 0.10664488261229606, + "grad_norm": 0.5838282108306885, + "learning_rate": 0.0006, + "loss": 2.1383, + "step": 28590 + }, + { + "epoch": 0.10668218407525944, + "grad_norm": 0.5610435009002686, + "learning_rate": 0.0006, + "loss": 2.2897, + "step": 28600 + }, + { + "epoch": 0.1067194855382228, + "grad_norm": 1.1147428750991821, + "learning_rate": 0.0006, + "loss": 2.3554, + "step": 28610 + }, + { + "epoch": 0.10675678700118618, + "grad_norm": 0.2994607090950012, + "learning_rate": 0.0006, + "loss": 2.1552, + "step": 28620 + }, + { + "epoch": 0.10679408846414956, + "grad_norm": 0.35476791858673096, + "learning_rate": 0.0006, + "loss": 2.1656, + "step": 28630 + }, + { + "epoch": 0.10683138992711294, + "grad_norm": 0.3317109942436218, + "learning_rate": 0.0006, + "loss": 2.2017, + "step": 28640 + }, + { + "epoch": 0.10686869139007632, + "grad_norm": 0.5390551686286926, + "learning_rate": 0.0006, + "loss": 2.1547, + "step": 28650 + }, + { + "epoch": 0.1069059928530397, + "grad_norm": 0.3872267007827759, + "learning_rate": 0.0006, + "loss": 2.0972, + "step": 28660 + }, + { + "epoch": 0.10694329431600308, + "grad_norm": 0.37798574566841125, + "learning_rate": 0.0006, + "loss": 2.2193, + "step": 28670 + }, + { + "epoch": 0.10698059577896644, + "grad_norm": 0.3920479416847229, + "learning_rate": 0.0006, + "loss": 2.1032, + "step": 28680 + }, + { + "epoch": 0.10701789724192982, + "grad_norm": 0.3411839008331299, + "learning_rate": 0.0006, + "loss": 2.1975, + "step": 28690 + }, + { + "epoch": 0.1070551987048932, + "grad_norm": 0.35635581612586975, + "learning_rate": 0.0006, + "loss": 2.3701, + "step": 28700 + }, + { + "epoch": 0.10709250016785658, + "grad_norm": 0.2637510299682617, + "learning_rate": 0.0006, + "loss": 2.2761, + "step": 28710 + }, + { + "epoch": 0.10712980163081996, + "grad_norm": 0.36912283301353455, + "learning_rate": 0.0006, + "loss": 2.2564, + "step": 28720 + }, + { + "epoch": 0.10716710309378334, + "grad_norm": 0.29327207803726196, + "learning_rate": 0.0006, + "loss": 2.3763, + "step": 28730 + }, + { + "epoch": 0.10720440455674672, + "grad_norm": 0.25437891483306885, + "learning_rate": 0.0006, + "loss": 2.2553, + "step": 28740 + }, + { + "epoch": 0.10724170601971009, + "grad_norm": 0.378723680973053, + "learning_rate": 0.0006, + "loss": 2.2138, + "step": 28750 + }, + { + "epoch": 0.10724170601971009, + "eval_valid_loss": 2.1943752765655518, + "eval_valid_loss/all": 2.057161808013916, + "eval_valid_loss/end_span": 1.2440494298934937, + "eval_valid_perplexity/batch": 7.823732852935791, + "eval_valid_perplexity/end_span": 3.469635009765625, + "eval_valid_perplexity/fim": 2.4904346466064453, + "eval_valid_perplexity/first_seq": 15.425577163696289, + "eval_valid_perplexity/last_seq": 8.96876335144043, + "eval_valid_perplexity/second_seq": 13.808889389038086, + "eval_valid_perplexity/seq": 8.818536758422852, + "eval_valid_reconstruction/all": 0.29370370507240295, + "eval_valid_reconstruction/end_span": 0.7121164202690125, + "eval_valid_reconstruction/fim": 0.18094561994075775, + "eval_valid_reconstruction/first_seq": 0.15592876076698303, + "eval_valid_reconstruction/last_seq": 0.3314709961414337, + "eval_valid_reconstruction/second_seq": 0.19715295732021332, + "eval_valid_runtime": 437.0028, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 28750 + }, + { + "epoch": 0.10724170601971009, + "eval_train_loss": 2.193647623062134, + "eval_train_loss/all": 2.0297141075134277, + "eval_train_loss/end_span": 1.195317268371582, + "eval_train_perplexity/batch": 7.611909866333008, + "eval_train_perplexity/end_span": 3.3046059608459473, + "eval_train_perplexity/fim": 2.1489498615264893, + "eval_train_perplexity/first_seq": 15.59926986694336, + "eval_train_perplexity/last_seq": 9.131498336791992, + "eval_train_perplexity/second_seq": 14.01770305633545, + "eval_train_perplexity/seq": 8.762312889099121, + "eval_train_reconstruction/all": 0.2829902768135071, + "eval_train_reconstruction/end_span": 0.7270261645317078, + "eval_train_reconstruction/fim": 0.15243060886859894, + "eval_train_reconstruction/first_seq": 0.15042905509471893, + "eval_train_reconstruction/last_seq": 0.31970345973968506, + "eval_train_reconstruction/second_seq": 0.18851855397224426, + "eval_train_runtime": 440.2231, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 28750 + }, + { + "epoch": 0.10727900748267347, + "grad_norm": 0.5095283389091492, + "learning_rate": 0.0006, + "loss": 2.2319, + "step": 28760 + }, + { + "epoch": 0.10731630894563685, + "grad_norm": 0.2721324563026428, + "learning_rate": 0.0006, + "loss": 2.2149, + "step": 28770 + }, + { + "epoch": 0.10735361040860023, + "grad_norm": 0.39277610182762146, + "learning_rate": 0.0006, + "loss": 2.2484, + "step": 28780 + }, + { + "epoch": 0.1073909118715636, + "grad_norm": 0.3561071455478668, + "learning_rate": 0.0006, + "loss": 2.1216, + "step": 28790 + }, + { + "epoch": 0.10742821333452698, + "grad_norm": 0.5587921738624573, + "learning_rate": 0.0006, + "loss": 2.2214, + "step": 28800 + }, + { + "epoch": 0.10746551479749036, + "grad_norm": 0.33233407139778137, + "learning_rate": 0.0006, + "loss": 2.3383, + "step": 28810 + }, + { + "epoch": 0.10750281626045373, + "grad_norm": 0.2231607437133789, + "learning_rate": 0.0006, + "loss": 2.2949, + "step": 28820 + }, + { + "epoch": 0.10754011772341711, + "grad_norm": 0.26274701952934265, + "learning_rate": 0.0006, + "loss": 2.1192, + "step": 28830 + }, + { + "epoch": 0.10757741918638049, + "grad_norm": 0.2089259922504425, + "learning_rate": 0.0006, + "loss": 2.3589, + "step": 28840 + }, + { + "epoch": 0.10761472064934387, + "grad_norm": 0.3394475281238556, + "learning_rate": 0.0006, + "loss": 2.1256, + "step": 28850 + }, + { + "epoch": 0.10765202211230725, + "grad_norm": 0.37083691358566284, + "learning_rate": 0.0006, + "loss": 2.3108, + "step": 28860 + }, + { + "epoch": 0.10768932357527063, + "grad_norm": 0.3713928163051605, + "learning_rate": 0.0006, + "loss": 2.407, + "step": 28870 + }, + { + "epoch": 0.107726625038234, + "grad_norm": 0.6761617064476013, + "learning_rate": 0.0006, + "loss": 2.0102, + "step": 28880 + }, + { + "epoch": 0.10776392650119737, + "grad_norm": 0.40565183758735657, + "learning_rate": 0.0006, + "loss": 2.2908, + "step": 28890 + }, + { + "epoch": 0.10780122796416075, + "grad_norm": 0.47146672010421753, + "learning_rate": 0.0006, + "loss": 2.26, + "step": 28900 + }, + { + "epoch": 0.10783852942712413, + "grad_norm": 0.4068447947502136, + "learning_rate": 0.0006, + "loss": 2.2813, + "step": 28910 + }, + { + "epoch": 0.10787583089008751, + "grad_norm": 0.3487916588783264, + "learning_rate": 0.0006, + "loss": 2.288, + "step": 28920 + }, + { + "epoch": 0.10791313235305089, + "grad_norm": 0.45053935050964355, + "learning_rate": 0.0006, + "loss": 2.3525, + "step": 28930 + }, + { + "epoch": 0.10795043381601427, + "grad_norm": 0.36008596420288086, + "learning_rate": 0.0006, + "loss": 2.1558, + "step": 28940 + }, + { + "epoch": 0.10798773527897765, + "grad_norm": 0.3958258628845215, + "learning_rate": 0.0006, + "loss": 2.1211, + "step": 28950 + }, + { + "epoch": 0.10802503674194101, + "grad_norm": 0.34928691387176514, + "learning_rate": 0.0006, + "loss": 2.255, + "step": 28960 + }, + { + "epoch": 0.10806233820490439, + "grad_norm": 0.27577874064445496, + "learning_rate": 0.0006, + "loss": 2.362, + "step": 28970 + }, + { + "epoch": 0.10809963966786777, + "grad_norm": 0.2627269923686981, + "learning_rate": 0.0006, + "loss": 2.1905, + "step": 28980 + }, + { + "epoch": 0.10813694113083115, + "grad_norm": 0.24857597053050995, + "learning_rate": 0.0006, + "loss": 2.4151, + "step": 28990 + }, + { + "epoch": 0.10817424259379453, + "grad_norm": 0.3013582229614258, + "learning_rate": 0.0006, + "loss": 2.244, + "step": 29000 + }, + { + "epoch": 0.10817424259379453, + "eval_valid_loss": 2.1932036876678467, + "eval_valid_loss/all": 2.0559542179107666, + "eval_valid_loss/end_span": 1.1875362396240234, + "eval_valid_perplexity/batch": 7.814291000366211, + "eval_valid_perplexity/end_span": 3.2789926528930664, + "eval_valid_perplexity/fim": 2.43542218208313, + "eval_valid_perplexity/first_seq": 14.63917350769043, + "eval_valid_perplexity/last_seq": 8.958564758300781, + "eval_valid_perplexity/second_seq": 13.649992942810059, + "eval_valid_perplexity/seq": 8.805415153503418, + "eval_valid_reconstruction/all": 0.2936641275882721, + "eval_valid_reconstruction/end_span": 0.7181710600852966, + "eval_valid_reconstruction/fim": 0.17751577496528625, + "eval_valid_reconstruction/first_seq": 0.17233404517173767, + "eval_valid_reconstruction/last_seq": 0.326689749956131, + "eval_valid_reconstruction/second_seq": 0.19956326484680176, + "eval_valid_runtime": 441.6236, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 29000 + }, + { + "epoch": 0.10817424259379453, + "eval_train_loss": 2.1911070346832275, + "eval_train_loss/all": 2.0271151065826416, + "eval_train_loss/end_span": 1.145716905593872, + "eval_train_perplexity/batch": 7.592152118682861, + "eval_train_perplexity/end_span": 3.1446950435638428, + "eval_train_perplexity/fim": 2.0316174030303955, + "eval_train_perplexity/first_seq": 15.320868492126465, + "eval_train_perplexity/last_seq": 9.174636840820312, + "eval_train_perplexity/second_seq": 14.370655059814453, + "eval_train_perplexity/seq": 8.737360954284668, + "eval_train_reconstruction/all": 0.28349238634109497, + "eval_train_reconstruction/end_span": 0.7304004430770874, + "eval_train_reconstruction/fim": 0.14234213531017303, + "eval_train_reconstruction/first_seq": 0.1553134322166443, + "eval_train_reconstruction/last_seq": 0.3144112527370453, + "eval_train_reconstruction/second_seq": 0.1771266758441925, + "eval_train_runtime": 441.2857, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 29000 + }, + { + "epoch": 0.10821154405675791, + "grad_norm": 0.34896421432495117, + "learning_rate": 0.0006, + "loss": 2.1855, + "step": 29010 + }, + { + "epoch": 0.10824884551972129, + "grad_norm": 0.3664073348045349, + "learning_rate": 0.0006, + "loss": 2.4106, + "step": 29020 + }, + { + "epoch": 0.10828614698268466, + "grad_norm": 0.23335686326026917, + "learning_rate": 0.0006, + "loss": 2.2972, + "step": 29030 + }, + { + "epoch": 0.10832344844564804, + "grad_norm": 0.3148103654384613, + "learning_rate": 0.0006, + "loss": 2.3138, + "step": 29040 + }, + { + "epoch": 0.10836074990861141, + "grad_norm": 0.6925448179244995, + "learning_rate": 0.0006, + "loss": 2.2272, + "step": 29050 + }, + { + "epoch": 0.1083980513715748, + "grad_norm": 0.3289877474308014, + "learning_rate": 0.0006, + "loss": 2.3516, + "step": 29060 + }, + { + "epoch": 0.10843535283453817, + "grad_norm": 0.4712415933609009, + "learning_rate": 0.0006, + "loss": 2.2903, + "step": 29070 + }, + { + "epoch": 0.10847265429750155, + "grad_norm": 0.3145817816257477, + "learning_rate": 0.0006, + "loss": 2.1313, + "step": 29080 + }, + { + "epoch": 0.10850995576046493, + "grad_norm": 0.7440522313117981, + "learning_rate": 0.0006, + "loss": 2.113, + "step": 29090 + }, + { + "epoch": 0.1085472572234283, + "grad_norm": 0.24300231039524078, + "learning_rate": 0.0006, + "loss": 2.3451, + "step": 29100 + }, + { + "epoch": 0.10858455868639168, + "grad_norm": 0.4506584405899048, + "learning_rate": 0.0006, + "loss": 2.2921, + "step": 29110 + }, + { + "epoch": 0.10862186014935506, + "grad_norm": 0.3882278501987457, + "learning_rate": 0.0006, + "loss": 2.3169, + "step": 29120 + }, + { + "epoch": 0.10865916161231844, + "grad_norm": 0.3787258267402649, + "learning_rate": 0.0006, + "loss": 2.1596, + "step": 29130 + }, + { + "epoch": 0.10869646307528182, + "grad_norm": 0.3677646815776825, + "learning_rate": 0.0006, + "loss": 2.2593, + "step": 29140 + }, + { + "epoch": 0.1087337645382452, + "grad_norm": 0.3206629455089569, + "learning_rate": 0.0006, + "loss": 2.2446, + "step": 29150 + }, + { + "epoch": 0.10877106600120856, + "grad_norm": 0.3850073516368866, + "learning_rate": 0.0006, + "loss": 2.0964, + "step": 29160 + }, + { + "epoch": 0.10880836746417194, + "grad_norm": 0.3535298705101013, + "learning_rate": 0.0006, + "loss": 2.1651, + "step": 29170 + }, + { + "epoch": 0.10884566892713532, + "grad_norm": 0.3389326333999634, + "learning_rate": 0.0006, + "loss": 2.3223, + "step": 29180 + }, + { + "epoch": 0.1088829703900987, + "grad_norm": 0.2615494728088379, + "learning_rate": 0.0006, + "loss": 2.3577, + "step": 29190 + }, + { + "epoch": 0.10892027185306208, + "grad_norm": 0.26067379117012024, + "learning_rate": 0.0006, + "loss": 2.1898, + "step": 29200 + }, + { + "epoch": 0.10895757331602546, + "grad_norm": 0.40358421206474304, + "learning_rate": 0.0006, + "loss": 2.2952, + "step": 29210 + }, + { + "epoch": 0.10899487477898884, + "grad_norm": 0.31433871388435364, + "learning_rate": 0.0006, + "loss": 2.0823, + "step": 29220 + }, + { + "epoch": 0.1090321762419522, + "grad_norm": 0.3426004648208618, + "learning_rate": 0.0006, + "loss": 2.151, + "step": 29230 + }, + { + "epoch": 0.10906947770491558, + "grad_norm": 0.4615058898925781, + "learning_rate": 0.0006, + "loss": 2.2961, + "step": 29240 + }, + { + "epoch": 0.10910677916787896, + "grad_norm": 0.4491499066352844, + "learning_rate": 0.0006, + "loss": 2.1394, + "step": 29250 + }, + { + "epoch": 0.10910677916787896, + "eval_valid_loss": 2.1911590099334717, + "eval_valid_loss/all": 2.053663969039917, + "eval_valid_loss/end_span": 1.2974404096603394, + "eval_valid_perplexity/batch": 7.796414852142334, + "eval_valid_perplexity/end_span": 3.659916877746582, + "eval_valid_perplexity/fim": 2.3435113430023193, + "eval_valid_perplexity/first_seq": 14.726882934570312, + "eval_valid_perplexity/last_seq": 8.850728988647461, + "eval_valid_perplexity/second_seq": 13.961894035339355, + "eval_valid_perplexity/seq": 8.78580379486084, + "eval_valid_reconstruction/all": 0.29451873898506165, + "eval_valid_reconstruction/end_span": 0.694604754447937, + "eval_valid_reconstruction/fim": 0.1696925312280655, + "eval_valid_reconstruction/first_seq": 0.1702665388584137, + "eval_valid_reconstruction/last_seq": 0.3273520767688751, + "eval_valid_reconstruction/second_seq": 0.19246737658977509, + "eval_valid_runtime": 443.1468, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 29250 + }, + { + "epoch": 0.10910677916787896, + "eval_train_loss": 2.1917543411254883, + "eval_train_loss/all": 2.0278942584991455, + "eval_train_loss/end_span": 1.2600040435791016, + "eval_train_perplexity/batch": 7.59807014465332, + "eval_train_perplexity/end_span": 3.52543568611145, + "eval_train_perplexity/fim": 2.2125654220581055, + "eval_train_perplexity/first_seq": 15.438565254211426, + "eval_train_perplexity/last_seq": 8.835386276245117, + "eval_train_perplexity/second_seq": 14.293191909790039, + "eval_train_perplexity/seq": 8.751021385192871, + "eval_train_reconstruction/all": 0.2833118736743927, + "eval_train_reconstruction/end_span": 0.7038216590881348, + "eval_train_reconstruction/fim": 0.15900814533233643, + "eval_train_reconstruction/first_seq": 0.15173979103565216, + "eval_train_reconstruction/last_seq": 0.3318807780742645, + "eval_train_reconstruction/second_seq": 0.18348482251167297, + "eval_train_runtime": 436.3394, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 29250 + }, + { + "epoch": 0.10914408063084234, + "grad_norm": 0.40947914123535156, + "learning_rate": 0.0006, + "loss": 2.1676, + "step": 29260 + }, + { + "epoch": 0.10918138209380572, + "grad_norm": 0.5884263515472412, + "learning_rate": 0.0006, + "loss": 2.3351, + "step": 29270 + }, + { + "epoch": 0.1092186835567691, + "grad_norm": 0.4391849637031555, + "learning_rate": 0.0006, + "loss": 2.2969, + "step": 29280 + }, + { + "epoch": 0.10925598501973248, + "grad_norm": 0.5567439198493958, + "learning_rate": 0.0006, + "loss": 2.1266, + "step": 29290 + }, + { + "epoch": 0.10929328648269585, + "grad_norm": 0.5245586037635803, + "learning_rate": 0.0006, + "loss": 2.0977, + "step": 29300 + }, + { + "epoch": 0.10933058794565922, + "grad_norm": 0.43188539147377014, + "learning_rate": 0.0006, + "loss": 2.1729, + "step": 29310 + }, + { + "epoch": 0.1093678894086226, + "grad_norm": 0.3725855052471161, + "learning_rate": 0.0006, + "loss": 2.0313, + "step": 29320 + }, + { + "epoch": 0.10940519087158598, + "grad_norm": 0.2950875163078308, + "learning_rate": 0.0006, + "loss": 2.2043, + "step": 29330 + }, + { + "epoch": 0.10944249233454936, + "grad_norm": 0.3171044886112213, + "learning_rate": 0.0006, + "loss": 2.2245, + "step": 29340 + }, + { + "epoch": 0.10947979379751274, + "grad_norm": 0.31298884749412537, + "learning_rate": 0.0006, + "loss": 2.3175, + "step": 29350 + }, + { + "epoch": 0.10951709526047612, + "grad_norm": 0.32735633850097656, + "learning_rate": 0.0006, + "loss": 2.3331, + "step": 29360 + }, + { + "epoch": 0.10955439672343949, + "grad_norm": 0.3394147753715515, + "learning_rate": 0.0006, + "loss": 2.0515, + "step": 29370 + }, + { + "epoch": 0.10959169818640287, + "grad_norm": 0.3623567819595337, + "learning_rate": 0.0006, + "loss": 2.2219, + "step": 29380 + }, + { + "epoch": 0.10962899964936625, + "grad_norm": 0.2840557098388672, + "learning_rate": 0.0006, + "loss": 2.2598, + "step": 29390 + }, + { + "epoch": 0.10966630111232963, + "grad_norm": 0.3155611455440521, + "learning_rate": 0.0006, + "loss": 2.1217, + "step": 29400 + }, + { + "epoch": 0.109703602575293, + "grad_norm": 0.2873480021953583, + "learning_rate": 0.0006, + "loss": 2.2435, + "step": 29410 + }, + { + "epoch": 0.10974090403825638, + "grad_norm": 0.4118991792201996, + "learning_rate": 0.0006, + "loss": 2.046, + "step": 29420 + }, + { + "epoch": 0.10977820550121976, + "grad_norm": 0.34579595923423767, + "learning_rate": 0.0006, + "loss": 2.3454, + "step": 29430 + }, + { + "epoch": 0.10981550696418313, + "grad_norm": 0.3346903622150421, + "learning_rate": 0.0006, + "loss": 2.266, + "step": 29440 + }, + { + "epoch": 0.10985280842714651, + "grad_norm": 0.27961695194244385, + "learning_rate": 0.0006, + "loss": 2.2427, + "step": 29450 + }, + { + "epoch": 0.10989010989010989, + "grad_norm": 0.5215656161308289, + "learning_rate": 0.0006, + "loss": 2.1331, + "step": 29460 + }, + { + "epoch": 0.10992741135307327, + "grad_norm": 0.400447279214859, + "learning_rate": 0.0006, + "loss": 2.2349, + "step": 29470 + }, + { + "epoch": 0.10996471281603665, + "grad_norm": 0.3576657772064209, + "learning_rate": 0.0006, + "loss": 2.2384, + "step": 29480 + }, + { + "epoch": 0.11000201427900003, + "grad_norm": 0.39252331852912903, + "learning_rate": 0.0006, + "loss": 2.1548, + "step": 29490 + }, + { + "epoch": 0.1100393157419634, + "grad_norm": 0.4811383783817291, + "learning_rate": 0.0006, + "loss": 2.2899, + "step": 29500 + }, + { + "epoch": 0.1100393157419634, + "eval_valid_loss": 2.19330096244812, + "eval_valid_loss/all": 2.056281805038452, + "eval_valid_loss/end_span": 1.3076744079589844, + "eval_valid_perplexity/batch": 7.8168511390686035, + "eval_valid_perplexity/end_span": 3.6975646018981934, + "eval_valid_perplexity/fim": 2.537273645401001, + "eval_valid_perplexity/first_seq": 14.986502647399902, + "eval_valid_perplexity/last_seq": 8.702801704406738, + "eval_valid_perplexity/second_seq": 13.220627784729004, + "eval_valid_perplexity/seq": 8.810277938842773, + "eval_valid_reconstruction/all": 0.29348012804985046, + "eval_valid_reconstruction/end_span": 0.6917651891708374, + "eval_valid_reconstruction/fim": 0.18454794585704803, + "eval_valid_reconstruction/first_seq": 0.16471260786056519, + "eval_valid_reconstruction/last_seq": 0.33763787150382996, + "eval_valid_reconstruction/second_seq": 0.21089564263820648, + "eval_valid_runtime": 478.7515, + "eval_valid_samples_per_second": 0.401, + "eval_valid_steps_per_second": 0.401, + "step": 29500 + }, + { + "epoch": 0.1100393157419634, + "eval_train_loss": 2.1915738582611084, + "eval_train_loss/all": 2.0278332233428955, + "eval_train_loss/end_span": 1.265651822090149, + "eval_train_perplexity/batch": 7.597606182098389, + "eval_train_perplexity/end_span": 3.545403003692627, + "eval_train_perplexity/fim": 2.2070424556732178, + "eval_train_perplexity/first_seq": 15.761883735656738, + "eval_train_perplexity/last_seq": 9.141923904418945, + "eval_train_perplexity/second_seq": 14.656694412231445, + "eval_train_perplexity/seq": 8.746589660644531, + "eval_train_reconstruction/all": 0.2831662595272064, + "eval_train_reconstruction/end_span": 0.7024543881416321, + "eval_train_reconstruction/fim": 0.15743836760520935, + "eval_train_reconstruction/first_seq": 0.14605891704559326, + "eval_train_reconstruction/last_seq": 0.31946200132369995, + "eval_train_reconstruction/second_seq": 0.17380039393901825, + "eval_train_runtime": 479.9998, + "eval_train_samples_per_second": 0.4, + "eval_train_steps_per_second": 0.4, + "step": 29500 + }, + { + "epoch": 0.11007661720492677, + "grad_norm": 0.4620858132839203, + "learning_rate": 0.0006, + "loss": 2.2551, + "step": 29510 + }, + { + "epoch": 0.11011391866789015, + "grad_norm": 0.3002927601337433, + "learning_rate": 0.0006, + "loss": 2.2833, + "step": 29520 + }, + { + "epoch": 0.11015122013085353, + "grad_norm": 0.4937800467014313, + "learning_rate": 0.0006, + "loss": 2.0892, + "step": 29530 + }, + { + "epoch": 0.11018852159381691, + "grad_norm": 0.27075204253196716, + "learning_rate": 0.0006, + "loss": 1.9768, + "step": 29540 + }, + { + "epoch": 0.11022582305678029, + "grad_norm": 0.3360745310783386, + "learning_rate": 0.0006, + "loss": 2.3393, + "step": 29550 + }, + { + "epoch": 0.11026312451974367, + "grad_norm": 0.3984098732471466, + "learning_rate": 0.0006, + "loss": 2.183, + "step": 29560 + }, + { + "epoch": 0.11030042598270705, + "grad_norm": 0.3600631058216095, + "learning_rate": 0.0006, + "loss": 2.1989, + "step": 29570 + }, + { + "epoch": 0.11033772744567041, + "grad_norm": 0.42112043499946594, + "learning_rate": 0.0006, + "loss": 2.2287, + "step": 29580 + }, + { + "epoch": 0.1103750289086338, + "grad_norm": 0.3855866491794586, + "learning_rate": 0.0006, + "loss": 2.1289, + "step": 29590 + }, + { + "epoch": 0.11041233037159717, + "grad_norm": 0.26862993836402893, + "learning_rate": 0.0006, + "loss": 2.3711, + "step": 29600 + }, + { + "epoch": 0.11044963183456055, + "grad_norm": 0.37093880772590637, + "learning_rate": 0.0006, + "loss": 2.3031, + "step": 29610 + }, + { + "epoch": 0.11048693329752393, + "grad_norm": 0.39919716119766235, + "learning_rate": 0.0006, + "loss": 2.2522, + "step": 29620 + }, + { + "epoch": 0.11052423476048731, + "grad_norm": 0.24369068443775177, + "learning_rate": 0.0006, + "loss": 2.1211, + "step": 29630 + }, + { + "epoch": 0.11056153622345069, + "grad_norm": 0.4478280544281006, + "learning_rate": 0.0006, + "loss": 2.3147, + "step": 29640 + }, + { + "epoch": 0.11059883768641406, + "grad_norm": 0.376232773065567, + "learning_rate": 0.0006, + "loss": 2.2814, + "step": 29650 + }, + { + "epoch": 0.11063613914937744, + "grad_norm": 0.41040220856666565, + "learning_rate": 0.0006, + "loss": 2.1357, + "step": 29660 + }, + { + "epoch": 0.11067344061234081, + "grad_norm": 1.5993021726608276, + "learning_rate": 0.0006, + "loss": 2.2268, + "step": 29670 + }, + { + "epoch": 0.1107107420753042, + "grad_norm": 0.270685613155365, + "learning_rate": 0.0006, + "loss": 2.0567, + "step": 29680 + }, + { + "epoch": 0.11074804353826757, + "grad_norm": 0.34002482891082764, + "learning_rate": 0.0006, + "loss": 2.3387, + "step": 29690 + }, + { + "epoch": 0.11078534500123095, + "grad_norm": 0.4132586121559143, + "learning_rate": 0.0006, + "loss": 2.0375, + "step": 29700 + }, + { + "epoch": 0.11082264646419433, + "grad_norm": 0.3839227557182312, + "learning_rate": 0.0006, + "loss": 2.3126, + "step": 29710 + }, + { + "epoch": 0.1108599479271577, + "grad_norm": 0.2900600731372833, + "learning_rate": 0.0006, + "loss": 2.409, + "step": 29720 + }, + { + "epoch": 0.11089724939012108, + "grad_norm": 0.3729667663574219, + "learning_rate": 0.0006, + "loss": 2.1911, + "step": 29730 + }, + { + "epoch": 0.11093455085308446, + "grad_norm": 0.33890095353126526, + "learning_rate": 0.0006, + "loss": 2.0446, + "step": 29740 + }, + { + "epoch": 0.11097185231604784, + "grad_norm": 0.4427178204059601, + "learning_rate": 0.0006, + "loss": 2.1666, + "step": 29750 + }, + { + "epoch": 0.11097185231604784, + "eval_valid_loss": 2.195460081100464, + "eval_valid_loss/all": 2.058046340942383, + "eval_valid_loss/end_span": 1.292724370956421, + "eval_valid_perplexity/batch": 7.8306565284729, + "eval_valid_perplexity/end_span": 3.6426970958709717, + "eval_valid_perplexity/fim": 2.7534070014953613, + "eval_valid_perplexity/first_seq": 14.742097854614258, + "eval_valid_perplexity/last_seq": 8.985299110412598, + "eval_valid_perplexity/second_seq": 14.083723068237305, + "eval_valid_perplexity/seq": 8.825430870056152, + "eval_valid_reconstruction/all": 0.2928374409675598, + "eval_valid_reconstruction/end_span": 0.7047377824783325, + "eval_valid_reconstruction/fim": 0.20147478580474854, + "eval_valid_reconstruction/first_seq": 0.17376777529716492, + "eval_valid_reconstruction/last_seq": 0.32938483357429504, + "eval_valid_reconstruction/second_seq": 0.18823498487472534, + "eval_valid_runtime": 442.1961, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 29750 + }, + { + "epoch": 0.11097185231604784, + "eval_train_loss": 2.1938745975494385, + "eval_train_loss/all": 2.0298352241516113, + "eval_train_loss/end_span": 1.2639689445495605, + "eval_train_perplexity/batch": 7.612832069396973, + "eval_train_perplexity/end_span": 3.5394415855407715, + "eval_train_perplexity/fim": 2.041945695877075, + "eval_train_perplexity/first_seq": 15.289581298828125, + "eval_train_perplexity/last_seq": 8.74596881866455, + "eval_train_perplexity/second_seq": 14.36976146697998, + "eval_train_perplexity/seq": 8.765885353088379, + "eval_train_reconstruction/all": 0.2826160788536072, + "eval_train_reconstruction/end_span": 0.7120667099952698, + "eval_train_reconstruction/fim": 0.14176423847675323, + "eval_train_reconstruction/first_seq": 0.15144185721874237, + "eval_train_reconstruction/last_seq": 0.33392441272735596, + "eval_train_reconstruction/second_seq": 0.18013980984687805, + "eval_train_runtime": 442.5342, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 29750 + }, + { + "epoch": 0.11100915377901122, + "grad_norm": 0.2513526976108551, + "learning_rate": 0.0006, + "loss": 2.2414, + "step": 29760 + }, + { + "epoch": 0.1110464552419746, + "grad_norm": 0.25230318307876587, + "learning_rate": 0.0006, + "loss": 2.3359, + "step": 29770 + }, + { + "epoch": 0.11108375670493796, + "grad_norm": 0.21319125592708588, + "learning_rate": 0.0006, + "loss": 2.334, + "step": 29780 + }, + { + "epoch": 0.11112105816790134, + "grad_norm": 0.31592532992362976, + "learning_rate": 0.0006, + "loss": 2.1949, + "step": 29790 + }, + { + "epoch": 0.11115835963086472, + "grad_norm": 0.5386735200881958, + "learning_rate": 0.0006, + "loss": 2.0438, + "step": 29800 + }, + { + "epoch": 0.1111956610938281, + "grad_norm": 0.31079205870628357, + "learning_rate": 0.0006, + "loss": 2.2321, + "step": 29810 + }, + { + "epoch": 0.11123296255679148, + "grad_norm": 0.27538153529167175, + "learning_rate": 0.0006, + "loss": 2.2589, + "step": 29820 + }, + { + "epoch": 0.11127026401975486, + "grad_norm": 0.41302725672721863, + "learning_rate": 0.0006, + "loss": 2.2132, + "step": 29830 + }, + { + "epoch": 0.11130756548271824, + "grad_norm": 0.48886778950691223, + "learning_rate": 0.0006, + "loss": 2.2664, + "step": 29840 + }, + { + "epoch": 0.1113448669456816, + "grad_norm": 0.3338317573070526, + "learning_rate": 0.0006, + "loss": 2.1404, + "step": 29850 + }, + { + "epoch": 0.11138216840864498, + "grad_norm": 0.3156065344810486, + "learning_rate": 0.0006, + "loss": 2.251, + "step": 29860 + }, + { + "epoch": 0.11141946987160836, + "grad_norm": 0.39832955598831177, + "learning_rate": 0.0006, + "loss": 2.3234, + "step": 29870 + }, + { + "epoch": 0.11145677133457174, + "grad_norm": 0.3050325810909271, + "learning_rate": 0.0006, + "loss": 2.1079, + "step": 29880 + }, + { + "epoch": 0.11149407279753512, + "grad_norm": 0.32844147086143494, + "learning_rate": 0.0006, + "loss": 2.1035, + "step": 29890 + }, + { + "epoch": 0.1115313742604985, + "grad_norm": 0.2785356640815735, + "learning_rate": 0.0006, + "loss": 2.1725, + "step": 29900 + }, + { + "epoch": 0.11156867572346188, + "grad_norm": 0.25681838393211365, + "learning_rate": 0.0006, + "loss": 2.2726, + "step": 29910 + }, + { + "epoch": 0.11160597718642525, + "grad_norm": 0.37460678815841675, + "learning_rate": 0.0006, + "loss": 2.1721, + "step": 29920 + }, + { + "epoch": 0.11164327864938862, + "grad_norm": 0.24137914180755615, + "learning_rate": 0.0006, + "loss": 2.1358, + "step": 29930 + }, + { + "epoch": 0.111680580112352, + "grad_norm": 0.322190523147583, + "learning_rate": 0.0006, + "loss": 2.1776, + "step": 29940 + }, + { + "epoch": 0.11171788157531538, + "grad_norm": 0.48912936449050903, + "learning_rate": 0.0006, + "loss": 2.1827, + "step": 29950 + }, + { + "epoch": 0.11175518303827876, + "grad_norm": 0.3736894726753235, + "learning_rate": 0.0006, + "loss": 2.161, + "step": 29960 + }, + { + "epoch": 0.11179248450124214, + "grad_norm": 0.3026806712150574, + "learning_rate": 0.0006, + "loss": 2.2581, + "step": 29970 + }, + { + "epoch": 0.11182978596420552, + "grad_norm": 0.3846144676208496, + "learning_rate": 0.0006, + "loss": 2.1399, + "step": 29980 + }, + { + "epoch": 0.11186708742716889, + "grad_norm": 0.344319611787796, + "learning_rate": 0.0006, + "loss": 2.3552, + "step": 29990 + }, + { + "epoch": 0.11190438889013227, + "grad_norm": 0.30501359701156616, + "learning_rate": 0.0006, + "loss": 2.2904, + "step": 30000 + }, + { + "epoch": 0.11190438889013227, + "eval_valid_loss": 2.192837715148926, + "eval_valid_loss/all": 2.05532169342041, + "eval_valid_loss/end_span": 1.2675392627716064, + "eval_valid_perplexity/batch": 7.809349536895752, + "eval_valid_perplexity/end_span": 3.552100896835327, + "eval_valid_perplexity/fim": 2.2801945209503174, + "eval_valid_perplexity/first_seq": 14.69179916381836, + "eval_valid_perplexity/last_seq": 8.838595390319824, + "eval_valid_perplexity/second_seq": 13.78132438659668, + "eval_valid_perplexity/seq": 8.796629905700684, + "eval_valid_reconstruction/all": 0.2934579849243164, + "eval_valid_reconstruction/end_span": 0.6990243196487427, + "eval_valid_reconstruction/fim": 0.16389822959899902, + "eval_valid_reconstruction/first_seq": 0.16699732840061188, + "eval_valid_reconstruction/last_seq": 0.3287871778011322, + "eval_valid_reconstruction/second_seq": 0.19609209895133972, + "eval_valid_runtime": 472.5312, + "eval_valid_samples_per_second": 0.406, + "eval_valid_steps_per_second": 0.406, + "step": 30000 + }, + { + "epoch": 0.11190438889013227, + "eval_train_loss": 2.1907474994659424, + "eval_train_loss/all": 2.0268898010253906, + "eval_train_loss/end_span": 1.225368857383728, + "eval_train_perplexity/batch": 7.590441703796387, + "eval_train_perplexity/end_span": 3.4054219722747803, + "eval_train_perplexity/fim": 1.964766025543213, + "eval_train_perplexity/first_seq": 15.232809066772461, + "eval_train_perplexity/last_seq": 8.992536544799805, + "eval_train_perplexity/second_seq": 14.130332946777344, + "eval_train_perplexity/seq": 8.735663414001465, + "eval_train_reconstruction/all": 0.2833492159843445, + "eval_train_reconstruction/end_span": 0.7114848494529724, + "eval_train_reconstruction/fim": 0.13460871577262878, + "eval_train_reconstruction/first_seq": 0.15970547497272491, + "eval_train_reconstruction/last_seq": 0.3255646228790283, + "eval_train_reconstruction/second_seq": 0.1884489357471466, + "eval_train_runtime": 469.9641, + "eval_train_samples_per_second": 0.409, + "eval_train_steps_per_second": 0.409, + "step": 30000 + }, + { + "epoch": 0.11194169035309565, + "grad_norm": 0.27783647179603577, + "learning_rate": 0.0006, + "loss": 2.4404, + "step": 30010 + }, + { + "epoch": 0.11197899181605903, + "grad_norm": 0.6335692405700684, + "learning_rate": 0.0006, + "loss": 2.2629, + "step": 30020 + }, + { + "epoch": 0.1120162932790224, + "grad_norm": 0.437248170375824, + "learning_rate": 0.0006, + "loss": 2.2627, + "step": 30030 + }, + { + "epoch": 0.11205359474198578, + "grad_norm": 0.33892062306404114, + "learning_rate": 0.0006, + "loss": 2.2414, + "step": 30040 + }, + { + "epoch": 0.11209089620494916, + "grad_norm": 0.4481928050518036, + "learning_rate": 0.0006, + "loss": 2.1149, + "step": 30050 + }, + { + "epoch": 0.11212819766791253, + "grad_norm": 0.28585025668144226, + "learning_rate": 0.0006, + "loss": 2.1634, + "step": 30060 + }, + { + "epoch": 0.11216549913087591, + "grad_norm": 0.641478955745697, + "learning_rate": 0.0006, + "loss": 2.247, + "step": 30070 + }, + { + "epoch": 0.11220280059383929, + "grad_norm": 0.38692229986190796, + "learning_rate": 0.0006, + "loss": 2.276, + "step": 30080 + }, + { + "epoch": 0.11224010205680267, + "grad_norm": 0.46528396010398865, + "learning_rate": 0.0006, + "loss": 2.24, + "step": 30090 + }, + { + "epoch": 0.11227740351976605, + "grad_norm": 0.5174573063850403, + "learning_rate": 0.0006, + "loss": 2.105, + "step": 30100 + }, + { + "epoch": 0.11231470498272943, + "grad_norm": 0.33256691694259644, + "learning_rate": 0.0006, + "loss": 2.2664, + "step": 30110 + }, + { + "epoch": 0.1123520064456928, + "grad_norm": 0.3992208242416382, + "learning_rate": 0.0006, + "loss": 2.0298, + "step": 30120 + }, + { + "epoch": 0.11238930790865617, + "grad_norm": 0.40815457701683044, + "learning_rate": 0.0006, + "loss": 2.2681, + "step": 30130 + }, + { + "epoch": 0.11242660937161955, + "grad_norm": 0.37421342730522156, + "learning_rate": 0.0006, + "loss": 2.3495, + "step": 30140 + }, + { + "epoch": 0.11246391083458293, + "grad_norm": 0.3521215319633484, + "learning_rate": 0.0006, + "loss": 2.123, + "step": 30150 + }, + { + "epoch": 0.11250121229754631, + "grad_norm": 0.2765689790248871, + "learning_rate": 0.0006, + "loss": 2.3585, + "step": 30160 + }, + { + "epoch": 0.11253851376050969, + "grad_norm": 0.2792215049266815, + "learning_rate": 0.0006, + "loss": 2.2734, + "step": 30170 + }, + { + "epoch": 0.11257581522347307, + "grad_norm": 0.29035326838493347, + "learning_rate": 0.0006, + "loss": 2.3589, + "step": 30180 + }, + { + "epoch": 0.11261311668643645, + "grad_norm": 0.35454031825065613, + "learning_rate": 0.0006, + "loss": 2.3125, + "step": 30190 + }, + { + "epoch": 0.11265041814939981, + "grad_norm": 0.20982691645622253, + "learning_rate": 0.0006, + "loss": 2.269, + "step": 30200 + }, + { + "epoch": 0.1126877196123632, + "grad_norm": 0.21556054055690765, + "learning_rate": 0.0006, + "loss": 2.2575, + "step": 30210 + }, + { + "epoch": 0.11272502107532657, + "grad_norm": 0.3209463357925415, + "learning_rate": 0.0006, + "loss": 2.2661, + "step": 30220 + }, + { + "epoch": 0.11276232253828995, + "grad_norm": 0.3688093423843384, + "learning_rate": 0.0006, + "loss": 2.1965, + "step": 30230 + }, + { + "epoch": 0.11279962400125333, + "grad_norm": 0.24389506876468658, + "learning_rate": 0.0006, + "loss": 2.2681, + "step": 30240 + }, + { + "epoch": 0.11283692546421671, + "grad_norm": 0.31856977939605713, + "learning_rate": 0.0006, + "loss": 2.1467, + "step": 30250 + }, + { + "epoch": 0.11283692546421671, + "eval_valid_loss": 2.1914703845977783, + "eval_valid_loss/all": 2.053945541381836, + "eval_valid_loss/end_span": 1.2240355014801025, + "eval_valid_perplexity/batch": 7.798610210418701, + "eval_valid_perplexity/end_span": 3.4008843898773193, + "eval_valid_perplexity/fim": 2.1754117012023926, + "eval_valid_perplexity/first_seq": 14.929072380065918, + "eval_valid_perplexity/last_seq": 8.873828887939453, + "eval_valid_perplexity/second_seq": 14.141036987304688, + "eval_valid_perplexity/seq": 8.785355567932129, + "eval_valid_reconstruction/all": 0.29415586590766907, + "eval_valid_reconstruction/end_span": 0.7095970511436462, + "eval_valid_reconstruction/fim": 0.15500329434871674, + "eval_valid_reconstruction/first_seq": 0.16286669671535492, + "eval_valid_reconstruction/last_seq": 0.3285117745399475, + "eval_valid_reconstruction/second_seq": 0.1854102611541748, + "eval_valid_runtime": 468.8284, + "eval_valid_samples_per_second": 0.41, + "eval_valid_steps_per_second": 0.41, + "step": 30250 + }, + { + "epoch": 0.11283692546421671, + "eval_train_loss": 2.1903040409088135, + "eval_train_loss/all": 2.0265049934387207, + "eval_train_loss/end_span": 1.189699649810791, + "eval_train_perplexity/batch": 7.587521553039551, + "eval_train_perplexity/end_span": 3.2860941886901855, + "eval_train_perplexity/fim": 2.3592333793640137, + "eval_train_perplexity/first_seq": 15.283761024475098, + "eval_train_perplexity/last_seq": 9.02059555053711, + "eval_train_perplexity/second_seq": 14.45146369934082, + "eval_train_perplexity/seq": 8.735428810119629, + "eval_train_reconstruction/all": 0.28363746404647827, + "eval_train_reconstruction/end_span": 0.7207819819450378, + "eval_train_reconstruction/fim": 0.171672523021698, + "eval_train_reconstruction/first_seq": 0.15253345668315887, + "eval_train_reconstruction/last_seq": 0.3214978277683258, + "eval_train_reconstruction/second_seq": 0.17717351019382477, + "eval_train_runtime": 470.2459, + "eval_train_samples_per_second": 0.408, + "eval_train_steps_per_second": 0.408, + "step": 30250 + }, + { + "epoch": 0.11287422692718009, + "grad_norm": 0.4296451807022095, + "learning_rate": 0.0006, + "loss": 2.1377, + "step": 30260 + }, + { + "epoch": 0.11291152839014346, + "grad_norm": 0.35602232813835144, + "learning_rate": 0.0006, + "loss": 2.1709, + "step": 30270 + }, + { + "epoch": 0.11294882985310684, + "grad_norm": 0.3998909294605255, + "learning_rate": 0.0006, + "loss": 2.2224, + "step": 30280 + }, + { + "epoch": 0.11298613131607022, + "grad_norm": 0.2494225949048996, + "learning_rate": 0.0006, + "loss": 2.2911, + "step": 30290 + }, + { + "epoch": 0.1130234327790336, + "grad_norm": 0.3608250319957733, + "learning_rate": 0.0006, + "loss": 2.2698, + "step": 30300 + }, + { + "epoch": 0.11306073424199697, + "grad_norm": 0.41240957379341125, + "learning_rate": 0.0006, + "loss": 2.4177, + "step": 30310 + }, + { + "epoch": 0.11309803570496035, + "grad_norm": 0.2392866462469101, + "learning_rate": 0.0006, + "loss": 2.1355, + "step": 30320 + }, + { + "epoch": 0.11313533716792372, + "grad_norm": 0.4063909947872162, + "learning_rate": 0.0006, + "loss": 2.2898, + "step": 30330 + }, + { + "epoch": 0.1131726386308871, + "grad_norm": 0.414241224527359, + "learning_rate": 0.0006, + "loss": 2.1254, + "step": 30340 + }, + { + "epoch": 0.11320994009385048, + "grad_norm": 0.33254823088645935, + "learning_rate": 0.0006, + "loss": 2.2574, + "step": 30350 + }, + { + "epoch": 0.11324724155681386, + "grad_norm": 0.41324442625045776, + "learning_rate": 0.0006, + "loss": 2.2939, + "step": 30360 + }, + { + "epoch": 0.11328454301977724, + "grad_norm": 0.24313583970069885, + "learning_rate": 0.0006, + "loss": 2.3618, + "step": 30370 + }, + { + "epoch": 0.11332184448274062, + "grad_norm": 0.29318004846572876, + "learning_rate": 0.0006, + "loss": 2.2373, + "step": 30380 + }, + { + "epoch": 0.113359145945704, + "grad_norm": 0.2786652743816376, + "learning_rate": 0.0006, + "loss": 2.2247, + "step": 30390 + }, + { + "epoch": 0.11339644740866736, + "grad_norm": 0.26298171281814575, + "learning_rate": 0.0006, + "loss": 2.2089, + "step": 30400 + }, + { + "epoch": 0.11343374887163074, + "grad_norm": 0.3679980933666229, + "learning_rate": 0.0006, + "loss": 2.2625, + "step": 30410 + }, + { + "epoch": 0.11347105033459412, + "grad_norm": 0.30765074491500854, + "learning_rate": 0.0006, + "loss": 2.1136, + "step": 30420 + }, + { + "epoch": 0.1135083517975575, + "grad_norm": 0.32336556911468506, + "learning_rate": 0.0006, + "loss": 2.1243, + "step": 30430 + }, + { + "epoch": 0.11354565326052088, + "grad_norm": 0.5101474523544312, + "learning_rate": 0.0006, + "loss": 2.2113, + "step": 30440 + }, + { + "epoch": 0.11358295472348426, + "grad_norm": 0.5088875889778137, + "learning_rate": 0.0006, + "loss": 2.2181, + "step": 30450 + }, + { + "epoch": 0.11362025618644764, + "grad_norm": 0.5904508829116821, + "learning_rate": 0.0006, + "loss": 2.1999, + "step": 30460 + }, + { + "epoch": 0.113657557649411, + "grad_norm": 0.42532214522361755, + "learning_rate": 0.0006, + "loss": 2.3395, + "step": 30470 + }, + { + "epoch": 0.11369485911237438, + "grad_norm": 0.44339263439178467, + "learning_rate": 0.0006, + "loss": 2.3119, + "step": 30480 + }, + { + "epoch": 0.11373216057533776, + "grad_norm": 0.5675201416015625, + "learning_rate": 0.0006, + "loss": 2.1603, + "step": 30490 + }, + { + "epoch": 0.11376946203830114, + "grad_norm": 0.39473387598991394, + "learning_rate": 0.0006, + "loss": 2.3715, + "step": 30500 + }, + { + "epoch": 0.11376946203830114, + "eval_valid_loss": 2.194580316543579, + "eval_valid_loss/all": 2.0571088790893555, + "eval_valid_loss/end_span": 1.2127833366394043, + "eval_valid_perplexity/batch": 7.823318958282471, + "eval_valid_perplexity/end_span": 3.3628315925598145, + "eval_valid_perplexity/fim": 2.468877077102661, + "eval_valid_perplexity/first_seq": 14.561616897583008, + "eval_valid_perplexity/last_seq": 8.892358779907227, + "eval_valid_perplexity/second_seq": 13.650351524353027, + "eval_valid_perplexity/seq": 8.815956115722656, + "eval_valid_reconstruction/all": 0.2933153808116913, + "eval_valid_reconstruction/end_span": 0.7128600478172302, + "eval_valid_reconstruction/fim": 0.17859198153018951, + "eval_valid_reconstruction/first_seq": 0.17202399671077728, + "eval_valid_reconstruction/last_seq": 0.3281986117362976, + "eval_valid_reconstruction/second_seq": 0.1979198157787323, + "eval_valid_runtime": 445.7697, + "eval_valid_samples_per_second": 0.431, + "eval_valid_steps_per_second": 0.431, + "step": 30500 + }, + { + "epoch": 0.11376946203830114, + "eval_train_loss": 2.194305658340454, + "eval_train_loss/all": 2.0302395820617676, + "eval_train_loss/end_span": 1.1816034317016602, + "eval_train_perplexity/batch": 7.61591100692749, + "eval_train_perplexity/end_span": 3.259596586227417, + "eval_train_perplexity/fim": 2.1956703662872314, + "eval_train_perplexity/first_seq": 15.667411804199219, + "eval_train_perplexity/last_seq": 9.273847579956055, + "eval_train_perplexity/second_seq": 14.04375171661377, + "eval_train_perplexity/seq": 8.77023696899414, + "eval_train_reconstruction/all": 0.28269028663635254, + "eval_train_reconstruction/end_span": 0.723296046257019, + "eval_train_reconstruction/fim": 0.1560649275779724, + "eval_train_reconstruction/first_seq": 0.14978590607643127, + "eval_train_reconstruction/last_seq": 0.3158336579799652, + "eval_train_reconstruction/second_seq": 0.18742257356643677, + "eval_train_runtime": 442.9931, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 30500 + }, + { + "epoch": 0.11380676350126452, + "grad_norm": 0.2961822748184204, + "learning_rate": 0.0006, + "loss": 2.2386, + "step": 30510 + }, + { + "epoch": 0.1138440649642279, + "grad_norm": 0.775197446346283, + "learning_rate": 0.0006, + "loss": 2.258, + "step": 30520 + }, + { + "epoch": 0.11388136642719128, + "grad_norm": 0.3501628637313843, + "learning_rate": 0.0006, + "loss": 2.1065, + "step": 30530 + }, + { + "epoch": 0.11391866789015465, + "grad_norm": 0.29861465096473694, + "learning_rate": 0.0006, + "loss": 2.1868, + "step": 30540 + }, + { + "epoch": 0.11395596935311803, + "grad_norm": 0.3488769829273224, + "learning_rate": 0.0006, + "loss": 2.265, + "step": 30550 + }, + { + "epoch": 0.1139932708160814, + "grad_norm": 0.4689563512802124, + "learning_rate": 0.0006, + "loss": 2.1864, + "step": 30560 + }, + { + "epoch": 0.11403057227904478, + "grad_norm": 0.2566947638988495, + "learning_rate": 0.0006, + "loss": 2.2347, + "step": 30570 + }, + { + "epoch": 0.11406787374200816, + "grad_norm": 0.35498008131980896, + "learning_rate": 0.0006, + "loss": 2.3242, + "step": 30580 + }, + { + "epoch": 0.11410517520497154, + "grad_norm": 1.3050158023834229, + "learning_rate": 0.0006, + "loss": 2.0504, + "step": 30590 + }, + { + "epoch": 0.11414247666793492, + "grad_norm": 0.39813172817230225, + "learning_rate": 0.0006, + "loss": 2.1836, + "step": 30600 + }, + { + "epoch": 0.11417977813089829, + "grad_norm": 0.32964006066322327, + "learning_rate": 0.0006, + "loss": 2.1927, + "step": 30610 + }, + { + "epoch": 0.11421707959386167, + "grad_norm": 0.31147271394729614, + "learning_rate": 0.0006, + "loss": 2.241, + "step": 30620 + }, + { + "epoch": 0.11425438105682505, + "grad_norm": 0.23010343313217163, + "learning_rate": 0.0006, + "loss": 2.1314, + "step": 30630 + }, + { + "epoch": 0.11429168251978843, + "grad_norm": 0.3708725571632385, + "learning_rate": 0.0006, + "loss": 2.2213, + "step": 30640 + }, + { + "epoch": 0.1143289839827518, + "grad_norm": 0.35669684410095215, + "learning_rate": 0.0006, + "loss": 2.2895, + "step": 30650 + }, + { + "epoch": 0.11436628544571519, + "grad_norm": 0.26857420802116394, + "learning_rate": 0.0006, + "loss": 2.449, + "step": 30660 + }, + { + "epoch": 0.11440358690867856, + "grad_norm": 0.3192959129810333, + "learning_rate": 0.0006, + "loss": 2.1927, + "step": 30670 + }, + { + "epoch": 0.11444088837164193, + "grad_norm": 1.7194721698760986, + "learning_rate": 0.0006, + "loss": 2.2782, + "step": 30680 + }, + { + "epoch": 0.11447818983460531, + "grad_norm": 0.4264073073863983, + "learning_rate": 0.0006, + "loss": 2.2957, + "step": 30690 + }, + { + "epoch": 0.11451549129756869, + "grad_norm": 0.6408981680870056, + "learning_rate": 0.0006, + "loss": 2.3149, + "step": 30700 + }, + { + "epoch": 0.11455279276053207, + "grad_norm": 0.4555889666080475, + "learning_rate": 0.0006, + "loss": 2.1653, + "step": 30710 + }, + { + "epoch": 0.11459009422349545, + "grad_norm": 0.28549203276634216, + "learning_rate": 0.0006, + "loss": 2.0862, + "step": 30720 + }, + { + "epoch": 0.11462739568645883, + "grad_norm": 0.45808085799217224, + "learning_rate": 0.0006, + "loss": 2.1168, + "step": 30730 + }, + { + "epoch": 0.1146646971494222, + "grad_norm": 0.2806899845600128, + "learning_rate": 0.0006, + "loss": 2.2174, + "step": 30740 + }, + { + "epoch": 0.11470199861238557, + "grad_norm": 0.31403565406799316, + "learning_rate": 0.0006, + "loss": 2.1481, + "step": 30750 + }, + { + "epoch": 0.11470199861238557, + "eval_valid_loss": 2.1904001235961914, + "eval_valid_loss/all": 2.053328514099121, + "eval_valid_loss/end_span": 1.266337513923645, + "eval_valid_perplexity/batch": 7.793799877166748, + "eval_valid_perplexity/end_span": 3.547834873199463, + "eval_valid_perplexity/fim": 2.606630563735962, + "eval_valid_perplexity/first_seq": 14.296446800231934, + "eval_valid_perplexity/last_seq": 8.972135543823242, + "eval_valid_perplexity/second_seq": 13.975909233093262, + "eval_valid_perplexity/seq": 8.78376579284668, + "eval_valid_reconstruction/all": 0.29421454668045044, + "eval_valid_reconstruction/end_span": 0.6991656422615051, + "eval_valid_reconstruction/fim": 0.19127462804317474, + "eval_valid_reconstruction/first_seq": 0.17998774349689484, + "eval_valid_reconstruction/last_seq": 0.32727450132369995, + "eval_valid_reconstruction/second_seq": 0.1864623874425888, + "eval_valid_runtime": 449.7367, + "eval_valid_samples_per_second": 0.427, + "eval_valid_steps_per_second": 0.427, + "step": 30750 + }, + { + "epoch": 0.11470199861238557, + "eval_train_loss": 2.18768572807312, + "eval_train_loss/all": 2.024447202682495, + "eval_train_loss/end_span": 1.2237880229949951, + "eval_train_perplexity/batch": 7.571924209594727, + "eval_train_perplexity/end_span": 3.400042772293091, + "eval_train_perplexity/fim": 1.960408091545105, + "eval_train_perplexity/first_seq": 15.794148445129395, + "eval_train_perplexity/last_seq": 8.841259002685547, + "eval_train_perplexity/second_seq": 14.475581169128418, + "eval_train_perplexity/seq": 8.715462684631348, + "eval_train_reconstruction/all": 0.28419235348701477, + "eval_train_reconstruction/end_span": 0.7096750140190125, + "eval_train_reconstruction/fim": 0.1357397437095642, + "eval_train_reconstruction/first_seq": 0.1462256759405136, + "eval_train_reconstruction/last_seq": 0.3276262879371643, + "eval_train_reconstruction/second_seq": 0.17682881653308868, + "eval_train_runtime": 440.4042, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 30750 + }, + { + "epoch": 0.11473930007534895, + "grad_norm": 0.37511080503463745, + "learning_rate": 0.0006, + "loss": 2.1757, + "step": 30760 + }, + { + "epoch": 0.11477660153831233, + "grad_norm": 1.2532801628112793, + "learning_rate": 0.0006, + "loss": 2.1532, + "step": 30770 + }, + { + "epoch": 0.11481390300127571, + "grad_norm": 0.3424473702907562, + "learning_rate": 0.0006, + "loss": 2.2655, + "step": 30780 + }, + { + "epoch": 0.11485120446423909, + "grad_norm": 0.33373287320137024, + "learning_rate": 0.0006, + "loss": 2.2958, + "step": 30790 + }, + { + "epoch": 0.11488850592720247, + "grad_norm": 0.3666076362133026, + "learning_rate": 0.0006, + "loss": 2.0594, + "step": 30800 + }, + { + "epoch": 0.11492580739016585, + "grad_norm": 0.2922575771808624, + "learning_rate": 0.0006, + "loss": 2.2233, + "step": 30810 + }, + { + "epoch": 0.11496310885312921, + "grad_norm": 0.2985919117927551, + "learning_rate": 0.0006, + "loss": 2.085, + "step": 30820 + }, + { + "epoch": 0.1150004103160926, + "grad_norm": 0.38899925351142883, + "learning_rate": 0.0006, + "loss": 2.2965, + "step": 30830 + }, + { + "epoch": 0.11503771177905597, + "grad_norm": 0.2094178944826126, + "learning_rate": 0.0006, + "loss": 2.1186, + "step": 30840 + }, + { + "epoch": 0.11507501324201935, + "grad_norm": 0.2071063220500946, + "learning_rate": 0.0006, + "loss": 2.2864, + "step": 30850 + }, + { + "epoch": 0.11511231470498273, + "grad_norm": 0.25762036442756653, + "learning_rate": 0.0006, + "loss": 2.1889, + "step": 30860 + }, + { + "epoch": 0.11514961616794611, + "grad_norm": 0.527504026889801, + "learning_rate": 0.0006, + "loss": 2.1512, + "step": 30870 + }, + { + "epoch": 0.11518691763090949, + "grad_norm": 0.28920701146125793, + "learning_rate": 0.0006, + "loss": 2.2847, + "step": 30880 + }, + { + "epoch": 0.11522421909387286, + "grad_norm": 0.42397260665893555, + "learning_rate": 0.0006, + "loss": 2.2519, + "step": 30890 + }, + { + "epoch": 0.11526152055683624, + "grad_norm": 0.3331198990345001, + "learning_rate": 0.0006, + "loss": 2.1922, + "step": 30900 + }, + { + "epoch": 0.11529882201979962, + "grad_norm": 0.3797452747821808, + "learning_rate": 0.0006, + "loss": 2.0566, + "step": 30910 + }, + { + "epoch": 0.115336123482763, + "grad_norm": 0.38608431816101074, + "learning_rate": 0.0006, + "loss": 2.1128, + "step": 30920 + }, + { + "epoch": 0.11537342494572637, + "grad_norm": 0.3685465157032013, + "learning_rate": 0.0006, + "loss": 2.0824, + "step": 30930 + }, + { + "epoch": 0.11541072640868975, + "grad_norm": 0.4263441562652588, + "learning_rate": 0.0006, + "loss": 2.1577, + "step": 30940 + }, + { + "epoch": 0.11544802787165312, + "grad_norm": 0.4151438772678375, + "learning_rate": 0.0006, + "loss": 2.2297, + "step": 30950 + }, + { + "epoch": 0.1154853293346165, + "grad_norm": 0.3284616470336914, + "learning_rate": 0.0006, + "loss": 2.3293, + "step": 30960 + }, + { + "epoch": 0.11552263079757988, + "grad_norm": 0.3889188766479492, + "learning_rate": 0.0006, + "loss": 2.0969, + "step": 30970 + }, + { + "epoch": 0.11555993226054326, + "grad_norm": 0.28066742420196533, + "learning_rate": 0.0006, + "loss": 2.3815, + "step": 30980 + }, + { + "epoch": 0.11559723372350664, + "grad_norm": 0.5871808528900146, + "learning_rate": 0.0006, + "loss": 2.2818, + "step": 30990 + }, + { + "epoch": 0.11563453518647002, + "grad_norm": 1.3274433612823486, + "learning_rate": 0.0006, + "loss": 2.1607, + "step": 31000 + }, + { + "epoch": 0.11563453518647002, + "eval_valid_loss": 2.1903810501098633, + "eval_valid_loss/all": 2.0535757541656494, + "eval_valid_loss/end_span": 1.2645657062530518, + "eval_valid_perplexity/batch": 7.795726776123047, + "eval_valid_perplexity/end_span": 3.5415544509887695, + "eval_valid_perplexity/fim": 2.1829419136047363, + "eval_valid_perplexity/first_seq": 14.70615005493164, + "eval_valid_perplexity/last_seq": 9.001007080078125, + "eval_valid_perplexity/second_seq": 13.71983814239502, + "eval_valid_perplexity/seq": 8.79001235961914, + "eval_valid_reconstruction/all": 0.29440274834632874, + "eval_valid_reconstruction/end_span": 0.7075287699699402, + "eval_valid_reconstruction/fim": 0.15591967105865479, + "eval_valid_reconstruction/first_seq": 0.17247939109802246, + "eval_valid_reconstruction/last_seq": 0.3263940215110779, + "eval_valid_reconstruction/second_seq": 0.19544853270053864, + "eval_valid_runtime": 448.7769, + "eval_valid_samples_per_second": 0.428, + "eval_valid_steps_per_second": 0.428, + "step": 31000 + }, + { + "epoch": 0.11563453518647002, + "eval_train_loss": 2.1890714168548584, + "eval_train_loss/all": 2.0259182453155518, + "eval_train_loss/end_span": 1.2223474979400635, + "eval_train_perplexity/batch": 7.583070755004883, + "eval_train_perplexity/end_span": 3.395148515701294, + "eval_train_perplexity/fim": 2.104227304458618, + "eval_train_perplexity/first_seq": 15.49614143371582, + "eval_train_perplexity/last_seq": 8.7150297164917, + "eval_train_perplexity/second_seq": 14.162958145141602, + "eval_train_perplexity/seq": 8.730870246887207, + "eval_train_reconstruction/all": 0.28383928537368774, + "eval_train_reconstruction/end_span": 0.7177010774612427, + "eval_train_reconstruction/fim": 0.1492166817188263, + "eval_train_reconstruction/first_seq": 0.15219734609127045, + "eval_train_reconstruction/last_seq": 0.33396637439727783, + "eval_train_reconstruction/second_seq": 0.18666395545005798, + "eval_train_runtime": 448.0156, + "eval_train_samples_per_second": 0.429, + "eval_train_steps_per_second": 0.429, + "step": 31000 + }, + { + "epoch": 0.1156718366494334, + "grad_norm": 0.35053110122680664, + "learning_rate": 0.0006, + "loss": 2.2447, + "step": 31010 + }, + { + "epoch": 0.11570913811239676, + "grad_norm": 0.3040464222431183, + "learning_rate": 0.0006, + "loss": 2.1106, + "step": 31020 + }, + { + "epoch": 0.11574643957536014, + "grad_norm": 0.5406830906867981, + "learning_rate": 0.0006, + "loss": 2.1745, + "step": 31030 + }, + { + "epoch": 0.11578374103832352, + "grad_norm": 0.422683447599411, + "learning_rate": 0.0006, + "loss": 2.3444, + "step": 31040 + }, + { + "epoch": 0.1158210425012869, + "grad_norm": 0.28001606464385986, + "learning_rate": 0.0006, + "loss": 2.1225, + "step": 31050 + }, + { + "epoch": 0.11585834396425028, + "grad_norm": 0.26121705770492554, + "learning_rate": 0.0006, + "loss": 2.2718, + "step": 31060 + }, + { + "epoch": 0.11589564542721366, + "grad_norm": 0.2996496260166168, + "learning_rate": 0.0006, + "loss": 2.1901, + "step": 31070 + }, + { + "epoch": 0.11593294689017704, + "grad_norm": 0.4343946874141693, + "learning_rate": 0.0006, + "loss": 2.2003, + "step": 31080 + }, + { + "epoch": 0.1159702483531404, + "grad_norm": 0.5358569622039795, + "learning_rate": 0.0006, + "loss": 2.1216, + "step": 31090 + }, + { + "epoch": 0.11600754981610378, + "grad_norm": 0.3973459303379059, + "learning_rate": 0.0006, + "loss": 2.2099, + "step": 31100 + }, + { + "epoch": 0.11604485127906716, + "grad_norm": 0.44428151845932007, + "learning_rate": 0.0006, + "loss": 2.1165, + "step": 31110 + }, + { + "epoch": 0.11608215274203054, + "grad_norm": 0.3463711738586426, + "learning_rate": 0.0006, + "loss": 1.9923, + "step": 31120 + }, + { + "epoch": 0.11611945420499392, + "grad_norm": 0.29919660091400146, + "learning_rate": 0.0006, + "loss": 2.2165, + "step": 31130 + }, + { + "epoch": 0.1161567556679573, + "grad_norm": 0.34469127655029297, + "learning_rate": 0.0006, + "loss": 2.2599, + "step": 31140 + }, + { + "epoch": 0.11619405713092068, + "grad_norm": 0.4666050374507904, + "learning_rate": 0.0006, + "loss": 2.1872, + "step": 31150 + }, + { + "epoch": 0.11623135859388405, + "grad_norm": 0.33235272765159607, + "learning_rate": 0.0006, + "loss": 2.0668, + "step": 31160 + }, + { + "epoch": 0.11626866005684743, + "grad_norm": 0.2772757112979889, + "learning_rate": 0.0006, + "loss": 2.1824, + "step": 31170 + }, + { + "epoch": 0.1163059615198108, + "grad_norm": 0.7000282406806946, + "learning_rate": 0.0006, + "loss": 2.2796, + "step": 31180 + }, + { + "epoch": 0.11634326298277418, + "grad_norm": 0.3797716796398163, + "learning_rate": 0.0006, + "loss": 2.272, + "step": 31190 + }, + { + "epoch": 0.11638056444573756, + "grad_norm": 0.38184598088264465, + "learning_rate": 0.0006, + "loss": 2.2297, + "step": 31200 + }, + { + "epoch": 0.11641786590870094, + "grad_norm": 0.37442463636398315, + "learning_rate": 0.0006, + "loss": 2.1685, + "step": 31210 + }, + { + "epoch": 0.11645516737166432, + "grad_norm": 0.30120396614074707, + "learning_rate": 0.0006, + "loss": 2.0722, + "step": 31220 + }, + { + "epoch": 0.11649246883462769, + "grad_norm": 0.39224377274513245, + "learning_rate": 0.0006, + "loss": 2.1864, + "step": 31230 + }, + { + "epoch": 0.11652977029759107, + "grad_norm": 12.935247421264648, + "learning_rate": 0.0006, + "loss": 2.0821, + "step": 31240 + }, + { + "epoch": 0.11656707176055445, + "grad_norm": 0.2937852144241333, + "learning_rate": 0.0006, + "loss": 2.2146, + "step": 31250 + }, + { + "epoch": 0.11656707176055445, + "eval_valid_loss": 2.195115804672241, + "eval_valid_loss/all": 2.057403802871704, + "eval_valid_loss/end_span": 1.1599456071853638, + "eval_valid_perplexity/batch": 7.825626373291016, + "eval_valid_perplexity/end_span": 3.1897597312927246, + "eval_valid_perplexity/fim": 2.286343574523926, + "eval_valid_perplexity/first_seq": 14.744038581848145, + "eval_valid_perplexity/last_seq": 8.750946044921875, + "eval_valid_perplexity/second_seq": 13.845667839050293, + "eval_valid_perplexity/seq": 8.821544647216797, + "eval_valid_reconstruction/all": 0.2938651442527771, + "eval_valid_reconstruction/end_span": 0.7416850924491882, + "eval_valid_reconstruction/fim": 0.16478979587554932, + "eval_valid_reconstruction/first_seq": 0.17236225306987762, + "eval_valid_reconstruction/last_seq": 0.3347399830818176, + "eval_valid_reconstruction/second_seq": 0.19394439458847046, + "eval_valid_runtime": 446.335, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 31250 + }, + { + "epoch": 0.11656707176055445, + "eval_train_loss": 2.1946041584014893, + "eval_train_loss/all": 2.0306410789489746, + "eval_train_loss/end_span": 1.126314401626587, + "eval_train_perplexity/batch": 7.618968963623047, + "eval_train_perplexity/end_span": 3.084268093109131, + "eval_train_perplexity/fim": 2.350100517272949, + "eval_train_perplexity/first_seq": 15.323648452758789, + "eval_train_perplexity/last_seq": 9.231422424316406, + "eval_train_perplexity/second_seq": 13.758028984069824, + "eval_train_perplexity/seq": 8.774022102355957, + "eval_train_reconstruction/all": 0.2828928232192993, + "eval_train_reconstruction/end_span": 0.7531352639198303, + "eval_train_reconstruction/fim": 0.16850709915161133, + "eval_train_reconstruction/first_seq": 0.1544599086046219, + "eval_train_reconstruction/last_seq": 0.3160001039505005, + "eval_train_reconstruction/second_seq": 0.1964692920446396, + "eval_train_runtime": 446.9561, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 31250 + }, + { + "epoch": 0.11660437322351783, + "grad_norm": 0.6536126732826233, + "learning_rate": 0.0006, + "loss": 2.176, + "step": 31260 + }, + { + "epoch": 0.1166416746864812, + "grad_norm": 0.3699854612350464, + "learning_rate": 0.0006, + "loss": 2.1268, + "step": 31270 + }, + { + "epoch": 0.11667897614944459, + "grad_norm": 0.3298143446445465, + "learning_rate": 0.0006, + "loss": 2.1692, + "step": 31280 + }, + { + "epoch": 0.11671627761240796, + "grad_norm": 0.30188822746276855, + "learning_rate": 0.0006, + "loss": 2.2467, + "step": 31290 + }, + { + "epoch": 0.11675357907537133, + "grad_norm": 0.4936509430408478, + "learning_rate": 0.0006, + "loss": 2.059, + "step": 31300 + }, + { + "epoch": 0.11679088053833471, + "grad_norm": 0.2898666560649872, + "learning_rate": 0.0006, + "loss": 2.275, + "step": 31310 + }, + { + "epoch": 0.11682818200129809, + "grad_norm": 0.5238190293312073, + "learning_rate": 0.0006, + "loss": 2.2459, + "step": 31320 + }, + { + "epoch": 0.11686548346426147, + "grad_norm": 0.30733928084373474, + "learning_rate": 0.0006, + "loss": 2.1559, + "step": 31330 + }, + { + "epoch": 0.11690278492722485, + "grad_norm": 0.3295579254627228, + "learning_rate": 0.0006, + "loss": 2.2201, + "step": 31340 + }, + { + "epoch": 0.11694008639018823, + "grad_norm": 0.3527715504169464, + "learning_rate": 0.0006, + "loss": 2.4162, + "step": 31350 + }, + { + "epoch": 0.11697738785315161, + "grad_norm": 0.5093117356300354, + "learning_rate": 0.0006, + "loss": 2.0651, + "step": 31360 + }, + { + "epoch": 0.11701468931611497, + "grad_norm": 0.29203638434410095, + "learning_rate": 0.0006, + "loss": 2.159, + "step": 31370 + }, + { + "epoch": 0.11705199077907835, + "grad_norm": 0.272025465965271, + "learning_rate": 0.0006, + "loss": 2.1466, + "step": 31380 + }, + { + "epoch": 0.11708929224204173, + "grad_norm": 0.36283838748931885, + "learning_rate": 0.0006, + "loss": 2.2962, + "step": 31390 + }, + { + "epoch": 0.11712659370500511, + "grad_norm": 0.3028932809829712, + "learning_rate": 0.0006, + "loss": 2.2137, + "step": 31400 + }, + { + "epoch": 0.11716389516796849, + "grad_norm": 0.3266291618347168, + "learning_rate": 0.0006, + "loss": 2.2596, + "step": 31410 + }, + { + "epoch": 0.11720119663093187, + "grad_norm": 0.28625497221946716, + "learning_rate": 0.0006, + "loss": 2.4012, + "step": 31420 + }, + { + "epoch": 0.11723849809389525, + "grad_norm": 0.36891043186187744, + "learning_rate": 0.0006, + "loss": 2.2747, + "step": 31430 + }, + { + "epoch": 0.11727579955685861, + "grad_norm": 0.2996677756309509, + "learning_rate": 0.0006, + "loss": 2.1861, + "step": 31440 + }, + { + "epoch": 0.117313101019822, + "grad_norm": 0.3801691234111786, + "learning_rate": 0.0006, + "loss": 2.1941, + "step": 31450 + }, + { + "epoch": 0.11735040248278537, + "grad_norm": 0.3482344150543213, + "learning_rate": 0.0006, + "loss": 2.2625, + "step": 31460 + }, + { + "epoch": 0.11738770394574875, + "grad_norm": 0.5826995372772217, + "learning_rate": 0.0006, + "loss": 2.066, + "step": 31470 + }, + { + "epoch": 0.11742500540871213, + "grad_norm": 0.3196304440498352, + "learning_rate": 0.0006, + "loss": 2.275, + "step": 31480 + }, + { + "epoch": 0.11746230687167551, + "grad_norm": 0.43210867047309875, + "learning_rate": 0.0006, + "loss": 1.9502, + "step": 31490 + }, + { + "epoch": 0.11749960833463889, + "grad_norm": 0.273396760225296, + "learning_rate": 0.0006, + "loss": 2.347, + "step": 31500 + }, + { + "epoch": 0.11749960833463889, + "eval_valid_loss": 2.193251848220825, + "eval_valid_loss/all": 2.056110382080078, + "eval_valid_loss/end_span": 1.2764254808425903, + "eval_valid_perplexity/batch": 7.815511226654053, + "eval_valid_perplexity/end_span": 3.5838065147399902, + "eval_valid_perplexity/fim": 2.3832273483276367, + "eval_valid_perplexity/first_seq": 14.690381050109863, + "eval_valid_perplexity/last_seq": 8.967381477355957, + "eval_valid_perplexity/second_seq": 13.409302711486816, + "eval_valid_perplexity/seq": 8.807305335998535, + "eval_valid_reconstruction/all": 0.2933889329433441, + "eval_valid_reconstruction/end_span": 0.6994830965995789, + "eval_valid_reconstruction/fim": 0.17385601997375488, + "eval_valid_reconstruction/first_seq": 0.17118875682353973, + "eval_valid_reconstruction/last_seq": 0.327269047498703, + "eval_valid_reconstruction/second_seq": 0.206463024020195, + "eval_valid_runtime": 448.4195, + "eval_valid_samples_per_second": 0.428, + "eval_valid_steps_per_second": 0.428, + "step": 31500 + }, + { + "epoch": 0.11749960833463889, + "eval_train_loss": 2.1946353912353516, + "eval_train_loss/all": 2.0305514335632324, + "eval_train_loss/end_span": 1.250357747077942, + "eval_train_perplexity/batch": 7.6182861328125, + "eval_train_perplexity/end_span": 3.4915919303894043, + "eval_train_perplexity/fim": 2.1901206970214844, + "eval_train_perplexity/first_seq": 15.484526634216309, + "eval_train_perplexity/last_seq": 8.927042961120605, + "eval_train_perplexity/second_seq": 14.106548309326172, + "eval_train_perplexity/seq": 8.769115447998047, + "eval_train_reconstruction/all": 0.2822292447090149, + "eval_train_reconstruction/end_span": 0.7075698375701904, + "eval_train_reconstruction/fim": 0.15530362725257874, + "eval_train_reconstruction/first_seq": 0.15274538099765778, + "eval_train_reconstruction/last_seq": 0.32738399505615234, + "eval_train_reconstruction/second_seq": 0.1883310079574585, + "eval_train_runtime": 443.9249, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 31500 + }, + { + "epoch": 0.11753690979760226, + "grad_norm": 0.2546112835407257, + "learning_rate": 0.0006, + "loss": 2.2393, + "step": 31510 + }, + { + "epoch": 0.11757421126056564, + "grad_norm": 0.37859949469566345, + "learning_rate": 0.0006, + "loss": 2.1993, + "step": 31520 + }, + { + "epoch": 0.11761151272352902, + "grad_norm": 0.4631933271884918, + "learning_rate": 0.0006, + "loss": 2.125, + "step": 31530 + }, + { + "epoch": 0.1176488141864924, + "grad_norm": 0.2875710725784302, + "learning_rate": 0.0006, + "loss": 2.1701, + "step": 31540 + }, + { + "epoch": 0.11768611564945577, + "grad_norm": 0.4509950876235962, + "learning_rate": 0.0006, + "loss": 2.35, + "step": 31550 + }, + { + "epoch": 0.11772341711241915, + "grad_norm": 1.657263159751892, + "learning_rate": 0.0006, + "loss": 2.2225, + "step": 31560 + }, + { + "epoch": 0.11776071857538252, + "grad_norm": 0.3393774628639221, + "learning_rate": 0.0006, + "loss": 2.1734, + "step": 31570 + }, + { + "epoch": 0.1177980200383459, + "grad_norm": 0.34517496824264526, + "learning_rate": 0.0006, + "loss": 2.2818, + "step": 31580 + }, + { + "epoch": 0.11783532150130928, + "grad_norm": 0.4123334586620331, + "learning_rate": 0.0006, + "loss": 2.2802, + "step": 31590 + }, + { + "epoch": 0.11787262296427266, + "grad_norm": 0.30174651741981506, + "learning_rate": 0.0006, + "loss": 2.2358, + "step": 31600 + }, + { + "epoch": 0.11790992442723604, + "grad_norm": 0.3469681143760681, + "learning_rate": 0.0006, + "loss": 2.194, + "step": 31610 + }, + { + "epoch": 0.11794722589019942, + "grad_norm": 0.26319608092308044, + "learning_rate": 0.0006, + "loss": 2.2131, + "step": 31620 + }, + { + "epoch": 0.1179845273531628, + "grad_norm": 0.33392956852912903, + "learning_rate": 0.0006, + "loss": 2.2351, + "step": 31630 + }, + { + "epoch": 0.11802182881612616, + "grad_norm": 0.2524607181549072, + "learning_rate": 0.0006, + "loss": 2.115, + "step": 31640 + }, + { + "epoch": 0.11805913027908954, + "grad_norm": 0.5032316446304321, + "learning_rate": 0.0006, + "loss": 2.0781, + "step": 31650 + }, + { + "epoch": 0.11809643174205292, + "grad_norm": 0.34302735328674316, + "learning_rate": 0.0006, + "loss": 2.2337, + "step": 31660 + }, + { + "epoch": 0.1181337332050163, + "grad_norm": 0.3341873288154602, + "learning_rate": 0.0006, + "loss": 2.3695, + "step": 31670 + }, + { + "epoch": 0.11817103466797968, + "grad_norm": 0.3987048864364624, + "learning_rate": 0.0006, + "loss": 2.1829, + "step": 31680 + }, + { + "epoch": 0.11820833613094306, + "grad_norm": 0.39078742265701294, + "learning_rate": 0.0006, + "loss": 2.2232, + "step": 31690 + }, + { + "epoch": 0.11824563759390644, + "grad_norm": 0.4329628348350525, + "learning_rate": 0.0006, + "loss": 2.3005, + "step": 31700 + }, + { + "epoch": 0.1182829390568698, + "grad_norm": 0.5519291758537292, + "learning_rate": 0.0006, + "loss": 2.1091, + "step": 31710 + }, + { + "epoch": 0.11832024051983318, + "grad_norm": 0.29502567648887634, + "learning_rate": 0.0006, + "loss": 2.3657, + "step": 31720 + }, + { + "epoch": 0.11835754198279656, + "grad_norm": 0.3869779407978058, + "learning_rate": 0.0006, + "loss": 2.3214, + "step": 31730 + }, + { + "epoch": 0.11839484344575994, + "grad_norm": 0.3429450988769531, + "learning_rate": 0.0006, + "loss": 2.3011, + "step": 31740 + }, + { + "epoch": 0.11843214490872332, + "grad_norm": 0.314247727394104, + "learning_rate": 0.0006, + "loss": 2.052, + "step": 31750 + }, + { + "epoch": 0.11843214490872332, + "eval_valid_loss": 2.1921377182006836, + "eval_valid_loss/all": 2.055234432220459, + "eval_valid_loss/end_span": 1.1478450298309326, + "eval_valid_perplexity/batch": 7.80866813659668, + "eval_valid_perplexity/end_span": 3.1513943672180176, + "eval_valid_perplexity/fim": 2.287595272064209, + "eval_valid_perplexity/first_seq": 14.761040687561035, + "eval_valid_perplexity/last_seq": 9.002253532409668, + "eval_valid_perplexity/second_seq": 13.18797492980957, + "eval_valid_perplexity/seq": 8.80561351776123, + "eval_valid_reconstruction/all": 0.29402583837509155, + "eval_valid_reconstruction/end_span": 0.737419605255127, + "eval_valid_reconstruction/fim": 0.16504167020320892, + "eval_valid_reconstruction/first_seq": 0.17172089219093323, + "eval_valid_reconstruction/last_seq": 0.32513293623924255, + "eval_valid_reconstruction/second_seq": 0.21056194603443146, + "eval_valid_runtime": 445.6866, + "eval_valid_samples_per_second": 0.431, + "eval_valid_steps_per_second": 0.431, + "step": 31750 + }, + { + "epoch": 0.11843214490872332, + "eval_train_loss": 2.1924362182617188, + "eval_train_loss/all": 2.0287411212921143, + "eval_train_loss/end_span": 1.1101489067077637, + "eval_train_perplexity/batch": 7.604506969451904, + "eval_train_perplexity/end_span": 3.0348103046417236, + "eval_train_perplexity/fim": 2.0873019695281982, + "eval_train_perplexity/first_seq": 15.502951622009277, + "eval_train_perplexity/last_seq": 8.674612045288086, + "eval_train_perplexity/second_seq": 14.28614330291748, + "eval_train_perplexity/seq": 8.758740425109863, + "eval_train_reconstruction/all": 0.28303489089012146, + "eval_train_reconstruction/end_span": 0.7490305304527283, + "eval_train_reconstruction/fim": 0.14657925069332123, + "eval_train_reconstruction/first_seq": 0.15150579810142517, + "eval_train_reconstruction/last_seq": 0.33677172660827637, + "eval_train_reconstruction/second_seq": 0.1831386685371399, + "eval_train_runtime": 442.9598, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 31750 + }, + { + "epoch": 0.1184694463716867, + "grad_norm": 0.3414407968521118, + "learning_rate": 0.0006, + "loss": 2.2932, + "step": 31760 + }, + { + "epoch": 0.11850674783465008, + "grad_norm": 0.27690523862838745, + "learning_rate": 0.0006, + "loss": 2.1531, + "step": 31770 + }, + { + "epoch": 0.11854404929761345, + "grad_norm": 0.335328072309494, + "learning_rate": 0.0006, + "loss": 2.2553, + "step": 31780 + }, + { + "epoch": 0.11858135076057683, + "grad_norm": 0.31297168135643005, + "learning_rate": 0.0006, + "loss": 2.2642, + "step": 31790 + }, + { + "epoch": 0.1186186522235402, + "grad_norm": 0.7311064600944519, + "learning_rate": 0.0006, + "loss": 2.2202, + "step": 31800 + }, + { + "epoch": 0.11865595368650358, + "grad_norm": 0.3991740047931671, + "learning_rate": 0.0006, + "loss": 2.3739, + "step": 31810 + }, + { + "epoch": 0.11869325514946696, + "grad_norm": 0.22500821948051453, + "learning_rate": 0.0006, + "loss": 2.2932, + "step": 31820 + }, + { + "epoch": 0.11873055661243034, + "grad_norm": 0.31762179732322693, + "learning_rate": 0.0006, + "loss": 2.1332, + "step": 31830 + }, + { + "epoch": 0.11876785807539372, + "grad_norm": 0.5553642511367798, + "learning_rate": 0.0006, + "loss": 2.2604, + "step": 31840 + }, + { + "epoch": 0.11880515953835709, + "grad_norm": 0.37675029039382935, + "learning_rate": 0.0006, + "loss": 2.1477, + "step": 31850 + }, + { + "epoch": 0.11884246100132047, + "grad_norm": 0.30244532227516174, + "learning_rate": 0.0006, + "loss": 2.1834, + "step": 31860 + }, + { + "epoch": 0.11887976246428385, + "grad_norm": 0.25508201122283936, + "learning_rate": 0.0006, + "loss": 2.2379, + "step": 31870 + }, + { + "epoch": 0.11891706392724723, + "grad_norm": 0.4549157917499542, + "learning_rate": 0.0006, + "loss": 2.0875, + "step": 31880 + }, + { + "epoch": 0.1189543653902106, + "grad_norm": 0.31441545486450195, + "learning_rate": 0.0006, + "loss": 2.2531, + "step": 31890 + }, + { + "epoch": 0.11899166685317399, + "grad_norm": 0.32069990038871765, + "learning_rate": 0.0006, + "loss": 2.2458, + "step": 31900 + }, + { + "epoch": 0.11902896831613737, + "grad_norm": 0.2983745038509369, + "learning_rate": 0.0006, + "loss": 2.3442, + "step": 31910 + }, + { + "epoch": 0.11906626977910073, + "grad_norm": 0.4305446743965149, + "learning_rate": 0.0006, + "loss": 2.0703, + "step": 31920 + }, + { + "epoch": 0.11910357124206411, + "grad_norm": 0.38537678122520447, + "learning_rate": 0.0006, + "loss": 2.0764, + "step": 31930 + }, + { + "epoch": 0.11914087270502749, + "grad_norm": 0.3429096043109894, + "learning_rate": 0.0006, + "loss": 2.2955, + "step": 31940 + }, + { + "epoch": 0.11917817416799087, + "grad_norm": 0.4511386454105377, + "learning_rate": 0.0006, + "loss": 2.114, + "step": 31950 + }, + { + "epoch": 0.11921547563095425, + "grad_norm": 0.3581865429878235, + "learning_rate": 0.0006, + "loss": 2.3158, + "step": 31960 + }, + { + "epoch": 0.11925277709391763, + "grad_norm": 0.2605074942111969, + "learning_rate": 0.0006, + "loss": 2.1741, + "step": 31970 + }, + { + "epoch": 0.11929007855688101, + "grad_norm": 0.43663984537124634, + "learning_rate": 0.0006, + "loss": 2.2425, + "step": 31980 + }, + { + "epoch": 0.11932738001984437, + "grad_norm": 0.28266265988349915, + "learning_rate": 0.0006, + "loss": 2.2089, + "step": 31990 + }, + { + "epoch": 0.11936468148280775, + "grad_norm": 0.5217158198356628, + "learning_rate": 0.0006, + "loss": 2.1499, + "step": 32000 + }, + { + "epoch": 0.11936468148280775, + "eval_valid_loss": 2.19567608833313, + "eval_valid_loss/all": 2.0587785243988037, + "eval_valid_loss/end_span": 1.2037204504013062, + "eval_valid_perplexity/batch": 7.836391925811768, + "eval_valid_perplexity/end_span": 3.3324923515319824, + "eval_valid_perplexity/fim": 2.257476568222046, + "eval_valid_perplexity/first_seq": 15.259275436401367, + "eval_valid_perplexity/last_seq": 9.047460556030273, + "eval_valid_perplexity/second_seq": 13.75878620147705, + "eval_valid_perplexity/seq": 8.845248222351074, + "eval_valid_reconstruction/all": 0.29282909631729126, + "eval_valid_reconstruction/end_span": 0.717093825340271, + "eval_valid_reconstruction/fim": 0.16172342002391815, + "eval_valid_reconstruction/first_seq": 0.15879705548286438, + "eval_valid_reconstruction/last_seq": 0.32309862971305847, + "eval_valid_reconstruction/second_seq": 0.19501517713069916, + "eval_valid_runtime": 450.0053, + "eval_valid_samples_per_second": 0.427, + "eval_valid_steps_per_second": 0.427, + "step": 32000 + }, + { + "epoch": 0.11936468148280775, + "eval_train_loss": 2.1915767192840576, + "eval_train_loss/all": 2.028080940246582, + "eval_train_loss/end_span": 1.1561542749404907, + "eval_train_perplexity/batch": 7.599488735198975, + "eval_train_perplexity/end_span": 3.17768931388855, + "eval_train_perplexity/fim": 2.048281192779541, + "eval_train_perplexity/first_seq": 15.597968101501465, + "eval_train_perplexity/last_seq": 8.945660591125488, + "eval_train_perplexity/second_seq": 14.656083106994629, + "eval_train_perplexity/seq": 8.757935523986816, + "eval_train_reconstruction/all": 0.28334513306617737, + "eval_train_reconstruction/end_span": 0.7311310768127441, + "eval_train_reconstruction/fim": 0.14263322949409485, + "eval_train_reconstruction/first_seq": 0.14959551393985748, + "eval_train_reconstruction/last_seq": 0.3255726099014282, + "eval_train_reconstruction/second_seq": 0.1753426045179367, + "eval_train_runtime": 443.8495, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 32000 + }, + { + "epoch": 0.11940198294577113, + "grad_norm": 0.4013538956642151, + "learning_rate": 0.0006, + "loss": 2.1479, + "step": 32010 + }, + { + "epoch": 0.11943928440873451, + "grad_norm": 0.3257390558719635, + "learning_rate": 0.0006, + "loss": 2.144, + "step": 32020 + }, + { + "epoch": 0.11947658587169789, + "grad_norm": 0.297804594039917, + "learning_rate": 0.0006, + "loss": 2.1445, + "step": 32030 + }, + { + "epoch": 0.11951388733466127, + "grad_norm": 0.30540743470191956, + "learning_rate": 0.0006, + "loss": 2.3185, + "step": 32040 + }, + { + "epoch": 0.11955118879762465, + "grad_norm": 0.35860297083854675, + "learning_rate": 0.0006, + "loss": 2.0962, + "step": 32050 + }, + { + "epoch": 0.11958849026058802, + "grad_norm": 0.5285745859146118, + "learning_rate": 0.0006, + "loss": 2.1483, + "step": 32060 + }, + { + "epoch": 0.1196257917235514, + "grad_norm": 0.332425981760025, + "learning_rate": 0.0006, + "loss": 2.3631, + "step": 32070 + }, + { + "epoch": 0.11966309318651477, + "grad_norm": 0.3275154232978821, + "learning_rate": 0.0006, + "loss": 2.182, + "step": 32080 + }, + { + "epoch": 0.11970039464947815, + "grad_norm": 0.38817843794822693, + "learning_rate": 0.0006, + "loss": 2.3039, + "step": 32090 + }, + { + "epoch": 0.11973769611244153, + "grad_norm": 0.45671898126602173, + "learning_rate": 0.0006, + "loss": 2.3824, + "step": 32100 + }, + { + "epoch": 0.11977499757540491, + "grad_norm": 0.3635548949241638, + "learning_rate": 0.0006, + "loss": 2.2317, + "step": 32110 + }, + { + "epoch": 0.11981229903836828, + "grad_norm": 0.30736181139945984, + "learning_rate": 0.0006, + "loss": 2.2973, + "step": 32120 + }, + { + "epoch": 0.11984960050133166, + "grad_norm": 0.2882140576839447, + "learning_rate": 0.0006, + "loss": 2.2092, + "step": 32130 + }, + { + "epoch": 0.11988690196429504, + "grad_norm": 0.3914002478122711, + "learning_rate": 0.0006, + "loss": 2.1531, + "step": 32140 + }, + { + "epoch": 0.11992420342725842, + "grad_norm": 0.3280617892742157, + "learning_rate": 0.0006, + "loss": 2.3533, + "step": 32150 + }, + { + "epoch": 0.1199615048902218, + "grad_norm": 0.34614861011505127, + "learning_rate": 0.0006, + "loss": 2.088, + "step": 32160 + }, + { + "epoch": 0.11999880635318518, + "grad_norm": 0.27127593755722046, + "learning_rate": 0.0006, + "loss": 2.3113, + "step": 32170 + }, + { + "epoch": 0.12003610781614855, + "grad_norm": 0.2669001519680023, + "learning_rate": 0.0006, + "loss": 2.142, + "step": 32180 + }, + { + "epoch": 0.12007340927911192, + "grad_norm": 0.42830130457878113, + "learning_rate": 0.0006, + "loss": 2.228, + "step": 32190 + }, + { + "epoch": 0.1201107107420753, + "grad_norm": 0.29059556126594543, + "learning_rate": 0.0006, + "loss": 2.1862, + "step": 32200 + }, + { + "epoch": 0.12014801220503868, + "grad_norm": 0.301690012216568, + "learning_rate": 0.0006, + "loss": 1.9452, + "step": 32210 + }, + { + "epoch": 0.12018531366800206, + "grad_norm": 0.3524816334247589, + "learning_rate": 0.0006, + "loss": 2.4009, + "step": 32220 + }, + { + "epoch": 0.12022261513096544, + "grad_norm": 0.3094330132007599, + "learning_rate": 0.0006, + "loss": 2.2253, + "step": 32230 + }, + { + "epoch": 0.12025991659392882, + "grad_norm": 0.4820084273815155, + "learning_rate": 0.0006, + "loss": 2.3612, + "step": 32240 + }, + { + "epoch": 0.1202972180568922, + "grad_norm": 0.26967811584472656, + "learning_rate": 0.0006, + "loss": 2.325, + "step": 32250 + }, + { + "epoch": 0.1202972180568922, + "eval_valid_loss": 2.191404342651367, + "eval_valid_loss/all": 2.0543456077575684, + "eval_valid_loss/end_span": 1.2302470207214355, + "eval_valid_perplexity/batch": 7.801730632781982, + "eval_valid_perplexity/end_span": 3.422074794769287, + "eval_valid_perplexity/fim": 2.0851542949676514, + "eval_valid_perplexity/first_seq": 14.775592803955078, + "eval_valid_perplexity/last_seq": 8.91150951385498, + "eval_valid_perplexity/second_seq": 13.842970848083496, + "eval_valid_perplexity/seq": 8.790465354919434, + "eval_valid_reconstruction/all": 0.293958842754364, + "eval_valid_reconstruction/end_span": 0.7152649760246277, + "eval_valid_reconstruction/fim": 0.14730143547058105, + "eval_valid_reconstruction/first_seq": 0.16896268725395203, + "eval_valid_reconstruction/last_seq": 0.33167675137519836, + "eval_valid_reconstruction/second_seq": 0.18999549746513367, + "eval_valid_runtime": 446.0145, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 32250 + }, + { + "epoch": 0.1202972180568922, + "eval_train_loss": 2.1899290084838867, + "eval_train_loss/all": 2.026228904724121, + "eval_train_loss/end_span": 1.1960169076919556, + "eval_train_perplexity/batch": 7.5854268074035645, + "eval_train_perplexity/end_span": 3.3069188594818115, + "eval_train_perplexity/fim": 2.188857316970825, + "eval_train_perplexity/first_seq": 15.691198348999023, + "eval_train_perplexity/last_seq": 9.276844024658203, + "eval_train_perplexity/second_seq": 14.109444618225098, + "eval_train_perplexity/seq": 8.72901725769043, + "eval_train_reconstruction/all": 0.2837955355644226, + "eval_train_reconstruction/end_span": 0.7234505414962769, + "eval_train_reconstruction/fim": 0.15675008296966553, + "eval_train_reconstruction/first_seq": 0.14703431725502014, + "eval_train_reconstruction/last_seq": 0.31309717893600464, + "eval_train_reconstruction/second_seq": 0.19032028317451477, + "eval_train_runtime": 445.9942, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 32250 + }, + { + "epoch": 0.12033451951985556, + "grad_norm": 0.3027627766132355, + "learning_rate": 0.0006, + "loss": 2.3154, + "step": 32260 + }, + { + "epoch": 0.12037182098281894, + "grad_norm": 0.5361747741699219, + "learning_rate": 0.0006, + "loss": 2.028, + "step": 32270 + }, + { + "epoch": 0.12040912244578232, + "grad_norm": 0.34570449590682983, + "learning_rate": 0.0006, + "loss": 2.0099, + "step": 32280 + }, + { + "epoch": 0.1204464239087457, + "grad_norm": 0.24189449846744537, + "learning_rate": 0.0006, + "loss": 2.013, + "step": 32290 + }, + { + "epoch": 0.12048372537170908, + "grad_norm": 0.4061657786369324, + "learning_rate": 0.0006, + "loss": 2.1509, + "step": 32300 + }, + { + "epoch": 0.12052102683467246, + "grad_norm": 0.3453228175640106, + "learning_rate": 0.0006, + "loss": 2.2105, + "step": 32310 + }, + { + "epoch": 0.12055832829763584, + "grad_norm": 0.4775119125843048, + "learning_rate": 0.0006, + "loss": 2.2741, + "step": 32320 + }, + { + "epoch": 0.1205956297605992, + "grad_norm": 0.4572742283344269, + "learning_rate": 0.0006, + "loss": 2.344, + "step": 32330 + }, + { + "epoch": 0.12063293122356258, + "grad_norm": 0.33053508400917053, + "learning_rate": 0.0006, + "loss": 2.2729, + "step": 32340 + }, + { + "epoch": 0.12067023268652596, + "grad_norm": 0.35086578130722046, + "learning_rate": 0.0006, + "loss": 2.3246, + "step": 32350 + }, + { + "epoch": 0.12070753414948934, + "grad_norm": 0.30941063165664673, + "learning_rate": 0.0006, + "loss": 2.2486, + "step": 32360 + }, + { + "epoch": 0.12074483561245272, + "grad_norm": 0.3714693486690521, + "learning_rate": 0.0006, + "loss": 2.1143, + "step": 32370 + }, + { + "epoch": 0.1207821370754161, + "grad_norm": 0.34588053822517395, + "learning_rate": 0.0006, + "loss": 2.0537, + "step": 32380 + }, + { + "epoch": 0.12081943853837948, + "grad_norm": 0.3160041570663452, + "learning_rate": 0.0006, + "loss": 2.308, + "step": 32390 + }, + { + "epoch": 0.12085674000134285, + "grad_norm": 0.4229719340801239, + "learning_rate": 0.0006, + "loss": 2.1663, + "step": 32400 + }, + { + "epoch": 0.12089404146430623, + "grad_norm": 0.2884368300437927, + "learning_rate": 0.0006, + "loss": 2.2881, + "step": 32410 + }, + { + "epoch": 0.1209313429272696, + "grad_norm": 0.3245297074317932, + "learning_rate": 0.0006, + "loss": 2.0405, + "step": 32420 + }, + { + "epoch": 0.12096864439023298, + "grad_norm": 0.30956223607063293, + "learning_rate": 0.0006, + "loss": 2.1218, + "step": 32430 + }, + { + "epoch": 0.12100594585319636, + "grad_norm": 0.3910176157951355, + "learning_rate": 0.0006, + "loss": 2.3113, + "step": 32440 + }, + { + "epoch": 0.12104324731615974, + "grad_norm": 0.6522216796875, + "learning_rate": 0.0006, + "loss": 2.0051, + "step": 32450 + }, + { + "epoch": 0.12108054877912312, + "grad_norm": 0.4276267886161804, + "learning_rate": 0.0006, + "loss": 2.2449, + "step": 32460 + }, + { + "epoch": 0.12111785024208649, + "grad_norm": 0.30471861362457275, + "learning_rate": 0.0006, + "loss": 2.3251, + "step": 32470 + }, + { + "epoch": 0.12115515170504987, + "grad_norm": 0.3484245538711548, + "learning_rate": 0.0006, + "loss": 2.2916, + "step": 32480 + }, + { + "epoch": 0.12119245316801325, + "grad_norm": 1.5720767974853516, + "learning_rate": 0.0006, + "loss": 2.061, + "step": 32490 + }, + { + "epoch": 0.12122975463097663, + "grad_norm": 0.3368013799190521, + "learning_rate": 0.0006, + "loss": 2.2166, + "step": 32500 + }, + { + "epoch": 0.12122975463097663, + "eval_valid_loss": 2.1906373500823975, + "eval_valid_loss/all": 2.0533480644226074, + "eval_valid_loss/end_span": 1.2706254720687866, + "eval_valid_perplexity/batch": 7.793951988220215, + "eval_valid_perplexity/end_span": 3.5630805492401123, + "eval_valid_perplexity/fim": 2.2214438915252686, + "eval_valid_perplexity/first_seq": 15.163317680358887, + "eval_valid_perplexity/last_seq": 9.25353717803955, + "eval_valid_perplexity/second_seq": 13.581206321716309, + "eval_valid_perplexity/seq": 8.782964706420898, + "eval_valid_reconstruction/all": 0.29464682936668396, + "eval_valid_reconstruction/end_span": 0.7019321918487549, + "eval_valid_reconstruction/fim": 0.15944376587867737, + "eval_valid_reconstruction/first_seq": 0.16140863299369812, + "eval_valid_reconstruction/last_seq": 0.31679946184158325, + "eval_valid_reconstruction/second_seq": 0.19903545081615448, + "eval_valid_runtime": 448.4454, + "eval_valid_samples_per_second": 0.428, + "eval_valid_steps_per_second": 0.428, + "step": 32500 + }, + { + "epoch": 0.12122975463097663, + "eval_train_loss": 2.1909584999084473, + "eval_train_loss/all": 2.0270750522613525, + "eval_train_loss/end_span": 1.2345339059829712, + "eval_train_perplexity/batch": 7.591847896575928, + "eval_train_perplexity/end_span": 3.4367763996124268, + "eval_train_perplexity/fim": 2.05222225189209, + "eval_train_perplexity/first_seq": 15.483562469482422, + "eval_train_perplexity/last_seq": 9.44680404663086, + "eval_train_perplexity/second_seq": 14.167139053344727, + "eval_train_perplexity/seq": 8.7405424118042, + "eval_train_reconstruction/all": 0.28362706303596497, + "eval_train_reconstruction/end_span": 0.7128308415412903, + "eval_train_reconstruction/fim": 0.1440264731645584, + "eval_train_reconstruction/first_seq": 0.1525162309408188, + "eval_train_reconstruction/last_seq": 0.31159237027168274, + "eval_train_reconstruction/second_seq": 0.1847502887248993, + "eval_train_runtime": 453.5806, + "eval_train_samples_per_second": 0.423, + "eval_train_steps_per_second": 0.423, + "step": 32500 + }, + { + "epoch": 0.12126705609394, + "grad_norm": 0.3667497932910919, + "learning_rate": 0.0006, + "loss": 2.1968, + "step": 32510 + }, + { + "epoch": 0.12130435755690339, + "grad_norm": 0.3251349627971649, + "learning_rate": 0.0006, + "loss": 2.06, + "step": 32520 + }, + { + "epoch": 0.12134165901986677, + "grad_norm": 0.38377806544303894, + "learning_rate": 0.0006, + "loss": 2.0863, + "step": 32530 + }, + { + "epoch": 0.12137896048283013, + "grad_norm": 0.3287907540798187, + "learning_rate": 0.0006, + "loss": 2.2115, + "step": 32540 + }, + { + "epoch": 0.12141626194579351, + "grad_norm": 0.41047561168670654, + "learning_rate": 0.0006, + "loss": 2.2273, + "step": 32550 + }, + { + "epoch": 0.12145356340875689, + "grad_norm": 0.2532126307487488, + "learning_rate": 0.0006, + "loss": 2.2774, + "step": 32560 + }, + { + "epoch": 0.12149086487172027, + "grad_norm": 0.4589654803276062, + "learning_rate": 0.0006, + "loss": 2.3082, + "step": 32570 + }, + { + "epoch": 0.12152816633468365, + "grad_norm": 0.28815993666648865, + "learning_rate": 0.0006, + "loss": 2.21, + "step": 32580 + }, + { + "epoch": 0.12156546779764703, + "grad_norm": 0.44822171330451965, + "learning_rate": 0.0006, + "loss": 2.303, + "step": 32590 + }, + { + "epoch": 0.12160276926061041, + "grad_norm": 0.24307647347450256, + "learning_rate": 0.0006, + "loss": 2.391, + "step": 32600 + }, + { + "epoch": 0.12164007072357377, + "grad_norm": 0.29220181703567505, + "learning_rate": 0.0006, + "loss": 2.253, + "step": 32610 + }, + { + "epoch": 0.12167737218653715, + "grad_norm": 0.33914515376091003, + "learning_rate": 0.0006, + "loss": 2.1907, + "step": 32620 + }, + { + "epoch": 0.12171467364950053, + "grad_norm": 0.2596885561943054, + "learning_rate": 0.0006, + "loss": 2.2072, + "step": 32630 + }, + { + "epoch": 0.12175197511246391, + "grad_norm": 0.4067549705505371, + "learning_rate": 0.0006, + "loss": 2.1533, + "step": 32640 + }, + { + "epoch": 0.12178927657542729, + "grad_norm": 0.38013404607772827, + "learning_rate": 0.0006, + "loss": 2.2658, + "step": 32650 + }, + { + "epoch": 0.12182657803839067, + "grad_norm": 0.43311089277267456, + "learning_rate": 0.0006, + "loss": 2.072, + "step": 32660 + }, + { + "epoch": 0.12186387950135405, + "grad_norm": 0.40383055806159973, + "learning_rate": 0.0006, + "loss": 2.2899, + "step": 32670 + }, + { + "epoch": 0.12190118096431742, + "grad_norm": 0.324421226978302, + "learning_rate": 0.0006, + "loss": 2.2619, + "step": 32680 + }, + { + "epoch": 0.1219384824272808, + "grad_norm": 0.3906266689300537, + "learning_rate": 0.0006, + "loss": 2.3532, + "step": 32690 + }, + { + "epoch": 0.12197578389024417, + "grad_norm": 0.36052390933036804, + "learning_rate": 0.0006, + "loss": 2.2154, + "step": 32700 + }, + { + "epoch": 0.12201308535320755, + "grad_norm": 0.3891521394252777, + "learning_rate": 0.0006, + "loss": 2.2154, + "step": 32710 + }, + { + "epoch": 0.12205038681617093, + "grad_norm": 0.20418891310691833, + "learning_rate": 0.0006, + "loss": 2.2851, + "step": 32720 + }, + { + "epoch": 0.12208768827913431, + "grad_norm": 0.3957444131374359, + "learning_rate": 0.0006, + "loss": 2.1939, + "step": 32730 + }, + { + "epoch": 0.12212498974209768, + "grad_norm": 0.3488689959049225, + "learning_rate": 0.0006, + "loss": 2.1522, + "step": 32740 + }, + { + "epoch": 0.12216229120506106, + "grad_norm": 0.25872379541397095, + "learning_rate": 0.0006, + "loss": 2.2942, + "step": 32750 + }, + { + "epoch": 0.12216229120506106, + "eval_valid_loss": 2.195613145828247, + "eval_valid_loss/all": 2.058063507080078, + "eval_valid_loss/end_span": 1.3605433702468872, + "eval_valid_perplexity/batch": 7.830790996551514, + "eval_valid_perplexity/end_span": 3.898310899734497, + "eval_valid_perplexity/fim": 2.4199624061584473, + "eval_valid_perplexity/first_seq": 14.729819297790527, + "eval_valid_perplexity/last_seq": 8.914628982543945, + "eval_valid_perplexity/second_seq": 13.329989433288574, + "eval_valid_perplexity/seq": 8.822396278381348, + "eval_valid_reconstruction/all": 0.29311448335647583, + "eval_valid_reconstruction/end_span": 0.6804417967796326, + "eval_valid_reconstruction/fim": 0.17536906898021698, + "eval_valid_reconstruction/first_seq": 0.17242836952209473, + "eval_valid_reconstruction/last_seq": 0.33139434456825256, + "eval_valid_reconstruction/second_seq": 0.20630018413066864, + "eval_valid_runtime": 443.2325, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 32750 + }, + { + "epoch": 0.12216229120506106, + "eval_train_loss": 2.1933324337005615, + "eval_train_loss/all": 2.0292630195617676, + "eval_train_loss/end_span": 1.3229988813400269, + "eval_train_perplexity/batch": 7.6084771156311035, + "eval_train_perplexity/end_span": 3.754664182662964, + "eval_train_perplexity/fim": 2.2168478965759277, + "eval_train_perplexity/first_seq": 15.664083480834961, + "eval_train_perplexity/last_seq": 9.056009292602539, + "eval_train_perplexity/second_seq": 13.782172203063965, + "eval_train_perplexity/seq": 8.75927734375, + "eval_train_reconstruction/all": 0.2829800844192505, + "eval_train_reconstruction/end_span": 0.692452609539032, + "eval_train_reconstruction/fim": 0.15818332135677338, + "eval_train_reconstruction/first_seq": 0.14812737703323364, + "eval_train_reconstruction/last_seq": 0.3219222128391266, + "eval_train_reconstruction/second_seq": 0.1955893635749817, + "eval_train_runtime": 445.4357, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 32750 + }, + { + "epoch": 0.12219959266802444, + "grad_norm": 0.31166312098503113, + "learning_rate": 0.0006, + "loss": 2.1662, + "step": 32760 + }, + { + "epoch": 0.12223689413098782, + "grad_norm": 0.41255253553390503, + "learning_rate": 0.0006, + "loss": 2.166, + "step": 32770 + }, + { + "epoch": 0.1222741955939512, + "grad_norm": 0.3450455963611603, + "learning_rate": 0.0006, + "loss": 2.0952, + "step": 32780 + }, + { + "epoch": 0.12231149705691458, + "grad_norm": 0.23957473039627075, + "learning_rate": 0.0006, + "loss": 2.2702, + "step": 32790 + }, + { + "epoch": 0.12234879851987795, + "grad_norm": 0.3002817928791046, + "learning_rate": 0.0006, + "loss": 2.0613, + "step": 32800 + }, + { + "epoch": 0.12238609998284132, + "grad_norm": 0.3109709620475769, + "learning_rate": 0.0006, + "loss": 2.0244, + "step": 32810 + }, + { + "epoch": 0.1224234014458047, + "grad_norm": 0.2539105713367462, + "learning_rate": 0.0006, + "loss": 2.3359, + "step": 32820 + }, + { + "epoch": 0.12246070290876808, + "grad_norm": 0.42010074853897095, + "learning_rate": 0.0006, + "loss": 2.2637, + "step": 32830 + }, + { + "epoch": 0.12249800437173146, + "grad_norm": 0.3652437627315521, + "learning_rate": 0.0006, + "loss": 2.1848, + "step": 32840 + }, + { + "epoch": 0.12253530583469484, + "grad_norm": 0.34733548760414124, + "learning_rate": 0.0006, + "loss": 2.1694, + "step": 32850 + }, + { + "epoch": 0.12257260729765822, + "grad_norm": 0.3641189634799957, + "learning_rate": 0.0006, + "loss": 2.2175, + "step": 32860 + }, + { + "epoch": 0.1226099087606216, + "grad_norm": 0.43706846237182617, + "learning_rate": 0.0006, + "loss": 2.1733, + "step": 32870 + }, + { + "epoch": 0.12264721022358496, + "grad_norm": 0.3789925277233124, + "learning_rate": 0.0006, + "loss": 2.0994, + "step": 32880 + }, + { + "epoch": 0.12268451168654834, + "grad_norm": 0.4619337320327759, + "learning_rate": 0.0006, + "loss": 2.1914, + "step": 32890 + }, + { + "epoch": 0.12272181314951172, + "grad_norm": 0.3176301419734955, + "learning_rate": 0.0006, + "loss": 2.1758, + "step": 32900 + }, + { + "epoch": 0.1227591146124751, + "grad_norm": 0.2683154344558716, + "learning_rate": 0.0006, + "loss": 2.3107, + "step": 32910 + }, + { + "epoch": 0.12279641607543848, + "grad_norm": 0.36664509773254395, + "learning_rate": 0.0006, + "loss": 2.3229, + "step": 32920 + }, + { + "epoch": 0.12283371753840186, + "grad_norm": 0.27490028738975525, + "learning_rate": 0.0006, + "loss": 2.3502, + "step": 32930 + }, + { + "epoch": 0.12287101900136524, + "grad_norm": 0.2870936691761017, + "learning_rate": 0.0006, + "loss": 2.2009, + "step": 32940 + }, + { + "epoch": 0.1229083204643286, + "grad_norm": 0.34337684512138367, + "learning_rate": 0.0006, + "loss": 2.0445, + "step": 32950 + }, + { + "epoch": 0.12294562192729198, + "grad_norm": 0.3400425612926483, + "learning_rate": 0.0006, + "loss": 2.247, + "step": 32960 + }, + { + "epoch": 0.12298292339025536, + "grad_norm": 0.4059264063835144, + "learning_rate": 0.0006, + "loss": 2.308, + "step": 32970 + }, + { + "epoch": 0.12302022485321874, + "grad_norm": 0.2718907594680786, + "learning_rate": 0.0006, + "loss": 2.2651, + "step": 32980 + }, + { + "epoch": 0.12305752631618212, + "grad_norm": 0.31872695684432983, + "learning_rate": 0.0006, + "loss": 2.3398, + "step": 32990 + }, + { + "epoch": 0.1230948277791455, + "grad_norm": 0.47908926010131836, + "learning_rate": 0.0006, + "loss": 2.3926, + "step": 33000 + }, + { + "epoch": 0.1230948277791455, + "eval_valid_loss": 2.1949663162231445, + "eval_valid_loss/all": 2.057335376739502, + "eval_valid_loss/end_span": 1.3145290613174438, + "eval_valid_perplexity/batch": 7.8250908851623535, + "eval_valid_perplexity/end_span": 3.7229971885681152, + "eval_valid_perplexity/fim": 2.1102304458618164, + "eval_valid_perplexity/first_seq": 14.793919563293457, + "eval_valid_perplexity/last_seq": 9.20531177520752, + "eval_valid_perplexity/second_seq": 13.836457252502441, + "eval_valid_perplexity/seq": 8.818087577819824, + "eval_valid_reconstruction/all": 0.2940504848957062, + "eval_valid_reconstruction/end_span": 0.7005835175514221, + "eval_valid_reconstruction/fim": 0.14872688055038452, + "eval_valid_reconstruction/first_seq": 0.17204922437667847, + "eval_valid_reconstruction/last_seq": 0.3236909806728363, + "eval_valid_reconstruction/second_seq": 0.19320957362651825, + "eval_valid_runtime": 447.9862, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 33000 + }, + { + "epoch": 0.1230948277791455, + "eval_train_loss": 2.1939852237701416, + "eval_train_loss/all": 2.029722213745117, + "eval_train_loss/end_span": 1.272491216659546, + "eval_train_perplexity/batch": 7.611971378326416, + "eval_train_perplexity/end_span": 3.569734573364258, + "eval_train_perplexity/fim": 2.171820640563965, + "eval_train_perplexity/first_seq": 15.444680213928223, + "eval_train_perplexity/last_seq": 8.744330406188965, + "eval_train_perplexity/second_seq": 14.430505752563477, + "eval_train_perplexity/seq": 8.762395858764648, + "eval_train_reconstruction/all": 0.2833232283592224, + "eval_train_reconstruction/end_span": 0.7115030288696289, + "eval_train_reconstruction/fim": 0.15461604297161102, + "eval_train_reconstruction/first_seq": 0.15344418585300446, + "eval_train_reconstruction/last_seq": 0.3392885625362396, + "eval_train_reconstruction/second_seq": 0.18075263500213623, + "eval_train_runtime": 451.8352, + "eval_train_samples_per_second": 0.425, + "eval_train_steps_per_second": 0.425, + "step": 33000 + }, + { + "epoch": 0.12313212924210888, + "grad_norm": 0.4386425316333771, + "learning_rate": 0.0006, + "loss": 2.1851, + "step": 33010 + }, + { + "epoch": 0.12316943070507225, + "grad_norm": 0.4001314342021942, + "learning_rate": 0.0006, + "loss": 2.0907, + "step": 33020 + }, + { + "epoch": 0.12320673216803563, + "grad_norm": 0.3793460726737976, + "learning_rate": 0.0006, + "loss": 2.2983, + "step": 33030 + }, + { + "epoch": 0.123244033630999, + "grad_norm": 0.25763750076293945, + "learning_rate": 0.0006, + "loss": 2.3717, + "step": 33040 + }, + { + "epoch": 0.12328133509396239, + "grad_norm": 0.22111867368221283, + "learning_rate": 0.0006, + "loss": 2.4055, + "step": 33050 + }, + { + "epoch": 0.12331863655692576, + "grad_norm": 0.24673384428024292, + "learning_rate": 0.0006, + "loss": 2.1937, + "step": 33060 + }, + { + "epoch": 0.12335593801988914, + "grad_norm": 0.35345837473869324, + "learning_rate": 0.0006, + "loss": 2.2466, + "step": 33070 + }, + { + "epoch": 0.12339323948285252, + "grad_norm": 0.3573523759841919, + "learning_rate": 0.0006, + "loss": 2.1031, + "step": 33080 + }, + { + "epoch": 0.12343054094581589, + "grad_norm": 0.4166763126850128, + "learning_rate": 0.0006, + "loss": 2.1824, + "step": 33090 + }, + { + "epoch": 0.12346784240877927, + "grad_norm": 0.2957724928855896, + "learning_rate": 0.0006, + "loss": 2.3409, + "step": 33100 + }, + { + "epoch": 0.12350514387174265, + "grad_norm": 0.2589571177959442, + "learning_rate": 0.0006, + "loss": 2.1045, + "step": 33110 + }, + { + "epoch": 0.12354244533470603, + "grad_norm": 0.34398460388183594, + "learning_rate": 0.0006, + "loss": 2.2841, + "step": 33120 + }, + { + "epoch": 0.12357974679766941, + "grad_norm": 0.395946204662323, + "learning_rate": 0.0006, + "loss": 2.1801, + "step": 33130 + }, + { + "epoch": 0.12361704826063279, + "grad_norm": 0.4723063111305237, + "learning_rate": 0.0006, + "loss": 2.1076, + "step": 33140 + }, + { + "epoch": 0.12365434972359617, + "grad_norm": 0.3218773603439331, + "learning_rate": 0.0006, + "loss": 2.2701, + "step": 33150 + }, + { + "epoch": 0.12369165118655953, + "grad_norm": 0.2903996407985687, + "learning_rate": 0.0006, + "loss": 2.2254, + "step": 33160 + }, + { + "epoch": 0.12372895264952291, + "grad_norm": 0.2106863409280777, + "learning_rate": 0.0006, + "loss": 2.2515, + "step": 33170 + }, + { + "epoch": 0.12376625411248629, + "grad_norm": 0.3520907163619995, + "learning_rate": 0.0006, + "loss": 2.3317, + "step": 33180 + }, + { + "epoch": 0.12380355557544967, + "grad_norm": 0.37296339869499207, + "learning_rate": 0.0006, + "loss": 2.1377, + "step": 33190 + }, + { + "epoch": 0.12384085703841305, + "grad_norm": 0.2744135856628418, + "learning_rate": 0.0006, + "loss": 2.2395, + "step": 33200 + }, + { + "epoch": 0.12387815850137643, + "grad_norm": 0.33189600706100464, + "learning_rate": 0.0006, + "loss": 2.2989, + "step": 33210 + }, + { + "epoch": 0.12391545996433981, + "grad_norm": 0.3281570374965668, + "learning_rate": 0.0006, + "loss": 2.1295, + "step": 33220 + }, + { + "epoch": 0.12395276142730317, + "grad_norm": 0.46037906408309937, + "learning_rate": 0.0006, + "loss": 2.1903, + "step": 33230 + }, + { + "epoch": 0.12399006289026655, + "grad_norm": 0.4095107614994049, + "learning_rate": 0.0006, + "loss": 2.1606, + "step": 33240 + }, + { + "epoch": 0.12402736435322993, + "grad_norm": 0.3519834280014038, + "learning_rate": 0.0006, + "loss": 2.0552, + "step": 33250 + }, + { + "epoch": 0.12402736435322993, + "eval_valid_loss": 2.190115213394165, + "eval_valid_loss/all": 2.0530009269714355, + "eval_valid_loss/end_span": 1.115110993385315, + "eval_valid_perplexity/batch": 7.7912468910217285, + "eval_valid_perplexity/end_span": 3.0499067306518555, + "eval_valid_perplexity/fim": 2.0708611011505127, + "eval_valid_perplexity/first_seq": 14.434800148010254, + "eval_valid_perplexity/last_seq": 9.130660057067871, + "eval_valid_perplexity/second_seq": 13.598822593688965, + "eval_valid_perplexity/seq": 8.782453536987305, + "eval_valid_reconstruction/all": 0.2944355905056, + "eval_valid_reconstruction/end_span": 0.7405627369880676, + "eval_valid_reconstruction/fim": 0.1449447125196457, + "eval_valid_reconstruction/first_seq": 0.17633354663848877, + "eval_valid_reconstruction/last_seq": 0.32299524545669556, + "eval_valid_reconstruction/second_seq": 0.19923222064971924, + "eval_valid_runtime": 450.6818, + "eval_valid_samples_per_second": 0.426, + "eval_valid_steps_per_second": 0.426, + "step": 33250 + }, + { + "epoch": 0.12402736435322993, + "eval_train_loss": 2.1898701190948486, + "eval_train_loss/all": 2.0263357162475586, + "eval_train_loss/end_span": 1.0904985666275024, + "eval_train_perplexity/batch": 7.58623743057251, + "eval_train_perplexity/end_span": 2.975757360458374, + "eval_train_perplexity/fim": 2.005490303039551, + "eval_train_perplexity/first_seq": 15.495287895202637, + "eval_train_perplexity/last_seq": 9.661599159240723, + "eval_train_perplexity/second_seq": 14.316612243652344, + "eval_train_perplexity/seq": 8.733206748962402, + "eval_train_reconstruction/all": 0.2835982143878937, + "eval_train_reconstruction/end_span": 0.7480886578559875, + "eval_train_reconstruction/fim": 0.1391919106245041, + "eval_train_reconstruction/first_seq": 0.15205496549606323, + "eval_train_reconstruction/last_seq": 0.3004854619503021, + "eval_train_reconstruction/second_seq": 0.17908203601837158, + "eval_train_runtime": 451.5273, + "eval_train_samples_per_second": 0.425, + "eval_train_steps_per_second": 0.425, + "step": 33250 + }, + { + "epoch": 0.12406466581619331, + "grad_norm": 0.26796427369117737, + "learning_rate": 0.0006, + "loss": 2.2466, + "step": 33260 + }, + { + "epoch": 0.12410196727915669, + "grad_norm": 0.26557546854019165, + "learning_rate": 0.0006, + "loss": 2.2153, + "step": 33270 + }, + { + "epoch": 0.12413926874212007, + "grad_norm": 0.40123867988586426, + "learning_rate": 0.0006, + "loss": 2.3118, + "step": 33280 + }, + { + "epoch": 0.12417657020508345, + "grad_norm": 0.2834693193435669, + "learning_rate": 0.0006, + "loss": 2.257, + "step": 33290 + }, + { + "epoch": 0.12421387166804682, + "grad_norm": 0.3291003406047821, + "learning_rate": 0.0006, + "loss": 2.2473, + "step": 33300 + }, + { + "epoch": 0.1242511731310102, + "grad_norm": 0.36293989419937134, + "learning_rate": 0.0006, + "loss": 2.3005, + "step": 33310 + }, + { + "epoch": 0.12428847459397357, + "grad_norm": 0.4115201532840729, + "learning_rate": 0.0006, + "loss": 2.2848, + "step": 33320 + }, + { + "epoch": 0.12432577605693695, + "grad_norm": 0.7907997965812683, + "learning_rate": 0.0006, + "loss": 2.0953, + "step": 33330 + }, + { + "epoch": 0.12436307751990033, + "grad_norm": 0.4591960608959198, + "learning_rate": 0.0006, + "loss": 2.2768, + "step": 33340 + }, + { + "epoch": 0.12440037898286371, + "grad_norm": 0.33411699533462524, + "learning_rate": 0.0006, + "loss": 2.1633, + "step": 33350 + }, + { + "epoch": 0.12443768044582708, + "grad_norm": 0.530060350894928, + "learning_rate": 0.0006, + "loss": 2.1157, + "step": 33360 + }, + { + "epoch": 0.12447498190879046, + "grad_norm": 0.5023632049560547, + "learning_rate": 0.0006, + "loss": 2.1007, + "step": 33370 + }, + { + "epoch": 0.12451228337175384, + "grad_norm": 0.4583599865436554, + "learning_rate": 0.0006, + "loss": 2.106, + "step": 33380 + }, + { + "epoch": 0.12454958483471722, + "grad_norm": 1.1900815963745117, + "learning_rate": 0.0006, + "loss": 1.8884, + "step": 33390 + }, + { + "epoch": 0.1245868862976806, + "grad_norm": 0.2175108641386032, + "learning_rate": 0.0006, + "loss": 2.3319, + "step": 33400 + }, + { + "epoch": 0.12462418776064398, + "grad_norm": 0.43711531162261963, + "learning_rate": 0.0006, + "loss": 2.208, + "step": 33410 + }, + { + "epoch": 0.12466148922360736, + "grad_norm": 0.5254966616630554, + "learning_rate": 0.0006, + "loss": 2.3091, + "step": 33420 + }, + { + "epoch": 0.12469879068657072, + "grad_norm": 0.29003003239631653, + "learning_rate": 0.0006, + "loss": 2.3537, + "step": 33430 + }, + { + "epoch": 0.1247360921495341, + "grad_norm": 0.6857041716575623, + "learning_rate": 0.0006, + "loss": 2.2948, + "step": 33440 + }, + { + "epoch": 0.12477339361249748, + "grad_norm": 1.7195159196853638, + "learning_rate": 0.0006, + "loss": 2.0568, + "step": 33450 + }, + { + "epoch": 0.12481069507546086, + "grad_norm": 0.5718810558319092, + "learning_rate": 0.0006, + "loss": 2.0214, + "step": 33460 + }, + { + "epoch": 0.12484799653842424, + "grad_norm": 0.26676657795906067, + "learning_rate": 0.0006, + "loss": 1.9667, + "step": 33470 + }, + { + "epoch": 0.12488529800138762, + "grad_norm": 0.6066029667854309, + "learning_rate": 0.0006, + "loss": 2.3089, + "step": 33480 + }, + { + "epoch": 0.124922599464351, + "grad_norm": 0.3577319085597992, + "learning_rate": 0.0006, + "loss": 2.1593, + "step": 33490 + }, + { + "epoch": 0.12495990092731436, + "grad_norm": 0.5573367476463318, + "learning_rate": 0.0006, + "loss": 2.285, + "step": 33500 + }, + { + "epoch": 0.12495990092731436, + "eval_valid_loss": 2.193206787109375, + "eval_valid_loss/all": 2.0559277534484863, + "eval_valid_loss/end_span": 1.2034151554107666, + "eval_valid_perplexity/batch": 7.814084053039551, + "eval_valid_perplexity/end_span": 3.331475019454956, + "eval_valid_perplexity/fim": 2.409546136856079, + "eval_valid_perplexity/first_seq": 14.790536880493164, + "eval_valid_perplexity/last_seq": 8.82256031036377, + "eval_valid_perplexity/second_seq": 14.342321395874023, + "eval_valid_perplexity/seq": 8.80175495147705, + "eval_valid_reconstruction/all": 0.2935567796230316, + "eval_valid_reconstruction/end_span": 0.713994562625885, + "eval_valid_reconstruction/fim": 0.1748577356338501, + "eval_valid_reconstruction/first_seq": 0.16966333985328674, + "eval_valid_reconstruction/last_seq": 0.33336570858955383, + "eval_valid_reconstruction/second_seq": 0.18148787319660187, + "eval_valid_runtime": 452.843, + "eval_valid_samples_per_second": 0.424, + "eval_valid_steps_per_second": 0.424, + "step": 33500 + }, + { + "epoch": 0.12495990092731436, + "eval_train_loss": 2.1925315856933594, + "eval_train_loss/all": 2.02874755859375, + "eval_train_loss/end_span": 1.1796047687530518, + "eval_train_perplexity/batch": 7.604556083679199, + "eval_train_perplexity/end_span": 3.2530882358551025, + "eval_train_perplexity/fim": 2.3309831619262695, + "eval_train_perplexity/first_seq": 15.759143829345703, + "eval_train_perplexity/last_seq": 8.574971199035645, + "eval_train_perplexity/second_seq": 14.601411819458008, + "eval_train_perplexity/seq": 8.75621509552002, + "eval_train_reconstruction/all": 0.2828384339809418, + "eval_train_reconstruction/end_span": 0.7222839593887329, + "eval_train_reconstruction/fim": 0.16854026913642883, + "eval_train_reconstruction/first_seq": 0.146045982837677, + "eval_train_reconstruction/last_seq": 0.3414578437805176, + "eval_train_reconstruction/second_seq": 0.1797126978635788, + "eval_train_runtime": 451.9217, + "eval_train_samples_per_second": 0.425, + "eval_train_steps_per_second": 0.425, + "step": 33500 + }, + { + "epoch": 0.12499720239027774, + "grad_norm": 0.5059244632720947, + "learning_rate": 0.0006, + "loss": 2.3363, + "step": 33510 + }, + { + "epoch": 0.12503450385324114, + "grad_norm": 0.38335293531417847, + "learning_rate": 0.0006, + "loss": 2.2302, + "step": 33520 + }, + { + "epoch": 0.12507180531620452, + "grad_norm": 0.36979275941848755, + "learning_rate": 0.0006, + "loss": 2.1477, + "step": 33530 + }, + { + "epoch": 0.12510910677916787, + "grad_norm": 0.33413031697273254, + "learning_rate": 0.0006, + "loss": 2.1614, + "step": 33540 + }, + { + "epoch": 0.12514640824213125, + "grad_norm": 0.17344552278518677, + "learning_rate": 0.0006, + "loss": 2.2772, + "step": 33550 + }, + { + "epoch": 0.12518370970509463, + "grad_norm": 0.2758602201938629, + "learning_rate": 0.0006, + "loss": 2.1463, + "step": 33560 + }, + { + "epoch": 0.125221011168058, + "grad_norm": 0.3838820457458496, + "learning_rate": 0.0006, + "loss": 2.218, + "step": 33570 + }, + { + "epoch": 0.12525831263102138, + "grad_norm": 0.32955655455589294, + "learning_rate": 0.0006, + "loss": 2.2403, + "step": 33580 + }, + { + "epoch": 0.12529561409398476, + "grad_norm": 0.30071961879730225, + "learning_rate": 0.0006, + "loss": 2.335, + "step": 33590 + }, + { + "epoch": 0.12533291555694814, + "grad_norm": 0.4332798421382904, + "learning_rate": 0.0006, + "loss": 2.2624, + "step": 33600 + }, + { + "epoch": 0.12537021701991152, + "grad_norm": 0.32362768054008484, + "learning_rate": 0.0006, + "loss": 2.349, + "step": 33610 + }, + { + "epoch": 0.1254075184828749, + "grad_norm": 0.8924639225006104, + "learning_rate": 0.0006, + "loss": 2.2813, + "step": 33620 + }, + { + "epoch": 0.12544481994583828, + "grad_norm": 0.46727943420410156, + "learning_rate": 0.0006, + "loss": 2.3427, + "step": 33630 + }, + { + "epoch": 0.12548212140880166, + "grad_norm": 0.4947614073753357, + "learning_rate": 0.0006, + "loss": 2.1666, + "step": 33640 + }, + { + "epoch": 0.12551942287176504, + "grad_norm": 0.4681389033794403, + "learning_rate": 0.0006, + "loss": 2.1942, + "step": 33650 + }, + { + "epoch": 0.12555672433472842, + "grad_norm": 0.3623877465724945, + "learning_rate": 0.0006, + "loss": 2.2159, + "step": 33660 + }, + { + "epoch": 0.12559402579769177, + "grad_norm": 0.2678369879722595, + "learning_rate": 0.0006, + "loss": 2.397, + "step": 33670 + }, + { + "epoch": 0.12563132726065515, + "grad_norm": 0.2579338252544403, + "learning_rate": 0.0006, + "loss": 2.1997, + "step": 33680 + }, + { + "epoch": 0.12566862872361853, + "grad_norm": 0.25223153829574585, + "learning_rate": 0.0006, + "loss": 2.3838, + "step": 33690 + }, + { + "epoch": 0.1257059301865819, + "grad_norm": 0.28028154373168945, + "learning_rate": 0.0006, + "loss": 2.2791, + "step": 33700 + }, + { + "epoch": 0.1257432316495453, + "grad_norm": 0.2877809703350067, + "learning_rate": 0.0006, + "loss": 2.2659, + "step": 33710 + }, + { + "epoch": 0.12578053311250867, + "grad_norm": 0.48412269353866577, + "learning_rate": 0.0006, + "loss": 2.4128, + "step": 33720 + }, + { + "epoch": 0.12581783457547205, + "grad_norm": 0.4365186095237732, + "learning_rate": 0.0006, + "loss": 2.1537, + "step": 33730 + }, + { + "epoch": 0.12585513603843543, + "grad_norm": 0.33537593483924866, + "learning_rate": 0.0006, + "loss": 2.1243, + "step": 33740 + }, + { + "epoch": 0.1258924375013988, + "grad_norm": 0.2875346839427948, + "learning_rate": 0.0006, + "loss": 2.0261, + "step": 33750 + }, + { + "epoch": 0.1258924375013988, + "eval_valid_loss": 2.186046600341797, + "eval_valid_loss/all": 2.0492796897888184, + "eval_valid_loss/end_span": 1.1895781755447388, + "eval_valid_perplexity/batch": 7.762307643890381, + "eval_valid_perplexity/end_span": 3.2856948375701904, + "eval_valid_perplexity/fim": 2.1527082920074463, + "eval_valid_perplexity/first_seq": 14.85245132446289, + "eval_valid_perplexity/last_seq": 8.612085342407227, + "eval_valid_perplexity/second_seq": 13.569007873535156, + "eval_valid_perplexity/seq": 8.748042106628418, + "eval_valid_reconstruction/all": 0.2957642078399658, + "eval_valid_reconstruction/end_span": 0.727936327457428, + "eval_valid_reconstruction/fim": 0.15445052087306976, + "eval_valid_reconstruction/first_seq": 0.1688876748085022, + "eval_valid_reconstruction/last_seq": 0.3391565978527069, + "eval_valid_reconstruction/second_seq": 0.20175525546073914, + "eval_valid_runtime": 449.367, + "eval_valid_samples_per_second": 0.427, + "eval_valid_steps_per_second": 0.427, + "step": 33750 + }, + { + "epoch": 0.1258924375013988, + "eval_train_loss": 2.186708450317383, + "eval_train_loss/all": 2.0234360694885254, + "eval_train_loss/end_span": 1.1517049074172974, + "eval_train_perplexity/batch": 7.564271926879883, + "eval_train_perplexity/end_span": 3.1635818481445312, + "eval_train_perplexity/fim": 2.1572625637054443, + "eval_train_perplexity/first_seq": 15.553431510925293, + "eval_train_perplexity/last_seq": 9.107453346252441, + "eval_train_perplexity/second_seq": 14.301939010620117, + "eval_train_perplexity/seq": 8.7085542678833, + "eval_train_reconstruction/all": 0.28479552268981934, + "eval_train_reconstruction/end_span": 0.7386193871498108, + "eval_train_reconstruction/fim": 0.1540120542049408, + "eval_train_reconstruction/first_seq": 0.15242813527584076, + "eval_train_reconstruction/last_seq": 0.3209744989871979, + "eval_train_reconstruction/second_seq": 0.18630535900592804, + "eval_train_runtime": 450.1933, + "eval_train_samples_per_second": 0.426, + "eval_train_steps_per_second": 0.426, + "step": 33750 + }, + { + "epoch": 0.1259297389643622, + "grad_norm": 0.41399869322776794, + "learning_rate": 0.0006, + "loss": 2.295, + "step": 33760 + }, + { + "epoch": 0.12596704042732557, + "grad_norm": 0.33658358454704285, + "learning_rate": 0.0006, + "loss": 2.2438, + "step": 33770 + }, + { + "epoch": 0.12600434189028895, + "grad_norm": 0.2968907952308655, + "learning_rate": 0.0006, + "loss": 2.3334, + "step": 33780 + }, + { + "epoch": 0.12604164335325233, + "grad_norm": 0.2504579424858093, + "learning_rate": 0.0006, + "loss": 2.3406, + "step": 33790 + }, + { + "epoch": 0.1260789448162157, + "grad_norm": 0.369026780128479, + "learning_rate": 0.0006, + "loss": 1.9864, + "step": 33800 + }, + { + "epoch": 0.12611624627917906, + "grad_norm": 0.4098239839076996, + "learning_rate": 0.0006, + "loss": 2.3129, + "step": 33810 + }, + { + "epoch": 0.12615354774214244, + "grad_norm": 0.31259945034980774, + "learning_rate": 0.0006, + "loss": 2.2995, + "step": 33820 + }, + { + "epoch": 0.12619084920510582, + "grad_norm": 0.2839601933956146, + "learning_rate": 0.0006, + "loss": 2.1941, + "step": 33830 + }, + { + "epoch": 0.1262281506680692, + "grad_norm": 0.34092283248901367, + "learning_rate": 0.0006, + "loss": 2.2763, + "step": 33840 + }, + { + "epoch": 0.12626545213103257, + "grad_norm": 0.365140825510025, + "learning_rate": 0.0006, + "loss": 2.1187, + "step": 33850 + }, + { + "epoch": 0.12630275359399595, + "grad_norm": 0.24714811146259308, + "learning_rate": 0.0006, + "loss": 2.2494, + "step": 33860 + }, + { + "epoch": 0.12634005505695933, + "grad_norm": 0.22423478960990906, + "learning_rate": 0.0006, + "loss": 2.2173, + "step": 33870 + }, + { + "epoch": 0.1263773565199227, + "grad_norm": 0.30697837471961975, + "learning_rate": 0.0006, + "loss": 2.229, + "step": 33880 + }, + { + "epoch": 0.1264146579828861, + "grad_norm": 0.3309588134288788, + "learning_rate": 0.0006, + "loss": 2.261, + "step": 33890 + }, + { + "epoch": 0.12645195944584947, + "grad_norm": 0.5012648701667786, + "learning_rate": 0.0006, + "loss": 2.1234, + "step": 33900 + }, + { + "epoch": 0.12648926090881285, + "grad_norm": 0.4407358467578888, + "learning_rate": 0.0006, + "loss": 2.2267, + "step": 33910 + }, + { + "epoch": 0.12652656237177623, + "grad_norm": 0.39369410276412964, + "learning_rate": 0.0006, + "loss": 2.2365, + "step": 33920 + }, + { + "epoch": 0.1265638638347396, + "grad_norm": 0.4462546706199646, + "learning_rate": 0.0006, + "loss": 2.0665, + "step": 33930 + }, + { + "epoch": 0.126601165297703, + "grad_norm": 0.5789057016372681, + "learning_rate": 0.0006, + "loss": 2.2775, + "step": 33940 + }, + { + "epoch": 0.12663846676066634, + "grad_norm": 0.3960319459438324, + "learning_rate": 0.0006, + "loss": 2.1409, + "step": 33950 + }, + { + "epoch": 0.12667576822362972, + "grad_norm": 0.24613945186138153, + "learning_rate": 0.0006, + "loss": 2.3139, + "step": 33960 + }, + { + "epoch": 0.1267130696865931, + "grad_norm": 0.2755185663700104, + "learning_rate": 0.0006, + "loss": 2.2744, + "step": 33970 + }, + { + "epoch": 0.12675037114955648, + "grad_norm": 0.4115654230117798, + "learning_rate": 0.0006, + "loss": 2.2394, + "step": 33980 + }, + { + "epoch": 0.12678767261251986, + "grad_norm": 0.3206702470779419, + "learning_rate": 0.0006, + "loss": 2.229, + "step": 33990 + }, + { + "epoch": 0.12682497407548324, + "grad_norm": 0.18358276784420013, + "learning_rate": 0.0006, + "loss": 2.2146, + "step": 34000 + }, + { + "epoch": 0.12682497407548324, + "eval_valid_loss": 2.1895735263824463, + "eval_valid_loss/all": 2.0525550842285156, + "eval_valid_loss/end_span": 1.2302347421646118, + "eval_valid_perplexity/batch": 7.787774085998535, + "eval_valid_perplexity/end_span": 3.4220328330993652, + "eval_valid_perplexity/fim": 2.2699451446533203, + "eval_valid_perplexity/first_seq": 14.810972213745117, + "eval_valid_perplexity/last_seq": 9.125049591064453, + "eval_valid_perplexity/second_seq": 13.493072509765625, + "eval_valid_perplexity/seq": 8.775224685668945, + "eval_valid_reconstruction/all": 0.2948216497898102, + "eval_valid_reconstruction/end_span": 0.7135345339775085, + "eval_valid_reconstruction/fim": 0.16423344612121582, + "eval_valid_reconstruction/first_seq": 0.167965829372406, + "eval_valid_reconstruction/last_seq": 0.3223792612552643, + "eval_valid_reconstruction/second_seq": 0.20270399749279022, + "eval_valid_runtime": 453.3192, + "eval_valid_samples_per_second": 0.424, + "eval_valid_steps_per_second": 0.424, + "step": 34000 + }, + { + "epoch": 0.12682497407548324, + "eval_train_loss": 2.187711477279663, + "eval_train_loss/all": 2.0240213871002197, + "eval_train_loss/end_span": 1.2041929960250854, + "eval_train_perplexity/batch": 7.568700313568115, + "eval_train_perplexity/end_span": 3.3340673446655273, + "eval_train_perplexity/fim": 2.141744613647461, + "eval_train_perplexity/first_seq": 15.370132446289062, + "eval_train_perplexity/last_seq": 9.041465759277344, + "eval_train_perplexity/second_seq": 14.54407787322998, + "eval_train_perplexity/seq": 8.711688041687012, + "eval_train_reconstruction/all": 0.2846180200576782, + "eval_train_reconstruction/end_span": 0.7220114469528198, + "eval_train_reconstruction/fim": 0.1526460200548172, + "eval_train_reconstruction/first_seq": 0.15642428398132324, + "eval_train_reconstruction/last_seq": 0.3240566849708557, + "eval_train_reconstruction/second_seq": 0.17652671039104462, + "eval_train_runtime": 449.068, + "eval_train_samples_per_second": 0.428, + "eval_train_steps_per_second": 0.428, + "step": 34000 + }, + { + "epoch": 0.12686227553844662, + "grad_norm": 0.28943514823913574, + "learning_rate": 0.0006, + "loss": 2.2391, + "step": 34010 + }, + { + "epoch": 0.12689957700141, + "grad_norm": 0.30693235993385315, + "learning_rate": 0.0006, + "loss": 2.1815, + "step": 34020 + }, + { + "epoch": 0.12693687846437338, + "grad_norm": 0.28176334500312805, + "learning_rate": 0.0006, + "loss": 2.1707, + "step": 34030 + }, + { + "epoch": 0.12697417992733676, + "grad_norm": 0.31317171454429626, + "learning_rate": 0.0006, + "loss": 2.1134, + "step": 34040 + }, + { + "epoch": 0.12701148139030013, + "grad_norm": 0.20990581810474396, + "learning_rate": 0.0006, + "loss": 2.3029, + "step": 34050 + }, + { + "epoch": 0.12704878285326351, + "grad_norm": 0.3444244861602783, + "learning_rate": 0.0006, + "loss": 2.3191, + "step": 34060 + }, + { + "epoch": 0.1270860843162269, + "grad_norm": 0.27485641837120056, + "learning_rate": 0.0006, + "loss": 2.1947, + "step": 34070 + }, + { + "epoch": 0.12712338577919027, + "grad_norm": 0.9641899466514587, + "learning_rate": 0.0006, + "loss": 2.242, + "step": 34080 + }, + { + "epoch": 0.12716068724215362, + "grad_norm": 0.28431835770606995, + "learning_rate": 0.0006, + "loss": 2.2488, + "step": 34090 + }, + { + "epoch": 0.127197988705117, + "grad_norm": 0.3332933783531189, + "learning_rate": 0.0006, + "loss": 2.3334, + "step": 34100 + }, + { + "epoch": 0.12723529016808038, + "grad_norm": 0.4471345841884613, + "learning_rate": 0.0006, + "loss": 2.1957, + "step": 34110 + }, + { + "epoch": 0.12727259163104376, + "grad_norm": 0.21947598457336426, + "learning_rate": 0.0006, + "loss": 2.2747, + "step": 34120 + }, + { + "epoch": 0.12730989309400714, + "grad_norm": 0.3872700333595276, + "learning_rate": 0.0006, + "loss": 2.2256, + "step": 34130 + }, + { + "epoch": 0.12734719455697052, + "grad_norm": 0.2617591619491577, + "learning_rate": 0.0006, + "loss": 2.3249, + "step": 34140 + }, + { + "epoch": 0.1273844960199339, + "grad_norm": 0.4669781029224396, + "learning_rate": 0.0006, + "loss": 2.1867, + "step": 34150 + }, + { + "epoch": 0.12742179748289728, + "grad_norm": 0.49424269795417786, + "learning_rate": 0.0006, + "loss": 2.0723, + "step": 34160 + }, + { + "epoch": 0.12745909894586066, + "grad_norm": 0.2538772225379944, + "learning_rate": 0.0006, + "loss": 2.0085, + "step": 34170 + }, + { + "epoch": 0.12749640040882404, + "grad_norm": 0.4243176579475403, + "learning_rate": 0.0006, + "loss": 2.2092, + "step": 34180 + }, + { + "epoch": 0.12753370187178742, + "grad_norm": 0.3056534230709076, + "learning_rate": 0.0006, + "loss": 2.2732, + "step": 34190 + }, + { + "epoch": 0.1275710033347508, + "grad_norm": 0.33115753531455994, + "learning_rate": 0.0006, + "loss": 2.2432, + "step": 34200 + }, + { + "epoch": 0.12760830479771418, + "grad_norm": 0.301677942276001, + "learning_rate": 0.0006, + "loss": 2.1591, + "step": 34210 + }, + { + "epoch": 0.12764560626067753, + "grad_norm": 0.5302786827087402, + "learning_rate": 0.0006, + "loss": 2.345, + "step": 34220 + }, + { + "epoch": 0.1276829077236409, + "grad_norm": 0.3672291040420532, + "learning_rate": 0.0006, + "loss": 2.3767, + "step": 34230 + }, + { + "epoch": 0.1277202091866043, + "grad_norm": 0.2919174134731293, + "learning_rate": 0.0006, + "loss": 2.3621, + "step": 34240 + }, + { + "epoch": 0.12775751064956767, + "grad_norm": 0.48441585898399353, + "learning_rate": 0.0006, + "loss": 2.3121, + "step": 34250 + }, + { + "epoch": 0.12775751064956767, + "eval_valid_loss": 2.186490297317505, + "eval_valid_loss/all": 2.0500941276550293, + "eval_valid_loss/end_span": 1.2466022968292236, + "eval_valid_perplexity/batch": 7.768632411956787, + "eval_valid_perplexity/end_span": 3.478503942489624, + "eval_valid_perplexity/fim": 2.4883503913879395, + "eval_valid_perplexity/first_seq": 14.876486778259277, + "eval_valid_perplexity/last_seq": 9.077256202697754, + "eval_valid_perplexity/second_seq": 13.807266235351562, + "eval_valid_perplexity/seq": 8.759814262390137, + "eval_valid_reconstruction/all": 0.29523399472236633, + "eval_valid_reconstruction/end_span": 0.7071523070335388, + "eval_valid_reconstruction/fim": 0.18238484859466553, + "eval_valid_reconstruction/first_seq": 0.1661691814661026, + "eval_valid_reconstruction/last_seq": 0.3239869773387909, + "eval_valid_reconstruction/second_seq": 0.19149929285049438, + "eval_valid_runtime": 450.6733, + "eval_valid_samples_per_second": 0.426, + "eval_valid_steps_per_second": 0.426, + "step": 34250 + }, + { + "epoch": 0.12775751064956767, + "eval_train_loss": 2.184401750564575, + "eval_train_loss/all": 2.021601438522339, + "eval_train_loss/end_span": 1.2080060243606567, + "eval_train_perplexity/batch": 7.5504069328308105, + "eval_train_perplexity/end_span": 3.346804618835449, + "eval_train_perplexity/fim": 2.222071886062622, + "eval_train_perplexity/first_seq": 15.561120986938477, + "eval_train_perplexity/last_seq": 9.206380844116211, + "eval_train_perplexity/second_seq": 14.292912483215332, + "eval_train_perplexity/seq": 8.692307472229004, + "eval_train_reconstruction/all": 0.28498926758766174, + "eval_train_reconstruction/end_span": 0.7194498777389526, + "eval_train_reconstruction/fim": 0.16072408854961395, + "eval_train_reconstruction/first_seq": 0.149198517203331, + "eval_train_reconstruction/last_seq": 0.3161049485206604, + "eval_train_reconstruction/second_seq": 0.1816786527633667, + "eval_train_runtime": 452.0848, + "eval_train_samples_per_second": 0.425, + "eval_train_steps_per_second": 0.425, + "step": 34250 + }, + { + "epoch": 0.12779481211253105, + "grad_norm": 0.4172574579715729, + "learning_rate": 0.0006, + "loss": 2.0937, + "step": 34260 + }, + { + "epoch": 0.12783211357549443, + "grad_norm": 0.39794233441352844, + "learning_rate": 0.0006, + "loss": 2.0642, + "step": 34270 + }, + { + "epoch": 0.1278694150384578, + "grad_norm": 0.30588141083717346, + "learning_rate": 0.0006, + "loss": 2.2708, + "step": 34280 + }, + { + "epoch": 0.12790671650142119, + "grad_norm": 0.32750430703163147, + "learning_rate": 0.0006, + "loss": 2.0867, + "step": 34290 + }, + { + "epoch": 0.12794401796438457, + "grad_norm": 0.2771610915660858, + "learning_rate": 0.0006, + "loss": 2.3143, + "step": 34300 + }, + { + "epoch": 0.12798131942734794, + "grad_norm": 0.3011327385902405, + "learning_rate": 0.0006, + "loss": 2.1794, + "step": 34310 + }, + { + "epoch": 0.12801862089031132, + "grad_norm": 0.22654783725738525, + "learning_rate": 0.0006, + "loss": 2.3861, + "step": 34320 + }, + { + "epoch": 0.1280559223532747, + "grad_norm": 0.2829095125198364, + "learning_rate": 0.0006, + "loss": 2.2625, + "step": 34330 + }, + { + "epoch": 0.12809322381623808, + "grad_norm": 0.47905921936035156, + "learning_rate": 0.0006, + "loss": 2.3735, + "step": 34340 + }, + { + "epoch": 0.12813052527920146, + "grad_norm": 0.4844723343849182, + "learning_rate": 0.0006, + "loss": 2.4614, + "step": 34350 + }, + { + "epoch": 0.12816782674216481, + "grad_norm": 0.339567631483078, + "learning_rate": 0.0006, + "loss": 2.195, + "step": 34360 + }, + { + "epoch": 0.1282051282051282, + "grad_norm": 0.3697819709777832, + "learning_rate": 0.0006, + "loss": 2.19, + "step": 34370 + }, + { + "epoch": 0.12824242966809157, + "grad_norm": 0.3602645993232727, + "learning_rate": 0.0006, + "loss": 2.1563, + "step": 34380 + }, + { + "epoch": 0.12827973113105495, + "grad_norm": 0.30149537324905396, + "learning_rate": 0.0006, + "loss": 2.1467, + "step": 34390 + }, + { + "epoch": 0.12831703259401833, + "grad_norm": 0.32374951243400574, + "learning_rate": 0.0006, + "loss": 2.0286, + "step": 34400 + }, + { + "epoch": 0.1283543340569817, + "grad_norm": 0.49963662028312683, + "learning_rate": 0.0006, + "loss": 2.2948, + "step": 34410 + }, + { + "epoch": 0.1283916355199451, + "grad_norm": 0.39397361874580383, + "learning_rate": 0.0006, + "loss": 2.2236, + "step": 34420 + }, + { + "epoch": 0.12842893698290847, + "grad_norm": 0.24457089602947235, + "learning_rate": 0.0006, + "loss": 2.1164, + "step": 34430 + }, + { + "epoch": 0.12846623844587185, + "grad_norm": 0.36348506808280945, + "learning_rate": 0.0006, + "loss": 2.1852, + "step": 34440 + }, + { + "epoch": 0.12850353990883523, + "grad_norm": 0.26584118604660034, + "learning_rate": 0.0006, + "loss": 2.0357, + "step": 34450 + }, + { + "epoch": 0.1285408413717986, + "grad_norm": 0.45423901081085205, + "learning_rate": 0.0006, + "loss": 2.2893, + "step": 34460 + }, + { + "epoch": 0.128578142834762, + "grad_norm": 0.3088974952697754, + "learning_rate": 0.0006, + "loss": 2.2517, + "step": 34470 + }, + { + "epoch": 0.12861544429772537, + "grad_norm": 0.3767017722129822, + "learning_rate": 0.0006, + "loss": 2.2344, + "step": 34480 + }, + { + "epoch": 0.12865274576068875, + "grad_norm": 0.47132933139801025, + "learning_rate": 0.0006, + "loss": 2.2781, + "step": 34490 + }, + { + "epoch": 0.1286900472236521, + "grad_norm": 0.3572888672351837, + "learning_rate": 0.0006, + "loss": 2.2897, + "step": 34500 + }, + { + "epoch": 0.1286900472236521, + "eval_valid_loss": 2.1878185272216797, + "eval_valid_loss/all": 2.0510542392730713, + "eval_valid_loss/end_span": 1.2229453325271606, + "eval_valid_perplexity/batch": 7.776094436645508, + "eval_valid_perplexity/end_span": 3.397178888320923, + "eval_valid_perplexity/fim": 2.178884744644165, + "eval_valid_perplexity/first_seq": 14.95707893371582, + "eval_valid_perplexity/last_seq": 8.900055885314941, + "eval_valid_perplexity/second_seq": 13.572123527526855, + "eval_valid_perplexity/seq": 8.767455101013184, + "eval_valid_reconstruction/all": 0.2952941954135895, + "eval_valid_reconstruction/end_span": 0.7103456854820251, + "eval_valid_reconstruction/fim": 0.1561880260705948, + "eval_valid_reconstruction/first_seq": 0.16615542769432068, + "eval_valid_reconstruction/last_seq": 0.33136186003685, + "eval_valid_reconstruction/second_seq": 0.19989947974681854, + "eval_valid_runtime": 448.9215, + "eval_valid_samples_per_second": 0.428, + "eval_valid_steps_per_second": 0.428, + "step": 34500 + }, + { + "epoch": 0.1286900472236521, + "eval_train_loss": 2.1871726512908936, + "eval_train_loss/all": 2.024240732192993, + "eval_train_loss/end_span": 1.1918587684631348, + "eval_train_perplexity/batch": 7.5703606605529785, + "eval_train_perplexity/end_span": 3.2931969165802, + "eval_train_perplexity/fim": 2.0241329669952393, + "eval_train_perplexity/first_seq": 15.388811111450195, + "eval_train_perplexity/last_seq": 9.041205406188965, + "eval_train_perplexity/second_seq": 14.13343334197998, + "eval_train_perplexity/seq": 8.716845512390137, + "eval_train_reconstruction/all": 0.2844085097312927, + "eval_train_reconstruction/end_span": 0.7199708819389343, + "eval_train_reconstruction/fim": 0.14144505560398102, + "eval_train_reconstruction/first_seq": 0.15558001399040222, + "eval_train_reconstruction/last_seq": 0.3233424723148346, + "eval_train_reconstruction/second_seq": 0.18631529808044434, + "eval_train_runtime": 452.7475, + "eval_train_samples_per_second": 0.424, + "eval_train_steps_per_second": 0.424, + "step": 34500 + }, + { + "epoch": 0.12872734868661548, + "grad_norm": 0.3169924020767212, + "learning_rate": 0.0006, + "loss": 2.1008, + "step": 34510 + }, + { + "epoch": 0.12876465014957886, + "grad_norm": 0.2921490967273712, + "learning_rate": 0.0006, + "loss": 2.1292, + "step": 34520 + }, + { + "epoch": 0.12880195161254224, + "grad_norm": 0.3975803256034851, + "learning_rate": 0.0006, + "loss": 2.1769, + "step": 34530 + }, + { + "epoch": 0.12883925307550562, + "grad_norm": 0.7023450136184692, + "learning_rate": 0.0006, + "loss": 2.3539, + "step": 34540 + }, + { + "epoch": 0.128876554538469, + "grad_norm": 0.31026092171669006, + "learning_rate": 0.0006, + "loss": 2.0905, + "step": 34550 + }, + { + "epoch": 0.12891385600143238, + "grad_norm": 0.40023839473724365, + "learning_rate": 0.0006, + "loss": 2.1648, + "step": 34560 + }, + { + "epoch": 0.12895115746439575, + "grad_norm": 0.37836572527885437, + "learning_rate": 0.0006, + "loss": 2.0327, + "step": 34570 + }, + { + "epoch": 0.12898845892735913, + "grad_norm": 0.347016304731369, + "learning_rate": 0.0006, + "loss": 2.2783, + "step": 34580 + }, + { + "epoch": 0.1290257603903225, + "grad_norm": 0.26132047176361084, + "learning_rate": 0.0006, + "loss": 2.3544, + "step": 34590 + }, + { + "epoch": 0.1290630618532859, + "grad_norm": 0.4164603054523468, + "learning_rate": 0.0006, + "loss": 2.1559, + "step": 34600 + }, + { + "epoch": 0.12910036331624927, + "grad_norm": 0.25433939695358276, + "learning_rate": 0.0006, + "loss": 2.2858, + "step": 34610 + }, + { + "epoch": 0.12913766477921265, + "grad_norm": 0.3057815432548523, + "learning_rate": 0.0006, + "loss": 2.2877, + "step": 34620 + }, + { + "epoch": 0.12917496624217603, + "grad_norm": 0.2913956642150879, + "learning_rate": 0.0006, + "loss": 2.1486, + "step": 34630 + }, + { + "epoch": 0.12921226770513938, + "grad_norm": 0.31754013895988464, + "learning_rate": 0.0006, + "loss": 2.3455, + "step": 34640 + }, + { + "epoch": 0.12924956916810276, + "grad_norm": 0.3459484279155731, + "learning_rate": 0.0006, + "loss": 2.3449, + "step": 34650 + }, + { + "epoch": 0.12928687063106614, + "grad_norm": 0.2992917001247406, + "learning_rate": 0.0006, + "loss": 2.2064, + "step": 34660 + }, + { + "epoch": 0.12932417209402952, + "grad_norm": 0.344983845949173, + "learning_rate": 0.0006, + "loss": 2.2452, + "step": 34670 + }, + { + "epoch": 0.1293614735569929, + "grad_norm": 0.3180393874645233, + "learning_rate": 0.0006, + "loss": 2.1606, + "step": 34680 + }, + { + "epoch": 0.12939877501995628, + "grad_norm": 0.37394461035728455, + "learning_rate": 0.0006, + "loss": 2.2011, + "step": 34690 + }, + { + "epoch": 0.12943607648291966, + "grad_norm": 0.5341460108757019, + "learning_rate": 0.0006, + "loss": 2.0702, + "step": 34700 + }, + { + "epoch": 0.12947337794588304, + "grad_norm": 0.32723915576934814, + "learning_rate": 0.0006, + "loss": 2.2766, + "step": 34710 + }, + { + "epoch": 0.12951067940884642, + "grad_norm": 0.2661336660385132, + "learning_rate": 0.0006, + "loss": 2.1588, + "step": 34720 + }, + { + "epoch": 0.1295479808718098, + "grad_norm": 0.329249769449234, + "learning_rate": 0.0006, + "loss": 2.2543, + "step": 34730 + }, + { + "epoch": 0.12958528233477318, + "grad_norm": 0.31494787335395813, + "learning_rate": 0.0006, + "loss": 2.295, + "step": 34740 + }, + { + "epoch": 0.12962258379773656, + "grad_norm": 0.33479437232017517, + "learning_rate": 0.0006, + "loss": 2.3326, + "step": 34750 + }, + { + "epoch": 0.12962258379773656, + "eval_valid_loss": 2.190589189529419, + "eval_valid_loss/all": 2.053485631942749, + "eval_valid_loss/end_span": 1.1368534564971924, + "eval_valid_perplexity/batch": 7.795024394989014, + "eval_valid_perplexity/end_span": 3.116945266723633, + "eval_valid_perplexity/fim": 2.1935272216796875, + "eval_valid_perplexity/first_seq": 14.501416206359863, + "eval_valid_perplexity/last_seq": 8.994744300842285, + "eval_valid_perplexity/second_seq": 13.372625350952148, + "eval_valid_perplexity/seq": 8.78494930267334, + "eval_valid_reconstruction/all": 0.2943442165851593, + "eval_valid_reconstruction/end_span": 0.7323404550552368, + "eval_valid_reconstruction/fim": 0.15778662264347076, + "eval_valid_reconstruction/first_seq": 0.17562758922576904, + "eval_valid_reconstruction/last_seq": 0.3258666396141052, + "eval_valid_reconstruction/second_seq": 0.20377454161643982, + "eval_valid_runtime": 447.2799, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 34750 + }, + { + "epoch": 0.12962258379773656, + "eval_train_loss": 2.188588857650757, + "eval_train_loss/all": 2.0249147415161133, + "eval_train_loss/end_span": 1.1197468042373657, + "eval_train_perplexity/batch": 7.575465202331543, + "eval_train_perplexity/end_span": 3.0640783309936523, + "eval_train_perplexity/fim": 1.9465882778167725, + "eval_train_perplexity/first_seq": 15.479514122009277, + "eval_train_perplexity/last_seq": 8.605460166931152, + "eval_train_perplexity/second_seq": 14.673959732055664, + "eval_train_perplexity/seq": 8.71573257446289, + "eval_train_reconstruction/all": 0.28408461809158325, + "eval_train_reconstruction/end_span": 0.7400270104408264, + "eval_train_reconstruction/fim": 0.1337682455778122, + "eval_train_reconstruction/first_seq": 0.15309761464595795, + "eval_train_reconstruction/last_seq": 0.3358195722103119, + "eval_train_reconstruction/second_seq": 0.17555364966392517, + "eval_train_runtime": 449.8236, + "eval_train_samples_per_second": 0.427, + "eval_train_steps_per_second": 0.427, + "step": 34750 + }, + { + "epoch": 0.12965988526069994, + "grad_norm": 0.4796260595321655, + "learning_rate": 0.0006, + "loss": 2.2881, + "step": 34760 + }, + { + "epoch": 0.12969718672366332, + "grad_norm": 0.24584995210170746, + "learning_rate": 0.0006, + "loss": 2.2085, + "step": 34770 + }, + { + "epoch": 0.12973448818662667, + "grad_norm": 0.32220229506492615, + "learning_rate": 0.0006, + "loss": 2.1003, + "step": 34780 + }, + { + "epoch": 0.12977178964959005, + "grad_norm": 0.2854171693325043, + "learning_rate": 0.0006, + "loss": 2.2897, + "step": 34790 + }, + { + "epoch": 0.12980909111255343, + "grad_norm": 0.5107263922691345, + "learning_rate": 0.0006, + "loss": 2.2712, + "step": 34800 + }, + { + "epoch": 0.1298463925755168, + "grad_norm": 0.27247071266174316, + "learning_rate": 0.0006, + "loss": 2.254, + "step": 34810 + }, + { + "epoch": 0.12988369403848019, + "grad_norm": 0.32038450241088867, + "learning_rate": 0.0006, + "loss": 2.071, + "step": 34820 + }, + { + "epoch": 0.12992099550144356, + "grad_norm": 0.21920330822467804, + "learning_rate": 0.0006, + "loss": 2.3578, + "step": 34830 + }, + { + "epoch": 0.12995829696440694, + "grad_norm": 0.3083370625972748, + "learning_rate": 0.0006, + "loss": 2.2185, + "step": 34840 + }, + { + "epoch": 0.12999559842737032, + "grad_norm": 0.3014524579048157, + "learning_rate": 0.0006, + "loss": 2.3556, + "step": 34850 + }, + { + "epoch": 0.1300328998903337, + "grad_norm": 0.41408219933509827, + "learning_rate": 0.0006, + "loss": 2.1058, + "step": 34860 + }, + { + "epoch": 0.13007020135329708, + "grad_norm": 0.2856701910495758, + "learning_rate": 0.0006, + "loss": 2.1503, + "step": 34870 + }, + { + "epoch": 0.13010750281626046, + "grad_norm": 0.4651288688182831, + "learning_rate": 0.0006, + "loss": 2.1733, + "step": 34880 + }, + { + "epoch": 0.13014480427922384, + "grad_norm": 0.22115226089954376, + "learning_rate": 0.0006, + "loss": 2.1699, + "step": 34890 + }, + { + "epoch": 0.13018210574218722, + "grad_norm": 0.2825407087802887, + "learning_rate": 0.0006, + "loss": 2.3892, + "step": 34900 + }, + { + "epoch": 0.13021940720515057, + "grad_norm": 0.5045467615127563, + "learning_rate": 0.0006, + "loss": 2.2267, + "step": 34910 + }, + { + "epoch": 0.13025670866811395, + "grad_norm": 0.2733520567417145, + "learning_rate": 0.0006, + "loss": 2.1685, + "step": 34920 + }, + { + "epoch": 0.13029401013107733, + "grad_norm": 0.2683226764202118, + "learning_rate": 0.0006, + "loss": 2.2894, + "step": 34930 + }, + { + "epoch": 0.1303313115940407, + "grad_norm": 0.39438289403915405, + "learning_rate": 0.0006, + "loss": 2.1818, + "step": 34940 + }, + { + "epoch": 0.1303686130570041, + "grad_norm": 0.3157723546028137, + "learning_rate": 0.0006, + "loss": 2.0522, + "step": 34950 + }, + { + "epoch": 0.13040591451996747, + "grad_norm": 0.4050779342651367, + "learning_rate": 0.0006, + "loss": 2.2169, + "step": 34960 + }, + { + "epoch": 0.13044321598293085, + "grad_norm": 0.4370315670967102, + "learning_rate": 0.0006, + "loss": 2.2078, + "step": 34970 + }, + { + "epoch": 0.13048051744589423, + "grad_norm": 0.3322743773460388, + "learning_rate": 0.0006, + "loss": 2.3213, + "step": 34980 + }, + { + "epoch": 0.1305178189088576, + "grad_norm": 0.2838217318058014, + "learning_rate": 0.0006, + "loss": 2.3141, + "step": 34990 + }, + { + "epoch": 0.130555120371821, + "grad_norm": 0.36834484338760376, + "learning_rate": 0.0006, + "loss": 2.2747, + "step": 35000 + }, + { + "epoch": 0.130555120371821, + "eval_valid_loss": 2.1866109371185303, + "eval_valid_loss/all": 2.049757480621338, + "eval_valid_loss/end_span": 1.272107481956482, + "eval_valid_perplexity/batch": 7.766017436981201, + "eval_valid_perplexity/end_span": 3.5683648586273193, + "eval_valid_perplexity/fim": 2.1727001667022705, + "eval_valid_perplexity/first_seq": 15.100730895996094, + "eval_valid_perplexity/last_seq": 8.907224655151367, + "eval_valid_perplexity/second_seq": 13.558109283447266, + "eval_valid_perplexity/seq": 8.7511625289917, + "eval_valid_reconstruction/all": 0.29552021622657776, + "eval_valid_reconstruction/end_span": 0.7058150172233582, + "eval_valid_reconstruction/fim": 0.1548721194267273, + "eval_valid_reconstruction/first_seq": 0.1623665988445282, + "eval_valid_reconstruction/last_seq": 0.32841649651527405, + "eval_valid_reconstruction/second_seq": 0.2003861367702484, + "eval_valid_runtime": 455.9957, + "eval_valid_samples_per_second": 0.421, + "eval_valid_steps_per_second": 0.421, + "step": 35000 + }, + { + "epoch": 0.130555120371821, + "eval_train_loss": 2.1852829456329346, + "eval_train_loss/all": 2.0219900608062744, + "eval_train_loss/end_span": 1.239477276802063, + "eval_train_perplexity/batch": 7.553341388702393, + "eval_train_perplexity/end_span": 3.4538075923919678, + "eval_train_perplexity/fim": 2.0366599559783936, + "eval_train_perplexity/first_seq": 15.539876937866211, + "eval_train_perplexity/last_seq": 9.106324195861816, + "eval_train_perplexity/second_seq": 14.013348579406738, + "eval_train_perplexity/seq": 8.692046165466309, + "eval_train_reconstruction/all": 0.2850555181503296, + "eval_train_reconstruction/end_span": 0.715125322341919, + "eval_train_reconstruction/fim": 0.1432970017194748, + "eval_train_reconstruction/first_seq": 0.15186457335948944, + "eval_train_reconstruction/last_seq": 0.3192307651042938, + "eval_train_reconstruction/second_seq": 0.18759644031524658, + "eval_train_runtime": 451.2843, + "eval_train_samples_per_second": 0.425, + "eval_train_steps_per_second": 0.425, + "step": 35000 + }, + { + "epoch": 0.13059242183478437, + "grad_norm": 0.3423159420490265, + "learning_rate": 0.0006, + "loss": 2.2287, + "step": 35010 + }, + { + "epoch": 0.13062972329774775, + "grad_norm": 0.19844526052474976, + "learning_rate": 0.0006, + "loss": 2.3504, + "step": 35020 + }, + { + "epoch": 0.13066702476071113, + "grad_norm": 0.349417507648468, + "learning_rate": 0.0006, + "loss": 2.0366, + "step": 35030 + }, + { + "epoch": 0.1307043262236745, + "grad_norm": 0.25883084535598755, + "learning_rate": 0.0006, + "loss": 2.2947, + "step": 35040 + }, + { + "epoch": 0.13074162768663786, + "grad_norm": 0.36292582750320435, + "learning_rate": 0.0006, + "loss": 2.3491, + "step": 35050 + }, + { + "epoch": 0.13077892914960124, + "grad_norm": 0.3614930808544159, + "learning_rate": 0.0006, + "loss": 2.1173, + "step": 35060 + }, + { + "epoch": 0.13081623061256462, + "grad_norm": 0.37793058156967163, + "learning_rate": 0.0006, + "loss": 2.2145, + "step": 35070 + }, + { + "epoch": 0.130853532075528, + "grad_norm": 0.4883798658847809, + "learning_rate": 0.0006, + "loss": 2.2812, + "step": 35080 + }, + { + "epoch": 0.13089083353849137, + "grad_norm": 0.30695611238479614, + "learning_rate": 0.0006, + "loss": 2.1338, + "step": 35090 + }, + { + "epoch": 0.13092813500145475, + "grad_norm": 0.27008622884750366, + "learning_rate": 0.0006, + "loss": 2.2446, + "step": 35100 + }, + { + "epoch": 0.13096543646441813, + "grad_norm": 0.4262482821941376, + "learning_rate": 0.0006, + "loss": 2.1769, + "step": 35110 + }, + { + "epoch": 0.1310027379273815, + "grad_norm": 0.3757408559322357, + "learning_rate": 0.0006, + "loss": 1.919, + "step": 35120 + }, + { + "epoch": 0.1310400393903449, + "grad_norm": 0.405109703540802, + "learning_rate": 0.0006, + "loss": 2.287, + "step": 35130 + }, + { + "epoch": 0.13107734085330827, + "grad_norm": 0.36885756254196167, + "learning_rate": 0.0006, + "loss": 2.2501, + "step": 35140 + }, + { + "epoch": 0.13111464231627165, + "grad_norm": 0.33001795411109924, + "learning_rate": 0.0006, + "loss": 2.1437, + "step": 35150 + }, + { + "epoch": 0.13115194377923503, + "grad_norm": 0.41387268900871277, + "learning_rate": 0.0006, + "loss": 2.1972, + "step": 35160 + }, + { + "epoch": 0.1311892452421984, + "grad_norm": 0.29767322540283203, + "learning_rate": 0.0006, + "loss": 2.2328, + "step": 35170 + }, + { + "epoch": 0.1312265467051618, + "grad_norm": 0.43934905529022217, + "learning_rate": 0.0006, + "loss": 2.3063, + "step": 35180 + }, + { + "epoch": 0.13126384816812514, + "grad_norm": 0.3972460627555847, + "learning_rate": 0.0006, + "loss": 2.1468, + "step": 35190 + }, + { + "epoch": 0.13130114963108852, + "grad_norm": 0.2590770423412323, + "learning_rate": 0.0006, + "loss": 2.2263, + "step": 35200 + }, + { + "epoch": 0.1313384510940519, + "grad_norm": 0.2150246948003769, + "learning_rate": 0.0006, + "loss": 2.2558, + "step": 35210 + }, + { + "epoch": 0.13137575255701528, + "grad_norm": 0.42454230785369873, + "learning_rate": 0.0006, + "loss": 2.2879, + "step": 35220 + }, + { + "epoch": 0.13141305401997866, + "grad_norm": 0.3638055920600891, + "learning_rate": 0.0006, + "loss": 2.2534, + "step": 35230 + }, + { + "epoch": 0.13145035548294204, + "grad_norm": 0.4001339375972748, + "learning_rate": 0.0006, + "loss": 2.0842, + "step": 35240 + }, + { + "epoch": 0.13148765694590542, + "grad_norm": 0.4065704643726349, + "learning_rate": 0.0006, + "loss": 2.3425, + "step": 35250 + }, + { + "epoch": 0.13148765694590542, + "eval_valid_loss": 2.1913535594940186, + "eval_valid_loss/all": 2.0540478229522705, + "eval_valid_loss/end_span": 1.2547932863235474, + "eval_valid_perplexity/batch": 7.799407958984375, + "eval_valid_perplexity/end_span": 3.507113218307495, + "eval_valid_perplexity/fim": 2.3303253650665283, + "eval_valid_perplexity/first_seq": 14.901395797729492, + "eval_valid_perplexity/last_seq": 8.594416618347168, + "eval_valid_perplexity/second_seq": 14.053669929504395, + "eval_valid_perplexity/seq": 8.790556907653809, + "eval_valid_reconstruction/all": 0.29443755745887756, + "eval_valid_reconstruction/end_span": 0.7121530175209045, + "eval_valid_reconstruction/fim": 0.16950808465480804, + "eval_valid_reconstruction/first_seq": 0.16661272943019867, + "eval_valid_reconstruction/last_seq": 0.34208300709724426, + "eval_valid_reconstruction/second_seq": 0.1876075714826584, + "eval_valid_runtime": 437.766, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 35250 + }, + { + "epoch": 0.13148765694590542, + "eval_train_loss": 2.1900975704193115, + "eval_train_loss/all": 2.0260376930236816, + "eval_train_loss/end_span": 1.2260979413986206, + "eval_train_perplexity/batch": 7.583976745605469, + "eval_train_perplexity/end_span": 3.4079058170318604, + "eval_train_perplexity/fim": 1.9858920574188232, + "eval_train_perplexity/first_seq": 15.755518913269043, + "eval_train_perplexity/last_seq": 8.811232566833496, + "eval_train_perplexity/second_seq": 14.189209938049316, + "eval_train_perplexity/seq": 8.72796630859375, + "eval_train_reconstruction/all": 0.28394800424575806, + "eval_train_reconstruction/end_span": 0.7183651328086853, + "eval_train_reconstruction/fim": 0.13756334781646729, + "eval_train_reconstruction/first_seq": 0.14273227751255035, + "eval_train_reconstruction/last_seq": 0.32908427715301514, + "eval_train_reconstruction/second_seq": 0.18579165637493134, + "eval_train_runtime": 439.4443, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 35250 + }, + { + "epoch": 0.1315249584088688, + "grad_norm": 0.32558274269104004, + "learning_rate": 0.0006, + "loss": 2.2706, + "step": 35260 + }, + { + "epoch": 0.13156225987183218, + "grad_norm": 0.5433319807052612, + "learning_rate": 0.0006, + "loss": 2.1311, + "step": 35270 + }, + { + "epoch": 0.13159956133479556, + "grad_norm": 0.3756905496120453, + "learning_rate": 0.0006, + "loss": 2.4267, + "step": 35280 + }, + { + "epoch": 0.13163686279775894, + "grad_norm": 0.44574862718582153, + "learning_rate": 0.0006, + "loss": 2.2235, + "step": 35290 + }, + { + "epoch": 0.13167416426072232, + "grad_norm": 0.7565720677375793, + "learning_rate": 0.0006, + "loss": 2.1969, + "step": 35300 + }, + { + "epoch": 0.1317114657236857, + "grad_norm": 0.36488303542137146, + "learning_rate": 0.0006, + "loss": 2.0165, + "step": 35310 + }, + { + "epoch": 0.13174876718664907, + "grad_norm": 0.3070181906223297, + "learning_rate": 0.0006, + "loss": 2.2695, + "step": 35320 + }, + { + "epoch": 0.13178606864961243, + "grad_norm": 0.23663026094436646, + "learning_rate": 0.0006, + "loss": 2.1545, + "step": 35330 + }, + { + "epoch": 0.1318233701125758, + "grad_norm": 0.2343878448009491, + "learning_rate": 0.0006, + "loss": 2.3676, + "step": 35340 + }, + { + "epoch": 0.13186067157553918, + "grad_norm": 0.25035303831100464, + "learning_rate": 0.0006, + "loss": 2.2993, + "step": 35350 + }, + { + "epoch": 0.13189797303850256, + "grad_norm": 0.39775004982948303, + "learning_rate": 0.0006, + "loss": 2.3183, + "step": 35360 + }, + { + "epoch": 0.13193527450146594, + "grad_norm": 0.49771854281425476, + "learning_rate": 0.0006, + "loss": 2.2766, + "step": 35370 + }, + { + "epoch": 0.13197257596442932, + "grad_norm": 0.30824750661849976, + "learning_rate": 0.0006, + "loss": 2.4159, + "step": 35380 + }, + { + "epoch": 0.1320098774273927, + "grad_norm": 0.3152630627155304, + "learning_rate": 0.0006, + "loss": 2.3112, + "step": 35390 + }, + { + "epoch": 0.13204717889035608, + "grad_norm": 0.39282265305519104, + "learning_rate": 0.0006, + "loss": 2.2956, + "step": 35400 + }, + { + "epoch": 0.13208448035331946, + "grad_norm": 0.3346657156944275, + "learning_rate": 0.0006, + "loss": 2.2469, + "step": 35410 + }, + { + "epoch": 0.13212178181628284, + "grad_norm": 0.290252149105072, + "learning_rate": 0.0006, + "loss": 2.3228, + "step": 35420 + }, + { + "epoch": 0.13215908327924622, + "grad_norm": 0.39809200167655945, + "learning_rate": 0.0006, + "loss": 1.976, + "step": 35430 + }, + { + "epoch": 0.1321963847422096, + "grad_norm": 0.2402949184179306, + "learning_rate": 0.0006, + "loss": 2.2238, + "step": 35440 + }, + { + "epoch": 0.13223368620517298, + "grad_norm": 0.45268064737319946, + "learning_rate": 0.0006, + "loss": 2.283, + "step": 35450 + }, + { + "epoch": 0.13227098766813633, + "grad_norm": 0.40073564648628235, + "learning_rate": 0.0006, + "loss": 2.0706, + "step": 35460 + }, + { + "epoch": 0.1323082891310997, + "grad_norm": 0.44300955533981323, + "learning_rate": 0.0006, + "loss": 2.3052, + "step": 35470 + }, + { + "epoch": 0.1323455905940631, + "grad_norm": 0.49550145864486694, + "learning_rate": 0.0006, + "loss": 2.2404, + "step": 35480 + }, + { + "epoch": 0.13238289205702647, + "grad_norm": 0.34421706199645996, + "learning_rate": 0.0006, + "loss": 2.1859, + "step": 35490 + }, + { + "epoch": 0.13242019351998985, + "grad_norm": 0.3215210437774658, + "learning_rate": 0.0006, + "loss": 2.2516, + "step": 35500 + }, + { + "epoch": 0.13242019351998985, + "eval_valid_loss": 2.185455560684204, + "eval_valid_loss/all": 2.0487046241760254, + "eval_valid_loss/end_span": 1.2014490365982056, + "eval_valid_perplexity/batch": 7.757845401763916, + "eval_valid_perplexity/end_span": 3.3249313831329346, + "eval_valid_perplexity/fim": 2.4138641357421875, + "eval_valid_perplexity/first_seq": 15.023465156555176, + "eval_valid_perplexity/last_seq": 8.55341911315918, + "eval_valid_perplexity/second_seq": 13.805093765258789, + "eval_valid_perplexity/seq": 8.7393159866333, + "eval_valid_reconstruction/all": 0.29621246457099915, + "eval_valid_reconstruction/end_span": 0.7159014344215393, + "eval_valid_reconstruction/fim": 0.1768810898065567, + "eval_valid_reconstruction/first_seq": 0.1617446094751358, + "eval_valid_reconstruction/last_seq": 0.34236690402030945, + "eval_valid_reconstruction/second_seq": 0.195120707154274, + "eval_valid_runtime": 446.7707, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 35500 + }, + { + "epoch": 0.13242019351998985, + "eval_train_loss": 2.186572313308716, + "eval_train_loss/all": 2.0230166912078857, + "eval_train_loss/end_span": 1.1765836477279663, + "eval_train_perplexity/batch": 7.561100006103516, + "eval_train_perplexity/end_span": 3.2432751655578613, + "eval_train_perplexity/fim": 2.225951671600342, + "eval_train_perplexity/first_seq": 15.521140098571777, + "eval_train_perplexity/last_seq": 8.549880981445312, + "eval_train_perplexity/second_seq": 14.208820343017578, + "eval_train_perplexity/seq": 8.699501037597656, + "eval_train_reconstruction/all": 0.2848968803882599, + "eval_train_reconstruction/end_span": 0.7265740633010864, + "eval_train_reconstruction/fim": 0.1605389565229416, + "eval_train_reconstruction/first_seq": 0.15176482498645782, + "eval_train_reconstruction/last_seq": 0.3409932553768158, + "eval_train_reconstruction/second_seq": 0.1846826821565628, + "eval_train_runtime": 456.3327, + "eval_train_samples_per_second": 0.421, + "eval_train_steps_per_second": 0.421, + "step": 35500 + }, + { + "epoch": 0.13245749498295323, + "grad_norm": 0.3793505132198334, + "learning_rate": 0.0006, + "loss": 2.1918, + "step": 35510 + }, + { + "epoch": 0.1324947964459166, + "grad_norm": 0.5498309135437012, + "learning_rate": 0.0006, + "loss": 2.0765, + "step": 35520 + }, + { + "epoch": 0.13253209790888, + "grad_norm": 0.41647869348526, + "learning_rate": 0.0006, + "loss": 2.1814, + "step": 35530 + }, + { + "epoch": 0.13256939937184337, + "grad_norm": 0.24906234443187714, + "learning_rate": 0.0006, + "loss": 2.2358, + "step": 35540 + }, + { + "epoch": 0.13260670083480675, + "grad_norm": 2.487032413482666, + "learning_rate": 0.0006, + "loss": 2.2496, + "step": 35550 + }, + { + "epoch": 0.13264400229777012, + "grad_norm": 23.607572555541992, + "learning_rate": 0.0006, + "loss": 2.1853, + "step": 35560 + }, + { + "epoch": 0.1326813037607335, + "grad_norm": 0.2903679311275482, + "learning_rate": 0.0006, + "loss": 2.451, + "step": 35570 + }, + { + "epoch": 0.13271860522369688, + "grad_norm": 0.426706463098526, + "learning_rate": 0.0006, + "loss": 2.2096, + "step": 35580 + }, + { + "epoch": 0.13275590668666026, + "grad_norm": 0.3775077164173126, + "learning_rate": 0.0006, + "loss": 2.3224, + "step": 35590 + }, + { + "epoch": 0.13279320814962361, + "grad_norm": 0.2796192765235901, + "learning_rate": 0.0006, + "loss": 2.179, + "step": 35600 + }, + { + "epoch": 0.132830509612587, + "grad_norm": 0.2668815553188324, + "learning_rate": 0.0006, + "loss": 2.2314, + "step": 35610 + }, + { + "epoch": 0.13286781107555037, + "grad_norm": 0.30240142345428467, + "learning_rate": 0.0006, + "loss": 2.1543, + "step": 35620 + }, + { + "epoch": 0.13290511253851375, + "grad_norm": 0.3449801504611969, + "learning_rate": 0.0006, + "loss": 2.1684, + "step": 35630 + }, + { + "epoch": 0.13294241400147713, + "grad_norm": 0.3945051431655884, + "learning_rate": 0.0006, + "loss": 2.1415, + "step": 35640 + }, + { + "epoch": 0.1329797154644405, + "grad_norm": 0.2163306176662445, + "learning_rate": 0.0006, + "loss": 2.2384, + "step": 35650 + }, + { + "epoch": 0.1330170169274039, + "grad_norm": 0.3972771465778351, + "learning_rate": 0.0006, + "loss": 2.2941, + "step": 35660 + }, + { + "epoch": 0.13305431839036727, + "grad_norm": 0.33624789118766785, + "learning_rate": 0.0006, + "loss": 2.1996, + "step": 35670 + }, + { + "epoch": 0.13309161985333065, + "grad_norm": 0.2922525405883789, + "learning_rate": 0.0006, + "loss": 2.3133, + "step": 35680 + }, + { + "epoch": 0.13312892131629403, + "grad_norm": 0.4417835474014282, + "learning_rate": 0.0006, + "loss": 2.2527, + "step": 35690 + }, + { + "epoch": 0.1331662227792574, + "grad_norm": 0.451638400554657, + "learning_rate": 0.0006, + "loss": 2.2282, + "step": 35700 + }, + { + "epoch": 0.1332035242422208, + "grad_norm": 0.4602743685245514, + "learning_rate": 0.0006, + "loss": 2.065, + "step": 35710 + }, + { + "epoch": 0.13324082570518417, + "grad_norm": 0.41027846932411194, + "learning_rate": 0.0006, + "loss": 2.0799, + "step": 35720 + }, + { + "epoch": 0.13327812716814755, + "grad_norm": 0.37020739912986755, + "learning_rate": 0.0006, + "loss": 2.3334, + "step": 35730 + }, + { + "epoch": 0.1333154286311109, + "grad_norm": 0.33163630962371826, + "learning_rate": 0.0006, + "loss": 2.1973, + "step": 35740 + }, + { + "epoch": 0.13335273009407428, + "grad_norm": 0.3406131863594055, + "learning_rate": 0.0006, + "loss": 2.3297, + "step": 35750 + }, + { + "epoch": 0.13335273009407428, + "eval_valid_loss": 2.190490484237671, + "eval_valid_loss/all": 2.0531508922576904, + "eval_valid_loss/end_span": 1.2492616176605225, + "eval_valid_perplexity/batch": 7.792415618896484, + "eval_valid_perplexity/end_span": 3.487766742706299, + "eval_valid_perplexity/fim": 2.1855201721191406, + "eval_valid_perplexity/first_seq": 14.572789192199707, + "eval_valid_perplexity/last_seq": 8.839238166809082, + "eval_valid_perplexity/second_seq": 13.861352920532227, + "eval_valid_perplexity/seq": 8.779784202575684, + "eval_valid_reconstruction/all": 0.29474756121635437, + "eval_valid_reconstruction/end_span": 0.7136226892471313, + "eval_valid_reconstruction/fim": 0.15640133619308472, + "eval_valid_reconstruction/first_seq": 0.1741505265235901, + "eval_valid_reconstruction/last_seq": 0.33097171783447266, + "eval_valid_reconstruction/second_seq": 0.19079340994358063, + "eval_valid_runtime": 457.4597, + "eval_valid_samples_per_second": 0.42, + "eval_valid_steps_per_second": 0.42, + "step": 35750 + }, + { + "epoch": 0.13335273009407428, + "eval_train_loss": 2.1898090839385986, + "eval_train_loss/all": 2.02581524848938, + "eval_train_loss/end_span": 1.222107172012329, + "eval_train_perplexity/batch": 7.582289695739746, + "eval_train_perplexity/end_span": 3.3943326473236084, + "eval_train_perplexity/fim": 1.888447880744934, + "eval_train_perplexity/first_seq": 15.450989723205566, + "eval_train_perplexity/last_seq": 9.042654037475586, + "eval_train_perplexity/second_seq": 13.860589981079102, + "eval_train_perplexity/seq": 8.723959922790527, + "eval_train_reconstruction/all": 0.2840063273906708, + "eval_train_reconstruction/end_span": 0.7198241353034973, + "eval_train_reconstruction/fim": 0.12769146263599396, + "eval_train_reconstruction/first_seq": 0.15412111580371857, + "eval_train_reconstruction/last_seq": 0.32143425941467285, + "eval_train_reconstruction/second_seq": 0.19070211052894592, + "eval_train_runtime": 456.552, + "eval_train_samples_per_second": 0.421, + "eval_train_steps_per_second": 0.421, + "step": 35750 + }, + { + "epoch": 0.13339003155703766, + "grad_norm": 0.4076254367828369, + "learning_rate": 0.0006, + "loss": 2.3072, + "step": 35760 + }, + { + "epoch": 0.13342733302000104, + "grad_norm": 0.37984588742256165, + "learning_rate": 0.0006, + "loss": 2.2232, + "step": 35770 + }, + { + "epoch": 0.13346463448296442, + "grad_norm": 0.32174015045166016, + "learning_rate": 0.0006, + "loss": 2.1903, + "step": 35780 + }, + { + "epoch": 0.1335019359459278, + "grad_norm": 0.4484248459339142, + "learning_rate": 0.0006, + "loss": 2.017, + "step": 35790 + }, + { + "epoch": 0.13353923740889118, + "grad_norm": 0.3792297840118408, + "learning_rate": 0.0006, + "loss": 1.9322, + "step": 35800 + }, + { + "epoch": 0.13357653887185456, + "grad_norm": 0.40597301721572876, + "learning_rate": 0.0006, + "loss": 2.1677, + "step": 35810 + }, + { + "epoch": 0.13361384033481793, + "grad_norm": 0.31803664565086365, + "learning_rate": 0.0006, + "loss": 2.1765, + "step": 35820 + }, + { + "epoch": 0.13365114179778131, + "grad_norm": 0.3841610848903656, + "learning_rate": 0.0006, + "loss": 2.0427, + "step": 35830 + }, + { + "epoch": 0.1336884432607447, + "grad_norm": 0.38190287351608276, + "learning_rate": 0.0006, + "loss": 2.2494, + "step": 35840 + }, + { + "epoch": 0.13372574472370807, + "grad_norm": 0.4008512794971466, + "learning_rate": 0.0006, + "loss": 2.2315, + "step": 35850 + }, + { + "epoch": 0.13376304618667145, + "grad_norm": 0.4150756895542145, + "learning_rate": 0.0006, + "loss": 2.1401, + "step": 35860 + }, + { + "epoch": 0.13380034764963483, + "grad_norm": 0.33926883339881897, + "learning_rate": 0.0006, + "loss": 2.068, + "step": 35870 + }, + { + "epoch": 0.13383764911259818, + "grad_norm": 0.30387452244758606, + "learning_rate": 0.0006, + "loss": 2.2153, + "step": 35880 + }, + { + "epoch": 0.13387495057556156, + "grad_norm": 0.2622429430484772, + "learning_rate": 0.0006, + "loss": 2.2037, + "step": 35890 + }, + { + "epoch": 0.13391225203852494, + "grad_norm": 0.27326342463493347, + "learning_rate": 0.0006, + "loss": 2.1124, + "step": 35900 + }, + { + "epoch": 0.13394955350148832, + "grad_norm": 0.3804779648780823, + "learning_rate": 0.0006, + "loss": 2.3596, + "step": 35910 + }, + { + "epoch": 0.1339868549644517, + "grad_norm": 0.3727635145187378, + "learning_rate": 0.0006, + "loss": 2.0008, + "step": 35920 + }, + { + "epoch": 0.13402415642741508, + "grad_norm": 0.48239046335220337, + "learning_rate": 0.0006, + "loss": 2.1081, + "step": 35930 + }, + { + "epoch": 0.13406145789037846, + "grad_norm": 0.5427637696266174, + "learning_rate": 0.0006, + "loss": 2.2179, + "step": 35940 + }, + { + "epoch": 0.13409875935334184, + "grad_norm": 0.3662244975566864, + "learning_rate": 0.0006, + "loss": 2.1922, + "step": 35950 + }, + { + "epoch": 0.13413606081630522, + "grad_norm": 0.3510001003742218, + "learning_rate": 0.0006, + "loss": 2.1179, + "step": 35960 + }, + { + "epoch": 0.1341733622792686, + "grad_norm": 0.26119565963745117, + "learning_rate": 0.0006, + "loss": 2.0819, + "step": 35970 + }, + { + "epoch": 0.13421066374223198, + "grad_norm": 0.32680559158325195, + "learning_rate": 0.0006, + "loss": 2.3026, + "step": 35980 + }, + { + "epoch": 0.13424796520519536, + "grad_norm": 0.3370746076107025, + "learning_rate": 0.0006, + "loss": 2.1687, + "step": 35990 + }, + { + "epoch": 0.13428526666815874, + "grad_norm": 0.23378300666809082, + "learning_rate": 0.0006, + "loss": 2.1247, + "step": 36000 + }, + { + "epoch": 0.13428526666815874, + "eval_valid_loss": 2.1846532821655273, + "eval_valid_loss/all": 2.0482993125915527, + "eval_valid_loss/end_span": 1.203818440437317, + "eval_valid_perplexity/batch": 7.754701614379883, + "eval_valid_perplexity/end_span": 3.3328187465667725, + "eval_valid_perplexity/fim": 2.3482625484466553, + "eval_valid_perplexity/first_seq": 14.725099563598633, + "eval_valid_perplexity/last_seq": 8.625514030456543, + "eval_valid_perplexity/second_seq": 13.621916770935059, + "eval_valid_perplexity/seq": 8.748990058898926, + "eval_valid_reconstruction/all": 0.29600170254707336, + "eval_valid_reconstruction/end_span": 0.7164128422737122, + "eval_valid_reconstruction/fim": 0.1721445769071579, + "eval_valid_reconstruction/first_seq": 0.17123067378997803, + "eval_valid_reconstruction/last_seq": 0.3422466516494751, + "eval_valid_reconstruction/second_seq": 0.20018230378627777, + "eval_valid_runtime": 451.9069, + "eval_valid_samples_per_second": 0.425, + "eval_valid_steps_per_second": 0.425, + "step": 36000 + }, + { + "epoch": 0.13428526666815874, + "eval_train_loss": 2.1848485469818115, + "eval_train_loss/all": 2.02209734916687, + "eval_train_loss/end_span": 1.1698980331420898, + "eval_train_perplexity/batch": 7.554152011871338, + "eval_train_perplexity/end_span": 3.2216641902923584, + "eval_train_perplexity/fim": 2.246607542037964, + "eval_train_perplexity/first_seq": 15.301663398742676, + "eval_train_perplexity/last_seq": 8.733952522277832, + "eval_train_perplexity/second_seq": 14.248732566833496, + "eval_train_perplexity/seq": 8.701897621154785, + "eval_train_reconstruction/all": 0.2848597764968872, + "eval_train_reconstruction/end_span": 0.7274966835975647, + "eval_train_reconstruction/fim": 0.1630844622850418, + "eval_train_reconstruction/first_seq": 0.15398886799812317, + "eval_train_reconstruction/last_seq": 0.33281680941581726, + "eval_train_reconstruction/second_seq": 0.18258625268936157, + "eval_train_runtime": 450.3549, + "eval_train_samples_per_second": 0.426, + "eval_train_steps_per_second": 0.426, + "step": 36000 + }, + { + "epoch": 0.13432256813112212, + "grad_norm": 0.4038180410861969, + "learning_rate": 0.0006, + "loss": 2.2609, + "step": 36010 + }, + { + "epoch": 0.13435986959408547, + "grad_norm": 0.2663930356502533, + "learning_rate": 0.0006, + "loss": 2.2588, + "step": 36020 + }, + { + "epoch": 0.13439717105704885, + "grad_norm": 0.3121393918991089, + "learning_rate": 0.0006, + "loss": 2.2769, + "step": 36030 + }, + { + "epoch": 0.13443447252001223, + "grad_norm": 0.3896241784095764, + "learning_rate": 0.0006, + "loss": 2.2904, + "step": 36040 + }, + { + "epoch": 0.1344717739829756, + "grad_norm": 0.22115828096866608, + "learning_rate": 0.0006, + "loss": 2.275, + "step": 36050 + }, + { + "epoch": 0.13450907544593899, + "grad_norm": 0.25070706009864807, + "learning_rate": 0.0006, + "loss": 2.2263, + "step": 36060 + }, + { + "epoch": 0.13454637690890237, + "grad_norm": 0.3159976601600647, + "learning_rate": 0.0006, + "loss": 2.2735, + "step": 36070 + }, + { + "epoch": 0.13458367837186574, + "grad_norm": 0.27909913659095764, + "learning_rate": 0.0006, + "loss": 2.2372, + "step": 36080 + }, + { + "epoch": 0.13462097983482912, + "grad_norm": 1.0184910297393799, + "learning_rate": 0.0006, + "loss": 2.2885, + "step": 36090 + }, + { + "epoch": 0.1346582812977925, + "grad_norm": 0.36194199323654175, + "learning_rate": 0.0006, + "loss": 2.2987, + "step": 36100 + }, + { + "epoch": 0.13469558276075588, + "grad_norm": 0.3739508390426636, + "learning_rate": 0.0006, + "loss": 2.1698, + "step": 36110 + }, + { + "epoch": 0.13473288422371926, + "grad_norm": 0.2954240143299103, + "learning_rate": 0.0006, + "loss": 2.2848, + "step": 36120 + }, + { + "epoch": 0.13477018568668264, + "grad_norm": 0.48633143305778503, + "learning_rate": 0.0006, + "loss": 2.0596, + "step": 36130 + }, + { + "epoch": 0.13480748714964602, + "grad_norm": 0.38600197434425354, + "learning_rate": 0.0006, + "loss": 2.1182, + "step": 36140 + }, + { + "epoch": 0.13484478861260937, + "grad_norm": 0.33015865087509155, + "learning_rate": 0.0006, + "loss": 2.2411, + "step": 36150 + }, + { + "epoch": 0.13488209007557275, + "grad_norm": 0.4337019622325897, + "learning_rate": 0.0006, + "loss": 2.2425, + "step": 36160 + }, + { + "epoch": 0.13491939153853613, + "grad_norm": 0.2286835014820099, + "learning_rate": 0.0006, + "loss": 2.2581, + "step": 36170 + }, + { + "epoch": 0.1349566930014995, + "grad_norm": 0.22583827376365662, + "learning_rate": 0.0006, + "loss": 2.0974, + "step": 36180 + }, + { + "epoch": 0.1349939944644629, + "grad_norm": 0.2757816016674042, + "learning_rate": 0.0006, + "loss": 2.1148, + "step": 36190 + }, + { + "epoch": 0.13503129592742627, + "grad_norm": 0.4130169749259949, + "learning_rate": 0.0006, + "loss": 2.1573, + "step": 36200 + }, + { + "epoch": 0.13506859739038965, + "grad_norm": 0.3744756877422333, + "learning_rate": 0.0006, + "loss": 2.3014, + "step": 36210 + }, + { + "epoch": 0.13510589885335303, + "grad_norm": 0.3492564558982849, + "learning_rate": 0.0006, + "loss": 2.3297, + "step": 36220 + }, + { + "epoch": 0.1351432003163164, + "grad_norm": 0.28457581996917725, + "learning_rate": 0.0006, + "loss": 2.2589, + "step": 36230 + }, + { + "epoch": 0.1351805017792798, + "grad_norm": 0.29157447814941406, + "learning_rate": 0.0006, + "loss": 2.2976, + "step": 36240 + }, + { + "epoch": 0.13521780324224317, + "grad_norm": 0.25741031765937805, + "learning_rate": 0.0006, + "loss": 2.209, + "step": 36250 + }, + { + "epoch": 0.13521780324224317, + "eval_valid_loss": 2.186483860015869, + "eval_valid_loss/all": 2.0495781898498535, + "eval_valid_loss/end_span": 1.1766988039016724, + "eval_valid_perplexity/batch": 7.764625072479248, + "eval_valid_perplexity/end_span": 3.2436485290527344, + "eval_valid_perplexity/fim": 2.4470999240875244, + "eval_valid_perplexity/first_seq": 13.983612060546875, + "eval_valid_perplexity/last_seq": 8.912530899047852, + "eval_valid_perplexity/second_seq": 13.94062614440918, + "eval_valid_perplexity/seq": 8.752274513244629, + "eval_valid_reconstruction/all": 0.29572904109954834, + "eval_valid_reconstruction/end_span": 0.7257448434829712, + "eval_valid_reconstruction/fim": 0.1798790544271469, + "eval_valid_reconstruction/first_seq": 0.1884581446647644, + "eval_valid_reconstruction/last_seq": 0.33094120025634766, + "eval_valid_reconstruction/second_seq": 0.1921021044254303, + "eval_valid_runtime": 490.5125, + "eval_valid_samples_per_second": 0.391, + "eval_valid_steps_per_second": 0.391, + "step": 36250 + }, + { + "epoch": 0.13521780324224317, + "eval_train_loss": 2.1853253841400146, + "eval_train_loss/all": 2.0221590995788574, + "eval_train_loss/end_span": 1.1405553817749023, + "eval_train_perplexity/batch": 7.5546183586120605, + "eval_train_perplexity/end_span": 3.1285054683685303, + "eval_train_perplexity/fim": 2.0357565879821777, + "eval_train_perplexity/first_seq": 15.761082649230957, + "eval_train_perplexity/last_seq": 8.847424507141113, + "eval_train_perplexity/second_seq": 14.541879653930664, + "eval_train_perplexity/seq": 8.697617530822754, + "eval_train_reconstruction/all": 0.2849540114402771, + "eval_train_reconstruction/end_span": 0.737548828125, + "eval_train_reconstruction/fim": 0.14411672949790955, + "eval_train_reconstruction/first_seq": 0.14692749083042145, + "eval_train_reconstruction/last_seq": 0.3301343023777008, + "eval_train_reconstruction/second_seq": 0.17694099247455597, + "eval_train_runtime": 465.7271, + "eval_train_samples_per_second": 0.412, + "eval_train_steps_per_second": 0.412, + "step": 36250 + }, + { + "epoch": 0.13525510470520655, + "grad_norm": 0.282912015914917, + "learning_rate": 0.0006, + "loss": 2.1925, + "step": 36260 + }, + { + "epoch": 0.13529240616816993, + "grad_norm": 0.3343248963356018, + "learning_rate": 0.0006, + "loss": 2.2525, + "step": 36270 + }, + { + "epoch": 0.1353297076311333, + "grad_norm": 0.34499356150627136, + "learning_rate": 0.0006, + "loss": 2.2553, + "step": 36280 + }, + { + "epoch": 0.13536700909409666, + "grad_norm": 0.455357164144516, + "learning_rate": 0.0006, + "loss": 2.184, + "step": 36290 + }, + { + "epoch": 0.13540431055706004, + "grad_norm": 0.40885087847709656, + "learning_rate": 0.0006, + "loss": 2.1918, + "step": 36300 + }, + { + "epoch": 0.13544161202002342, + "grad_norm": 0.21405577659606934, + "learning_rate": 0.0006, + "loss": 2.1959, + "step": 36310 + }, + { + "epoch": 0.1354789134829868, + "grad_norm": 0.36528894305229187, + "learning_rate": 0.0006, + "loss": 2.2837, + "step": 36320 + }, + { + "epoch": 0.13551621494595018, + "grad_norm": 0.3940622806549072, + "learning_rate": 0.0006, + "loss": 2.3211, + "step": 36330 + }, + { + "epoch": 0.13555351640891355, + "grad_norm": 0.3393820524215698, + "learning_rate": 0.0006, + "loss": 2.1697, + "step": 36340 + }, + { + "epoch": 0.13559081787187693, + "grad_norm": 0.35889682173728943, + "learning_rate": 0.0006, + "loss": 2.2532, + "step": 36350 + }, + { + "epoch": 0.1356281193348403, + "grad_norm": 0.4331860840320587, + "learning_rate": 0.0006, + "loss": 2.1839, + "step": 36360 + }, + { + "epoch": 0.1356654207978037, + "grad_norm": 0.43995729088783264, + "learning_rate": 0.0006, + "loss": 2.2513, + "step": 36370 + }, + { + "epoch": 0.13570272226076707, + "grad_norm": 0.3653767704963684, + "learning_rate": 0.0006, + "loss": 1.8943, + "step": 36380 + }, + { + "epoch": 0.13574002372373045, + "grad_norm": 0.23389266431331635, + "learning_rate": 0.0006, + "loss": 2.1984, + "step": 36390 + }, + { + "epoch": 0.13577732518669383, + "grad_norm": 0.4650367498397827, + "learning_rate": 0.0006, + "loss": 2.2625, + "step": 36400 + }, + { + "epoch": 0.1358146266496572, + "grad_norm": 0.38282835483551025, + "learning_rate": 0.0006, + "loss": 2.1976, + "step": 36410 + }, + { + "epoch": 0.1358519281126206, + "grad_norm": 0.5197153687477112, + "learning_rate": 0.0006, + "loss": 2.1742, + "step": 36420 + }, + { + "epoch": 0.13588922957558394, + "grad_norm": 0.3309674859046936, + "learning_rate": 0.0006, + "loss": 2.2139, + "step": 36430 + }, + { + "epoch": 0.13592653103854732, + "grad_norm": 0.41582173109054565, + "learning_rate": 0.0006, + "loss": 2.2, + "step": 36440 + }, + { + "epoch": 0.1359638325015107, + "grad_norm": 0.47640496492385864, + "learning_rate": 0.0006, + "loss": 2.2268, + "step": 36450 + }, + { + "epoch": 0.13600113396447408, + "grad_norm": 0.4961922764778137, + "learning_rate": 0.0006, + "loss": 2.2285, + "step": 36460 + }, + { + "epoch": 0.13603843542743746, + "grad_norm": 0.2983034551143646, + "learning_rate": 0.0006, + "loss": 2.3303, + "step": 36470 + }, + { + "epoch": 0.13607573689040084, + "grad_norm": 0.2847345769405365, + "learning_rate": 0.0006, + "loss": 2.1969, + "step": 36480 + }, + { + "epoch": 0.13611303835336422, + "grad_norm": 0.3101258873939514, + "learning_rate": 0.0006, + "loss": 2.2501, + "step": 36490 + }, + { + "epoch": 0.1361503398163276, + "grad_norm": 0.42386043071746826, + "learning_rate": 0.0006, + "loss": 2.2222, + "step": 36500 + }, + { + "epoch": 0.1361503398163276, + "eval_valid_loss": 2.1942532062530518, + "eval_valid_loss/all": 2.057232618331909, + "eval_valid_loss/end_span": 1.2557759284973145, + "eval_valid_perplexity/batch": 7.824286937713623, + "eval_valid_perplexity/end_span": 3.510561227798462, + "eval_valid_perplexity/fim": 2.2066662311553955, + "eval_valid_perplexity/first_seq": 14.950140953063965, + "eval_valid_perplexity/last_seq": 9.135160446166992, + "eval_valid_perplexity/second_seq": 13.49181842803955, + "eval_valid_perplexity/seq": 8.831886291503906, + "eval_valid_reconstruction/all": 0.2932300269603729, + "eval_valid_reconstruction/end_span": 0.7058366537094116, + "eval_valid_reconstruction/fim": 0.15785127878189087, + "eval_valid_reconstruction/first_seq": 0.16660857200622559, + "eval_valid_reconstruction/last_seq": 0.32347628474235535, + "eval_valid_reconstruction/second_seq": 0.20156840980052948, + "eval_valid_runtime": 443.4768, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 36500 + }, + { + "epoch": 0.1361503398163276, + "eval_train_loss": 2.1909496784210205, + "eval_train_loss/all": 2.02748966217041, + "eval_train_loss/end_span": 1.2184854745864868, + "eval_train_perplexity/batch": 7.594996452331543, + "eval_train_perplexity/end_span": 3.382061719894409, + "eval_train_perplexity/fim": 2.0082647800445557, + "eval_train_perplexity/first_seq": 15.806353569030762, + "eval_train_perplexity/last_seq": 9.069353103637695, + "eval_train_perplexity/second_seq": 14.537677764892578, + "eval_train_perplexity/seq": 8.753694534301758, + "eval_train_reconstruction/all": 0.28325405716896057, + "eval_train_reconstruction/end_span": 0.7167930006980896, + "eval_train_reconstruction/fim": 0.13969966769218445, + "eval_train_reconstruction/first_seq": 0.14380362629890442, + "eval_train_reconstruction/last_seq": 0.3214920163154602, + "eval_train_reconstruction/second_seq": 0.1729258894920349, + "eval_train_runtime": 444.5067, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 36500 + }, + { + "epoch": 0.13618764127929098, + "grad_norm": 0.4519098997116089, + "learning_rate": 0.0006, + "loss": 2.2668, + "step": 36510 + }, + { + "epoch": 0.13622494274225436, + "grad_norm": 0.5789509415626526, + "learning_rate": 0.0006, + "loss": 2.0734, + "step": 36520 + }, + { + "epoch": 0.13626224420521774, + "grad_norm": 0.3386200964450836, + "learning_rate": 0.0006, + "loss": 2.2023, + "step": 36530 + }, + { + "epoch": 0.13629954566818112, + "grad_norm": 0.2899700999259949, + "learning_rate": 0.0006, + "loss": 2.1661, + "step": 36540 + }, + { + "epoch": 0.1363368471311445, + "grad_norm": 0.38415762782096863, + "learning_rate": 0.0006, + "loss": 2.312, + "step": 36550 + }, + { + "epoch": 0.13637414859410787, + "grad_norm": 0.3007860481739044, + "learning_rate": 0.0006, + "loss": 2.2502, + "step": 36560 + }, + { + "epoch": 0.13641145005707123, + "grad_norm": 0.34969890117645264, + "learning_rate": 0.0006, + "loss": 2.1529, + "step": 36570 + }, + { + "epoch": 0.1364487515200346, + "grad_norm": 0.2808239758014679, + "learning_rate": 0.0006, + "loss": 2.1636, + "step": 36580 + }, + { + "epoch": 0.13648605298299799, + "grad_norm": 0.39546850323677063, + "learning_rate": 0.0006, + "loss": 2.2259, + "step": 36590 + }, + { + "epoch": 0.13652335444596136, + "grad_norm": 0.25771957635879517, + "learning_rate": 0.0006, + "loss": 2.4131, + "step": 36600 + }, + { + "epoch": 0.13656065590892474, + "grad_norm": 0.3634951114654541, + "learning_rate": 0.0006, + "loss": 2.194, + "step": 36610 + }, + { + "epoch": 0.13659795737188812, + "grad_norm": 0.3809430003166199, + "learning_rate": 0.0006, + "loss": 2.0962, + "step": 36620 + }, + { + "epoch": 0.1366352588348515, + "grad_norm": 0.7251625061035156, + "learning_rate": 0.0006, + "loss": 2.2696, + "step": 36630 + }, + { + "epoch": 0.13667256029781488, + "grad_norm": 0.335628479719162, + "learning_rate": 0.0006, + "loss": 2.007, + "step": 36640 + }, + { + "epoch": 0.13670986176077826, + "grad_norm": 0.3489452302455902, + "learning_rate": 0.0006, + "loss": 2.3241, + "step": 36650 + }, + { + "epoch": 0.13674716322374164, + "grad_norm": 0.3245968818664551, + "learning_rate": 0.0006, + "loss": 2.3534, + "step": 36660 + }, + { + "epoch": 0.13678446468670502, + "grad_norm": 0.21233011782169342, + "learning_rate": 0.0006, + "loss": 2.1733, + "step": 36670 + }, + { + "epoch": 0.1368217661496684, + "grad_norm": 0.39353999495506287, + "learning_rate": 0.0006, + "loss": 2.117, + "step": 36680 + }, + { + "epoch": 0.13685906761263178, + "grad_norm": 0.424673855304718, + "learning_rate": 0.0006, + "loss": 2.174, + "step": 36690 + }, + { + "epoch": 0.13689636907559513, + "grad_norm": 0.2541026175022125, + "learning_rate": 0.0006, + "loss": 2.2776, + "step": 36700 + }, + { + "epoch": 0.1369336705385585, + "grad_norm": 0.3272351622581482, + "learning_rate": 0.0006, + "loss": 2.1699, + "step": 36710 + }, + { + "epoch": 0.1369709720015219, + "grad_norm": 0.3877384662628174, + "learning_rate": 0.0006, + "loss": 2.1804, + "step": 36720 + }, + { + "epoch": 0.13700827346448527, + "grad_norm": 0.32191166281700134, + "learning_rate": 0.0006, + "loss": 2.1794, + "step": 36730 + }, + { + "epoch": 0.13704557492744865, + "grad_norm": 0.5516616702079773, + "learning_rate": 0.0006, + "loss": 2.1101, + "step": 36740 + }, + { + "epoch": 0.13708287639041203, + "grad_norm": 0.4099642038345337, + "learning_rate": 0.0006, + "loss": 2.1952, + "step": 36750 + }, + { + "epoch": 0.13708287639041203, + "eval_valid_loss": 2.193948984146118, + "eval_valid_loss/all": 2.0565624237060547, + "eval_valid_loss/end_span": 1.2104823589324951, + "eval_valid_perplexity/batch": 7.819045066833496, + "eval_valid_perplexity/end_span": 3.3551025390625, + "eval_valid_perplexity/fim": 2.4477837085723877, + "eval_valid_perplexity/first_seq": 14.785805702209473, + "eval_valid_perplexity/last_seq": 8.898475646972656, + "eval_valid_perplexity/second_seq": 13.653182983398438, + "eval_valid_perplexity/seq": 8.81717586517334, + "eval_valid_reconstruction/all": 0.2937876582145691, + "eval_valid_reconstruction/end_span": 0.7140900492668152, + "eval_valid_reconstruction/fim": 0.17798781394958496, + "eval_valid_reconstruction/first_seq": 0.16956022381782532, + "eval_valid_reconstruction/last_seq": 0.3280044496059418, + "eval_valid_reconstruction/second_seq": 0.1961357146501541, + "eval_valid_runtime": 442.7986, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 36750 + }, + { + "epoch": 0.13708287639041203, + "eval_train_loss": 2.1922671794891357, + "eval_train_loss/all": 2.0285747051239014, + "eval_train_loss/end_span": 1.1787927150726318, + "eval_train_perplexity/batch": 7.603241920471191, + "eval_train_perplexity/end_span": 3.2504475116729736, + "eval_train_perplexity/fim": 2.0251080989837646, + "eval_train_perplexity/first_seq": 15.510841369628906, + "eval_train_perplexity/last_seq": 9.239686012268066, + "eval_train_perplexity/second_seq": 14.323546409606934, + "eval_train_perplexity/seq": 8.756068229675293, + "eval_train_reconstruction/all": 0.28316277265548706, + "eval_train_reconstruction/end_span": 0.7247333526611328, + "eval_train_reconstruction/fim": 0.1409551203250885, + "eval_train_reconstruction/first_seq": 0.15307343006134033, + "eval_train_reconstruction/last_seq": 0.31918519735336304, + "eval_train_reconstruction/second_seq": 0.18131983280181885, + "eval_train_runtime": 440.8345, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 36750 + }, + { + "epoch": 0.1371201778533754, + "grad_norm": 0.2982819080352783, + "learning_rate": 0.0006, + "loss": 2.2813, + "step": 36760 + }, + { + "epoch": 0.1371574793163388, + "grad_norm": 0.3822405934333801, + "learning_rate": 0.0006, + "loss": 2.1268, + "step": 36770 + }, + { + "epoch": 0.13719478077930217, + "grad_norm": 0.3375735580921173, + "learning_rate": 0.0006, + "loss": 2.2074, + "step": 36780 + }, + { + "epoch": 0.13723208224226555, + "grad_norm": 0.3371118903160095, + "learning_rate": 0.0006, + "loss": 2.1408, + "step": 36790 + }, + { + "epoch": 0.13726938370522893, + "grad_norm": 0.2852800488471985, + "learning_rate": 0.0006, + "loss": 2.2188, + "step": 36800 + }, + { + "epoch": 0.1373066851681923, + "grad_norm": 0.48463505506515503, + "learning_rate": 0.0006, + "loss": 2.1866, + "step": 36810 + }, + { + "epoch": 0.13734398663115568, + "grad_norm": 0.37941932678222656, + "learning_rate": 0.0006, + "loss": 2.085, + "step": 36820 + }, + { + "epoch": 0.13738128809411906, + "grad_norm": 0.35189148783683777, + "learning_rate": 0.0006, + "loss": 2.1645, + "step": 36830 + }, + { + "epoch": 0.13741858955708242, + "grad_norm": 0.3278163969516754, + "learning_rate": 0.0006, + "loss": 2.2229, + "step": 36840 + }, + { + "epoch": 0.1374558910200458, + "grad_norm": 0.32433009147644043, + "learning_rate": 0.0006, + "loss": 2.241, + "step": 36850 + }, + { + "epoch": 0.13749319248300917, + "grad_norm": 0.2979591190814972, + "learning_rate": 0.0006, + "loss": 2.2076, + "step": 36860 + }, + { + "epoch": 0.13753049394597255, + "grad_norm": 0.3057061731815338, + "learning_rate": 0.0006, + "loss": 2.2642, + "step": 36870 + }, + { + "epoch": 0.13756779540893593, + "grad_norm": 0.45772504806518555, + "learning_rate": 0.0006, + "loss": 2.1507, + "step": 36880 + }, + { + "epoch": 0.1376050968718993, + "grad_norm": 0.34848058223724365, + "learning_rate": 0.0006, + "loss": 2.1949, + "step": 36890 + }, + { + "epoch": 0.1376423983348627, + "grad_norm": 0.47806257009506226, + "learning_rate": 0.0006, + "loss": 2.2284, + "step": 36900 + }, + { + "epoch": 0.13767969979782607, + "grad_norm": 0.42246773838996887, + "learning_rate": 0.0006, + "loss": 2.1212, + "step": 36910 + }, + { + "epoch": 0.13771700126078945, + "grad_norm": 0.43729159235954285, + "learning_rate": 0.0006, + "loss": 2.312, + "step": 36920 + }, + { + "epoch": 0.13775430272375283, + "grad_norm": 0.3380105495452881, + "learning_rate": 0.0006, + "loss": 2.1854, + "step": 36930 + }, + { + "epoch": 0.1377916041867162, + "grad_norm": 0.3458985984325409, + "learning_rate": 0.0006, + "loss": 2.3291, + "step": 36940 + }, + { + "epoch": 0.1378289056496796, + "grad_norm": 0.40879368782043457, + "learning_rate": 0.0006, + "loss": 2.0788, + "step": 36950 + }, + { + "epoch": 0.13786620711264297, + "grad_norm": 0.45621058344841003, + "learning_rate": 0.0006, + "loss": 2.1806, + "step": 36960 + }, + { + "epoch": 0.13790350857560635, + "grad_norm": 0.25361311435699463, + "learning_rate": 0.0006, + "loss": 2.3083, + "step": 36970 + }, + { + "epoch": 0.1379408100385697, + "grad_norm": 6.427034854888916, + "learning_rate": 0.0006, + "loss": 2.2139, + "step": 36980 + }, + { + "epoch": 0.13797811150153308, + "grad_norm": 0.6184579133987427, + "learning_rate": 0.0006, + "loss": 2.448, + "step": 36990 + }, + { + "epoch": 0.13801541296449646, + "grad_norm": 0.4272187650203705, + "learning_rate": 0.0006, + "loss": 2.2184, + "step": 37000 + }, + { + "epoch": 0.13801541296449646, + "eval_valid_loss": 2.191164255142212, + "eval_valid_loss/all": 2.054304838180542, + "eval_valid_loss/end_span": 1.135163426399231, + "eval_valid_perplexity/batch": 7.801412582397461, + "eval_valid_perplexity/end_span": 3.1116819381713867, + "eval_valid_perplexity/fim": 2.5616509914398193, + "eval_valid_perplexity/first_seq": 14.443689346313477, + "eval_valid_perplexity/last_seq": 8.907171249389648, + "eval_valid_perplexity/second_seq": 13.893777847290039, + "eval_valid_perplexity/seq": 8.800308227539062, + "eval_valid_reconstruction/all": 0.2945583462715149, + "eval_valid_reconstruction/end_span": 0.7418063879013062, + "eval_valid_reconstruction/fim": 0.18935278058052063, + "eval_valid_reconstruction/first_seq": 0.17531351745128632, + "eval_valid_reconstruction/last_seq": 0.3305138349533081, + "eval_valid_reconstruction/second_seq": 0.19411282241344452, + "eval_valid_runtime": 451.2187, + "eval_valid_samples_per_second": 0.426, + "eval_valid_steps_per_second": 0.426, + "step": 37000 + }, + { + "epoch": 0.13801541296449646, + "eval_train_loss": 2.190392255783081, + "eval_train_loss/all": 2.026921510696411, + "eval_train_loss/end_span": 1.0944488048553467, + "eval_train_perplexity/batch": 7.590682506561279, + "eval_train_perplexity/end_span": 2.9875354766845703, + "eval_train_perplexity/fim": 2.077040672302246, + "eval_train_perplexity/first_seq": 15.417238235473633, + "eval_train_perplexity/last_seq": 8.97800350189209, + "eval_train_perplexity/second_seq": 14.290251731872559, + "eval_train_perplexity/seq": 8.740778923034668, + "eval_train_reconstruction/all": 0.28383156657218933, + "eval_train_reconstruction/end_span": 0.7529206871986389, + "eval_train_reconstruction/fim": 0.14675483107566833, + "eval_train_reconstruction/first_seq": 0.15232689678668976, + "eval_train_reconstruction/last_seq": 0.32601267099380493, + "eval_train_reconstruction/second_seq": 0.18055270612239838, + "eval_train_runtime": 443.4465, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 37000 + }, + { + "epoch": 0.13805271442745984, + "grad_norm": 0.3586404323577881, + "learning_rate": 0.0006, + "loss": 2.0521, + "step": 37010 + }, + { + "epoch": 0.13809001589042322, + "grad_norm": 0.5074976682662964, + "learning_rate": 0.0006, + "loss": 2.346, + "step": 37020 + }, + { + "epoch": 0.1381273173533866, + "grad_norm": 0.3678711950778961, + "learning_rate": 0.0006, + "loss": 2.2573, + "step": 37030 + }, + { + "epoch": 0.13816461881634998, + "grad_norm": 0.3698018193244934, + "learning_rate": 0.0006, + "loss": 2.1882, + "step": 37040 + }, + { + "epoch": 0.13820192027931336, + "grad_norm": 0.5773909687995911, + "learning_rate": 0.0006, + "loss": 2.2748, + "step": 37050 + }, + { + "epoch": 0.13823922174227674, + "grad_norm": 0.4307749271392822, + "learning_rate": 0.0006, + "loss": 2.3183, + "step": 37060 + }, + { + "epoch": 0.13827652320524011, + "grad_norm": 0.442703515291214, + "learning_rate": 0.0006, + "loss": 2.1732, + "step": 37070 + }, + { + "epoch": 0.1383138246682035, + "grad_norm": 0.37716513872146606, + "learning_rate": 0.0006, + "loss": 2.2665, + "step": 37080 + }, + { + "epoch": 0.13835112613116687, + "grad_norm": 0.3718627691268921, + "learning_rate": 0.0006, + "loss": 2.0858, + "step": 37090 + }, + { + "epoch": 0.13838842759413025, + "grad_norm": 0.29055407643318176, + "learning_rate": 0.0006, + "loss": 2.1609, + "step": 37100 + }, + { + "epoch": 0.13842572905709363, + "grad_norm": 0.296568363904953, + "learning_rate": 0.0006, + "loss": 2.2666, + "step": 37110 + }, + { + "epoch": 0.13846303052005698, + "grad_norm": 0.27646544575691223, + "learning_rate": 0.0006, + "loss": 2.1723, + "step": 37120 + }, + { + "epoch": 0.13850033198302036, + "grad_norm": 0.5268688797950745, + "learning_rate": 0.0006, + "loss": 2.2791, + "step": 37130 + }, + { + "epoch": 0.13853763344598374, + "grad_norm": 0.5858958959579468, + "learning_rate": 0.0006, + "loss": 2.1206, + "step": 37140 + }, + { + "epoch": 0.13857493490894712, + "grad_norm": 0.419019877910614, + "learning_rate": 0.0006, + "loss": 2.3462, + "step": 37150 + }, + { + "epoch": 0.1386122363719105, + "grad_norm": 0.3269574046134949, + "learning_rate": 0.0006, + "loss": 2.2808, + "step": 37160 + }, + { + "epoch": 0.13864953783487388, + "grad_norm": 0.30331575870513916, + "learning_rate": 0.0006, + "loss": 2.1883, + "step": 37170 + }, + { + "epoch": 0.13868683929783726, + "grad_norm": 0.269971638917923, + "learning_rate": 0.0006, + "loss": 2.2595, + "step": 37180 + }, + { + "epoch": 0.13872414076080064, + "grad_norm": 0.5062397718429565, + "learning_rate": 0.0006, + "loss": 2.2745, + "step": 37190 + }, + { + "epoch": 0.13876144222376402, + "grad_norm": 0.31714168190956116, + "learning_rate": 0.0006, + "loss": 2.3565, + "step": 37200 + }, + { + "epoch": 0.1387987436867274, + "grad_norm": 0.4187866151332855, + "learning_rate": 0.0006, + "loss": 2.149, + "step": 37210 + }, + { + "epoch": 0.13883604514969078, + "grad_norm": 0.588157594203949, + "learning_rate": 0.0006, + "loss": 2.3438, + "step": 37220 + }, + { + "epoch": 0.13887334661265416, + "grad_norm": 0.24843518435955048, + "learning_rate": 0.0006, + "loss": 2.1608, + "step": 37230 + }, + { + "epoch": 0.13891064807561754, + "grad_norm": 0.31451448798179626, + "learning_rate": 0.0006, + "loss": 2.0597, + "step": 37240 + }, + { + "epoch": 0.1389479495385809, + "grad_norm": 0.5415652990341187, + "learning_rate": 0.0006, + "loss": 2.2312, + "step": 37250 + }, + { + "epoch": 0.1389479495385809, + "eval_valid_loss": 2.193481206893921, + "eval_valid_loss/all": 2.0567879676818848, + "eval_valid_loss/end_span": 1.4522372484207153, + "eval_valid_perplexity/batch": 7.8208088874816895, + "eval_valid_perplexity/end_span": 4.27266263961792, + "eval_valid_perplexity/fim": 2.6227657794952393, + "eval_valid_perplexity/first_seq": 14.819912910461426, + "eval_valid_perplexity/last_seq": 9.189698219299316, + "eval_valid_perplexity/second_seq": 13.640877723693848, + "eval_valid_perplexity/seq": 8.8234224319458, + "eval_valid_reconstruction/all": 0.29395154118537903, + "eval_valid_reconstruction/end_span": 0.6777172088623047, + "eval_valid_reconstruction/fim": 0.18888810276985168, + "eval_valid_reconstruction/first_seq": 0.17052015662193298, + "eval_valid_reconstruction/last_seq": 0.3207370340824127, + "eval_valid_reconstruction/second_seq": 0.19428203999996185, + "eval_valid_runtime": 477.1589, + "eval_valid_samples_per_second": 0.402, + "eval_valid_steps_per_second": 0.402, + "step": 37250 + }, + { + "epoch": 0.1389479495385809, + "eval_train_loss": 2.1897878646850586, + "eval_train_loss/all": 2.0264925956726074, + "eval_train_loss/end_span": 1.380228877067566, + "eval_train_perplexity/batch": 7.587427616119385, + "eval_train_perplexity/end_span": 3.97581148147583, + "eval_train_perplexity/fim": 1.9517134428024292, + "eval_train_perplexity/first_seq": 15.465436935424805, + "eval_train_perplexity/last_seq": 8.463299751281738, + "eval_train_perplexity/second_seq": 13.96175479888916, + "eval_train_perplexity/seq": 8.7344331741333, + "eval_train_reconstruction/all": 0.28399357199668884, + "eval_train_reconstruction/end_span": 0.6885212659835815, + "eval_train_reconstruction/fim": 0.13303834199905396, + "eval_train_reconstruction/first_seq": 0.14903220534324646, + "eval_train_reconstruction/last_seq": 0.3438272774219513, + "eval_train_reconstruction/second_seq": 0.1898014098405838, + "eval_train_runtime": 457.6325, + "eval_train_samples_per_second": 0.42, + "eval_train_steps_per_second": 0.42, + "step": 37250 + }, + { + "epoch": 0.13898525100154427, + "grad_norm": 0.30421751737594604, + "learning_rate": 0.0006, + "loss": 2.2597, + "step": 37260 + }, + { + "epoch": 0.13902255246450765, + "grad_norm": 0.2780909240245819, + "learning_rate": 0.0006, + "loss": 2.2215, + "step": 37270 + }, + { + "epoch": 0.13905985392747103, + "grad_norm": 0.371232807636261, + "learning_rate": 0.0006, + "loss": 2.0563, + "step": 37280 + }, + { + "epoch": 0.1390971553904344, + "grad_norm": 0.37605947256088257, + "learning_rate": 0.0006, + "loss": 2.1977, + "step": 37290 + }, + { + "epoch": 0.1391344568533978, + "grad_norm": 0.28731659054756165, + "learning_rate": 0.0006, + "loss": 2.2515, + "step": 37300 + }, + { + "epoch": 0.13917175831636117, + "grad_norm": 0.30940088629722595, + "learning_rate": 0.0006, + "loss": 2.2093, + "step": 37310 + }, + { + "epoch": 0.13920905977932455, + "grad_norm": 0.19710780680179596, + "learning_rate": 0.0006, + "loss": 2.3338, + "step": 37320 + }, + { + "epoch": 0.13924636124228792, + "grad_norm": 0.3834051787853241, + "learning_rate": 0.0006, + "loss": 2.2329, + "step": 37330 + }, + { + "epoch": 0.1392836627052513, + "grad_norm": 0.3981872498989105, + "learning_rate": 0.0006, + "loss": 2.1186, + "step": 37340 + }, + { + "epoch": 0.13932096416821468, + "grad_norm": 0.28967079520225525, + "learning_rate": 0.0006, + "loss": 2.1897, + "step": 37350 + }, + { + "epoch": 0.13935826563117806, + "grad_norm": 0.4256758689880371, + "learning_rate": 0.0006, + "loss": 2.0873, + "step": 37360 + }, + { + "epoch": 0.13939556709414144, + "grad_norm": 0.25219064950942993, + "learning_rate": 0.0006, + "loss": 2.3756, + "step": 37370 + }, + { + "epoch": 0.13943286855710482, + "grad_norm": 0.37683984637260437, + "learning_rate": 0.0006, + "loss": 2.1215, + "step": 37380 + }, + { + "epoch": 0.13947017002006817, + "grad_norm": 0.27527502179145813, + "learning_rate": 0.0006, + "loss": 2.2364, + "step": 37390 + }, + { + "epoch": 0.13950747148303155, + "grad_norm": 0.2603079378604889, + "learning_rate": 0.0006, + "loss": 2.2582, + "step": 37400 + }, + { + "epoch": 0.13954477294599493, + "grad_norm": 0.4235813319683075, + "learning_rate": 0.0006, + "loss": 2.1134, + "step": 37410 + }, + { + "epoch": 0.1395820744089583, + "grad_norm": 0.27004507184028625, + "learning_rate": 0.0006, + "loss": 2.2901, + "step": 37420 + }, + { + "epoch": 0.1396193758719217, + "grad_norm": 0.42476868629455566, + "learning_rate": 0.0006, + "loss": 2.1986, + "step": 37430 + }, + { + "epoch": 0.13965667733488507, + "grad_norm": 0.28174763917922974, + "learning_rate": 0.0006, + "loss": 2.4021, + "step": 37440 + }, + { + "epoch": 0.13969397879784845, + "grad_norm": 0.2348509579896927, + "learning_rate": 0.0006, + "loss": 2.1539, + "step": 37450 + }, + { + "epoch": 0.13973128026081183, + "grad_norm": 0.45381930470466614, + "learning_rate": 0.0006, + "loss": 2.2119, + "step": 37460 + }, + { + "epoch": 0.1397685817237752, + "grad_norm": 0.26238390803337097, + "learning_rate": 0.0006, + "loss": 2.2247, + "step": 37470 + }, + { + "epoch": 0.1398058831867386, + "grad_norm": 0.34934428334236145, + "learning_rate": 0.0006, + "loss": 2.3145, + "step": 37480 + }, + { + "epoch": 0.13984318464970197, + "grad_norm": 0.7319929599761963, + "learning_rate": 0.0006, + "loss": 2.0833, + "step": 37490 + }, + { + "epoch": 0.13988048611266535, + "grad_norm": 0.349892258644104, + "learning_rate": 0.0006, + "loss": 2.2869, + "step": 37500 + }, + { + "epoch": 0.13988048611266535, + "eval_valid_loss": 2.1894142627716064, + "eval_valid_loss/all": 2.0526771545410156, + "eval_valid_loss/end_span": 1.309064269065857, + "eval_valid_perplexity/batch": 7.788724899291992, + "eval_valid_perplexity/end_span": 3.702707290649414, + "eval_valid_perplexity/fim": 2.3993947505950928, + "eval_valid_perplexity/first_seq": 14.901956558227539, + "eval_valid_perplexity/last_seq": 8.664643287658691, + "eval_valid_perplexity/second_seq": 14.094058990478516, + "eval_valid_perplexity/seq": 8.78001880645752, + "eval_valid_reconstruction/all": 0.29504337906837463, + "eval_valid_reconstruction/end_span": 0.6959642767906189, + "eval_valid_reconstruction/fim": 0.1751689612865448, + "eval_valid_reconstruction/first_seq": 0.16809917986392975, + "eval_valid_reconstruction/last_seq": 0.3367300033569336, + "eval_valid_reconstruction/second_seq": 0.1871086061000824, + "eval_valid_runtime": 447.0647, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 37500 + }, + { + "epoch": 0.13988048611266535, + "eval_train_loss": 2.186380624771118, + "eval_train_loss/all": 2.0230963230133057, + "eval_train_loss/end_span": 1.274729609489441, + "eval_train_perplexity/batch": 7.561702251434326, + "eval_train_perplexity/end_span": 3.5777339935302734, + "eval_train_perplexity/fim": 2.0760865211486816, + "eval_train_perplexity/first_seq": 15.564308166503906, + "eval_train_perplexity/last_seq": 9.029070854187012, + "eval_train_perplexity/second_seq": 14.161445617675781, + "eval_train_perplexity/seq": 8.704472541809082, + "eval_train_reconstruction/all": 0.28498876094818115, + "eval_train_reconstruction/end_span": 0.7062342166900635, + "eval_train_reconstruction/fim": 0.1469610631465912, + "eval_train_reconstruction/first_seq": 0.15137368440628052, + "eval_train_reconstruction/last_seq": 0.3217551112174988, + "eval_train_reconstruction/second_seq": 0.18659529089927673, + "eval_train_runtime": 439.6355, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 37500 + }, + { + "epoch": 0.13991778757562873, + "grad_norm": 0.398626446723938, + "learning_rate": 0.0006, + "loss": 2.1211, + "step": 37510 + }, + { + "epoch": 0.1399550890385921, + "grad_norm": 0.3307296931743622, + "learning_rate": 0.0006, + "loss": 2.2548, + "step": 37520 + }, + { + "epoch": 0.13999239050155546, + "grad_norm": 0.4284118413925171, + "learning_rate": 0.0006, + "loss": 2.1171, + "step": 37530 + }, + { + "epoch": 0.14002969196451884, + "grad_norm": 0.6169320344924927, + "learning_rate": 0.0006, + "loss": 2.2246, + "step": 37540 + }, + { + "epoch": 0.14006699342748222, + "grad_norm": 0.6513209939002991, + "learning_rate": 0.0006, + "loss": 2.0705, + "step": 37550 + }, + { + "epoch": 0.1401042948904456, + "grad_norm": 0.4731537103652954, + "learning_rate": 0.0006, + "loss": 2.2557, + "step": 37560 + }, + { + "epoch": 0.14014159635340898, + "grad_norm": 0.4221716523170471, + "learning_rate": 0.0006, + "loss": 2.1867, + "step": 37570 + }, + { + "epoch": 0.14017889781637236, + "grad_norm": 0.29802027344703674, + "learning_rate": 0.0006, + "loss": 2.3416, + "step": 37580 + }, + { + "epoch": 0.14021619927933573, + "grad_norm": 0.4363097846508026, + "learning_rate": 0.0006, + "loss": 2.2631, + "step": 37590 + }, + { + "epoch": 0.14025350074229911, + "grad_norm": 0.47037097811698914, + "learning_rate": 0.0006, + "loss": 2.0659, + "step": 37600 + }, + { + "epoch": 0.1402908022052625, + "grad_norm": 0.3291166126728058, + "learning_rate": 0.0006, + "loss": 2.2233, + "step": 37610 + }, + { + "epoch": 0.14032810366822587, + "grad_norm": 0.2664014995098114, + "learning_rate": 0.0006, + "loss": 2.2539, + "step": 37620 + }, + { + "epoch": 0.14036540513118925, + "grad_norm": 0.413257360458374, + "learning_rate": 0.0006, + "loss": 2.05, + "step": 37630 + }, + { + "epoch": 0.14040270659415263, + "grad_norm": 0.4351906180381775, + "learning_rate": 0.0006, + "loss": 2.2812, + "step": 37640 + }, + { + "epoch": 0.140440008057116, + "grad_norm": 0.4005521535873413, + "learning_rate": 0.0006, + "loss": 2.309, + "step": 37650 + }, + { + "epoch": 0.1404773095200794, + "grad_norm": 0.28974542021751404, + "learning_rate": 0.0006, + "loss": 2.1474, + "step": 37660 + }, + { + "epoch": 0.14051461098304274, + "grad_norm": 0.40458109974861145, + "learning_rate": 0.0006, + "loss": 2.1971, + "step": 37670 + }, + { + "epoch": 0.14055191244600612, + "grad_norm": 0.3033756911754608, + "learning_rate": 0.0006, + "loss": 2.3326, + "step": 37680 + }, + { + "epoch": 0.1405892139089695, + "grad_norm": 0.2973169684410095, + "learning_rate": 0.0006, + "loss": 2.2129, + "step": 37690 + }, + { + "epoch": 0.14062651537193288, + "grad_norm": 1.414031744003296, + "learning_rate": 0.0006, + "loss": 2.3949, + "step": 37700 + }, + { + "epoch": 0.14066381683489626, + "grad_norm": 0.29613256454467773, + "learning_rate": 0.0006, + "loss": 2.1206, + "step": 37710 + }, + { + "epoch": 0.14070111829785964, + "grad_norm": 0.5524561405181885, + "learning_rate": 0.0006, + "loss": 2.2234, + "step": 37720 + }, + { + "epoch": 0.14073841976082302, + "grad_norm": 0.2966955900192261, + "learning_rate": 0.0006, + "loss": 2.2833, + "step": 37730 + }, + { + "epoch": 0.1407757212237864, + "grad_norm": 0.3470619320869446, + "learning_rate": 0.0006, + "loss": 2.0767, + "step": 37740 + }, + { + "epoch": 0.14081302268674978, + "grad_norm": 0.5276867151260376, + "learning_rate": 0.0006, + "loss": 2.2042, + "step": 37750 + }, + { + "epoch": 0.14081302268674978, + "eval_valid_loss": 2.1855099201202393, + "eval_valid_loss/all": 2.049201726913452, + "eval_valid_loss/end_span": 1.2416541576385498, + "eval_valid_perplexity/batch": 7.761702537536621, + "eval_valid_perplexity/end_span": 3.461334228515625, + "eval_valid_perplexity/fim": 2.5775554180145264, + "eval_valid_perplexity/first_seq": 14.989958763122559, + "eval_valid_perplexity/last_seq": 8.888986587524414, + "eval_valid_perplexity/second_seq": 13.50389289855957, + "eval_valid_perplexity/seq": 8.756503105163574, + "eval_valid_reconstruction/all": 0.29580023884773254, + "eval_valid_reconstruction/end_span": 0.7048439979553223, + "eval_valid_reconstruction/fim": 0.19116656482219696, + "eval_valid_reconstruction/first_seq": 0.16415934264659882, + "eval_valid_reconstruction/last_seq": 0.3353293538093567, + "eval_valid_reconstruction/second_seq": 0.19991694390773773, + "eval_valid_runtime": 445.6358, + "eval_valid_samples_per_second": 0.431, + "eval_valid_steps_per_second": 0.431, + "step": 37750 + }, + { + "epoch": 0.14081302268674978, + "eval_train_loss": 2.1829607486724854, + "eval_train_loss/all": 2.0205061435699463, + "eval_train_loss/end_span": 1.20808744430542, + "eval_train_perplexity/batch": 7.542141437530518, + "eval_train_perplexity/end_span": 3.3470771312713623, + "eval_train_perplexity/fim": 1.9648380279541016, + "eval_train_perplexity/first_seq": 15.165997505187988, + "eval_train_perplexity/last_seq": 8.874076843261719, + "eval_train_perplexity/second_seq": 14.075630187988281, + "eval_train_perplexity/seq": 8.683123588562012, + "eval_train_reconstruction/all": 0.28546348214149475, + "eval_train_reconstruction/end_span": 0.7152066826820374, + "eval_train_reconstruction/fim": 0.13574406504631042, + "eval_train_reconstruction/first_seq": 0.15827050805091858, + "eval_train_reconstruction/last_seq": 0.3267575204372406, + "eval_train_reconstruction/second_seq": 0.18830057978630066, + "eval_train_runtime": 444.2895, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 37750 + }, + { + "epoch": 0.14085032414971316, + "grad_norm": 0.41219717264175415, + "learning_rate": 0.0006, + "loss": 2.2028, + "step": 37760 + }, + { + "epoch": 0.14088762561267654, + "grad_norm": 0.3070066571235657, + "learning_rate": 0.0006, + "loss": 2.3433, + "step": 37770 + }, + { + "epoch": 0.14092492707563992, + "grad_norm": 0.35173508524894714, + "learning_rate": 0.0006, + "loss": 2.2711, + "step": 37780 + }, + { + "epoch": 0.1409622285386033, + "grad_norm": 0.3667271137237549, + "learning_rate": 0.0006, + "loss": 2.2404, + "step": 37790 + }, + { + "epoch": 0.14099953000156668, + "grad_norm": 0.3890726566314697, + "learning_rate": 0.0006, + "loss": 2.1645, + "step": 37800 + }, + { + "epoch": 0.14103683146453003, + "grad_norm": 0.2654382288455963, + "learning_rate": 0.0006, + "loss": 2.2497, + "step": 37810 + }, + { + "epoch": 0.1410741329274934, + "grad_norm": 0.32539668679237366, + "learning_rate": 0.0006, + "loss": 2.3609, + "step": 37820 + }, + { + "epoch": 0.14111143439045679, + "grad_norm": 0.4421609044075012, + "learning_rate": 0.0006, + "loss": 2.0216, + "step": 37830 + }, + { + "epoch": 0.14114873585342017, + "grad_norm": 0.47757643461227417, + "learning_rate": 0.0006, + "loss": 2.3427, + "step": 37840 + }, + { + "epoch": 0.14118603731638354, + "grad_norm": 0.4481562077999115, + "learning_rate": 0.0006, + "loss": 2.1711, + "step": 37850 + }, + { + "epoch": 0.14122333877934692, + "grad_norm": 0.39654573798179626, + "learning_rate": 0.0006, + "loss": 2.1789, + "step": 37860 + }, + { + "epoch": 0.1412606402423103, + "grad_norm": 0.3810980021953583, + "learning_rate": 0.0006, + "loss": 2.3056, + "step": 37870 + }, + { + "epoch": 0.14129794170527368, + "grad_norm": 0.3328372836112976, + "learning_rate": 0.0006, + "loss": 2.2711, + "step": 37880 + }, + { + "epoch": 0.14133524316823706, + "grad_norm": 0.28349822759628296, + "learning_rate": 0.0006, + "loss": 2.3089, + "step": 37890 + }, + { + "epoch": 0.14137254463120044, + "grad_norm": 0.32313019037246704, + "learning_rate": 0.0006, + "loss": 2.2739, + "step": 37900 + }, + { + "epoch": 0.14140984609416382, + "grad_norm": 0.2966515123844147, + "learning_rate": 0.0006, + "loss": 2.1774, + "step": 37910 + }, + { + "epoch": 0.1414471475571272, + "grad_norm": 0.29412075877189636, + "learning_rate": 0.0006, + "loss": 2.2731, + "step": 37920 + }, + { + "epoch": 0.14148444902009058, + "grad_norm": 0.38430649042129517, + "learning_rate": 0.0006, + "loss": 2.2892, + "step": 37930 + }, + { + "epoch": 0.14152175048305393, + "grad_norm": 0.2299536168575287, + "learning_rate": 0.0006, + "loss": 2.2372, + "step": 37940 + }, + { + "epoch": 0.1415590519460173, + "grad_norm": 0.4178551733493805, + "learning_rate": 0.0006, + "loss": 2.1132, + "step": 37950 + }, + { + "epoch": 0.1415963534089807, + "grad_norm": 0.3596700131893158, + "learning_rate": 0.0006, + "loss": 2.2693, + "step": 37960 + }, + { + "epoch": 0.14163365487194407, + "grad_norm": 0.3839450478553772, + "learning_rate": 0.0006, + "loss": 2.1255, + "step": 37970 + }, + { + "epoch": 0.14167095633490745, + "grad_norm": 0.30144158005714417, + "learning_rate": 0.0006, + "loss": 2.0404, + "step": 37980 + }, + { + "epoch": 0.14170825779787083, + "grad_norm": 0.31339845061302185, + "learning_rate": 0.0006, + "loss": 2.1825, + "step": 37990 + }, + { + "epoch": 0.1417455592608342, + "grad_norm": 0.4052589535713196, + "learning_rate": 0.0006, + "loss": 2.1932, + "step": 38000 + }, + { + "epoch": 0.1417455592608342, + "eval_valid_loss": 2.189795970916748, + "eval_valid_loss/all": 2.0532727241516113, + "eval_valid_loss/end_span": 1.2837297916412354, + "eval_valid_perplexity/batch": 7.793365001678467, + "eval_valid_perplexity/end_span": 3.610079526901245, + "eval_valid_perplexity/fim": 2.4464688301086426, + "eval_valid_perplexity/first_seq": 14.95216178894043, + "eval_valid_perplexity/last_seq": 9.208413124084473, + "eval_valid_perplexity/second_seq": 13.854787826538086, + "eval_valid_perplexity/seq": 8.790759086608887, + "eval_valid_reconstruction/all": 0.2948191463947296, + "eval_valid_reconstruction/end_span": 0.7001326680183411, + "eval_valid_reconstruction/fim": 0.17924316227436066, + "eval_valid_reconstruction/first_seq": 0.16298994421958923, + "eval_valid_reconstruction/last_seq": 0.32132667303085327, + "eval_valid_reconstruction/second_seq": 0.19270989298820496, + "eval_valid_runtime": 443.7257, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 38000 + }, + { + "epoch": 0.1417455592608342, + "eval_train_loss": 2.184213876724243, + "eval_train_loss/all": 2.021416425704956, + "eval_train_loss/end_span": 1.2455936670303345, + "eval_train_perplexity/batch": 7.549009799957275, + "eval_train_perplexity/end_span": 3.4749972820281982, + "eval_train_perplexity/fim": 1.9270610809326172, + "eval_train_perplexity/first_seq": 15.300375938415527, + "eval_train_perplexity/last_seq": 8.896334648132324, + "eval_train_perplexity/second_seq": 14.450939178466797, + "eval_train_perplexity/seq": 8.691614151000977, + "eval_train_reconstruction/all": 0.28529834747314453, + "eval_train_reconstruction/end_span": 0.708906352519989, + "eval_train_reconstruction/fim": 0.13197630643844604, + "eval_train_reconstruction/first_seq": 0.15304265916347504, + "eval_train_reconstruction/last_seq": 0.32820364832878113, + "eval_train_reconstruction/second_seq": 0.1774311363697052, + "eval_train_runtime": 439.8871, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 38000 + }, + { + "epoch": 0.1417828607237976, + "grad_norm": 0.33244627714157104, + "learning_rate": 0.0006, + "loss": 2.06, + "step": 38010 + }, + { + "epoch": 0.14182016218676097, + "grad_norm": 0.2743273973464966, + "learning_rate": 0.0006, + "loss": 2.2142, + "step": 38020 + }, + { + "epoch": 0.14185746364972435, + "grad_norm": 0.2925772964954376, + "learning_rate": 0.0006, + "loss": 2.2952, + "step": 38030 + }, + { + "epoch": 0.14189476511268773, + "grad_norm": 0.2849388122558594, + "learning_rate": 0.0006, + "loss": 2.0342, + "step": 38040 + }, + { + "epoch": 0.1419320665756511, + "grad_norm": 0.3199794888496399, + "learning_rate": 0.0006, + "loss": 2.2551, + "step": 38050 + }, + { + "epoch": 0.14196936803861449, + "grad_norm": 0.5026946067810059, + "learning_rate": 0.0006, + "loss": 2.2631, + "step": 38060 + }, + { + "epoch": 0.14200666950157786, + "grad_norm": 0.5329354405403137, + "learning_rate": 0.0006, + "loss": 2.0648, + "step": 38070 + }, + { + "epoch": 0.14204397096454122, + "grad_norm": 0.3590565323829651, + "learning_rate": 0.0006, + "loss": 2.2043, + "step": 38080 + }, + { + "epoch": 0.1420812724275046, + "grad_norm": 0.23944498598575592, + "learning_rate": 0.0006, + "loss": 2.2072, + "step": 38090 + }, + { + "epoch": 0.14211857389046798, + "grad_norm": 0.3348518908023834, + "learning_rate": 0.0006, + "loss": 2.1535, + "step": 38100 + }, + { + "epoch": 0.14215587535343135, + "grad_norm": 0.33910489082336426, + "learning_rate": 0.0006, + "loss": 2.1859, + "step": 38110 + }, + { + "epoch": 0.14219317681639473, + "grad_norm": 0.4271278977394104, + "learning_rate": 0.0006, + "loss": 2.1528, + "step": 38120 + }, + { + "epoch": 0.1422304782793581, + "grad_norm": 0.30429255962371826, + "learning_rate": 0.0006, + "loss": 2.271, + "step": 38130 + }, + { + "epoch": 0.1422677797423215, + "grad_norm": 0.3462347984313965, + "learning_rate": 0.0006, + "loss": 2.2237, + "step": 38140 + }, + { + "epoch": 0.14230508120528487, + "grad_norm": 0.40039196610450745, + "learning_rate": 0.0006, + "loss": 2.2304, + "step": 38150 + }, + { + "epoch": 0.14234238266824825, + "grad_norm": 0.3167526423931122, + "learning_rate": 0.0006, + "loss": 2.1863, + "step": 38160 + }, + { + "epoch": 0.14237968413121163, + "grad_norm": 1.61554753780365, + "learning_rate": 0.0006, + "loss": 2.2012, + "step": 38170 + }, + { + "epoch": 0.142416985594175, + "grad_norm": 0.40438613295555115, + "learning_rate": 0.0006, + "loss": 2.1969, + "step": 38180 + }, + { + "epoch": 0.1424542870571384, + "grad_norm": 0.26215964555740356, + "learning_rate": 0.0006, + "loss": 2.0033, + "step": 38190 + }, + { + "epoch": 0.14249158852010177, + "grad_norm": 0.3340311646461487, + "learning_rate": 0.0006, + "loss": 2.174, + "step": 38200 + }, + { + "epoch": 0.14252888998306515, + "grad_norm": 0.8492041826248169, + "learning_rate": 0.0006, + "loss": 2.0705, + "step": 38210 + }, + { + "epoch": 0.1425661914460285, + "grad_norm": 0.4031490385532379, + "learning_rate": 0.0006, + "loss": 2.0008, + "step": 38220 + }, + { + "epoch": 0.14260349290899188, + "grad_norm": 0.24971507489681244, + "learning_rate": 0.0006, + "loss": 2.3453, + "step": 38230 + }, + { + "epoch": 0.14264079437195526, + "grad_norm": 0.2503451406955719, + "learning_rate": 0.0006, + "loss": 2.1949, + "step": 38240 + }, + { + "epoch": 0.14267809583491864, + "grad_norm": 0.32935449481010437, + "learning_rate": 0.0006, + "loss": 2.2242, + "step": 38250 + }, + { + "epoch": 0.14267809583491864, + "eval_valid_loss": 2.1883351802825928, + "eval_valid_loss/all": 2.0515317916870117, + "eval_valid_loss/end_span": 1.2515901327133179, + "eval_valid_perplexity/batch": 7.77980899810791, + "eval_valid_perplexity/end_span": 3.4958975315093994, + "eval_valid_perplexity/fim": 2.6007068157196045, + "eval_valid_perplexity/first_seq": 14.906693458557129, + "eval_valid_perplexity/last_seq": 9.111199378967285, + "eval_valid_perplexity/second_seq": 13.912128448486328, + "eval_valid_perplexity/seq": 8.77025318145752, + "eval_valid_reconstruction/all": 0.29525354504585266, + "eval_valid_reconstruction/end_span": 0.7095208764076233, + "eval_valid_reconstruction/fim": 0.19262340664863586, + "eval_valid_reconstruction/first_seq": 0.16786350309848785, + "eval_valid_reconstruction/last_seq": 0.32386600971221924, + "eval_valid_reconstruction/second_seq": 0.18942970037460327, + "eval_valid_runtime": 435.6207, + "eval_valid_samples_per_second": 0.441, + "eval_valid_steps_per_second": 0.441, + "step": 38250 + }, + { + "epoch": 0.14267809583491864, + "eval_train_loss": 2.185596227645874, + "eval_train_loss/all": 2.022514820098877, + "eval_train_loss/end_span": 1.203152060508728, + "eval_train_perplexity/batch": 7.557306289672852, + "eval_train_perplexity/end_span": 3.3305985927581787, + "eval_train_perplexity/fim": 2.2026145458221436, + "eval_train_perplexity/first_seq": 15.643232345581055, + "eval_train_perplexity/last_seq": 8.671521186828613, + "eval_train_perplexity/second_seq": 14.051478385925293, + "eval_train_perplexity/seq": 8.695631980895996, + "eval_train_reconstruction/all": 0.2851147949695587, + "eval_train_reconstruction/end_span": 0.7232770323753357, + "eval_train_reconstruction/fim": 0.1601942479610443, + "eval_train_reconstruction/first_seq": 0.14504168927669525, + "eval_train_reconstruction/last_seq": 0.34167855978012085, + "eval_train_reconstruction/second_seq": 0.18636374175548553, + "eval_train_runtime": 445.3835, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 38250 + }, + { + "epoch": 0.14271539729788202, + "grad_norm": 0.4091011583805084, + "learning_rate": 0.0006, + "loss": 2.1084, + "step": 38260 + }, + { + "epoch": 0.1427526987608454, + "grad_norm": 0.4456115663051605, + "learning_rate": 0.0006, + "loss": 2.3342, + "step": 38270 + }, + { + "epoch": 0.14279000022380878, + "grad_norm": 0.3741234540939331, + "learning_rate": 0.0006, + "loss": 2.1631, + "step": 38280 + }, + { + "epoch": 0.14282730168677216, + "grad_norm": 0.3200359344482422, + "learning_rate": 0.0006, + "loss": 2.1927, + "step": 38290 + }, + { + "epoch": 0.14286460314973554, + "grad_norm": 0.40099409222602844, + "learning_rate": 0.0006, + "loss": 2.1765, + "step": 38300 + }, + { + "epoch": 0.14290190461269892, + "grad_norm": 0.3403623104095459, + "learning_rate": 0.0006, + "loss": 2.247, + "step": 38310 + }, + { + "epoch": 0.1429392060756623, + "grad_norm": 0.3942701518535614, + "learning_rate": 0.0006, + "loss": 2.2947, + "step": 38320 + }, + { + "epoch": 0.14297650753862567, + "grad_norm": 0.2758128046989441, + "learning_rate": 0.0006, + "loss": 2.1676, + "step": 38330 + }, + { + "epoch": 0.14301380900158905, + "grad_norm": 0.3339759111404419, + "learning_rate": 0.0006, + "loss": 2.1572, + "step": 38340 + }, + { + "epoch": 0.14305111046455243, + "grad_norm": 0.32291874289512634, + "learning_rate": 0.0006, + "loss": 2.0961, + "step": 38350 + }, + { + "epoch": 0.14308841192751579, + "grad_norm": 0.3638797104358673, + "learning_rate": 0.0006, + "loss": 2.4617, + "step": 38360 + }, + { + "epoch": 0.14312571339047916, + "grad_norm": 0.38849329948425293, + "learning_rate": 0.0006, + "loss": 2.0876, + "step": 38370 + }, + { + "epoch": 0.14316301485344254, + "grad_norm": 0.3055025339126587, + "learning_rate": 0.0006, + "loss": 2.3255, + "step": 38380 + }, + { + "epoch": 0.14320031631640592, + "grad_norm": 0.2644047141075134, + "learning_rate": 0.0006, + "loss": 2.2151, + "step": 38390 + }, + { + "epoch": 0.1432376177793693, + "grad_norm": 0.3217112421989441, + "learning_rate": 0.0006, + "loss": 2.365, + "step": 38400 + }, + { + "epoch": 0.14327491924233268, + "grad_norm": 0.5061597228050232, + "learning_rate": 0.0006, + "loss": 2.0295, + "step": 38410 + }, + { + "epoch": 0.14331222070529606, + "grad_norm": 0.3777892291545868, + "learning_rate": 0.0006, + "loss": 2.2745, + "step": 38420 + }, + { + "epoch": 0.14334952216825944, + "grad_norm": 0.2481248527765274, + "learning_rate": 0.0006, + "loss": 2.3187, + "step": 38430 + }, + { + "epoch": 0.14338682363122282, + "grad_norm": 0.28040385246276855, + "learning_rate": 0.0006, + "loss": 2.2729, + "step": 38440 + }, + { + "epoch": 0.1434241250941862, + "grad_norm": 0.32534343004226685, + "learning_rate": 0.0006, + "loss": 2.3541, + "step": 38450 + }, + { + "epoch": 0.14346142655714958, + "grad_norm": 0.6631630063056946, + "learning_rate": 0.0006, + "loss": 2.0943, + "step": 38460 + }, + { + "epoch": 0.14349872802011296, + "grad_norm": 0.2824140787124634, + "learning_rate": 0.0006, + "loss": 2.0398, + "step": 38470 + }, + { + "epoch": 0.14353602948307634, + "grad_norm": 0.3122090995311737, + "learning_rate": 0.0006, + "loss": 2.1499, + "step": 38480 + }, + { + "epoch": 0.1435733309460397, + "grad_norm": 0.3372127115726471, + "learning_rate": 0.0006, + "loss": 2.2241, + "step": 38490 + }, + { + "epoch": 0.14361063240900307, + "grad_norm": 0.27564600110054016, + "learning_rate": 0.0006, + "loss": 2.2608, + "step": 38500 + }, + { + "epoch": 0.14361063240900307, + "eval_valid_loss": 2.1854591369628906, + "eval_valid_loss/all": 2.048970937728882, + "eval_valid_loss/end_span": 1.3343883752822876, + "eval_valid_perplexity/batch": 7.75991153717041, + "eval_valid_perplexity/end_span": 3.7976725101470947, + "eval_valid_perplexity/fim": 2.2637298107147217, + "eval_valid_perplexity/first_seq": 15.092729568481445, + "eval_valid_perplexity/last_seq": 8.873753547668457, + "eval_valid_perplexity/second_seq": 13.585999488830566, + "eval_valid_perplexity/seq": 8.748307228088379, + "eval_valid_reconstruction/all": 0.29586294293403625, + "eval_valid_reconstruction/end_span": 0.6848065853118896, + "eval_valid_reconstruction/fim": 0.16376984119415283, + "eval_valid_reconstruction/first_seq": 0.16144177317619324, + "eval_valid_reconstruction/last_seq": 0.33164671063423157, + "eval_valid_reconstruction/second_seq": 0.19929875433444977, + "eval_valid_runtime": 440.4517, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 38500 + }, + { + "epoch": 0.14361063240900307, + "eval_train_loss": 2.1836330890655518, + "eval_train_loss/all": 2.0207033157348633, + "eval_train_loss/end_span": 1.2906827926635742, + "eval_train_perplexity/batch": 7.543628692626953, + "eval_train_perplexity/end_span": 3.635267734527588, + "eval_train_perplexity/fim": 2.062436580657959, + "eval_train_perplexity/first_seq": 15.592844009399414, + "eval_train_perplexity/last_seq": 8.973973274230957, + "eval_train_perplexity/second_seq": 14.141785621643066, + "eval_train_perplexity/seq": 8.682692527770996, + "eval_train_reconstruction/all": 0.2855425477027893, + "eval_train_reconstruction/end_span": 0.6965012550354004, + "eval_train_reconstruction/fim": 0.14540955424308777, + "eval_train_reconstruction/first_seq": 0.1484614759683609, + "eval_train_reconstruction/last_seq": 0.32603034377098083, + "eval_train_reconstruction/second_seq": 0.18563538789749146, + "eval_train_runtime": 445.9339, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 38500 + }, + { + "epoch": 0.14364793387196645, + "grad_norm": 0.5510097146034241, + "learning_rate": 0.0006, + "loss": 2.0641, + "step": 38510 + }, + { + "epoch": 0.14368523533492983, + "grad_norm": 0.3468186855316162, + "learning_rate": 0.0006, + "loss": 2.0095, + "step": 38520 + }, + { + "epoch": 0.1437225367978932, + "grad_norm": 0.3485722839832306, + "learning_rate": 0.0006, + "loss": 2.2994, + "step": 38530 + }, + { + "epoch": 0.1437598382608566, + "grad_norm": 0.4013533294200897, + "learning_rate": 0.0006, + "loss": 2.2132, + "step": 38540 + }, + { + "epoch": 0.14379713972381997, + "grad_norm": 0.3990823030471802, + "learning_rate": 0.0006, + "loss": 2.2845, + "step": 38550 + }, + { + "epoch": 0.14383444118678335, + "grad_norm": 0.32364559173583984, + "learning_rate": 0.0006, + "loss": 2.2265, + "step": 38560 + }, + { + "epoch": 0.14387174264974673, + "grad_norm": 0.41795510053634644, + "learning_rate": 0.0006, + "loss": 2.0431, + "step": 38570 + }, + { + "epoch": 0.1439090441127101, + "grad_norm": 0.3959707021713257, + "learning_rate": 0.0006, + "loss": 2.3033, + "step": 38580 + }, + { + "epoch": 0.14394634557567348, + "grad_norm": 0.2640005946159363, + "learning_rate": 0.0006, + "loss": 2.1018, + "step": 38590 + }, + { + "epoch": 0.14398364703863686, + "grad_norm": 0.23073157668113708, + "learning_rate": 0.0006, + "loss": 2.3938, + "step": 38600 + }, + { + "epoch": 0.14402094850160024, + "grad_norm": 0.47721782326698303, + "learning_rate": 0.0006, + "loss": 2.2749, + "step": 38610 + }, + { + "epoch": 0.14405824996456362, + "grad_norm": 0.32546940445899963, + "learning_rate": 0.0006, + "loss": 2.0966, + "step": 38620 + }, + { + "epoch": 0.14409555142752697, + "grad_norm": 0.3643605411052704, + "learning_rate": 0.0006, + "loss": 2.2859, + "step": 38630 + }, + { + "epoch": 0.14413285289049035, + "grad_norm": 0.38824430108070374, + "learning_rate": 0.0006, + "loss": 2.2588, + "step": 38640 + }, + { + "epoch": 0.14417015435345373, + "grad_norm": 0.2761787176132202, + "learning_rate": 0.0006, + "loss": 2.248, + "step": 38650 + }, + { + "epoch": 0.1442074558164171, + "grad_norm": 0.31200122833251953, + "learning_rate": 0.0006, + "loss": 2.2388, + "step": 38660 + }, + { + "epoch": 0.1442447572793805, + "grad_norm": 0.37417352199554443, + "learning_rate": 0.0006, + "loss": 2.2414, + "step": 38670 + }, + { + "epoch": 0.14428205874234387, + "grad_norm": 0.4663110077381134, + "learning_rate": 0.0006, + "loss": 2.1342, + "step": 38680 + }, + { + "epoch": 0.14431936020530725, + "grad_norm": 0.27537286281585693, + "learning_rate": 0.0006, + "loss": 2.3026, + "step": 38690 + }, + { + "epoch": 0.14435666166827063, + "grad_norm": 0.37812408804893494, + "learning_rate": 0.0006, + "loss": 2.2514, + "step": 38700 + }, + { + "epoch": 0.144393963131234, + "grad_norm": 0.2967618405818939, + "learning_rate": 0.0006, + "loss": 2.3221, + "step": 38710 + }, + { + "epoch": 0.1444312645941974, + "grad_norm": 0.25191357731819153, + "learning_rate": 0.0006, + "loss": 2.3426, + "step": 38720 + }, + { + "epoch": 0.14446856605716077, + "grad_norm": 0.25425469875335693, + "learning_rate": 0.0006, + "loss": 2.3489, + "step": 38730 + }, + { + "epoch": 0.14450586752012415, + "grad_norm": 0.49627089500427246, + "learning_rate": 0.0006, + "loss": 2.1892, + "step": 38740 + }, + { + "epoch": 0.14454316898308753, + "grad_norm": 0.30712929368019104, + "learning_rate": 0.0006, + "loss": 2.2505, + "step": 38750 + }, + { + "epoch": 0.14454316898308753, + "eval_valid_loss": 2.190830945968628, + "eval_valid_loss/all": 2.0538954734802246, + "eval_valid_loss/end_span": 1.206304907798767, + "eval_valid_perplexity/batch": 7.798219680786133, + "eval_valid_perplexity/end_span": 3.341116189956665, + "eval_valid_perplexity/fim": 2.238670825958252, + "eval_valid_perplexity/first_seq": 14.917533874511719, + "eval_valid_perplexity/last_seq": 8.97546100616455, + "eval_valid_perplexity/second_seq": 13.440288543701172, + "eval_valid_perplexity/seq": 8.789621353149414, + "eval_valid_reconstruction/all": 0.29410818219184875, + "eval_valid_reconstruction/end_span": 0.7131261229515076, + "eval_valid_reconstruction/fim": 0.1615636646747589, + "eval_valid_reconstruction/first_seq": 0.16471129655838013, + "eval_valid_reconstruction/last_seq": 0.32855746150016785, + "eval_valid_reconstruction/second_seq": 0.20692259073257446, + "eval_valid_runtime": 434.8465, + "eval_valid_samples_per_second": 0.442, + "eval_valid_steps_per_second": 0.442, + "step": 38750 + }, + { + "epoch": 0.14454316898308753, + "eval_train_loss": 2.1873223781585693, + "eval_train_loss/all": 2.0239546298980713, + "eval_train_loss/end_span": 1.1698559522628784, + "eval_train_perplexity/batch": 7.568195343017578, + "eval_train_perplexity/end_span": 3.2215285301208496, + "eval_train_perplexity/fim": 2.2659363746643066, + "eval_train_perplexity/first_seq": 15.625843048095703, + "eval_train_perplexity/last_seq": 8.544610023498535, + "eval_train_perplexity/second_seq": 14.434331893920898, + "eval_train_perplexity/seq": 8.711257934570312, + "eval_train_reconstruction/all": 0.2842004895210266, + "eval_train_reconstruction/end_span": 0.7243818640708923, + "eval_train_reconstruction/fim": 0.1627395898103714, + "eval_train_reconstruction/first_seq": 0.14909662306308746, + "eval_train_reconstruction/last_seq": 0.3396732807159424, + "eval_train_reconstruction/second_seq": 0.17834196984767914, + "eval_train_runtime": 444.2491, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 38750 + }, + { + "epoch": 0.1445804704460509, + "grad_norm": 0.27121207118034363, + "learning_rate": 0.0006, + "loss": 2.1798, + "step": 38760 + }, + { + "epoch": 0.14461777190901426, + "grad_norm": 0.29749730229377747, + "learning_rate": 0.0006, + "loss": 2.1571, + "step": 38770 + }, + { + "epoch": 0.14465507337197764, + "grad_norm": 0.29210036993026733, + "learning_rate": 0.0006, + "loss": 2.3034, + "step": 38780 + }, + { + "epoch": 0.14469237483494102, + "grad_norm": 0.3505750596523285, + "learning_rate": 0.0006, + "loss": 2.3844, + "step": 38790 + }, + { + "epoch": 0.1447296762979044, + "grad_norm": 0.36754652857780457, + "learning_rate": 0.0006, + "loss": 2.1872, + "step": 38800 + }, + { + "epoch": 0.14476697776086778, + "grad_norm": 0.41839709877967834, + "learning_rate": 0.0006, + "loss": 2.2153, + "step": 38810 + }, + { + "epoch": 0.14480427922383116, + "grad_norm": 0.3659198582172394, + "learning_rate": 0.0006, + "loss": 2.1237, + "step": 38820 + }, + { + "epoch": 0.14484158068679454, + "grad_norm": 0.38807281851768494, + "learning_rate": 0.0006, + "loss": 2.288, + "step": 38830 + }, + { + "epoch": 0.14487888214975791, + "grad_norm": 0.38374584913253784, + "learning_rate": 0.0006, + "loss": 2.1562, + "step": 38840 + }, + { + "epoch": 0.1449161836127213, + "grad_norm": 0.32513394951820374, + "learning_rate": 0.0006, + "loss": 2.2898, + "step": 38850 + }, + { + "epoch": 0.14495348507568467, + "grad_norm": 0.4617779850959778, + "learning_rate": 0.0006, + "loss": 1.998, + "step": 38860 + }, + { + "epoch": 0.14499078653864805, + "grad_norm": 0.271077960729599, + "learning_rate": 0.0006, + "loss": 2.0969, + "step": 38870 + }, + { + "epoch": 0.14502808800161143, + "grad_norm": 0.3819454610347748, + "learning_rate": 0.0006, + "loss": 2.2154, + "step": 38880 + }, + { + "epoch": 0.1450653894645748, + "grad_norm": 0.41307303309440613, + "learning_rate": 0.0006, + "loss": 2.3066, + "step": 38890 + }, + { + "epoch": 0.1451026909275382, + "grad_norm": 0.2576780319213867, + "learning_rate": 0.0006, + "loss": 2.1237, + "step": 38900 + }, + { + "epoch": 0.14513999239050154, + "grad_norm": 0.2712503969669342, + "learning_rate": 0.0006, + "loss": 2.3092, + "step": 38910 + }, + { + "epoch": 0.14517729385346492, + "grad_norm": 0.2829888164997101, + "learning_rate": 0.0006, + "loss": 2.2306, + "step": 38920 + }, + { + "epoch": 0.1452145953164283, + "grad_norm": 0.3592173755168915, + "learning_rate": 0.0006, + "loss": 2.1267, + "step": 38930 + }, + { + "epoch": 0.14525189677939168, + "grad_norm": 0.2606058716773987, + "learning_rate": 0.0006, + "loss": 2.2775, + "step": 38940 + }, + { + "epoch": 0.14528919824235506, + "grad_norm": 0.41921767592430115, + "learning_rate": 0.0006, + "loss": 2.3029, + "step": 38950 + }, + { + "epoch": 0.14532649970531844, + "grad_norm": 0.305867463350296, + "learning_rate": 0.0006, + "loss": 2.2173, + "step": 38960 + }, + { + "epoch": 0.14536380116828182, + "grad_norm": 0.3886246681213379, + "learning_rate": 0.0006, + "loss": 2.0951, + "step": 38970 + }, + { + "epoch": 0.1454011026312452, + "grad_norm": 0.2427569329738617, + "learning_rate": 0.0006, + "loss": 2.1716, + "step": 38980 + }, + { + "epoch": 0.14543840409420858, + "grad_norm": 0.29444101452827454, + "learning_rate": 0.0006, + "loss": 2.2433, + "step": 38990 + }, + { + "epoch": 0.14547570555717196, + "grad_norm": 0.5560967326164246, + "learning_rate": 0.0006, + "loss": 2.0578, + "step": 39000 + }, + { + "epoch": 0.14547570555717196, + "eval_valid_loss": 2.1866719722747803, + "eval_valid_loss/all": 2.0504684448242188, + "eval_valid_loss/end_span": 1.1989119052886963, + "eval_valid_perplexity/batch": 7.771540641784668, + "eval_valid_perplexity/end_span": 3.3165063858032227, + "eval_valid_perplexity/fim": 2.623746156692505, + "eval_valid_perplexity/first_seq": 14.870975494384766, + "eval_valid_perplexity/last_seq": 8.501554489135742, + "eval_valid_perplexity/second_seq": 13.58463191986084, + "eval_valid_perplexity/seq": 8.769083023071289, + "eval_valid_reconstruction/all": 0.2955191433429718, + "eval_valid_reconstruction/end_span": 0.7218106985092163, + "eval_valid_reconstruction/fim": 0.1943131983280182, + "eval_valid_reconstruction/first_seq": 0.16730734705924988, + "eval_valid_reconstruction/last_seq": 0.34504783153533936, + "eval_valid_reconstruction/second_seq": 0.19830520451068878, + "eval_valid_runtime": 441.4787, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 39000 + }, + { + "epoch": 0.14547570555717196, + "eval_train_loss": 2.182609796524048, + "eval_train_loss/all": 2.0202324390411377, + "eval_train_loss/end_span": 1.1714006662368774, + "eval_train_perplexity/batch": 7.540077209472656, + "eval_train_perplexity/end_span": 3.226508617401123, + "eval_train_perplexity/fim": 2.1901917457580566, + "eval_train_perplexity/first_seq": 15.617474555969238, + "eval_train_perplexity/last_seq": 8.474339485168457, + "eval_train_perplexity/second_seq": 14.282693862915039, + "eval_train_perplexity/seq": 8.686859130859375, + "eval_train_reconstruction/all": 0.2858162820339203, + "eval_train_reconstruction/end_span": 0.7327384948730469, + "eval_train_reconstruction/fim": 0.15900816023349762, + "eval_train_reconstruction/first_seq": 0.1498102992773056, + "eval_train_reconstruction/last_seq": 0.34014594554901123, + "eval_train_reconstruction/second_seq": 0.1832265704870224, + "eval_train_runtime": 440.0988, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 39000 + }, + { + "epoch": 0.14551300702013534, + "grad_norm": 0.266509085893631, + "learning_rate": 0.0006, + "loss": 2.348, + "step": 39010 + }, + { + "epoch": 0.14555030848309872, + "grad_norm": 0.4495362341403961, + "learning_rate": 0.0006, + "loss": 2.0626, + "step": 39020 + }, + { + "epoch": 0.1455876099460621, + "grad_norm": 0.5045959949493408, + "learning_rate": 0.0006, + "loss": 2.2607, + "step": 39030 + }, + { + "epoch": 0.14562491140902545, + "grad_norm": 0.4316059350967407, + "learning_rate": 0.0006, + "loss": 2.1628, + "step": 39040 + }, + { + "epoch": 0.14566221287198883, + "grad_norm": 0.25471043586730957, + "learning_rate": 0.0006, + "loss": 2.3547, + "step": 39050 + }, + { + "epoch": 0.1456995143349522, + "grad_norm": 0.3954010307788849, + "learning_rate": 0.0006, + "loss": 2.0869, + "step": 39060 + }, + { + "epoch": 0.1457368157979156, + "grad_norm": 0.29907774925231934, + "learning_rate": 0.0006, + "loss": 2.3128, + "step": 39070 + }, + { + "epoch": 0.14577411726087897, + "grad_norm": 0.6175805926322937, + "learning_rate": 0.0006, + "loss": 2.061, + "step": 39080 + }, + { + "epoch": 0.14581141872384235, + "grad_norm": 0.2751348912715912, + "learning_rate": 0.0006, + "loss": 2.2104, + "step": 39090 + }, + { + "epoch": 0.14584872018680572, + "grad_norm": 0.2506461441516876, + "learning_rate": 0.0006, + "loss": 2.2631, + "step": 39100 + }, + { + "epoch": 0.1458860216497691, + "grad_norm": 0.3780500292778015, + "learning_rate": 0.0006, + "loss": 2.1573, + "step": 39110 + }, + { + "epoch": 0.14592332311273248, + "grad_norm": 0.39321884512901306, + "learning_rate": 0.0006, + "loss": 2.0159, + "step": 39120 + }, + { + "epoch": 0.14596062457569586, + "grad_norm": 0.444102942943573, + "learning_rate": 0.0006, + "loss": 2.1607, + "step": 39130 + }, + { + "epoch": 0.14599792603865924, + "grad_norm": 0.3425441384315491, + "learning_rate": 0.0006, + "loss": 2.3821, + "step": 39140 + }, + { + "epoch": 0.14603522750162262, + "grad_norm": 0.3816486895084381, + "learning_rate": 0.0006, + "loss": 2.1364, + "step": 39150 + }, + { + "epoch": 0.146072528964586, + "grad_norm": 0.3293783366680145, + "learning_rate": 0.0006, + "loss": 2.2534, + "step": 39160 + }, + { + "epoch": 0.14610983042754938, + "grad_norm": 0.5144869685173035, + "learning_rate": 0.0006, + "loss": 2.3279, + "step": 39170 + }, + { + "epoch": 0.14614713189051273, + "grad_norm": 0.38333553075790405, + "learning_rate": 0.0006, + "loss": 2.2719, + "step": 39180 + }, + { + "epoch": 0.1461844333534761, + "grad_norm": 0.468468576669693, + "learning_rate": 0.0006, + "loss": 2.2092, + "step": 39190 + }, + { + "epoch": 0.1462217348164395, + "grad_norm": 0.35165566205978394, + "learning_rate": 0.0006, + "loss": 2.1865, + "step": 39200 + }, + { + "epoch": 0.14625903627940287, + "grad_norm": 0.31696879863739014, + "learning_rate": 0.0006, + "loss": 2.3367, + "step": 39210 + }, + { + "epoch": 0.14629633774236625, + "grad_norm": 0.38779890537261963, + "learning_rate": 0.0006, + "loss": 2.0163, + "step": 39220 + }, + { + "epoch": 0.14633363920532963, + "grad_norm": 0.45437824726104736, + "learning_rate": 0.0006, + "loss": 2.1689, + "step": 39230 + }, + { + "epoch": 0.146370940668293, + "grad_norm": 0.4191362261772156, + "learning_rate": 0.0006, + "loss": 2.276, + "step": 39240 + }, + { + "epoch": 0.1464082421312564, + "grad_norm": 0.4021952748298645, + "learning_rate": 0.0006, + "loss": 2.3304, + "step": 39250 + }, + { + "epoch": 0.1464082421312564, + "eval_valid_loss": 2.1881847381591797, + "eval_valid_loss/all": 2.0515146255493164, + "eval_valid_loss/end_span": 1.3196762800216675, + "eval_valid_perplexity/batch": 7.779675483703613, + "eval_valid_perplexity/end_span": 3.7422096729278564, + "eval_valid_perplexity/fim": 2.3394548892974854, + "eval_valid_perplexity/first_seq": 14.83576488494873, + "eval_valid_perplexity/last_seq": 8.707879066467285, + "eval_valid_perplexity/second_seq": 13.39087200164795, + "eval_valid_perplexity/seq": 8.7724609375, + "eval_valid_reconstruction/all": 0.29520323872566223, + "eval_valid_reconstruction/end_span": 0.6910721659660339, + "eval_valid_reconstruction/fim": 0.17122948169708252, + "eval_valid_reconstruction/first_seq": 0.1662321239709854, + "eval_valid_reconstruction/last_seq": 0.33893221616744995, + "eval_valid_reconstruction/second_seq": 0.2084398716688156, + "eval_valid_runtime": 441.4692, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 39250 + }, + { + "epoch": 0.1464082421312564, + "eval_train_loss": 2.1838743686676025, + "eval_train_loss/all": 2.021038293838501, + "eval_train_loss/end_span": 1.2983967065811157, + "eval_train_perplexity/batch": 7.54615592956543, + "eval_train_perplexity/end_span": 3.6634185314178467, + "eval_train_perplexity/fim": 2.0834388732910156, + "eval_train_perplexity/first_seq": 15.715058326721191, + "eval_train_perplexity/last_seq": 8.913546562194824, + "eval_train_perplexity/second_seq": 14.344841003417969, + "eval_train_perplexity/seq": 8.688185691833496, + "eval_train_reconstruction/all": 0.28544095158576965, + "eval_train_reconstruction/end_span": 0.6970686912536621, + "eval_train_reconstruction/fim": 0.14808900654315948, + "eval_train_reconstruction/first_seq": 0.14919577538967133, + "eval_train_reconstruction/last_seq": 0.32925698161125183, + "eval_train_reconstruction/second_seq": 0.1826738566160202, + "eval_train_runtime": 446.1737, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 39250 + }, + { + "epoch": 0.14644554359421977, + "grad_norm": 0.3238033354282379, + "learning_rate": 0.0006, + "loss": 2.2563, + "step": 39260 + }, + { + "epoch": 0.14648284505718315, + "grad_norm": 0.2854604125022888, + "learning_rate": 0.0006, + "loss": 2.2796, + "step": 39270 + }, + { + "epoch": 0.14652014652014653, + "grad_norm": 0.22050493955612183, + "learning_rate": 0.0006, + "loss": 2.2642, + "step": 39280 + }, + { + "epoch": 0.1465574479831099, + "grad_norm": 0.2990943491458893, + "learning_rate": 0.0006, + "loss": 2.2977, + "step": 39290 + }, + { + "epoch": 0.14659474944607329, + "grad_norm": 0.34499120712280273, + "learning_rate": 0.0006, + "loss": 2.1519, + "step": 39300 + }, + { + "epoch": 0.14663205090903667, + "grad_norm": 0.3261229991912842, + "learning_rate": 0.0006, + "loss": 2.2659, + "step": 39310 + }, + { + "epoch": 0.14666935237200002, + "grad_norm": 0.514968991279602, + "learning_rate": 0.0006, + "loss": 2.2046, + "step": 39320 + }, + { + "epoch": 0.1467066538349634, + "grad_norm": 0.299498975276947, + "learning_rate": 0.0006, + "loss": 2.212, + "step": 39330 + }, + { + "epoch": 0.14674395529792678, + "grad_norm": 0.2815449833869934, + "learning_rate": 0.0006, + "loss": 2.1101, + "step": 39340 + }, + { + "epoch": 0.14678125676089016, + "grad_norm": 0.3360259532928467, + "learning_rate": 0.0006, + "loss": 2.2708, + "step": 39350 + }, + { + "epoch": 0.14681855822385353, + "grad_norm": 0.25659874081611633, + "learning_rate": 0.0006, + "loss": 2.2434, + "step": 39360 + }, + { + "epoch": 0.14685585968681691, + "grad_norm": 0.262555867433548, + "learning_rate": 0.0006, + "loss": 2.2507, + "step": 39370 + }, + { + "epoch": 0.1468931611497803, + "grad_norm": 0.3213578164577484, + "learning_rate": 0.0006, + "loss": 2.2799, + "step": 39380 + }, + { + "epoch": 0.14693046261274367, + "grad_norm": 0.306950181722641, + "learning_rate": 0.0006, + "loss": 2.0762, + "step": 39390 + }, + { + "epoch": 0.14696776407570705, + "grad_norm": 0.3997628092765808, + "learning_rate": 0.0006, + "loss": 2.2994, + "step": 39400 + }, + { + "epoch": 0.14700506553867043, + "grad_norm": 0.3466789126396179, + "learning_rate": 0.0006, + "loss": 2.1455, + "step": 39410 + }, + { + "epoch": 0.1470423670016338, + "grad_norm": 1.4289065599441528, + "learning_rate": 0.0006, + "loss": 2.3956, + "step": 39420 + }, + { + "epoch": 0.1470796684645972, + "grad_norm": 0.44516193866729736, + "learning_rate": 0.0006, + "loss": 2.4097, + "step": 39430 + }, + { + "epoch": 0.14711696992756057, + "grad_norm": 0.5780780911445618, + "learning_rate": 0.0006, + "loss": 2.1837, + "step": 39440 + }, + { + "epoch": 0.14715427139052395, + "grad_norm": 0.48369303345680237, + "learning_rate": 0.0006, + "loss": 2.1706, + "step": 39450 + }, + { + "epoch": 0.1471915728534873, + "grad_norm": 0.3235512673854828, + "learning_rate": 0.0006, + "loss": 2.2543, + "step": 39460 + }, + { + "epoch": 0.14722887431645068, + "grad_norm": 0.36539044976234436, + "learning_rate": 0.0006, + "loss": 2.1695, + "step": 39470 + }, + { + "epoch": 0.14726617577941406, + "grad_norm": 0.4007440507411957, + "learning_rate": 0.0006, + "loss": 2.248, + "step": 39480 + }, + { + "epoch": 0.14730347724237744, + "grad_norm": 0.4171707034111023, + "learning_rate": 0.0006, + "loss": 2.2997, + "step": 39490 + }, + { + "epoch": 0.14734077870534082, + "grad_norm": 0.6816478371620178, + "learning_rate": 0.0006, + "loss": 2.0993, + "step": 39500 + }, + { + "epoch": 0.14734077870534082, + "eval_valid_loss": 2.1903152465820312, + "eval_valid_loss/all": 2.0535824298858643, + "eval_valid_loss/end_span": 1.2257473468780518, + "eval_valid_perplexity/batch": 7.795778751373291, + "eval_valid_perplexity/end_span": 3.4067111015319824, + "eval_valid_perplexity/fim": 2.454179048538208, + "eval_valid_perplexity/first_seq": 15.000426292419434, + "eval_valid_perplexity/last_seq": 9.233695983886719, + "eval_valid_perplexity/second_seq": 13.657144546508789, + "eval_valid_perplexity/seq": 8.79093074798584, + "eval_valid_reconstruction/all": 0.2945864498615265, + "eval_valid_reconstruction/end_span": 0.6987942457199097, + "eval_valid_reconstruction/fim": 0.17870406806468964, + "eval_valid_reconstruction/first_seq": 0.16614899039268494, + "eval_valid_reconstruction/last_seq": 0.31883662939071655, + "eval_valid_reconstruction/second_seq": 0.19991496205329895, + "eval_valid_runtime": 448.4343, + "eval_valid_samples_per_second": 0.428, + "eval_valid_steps_per_second": 0.428, + "step": 39500 + }, + { + "epoch": 0.14734077870534082, + "eval_train_loss": 2.188676118850708, + "eval_train_loss/all": 2.0258443355560303, + "eval_train_loss/end_span": 1.197270154953003, + "eval_train_perplexity/batch": 7.582510471343994, + "eval_train_perplexity/end_span": 3.311065912246704, + "eval_train_perplexity/fim": 2.2115345001220703, + "eval_train_perplexity/first_seq": 15.552600860595703, + "eval_train_perplexity/last_seq": 9.0507230758667, + "eval_train_perplexity/second_seq": 14.266622543334961, + "eval_train_perplexity/seq": 8.73498821258545, + "eval_train_reconstruction/all": 0.2841694951057434, + "eval_train_reconstruction/end_span": 0.7091097831726074, + "eval_train_reconstruction/fim": 0.15855441987514496, + "eval_train_reconstruction/first_seq": 0.1506689041852951, + "eval_train_reconstruction/last_seq": 0.3242662847042084, + "eval_train_reconstruction/second_seq": 0.18369761109352112, + "eval_train_runtime": 440.4241, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 39500 + }, + { + "epoch": 0.1473780801683042, + "grad_norm": 0.3968047797679901, + "learning_rate": 0.0006, + "loss": 2.3984, + "step": 39510 + }, + { + "epoch": 0.14741538163126758, + "grad_norm": 0.3848569393157959, + "learning_rate": 0.0006, + "loss": 1.9794, + "step": 39520 + }, + { + "epoch": 0.14745268309423096, + "grad_norm": 0.39495596289634705, + "learning_rate": 0.0006, + "loss": 2.3312, + "step": 39530 + }, + { + "epoch": 0.14748998455719434, + "grad_norm": 0.3240102231502533, + "learning_rate": 0.0006, + "loss": 2.3865, + "step": 39540 + }, + { + "epoch": 0.14752728602015772, + "grad_norm": 0.30814146995544434, + "learning_rate": 0.0006, + "loss": 2.2658, + "step": 39550 + }, + { + "epoch": 0.1475645874831211, + "grad_norm": 0.23826222121715546, + "learning_rate": 0.0006, + "loss": 2.2999, + "step": 39560 + }, + { + "epoch": 0.14760188894608448, + "grad_norm": 0.2957969903945923, + "learning_rate": 0.0006, + "loss": 2.1771, + "step": 39570 + }, + { + "epoch": 0.14763919040904785, + "grad_norm": 0.3387509286403656, + "learning_rate": 0.0006, + "loss": 2.2318, + "step": 39580 + }, + { + "epoch": 0.14767649187201123, + "grad_norm": 0.38809478282928467, + "learning_rate": 0.0006, + "loss": 2.2469, + "step": 39590 + }, + { + "epoch": 0.14771379333497459, + "grad_norm": 0.2263656109571457, + "learning_rate": 0.0006, + "loss": 2.2301, + "step": 39600 + }, + { + "epoch": 0.14775109479793797, + "grad_norm": 0.34742799401283264, + "learning_rate": 0.0006, + "loss": 2.1429, + "step": 39610 + }, + { + "epoch": 0.14778839626090134, + "grad_norm": 0.30278828740119934, + "learning_rate": 0.0006, + "loss": 2.3659, + "step": 39620 + }, + { + "epoch": 0.14782569772386472, + "grad_norm": 0.41839149594306946, + "learning_rate": 0.0006, + "loss": 2.2158, + "step": 39630 + }, + { + "epoch": 0.1478629991868281, + "grad_norm": 0.8274522423744202, + "learning_rate": 0.0006, + "loss": 2.1349, + "step": 39640 + }, + { + "epoch": 0.14790030064979148, + "grad_norm": 0.4073423445224762, + "learning_rate": 0.0006, + "loss": 2.211, + "step": 39650 + }, + { + "epoch": 0.14793760211275486, + "grad_norm": 0.2895248532295227, + "learning_rate": 0.0006, + "loss": 2.2948, + "step": 39660 + }, + { + "epoch": 0.14797490357571824, + "grad_norm": 0.4076474606990814, + "learning_rate": 0.0006, + "loss": 2.1797, + "step": 39670 + }, + { + "epoch": 0.14801220503868162, + "grad_norm": 0.3137817084789276, + "learning_rate": 0.0006, + "loss": 2.1401, + "step": 39680 + }, + { + "epoch": 0.148049506501645, + "grad_norm": 0.4331076443195343, + "learning_rate": 0.0006, + "loss": 2.3665, + "step": 39690 + }, + { + "epoch": 0.14808680796460838, + "grad_norm": 0.36810386180877686, + "learning_rate": 0.0006, + "loss": 2.2418, + "step": 39700 + }, + { + "epoch": 0.14812410942757176, + "grad_norm": 0.3958509564399719, + "learning_rate": 0.0006, + "loss": 2.0982, + "step": 39710 + }, + { + "epoch": 0.14816141089053514, + "grad_norm": 0.29265254735946655, + "learning_rate": 0.0006, + "loss": 2.2996, + "step": 39720 + }, + { + "epoch": 0.1481987123534985, + "grad_norm": 0.3184722065925598, + "learning_rate": 0.0006, + "loss": 2.2284, + "step": 39730 + }, + { + "epoch": 0.14823601381646187, + "grad_norm": 0.3602912127971649, + "learning_rate": 0.0006, + "loss": 2.3557, + "step": 39740 + }, + { + "epoch": 0.14827331527942525, + "grad_norm": 0.2427869737148285, + "learning_rate": 0.0006, + "loss": 2.0999, + "step": 39750 + }, + { + "epoch": 0.14827331527942525, + "eval_valid_loss": 2.1879758834838867, + "eval_valid_loss/all": 2.0516979694366455, + "eval_valid_loss/end_span": 1.1986109018325806, + "eval_valid_perplexity/batch": 7.781102180480957, + "eval_valid_perplexity/end_span": 3.3155081272125244, + "eval_valid_perplexity/fim": 2.242730140686035, + "eval_valid_perplexity/first_seq": 14.989768981933594, + "eval_valid_perplexity/last_seq": 8.808927536010742, + "eval_valid_perplexity/second_seq": 13.770397186279297, + "eval_valid_perplexity/seq": 8.781707763671875, + "eval_valid_reconstruction/all": 0.295195996761322, + "eval_valid_reconstruction/end_span": 0.7189337611198425, + "eval_valid_reconstruction/fim": 0.16208884119987488, + "eval_valid_reconstruction/first_seq": 0.16664613783359528, + "eval_valid_reconstruction/last_seq": 0.33332502841949463, + "eval_valid_reconstruction/second_seq": 0.1973045915365219, + "eval_valid_runtime": 445.8377, + "eval_valid_samples_per_second": 0.431, + "eval_valid_steps_per_second": 0.431, + "step": 39750 + }, + { + "epoch": 0.14827331527942525, + "eval_train_loss": 2.185683012008667, + "eval_train_loss/all": 2.0230231285095215, + "eval_train_loss/end_span": 1.1703110933303833, + "eval_train_perplexity/batch": 7.561148643493652, + "eval_train_perplexity/end_span": 3.2229950428009033, + "eval_train_perplexity/fim": 1.995363473892212, + "eval_train_perplexity/first_seq": 15.810620307922363, + "eval_train_perplexity/last_seq": 8.878338813781738, + "eval_train_perplexity/second_seq": 14.030903816223145, + "eval_train_perplexity/seq": 8.708548545837402, + "eval_train_reconstruction/all": 0.2847329378128052, + "eval_train_reconstruction/end_span": 0.7268703579902649, + "eval_train_reconstruction/fim": 0.1391439586877823, + "eval_train_reconstruction/first_seq": 0.1455528438091278, + "eval_train_reconstruction/last_seq": 0.3256136476993561, + "eval_train_reconstruction/second_seq": 0.18938815593719482, + "eval_train_runtime": 446.3599, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 39750 + }, + { + "epoch": 0.14831061674238863, + "grad_norm": 0.2707000970840454, + "learning_rate": 0.0006, + "loss": 2.1515, + "step": 39760 + }, + { + "epoch": 0.148347918205352, + "grad_norm": 0.2618013620376587, + "learning_rate": 0.0006, + "loss": 2.1925, + "step": 39770 + }, + { + "epoch": 0.1483852196683154, + "grad_norm": 0.3575701415538788, + "learning_rate": 0.0006, + "loss": 2.4018, + "step": 39780 + }, + { + "epoch": 0.14842252113127877, + "grad_norm": 0.347710520029068, + "learning_rate": 0.0006, + "loss": 2.3249, + "step": 39790 + }, + { + "epoch": 0.14845982259424215, + "grad_norm": 0.3596550524234772, + "learning_rate": 0.0006, + "loss": 2.0406, + "step": 39800 + }, + { + "epoch": 0.14849712405720553, + "grad_norm": 0.27684125304222107, + "learning_rate": 0.0006, + "loss": 2.2831, + "step": 39810 + }, + { + "epoch": 0.1485344255201689, + "grad_norm": 0.38228002190589905, + "learning_rate": 0.0006, + "loss": 2.1313, + "step": 39820 + }, + { + "epoch": 0.14857172698313229, + "grad_norm": 0.5554470419883728, + "learning_rate": 0.0006, + "loss": 2.1681, + "step": 39830 + }, + { + "epoch": 0.14860902844609566, + "grad_norm": 0.5037282705307007, + "learning_rate": 0.0006, + "loss": 2.0012, + "step": 39840 + }, + { + "epoch": 0.14864632990905904, + "grad_norm": 0.43552857637405396, + "learning_rate": 0.0006, + "loss": 2.126, + "step": 39850 + }, + { + "epoch": 0.14868363137202242, + "grad_norm": 0.3754121661186218, + "learning_rate": 0.0006, + "loss": 2.2306, + "step": 39860 + }, + { + "epoch": 0.14872093283498578, + "grad_norm": 0.3653680980205536, + "learning_rate": 0.0006, + "loss": 2.336, + "step": 39870 + }, + { + "epoch": 0.14875823429794915, + "grad_norm": 0.4607110917568207, + "learning_rate": 0.0006, + "loss": 2.3253, + "step": 39880 + }, + { + "epoch": 0.14879553576091253, + "grad_norm": 0.3847218453884125, + "learning_rate": 0.0006, + "loss": 2.1169, + "step": 39890 + }, + { + "epoch": 0.1488328372238759, + "grad_norm": 0.31450605392456055, + "learning_rate": 0.0006, + "loss": 2.2302, + "step": 39900 + }, + { + "epoch": 0.1488701386868393, + "grad_norm": 0.3833411633968353, + "learning_rate": 0.0006, + "loss": 2.2704, + "step": 39910 + }, + { + "epoch": 0.14890744014980267, + "grad_norm": 0.2898140847682953, + "learning_rate": 0.0006, + "loss": 2.2793, + "step": 39920 + }, + { + "epoch": 0.14894474161276605, + "grad_norm": 0.36171936988830566, + "learning_rate": 0.0006, + "loss": 2.066, + "step": 39930 + }, + { + "epoch": 0.14898204307572943, + "grad_norm": 0.4355625510215759, + "learning_rate": 0.0006, + "loss": 2.0816, + "step": 39940 + }, + { + "epoch": 0.1490193445386928, + "grad_norm": 0.2948443591594696, + "learning_rate": 0.0006, + "loss": 2.2564, + "step": 39950 + }, + { + "epoch": 0.1490566460016562, + "grad_norm": 0.33983227610588074, + "learning_rate": 0.0006, + "loss": 2.2262, + "step": 39960 + }, + { + "epoch": 0.14909394746461957, + "grad_norm": 0.29915139079093933, + "learning_rate": 0.0006, + "loss": 2.3576, + "step": 39970 + }, + { + "epoch": 0.14913124892758295, + "grad_norm": 0.318695604801178, + "learning_rate": 0.0006, + "loss": 2.1817, + "step": 39980 + }, + { + "epoch": 0.14916855039054633, + "grad_norm": 0.3670351505279541, + "learning_rate": 0.0006, + "loss": 2.2927, + "step": 39990 + }, + { + "epoch": 0.1492058518535097, + "grad_norm": 0.2786240577697754, + "learning_rate": 0.0006, + "loss": 2.2815, + "step": 40000 + }, + { + "epoch": 0.1492058518535097, + "eval_valid_loss": 2.1893866062164307, + "eval_valid_loss/all": 2.0524966716766357, + "eval_valid_loss/end_span": 1.3062517642974854, + "eval_valid_perplexity/batch": 7.787319183349609, + "eval_valid_perplexity/end_span": 3.692308187484741, + "eval_valid_perplexity/fim": 2.222747564315796, + "eval_valid_perplexity/first_seq": 14.802478790283203, + "eval_valid_perplexity/last_seq": 8.772000312805176, + "eval_valid_perplexity/second_seq": 13.6126708984375, + "eval_valid_perplexity/seq": 8.781445503234863, + "eval_valid_reconstruction/all": 0.29486528038978577, + "eval_valid_reconstruction/end_span": 0.6878136992454529, + "eval_valid_reconstruction/fim": 0.16017380356788635, + "eval_valid_reconstruction/first_seq": 0.16656583547592163, + "eval_valid_reconstruction/last_seq": 0.3358757495880127, + "eval_valid_reconstruction/second_seq": 0.2004520297050476, + "eval_valid_runtime": 443.7834, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 40000 + }, + { + "epoch": 0.1492058518535097, + "eval_train_loss": 2.1869423389434814, + "eval_train_loss/all": 2.0237810611724854, + "eval_train_loss/end_span": 1.2699227333068848, + "eval_train_perplexity/batch": 7.5668816566467285, + "eval_train_perplexity/end_span": 3.560577392578125, + "eval_train_perplexity/fim": 2.0326201915740967, + "eval_train_perplexity/first_seq": 15.224465370178223, + "eval_train_perplexity/last_seq": 8.631819725036621, + "eval_train_perplexity/second_seq": 14.268925666809082, + "eval_train_perplexity/seq": 8.715967178344727, + "eval_train_reconstruction/all": 0.2846192717552185, + "eval_train_reconstruction/end_span": 0.6991305947303772, + "eval_train_reconstruction/fim": 0.14213347434997559, + "eval_train_reconstruction/first_seq": 0.15462099015712738, + "eval_train_reconstruction/last_seq": 0.33573436737060547, + "eval_train_reconstruction/second_seq": 0.18139398097991943, + "eval_train_runtime": 463.7715, + "eval_train_samples_per_second": 0.414, + "eval_train_steps_per_second": 0.414, + "step": 40000 + }, + { + "epoch": 0.14924315331647306, + "grad_norm": 0.3583550751209259, + "learning_rate": 0.0006, + "loss": 2.3396, + "step": 40010 + }, + { + "epoch": 0.14928045477943644, + "grad_norm": 0.31716388463974, + "learning_rate": 0.0006, + "loss": 2.2695, + "step": 40020 + }, + { + "epoch": 0.14931775624239982, + "grad_norm": 0.4195127487182617, + "learning_rate": 0.0006, + "loss": 2.2452, + "step": 40030 + }, + { + "epoch": 0.1493550577053632, + "grad_norm": 0.2535831034183502, + "learning_rate": 0.0006, + "loss": 2.1022, + "step": 40040 + }, + { + "epoch": 0.14939235916832658, + "grad_norm": 0.3057853579521179, + "learning_rate": 0.0006, + "loss": 2.3527, + "step": 40050 + }, + { + "epoch": 0.14942966063128996, + "grad_norm": 0.3097497224807739, + "learning_rate": 0.0006, + "loss": 2.182, + "step": 40060 + }, + { + "epoch": 0.14946696209425334, + "grad_norm": 0.35348862409591675, + "learning_rate": 0.0006, + "loss": 1.9478, + "step": 40070 + }, + { + "epoch": 0.14950426355721672, + "grad_norm": 0.28453129529953003, + "learning_rate": 0.0006, + "loss": 2.0838, + "step": 40080 + }, + { + "epoch": 0.1495415650201801, + "grad_norm": 0.40600958466529846, + "learning_rate": 0.0006, + "loss": 2.1573, + "step": 40090 + }, + { + "epoch": 0.14957886648314347, + "grad_norm": 0.4284599721431732, + "learning_rate": 0.0006, + "loss": 2.2204, + "step": 40100 + }, + { + "epoch": 0.14961616794610685, + "grad_norm": 0.30706843733787537, + "learning_rate": 0.0006, + "loss": 2.2041, + "step": 40110 + }, + { + "epoch": 0.14965346940907023, + "grad_norm": 0.8178428411483765, + "learning_rate": 0.0006, + "loss": 2.3253, + "step": 40120 + }, + { + "epoch": 0.1496907708720336, + "grad_norm": 0.4376480281352997, + "learning_rate": 0.0006, + "loss": 2.0752, + "step": 40130 + }, + { + "epoch": 0.149728072334997, + "grad_norm": 0.27890628576278687, + "learning_rate": 0.0006, + "loss": 2.2857, + "step": 40140 + }, + { + "epoch": 0.14976537379796034, + "grad_norm": 0.25193244218826294, + "learning_rate": 0.0006, + "loss": 2.185, + "step": 40150 + }, + { + "epoch": 0.14980267526092372, + "grad_norm": 0.28145164251327515, + "learning_rate": 0.0006, + "loss": 2.2055, + "step": 40160 + }, + { + "epoch": 0.1498399767238871, + "grad_norm": 0.3994036912918091, + "learning_rate": 0.0006, + "loss": 2.3082, + "step": 40170 + }, + { + "epoch": 0.14987727818685048, + "grad_norm": 258.41290283203125, + "learning_rate": 0.0006, + "loss": 2.3222, + "step": 40180 + }, + { + "epoch": 0.14991457964981386, + "grad_norm": 0.520788311958313, + "learning_rate": 0.0006, + "loss": 2.0717, + "step": 40190 + }, + { + "epoch": 0.14995188111277724, + "grad_norm": 0.34398695826530457, + "learning_rate": 0.0006, + "loss": 2.3363, + "step": 40200 + }, + { + "epoch": 0.14998918257574062, + "grad_norm": 0.3316304087638855, + "learning_rate": 0.0006, + "loss": 2.1671, + "step": 40210 + }, + { + "epoch": 0.150026484038704, + "grad_norm": 0.3232782781124115, + "learning_rate": 0.0006, + "loss": 2.2553, + "step": 40220 + }, + { + "epoch": 0.15006378550166738, + "grad_norm": 0.37620383501052856, + "learning_rate": 0.0006, + "loss": 2.2759, + "step": 40230 + }, + { + "epoch": 0.15010108696463076, + "grad_norm": 0.2885645627975464, + "learning_rate": 0.0006, + "loss": 2.3725, + "step": 40240 + }, + { + "epoch": 0.15013838842759414, + "grad_norm": 0.2307073324918747, + "learning_rate": 0.0006, + "loss": 2.2721, + "step": 40250 + }, + { + "epoch": 0.15013838842759414, + "eval_valid_loss": 2.187687635421753, + "eval_valid_loss/all": 2.0508594512939453, + "eval_valid_loss/end_span": 1.2403531074523926, + "eval_valid_perplexity/batch": 7.774580001831055, + "eval_valid_perplexity/end_span": 3.456833839416504, + "eval_valid_perplexity/fim": 2.0919768810272217, + "eval_valid_perplexity/first_seq": 14.381698608398438, + "eval_valid_perplexity/last_seq": 8.538235664367676, + "eval_valid_perplexity/second_seq": 13.520068168640137, + "eval_valid_perplexity/seq": 8.765022277832031, + "eval_valid_reconstruction/all": 0.2950606644153595, + "eval_valid_reconstruction/end_span": 0.7148457169532776, + "eval_valid_reconstruction/fim": 0.14910952746868134, + "eval_valid_reconstruction/first_seq": 0.1775306761264801, + "eval_valid_reconstruction/last_seq": 0.34197214245796204, + "eval_valid_reconstruction/second_seq": 0.19852997362613678, + "eval_valid_runtime": 448.2701, + "eval_valid_samples_per_second": 0.428, + "eval_valid_steps_per_second": 0.428, + "step": 40250 + }, + { + "epoch": 0.15013838842759414, + "eval_train_loss": 2.1835687160491943, + "eval_train_loss/all": 2.0205349922180176, + "eval_train_loss/end_span": 1.200961709022522, + "eval_train_perplexity/batch": 7.542358875274658, + "eval_train_perplexity/end_span": 3.3233113288879395, + "eval_train_perplexity/fim": 2.2003543376922607, + "eval_train_perplexity/first_seq": 15.472761154174805, + "eval_train_perplexity/last_seq": 8.480023384094238, + "eval_train_perplexity/second_seq": 14.157769203186035, + "eval_train_perplexity/seq": 8.679364204406738, + "eval_train_reconstruction/all": 0.2853274941444397, + "eval_train_reconstruction/end_span": 0.7271239757537842, + "eval_train_reconstruction/fim": 0.15966269373893738, + "eval_train_reconstruction/first_seq": 0.14947496354579926, + "eval_train_reconstruction/last_seq": 0.34628409147262573, + "eval_train_reconstruction/second_seq": 0.1858721524477005, + "eval_train_runtime": 450.3784, + "eval_train_samples_per_second": 0.426, + "eval_train_steps_per_second": 0.426, + "step": 40250 + }, + { + "epoch": 0.15017568989055752, + "grad_norm": 0.3792381286621094, + "learning_rate": 0.0006, + "loss": 2.3201, + "step": 40260 + }, + { + "epoch": 0.1502129913535209, + "grad_norm": 0.37658974528312683, + "learning_rate": 0.0006, + "loss": 2.3379, + "step": 40270 + }, + { + "epoch": 0.15025029281648425, + "grad_norm": 0.3619530200958252, + "learning_rate": 0.0006, + "loss": 2.3376, + "step": 40280 + }, + { + "epoch": 0.15028759427944763, + "grad_norm": 0.23355014622211456, + "learning_rate": 0.0006, + "loss": 2.1825, + "step": 40290 + }, + { + "epoch": 0.150324895742411, + "grad_norm": 0.3356885313987732, + "learning_rate": 0.0006, + "loss": 2.2079, + "step": 40300 + }, + { + "epoch": 0.1503621972053744, + "grad_norm": 0.3426577150821686, + "learning_rate": 0.0006, + "loss": 2.2371, + "step": 40310 + }, + { + "epoch": 0.15039949866833777, + "grad_norm": 0.32172316312789917, + "learning_rate": 0.0006, + "loss": 2.2875, + "step": 40320 + }, + { + "epoch": 0.15043680013130115, + "grad_norm": 0.27450332045555115, + "learning_rate": 0.0006, + "loss": 2.1912, + "step": 40330 + }, + { + "epoch": 0.15047410159426453, + "grad_norm": 0.30965742468833923, + "learning_rate": 0.0006, + "loss": 2.3203, + "step": 40340 + }, + { + "epoch": 0.1505114030572279, + "grad_norm": 0.474470853805542, + "learning_rate": 0.0006, + "loss": 2.3022, + "step": 40350 + }, + { + "epoch": 0.15054870452019128, + "grad_norm": 0.45068466663360596, + "learning_rate": 0.0006, + "loss": 2.2357, + "step": 40360 + }, + { + "epoch": 0.15058600598315466, + "grad_norm": 0.475349485874176, + "learning_rate": 0.0006, + "loss": 2.0785, + "step": 40370 + }, + { + "epoch": 0.15062330744611804, + "grad_norm": 0.27954304218292236, + "learning_rate": 0.0006, + "loss": 2.1501, + "step": 40380 + }, + { + "epoch": 0.15066060890908142, + "grad_norm": 0.3944456875324249, + "learning_rate": 0.0006, + "loss": 2.2809, + "step": 40390 + }, + { + "epoch": 0.1506979103720448, + "grad_norm": 0.5367174744606018, + "learning_rate": 0.0006, + "loss": 2.3312, + "step": 40400 + }, + { + "epoch": 0.15073521183500818, + "grad_norm": 0.4018508791923523, + "learning_rate": 0.0006, + "loss": 2.2184, + "step": 40410 + }, + { + "epoch": 0.15077251329797153, + "grad_norm": 0.2969034016132355, + "learning_rate": 0.0006, + "loss": 2.3146, + "step": 40420 + }, + { + "epoch": 0.1508098147609349, + "grad_norm": 0.2785779535770416, + "learning_rate": 0.0006, + "loss": 2.2859, + "step": 40430 + }, + { + "epoch": 0.1508471162238983, + "grad_norm": 0.3495776951313019, + "learning_rate": 0.0006, + "loss": 2.2443, + "step": 40440 + }, + { + "epoch": 0.15088441768686167, + "grad_norm": 0.3181625008583069, + "learning_rate": 0.0006, + "loss": 2.2896, + "step": 40450 + }, + { + "epoch": 0.15092171914982505, + "grad_norm": 0.33100149035453796, + "learning_rate": 0.0006, + "loss": 2.1491, + "step": 40460 + }, + { + "epoch": 0.15095902061278843, + "grad_norm": 0.3055623471736908, + "learning_rate": 0.0006, + "loss": 2.2528, + "step": 40470 + }, + { + "epoch": 0.1509963220757518, + "grad_norm": 0.3055887818336487, + "learning_rate": 0.0006, + "loss": 2.3069, + "step": 40480 + }, + { + "epoch": 0.1510336235387152, + "grad_norm": 0.357363224029541, + "learning_rate": 0.0006, + "loss": 1.9522, + "step": 40490 + }, + { + "epoch": 0.15107092500167857, + "grad_norm": 0.3430016040802002, + "learning_rate": 0.0006, + "loss": 2.3014, + "step": 40500 + }, + { + "epoch": 0.15107092500167857, + "eval_valid_loss": 2.1959903240203857, + "eval_valid_loss/all": 2.0578901767730713, + "eval_valid_loss/end_span": 1.1668051481246948, + "eval_valid_perplexity/batch": 7.829433441162109, + "eval_valid_perplexity/end_span": 3.2117152214050293, + "eval_valid_perplexity/fim": 2.6258859634399414, + "eval_valid_perplexity/first_seq": 14.777573585510254, + "eval_valid_perplexity/last_seq": 8.880183219909668, + "eval_valid_perplexity/second_seq": 13.69394588470459, + "eval_valid_perplexity/seq": 8.816903114318848, + "eval_valid_reconstruction/all": 0.2936686873435974, + "eval_valid_reconstruction/end_span": 0.7365255355834961, + "eval_valid_reconstruction/fim": 0.19300290942192078, + "eval_valid_reconstruction/first_seq": 0.17147605121135712, + "eval_valid_reconstruction/last_seq": 0.3287327289581299, + "eval_valid_reconstruction/second_seq": 0.19921857118606567, + "eval_valid_runtime": 510.2758, + "eval_valid_samples_per_second": 0.376, + "eval_valid_steps_per_second": 0.376, + "step": 40500 + }, + { + "epoch": 0.15107092500167857, + "eval_train_loss": 2.192089080810547, + "eval_train_loss/all": 2.0273656845092773, + "eval_train_loss/end_span": 1.1259994506835938, + "eval_train_perplexity/batch": 7.594054698944092, + "eval_train_perplexity/end_span": 3.08329701423645, + "eval_train_perplexity/fim": 1.91119384765625, + "eval_train_perplexity/first_seq": 15.423749923706055, + "eval_train_perplexity/last_seq": 9.640944480895996, + "eval_train_perplexity/second_seq": 14.233354568481445, + "eval_train_perplexity/seq": 8.735097885131836, + "eval_train_reconstruction/all": 0.2836630642414093, + "eval_train_reconstruction/end_span": 0.7486074566841125, + "eval_train_reconstruction/fim": 0.12941747903823853, + "eval_train_reconstruction/first_seq": 0.15148435533046722, + "eval_train_reconstruction/last_seq": 0.30137163400650024, + "eval_train_reconstruction/second_seq": 0.1814502477645874, + "eval_train_runtime": 505.9709, + "eval_train_samples_per_second": 0.379, + "eval_train_steps_per_second": 0.379, + "step": 40500 + }, + { + "epoch": 0.15110822646464195, + "grad_norm": 0.3072056770324707, + "learning_rate": 0.0006, + "loss": 2.1811, + "step": 40510 + }, + { + "epoch": 0.15114552792760533, + "grad_norm": 0.3074518144130707, + "learning_rate": 0.0006, + "loss": 2.2316, + "step": 40520 + }, + { + "epoch": 0.1511828293905687, + "grad_norm": 0.39832207560539246, + "learning_rate": 0.0006, + "loss": 2.0954, + "step": 40530 + }, + { + "epoch": 0.1512201308535321, + "grad_norm": 0.30444544553756714, + "learning_rate": 0.0006, + "loss": 2.0131, + "step": 40540 + }, + { + "epoch": 0.15125743231649547, + "grad_norm": 0.3181310296058655, + "learning_rate": 0.0006, + "loss": 2.1879, + "step": 40550 + }, + { + "epoch": 0.15129473377945882, + "grad_norm": 0.3683346211910248, + "learning_rate": 0.0006, + "loss": 2.3953, + "step": 40560 + }, + { + "epoch": 0.1513320352424222, + "grad_norm": 0.27592653036117554, + "learning_rate": 0.0006, + "loss": 2.2005, + "step": 40570 + }, + { + "epoch": 0.15136933670538558, + "grad_norm": 0.31085193157196045, + "learning_rate": 0.0006, + "loss": 2.2568, + "step": 40580 + }, + { + "epoch": 0.15140663816834896, + "grad_norm": 0.3760119676589966, + "learning_rate": 0.0006, + "loss": 2.0454, + "step": 40590 + }, + { + "epoch": 0.15144393963131234, + "grad_norm": 0.2850039005279541, + "learning_rate": 0.0006, + "loss": 2.3155, + "step": 40600 + }, + { + "epoch": 0.15148124109427571, + "grad_norm": 0.341155081987381, + "learning_rate": 0.0006, + "loss": 2.1233, + "step": 40610 + }, + { + "epoch": 0.1515185425572391, + "grad_norm": 0.48704203963279724, + "learning_rate": 0.0006, + "loss": 2.1843, + "step": 40620 + }, + { + "epoch": 0.15155584402020247, + "grad_norm": 0.2643511891365051, + "learning_rate": 0.0006, + "loss": 2.315, + "step": 40630 + }, + { + "epoch": 0.15159314548316585, + "grad_norm": 0.5150169134140015, + "learning_rate": 0.0006, + "loss": 2.1937, + "step": 40640 + }, + { + "epoch": 0.15163044694612923, + "grad_norm": 0.5672255158424377, + "learning_rate": 0.0006, + "loss": 2.2301, + "step": 40650 + }, + { + "epoch": 0.1516677484090926, + "grad_norm": 0.3367355763912201, + "learning_rate": 0.0006, + "loss": 2.1583, + "step": 40660 + }, + { + "epoch": 0.151705049872056, + "grad_norm": 0.39528852701187134, + "learning_rate": 0.0006, + "loss": 2.3077, + "step": 40670 + }, + { + "epoch": 0.15174235133501937, + "grad_norm": 0.40505269169807434, + "learning_rate": 0.0006, + "loss": 2.1779, + "step": 40680 + }, + { + "epoch": 0.15177965279798275, + "grad_norm": 0.26663875579833984, + "learning_rate": 0.0006, + "loss": 1.9467, + "step": 40690 + }, + { + "epoch": 0.1518169542609461, + "grad_norm": 0.3290478587150574, + "learning_rate": 0.0006, + "loss": 2.1196, + "step": 40700 + }, + { + "epoch": 0.15185425572390948, + "grad_norm": 5.047861099243164, + "learning_rate": 0.0006, + "loss": 1.9503, + "step": 40710 + }, + { + "epoch": 0.15189155718687286, + "grad_norm": 0.363473117351532, + "learning_rate": 0.0006, + "loss": 2.2545, + "step": 40720 + }, + { + "epoch": 0.15192885864983624, + "grad_norm": 0.4169936776161194, + "learning_rate": 0.0006, + "loss": 2.2437, + "step": 40730 + }, + { + "epoch": 0.15196616011279962, + "grad_norm": 0.23953711986541748, + "learning_rate": 0.0006, + "loss": 2.2434, + "step": 40740 + }, + { + "epoch": 0.152003461575763, + "grad_norm": 0.3862375020980835, + "learning_rate": 0.0006, + "loss": 2.179, + "step": 40750 + }, + { + "epoch": 0.152003461575763, + "eval_valid_loss": 2.198364734649658, + "eval_valid_loss/all": 2.0609843730926514, + "eval_valid_loss/end_span": 1.2104260921478271, + "eval_valid_perplexity/batch": 7.853696823120117, + "eval_valid_perplexity/end_span": 3.3549139499664307, + "eval_valid_perplexity/fim": 2.3066940307617188, + "eval_valid_perplexity/first_seq": 15.225626945495605, + "eval_valid_perplexity/last_seq": 9.226211547851562, + "eval_valid_perplexity/second_seq": 13.663931846618652, + "eval_valid_perplexity/seq": 8.858131408691406, + "eval_valid_reconstruction/all": 0.2920628786087036, + "eval_valid_reconstruction/end_span": 0.7206181287765503, + "eval_valid_reconstruction/fim": 0.16445331275463104, + "eval_valid_reconstruction/first_seq": 0.15977270901203156, + "eval_valid_reconstruction/last_seq": 0.32110318541526794, + "eval_valid_reconstruction/second_seq": 0.1952255368232727, + "eval_valid_runtime": 502.8166, + "eval_valid_samples_per_second": 0.382, + "eval_valid_steps_per_second": 0.382, + "step": 40750 + }, + { + "epoch": 0.152003461575763, + "eval_train_loss": 2.193432092666626, + "eval_train_loss/all": 2.0297117233276367, + "eval_train_loss/end_span": 1.1782290935516357, + "eval_train_perplexity/batch": 7.611891746520996, + "eval_train_perplexity/end_span": 3.2486162185668945, + "eval_train_perplexity/fim": 2.619255304336548, + "eval_train_perplexity/first_seq": 15.63278579711914, + "eval_train_perplexity/last_seq": 8.87124252319336, + "eval_train_perplexity/second_seq": 14.42786693572998, + "eval_train_perplexity/seq": 8.768685340881348, + "eval_train_reconstruction/all": 0.2826380729675293, + "eval_train_reconstruction/end_span": 0.7320203185081482, + "eval_train_reconstruction/fim": 0.19180086255073547, + "eval_train_reconstruction/first_seq": 0.15168753266334534, + "eval_train_reconstruction/last_seq": 0.3295723497867584, + "eval_train_reconstruction/second_seq": 0.18003438413143158, + "eval_train_runtime": 495.6511, + "eval_train_samples_per_second": 0.387, + "eval_train_steps_per_second": 0.387, + "step": 40750 + }, + { + "epoch": 0.15204076303872638, + "grad_norm": 0.3960123062133789, + "learning_rate": 0.0006, + "loss": 2.1924, + "step": 40760 + }, + { + "epoch": 0.15207806450168976, + "grad_norm": 0.8936513662338257, + "learning_rate": 0.0006, + "loss": 2.1958, + "step": 40770 + }, + { + "epoch": 0.15211536596465314, + "grad_norm": 0.3626154363155365, + "learning_rate": 0.0006, + "loss": 2.1148, + "step": 40780 + }, + { + "epoch": 0.15215266742761652, + "grad_norm": 0.6041672229766846, + "learning_rate": 0.0006, + "loss": 2.2743, + "step": 40790 + }, + { + "epoch": 0.1521899688905799, + "grad_norm": 0.26709145307540894, + "learning_rate": 0.0006, + "loss": 2.2483, + "step": 40800 + }, + { + "epoch": 0.15222727035354328, + "grad_norm": 0.2789766490459442, + "learning_rate": 0.0006, + "loss": 2.2849, + "step": 40810 + }, + { + "epoch": 0.15226457181650666, + "grad_norm": 0.43654680252075195, + "learning_rate": 0.0006, + "loss": 2.1733, + "step": 40820 + }, + { + "epoch": 0.15230187327947, + "grad_norm": 0.304354190826416, + "learning_rate": 0.0006, + "loss": 2.3157, + "step": 40830 + }, + { + "epoch": 0.1523391747424334, + "grad_norm": 1.011583685874939, + "learning_rate": 0.0006, + "loss": 2.3514, + "step": 40840 + }, + { + "epoch": 0.15237647620539677, + "grad_norm": 0.37925949692726135, + "learning_rate": 0.0006, + "loss": 2.1708, + "step": 40850 + }, + { + "epoch": 0.15241377766836015, + "grad_norm": 0.4608461558818817, + "learning_rate": 0.0006, + "loss": 2.1048, + "step": 40860 + }, + { + "epoch": 0.15245107913132352, + "grad_norm": 0.39697256684303284, + "learning_rate": 0.0006, + "loss": 2.2268, + "step": 40870 + }, + { + "epoch": 0.1524883805942869, + "grad_norm": 0.3014197051525116, + "learning_rate": 0.0006, + "loss": 2.2031, + "step": 40880 + }, + { + "epoch": 0.15252568205725028, + "grad_norm": 0.3494955599308014, + "learning_rate": 0.0006, + "loss": 2.2119, + "step": 40890 + }, + { + "epoch": 0.15256298352021366, + "grad_norm": 0.3227818012237549, + "learning_rate": 0.0006, + "loss": 2.187, + "step": 40900 + }, + { + "epoch": 0.15260028498317704, + "grad_norm": 0.3929116129875183, + "learning_rate": 0.0006, + "loss": 2.2634, + "step": 40910 + }, + { + "epoch": 0.15263758644614042, + "grad_norm": 0.28675004839897156, + "learning_rate": 0.0006, + "loss": 2.0901, + "step": 40920 + }, + { + "epoch": 0.1526748879091038, + "grad_norm": 0.2657923996448517, + "learning_rate": 0.0006, + "loss": 2.2303, + "step": 40930 + }, + { + "epoch": 0.15271218937206718, + "grad_norm": 0.25271204113960266, + "learning_rate": 0.0006, + "loss": 1.8693, + "step": 40940 + }, + { + "epoch": 0.15274949083503056, + "grad_norm": 0.3834085166454315, + "learning_rate": 0.0006, + "loss": 2.1813, + "step": 40950 + }, + { + "epoch": 0.15278679229799394, + "grad_norm": 0.386538565158844, + "learning_rate": 0.0006, + "loss": 2.3126, + "step": 40960 + }, + { + "epoch": 0.1528240937609573, + "grad_norm": 0.25176194310188293, + "learning_rate": 0.0006, + "loss": 2.3059, + "step": 40970 + }, + { + "epoch": 0.15286139522392067, + "grad_norm": 0.2842774987220764, + "learning_rate": 0.0006, + "loss": 2.3512, + "step": 40980 + }, + { + "epoch": 0.15289869668688405, + "grad_norm": 0.3977054953575134, + "learning_rate": 0.0006, + "loss": 2.2298, + "step": 40990 + }, + { + "epoch": 0.15293599814984743, + "grad_norm": 0.3158818781375885, + "learning_rate": 0.0006, + "loss": 2.2078, + "step": 41000 + }, + { + "epoch": 0.15293599814984743, + "eval_valid_loss": 2.185914993286133, + "eval_valid_loss/all": 2.049600601196289, + "eval_valid_loss/end_span": 1.3855212926864624, + "eval_valid_perplexity/batch": 7.764799118041992, + "eval_valid_perplexity/end_span": 3.9969089031219482, + "eval_valid_perplexity/fim": 2.2629942893981934, + "eval_valid_perplexity/first_seq": 14.884664535522461, + "eval_valid_perplexity/last_seq": 8.883018493652344, + "eval_valid_perplexity/second_seq": 13.98699951171875, + "eval_valid_perplexity/seq": 8.754822731018066, + "eval_valid_reconstruction/all": 0.2960142195224762, + "eval_valid_reconstruction/end_span": 0.676404595375061, + "eval_valid_reconstruction/fim": 0.16367706656455994, + "eval_valid_reconstruction/first_seq": 0.16402170062065125, + "eval_valid_reconstruction/last_seq": 0.33057892322540283, + "eval_valid_reconstruction/second_seq": 0.19174763560295105, + "eval_valid_runtime": 528.997, + "eval_valid_samples_per_second": 0.363, + "eval_valid_steps_per_second": 0.363, + "step": 41000 + }, + { + "epoch": 0.15293599814984743, + "eval_train_loss": 2.1865718364715576, + "eval_train_loss/all": 2.023656129837036, + "eval_train_loss/end_span": 1.3470088243484497, + "eval_train_perplexity/batch": 7.56593656539917, + "eval_train_perplexity/end_span": 3.845904588699341, + "eval_train_perplexity/fim": 1.957788109779358, + "eval_train_perplexity/first_seq": 15.79112434387207, + "eval_train_perplexity/last_seq": 8.687067985534668, + "eval_train_perplexity/second_seq": 13.49560832977295, + "eval_train_perplexity/seq": 8.71090030670166, + "eval_train_reconstruction/all": 0.2848719358444214, + "eval_train_reconstruction/end_span": 0.6890468597412109, + "eval_train_reconstruction/fim": 0.13553820550441742, + "eval_train_reconstruction/first_seq": 0.14684154093265533, + "eval_train_reconstruction/last_seq": 0.33622244000434875, + "eval_train_reconstruction/second_seq": 0.2031445950269699, + "eval_train_runtime": 527.8924, + "eval_train_samples_per_second": 0.364, + "eval_train_steps_per_second": 0.364, + "step": 41000 + }, + { + "epoch": 0.1529732996128108, + "grad_norm": 0.5656257271766663, + "learning_rate": 0.0006, + "loss": 2.2305, + "step": 41010 + }, + { + "epoch": 0.1530106010757742, + "grad_norm": 0.33134448528289795, + "learning_rate": 0.0006, + "loss": 2.3416, + "step": 41020 + }, + { + "epoch": 0.15304790253873757, + "grad_norm": 0.32620638608932495, + "learning_rate": 0.0006, + "loss": 2.2762, + "step": 41030 + }, + { + "epoch": 0.15308520400170095, + "grad_norm": 0.4979653060436249, + "learning_rate": 0.0006, + "loss": 2.097, + "step": 41040 + }, + { + "epoch": 0.15312250546466433, + "grad_norm": 0.4615210294723511, + "learning_rate": 0.0006, + "loss": 2.1823, + "step": 41050 + }, + { + "epoch": 0.1531598069276277, + "grad_norm": 0.3257724642753601, + "learning_rate": 0.0006, + "loss": 2.3882, + "step": 41060 + }, + { + "epoch": 0.15319710839059109, + "grad_norm": 0.3306836485862732, + "learning_rate": 0.0006, + "loss": 2.1569, + "step": 41070 + }, + { + "epoch": 0.15323440985355447, + "grad_norm": 0.4112743139266968, + "learning_rate": 0.0006, + "loss": 2.127, + "step": 41080 + }, + { + "epoch": 0.15327171131651784, + "grad_norm": 0.4081413149833679, + "learning_rate": 0.0006, + "loss": 2.2569, + "step": 41090 + }, + { + "epoch": 0.15330901277948122, + "grad_norm": 0.29601117968559265, + "learning_rate": 0.0006, + "loss": 2.2982, + "step": 41100 + }, + { + "epoch": 0.15334631424244458, + "grad_norm": 0.28392642736434937, + "learning_rate": 0.0006, + "loss": 2.2955, + "step": 41110 + }, + { + "epoch": 0.15338361570540796, + "grad_norm": 0.2697628140449524, + "learning_rate": 0.0006, + "loss": 2.2467, + "step": 41120 + }, + { + "epoch": 0.15342091716837133, + "grad_norm": 0.4995458126068115, + "learning_rate": 0.0006, + "loss": 2.2936, + "step": 41130 + }, + { + "epoch": 0.15345821863133471, + "grad_norm": 0.31033265590667725, + "learning_rate": 0.0006, + "loss": 2.285, + "step": 41140 + }, + { + "epoch": 0.1534955200942981, + "grad_norm": 0.3007839024066925, + "learning_rate": 0.0006, + "loss": 2.2703, + "step": 41150 + }, + { + "epoch": 0.15353282155726147, + "grad_norm": 0.2132665067911148, + "learning_rate": 0.0006, + "loss": 2.0963, + "step": 41160 + }, + { + "epoch": 0.15357012302022485, + "grad_norm": 0.4116544723510742, + "learning_rate": 0.0006, + "loss": 2.276, + "step": 41170 + }, + { + "epoch": 0.15360742448318823, + "grad_norm": 0.40800178050994873, + "learning_rate": 0.0006, + "loss": 2.3809, + "step": 41180 + }, + { + "epoch": 0.1536447259461516, + "grad_norm": 0.465323805809021, + "learning_rate": 0.0006, + "loss": 2.0914, + "step": 41190 + }, + { + "epoch": 0.153682027409115, + "grad_norm": 0.4636091887950897, + "learning_rate": 0.0006, + "loss": 2.2212, + "step": 41200 + }, + { + "epoch": 0.15371932887207837, + "grad_norm": 0.27055463194847107, + "learning_rate": 0.0006, + "loss": 2.346, + "step": 41210 + }, + { + "epoch": 0.15375663033504175, + "grad_norm": 0.42654889822006226, + "learning_rate": 0.0006, + "loss": 2.2442, + "step": 41220 + }, + { + "epoch": 0.15379393179800513, + "grad_norm": 0.27743372321128845, + "learning_rate": 0.0006, + "loss": 2.3231, + "step": 41230 + }, + { + "epoch": 0.1538312332609685, + "grad_norm": 0.33545711636543274, + "learning_rate": 0.0006, + "loss": 2.1073, + "step": 41240 + }, + { + "epoch": 0.15386853472393186, + "grad_norm": 0.35671067237854004, + "learning_rate": 0.0006, + "loss": 1.9913, + "step": 41250 + }, + { + "epoch": 0.15386853472393186, + "eval_valid_loss": 2.1848459243774414, + "eval_valid_loss/all": 2.0485730171203613, + "eval_valid_loss/end_span": 1.1361421346664429, + "eval_valid_perplexity/batch": 7.756824493408203, + "eval_valid_perplexity/end_span": 3.1147289276123047, + "eval_valid_perplexity/fim": 2.0928924083709717, + "eval_valid_perplexity/first_seq": 15.280500411987305, + "eval_valid_perplexity/last_seq": 8.448539733886719, + "eval_valid_perplexity/second_seq": 13.637372016906738, + "eval_valid_perplexity/seq": 8.74439811706543, + "eval_valid_reconstruction/all": 0.29595819115638733, + "eval_valid_reconstruction/end_span": 0.7349068522453308, + "eval_valid_reconstruction/fim": 0.14871567487716675, + "eval_valid_reconstruction/first_seq": 0.15811355412006378, + "eval_valid_reconstruction/last_seq": 0.3478095531463623, + "eval_valid_reconstruction/second_seq": 0.1987571269273758, + "eval_valid_runtime": 521.0255, + "eval_valid_samples_per_second": 0.369, + "eval_valid_steps_per_second": 0.369, + "step": 41250 + }, + { + "epoch": 0.15386853472393186, + "eval_train_loss": 2.1813719272613525, + "eval_train_loss/all": 2.0188333988189697, + "eval_train_loss/end_span": 1.0974323749542236, + "eval_train_perplexity/batch": 7.52953577041626, + "eval_train_perplexity/end_span": 2.996462345123291, + "eval_train_perplexity/fim": 2.126866102218628, + "eval_train_perplexity/first_seq": 15.628533363342285, + "eval_train_perplexity/last_seq": 9.00257396697998, + "eval_train_perplexity/second_seq": 14.1749267578125, + "eval_train_perplexity/seq": 8.669271469116211, + "eval_train_reconstruction/all": 0.28613200783729553, + "eval_train_reconstruction/end_span": 0.7470722198486328, + "eval_train_reconstruction/fim": 0.15254202485084534, + "eval_train_reconstruction/first_seq": 0.14811579883098602, + "eval_train_reconstruction/last_seq": 0.32541075348854065, + "eval_train_reconstruction/second_seq": 0.1860620230436325, + "eval_train_runtime": 528.3223, + "eval_train_samples_per_second": 0.363, + "eval_train_steps_per_second": 0.363, + "step": 41250 + }, + { + "epoch": 0.15390583618689524, + "grad_norm": 0.4151937663555145, + "learning_rate": 0.0006, + "loss": 2.1799, + "step": 41260 + }, + { + "epoch": 0.15394313764985862, + "grad_norm": 0.3793924152851105, + "learning_rate": 0.0006, + "loss": 2.1774, + "step": 41270 + }, + { + "epoch": 0.153980439112822, + "grad_norm": 0.35399380326271057, + "learning_rate": 0.0006, + "loss": 2.1892, + "step": 41280 + }, + { + "epoch": 0.15401774057578538, + "grad_norm": 0.34593141078948975, + "learning_rate": 0.0006, + "loss": 2.1511, + "step": 41290 + }, + { + "epoch": 0.15405504203874876, + "grad_norm": 0.1554938703775406, + "learning_rate": 0.0006, + "loss": 2.2107, + "step": 41300 + }, + { + "epoch": 0.15409234350171214, + "grad_norm": 0.3723108172416687, + "learning_rate": 0.0006, + "loss": 2.3996, + "step": 41310 + }, + { + "epoch": 0.15412964496467552, + "grad_norm": 0.34707745909690857, + "learning_rate": 0.0006, + "loss": 2.3645, + "step": 41320 + }, + { + "epoch": 0.1541669464276389, + "grad_norm": 0.4915243983268738, + "learning_rate": 0.0006, + "loss": 2.19, + "step": 41330 + }, + { + "epoch": 0.15420424789060228, + "grad_norm": 0.3340601325035095, + "learning_rate": 0.0006, + "loss": 2.1183, + "step": 41340 + }, + { + "epoch": 0.15424154935356565, + "grad_norm": 0.3117865025997162, + "learning_rate": 0.0006, + "loss": 2.1562, + "step": 41350 + }, + { + "epoch": 0.15427885081652903, + "grad_norm": 0.41133248805999756, + "learning_rate": 0.0006, + "loss": 2.1839, + "step": 41360 + }, + { + "epoch": 0.1543161522794924, + "grad_norm": 0.2982337772846222, + "learning_rate": 0.0006, + "loss": 2.3238, + "step": 41370 + }, + { + "epoch": 0.1543534537424558, + "grad_norm": 0.29256004095077515, + "learning_rate": 0.0006, + "loss": 2.2675, + "step": 41380 + }, + { + "epoch": 0.15439075520541914, + "grad_norm": 0.26688477396965027, + "learning_rate": 0.0006, + "loss": 2.3012, + "step": 41390 + }, + { + "epoch": 0.15442805666838252, + "grad_norm": 0.5322939157485962, + "learning_rate": 0.0006, + "loss": 2.2171, + "step": 41400 + }, + { + "epoch": 0.1544653581313459, + "grad_norm": 0.49724245071411133, + "learning_rate": 0.0006, + "loss": 2.2054, + "step": 41410 + }, + { + "epoch": 0.15450265959430928, + "grad_norm": 0.3297518789768219, + "learning_rate": 0.0006, + "loss": 2.285, + "step": 41420 + }, + { + "epoch": 0.15453996105727266, + "grad_norm": 0.2660132348537445, + "learning_rate": 0.0006, + "loss": 2.1599, + "step": 41430 + }, + { + "epoch": 0.15457726252023604, + "grad_norm": 0.4032997786998749, + "learning_rate": 0.0006, + "loss": 2.245, + "step": 41440 + }, + { + "epoch": 0.15461456398319942, + "grad_norm": 0.2633526027202606, + "learning_rate": 0.0006, + "loss": 2.083, + "step": 41450 + }, + { + "epoch": 0.1546518654461628, + "grad_norm": 0.4471266567707062, + "learning_rate": 0.0006, + "loss": 2.3208, + "step": 41460 + }, + { + "epoch": 0.15468916690912618, + "grad_norm": 0.46744704246520996, + "learning_rate": 0.0006, + "loss": 2.2943, + "step": 41470 + }, + { + "epoch": 0.15472646837208956, + "grad_norm": 0.42757928371429443, + "learning_rate": 0.0006, + "loss": 2.2528, + "step": 41480 + }, + { + "epoch": 0.15476376983505294, + "grad_norm": 0.3924972414970398, + "learning_rate": 0.0006, + "loss": 2.3254, + "step": 41490 + }, + { + "epoch": 0.15480107129801632, + "grad_norm": 0.4564869999885559, + "learning_rate": 0.0006, + "loss": 2.3018, + "step": 41500 + }, + { + "epoch": 0.15480107129801632, + "eval_valid_loss": 2.189683198928833, + "eval_valid_loss/all": 2.0528433322906494, + "eval_valid_loss/end_span": 1.2528377771377563, + "eval_valid_perplexity/batch": 7.7900190353393555, + "eval_valid_perplexity/end_span": 3.5002617835998535, + "eval_valid_perplexity/fim": 2.759918212890625, + "eval_valid_perplexity/first_seq": 14.730896949768066, + "eval_valid_perplexity/last_seq": 8.787280082702637, + "eval_valid_perplexity/second_seq": 13.930041313171387, + "eval_valid_perplexity/seq": 8.783223152160645, + "eval_valid_reconstruction/all": 0.29460376501083374, + "eval_valid_reconstruction/end_span": 0.703818142414093, + "eval_valid_reconstruction/fim": 0.20334717631340027, + "eval_valid_reconstruction/first_seq": 0.17068813741207123, + "eval_valid_reconstruction/last_seq": 0.33489248156547546, + "eval_valid_reconstruction/second_seq": 0.19006946682929993, + "eval_valid_runtime": 536.0426, + "eval_valid_samples_per_second": 0.358, + "eval_valid_steps_per_second": 0.358, + "step": 41500 + }, + { + "epoch": 0.15480107129801632, + "eval_train_loss": 2.1860334873199463, + "eval_train_loss/all": 2.023022174835205, + "eval_train_loss/end_span": 1.2090203762054443, + "eval_train_perplexity/batch": 7.561141490936279, + "eval_train_perplexity/end_span": 3.35020112991333, + "eval_train_perplexity/fim": 2.1223931312561035, + "eval_train_perplexity/first_seq": 15.633169174194336, + "eval_train_perplexity/last_seq": 9.021866798400879, + "eval_train_perplexity/second_seq": 14.415083885192871, + "eval_train_perplexity/seq": 8.704367637634277, + "eval_train_reconstruction/all": 0.2847685217857361, + "eval_train_reconstruction/end_span": 0.7163851857185364, + "eval_train_reconstruction/fim": 0.15086814761161804, + "eval_train_reconstruction/first_seq": 0.14663074910640717, + "eval_train_reconstruction/last_seq": 0.3263695538043976, + "eval_train_reconstruction/second_seq": 0.1757330596446991, + "eval_train_runtime": 512.6916, + "eval_train_samples_per_second": 0.374, + "eval_train_steps_per_second": 0.374, + "step": 41500 + }, + { + "epoch": 0.1548383727609797, + "grad_norm": 0.27676960825920105, + "learning_rate": 0.0006, + "loss": 2.1356, + "step": 41510 + }, + { + "epoch": 0.15487567422394305, + "grad_norm": 0.38823986053466797, + "learning_rate": 0.0006, + "loss": 2.3378, + "step": 41520 + }, + { + "epoch": 0.15491297568690643, + "grad_norm": 0.296860933303833, + "learning_rate": 0.0006, + "loss": 2.162, + "step": 41530 + }, + { + "epoch": 0.1549502771498698, + "grad_norm": 0.27212220430374146, + "learning_rate": 0.0006, + "loss": 2.3581, + "step": 41540 + }, + { + "epoch": 0.1549875786128332, + "grad_norm": 0.4414380192756653, + "learning_rate": 0.0006, + "loss": 2.2248, + "step": 41550 + }, + { + "epoch": 0.15502488007579657, + "grad_norm": 0.40827980637550354, + "learning_rate": 0.0006, + "loss": 2.0187, + "step": 41560 + }, + { + "epoch": 0.15506218153875995, + "grad_norm": 0.37324258685112, + "learning_rate": 0.0006, + "loss": 2.0708, + "step": 41570 + }, + { + "epoch": 0.15509948300172333, + "grad_norm": 0.40335795283317566, + "learning_rate": 0.0006, + "loss": 2.2186, + "step": 41580 + }, + { + "epoch": 0.1551367844646867, + "grad_norm": 0.4272978901863098, + "learning_rate": 0.0006, + "loss": 2.2745, + "step": 41590 + }, + { + "epoch": 0.15517408592765009, + "grad_norm": 0.3183085024356842, + "learning_rate": 0.0006, + "loss": 1.9818, + "step": 41600 + }, + { + "epoch": 0.15521138739061346, + "grad_norm": 0.3168216943740845, + "learning_rate": 0.0006, + "loss": 1.9918, + "step": 41610 + }, + { + "epoch": 0.15524868885357684, + "grad_norm": 0.36565709114074707, + "learning_rate": 0.0006, + "loss": 2.155, + "step": 41620 + }, + { + "epoch": 0.15528599031654022, + "grad_norm": 0.522169828414917, + "learning_rate": 0.0006, + "loss": 2.2375, + "step": 41630 + }, + { + "epoch": 0.1553232917795036, + "grad_norm": 0.34303587675094604, + "learning_rate": 0.0006, + "loss": 2.174, + "step": 41640 + }, + { + "epoch": 0.15536059324246698, + "grad_norm": 0.4731079936027527, + "learning_rate": 0.0006, + "loss": 2.3465, + "step": 41650 + }, + { + "epoch": 0.15539789470543033, + "grad_norm": 0.41391250491142273, + "learning_rate": 0.0006, + "loss": 2.2301, + "step": 41660 + }, + { + "epoch": 0.1554351961683937, + "grad_norm": 0.2950637638568878, + "learning_rate": 0.0006, + "loss": 2.1854, + "step": 41670 + }, + { + "epoch": 0.1554724976313571, + "grad_norm": 0.36380672454833984, + "learning_rate": 0.0006, + "loss": 2.1902, + "step": 41680 + }, + { + "epoch": 0.15550979909432047, + "grad_norm": 0.2871299982070923, + "learning_rate": 0.0006, + "loss": 2.2895, + "step": 41690 + }, + { + "epoch": 0.15554710055728385, + "grad_norm": 0.28727027773857117, + "learning_rate": 0.0006, + "loss": 2.1602, + "step": 41700 + }, + { + "epoch": 0.15558440202024723, + "grad_norm": 0.41364049911499023, + "learning_rate": 0.0006, + "loss": 2.0285, + "step": 41710 + }, + { + "epoch": 0.1556217034832106, + "grad_norm": 0.3227551579475403, + "learning_rate": 0.0006, + "loss": 2.2906, + "step": 41720 + }, + { + "epoch": 0.155659004946174, + "grad_norm": 0.4560000002384186, + "learning_rate": 0.0006, + "loss": 2.1715, + "step": 41730 + }, + { + "epoch": 0.15569630640913737, + "grad_norm": 0.4727163016796112, + "learning_rate": 0.0006, + "loss": 2.0227, + "step": 41740 + }, + { + "epoch": 0.15573360787210075, + "grad_norm": 0.34317371249198914, + "learning_rate": 0.0006, + "loss": 2.2004, + "step": 41750 + }, + { + "epoch": 0.15573360787210075, + "eval_valid_loss": 2.183112144470215, + "eval_valid_loss/all": 2.0467772483825684, + "eval_valid_loss/end_span": 1.2413530349731445, + "eval_valid_perplexity/batch": 7.742907524108887, + "eval_valid_perplexity/end_span": 3.460292100906372, + "eval_valid_perplexity/fim": 2.263935089111328, + "eval_valid_perplexity/first_seq": 15.146404266357422, + "eval_valid_perplexity/last_seq": 8.97713851928711, + "eval_valid_perplexity/second_seq": 13.722458839416504, + "eval_valid_perplexity/seq": 8.726457595825195, + "eval_valid_reconstruction/all": 0.2965145409107208, + "eval_valid_reconstruction/end_span": 0.7071312665939331, + "eval_valid_reconstruction/fim": 0.1647336184978485, + "eval_valid_reconstruction/first_seq": 0.1610247939825058, + "eval_valid_reconstruction/last_seq": 0.32853829860687256, + "eval_valid_reconstruction/second_seq": 0.19842040538787842, + "eval_valid_runtime": 548.9353, + "eval_valid_samples_per_second": 0.35, + "eval_valid_steps_per_second": 0.35, + "step": 41750 + }, + { + "epoch": 0.15573360787210075, + "eval_train_loss": 2.1800389289855957, + "eval_train_loss/all": 2.017659902572632, + "eval_train_loss/end_span": 1.1976454257965088, + "eval_train_perplexity/batch": 7.520705223083496, + "eval_train_perplexity/end_span": 3.3123085498809814, + "eval_train_perplexity/fim": 2.062382936477661, + "eval_train_perplexity/first_seq": 15.77448844909668, + "eval_train_perplexity/last_seq": 8.27871322631836, + "eval_train_perplexity/second_seq": 14.007950782775879, + "eval_train_perplexity/seq": 8.659857749938965, + "eval_train_reconstruction/all": 0.2864578664302826, + "eval_train_reconstruction/end_span": 0.7193493843078613, + "eval_train_reconstruction/fim": 0.1458698809146881, + "eval_train_reconstruction/first_seq": 0.14722847938537598, + "eval_train_reconstruction/last_seq": 0.3520723879337311, + "eval_train_reconstruction/second_seq": 0.18743503093719482, + "eval_train_runtime": 512.0987, + "eval_train_samples_per_second": 0.375, + "eval_train_steps_per_second": 0.375, + "step": 41750 + }, + { + "epoch": 0.15577090933506413, + "grad_norm": 0.2650071978569031, + "learning_rate": 0.0006, + "loss": 2.239, + "step": 41760 + }, + { + "epoch": 0.1558082107980275, + "grad_norm": 0.30623549222946167, + "learning_rate": 0.0006, + "loss": 2.2118, + "step": 41770 + }, + { + "epoch": 0.1558455122609909, + "grad_norm": 0.36383023858070374, + "learning_rate": 0.0006, + "loss": 2.1615, + "step": 41780 + }, + { + "epoch": 0.15588281372395427, + "grad_norm": 0.37073060870170593, + "learning_rate": 0.0006, + "loss": 2.1896, + "step": 41790 + }, + { + "epoch": 0.15592011518691762, + "grad_norm": 0.3051542341709137, + "learning_rate": 0.0006, + "loss": 2.0073, + "step": 41800 + }, + { + "epoch": 0.155957416649881, + "grad_norm": 0.5818561315536499, + "learning_rate": 0.0006, + "loss": 2.1508, + "step": 41810 + }, + { + "epoch": 0.15599471811284438, + "grad_norm": 0.40379732847213745, + "learning_rate": 0.0006, + "loss": 2.2002, + "step": 41820 + }, + { + "epoch": 0.15603201957580776, + "grad_norm": 0.34508371353149414, + "learning_rate": 0.0006, + "loss": 1.9082, + "step": 41830 + }, + { + "epoch": 0.15606932103877114, + "grad_norm": 0.29350534081459045, + "learning_rate": 0.0006, + "loss": 2.2219, + "step": 41840 + }, + { + "epoch": 0.15610662250173452, + "grad_norm": 0.22673402726650238, + "learning_rate": 0.0006, + "loss": 2.2356, + "step": 41850 + }, + { + "epoch": 0.1561439239646979, + "grad_norm": 0.3289574384689331, + "learning_rate": 0.0006, + "loss": 2.0647, + "step": 41860 + }, + { + "epoch": 0.15618122542766127, + "grad_norm": 0.3630244731903076, + "learning_rate": 0.0006, + "loss": 2.2716, + "step": 41870 + }, + { + "epoch": 0.15621852689062465, + "grad_norm": 0.3621048927307129, + "learning_rate": 0.0006, + "loss": 2.1706, + "step": 41880 + }, + { + "epoch": 0.15625582835358803, + "grad_norm": 0.3850362300872803, + "learning_rate": 0.0006, + "loss": 2.1804, + "step": 41890 + }, + { + "epoch": 0.1562931298165514, + "grad_norm": 0.5443671345710754, + "learning_rate": 0.0006, + "loss": 2.3252, + "step": 41900 + }, + { + "epoch": 0.1563304312795148, + "grad_norm": 0.3703918159008026, + "learning_rate": 0.0006, + "loss": 2.2581, + "step": 41910 + }, + { + "epoch": 0.15636773274247817, + "grad_norm": 0.3350268304347992, + "learning_rate": 0.0006, + "loss": 2.143, + "step": 41920 + }, + { + "epoch": 0.15640503420544155, + "grad_norm": 0.25246235728263855, + "learning_rate": 0.0006, + "loss": 2.274, + "step": 41930 + }, + { + "epoch": 0.1564423356684049, + "grad_norm": 0.3162323534488678, + "learning_rate": 0.0006, + "loss": 2.0652, + "step": 41940 + }, + { + "epoch": 0.15647963713136828, + "grad_norm": 0.26795947551727295, + "learning_rate": 0.0006, + "loss": 2.3659, + "step": 41950 + }, + { + "epoch": 0.15651693859433166, + "grad_norm": 0.34826669096946716, + "learning_rate": 0.0006, + "loss": 2.2825, + "step": 41960 + }, + { + "epoch": 0.15655424005729504, + "grad_norm": 0.3460218906402588, + "learning_rate": 0.0006, + "loss": 2.3021, + "step": 41970 + }, + { + "epoch": 0.15659154152025842, + "grad_norm": 0.36856362223625183, + "learning_rate": 0.0006, + "loss": 2.2503, + "step": 41980 + }, + { + "epoch": 0.1566288429832218, + "grad_norm": 0.296779602766037, + "learning_rate": 0.0006, + "loss": 2.285, + "step": 41990 + }, + { + "epoch": 0.15666614444618518, + "grad_norm": 0.32281526923179626, + "learning_rate": 0.0006, + "loss": 2.205, + "step": 42000 + }, + { + "epoch": 0.15666614444618518, + "eval_valid_loss": 2.184649705886841, + "eval_valid_loss/all": 2.047959804534912, + "eval_valid_loss/end_span": 1.2535027265548706, + "eval_valid_perplexity/batch": 7.752068996429443, + "eval_valid_perplexity/end_span": 3.5025901794433594, + "eval_valid_perplexity/fim": 2.311760663986206, + "eval_valid_perplexity/first_seq": 14.579548835754395, + "eval_valid_perplexity/last_seq": 9.136829376220703, + "eval_valid_perplexity/second_seq": 13.665619850158691, + "eval_valid_perplexity/seq": 8.734871864318848, + "eval_valid_reconstruction/all": 0.2961297333240509, + "eval_valid_reconstruction/end_span": 0.7024281620979309, + "eval_valid_reconstruction/fim": 0.1684025526046753, + "eval_valid_reconstruction/first_seq": 0.17612557113170624, + "eval_valid_reconstruction/last_seq": 0.3203743100166321, + "eval_valid_reconstruction/second_seq": 0.19624102115631104, + "eval_valid_runtime": 560.6581, + "eval_valid_samples_per_second": 0.342, + "eval_valid_steps_per_second": 0.342, + "step": 42000 + }, + { + "epoch": 0.15666614444618518, + "eval_train_loss": 2.1836531162261963, + "eval_train_loss/all": 2.020746946334839, + "eval_train_loss/end_span": 1.2124249935150146, + "eval_train_perplexity/batch": 7.543957710266113, + "eval_train_perplexity/end_span": 3.361626625061035, + "eval_train_perplexity/fim": 2.0722925662994385, + "eval_train_perplexity/first_seq": 15.673333168029785, + "eval_train_perplexity/last_seq": 9.001786231994629, + "eval_train_perplexity/second_seq": 14.05794906616211, + "eval_train_perplexity/seq": 8.686646461486816, + "eval_train_reconstruction/all": 0.2855309247970581, + "eval_train_reconstruction/end_span": 0.7152562737464905, + "eval_train_reconstruction/fim": 0.14672128856182098, + "eval_train_reconstruction/first_seq": 0.1490003764629364, + "eval_train_reconstruction/last_seq": 0.3215515613555908, + "eval_train_reconstruction/second_seq": 0.18599803745746613, + "eval_train_runtime": 539.0559, + "eval_train_samples_per_second": 0.356, + "eval_train_steps_per_second": 0.356, + "step": 42000 + }, + { + "epoch": 0.15670344590914856, + "grad_norm": 0.33889907598495483, + "learning_rate": 0.0006, + "loss": 2.1531, + "step": 42010 + }, + { + "epoch": 0.15674074737211194, + "grad_norm": 0.42168065905570984, + "learning_rate": 0.0006, + "loss": 2.0678, + "step": 42020 + }, + { + "epoch": 0.15677804883507532, + "grad_norm": 0.3383558392524719, + "learning_rate": 0.0006, + "loss": 2.0222, + "step": 42030 + }, + { + "epoch": 0.1568153502980387, + "grad_norm": 0.3126757740974426, + "learning_rate": 0.0006, + "loss": 2.2467, + "step": 42040 + }, + { + "epoch": 0.15685265176100208, + "grad_norm": 0.3264944553375244, + "learning_rate": 0.0006, + "loss": 2.2933, + "step": 42050 + }, + { + "epoch": 0.15688995322396546, + "grad_norm": 0.38675540685653687, + "learning_rate": 0.0006, + "loss": 2.1256, + "step": 42060 + }, + { + "epoch": 0.1569272546869288, + "grad_norm": 0.32017090916633606, + "learning_rate": 0.0006, + "loss": 2.1647, + "step": 42070 + }, + { + "epoch": 0.1569645561498922, + "grad_norm": 0.4309694170951843, + "learning_rate": 0.0006, + "loss": 2.1729, + "step": 42080 + }, + { + "epoch": 0.15700185761285557, + "grad_norm": 0.33771881461143494, + "learning_rate": 0.0006, + "loss": 2.1264, + "step": 42090 + }, + { + "epoch": 0.15703915907581895, + "grad_norm": 0.3696604371070862, + "learning_rate": 0.0006, + "loss": 2.1902, + "step": 42100 + }, + { + "epoch": 0.15707646053878233, + "grad_norm": 0.2725246250629425, + "learning_rate": 0.0006, + "loss": 2.3024, + "step": 42110 + }, + { + "epoch": 0.1571137620017457, + "grad_norm": 0.33574357628822327, + "learning_rate": 0.0006, + "loss": 2.1186, + "step": 42120 + }, + { + "epoch": 0.15715106346470908, + "grad_norm": 0.2559226155281067, + "learning_rate": 0.0006, + "loss": 2.1751, + "step": 42130 + }, + { + "epoch": 0.15718836492767246, + "grad_norm": 0.2810719311237335, + "learning_rate": 0.0006, + "loss": 2.2948, + "step": 42140 + }, + { + "epoch": 0.15722566639063584, + "grad_norm": 0.5298264622688293, + "learning_rate": 0.0006, + "loss": 2.2626, + "step": 42150 + }, + { + "epoch": 0.15726296785359922, + "grad_norm": 0.38774964213371277, + "learning_rate": 0.0006, + "loss": 2.0077, + "step": 42160 + }, + { + "epoch": 0.1573002693165626, + "grad_norm": 0.2538364827632904, + "learning_rate": 0.0006, + "loss": 2.2886, + "step": 42170 + }, + { + "epoch": 0.15733757077952598, + "grad_norm": 0.3563176691532135, + "learning_rate": 0.0006, + "loss": 2.1819, + "step": 42180 + }, + { + "epoch": 0.15737487224248936, + "grad_norm": 0.5751330256462097, + "learning_rate": 0.0006, + "loss": 2.3333, + "step": 42190 + }, + { + "epoch": 0.15741217370545274, + "grad_norm": 0.2921817898750305, + "learning_rate": 0.0006, + "loss": 2.2541, + "step": 42200 + }, + { + "epoch": 0.1574494751684161, + "grad_norm": 0.5574577450752258, + "learning_rate": 0.0006, + "loss": 2.234, + "step": 42210 + }, + { + "epoch": 0.15748677663137947, + "grad_norm": 0.2662336528301239, + "learning_rate": 0.0006, + "loss": 2.4163, + "step": 42220 + }, + { + "epoch": 0.15752407809434285, + "grad_norm": 0.39027681946754456, + "learning_rate": 0.0006, + "loss": 2.0722, + "step": 42230 + }, + { + "epoch": 0.15756137955730623, + "grad_norm": 0.6572689414024353, + "learning_rate": 0.0006, + "loss": 2.1611, + "step": 42240 + }, + { + "epoch": 0.1575986810202696, + "grad_norm": 0.35468634963035583, + "learning_rate": 0.0006, + "loss": 2.3508, + "step": 42250 + }, + { + "epoch": 0.1575986810202696, + "eval_valid_loss": 2.1871607303619385, + "eval_valid_loss/all": 2.050299882888794, + "eval_valid_loss/end_span": 1.223235011100769, + "eval_valid_perplexity/batch": 7.770230770111084, + "eval_valid_perplexity/end_span": 3.398163080215454, + "eval_valid_perplexity/fim": 2.347658634185791, + "eval_valid_perplexity/first_seq": 14.996896743774414, + "eval_valid_perplexity/last_seq": 8.938827514648438, + "eval_valid_perplexity/second_seq": 13.592955589294434, + "eval_valid_perplexity/seq": 8.756152153015137, + "eval_valid_reconstruction/all": 0.295373797416687, + "eval_valid_reconstruction/end_span": 0.7090997099876404, + "eval_valid_reconstruction/fim": 0.17137247323989868, + "eval_valid_reconstruction/first_seq": 0.16470281779766083, + "eval_valid_reconstruction/last_seq": 0.3270029127597809, + "eval_valid_reconstruction/second_seq": 0.20015136897563934, + "eval_valid_runtime": 540.1975, + "eval_valid_samples_per_second": 0.355, + "eval_valid_steps_per_second": 0.355, + "step": 42250 + }, + { + "epoch": 0.1575986810202696, + "eval_train_loss": 2.1872081756591797, + "eval_train_loss/all": 2.0240468978881836, + "eval_train_loss/end_span": 1.1930731534957886, + "eval_train_perplexity/batch": 7.5688934326171875, + "eval_train_perplexity/end_span": 3.297198534011841, + "eval_train_perplexity/fim": 2.305267095565796, + "eval_train_perplexity/first_seq": 15.331571578979492, + "eval_train_perplexity/last_seq": 8.817431449890137, + "eval_train_perplexity/second_seq": 14.149090766906738, + "eval_train_perplexity/seq": 8.715079307556152, + "eval_train_reconstruction/all": 0.2843683660030365, + "eval_train_reconstruction/end_span": 0.7197965979576111, + "eval_train_reconstruction/fim": 0.1672026515007019, + "eval_train_reconstruction/first_seq": 0.15371249616146088, + "eval_train_reconstruction/last_seq": 0.3301471769809723, + "eval_train_reconstruction/second_seq": 0.1862533688545227, + "eval_train_runtime": 518.2066, + "eval_train_samples_per_second": 0.371, + "eval_train_steps_per_second": 0.371, + "step": 42250 + }, + { + "epoch": 0.157635982483233, + "grad_norm": 0.37553462386131287, + "learning_rate": 0.0006, + "loss": 2.2938, + "step": 42260 + }, + { + "epoch": 0.15767328394619637, + "grad_norm": 0.3786643147468567, + "learning_rate": 0.0006, + "loss": 2.1863, + "step": 42270 + }, + { + "epoch": 0.15771058540915975, + "grad_norm": 0.33092397451400757, + "learning_rate": 0.0006, + "loss": 2.1118, + "step": 42280 + }, + { + "epoch": 0.15774788687212313, + "grad_norm": 0.3602079749107361, + "learning_rate": 0.0006, + "loss": 2.3032, + "step": 42290 + }, + { + "epoch": 0.1577851883350865, + "grad_norm": 0.4559529721736908, + "learning_rate": 0.0006, + "loss": 2.3396, + "step": 42300 + }, + { + "epoch": 0.1578224897980499, + "grad_norm": 0.335424542427063, + "learning_rate": 0.0006, + "loss": 2.1598, + "step": 42310 + }, + { + "epoch": 0.15785979126101327, + "grad_norm": 0.3725166618824005, + "learning_rate": 0.0006, + "loss": 2.1479, + "step": 42320 + }, + { + "epoch": 0.15789709272397665, + "grad_norm": 0.4805164337158203, + "learning_rate": 0.0006, + "loss": 2.2225, + "step": 42330 + }, + { + "epoch": 0.15793439418694002, + "grad_norm": 0.31335216760635376, + "learning_rate": 0.0006, + "loss": 2.2434, + "step": 42340 + }, + { + "epoch": 0.15797169564990338, + "grad_norm": 0.47280171513557434, + "learning_rate": 0.0006, + "loss": 2.2618, + "step": 42350 + }, + { + "epoch": 0.15800899711286676, + "grad_norm": 0.4238443076610565, + "learning_rate": 0.0006, + "loss": 2.1097, + "step": 42360 + }, + { + "epoch": 0.15804629857583014, + "grad_norm": 0.2213970571756363, + "learning_rate": 0.0006, + "loss": 2.338, + "step": 42370 + }, + { + "epoch": 0.15808360003879351, + "grad_norm": 0.33618640899658203, + "learning_rate": 0.0006, + "loss": 2.2929, + "step": 42380 + }, + { + "epoch": 0.1581209015017569, + "grad_norm": 0.2522968351840973, + "learning_rate": 0.0006, + "loss": 2.2485, + "step": 42390 + }, + { + "epoch": 0.15815820296472027, + "grad_norm": 0.3069884479045868, + "learning_rate": 0.0006, + "loss": 2.178, + "step": 42400 + }, + { + "epoch": 0.15819550442768365, + "grad_norm": 0.3587142825126648, + "learning_rate": 0.0006, + "loss": 2.3013, + "step": 42410 + }, + { + "epoch": 0.15823280589064703, + "grad_norm": 0.4319458603858948, + "learning_rate": 0.0006, + "loss": 2.2892, + "step": 42420 + }, + { + "epoch": 0.1582701073536104, + "grad_norm": 0.28777405619621277, + "learning_rate": 0.0006, + "loss": 2.1932, + "step": 42430 + }, + { + "epoch": 0.1583074088165738, + "grad_norm": 0.31718161702156067, + "learning_rate": 0.0006, + "loss": 2.3884, + "step": 42440 + }, + { + "epoch": 0.15834471027953717, + "grad_norm": 0.3237944543361664, + "learning_rate": 0.0006, + "loss": 2.2398, + "step": 42450 + }, + { + "epoch": 0.15838201174250055, + "grad_norm": 0.3387557566165924, + "learning_rate": 0.0006, + "loss": 2.3177, + "step": 42460 + }, + { + "epoch": 0.15841931320546393, + "grad_norm": 0.31674399971961975, + "learning_rate": 0.0006, + "loss": 2.1229, + "step": 42470 + }, + { + "epoch": 0.1584566146684273, + "grad_norm": 0.4485701024532318, + "learning_rate": 0.0006, + "loss": 2.2603, + "step": 42480 + }, + { + "epoch": 0.15849391613139066, + "grad_norm": 0.34192803502082825, + "learning_rate": 0.0006, + "loss": 2.1317, + "step": 42490 + }, + { + "epoch": 0.15853121759435404, + "grad_norm": 0.39750319719314575, + "learning_rate": 0.0006, + "loss": 2.1433, + "step": 42500 + }, + { + "epoch": 0.15853121759435404, + "eval_valid_loss": 2.1874353885650635, + "eval_valid_loss/all": 2.050985336303711, + "eval_valid_loss/end_span": 1.1896345615386963, + "eval_valid_perplexity/batch": 7.775558948516846, + "eval_valid_perplexity/end_span": 3.2858800888061523, + "eval_valid_perplexity/fim": 2.1547417640686035, + "eval_valid_perplexity/first_seq": 15.040603637695312, + "eval_valid_perplexity/last_seq": 8.71989917755127, + "eval_valid_perplexity/second_seq": 14.084179878234863, + "eval_valid_perplexity/seq": 8.770194053649902, + "eval_valid_reconstruction/all": 0.29485127329826355, + "eval_valid_reconstruction/end_span": 0.7175712585449219, + "eval_valid_reconstruction/fim": 0.15272293984889984, + "eval_valid_reconstruction/first_seq": 0.16267850995063782, + "eval_valid_reconstruction/last_seq": 0.3373171389102936, + "eval_valid_reconstruction/second_seq": 0.18926283717155457, + "eval_valid_runtime": 513.9394, + "eval_valid_samples_per_second": 0.374, + "eval_valid_steps_per_second": 0.374, + "step": 42500 + }, + { + "epoch": 0.15853121759435404, + "eval_train_loss": 2.1850883960723877, + "eval_train_loss/all": 2.0221712589263916, + "eval_train_loss/end_span": 1.1625304222106934, + "eval_train_perplexity/batch": 7.554710388183594, + "eval_train_perplexity/end_span": 3.1980154514312744, + "eval_train_perplexity/fim": 2.020212173461914, + "eval_train_perplexity/first_seq": 15.368354797363281, + "eval_train_perplexity/last_seq": 9.103155136108398, + "eval_train_perplexity/second_seq": 14.274526596069336, + "eval_train_perplexity/seq": 8.698027610778809, + "eval_train_reconstruction/all": 0.28468960523605347, + "eval_train_reconstruction/end_span": 0.7253577709197998, + "eval_train_reconstruction/fim": 0.14039316773414612, + "eval_train_reconstruction/first_seq": 0.15609107911586761, + "eval_train_reconstruction/last_seq": 0.32342618703842163, + "eval_train_reconstruction/second_seq": 0.18207989633083344, + "eval_train_runtime": 511.6899, + "eval_train_samples_per_second": 0.375, + "eval_train_steps_per_second": 0.375, + "step": 42500 + }, + { + "epoch": 0.15856851905731742, + "grad_norm": 0.31195491552352905, + "learning_rate": 0.0006, + "loss": 2.2225, + "step": 42510 + }, + { + "epoch": 0.1586058205202808, + "grad_norm": 0.374057412147522, + "learning_rate": 0.0006, + "loss": 2.1306, + "step": 42520 + }, + { + "epoch": 0.15864312198324418, + "grad_norm": 0.43992072343826294, + "learning_rate": 0.0006, + "loss": 2.3809, + "step": 42530 + }, + { + "epoch": 0.15868042344620756, + "grad_norm": 0.46816903352737427, + "learning_rate": 0.0006, + "loss": 2.2108, + "step": 42540 + }, + { + "epoch": 0.15871772490917094, + "grad_norm": 0.38276493549346924, + "learning_rate": 0.0006, + "loss": 2.2421, + "step": 42550 + }, + { + "epoch": 0.15875502637213432, + "grad_norm": 0.4873292148113251, + "learning_rate": 0.0006, + "loss": 2.1911, + "step": 42560 + }, + { + "epoch": 0.1587923278350977, + "grad_norm": 0.5287480354309082, + "learning_rate": 0.0006, + "loss": 2.3293, + "step": 42570 + }, + { + "epoch": 0.15882962929806108, + "grad_norm": 0.5345765948295593, + "learning_rate": 0.0006, + "loss": 2.3645, + "step": 42580 + }, + { + "epoch": 0.15886693076102446, + "grad_norm": 0.3220521807670593, + "learning_rate": 0.0006, + "loss": 2.1863, + "step": 42590 + }, + { + "epoch": 0.15890423222398783, + "grad_norm": 0.34698399901390076, + "learning_rate": 0.0006, + "loss": 2.2252, + "step": 42600 + }, + { + "epoch": 0.15894153368695121, + "grad_norm": 0.37065213918685913, + "learning_rate": 0.0006, + "loss": 2.1336, + "step": 42610 + }, + { + "epoch": 0.1589788351499146, + "grad_norm": 0.29911649227142334, + "learning_rate": 0.0006, + "loss": 2.3673, + "step": 42620 + }, + { + "epoch": 0.15901613661287795, + "grad_norm": 0.3664100170135498, + "learning_rate": 0.0006, + "loss": 2.2216, + "step": 42630 + }, + { + "epoch": 0.15905343807584132, + "grad_norm": 0.2613829970359802, + "learning_rate": 0.0006, + "loss": 2.1209, + "step": 42640 + }, + { + "epoch": 0.1590907395388047, + "grad_norm": 11.437225341796875, + "learning_rate": 0.0006, + "loss": 2.2115, + "step": 42650 + }, + { + "epoch": 0.15912804100176808, + "grad_norm": 0.31999099254608154, + "learning_rate": 0.0006, + "loss": 2.2241, + "step": 42660 + }, + { + "epoch": 0.15916534246473146, + "grad_norm": 0.31486696004867554, + "learning_rate": 0.0006, + "loss": 2.2284, + "step": 42670 + }, + { + "epoch": 0.15920264392769484, + "grad_norm": 0.29370325803756714, + "learning_rate": 0.0006, + "loss": 2.1908, + "step": 42680 + }, + { + "epoch": 0.15923994539065822, + "grad_norm": 0.4589220881462097, + "learning_rate": 0.0006, + "loss": 2.2278, + "step": 42690 + }, + { + "epoch": 0.1592772468536216, + "grad_norm": 0.3651379942893982, + "learning_rate": 0.0006, + "loss": 2.2806, + "step": 42700 + }, + { + "epoch": 0.15931454831658498, + "grad_norm": 0.42434537410736084, + "learning_rate": 0.0006, + "loss": 2.1513, + "step": 42710 + }, + { + "epoch": 0.15935184977954836, + "grad_norm": 0.37100762128829956, + "learning_rate": 0.0006, + "loss": 2.2715, + "step": 42720 + }, + { + "epoch": 0.15938915124251174, + "grad_norm": 0.30912038683891296, + "learning_rate": 0.0006, + "loss": 2.1663, + "step": 42730 + }, + { + "epoch": 0.15942645270547512, + "grad_norm": 0.2888425588607788, + "learning_rate": 0.0006, + "loss": 2.1924, + "step": 42740 + }, + { + "epoch": 0.1594637541684385, + "grad_norm": 0.33025726675987244, + "learning_rate": 0.0006, + "loss": 2.2262, + "step": 42750 + }, + { + "epoch": 0.1594637541684385, + "eval_valid_loss": 2.185539484024048, + "eval_valid_loss/all": 2.049025535583496, + "eval_valid_loss/end_span": 1.1671488285064697, + "eval_valid_perplexity/batch": 7.760335445404053, + "eval_valid_perplexity/end_span": 3.2128193378448486, + "eval_valid_perplexity/fim": 2.1886677742004395, + "eval_valid_perplexity/first_seq": 14.746094703674316, + "eval_valid_perplexity/last_seq": 8.733799934387207, + "eval_valid_perplexity/second_seq": 13.92962646484375, + "eval_valid_perplexity/seq": 8.750030517578125, + "eval_valid_reconstruction/all": 0.2959563136100769, + "eval_valid_reconstruction/end_span": 0.7254364490509033, + "eval_valid_reconstruction/fim": 0.15719369053840637, + "eval_valid_reconstruction/first_seq": 0.1713528037071228, + "eval_valid_reconstruction/last_seq": 0.3353404402732849, + "eval_valid_reconstruction/second_seq": 0.19178283214569092, + "eval_valid_runtime": 511.9283, + "eval_valid_samples_per_second": 0.375, + "eval_valid_steps_per_second": 0.375, + "step": 42750 + }, + { + "epoch": 0.1594637541684385, + "eval_train_loss": 2.1838550567626953, + "eval_train_loss/all": 2.0210726261138916, + "eval_train_loss/end_span": 1.1351977586746216, + "eval_train_perplexity/batch": 7.546414852142334, + "eval_train_perplexity/end_span": 3.111788749694824, + "eval_train_perplexity/fim": 2.1127378940582275, + "eval_train_perplexity/first_seq": 15.076574325561523, + "eval_train_perplexity/last_seq": 8.838032722473145, + "eval_train_perplexity/second_seq": 14.321063995361328, + "eval_train_perplexity/seq": 8.687773704528809, + "eval_train_reconstruction/all": 0.28554609417915344, + "eval_train_reconstruction/end_span": 0.7363064885139465, + "eval_train_reconstruction/fim": 0.1500784009695053, + "eval_train_reconstruction/first_seq": 0.15825653076171875, + "eval_train_reconstruction/last_seq": 0.3319297730922699, + "eval_train_reconstruction/second_seq": 0.18216629326343536, + "eval_train_runtime": 514.8853, + "eval_train_samples_per_second": 0.373, + "eval_train_steps_per_second": 0.373, + "step": 42750 + }, + { + "epoch": 0.15950105563140185, + "grad_norm": 0.4773378074169159, + "learning_rate": 0.0006, + "loss": 2.1507, + "step": 42760 + }, + { + "epoch": 0.15953835709436523, + "grad_norm": 0.5934918522834778, + "learning_rate": 0.0006, + "loss": 2.2753, + "step": 42770 + }, + { + "epoch": 0.1595756585573286, + "grad_norm": 0.3337015211582184, + "learning_rate": 0.0006, + "loss": 2.29, + "step": 42780 + }, + { + "epoch": 0.159612960020292, + "grad_norm": 0.4409749209880829, + "learning_rate": 0.0006, + "loss": 2.2554, + "step": 42790 + }, + { + "epoch": 0.15965026148325537, + "grad_norm": 0.33157631754875183, + "learning_rate": 0.0006, + "loss": 2.2484, + "step": 42800 + }, + { + "epoch": 0.15968756294621875, + "grad_norm": 0.23964378237724304, + "learning_rate": 0.0006, + "loss": 2.3023, + "step": 42810 + }, + { + "epoch": 0.15972486440918213, + "grad_norm": 0.24865122139453888, + "learning_rate": 0.0006, + "loss": 2.1779, + "step": 42820 + }, + { + "epoch": 0.1597621658721455, + "grad_norm": 0.22258450090885162, + "learning_rate": 0.0006, + "loss": 2.2095, + "step": 42830 + }, + { + "epoch": 0.15979946733510889, + "grad_norm": 0.31802499294281006, + "learning_rate": 0.0006, + "loss": 2.18, + "step": 42840 + }, + { + "epoch": 0.15983676879807227, + "grad_norm": 0.36914703249931335, + "learning_rate": 0.0006, + "loss": 2.1998, + "step": 42850 + }, + { + "epoch": 0.15987407026103564, + "grad_norm": 0.34926456212997437, + "learning_rate": 0.0006, + "loss": 2.1656, + "step": 42860 + }, + { + "epoch": 0.15991137172399902, + "grad_norm": 0.461929589509964, + "learning_rate": 0.0006, + "loss": 2.2102, + "step": 42870 + }, + { + "epoch": 0.1599486731869624, + "grad_norm": 0.3102858066558838, + "learning_rate": 0.0006, + "loss": 2.0945, + "step": 42880 + }, + { + "epoch": 0.15998597464992578, + "grad_norm": 0.25255799293518066, + "learning_rate": 0.0006, + "loss": 2.1998, + "step": 42890 + }, + { + "epoch": 0.16002327611288913, + "grad_norm": 0.38450106978416443, + "learning_rate": 0.0006, + "loss": 2.2714, + "step": 42900 + }, + { + "epoch": 0.16006057757585251, + "grad_norm": 0.43519654870033264, + "learning_rate": 0.0006, + "loss": 2.2132, + "step": 42910 + }, + { + "epoch": 0.1600978790388159, + "grad_norm": 0.29644864797592163, + "learning_rate": 0.0006, + "loss": 2.2383, + "step": 42920 + }, + { + "epoch": 0.16013518050177927, + "grad_norm": 0.4274751543998718, + "learning_rate": 0.0006, + "loss": 2.0329, + "step": 42930 + }, + { + "epoch": 0.16017248196474265, + "grad_norm": 0.22426524758338928, + "learning_rate": 0.0006, + "loss": 2.1063, + "step": 42940 + }, + { + "epoch": 0.16020978342770603, + "grad_norm": 0.23336075246334076, + "learning_rate": 0.0006, + "loss": 2.4173, + "step": 42950 + }, + { + "epoch": 0.1602470848906694, + "grad_norm": 0.7764258980751038, + "learning_rate": 0.0006, + "loss": 2.2181, + "step": 42960 + }, + { + "epoch": 0.1602843863536328, + "grad_norm": 0.3335942327976227, + "learning_rate": 0.0006, + "loss": 2.2216, + "step": 42970 + }, + { + "epoch": 0.16032168781659617, + "grad_norm": 0.364610493183136, + "learning_rate": 0.0006, + "loss": 2.2556, + "step": 42980 + }, + { + "epoch": 0.16035898927955955, + "grad_norm": 0.4228040874004364, + "learning_rate": 0.0006, + "loss": 2.1794, + "step": 42990 + }, + { + "epoch": 0.16039629074252293, + "grad_norm": 0.2537142336368561, + "learning_rate": 0.0006, + "loss": 2.2614, + "step": 43000 + }, + { + "epoch": 0.16039629074252293, + "eval_valid_loss": 2.1908531188964844, + "eval_valid_loss/all": 2.0539703369140625, + "eval_valid_loss/end_span": 1.3067923784255981, + "eval_valid_perplexity/batch": 7.798803806304932, + "eval_valid_perplexity/end_span": 3.6943047046661377, + "eval_valid_perplexity/fim": 2.4506635665893555, + "eval_valid_perplexity/first_seq": 14.864166259765625, + "eval_valid_perplexity/last_seq": 8.783968925476074, + "eval_valid_perplexity/second_seq": 13.603144645690918, + "eval_valid_perplexity/seq": 8.79345989227295, + "eval_valid_reconstruction/all": 0.2947414219379425, + "eval_valid_reconstruction/end_span": 0.6934090852737427, + "eval_valid_reconstruction/fim": 0.17809301614761353, + "eval_valid_reconstruction/first_seq": 0.16536279022693634, + "eval_valid_reconstruction/last_seq": 0.3353537321090698, + "eval_valid_reconstruction/second_seq": 0.1983853280544281, + "eval_valid_runtime": 517.0911, + "eval_valid_samples_per_second": 0.371, + "eval_valid_steps_per_second": 0.371, + "step": 43000 + }, + { + "epoch": 0.16039629074252293, + "eval_train_loss": 2.1877753734588623, + "eval_train_loss/all": 2.024113655090332, + "eval_train_loss/end_span": 1.2678487300872803, + "eval_train_perplexity/batch": 7.569398880004883, + "eval_train_perplexity/end_span": 3.5532004833221436, + "eval_train_perplexity/fim": 2.0924229621887207, + "eval_train_perplexity/first_seq": 15.55208969116211, + "eval_train_perplexity/last_seq": 8.962557792663574, + "eval_train_perplexity/second_seq": 14.382782936096191, + "eval_train_perplexity/seq": 8.70976734161377, + "eval_train_reconstruction/all": 0.28473466634750366, + "eval_train_reconstruction/end_span": 0.7068904638290405, + "eval_train_reconstruction/fim": 0.14837856590747833, + "eval_train_reconstruction/first_seq": 0.15052485466003418, + "eval_train_reconstruction/last_seq": 0.32416820526123047, + "eval_train_reconstruction/second_seq": 0.18070478737354279, + "eval_train_runtime": 522.7143, + "eval_train_samples_per_second": 0.367, + "eval_train_steps_per_second": 0.367, + "step": 43000 + }, + { + "epoch": 0.1604335922054863, + "grad_norm": 0.4284549653530121, + "learning_rate": 0.0006, + "loss": 2.038, + "step": 43010 + }, + { + "epoch": 0.1604708936684497, + "grad_norm": 0.47963476181030273, + "learning_rate": 0.0006, + "loss": 2.1462, + "step": 43020 + }, + { + "epoch": 0.16050819513141307, + "grad_norm": 0.26766496896743774, + "learning_rate": 0.0006, + "loss": 2.157, + "step": 43030 + }, + { + "epoch": 0.16054549659437642, + "grad_norm": 0.2512650787830353, + "learning_rate": 0.0006, + "loss": 2.1646, + "step": 43040 + }, + { + "epoch": 0.1605827980573398, + "grad_norm": 0.3377945125102997, + "learning_rate": 0.0006, + "loss": 2.3043, + "step": 43050 + }, + { + "epoch": 0.16062009952030318, + "grad_norm": 0.2799924612045288, + "learning_rate": 0.0006, + "loss": 2.2733, + "step": 43060 + }, + { + "epoch": 0.16065740098326656, + "grad_norm": 0.384631872177124, + "learning_rate": 0.0006, + "loss": 2.3395, + "step": 43070 + }, + { + "epoch": 0.16069470244622994, + "grad_norm": 1.3753563165664673, + "learning_rate": 0.0006, + "loss": 2.3121, + "step": 43080 + }, + { + "epoch": 0.16073200390919332, + "grad_norm": 0.28676027059555054, + "learning_rate": 0.0006, + "loss": 2.1614, + "step": 43090 + }, + { + "epoch": 0.1607693053721567, + "grad_norm": 0.3439808189868927, + "learning_rate": 0.0006, + "loss": 2.0788, + "step": 43100 + }, + { + "epoch": 0.16080660683512008, + "grad_norm": 0.5042449235916138, + "learning_rate": 0.0006, + "loss": 2.2832, + "step": 43110 + }, + { + "epoch": 0.16084390829808345, + "grad_norm": 0.30318522453308105, + "learning_rate": 0.0006, + "loss": 2.298, + "step": 43120 + }, + { + "epoch": 0.16088120976104683, + "grad_norm": 0.2965073883533478, + "learning_rate": 0.0006, + "loss": 2.0742, + "step": 43130 + }, + { + "epoch": 0.1609185112240102, + "grad_norm": 0.5500946044921875, + "learning_rate": 0.0006, + "loss": 2.2157, + "step": 43140 + }, + { + "epoch": 0.1609558126869736, + "grad_norm": 0.25025010108947754, + "learning_rate": 0.0006, + "loss": 2.3214, + "step": 43150 + }, + { + "epoch": 0.16099311414993697, + "grad_norm": 0.33372142910957336, + "learning_rate": 0.0006, + "loss": 2.2617, + "step": 43160 + }, + { + "epoch": 0.16103041561290035, + "grad_norm": 1.346146583557129, + "learning_rate": 0.0006, + "loss": 2.3137, + "step": 43170 + }, + { + "epoch": 0.1610677170758637, + "grad_norm": 0.3087266683578491, + "learning_rate": 0.0006, + "loss": 2.171, + "step": 43180 + }, + { + "epoch": 0.16110501853882708, + "grad_norm": 0.325833797454834, + "learning_rate": 0.0006, + "loss": 2.1637, + "step": 43190 + }, + { + "epoch": 0.16114232000179046, + "grad_norm": 0.2788829803466797, + "learning_rate": 0.0006, + "loss": 2.2377, + "step": 43200 + }, + { + "epoch": 0.16117962146475384, + "grad_norm": 0.27812430262565613, + "learning_rate": 0.0006, + "loss": 2.2561, + "step": 43210 + }, + { + "epoch": 0.16121692292771722, + "grad_norm": 0.4245210289955139, + "learning_rate": 0.0006, + "loss": 2.2473, + "step": 43220 + }, + { + "epoch": 0.1612542243906806, + "grad_norm": 0.2676950991153717, + "learning_rate": 0.0006, + "loss": 2.2994, + "step": 43230 + }, + { + "epoch": 0.16129152585364398, + "grad_norm": 0.45535925030708313, + "learning_rate": 0.0006, + "loss": 2.252, + "step": 43240 + }, + { + "epoch": 0.16132882731660736, + "grad_norm": 0.34523677825927734, + "learning_rate": 0.0006, + "loss": 2.1277, + "step": 43250 + }, + { + "epoch": 0.16132882731660736, + "eval_valid_loss": 2.183983325958252, + "eval_valid_loss/all": 2.0476295948028564, + "eval_valid_loss/end_span": 1.1525988578796387, + "eval_valid_perplexity/batch": 7.749509811401367, + "eval_valid_perplexity/end_span": 3.1664111614227295, + "eval_valid_perplexity/fim": 2.2966666221618652, + "eval_valid_perplexity/first_seq": 14.70217514038086, + "eval_valid_perplexity/last_seq": 8.77558708190918, + "eval_valid_perplexity/second_seq": 13.67396354675293, + "eval_valid_perplexity/seq": 8.734963417053223, + "eval_valid_reconstruction/all": 0.29622772336006165, + "eval_valid_reconstruction/end_span": 0.7314925789833069, + "eval_valid_reconstruction/fim": 0.16776877641677856, + "eval_valid_reconstruction/first_seq": 0.169730544090271, + "eval_valid_reconstruction/last_seq": 0.3323057293891907, + "eval_valid_reconstruction/second_seq": 0.19628240168094635, + "eval_valid_runtime": 514.4625, + "eval_valid_samples_per_second": 0.373, + "eval_valid_steps_per_second": 0.373, + "step": 43250 + }, + { + "epoch": 0.16132882731660736, + "eval_train_loss": 2.1820671558380127, + "eval_train_loss/all": 2.0192720890045166, + "eval_train_loss/end_span": 1.1173484325408936, + "eval_train_perplexity/batch": 7.532839775085449, + "eval_train_perplexity/end_span": 3.0567383766174316, + "eval_train_perplexity/fim": 2.1876795291900635, + "eval_train_perplexity/first_seq": 15.594829559326172, + "eval_train_perplexity/last_seq": 9.069310188293457, + "eval_train_perplexity/second_seq": 14.000137329101562, + "eval_train_perplexity/seq": 8.672636985778809, + "eval_train_reconstruction/all": 0.2858790457248688, + "eval_train_reconstruction/end_span": 0.7441301941871643, + "eval_train_reconstruction/fim": 0.15812230110168457, + "eval_train_reconstruction/first_seq": 0.15316857397556305, + "eval_train_reconstruction/last_seq": 0.32246023416519165, + "eval_train_reconstruction/second_seq": 0.19091656804084778, + "eval_train_runtime": 513.8188, + "eval_train_samples_per_second": 0.374, + "eval_train_steps_per_second": 0.374, + "step": 43250 + }, + { + "epoch": 0.16136612877957074, + "grad_norm": 0.39315053820610046, + "learning_rate": 0.0006, + "loss": 2.3114, + "step": 43260 + }, + { + "epoch": 0.16140343024253412, + "grad_norm": 0.26865214109420776, + "learning_rate": 0.0006, + "loss": 2.3389, + "step": 43270 + }, + { + "epoch": 0.1614407317054975, + "grad_norm": 0.42369577288627625, + "learning_rate": 0.0006, + "loss": 2.2453, + "step": 43280 + }, + { + "epoch": 0.16147803316846088, + "grad_norm": 0.36895203590393066, + "learning_rate": 0.0006, + "loss": 2.3794, + "step": 43290 + }, + { + "epoch": 0.16151533463142426, + "grad_norm": 0.4844391345977783, + "learning_rate": 0.0006, + "loss": 2.2727, + "step": 43300 + }, + { + "epoch": 0.1615526360943876, + "grad_norm": 0.6915794014930725, + "learning_rate": 0.0006, + "loss": 2.3047, + "step": 43310 + }, + { + "epoch": 0.161589937557351, + "grad_norm": 0.6275699734687805, + "learning_rate": 0.0006, + "loss": 2.1249, + "step": 43320 + }, + { + "epoch": 0.16162723902031437, + "grad_norm": 0.342941015958786, + "learning_rate": 0.0006, + "loss": 2.3116, + "step": 43330 + }, + { + "epoch": 0.16166454048327775, + "grad_norm": 0.48990875482559204, + "learning_rate": 0.0006, + "loss": 2.007, + "step": 43340 + }, + { + "epoch": 0.16170184194624113, + "grad_norm": 0.35838377475738525, + "learning_rate": 0.0006, + "loss": 2.1901, + "step": 43350 + }, + { + "epoch": 0.1617391434092045, + "grad_norm": 0.25133073329925537, + "learning_rate": 0.0006, + "loss": 2.194, + "step": 43360 + }, + { + "epoch": 0.16177644487216788, + "grad_norm": 0.3624676764011383, + "learning_rate": 0.0006, + "loss": 2.2064, + "step": 43370 + }, + { + "epoch": 0.16181374633513126, + "grad_norm": 0.26279860734939575, + "learning_rate": 0.0006, + "loss": 2.2408, + "step": 43380 + }, + { + "epoch": 0.16185104779809464, + "grad_norm": 0.5211269855499268, + "learning_rate": 0.0006, + "loss": 2.2907, + "step": 43390 + }, + { + "epoch": 0.16188834926105802, + "grad_norm": 0.3567313253879547, + "learning_rate": 0.0006, + "loss": 2.3022, + "step": 43400 + }, + { + "epoch": 0.1619256507240214, + "grad_norm": 0.3720013499259949, + "learning_rate": 0.0006, + "loss": 2.1881, + "step": 43410 + }, + { + "epoch": 0.16196295218698478, + "grad_norm": 0.33193838596343994, + "learning_rate": 0.0006, + "loss": 2.1342, + "step": 43420 + }, + { + "epoch": 0.16200025364994816, + "grad_norm": 0.2154540717601776, + "learning_rate": 0.0006, + "loss": 2.2335, + "step": 43430 + }, + { + "epoch": 0.16203755511291154, + "grad_norm": 0.3838658332824707, + "learning_rate": 0.0006, + "loss": 2.221, + "step": 43440 + }, + { + "epoch": 0.1620748565758749, + "grad_norm": 0.4244902431964874, + "learning_rate": 0.0006, + "loss": 2.1061, + "step": 43450 + }, + { + "epoch": 0.16211215803883827, + "grad_norm": 0.4889724850654602, + "learning_rate": 0.0006, + "loss": 2.1634, + "step": 43460 + }, + { + "epoch": 0.16214945950180165, + "grad_norm": 0.4848657250404358, + "learning_rate": 0.0006, + "loss": 2.2391, + "step": 43470 + }, + { + "epoch": 0.16218676096476503, + "grad_norm": 0.5609893798828125, + "learning_rate": 0.0006, + "loss": 2.3269, + "step": 43480 + }, + { + "epoch": 0.1622240624277284, + "grad_norm": 0.2642020881175995, + "learning_rate": 0.0006, + "loss": 2.1647, + "step": 43490 + }, + { + "epoch": 0.1622613638906918, + "grad_norm": 0.4097767770290375, + "learning_rate": 0.0006, + "loss": 2.0086, + "step": 43500 + }, + { + "epoch": 0.1622613638906918, + "eval_valid_loss": 2.185673236846924, + "eval_valid_loss/all": 2.049144744873047, + "eval_valid_loss/end_span": 1.173296570777893, + "eval_valid_perplexity/batch": 7.761260509490967, + "eval_valid_perplexity/end_span": 3.2326316833496094, + "eval_valid_perplexity/fim": 2.487180233001709, + "eval_valid_perplexity/first_seq": 14.915382385253906, + "eval_valid_perplexity/last_seq": 9.141350746154785, + "eval_valid_perplexity/second_seq": 13.538269996643066, + "eval_valid_perplexity/seq": 8.75331974029541, + "eval_valid_reconstruction/all": 0.29579800367355347, + "eval_valid_reconstruction/end_span": 0.7305282354354858, + "eval_valid_reconstruction/fim": 0.18329410254955292, + "eval_valid_reconstruction/first_seq": 0.16766391694545746, + "eval_valid_reconstruction/last_seq": 0.3244498670101166, + "eval_valid_reconstruction/second_seq": 0.20396746695041656, + "eval_valid_runtime": 526.5913, + "eval_valid_samples_per_second": 0.365, + "eval_valid_steps_per_second": 0.365, + "step": 43500 + }, + { + "epoch": 0.1622613638906918, + "eval_train_loss": 2.183326482772827, + "eval_train_loss/all": 2.0206356048583984, + "eval_train_loss/end_span": 1.1279269456863403, + "eval_train_perplexity/batch": 7.543118000030518, + "eval_train_perplexity/end_span": 3.0892457962036133, + "eval_train_perplexity/fim": 2.011463165283203, + "eval_train_perplexity/first_seq": 15.714953422546387, + "eval_train_perplexity/last_seq": 8.869516372680664, + "eval_train_perplexity/second_seq": 14.319948196411133, + "eval_train_perplexity/seq": 8.68510627746582, + "eval_train_reconstruction/all": 0.2855256199836731, + "eval_train_reconstruction/end_span": 0.7458853721618652, + "eval_train_reconstruction/fim": 0.14086470007896423, + "eval_train_reconstruction/first_seq": 0.14584723114967346, + "eval_train_reconstruction/last_seq": 0.32759788632392883, + "eval_train_reconstruction/second_seq": 0.18094412982463837, + "eval_train_runtime": 528.2342, + "eval_train_samples_per_second": 0.363, + "eval_train_steps_per_second": 0.363, + "step": 43500 + }, + { + "epoch": 0.16229866535365517, + "grad_norm": 0.27312523126602173, + "learning_rate": 0.0006, + "loss": 2.2853, + "step": 43510 + }, + { + "epoch": 0.16233596681661855, + "grad_norm": 0.3408607244491577, + "learning_rate": 0.0006, + "loss": 2.1219, + "step": 43520 + }, + { + "epoch": 0.16237326827958193, + "grad_norm": 0.3200668394565582, + "learning_rate": 0.0006, + "loss": 2.2991, + "step": 43530 + }, + { + "epoch": 0.1624105697425453, + "grad_norm": 0.5846495032310486, + "learning_rate": 0.0006, + "loss": 2.1359, + "step": 43540 + }, + { + "epoch": 0.1624478712055087, + "grad_norm": 0.2791212201118469, + "learning_rate": 0.0006, + "loss": 2.2663, + "step": 43550 + }, + { + "epoch": 0.16248517266847207, + "grad_norm": 0.3288963735103607, + "learning_rate": 0.0006, + "loss": 2.2741, + "step": 43560 + }, + { + "epoch": 0.16252247413143545, + "grad_norm": 0.2976641058921814, + "learning_rate": 0.0006, + "loss": 2.1343, + "step": 43570 + }, + { + "epoch": 0.16255977559439883, + "grad_norm": 0.3764709234237671, + "learning_rate": 0.0006, + "loss": 2.2154, + "step": 43580 + }, + { + "epoch": 0.16259707705736218, + "grad_norm": 0.320892333984375, + "learning_rate": 0.0006, + "loss": 2.1618, + "step": 43590 + }, + { + "epoch": 0.16263437852032556, + "grad_norm": 0.2776557505130768, + "learning_rate": 0.0006, + "loss": 2.1228, + "step": 43600 + }, + { + "epoch": 0.16267167998328894, + "grad_norm": 0.35083654522895813, + "learning_rate": 0.0006, + "loss": 2.2906, + "step": 43610 + }, + { + "epoch": 0.16270898144625232, + "grad_norm": 0.5075205564498901, + "learning_rate": 0.0006, + "loss": 2.0913, + "step": 43620 + }, + { + "epoch": 0.1627462829092157, + "grad_norm": 0.3294376730918884, + "learning_rate": 0.0006, + "loss": 2.3363, + "step": 43630 + }, + { + "epoch": 0.16278358437217907, + "grad_norm": 0.26986730098724365, + "learning_rate": 0.0006, + "loss": 2.267, + "step": 43640 + }, + { + "epoch": 0.16282088583514245, + "grad_norm": 0.30368897318840027, + "learning_rate": 0.0006, + "loss": 2.0417, + "step": 43650 + }, + { + "epoch": 0.16285818729810583, + "grad_norm": 0.36759260296821594, + "learning_rate": 0.0006, + "loss": 2.2577, + "step": 43660 + }, + { + "epoch": 0.1628954887610692, + "grad_norm": 0.39152392745018005, + "learning_rate": 0.0006, + "loss": 2.0975, + "step": 43670 + }, + { + "epoch": 0.1629327902240326, + "grad_norm": 0.5436105728149414, + "learning_rate": 0.0006, + "loss": 2.3338, + "step": 43680 + }, + { + "epoch": 0.16297009168699597, + "grad_norm": 0.25730425119400024, + "learning_rate": 0.0006, + "loss": 2.2038, + "step": 43690 + }, + { + "epoch": 0.16300739314995935, + "grad_norm": 0.3548682630062103, + "learning_rate": 0.0006, + "loss": 2.3275, + "step": 43700 + }, + { + "epoch": 0.16304469461292273, + "grad_norm": 0.3061622083187103, + "learning_rate": 0.0006, + "loss": 2.2084, + "step": 43710 + }, + { + "epoch": 0.1630819960758861, + "grad_norm": 0.4127502739429474, + "learning_rate": 0.0006, + "loss": 2.1998, + "step": 43720 + }, + { + "epoch": 0.16311929753884946, + "grad_norm": 24.25690460205078, + "learning_rate": 0.0006, + "loss": 2.3384, + "step": 43730 + }, + { + "epoch": 0.16315659900181284, + "grad_norm": 2.530625343322754, + "learning_rate": 0.0006, + "loss": 2.3685, + "step": 43740 + }, + { + "epoch": 0.16319390046477622, + "grad_norm": 0.3955136835575104, + "learning_rate": 0.0006, + "loss": 2.2235, + "step": 43750 + }, + { + "epoch": 0.16319390046477622, + "eval_valid_loss": 2.1960625648498535, + "eval_valid_loss/all": 2.0584404468536377, + "eval_valid_loss/end_span": 1.2986690998077393, + "eval_valid_perplexity/batch": 7.833743095397949, + "eval_valid_perplexity/end_span": 3.664416551589966, + "eval_valid_perplexity/fim": 2.252821207046509, + "eval_valid_perplexity/first_seq": 14.846256256103516, + "eval_valid_perplexity/last_seq": 8.87098217010498, + "eval_valid_perplexity/second_seq": 13.806631088256836, + "eval_valid_perplexity/seq": 8.832596778869629, + "eval_valid_reconstruction/all": 0.29293331503868103, + "eval_valid_reconstruction/end_span": 0.6953599452972412, + "eval_valid_reconstruction/fim": 0.1612585037946701, + "eval_valid_reconstruction/first_seq": 0.16968505084514618, + "eval_valid_reconstruction/last_seq": 0.3329846262931824, + "eval_valid_reconstruction/second_seq": 0.19144411385059357, + "eval_valid_runtime": 534.0145, + "eval_valid_samples_per_second": 0.36, + "eval_valid_steps_per_second": 0.36, + "step": 43750 + }, + { + "epoch": 0.16319390046477622, + "eval_train_loss": 2.194458484649658, + "eval_train_loss/all": 2.0308666229248047, + "eval_train_loss/end_span": 1.2724125385284424, + "eval_train_perplexity/batch": 7.620687961578369, + "eval_train_perplexity/end_span": 3.569453716278076, + "eval_train_perplexity/fim": 1.9110217094421387, + "eval_train_perplexity/first_seq": 15.638373374938965, + "eval_train_perplexity/last_seq": 8.647193908691406, + "eval_train_perplexity/second_seq": 13.707037925720215, + "eval_train_perplexity/seq": 8.779727935791016, + "eval_train_reconstruction/all": 0.2823913097381592, + "eval_train_reconstruction/end_span": 0.7059988975524902, + "eval_train_reconstruction/fim": 0.1283276379108429, + "eval_train_reconstruction/first_seq": 0.14816845953464508, + "eval_train_reconstruction/last_seq": 0.3370746076107025, + "eval_train_reconstruction/second_seq": 0.1979232281446457, + "eval_train_runtime": 527.9716, + "eval_train_samples_per_second": 0.364, + "eval_train_steps_per_second": 0.364, + "step": 43750 + }, + { + "epoch": 0.1632312019277396, + "grad_norm": 0.28717172145843506, + "learning_rate": 0.0006, + "loss": 2.0562, + "step": 43760 + }, + { + "epoch": 0.16326850339070298, + "grad_norm": 0.23554570972919464, + "learning_rate": 0.0006, + "loss": 2.2047, + "step": 43770 + }, + { + "epoch": 0.16330580485366636, + "grad_norm": 1.0638519525527954, + "learning_rate": 0.0006, + "loss": 2.1324, + "step": 43780 + }, + { + "epoch": 0.16334310631662974, + "grad_norm": 0.4107969105243683, + "learning_rate": 0.0006, + "loss": 2.2873, + "step": 43790 + }, + { + "epoch": 0.16338040777959312, + "grad_norm": 0.34240293502807617, + "learning_rate": 0.0006, + "loss": 2.2395, + "step": 43800 + }, + { + "epoch": 0.1634177092425565, + "grad_norm": 6.378195285797119, + "learning_rate": 0.0006, + "loss": 2.2183, + "step": 43810 + }, + { + "epoch": 0.16345501070551988, + "grad_norm": 0.29458048939704895, + "learning_rate": 0.0006, + "loss": 2.099, + "step": 43820 + }, + { + "epoch": 0.16349231216848326, + "grad_norm": 0.2455892115831375, + "learning_rate": 0.0006, + "loss": 2.4435, + "step": 43830 + }, + { + "epoch": 0.16352961363144664, + "grad_norm": 5.865114688873291, + "learning_rate": 0.0006, + "loss": 2.1385, + "step": 43840 + }, + { + "epoch": 0.16356691509441001, + "grad_norm": 0.38769587874412537, + "learning_rate": 0.0006, + "loss": 2.1373, + "step": 43850 + }, + { + "epoch": 0.16360421655737337, + "grad_norm": 0.37385109066963196, + "learning_rate": 0.0006, + "loss": 2.3081, + "step": 43860 + }, + { + "epoch": 0.16364151802033675, + "grad_norm": 0.21432574093341827, + "learning_rate": 0.0006, + "loss": 2.2356, + "step": 43870 + }, + { + "epoch": 0.16367881948330013, + "grad_norm": 0.36073657870292664, + "learning_rate": 0.0006, + "loss": 2.3019, + "step": 43880 + }, + { + "epoch": 0.1637161209462635, + "grad_norm": 0.36110132932662964, + "learning_rate": 0.0006, + "loss": 2.3646, + "step": 43890 + }, + { + "epoch": 0.16375342240922688, + "grad_norm": 0.34165695309638977, + "learning_rate": 0.0006, + "loss": 2.1528, + "step": 43900 + }, + { + "epoch": 0.16379072387219026, + "grad_norm": 0.47023865580558777, + "learning_rate": 0.0006, + "loss": 2.2035, + "step": 43910 + }, + { + "epoch": 0.16382802533515364, + "grad_norm": 0.22254666686058044, + "learning_rate": 0.0006, + "loss": 2.1455, + "step": 43920 + }, + { + "epoch": 0.16386532679811702, + "grad_norm": 0.42084798216819763, + "learning_rate": 0.0006, + "loss": 2.2217, + "step": 43930 + }, + { + "epoch": 0.1639026282610804, + "grad_norm": 0.4182269871234894, + "learning_rate": 0.0006, + "loss": 2.071, + "step": 43940 + }, + { + "epoch": 0.16393992972404378, + "grad_norm": 0.33549097180366516, + "learning_rate": 0.0006, + "loss": 2.1987, + "step": 43950 + }, + { + "epoch": 0.16397723118700716, + "grad_norm": 0.2981192469596863, + "learning_rate": 0.0006, + "loss": 2.0379, + "step": 43960 + }, + { + "epoch": 0.16401453264997054, + "grad_norm": 0.3339776396751404, + "learning_rate": 0.0006, + "loss": 2.2096, + "step": 43970 + }, + { + "epoch": 0.16405183411293392, + "grad_norm": 0.42251572012901306, + "learning_rate": 0.0006, + "loss": 2.2519, + "step": 43980 + }, + { + "epoch": 0.1640891355758973, + "grad_norm": 0.5374317765235901, + "learning_rate": 0.0006, + "loss": 2.1686, + "step": 43990 + }, + { + "epoch": 0.16412643703886065, + "grad_norm": 2.810919761657715, + "learning_rate": 0.0006, + "loss": 2.1434, + "step": 44000 + }, + { + "epoch": 0.16412643703886065, + "eval_valid_loss": 2.211355447769165, + "eval_valid_loss/all": 2.073230028152466, + "eval_valid_loss/end_span": 1.3004051446914673, + "eval_valid_perplexity/batch": 7.9504618644714355, + "eval_valid_perplexity/end_span": 3.670783519744873, + "eval_valid_perplexity/fim": 2.2938859462738037, + "eval_valid_perplexity/first_seq": 14.89283275604248, + "eval_valid_perplexity/last_seq": 9.126981735229492, + "eval_valid_perplexity/second_seq": 13.866843223571777, + "eval_valid_perplexity/seq": 8.979324340820312, + "eval_valid_reconstruction/all": 0.28895923495292664, + "eval_valid_reconstruction/end_span": 0.6899304389953613, + "eval_valid_reconstruction/fim": 0.16145023703575134, + "eval_valid_reconstruction/first_seq": 0.16460229456424713, + "eval_valid_reconstruction/last_seq": 0.32422521710395813, + "eval_valid_reconstruction/second_seq": 0.1914728730916977, + "eval_valid_runtime": 527.0713, + "eval_valid_samples_per_second": 0.364, + "eval_valid_steps_per_second": 0.364, + "step": 44000 + }, + { + "epoch": 0.16412643703886065, + "eval_train_loss": 2.2057158946990967, + "eval_train_loss/all": 2.04130482673645, + "eval_train_loss/end_span": 1.2624484300613403, + "eval_train_perplexity/batch": 7.700650691986084, + "eval_train_perplexity/end_span": 3.5340638160705566, + "eval_train_perplexity/fim": 2.072420835494995, + "eval_train_perplexity/first_seq": 15.743818283081055, + "eval_train_perplexity/last_seq": 9.382055282592773, + "eval_train_perplexity/second_seq": 14.610664367675781, + "eval_train_perplexity/seq": 8.880501747131348, + "eval_train_reconstruction/all": 0.27959826588630676, + "eval_train_reconstruction/end_span": 0.7001505494117737, + "eval_train_reconstruction/fim": 0.14282186329364777, + "eval_train_reconstruction/first_seq": 0.1452123373746872, + "eval_train_reconstruction/last_seq": 0.31375741958618164, + "eval_train_reconstruction/second_seq": 0.17719829082489014, + "eval_train_runtime": 520.8233, + "eval_train_samples_per_second": 0.369, + "eval_train_steps_per_second": 0.369, + "step": 44000 + }, + { + "epoch": 0.16416373850182403, + "grad_norm": 0.40243032574653625, + "learning_rate": 0.0006, + "loss": 2.2928, + "step": 44010 + }, + { + "epoch": 0.1642010399647874, + "grad_norm": 0.4470853805541992, + "learning_rate": 0.0006, + "loss": 2.0844, + "step": 44020 + }, + { + "epoch": 0.1642383414277508, + "grad_norm": 1.3313966989517212, + "learning_rate": 0.0006, + "loss": 2.2286, + "step": 44030 + }, + { + "epoch": 0.16427564289071417, + "grad_norm": 2.386127233505249, + "learning_rate": 0.0006, + "loss": 2.3651, + "step": 44040 + }, + { + "epoch": 0.16431294435367755, + "grad_norm": 0.406345933675766, + "learning_rate": 0.0006, + "loss": 2.3143, + "step": 44050 + }, + { + "epoch": 0.16435024581664093, + "grad_norm": 0.2442510724067688, + "learning_rate": 0.0006, + "loss": 2.144, + "step": 44060 + }, + { + "epoch": 0.1643875472796043, + "grad_norm": 0.3421202600002289, + "learning_rate": 0.0006, + "loss": 2.2928, + "step": 44070 + }, + { + "epoch": 0.1644248487425677, + "grad_norm": 1.0100477933883667, + "learning_rate": 0.0006, + "loss": 2.1988, + "step": 44080 + }, + { + "epoch": 0.16446215020553107, + "grad_norm": 0.35781922936439514, + "learning_rate": 0.0006, + "loss": 2.2172, + "step": 44090 + }, + { + "epoch": 0.16449945166849445, + "grad_norm": 0.4610334634780884, + "learning_rate": 0.0006, + "loss": 2.1597, + "step": 44100 + }, + { + "epoch": 0.16453675313145782, + "grad_norm": 0.6509600281715393, + "learning_rate": 0.0006, + "loss": 2.2502, + "step": 44110 + }, + { + "epoch": 0.1645740545944212, + "grad_norm": 0.5235219597816467, + "learning_rate": 0.0006, + "loss": 2.2564, + "step": 44120 + }, + { + "epoch": 0.16461135605738458, + "grad_norm": 0.35118767619132996, + "learning_rate": 0.0006, + "loss": 1.951, + "step": 44130 + }, + { + "epoch": 0.16464865752034794, + "grad_norm": 0.33683252334594727, + "learning_rate": 0.0006, + "loss": 2.2384, + "step": 44140 + }, + { + "epoch": 0.16468595898331131, + "grad_norm": 0.27493512630462646, + "learning_rate": 0.0006, + "loss": 2.3388, + "step": 44150 + }, + { + "epoch": 0.1647232604462747, + "grad_norm": 0.35580387711524963, + "learning_rate": 0.0006, + "loss": 2.1224, + "step": 44160 + }, + { + "epoch": 0.16476056190923807, + "grad_norm": 0.5559250712394714, + "learning_rate": 0.0006, + "loss": 1.9908, + "step": 44170 + }, + { + "epoch": 0.16479786337220145, + "grad_norm": 0.34572264552116394, + "learning_rate": 0.0006, + "loss": 2.1558, + "step": 44180 + }, + { + "epoch": 0.16483516483516483, + "grad_norm": 0.36564746499061584, + "learning_rate": 0.0006, + "loss": 2.2432, + "step": 44190 + }, + { + "epoch": 0.1648724662981282, + "grad_norm": 0.29997071623802185, + "learning_rate": 0.0006, + "loss": 2.2699, + "step": 44200 + }, + { + "epoch": 0.1649097677610916, + "grad_norm": 0.2782275974750519, + "learning_rate": 0.0006, + "loss": 2.2162, + "step": 44210 + }, + { + "epoch": 0.16494706922405497, + "grad_norm": 0.31083205342292786, + "learning_rate": 0.0006, + "loss": 2.2546, + "step": 44220 + }, + { + "epoch": 0.16498437068701835, + "grad_norm": 0.3217261731624603, + "learning_rate": 0.0006, + "loss": 2.121, + "step": 44230 + }, + { + "epoch": 0.16502167214998173, + "grad_norm": 0.386825829744339, + "learning_rate": 0.0006, + "loss": 1.9666, + "step": 44240 + }, + { + "epoch": 0.1650589736129451, + "grad_norm": 0.3029041290283203, + "learning_rate": 0.0006, + "loss": 2.3134, + "step": 44250 + }, + { + "epoch": 0.1650589736129451, + "eval_valid_loss": 2.190117835998535, + "eval_valid_loss/all": 2.053595542907715, + "eval_valid_loss/end_span": 1.283660650253296, + "eval_valid_perplexity/batch": 7.795881271362305, + "eval_valid_perplexity/end_span": 3.609829902648926, + "eval_valid_perplexity/fim": 2.528684616088867, + "eval_valid_perplexity/first_seq": 14.78184700012207, + "eval_valid_perplexity/last_seq": 9.068668365478516, + "eval_valid_perplexity/second_seq": 13.83298397064209, + "eval_valid_perplexity/seq": 8.800334930419922, + "eval_valid_reconstruction/all": 0.29473617672920227, + "eval_valid_reconstruction/end_span": 0.7009366154670715, + "eval_valid_reconstruction/fim": 0.18586072325706482, + "eval_valid_reconstruction/first_seq": 0.17086367309093475, + "eval_valid_reconstruction/last_seq": 0.32631418108940125, + "eval_valid_reconstruction/second_seq": 0.19834773242473602, + "eval_valid_runtime": 522.323, + "eval_valid_samples_per_second": 0.368, + "eval_valid_steps_per_second": 0.368, + "step": 44250 + }, + { + "epoch": 0.1650589736129451, + "eval_train_loss": 2.1868176460266113, + "eval_train_loss/all": 2.0242080688476562, + "eval_train_loss/end_span": 1.2362614870071411, + "eval_train_perplexity/batch": 7.570113658905029, + "eval_train_perplexity/end_span": 3.442718744277954, + "eval_train_perplexity/fim": 1.9300462007522583, + "eval_train_perplexity/first_seq": 15.45639419555664, + "eval_train_perplexity/last_seq": 8.652873992919922, + "eval_train_perplexity/second_seq": 14.4544095993042, + "eval_train_perplexity/seq": 8.724528312683105, + "eval_train_reconstruction/all": 0.28441736102104187, + "eval_train_reconstruction/end_span": 0.7146387100219727, + "eval_train_reconstruction/fim": 0.13202226161956787, + "eval_train_reconstruction/first_seq": 0.15534059703350067, + "eval_train_reconstruction/last_seq": 0.33746519684791565, + "eval_train_reconstruction/second_seq": 0.17803020775318146, + "eval_train_runtime": 518.7506, + "eval_train_samples_per_second": 0.37, + "eval_train_steps_per_second": 0.37, + "step": 44250 + }, + { + "epoch": 0.1650962750759085, + "grad_norm": 0.2925720810890198, + "learning_rate": 0.0006, + "loss": 2.2951, + "step": 44260 + }, + { + "epoch": 0.16513357653887187, + "grad_norm": 0.35787710547447205, + "learning_rate": 0.0006, + "loss": 2.2786, + "step": 44270 + }, + { + "epoch": 0.16517087800183522, + "grad_norm": 0.41796615719795227, + "learning_rate": 0.0006, + "loss": 2.2738, + "step": 44280 + }, + { + "epoch": 0.1652081794647986, + "grad_norm": 0.4725489914417267, + "learning_rate": 0.0006, + "loss": 2.0971, + "step": 44290 + }, + { + "epoch": 0.16524548092776198, + "grad_norm": 0.46801695227622986, + "learning_rate": 0.0006, + "loss": 2.245, + "step": 44300 + }, + { + "epoch": 0.16528278239072536, + "grad_norm": 0.3791666328907013, + "learning_rate": 0.0006, + "loss": 2.1163, + "step": 44310 + }, + { + "epoch": 0.16532008385368874, + "grad_norm": 0.4808207154273987, + "learning_rate": 0.0006, + "loss": 2.1949, + "step": 44320 + }, + { + "epoch": 0.16535738531665212, + "grad_norm": 0.4196471571922302, + "learning_rate": 0.0006, + "loss": 2.3316, + "step": 44330 + }, + { + "epoch": 0.1653946867796155, + "grad_norm": 0.34845560789108276, + "learning_rate": 0.0006, + "loss": 2.2104, + "step": 44340 + }, + { + "epoch": 0.16543198824257888, + "grad_norm": 0.3097243905067444, + "learning_rate": 0.0006, + "loss": 2.0103, + "step": 44350 + }, + { + "epoch": 0.16546928970554226, + "grad_norm": 0.3346373438835144, + "learning_rate": 0.0006, + "loss": 2.2248, + "step": 44360 + }, + { + "epoch": 0.16550659116850563, + "grad_norm": 0.3849191963672638, + "learning_rate": 0.0006, + "loss": 2.099, + "step": 44370 + }, + { + "epoch": 0.16554389263146901, + "grad_norm": 0.3597859740257263, + "learning_rate": 0.0006, + "loss": 2.0777, + "step": 44380 + }, + { + "epoch": 0.1655811940944324, + "grad_norm": 0.40605804324150085, + "learning_rate": 0.0006, + "loss": 2.3486, + "step": 44390 + }, + { + "epoch": 0.16561849555739577, + "grad_norm": 0.40017321705818176, + "learning_rate": 0.0006, + "loss": 2.1958, + "step": 44400 + }, + { + "epoch": 0.16565579702035915, + "grad_norm": 0.47686758637428284, + "learning_rate": 0.0006, + "loss": 2.3198, + "step": 44410 + }, + { + "epoch": 0.1656930984833225, + "grad_norm": 0.30497801303863525, + "learning_rate": 0.0006, + "loss": 2.1805, + "step": 44420 + }, + { + "epoch": 0.16573039994628588, + "grad_norm": 0.3173586428165436, + "learning_rate": 0.0006, + "loss": 2.225, + "step": 44430 + }, + { + "epoch": 0.16576770140924926, + "grad_norm": 0.4218086898326874, + "learning_rate": 0.0006, + "loss": 2.3318, + "step": 44440 + }, + { + "epoch": 0.16580500287221264, + "grad_norm": 0.2768164575099945, + "learning_rate": 0.0006, + "loss": 2.2071, + "step": 44450 + }, + { + "epoch": 0.16584230433517602, + "grad_norm": 0.9244266152381897, + "learning_rate": 0.0006, + "loss": 2.0722, + "step": 44460 + }, + { + "epoch": 0.1658796057981394, + "grad_norm": 0.18949870765209198, + "learning_rate": 0.0006, + "loss": 2.3054, + "step": 44470 + }, + { + "epoch": 0.16591690726110278, + "grad_norm": 0.414249986410141, + "learning_rate": 0.0006, + "loss": 2.2461, + "step": 44480 + }, + { + "epoch": 0.16595420872406616, + "grad_norm": 0.24480250477790833, + "learning_rate": 0.0006, + "loss": 2.1665, + "step": 44490 + }, + { + "epoch": 0.16599151018702954, + "grad_norm": 0.21058523654937744, + "learning_rate": 0.0006, + "loss": 2.2566, + "step": 44500 + }, + { + "epoch": 0.16599151018702954, + "eval_valid_loss": 2.1869802474975586, + "eval_valid_loss/all": 2.050044059753418, + "eval_valid_loss/end_span": 1.3030812740325928, + "eval_valid_perplexity/batch": 7.768243312835693, + "eval_valid_perplexity/end_span": 3.6806201934814453, + "eval_valid_perplexity/fim": 2.612574815750122, + "eval_valid_perplexity/first_seq": 15.213489532470703, + "eval_valid_perplexity/last_seq": 8.765212059020996, + "eval_valid_perplexity/second_seq": 14.077919006347656, + "eval_valid_perplexity/seq": 8.75786304473877, + "eval_valid_reconstruction/all": 0.2955585718154907, + "eval_valid_reconstruction/end_span": 0.6886358261108398, + "eval_valid_reconstruction/fim": 0.19319890439510345, + "eval_valid_reconstruction/first_seq": 0.15860934555530548, + "eval_valid_reconstruction/last_seq": 0.3304249942302704, + "eval_valid_reconstruction/second_seq": 0.19059380888938904, + "eval_valid_runtime": 522.2704, + "eval_valid_samples_per_second": 0.368, + "eval_valid_steps_per_second": 0.368, + "step": 44500 + }, + { + "epoch": 0.16599151018702954, + "eval_train_loss": 2.1858291625976562, + "eval_train_loss/all": 2.0228331089019775, + "eval_train_loss/end_span": 1.2632752656936646, + "eval_train_perplexity/batch": 7.559711933135986, + "eval_train_perplexity/end_span": 3.536987066268921, + "eval_train_perplexity/fim": 2.0427374839782715, + "eval_train_perplexity/first_seq": 15.592996597290039, + "eval_train_perplexity/last_seq": 8.475619316101074, + "eval_train_perplexity/second_seq": 14.255858421325684, + "eval_train_perplexity/seq": 8.70773983001709, + "eval_train_reconstruction/all": 0.28499042987823486, + "eval_train_reconstruction/end_span": 0.7004817128181458, + "eval_train_reconstruction/fim": 0.14295858144760132, + "eval_train_reconstruction/first_seq": 0.15048320591449738, + "eval_train_reconstruction/last_seq": 0.3450407385826111, + "eval_train_reconstruction/second_seq": 0.1833483725786209, + "eval_train_runtime": 514.3442, + "eval_train_samples_per_second": 0.373, + "eval_train_steps_per_second": 0.373, + "step": 44500 + }, + { + "epoch": 0.16602881164999292, + "grad_norm": 0.36870378255844116, + "learning_rate": 0.0006, + "loss": 2.2717, + "step": 44510 + }, + { + "epoch": 0.1660661131129563, + "grad_norm": 0.44309064745903015, + "learning_rate": 0.0006, + "loss": 2.3643, + "step": 44520 + }, + { + "epoch": 0.16610341457591968, + "grad_norm": 0.3138805627822876, + "learning_rate": 0.0006, + "loss": 2.2232, + "step": 44530 + }, + { + "epoch": 0.16614071603888306, + "grad_norm": 0.6387563347816467, + "learning_rate": 0.0006, + "loss": 2.3135, + "step": 44540 + }, + { + "epoch": 0.1661780175018464, + "grad_norm": 0.6811872720718384, + "learning_rate": 0.0006, + "loss": 2.338, + "step": 44550 + }, + { + "epoch": 0.1662153189648098, + "grad_norm": 0.29846978187561035, + "learning_rate": 0.0006, + "loss": 2.3024, + "step": 44560 + }, + { + "epoch": 0.16625262042777317, + "grad_norm": 0.28854116797447205, + "learning_rate": 0.0006, + "loss": 2.1122, + "step": 44570 + }, + { + "epoch": 0.16628992189073655, + "grad_norm": 0.3678998649120331, + "learning_rate": 0.0006, + "loss": 2.031, + "step": 44580 + }, + { + "epoch": 0.16632722335369993, + "grad_norm": 0.391569584608078, + "learning_rate": 0.0006, + "loss": 2.2241, + "step": 44590 + }, + { + "epoch": 0.1663645248166633, + "grad_norm": 0.399095356464386, + "learning_rate": 0.0006, + "loss": 2.2902, + "step": 44600 + }, + { + "epoch": 0.16640182627962669, + "grad_norm": 0.3058154284954071, + "learning_rate": 0.0006, + "loss": 2.1803, + "step": 44610 + }, + { + "epoch": 0.16643912774259007, + "grad_norm": 0.4336487650871277, + "learning_rate": 0.0006, + "loss": 2.2193, + "step": 44620 + }, + { + "epoch": 0.16647642920555344, + "grad_norm": 0.34947192668914795, + "learning_rate": 0.0006, + "loss": 2.1843, + "step": 44630 + }, + { + "epoch": 0.16651373066851682, + "grad_norm": 0.34769168496131897, + "learning_rate": 0.0006, + "loss": 1.9994, + "step": 44640 + }, + { + "epoch": 0.1665510321314802, + "grad_norm": 0.2356233447790146, + "learning_rate": 0.0006, + "loss": 2.2834, + "step": 44650 + }, + { + "epoch": 0.16658833359444358, + "grad_norm": 0.44321972131729126, + "learning_rate": 0.0006, + "loss": 2.2991, + "step": 44660 + }, + { + "epoch": 0.16662563505740696, + "grad_norm": 0.34543123841285706, + "learning_rate": 0.0006, + "loss": 2.2366, + "step": 44670 + }, + { + "epoch": 0.16666293652037034, + "grad_norm": 0.2617637813091278, + "learning_rate": 0.0006, + "loss": 2.3905, + "step": 44680 + }, + { + "epoch": 0.1667002379833337, + "grad_norm": 0.32747378945350647, + "learning_rate": 0.0006, + "loss": 1.9775, + "step": 44690 + }, + { + "epoch": 0.16673753944629707, + "grad_norm": 0.4256947636604309, + "learning_rate": 0.0006, + "loss": 2.1739, + "step": 44700 + }, + { + "epoch": 0.16677484090926045, + "grad_norm": 0.49412038922309875, + "learning_rate": 0.0006, + "loss": 2.1749, + "step": 44710 + }, + { + "epoch": 0.16681214237222383, + "grad_norm": 0.5434571504592896, + "learning_rate": 0.0006, + "loss": 2.222, + "step": 44720 + }, + { + "epoch": 0.1668494438351872, + "grad_norm": 0.28619125485420227, + "learning_rate": 0.0006, + "loss": 2.1117, + "step": 44730 + }, + { + "epoch": 0.1668867452981506, + "grad_norm": 0.3804386556148529, + "learning_rate": 0.0006, + "loss": 2.1862, + "step": 44740 + }, + { + "epoch": 0.16692404676111397, + "grad_norm": 0.45968154072761536, + "learning_rate": 0.0006, + "loss": 2.1502, + "step": 44750 + }, + { + "epoch": 0.16692404676111397, + "eval_valid_loss": 2.186236619949341, + "eval_valid_loss/all": 2.0499234199523926, + "eval_valid_loss/end_span": 1.225448727607727, + "eval_valid_perplexity/batch": 7.767306327819824, + "eval_valid_perplexity/end_span": 3.405694007873535, + "eval_valid_perplexity/fim": 2.632249355316162, + "eval_valid_perplexity/first_seq": 14.645980834960938, + "eval_valid_perplexity/last_seq": 9.247878074645996, + "eval_valid_perplexity/second_seq": 13.70533561706543, + "eval_valid_perplexity/seq": 8.763406753540039, + "eval_valid_reconstruction/all": 0.2957008481025696, + "eval_valid_reconstruction/end_span": 0.7217251062393188, + "eval_valid_reconstruction/fim": 0.19392907619476318, + "eval_valid_reconstruction/first_seq": 0.17330491542816162, + "eval_valid_reconstruction/last_seq": 0.31783920526504517, + "eval_valid_reconstruction/second_seq": 0.1980585902929306, + "eval_valid_runtime": 516.6736, + "eval_valid_samples_per_second": 0.372, + "eval_valid_steps_per_second": 0.372, + "step": 44750 + }, + { + "epoch": 0.16692404676111397, + "eval_train_loss": 2.185336112976074, + "eval_train_loss/all": 2.0225632190704346, + "eval_train_loss/end_span": 1.190328598022461, + "eval_train_perplexity/batch": 7.557672023773193, + "eval_train_perplexity/end_span": 3.288161516189575, + "eval_train_perplexity/fim": 2.2454817295074463, + "eval_train_perplexity/first_seq": 15.597707748413086, + "eval_train_perplexity/last_seq": 8.890542030334473, + "eval_train_perplexity/second_seq": 14.556697845458984, + "eval_train_perplexity/seq": 8.70975399017334, + "eval_train_reconstruction/all": 0.2849092483520508, + "eval_train_reconstruction/end_span": 0.7297704219818115, + "eval_train_reconstruction/fim": 0.162730872631073, + "eval_train_reconstruction/first_seq": 0.14898821711540222, + "eval_train_reconstruction/last_seq": 0.3282206356525421, + "eval_train_reconstruction/second_seq": 0.17730148136615753, + "eval_train_runtime": 522.962, + "eval_train_samples_per_second": 0.367, + "eval_train_steps_per_second": 0.367, + "step": 44750 + }, + { + "epoch": 0.16696134822407735, + "grad_norm": 0.4824735224246979, + "learning_rate": 0.0006, + "loss": 2.2895, + "step": 44760 + }, + { + "epoch": 0.16699864968704073, + "grad_norm": 0.5017778277397156, + "learning_rate": 0.0006, + "loss": 2.3454, + "step": 44770 + }, + { + "epoch": 0.1670359511500041, + "grad_norm": 0.33952799439430237, + "learning_rate": 0.0006, + "loss": 2.2603, + "step": 44780 + }, + { + "epoch": 0.1670732526129675, + "grad_norm": 0.28090324997901917, + "learning_rate": 0.0006, + "loss": 2.23, + "step": 44790 + }, + { + "epoch": 0.16711055407593087, + "grad_norm": 0.3103366494178772, + "learning_rate": 0.0006, + "loss": 2.2796, + "step": 44800 + }, + { + "epoch": 0.16714785553889425, + "grad_norm": 0.29185473918914795, + "learning_rate": 0.0006, + "loss": 2.1872, + "step": 44810 + }, + { + "epoch": 0.16718515700185763, + "grad_norm": 0.28141212463378906, + "learning_rate": 0.0006, + "loss": 2.1899, + "step": 44820 + }, + { + "epoch": 0.16722245846482098, + "grad_norm": 0.36685711145401, + "learning_rate": 0.0006, + "loss": 2.1893, + "step": 44830 + }, + { + "epoch": 0.16725975992778436, + "grad_norm": 0.5631895065307617, + "learning_rate": 0.0006, + "loss": 2.2891, + "step": 44840 + }, + { + "epoch": 0.16729706139074774, + "grad_norm": 0.37981483340263367, + "learning_rate": 0.0006, + "loss": 2.3831, + "step": 44850 + }, + { + "epoch": 0.16733436285371112, + "grad_norm": 0.5685669779777527, + "learning_rate": 0.0006, + "loss": 2.2268, + "step": 44860 + }, + { + "epoch": 0.1673716643166745, + "grad_norm": 0.45588430762290955, + "learning_rate": 0.0006, + "loss": 2.0569, + "step": 44870 + }, + { + "epoch": 0.16740896577963787, + "grad_norm": 0.3481634557247162, + "learning_rate": 0.0006, + "loss": 2.0597, + "step": 44880 + }, + { + "epoch": 0.16744626724260125, + "grad_norm": 0.3292698860168457, + "learning_rate": 0.0006, + "loss": 2.2199, + "step": 44890 + }, + { + "epoch": 0.16748356870556463, + "grad_norm": 0.3608386814594269, + "learning_rate": 0.0006, + "loss": 2.298, + "step": 44900 + }, + { + "epoch": 0.167520870168528, + "grad_norm": 0.401458203792572, + "learning_rate": 0.0006, + "loss": 2.1655, + "step": 44910 + }, + { + "epoch": 0.1675581716314914, + "grad_norm": 0.2114928662776947, + "learning_rate": 0.0006, + "loss": 2.2014, + "step": 44920 + }, + { + "epoch": 0.16759547309445477, + "grad_norm": 0.21553094685077667, + "learning_rate": 0.0006, + "loss": 2.3708, + "step": 44930 + }, + { + "epoch": 0.16763277455741815, + "grad_norm": 0.5309237837791443, + "learning_rate": 0.0006, + "loss": 2.2048, + "step": 44940 + }, + { + "epoch": 0.16767007602038153, + "grad_norm": 0.3277752697467804, + "learning_rate": 0.0006, + "loss": 2.3357, + "step": 44950 + }, + { + "epoch": 0.1677073774833449, + "grad_norm": 0.3602464199066162, + "learning_rate": 0.0006, + "loss": 2.3239, + "step": 44960 + }, + { + "epoch": 0.16774467894630826, + "grad_norm": 0.35352623462677, + "learning_rate": 0.0006, + "loss": 2.3073, + "step": 44970 + }, + { + "epoch": 0.16778198040927164, + "grad_norm": 0.2824835479259491, + "learning_rate": 0.0006, + "loss": 2.433, + "step": 44980 + }, + { + "epoch": 0.16781928187223502, + "grad_norm": 0.302762508392334, + "learning_rate": 0.0006, + "loss": 2.2579, + "step": 44990 + }, + { + "epoch": 0.1678565833351984, + "grad_norm": 0.30244898796081543, + "learning_rate": 0.0006, + "loss": 2.3121, + "step": 45000 + }, + { + "epoch": 0.1678565833351984, + "eval_valid_loss": 2.1855275630950928, + "eval_valid_loss/all": 2.0489730834960938, + "eval_valid_loss/end_span": 1.2165180444717407, + "eval_valid_perplexity/batch": 7.759928226470947, + "eval_valid_perplexity/end_span": 3.3754141330718994, + "eval_valid_perplexity/fim": 2.440978765487671, + "eval_valid_perplexity/first_seq": 15.005544662475586, + "eval_valid_perplexity/last_seq": 8.500885963439941, + "eval_valid_perplexity/second_seq": 13.625938415527344, + "eval_valid_perplexity/seq": 8.749686241149902, + "eval_valid_reconstruction/all": 0.29585540294647217, + "eval_valid_reconstruction/end_span": 0.7198642492294312, + "eval_valid_reconstruction/fim": 0.1812811642885208, + "eval_valid_reconstruction/first_seq": 0.16333512961864471, + "eval_valid_reconstruction/last_seq": 0.3433830440044403, + "eval_valid_reconstruction/second_seq": 0.1958617866039276, + "eval_valid_runtime": 513.9789, + "eval_valid_samples_per_second": 0.374, + "eval_valid_steps_per_second": 0.374, + "step": 45000 + }, + { + "epoch": 0.1678565833351984, + "eval_train_loss": 2.1835107803344727, + "eval_train_loss/all": 2.020888566970825, + "eval_train_loss/end_span": 1.1697455644607544, + "eval_train_perplexity/batch": 7.5450263023376465, + "eval_train_perplexity/end_span": 3.221173048019409, + "eval_train_perplexity/fim": 2.0448808670043945, + "eval_train_perplexity/first_seq": 15.262320518493652, + "eval_train_perplexity/last_seq": 9.251289367675781, + "eval_train_perplexity/second_seq": 14.419438362121582, + "eval_train_perplexity/seq": 8.690552711486816, + "eval_train_reconstruction/all": 0.2853142321109772, + "eval_train_reconstruction/end_span": 0.732511579990387, + "eval_train_reconstruction/fim": 0.14537689089775085, + "eval_train_reconstruction/first_seq": 0.15825395286083221, + "eval_train_reconstruction/last_seq": 0.31609785556793213, + "eval_train_reconstruction/second_seq": 0.18004480004310608, + "eval_train_runtime": 516.7873, + "eval_train_samples_per_second": 0.372, + "eval_train_steps_per_second": 0.372, + "step": 45000 + }, + { + "epoch": 0.16789388479816178, + "grad_norm": 0.28795406222343445, + "learning_rate": 0.0006, + "loss": 2.3239, + "step": 45010 + }, + { + "epoch": 0.16793118626112516, + "grad_norm": 0.18983903527259827, + "learning_rate": 0.0006, + "loss": 2.4118, + "step": 45020 + }, + { + "epoch": 0.16796848772408854, + "grad_norm": 0.3578048348426819, + "learning_rate": 0.0006, + "loss": 2.3485, + "step": 45030 + }, + { + "epoch": 0.16800578918705192, + "grad_norm": 0.2286122888326645, + "learning_rate": 0.0006, + "loss": 2.1091, + "step": 45040 + }, + { + "epoch": 0.1680430906500153, + "grad_norm": 0.2730460464954376, + "learning_rate": 0.0006, + "loss": 2.2401, + "step": 45050 + }, + { + "epoch": 0.16808039211297868, + "grad_norm": 0.4354710876941681, + "learning_rate": 0.0006, + "loss": 2.1372, + "step": 45060 + }, + { + "epoch": 0.16811769357594206, + "grad_norm": 0.24525928497314453, + "learning_rate": 0.0006, + "loss": 2.2264, + "step": 45070 + }, + { + "epoch": 0.16815499503890544, + "grad_norm": 0.5646826028823853, + "learning_rate": 0.0006, + "loss": 2.1775, + "step": 45080 + }, + { + "epoch": 0.16819229650186882, + "grad_norm": 0.4348395764827728, + "learning_rate": 0.0006, + "loss": 2.2032, + "step": 45090 + }, + { + "epoch": 0.16822959796483217, + "grad_norm": 0.5648279786109924, + "learning_rate": 0.0006, + "loss": 2.0965, + "step": 45100 + }, + { + "epoch": 0.16826689942779555, + "grad_norm": 0.2589111030101776, + "learning_rate": 0.0006, + "loss": 2.1468, + "step": 45110 + }, + { + "epoch": 0.16830420089075893, + "grad_norm": 0.41771265864372253, + "learning_rate": 0.0006, + "loss": 2.2227, + "step": 45120 + }, + { + "epoch": 0.1683415023537223, + "grad_norm": 0.32365190982818604, + "learning_rate": 0.0006, + "loss": 2.0481, + "step": 45130 + }, + { + "epoch": 0.16837880381668568, + "grad_norm": 0.3281289041042328, + "learning_rate": 0.0006, + "loss": 2.3325, + "step": 45140 + }, + { + "epoch": 0.16841610527964906, + "grad_norm": 0.32764944434165955, + "learning_rate": 0.0006, + "loss": 2.3069, + "step": 45150 + }, + { + "epoch": 0.16845340674261244, + "grad_norm": 0.4820720851421356, + "learning_rate": 0.0006, + "loss": 2.2341, + "step": 45160 + }, + { + "epoch": 0.16849070820557582, + "grad_norm": 0.24824245274066925, + "learning_rate": 0.0006, + "loss": 2.2969, + "step": 45170 + }, + { + "epoch": 0.1685280096685392, + "grad_norm": 0.3451268970966339, + "learning_rate": 0.0006, + "loss": 2.176, + "step": 45180 + }, + { + "epoch": 0.16856531113150258, + "grad_norm": 0.33159586787223816, + "learning_rate": 0.0006, + "loss": 2.1599, + "step": 45190 + }, + { + "epoch": 0.16860261259446596, + "grad_norm": 0.3934187591075897, + "learning_rate": 0.0006, + "loss": 2.0403, + "step": 45200 + }, + { + "epoch": 0.16863991405742934, + "grad_norm": 0.3327541947364807, + "learning_rate": 0.0006, + "loss": 2.2842, + "step": 45210 + }, + { + "epoch": 0.16867721552039272, + "grad_norm": 0.32562926411628723, + "learning_rate": 0.0006, + "loss": 2.2734, + "step": 45220 + }, + { + "epoch": 0.1687145169833561, + "grad_norm": 0.33294180035591125, + "learning_rate": 0.0006, + "loss": 2.2181, + "step": 45230 + }, + { + "epoch": 0.16875181844631945, + "grad_norm": 0.4525696635246277, + "learning_rate": 0.0006, + "loss": 2.2222, + "step": 45240 + }, + { + "epoch": 0.16878911990928283, + "grad_norm": 0.3434986472129822, + "learning_rate": 0.0006, + "loss": 2.1292, + "step": 45250 + }, + { + "epoch": 0.16878911990928283, + "eval_valid_loss": 2.1829354763031006, + "eval_valid_loss/all": 2.04667329788208, + "eval_valid_loss/end_span": 1.2765164375305176, + "eval_valid_perplexity/batch": 7.74210262298584, + "eval_valid_perplexity/end_span": 3.584132432937622, + "eval_valid_perplexity/fim": 2.293928623199463, + "eval_valid_perplexity/first_seq": 14.8379545211792, + "eval_valid_perplexity/last_seq": 8.566876411437988, + "eval_valid_perplexity/second_seq": 13.841290473937988, + "eval_valid_perplexity/seq": 8.731328010559082, + "eval_valid_reconstruction/all": 0.2966885268688202, + "eval_valid_reconstruction/end_span": 0.6966232061386108, + "eval_valid_reconstruction/fim": 0.1677679419517517, + "eval_valid_reconstruction/first_seq": 0.16910067200660706, + "eval_valid_reconstruction/last_seq": 0.34506845474243164, + "eval_valid_reconstruction/second_seq": 0.1935690939426422, + "eval_valid_runtime": 520.4205, + "eval_valid_samples_per_second": 0.369, + "eval_valid_steps_per_second": 0.369, + "step": 45250 + }, + { + "epoch": 0.16878911990928283, + "eval_train_loss": 2.18229079246521, + "eval_train_loss/all": 2.019829511642456, + "eval_train_loss/end_span": 1.2438713312149048, + "eval_train_perplexity/batch": 7.537039756774902, + "eval_train_perplexity/end_span": 3.469017267227173, + "eval_train_perplexity/fim": 2.333308219909668, + "eval_train_perplexity/first_seq": 15.607282638549805, + "eval_train_perplexity/last_seq": 8.954106330871582, + "eval_train_perplexity/second_seq": 13.967674255371094, + "eval_train_perplexity/seq": 8.67867660522461, + "eval_train_reconstruction/all": 0.2858107388019562, + "eval_train_reconstruction/end_span": 0.7079501748085022, + "eval_train_reconstruction/fim": 0.17178083956241608, + "eval_train_reconstruction/first_seq": 0.14893405139446259, + "eval_train_reconstruction/last_seq": 0.3220106363296509, + "eval_train_reconstruction/second_seq": 0.18861660361289978, + "eval_train_runtime": 520.5707, + "eval_train_samples_per_second": 0.369, + "eval_train_steps_per_second": 0.369, + "step": 45250 + }, + { + "epoch": 0.1688264213722462, + "grad_norm": 0.27132558822631836, + "learning_rate": 0.0006, + "loss": 2.284, + "step": 45260 + }, + { + "epoch": 0.1688637228352096, + "grad_norm": 0.27959150075912476, + "learning_rate": 0.0006, + "loss": 2.3598, + "step": 45270 + }, + { + "epoch": 0.16890102429817297, + "grad_norm": 0.31775012612342834, + "learning_rate": 0.0006, + "loss": 2.1673, + "step": 45280 + }, + { + "epoch": 0.16893832576113635, + "grad_norm": 0.3519335389137268, + "learning_rate": 0.0006, + "loss": 2.2285, + "step": 45290 + }, + { + "epoch": 0.16897562722409973, + "grad_norm": 0.23540133237838745, + "learning_rate": 0.0006, + "loss": 2.3257, + "step": 45300 + }, + { + "epoch": 0.1690129286870631, + "grad_norm": 0.27028948068618774, + "learning_rate": 0.0006, + "loss": 2.2356, + "step": 45310 + }, + { + "epoch": 0.1690502301500265, + "grad_norm": 0.503941535949707, + "learning_rate": 0.0006, + "loss": 2.2153, + "step": 45320 + }, + { + "epoch": 0.16908753161298987, + "grad_norm": 0.19917260110378265, + "learning_rate": 0.0006, + "loss": 2.2825, + "step": 45330 + }, + { + "epoch": 0.16912483307595325, + "grad_norm": 0.5255761742591858, + "learning_rate": 0.0006, + "loss": 2.1429, + "step": 45340 + }, + { + "epoch": 0.16916213453891663, + "grad_norm": 0.3919922411441803, + "learning_rate": 0.0006, + "loss": 2.2233, + "step": 45350 + }, + { + "epoch": 0.16919943600188, + "grad_norm": 0.33428481221199036, + "learning_rate": 0.0006, + "loss": 2.1972, + "step": 45360 + }, + { + "epoch": 0.16923673746484338, + "grad_norm": 0.1976984739303589, + "learning_rate": 0.0006, + "loss": 2.2461, + "step": 45370 + }, + { + "epoch": 0.16927403892780674, + "grad_norm": 0.27539026737213135, + "learning_rate": 0.0006, + "loss": 2.3863, + "step": 45380 + }, + { + "epoch": 0.16931134039077012, + "grad_norm": 0.2900658845901489, + "learning_rate": 0.0006, + "loss": 2.2208, + "step": 45390 + }, + { + "epoch": 0.1693486418537335, + "grad_norm": 0.2657117545604706, + "learning_rate": 0.0006, + "loss": 2.3138, + "step": 45400 + }, + { + "epoch": 0.16938594331669687, + "grad_norm": 0.4482201933860779, + "learning_rate": 0.0006, + "loss": 2.2205, + "step": 45410 + }, + { + "epoch": 0.16942324477966025, + "grad_norm": 0.30869773030281067, + "learning_rate": 0.0006, + "loss": 2.2009, + "step": 45420 + }, + { + "epoch": 0.16946054624262363, + "grad_norm": 0.3815520107746124, + "learning_rate": 0.0006, + "loss": 2.2523, + "step": 45430 + }, + { + "epoch": 0.169497847705587, + "grad_norm": 0.37375885248184204, + "learning_rate": 0.0006, + "loss": 2.2106, + "step": 45440 + }, + { + "epoch": 0.1695351491685504, + "grad_norm": 0.23580008745193481, + "learning_rate": 0.0006, + "loss": 2.2113, + "step": 45450 + }, + { + "epoch": 0.16957245063151377, + "grad_norm": 0.3842746615409851, + "learning_rate": 0.0006, + "loss": 2.1868, + "step": 45460 + }, + { + "epoch": 0.16960975209447715, + "grad_norm": 0.3428361117839813, + "learning_rate": 0.0006, + "loss": 2.0474, + "step": 45470 + }, + { + "epoch": 0.16964705355744053, + "grad_norm": 0.5134523510932922, + "learning_rate": 0.0006, + "loss": 2.2174, + "step": 45480 + }, + { + "epoch": 0.1696843550204039, + "grad_norm": 0.4136003255844116, + "learning_rate": 0.0006, + "loss": 2.2989, + "step": 45490 + }, + { + "epoch": 0.1697216564833673, + "grad_norm": 0.39509114623069763, + "learning_rate": 0.0006, + "loss": 2.2453, + "step": 45500 + }, + { + "epoch": 0.1697216564833673, + "eval_valid_loss": 2.194817304611206, + "eval_valid_loss/all": 2.057352066040039, + "eval_valid_loss/end_span": 1.1980276107788086, + "eval_valid_perplexity/batch": 7.825221538543701, + "eval_valid_perplexity/end_span": 3.31357479095459, + "eval_valid_perplexity/fim": 2.3789141178131104, + "eval_valid_perplexity/first_seq": 14.471771240234375, + "eval_valid_perplexity/last_seq": 9.0255126953125, + "eval_valid_perplexity/second_seq": 13.84896183013916, + "eval_valid_perplexity/seq": 8.822264671325684, + "eval_valid_reconstruction/all": 0.2935760021209717, + "eval_valid_reconstruction/end_span": 0.7184624671936035, + "eval_valid_reconstruction/fim": 0.17223411798477173, + "eval_valid_reconstruction/first_seq": 0.17415758967399597, + "eval_valid_reconstruction/last_seq": 0.3265594244003296, + "eval_valid_reconstruction/second_seq": 0.19412443041801453, + "eval_valid_runtime": 518.6219, + "eval_valid_samples_per_second": 0.37, + "eval_valid_steps_per_second": 0.37, + "step": 45500 + }, + { + "epoch": 0.1697216564833673, + "eval_train_loss": 2.193786382675171, + "eval_train_loss/all": 2.029486656188965, + "eval_train_loss/end_span": 1.1593124866485596, + "eval_train_perplexity/batch": 7.6101789474487305, + "eval_train_perplexity/end_span": 3.1877408027648926, + "eval_train_perplexity/fim": 1.95164954662323, + "eval_train_perplexity/first_seq": 15.368505477905273, + "eval_train_perplexity/last_seq": 9.533669471740723, + "eval_train_perplexity/second_seq": 14.456367492675781, + "eval_train_perplexity/seq": 8.760266304016113, + "eval_train_reconstruction/all": 0.28297966718673706, + "eval_train_reconstruction/end_span": 0.7346386313438416, + "eval_train_reconstruction/fim": 0.1332322210073471, + "eval_train_reconstruction/first_seq": 0.1553604006767273, + "eval_train_reconstruction/last_seq": 0.31079787015914917, + "eval_train_reconstruction/second_seq": 0.1804688274860382, + "eval_train_runtime": 512.9561, + "eval_train_samples_per_second": 0.374, + "eval_train_steps_per_second": 0.374, + "step": 45500 + }, + { + "epoch": 0.16975895794633067, + "grad_norm": 0.4336894452571869, + "learning_rate": 0.0006, + "loss": 2.1955, + "step": 45510 + }, + { + "epoch": 0.16979625940929402, + "grad_norm": 0.488085001707077, + "learning_rate": 0.0006, + "loss": 2.0796, + "step": 45520 + }, + { + "epoch": 0.1698335608722574, + "grad_norm": 0.5496479868888855, + "learning_rate": 0.0006, + "loss": 2.4051, + "step": 45530 + }, + { + "epoch": 0.16987086233522078, + "grad_norm": 0.3079357147216797, + "learning_rate": 0.0006, + "loss": 2.2335, + "step": 45540 + }, + { + "epoch": 0.16990816379818416, + "grad_norm": 0.21753178536891937, + "learning_rate": 0.0006, + "loss": 2.3456, + "step": 45550 + }, + { + "epoch": 0.16994546526114754, + "grad_norm": 0.3718641996383667, + "learning_rate": 0.0006, + "loss": 2.3011, + "step": 45560 + }, + { + "epoch": 0.16998276672411092, + "grad_norm": 0.43548375368118286, + "learning_rate": 0.0006, + "loss": 2.2629, + "step": 45570 + }, + { + "epoch": 0.1700200681870743, + "grad_norm": 0.23958349227905273, + "learning_rate": 0.0006, + "loss": 2.1883, + "step": 45580 + }, + { + "epoch": 0.17005736965003768, + "grad_norm": 0.3333842158317566, + "learning_rate": 0.0006, + "loss": 2.1495, + "step": 45590 + }, + { + "epoch": 0.17009467111300106, + "grad_norm": 0.3194049000740051, + "learning_rate": 0.0006, + "loss": 2.3572, + "step": 45600 + }, + { + "epoch": 0.17013197257596444, + "grad_norm": 0.28751567006111145, + "learning_rate": 0.0006, + "loss": 2.257, + "step": 45610 + }, + { + "epoch": 0.17016927403892781, + "grad_norm": 0.428897500038147, + "learning_rate": 0.0006, + "loss": 2.2443, + "step": 45620 + }, + { + "epoch": 0.1702065755018912, + "grad_norm": 0.2859758734703064, + "learning_rate": 0.0006, + "loss": 2.1241, + "step": 45630 + }, + { + "epoch": 0.17024387696485457, + "grad_norm": 0.3083004355430603, + "learning_rate": 0.0006, + "loss": 2.323, + "step": 45640 + }, + { + "epoch": 0.17028117842781793, + "grad_norm": 0.30692118406295776, + "learning_rate": 0.0006, + "loss": 2.1568, + "step": 45650 + }, + { + "epoch": 0.1703184798907813, + "grad_norm": 0.5379927754402161, + "learning_rate": 0.0006, + "loss": 2.3007, + "step": 45660 + }, + { + "epoch": 0.17035578135374468, + "grad_norm": 0.3668426275253296, + "learning_rate": 0.0006, + "loss": 2.2467, + "step": 45670 + }, + { + "epoch": 0.17039308281670806, + "grad_norm": 0.4456403851509094, + "learning_rate": 0.0006, + "loss": 2.2515, + "step": 45680 + }, + { + "epoch": 0.17043038427967144, + "grad_norm": 0.3779720962047577, + "learning_rate": 0.0006, + "loss": 2.235, + "step": 45690 + }, + { + "epoch": 0.17046768574263482, + "grad_norm": 0.43209999799728394, + "learning_rate": 0.0006, + "loss": 2.0357, + "step": 45700 + }, + { + "epoch": 0.1705049872055982, + "grad_norm": 0.4052808880805969, + "learning_rate": 0.0006, + "loss": 2.325, + "step": 45710 + }, + { + "epoch": 0.17054228866856158, + "grad_norm": 0.2536660134792328, + "learning_rate": 0.0006, + "loss": 2.1757, + "step": 45720 + }, + { + "epoch": 0.17057959013152496, + "grad_norm": 0.34606602787971497, + "learning_rate": 0.0006, + "loss": 2.1452, + "step": 45730 + }, + { + "epoch": 0.17061689159448834, + "grad_norm": 0.366613507270813, + "learning_rate": 0.0006, + "loss": 2.2583, + "step": 45740 + }, + { + "epoch": 0.17065419305745172, + "grad_norm": 0.8162715435028076, + "learning_rate": 0.0006, + "loss": 2.2078, + "step": 45750 + }, + { + "epoch": 0.17065419305745172, + "eval_valid_loss": 2.1852304935455322, + "eval_valid_loss/all": 2.049172878265381, + "eval_valid_loss/end_span": 1.2643741369247437, + "eval_valid_perplexity/batch": 7.761478900909424, + "eval_valid_perplexity/end_span": 3.5408759117126465, + "eval_valid_perplexity/fim": 2.0385425090789795, + "eval_valid_perplexity/first_seq": 14.569836616516113, + "eval_valid_perplexity/last_seq": 8.754892349243164, + "eval_valid_perplexity/second_seq": 13.277536392211914, + "eval_valid_perplexity/seq": 8.754971504211426, + "eval_valid_reconstruction/all": 0.29611390829086304, + "eval_valid_reconstruction/end_span": 0.7004135251045227, + "eval_valid_reconstruction/fim": 0.1435733586549759, + "eval_valid_reconstruction/first_seq": 0.17488665878772736, + "eval_valid_reconstruction/last_seq": 0.33755284547805786, + "eval_valid_reconstruction/second_seq": 0.2073374092578888, + "eval_valid_runtime": 510.0079, + "eval_valid_samples_per_second": 0.376, + "eval_valid_steps_per_second": 0.376, + "step": 45750 + }, + { + "epoch": 0.17065419305745172, + "eval_train_loss": 2.183061361312866, + "eval_train_loss/all": 2.0206081867218018, + "eval_train_loss/end_span": 1.2303873300552368, + "eval_train_perplexity/batch": 7.542911052703857, + "eval_train_perplexity/end_span": 3.4225549697875977, + "eval_train_perplexity/fim": 2.197436809539795, + "eval_train_perplexity/first_seq": 15.335504531860352, + "eval_train_perplexity/last_seq": 8.788732528686523, + "eval_train_perplexity/second_seq": 14.425607681274414, + "eval_train_perplexity/seq": 8.684562683105469, + "eval_train_reconstruction/all": 0.2856276333332062, + "eval_train_reconstruction/end_span": 0.7137731909751892, + "eval_train_reconstruction/fim": 0.1582096666097641, + "eval_train_reconstruction/first_seq": 0.15428541600704193, + "eval_train_reconstruction/last_seq": 0.331514447927475, + "eval_train_reconstruction/second_seq": 0.17890264093875885, + "eval_train_runtime": 516.0201, + "eval_train_samples_per_second": 0.372, + "eval_train_steps_per_second": 0.372, + "step": 45750 + }, + { + "epoch": 0.1706914945204151, + "grad_norm": 0.24748049676418304, + "learning_rate": 0.0006, + "loss": 2.0154, + "step": 45760 + }, + { + "epoch": 0.17072879598337848, + "grad_norm": 1.6656537055969238, + "learning_rate": 0.0006, + "loss": 2.1375, + "step": 45770 + }, + { + "epoch": 0.17076609744634186, + "grad_norm": 0.36933860182762146, + "learning_rate": 0.0006, + "loss": 2.1572, + "step": 45780 + }, + { + "epoch": 0.1708033989093052, + "grad_norm": 0.41705024242401123, + "learning_rate": 0.0006, + "loss": 1.9559, + "step": 45790 + }, + { + "epoch": 0.1708407003722686, + "grad_norm": 0.4498966932296753, + "learning_rate": 0.0006, + "loss": 2.2674, + "step": 45800 + }, + { + "epoch": 0.17087800183523197, + "grad_norm": 0.33005738258361816, + "learning_rate": 0.0006, + "loss": 2.1852, + "step": 45810 + }, + { + "epoch": 0.17091530329819535, + "grad_norm": 0.434479296207428, + "learning_rate": 0.0006, + "loss": 2.1818, + "step": 45820 + }, + { + "epoch": 0.17095260476115873, + "grad_norm": 0.4108823537826538, + "learning_rate": 0.0006, + "loss": 2.0765, + "step": 45830 + }, + { + "epoch": 0.1709899062241221, + "grad_norm": 0.23638825118541718, + "learning_rate": 0.0006, + "loss": 2.3895, + "step": 45840 + }, + { + "epoch": 0.1710272076870855, + "grad_norm": 0.34586969017982483, + "learning_rate": 0.0006, + "loss": 2.1848, + "step": 45850 + }, + { + "epoch": 0.17106450915004887, + "grad_norm": 0.4075154662132263, + "learning_rate": 0.0006, + "loss": 2.1307, + "step": 45860 + }, + { + "epoch": 0.17110181061301225, + "grad_norm": 0.23942524194717407, + "learning_rate": 0.0006, + "loss": 2.2382, + "step": 45870 + }, + { + "epoch": 0.17113911207597562, + "grad_norm": 0.37828418612480164, + "learning_rate": 0.0006, + "loss": 2.3402, + "step": 45880 + }, + { + "epoch": 0.171176413538939, + "grad_norm": 0.35413622856140137, + "learning_rate": 0.0006, + "loss": 2.113, + "step": 45890 + }, + { + "epoch": 0.17121371500190238, + "grad_norm": 0.4694828391075134, + "learning_rate": 0.0006, + "loss": 2.1262, + "step": 45900 + }, + { + "epoch": 0.17125101646486576, + "grad_norm": 0.3582916855812073, + "learning_rate": 0.0006, + "loss": 2.3017, + "step": 45910 + }, + { + "epoch": 0.17128831792782914, + "grad_norm": 0.4548618197441101, + "learning_rate": 0.0006, + "loss": 2.192, + "step": 45920 + }, + { + "epoch": 0.1713256193907925, + "grad_norm": 0.31596994400024414, + "learning_rate": 0.0006, + "loss": 2.1918, + "step": 45930 + }, + { + "epoch": 0.17136292085375587, + "grad_norm": 0.31942570209503174, + "learning_rate": 0.0006, + "loss": 2.3908, + "step": 45940 + }, + { + "epoch": 0.17140022231671925, + "grad_norm": 0.4575922191143036, + "learning_rate": 0.0006, + "loss": 2.1674, + "step": 45950 + }, + { + "epoch": 0.17143752377968263, + "grad_norm": 0.42285478115081787, + "learning_rate": 0.0006, + "loss": 2.0914, + "step": 45960 + }, + { + "epoch": 0.171474825242646, + "grad_norm": 0.5745246410369873, + "learning_rate": 0.0006, + "loss": 2.2504, + "step": 45970 + }, + { + "epoch": 0.1715121267056094, + "grad_norm": 0.33401328325271606, + "learning_rate": 0.0006, + "loss": 2.1196, + "step": 45980 + }, + { + "epoch": 0.17154942816857277, + "grad_norm": 0.3574930429458618, + "learning_rate": 0.0006, + "loss": 2.1172, + "step": 45990 + }, + { + "epoch": 0.17158672963153615, + "grad_norm": 0.42451244592666626, + "learning_rate": 0.0006, + "loss": 2.3203, + "step": 46000 + }, + { + "epoch": 0.17158672963153615, + "eval_valid_loss": 2.1830313205718994, + "eval_valid_loss/all": 2.046945095062256, + "eval_valid_loss/end_span": 1.225809931755066, + "eval_valid_perplexity/batch": 7.74420690536499, + "eval_valid_perplexity/end_span": 3.406924247741699, + "eval_valid_perplexity/fim": 2.15315580368042, + "eval_valid_perplexity/first_seq": 14.92530345916748, + "eval_valid_perplexity/last_seq": 8.703009605407715, + "eval_valid_perplexity/second_seq": 13.823179244995117, + "eval_valid_perplexity/seq": 8.73393726348877, + "eval_valid_reconstruction/all": 0.296527236700058, + "eval_valid_reconstruction/end_span": 0.7140122056007385, + "eval_valid_reconstruction/fim": 0.15536987781524658, + "eval_valid_reconstruction/first_seq": 0.16485077142715454, + "eval_valid_reconstruction/last_seq": 0.33738934993743896, + "eval_valid_reconstruction/second_seq": 0.19648562371730804, + "eval_valid_runtime": 512.9925, + "eval_valid_samples_per_second": 0.374, + "eval_valid_steps_per_second": 0.374, + "step": 46000 + }, + { + "epoch": 0.17158672963153615, + "eval_train_loss": 2.1815741062164307, + "eval_train_loss/all": 2.0193865299224854, + "eval_train_loss/end_span": 1.19308340549469, + "eval_train_perplexity/batch": 7.5337018966674805, + "eval_train_perplexity/end_span": 3.297232151031494, + "eval_train_perplexity/fim": 2.2544167041778564, + "eval_train_perplexity/first_seq": 15.525226593017578, + "eval_train_perplexity/last_seq": 8.502991676330566, + "eval_train_perplexity/second_seq": 14.000772476196289, + "eval_train_perplexity/seq": 8.675185203552246, + "eval_train_reconstruction/all": 0.2857539653778076, + "eval_train_reconstruction/end_span": 0.7260294556617737, + "eval_train_reconstruction/fim": 0.1642449051141739, + "eval_train_reconstruction/first_seq": 0.15127955377101898, + "eval_train_reconstruction/last_seq": 0.3431095480918884, + "eval_train_reconstruction/second_seq": 0.18660709261894226, + "eval_train_runtime": 513.8312, + "eval_train_samples_per_second": 0.374, + "eval_train_steps_per_second": 0.374, + "step": 46000 + }, + { + "epoch": 0.17162403109449953, + "grad_norm": 0.3015180826187134, + "learning_rate": 0.0006, + "loss": 2.239, + "step": 46010 + }, + { + "epoch": 0.1716613325574629, + "grad_norm": 0.38440507650375366, + "learning_rate": 0.0006, + "loss": 2.1731, + "step": 46020 + }, + { + "epoch": 0.1716986340204263, + "grad_norm": 0.3492671549320221, + "learning_rate": 0.0006, + "loss": 2.2721, + "step": 46030 + }, + { + "epoch": 0.17173593548338967, + "grad_norm": 0.32686299085617065, + "learning_rate": 0.0006, + "loss": 2.1325, + "step": 46040 + }, + { + "epoch": 0.17177323694635305, + "grad_norm": 0.5001857876777649, + "learning_rate": 0.0006, + "loss": 1.9429, + "step": 46050 + }, + { + "epoch": 0.17181053840931643, + "grad_norm": 0.29081201553344727, + "learning_rate": 0.0006, + "loss": 2.2417, + "step": 46060 + }, + { + "epoch": 0.17184783987227978, + "grad_norm": 0.376947820186615, + "learning_rate": 0.0006, + "loss": 2.2733, + "step": 46070 + }, + { + "epoch": 0.17188514133524316, + "grad_norm": 0.29162871837615967, + "learning_rate": 0.0006, + "loss": 2.2146, + "step": 46080 + }, + { + "epoch": 0.17192244279820654, + "grad_norm": 0.3936319947242737, + "learning_rate": 0.0006, + "loss": 2.1941, + "step": 46090 + }, + { + "epoch": 0.17195974426116992, + "grad_norm": 0.2536769211292267, + "learning_rate": 0.0006, + "loss": 2.2149, + "step": 46100 + }, + { + "epoch": 0.1719970457241333, + "grad_norm": 0.4591253399848938, + "learning_rate": 0.0006, + "loss": 2.1929, + "step": 46110 + }, + { + "epoch": 0.17203434718709668, + "grad_norm": 0.30037742853164673, + "learning_rate": 0.0006, + "loss": 2.1431, + "step": 46120 + }, + { + "epoch": 0.17207164865006006, + "grad_norm": 0.3336773216724396, + "learning_rate": 0.0006, + "loss": 2.2963, + "step": 46130 + }, + { + "epoch": 0.17210895011302343, + "grad_norm": 0.2700132131576538, + "learning_rate": 0.0006, + "loss": 2.0527, + "step": 46140 + }, + { + "epoch": 0.17214625157598681, + "grad_norm": 0.32235342264175415, + "learning_rate": 0.0006, + "loss": 2.2336, + "step": 46150 + }, + { + "epoch": 0.1721835530389502, + "grad_norm": 0.4185906946659088, + "learning_rate": 0.0006, + "loss": 2.1626, + "step": 46160 + }, + { + "epoch": 0.17222085450191357, + "grad_norm": 0.4174693822860718, + "learning_rate": 0.0006, + "loss": 2.0367, + "step": 46170 + }, + { + "epoch": 0.17225815596487695, + "grad_norm": 0.23931540548801422, + "learning_rate": 0.0006, + "loss": 2.3084, + "step": 46180 + }, + { + "epoch": 0.17229545742784033, + "grad_norm": 0.3863498866558075, + "learning_rate": 0.0006, + "loss": 2.3274, + "step": 46190 + }, + { + "epoch": 0.1723327588908037, + "grad_norm": 0.3939547538757324, + "learning_rate": 0.0006, + "loss": 2.27, + "step": 46200 + }, + { + "epoch": 0.17237006035376706, + "grad_norm": 0.3407176733016968, + "learning_rate": 0.0006, + "loss": 2.0757, + "step": 46210 + }, + { + "epoch": 0.17240736181673044, + "grad_norm": 0.29520219564437866, + "learning_rate": 0.0006, + "loss": 2.351, + "step": 46220 + }, + { + "epoch": 0.17244466327969382, + "grad_norm": 0.3598504364490509, + "learning_rate": 0.0006, + "loss": 2.2842, + "step": 46230 + }, + { + "epoch": 0.1724819647426572, + "grad_norm": 0.26693010330200195, + "learning_rate": 0.0006, + "loss": 2.2988, + "step": 46240 + }, + { + "epoch": 0.17251926620562058, + "grad_norm": 0.3489798605442047, + "learning_rate": 0.0006, + "loss": 2.2928, + "step": 46250 + }, + { + "epoch": 0.17251926620562058, + "eval_valid_loss": 2.185913324356079, + "eval_valid_loss/all": 2.0492048263549805, + "eval_valid_loss/end_span": 1.3015638589859009, + "eval_valid_perplexity/batch": 7.7617268562316895, + "eval_valid_perplexity/end_span": 3.675039529800415, + "eval_valid_perplexity/fim": 2.3456761837005615, + "eval_valid_perplexity/first_seq": 14.75119686126709, + "eval_valid_perplexity/last_seq": 8.377610206604004, + "eval_valid_perplexity/second_seq": 14.060730934143066, + "eval_valid_perplexity/seq": 8.748587608337402, + "eval_valid_reconstruction/all": 0.296078622341156, + "eval_valid_reconstruction/end_span": 0.7017275094985962, + "eval_valid_reconstruction/fim": 0.1713138222694397, + "eval_valid_reconstruction/first_seq": 0.17168322205543518, + "eval_valid_reconstruction/last_seq": 0.34497708082199097, + "eval_valid_reconstruction/second_seq": 0.18886162340641022, + "eval_valid_runtime": 508.8748, + "eval_valid_samples_per_second": 0.377, + "eval_valid_steps_per_second": 0.377, + "step": 46250 + }, + { + "epoch": 0.17251926620562058, + "eval_train_loss": 2.1859352588653564, + "eval_train_loss/all": 2.0224361419677734, + "eval_train_loss/end_span": 1.2574349641799927, + "eval_train_perplexity/batch": 7.556711673736572, + "eval_train_perplexity/end_span": 3.516390323638916, + "eval_train_perplexity/fim": 2.1179087162017822, + "eval_train_perplexity/first_seq": 15.36569881439209, + "eval_train_perplexity/last_seq": 8.600174903869629, + "eval_train_perplexity/second_seq": 14.174497604370117, + "eval_train_perplexity/seq": 8.6975736618042, + "eval_train_reconstruction/all": 0.28521913290023804, + "eval_train_reconstruction/end_span": 0.7134071588516235, + "eval_train_reconstruction/fim": 0.15042591094970703, + "eval_train_reconstruction/first_seq": 0.15492036938667297, + "eval_train_reconstruction/last_seq": 0.34255650639533997, + "eval_train_reconstruction/second_seq": 0.1844351589679718, + "eval_train_runtime": 519.337, + "eval_train_samples_per_second": 0.37, + "eval_train_steps_per_second": 0.37, + "step": 46250 + }, + { + "epoch": 0.17255656766858396, + "grad_norm": 0.35100722312927246, + "learning_rate": 0.0006, + "loss": 1.9831, + "step": 46260 + }, + { + "epoch": 0.17259386913154734, + "grad_norm": 0.3124428689479828, + "learning_rate": 0.0006, + "loss": 2.3045, + "step": 46270 + }, + { + "epoch": 0.17263117059451072, + "grad_norm": 0.29965078830718994, + "learning_rate": 0.0006, + "loss": 2.1833, + "step": 46280 + }, + { + "epoch": 0.1726684720574741, + "grad_norm": 0.3758871257305145, + "learning_rate": 0.0006, + "loss": 2.3377, + "step": 46290 + }, + { + "epoch": 0.17270577352043748, + "grad_norm": 0.3388274312019348, + "learning_rate": 0.0006, + "loss": 1.9463, + "step": 46300 + }, + { + "epoch": 0.17274307498340086, + "grad_norm": 0.4473312795162201, + "learning_rate": 0.0006, + "loss": 2.1468, + "step": 46310 + }, + { + "epoch": 0.17278037644636424, + "grad_norm": 0.21323272585868835, + "learning_rate": 0.0006, + "loss": 2.2751, + "step": 46320 + }, + { + "epoch": 0.17281767790932762, + "grad_norm": 0.545762836933136, + "learning_rate": 0.0006, + "loss": 2.314, + "step": 46330 + }, + { + "epoch": 0.17285497937229097, + "grad_norm": 0.3604537546634674, + "learning_rate": 0.0006, + "loss": 2.2814, + "step": 46340 + }, + { + "epoch": 0.17289228083525435, + "grad_norm": 0.3109537959098816, + "learning_rate": 0.0006, + "loss": 2.3381, + "step": 46350 + }, + { + "epoch": 0.17292958229821773, + "grad_norm": 0.28285694122314453, + "learning_rate": 0.0006, + "loss": 2.3184, + "step": 46360 + }, + { + "epoch": 0.1729668837611811, + "grad_norm": 0.37841612100601196, + "learning_rate": 0.0006, + "loss": 2.2902, + "step": 46370 + }, + { + "epoch": 0.17300418522414449, + "grad_norm": 0.5416720509529114, + "learning_rate": 0.0006, + "loss": 2.3264, + "step": 46380 + }, + { + "epoch": 0.17304148668710786, + "grad_norm": 0.2345474511384964, + "learning_rate": 0.0006, + "loss": 2.3492, + "step": 46390 + }, + { + "epoch": 0.17307878815007124, + "grad_norm": 0.37017765641212463, + "learning_rate": 0.0006, + "loss": 2.2084, + "step": 46400 + }, + { + "epoch": 0.17311608961303462, + "grad_norm": 0.28708744049072266, + "learning_rate": 0.0006, + "loss": 2.4084, + "step": 46410 + }, + { + "epoch": 0.173153391075998, + "grad_norm": 0.33514416217803955, + "learning_rate": 0.0006, + "loss": 2.1623, + "step": 46420 + }, + { + "epoch": 0.17319069253896138, + "grad_norm": 0.44400089979171753, + "learning_rate": 0.0006, + "loss": 1.9763, + "step": 46430 + }, + { + "epoch": 0.17322799400192476, + "grad_norm": 0.3620220422744751, + "learning_rate": 0.0006, + "loss": 2.2802, + "step": 46440 + }, + { + "epoch": 0.17326529546488814, + "grad_norm": 0.30068841576576233, + "learning_rate": 0.0006, + "loss": 2.2042, + "step": 46450 + }, + { + "epoch": 0.17330259692785152, + "grad_norm": 0.3428497910499573, + "learning_rate": 0.0006, + "loss": 2.2016, + "step": 46460 + }, + { + "epoch": 0.1733398983908149, + "grad_norm": 0.4052395820617676, + "learning_rate": 0.0006, + "loss": 2.2015, + "step": 46470 + }, + { + "epoch": 0.17337719985377825, + "grad_norm": 0.3405417501926422, + "learning_rate": 0.0006, + "loss": 2.352, + "step": 46480 + }, + { + "epoch": 0.17341450131674163, + "grad_norm": 0.4958297610282898, + "learning_rate": 0.0006, + "loss": 2.1858, + "step": 46490 + }, + { + "epoch": 0.173451802779705, + "grad_norm": 0.1911449134349823, + "learning_rate": 0.0006, + "loss": 2.3454, + "step": 46500 + }, + { + "epoch": 0.173451802779705, + "eval_valid_loss": 2.183469533920288, + "eval_valid_loss/all": 2.047092914581299, + "eval_valid_loss/end_span": 1.302369475364685, + "eval_valid_perplexity/batch": 7.745351791381836, + "eval_valid_perplexity/end_span": 3.6780014038085938, + "eval_valid_perplexity/fim": 2.3177149295806885, + "eval_valid_perplexity/first_seq": 14.634218215942383, + "eval_valid_perplexity/last_seq": 8.760007858276367, + "eval_valid_perplexity/second_seq": 13.686419486999512, + "eval_valid_perplexity/seq": 8.733779907226562, + "eval_valid_reconstruction/all": 0.2963556945323944, + "eval_valid_reconstruction/end_span": 0.6938889622688293, + "eval_valid_reconstruction/fim": 0.16912440955638885, + "eval_valid_reconstruction/first_seq": 0.17315693199634552, + "eval_valid_reconstruction/last_seq": 0.33525002002716064, + "eval_valid_reconstruction/second_seq": 0.19807936251163483, + "eval_valid_runtime": 511.0933, + "eval_valid_samples_per_second": 0.376, + "eval_valid_steps_per_second": 0.376, + "step": 46500 + }, + { + "epoch": 0.173451802779705, + "eval_train_loss": 2.18204665184021, + "eval_train_loss/all": 2.0192344188690186, + "eval_train_loss/end_span": 1.2630330324172974, + "eval_train_perplexity/batch": 7.532556056976318, + "eval_train_perplexity/end_span": 3.536130428314209, + "eval_train_perplexity/fim": 2.1875956058502197, + "eval_train_perplexity/first_seq": 15.578634262084961, + "eval_train_perplexity/last_seq": 8.725258827209473, + "eval_train_perplexity/second_seq": 14.255939483642578, + "eval_train_perplexity/seq": 8.671292304992676, + "eval_train_reconstruction/all": 0.2857537865638733, + "eval_train_reconstruction/end_span": 0.7067305445671082, + "eval_train_reconstruction/fim": 0.15828582644462585, + "eval_train_reconstruction/first_seq": 0.1487920880317688, + "eval_train_reconstruction/last_seq": 0.3339749574661255, + "eval_train_reconstruction/second_seq": 0.1839521825313568, + "eval_train_runtime": 516.6516, + "eval_train_samples_per_second": 0.372, + "eval_train_steps_per_second": 0.372, + "step": 46500 + }, + { + "epoch": 0.1734891042426684, + "grad_norm": 0.24602974951267242, + "learning_rate": 0.0006, + "loss": 2.2608, + "step": 46510 + }, + { + "epoch": 0.17352640570563177, + "grad_norm": 0.427915096282959, + "learning_rate": 0.0006, + "loss": 2.2451, + "step": 46520 + }, + { + "epoch": 0.17356370716859515, + "grad_norm": 0.5240296721458435, + "learning_rate": 0.0006, + "loss": 2.2257, + "step": 46530 + }, + { + "epoch": 0.17360100863155853, + "grad_norm": 0.37676361203193665, + "learning_rate": 0.0006, + "loss": 2.1321, + "step": 46540 + }, + { + "epoch": 0.1736383100945219, + "grad_norm": 0.34919893741607666, + "learning_rate": 0.0006, + "loss": 2.1109, + "step": 46550 + }, + { + "epoch": 0.1736756115574853, + "grad_norm": 0.3002404570579529, + "learning_rate": 0.0006, + "loss": 2.1057, + "step": 46560 + }, + { + "epoch": 0.17371291302044867, + "grad_norm": 0.48794543743133545, + "learning_rate": 0.0006, + "loss": 2.2091, + "step": 46570 + }, + { + "epoch": 0.17375021448341205, + "grad_norm": 0.3192950487136841, + "learning_rate": 0.0006, + "loss": 2.1363, + "step": 46580 + }, + { + "epoch": 0.17378751594637543, + "grad_norm": 0.2906842827796936, + "learning_rate": 0.0006, + "loss": 2.2012, + "step": 46590 + }, + { + "epoch": 0.1738248174093388, + "grad_norm": 0.350750207901001, + "learning_rate": 0.0006, + "loss": 2.2322, + "step": 46600 + }, + { + "epoch": 0.17386211887230218, + "grad_norm": 0.4061283767223358, + "learning_rate": 0.0006, + "loss": 2.2469, + "step": 46610 + }, + { + "epoch": 0.17389942033526554, + "grad_norm": 0.3718392848968506, + "learning_rate": 0.0006, + "loss": 2.1801, + "step": 46620 + }, + { + "epoch": 0.17393672179822892, + "grad_norm": 0.4175383448600769, + "learning_rate": 0.0006, + "loss": 2.15, + "step": 46630 + }, + { + "epoch": 0.1739740232611923, + "grad_norm": 0.7709935307502747, + "learning_rate": 0.0006, + "loss": 2.2646, + "step": 46640 + }, + { + "epoch": 0.17401132472415567, + "grad_norm": 0.2663812041282654, + "learning_rate": 0.0006, + "loss": 2.3039, + "step": 46650 + }, + { + "epoch": 0.17404862618711905, + "grad_norm": 0.4489781856536865, + "learning_rate": 0.0006, + "loss": 2.079, + "step": 46660 + }, + { + "epoch": 0.17408592765008243, + "grad_norm": 0.44749385118484497, + "learning_rate": 0.0006, + "loss": 2.2428, + "step": 46670 + }, + { + "epoch": 0.1741232291130458, + "grad_norm": 2.5019235610961914, + "learning_rate": 0.0006, + "loss": 2.1881, + "step": 46680 + }, + { + "epoch": 0.1741605305760092, + "grad_norm": 0.2976053059101105, + "learning_rate": 0.0006, + "loss": 2.3117, + "step": 46690 + }, + { + "epoch": 0.17419783203897257, + "grad_norm": 0.30633917450904846, + "learning_rate": 0.0006, + "loss": 2.0132, + "step": 46700 + }, + { + "epoch": 0.17423513350193595, + "grad_norm": 0.2686232328414917, + "learning_rate": 0.0006, + "loss": 2.2574, + "step": 46710 + }, + { + "epoch": 0.17427243496489933, + "grad_norm": 0.2927355170249939, + "learning_rate": 0.0006, + "loss": 2.2606, + "step": 46720 + }, + { + "epoch": 0.1743097364278627, + "grad_norm": 0.3506802022457123, + "learning_rate": 0.0006, + "loss": 2.3005, + "step": 46730 + }, + { + "epoch": 0.1743470378908261, + "grad_norm": 0.4249761998653412, + "learning_rate": 0.0006, + "loss": 2.2263, + "step": 46740 + }, + { + "epoch": 0.17438433935378947, + "grad_norm": 0.4038447141647339, + "learning_rate": 0.0006, + "loss": 1.9772, + "step": 46750 + }, + { + "epoch": 0.17438433935378947, + "eval_valid_loss": 2.1869380474090576, + "eval_valid_loss/all": 2.0503764152526855, + "eval_valid_loss/end_span": 1.2132285833358765, + "eval_valid_perplexity/batch": 7.770825386047363, + "eval_valid_perplexity/end_span": 3.3643290996551514, + "eval_valid_perplexity/fim": 2.379988193511963, + "eval_valid_perplexity/first_seq": 15.071449279785156, + "eval_valid_perplexity/last_seq": 8.276956558227539, + "eval_valid_perplexity/second_seq": 13.723623275756836, + "eval_valid_perplexity/seq": 8.76549243927002, + "eval_valid_reconstruction/all": 0.29534775018692017, + "eval_valid_reconstruction/end_span": 0.7172602415084839, + "eval_valid_reconstruction/fim": 0.17374756932258606, + "eval_valid_reconstruction/first_seq": 0.16617538034915924, + "eval_valid_reconstruction/last_seq": 0.3528550863265991, + "eval_valid_reconstruction/second_seq": 0.19646304845809937, + "eval_valid_runtime": 519.6642, + "eval_valid_samples_per_second": 0.369, + "eval_valid_steps_per_second": 0.369, + "step": 46750 + }, + { + "epoch": 0.17438433935378947, + "eval_train_loss": 2.18546462059021, + "eval_train_loss/all": 2.0225613117218018, + "eval_train_loss/end_span": 1.1679623126983643, + "eval_train_perplexity/batch": 7.557657718658447, + "eval_train_perplexity/end_span": 3.2154338359832764, + "eval_train_perplexity/fim": 2.0801141262054443, + "eval_train_perplexity/first_seq": 15.442283630371094, + "eval_train_perplexity/last_seq": 8.799702644348145, + "eval_train_perplexity/second_seq": 14.49648666381836, + "eval_train_perplexity/seq": 8.704085350036621, + "eval_train_reconstruction/all": 0.2846391201019287, + "eval_train_reconstruction/end_span": 0.733021080493927, + "eval_train_reconstruction/fim": 0.14606192708015442, + "eval_train_reconstruction/first_seq": 0.15446197986602783, + "eval_train_reconstruction/last_seq": 0.33483970165252686, + "eval_train_reconstruction/second_seq": 0.1753534972667694, + "eval_train_runtime": 521.9456, + "eval_train_samples_per_second": 0.368, + "eval_train_steps_per_second": 0.368, + "step": 46750 + }, + { + "epoch": 0.17442164081675282, + "grad_norm": 0.30024611949920654, + "learning_rate": 0.0006, + "loss": 2.2167, + "step": 46760 + }, + { + "epoch": 0.1744589422797162, + "grad_norm": 1.2563660144805908, + "learning_rate": 0.0006, + "loss": 2.1966, + "step": 46770 + }, + { + "epoch": 0.17449624374267958, + "grad_norm": 0.35894155502319336, + "learning_rate": 0.0006, + "loss": 2.1644, + "step": 46780 + }, + { + "epoch": 0.17453354520564296, + "grad_norm": 0.39749041199684143, + "learning_rate": 0.0006, + "loss": 2.0523, + "step": 46790 + }, + { + "epoch": 0.17457084666860634, + "grad_norm": 0.38815122842788696, + "learning_rate": 0.0006, + "loss": 2.0196, + "step": 46800 + }, + { + "epoch": 0.17460814813156972, + "grad_norm": 0.357036292552948, + "learning_rate": 0.0006, + "loss": 2.2583, + "step": 46810 + }, + { + "epoch": 0.1746454495945331, + "grad_norm": 0.24450115859508514, + "learning_rate": 0.0006, + "loss": 2.2804, + "step": 46820 + }, + { + "epoch": 0.17468275105749648, + "grad_norm": 0.25873786211013794, + "learning_rate": 0.0006, + "loss": 2.3787, + "step": 46830 + }, + { + "epoch": 0.17472005252045986, + "grad_norm": 0.5020834803581238, + "learning_rate": 0.0006, + "loss": 2.1399, + "step": 46840 + }, + { + "epoch": 0.17475735398342324, + "grad_norm": 0.437563955783844, + "learning_rate": 0.0006, + "loss": 2.2351, + "step": 46850 + }, + { + "epoch": 0.17479465544638662, + "grad_norm": 0.34815171360969543, + "learning_rate": 0.0006, + "loss": 2.0581, + "step": 46860 + }, + { + "epoch": 0.17483195690935, + "grad_norm": 0.2597368657588959, + "learning_rate": 0.0006, + "loss": 2.185, + "step": 46870 + }, + { + "epoch": 0.17486925837231337, + "grad_norm": 0.19623062014579773, + "learning_rate": 0.0006, + "loss": 2.2823, + "step": 46880 + }, + { + "epoch": 0.17490655983527673, + "grad_norm": 0.38905712962150574, + "learning_rate": 0.0006, + "loss": 1.9448, + "step": 46890 + }, + { + "epoch": 0.1749438612982401, + "grad_norm": 0.3219161927700043, + "learning_rate": 0.0006, + "loss": 2.0895, + "step": 46900 + }, + { + "epoch": 0.17498116276120348, + "grad_norm": 0.4130001366138458, + "learning_rate": 0.0006, + "loss": 2.2946, + "step": 46910 + }, + { + "epoch": 0.17501846422416686, + "grad_norm": 0.42612430453300476, + "learning_rate": 0.0006, + "loss": 2.2409, + "step": 46920 + }, + { + "epoch": 0.17505576568713024, + "grad_norm": 0.437876433134079, + "learning_rate": 0.0006, + "loss": 2.3021, + "step": 46930 + }, + { + "epoch": 0.17509306715009362, + "grad_norm": 0.3012915253639221, + "learning_rate": 0.0006, + "loss": 2.2847, + "step": 46940 + }, + { + "epoch": 0.175130368613057, + "grad_norm": 0.5955538749694824, + "learning_rate": 0.0006, + "loss": 2.2193, + "step": 46950 + }, + { + "epoch": 0.17516767007602038, + "grad_norm": 0.34016668796539307, + "learning_rate": 0.0006, + "loss": 2.1217, + "step": 46960 + }, + { + "epoch": 0.17520497153898376, + "grad_norm": 0.2918568253517151, + "learning_rate": 0.0006, + "loss": 2.3407, + "step": 46970 + }, + { + "epoch": 0.17524227300194714, + "grad_norm": 0.3881615996360779, + "learning_rate": 0.0006, + "loss": 2.0694, + "step": 46980 + }, + { + "epoch": 0.17527957446491052, + "grad_norm": 0.4133944809436798, + "learning_rate": 0.0006, + "loss": 2.321, + "step": 46990 + }, + { + "epoch": 0.1753168759278739, + "grad_norm": 0.28057870268821716, + "learning_rate": 0.0006, + "loss": 2.3287, + "step": 47000 + }, + { + "epoch": 0.1753168759278739, + "eval_valid_loss": 2.1822292804718018, + "eval_valid_loss/all": 2.046208143234253, + "eval_valid_loss/end_span": 1.1974358558654785, + "eval_valid_perplexity/batch": 7.738502025604248, + "eval_valid_perplexity/end_span": 3.311614513397217, + "eval_valid_perplexity/fim": 2.3889167308807373, + "eval_valid_perplexity/first_seq": 14.408445358276367, + "eval_valid_perplexity/last_seq": 8.701849937438965, + "eval_valid_perplexity/second_seq": 13.539916038513184, + "eval_valid_perplexity/seq": 8.727279663085938, + "eval_valid_reconstruction/all": 0.296936959028244, + "eval_valid_reconstruction/end_span": 0.7217732667922974, + "eval_valid_reconstruction/fim": 0.1757933497428894, + "eval_valid_reconstruction/first_seq": 0.17833007872104645, + "eval_valid_reconstruction/last_seq": 0.3353356122970581, + "eval_valid_reconstruction/second_seq": 0.20368996262550354, + "eval_valid_runtime": 526.2068, + "eval_valid_samples_per_second": 0.365, + "eval_valid_steps_per_second": 0.365, + "step": 47000 + }, + { + "epoch": 0.1753168759278739, + "eval_train_loss": 2.1819427013397217, + "eval_train_loss/all": 2.0197036266326904, + "eval_train_loss/end_span": 1.15984308719635, + "eval_train_perplexity/batch": 7.536091327667236, + "eval_train_perplexity/end_span": 3.1894328594207764, + "eval_train_perplexity/fim": 2.350325107574463, + "eval_train_perplexity/first_seq": 15.570558547973633, + "eval_train_perplexity/last_seq": 8.425017356872559, + "eval_train_perplexity/second_seq": 14.05192756652832, + "eval_train_perplexity/seq": 8.67674732208252, + "eval_train_reconstruction/all": 0.28585997223854065, + "eval_train_reconstruction/end_span": 0.7327854037284851, + "eval_train_reconstruction/fim": 0.17228755354881287, + "eval_train_reconstruction/first_seq": 0.15168538689613342, + "eval_train_reconstruction/last_seq": 0.34371525049209595, + "eval_train_reconstruction/second_seq": 0.18847794830799103, + "eval_train_runtime": 518.8854, + "eval_train_samples_per_second": 0.37, + "eval_train_steps_per_second": 0.37, + "step": 47000 + }, + { + "epoch": 0.17535417739083728, + "grad_norm": 0.3771151304244995, + "learning_rate": 0.0006, + "loss": 2.0294, + "step": 47010 + }, + { + "epoch": 0.17539147885380066, + "grad_norm": 0.3357031047344208, + "learning_rate": 0.0006, + "loss": 2.112, + "step": 47020 + }, + { + "epoch": 0.175428780316764, + "grad_norm": 0.3500498831272125, + "learning_rate": 0.0006, + "loss": 2.0996, + "step": 47030 + }, + { + "epoch": 0.1754660817797274, + "grad_norm": 0.37558525800704956, + "learning_rate": 0.0006, + "loss": 2.0918, + "step": 47040 + }, + { + "epoch": 0.17550338324269077, + "grad_norm": 0.34323930740356445, + "learning_rate": 0.0006, + "loss": 2.1499, + "step": 47050 + }, + { + "epoch": 0.17554068470565415, + "grad_norm": 0.48832258582115173, + "learning_rate": 0.0006, + "loss": 2.055, + "step": 47060 + }, + { + "epoch": 0.17557798616861753, + "grad_norm": 0.24941937625408173, + "learning_rate": 0.0006, + "loss": 2.0294, + "step": 47070 + }, + { + "epoch": 0.1756152876315809, + "grad_norm": 0.31956520676612854, + "learning_rate": 0.0006, + "loss": 2.2002, + "step": 47080 + }, + { + "epoch": 0.1756525890945443, + "grad_norm": 0.2916072607040405, + "learning_rate": 0.0006, + "loss": 2.2264, + "step": 47090 + }, + { + "epoch": 0.17568989055750767, + "grad_norm": 0.4003258943557739, + "learning_rate": 0.0006, + "loss": 2.2711, + "step": 47100 + }, + { + "epoch": 0.17572719202047105, + "grad_norm": 0.4000511169433594, + "learning_rate": 0.0006, + "loss": 1.9159, + "step": 47110 + }, + { + "epoch": 0.17576449348343443, + "grad_norm": 0.4268791973590851, + "learning_rate": 0.0006, + "loss": 2.1654, + "step": 47120 + }, + { + "epoch": 0.1758017949463978, + "grad_norm": 0.35292547941207886, + "learning_rate": 0.0006, + "loss": 2.0626, + "step": 47130 + }, + { + "epoch": 0.17583909640936118, + "grad_norm": 0.2716134488582611, + "learning_rate": 0.0006, + "loss": 2.2321, + "step": 47140 + }, + { + "epoch": 0.17587639787232456, + "grad_norm": 0.3806495666503906, + "learning_rate": 0.0006, + "loss": 2.2118, + "step": 47150 + }, + { + "epoch": 0.17591369933528794, + "grad_norm": 0.3612562417984009, + "learning_rate": 0.0006, + "loss": 2.1421, + "step": 47160 + }, + { + "epoch": 0.1759510007982513, + "grad_norm": 0.32680267095565796, + "learning_rate": 0.0006, + "loss": 2.1584, + "step": 47170 + }, + { + "epoch": 0.17598830226121467, + "grad_norm": 0.45786458253860474, + "learning_rate": 0.0006, + "loss": 2.2013, + "step": 47180 + }, + { + "epoch": 0.17602560372417805, + "grad_norm": 0.48086273670196533, + "learning_rate": 0.0006, + "loss": 2.2978, + "step": 47190 + }, + { + "epoch": 0.17606290518714143, + "grad_norm": 0.4424624443054199, + "learning_rate": 0.0006, + "loss": 2.0286, + "step": 47200 + }, + { + "epoch": 0.1761002066501048, + "grad_norm": 0.28549015522003174, + "learning_rate": 0.0006, + "loss": 2.1959, + "step": 47210 + }, + { + "epoch": 0.1761375081130682, + "grad_norm": 0.42784032225608826, + "learning_rate": 0.0006, + "loss": 2.3614, + "step": 47220 + }, + { + "epoch": 0.17617480957603157, + "grad_norm": 0.3344692289829254, + "learning_rate": 0.0006, + "loss": 2.1164, + "step": 47230 + }, + { + "epoch": 0.17621211103899495, + "grad_norm": 0.23886020481586456, + "learning_rate": 0.0006, + "loss": 2.2075, + "step": 47240 + }, + { + "epoch": 0.17624941250195833, + "grad_norm": 0.9097710251808167, + "learning_rate": 0.0006, + "loss": 2.0459, + "step": 47250 + }, + { + "epoch": 0.17624941250195833, + "eval_valid_loss": 2.1837549209594727, + "eval_valid_loss/all": 2.0475716590881348, + "eval_valid_loss/end_span": 1.2186564207077026, + "eval_valid_perplexity/batch": 7.749061107635498, + "eval_valid_perplexity/end_span": 3.3826398849487305, + "eval_valid_perplexity/fim": 2.505415916442871, + "eval_valid_perplexity/first_seq": 14.931279182434082, + "eval_valid_perplexity/last_seq": 8.72429370880127, + "eval_valid_perplexity/second_seq": 13.75400447845459, + "eval_valid_perplexity/seq": 8.737661361694336, + "eval_valid_reconstruction/all": 0.29622188210487366, + "eval_valid_reconstruction/end_span": 0.7175586223602295, + "eval_valid_reconstruction/fim": 0.18554382026195526, + "eval_valid_reconstruction/first_seq": 0.16594092547893524, + "eval_valid_reconstruction/last_seq": 0.33687713742256165, + "eval_valid_reconstruction/second_seq": 0.19516737759113312, + "eval_valid_runtime": 524.5159, + "eval_valid_samples_per_second": 0.366, + "eval_valid_steps_per_second": 0.366, + "step": 47250 + }, + { + "epoch": 0.17624941250195833, + "eval_train_loss": 2.181324005126953, + "eval_train_loss/all": 2.018730878829956, + "eval_train_loss/end_span": 1.1863282918930054, + "eval_train_perplexity/batch": 7.528763771057129, + "eval_train_perplexity/end_span": 3.275034189224243, + "eval_train_perplexity/fim": 2.0925889015197754, + "eval_train_perplexity/first_seq": 15.63436222076416, + "eval_train_perplexity/last_seq": 8.744580268859863, + "eval_train_perplexity/second_seq": 14.000922203063965, + "eval_train_perplexity/seq": 8.67141056060791, + "eval_train_reconstruction/all": 0.28609153628349304, + "eval_train_reconstruction/end_span": 0.7260218858718872, + "eval_train_reconstruction/fim": 0.14928960800170898, + "eval_train_reconstruction/first_seq": 0.1486676037311554, + "eval_train_reconstruction/last_seq": 0.33382683992385864, + "eval_train_reconstruction/second_seq": 0.19152933359146118, + "eval_train_runtime": 518.0296, + "eval_train_samples_per_second": 0.371, + "eval_train_steps_per_second": 0.371, + "step": 47250 + }, + { + "epoch": 0.1762867139649217, + "grad_norm": 0.2438710778951645, + "learning_rate": 0.0006, + "loss": 2.3009, + "step": 47260 + }, + { + "epoch": 0.1763240154278851, + "grad_norm": 0.40961432456970215, + "learning_rate": 0.0006, + "loss": 2.1299, + "step": 47270 + }, + { + "epoch": 0.17636131689084847, + "grad_norm": 0.6059914827346802, + "learning_rate": 0.0006, + "loss": 2.0362, + "step": 47280 + }, + { + "epoch": 0.17639861835381185, + "grad_norm": 0.2508642375469208, + "learning_rate": 0.0006, + "loss": 2.0836, + "step": 47290 + }, + { + "epoch": 0.17643591981677523, + "grad_norm": 0.3961138427257538, + "learning_rate": 0.0006, + "loss": 2.0647, + "step": 47300 + }, + { + "epoch": 0.17647322127973858, + "grad_norm": 0.43407443165779114, + "learning_rate": 0.0006, + "loss": 2.2157, + "step": 47310 + }, + { + "epoch": 0.17651052274270196, + "grad_norm": 0.2673807442188263, + "learning_rate": 0.0006, + "loss": 2.1075, + "step": 47320 + }, + { + "epoch": 0.17654782420566534, + "grad_norm": 0.4190669059753418, + "learning_rate": 0.0006, + "loss": 2.1269, + "step": 47330 + }, + { + "epoch": 0.17658512566862872, + "grad_norm": 0.37893131375312805, + "learning_rate": 0.0006, + "loss": 2.1948, + "step": 47340 + }, + { + "epoch": 0.1766224271315921, + "grad_norm": 8.661190032958984, + "learning_rate": 0.0006, + "loss": 2.358, + "step": 47350 + }, + { + "epoch": 0.17665972859455548, + "grad_norm": 0.3515980839729309, + "learning_rate": 0.0006, + "loss": 2.16, + "step": 47360 + }, + { + "epoch": 0.17669703005751886, + "grad_norm": 0.5106039047241211, + "learning_rate": 0.0006, + "loss": 2.2855, + "step": 47370 + }, + { + "epoch": 0.17673433152048224, + "grad_norm": 0.2363581508398056, + "learning_rate": 0.0006, + "loss": 1.9474, + "step": 47380 + }, + { + "epoch": 0.17677163298344561, + "grad_norm": 0.4888714849948883, + "learning_rate": 0.0006, + "loss": 1.8635, + "step": 47390 + }, + { + "epoch": 0.176808934446409, + "grad_norm": 0.36287420988082886, + "learning_rate": 0.0006, + "loss": 2.3294, + "step": 47400 + }, + { + "epoch": 0.17684623590937237, + "grad_norm": 0.35978633165359497, + "learning_rate": 0.0006, + "loss": 2.115, + "step": 47410 + }, + { + "epoch": 0.17688353737233575, + "grad_norm": 0.4656985104084015, + "learning_rate": 0.0006, + "loss": 2.1512, + "step": 47420 + }, + { + "epoch": 0.17692083883529913, + "grad_norm": 0.2584339380264282, + "learning_rate": 0.0006, + "loss": 2.2627, + "step": 47430 + }, + { + "epoch": 0.17695814029826248, + "grad_norm": 0.2778404653072357, + "learning_rate": 0.0006, + "loss": 2.1525, + "step": 47440 + }, + { + "epoch": 0.17699544176122586, + "grad_norm": 0.34547483921051025, + "learning_rate": 0.0006, + "loss": 2.3416, + "step": 47450 + }, + { + "epoch": 0.17703274322418924, + "grad_norm": 0.22963832318782806, + "learning_rate": 0.0006, + "loss": 2.2607, + "step": 47460 + }, + { + "epoch": 0.17707004468715262, + "grad_norm": 0.35274624824523926, + "learning_rate": 0.0006, + "loss": 2.1845, + "step": 47470 + }, + { + "epoch": 0.177107346150116, + "grad_norm": 0.4158378541469574, + "learning_rate": 0.0006, + "loss": 2.1337, + "step": 47480 + }, + { + "epoch": 0.17714464761307938, + "grad_norm": 0.37499141693115234, + "learning_rate": 0.0006, + "loss": 2.1545, + "step": 47490 + }, + { + "epoch": 0.17718194907604276, + "grad_norm": 0.3875156342983246, + "learning_rate": 0.0006, + "loss": 2.0837, + "step": 47500 + }, + { + "epoch": 0.17718194907604276, + "eval_valid_loss": 2.1824140548706055, + "eval_valid_loss/all": 2.0466201305389404, + "eval_valid_loss/end_span": 1.225987434387207, + "eval_valid_perplexity/batch": 7.7416911125183105, + "eval_valid_perplexity/end_span": 3.40752911567688, + "eval_valid_perplexity/fim": 2.3264455795288086, + "eval_valid_perplexity/first_seq": 15.149164199829102, + "eval_valid_perplexity/last_seq": 8.46360969543457, + "eval_valid_perplexity/second_seq": 13.597101211547852, + "eval_valid_perplexity/seq": 8.734169960021973, + "eval_valid_reconstruction/all": 0.2968207597732544, + "eval_valid_reconstruction/end_span": 0.716706395149231, + "eval_valid_reconstruction/fim": 0.17033787071704865, + "eval_valid_reconstruction/first_seq": 0.15935556590557098, + "eval_valid_reconstruction/last_seq": 0.3483123779296875, + "eval_valid_reconstruction/second_seq": 0.2016497254371643, + "eval_valid_runtime": 519.1664, + "eval_valid_samples_per_second": 0.37, + "eval_valid_steps_per_second": 0.37, + "step": 47500 + }, + { + "epoch": 0.17718194907604276, + "eval_train_loss": 2.182527542114258, + "eval_train_loss/all": 2.0203027725219727, + "eval_train_loss/end_span": 1.1929972171783447, + "eval_train_perplexity/batch": 7.540607452392578, + "eval_train_perplexity/end_span": 3.296948194503784, + "eval_train_perplexity/fim": 1.9777657985687256, + "eval_train_perplexity/first_seq": 15.451733589172363, + "eval_train_perplexity/last_seq": 8.855475425720215, + "eval_train_perplexity/second_seq": 13.990657806396484, + "eval_train_perplexity/seq": 8.68570327758789, + "eval_train_reconstruction/all": 0.2858722507953644, + "eval_train_reconstruction/end_span": 0.7283209562301636, + "eval_train_reconstruction/fim": 0.13794198632240295, + "eval_train_reconstruction/first_seq": 0.15188495814800262, + "eval_train_reconstruction/last_seq": 0.330167293548584, + "eval_train_reconstruction/second_seq": 0.18786853551864624, + "eval_train_runtime": 517.2945, + "eval_train_samples_per_second": 0.371, + "eval_train_steps_per_second": 0.371, + "step": 47500 + }, + { + "epoch": 0.17721925053900614, + "grad_norm": 0.4946529269218445, + "learning_rate": 0.0006, + "loss": 2.0112, + "step": 47510 + }, + { + "epoch": 0.17725655200196952, + "grad_norm": 0.404994934797287, + "learning_rate": 0.0006, + "loss": 2.2669, + "step": 47520 + }, + { + "epoch": 0.1772938534649329, + "grad_norm": 0.4161774218082428, + "learning_rate": 0.0006, + "loss": 2.1461, + "step": 47530 + }, + { + "epoch": 0.17733115492789628, + "grad_norm": 0.3273473083972931, + "learning_rate": 0.0006, + "loss": 2.2688, + "step": 47540 + }, + { + "epoch": 0.17736845639085966, + "grad_norm": 0.32632583379745483, + "learning_rate": 0.0006, + "loss": 2.3384, + "step": 47550 + }, + { + "epoch": 0.17740575785382304, + "grad_norm": 0.3058311939239502, + "learning_rate": 0.0006, + "loss": 2.2581, + "step": 47560 + }, + { + "epoch": 0.17744305931678642, + "grad_norm": 0.40186992287635803, + "learning_rate": 0.0006, + "loss": 2.1837, + "step": 47570 + }, + { + "epoch": 0.17748036077974977, + "grad_norm": 0.2759837508201599, + "learning_rate": 0.0006, + "loss": 2.3222, + "step": 47580 + }, + { + "epoch": 0.17751766224271315, + "grad_norm": 1.2202249765396118, + "learning_rate": 0.0006, + "loss": 2.2003, + "step": 47590 + }, + { + "epoch": 0.17755496370567653, + "grad_norm": 0.3069111108779907, + "learning_rate": 0.0006, + "loss": 2.3404, + "step": 47600 + }, + { + "epoch": 0.1775922651686399, + "grad_norm": 0.269894540309906, + "learning_rate": 0.0006, + "loss": 1.9749, + "step": 47610 + }, + { + "epoch": 0.1776295666316033, + "grad_norm": 0.641748309135437, + "learning_rate": 0.0006, + "loss": 1.9636, + "step": 47620 + }, + { + "epoch": 0.17766686809456667, + "grad_norm": 0.2761753797531128, + "learning_rate": 0.0006, + "loss": 2.1912, + "step": 47630 + }, + { + "epoch": 0.17770416955753005, + "grad_norm": 0.27474090456962585, + "learning_rate": 0.0006, + "loss": 2.0949, + "step": 47640 + }, + { + "epoch": 0.17774147102049342, + "grad_norm": 0.36864182353019714, + "learning_rate": 0.0006, + "loss": 2.1226, + "step": 47650 + }, + { + "epoch": 0.1777787724834568, + "grad_norm": 0.33958953619003296, + "learning_rate": 0.0006, + "loss": 2.2473, + "step": 47660 + }, + { + "epoch": 0.17781607394642018, + "grad_norm": 0.44704920053482056, + "learning_rate": 0.0006, + "loss": 2.1848, + "step": 47670 + }, + { + "epoch": 0.17785337540938356, + "grad_norm": 0.2628449499607086, + "learning_rate": 0.0006, + "loss": 2.417, + "step": 47680 + }, + { + "epoch": 0.17789067687234694, + "grad_norm": 0.35733169317245483, + "learning_rate": 0.0006, + "loss": 2.225, + "step": 47690 + }, + { + "epoch": 0.17792797833531032, + "grad_norm": 0.47115999460220337, + "learning_rate": 0.0006, + "loss": 2.2367, + "step": 47700 + }, + { + "epoch": 0.1779652797982737, + "grad_norm": 0.2835077941417694, + "learning_rate": 0.0006, + "loss": 2.2886, + "step": 47710 + }, + { + "epoch": 0.17800258126123705, + "grad_norm": 0.3798123300075531, + "learning_rate": 0.0006, + "loss": 2.3007, + "step": 47720 + }, + { + "epoch": 0.17803988272420043, + "grad_norm": 0.31477615237236023, + "learning_rate": 0.0006, + "loss": 2.176, + "step": 47730 + }, + { + "epoch": 0.1780771841871638, + "grad_norm": 0.3711263835430145, + "learning_rate": 0.0006, + "loss": 2.2587, + "step": 47740 + }, + { + "epoch": 0.1781144856501272, + "grad_norm": 0.3454609811306, + "learning_rate": 0.0006, + "loss": 2.2674, + "step": 47750 + }, + { + "epoch": 0.1781144856501272, + "eval_valid_loss": 2.186163902282715, + "eval_valid_loss/all": 2.049984931945801, + "eval_valid_loss/end_span": 1.2512927055358887, + "eval_valid_perplexity/batch": 7.767784118652344, + "eval_valid_perplexity/end_span": 3.4948577880859375, + "eval_valid_perplexity/fim": 2.066767930984497, + "eval_valid_perplexity/first_seq": 15.19382095336914, + "eval_valid_perplexity/last_seq": 8.828544616699219, + "eval_valid_perplexity/second_seq": 13.65820598602295, + "eval_valid_perplexity/seq": 8.757081031799316, + "eval_valid_reconstruction/all": 0.295624315738678, + "eval_valid_reconstruction/end_span": 0.7146793603897095, + "eval_valid_reconstruction/fim": 0.14577658474445343, + "eval_valid_reconstruction/first_seq": 0.1614164113998413, + "eval_valid_reconstruction/last_seq": 0.33299127221107483, + "eval_valid_reconstruction/second_seq": 0.19624803960323334, + "eval_valid_runtime": 517.3705, + "eval_valid_samples_per_second": 0.371, + "eval_valid_steps_per_second": 0.371, + "step": 47750 + }, + { + "epoch": 0.1781144856501272, + "eval_train_loss": 2.1825172901153564, + "eval_train_loss/all": 2.0200231075286865, + "eval_train_loss/end_span": 1.217686653137207, + "eval_train_perplexity/batch": 7.538499355316162, + "eval_train_perplexity/end_span": 3.379361152648926, + "eval_train_perplexity/fim": 2.25823974609375, + "eval_train_perplexity/first_seq": 15.4373140335083, + "eval_train_perplexity/last_seq": 8.674726486206055, + "eval_train_perplexity/second_seq": 14.485161781311035, + "eval_train_perplexity/seq": 8.683789253234863, + "eval_train_reconstruction/all": 0.28562307357788086, + "eval_train_reconstruction/end_span": 0.7222080230712891, + "eval_train_reconstruction/fim": 0.16349001228809357, + "eval_train_reconstruction/first_seq": 0.15297068655490875, + "eval_train_reconstruction/last_seq": 0.3358062505722046, + "eval_train_reconstruction/second_seq": 0.1759590357542038, + "eval_train_runtime": 525.3717, + "eval_train_samples_per_second": 0.365, + "eval_train_steps_per_second": 0.365, + "step": 47750 + }, + { + "epoch": 0.17815178711309057, + "grad_norm": 0.39205488562583923, + "learning_rate": 0.0006, + "loss": 2.1329, + "step": 47760 + }, + { + "epoch": 0.17818908857605395, + "grad_norm": 0.28482505679130554, + "learning_rate": 0.0006, + "loss": 2.2361, + "step": 47770 + }, + { + "epoch": 0.17822639003901733, + "grad_norm": 0.2862361967563629, + "learning_rate": 0.0006, + "loss": 2.1529, + "step": 47780 + }, + { + "epoch": 0.1782636915019807, + "grad_norm": 0.2759186029434204, + "learning_rate": 0.0006, + "loss": 2.3693, + "step": 47790 + }, + { + "epoch": 0.1783009929649441, + "grad_norm": 0.27648553252220154, + "learning_rate": 0.0006, + "loss": 2.2474, + "step": 47800 + }, + { + "epoch": 0.17833829442790747, + "grad_norm": 0.3645535409450531, + "learning_rate": 0.0006, + "loss": 2.1018, + "step": 47810 + }, + { + "epoch": 0.17837559589087085, + "grad_norm": 0.2735760807991028, + "learning_rate": 0.0006, + "loss": 2.2379, + "step": 47820 + }, + { + "epoch": 0.17841289735383423, + "grad_norm": 0.23292216658592224, + "learning_rate": 0.0006, + "loss": 2.3823, + "step": 47830 + }, + { + "epoch": 0.1784501988167976, + "grad_norm": 0.27813851833343506, + "learning_rate": 0.0006, + "loss": 2.3457, + "step": 47840 + }, + { + "epoch": 0.17848750027976099, + "grad_norm": 0.2863854467868805, + "learning_rate": 0.0006, + "loss": 2.0955, + "step": 47850 + }, + { + "epoch": 0.17852480174272434, + "grad_norm": 0.2925645709037781, + "learning_rate": 0.0006, + "loss": 2.2638, + "step": 47860 + }, + { + "epoch": 0.17856210320568772, + "grad_norm": 0.27616527676582336, + "learning_rate": 0.0006, + "loss": 2.1553, + "step": 47870 + }, + { + "epoch": 0.1785994046686511, + "grad_norm": 0.26143836975097656, + "learning_rate": 0.0006, + "loss": 2.2999, + "step": 47880 + }, + { + "epoch": 0.17863670613161448, + "grad_norm": 0.32881420850753784, + "learning_rate": 0.0006, + "loss": 2.2381, + "step": 47890 + }, + { + "epoch": 0.17867400759457785, + "grad_norm": 0.31185200810432434, + "learning_rate": 0.0006, + "loss": 2.2453, + "step": 47900 + }, + { + "epoch": 0.17871130905754123, + "grad_norm": 0.45096081495285034, + "learning_rate": 0.0006, + "loss": 2.2944, + "step": 47910 + }, + { + "epoch": 0.1787486105205046, + "grad_norm": 0.4509238600730896, + "learning_rate": 0.0006, + "loss": 2.2521, + "step": 47920 + }, + { + "epoch": 0.178785911983468, + "grad_norm": 0.2802223563194275, + "learning_rate": 0.0006, + "loss": 2.0645, + "step": 47930 + }, + { + "epoch": 0.17882321344643137, + "grad_norm": 0.31433260440826416, + "learning_rate": 0.0006, + "loss": 2.2706, + "step": 47940 + }, + { + "epoch": 0.17886051490939475, + "grad_norm": 0.3247743844985962, + "learning_rate": 0.0006, + "loss": 2.294, + "step": 47950 + }, + { + "epoch": 0.17889781637235813, + "grad_norm": 0.2576138973236084, + "learning_rate": 0.0006, + "loss": 2.1262, + "step": 47960 + }, + { + "epoch": 0.1789351178353215, + "grad_norm": 0.36294931173324585, + "learning_rate": 0.0006, + "loss": 2.2311, + "step": 47970 + }, + { + "epoch": 0.1789724192982849, + "grad_norm": 0.3114989697933197, + "learning_rate": 0.0006, + "loss": 2.1977, + "step": 47980 + }, + { + "epoch": 0.17900972076124827, + "grad_norm": 0.46019694209098816, + "learning_rate": 0.0006, + "loss": 2.0832, + "step": 47990 + }, + { + "epoch": 0.17904702222421162, + "grad_norm": 0.40756046772003174, + "learning_rate": 0.0006, + "loss": 2.2464, + "step": 48000 + }, + { + "epoch": 0.17904702222421162, + "eval_valid_loss": 2.1819169521331787, + "eval_valid_loss/all": 2.045752763748169, + "eval_valid_loss/end_span": 1.2846171855926514, + "eval_valid_perplexity/batch": 7.734979152679443, + "eval_valid_perplexity/end_span": 3.6132845878601074, + "eval_valid_perplexity/fim": 2.279207468032837, + "eval_valid_perplexity/first_seq": 14.72753620147705, + "eval_valid_perplexity/last_seq": 8.43801212310791, + "eval_valid_perplexity/second_seq": 14.045586585998535, + "eval_valid_perplexity/seq": 8.716297149658203, + "eval_valid_reconstruction/all": 0.29661309719085693, + "eval_valid_reconstruction/end_span": 0.6966955065727234, + "eval_valid_reconstruction/fim": 0.1676202416419983, + "eval_valid_reconstruction/first_seq": 0.17029443383216858, + "eval_valid_reconstruction/last_seq": 0.3461567461490631, + "eval_valid_reconstruction/second_seq": 0.1841506063938141, + "eval_valid_runtime": 518.2451, + "eval_valid_samples_per_second": 0.37, + "eval_valid_steps_per_second": 0.37, + "step": 48000 + }, + { + "epoch": 0.17904702222421162, + "eval_train_loss": 2.1789491176605225, + "eval_train_loss/all": 2.0168697834014893, + "eval_train_loss/end_span": 1.2506093978881836, + "eval_train_perplexity/batch": 7.51476526260376, + "eval_train_perplexity/end_span": 3.4924705028533936, + "eval_train_perplexity/fim": 2.1171839237213135, + "eval_train_perplexity/first_seq": 15.349126815795898, + "eval_train_perplexity/last_seq": 8.811931610107422, + "eval_train_perplexity/second_seq": 14.217194557189941, + "eval_train_perplexity/seq": 8.653393745422363, + "eval_train_reconstruction/all": 0.28645002841949463, + "eval_train_reconstruction/end_span": 0.7068955302238464, + "eval_train_reconstruction/fim": 0.15125809609889984, + "eval_train_reconstruction/first_seq": 0.15572503209114075, + "eval_train_reconstruction/last_seq": 0.3338446319103241, + "eval_train_reconstruction/second_seq": 0.18669022619724274, + "eval_train_runtime": 519.7243, + "eval_train_samples_per_second": 0.369, + "eval_train_steps_per_second": 0.369, + "step": 48000 + }, + { + "epoch": 0.179084323687175, + "grad_norm": 0.28797417879104614, + "learning_rate": 0.0006, + "loss": 2.0937, + "step": 48010 + }, + { + "epoch": 0.17912162515013838, + "grad_norm": 0.31132611632347107, + "learning_rate": 0.0006, + "loss": 2.192, + "step": 48020 + }, + { + "epoch": 0.17915892661310176, + "grad_norm": 0.5201295018196106, + "learning_rate": 0.0006, + "loss": 2.1583, + "step": 48030 + }, + { + "epoch": 0.17919622807606514, + "grad_norm": 0.5826014280319214, + "learning_rate": 0.0006, + "loss": 2.1859, + "step": 48040 + }, + { + "epoch": 0.17923352953902852, + "grad_norm": 0.5822812914848328, + "learning_rate": 0.0006, + "loss": 2.1563, + "step": 48050 + }, + { + "epoch": 0.1792708310019919, + "grad_norm": 0.31135910749435425, + "learning_rate": 0.0006, + "loss": 2.4131, + "step": 48060 + }, + { + "epoch": 0.17930813246495528, + "grad_norm": 0.4444440007209778, + "learning_rate": 0.0006, + "loss": 2.1763, + "step": 48070 + }, + { + "epoch": 0.17934543392791866, + "grad_norm": 0.687171995639801, + "learning_rate": 0.0006, + "loss": 2.0073, + "step": 48080 + }, + { + "epoch": 0.17938273539088204, + "grad_norm": 0.3054075837135315, + "learning_rate": 0.0006, + "loss": 2.2363, + "step": 48090 + }, + { + "epoch": 0.17942003685384542, + "grad_norm": 0.47343552112579346, + "learning_rate": 0.0006, + "loss": 2.0115, + "step": 48100 + }, + { + "epoch": 0.1794573383168088, + "grad_norm": 0.25276556611061096, + "learning_rate": 0.0006, + "loss": 2.3213, + "step": 48110 + }, + { + "epoch": 0.17949463977977217, + "grad_norm": 0.45099520683288574, + "learning_rate": 0.0006, + "loss": 1.9969, + "step": 48120 + }, + { + "epoch": 0.17953194124273553, + "grad_norm": 0.25869038701057434, + "learning_rate": 0.0006, + "loss": 2.3808, + "step": 48130 + }, + { + "epoch": 0.1795692427056989, + "grad_norm": 0.3739531636238098, + "learning_rate": 0.0006, + "loss": 2.3476, + "step": 48140 + }, + { + "epoch": 0.17960654416866229, + "grad_norm": 0.4610814154148102, + "learning_rate": 0.0006, + "loss": 2.308, + "step": 48150 + }, + { + "epoch": 0.17964384563162566, + "grad_norm": 0.2591072618961334, + "learning_rate": 0.0006, + "loss": 2.1221, + "step": 48160 + }, + { + "epoch": 0.17968114709458904, + "grad_norm": 0.4578026533126831, + "learning_rate": 0.0006, + "loss": 2.1738, + "step": 48170 + }, + { + "epoch": 0.17971844855755242, + "grad_norm": 0.40266233682632446, + "learning_rate": 0.0006, + "loss": 2.1176, + "step": 48180 + }, + { + "epoch": 0.1797557500205158, + "grad_norm": 0.3880186676979065, + "learning_rate": 0.0006, + "loss": 2.2875, + "step": 48190 + }, + { + "epoch": 0.17979305148347918, + "grad_norm": 0.4621662199497223, + "learning_rate": 0.0006, + "loss": 2.1686, + "step": 48200 + }, + { + "epoch": 0.17983035294644256, + "grad_norm": 0.34520024061203003, + "learning_rate": 0.0006, + "loss": 2.2259, + "step": 48210 + }, + { + "epoch": 0.17986765440940594, + "grad_norm": 0.26666873693466187, + "learning_rate": 0.0006, + "loss": 2.0822, + "step": 48220 + }, + { + "epoch": 0.17990495587236932, + "grad_norm": 0.6929119825363159, + "learning_rate": 0.0006, + "loss": 2.0491, + "step": 48230 + }, + { + "epoch": 0.1799422573353327, + "grad_norm": 0.44340217113494873, + "learning_rate": 0.0006, + "loss": 2.2298, + "step": 48240 + }, + { + "epoch": 0.17997955879829608, + "grad_norm": 0.8747690916061401, + "learning_rate": 0.0006, + "loss": 2.3395, + "step": 48250 + }, + { + "epoch": 0.17997955879829608, + "eval_valid_loss": 2.187880277633667, + "eval_valid_loss/all": 2.0512912273406982, + "eval_valid_loss/end_span": 1.237638235092163, + "eval_valid_perplexity/batch": 7.777937889099121, + "eval_valid_perplexity/end_span": 3.4474618434906006, + "eval_valid_perplexity/fim": 2.234126567840576, + "eval_valid_perplexity/first_seq": 15.066265106201172, + "eval_valid_perplexity/last_seq": 9.21311092376709, + "eval_valid_perplexity/second_seq": 13.735993385314941, + "eval_valid_perplexity/seq": 8.770363807678223, + "eval_valid_reconstruction/all": 0.29537466168403625, + "eval_valid_reconstruction/end_span": 0.7049818634986877, + "eval_valid_reconstruction/fim": 0.16185414791107178, + "eval_valid_reconstruction/first_seq": 0.1631470024585724, + "eval_valid_reconstruction/last_seq": 0.3199625611305237, + "eval_valid_reconstruction/second_seq": 0.197060689330101, + "eval_valid_runtime": 508.6256, + "eval_valid_samples_per_second": 0.377, + "eval_valid_steps_per_second": 0.377, + "step": 48250 + }, + { + "epoch": 0.17997955879829608, + "eval_train_loss": 2.187566041946411, + "eval_train_loss/all": 2.0241611003875732, + "eval_train_loss/end_span": 1.201980710029602, + "eval_train_perplexity/batch": 7.56975793838501, + "eval_train_perplexity/end_span": 3.326699733734131, + "eval_train_perplexity/fim": 1.9491558074951172, + "eval_train_perplexity/first_seq": 15.36584186553955, + "eval_train_perplexity/last_seq": 8.877985954284668, + "eval_train_perplexity/second_seq": 14.45762825012207, + "eval_train_perplexity/seq": 8.715703010559082, + "eval_train_reconstruction/all": 0.2844821512699127, + "eval_train_reconstruction/end_span": 0.7159624099731445, + "eval_train_reconstruction/fim": 0.1337263137102127, + "eval_train_reconstruction/first_seq": 0.1548212468624115, + "eval_train_reconstruction/last_seq": 0.33425796031951904, + "eval_train_reconstruction/second_seq": 0.17690931260585785, + "eval_train_runtime": 514.7197, + "eval_train_samples_per_second": 0.373, + "eval_train_steps_per_second": 0.373, + "step": 48250 + }, + { + "epoch": 0.18001686026125946, + "grad_norm": 0.4697090685367584, + "learning_rate": 0.0006, + "loss": 2.2126, + "step": 48260 + }, + { + "epoch": 0.1800541617242228, + "grad_norm": 0.35707148909568787, + "learning_rate": 0.0006, + "loss": 2.3113, + "step": 48270 + }, + { + "epoch": 0.1800914631871862, + "grad_norm": 0.4004633128643036, + "learning_rate": 0.0006, + "loss": 2.1585, + "step": 48280 + }, + { + "epoch": 0.18012876465014957, + "grad_norm": 0.41196709871292114, + "learning_rate": 0.0006, + "loss": 2.2074, + "step": 48290 + }, + { + "epoch": 0.18016606611311295, + "grad_norm": 0.2657153606414795, + "learning_rate": 0.0006, + "loss": 2.1545, + "step": 48300 + }, + { + "epoch": 0.18020336757607633, + "grad_norm": 0.31108370423316956, + "learning_rate": 0.0006, + "loss": 2.1649, + "step": 48310 + }, + { + "epoch": 0.1802406690390397, + "grad_norm": 0.32589393854141235, + "learning_rate": 0.0006, + "loss": 2.2811, + "step": 48320 + }, + { + "epoch": 0.1802779705020031, + "grad_norm": 0.5149913430213928, + "learning_rate": 0.0006, + "loss": 2.1823, + "step": 48330 + }, + { + "epoch": 0.18031527196496647, + "grad_norm": 0.31573760509490967, + "learning_rate": 0.0006, + "loss": 2.2219, + "step": 48340 + }, + { + "epoch": 0.18035257342792985, + "grad_norm": 0.30930912494659424, + "learning_rate": 0.0006, + "loss": 2.2456, + "step": 48350 + }, + { + "epoch": 0.18038987489089323, + "grad_norm": 0.23074600100517273, + "learning_rate": 0.0006, + "loss": 2.2494, + "step": 48360 + }, + { + "epoch": 0.1804271763538566, + "grad_norm": 0.3017536401748657, + "learning_rate": 0.0006, + "loss": 2.4114, + "step": 48370 + }, + { + "epoch": 0.18046447781681998, + "grad_norm": 0.3195408582687378, + "learning_rate": 0.0006, + "loss": 2.2578, + "step": 48380 + }, + { + "epoch": 0.18050177927978336, + "grad_norm": 0.4165717661380768, + "learning_rate": 0.0006, + "loss": 2.1026, + "step": 48390 + }, + { + "epoch": 0.18053908074274674, + "grad_norm": 0.561040997505188, + "learning_rate": 0.0006, + "loss": 2.1744, + "step": 48400 + }, + { + "epoch": 0.1805763822057101, + "grad_norm": 0.34170445799827576, + "learning_rate": 0.0006, + "loss": 2.1188, + "step": 48410 + }, + { + "epoch": 0.18061368366867347, + "grad_norm": 0.22400227189064026, + "learning_rate": 0.0006, + "loss": 2.2324, + "step": 48420 + }, + { + "epoch": 0.18065098513163685, + "grad_norm": 0.27741119265556335, + "learning_rate": 0.0006, + "loss": 2.2533, + "step": 48430 + }, + { + "epoch": 0.18068828659460023, + "grad_norm": 0.33669260144233704, + "learning_rate": 0.0006, + "loss": 1.996, + "step": 48440 + }, + { + "epoch": 0.1807255880575636, + "grad_norm": 0.2774890065193176, + "learning_rate": 0.0006, + "loss": 2.3753, + "step": 48450 + }, + { + "epoch": 0.180762889520527, + "grad_norm": 0.3854230046272278, + "learning_rate": 0.0006, + "loss": 2.1042, + "step": 48460 + }, + { + "epoch": 0.18080019098349037, + "grad_norm": 0.2977631092071533, + "learning_rate": 0.0006, + "loss": 2.1849, + "step": 48470 + }, + { + "epoch": 0.18083749244645375, + "grad_norm": 0.3738870322704315, + "learning_rate": 0.0006, + "loss": 1.9974, + "step": 48480 + }, + { + "epoch": 0.18087479390941713, + "grad_norm": 0.3249952495098114, + "learning_rate": 0.0006, + "loss": 1.9328, + "step": 48490 + }, + { + "epoch": 0.1809120953723805, + "grad_norm": 0.3198281228542328, + "learning_rate": 0.0006, + "loss": 2.0719, + "step": 48500 + }, + { + "epoch": 0.1809120953723805, + "eval_valid_loss": 2.1832051277160645, + "eval_valid_loss/all": 2.046994686126709, + "eval_valid_loss/end_span": 1.2191766500473022, + "eval_valid_perplexity/batch": 7.744591236114502, + "eval_valid_perplexity/end_span": 3.3844001293182373, + "eval_valid_perplexity/fim": 2.365783452987671, + "eval_valid_perplexity/first_seq": 14.740147590637207, + "eval_valid_perplexity/last_seq": 9.146360397338867, + "eval_valid_perplexity/second_seq": 13.980215072631836, + "eval_valid_perplexity/seq": 8.736708641052246, + "eval_valid_reconstruction/all": 0.29653987288475037, + "eval_valid_reconstruction/end_span": 0.71642005443573, + "eval_valid_reconstruction/fim": 0.1729540377855301, + "eval_valid_reconstruction/first_seq": 0.16755908727645874, + "eval_valid_reconstruction/last_seq": 0.3208508789539337, + "eval_valid_reconstruction/second_seq": 0.18852318823337555, + "eval_valid_runtime": 501.8296, + "eval_valid_samples_per_second": 0.383, + "eval_valid_steps_per_second": 0.383, + "step": 48500 + }, + { + "epoch": 0.1809120953723805, + "eval_train_loss": 2.179736375808716, + "eval_train_loss/all": 2.0175695419311523, + "eval_train_loss/end_span": 1.185056209564209, + "eval_train_perplexity/batch": 7.520025730133057, + "eval_train_perplexity/end_span": 3.2708706855773926, + "eval_train_perplexity/fim": 2.455641984939575, + "eval_train_perplexity/first_seq": 15.464640617370605, + "eval_train_perplexity/last_seq": 8.612537384033203, + "eval_train_perplexity/second_seq": 14.487796783447266, + "eval_train_perplexity/seq": 8.661050796508789, + "eval_train_reconstruction/all": 0.28639575839042664, + "eval_train_reconstruction/end_span": 0.7279810309410095, + "eval_train_reconstruction/fim": 0.1810247004032135, + "eval_train_reconstruction/first_seq": 0.1506623923778534, + "eval_train_reconstruction/last_seq": 0.3395962715148926, + "eval_train_reconstruction/second_seq": 0.17891784012317657, + "eval_train_runtime": 518.81, + "eval_train_samples_per_second": 0.37, + "eval_train_steps_per_second": 0.37, + "step": 48500 + }, + { + "epoch": 0.1809493968353439, + "grad_norm": 0.244635671377182, + "learning_rate": 0.0006, + "loss": 2.1626, + "step": 48510 + }, + { + "epoch": 0.18098669829830727, + "grad_norm": 0.3433055281639099, + "learning_rate": 0.0006, + "loss": 2.1248, + "step": 48520 + }, + { + "epoch": 0.18102399976127065, + "grad_norm": 0.3158920407295227, + "learning_rate": 0.0006, + "loss": 2.2141, + "step": 48530 + }, + { + "epoch": 0.18106130122423403, + "grad_norm": 0.29473596811294556, + "learning_rate": 0.0006, + "loss": 2.3293, + "step": 48540 + }, + { + "epoch": 0.18109860268719738, + "grad_norm": 0.3637133538722992, + "learning_rate": 0.0006, + "loss": 2.0438, + "step": 48550 + }, + { + "epoch": 0.18113590415016076, + "grad_norm": 0.6254370212554932, + "learning_rate": 0.0006, + "loss": 2.1071, + "step": 48560 + }, + { + "epoch": 0.18117320561312414, + "grad_norm": 0.26309719681739807, + "learning_rate": 0.0006, + "loss": 2.2452, + "step": 48570 + }, + { + "epoch": 0.18121050707608752, + "grad_norm": 0.2686316668987274, + "learning_rate": 0.0006, + "loss": 2.271, + "step": 48580 + }, + { + "epoch": 0.1812478085390509, + "grad_norm": 0.3153771758079529, + "learning_rate": 0.0006, + "loss": 2.2844, + "step": 48590 + }, + { + "epoch": 0.18128511000201428, + "grad_norm": 0.3428932726383209, + "learning_rate": 0.0006, + "loss": 2.2851, + "step": 48600 + }, + { + "epoch": 0.18132241146497766, + "grad_norm": 0.3214545249938965, + "learning_rate": 0.0006, + "loss": 2.0599, + "step": 48610 + }, + { + "epoch": 0.18135971292794104, + "grad_norm": 0.40566515922546387, + "learning_rate": 0.0006, + "loss": 2.1956, + "step": 48620 + }, + { + "epoch": 0.18139701439090442, + "grad_norm": 0.38953980803489685, + "learning_rate": 0.0006, + "loss": 2.1839, + "step": 48630 + }, + { + "epoch": 0.1814343158538678, + "grad_norm": 0.24002905189990997, + "learning_rate": 0.0006, + "loss": 2.3664, + "step": 48640 + }, + { + "epoch": 0.18147161731683117, + "grad_norm": 0.31252214312553406, + "learning_rate": 0.0006, + "loss": 2.0871, + "step": 48650 + }, + { + "epoch": 0.18150891877979455, + "grad_norm": 0.3459746241569519, + "learning_rate": 0.0006, + "loss": 2.2939, + "step": 48660 + }, + { + "epoch": 0.18154622024275793, + "grad_norm": 0.460504949092865, + "learning_rate": 0.0006, + "loss": 2.2544, + "step": 48670 + }, + { + "epoch": 0.18158352170572128, + "grad_norm": 0.2598676383495331, + "learning_rate": 0.0006, + "loss": 2.2483, + "step": 48680 + }, + { + "epoch": 0.18162082316868466, + "grad_norm": 0.30478161573410034, + "learning_rate": 0.0006, + "loss": 2.2098, + "step": 48690 + }, + { + "epoch": 0.18165812463164804, + "grad_norm": 0.4550066590309143, + "learning_rate": 0.0006, + "loss": 2.0929, + "step": 48700 + }, + { + "epoch": 0.18169542609461142, + "grad_norm": 0.4238313138484955, + "learning_rate": 0.0006, + "loss": 2.1301, + "step": 48710 + }, + { + "epoch": 0.1817327275575748, + "grad_norm": 0.4338102638721466, + "learning_rate": 0.0006, + "loss": 2.2288, + "step": 48720 + }, + { + "epoch": 0.18177002902053818, + "grad_norm": 0.3623291850090027, + "learning_rate": 0.0006, + "loss": 2.077, + "step": 48730 + }, + { + "epoch": 0.18180733048350156, + "grad_norm": 0.28533458709716797, + "learning_rate": 0.0006, + "loss": 2.2478, + "step": 48740 + }, + { + "epoch": 0.18184463194646494, + "grad_norm": 0.34654393792152405, + "learning_rate": 0.0006, + "loss": 2.0581, + "step": 48750 + }, + { + "epoch": 0.18184463194646494, + "eval_valid_loss": 2.1847469806671143, + "eval_valid_loss/all": 2.048799991607666, + "eval_valid_loss/end_span": 1.3649407625198364, + "eval_valid_perplexity/batch": 7.758584976196289, + "eval_valid_perplexity/end_span": 3.9154911041259766, + "eval_valid_perplexity/fim": 2.347322702407837, + "eval_valid_perplexity/first_seq": 15.101591110229492, + "eval_valid_perplexity/last_seq": 8.754831314086914, + "eval_valid_perplexity/second_seq": 13.5686616897583, + "eval_valid_perplexity/seq": 8.757282257080078, + "eval_valid_reconstruction/all": 0.29617637395858765, + "eval_valid_reconstruction/end_span": 0.686431348323822, + "eval_valid_reconstruction/fim": 0.1721058189868927, + "eval_valid_reconstruction/first_seq": 0.16253486275672913, + "eval_valid_reconstruction/last_seq": 0.33697813749313354, + "eval_valid_reconstruction/second_seq": 0.19899669289588928, + "eval_valid_runtime": 512.4711, + "eval_valid_samples_per_second": 0.375, + "eval_valid_steps_per_second": 0.375, + "step": 48750 + }, + { + "epoch": 0.18184463194646494, + "eval_train_loss": 2.1840217113494873, + "eval_train_loss/all": 2.0221354961395264, + "eval_train_loss/end_span": 1.3222905397415161, + "eval_train_perplexity/batch": 7.554440021514893, + "eval_train_perplexity/end_span": 3.7520055770874023, + "eval_train_perplexity/fim": 2.1455554962158203, + "eval_train_perplexity/first_seq": 15.517251968383789, + "eval_train_perplexity/last_seq": 8.736166000366211, + "eval_train_perplexity/second_seq": 14.29467487335205, + "eval_train_perplexity/seq": 8.70718765258789, + "eval_train_reconstruction/all": 0.28521883487701416, + "eval_train_reconstruction/end_span": 0.694409966468811, + "eval_train_reconstruction/fim": 0.15316876769065857, + "eval_train_reconstruction/first_seq": 0.14998190104961395, + "eval_train_reconstruction/last_seq": 0.33489730954170227, + "eval_train_reconstruction/second_seq": 0.17998461425304413, + "eval_train_runtime": 508.2806, + "eval_train_samples_per_second": 0.378, + "eval_train_steps_per_second": 0.378, + "step": 48750 + }, + { + "epoch": 0.18188193340942832, + "grad_norm": 0.2238171398639679, + "learning_rate": 0.0006, + "loss": 2.2536, + "step": 48760 + }, + { + "epoch": 0.1819192348723917, + "grad_norm": 0.3597430884838104, + "learning_rate": 0.0006, + "loss": 2.3211, + "step": 48770 + }, + { + "epoch": 0.18195653633535508, + "grad_norm": 0.5357645750045776, + "learning_rate": 0.0006, + "loss": 2.1882, + "step": 48780 + }, + { + "epoch": 0.18199383779831846, + "grad_norm": 0.21719439327716827, + "learning_rate": 0.0006, + "loss": 2.2039, + "step": 48790 + }, + { + "epoch": 0.18203113926128184, + "grad_norm": 0.3249138593673706, + "learning_rate": 0.0006, + "loss": 2.2705, + "step": 48800 + }, + { + "epoch": 0.18206844072424522, + "grad_norm": 0.5006373524665833, + "learning_rate": 0.0006, + "loss": 2.2888, + "step": 48810 + }, + { + "epoch": 0.18210574218720857, + "grad_norm": 0.3214944005012512, + "learning_rate": 0.0006, + "loss": 2.1527, + "step": 48820 + }, + { + "epoch": 0.18214304365017195, + "grad_norm": 0.28698861598968506, + "learning_rate": 0.0006, + "loss": 2.1896, + "step": 48830 + }, + { + "epoch": 0.18218034511313533, + "grad_norm": 0.3036311864852905, + "learning_rate": 0.0006, + "loss": 2.2298, + "step": 48840 + }, + { + "epoch": 0.1822176465760987, + "grad_norm": 0.3002713620662689, + "learning_rate": 0.0006, + "loss": 2.25, + "step": 48850 + }, + { + "epoch": 0.1822549480390621, + "grad_norm": 0.36340653896331787, + "learning_rate": 0.0006, + "loss": 2.1341, + "step": 48860 + }, + { + "epoch": 0.18229224950202547, + "grad_norm": 0.28717461228370667, + "learning_rate": 0.0006, + "loss": 2.233, + "step": 48870 + }, + { + "epoch": 0.18232955096498885, + "grad_norm": 0.3075501024723053, + "learning_rate": 0.0006, + "loss": 2.2208, + "step": 48880 + }, + { + "epoch": 0.18236685242795223, + "grad_norm": 0.7755941152572632, + "learning_rate": 0.0006, + "loss": 2.1778, + "step": 48890 + }, + { + "epoch": 0.1824041538909156, + "grad_norm": 0.35913288593292236, + "learning_rate": 0.0006, + "loss": 2.3953, + "step": 48900 + }, + { + "epoch": 0.18244145535387898, + "grad_norm": 0.2715131640434265, + "learning_rate": 0.0006, + "loss": 2.3178, + "step": 48910 + }, + { + "epoch": 0.18247875681684236, + "grad_norm": 0.3396020531654358, + "learning_rate": 0.0006, + "loss": 2.2896, + "step": 48920 + }, + { + "epoch": 0.18251605827980574, + "grad_norm": 0.33839085698127747, + "learning_rate": 0.0006, + "loss": 2.2291, + "step": 48930 + }, + { + "epoch": 0.18255335974276912, + "grad_norm": 0.31776919960975647, + "learning_rate": 0.0006, + "loss": 2.1895, + "step": 48940 + }, + { + "epoch": 0.1825906612057325, + "grad_norm": 0.3206128478050232, + "learning_rate": 0.0006, + "loss": 2.3103, + "step": 48950 + }, + { + "epoch": 0.18262796266869585, + "grad_norm": 0.3659849166870117, + "learning_rate": 0.0006, + "loss": 2.2831, + "step": 48960 + }, + { + "epoch": 0.18266526413165923, + "grad_norm": 0.321240097284317, + "learning_rate": 0.0006, + "loss": 2.1782, + "step": 48970 + }, + { + "epoch": 0.1827025655946226, + "grad_norm": 0.295206755399704, + "learning_rate": 0.0006, + "loss": 2.3385, + "step": 48980 + }, + { + "epoch": 0.182739867057586, + "grad_norm": 0.29451021552085876, + "learning_rate": 0.0006, + "loss": 2.3201, + "step": 48990 + }, + { + "epoch": 0.18277716852054937, + "grad_norm": 0.41055870056152344, + "learning_rate": 0.0006, + "loss": 2.157, + "step": 49000 + }, + { + "epoch": 0.18277716852054937, + "eval_valid_loss": 2.1783266067504883, + "eval_valid_loss/all": 2.042555093765259, + "eval_valid_loss/end_span": 1.2402678728103638, + "eval_valid_perplexity/batch": 7.71028470993042, + "eval_valid_perplexity/end_span": 3.4565391540527344, + "eval_valid_perplexity/fim": 2.776848554611206, + "eval_valid_perplexity/first_seq": 14.619369506835938, + "eval_valid_perplexity/last_seq": 8.711373329162598, + "eval_valid_perplexity/second_seq": 13.599600791931152, + "eval_valid_perplexity/seq": 8.697041511535645, + "eval_valid_reconstruction/all": 0.2978273034095764, + "eval_valid_reconstruction/end_span": 0.7082632780075073, + "eval_valid_reconstruction/fim": 0.20706214010715485, + "eval_valid_reconstruction/first_seq": 0.17527256906032562, + "eval_valid_reconstruction/last_seq": 0.33609312772750854, + "eval_valid_reconstruction/second_seq": 0.20138905942440033, + "eval_valid_runtime": 505.8001, + "eval_valid_samples_per_second": 0.38, + "eval_valid_steps_per_second": 0.38, + "step": 49000 + }, + { + "epoch": 0.18277716852054937, + "eval_train_loss": 2.1779799461364746, + "eval_train_loss/all": 2.0162293910980225, + "eval_train_loss/end_span": 1.1913150548934937, + "eval_train_perplexity/batch": 7.509954452514648, + "eval_train_perplexity/end_span": 3.2914066314697266, + "eval_train_perplexity/fim": 1.9844635725021362, + "eval_train_perplexity/first_seq": 15.594996452331543, + "eval_train_perplexity/last_seq": 8.743321418762207, + "eval_train_perplexity/second_seq": 14.337688446044922, + "eval_train_perplexity/seq": 8.650654792785645, + "eval_train_reconstruction/all": 0.2869842052459717, + "eval_train_reconstruction/end_span": 0.7202284336090088, + "eval_train_reconstruction/fim": 0.13881151378154755, + "eval_train_reconstruction/first_seq": 0.1495198905467987, + "eval_train_reconstruction/last_seq": 0.3333681523799896, + "eval_train_reconstruction/second_seq": 0.1822427660226822, + "eval_train_runtime": 506.815, + "eval_train_samples_per_second": 0.379, + "eval_train_steps_per_second": 0.379, + "step": 49000 + }, + { + "epoch": 0.18281446998351275, + "grad_norm": 0.402970552444458, + "learning_rate": 0.0006, + "loss": 2.1385, + "step": 49010 + }, + { + "epoch": 0.18285177144647613, + "grad_norm": 0.3525312542915344, + "learning_rate": 0.0006, + "loss": 2.1469, + "step": 49020 + }, + { + "epoch": 0.1828890729094395, + "grad_norm": 0.4521220624446869, + "learning_rate": 0.0006, + "loss": 2.2427, + "step": 49030 + }, + { + "epoch": 0.1829263743724029, + "grad_norm": 0.6345458030700684, + "learning_rate": 0.0006, + "loss": 2.0587, + "step": 49040 + }, + { + "epoch": 0.18296367583536627, + "grad_norm": 0.40587177872657776, + "learning_rate": 0.0006, + "loss": 2.1839, + "step": 49050 + }, + { + "epoch": 0.18300097729832965, + "grad_norm": 0.3895140588283539, + "learning_rate": 0.0006, + "loss": 2.0014, + "step": 49060 + }, + { + "epoch": 0.18303827876129303, + "grad_norm": 0.20736972987651825, + "learning_rate": 0.0006, + "loss": 2.3554, + "step": 49070 + }, + { + "epoch": 0.1830755802242564, + "grad_norm": 0.2854832410812378, + "learning_rate": 0.0006, + "loss": 2.3059, + "step": 49080 + }, + { + "epoch": 0.1831128816872198, + "grad_norm": 0.4146565794944763, + "learning_rate": 0.0006, + "loss": 2.2355, + "step": 49090 + }, + { + "epoch": 0.18315018315018314, + "grad_norm": 0.2956407070159912, + "learning_rate": 0.0006, + "loss": 2.1986, + "step": 49100 + }, + { + "epoch": 0.18318748461314652, + "grad_norm": 0.3188014328479767, + "learning_rate": 0.0006, + "loss": 2.3971, + "step": 49110 + }, + { + "epoch": 0.1832247860761099, + "grad_norm": 0.3076457381248474, + "learning_rate": 0.0006, + "loss": 1.9347, + "step": 49120 + }, + { + "epoch": 0.18326208753907328, + "grad_norm": 0.33833813667297363, + "learning_rate": 0.0006, + "loss": 2.3884, + "step": 49130 + }, + { + "epoch": 0.18329938900203666, + "grad_norm": 0.26971858739852905, + "learning_rate": 0.0006, + "loss": 2.1761, + "step": 49140 + }, + { + "epoch": 0.18333669046500004, + "grad_norm": 0.46376603841781616, + "learning_rate": 0.0006, + "loss": 2.2044, + "step": 49150 + }, + { + "epoch": 0.18337399192796341, + "grad_norm": 0.30236631631851196, + "learning_rate": 0.0006, + "loss": 2.261, + "step": 49160 + }, + { + "epoch": 0.1834112933909268, + "grad_norm": 0.44772621989250183, + "learning_rate": 0.0006, + "loss": 2.1706, + "step": 49170 + }, + { + "epoch": 0.18344859485389017, + "grad_norm": 0.32834604382514954, + "learning_rate": 0.0006, + "loss": 2.2064, + "step": 49180 + }, + { + "epoch": 0.18348589631685355, + "grad_norm": 0.2818068563938141, + "learning_rate": 0.0006, + "loss": 2.234, + "step": 49190 + }, + { + "epoch": 0.18352319777981693, + "grad_norm": 0.4236081540584564, + "learning_rate": 0.0006, + "loss": 2.1427, + "step": 49200 + }, + { + "epoch": 0.1835604992427803, + "grad_norm": 0.40421241521835327, + "learning_rate": 0.0006, + "loss": 2.2151, + "step": 49210 + }, + { + "epoch": 0.1835978007057437, + "grad_norm": 0.2870214581489563, + "learning_rate": 0.0006, + "loss": 2.0361, + "step": 49220 + }, + { + "epoch": 0.18363510216870704, + "grad_norm": 0.30527541041374207, + "learning_rate": 0.0006, + "loss": 2.1946, + "step": 49230 + }, + { + "epoch": 0.18367240363167042, + "grad_norm": 0.4189262092113495, + "learning_rate": 0.0006, + "loss": 2.2341, + "step": 49240 + }, + { + "epoch": 0.1837097050946338, + "grad_norm": 0.28272899985313416, + "learning_rate": 0.0006, + "loss": 2.4105, + "step": 49250 + }, + { + "epoch": 0.1837097050946338, + "eval_valid_loss": 2.17989444732666, + "eval_valid_loss/all": 2.043806791305542, + "eval_valid_loss/end_span": 1.2100012302398682, + "eval_valid_perplexity/batch": 7.71994161605835, + "eval_valid_perplexity/end_span": 3.3534886837005615, + "eval_valid_perplexity/fim": 2.1914291381835938, + "eval_valid_perplexity/first_seq": 15.05294418334961, + "eval_valid_perplexity/last_seq": 8.715819358825684, + "eval_valid_perplexity/second_seq": 13.69288158416748, + "eval_valid_perplexity/seq": 8.70499324798584, + "eval_valid_reconstruction/all": 0.2976216971874237, + "eval_valid_reconstruction/end_span": 0.721024215221405, + "eval_valid_reconstruction/fim": 0.1589762270450592, + "eval_valid_reconstruction/first_seq": 0.16487614810466766, + "eval_valid_reconstruction/last_seq": 0.3350045680999756, + "eval_valid_reconstruction/second_seq": 0.2001471072435379, + "eval_valid_runtime": 514.4357, + "eval_valid_samples_per_second": 0.373, + "eval_valid_steps_per_second": 0.373, + "step": 49250 + }, + { + "epoch": 0.1837097050946338, + "eval_train_loss": 2.1794586181640625, + "eval_train_loss/all": 2.0172088146209717, + "eval_train_loss/end_span": 1.1710963249206543, + "eval_train_perplexity/batch": 7.517313480377197, + "eval_train_perplexity/end_span": 3.225526809692383, + "eval_train_perplexity/fim": 2.0304806232452393, + "eval_train_perplexity/first_seq": 15.803791046142578, + "eval_train_perplexity/last_seq": 9.175822257995605, + "eval_train_perplexity/second_seq": 14.250505447387695, + "eval_train_perplexity/seq": 8.656530380249023, + "eval_train_reconstruction/all": 0.28665891289711, + "eval_train_reconstruction/end_span": 0.7315454483032227, + "eval_train_reconstruction/fim": 0.1441010683774948, + "eval_train_reconstruction/first_seq": 0.14199571311473846, + "eval_train_reconstruction/last_seq": 0.31836965680122375, + "eval_train_reconstruction/second_seq": 0.18715475499629974, + "eval_train_runtime": 505.3141, + "eval_train_samples_per_second": 0.38, + "eval_train_steps_per_second": 0.38, + "step": 49250 + }, + { + "epoch": 0.18374700655759718, + "grad_norm": 0.37634849548339844, + "learning_rate": 0.0006, + "loss": 2.2117, + "step": 49260 + }, + { + "epoch": 0.18378430802056056, + "grad_norm": 0.4088083505630493, + "learning_rate": 0.0006, + "loss": 2.1088, + "step": 49270 + }, + { + "epoch": 0.18382160948352394, + "grad_norm": 0.3731492757797241, + "learning_rate": 0.0006, + "loss": 2.2552, + "step": 49280 + }, + { + "epoch": 0.18385891094648732, + "grad_norm": 0.2923642098903656, + "learning_rate": 0.0006, + "loss": 2.2193, + "step": 49290 + }, + { + "epoch": 0.1838962124094507, + "grad_norm": 0.3981491029262543, + "learning_rate": 0.0006, + "loss": 2.1473, + "step": 49300 + }, + { + "epoch": 0.18393351387241408, + "grad_norm": 0.3614998161792755, + "learning_rate": 0.0006, + "loss": 2.0466, + "step": 49310 + }, + { + "epoch": 0.18397081533537746, + "grad_norm": 0.5402523279190063, + "learning_rate": 0.0006, + "loss": 2.1447, + "step": 49320 + }, + { + "epoch": 0.18400811679834084, + "grad_norm": 2.0642004013061523, + "learning_rate": 0.0006, + "loss": 2.1793, + "step": 49330 + }, + { + "epoch": 0.18404541826130422, + "grad_norm": 0.45436960458755493, + "learning_rate": 0.0006, + "loss": 2.1426, + "step": 49340 + }, + { + "epoch": 0.1840827197242676, + "grad_norm": 0.3146677315235138, + "learning_rate": 0.0006, + "loss": 2.1248, + "step": 49350 + }, + { + "epoch": 0.18412002118723098, + "grad_norm": 0.3543142080307007, + "learning_rate": 0.0006, + "loss": 1.9575, + "step": 49360 + }, + { + "epoch": 0.18415732265019433, + "grad_norm": 0.2662213444709778, + "learning_rate": 0.0006, + "loss": 2.1876, + "step": 49370 + }, + { + "epoch": 0.1841946241131577, + "grad_norm": 1.676282525062561, + "learning_rate": 0.0006, + "loss": 2.1298, + "step": 49380 + }, + { + "epoch": 0.18423192557612109, + "grad_norm": 0.2788110375404358, + "learning_rate": 0.0006, + "loss": 2.332, + "step": 49390 + }, + { + "epoch": 0.18426922703908447, + "grad_norm": 0.41195639967918396, + "learning_rate": 0.0006, + "loss": 2.0729, + "step": 49400 + }, + { + "epoch": 0.18430652850204784, + "grad_norm": 0.26438191533088684, + "learning_rate": 0.0006, + "loss": 2.2401, + "step": 49410 + }, + { + "epoch": 0.18434382996501122, + "grad_norm": 0.29742512106895447, + "learning_rate": 0.0006, + "loss": 2.389, + "step": 49420 + }, + { + "epoch": 0.1843811314279746, + "grad_norm": 0.4402758479118347, + "learning_rate": 0.0006, + "loss": 2.1399, + "step": 49430 + }, + { + "epoch": 0.18441843289093798, + "grad_norm": 0.33304160833358765, + "learning_rate": 0.0006, + "loss": 2.1505, + "step": 49440 + }, + { + "epoch": 0.18445573435390136, + "grad_norm": 0.3460690677165985, + "learning_rate": 0.0006, + "loss": 2.2253, + "step": 49450 + }, + { + "epoch": 0.18449303581686474, + "grad_norm": 0.3484037220478058, + "learning_rate": 0.0006, + "loss": 2.3064, + "step": 49460 + }, + { + "epoch": 0.18453033727982812, + "grad_norm": 0.49295574426651, + "learning_rate": 0.0006, + "loss": 2.0244, + "step": 49470 + }, + { + "epoch": 0.1845676387427915, + "grad_norm": 0.24430134892463684, + "learning_rate": 0.0006, + "loss": 2.0711, + "step": 49480 + }, + { + "epoch": 0.18460494020575488, + "grad_norm": 0.2835167348384857, + "learning_rate": 0.0006, + "loss": 2.3582, + "step": 49490 + }, + { + "epoch": 0.18464224166871826, + "grad_norm": 0.3788309395313263, + "learning_rate": 0.0006, + "loss": 2.2699, + "step": 49500 + }, + { + "epoch": 0.18464224166871826, + "eval_valid_loss": 2.183591604232788, + "eval_valid_loss/all": 2.047377586364746, + "eval_valid_loss/end_span": 1.1795463562011719, + "eval_valid_perplexity/batch": 7.747557163238525, + "eval_valid_perplexity/end_span": 3.2528982162475586, + "eval_valid_perplexity/fim": 2.1122167110443115, + "eval_valid_perplexity/first_seq": 14.97033977508545, + "eval_valid_perplexity/last_seq": 9.267413139343262, + "eval_valid_perplexity/second_seq": 14.043068885803223, + "eval_valid_perplexity/seq": 8.737654685974121, + "eval_valid_reconstruction/all": 0.296417236328125, + "eval_valid_reconstruction/end_span": 0.7247470617294312, + "eval_valid_reconstruction/fim": 0.15177489817142487, + "eval_valid_reconstruction/first_seq": 0.1655253767967224, + "eval_valid_reconstruction/last_seq": 0.3177030682563782, + "eval_valid_reconstruction/second_seq": 0.1909131556749344, + "eval_valid_runtime": 512.4272, + "eval_valid_samples_per_second": 0.375, + "eval_valid_steps_per_second": 0.375, + "step": 49500 + }, + { + "epoch": 0.18464224166871826, + "eval_train_loss": 2.180697441101074, + "eval_train_loss/all": 2.0183491706848145, + "eval_train_loss/end_span": 1.1433418989181519, + "eval_train_perplexity/batch": 7.525890827178955, + "eval_train_perplexity/end_span": 3.137235164642334, + "eval_train_perplexity/fim": 2.1766815185546875, + "eval_train_perplexity/first_seq": 15.602200508117676, + "eval_train_perplexity/last_seq": 8.824639320373535, + "eval_train_perplexity/second_seq": 14.412825584411621, + "eval_train_perplexity/seq": 8.665972709655762, + "eval_train_reconstruction/all": 0.28620004653930664, + "eval_train_reconstruction/end_span": 0.7353572845458984, + "eval_train_reconstruction/fim": 0.15676167607307434, + "eval_train_reconstruction/first_seq": 0.15238738059997559, + "eval_train_reconstruction/last_seq": 0.32935670018196106, + "eval_train_reconstruction/second_seq": 0.1807629019021988, + "eval_train_runtime": 512.7994, + "eval_train_samples_per_second": 0.374, + "eval_train_steps_per_second": 0.374, + "step": 49500 + }, + { + "epoch": 0.1846795431316816, + "grad_norm": 0.36972805857658386, + "learning_rate": 0.0006, + "loss": 2.3124, + "step": 49510 + }, + { + "epoch": 0.184716844594645, + "grad_norm": 0.3140261471271515, + "learning_rate": 0.0006, + "loss": 2.0149, + "step": 49520 + }, + { + "epoch": 0.18475414605760837, + "grad_norm": 0.2946452796459198, + "learning_rate": 0.0006, + "loss": 2.2558, + "step": 49530 + }, + { + "epoch": 0.18479144752057175, + "grad_norm": 0.27297309041023254, + "learning_rate": 0.0006, + "loss": 2.1608, + "step": 49540 + }, + { + "epoch": 0.18482874898353513, + "grad_norm": 0.32102373242378235, + "learning_rate": 0.0006, + "loss": 2.1261, + "step": 49550 + }, + { + "epoch": 0.1848660504464985, + "grad_norm": 0.24816806614398956, + "learning_rate": 0.0006, + "loss": 2.2366, + "step": 49560 + }, + { + "epoch": 0.1849033519094619, + "grad_norm": 0.3162383437156677, + "learning_rate": 0.0006, + "loss": 2.133, + "step": 49570 + }, + { + "epoch": 0.18494065337242527, + "grad_norm": 0.35263192653656006, + "learning_rate": 0.0006, + "loss": 2.2034, + "step": 49580 + }, + { + "epoch": 0.18497795483538865, + "grad_norm": 0.3347272574901581, + "learning_rate": 0.0006, + "loss": 2.2548, + "step": 49590 + }, + { + "epoch": 0.18501525629835203, + "grad_norm": 0.5698261260986328, + "learning_rate": 0.0006, + "loss": 2.2333, + "step": 49600 + }, + { + "epoch": 0.1850525577613154, + "grad_norm": 0.2981284260749817, + "learning_rate": 0.0006, + "loss": 2.3445, + "step": 49610 + }, + { + "epoch": 0.18508985922427879, + "grad_norm": 0.4016222655773163, + "learning_rate": 0.0006, + "loss": 2.3208, + "step": 49620 + }, + { + "epoch": 0.18512716068724216, + "grad_norm": 0.3014146387577057, + "learning_rate": 0.0006, + "loss": 2.1205, + "step": 49630 + }, + { + "epoch": 0.18516446215020554, + "grad_norm": 0.30725792050361633, + "learning_rate": 0.0006, + "loss": 2.2542, + "step": 49640 + }, + { + "epoch": 0.1852017636131689, + "grad_norm": 0.33997663855552673, + "learning_rate": 0.0006, + "loss": 2.1171, + "step": 49650 + }, + { + "epoch": 0.18523906507613228, + "grad_norm": 0.3125193417072296, + "learning_rate": 0.0006, + "loss": 2.2203, + "step": 49660 + }, + { + "epoch": 0.18527636653909565, + "grad_norm": 1.9264143705368042, + "learning_rate": 0.0006, + "loss": 2.0861, + "step": 49670 + }, + { + "epoch": 0.18531366800205903, + "grad_norm": 0.4095751643180847, + "learning_rate": 0.0006, + "loss": 2.2977, + "step": 49680 + }, + { + "epoch": 0.1853509694650224, + "grad_norm": 0.46690917015075684, + "learning_rate": 0.0006, + "loss": 2.3339, + "step": 49690 + }, + { + "epoch": 0.1853882709279858, + "grad_norm": 0.39575666189193726, + "learning_rate": 0.0006, + "loss": 2.1579, + "step": 49700 + }, + { + "epoch": 0.18542557239094917, + "grad_norm": 0.3468663990497589, + "learning_rate": 0.0006, + "loss": 2.1156, + "step": 49710 + }, + { + "epoch": 0.18546287385391255, + "grad_norm": 0.48278534412384033, + "learning_rate": 0.0006, + "loss": 2.1228, + "step": 49720 + }, + { + "epoch": 0.18550017531687593, + "grad_norm": 0.7114423513412476, + "learning_rate": 0.0006, + "loss": 2.2211, + "step": 49730 + }, + { + "epoch": 0.1855374767798393, + "grad_norm": 0.3983613848686218, + "learning_rate": 0.0006, + "loss": 2.2014, + "step": 49740 + }, + { + "epoch": 0.1855747782428027, + "grad_norm": 0.36210498213768005, + "learning_rate": 0.0006, + "loss": 2.2896, + "step": 49750 + }, + { + "epoch": 0.1855747782428027, + "eval_valid_loss": 2.1874170303344727, + "eval_valid_loss/all": 2.050708055496216, + "eval_valid_loss/end_span": 1.2541131973266602, + "eval_valid_perplexity/batch": 7.773403167724609, + "eval_valid_perplexity/end_span": 3.5047290325164795, + "eval_valid_perplexity/fim": 2.0673880577087402, + "eval_valid_perplexity/first_seq": 14.529898643493652, + "eval_valid_perplexity/last_seq": 9.14041805267334, + "eval_valid_perplexity/second_seq": 13.800338745117188, + "eval_valid_perplexity/seq": 8.766253471374512, + "eval_valid_reconstruction/all": 0.2955075800418854, + "eval_valid_reconstruction/end_span": 0.7074323892593384, + "eval_valid_reconstruction/fim": 0.14590813219547272, + "eval_valid_reconstruction/first_seq": 0.17315922677516937, + "eval_valid_reconstruction/last_seq": 0.3199438154697418, + "eval_valid_reconstruction/second_seq": 0.1942521631717682, + "eval_valid_runtime": 514.7653, + "eval_valid_samples_per_second": 0.373, + "eval_valid_steps_per_second": 0.373, + "step": 49750 + }, + { + "epoch": 0.1855747782428027, + "eval_train_loss": 2.1834497451782227, + "eval_train_loss/all": 2.0204761028289795, + "eval_train_loss/end_span": 1.224044919013977, + "eval_train_perplexity/batch": 7.541914939880371, + "eval_train_perplexity/end_span": 3.400916337966919, + "eval_train_perplexity/fim": 2.1905407905578613, + "eval_train_perplexity/first_seq": 15.535516738891602, + "eval_train_perplexity/last_seq": 8.737038612365723, + "eval_train_perplexity/second_seq": 13.954293251037598, + "eval_train_perplexity/seq": 8.679808616638184, + "eval_train_reconstruction/all": 0.2856209874153137, + "eval_train_reconstruction/end_span": 0.7172505259513855, + "eval_train_reconstruction/fim": 0.15784448385238647, + "eval_train_reconstruction/first_seq": 0.14882917702198029, + "eval_train_reconstruction/last_seq": 0.3326092064380646, + "eval_train_reconstruction/second_seq": 0.19251899421215057, + "eval_train_runtime": 506.5154, + "eval_train_samples_per_second": 0.379, + "eval_train_steps_per_second": 0.379, + "step": 49750 + }, + { + "epoch": 0.18561207970576607, + "grad_norm": 0.33273717761039734, + "learning_rate": 0.0006, + "loss": 2.3283, + "step": 49760 + }, + { + "epoch": 0.18564938116872945, + "grad_norm": 0.9690778255462646, + "learning_rate": 0.0006, + "loss": 2.1742, + "step": 49770 + }, + { + "epoch": 0.18568668263169283, + "grad_norm": 0.3494618535041809, + "learning_rate": 0.0006, + "loss": 2.2133, + "step": 49780 + }, + { + "epoch": 0.18572398409465618, + "grad_norm": 0.3171655833721161, + "learning_rate": 0.0006, + "loss": 2.3121, + "step": 49790 + }, + { + "epoch": 0.18576128555761956, + "grad_norm": 0.297606885433197, + "learning_rate": 0.0006, + "loss": 2.2198, + "step": 49800 + }, + { + "epoch": 0.18579858702058294, + "grad_norm": 0.4046955704689026, + "learning_rate": 0.0006, + "loss": 2.136, + "step": 49810 + }, + { + "epoch": 0.18583588848354632, + "grad_norm": 0.35889461636543274, + "learning_rate": 0.0006, + "loss": 2.2415, + "step": 49820 + }, + { + "epoch": 0.1858731899465097, + "grad_norm": 0.33097442984580994, + "learning_rate": 0.0006, + "loss": 2.1381, + "step": 49830 + }, + { + "epoch": 0.18591049140947308, + "grad_norm": 0.40742483735084534, + "learning_rate": 0.0006, + "loss": 2.1615, + "step": 49840 + }, + { + "epoch": 0.18594779287243646, + "grad_norm": 0.3375520706176758, + "learning_rate": 0.0006, + "loss": 2.2719, + "step": 49850 + }, + { + "epoch": 0.18598509433539984, + "grad_norm": 0.3083322048187256, + "learning_rate": 0.0006, + "loss": 2.2592, + "step": 49860 + }, + { + "epoch": 0.18602239579836322, + "grad_norm": 0.23966486752033234, + "learning_rate": 0.0006, + "loss": 2.2562, + "step": 49870 + }, + { + "epoch": 0.1860596972613266, + "grad_norm": 0.2990809977054596, + "learning_rate": 0.0006, + "loss": 2.1584, + "step": 49880 + }, + { + "epoch": 0.18609699872428997, + "grad_norm": 0.22842805087566376, + "learning_rate": 0.0006, + "loss": 2.2797, + "step": 49890 + }, + { + "epoch": 0.18613430018725335, + "grad_norm": 0.31746605038642883, + "learning_rate": 0.0006, + "loss": 2.1897, + "step": 49900 + }, + { + "epoch": 0.18617160165021673, + "grad_norm": 0.2765961289405823, + "learning_rate": 0.0006, + "loss": 2.4189, + "step": 49910 + }, + { + "epoch": 0.18620890311318009, + "grad_norm": 0.3001217246055603, + "learning_rate": 0.0006, + "loss": 2.373, + "step": 49920 + }, + { + "epoch": 0.18624620457614346, + "grad_norm": 0.3174630403518677, + "learning_rate": 0.0006, + "loss": 2.1575, + "step": 49930 + }, + { + "epoch": 0.18628350603910684, + "grad_norm": 0.3450741469860077, + "learning_rate": 0.0006, + "loss": 2.2422, + "step": 49940 + }, + { + "epoch": 0.18632080750207022, + "grad_norm": 0.3003854751586914, + "learning_rate": 0.0006, + "loss": 2.208, + "step": 49950 + }, + { + "epoch": 0.1863581089650336, + "grad_norm": 0.47766339778900146, + "learning_rate": 0.0006, + "loss": 2.1124, + "step": 49960 + }, + { + "epoch": 0.18639541042799698, + "grad_norm": 0.4044462740421295, + "learning_rate": 0.0006, + "loss": 2.2376, + "step": 49970 + }, + { + "epoch": 0.18643271189096036, + "grad_norm": 0.3792949914932251, + "learning_rate": 0.0006, + "loss": 2.1562, + "step": 49980 + }, + { + "epoch": 0.18647001335392374, + "grad_norm": 0.3560364246368408, + "learning_rate": 0.0006, + "loss": 2.2519, + "step": 49990 + }, + { + "epoch": 0.18650731481688712, + "grad_norm": 0.560382604598999, + "learning_rate": 0.0006, + "loss": 2.0787, + "step": 50000 + }, + { + "epoch": 0.18650731481688712, + "eval_valid_loss": 2.183008909225464, + "eval_valid_loss/all": 2.046926975250244, + "eval_valid_loss/end_span": 1.192683458328247, + "eval_valid_perplexity/batch": 7.7440667152404785, + "eval_valid_perplexity/end_span": 3.2959136962890625, + "eval_valid_perplexity/fim": 2.511120557785034, + "eval_valid_perplexity/first_seq": 14.703934669494629, + "eval_valid_perplexity/last_seq": 8.784517288208008, + "eval_valid_perplexity/second_seq": 13.880176544189453, + "eval_valid_perplexity/seq": 8.73324966430664, + "eval_valid_reconstruction/all": 0.29648053646087646, + "eval_valid_reconstruction/end_span": 0.7163052558898926, + "eval_valid_reconstruction/fim": 0.1856219619512558, + "eval_valid_reconstruction/first_seq": 0.17088842391967773, + "eval_valid_reconstruction/last_seq": 0.33371099829673767, + "eval_valid_reconstruction/second_seq": 0.1915271282196045, + "eval_valid_runtime": 438.4807, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 50000 + }, + { + "epoch": 0.18650731481688712, + "eval_train_loss": 2.180774211883545, + "eval_train_loss/all": 2.018491506576538, + "eval_train_loss/end_span": 1.1559852361679077, + "eval_train_perplexity/batch": 7.526961803436279, + "eval_train_perplexity/end_span": 3.177152156829834, + "eval_train_perplexity/fim": 2.253117561340332, + "eval_train_perplexity/first_seq": 15.579214096069336, + "eval_train_perplexity/last_seq": 8.66987133026123, + "eval_train_perplexity/second_seq": 13.991945266723633, + "eval_train_perplexity/seq": 8.667943000793457, + "eval_train_reconstruction/all": 0.28604352474212646, + "eval_train_reconstruction/end_span": 0.7267919182777405, + "eval_train_reconstruction/fim": 0.16389882564544678, + "eval_train_reconstruction/first_seq": 0.14895135164260864, + "eval_train_reconstruction/last_seq": 0.33624303340911865, + "eval_train_reconstruction/second_seq": 0.18973203003406525, + "eval_train_runtime": 437.1708, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 50000 + }, + { + "epoch": 0.1865446162798505, + "grad_norm": 0.3567264676094055, + "learning_rate": 0.0006, + "loss": 2.2234, + "step": 50010 + }, + { + "epoch": 0.18658191774281388, + "grad_norm": 0.3299293518066406, + "learning_rate": 0.0006, + "loss": 2.316, + "step": 50020 + }, + { + "epoch": 0.18661921920577726, + "grad_norm": 0.32786282896995544, + "learning_rate": 0.0006, + "loss": 2.2325, + "step": 50030 + }, + { + "epoch": 0.18665652066874064, + "grad_norm": 0.3444758355617523, + "learning_rate": 0.0006, + "loss": 2.1641, + "step": 50040 + }, + { + "epoch": 0.18669382213170402, + "grad_norm": 0.4550645351409912, + "learning_rate": 0.0006, + "loss": 2.1372, + "step": 50050 + }, + { + "epoch": 0.18673112359466737, + "grad_norm": 0.4585898220539093, + "learning_rate": 0.0006, + "loss": 1.9774, + "step": 50060 + }, + { + "epoch": 0.18676842505763075, + "grad_norm": 0.4167327284812927, + "learning_rate": 0.0006, + "loss": 2.0285, + "step": 50070 + }, + { + "epoch": 0.18680572652059413, + "grad_norm": 0.4693422019481659, + "learning_rate": 0.0006, + "loss": 1.9651, + "step": 50080 + }, + { + "epoch": 0.1868430279835575, + "grad_norm": 0.2819894850254059, + "learning_rate": 0.0006, + "loss": 2.1655, + "step": 50090 + }, + { + "epoch": 0.1868803294465209, + "grad_norm": 0.3138754367828369, + "learning_rate": 0.0006, + "loss": 2.2188, + "step": 50100 + }, + { + "epoch": 0.18691763090948427, + "grad_norm": 0.5279825329780579, + "learning_rate": 0.0006, + "loss": 2.2635, + "step": 50110 + }, + { + "epoch": 0.18695493237244765, + "grad_norm": 0.32199859619140625, + "learning_rate": 0.0006, + "loss": 2.1767, + "step": 50120 + }, + { + "epoch": 0.18699223383541103, + "grad_norm": 0.4006772041320801, + "learning_rate": 0.0006, + "loss": 2.1695, + "step": 50130 + }, + { + "epoch": 0.1870295352983744, + "grad_norm": 0.35431715846061707, + "learning_rate": 0.0006, + "loss": 2.2827, + "step": 50140 + }, + { + "epoch": 0.18706683676133778, + "grad_norm": 0.28048592805862427, + "learning_rate": 0.0006, + "loss": 2.2198, + "step": 50150 + }, + { + "epoch": 0.18710413822430116, + "grad_norm": 0.33079585433006287, + "learning_rate": 0.0006, + "loss": 2.1624, + "step": 50160 + }, + { + "epoch": 0.18714143968726454, + "grad_norm": 0.31579557061195374, + "learning_rate": 0.0006, + "loss": 2.0682, + "step": 50170 + }, + { + "epoch": 0.18717874115022792, + "grad_norm": 0.40272256731987, + "learning_rate": 0.0006, + "loss": 2.2596, + "step": 50180 + }, + { + "epoch": 0.1872160426131913, + "grad_norm": 0.32611194252967834, + "learning_rate": 0.0006, + "loss": 2.2371, + "step": 50190 + }, + { + "epoch": 0.18725334407615465, + "grad_norm": 0.32246989011764526, + "learning_rate": 0.0006, + "loss": 2.1248, + "step": 50200 + }, + { + "epoch": 0.18729064553911803, + "grad_norm": 0.37627601623535156, + "learning_rate": 0.0006, + "loss": 2.2606, + "step": 50210 + }, + { + "epoch": 0.1873279470020814, + "grad_norm": 0.2666715979576111, + "learning_rate": 0.0006, + "loss": 2.1725, + "step": 50220 + }, + { + "epoch": 0.1873652484650448, + "grad_norm": 0.3561416566371918, + "learning_rate": 0.0006, + "loss": 2.2199, + "step": 50230 + }, + { + "epoch": 0.18740254992800817, + "grad_norm": 0.4558974802494049, + "learning_rate": 0.0006, + "loss": 2.0661, + "step": 50240 + }, + { + "epoch": 0.18743985139097155, + "grad_norm": 0.23682957887649536, + "learning_rate": 0.0006, + "loss": 2.2343, + "step": 50250 + }, + { + "epoch": 0.18743985139097155, + "eval_valid_loss": 2.1796882152557373, + "eval_valid_loss/all": 2.0437698364257812, + "eval_valid_loss/end_span": 1.2004255056381226, + "eval_valid_perplexity/batch": 7.719656467437744, + "eval_valid_perplexity/end_span": 3.3215298652648926, + "eval_valid_perplexity/fim": 2.448885917663574, + "eval_valid_perplexity/first_seq": 14.720097541809082, + "eval_valid_perplexity/last_seq": 8.98416519165039, + "eval_valid_perplexity/second_seq": 13.814523696899414, + "eval_valid_perplexity/seq": 8.706100463867188, + "eval_valid_reconstruction/all": 0.2974293529987335, + "eval_valid_reconstruction/end_span": 0.7179556488990784, + "eval_valid_reconstruction/fim": 0.18219399452209473, + "eval_valid_reconstruction/first_seq": 0.16949981451034546, + "eval_valid_reconstruction/last_seq": 0.3248029053211212, + "eval_valid_reconstruction/second_seq": 0.1942123919725418, + "eval_valid_runtime": 441.192, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 50250 + }, + { + "epoch": 0.18743985139097155, + "eval_train_loss": 2.1783530712127686, + "eval_train_loss/all": 2.0165598392486572, + "eval_train_loss/end_span": 1.1700228452682495, + "eval_train_perplexity/batch": 7.512436389923096, + "eval_train_perplexity/end_span": 3.2220661640167236, + "eval_train_perplexity/fim": 2.0964128971099854, + "eval_train_perplexity/first_seq": 15.724184036254883, + "eval_train_perplexity/last_seq": 8.745522499084473, + "eval_train_perplexity/second_seq": 13.979437828063965, + "eval_train_perplexity/seq": 8.652203559875488, + "eval_train_reconstruction/all": 0.28667089343070984, + "eval_train_reconstruction/end_span": 0.7280279397964478, + "eval_train_reconstruction/fim": 0.1500372588634491, + "eval_train_reconstruction/first_seq": 0.1461600661277771, + "eval_train_reconstruction/last_seq": 0.33330851793289185, + "eval_train_reconstruction/second_seq": 0.19029942154884338, + "eval_train_runtime": 441.8164, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 50250 + }, + { + "epoch": 0.18747715285393493, + "grad_norm": 0.2840842604637146, + "learning_rate": 0.0006, + "loss": 2.3237, + "step": 50260 + }, + { + "epoch": 0.1875144543168983, + "grad_norm": 0.3594757318496704, + "learning_rate": 0.0006, + "loss": 2.2441, + "step": 50270 + }, + { + "epoch": 0.1875517557798617, + "grad_norm": 0.4192587733268738, + "learning_rate": 0.0006, + "loss": 2.04, + "step": 50280 + }, + { + "epoch": 0.18758905724282507, + "grad_norm": 0.4005625247955322, + "learning_rate": 0.0006, + "loss": 2.1006, + "step": 50290 + }, + { + "epoch": 0.18762635870578845, + "grad_norm": 0.29656723141670227, + "learning_rate": 0.0006, + "loss": 2.1283, + "step": 50300 + }, + { + "epoch": 0.18766366016875183, + "grad_norm": 0.3856143355369568, + "learning_rate": 0.0006, + "loss": 2.3098, + "step": 50310 + }, + { + "epoch": 0.1877009616317152, + "grad_norm": 0.3385031223297119, + "learning_rate": 0.0006, + "loss": 2.23, + "step": 50320 + }, + { + "epoch": 0.1877382630946786, + "grad_norm": 0.3133629560470581, + "learning_rate": 0.0006, + "loss": 2.0305, + "step": 50330 + }, + { + "epoch": 0.18777556455764194, + "grad_norm": 0.29117387533187866, + "learning_rate": 0.0006, + "loss": 2.3448, + "step": 50340 + }, + { + "epoch": 0.18781286602060532, + "grad_norm": 0.2842309772968292, + "learning_rate": 0.0006, + "loss": 2.228, + "step": 50350 + }, + { + "epoch": 0.1878501674835687, + "grad_norm": 0.3599432110786438, + "learning_rate": 0.0006, + "loss": 2.2492, + "step": 50360 + }, + { + "epoch": 0.18788746894653208, + "grad_norm": 0.3402864336967468, + "learning_rate": 0.0006, + "loss": 2.2699, + "step": 50370 + }, + { + "epoch": 0.18792477040949546, + "grad_norm": 0.26613178849220276, + "learning_rate": 0.0006, + "loss": 2.2228, + "step": 50380 + }, + { + "epoch": 0.18796207187245884, + "grad_norm": 1.1001777648925781, + "learning_rate": 0.0006, + "loss": 2.2401, + "step": 50390 + }, + { + "epoch": 0.18799937333542222, + "grad_norm": 0.40505367517471313, + "learning_rate": 0.0006, + "loss": 2.2817, + "step": 50400 + }, + { + "epoch": 0.1880366747983856, + "grad_norm": 0.3562977910041809, + "learning_rate": 0.0006, + "loss": 2.2214, + "step": 50410 + }, + { + "epoch": 0.18807397626134897, + "grad_norm": 0.40367716550827026, + "learning_rate": 0.0006, + "loss": 2.2459, + "step": 50420 + }, + { + "epoch": 0.18811127772431235, + "grad_norm": 0.27876073122024536, + "learning_rate": 0.0006, + "loss": 2.2787, + "step": 50430 + }, + { + "epoch": 0.18814857918727573, + "grad_norm": 0.2867765724658966, + "learning_rate": 0.0006, + "loss": 2.2045, + "step": 50440 + }, + { + "epoch": 0.1881858806502391, + "grad_norm": 1.1510597467422485, + "learning_rate": 0.0006, + "loss": 2.1011, + "step": 50450 + }, + { + "epoch": 0.1882231821132025, + "grad_norm": 0.3856930434703827, + "learning_rate": 0.0006, + "loss": 2.265, + "step": 50460 + }, + { + "epoch": 0.18826048357616584, + "grad_norm": 0.32816269993782043, + "learning_rate": 0.0006, + "loss": 2.2653, + "step": 50470 + }, + { + "epoch": 0.18829778503912922, + "grad_norm": 0.3119485080242157, + "learning_rate": 0.0006, + "loss": 2.2398, + "step": 50480 + }, + { + "epoch": 0.1883350865020926, + "grad_norm": 0.49383553862571716, + "learning_rate": 0.0006, + "loss": 2.2324, + "step": 50490 + }, + { + "epoch": 0.18837238796505598, + "grad_norm": 0.4011939764022827, + "learning_rate": 0.0006, + "loss": 2.244, + "step": 50500 + }, + { + "epoch": 0.18837238796505598, + "eval_valid_loss": 2.1821327209472656, + "eval_valid_loss/all": 2.0460991859436035, + "eval_valid_loss/end_span": 1.199981689453125, + "eval_valid_perplexity/batch": 7.737658977508545, + "eval_valid_perplexity/end_span": 3.320056200027466, + "eval_valid_perplexity/fim": 2.312747001647949, + "eval_valid_perplexity/first_seq": 14.837119102478027, + "eval_valid_perplexity/last_seq": 8.406516075134277, + "eval_valid_perplexity/second_seq": 13.484721183776855, + "eval_valid_perplexity/seq": 8.728236198425293, + "eval_valid_reconstruction/all": 0.29656580090522766, + "eval_valid_reconstruction/end_span": 0.7138679623603821, + "eval_valid_reconstruction/fim": 0.170033261179924, + "eval_valid_reconstruction/first_seq": 0.16446687281131744, + "eval_valid_reconstruction/last_seq": 0.3491402864456177, + "eval_valid_reconstruction/second_seq": 0.20358715951442719, + "eval_valid_runtime": 437.6619, + "eval_valid_samples_per_second": 0.439, + "eval_valid_steps_per_second": 0.439, + "step": 50500 + }, + { + "epoch": 0.18837238796505598, + "eval_train_loss": 2.180097818374634, + "eval_train_loss/all": 2.0180888175964355, + "eval_train_loss/end_span": 1.1735438108444214, + "eval_train_perplexity/batch": 7.523931503295898, + "eval_train_perplexity/end_span": 3.233431100845337, + "eval_train_perplexity/fim": 1.9989060163497925, + "eval_train_perplexity/first_seq": 15.63819408416748, + "eval_train_perplexity/last_seq": 8.730371475219727, + "eval_train_perplexity/second_seq": 14.319404602050781, + "eval_train_perplexity/seq": 8.663810729980469, + "eval_train_reconstruction/all": 0.28595301508903503, + "eval_train_reconstruction/end_span": 0.7227523922920227, + "eval_train_reconstruction/fim": 0.1400790959596634, + "eval_train_reconstruction/first_seq": 0.15094929933547974, + "eval_train_reconstruction/last_seq": 0.33147090673446655, + "eval_train_reconstruction/second_seq": 0.1805720329284668, + "eval_train_runtime": 440.396, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 50500 + }, + { + "epoch": 0.18840968942801936, + "grad_norm": 0.3073486089706421, + "learning_rate": 0.0006, + "loss": 2.0793, + "step": 50510 + }, + { + "epoch": 0.18844699089098274, + "grad_norm": 0.4702248275279999, + "learning_rate": 0.0006, + "loss": 2.1637, + "step": 50520 + }, + { + "epoch": 0.18848429235394612, + "grad_norm": 0.31744518876075745, + "learning_rate": 0.0006, + "loss": 2.1959, + "step": 50530 + }, + { + "epoch": 0.1885215938169095, + "grad_norm": 1.2180498838424683, + "learning_rate": 0.0006, + "loss": 2.2625, + "step": 50540 + }, + { + "epoch": 0.18855889527987288, + "grad_norm": 0.39192578196525574, + "learning_rate": 0.0006, + "loss": 2.101, + "step": 50550 + }, + { + "epoch": 0.18859619674283626, + "grad_norm": 0.9200299978256226, + "learning_rate": 0.0006, + "loss": 2.3078, + "step": 50560 + }, + { + "epoch": 0.18863349820579964, + "grad_norm": 0.4348708689212799, + "learning_rate": 0.0006, + "loss": 2.1797, + "step": 50570 + }, + { + "epoch": 0.18867079966876302, + "grad_norm": 0.274026095867157, + "learning_rate": 0.0006, + "loss": 2.2316, + "step": 50580 + }, + { + "epoch": 0.1887081011317264, + "grad_norm": 0.39313608407974243, + "learning_rate": 0.0006, + "loss": 2.247, + "step": 50590 + }, + { + "epoch": 0.18874540259468978, + "grad_norm": 0.38768360018730164, + "learning_rate": 0.0006, + "loss": 2.17, + "step": 50600 + }, + { + "epoch": 0.18878270405765313, + "grad_norm": 0.4439745843410492, + "learning_rate": 0.0006, + "loss": 2.3424, + "step": 50610 + }, + { + "epoch": 0.1888200055206165, + "grad_norm": 0.33707195520401, + "learning_rate": 0.0006, + "loss": 2.2455, + "step": 50620 + }, + { + "epoch": 0.1888573069835799, + "grad_norm": 0.3035410940647125, + "learning_rate": 0.0006, + "loss": 2.3724, + "step": 50630 + }, + { + "epoch": 0.18889460844654327, + "grad_norm": 0.2620484232902527, + "learning_rate": 0.0006, + "loss": 2.2679, + "step": 50640 + }, + { + "epoch": 0.18893190990950665, + "grad_norm": 0.26795250177383423, + "learning_rate": 0.0006, + "loss": 2.21, + "step": 50650 + }, + { + "epoch": 0.18896921137247003, + "grad_norm": 0.2603134512901306, + "learning_rate": 0.0006, + "loss": 2.3556, + "step": 50660 + }, + { + "epoch": 0.1890065128354334, + "grad_norm": 0.349816232919693, + "learning_rate": 0.0006, + "loss": 1.9273, + "step": 50670 + }, + { + "epoch": 0.18904381429839678, + "grad_norm": 0.24663814902305603, + "learning_rate": 0.0006, + "loss": 2.2014, + "step": 50680 + }, + { + "epoch": 0.18908111576136016, + "grad_norm": 0.38308823108673096, + "learning_rate": 0.0006, + "loss": 2.1169, + "step": 50690 + }, + { + "epoch": 0.18911841722432354, + "grad_norm": 0.4167357385158539, + "learning_rate": 0.0006, + "loss": 2.1945, + "step": 50700 + }, + { + "epoch": 0.18915571868728692, + "grad_norm": 1.0911083221435547, + "learning_rate": 0.0006, + "loss": 2.2541, + "step": 50710 + }, + { + "epoch": 0.1891930201502503, + "grad_norm": 0.4464857280254364, + "learning_rate": 0.0006, + "loss": 2.1087, + "step": 50720 + }, + { + "epoch": 0.18923032161321368, + "grad_norm": 0.40395450592041016, + "learning_rate": 0.0006, + "loss": 2.1597, + "step": 50730 + }, + { + "epoch": 0.18926762307617706, + "grad_norm": 0.39881160855293274, + "learning_rate": 0.0006, + "loss": 2.2233, + "step": 50740 + }, + { + "epoch": 0.1893049245391404, + "grad_norm": 0.37136369943618774, + "learning_rate": 0.0006, + "loss": 2.2704, + "step": 50750 + }, + { + "epoch": 0.1893049245391404, + "eval_valid_loss": 2.1842944622039795, + "eval_valid_loss/all": 2.047947406768799, + "eval_valid_loss/end_span": 1.2928000688552856, + "eval_valid_perplexity/batch": 7.7519731521606445, + "eval_valid_perplexity/end_span": 3.642972946166992, + "eval_valid_perplexity/fim": 2.477688789367676, + "eval_valid_perplexity/first_seq": 14.311704635620117, + "eval_valid_perplexity/last_seq": 8.77005672454834, + "eval_valid_perplexity/second_seq": 13.684657096862793, + "eval_valid_perplexity/seq": 8.741841316223145, + "eval_valid_reconstruction/all": 0.29623743891716003, + "eval_valid_reconstruction/end_span": 0.6851432919502258, + "eval_valid_reconstruction/fim": 0.1821012943983078, + "eval_valid_reconstruction/first_seq": 0.1768665909767151, + "eval_valid_reconstruction/last_seq": 0.33367425203323364, + "eval_valid_reconstruction/second_seq": 0.1987992823123932, + "eval_valid_runtime": 440.5081, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 50750 + }, + { + "epoch": 0.1893049245391404, + "eval_train_loss": 2.1814653873443604, + "eval_train_loss/all": 2.0189943313598633, + "eval_train_loss/end_span": 1.263763189315796, + "eval_train_perplexity/batch": 7.530747890472412, + "eval_train_perplexity/end_span": 3.538713216781616, + "eval_train_perplexity/fim": 2.15065860748291, + "eval_train_perplexity/first_seq": 15.24660587310791, + "eval_train_perplexity/last_seq": 8.683491706848145, + "eval_train_perplexity/second_seq": 14.226278305053711, + "eval_train_perplexity/seq": 8.670424461364746, + "eval_train_reconstruction/all": 0.28598475456237793, + "eval_train_reconstruction/end_span": 0.6954641342163086, + "eval_train_reconstruction/fim": 0.1537182331085205, + "eval_train_reconstruction/first_seq": 0.15620248019695282, + "eval_train_reconstruction/last_seq": 0.3351747691631317, + "eval_train_reconstruction/second_seq": 0.18448254466056824, + "eval_train_runtime": 442.8534, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 50750 + }, + { + "epoch": 0.1893422260021038, + "grad_norm": 0.3090338110923767, + "learning_rate": 0.0006, + "loss": 2.3199, + "step": 50760 + }, + { + "epoch": 0.18937952746506717, + "grad_norm": 0.3414296507835388, + "learning_rate": 0.0006, + "loss": 2.0178, + "step": 50770 + }, + { + "epoch": 0.18941682892803055, + "grad_norm": 0.35721340775489807, + "learning_rate": 0.0006, + "loss": 2.3024, + "step": 50780 + }, + { + "epoch": 0.18945413039099393, + "grad_norm": 0.3058365285396576, + "learning_rate": 0.0006, + "loss": 2.2181, + "step": 50790 + }, + { + "epoch": 0.1894914318539573, + "grad_norm": 0.5467208623886108, + "learning_rate": 0.0006, + "loss": 2.1363, + "step": 50800 + }, + { + "epoch": 0.1895287333169207, + "grad_norm": 0.3665601909160614, + "learning_rate": 0.0006, + "loss": 2.3386, + "step": 50810 + }, + { + "epoch": 0.18956603477988407, + "grad_norm": 0.26499778032302856, + "learning_rate": 0.0006, + "loss": 2.3381, + "step": 50820 + }, + { + "epoch": 0.18960333624284745, + "grad_norm": 0.36429837346076965, + "learning_rate": 0.0006, + "loss": 2.056, + "step": 50830 + }, + { + "epoch": 0.18964063770581083, + "grad_norm": 0.31205999851226807, + "learning_rate": 0.0006, + "loss": 2.2563, + "step": 50840 + }, + { + "epoch": 0.1896779391687742, + "grad_norm": 0.4376172423362732, + "learning_rate": 0.0006, + "loss": 2.2086, + "step": 50850 + }, + { + "epoch": 0.18971524063173759, + "grad_norm": 0.2829352021217346, + "learning_rate": 0.0006, + "loss": 2.2969, + "step": 50860 + }, + { + "epoch": 0.18975254209470097, + "grad_norm": 0.3156704306602478, + "learning_rate": 0.0006, + "loss": 2.3593, + "step": 50870 + }, + { + "epoch": 0.18978984355766434, + "grad_norm": 0.36555516719818115, + "learning_rate": 0.0006, + "loss": 2.296, + "step": 50880 + }, + { + "epoch": 0.1898271450206277, + "grad_norm": 0.47550562024116516, + "learning_rate": 0.0006, + "loss": 2.1359, + "step": 50890 + }, + { + "epoch": 0.18986444648359108, + "grad_norm": 0.43170884251594543, + "learning_rate": 0.0006, + "loss": 2.3141, + "step": 50900 + }, + { + "epoch": 0.18990174794655446, + "grad_norm": 0.4220486581325531, + "learning_rate": 0.0006, + "loss": 2.3076, + "step": 50910 + }, + { + "epoch": 0.18993904940951783, + "grad_norm": 0.4374246597290039, + "learning_rate": 0.0006, + "loss": 2.1261, + "step": 50920 + }, + { + "epoch": 0.18997635087248121, + "grad_norm": 0.4068562686443329, + "learning_rate": 0.0006, + "loss": 2.1356, + "step": 50930 + }, + { + "epoch": 0.1900136523354446, + "grad_norm": 0.5711377859115601, + "learning_rate": 0.0006, + "loss": 2.2697, + "step": 50940 + }, + { + "epoch": 0.19005095379840797, + "grad_norm": 0.36956852674484253, + "learning_rate": 0.0006, + "loss": 2.0981, + "step": 50950 + }, + { + "epoch": 0.19008825526137135, + "grad_norm": 0.2949256896972656, + "learning_rate": 0.0006, + "loss": 2.344, + "step": 50960 + }, + { + "epoch": 0.19012555672433473, + "grad_norm": 0.516538143157959, + "learning_rate": 0.0006, + "loss": 2.2993, + "step": 50970 + }, + { + "epoch": 0.1901628581872981, + "grad_norm": 0.34502819180488586, + "learning_rate": 0.0006, + "loss": 2.2327, + "step": 50980 + }, + { + "epoch": 0.1902001596502615, + "grad_norm": 0.28266435861587524, + "learning_rate": 0.0006, + "loss": 2.417, + "step": 50990 + }, + { + "epoch": 0.19023746111322487, + "grad_norm": 0.39980411529541016, + "learning_rate": 0.0006, + "loss": 2.2259, + "step": 51000 + }, + { + "epoch": 0.19023746111322487, + "eval_valid_loss": 2.1820991039276123, + "eval_valid_loss/all": 2.045872926712036, + "eval_valid_loss/end_span": 1.2605690956115723, + "eval_valid_perplexity/batch": 7.735908508300781, + "eval_valid_perplexity/end_span": 3.527428388595581, + "eval_valid_perplexity/fim": 2.2034664154052734, + "eval_valid_perplexity/first_seq": 14.572310447692871, + "eval_valid_perplexity/last_seq": 9.226556777954102, + "eval_valid_perplexity/second_seq": 13.882804870605469, + "eval_valid_perplexity/seq": 8.724479675292969, + "eval_valid_reconstruction/all": 0.29683056473731995, + "eval_valid_reconstruction/end_span": 0.7070438861846924, + "eval_valid_reconstruction/fim": 0.15940715372562408, + "eval_valid_reconstruction/first_seq": 0.1738797426223755, + "eval_valid_reconstruction/last_seq": 0.31614020466804504, + "eval_valid_reconstruction/second_seq": 0.18737727403640747, + "eval_valid_runtime": 440.8777, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 51000 + }, + { + "epoch": 0.19023746111322487, + "eval_train_loss": 2.1808016300201416, + "eval_train_loss/all": 2.018285036087036, + "eval_train_loss/end_span": 1.2224342823028564, + "eval_train_perplexity/batch": 7.5254082679748535, + "eval_train_perplexity/end_span": 3.3954432010650635, + "eval_train_perplexity/fim": 1.9431960582733154, + "eval_train_perplexity/first_seq": 15.760135650634766, + "eval_train_perplexity/last_seq": 8.957484245300293, + "eval_train_perplexity/second_seq": 13.86968994140625, + "eval_train_perplexity/seq": 8.664519309997559, + "eval_train_reconstruction/all": 0.2860976755619049, + "eval_train_reconstruction/end_span": 0.7195804119110107, + "eval_train_reconstruction/fim": 0.1342957317829132, + "eval_train_reconstruction/first_seq": 0.14272679388523102, + "eval_train_reconstruction/last_seq": 0.3260365128517151, + "eval_train_reconstruction/second_seq": 0.19325841963291168, + "eval_train_runtime": 439.4792, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 51000 + }, + { + "epoch": 0.19027476257618825, + "grad_norm": 0.2521446645259857, + "learning_rate": 0.0006, + "loss": 2.21, + "step": 51010 + }, + { + "epoch": 0.19031206403915163, + "grad_norm": 0.3429035544395447, + "learning_rate": 0.0006, + "loss": 1.9816, + "step": 51020 + }, + { + "epoch": 0.19034936550211498, + "grad_norm": 0.30514970421791077, + "learning_rate": 0.0006, + "loss": 2.0158, + "step": 51030 + }, + { + "epoch": 0.19038666696507836, + "grad_norm": 0.35138505697250366, + "learning_rate": 0.0006, + "loss": 2.2507, + "step": 51040 + }, + { + "epoch": 0.19042396842804174, + "grad_norm": 0.3627309799194336, + "learning_rate": 0.0006, + "loss": 2.1785, + "step": 51050 + }, + { + "epoch": 0.19046126989100512, + "grad_norm": 0.33502674102783203, + "learning_rate": 0.0006, + "loss": 2.1678, + "step": 51060 + }, + { + "epoch": 0.1904985713539685, + "grad_norm": 0.2267342060804367, + "learning_rate": 0.0006, + "loss": 2.3061, + "step": 51070 + }, + { + "epoch": 0.19053587281693188, + "grad_norm": 0.26105085015296936, + "learning_rate": 0.0006, + "loss": 2.2464, + "step": 51080 + }, + { + "epoch": 0.19057317427989526, + "grad_norm": 0.25652173161506653, + "learning_rate": 0.0006, + "loss": 2.1362, + "step": 51090 + }, + { + "epoch": 0.19061047574285864, + "grad_norm": 0.5011621713638306, + "learning_rate": 0.0006, + "loss": 2.175, + "step": 51100 + }, + { + "epoch": 0.19064777720582202, + "grad_norm": 0.28158918023109436, + "learning_rate": 0.0006, + "loss": 2.2418, + "step": 51110 + }, + { + "epoch": 0.1906850786687854, + "grad_norm": 0.3213143050670624, + "learning_rate": 0.0006, + "loss": 2.2621, + "step": 51120 + }, + { + "epoch": 0.19072238013174878, + "grad_norm": 0.26668626070022583, + "learning_rate": 0.0006, + "loss": 2.311, + "step": 51130 + }, + { + "epoch": 0.19075968159471215, + "grad_norm": 0.3694637715816498, + "learning_rate": 0.0006, + "loss": 1.9303, + "step": 51140 + }, + { + "epoch": 0.19079698305767553, + "grad_norm": 0.3419964909553528, + "learning_rate": 0.0006, + "loss": 2.0997, + "step": 51150 + }, + { + "epoch": 0.19083428452063889, + "grad_norm": 0.32000458240509033, + "learning_rate": 0.0006, + "loss": 2.138, + "step": 51160 + }, + { + "epoch": 0.19087158598360227, + "grad_norm": 0.4034086763858795, + "learning_rate": 0.0006, + "loss": 2.0757, + "step": 51170 + }, + { + "epoch": 0.19090888744656564, + "grad_norm": 0.3210839629173279, + "learning_rate": 0.0006, + "loss": 2.1712, + "step": 51180 + }, + { + "epoch": 0.19094618890952902, + "grad_norm": 0.3893907964229584, + "learning_rate": 0.0006, + "loss": 2.2547, + "step": 51190 + }, + { + "epoch": 0.1909834903724924, + "grad_norm": 0.35214024782180786, + "learning_rate": 0.0006, + "loss": 2.3323, + "step": 51200 + }, + { + "epoch": 0.19102079183545578, + "grad_norm": 0.3043161630630493, + "learning_rate": 0.0006, + "loss": 2.2798, + "step": 51210 + }, + { + "epoch": 0.19105809329841916, + "grad_norm": 0.26923826336860657, + "learning_rate": 0.0006, + "loss": 2.2093, + "step": 51220 + }, + { + "epoch": 0.19109539476138254, + "grad_norm": 0.33271124958992004, + "learning_rate": 0.0006, + "loss": 2.3846, + "step": 51230 + }, + { + "epoch": 0.19113269622434592, + "grad_norm": 0.38246023654937744, + "learning_rate": 0.0006, + "loss": 2.2437, + "step": 51240 + }, + { + "epoch": 0.1911699976873093, + "grad_norm": 0.37618401646614075, + "learning_rate": 0.0006, + "loss": 2.147, + "step": 51250 + }, + { + "epoch": 0.1911699976873093, + "eval_valid_loss": 2.176701784133911, + "eval_valid_loss/all": 2.04048228263855, + "eval_valid_loss/end_span": 1.1853398084640503, + "eval_valid_perplexity/batch": 7.694319248199463, + "eval_valid_perplexity/end_span": 3.2717983722686768, + "eval_valid_perplexity/fim": 2.2602345943450928, + "eval_valid_perplexity/first_seq": 14.801405906677246, + "eval_valid_perplexity/last_seq": 8.589244842529297, + "eval_valid_perplexity/second_seq": 13.91434097290039, + "eval_valid_perplexity/seq": 8.674731254577637, + "eval_valid_reconstruction/all": 0.2985726594924927, + "eval_valid_reconstruction/end_span": 0.7156184911727905, + "eval_valid_reconstruction/fim": 0.16599954664707184, + "eval_valid_reconstruction/first_seq": 0.1701325923204422, + "eval_valid_reconstruction/last_seq": 0.34204933047294617, + "eval_valid_reconstruction/second_seq": 0.19110411405563354, + "eval_valid_runtime": 444.1652, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 51250 + }, + { + "epoch": 0.1911699976873093, + "eval_train_loss": 2.1773033142089844, + "eval_train_loss/all": 2.015336751937866, + "eval_train_loss/end_span": 1.153120756149292, + "eval_train_perplexity/batch": 7.50325345993042, + "eval_train_perplexity/end_span": 3.1680643558502197, + "eval_train_perplexity/fim": 2.1249678134918213, + "eval_train_perplexity/first_seq": 15.40291690826416, + "eval_train_perplexity/last_seq": 8.610451698303223, + "eval_train_perplexity/second_seq": 14.389389038085938, + "eval_train_perplexity/seq": 8.63943862915039, + "eval_train_reconstruction/all": 0.2873016595840454, + "eval_train_reconstruction/end_span": 0.7281184792518616, + "eval_train_reconstruction/fim": 0.15262609720230103, + "eval_train_reconstruction/first_seq": 0.1555778980255127, + "eval_train_reconstruction/last_seq": 0.3377038836479187, + "eval_train_reconstruction/second_seq": 0.17954544723033905, + "eval_train_runtime": 443.7009, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 51250 + }, + { + "epoch": 0.19120729915027268, + "grad_norm": 0.2143143266439438, + "learning_rate": 0.0006, + "loss": 2.3069, + "step": 51260 + }, + { + "epoch": 0.19124460061323606, + "grad_norm": 0.4050350785255432, + "learning_rate": 0.0006, + "loss": 2.2328, + "step": 51270 + }, + { + "epoch": 0.19128190207619944, + "grad_norm": 0.35310158133506775, + "learning_rate": 0.0006, + "loss": 2.2048, + "step": 51280 + }, + { + "epoch": 0.19131920353916282, + "grad_norm": 0.23452578485012054, + "learning_rate": 0.0006, + "loss": 2.2935, + "step": 51290 + }, + { + "epoch": 0.19135650500212617, + "grad_norm": 0.2610922157764435, + "learning_rate": 0.0006, + "loss": 2.2445, + "step": 51300 + }, + { + "epoch": 0.19139380646508955, + "grad_norm": 0.23687702417373657, + "learning_rate": 0.0006, + "loss": 2.2849, + "step": 51310 + }, + { + "epoch": 0.19143110792805293, + "grad_norm": 0.32778000831604004, + "learning_rate": 0.0006, + "loss": 2.2169, + "step": 51320 + }, + { + "epoch": 0.1914684093910163, + "grad_norm": 0.40833520889282227, + "learning_rate": 0.0006, + "loss": 2.0553, + "step": 51330 + }, + { + "epoch": 0.1915057108539797, + "grad_norm": 0.32509398460388184, + "learning_rate": 0.0006, + "loss": 2.1762, + "step": 51340 + }, + { + "epoch": 0.19154301231694307, + "grad_norm": 0.3549800515174866, + "learning_rate": 0.0006, + "loss": 2.3134, + "step": 51350 + }, + { + "epoch": 0.19158031377990645, + "grad_norm": 0.22461822628974915, + "learning_rate": 0.0006, + "loss": 2.408, + "step": 51360 + }, + { + "epoch": 0.19161761524286983, + "grad_norm": 0.5433110594749451, + "learning_rate": 0.0006, + "loss": 2.2773, + "step": 51370 + }, + { + "epoch": 0.1916549167058332, + "grad_norm": 0.22259676456451416, + "learning_rate": 0.0006, + "loss": 2.311, + "step": 51380 + }, + { + "epoch": 0.19169221816879659, + "grad_norm": 0.25176340341567993, + "learning_rate": 0.0006, + "loss": 2.0952, + "step": 51390 + }, + { + "epoch": 0.19172951963175996, + "grad_norm": 0.3031318187713623, + "learning_rate": 0.0006, + "loss": 2.2396, + "step": 51400 + }, + { + "epoch": 0.19176682109472334, + "grad_norm": 0.29729101061820984, + "learning_rate": 0.0006, + "loss": 2.1497, + "step": 51410 + }, + { + "epoch": 0.19180412255768672, + "grad_norm": 0.3494732975959778, + "learning_rate": 0.0006, + "loss": 2.28, + "step": 51420 + }, + { + "epoch": 0.1918414240206501, + "grad_norm": 0.4254058301448822, + "learning_rate": 0.0006, + "loss": 2.192, + "step": 51430 + }, + { + "epoch": 0.19187872548361345, + "grad_norm": 0.23618273437023163, + "learning_rate": 0.0006, + "loss": 2.1901, + "step": 51440 + }, + { + "epoch": 0.19191602694657683, + "grad_norm": 0.25325366854667664, + "learning_rate": 0.0006, + "loss": 2.3285, + "step": 51450 + }, + { + "epoch": 0.1919533284095402, + "grad_norm": 0.23429013788700104, + "learning_rate": 0.0006, + "loss": 2.2258, + "step": 51460 + }, + { + "epoch": 0.1919906298725036, + "grad_norm": 0.42972317337989807, + "learning_rate": 0.0006, + "loss": 2.1648, + "step": 51470 + }, + { + "epoch": 0.19202793133546697, + "grad_norm": 0.3504788279533386, + "learning_rate": 0.0006, + "loss": 2.015, + "step": 51480 + }, + { + "epoch": 0.19206523279843035, + "grad_norm": 0.32394951581954956, + "learning_rate": 0.0006, + "loss": 2.1869, + "step": 51490 + }, + { + "epoch": 0.19210253426139373, + "grad_norm": 0.3593686819076538, + "learning_rate": 0.0006, + "loss": 2.336, + "step": 51500 + }, + { + "epoch": 0.19210253426139373, + "eval_valid_loss": 2.1815755367279053, + "eval_valid_loss/all": 2.0452938079833984, + "eval_valid_loss/end_span": 1.199392557144165, + "eval_valid_perplexity/batch": 7.731429576873779, + "eval_valid_perplexity/end_span": 3.318100690841675, + "eval_valid_perplexity/fim": 2.3395869731903076, + "eval_valid_perplexity/first_seq": 14.455778121948242, + "eval_valid_perplexity/last_seq": 9.08198356628418, + "eval_valid_perplexity/second_seq": 13.50753116607666, + "eval_valid_perplexity/seq": 8.717427253723145, + "eval_valid_reconstruction/all": 0.2968599498271942, + "eval_valid_reconstruction/end_span": 0.7190896272659302, + "eval_valid_reconstruction/fim": 0.1727568507194519, + "eval_valid_reconstruction/first_seq": 0.1753443032503128, + "eval_valid_reconstruction/last_seq": 0.3233596086502075, + "eval_valid_reconstruction/second_seq": 0.20282579958438873, + "eval_valid_runtime": 442.8628, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 51500 + }, + { + "epoch": 0.19210253426139373, + "eval_train_loss": 2.1792943477630615, + "eval_train_loss/all": 2.016805410385132, + "eval_train_loss/end_span": 1.1654514074325562, + "eval_train_perplexity/batch": 7.514281272888184, + "eval_train_perplexity/end_span": 3.2073702812194824, + "eval_train_perplexity/fim": 2.151400566101074, + "eval_train_perplexity/first_seq": 15.42205810546875, + "eval_train_perplexity/last_seq": 8.604385375976562, + "eval_train_perplexity/second_seq": 13.920482635498047, + "eval_train_perplexity/seq": 8.647396087646484, + "eval_train_reconstruction/all": 0.28649404644966125, + "eval_train_reconstruction/end_span": 0.73031085729599, + "eval_train_reconstruction/fim": 0.15553545951843262, + "eval_train_reconstruction/first_seq": 0.15146854519844055, + "eval_train_reconstruction/last_seq": 0.3421872556209564, + "eval_train_reconstruction/second_seq": 0.19103549420833588, + "eval_train_runtime": 439.9028, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 51500 + }, + { + "epoch": 0.1921398357243571, + "grad_norm": 0.3135716915130615, + "learning_rate": 0.0006, + "loss": 2.3808, + "step": 51510 + }, + { + "epoch": 0.1921771371873205, + "grad_norm": 0.4231254756450653, + "learning_rate": 0.0006, + "loss": 2.1698, + "step": 51520 + }, + { + "epoch": 0.19221443865028387, + "grad_norm": 0.4641605019569397, + "learning_rate": 0.0006, + "loss": 2.1241, + "step": 51530 + }, + { + "epoch": 0.19225174011324725, + "grad_norm": 0.38355371356010437, + "learning_rate": 0.0006, + "loss": 2.1878, + "step": 51540 + }, + { + "epoch": 0.19228904157621063, + "grad_norm": 0.44489017128944397, + "learning_rate": 0.0006, + "loss": 2.2038, + "step": 51550 + }, + { + "epoch": 0.192326343039174, + "grad_norm": 0.4379982352256775, + "learning_rate": 0.0006, + "loss": 2.2285, + "step": 51560 + }, + { + "epoch": 0.1923636445021374, + "grad_norm": 0.3144233524799347, + "learning_rate": 0.0006, + "loss": 2.2148, + "step": 51570 + }, + { + "epoch": 0.19240094596510074, + "grad_norm": 0.5189852714538574, + "learning_rate": 0.0006, + "loss": 2.1952, + "step": 51580 + }, + { + "epoch": 0.19243824742806412, + "grad_norm": 0.3876047730445862, + "learning_rate": 0.0006, + "loss": 2.3261, + "step": 51590 + }, + { + "epoch": 0.1924755488910275, + "grad_norm": 0.36930063366889954, + "learning_rate": 0.0006, + "loss": 2.2462, + "step": 51600 + }, + { + "epoch": 0.19251285035399088, + "grad_norm": 0.36199143528938293, + "learning_rate": 0.0006, + "loss": 2.294, + "step": 51610 + }, + { + "epoch": 0.19255015181695426, + "grad_norm": 0.276292622089386, + "learning_rate": 0.0006, + "loss": 2.311, + "step": 51620 + }, + { + "epoch": 0.19258745327991764, + "grad_norm": 0.35009390115737915, + "learning_rate": 0.0006, + "loss": 2.3152, + "step": 51630 + }, + { + "epoch": 0.19262475474288102, + "grad_norm": 0.34693753719329834, + "learning_rate": 0.0006, + "loss": 2.295, + "step": 51640 + }, + { + "epoch": 0.1926620562058444, + "grad_norm": 0.28304523229599, + "learning_rate": 0.0006, + "loss": 2.1718, + "step": 51650 + }, + { + "epoch": 0.19269935766880777, + "grad_norm": 0.3615265190601349, + "learning_rate": 0.0006, + "loss": 2.1794, + "step": 51660 + }, + { + "epoch": 0.19273665913177115, + "grad_norm": 0.3912058472633362, + "learning_rate": 0.0006, + "loss": 2.27, + "step": 51670 + }, + { + "epoch": 0.19277396059473453, + "grad_norm": 0.32866907119750977, + "learning_rate": 0.0006, + "loss": 2.3777, + "step": 51680 + }, + { + "epoch": 0.1928112620576979, + "grad_norm": 0.2685398459434509, + "learning_rate": 0.0006, + "loss": 2.3948, + "step": 51690 + }, + { + "epoch": 0.1928485635206613, + "grad_norm": 0.2726791501045227, + "learning_rate": 0.0006, + "loss": 2.3212, + "step": 51700 + }, + { + "epoch": 0.19288586498362464, + "grad_norm": 0.3479410707950592, + "learning_rate": 0.0006, + "loss": 2.2023, + "step": 51710 + }, + { + "epoch": 0.19292316644658802, + "grad_norm": 0.3715308606624603, + "learning_rate": 0.0006, + "loss": 2.3061, + "step": 51720 + }, + { + "epoch": 0.1929604679095514, + "grad_norm": 0.3342021405696869, + "learning_rate": 0.0006, + "loss": 2.1335, + "step": 51730 + }, + { + "epoch": 0.19299776937251478, + "grad_norm": 0.31373581290245056, + "learning_rate": 0.0006, + "loss": 2.2477, + "step": 51740 + }, + { + "epoch": 0.19303507083547816, + "grad_norm": 0.44046705961227417, + "learning_rate": 0.0006, + "loss": 2.0574, + "step": 51750 + }, + { + "epoch": 0.19303507083547816, + "eval_valid_loss": 2.1799254417419434, + "eval_valid_loss/all": 2.0439701080322266, + "eval_valid_loss/end_span": 1.267906665802002, + "eval_valid_perplexity/batch": 7.721202373504639, + "eval_valid_perplexity/end_span": 3.553406238555908, + "eval_valid_perplexity/fim": 2.298367738723755, + "eval_valid_perplexity/first_seq": 14.660909652709961, + "eval_valid_perplexity/last_seq": 9.192198753356934, + "eval_valid_perplexity/second_seq": 13.828518867492676, + "eval_valid_perplexity/seq": 8.709047317504883, + "eval_valid_reconstruction/all": 0.29743123054504395, + "eval_valid_reconstruction/end_span": 0.6951220631599426, + "eval_valid_reconstruction/fim": 0.16813814640045166, + "eval_valid_reconstruction/first_seq": 0.17212069034576416, + "eval_valid_reconstruction/last_seq": 0.321842223405838, + "eval_valid_reconstruction/second_seq": 0.19218571484088898, + "eval_valid_runtime": 447.7748, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 51750 + }, + { + "epoch": 0.19303507083547816, + "eval_train_loss": 2.179295778274536, + "eval_train_loss/all": 2.017054557800293, + "eval_train_loss/end_span": 1.2232224941253662, + "eval_train_perplexity/batch": 7.516153812408447, + "eval_train_perplexity/end_span": 3.398120641708374, + "eval_train_perplexity/fim": 2.32029128074646, + "eval_train_perplexity/first_seq": 15.090002059936523, + "eval_train_perplexity/last_seq": 8.62966537475586, + "eval_train_perplexity/second_seq": 14.556705474853516, + "eval_train_perplexity/seq": 8.657317161560059, + "eval_train_reconstruction/all": 0.28642842173576355, + "eval_train_reconstruction/end_span": 0.7062413692474365, + "eval_train_reconstruction/fim": 0.17038460075855255, + "eval_train_reconstruction/first_seq": 0.16292504966259003, + "eval_train_reconstruction/last_seq": 0.34200721979141235, + "eval_train_reconstruction/second_seq": 0.17918865382671356, + "eval_train_runtime": 442.1419, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 51750 + }, + { + "epoch": 0.19307237229844154, + "grad_norm": 0.3348371982574463, + "learning_rate": 0.0006, + "loss": 2.193, + "step": 51760 + }, + { + "epoch": 0.19310967376140492, + "grad_norm": 0.8241416811943054, + "learning_rate": 0.0006, + "loss": 2.0251, + "step": 51770 + }, + { + "epoch": 0.1931469752243683, + "grad_norm": 6.863491535186768, + "learning_rate": 0.0006, + "loss": 2.4016, + "step": 51780 + }, + { + "epoch": 0.19318427668733168, + "grad_norm": 0.4908263087272644, + "learning_rate": 0.0006, + "loss": 2.2797, + "step": 51790 + }, + { + "epoch": 0.19322157815029506, + "grad_norm": 0.32128649950027466, + "learning_rate": 0.0006, + "loss": 2.0231, + "step": 51800 + }, + { + "epoch": 0.19325887961325844, + "grad_norm": 0.4261067509651184, + "learning_rate": 0.0006, + "loss": 2.2037, + "step": 51810 + }, + { + "epoch": 0.19329618107622182, + "grad_norm": 0.3847268521785736, + "learning_rate": 0.0006, + "loss": 2.1228, + "step": 51820 + }, + { + "epoch": 0.1933334825391852, + "grad_norm": 0.3882249593734741, + "learning_rate": 0.0006, + "loss": 2.0899, + "step": 51830 + }, + { + "epoch": 0.19337078400214858, + "grad_norm": 0.3291654586791992, + "learning_rate": 0.0006, + "loss": 2.1233, + "step": 51840 + }, + { + "epoch": 0.19340808546511193, + "grad_norm": 0.2735764682292938, + "learning_rate": 0.0006, + "loss": 2.2902, + "step": 51850 + }, + { + "epoch": 0.1934453869280753, + "grad_norm": 0.3591601848602295, + "learning_rate": 0.0006, + "loss": 2.2748, + "step": 51860 + }, + { + "epoch": 0.1934826883910387, + "grad_norm": 0.33002185821533203, + "learning_rate": 0.0006, + "loss": 2.116, + "step": 51870 + }, + { + "epoch": 0.19351998985400207, + "grad_norm": 0.33235490322113037, + "learning_rate": 0.0006, + "loss": 1.9792, + "step": 51880 + }, + { + "epoch": 0.19355729131696545, + "grad_norm": 0.43426403403282166, + "learning_rate": 0.0006, + "loss": 2.2154, + "step": 51890 + }, + { + "epoch": 0.19359459277992883, + "grad_norm": 0.48848143219947815, + "learning_rate": 0.0006, + "loss": 2.2191, + "step": 51900 + }, + { + "epoch": 0.1936318942428922, + "grad_norm": 0.29612451791763306, + "learning_rate": 0.0006, + "loss": 2.2711, + "step": 51910 + }, + { + "epoch": 0.19366919570585558, + "grad_norm": 0.4009115695953369, + "learning_rate": 0.0006, + "loss": 2.2325, + "step": 51920 + }, + { + "epoch": 0.19370649716881896, + "grad_norm": 0.2579828202724457, + "learning_rate": 0.0006, + "loss": 2.4589, + "step": 51930 + }, + { + "epoch": 0.19374379863178234, + "grad_norm": 0.4669746458530426, + "learning_rate": 0.0006, + "loss": 2.1496, + "step": 51940 + }, + { + "epoch": 0.19378110009474572, + "grad_norm": 0.4188954532146454, + "learning_rate": 0.0006, + "loss": 2.1443, + "step": 51950 + }, + { + "epoch": 0.1938184015577091, + "grad_norm": 0.248526930809021, + "learning_rate": 0.0006, + "loss": 2.229, + "step": 51960 + }, + { + "epoch": 0.19385570302067248, + "grad_norm": 0.5054819583892822, + "learning_rate": 0.0006, + "loss": 1.9034, + "step": 51970 + }, + { + "epoch": 0.19389300448363586, + "grad_norm": 0.3692564070224762, + "learning_rate": 0.0006, + "loss": 2.3164, + "step": 51980 + }, + { + "epoch": 0.1939303059465992, + "grad_norm": 0.24271774291992188, + "learning_rate": 0.0006, + "loss": 2.2635, + "step": 51990 + }, + { + "epoch": 0.1939676074095626, + "grad_norm": 0.3737662732601166, + "learning_rate": 0.0006, + "loss": 2.2122, + "step": 52000 + }, + { + "epoch": 0.1939676074095626, + "eval_valid_loss": 2.1768150329589844, + "eval_valid_loss/all": 2.040942668914795, + "eval_valid_loss/end_span": 1.2136034965515137, + "eval_valid_perplexity/batch": 7.697862148284912, + "eval_valid_perplexity/end_span": 3.365590810775757, + "eval_valid_perplexity/fim": 2.7977540493011475, + "eval_valid_perplexity/first_seq": 15.20698070526123, + "eval_valid_perplexity/last_seq": 9.015459060668945, + "eval_valid_perplexity/second_seq": 13.767091751098633, + "eval_valid_perplexity/seq": 8.679044723510742, + "eval_valid_reconstruction/all": 0.2984660863876343, + "eval_valid_reconstruction/end_span": 0.7161259651184082, + "eval_valid_reconstruction/fim": 0.21016369760036469, + "eval_valid_reconstruction/first_seq": 0.16042223572731018, + "eval_valid_reconstruction/last_seq": 0.32660114765167236, + "eval_valid_reconstruction/second_seq": 0.19295378029346466, + "eval_valid_runtime": 440.534, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 52000 + }, + { + "epoch": 0.1939676074095626, + "eval_train_loss": 2.1777098178863525, + "eval_train_loss/all": 2.015622854232788, + "eval_train_loss/end_span": 1.17939293384552, + "eval_train_perplexity/batch": 7.505400657653809, + "eval_train_perplexity/end_span": 3.252399206161499, + "eval_train_perplexity/fim": 2.149109363555908, + "eval_train_perplexity/first_seq": 15.481890678405762, + "eval_train_perplexity/last_seq": 9.193009376525879, + "eval_train_perplexity/second_seq": 13.930489540100098, + "eval_train_perplexity/seq": 8.641005516052246, + "eval_train_reconstruction/all": 0.2870807349681854, + "eval_train_reconstruction/end_span": 0.7251294255256653, + "eval_train_reconstruction/fim": 0.1544012576341629, + "eval_train_reconstruction/first_seq": 0.1536126285791397, + "eval_train_reconstruction/last_seq": 0.3209446668624878, + "eval_train_reconstruction/second_seq": 0.1901717633008957, + "eval_train_runtime": 439.3718, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 52000 + }, + { + "epoch": 0.19400490887252597, + "grad_norm": 0.383378803730011, + "learning_rate": 0.0006, + "loss": 2.0415, + "step": 52010 + }, + { + "epoch": 0.19404221033548935, + "grad_norm": 0.3145599365234375, + "learning_rate": 0.0006, + "loss": 2.1599, + "step": 52020 + }, + { + "epoch": 0.19407951179845273, + "grad_norm": 0.350801020860672, + "learning_rate": 0.0006, + "loss": 2.2516, + "step": 52030 + }, + { + "epoch": 0.1941168132614161, + "grad_norm": 0.2865169942378998, + "learning_rate": 0.0006, + "loss": 2.132, + "step": 52040 + }, + { + "epoch": 0.1941541147243795, + "grad_norm": 0.28455790877342224, + "learning_rate": 0.0006, + "loss": 2.1541, + "step": 52050 + }, + { + "epoch": 0.19419141618734287, + "grad_norm": 0.327877402305603, + "learning_rate": 0.0006, + "loss": 2.1753, + "step": 52060 + }, + { + "epoch": 0.19422871765030625, + "grad_norm": 0.3523751199245453, + "learning_rate": 0.0006, + "loss": 2.157, + "step": 52070 + }, + { + "epoch": 0.19426601911326963, + "grad_norm": 0.3865252733230591, + "learning_rate": 0.0006, + "loss": 2.1998, + "step": 52080 + }, + { + "epoch": 0.194303320576233, + "grad_norm": 0.30115994811058044, + "learning_rate": 0.0006, + "loss": 2.0814, + "step": 52090 + }, + { + "epoch": 0.1943406220391964, + "grad_norm": 0.2839377820491791, + "learning_rate": 0.0006, + "loss": 2.2833, + "step": 52100 + }, + { + "epoch": 0.19437792350215977, + "grad_norm": 0.30737748742103577, + "learning_rate": 0.0006, + "loss": 2.3624, + "step": 52110 + }, + { + "epoch": 0.19441522496512315, + "grad_norm": 0.28548476099967957, + "learning_rate": 0.0006, + "loss": 2.2172, + "step": 52120 + }, + { + "epoch": 0.1944525264280865, + "grad_norm": 0.32563015818595886, + "learning_rate": 0.0006, + "loss": 2.0695, + "step": 52130 + }, + { + "epoch": 0.19448982789104988, + "grad_norm": 0.2849358916282654, + "learning_rate": 0.0006, + "loss": 2.3473, + "step": 52140 + }, + { + "epoch": 0.19452712935401326, + "grad_norm": 0.3815254271030426, + "learning_rate": 0.0006, + "loss": 2.1469, + "step": 52150 + }, + { + "epoch": 0.19456443081697664, + "grad_norm": 0.273779034614563, + "learning_rate": 0.0006, + "loss": 2.3344, + "step": 52160 + }, + { + "epoch": 0.19460173227994002, + "grad_norm": 0.43278077244758606, + "learning_rate": 0.0006, + "loss": 2.2681, + "step": 52170 + }, + { + "epoch": 0.1946390337429034, + "grad_norm": 0.37698858976364136, + "learning_rate": 0.0006, + "loss": 2.1096, + "step": 52180 + }, + { + "epoch": 0.19467633520586677, + "grad_norm": 0.4162612855434418, + "learning_rate": 0.0006, + "loss": 2.1744, + "step": 52190 + }, + { + "epoch": 0.19471363666883015, + "grad_norm": 0.4060821831226349, + "learning_rate": 0.0006, + "loss": 2.0309, + "step": 52200 + }, + { + "epoch": 0.19475093813179353, + "grad_norm": 0.3178881108760834, + "learning_rate": 0.0006, + "loss": 2.2892, + "step": 52210 + }, + { + "epoch": 0.1947882395947569, + "grad_norm": 0.31934627890586853, + "learning_rate": 0.0006, + "loss": 2.2138, + "step": 52220 + }, + { + "epoch": 0.1948255410577203, + "grad_norm": 0.3373700678348541, + "learning_rate": 0.0006, + "loss": 2.2947, + "step": 52230 + }, + { + "epoch": 0.19486284252068367, + "grad_norm": 0.29446861147880554, + "learning_rate": 0.0006, + "loss": 2.3479, + "step": 52240 + }, + { + "epoch": 0.19490014398364705, + "grad_norm": 0.3428214490413666, + "learning_rate": 0.0006, + "loss": 2.0974, + "step": 52250 + }, + { + "epoch": 0.19490014398364705, + "eval_valid_loss": 2.1803531646728516, + "eval_valid_loss/all": 2.044297695159912, + "eval_valid_loss/end_span": 1.1788129806518555, + "eval_valid_perplexity/batch": 7.7237324714660645, + "eval_valid_perplexity/end_span": 3.2505135536193848, + "eval_valid_perplexity/fim": 2.040086030960083, + "eval_valid_perplexity/first_seq": 14.971253395080566, + "eval_valid_perplexity/last_seq": 9.221782684326172, + "eval_valid_perplexity/second_seq": 13.350698471069336, + "eval_valid_perplexity/seq": 8.709723472595215, + "eval_valid_reconstruction/all": 0.29751715064048767, + "eval_valid_reconstruction/end_span": 0.7255121469497681, + "eval_valid_reconstruction/fim": 0.14465318620204926, + "eval_valid_reconstruction/first_seq": 0.16634134948253632, + "eval_valid_reconstruction/last_seq": 0.32383260130882263, + "eval_valid_reconstruction/second_seq": 0.20613369345664978, + "eval_valid_runtime": 438.345, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 52250 + }, + { + "epoch": 0.19490014398364705, + "eval_train_loss": 2.1782636642456055, + "eval_train_loss/all": 2.016033411026001, + "eval_train_loss/end_span": 1.1387007236480713, + "eval_train_perplexity/batch": 7.508482933044434, + "eval_train_perplexity/end_span": 3.122708559036255, + "eval_train_perplexity/fim": 2.199655055999756, + "eval_train_perplexity/first_seq": 15.615300178527832, + "eval_train_perplexity/last_seq": 8.813406944274902, + "eval_train_perplexity/second_seq": 14.615915298461914, + "eval_train_perplexity/seq": 8.647814750671387, + "eval_train_reconstruction/all": 0.2869476079940796, + "eval_train_reconstruction/end_span": 0.7366690635681152, + "eval_train_reconstruction/fim": 0.16015031933784485, + "eval_train_reconstruction/first_seq": 0.14874467253684998, + "eval_train_reconstruction/last_seq": 0.32939326763153076, + "eval_train_reconstruction/second_seq": 0.17255103588104248, + "eval_train_runtime": 437.0503, + "eval_train_samples_per_second": 0.439, + "eval_train_steps_per_second": 0.439, + "step": 52250 + }, + { + "epoch": 0.1949374454466104, + "grad_norm": 0.3927913308143616, + "learning_rate": 0.0006, + "loss": 2.0978, + "step": 52260 + }, + { + "epoch": 0.19497474690957378, + "grad_norm": 0.43836936354637146, + "learning_rate": 0.0006, + "loss": 2.3389, + "step": 52270 + }, + { + "epoch": 0.19501204837253716, + "grad_norm": 0.38501057028770447, + "learning_rate": 0.0006, + "loss": 2.1579, + "step": 52280 + }, + { + "epoch": 0.19504934983550054, + "grad_norm": 0.3372001349925995, + "learning_rate": 0.0006, + "loss": 2.2233, + "step": 52290 + }, + { + "epoch": 0.19508665129846392, + "grad_norm": 0.27110767364501953, + "learning_rate": 0.0006, + "loss": 2.1273, + "step": 52300 + }, + { + "epoch": 0.1951239527614273, + "grad_norm": 0.41003328561782837, + "learning_rate": 0.0006, + "loss": 2.1311, + "step": 52310 + }, + { + "epoch": 0.19516125422439068, + "grad_norm": 0.5172410011291504, + "learning_rate": 0.0006, + "loss": 2.308, + "step": 52320 + }, + { + "epoch": 0.19519855568735406, + "grad_norm": 0.24189627170562744, + "learning_rate": 0.0006, + "loss": 2.2652, + "step": 52330 + }, + { + "epoch": 0.19523585715031744, + "grad_norm": 0.2840029299259186, + "learning_rate": 0.0006, + "loss": 2.0892, + "step": 52340 + }, + { + "epoch": 0.19527315861328082, + "grad_norm": 0.5760120749473572, + "learning_rate": 0.0006, + "loss": 2.0681, + "step": 52350 + }, + { + "epoch": 0.1953104600762442, + "grad_norm": 0.3585349917411804, + "learning_rate": 0.0006, + "loss": 2.2613, + "step": 52360 + }, + { + "epoch": 0.19534776153920758, + "grad_norm": 0.22567608952522278, + "learning_rate": 0.0006, + "loss": 2.152, + "step": 52370 + }, + { + "epoch": 0.19538506300217096, + "grad_norm": 0.41235101222991943, + "learning_rate": 0.0006, + "loss": 2.3732, + "step": 52380 + }, + { + "epoch": 0.19542236446513434, + "grad_norm": 0.39477604627609253, + "learning_rate": 0.0006, + "loss": 2.0721, + "step": 52390 + }, + { + "epoch": 0.1954596659280977, + "grad_norm": 0.30469658970832825, + "learning_rate": 0.0006, + "loss": 2.2741, + "step": 52400 + }, + { + "epoch": 0.19549696739106107, + "grad_norm": 0.370606005191803, + "learning_rate": 0.0006, + "loss": 2.2915, + "step": 52410 + }, + { + "epoch": 0.19553426885402445, + "grad_norm": 0.27080604434013367, + "learning_rate": 0.0006, + "loss": 2.2305, + "step": 52420 + }, + { + "epoch": 0.19557157031698783, + "grad_norm": 0.26348283886909485, + "learning_rate": 0.0006, + "loss": 2.1925, + "step": 52430 + }, + { + "epoch": 0.1956088717799512, + "grad_norm": 0.3427712917327881, + "learning_rate": 0.0006, + "loss": 2.3705, + "step": 52440 + }, + { + "epoch": 0.19564617324291458, + "grad_norm": 0.4658307433128357, + "learning_rate": 0.0006, + "loss": 2.2306, + "step": 52450 + }, + { + "epoch": 0.19568347470587796, + "grad_norm": 0.8359739780426025, + "learning_rate": 0.0006, + "loss": 2.1769, + "step": 52460 + }, + { + "epoch": 0.19572077616884134, + "grad_norm": 0.29424992203712463, + "learning_rate": 0.0006, + "loss": 2.1417, + "step": 52470 + }, + { + "epoch": 0.19575807763180472, + "grad_norm": 0.4137101471424103, + "learning_rate": 0.0006, + "loss": 2.2019, + "step": 52480 + }, + { + "epoch": 0.1957953790947681, + "grad_norm": 4.549399375915527, + "learning_rate": 0.0006, + "loss": 2.0506, + "step": 52490 + }, + { + "epoch": 0.19583268055773148, + "grad_norm": 0.38949263095855713, + "learning_rate": 0.0006, + "loss": 2.1887, + "step": 52500 + }, + { + "epoch": 0.19583268055773148, + "eval_valid_loss": 2.1832683086395264, + "eval_valid_loss/all": 2.047215461730957, + "eval_valid_loss/end_span": 1.2680730819702148, + "eval_valid_perplexity/batch": 7.746301174163818, + "eval_valid_perplexity/end_span": 3.553997755050659, + "eval_valid_perplexity/fim": 2.5349278450012207, + "eval_valid_perplexity/first_seq": 14.921795845031738, + "eval_valid_perplexity/last_seq": 8.772056579589844, + "eval_valid_perplexity/second_seq": 13.844730377197266, + "eval_valid_perplexity/seq": 8.73845100402832, + "eval_valid_reconstruction/all": 0.2966507375240326, + "eval_valid_reconstruction/end_span": 0.6985074877738953, + "eval_valid_reconstruction/fim": 0.18773430585861206, + "eval_valid_reconstruction/first_seq": 0.16519680619239807, + "eval_valid_reconstruction/last_seq": 0.33641037344932556, + "eval_valid_reconstruction/second_seq": 0.1901787668466568, + "eval_valid_runtime": 440.0532, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 52500 + }, + { + "epoch": 0.19583268055773148, + "eval_train_loss": 2.183405637741089, + "eval_train_loss/all": 2.021012544631958, + "eval_train_loss/end_span": 1.2346000671386719, + "eval_train_perplexity/batch": 7.545961856842041, + "eval_train_perplexity/end_span": 3.4370036125183105, + "eval_train_perplexity/fim": 2.044588565826416, + "eval_train_perplexity/first_seq": 15.68668270111084, + "eval_train_perplexity/last_seq": 8.60461711883545, + "eval_train_perplexity/second_seq": 13.744009971618652, + "eval_train_perplexity/seq": 8.693452835083008, + "eval_train_reconstruction/all": 0.28562670946121216, + "eval_train_reconstruction/end_span": 0.7086974382400513, + "eval_train_reconstruction/fim": 0.14394381642341614, + "eval_train_reconstruction/first_seq": 0.14820070564746857, + "eval_train_reconstruction/last_seq": 0.33646416664123535, + "eval_train_reconstruction/second_seq": 0.19567525386810303, + "eval_train_runtime": 439.6398, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 52500 + }, + { + "epoch": 0.19586998202069486, + "grad_norm": 0.2824617028236389, + "learning_rate": 0.0006, + "loss": 2.3439, + "step": 52510 + }, + { + "epoch": 0.19590728348365824, + "grad_norm": 0.42770916223526, + "learning_rate": 0.0006, + "loss": 2.4267, + "step": 52520 + }, + { + "epoch": 0.19594458494662162, + "grad_norm": 0.4792165160179138, + "learning_rate": 0.0006, + "loss": 2.2221, + "step": 52530 + }, + { + "epoch": 0.19598188640958497, + "grad_norm": 0.24708421528339386, + "learning_rate": 0.0006, + "loss": 2.149, + "step": 52540 + }, + { + "epoch": 0.19601918787254835, + "grad_norm": 0.3231409788131714, + "learning_rate": 0.0006, + "loss": 2.1691, + "step": 52550 + }, + { + "epoch": 0.19605648933551173, + "grad_norm": 0.3874198794364929, + "learning_rate": 0.0006, + "loss": 2.3266, + "step": 52560 + }, + { + "epoch": 0.1960937907984751, + "grad_norm": 0.2897821366786957, + "learning_rate": 0.0006, + "loss": 2.2936, + "step": 52570 + }, + { + "epoch": 0.1961310922614385, + "grad_norm": 0.4401852786540985, + "learning_rate": 0.0006, + "loss": 2.1941, + "step": 52580 + }, + { + "epoch": 0.19616839372440187, + "grad_norm": 0.4484097361564636, + "learning_rate": 0.0006, + "loss": 2.0233, + "step": 52590 + }, + { + "epoch": 0.19620569518736525, + "grad_norm": 0.29045844078063965, + "learning_rate": 0.0006, + "loss": 2.2889, + "step": 52600 + }, + { + "epoch": 0.19624299665032863, + "grad_norm": 0.35356390476226807, + "learning_rate": 0.0006, + "loss": 2.1436, + "step": 52610 + }, + { + "epoch": 0.196280298113292, + "grad_norm": 0.33522355556488037, + "learning_rate": 0.0006, + "loss": 2.2469, + "step": 52620 + }, + { + "epoch": 0.19631759957625539, + "grad_norm": 0.4039767384529114, + "learning_rate": 0.0006, + "loss": 2.0818, + "step": 52630 + }, + { + "epoch": 0.19635490103921877, + "grad_norm": 0.37185239791870117, + "learning_rate": 0.0006, + "loss": 2.2398, + "step": 52640 + }, + { + "epoch": 0.19639220250218214, + "grad_norm": 0.5081436634063721, + "learning_rate": 0.0006, + "loss": 2.3162, + "step": 52650 + }, + { + "epoch": 0.19642950396514552, + "grad_norm": 0.2722405791282654, + "learning_rate": 0.0006, + "loss": 2.206, + "step": 52660 + }, + { + "epoch": 0.1964668054281089, + "grad_norm": 0.48758378624916077, + "learning_rate": 0.0006, + "loss": 2.2822, + "step": 52670 + }, + { + "epoch": 0.19650410689107226, + "grad_norm": 0.3747575581073761, + "learning_rate": 0.0006, + "loss": 2.3024, + "step": 52680 + }, + { + "epoch": 0.19654140835403563, + "grad_norm": 0.31919756531715393, + "learning_rate": 0.0006, + "loss": 2.1158, + "step": 52690 + }, + { + "epoch": 0.19657870981699901, + "grad_norm": 0.3595989942550659, + "learning_rate": 0.0006, + "loss": 2.1891, + "step": 52700 + }, + { + "epoch": 0.1966160112799624, + "grad_norm": 0.359625905752182, + "learning_rate": 0.0006, + "loss": 2.165, + "step": 52710 + }, + { + "epoch": 0.19665331274292577, + "grad_norm": 0.25579264760017395, + "learning_rate": 0.0006, + "loss": 2.2845, + "step": 52720 + }, + { + "epoch": 0.19669061420588915, + "grad_norm": 0.28755179047584534, + "learning_rate": 0.0006, + "loss": 2.2746, + "step": 52730 + }, + { + "epoch": 0.19672791566885253, + "grad_norm": 0.32257792353630066, + "learning_rate": 0.0006, + "loss": 2.2937, + "step": 52740 + }, + { + "epoch": 0.1967652171318159, + "grad_norm": 0.38619065284729004, + "learning_rate": 0.0006, + "loss": 2.185, + "step": 52750 + }, + { + "epoch": 0.1967652171318159, + "eval_valid_loss": 2.1827945709228516, + "eval_valid_loss/all": 2.046769380569458, + "eval_valid_loss/end_span": 1.2051371335983276, + "eval_valid_perplexity/batch": 7.742846488952637, + "eval_valid_perplexity/end_span": 3.33721661567688, + "eval_valid_perplexity/fim": 2.514650821685791, + "eval_valid_perplexity/first_seq": 15.124789237976074, + "eval_valid_perplexity/last_seq": 8.65900707244873, + "eval_valid_perplexity/second_seq": 13.580788612365723, + "eval_valid_perplexity/seq": 8.733074188232422, + "eval_valid_reconstruction/all": 0.296626478433609, + "eval_valid_reconstruction/end_span": 0.7177839875221252, + "eval_valid_reconstruction/fim": 0.18566450476646423, + "eval_valid_reconstruction/first_seq": 0.16482095420360565, + "eval_valid_reconstruction/last_seq": 0.33803999423980713, + "eval_valid_reconstruction/second_seq": 0.19972392916679382, + "eval_valid_runtime": 440.0095, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 52750 + }, + { + "epoch": 0.1967652171318159, + "eval_train_loss": 2.179689645767212, + "eval_train_loss/all": 2.0172975063323975, + "eval_train_loss/end_span": 1.1670805215835571, + "eval_train_perplexity/batch": 7.517980098724365, + "eval_train_perplexity/end_span": 3.212599754333496, + "eval_train_perplexity/fim": 2.0887632369995117, + "eval_train_perplexity/first_seq": 15.723861694335938, + "eval_train_perplexity/last_seq": 9.075563430786133, + "eval_train_perplexity/second_seq": 14.232132911682129, + "eval_train_perplexity/seq": 8.65804386138916, + "eval_train_reconstruction/all": 0.28653931617736816, + "eval_train_reconstruction/end_span": 0.729853093624115, + "eval_train_reconstruction/fim": 0.14900638163089752, + "eval_train_reconstruction/first_seq": 0.14702387154102325, + "eval_train_reconstruction/last_seq": 0.32145777344703674, + "eval_train_reconstruction/second_seq": 0.1827697604894638, + "eval_train_runtime": 439.291, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 52750 + }, + { + "epoch": 0.1968025185947793, + "grad_norm": 0.32048362493515015, + "learning_rate": 0.0006, + "loss": 2.1537, + "step": 52760 + }, + { + "epoch": 0.19683982005774267, + "grad_norm": 0.36742907762527466, + "learning_rate": 0.0006, + "loss": 2.2806, + "step": 52770 + }, + { + "epoch": 0.19687712152070605, + "grad_norm": 0.3708847463130951, + "learning_rate": 0.0006, + "loss": 2.1984, + "step": 52780 + }, + { + "epoch": 0.19691442298366943, + "grad_norm": 0.30296435952186584, + "learning_rate": 0.0006, + "loss": 2.2402, + "step": 52790 + }, + { + "epoch": 0.1969517244466328, + "grad_norm": 0.3220214247703552, + "learning_rate": 0.0006, + "loss": 2.1708, + "step": 52800 + }, + { + "epoch": 0.1969890259095962, + "grad_norm": 0.3032721281051636, + "learning_rate": 0.0006, + "loss": 2.0568, + "step": 52810 + }, + { + "epoch": 0.19702632737255954, + "grad_norm": 0.44977641105651855, + "learning_rate": 0.0006, + "loss": 2.2319, + "step": 52820 + }, + { + "epoch": 0.19706362883552292, + "grad_norm": 0.27255070209503174, + "learning_rate": 0.0006, + "loss": 2.2145, + "step": 52830 + }, + { + "epoch": 0.1971009302984863, + "grad_norm": 0.3147827982902527, + "learning_rate": 0.0006, + "loss": 1.9983, + "step": 52840 + }, + { + "epoch": 0.19713823176144968, + "grad_norm": 0.2958422601222992, + "learning_rate": 0.0006, + "loss": 2.3438, + "step": 52850 + }, + { + "epoch": 0.19717553322441306, + "grad_norm": 0.33469510078430176, + "learning_rate": 0.0006, + "loss": 2.3593, + "step": 52860 + }, + { + "epoch": 0.19721283468737644, + "grad_norm": 0.3128630518913269, + "learning_rate": 0.0006, + "loss": 2.2826, + "step": 52870 + }, + { + "epoch": 0.19725013615033982, + "grad_norm": 0.20012858510017395, + "learning_rate": 0.0006, + "loss": 2.3301, + "step": 52880 + }, + { + "epoch": 0.1972874376133032, + "grad_norm": 0.29387712478637695, + "learning_rate": 0.0006, + "loss": 2.0352, + "step": 52890 + }, + { + "epoch": 0.19732473907626658, + "grad_norm": 0.34073999524116516, + "learning_rate": 0.0006, + "loss": 2.2993, + "step": 52900 + }, + { + "epoch": 0.19736204053922995, + "grad_norm": 0.4613955616950989, + "learning_rate": 0.0006, + "loss": 2.0678, + "step": 52910 + }, + { + "epoch": 0.19739934200219333, + "grad_norm": 0.27975913882255554, + "learning_rate": 0.0006, + "loss": 2.1901, + "step": 52920 + }, + { + "epoch": 0.1974366434651567, + "grad_norm": 0.3332703709602356, + "learning_rate": 0.0006, + "loss": 2.0856, + "step": 52930 + }, + { + "epoch": 0.1974739449281201, + "grad_norm": 0.6966163516044617, + "learning_rate": 0.0006, + "loss": 2.3164, + "step": 52940 + }, + { + "epoch": 0.19751124639108344, + "grad_norm": 0.3497284948825836, + "learning_rate": 0.0006, + "loss": 2.2769, + "step": 52950 + }, + { + "epoch": 0.19754854785404682, + "grad_norm": 0.4565528631210327, + "learning_rate": 0.0006, + "loss": 2.1738, + "step": 52960 + }, + { + "epoch": 0.1975858493170102, + "grad_norm": 0.3769976794719696, + "learning_rate": 0.0006, + "loss": 2.2372, + "step": 52970 + }, + { + "epoch": 0.19762315077997358, + "grad_norm": 0.3299129903316498, + "learning_rate": 0.0006, + "loss": 2.2545, + "step": 52980 + }, + { + "epoch": 0.19766045224293696, + "grad_norm": 0.30122658610343933, + "learning_rate": 0.0006, + "loss": 2.3284, + "step": 52990 + }, + { + "epoch": 0.19769775370590034, + "grad_norm": 0.406206876039505, + "learning_rate": 0.0006, + "loss": 2.2101, + "step": 53000 + }, + { + "epoch": 0.19769775370590034, + "eval_valid_loss": 2.1928136348724365, + "eval_valid_loss/all": 2.0557096004486084, + "eval_valid_loss/end_span": 1.269608497619629, + "eval_valid_perplexity/batch": 7.812379360198975, + "eval_valid_perplexity/end_span": 3.5594587326049805, + "eval_valid_perplexity/fim": 2.3179867267608643, + "eval_valid_perplexity/first_seq": 14.995949745178223, + "eval_valid_perplexity/last_seq": 8.948444366455078, + "eval_valid_perplexity/second_seq": 13.511985778808594, + "eval_valid_perplexity/seq": 8.80945110321045, + "eval_valid_reconstruction/all": 0.2939367890357971, + "eval_valid_reconstruction/end_span": 0.7029043436050415, + "eval_valid_reconstruction/fim": 0.16877613961696625, + "eval_valid_reconstruction/first_seq": 0.1661582589149475, + "eval_valid_reconstruction/last_seq": 0.32862603664398193, + "eval_valid_reconstruction/second_seq": 0.20139241218566895, + "eval_valid_runtime": 441.5674, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 53000 + }, + { + "epoch": 0.19769775370590034, + "eval_train_loss": 2.192737340927124, + "eval_train_loss/all": 2.029397487640381, + "eval_train_loss/end_span": 1.220300555229187, + "eval_train_perplexity/batch": 7.609499931335449, + "eval_train_perplexity/end_span": 3.3882060050964355, + "eval_train_perplexity/fim": 2.23179292678833, + "eval_train_perplexity/first_seq": 15.538572311401367, + "eval_train_perplexity/last_seq": 9.157182693481445, + "eval_train_perplexity/second_seq": 14.035477638244629, + "eval_train_perplexity/seq": 8.767173767089844, + "eval_train_reconstruction/all": 0.2828352451324463, + "eval_train_reconstruction/end_span": 0.7171873450279236, + "eval_train_reconstruction/fim": 0.16000229120254517, + "eval_train_reconstruction/first_seq": 0.15064078569412231, + "eval_train_reconstruction/last_seq": 0.322099506855011, + "eval_train_reconstruction/second_seq": 0.18825921416282654, + "eval_train_runtime": 450.3198, + "eval_train_samples_per_second": 0.426, + "eval_train_steps_per_second": 0.426, + "step": 53000 + }, + { + "epoch": 0.19773505516886372, + "grad_norm": 0.2852279841899872, + "learning_rate": 0.0006, + "loss": 2.0228, + "step": 53010 + }, + { + "epoch": 0.1977723566318271, + "grad_norm": 0.43697190284729004, + "learning_rate": 0.0006, + "loss": 2.1221, + "step": 53020 + }, + { + "epoch": 0.19780965809479048, + "grad_norm": 0.42107149958610535, + "learning_rate": 0.0006, + "loss": 2.2064, + "step": 53030 + }, + { + "epoch": 0.19784695955775386, + "grad_norm": 0.31543442606925964, + "learning_rate": 0.0006, + "loss": 2.2177, + "step": 53040 + }, + { + "epoch": 0.19788426102071724, + "grad_norm": 0.37474381923675537, + "learning_rate": 0.0006, + "loss": 2.0664, + "step": 53050 + }, + { + "epoch": 0.19792156248368062, + "grad_norm": 0.3695387542247772, + "learning_rate": 0.0006, + "loss": 2.1695, + "step": 53060 + }, + { + "epoch": 0.197958863946644, + "grad_norm": 0.32543519139289856, + "learning_rate": 0.0006, + "loss": 2.2507, + "step": 53070 + }, + { + "epoch": 0.19799616540960738, + "grad_norm": 0.3755052387714386, + "learning_rate": 0.0006, + "loss": 2.1449, + "step": 53080 + }, + { + "epoch": 0.19803346687257073, + "grad_norm": 0.31380537152290344, + "learning_rate": 0.0006, + "loss": 2.1417, + "step": 53090 + }, + { + "epoch": 0.1980707683355341, + "grad_norm": 0.4348211884498596, + "learning_rate": 0.0006, + "loss": 2.2807, + "step": 53100 + }, + { + "epoch": 0.1981080697984975, + "grad_norm": 0.3247743546962738, + "learning_rate": 0.0006, + "loss": 2.1043, + "step": 53110 + }, + { + "epoch": 0.19814537126146087, + "grad_norm": 0.2955935299396515, + "learning_rate": 0.0006, + "loss": 2.2163, + "step": 53120 + }, + { + "epoch": 0.19818267272442425, + "grad_norm": 0.39491671323776245, + "learning_rate": 0.0006, + "loss": 2.1501, + "step": 53130 + }, + { + "epoch": 0.19821997418738763, + "grad_norm": 0.3272331655025482, + "learning_rate": 0.0006, + "loss": 2.4246, + "step": 53140 + }, + { + "epoch": 0.198257275650351, + "grad_norm": 0.23502439260482788, + "learning_rate": 0.0006, + "loss": 2.3596, + "step": 53150 + }, + { + "epoch": 0.19829457711331439, + "grad_norm": 0.3558988869190216, + "learning_rate": 0.0006, + "loss": 2.1671, + "step": 53160 + }, + { + "epoch": 0.19833187857627776, + "grad_norm": 0.26019006967544556, + "learning_rate": 0.0006, + "loss": 2.2495, + "step": 53170 + }, + { + "epoch": 0.19836918003924114, + "grad_norm": 0.6249595880508423, + "learning_rate": 0.0006, + "loss": 2.2226, + "step": 53180 + }, + { + "epoch": 0.19840648150220452, + "grad_norm": 0.30621442198753357, + "learning_rate": 0.0006, + "loss": 2.1932, + "step": 53190 + }, + { + "epoch": 0.1984437829651679, + "grad_norm": 0.40028688311576843, + "learning_rate": 0.0006, + "loss": 2.2461, + "step": 53200 + }, + { + "epoch": 0.19848108442813128, + "grad_norm": 0.3354368209838867, + "learning_rate": 0.0006, + "loss": 2.226, + "step": 53210 + }, + { + "epoch": 0.19851838589109466, + "grad_norm": 0.3998297154903412, + "learning_rate": 0.0006, + "loss": 2.2106, + "step": 53220 + }, + { + "epoch": 0.198555687354058, + "grad_norm": 0.37628868222236633, + "learning_rate": 0.0006, + "loss": 2.2318, + "step": 53230 + }, + { + "epoch": 0.1985929888170214, + "grad_norm": 0.3915208876132965, + "learning_rate": 0.0006, + "loss": 2.1273, + "step": 53240 + }, + { + "epoch": 0.19863029027998477, + "grad_norm": 0.3358965516090393, + "learning_rate": 0.0006, + "loss": 2.1829, + "step": 53250 + }, + { + "epoch": 0.19863029027998477, + "eval_valid_loss": 2.1835737228393555, + "eval_valid_loss/all": 2.04763126373291, + "eval_valid_loss/end_span": 1.2437021732330322, + "eval_valid_perplexity/batch": 7.749522686004639, + "eval_valid_perplexity/end_span": 3.468430519104004, + "eval_valid_perplexity/fim": 2.3266074657440186, + "eval_valid_perplexity/first_seq": 14.951064109802246, + "eval_valid_perplexity/last_seq": 8.782266616821289, + "eval_valid_perplexity/second_seq": 13.947551727294922, + "eval_valid_perplexity/seq": 8.743731498718262, + "eval_valid_reconstruction/all": 0.29549092054367065, + "eval_valid_reconstruction/end_span": 0.7044897079467773, + "eval_valid_reconstruction/fim": 0.16931667923927307, + "eval_valid_reconstruction/first_seq": 0.1628122329711914, + "eval_valid_reconstruction/last_seq": 0.336563378572464, + "eval_valid_reconstruction/second_seq": 0.18964053690433502, + "eval_valid_runtime": 446.1399, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 53250 + }, + { + "epoch": 0.19863029027998477, + "eval_train_loss": 2.181922674179077, + "eval_train_loss/all": 2.0193071365356445, + "eval_train_loss/end_span": 1.2020792961120605, + "eval_train_perplexity/batch": 7.533103942871094, + "eval_train_perplexity/end_span": 3.3270275592803955, + "eval_train_perplexity/fim": 1.934326410293579, + "eval_train_perplexity/first_seq": 15.243752479553223, + "eval_train_perplexity/last_seq": 8.802606582641602, + "eval_train_perplexity/second_seq": 14.304258346557617, + "eval_train_perplexity/seq": 8.669621467590332, + "eval_train_reconstruction/all": 0.2854158878326416, + "eval_train_reconstruction/end_span": 0.7185633182525635, + "eval_train_reconstruction/fim": 0.13321033120155334, + "eval_train_reconstruction/first_seq": 0.15737177431583405, + "eval_train_reconstruction/last_seq": 0.3316100239753723, + "eval_train_reconstruction/second_seq": 0.1811148226261139, + "eval_train_runtime": 446.2461, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 53250 + }, + { + "epoch": 0.19866759174294815, + "grad_norm": 0.42953959107398987, + "learning_rate": 0.0006, + "loss": 2.2483, + "step": 53260 + }, + { + "epoch": 0.19870489320591153, + "grad_norm": 0.3188724219799042, + "learning_rate": 0.0006, + "loss": 2.2347, + "step": 53270 + }, + { + "epoch": 0.1987421946688749, + "grad_norm": 0.34746602177619934, + "learning_rate": 0.0006, + "loss": 2.1808, + "step": 53280 + }, + { + "epoch": 0.1987794961318383, + "grad_norm": 0.26895251870155334, + "learning_rate": 0.0006, + "loss": 2.2878, + "step": 53290 + }, + { + "epoch": 0.19881679759480167, + "grad_norm": 0.3409137725830078, + "learning_rate": 0.0006, + "loss": 2.1964, + "step": 53300 + }, + { + "epoch": 0.19885409905776505, + "grad_norm": 0.356431782245636, + "learning_rate": 0.0006, + "loss": 2.2506, + "step": 53310 + }, + { + "epoch": 0.19889140052072843, + "grad_norm": 0.43122929334640503, + "learning_rate": 0.0006, + "loss": 2.1078, + "step": 53320 + }, + { + "epoch": 0.1989287019836918, + "grad_norm": 0.4804039001464844, + "learning_rate": 0.0006, + "loss": 2.2972, + "step": 53330 + }, + { + "epoch": 0.1989660034466552, + "grad_norm": 0.3115079700946808, + "learning_rate": 0.0006, + "loss": 2.0996, + "step": 53340 + }, + { + "epoch": 0.19900330490961857, + "grad_norm": 0.45114681124687195, + "learning_rate": 0.0006, + "loss": 2.2429, + "step": 53350 + }, + { + "epoch": 0.19904060637258195, + "grad_norm": 0.2706945538520813, + "learning_rate": 0.0006, + "loss": 2.2193, + "step": 53360 + }, + { + "epoch": 0.1990779078355453, + "grad_norm": 0.28276094794273376, + "learning_rate": 0.0006, + "loss": 2.1372, + "step": 53370 + }, + { + "epoch": 0.19911520929850868, + "grad_norm": 1.0869559049606323, + "learning_rate": 0.0006, + "loss": 2.1135, + "step": 53380 + }, + { + "epoch": 0.19915251076147206, + "grad_norm": 0.3309819996356964, + "learning_rate": 0.0006, + "loss": 2.2642, + "step": 53390 + }, + { + "epoch": 0.19918981222443544, + "grad_norm": 0.4571499526500702, + "learning_rate": 0.0006, + "loss": 2.3878, + "step": 53400 + }, + { + "epoch": 0.19922711368739882, + "grad_norm": 0.29533886909484863, + "learning_rate": 0.0006, + "loss": 2.1886, + "step": 53410 + }, + { + "epoch": 0.1992644151503622, + "grad_norm": 0.4050650894641876, + "learning_rate": 0.0006, + "loss": 2.3025, + "step": 53420 + }, + { + "epoch": 0.19930171661332557, + "grad_norm": 0.24906504154205322, + "learning_rate": 0.0006, + "loss": 2.1641, + "step": 53430 + }, + { + "epoch": 0.19933901807628895, + "grad_norm": 0.35151681303977966, + "learning_rate": 0.0006, + "loss": 2.1377, + "step": 53440 + }, + { + "epoch": 0.19937631953925233, + "grad_norm": 0.3611591160297394, + "learning_rate": 0.0006, + "loss": 2.1808, + "step": 53450 + }, + { + "epoch": 0.1994136210022157, + "grad_norm": 0.34918102622032166, + "learning_rate": 0.0006, + "loss": 2.0966, + "step": 53460 + }, + { + "epoch": 0.1994509224651791, + "grad_norm": 0.2837616801261902, + "learning_rate": 0.0006, + "loss": 2.1916, + "step": 53470 + }, + { + "epoch": 0.19948822392814247, + "grad_norm": 0.26985424757003784, + "learning_rate": 0.0006, + "loss": 2.184, + "step": 53480 + }, + { + "epoch": 0.19952552539110585, + "grad_norm": 0.3054952621459961, + "learning_rate": 0.0006, + "loss": 2.3666, + "step": 53490 + }, + { + "epoch": 0.1995628268540692, + "grad_norm": 0.2928348481655121, + "learning_rate": 0.0006, + "loss": 2.3304, + "step": 53500 + }, + { + "epoch": 0.1995628268540692, + "eval_valid_loss": 2.1764261722564697, + "eval_valid_loss/all": 2.0406734943389893, + "eval_valid_loss/end_span": 1.198355793952942, + "eval_valid_perplexity/batch": 7.6957902908325195, + "eval_valid_perplexity/end_span": 3.314662456512451, + "eval_valid_perplexity/fim": 2.7621610164642334, + "eval_valid_perplexity/first_seq": 14.605639457702637, + "eval_valid_perplexity/last_seq": 8.753253936767578, + "eval_valid_perplexity/second_seq": 13.160262107849121, + "eval_valid_perplexity/seq": 8.67587947845459, + "eval_valid_reconstruction/all": 0.2984170913696289, + "eval_valid_reconstruction/end_span": 0.7170358300209045, + "eval_valid_reconstruction/fim": 0.20630362629890442, + "eval_valid_reconstruction/first_seq": 0.17388036847114563, + "eval_valid_reconstruction/last_seq": 0.3333089351654053, + "eval_valid_reconstruction/second_seq": 0.21061643958091736, + "eval_valid_runtime": 444.1356, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 53500 + }, + { + "epoch": 0.1995628268540692, + "eval_train_loss": 2.1773440837860107, + "eval_train_loss/all": 2.015195608139038, + "eval_train_loss/end_span": 1.1585947275161743, + "eval_train_perplexity/batch": 7.502194881439209, + "eval_train_perplexity/end_span": 3.1854536533355713, + "eval_train_perplexity/fim": 2.177999496459961, + "eval_train_perplexity/first_seq": 15.426129341125488, + "eval_train_perplexity/last_seq": 8.76156997680664, + "eval_train_perplexity/second_seq": 14.00898551940918, + "eval_train_perplexity/seq": 8.640881538391113, + "eval_train_reconstruction/all": 0.2872559428215027, + "eval_train_reconstruction/end_span": 0.729982316493988, + "eval_train_reconstruction/fim": 0.15821750462055206, + "eval_train_reconstruction/first_seq": 0.15468399226665497, + "eval_train_reconstruction/last_seq": 0.3266316056251526, + "eval_train_reconstruction/second_seq": 0.18787811696529388, + "eval_train_runtime": 436.6035, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 53500 + }, + { + "epoch": 0.19960012831703258, + "grad_norm": 0.2682049870491028, + "learning_rate": 0.0006, + "loss": 2.2182, + "step": 53510 + }, + { + "epoch": 0.19963742977999596, + "grad_norm": 0.5394496917724609, + "learning_rate": 0.0006, + "loss": 2.2721, + "step": 53520 + }, + { + "epoch": 0.19967473124295934, + "grad_norm": 0.30722352862358093, + "learning_rate": 0.0006, + "loss": 2.1334, + "step": 53530 + }, + { + "epoch": 0.19971203270592272, + "grad_norm": 0.38351237773895264, + "learning_rate": 0.0006, + "loss": 2.1308, + "step": 53540 + }, + { + "epoch": 0.1997493341688861, + "grad_norm": 0.25544658303260803, + "learning_rate": 0.0006, + "loss": 2.2505, + "step": 53550 + }, + { + "epoch": 0.19978663563184948, + "grad_norm": 0.4261239767074585, + "learning_rate": 0.0006, + "loss": 2.2426, + "step": 53560 + }, + { + "epoch": 0.19982393709481286, + "grad_norm": 0.3747783303260803, + "learning_rate": 0.0006, + "loss": 2.1663, + "step": 53570 + }, + { + "epoch": 0.19986123855777624, + "grad_norm": 0.49263235926628113, + "learning_rate": 0.0006, + "loss": 2.1662, + "step": 53580 + }, + { + "epoch": 0.19989854002073962, + "grad_norm": 0.30766236782073975, + "learning_rate": 0.0006, + "loss": 2.313, + "step": 53590 + }, + { + "epoch": 0.199935841483703, + "grad_norm": 0.27877694368362427, + "learning_rate": 0.0006, + "loss": 2.1747, + "step": 53600 + }, + { + "epoch": 0.19997314294666638, + "grad_norm": 0.5099806785583496, + "learning_rate": 0.0006, + "loss": 2.0916, + "step": 53610 + }, + { + "epoch": 0.20001044440962976, + "grad_norm": 0.30646127462387085, + "learning_rate": 0.0006, + "loss": 2.1074, + "step": 53620 + }, + { + "epoch": 0.20004774587259314, + "grad_norm": 0.42200517654418945, + "learning_rate": 0.0006, + "loss": 2.2458, + "step": 53630 + }, + { + "epoch": 0.2000850473355565, + "grad_norm": 0.3143894374370575, + "learning_rate": 0.0006, + "loss": 2.2386, + "step": 53640 + }, + { + "epoch": 0.20012234879851987, + "grad_norm": 0.5317075848579407, + "learning_rate": 0.0006, + "loss": 2.151, + "step": 53650 + }, + { + "epoch": 0.20015965026148325, + "grad_norm": 0.40617606043815613, + "learning_rate": 0.0006, + "loss": 2.2706, + "step": 53660 + }, + { + "epoch": 0.20019695172444663, + "grad_norm": 0.3302329480648041, + "learning_rate": 0.0006, + "loss": 2.2757, + "step": 53670 + }, + { + "epoch": 0.20023425318741, + "grad_norm": 0.3415656089782715, + "learning_rate": 0.0006, + "loss": 2.0812, + "step": 53680 + }, + { + "epoch": 0.20027155465037338, + "grad_norm": 0.25528475642204285, + "learning_rate": 0.0006, + "loss": 2.1789, + "step": 53690 + }, + { + "epoch": 0.20030885611333676, + "grad_norm": 0.2575214207172394, + "learning_rate": 0.0006, + "loss": 2.326, + "step": 53700 + }, + { + "epoch": 0.20034615757630014, + "grad_norm": 0.39173004031181335, + "learning_rate": 0.0006, + "loss": 2.0493, + "step": 53710 + }, + { + "epoch": 0.20038345903926352, + "grad_norm": 0.40590518712997437, + "learning_rate": 0.0006, + "loss": 2.1497, + "step": 53720 + }, + { + "epoch": 0.2004207605022269, + "grad_norm": 0.3198484480381012, + "learning_rate": 0.0006, + "loss": 2.1842, + "step": 53730 + }, + { + "epoch": 0.20045806196519028, + "grad_norm": 0.361364483833313, + "learning_rate": 0.0006, + "loss": 2.1941, + "step": 53740 + }, + { + "epoch": 0.20049536342815366, + "grad_norm": 0.27521175146102905, + "learning_rate": 0.0006, + "loss": 2.2482, + "step": 53750 + }, + { + "epoch": 0.20049536342815366, + "eval_valid_loss": 2.182955503463745, + "eval_valid_loss/all": 2.0468008518218994, + "eval_valid_loss/end_span": 1.1220492124557495, + "eval_valid_perplexity/batch": 7.7430901527404785, + "eval_valid_perplexity/end_span": 3.071141242980957, + "eval_valid_perplexity/fim": 2.1895291805267334, + "eval_valid_perplexity/first_seq": 14.812091827392578, + "eval_valid_perplexity/last_seq": 8.748586654663086, + "eval_valid_perplexity/second_seq": 14.116809844970703, + "eval_valid_perplexity/seq": 8.7310152053833, + "eval_valid_reconstruction/all": 0.2966415584087372, + "eval_valid_reconstruction/end_span": 0.7379588484764099, + "eval_valid_reconstruction/fim": 0.15807051956653595, + "eval_valid_reconstruction/first_seq": 0.17045724391937256, + "eval_valid_reconstruction/last_seq": 0.3347979187965393, + "eval_valid_reconstruction/second_seq": 0.18911701440811157, + "eval_valid_runtime": 446.828, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 53750 + }, + { + "epoch": 0.20049536342815366, + "eval_train_loss": 2.1814515590667725, + "eval_train_loss/all": 2.018803358078003, + "eval_train_loss/end_span": 1.0947917699813843, + "eval_train_perplexity/batch": 7.5293097496032715, + "eval_train_perplexity/end_span": 2.988560199737549, + "eval_train_perplexity/fim": 2.431884288787842, + "eval_train_perplexity/first_seq": 15.80590534210205, + "eval_train_perplexity/last_seq": 8.584013938903809, + "eval_train_perplexity/second_seq": 14.188939094543457, + "eval_train_perplexity/seq": 8.668286323547363, + "eval_train_reconstruction/all": 0.2860482931137085, + "eval_train_reconstruction/end_span": 0.7476136684417725, + "eval_train_reconstruction/fim": 0.18009190261363983, + "eval_train_reconstruction/first_seq": 0.1471969038248062, + "eval_train_reconstruction/last_seq": 0.33793869614601135, + "eval_train_reconstruction/second_seq": 0.18357177078723907, + "eval_train_runtime": 446.0317, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 53750 + }, + { + "epoch": 0.20053266489111704, + "grad_norm": 0.2771800756454468, + "learning_rate": 0.0006, + "loss": 2.2539, + "step": 53760 + }, + { + "epoch": 0.20056996635408042, + "grad_norm": 0.29174619913101196, + "learning_rate": 0.0006, + "loss": 2.3041, + "step": 53770 + }, + { + "epoch": 0.20060726781704377, + "grad_norm": 0.2998107373714447, + "learning_rate": 0.0006, + "loss": 2.2055, + "step": 53780 + }, + { + "epoch": 0.20064456928000715, + "grad_norm": 0.35869792103767395, + "learning_rate": 0.0006, + "loss": 2.1772, + "step": 53790 + }, + { + "epoch": 0.20068187074297053, + "grad_norm": 0.32547369599342346, + "learning_rate": 0.0006, + "loss": 2.2017, + "step": 53800 + }, + { + "epoch": 0.2007191722059339, + "grad_norm": 0.4287964701652527, + "learning_rate": 0.0006, + "loss": 2.0842, + "step": 53810 + }, + { + "epoch": 0.2007564736688973, + "grad_norm": 0.3471231758594513, + "learning_rate": 0.0006, + "loss": 2.3163, + "step": 53820 + }, + { + "epoch": 0.20079377513186067, + "grad_norm": 0.3469502627849579, + "learning_rate": 0.0006, + "loss": 2.1139, + "step": 53830 + }, + { + "epoch": 0.20083107659482405, + "grad_norm": 0.5372893810272217, + "learning_rate": 0.0006, + "loss": 2.12, + "step": 53840 + }, + { + "epoch": 0.20086837805778743, + "grad_norm": 0.3582940995693207, + "learning_rate": 0.0006, + "loss": 2.1915, + "step": 53850 + }, + { + "epoch": 0.2009056795207508, + "grad_norm": 0.6034200191497803, + "learning_rate": 0.0006, + "loss": 2.2007, + "step": 53860 + }, + { + "epoch": 0.2009429809837142, + "grad_norm": 0.3340396285057068, + "learning_rate": 0.0006, + "loss": 2.0547, + "step": 53870 + }, + { + "epoch": 0.20098028244667757, + "grad_norm": 0.3551633358001709, + "learning_rate": 0.0006, + "loss": 2.283, + "step": 53880 + }, + { + "epoch": 0.20101758390964095, + "grad_norm": 0.36636123061180115, + "learning_rate": 0.0006, + "loss": 2.3089, + "step": 53890 + }, + { + "epoch": 0.20105488537260433, + "grad_norm": 0.37241581082344055, + "learning_rate": 0.0006, + "loss": 2.1297, + "step": 53900 + }, + { + "epoch": 0.2010921868355677, + "grad_norm": 0.3352634310722351, + "learning_rate": 0.0006, + "loss": 2.1704, + "step": 53910 + }, + { + "epoch": 0.20112948829853106, + "grad_norm": 0.3890755772590637, + "learning_rate": 0.0006, + "loss": 2.0874, + "step": 53920 + }, + { + "epoch": 0.20116678976149444, + "grad_norm": 0.3509451746940613, + "learning_rate": 0.0006, + "loss": 2.3223, + "step": 53930 + }, + { + "epoch": 0.20120409122445782, + "grad_norm": 0.3242673873901367, + "learning_rate": 0.0006, + "loss": 2.288, + "step": 53940 + }, + { + "epoch": 0.2012413926874212, + "grad_norm": 0.3235304057598114, + "learning_rate": 0.0006, + "loss": 2.2678, + "step": 53950 + }, + { + "epoch": 0.20127869415038457, + "grad_norm": 0.3397161066532135, + "learning_rate": 0.0006, + "loss": 2.2429, + "step": 53960 + }, + { + "epoch": 0.20131599561334795, + "grad_norm": 0.3529932498931885, + "learning_rate": 0.0006, + "loss": 2.3256, + "step": 53970 + }, + { + "epoch": 0.20135329707631133, + "grad_norm": 0.35403695702552795, + "learning_rate": 0.0006, + "loss": 2.1057, + "step": 53980 + }, + { + "epoch": 0.2013905985392747, + "grad_norm": 0.429228812456131, + "learning_rate": 0.0006, + "loss": 2.221, + "step": 53990 + }, + { + "epoch": 0.2014279000022381, + "grad_norm": 0.2693326771259308, + "learning_rate": 0.0006, + "loss": 2.3755, + "step": 54000 + }, + { + "epoch": 0.2014279000022381, + "eval_valid_loss": 2.1822609901428223, + "eval_valid_loss/all": 2.0461814403533936, + "eval_valid_loss/end_span": 1.1554559469223022, + "eval_valid_perplexity/batch": 7.738295555114746, + "eval_valid_perplexity/end_span": 3.1754708290100098, + "eval_valid_perplexity/fim": 2.1951284408569336, + "eval_valid_perplexity/first_seq": 14.669671058654785, + "eval_valid_perplexity/last_seq": 8.476649284362793, + "eval_valid_perplexity/second_seq": 13.85047435760498, + "eval_valid_perplexity/seq": 8.729400634765625, + "eval_valid_reconstruction/all": 0.29701635241508484, + "eval_valid_reconstruction/end_span": 0.7251898050308228, + "eval_valid_reconstruction/fim": 0.1582607924938202, + "eval_valid_reconstruction/first_seq": 0.17433500289916992, + "eval_valid_reconstruction/last_seq": 0.3478292226791382, + "eval_valid_reconstruction/second_seq": 0.2001808136701584, + "eval_valid_runtime": 441.2874, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 54000 + }, + { + "epoch": 0.2014279000022381, + "eval_train_loss": 2.1801538467407227, + "eval_train_loss/all": 2.0180089473724365, + "eval_train_loss/end_span": 1.1118448972702026, + "eval_train_perplexity/batch": 7.5233306884765625, + "eval_train_perplexity/end_span": 3.039961576461792, + "eval_train_perplexity/fim": 2.024444580078125, + "eval_train_perplexity/first_seq": 15.364954948425293, + "eval_train_perplexity/last_seq": 8.884517669677734, + "eval_train_perplexity/second_seq": 14.358049392700195, + "eval_train_perplexity/seq": 8.664676666259766, + "eval_train_reconstruction/all": 0.2864725887775421, + "eval_train_reconstruction/end_span": 0.7366781830787659, + "eval_train_reconstruction/fim": 0.14279542863368988, + "eval_train_reconstruction/first_seq": 0.1546688973903656, + "eval_train_reconstruction/last_seq": 0.3300960659980774, + "eval_train_reconstruction/second_seq": 0.1799081712961197, + "eval_train_runtime": 444.3657, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 54000 + }, + { + "epoch": 0.20146520146520147, + "grad_norm": 0.33414891362190247, + "learning_rate": 0.0006, + "loss": 2.3759, + "step": 54010 + }, + { + "epoch": 0.20150250292816485, + "grad_norm": 0.4102835953235626, + "learning_rate": 0.0006, + "loss": 2.2537, + "step": 54020 + }, + { + "epoch": 0.20153980439112823, + "grad_norm": 0.27828267216682434, + "learning_rate": 0.0006, + "loss": 2.3903, + "step": 54030 + }, + { + "epoch": 0.2015771058540916, + "grad_norm": 0.3699483573436737, + "learning_rate": 0.0006, + "loss": 2.3369, + "step": 54040 + }, + { + "epoch": 0.20161440731705496, + "grad_norm": 0.3237399160861969, + "learning_rate": 0.0006, + "loss": 2.3995, + "step": 54050 + }, + { + "epoch": 0.20165170878001834, + "grad_norm": 0.46066394448280334, + "learning_rate": 0.0006, + "loss": 2.2349, + "step": 54060 + }, + { + "epoch": 0.20168901024298172, + "grad_norm": 0.2833770513534546, + "learning_rate": 0.0006, + "loss": 2.2087, + "step": 54070 + }, + { + "epoch": 0.2017263117059451, + "grad_norm": 0.34982380270957947, + "learning_rate": 0.0006, + "loss": 2.1646, + "step": 54080 + }, + { + "epoch": 0.20176361316890848, + "grad_norm": 0.8483486175537109, + "learning_rate": 0.0006, + "loss": 2.1055, + "step": 54090 + }, + { + "epoch": 0.20180091463187186, + "grad_norm": 0.6539369821548462, + "learning_rate": 0.0006, + "loss": 2.0749, + "step": 54100 + }, + { + "epoch": 0.20183821609483524, + "grad_norm": 0.18097418546676636, + "learning_rate": 0.0006, + "loss": 2.2444, + "step": 54110 + }, + { + "epoch": 0.20187551755779862, + "grad_norm": 4.201321125030518, + "learning_rate": 0.0006, + "loss": 2.0407, + "step": 54120 + }, + { + "epoch": 0.201912819020762, + "grad_norm": 0.30793678760528564, + "learning_rate": 0.0006, + "loss": 2.3426, + "step": 54130 + }, + { + "epoch": 0.20195012048372538, + "grad_norm": 0.691378116607666, + "learning_rate": 0.0006, + "loss": 1.8892, + "step": 54140 + }, + { + "epoch": 0.20198742194668876, + "grad_norm": 0.3193868398666382, + "learning_rate": 0.0006, + "loss": 2.2432, + "step": 54150 + }, + { + "epoch": 0.20202472340965213, + "grad_norm": 0.2968359589576721, + "learning_rate": 0.0006, + "loss": 2.2995, + "step": 54160 + }, + { + "epoch": 0.20206202487261551, + "grad_norm": 0.4639488160610199, + "learning_rate": 0.0006, + "loss": 2.1139, + "step": 54170 + }, + { + "epoch": 0.2020993263355789, + "grad_norm": 0.270540326833725, + "learning_rate": 0.0006, + "loss": 2.3287, + "step": 54180 + }, + { + "epoch": 0.20213662779854225, + "grad_norm": 0.3838210105895996, + "learning_rate": 0.0006, + "loss": 2.1954, + "step": 54190 + }, + { + "epoch": 0.20217392926150562, + "grad_norm": 0.3311252295970917, + "learning_rate": 0.0006, + "loss": 2.1758, + "step": 54200 + }, + { + "epoch": 0.202211230724469, + "grad_norm": 0.29533183574676514, + "learning_rate": 0.0006, + "loss": 2.2091, + "step": 54210 + }, + { + "epoch": 0.20224853218743238, + "grad_norm": 0.3818018138408661, + "learning_rate": 0.0006, + "loss": 2.2022, + "step": 54220 + }, + { + "epoch": 0.20228583365039576, + "grad_norm": 0.3311423361301422, + "learning_rate": 0.0006, + "loss": 2.2894, + "step": 54230 + }, + { + "epoch": 0.20232313511335914, + "grad_norm": 0.3375598192214966, + "learning_rate": 0.0006, + "loss": 2.1688, + "step": 54240 + }, + { + "epoch": 0.20236043657632252, + "grad_norm": 0.44267717003822327, + "learning_rate": 0.0006, + "loss": 2.2229, + "step": 54250 + }, + { + "epoch": 0.20236043657632252, + "eval_valid_loss": 2.177704095840454, + "eval_valid_loss/all": 2.0418379306793213, + "eval_valid_loss/end_span": 1.2532591819763184, + "eval_valid_perplexity/batch": 7.704757213592529, + "eval_valid_perplexity/end_span": 3.501737117767334, + "eval_valid_perplexity/fim": 2.3877789974212646, + "eval_valid_perplexity/first_seq": 15.006786346435547, + "eval_valid_perplexity/last_seq": 8.825072288513184, + "eval_valid_perplexity/second_seq": 13.351000785827637, + "eval_valid_perplexity/seq": 8.687067031860352, + "eval_valid_reconstruction/all": 0.29797202348709106, + "eval_valid_reconstruction/end_span": 0.7025567293167114, + "eval_valid_reconstruction/fim": 0.17647409439086914, + "eval_valid_reconstruction/first_seq": 0.1623339056968689, + "eval_valid_reconstruction/last_seq": 0.3306848406791687, + "eval_valid_reconstruction/second_seq": 0.20527398586273193, + "eval_valid_runtime": 440.6101, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 54250 + }, + { + "epoch": 0.20236043657632252, + "eval_train_loss": 2.1786186695098877, + "eval_train_loss/all": 2.0163371562957764, + "eval_train_loss/end_span": 1.2089108228683472, + "eval_train_perplexity/batch": 7.510763645172119, + "eval_train_perplexity/end_span": 3.3498342037200928, + "eval_train_perplexity/fim": 2.0685019493103027, + "eval_train_perplexity/first_seq": 15.753682136535645, + "eval_train_perplexity/last_seq": 8.981409072875977, + "eval_train_perplexity/second_seq": 14.065877914428711, + "eval_train_perplexity/seq": 8.644781112670898, + "eval_train_reconstruction/all": 0.28683605790138245, + "eval_train_reconstruction/end_span": 0.7164949774742126, + "eval_train_reconstruction/fim": 0.1487530767917633, + "eval_train_reconstruction/first_seq": 0.14527735114097595, + "eval_train_reconstruction/last_seq": 0.3273542523384094, + "eval_train_reconstruction/second_seq": 0.18936766684055328, + "eval_train_runtime": 439.8141, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 54250 + }, + { + "epoch": 0.2023977380392859, + "grad_norm": 0.29744938015937805, + "learning_rate": 0.0006, + "loss": 2.2336, + "step": 54260 + }, + { + "epoch": 0.20243503950224928, + "grad_norm": 0.32899779081344604, + "learning_rate": 0.0006, + "loss": 2.1818, + "step": 54270 + }, + { + "epoch": 0.20247234096521266, + "grad_norm": 0.5241740942001343, + "learning_rate": 0.0006, + "loss": 2.125, + "step": 54280 + }, + { + "epoch": 0.20250964242817604, + "grad_norm": 0.39946994185447693, + "learning_rate": 0.0006, + "loss": 2.208, + "step": 54290 + }, + { + "epoch": 0.20254694389113942, + "grad_norm": 0.36686891317367554, + "learning_rate": 0.0006, + "loss": 2.091, + "step": 54300 + }, + { + "epoch": 0.2025842453541028, + "grad_norm": 0.36290597915649414, + "learning_rate": 0.0006, + "loss": 2.3289, + "step": 54310 + }, + { + "epoch": 0.20262154681706618, + "grad_norm": 0.40078169107437134, + "learning_rate": 0.0006, + "loss": 2.1049, + "step": 54320 + }, + { + "epoch": 0.20265884828002953, + "grad_norm": 0.38226208090782166, + "learning_rate": 0.0006, + "loss": 2.38, + "step": 54330 + }, + { + "epoch": 0.2026961497429929, + "grad_norm": 0.3876839280128479, + "learning_rate": 0.0006, + "loss": 2.3568, + "step": 54340 + }, + { + "epoch": 0.2027334512059563, + "grad_norm": 0.41083404421806335, + "learning_rate": 0.0006, + "loss": 2.2255, + "step": 54350 + }, + { + "epoch": 0.20277075266891967, + "grad_norm": 0.4862637221813202, + "learning_rate": 0.0006, + "loss": 2.1847, + "step": 54360 + }, + { + "epoch": 0.20280805413188305, + "grad_norm": 0.3114193379878998, + "learning_rate": 0.0006, + "loss": 2.0743, + "step": 54370 + }, + { + "epoch": 0.20284535559484643, + "grad_norm": 0.34589534997940063, + "learning_rate": 0.0006, + "loss": 2.1312, + "step": 54380 + }, + { + "epoch": 0.2028826570578098, + "grad_norm": 0.3409474790096283, + "learning_rate": 0.0006, + "loss": 2.2344, + "step": 54390 + }, + { + "epoch": 0.20291995852077319, + "grad_norm": 0.31943440437316895, + "learning_rate": 0.0006, + "loss": 2.1006, + "step": 54400 + }, + { + "epoch": 0.20295725998373657, + "grad_norm": 0.3286251425743103, + "learning_rate": 0.0006, + "loss": 2.3683, + "step": 54410 + }, + { + "epoch": 0.20299456144669994, + "grad_norm": 0.3325121998786926, + "learning_rate": 0.0006, + "loss": 2.058, + "step": 54420 + }, + { + "epoch": 0.20303186290966332, + "grad_norm": 0.3877923786640167, + "learning_rate": 0.0006, + "loss": 2.1541, + "step": 54430 + }, + { + "epoch": 0.2030691643726267, + "grad_norm": 0.37180468440055847, + "learning_rate": 0.0006, + "loss": 2.0967, + "step": 54440 + }, + { + "epoch": 0.20310646583559008, + "grad_norm": 0.39126843214035034, + "learning_rate": 0.0006, + "loss": 2.2787, + "step": 54450 + }, + { + "epoch": 0.20314376729855346, + "grad_norm": 0.35320720076560974, + "learning_rate": 0.0006, + "loss": 2.1841, + "step": 54460 + }, + { + "epoch": 0.20318106876151681, + "grad_norm": 0.2677922248840332, + "learning_rate": 0.0006, + "loss": 2.3655, + "step": 54470 + }, + { + "epoch": 0.2032183702244802, + "grad_norm": 0.328238844871521, + "learning_rate": 0.0006, + "loss": 2.2368, + "step": 54480 + }, + { + "epoch": 0.20325567168744357, + "grad_norm": 0.649256706237793, + "learning_rate": 0.0006, + "loss": 2.0759, + "step": 54490 + }, + { + "epoch": 0.20329297315040695, + "grad_norm": 0.2656821012496948, + "learning_rate": 0.0006, + "loss": 2.3889, + "step": 54500 + }, + { + "epoch": 0.20329297315040695, + "eval_valid_loss": 2.179863452911377, + "eval_valid_loss/all": 2.0437662601470947, + "eval_valid_loss/end_span": 1.1853216886520386, + "eval_valid_perplexity/batch": 7.719628810882568, + "eval_valid_perplexity/end_span": 3.2717392444610596, + "eval_valid_perplexity/fim": 2.169403314590454, + "eval_valid_perplexity/first_seq": 14.8383150100708, + "eval_valid_perplexity/last_seq": 8.547830581665039, + "eval_valid_perplexity/second_seq": 13.708005905151367, + "eval_valid_perplexity/seq": 8.703829765319824, + "eval_valid_reconstruction/all": 0.2974853515625, + "eval_valid_reconstruction/end_span": 0.7289039492607117, + "eval_valid_reconstruction/fim": 0.15701745450496674, + "eval_valid_reconstruction/first_seq": 0.16934412717819214, + "eval_valid_reconstruction/last_seq": 0.33852773904800415, + "eval_valid_reconstruction/second_seq": 0.19645991921424866, + "eval_valid_runtime": 440.5895, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 54500 + }, + { + "epoch": 0.20329297315040695, + "eval_train_loss": 2.180255174636841, + "eval_train_loss/all": 2.017899513244629, + "eval_train_loss/end_span": 1.1426585912704468, + "eval_train_perplexity/batch": 7.522507190704346, + "eval_train_perplexity/end_span": 3.135092258453369, + "eval_train_perplexity/fim": 2.174713373184204, + "eval_train_perplexity/first_seq": 15.457802772521973, + "eval_train_perplexity/last_seq": 8.837773323059082, + "eval_train_perplexity/second_seq": 14.528935432434082, + "eval_train_perplexity/seq": 8.659356117248535, + "eval_train_reconstruction/all": 0.28641214966773987, + "eval_train_reconstruction/end_span": 0.7420097589492798, + "eval_train_reconstruction/fim": 0.15699845552444458, + "eval_train_reconstruction/first_seq": 0.15688017010688782, + "eval_train_reconstruction/last_seq": 0.32936131954193115, + "eval_train_reconstruction/second_seq": 0.1732192188501358, + "eval_train_runtime": 440.4511, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 54500 + }, + { + "epoch": 0.20333027461337033, + "grad_norm": 0.9219073057174683, + "learning_rate": 0.0006, + "loss": 2.0411, + "step": 54510 + }, + { + "epoch": 0.2033675760763337, + "grad_norm": 0.4243004620075226, + "learning_rate": 0.0006, + "loss": 2.2737, + "step": 54520 + }, + { + "epoch": 0.2034048775392971, + "grad_norm": 0.30707812309265137, + "learning_rate": 0.0006, + "loss": 2.2427, + "step": 54530 + }, + { + "epoch": 0.20344217900226047, + "grad_norm": 0.4264020621776581, + "learning_rate": 0.0006, + "loss": 2.106, + "step": 54540 + }, + { + "epoch": 0.20347948046522385, + "grad_norm": 0.3660132586956024, + "learning_rate": 0.0006, + "loss": 2.1303, + "step": 54550 + }, + { + "epoch": 0.20351678192818723, + "grad_norm": 0.2842302620410919, + "learning_rate": 0.0006, + "loss": 2.3298, + "step": 54560 + }, + { + "epoch": 0.2035540833911506, + "grad_norm": 0.3168628215789795, + "learning_rate": 0.0006, + "loss": 1.8564, + "step": 54570 + }, + { + "epoch": 0.203591384854114, + "grad_norm": 0.32742804288864136, + "learning_rate": 0.0006, + "loss": 2.0965, + "step": 54580 + }, + { + "epoch": 0.20362868631707737, + "grad_norm": 0.35934001207351685, + "learning_rate": 0.0006, + "loss": 2.2793, + "step": 54590 + }, + { + "epoch": 0.20366598778004075, + "grad_norm": 0.36119508743286133, + "learning_rate": 0.0006, + "loss": 2.118, + "step": 54600 + }, + { + "epoch": 0.2037032892430041, + "grad_norm": 0.31836217641830444, + "learning_rate": 0.0006, + "loss": 2.3219, + "step": 54610 + }, + { + "epoch": 0.20374059070596748, + "grad_norm": 0.30507805943489075, + "learning_rate": 0.0006, + "loss": 2.2016, + "step": 54620 + }, + { + "epoch": 0.20377789216893086, + "grad_norm": 0.30197542905807495, + "learning_rate": 0.0006, + "loss": 2.0051, + "step": 54630 + }, + { + "epoch": 0.20381519363189424, + "grad_norm": 0.25365644693374634, + "learning_rate": 0.0006, + "loss": 2.3464, + "step": 54640 + }, + { + "epoch": 0.20385249509485762, + "grad_norm": 0.3397630751132965, + "learning_rate": 0.0006, + "loss": 2.1047, + "step": 54650 + }, + { + "epoch": 0.203889796557821, + "grad_norm": 0.3533913791179657, + "learning_rate": 0.0006, + "loss": 2.2004, + "step": 54660 + }, + { + "epoch": 0.20392709802078438, + "grad_norm": 0.3250032961368561, + "learning_rate": 0.0006, + "loss": 2.3817, + "step": 54670 + }, + { + "epoch": 0.20396439948374775, + "grad_norm": 0.4013751745223999, + "learning_rate": 0.0006, + "loss": 2.3006, + "step": 54680 + }, + { + "epoch": 0.20400170094671113, + "grad_norm": 0.3984266221523285, + "learning_rate": 0.0006, + "loss": 2.2549, + "step": 54690 + }, + { + "epoch": 0.2040390024096745, + "grad_norm": 0.38768821954727173, + "learning_rate": 0.0006, + "loss": 2.1117, + "step": 54700 + }, + { + "epoch": 0.2040763038726379, + "grad_norm": 0.32857584953308105, + "learning_rate": 0.0006, + "loss": 2.2096, + "step": 54710 + }, + { + "epoch": 0.20411360533560127, + "grad_norm": 0.3400741219520569, + "learning_rate": 0.0006, + "loss": 2.2737, + "step": 54720 + }, + { + "epoch": 0.20415090679856465, + "grad_norm": 0.3390045464038849, + "learning_rate": 0.0006, + "loss": 2.227, + "step": 54730 + }, + { + "epoch": 0.204188208261528, + "grad_norm": 0.44071707129478455, + "learning_rate": 0.0006, + "loss": 2.2241, + "step": 54740 + }, + { + "epoch": 0.20422550972449138, + "grad_norm": 0.35458675026893616, + "learning_rate": 0.0006, + "loss": 2.26, + "step": 54750 + }, + { + "epoch": 0.20422550972449138, + "eval_valid_loss": 2.1848526000976562, + "eval_valid_loss/all": 2.048117160797119, + "eval_valid_loss/end_span": 1.2017563581466675, + "eval_valid_perplexity/batch": 7.753289222717285, + "eval_valid_perplexity/end_span": 3.325953245162964, + "eval_valid_perplexity/fim": 2.3216843605041504, + "eval_valid_perplexity/first_seq": 14.807790756225586, + "eval_valid_perplexity/last_seq": 8.775504112243652, + "eval_valid_perplexity/second_seq": 13.572407722473145, + "eval_valid_perplexity/seq": 8.736287117004395, + "eval_valid_reconstruction/all": 0.29604190587997437, + "eval_valid_reconstruction/end_span": 0.72165846824646, + "eval_valid_reconstruction/fim": 0.16982349753379822, + "eval_valid_reconstruction/first_seq": 0.16859690845012665, + "eval_valid_reconstruction/last_seq": 0.3349147140979767, + "eval_valid_reconstruction/second_seq": 0.19952796399593353, + "eval_valid_runtime": 441.7934, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 54750 + }, + { + "epoch": 0.20422550972449138, + "eval_train_loss": 2.181688070297241, + "eval_train_loss/all": 2.018552541732788, + "eval_train_loss/end_span": 1.1602253913879395, + "eval_train_perplexity/batch": 7.527421474456787, + "eval_train_perplexity/end_span": 3.190652370452881, + "eval_train_perplexity/fim": 1.9925156831741333, + "eval_train_perplexity/first_seq": 15.688055992126465, + "eval_train_perplexity/last_seq": 8.889037132263184, + "eval_train_perplexity/second_seq": 14.53506851196289, + "eval_train_perplexity/seq": 8.662007331848145, + "eval_train_reconstruction/all": 0.28600504994392395, + "eval_train_reconstruction/end_span": 0.7336496114730835, + "eval_train_reconstruction/fim": 0.13923999667167664, + "eval_train_reconstruction/first_seq": 0.1463850885629654, + "eval_train_reconstruction/last_seq": 0.3298642635345459, + "eval_train_reconstruction/second_seq": 0.17618627846240997, + "eval_train_runtime": 444.9423, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 54750 + }, + { + "epoch": 0.20426281118745476, + "grad_norm": 0.43524500727653503, + "learning_rate": 0.0006, + "loss": 2.2946, + "step": 54760 + }, + { + "epoch": 0.20430011265041814, + "grad_norm": 0.3794512450695038, + "learning_rate": 0.0006, + "loss": 2.1579, + "step": 54770 + }, + { + "epoch": 0.20433741411338152, + "grad_norm": 0.34354376792907715, + "learning_rate": 0.0006, + "loss": 2.3255, + "step": 54780 + }, + { + "epoch": 0.2043747155763449, + "grad_norm": 0.36338645219802856, + "learning_rate": 0.0006, + "loss": 2.118, + "step": 54790 + }, + { + "epoch": 0.20441201703930828, + "grad_norm": 0.2931148111820221, + "learning_rate": 0.0006, + "loss": 2.2783, + "step": 54800 + }, + { + "epoch": 0.20444931850227166, + "grad_norm": 0.36876341700553894, + "learning_rate": 0.0006, + "loss": 2.233, + "step": 54810 + }, + { + "epoch": 0.20448661996523504, + "grad_norm": 0.36601826548576355, + "learning_rate": 0.0006, + "loss": 2.1914, + "step": 54820 + }, + { + "epoch": 0.20452392142819842, + "grad_norm": 0.33226409554481506, + "learning_rate": 0.0006, + "loss": 2.153, + "step": 54830 + }, + { + "epoch": 0.2045612228911618, + "grad_norm": 0.3074072599411011, + "learning_rate": 0.0006, + "loss": 2.1697, + "step": 54840 + }, + { + "epoch": 0.20459852435412518, + "grad_norm": 0.32999566197395325, + "learning_rate": 0.0006, + "loss": 2.1129, + "step": 54850 + }, + { + "epoch": 0.20463582581708856, + "grad_norm": 0.27559566497802734, + "learning_rate": 0.0006, + "loss": 2.1086, + "step": 54860 + }, + { + "epoch": 0.20467312728005194, + "grad_norm": 0.29399779438972473, + "learning_rate": 0.0006, + "loss": 2.0973, + "step": 54870 + }, + { + "epoch": 0.2047104287430153, + "grad_norm": 0.21973200142383575, + "learning_rate": 0.0006, + "loss": 2.2174, + "step": 54880 + }, + { + "epoch": 0.20474773020597867, + "grad_norm": 0.40420764684677124, + "learning_rate": 0.0006, + "loss": 2.343, + "step": 54890 + }, + { + "epoch": 0.20478503166894205, + "grad_norm": 0.26102301478385925, + "learning_rate": 0.0006, + "loss": 2.2214, + "step": 54900 + }, + { + "epoch": 0.20482233313190543, + "grad_norm": 0.3743969798088074, + "learning_rate": 0.0006, + "loss": 2.1759, + "step": 54910 + }, + { + "epoch": 0.2048596345948688, + "grad_norm": 0.43121933937072754, + "learning_rate": 0.0006, + "loss": 2.133, + "step": 54920 + }, + { + "epoch": 0.20489693605783219, + "grad_norm": 0.3527165651321411, + "learning_rate": 0.0006, + "loss": 2.2976, + "step": 54930 + }, + { + "epoch": 0.20493423752079556, + "grad_norm": 0.3134365677833557, + "learning_rate": 0.0006, + "loss": 2.2143, + "step": 54940 + }, + { + "epoch": 0.20497153898375894, + "grad_norm": 0.2586596608161926, + "learning_rate": 0.0006, + "loss": 2.3921, + "step": 54950 + }, + { + "epoch": 0.20500884044672232, + "grad_norm": 0.27956143021583557, + "learning_rate": 0.0006, + "loss": 2.174, + "step": 54960 + }, + { + "epoch": 0.2050461419096857, + "grad_norm": 0.2746502161026001, + "learning_rate": 0.0006, + "loss": 2.0117, + "step": 54970 + }, + { + "epoch": 0.20508344337264908, + "grad_norm": 0.3261515498161316, + "learning_rate": 0.0006, + "loss": 2.2648, + "step": 54980 + }, + { + "epoch": 0.20512074483561246, + "grad_norm": 0.28617462515830994, + "learning_rate": 0.0006, + "loss": 2.0999, + "step": 54990 + }, + { + "epoch": 0.20515804629857584, + "grad_norm": 0.3547711670398712, + "learning_rate": 0.0006, + "loss": 2.2589, + "step": 55000 + }, + { + "epoch": 0.20515804629857584, + "eval_valid_loss": 2.1794941425323486, + "eval_valid_loss/all": 2.0438191890716553, + "eval_valid_loss/end_span": 1.2081516981124878, + "eval_valid_perplexity/batch": 7.720037460327148, + "eval_valid_perplexity/end_span": 3.347292184829712, + "eval_valid_perplexity/fim": 2.3034534454345703, + "eval_valid_perplexity/first_seq": 14.824047088623047, + "eval_valid_perplexity/last_seq": 8.916123390197754, + "eval_valid_perplexity/second_seq": 13.779428482055664, + "eval_valid_perplexity/seq": 8.71038818359375, + "eval_valid_reconstruction/all": 0.297405481338501, + "eval_valid_reconstruction/end_span": 0.7135427594184875, + "eval_valid_reconstruction/fim": 0.16911831498146057, + "eval_valid_reconstruction/first_seq": 0.1673458367586136, + "eval_valid_reconstruction/last_seq": 0.32709765434265137, + "eval_valid_reconstruction/second_seq": 0.19586649537086487, + "eval_valid_runtime": 440.1374, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 55000 + }, + { + "epoch": 0.20515804629857584, + "eval_train_loss": 2.178668975830078, + "eval_train_loss/all": 2.0166943073272705, + "eval_train_loss/end_span": 1.175580620765686, + "eval_train_perplexity/batch": 7.513446807861328, + "eval_train_perplexity/end_span": 3.240023612976074, + "eval_train_perplexity/fim": 2.057996988296509, + "eval_train_perplexity/first_seq": 15.632491111755371, + "eval_train_perplexity/last_seq": 9.035785675048828, + "eval_train_perplexity/second_seq": 14.01255989074707, + "eval_train_perplexity/seq": 8.6575288772583, + "eval_train_reconstruction/all": 0.286677747964859, + "eval_train_reconstruction/end_span": 0.7229623198509216, + "eval_train_reconstruction/fim": 0.14665630459785461, + "eval_train_reconstruction/first_seq": 0.14879964292049408, + "eval_train_reconstruction/last_seq": 0.32322263717651367, + "eval_train_reconstruction/second_seq": 0.18592391908168793, + "eval_train_runtime": 443.1497, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 55000 + }, + { + "epoch": 0.20519534776153922, + "grad_norm": 0.5078951716423035, + "learning_rate": 0.0006, + "loss": 2.2608, + "step": 55010 + }, + { + "epoch": 0.20523264922450257, + "grad_norm": 0.30351895093917847, + "learning_rate": 0.0006, + "loss": 2.2623, + "step": 55020 + }, + { + "epoch": 0.20526995068746595, + "grad_norm": 0.24858303368091583, + "learning_rate": 0.0006, + "loss": 2.0413, + "step": 55030 + }, + { + "epoch": 0.20530725215042933, + "grad_norm": 0.23757779598236084, + "learning_rate": 0.0006, + "loss": 2.2726, + "step": 55040 + }, + { + "epoch": 0.2053445536133927, + "grad_norm": 0.32173144817352295, + "learning_rate": 0.0006, + "loss": 2.1742, + "step": 55050 + }, + { + "epoch": 0.2053818550763561, + "grad_norm": 0.23359790444374084, + "learning_rate": 0.0006, + "loss": 2.1149, + "step": 55060 + }, + { + "epoch": 0.20541915653931947, + "grad_norm": 0.44186294078826904, + "learning_rate": 0.0006, + "loss": 2.2491, + "step": 55070 + }, + { + "epoch": 0.20545645800228285, + "grad_norm": 0.3723406493663788, + "learning_rate": 0.0006, + "loss": 2.2941, + "step": 55080 + }, + { + "epoch": 0.20549375946524623, + "grad_norm": 0.40449258685112, + "learning_rate": 0.0006, + "loss": 2.3021, + "step": 55090 + }, + { + "epoch": 0.2055310609282096, + "grad_norm": 0.3536790609359741, + "learning_rate": 0.0006, + "loss": 2.4052, + "step": 55100 + }, + { + "epoch": 0.205568362391173, + "grad_norm": 0.369706928730011, + "learning_rate": 0.0006, + "loss": 2.1933, + "step": 55110 + }, + { + "epoch": 0.20560566385413637, + "grad_norm": 0.30936112999916077, + "learning_rate": 0.0006, + "loss": 2.2931, + "step": 55120 + }, + { + "epoch": 0.20564296531709975, + "grad_norm": 0.3328295052051544, + "learning_rate": 0.0006, + "loss": 2.3631, + "step": 55130 + }, + { + "epoch": 0.20568026678006313, + "grad_norm": 0.41726449131965637, + "learning_rate": 0.0006, + "loss": 2.2154, + "step": 55140 + }, + { + "epoch": 0.2057175682430265, + "grad_norm": 0.3090302348136902, + "learning_rate": 0.0006, + "loss": 2.2884, + "step": 55150 + }, + { + "epoch": 0.20575486970598986, + "grad_norm": 0.3059858977794647, + "learning_rate": 0.0006, + "loss": 2.2848, + "step": 55160 + }, + { + "epoch": 0.20579217116895324, + "grad_norm": 0.24757739901542664, + "learning_rate": 0.0006, + "loss": 2.275, + "step": 55170 + }, + { + "epoch": 0.20582947263191662, + "grad_norm": 0.2806149125099182, + "learning_rate": 0.0006, + "loss": 2.1416, + "step": 55180 + }, + { + "epoch": 0.20586677409488, + "grad_norm": 0.23725293576717377, + "learning_rate": 0.0006, + "loss": 2.2053, + "step": 55190 + }, + { + "epoch": 0.20590407555784337, + "grad_norm": 0.36625391244888306, + "learning_rate": 0.0006, + "loss": 2.1505, + "step": 55200 + }, + { + "epoch": 0.20594137702080675, + "grad_norm": 0.4984239935874939, + "learning_rate": 0.0006, + "loss": 2.1186, + "step": 55210 + }, + { + "epoch": 0.20597867848377013, + "grad_norm": 0.44061610102653503, + "learning_rate": 0.0006, + "loss": 2.0601, + "step": 55220 + }, + { + "epoch": 0.2060159799467335, + "grad_norm": 0.28944113850593567, + "learning_rate": 0.0006, + "loss": 2.2737, + "step": 55230 + }, + { + "epoch": 0.2060532814096969, + "grad_norm": 0.3567514419555664, + "learning_rate": 0.0006, + "loss": 2.2644, + "step": 55240 + }, + { + "epoch": 0.20609058287266027, + "grad_norm": 0.3782895803451538, + "learning_rate": 0.0006, + "loss": 2.1445, + "step": 55250 + }, + { + "epoch": 0.20609058287266027, + "eval_valid_loss": 2.177166223526001, + "eval_valid_loss/all": 2.0412814617156982, + "eval_valid_loss/end_span": 1.2786731719970703, + "eval_valid_perplexity/batch": 7.700470924377441, + "eval_valid_perplexity/end_span": 3.5918707847595215, + "eval_valid_perplexity/fim": 2.332669734954834, + "eval_valid_perplexity/first_seq": 15.007362365722656, + "eval_valid_perplexity/last_seq": 8.812898635864258, + "eval_valid_perplexity/second_seq": 13.954976081848145, + "eval_valid_perplexity/seq": 8.684101104736328, + "eval_valid_reconstruction/all": 0.2981921136379242, + "eval_valid_reconstruction/end_span": 0.698593258857727, + "eval_valid_reconstruction/fim": 0.17236150801181793, + "eval_valid_reconstruction/first_seq": 0.16615059971809387, + "eval_valid_reconstruction/last_seq": 0.33190175890922546, + "eval_valid_reconstruction/second_seq": 0.19446662068367004, + "eval_valid_runtime": 444.0571, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 55250 + }, + { + "epoch": 0.20609058287266027, + "eval_train_loss": 2.1784451007843018, + "eval_train_loss/all": 2.0164856910705566, + "eval_train_loss/end_span": 1.2443552017211914, + "eval_train_perplexity/batch": 7.5118794441223145, + "eval_train_perplexity/end_span": 3.470696210861206, + "eval_train_perplexity/fim": 2.238959312438965, + "eval_train_perplexity/first_seq": 15.478494644165039, + "eval_train_perplexity/last_seq": 8.547767639160156, + "eval_train_perplexity/second_seq": 14.469922065734863, + "eval_train_perplexity/seq": 8.650843620300293, + "eval_train_reconstruction/all": 0.28674373030662537, + "eval_train_reconstruction/end_span": 0.7074264883995056, + "eval_train_reconstruction/fim": 0.16342417895793915, + "eval_train_reconstruction/first_seq": 0.153774231672287, + "eval_train_reconstruction/last_seq": 0.34323447942733765, + "eval_train_reconstruction/second_seq": 0.1758776307106018, + "eval_train_runtime": 443.8653, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 55250 + }, + { + "epoch": 0.20612788433562365, + "grad_norm": 0.3210705518722534, + "learning_rate": 0.0006, + "loss": 2.2489, + "step": 55260 + }, + { + "epoch": 0.20616518579858703, + "grad_norm": 0.5110362768173218, + "learning_rate": 0.0006, + "loss": 2.3331, + "step": 55270 + }, + { + "epoch": 0.2062024872615504, + "grad_norm": 0.4025651216506958, + "learning_rate": 0.0006, + "loss": 1.9909, + "step": 55280 + }, + { + "epoch": 0.20623978872451376, + "grad_norm": 0.39614561200141907, + "learning_rate": 0.0006, + "loss": 2.0214, + "step": 55290 + }, + { + "epoch": 0.20627709018747714, + "grad_norm": 0.24664440751075745, + "learning_rate": 0.0006, + "loss": 2.3552, + "step": 55300 + }, + { + "epoch": 0.20631439165044052, + "grad_norm": 0.3284165859222412, + "learning_rate": 0.0006, + "loss": 2.3585, + "step": 55310 + }, + { + "epoch": 0.2063516931134039, + "grad_norm": 0.3988805115222931, + "learning_rate": 0.0006, + "loss": 2.1921, + "step": 55320 + }, + { + "epoch": 0.20638899457636728, + "grad_norm": 0.2566421926021576, + "learning_rate": 0.0006, + "loss": 2.2977, + "step": 55330 + }, + { + "epoch": 0.20642629603933066, + "grad_norm": 0.2983056902885437, + "learning_rate": 0.0006, + "loss": 2.2567, + "step": 55340 + }, + { + "epoch": 0.20646359750229404, + "grad_norm": 0.37748733162879944, + "learning_rate": 0.0006, + "loss": 2.1067, + "step": 55350 + }, + { + "epoch": 0.20650089896525742, + "grad_norm": 0.3277397155761719, + "learning_rate": 0.0006, + "loss": 2.2093, + "step": 55360 + }, + { + "epoch": 0.2065382004282208, + "grad_norm": 0.31706541776657104, + "learning_rate": 0.0006, + "loss": 2.1145, + "step": 55370 + }, + { + "epoch": 0.20657550189118418, + "grad_norm": 0.30292290449142456, + "learning_rate": 0.0006, + "loss": 2.1732, + "step": 55380 + }, + { + "epoch": 0.20661280335414756, + "grad_norm": 0.2546093165874481, + "learning_rate": 0.0006, + "loss": 2.2616, + "step": 55390 + }, + { + "epoch": 0.20665010481711094, + "grad_norm": 0.2874845266342163, + "learning_rate": 0.0006, + "loss": 2.2297, + "step": 55400 + }, + { + "epoch": 0.20668740628007432, + "grad_norm": 0.36553576588630676, + "learning_rate": 0.0006, + "loss": 2.1319, + "step": 55410 + }, + { + "epoch": 0.2067247077430377, + "grad_norm": 0.4528818726539612, + "learning_rate": 0.0006, + "loss": 2.3997, + "step": 55420 + }, + { + "epoch": 0.20676200920600105, + "grad_norm": 0.22112303972244263, + "learning_rate": 0.0006, + "loss": 2.2302, + "step": 55430 + }, + { + "epoch": 0.20679931066896443, + "grad_norm": 0.3699561059474945, + "learning_rate": 0.0006, + "loss": 2.2951, + "step": 55440 + }, + { + "epoch": 0.2068366121319278, + "grad_norm": 0.30310359597206116, + "learning_rate": 0.0006, + "loss": 2.0639, + "step": 55450 + }, + { + "epoch": 0.20687391359489118, + "grad_norm": 0.2585335671901703, + "learning_rate": 0.0006, + "loss": 2.1946, + "step": 55460 + }, + { + "epoch": 0.20691121505785456, + "grad_norm": 0.2606963813304901, + "learning_rate": 0.0006, + "loss": 2.225, + "step": 55470 + }, + { + "epoch": 0.20694851652081794, + "grad_norm": 0.39504143595695496, + "learning_rate": 0.0006, + "loss": 2.173, + "step": 55480 + }, + { + "epoch": 0.20698581798378132, + "grad_norm": 0.2605018615722656, + "learning_rate": 0.0006, + "loss": 2.2306, + "step": 55490 + }, + { + "epoch": 0.2070231194467447, + "grad_norm": 0.3538898825645447, + "learning_rate": 0.0006, + "loss": 2.3058, + "step": 55500 + }, + { + "epoch": 0.2070231194467447, + "eval_valid_loss": 2.17537522315979, + "eval_valid_loss/all": 2.0397531986236572, + "eval_valid_loss/end_span": 1.2114982604980469, + "eval_valid_perplexity/batch": 7.688711166381836, + "eval_valid_perplexity/end_span": 3.3585128784179688, + "eval_valid_perplexity/fim": 2.178274393081665, + "eval_valid_perplexity/first_seq": 14.880431175231934, + "eval_valid_perplexity/last_seq": 8.685070991516113, + "eval_valid_perplexity/second_seq": 14.012396812438965, + "eval_valid_perplexity/seq": 8.670465469360352, + "eval_valid_reconstruction/all": 0.2986797094345093, + "eval_valid_reconstruction/end_span": 0.7135322690010071, + "eval_valid_reconstruction/fim": 0.15803349018096924, + "eval_valid_reconstruction/first_seq": 0.1671207845211029, + "eval_valid_reconstruction/last_seq": 0.3374118208885193, + "eval_valid_reconstruction/second_seq": 0.19121822714805603, + "eval_valid_runtime": 440.4852, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 55500 + }, + { + "epoch": 0.2070231194467447, + "eval_train_loss": 2.1748082637786865, + "eval_train_loss/all": 2.013009786605835, + "eval_train_loss/end_span": 1.1805635690689087, + "eval_train_perplexity/batch": 7.485814094543457, + "eval_train_perplexity/end_span": 3.256208896636963, + "eval_train_perplexity/fim": 1.914475679397583, + "eval_train_perplexity/first_seq": 15.388205528259277, + "eval_train_perplexity/last_seq": 9.078673362731934, + "eval_train_perplexity/second_seq": 14.083561897277832, + "eval_train_perplexity/seq": 8.617992401123047, + "eval_train_reconstruction/all": 0.28772592544555664, + "eval_train_reconstruction/end_span": 0.7239623665809631, + "eval_train_reconstruction/fim": 0.13141082227230072, + "eval_train_reconstruction/first_seq": 0.15487943589687347, + "eval_train_reconstruction/last_seq": 0.3211461007595062, + "eval_train_reconstruction/second_seq": 0.18691225349903107, + "eval_train_runtime": 440.4227, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 55500 + }, + { + "epoch": 0.20706042090970808, + "grad_norm": 0.3646049499511719, + "learning_rate": 0.0006, + "loss": 2.1212, + "step": 55510 + }, + { + "epoch": 0.20709772237267146, + "grad_norm": 0.4735606908798218, + "learning_rate": 0.0006, + "loss": 2.1688, + "step": 55520 + }, + { + "epoch": 0.20713502383563484, + "grad_norm": 0.24440285563468933, + "learning_rate": 0.0006, + "loss": 2.1618, + "step": 55530 + }, + { + "epoch": 0.20717232529859822, + "grad_norm": 0.4392983615398407, + "learning_rate": 0.0006, + "loss": 2.2755, + "step": 55540 + }, + { + "epoch": 0.2072096267615616, + "grad_norm": 0.37922370433807373, + "learning_rate": 0.0006, + "loss": 2.2451, + "step": 55550 + }, + { + "epoch": 0.20724692822452498, + "grad_norm": 0.3461383879184723, + "learning_rate": 0.0006, + "loss": 2.1562, + "step": 55560 + }, + { + "epoch": 0.20728422968748833, + "grad_norm": 0.33950644731521606, + "learning_rate": 0.0006, + "loss": 2.2834, + "step": 55570 + }, + { + "epoch": 0.2073215311504517, + "grad_norm": 0.30801868438720703, + "learning_rate": 0.0006, + "loss": 2.1821, + "step": 55580 + }, + { + "epoch": 0.2073588326134151, + "grad_norm": 0.39866262674331665, + "learning_rate": 0.0006, + "loss": 2.337, + "step": 55590 + }, + { + "epoch": 0.20739613407637847, + "grad_norm": 2.0423882007598877, + "learning_rate": 0.0006, + "loss": 1.9054, + "step": 55600 + }, + { + "epoch": 0.20743343553934185, + "grad_norm": 0.3128279745578766, + "learning_rate": 0.0006, + "loss": 2.2503, + "step": 55610 + }, + { + "epoch": 0.20747073700230523, + "grad_norm": 0.3663748502731323, + "learning_rate": 0.0006, + "loss": 2.1948, + "step": 55620 + }, + { + "epoch": 0.2075080384652686, + "grad_norm": 0.3836769461631775, + "learning_rate": 0.0006, + "loss": 2.0939, + "step": 55630 + }, + { + "epoch": 0.207545339928232, + "grad_norm": 0.6511574387550354, + "learning_rate": 0.0006, + "loss": 2.1638, + "step": 55640 + }, + { + "epoch": 0.20758264139119537, + "grad_norm": 0.42864343523979187, + "learning_rate": 0.0006, + "loss": 2.1354, + "step": 55650 + }, + { + "epoch": 0.20761994285415875, + "grad_norm": 0.3116806745529175, + "learning_rate": 0.0006, + "loss": 2.2224, + "step": 55660 + }, + { + "epoch": 0.20765724431712212, + "grad_norm": 0.24729323387145996, + "learning_rate": 0.0006, + "loss": 2.3037, + "step": 55670 + }, + { + "epoch": 0.2076945457800855, + "grad_norm": 0.22471295297145844, + "learning_rate": 0.0006, + "loss": 2.3036, + "step": 55680 + }, + { + "epoch": 0.20773184724304888, + "grad_norm": 0.3646088242530823, + "learning_rate": 0.0006, + "loss": 2.1792, + "step": 55690 + }, + { + "epoch": 0.20776914870601226, + "grad_norm": 0.26629236340522766, + "learning_rate": 0.0006, + "loss": 2.1239, + "step": 55700 + }, + { + "epoch": 0.20780645016897561, + "grad_norm": 0.3670457601547241, + "learning_rate": 0.0006, + "loss": 2.1552, + "step": 55710 + }, + { + "epoch": 0.207843751631939, + "grad_norm": 0.41176077723503113, + "learning_rate": 0.0006, + "loss": 2.2028, + "step": 55720 + }, + { + "epoch": 0.20788105309490237, + "grad_norm": 0.3462364673614502, + "learning_rate": 0.0006, + "loss": 2.3059, + "step": 55730 + }, + { + "epoch": 0.20791835455786575, + "grad_norm": 0.2533428966999054, + "learning_rate": 0.0006, + "loss": 2.3068, + "step": 55740 + }, + { + "epoch": 0.20795565602082913, + "grad_norm": 0.5509130358695984, + "learning_rate": 0.0006, + "loss": 2.2321, + "step": 55750 + }, + { + "epoch": 0.20795565602082913, + "eval_valid_loss": 2.177504539489746, + "eval_valid_loss/all": 2.0416605472564697, + "eval_valid_loss/end_span": 1.2193478345870972, + "eval_valid_perplexity/batch": 7.703390598297119, + "eval_valid_perplexity/end_span": 3.384979486465454, + "eval_valid_perplexity/fim": 2.6936612129211426, + "eval_valid_perplexity/first_seq": 14.788622856140137, + "eval_valid_perplexity/last_seq": 8.666239738464355, + "eval_valid_perplexity/second_seq": 13.716204643249512, + "eval_valid_perplexity/seq": 8.68313980102539, + "eval_valid_reconstruction/all": 0.2978941798210144, + "eval_valid_reconstruction/end_span": 0.7145382165908813, + "eval_valid_reconstruction/fim": 0.20035915076732635, + "eval_valid_reconstruction/first_seq": 0.1722642183303833, + "eval_valid_reconstruction/last_seq": 0.34063491225242615, + "eval_valid_reconstruction/second_seq": 0.19491928815841675, + "eval_valid_runtime": 443.8131, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 55750 + }, + { + "epoch": 0.20795565602082913, + "eval_train_loss": 2.1760916709899902, + "eval_train_loss/all": 2.0140905380249023, + "eval_train_loss/end_span": 1.186436653137207, + "eval_train_perplexity/batch": 7.493908882141113, + "eval_train_perplexity/end_span": 3.2753889560699463, + "eval_train_perplexity/fim": 2.0395150184631348, + "eval_train_perplexity/first_seq": 15.401712417602539, + "eval_train_perplexity/last_seq": 8.510388374328613, + "eval_train_perplexity/second_seq": 13.78048038482666, + "eval_train_perplexity/seq": 8.625619888305664, + "eval_train_reconstruction/all": 0.2872962951660156, + "eval_train_reconstruction/end_span": 0.7233785390853882, + "eval_train_reconstruction/fim": 0.1446002721786499, + "eval_train_reconstruction/first_seq": 0.15468156337738037, + "eval_train_reconstruction/last_seq": 0.3402956426143646, + "eval_train_reconstruction/second_seq": 0.1962471604347229, + "eval_train_runtime": 441.1973, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 55750 + }, + { + "epoch": 0.2079929574837925, + "grad_norm": 0.47166913747787476, + "learning_rate": 0.0006, + "loss": 2.1662, + "step": 55760 + }, + { + "epoch": 0.2080302589467559, + "grad_norm": 0.4897453188896179, + "learning_rate": 0.0006, + "loss": 2.1219, + "step": 55770 + }, + { + "epoch": 0.20806756040971927, + "grad_norm": 0.43817266821861267, + "learning_rate": 0.0006, + "loss": 2.0275, + "step": 55780 + }, + { + "epoch": 0.20810486187268265, + "grad_norm": 0.42079463601112366, + "learning_rate": 0.0006, + "loss": 2.0964, + "step": 55790 + }, + { + "epoch": 0.20814216333564603, + "grad_norm": 0.32552745938301086, + "learning_rate": 0.0006, + "loss": 2.1502, + "step": 55800 + }, + { + "epoch": 0.2081794647986094, + "grad_norm": 0.25174808502197266, + "learning_rate": 0.0006, + "loss": 2.2692, + "step": 55810 + }, + { + "epoch": 0.2082167662615728, + "grad_norm": 0.5540134906768799, + "learning_rate": 0.0006, + "loss": 2.1866, + "step": 55820 + }, + { + "epoch": 0.20825406772453617, + "grad_norm": 0.3739074766635895, + "learning_rate": 0.0006, + "loss": 2.1634, + "step": 55830 + }, + { + "epoch": 0.20829136918749952, + "grad_norm": 0.22320100665092468, + "learning_rate": 0.0006, + "loss": 2.1536, + "step": 55840 + }, + { + "epoch": 0.2083286706504629, + "grad_norm": 0.32174715399742126, + "learning_rate": 0.0006, + "loss": 2.2175, + "step": 55850 + }, + { + "epoch": 0.20836597211342628, + "grad_norm": 0.5281128883361816, + "learning_rate": 0.0006, + "loss": 2.2263, + "step": 55860 + }, + { + "epoch": 0.20840327357638966, + "grad_norm": 0.22788523137569427, + "learning_rate": 0.0006, + "loss": 2.2764, + "step": 55870 + }, + { + "epoch": 0.20844057503935304, + "grad_norm": 0.3799770176410675, + "learning_rate": 0.0006, + "loss": 2.2741, + "step": 55880 + }, + { + "epoch": 0.20847787650231642, + "grad_norm": 0.4089408814907074, + "learning_rate": 0.0006, + "loss": 1.9954, + "step": 55890 + }, + { + "epoch": 0.2085151779652798, + "grad_norm": 0.31036505103111267, + "learning_rate": 0.0006, + "loss": 2.2129, + "step": 55900 + }, + { + "epoch": 0.20855247942824318, + "grad_norm": 0.3365548253059387, + "learning_rate": 0.0006, + "loss": 2.2106, + "step": 55910 + }, + { + "epoch": 0.20858978089120656, + "grad_norm": 0.25895825028419495, + "learning_rate": 0.0006, + "loss": 2.0596, + "step": 55920 + }, + { + "epoch": 0.20862708235416993, + "grad_norm": 0.3457549810409546, + "learning_rate": 0.0006, + "loss": 2.0281, + "step": 55930 + }, + { + "epoch": 0.20866438381713331, + "grad_norm": 0.5311142802238464, + "learning_rate": 0.0006, + "loss": 2.3148, + "step": 55940 + }, + { + "epoch": 0.2087016852800967, + "grad_norm": 0.33807170391082764, + "learning_rate": 0.0006, + "loss": 2.2901, + "step": 55950 + }, + { + "epoch": 0.20873898674306007, + "grad_norm": 0.3653275966644287, + "learning_rate": 0.0006, + "loss": 2.1956, + "step": 55960 + }, + { + "epoch": 0.20877628820602345, + "grad_norm": 0.2541240155696869, + "learning_rate": 0.0006, + "loss": 2.2257, + "step": 55970 + }, + { + "epoch": 0.2088135896689868, + "grad_norm": 0.5229631066322327, + "learning_rate": 0.0006, + "loss": 2.1359, + "step": 55980 + }, + { + "epoch": 0.20885089113195018, + "grad_norm": 0.24947217106819153, + "learning_rate": 0.0006, + "loss": 2.3265, + "step": 55990 + }, + { + "epoch": 0.20888819259491356, + "grad_norm": 0.28267210721969604, + "learning_rate": 0.0006, + "loss": 2.1967, + "step": 56000 + }, + { + "epoch": 0.20888819259491356, + "eval_valid_loss": 2.177217483520508, + "eval_valid_loss/all": 2.0417470932006836, + "eval_valid_loss/end_span": 1.259060025215149, + "eval_valid_perplexity/batch": 7.704057216644287, + "eval_valid_perplexity/end_span": 3.522109270095825, + "eval_valid_perplexity/fim": 2.157947063446045, + "eval_valid_perplexity/first_seq": 14.016820907592773, + "eval_valid_perplexity/last_seq": 8.705161094665527, + "eval_valid_perplexity/second_seq": 13.756139755249023, + "eval_valid_perplexity/seq": 8.691021919250488, + "eval_valid_reconstruction/all": 0.29804766178131104, + "eval_valid_reconstruction/end_span": 0.7047647833824158, + "eval_valid_reconstruction/fim": 0.1560671329498291, + "eval_valid_reconstruction/first_seq": 0.18599171936511993, + "eval_valid_reconstruction/last_seq": 0.3352336287498474, + "eval_valid_reconstruction/second_seq": 0.1946183741092682, + "eval_valid_runtime": 438.6076, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 56000 + }, + { + "epoch": 0.20888819259491356, + "eval_train_loss": 2.175811529159546, + "eval_train_loss/all": 2.014185905456543, + "eval_train_loss/end_span": 1.2263604402542114, + "eval_train_perplexity/batch": 7.49462366104126, + "eval_train_perplexity/end_span": 3.4088003635406494, + "eval_train_perplexity/fim": 2.111495018005371, + "eval_train_perplexity/first_seq": 15.356964111328125, + "eval_train_perplexity/last_seq": 8.833791732788086, + "eval_train_perplexity/second_seq": 13.85441780090332, + "eval_train_perplexity/seq": 8.632195472717285, + "eval_train_reconstruction/all": 0.28743991255760193, + "eval_train_reconstruction/end_span": 0.7148851752281189, + "eval_train_reconstruction/fim": 0.15130221843719482, + "eval_train_reconstruction/first_seq": 0.1567503958940506, + "eval_train_reconstruction/last_seq": 0.3321973383426666, + "eval_train_reconstruction/second_seq": 0.19074232876300812, + "eval_train_runtime": 442.2265, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 56000 + }, + { + "epoch": 0.20892549405787694, + "grad_norm": 0.4547506868839264, + "learning_rate": 0.0006, + "loss": 2.0248, + "step": 56010 + }, + { + "epoch": 0.20896279552084032, + "grad_norm": 1.716042160987854, + "learning_rate": 0.0006, + "loss": 2.0575, + "step": 56020 + }, + { + "epoch": 0.2090000969838037, + "grad_norm": 0.3170698285102844, + "learning_rate": 0.0006, + "loss": 2.224, + "step": 56030 + }, + { + "epoch": 0.20903739844676708, + "grad_norm": 0.25163593888282776, + "learning_rate": 0.0006, + "loss": 2.1393, + "step": 56040 + }, + { + "epoch": 0.20907469990973046, + "grad_norm": 0.4523600935935974, + "learning_rate": 0.0006, + "loss": 2.3808, + "step": 56050 + }, + { + "epoch": 0.20911200137269384, + "grad_norm": 0.27600735425949097, + "learning_rate": 0.0006, + "loss": 2.2803, + "step": 56060 + }, + { + "epoch": 0.20914930283565722, + "grad_norm": 0.4858933687210083, + "learning_rate": 0.0006, + "loss": 2.2699, + "step": 56070 + }, + { + "epoch": 0.2091866042986206, + "grad_norm": 0.3571852147579193, + "learning_rate": 0.0006, + "loss": 2.1786, + "step": 56080 + }, + { + "epoch": 0.20922390576158398, + "grad_norm": 0.29971882700920105, + "learning_rate": 0.0006, + "loss": 2.2869, + "step": 56090 + }, + { + "epoch": 0.20926120722454736, + "grad_norm": 0.358707070350647, + "learning_rate": 0.0006, + "loss": 2.1469, + "step": 56100 + }, + { + "epoch": 0.20929850868751074, + "grad_norm": 0.3903743624687195, + "learning_rate": 0.0006, + "loss": 2.0039, + "step": 56110 + }, + { + "epoch": 0.2093358101504741, + "grad_norm": 0.414621502161026, + "learning_rate": 0.0006, + "loss": 2.3051, + "step": 56120 + }, + { + "epoch": 0.20937311161343747, + "grad_norm": 0.2894119918346405, + "learning_rate": 0.0006, + "loss": 2.2831, + "step": 56130 + }, + { + "epoch": 0.20941041307640085, + "grad_norm": 0.48330390453338623, + "learning_rate": 0.0006, + "loss": 2.0432, + "step": 56140 + }, + { + "epoch": 0.20944771453936423, + "grad_norm": 0.405452162027359, + "learning_rate": 0.0006, + "loss": 2.2333, + "step": 56150 + }, + { + "epoch": 0.2094850160023276, + "grad_norm": 0.2861316502094269, + "learning_rate": 0.0006, + "loss": 2.3812, + "step": 56160 + }, + { + "epoch": 0.20952231746529099, + "grad_norm": 0.38080063462257385, + "learning_rate": 0.0006, + "loss": 2.2422, + "step": 56170 + }, + { + "epoch": 0.20955961892825437, + "grad_norm": 0.43906643986701965, + "learning_rate": 0.0006, + "loss": 2.1317, + "step": 56180 + }, + { + "epoch": 0.20959692039121774, + "grad_norm": 0.3275911808013916, + "learning_rate": 0.0006, + "loss": 2.0824, + "step": 56190 + }, + { + "epoch": 0.20963422185418112, + "grad_norm": 0.38171207904815674, + "learning_rate": 0.0006, + "loss": 2.245, + "step": 56200 + }, + { + "epoch": 0.2096715233171445, + "grad_norm": 0.3112732768058777, + "learning_rate": 0.0006, + "loss": 2.2396, + "step": 56210 + }, + { + "epoch": 0.20970882478010788, + "grad_norm": 0.29373276233673096, + "learning_rate": 0.0006, + "loss": 2.3046, + "step": 56220 + }, + { + "epoch": 0.20974612624307126, + "grad_norm": 0.3442123234272003, + "learning_rate": 0.0006, + "loss": 2.1984, + "step": 56230 + }, + { + "epoch": 0.20978342770603464, + "grad_norm": 0.35870930552482605, + "learning_rate": 0.0006, + "loss": 2.1153, + "step": 56240 + }, + { + "epoch": 0.20982072916899802, + "grad_norm": 0.3285418748855591, + "learning_rate": 0.0006, + "loss": 2.1855, + "step": 56250 + }, + { + "epoch": 0.20982072916899802, + "eval_valid_loss": 2.1767215728759766, + "eval_valid_loss/all": 2.0411503314971924, + "eval_valid_loss/end_span": 1.1916624307632446, + "eval_valid_perplexity/batch": 7.699460983276367, + "eval_valid_perplexity/end_span": 3.2925503253936768, + "eval_valid_perplexity/fim": 2.2607147693634033, + "eval_valid_perplexity/first_seq": 14.380139350891113, + "eval_valid_perplexity/last_seq": 8.925907135009766, + "eval_valid_perplexity/second_seq": 13.431239128112793, + "eval_valid_perplexity/seq": 8.683831214904785, + "eval_valid_reconstruction/all": 0.29813122749328613, + "eval_valid_reconstruction/end_span": 0.7236192226409912, + "eval_valid_reconstruction/fim": 0.1650300920009613, + "eval_valid_reconstruction/first_seq": 0.1776757389307022, + "eval_valid_reconstruction/last_seq": 0.3280380070209503, + "eval_valid_reconstruction/second_seq": 0.2039758861064911, + "eval_valid_runtime": 442.3133, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 56250 + }, + { + "epoch": 0.20982072916899802, + "eval_train_loss": 2.176264524459839, + "eval_train_loss/all": 2.0146260261535645, + "eval_train_loss/end_span": 1.1621246337890625, + "eval_train_perplexity/batch": 7.497922897338867, + "eval_train_perplexity/end_span": 3.1967179775238037, + "eval_train_perplexity/fim": 2.23634934425354, + "eval_train_perplexity/first_seq": 15.502135276794434, + "eval_train_perplexity/last_seq": 9.11294174194336, + "eval_train_perplexity/second_seq": 13.695908546447754, + "eval_train_perplexity/seq": 8.632570266723633, + "eval_train_reconstruction/all": 0.287160724401474, + "eval_train_reconstruction/end_span": 0.7315735220909119, + "eval_train_reconstruction/fim": 0.16210761666297913, + "eval_train_reconstruction/first_seq": 0.14888548851013184, + "eval_train_reconstruction/last_seq": 0.32169342041015625, + "eval_train_reconstruction/second_seq": 0.1995389610528946, + "eval_train_runtime": 442.6785, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 56250 + }, + { + "epoch": 0.20985803063196137, + "grad_norm": 0.6223123669624329, + "learning_rate": 0.0006, + "loss": 2.1592, + "step": 56260 + }, + { + "epoch": 0.20989533209492475, + "grad_norm": 0.3558013141155243, + "learning_rate": 0.0006, + "loss": 2.1737, + "step": 56270 + }, + { + "epoch": 0.20993263355788813, + "grad_norm": 0.6480910181999207, + "learning_rate": 0.0006, + "loss": 2.0242, + "step": 56280 + }, + { + "epoch": 0.2099699350208515, + "grad_norm": 0.4146832227706909, + "learning_rate": 0.0006, + "loss": 2.171, + "step": 56290 + }, + { + "epoch": 0.2100072364838149, + "grad_norm": 0.39135271310806274, + "learning_rate": 0.0006, + "loss": 2.1343, + "step": 56300 + }, + { + "epoch": 0.21004453794677827, + "grad_norm": 0.40045395493507385, + "learning_rate": 0.0006, + "loss": 2.1364, + "step": 56310 + }, + { + "epoch": 0.21008183940974165, + "grad_norm": 0.44320520758628845, + "learning_rate": 0.0006, + "loss": 2.2077, + "step": 56320 + }, + { + "epoch": 0.21011914087270503, + "grad_norm": 0.34439194202423096, + "learning_rate": 0.0006, + "loss": 2.2875, + "step": 56330 + }, + { + "epoch": 0.2101564423356684, + "grad_norm": 0.26833125948905945, + "learning_rate": 0.0006, + "loss": 2.081, + "step": 56340 + }, + { + "epoch": 0.2101937437986318, + "grad_norm": 0.4457394778728485, + "learning_rate": 0.0006, + "loss": 2.2145, + "step": 56350 + }, + { + "epoch": 0.21023104526159517, + "grad_norm": 0.3991950452327728, + "learning_rate": 0.0006, + "loss": 2.2602, + "step": 56360 + }, + { + "epoch": 0.21026834672455855, + "grad_norm": 0.33525851368904114, + "learning_rate": 0.0006, + "loss": 2.3668, + "step": 56370 + }, + { + "epoch": 0.21030564818752193, + "grad_norm": 0.3474522531032562, + "learning_rate": 0.0006, + "loss": 2.2887, + "step": 56380 + }, + { + "epoch": 0.2103429496504853, + "grad_norm": 0.22462835907936096, + "learning_rate": 0.0006, + "loss": 2.3528, + "step": 56390 + }, + { + "epoch": 0.21038025111344866, + "grad_norm": 0.28424519300460815, + "learning_rate": 0.0006, + "loss": 2.1968, + "step": 56400 + }, + { + "epoch": 0.21041755257641204, + "grad_norm": 0.3637447953224182, + "learning_rate": 0.0006, + "loss": 1.9967, + "step": 56410 + }, + { + "epoch": 0.21045485403937542, + "grad_norm": 0.42919689416885376, + "learning_rate": 0.0006, + "loss": 2.1114, + "step": 56420 + }, + { + "epoch": 0.2104921555023388, + "grad_norm": 0.23954744637012482, + "learning_rate": 0.0006, + "loss": 2.2662, + "step": 56430 + }, + { + "epoch": 0.21052945696530218, + "grad_norm": 0.24089296162128448, + "learning_rate": 0.0006, + "loss": 2.4224, + "step": 56440 + }, + { + "epoch": 0.21056675842826555, + "grad_norm": 0.2867920696735382, + "learning_rate": 0.0006, + "loss": 2.2644, + "step": 56450 + }, + { + "epoch": 0.21060405989122893, + "grad_norm": 0.33009225130081177, + "learning_rate": 0.0006, + "loss": 2.1167, + "step": 56460 + }, + { + "epoch": 0.2106413613541923, + "grad_norm": 0.3270392417907715, + "learning_rate": 0.0006, + "loss": 2.347, + "step": 56470 + }, + { + "epoch": 0.2106786628171557, + "grad_norm": 0.34037092328071594, + "learning_rate": 0.0006, + "loss": 2.2025, + "step": 56480 + }, + { + "epoch": 0.21071596428011907, + "grad_norm": 0.44380080699920654, + "learning_rate": 0.0006, + "loss": 2.0632, + "step": 56490 + }, + { + "epoch": 0.21075326574308245, + "grad_norm": 0.4142859876155853, + "learning_rate": 0.0006, + "loss": 2.1942, + "step": 56500 + }, + { + "epoch": 0.21075326574308245, + "eval_valid_loss": 2.178879976272583, + "eval_valid_loss/all": 2.043086528778076, + "eval_valid_loss/end_span": 1.2065175771713257, + "eval_valid_perplexity/batch": 7.714383125305176, + "eval_valid_perplexity/end_span": 3.3418266773223877, + "eval_valid_perplexity/fim": 2.6208951473236084, + "eval_valid_perplexity/first_seq": 14.645153045654297, + "eval_valid_perplexity/last_seq": 8.771577835083008, + "eval_valid_perplexity/second_seq": 13.744599342346191, + "eval_valid_perplexity/seq": 8.704928398132324, + "eval_valid_reconstruction/all": 0.29775023460388184, + "eval_valid_reconstruction/end_span": 0.7166526317596436, + "eval_valid_reconstruction/fim": 0.19557252526283264, + "eval_valid_reconstruction/first_seq": 0.16955061256885529, + "eval_valid_reconstruction/last_seq": 0.3324034512042999, + "eval_valid_reconstruction/second_seq": 0.1994195133447647, + "eval_valid_runtime": 440.3236, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 56500 + }, + { + "epoch": 0.21075326574308245, + "eval_train_loss": 2.174827814102173, + "eval_train_loss/all": 2.01375150680542, + "eval_train_loss/end_span": 1.1791727542877197, + "eval_train_perplexity/batch": 7.491368770599365, + "eval_train_perplexity/end_span": 3.251683235168457, + "eval_train_perplexity/fim": 1.9403297901153564, + "eval_train_perplexity/first_seq": 15.365135192871094, + "eval_train_perplexity/last_seq": 8.655533790588379, + "eval_train_perplexity/second_seq": 13.761526107788086, + "eval_train_perplexity/seq": 8.633172988891602, + "eval_train_reconstruction/all": 0.2877013683319092, + "eval_train_reconstruction/end_span": 0.7253457307815552, + "eval_train_reconstruction/fim": 0.135358065366745, + "eval_train_reconstruction/first_seq": 0.1550315022468567, + "eval_train_reconstruction/last_seq": 0.3328748047351837, + "eval_train_reconstruction/second_seq": 0.1962108314037323, + "eval_train_runtime": 439.5857, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 56500 + }, + { + "epoch": 0.21079056720604583, + "grad_norm": 0.26914381980895996, + "learning_rate": 0.0006, + "loss": 2.1586, + "step": 56510 + }, + { + "epoch": 0.2108278686690092, + "grad_norm": 0.2554413080215454, + "learning_rate": 0.0006, + "loss": 2.3593, + "step": 56520 + }, + { + "epoch": 0.21086517013197256, + "grad_norm": 0.3339211344718933, + "learning_rate": 0.0006, + "loss": 2.205, + "step": 56530 + }, + { + "epoch": 0.21090247159493594, + "grad_norm": 0.2552342116832733, + "learning_rate": 0.0006, + "loss": 2.0189, + "step": 56540 + }, + { + "epoch": 0.21093977305789932, + "grad_norm": 0.43402212858200073, + "learning_rate": 0.0006, + "loss": 2.1076, + "step": 56550 + }, + { + "epoch": 0.2109770745208627, + "grad_norm": 0.4607846140861511, + "learning_rate": 0.0006, + "loss": 2.1805, + "step": 56560 + }, + { + "epoch": 0.21101437598382608, + "grad_norm": 0.27422383427619934, + "learning_rate": 0.0006, + "loss": 2.292, + "step": 56570 + }, + { + "epoch": 0.21105167744678946, + "grad_norm": 0.5222145915031433, + "learning_rate": 0.0006, + "loss": 2.197, + "step": 56580 + }, + { + "epoch": 0.21108897890975284, + "grad_norm": 0.31254348158836365, + "learning_rate": 0.0006, + "loss": 2.2005, + "step": 56590 + }, + { + "epoch": 0.21112628037271622, + "grad_norm": 0.341429740190506, + "learning_rate": 0.0006, + "loss": 2.3178, + "step": 56600 + }, + { + "epoch": 0.2111635818356796, + "grad_norm": 0.4382973313331604, + "learning_rate": 0.0006, + "loss": 2.1585, + "step": 56610 + }, + { + "epoch": 0.21120088329864298, + "grad_norm": 0.5719639658927917, + "learning_rate": 0.0006, + "loss": 2.1667, + "step": 56620 + }, + { + "epoch": 0.21123818476160636, + "grad_norm": 0.41167500615119934, + "learning_rate": 0.0006, + "loss": 2.1278, + "step": 56630 + }, + { + "epoch": 0.21127548622456974, + "grad_norm": 0.2394084930419922, + "learning_rate": 0.0006, + "loss": 2.2708, + "step": 56640 + }, + { + "epoch": 0.21131278768753312, + "grad_norm": 0.3281674385070801, + "learning_rate": 0.0006, + "loss": 2.0556, + "step": 56650 + }, + { + "epoch": 0.2113500891504965, + "grad_norm": 0.4938162565231323, + "learning_rate": 0.0006, + "loss": 2.0194, + "step": 56660 + }, + { + "epoch": 0.21138739061345985, + "grad_norm": 0.4014081060886383, + "learning_rate": 0.0006, + "loss": 2.2365, + "step": 56670 + }, + { + "epoch": 0.21142469207642323, + "grad_norm": 0.37478846311569214, + "learning_rate": 0.0006, + "loss": 2.1432, + "step": 56680 + }, + { + "epoch": 0.2114619935393866, + "grad_norm": 0.3806969225406647, + "learning_rate": 0.0006, + "loss": 2.2883, + "step": 56690 + }, + { + "epoch": 0.21149929500234999, + "grad_norm": 0.2963026762008667, + "learning_rate": 0.0006, + "loss": 2.1784, + "step": 56700 + }, + { + "epoch": 0.21153659646531336, + "grad_norm": 0.33987438678741455, + "learning_rate": 0.0006, + "loss": 2.3089, + "step": 56710 + }, + { + "epoch": 0.21157389792827674, + "grad_norm": 0.3044522702693939, + "learning_rate": 0.0006, + "loss": 2.4092, + "step": 56720 + }, + { + "epoch": 0.21161119939124012, + "grad_norm": 0.37588372826576233, + "learning_rate": 0.0006, + "loss": 2.232, + "step": 56730 + }, + { + "epoch": 0.2116485008542035, + "grad_norm": 0.3573823571205139, + "learning_rate": 0.0006, + "loss": 2.2245, + "step": 56740 + }, + { + "epoch": 0.21168580231716688, + "grad_norm": 0.2622274160385132, + "learning_rate": 0.0006, + "loss": 2.2686, + "step": 56750 + }, + { + "epoch": 0.21168580231716688, + "eval_valid_loss": 2.177502393722534, + "eval_valid_loss/all": 2.0417611598968506, + "eval_valid_loss/end_span": 1.2495259046554565, + "eval_valid_perplexity/batch": 7.704165458679199, + "eval_valid_perplexity/end_span": 3.4886887073516846, + "eval_valid_perplexity/fim": 2.491753339767456, + "eval_valid_perplexity/first_seq": 14.608975410461426, + "eval_valid_perplexity/last_seq": 8.880462646484375, + "eval_valid_perplexity/second_seq": 13.204020500183105, + "eval_valid_perplexity/seq": 8.688889503479004, + "eval_valid_reconstruction/all": 0.29838308691978455, + "eval_valid_reconstruction/end_span": 0.7014991641044617, + "eval_valid_reconstruction/fim": 0.18559738993644714, + "eval_valid_reconstruction/first_seq": 0.1736244261264801, + "eval_valid_reconstruction/last_seq": 0.3332127332687378, + "eval_valid_reconstruction/second_seq": 0.2081068605184555, + "eval_valid_runtime": 444.9747, + "eval_valid_samples_per_second": 0.431, + "eval_valid_steps_per_second": 0.431, + "step": 56750 + }, + { + "epoch": 0.21168580231716688, + "eval_train_loss": 2.1746034622192383, + "eval_train_loss/all": 2.0130324363708496, + "eval_train_loss/end_span": 1.2138835191726685, + "eval_train_perplexity/batch": 7.485983848571777, + "eval_train_perplexity/end_span": 3.3665332794189453, + "eval_train_perplexity/fim": 2.0769059658050537, + "eval_train_perplexity/first_seq": 15.432000160217285, + "eval_train_perplexity/last_seq": 8.977020263671875, + "eval_train_perplexity/second_seq": 14.182476043701172, + "eval_train_perplexity/seq": 8.617636680603027, + "eval_train_reconstruction/all": 0.2880265712738037, + "eval_train_reconstruction/end_span": 0.7129228115081787, + "eval_train_reconstruction/fim": 0.14980818331241608, + "eval_train_reconstruction/first_seq": 0.1541888415813446, + "eval_train_reconstruction/last_seq": 0.32150352001190186, + "eval_train_reconstruction/second_seq": 0.18351022899150848, + "eval_train_runtime": 440.3405, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 56750 + }, + { + "epoch": 0.21172310378013026, + "grad_norm": 0.39941009879112244, + "learning_rate": 0.0006, + "loss": 2.1845, + "step": 56760 + }, + { + "epoch": 0.21176040524309364, + "grad_norm": 0.3743520975112915, + "learning_rate": 0.0006, + "loss": 2.2127, + "step": 56770 + }, + { + "epoch": 0.21179770670605702, + "grad_norm": 0.44643276929855347, + "learning_rate": 0.0006, + "loss": 2.3143, + "step": 56780 + }, + { + "epoch": 0.2118350081690204, + "grad_norm": 0.328815221786499, + "learning_rate": 0.0006, + "loss": 2.275, + "step": 56790 + }, + { + "epoch": 0.21187230963198378, + "grad_norm": 0.3352814316749573, + "learning_rate": 0.0006, + "loss": 2.319, + "step": 56800 + }, + { + "epoch": 0.21190961109494713, + "grad_norm": 0.25528812408447266, + "learning_rate": 0.0006, + "loss": 2.0744, + "step": 56810 + }, + { + "epoch": 0.2119469125579105, + "grad_norm": 0.3300462067127228, + "learning_rate": 0.0006, + "loss": 2.216, + "step": 56820 + }, + { + "epoch": 0.2119842140208739, + "grad_norm": 0.5384141802787781, + "learning_rate": 0.0006, + "loss": 2.3098, + "step": 56830 + }, + { + "epoch": 0.21202151548383727, + "grad_norm": 0.3652390241622925, + "learning_rate": 0.0006, + "loss": 2.097, + "step": 56840 + }, + { + "epoch": 0.21205881694680065, + "grad_norm": 0.3138430416584015, + "learning_rate": 0.0006, + "loss": 2.2104, + "step": 56850 + }, + { + "epoch": 0.21209611840976403, + "grad_norm": 0.2668927013874054, + "learning_rate": 0.0006, + "loss": 2.0438, + "step": 56860 + }, + { + "epoch": 0.2121334198727274, + "grad_norm": 0.4174995422363281, + "learning_rate": 0.0006, + "loss": 2.1854, + "step": 56870 + }, + { + "epoch": 0.2121707213356908, + "grad_norm": 3.464639902114868, + "learning_rate": 0.0006, + "loss": 2.2515, + "step": 56880 + }, + { + "epoch": 0.21220802279865417, + "grad_norm": 0.30690932273864746, + "learning_rate": 0.0006, + "loss": 2.1055, + "step": 56890 + }, + { + "epoch": 0.21224532426161755, + "grad_norm": 0.24379639327526093, + "learning_rate": 0.0006, + "loss": 1.9284, + "step": 56900 + }, + { + "epoch": 0.21228262572458093, + "grad_norm": 0.35320377349853516, + "learning_rate": 0.0006, + "loss": 2.1249, + "step": 56910 + }, + { + "epoch": 0.2123199271875443, + "grad_norm": 0.3418390154838562, + "learning_rate": 0.0006, + "loss": 2.3129, + "step": 56920 + }, + { + "epoch": 0.21235722865050768, + "grad_norm": 0.29721832275390625, + "learning_rate": 0.0006, + "loss": 2.2339, + "step": 56930 + }, + { + "epoch": 0.21239453011347106, + "grad_norm": 0.24577482044696808, + "learning_rate": 0.0006, + "loss": 2.1495, + "step": 56940 + }, + { + "epoch": 0.21243183157643442, + "grad_norm": 0.38089197874069214, + "learning_rate": 0.0006, + "loss": 2.1636, + "step": 56950 + }, + { + "epoch": 0.2124691330393978, + "grad_norm": 0.2293151617050171, + "learning_rate": 0.0006, + "loss": 2.0862, + "step": 56960 + }, + { + "epoch": 0.21250643450236117, + "grad_norm": 0.25699159502983093, + "learning_rate": 0.0006, + "loss": 2.2603, + "step": 56970 + }, + { + "epoch": 0.21254373596532455, + "grad_norm": 0.2567465901374817, + "learning_rate": 0.0006, + "loss": 2.1117, + "step": 56980 + }, + { + "epoch": 0.21258103742828793, + "grad_norm": 0.4103713035583496, + "learning_rate": 0.0006, + "loss": 2.4153, + "step": 56990 + }, + { + "epoch": 0.2126183388912513, + "grad_norm": 0.4003928005695343, + "learning_rate": 0.0006, + "loss": 2.1137, + "step": 57000 + }, + { + "epoch": 0.2126183388912513, + "eval_valid_loss": 2.177947759628296, + "eval_valid_loss/all": 2.042147397994995, + "eval_valid_loss/end_span": 1.2425079345703125, + "eval_valid_perplexity/batch": 7.707141876220703, + "eval_valid_perplexity/end_span": 3.4642908573150635, + "eval_valid_perplexity/fim": 2.2128849029541016, + "eval_valid_perplexity/first_seq": 15.311432838439941, + "eval_valid_perplexity/last_seq": 8.52223014831543, + "eval_valid_perplexity/second_seq": 14.205355644226074, + "eval_valid_perplexity/seq": 8.692933082580566, + "eval_valid_reconstruction/all": 0.29838329553604126, + "eval_valid_reconstruction/end_span": 0.7026579976081848, + "eval_valid_reconstruction/fim": 0.16050158441066742, + "eval_valid_reconstruction/first_seq": 0.15652795135974884, + "eval_valid_reconstruction/last_seq": 0.3388524353504181, + "eval_valid_reconstruction/second_seq": 0.18718113005161285, + "eval_valid_runtime": 441.576, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 57000 + }, + { + "epoch": 0.2126183388912513, + "eval_train_loss": 2.1764659881591797, + "eval_train_loss/all": 2.015106201171875, + "eval_train_loss/end_span": 1.2045214176177979, + "eval_train_perplexity/batch": 7.501523971557617, + "eval_train_perplexity/end_span": 3.33516263961792, + "eval_train_perplexity/fim": 2.04970383644104, + "eval_train_perplexity/first_seq": 15.602010726928711, + "eval_train_perplexity/last_seq": 8.186229705810547, + "eval_train_perplexity/second_seq": 14.109222412109375, + "eval_train_perplexity/seq": 8.63736629486084, + "eval_train_reconstruction/all": 0.28761130571365356, + "eval_train_reconstruction/end_span": 0.7127285599708557, + "eval_train_reconstruction/fim": 0.14529190957546234, + "eval_train_reconstruction/first_seq": 0.15161964297294617, + "eval_train_reconstruction/last_seq": 0.3534456789493561, + "eval_train_reconstruction/second_seq": 0.1869295984506607, + "eval_train_runtime": 445.9644, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 57000 + }, + { + "epoch": 0.2126556403542147, + "grad_norm": 0.23975253105163574, + "learning_rate": 0.0006, + "loss": 2.3185, + "step": 57010 + }, + { + "epoch": 0.21269294181717807, + "grad_norm": 0.30171287059783936, + "learning_rate": 0.0006, + "loss": 2.2422, + "step": 57020 + }, + { + "epoch": 0.21273024328014145, + "grad_norm": 0.43898165225982666, + "learning_rate": 0.0006, + "loss": 2.2847, + "step": 57030 + }, + { + "epoch": 0.21276754474310483, + "grad_norm": 0.5267592668533325, + "learning_rate": 0.0006, + "loss": 2.2477, + "step": 57040 + }, + { + "epoch": 0.2128048462060682, + "grad_norm": 0.42628464102745056, + "learning_rate": 0.0006, + "loss": 2.1034, + "step": 57050 + }, + { + "epoch": 0.2128421476690316, + "grad_norm": 0.3796719014644623, + "learning_rate": 0.0006, + "loss": 2.0942, + "step": 57060 + }, + { + "epoch": 0.21287944913199497, + "grad_norm": 0.33818039298057556, + "learning_rate": 0.0006, + "loss": 2.1719, + "step": 57070 + }, + { + "epoch": 0.21291675059495832, + "grad_norm": 0.39962199330329895, + "learning_rate": 0.0006, + "loss": 2.353, + "step": 57080 + }, + { + "epoch": 0.2129540520579217, + "grad_norm": 0.23421208560466766, + "learning_rate": 0.0006, + "loss": 2.1989, + "step": 57090 + }, + { + "epoch": 0.21299135352088508, + "grad_norm": 0.39004766941070557, + "learning_rate": 0.0006, + "loss": 2.306, + "step": 57100 + }, + { + "epoch": 0.21302865498384846, + "grad_norm": 0.2578071355819702, + "learning_rate": 0.0006, + "loss": 2.1467, + "step": 57110 + }, + { + "epoch": 0.21306595644681184, + "grad_norm": 0.4821344316005707, + "learning_rate": 0.0006, + "loss": 2.0395, + "step": 57120 + }, + { + "epoch": 0.21310325790977522, + "grad_norm": 0.3904949724674225, + "learning_rate": 0.0006, + "loss": 2.2028, + "step": 57130 + }, + { + "epoch": 0.2131405593727386, + "grad_norm": 0.3117394745349884, + "learning_rate": 0.0006, + "loss": 2.1682, + "step": 57140 + }, + { + "epoch": 0.21317786083570198, + "grad_norm": 0.3322374224662781, + "learning_rate": 0.0006, + "loss": 1.955, + "step": 57150 + }, + { + "epoch": 0.21321516229866536, + "grad_norm": 0.22820574045181274, + "learning_rate": 0.0006, + "loss": 2.2359, + "step": 57160 + }, + { + "epoch": 0.21325246376162874, + "grad_norm": 0.35658401250839233, + "learning_rate": 0.0006, + "loss": 2.2953, + "step": 57170 + }, + { + "epoch": 0.21328976522459211, + "grad_norm": 0.328020304441452, + "learning_rate": 0.0006, + "loss": 2.4054, + "step": 57180 + }, + { + "epoch": 0.2133270666875555, + "grad_norm": 0.2951064705848694, + "learning_rate": 0.0006, + "loss": 2.1349, + "step": 57190 + }, + { + "epoch": 0.21336436815051887, + "grad_norm": 0.36343705654144287, + "learning_rate": 0.0006, + "loss": 2.1765, + "step": 57200 + }, + { + "epoch": 0.21340166961348225, + "grad_norm": 0.317597359418869, + "learning_rate": 0.0006, + "loss": 1.9853, + "step": 57210 + }, + { + "epoch": 0.2134389710764456, + "grad_norm": 0.4235652983188629, + "learning_rate": 0.0006, + "loss": 2.1806, + "step": 57220 + }, + { + "epoch": 0.21347627253940898, + "grad_norm": 0.21755583584308624, + "learning_rate": 0.0006, + "loss": 2.2237, + "step": 57230 + }, + { + "epoch": 0.21351357400237236, + "grad_norm": 0.25042882561683655, + "learning_rate": 0.0006, + "loss": 2.2615, + "step": 57240 + }, + { + "epoch": 0.21355087546533574, + "grad_norm": 5.173686504364014, + "learning_rate": 0.0006, + "loss": 2.1242, + "step": 57250 + }, + { + "epoch": 0.21355087546533574, + "eval_valid_loss": 2.1794397830963135, + "eval_valid_loss/all": 2.0438196659088135, + "eval_valid_loss/end_span": 1.247975468635559, + "eval_valid_perplexity/batch": 7.720040798187256, + "eval_valid_perplexity/end_span": 3.483283758163452, + "eval_valid_perplexity/fim": 2.3321218490600586, + "eval_valid_perplexity/first_seq": 15.294726371765137, + "eval_valid_perplexity/last_seq": 8.369370460510254, + "eval_valid_perplexity/second_seq": 14.021890640258789, + "eval_valid_perplexity/seq": 8.708921432495117, + "eval_valid_reconstruction/all": 0.29751265048980713, + "eval_valid_reconstruction/end_span": 0.7033137083053589, + "eval_valid_reconstruction/fim": 0.171931654214859, + "eval_valid_reconstruction/first_seq": 0.15749230980873108, + "eval_valid_reconstruction/last_seq": 0.34613481163978577, + "eval_valid_reconstruction/second_seq": 0.19353041052818298, + "eval_valid_runtime": 438.1658, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 57250 + }, + { + "epoch": 0.21355087546533574, + "eval_train_loss": 2.177884340286255, + "eval_train_loss/all": 2.016197443008423, + "eval_train_loss/end_span": 1.2026296854019165, + "eval_train_perplexity/batch": 7.509714603424072, + "eval_train_perplexity/end_span": 3.328859329223633, + "eval_train_perplexity/fim": 1.975778341293335, + "eval_train_perplexity/first_seq": 15.586528778076172, + "eval_train_perplexity/last_seq": 9.086603164672852, + "eval_train_perplexity/second_seq": 14.250224113464355, + "eval_train_perplexity/seq": 8.649067878723145, + "eval_train_reconstruction/all": 0.28689834475517273, + "eval_train_reconstruction/end_span": 0.7169795632362366, + "eval_train_reconstruction/fim": 0.1379239559173584, + "eval_train_reconstruction/first_seq": 0.1504507064819336, + "eval_train_reconstruction/last_seq": 0.32251784205436707, + "eval_train_reconstruction/second_seq": 0.18211790919303894, + "eval_train_runtime": 441.4976, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 57250 + }, + { + "epoch": 0.21358817692829912, + "grad_norm": 0.2822608947753906, + "learning_rate": 0.0006, + "loss": 2.1694, + "step": 57260 + }, + { + "epoch": 0.2136254783912625, + "grad_norm": 0.32159385085105896, + "learning_rate": 0.0006, + "loss": 2.2127, + "step": 57270 + }, + { + "epoch": 0.21366277985422588, + "grad_norm": 0.30767935514450073, + "learning_rate": 0.0006, + "loss": 2.3022, + "step": 57280 + }, + { + "epoch": 0.21370008131718926, + "grad_norm": 20.469514846801758, + "learning_rate": 0.0006, + "loss": 2.1637, + "step": 57290 + }, + { + "epoch": 0.21373738278015264, + "grad_norm": 0.4966380298137665, + "learning_rate": 0.0006, + "loss": 2.1068, + "step": 57300 + }, + { + "epoch": 0.21377468424311602, + "grad_norm": 0.4815598130226135, + "learning_rate": 0.0006, + "loss": 2.0165, + "step": 57310 + }, + { + "epoch": 0.2138119857060794, + "grad_norm": 0.4017059803009033, + "learning_rate": 0.0006, + "loss": 2.3067, + "step": 57320 + }, + { + "epoch": 0.21384928716904278, + "grad_norm": 0.3865620195865631, + "learning_rate": 0.0006, + "loss": 2.1921, + "step": 57330 + }, + { + "epoch": 0.21388658863200616, + "grad_norm": 0.3336637318134308, + "learning_rate": 0.0006, + "loss": 2.2231, + "step": 57340 + }, + { + "epoch": 0.21392389009496954, + "grad_norm": 0.4329932630062103, + "learning_rate": 0.0006, + "loss": 2.1355, + "step": 57350 + }, + { + "epoch": 0.2139611915579329, + "grad_norm": 0.3160308003425598, + "learning_rate": 0.0006, + "loss": 2.2304, + "step": 57360 + }, + { + "epoch": 0.21399849302089627, + "grad_norm": 0.3216007947921753, + "learning_rate": 0.0006, + "loss": 2.1054, + "step": 57370 + }, + { + "epoch": 0.21403579448385965, + "grad_norm": 0.25850269198417664, + "learning_rate": 0.0006, + "loss": 2.227, + "step": 57380 + }, + { + "epoch": 0.21407309594682303, + "grad_norm": 0.2656196057796478, + "learning_rate": 0.0006, + "loss": 2.1364, + "step": 57390 + }, + { + "epoch": 0.2141103974097864, + "grad_norm": 0.2847604751586914, + "learning_rate": 0.0006, + "loss": 2.331, + "step": 57400 + }, + { + "epoch": 0.2141476988727498, + "grad_norm": 0.40395933389663696, + "learning_rate": 0.0006, + "loss": 2.3534, + "step": 57410 + }, + { + "epoch": 0.21418500033571317, + "grad_norm": 0.2938227653503418, + "learning_rate": 0.0006, + "loss": 2.1115, + "step": 57420 + }, + { + "epoch": 0.21422230179867655, + "grad_norm": 0.3980991244316101, + "learning_rate": 0.0006, + "loss": 2.1573, + "step": 57430 + }, + { + "epoch": 0.21425960326163992, + "grad_norm": 0.3889647126197815, + "learning_rate": 0.0006, + "loss": 2.3449, + "step": 57440 + }, + { + "epoch": 0.2142969047246033, + "grad_norm": 0.23552146553993225, + "learning_rate": 0.0006, + "loss": 2.158, + "step": 57450 + }, + { + "epoch": 0.21433420618756668, + "grad_norm": 0.32603201270103455, + "learning_rate": 0.0006, + "loss": 2.1476, + "step": 57460 + }, + { + "epoch": 0.21437150765053006, + "grad_norm": 0.7074960470199585, + "learning_rate": 0.0006, + "loss": 2.2278, + "step": 57470 + }, + { + "epoch": 0.21440880911349344, + "grad_norm": 0.28303056955337524, + "learning_rate": 0.0006, + "loss": 2.2203, + "step": 57480 + }, + { + "epoch": 0.21444611057645682, + "grad_norm": 0.4862563908100128, + "learning_rate": 0.0006, + "loss": 2.1438, + "step": 57490 + }, + { + "epoch": 0.21448341203942017, + "grad_norm": 0.2334991842508316, + "learning_rate": 0.0006, + "loss": 2.3189, + "step": 57500 + }, + { + "epoch": 0.21448341203942017, + "eval_valid_loss": 2.1822421550750732, + "eval_valid_loss/all": 2.0461630821228027, + "eval_valid_loss/end_span": 1.2389329671859741, + "eval_valid_perplexity/batch": 7.738153457641602, + "eval_valid_perplexity/end_span": 3.45192813873291, + "eval_valid_perplexity/fim": 2.4351072311401367, + "eval_valid_perplexity/first_seq": 14.939565658569336, + "eval_valid_perplexity/last_seq": 8.751893043518066, + "eval_valid_perplexity/second_seq": 13.810792922973633, + "eval_valid_perplexity/seq": 8.72712230682373, + "eval_valid_reconstruction/all": 0.29680442810058594, + "eval_valid_reconstruction/end_span": 0.7207806706428528, + "eval_valid_reconstruction/fim": 0.17871254682540894, + "eval_valid_reconstruction/first_seq": 0.16482292115688324, + "eval_valid_reconstruction/last_seq": 0.33405473828315735, + "eval_valid_reconstruction/second_seq": 0.19287604093551636, + "eval_valid_runtime": 443.4366, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 57500 + }, + { + "epoch": 0.21448341203942017, + "eval_train_loss": 2.180159330368042, + "eval_train_loss/all": 2.017789363861084, + "eval_train_loss/end_span": 1.1941378116607666, + "eval_train_perplexity/batch": 7.521678924560547, + "eval_train_perplexity/end_span": 3.300710678100586, + "eval_train_perplexity/fim": 2.1423470973968506, + "eval_train_perplexity/first_seq": 15.697278022766113, + "eval_train_perplexity/last_seq": 8.576226234436035, + "eval_train_perplexity/second_seq": 14.013833045959473, + "eval_train_perplexity/seq": 8.659219741821289, + "eval_train_reconstruction/all": 0.2864120304584503, + "eval_train_reconstruction/end_span": 0.7330384254455566, + "eval_train_reconstruction/fim": 0.1526826173067093, + "eval_train_reconstruction/first_seq": 0.14945310354232788, + "eval_train_reconstruction/last_seq": 0.3403257131576538, + "eval_train_reconstruction/second_seq": 0.18738485872745514, + "eval_train_runtime": 443.6618, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 57500 + }, + { + "epoch": 0.21452071350238355, + "grad_norm": 0.5139169096946716, + "learning_rate": 0.0006, + "loss": 2.1136, + "step": 57510 + }, + { + "epoch": 0.21455801496534693, + "grad_norm": 0.717876672744751, + "learning_rate": 0.0006, + "loss": 2.0057, + "step": 57520 + }, + { + "epoch": 0.2145953164283103, + "grad_norm": 0.35277873277664185, + "learning_rate": 0.0006, + "loss": 2.2407, + "step": 57530 + }, + { + "epoch": 0.2146326178912737, + "grad_norm": 0.41861051321029663, + "learning_rate": 0.0006, + "loss": 2.037, + "step": 57540 + }, + { + "epoch": 0.21466991935423707, + "grad_norm": 1.5224655866622925, + "learning_rate": 0.0006, + "loss": 2.0494, + "step": 57550 + }, + { + "epoch": 0.21470722081720045, + "grad_norm": 0.4469328224658966, + "learning_rate": 0.0006, + "loss": 2.211, + "step": 57560 + }, + { + "epoch": 0.21474452228016383, + "grad_norm": 0.4371592104434967, + "learning_rate": 0.0006, + "loss": 2.136, + "step": 57570 + }, + { + "epoch": 0.2147818237431272, + "grad_norm": 0.38895219564437866, + "learning_rate": 0.0006, + "loss": 2.1525, + "step": 57580 + }, + { + "epoch": 0.2148191252060906, + "grad_norm": 0.3164790868759155, + "learning_rate": 0.0006, + "loss": 2.1814, + "step": 57590 + }, + { + "epoch": 0.21485642666905397, + "grad_norm": 0.3701966404914856, + "learning_rate": 0.0006, + "loss": 2.1219, + "step": 57600 + }, + { + "epoch": 0.21489372813201735, + "grad_norm": 0.36312925815582275, + "learning_rate": 0.0006, + "loss": 2.2218, + "step": 57610 + }, + { + "epoch": 0.21493102959498073, + "grad_norm": 0.3441680073738098, + "learning_rate": 0.0006, + "loss": 2.3101, + "step": 57620 + }, + { + "epoch": 0.21496833105794408, + "grad_norm": 0.4881911873817444, + "learning_rate": 0.0006, + "loss": 2.2463, + "step": 57630 + }, + { + "epoch": 0.21500563252090746, + "grad_norm": 0.35696178674697876, + "learning_rate": 0.0006, + "loss": 2.2916, + "step": 57640 + }, + { + "epoch": 0.21504293398387084, + "grad_norm": 0.39704588055610657, + "learning_rate": 0.0006, + "loss": 2.1532, + "step": 57650 + }, + { + "epoch": 0.21508023544683422, + "grad_norm": 0.30509263277053833, + "learning_rate": 0.0006, + "loss": 2.2437, + "step": 57660 + }, + { + "epoch": 0.2151175369097976, + "grad_norm": 0.33106479048728943, + "learning_rate": 0.0006, + "loss": 2.1795, + "step": 57670 + }, + { + "epoch": 0.21515483837276098, + "grad_norm": 0.45127072930336, + "learning_rate": 0.0006, + "loss": 2.2303, + "step": 57680 + }, + { + "epoch": 0.21519213983572436, + "grad_norm": 0.35132861137390137, + "learning_rate": 0.0006, + "loss": 2.102, + "step": 57690 + }, + { + "epoch": 0.21522944129868773, + "grad_norm": 0.3235408663749695, + "learning_rate": 0.0006, + "loss": 2.2146, + "step": 57700 + }, + { + "epoch": 0.21526674276165111, + "grad_norm": 0.2909853756427765, + "learning_rate": 0.0006, + "loss": 2.3625, + "step": 57710 + }, + { + "epoch": 0.2153040442246145, + "grad_norm": 0.24198107421398163, + "learning_rate": 0.0006, + "loss": 2.3645, + "step": 57720 + }, + { + "epoch": 0.21534134568757787, + "grad_norm": 0.32593709230422974, + "learning_rate": 0.0006, + "loss": 2.2123, + "step": 57730 + }, + { + "epoch": 0.21537864715054125, + "grad_norm": 4.322092533111572, + "learning_rate": 0.0006, + "loss": 2.3526, + "step": 57740 + }, + { + "epoch": 0.21541594861350463, + "grad_norm": 0.6432812809944153, + "learning_rate": 0.0006, + "loss": 2.1789, + "step": 57750 + }, + { + "epoch": 0.21541594861350463, + "eval_valid_loss": 2.2145986557006836, + "eval_valid_loss/all": 2.076658248901367, + "eval_valid_loss/end_span": 1.46124267578125, + "eval_valid_perplexity/batch": 7.97776460647583, + "eval_valid_perplexity/end_span": 4.311313629150391, + "eval_valid_perplexity/fim": 2.2541236877441406, + "eval_valid_perplexity/first_seq": 14.965372085571289, + "eval_valid_perplexity/last_seq": 8.84152889251709, + "eval_valid_perplexity/second_seq": 13.921368598937988, + "eval_valid_perplexity/seq": 9.022393226623535, + "eval_valid_reconstruction/all": 0.2890613377094269, + "eval_valid_reconstruction/end_span": 0.6855697631835938, + "eval_valid_reconstruction/fim": 0.15801189839839935, + "eval_valid_reconstruction/first_seq": 0.1656458079814911, + "eval_valid_reconstruction/last_seq": 0.3312843143939972, + "eval_valid_reconstruction/second_seq": 0.1945505142211914, + "eval_valid_runtime": 441.6487, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 57750 + }, + { + "epoch": 0.21541594861350463, + "eval_train_loss": 2.2070820331573486, + "eval_train_loss/all": 2.0431156158447266, + "eval_train_loss/end_span": 1.391647219657898, + "eval_train_perplexity/batch": 7.7146077156066895, + "eval_train_perplexity/end_span": 4.021468639373779, + "eval_train_perplexity/fim": 2.1180896759033203, + "eval_train_perplexity/first_seq": 15.34665298461914, + "eval_train_perplexity/last_seq": 9.306982040405273, + "eval_train_perplexity/second_seq": 14.26524829864502, + "eval_train_perplexity/seq": 8.911124229431152, + "eval_train_reconstruction/all": 0.2799943685531616, + "eval_train_reconstruction/end_span": 0.6979819536209106, + "eval_train_reconstruction/fim": 0.1471191942691803, + "eval_train_reconstruction/first_seq": 0.1565542072057724, + "eval_train_reconstruction/last_seq": 0.31637415289878845, + "eval_train_reconstruction/second_seq": 0.18354327976703644, + "eval_train_runtime": 443.8514, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 57750 + }, + { + "epoch": 0.215453250076468, + "grad_norm": 0.3510143756866455, + "learning_rate": 0.0006, + "loss": 2.3502, + "step": 57760 + }, + { + "epoch": 0.21549055153943136, + "grad_norm": 0.3636143207550049, + "learning_rate": 0.0006, + "loss": 2.1751, + "step": 57770 + }, + { + "epoch": 0.21552785300239474, + "grad_norm": 0.2909983694553375, + "learning_rate": 0.0006, + "loss": 2.2204, + "step": 57780 + }, + { + "epoch": 0.21556515446535812, + "grad_norm": 0.5131400227546692, + "learning_rate": 0.0006, + "loss": 2.1713, + "step": 57790 + }, + { + "epoch": 0.2156024559283215, + "grad_norm": 0.48041725158691406, + "learning_rate": 0.0006, + "loss": 2.155, + "step": 57800 + }, + { + "epoch": 0.21563975739128488, + "grad_norm": 0.3085395395755768, + "learning_rate": 0.0006, + "loss": 2.284, + "step": 57810 + }, + { + "epoch": 0.21567705885424826, + "grad_norm": 25.763263702392578, + "learning_rate": 0.0006, + "loss": 2.3119, + "step": 57820 + }, + { + "epoch": 0.21571436031721164, + "grad_norm": 0.5510742664337158, + "learning_rate": 0.0006, + "loss": 2.1662, + "step": 57830 + }, + { + "epoch": 0.21575166178017502, + "grad_norm": 0.6466108560562134, + "learning_rate": 0.0006, + "loss": 2.3376, + "step": 57840 + }, + { + "epoch": 0.2157889632431384, + "grad_norm": 0.35810521245002747, + "learning_rate": 0.0006, + "loss": 2.1835, + "step": 57850 + }, + { + "epoch": 0.21582626470610178, + "grad_norm": 105.35580444335938, + "learning_rate": 0.0006, + "loss": 2.3157, + "step": 57860 + }, + { + "epoch": 0.21586356616906516, + "grad_norm": 0.3869578540325165, + "learning_rate": 0.0006, + "loss": 2.0358, + "step": 57870 + }, + { + "epoch": 0.21590086763202854, + "grad_norm": 0.31630080938339233, + "learning_rate": 0.0006, + "loss": 2.2871, + "step": 57880 + }, + { + "epoch": 0.21593816909499192, + "grad_norm": 0.4092036783695221, + "learning_rate": 0.0006, + "loss": 2.3865, + "step": 57890 + }, + { + "epoch": 0.2159754705579553, + "grad_norm": 0.43851137161254883, + "learning_rate": 0.0006, + "loss": 2.0977, + "step": 57900 + }, + { + "epoch": 0.21601277202091865, + "grad_norm": 0.45180338621139526, + "learning_rate": 0.0006, + "loss": 2.0409, + "step": 57910 + }, + { + "epoch": 0.21605007348388203, + "grad_norm": 0.3520795404911041, + "learning_rate": 0.0006, + "loss": 2.4003, + "step": 57920 + }, + { + "epoch": 0.2160873749468454, + "grad_norm": 1.2138044834136963, + "learning_rate": 0.0006, + "loss": 2.2443, + "step": 57930 + }, + { + "epoch": 0.21612467640980879, + "grad_norm": 0.36971861124038696, + "learning_rate": 0.0006, + "loss": 2.2792, + "step": 57940 + }, + { + "epoch": 0.21616197787277217, + "grad_norm": 0.31142738461494446, + "learning_rate": 0.0006, + "loss": 2.1467, + "step": 57950 + }, + { + "epoch": 0.21619927933573554, + "grad_norm": 0.3130701184272766, + "learning_rate": 0.0006, + "loss": 2.1573, + "step": 57960 + }, + { + "epoch": 0.21623658079869892, + "grad_norm": 0.3212928771972656, + "learning_rate": 0.0006, + "loss": 2.3396, + "step": 57970 + }, + { + "epoch": 0.2162738822616623, + "grad_norm": 0.24756313860416412, + "learning_rate": 0.0006, + "loss": 2.3068, + "step": 57980 + }, + { + "epoch": 0.21631118372462568, + "grad_norm": 0.2542846202850342, + "learning_rate": 0.0006, + "loss": 2.3196, + "step": 57990 + }, + { + "epoch": 0.21634848518758906, + "grad_norm": 0.2599810063838959, + "learning_rate": 0.0006, + "loss": 2.271, + "step": 58000 + }, + { + "epoch": 0.21634848518758906, + "eval_valid_loss": 2.1817479133605957, + "eval_valid_loss/all": 2.0456578731536865, + "eval_valid_loss/end_span": 1.241840124130249, + "eval_valid_perplexity/batch": 7.7342448234558105, + "eval_valid_perplexity/end_span": 3.461977958679199, + "eval_valid_perplexity/fim": 2.4872188568115234, + "eval_valid_perplexity/first_seq": 15.103413581848145, + "eval_valid_perplexity/last_seq": 9.127253532409668, + "eval_valid_perplexity/second_seq": 14.212300300598145, + "eval_valid_perplexity/seq": 8.72376537322998, + "eval_valid_reconstruction/all": 0.2970758378505707, + "eval_valid_reconstruction/end_span": 0.7142070531845093, + "eval_valid_reconstruction/fim": 0.1844656616449356, + "eval_valid_reconstruction/first_seq": 0.16335618495941162, + "eval_valid_reconstruction/last_seq": 0.3162349462509155, + "eval_valid_reconstruction/second_seq": 0.18717451393604279, + "eval_valid_runtime": 444.6152, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 58000 + }, + { + "epoch": 0.21634848518758906, + "eval_train_loss": 2.181023120880127, + "eval_train_loss/all": 2.018791913986206, + "eval_train_loss/end_span": 1.2076579332351685, + "eval_train_perplexity/batch": 7.529223442077637, + "eval_train_perplexity/end_span": 3.345639705657959, + "eval_train_perplexity/fim": 1.9326716661453247, + "eval_train_perplexity/first_seq": 15.660412788391113, + "eval_train_perplexity/last_seq": 8.849384307861328, + "eval_train_perplexity/second_seq": 14.267004013061523, + "eval_train_perplexity/seq": 8.673973083496094, + "eval_train_reconstruction/all": 0.28605321049690247, + "eval_train_reconstruction/end_span": 0.7264649868011475, + "eval_train_reconstruction/fim": 0.13358648121356964, + "eval_train_reconstruction/first_seq": 0.15197442471981049, + "eval_train_reconstruction/last_seq": 0.328235387802124, + "eval_train_reconstruction/second_seq": 0.18597111105918884, + "eval_train_runtime": 439.8911, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 58000 + }, + { + "epoch": 0.21638578665055244, + "grad_norm": 0.24727553129196167, + "learning_rate": 0.0006, + "loss": 2.1722, + "step": 58010 + }, + { + "epoch": 0.21642308811351582, + "grad_norm": 2.343123197555542, + "learning_rate": 0.0006, + "loss": 2.1085, + "step": 58020 + }, + { + "epoch": 0.2164603895764792, + "grad_norm": 0.34990033507347107, + "learning_rate": 0.0006, + "loss": 2.1449, + "step": 58030 + }, + { + "epoch": 0.21649769103944258, + "grad_norm": 0.2725059390068054, + "learning_rate": 0.0006, + "loss": 2.1452, + "step": 58040 + }, + { + "epoch": 0.21653499250240593, + "grad_norm": 0.33437687158584595, + "learning_rate": 0.0006, + "loss": 2.134, + "step": 58050 + }, + { + "epoch": 0.2165722939653693, + "grad_norm": 0.4534415602684021, + "learning_rate": 0.0006, + "loss": 2.1579, + "step": 58060 + }, + { + "epoch": 0.2166095954283327, + "grad_norm": 0.29756996035575867, + "learning_rate": 0.0006, + "loss": 2.2479, + "step": 58070 + }, + { + "epoch": 0.21664689689129607, + "grad_norm": 0.2945084869861603, + "learning_rate": 0.0006, + "loss": 2.3798, + "step": 58080 + }, + { + "epoch": 0.21668419835425945, + "grad_norm": 0.3666843771934509, + "learning_rate": 0.0006, + "loss": 2.3894, + "step": 58090 + }, + { + "epoch": 0.21672149981722283, + "grad_norm": 0.255823016166687, + "learning_rate": 0.0006, + "loss": 2.2991, + "step": 58100 + }, + { + "epoch": 0.2167588012801862, + "grad_norm": 0.2610294222831726, + "learning_rate": 0.0006, + "loss": 2.1929, + "step": 58110 + }, + { + "epoch": 0.2167961027431496, + "grad_norm": 0.5044459700584412, + "learning_rate": 0.0006, + "loss": 2.0606, + "step": 58120 + }, + { + "epoch": 0.21683340420611297, + "grad_norm": 0.3069612681865692, + "learning_rate": 0.0006, + "loss": 2.1277, + "step": 58130 + }, + { + "epoch": 0.21687070566907635, + "grad_norm": 0.33455803990364075, + "learning_rate": 0.0006, + "loss": 2.2824, + "step": 58140 + }, + { + "epoch": 0.21690800713203973, + "grad_norm": 0.4064924120903015, + "learning_rate": 0.0006, + "loss": 2.1817, + "step": 58150 + }, + { + "epoch": 0.2169453085950031, + "grad_norm": 0.21037711203098297, + "learning_rate": 0.0006, + "loss": 2.1791, + "step": 58160 + }, + { + "epoch": 0.21698261005796649, + "grad_norm": 0.4885220229625702, + "learning_rate": 0.0006, + "loss": 2.301, + "step": 58170 + }, + { + "epoch": 0.21701991152092986, + "grad_norm": 0.3344346582889557, + "learning_rate": 0.0006, + "loss": 2.2421, + "step": 58180 + }, + { + "epoch": 0.21705721298389322, + "grad_norm": 0.2870485186576843, + "learning_rate": 0.0006, + "loss": 2.2619, + "step": 58190 + }, + { + "epoch": 0.2170945144468566, + "grad_norm": 0.39137473702430725, + "learning_rate": 0.0006, + "loss": 2.3342, + "step": 58200 + }, + { + "epoch": 0.21713181590981998, + "grad_norm": 0.39838844537734985, + "learning_rate": 0.0006, + "loss": 2.0818, + "step": 58210 + }, + { + "epoch": 0.21716911737278335, + "grad_norm": 0.41184043884277344, + "learning_rate": 0.0006, + "loss": 2.1127, + "step": 58220 + }, + { + "epoch": 0.21720641883574673, + "grad_norm": 0.4407745599746704, + "learning_rate": 0.0006, + "loss": 2.3013, + "step": 58230 + }, + { + "epoch": 0.2172437202987101, + "grad_norm": 0.24490675330162048, + "learning_rate": 0.0006, + "loss": 2.2424, + "step": 58240 + }, + { + "epoch": 0.2172810217616735, + "grad_norm": 0.29679012298583984, + "learning_rate": 0.0006, + "loss": 2.1894, + "step": 58250 + }, + { + "epoch": 0.2172810217616735, + "eval_valid_loss": 2.177027940750122, + "eval_valid_loss/all": 2.0414085388183594, + "eval_valid_loss/end_span": 1.270602822303772, + "eval_valid_perplexity/batch": 7.701449394226074, + "eval_valid_perplexity/end_span": 3.562999725341797, + "eval_valid_perplexity/fim": 2.237982749938965, + "eval_valid_perplexity/first_seq": 14.633174896240234, + "eval_valid_perplexity/last_seq": 8.782217979431152, + "eval_valid_perplexity/second_seq": 14.060208320617676, + "eval_valid_perplexity/seq": 8.685196876525879, + "eval_valid_reconstruction/all": 0.2981334626674652, + "eval_valid_reconstruction/end_span": 0.7018213272094727, + "eval_valid_reconstruction/fim": 0.16346624493598938, + "eval_valid_reconstruction/first_seq": 0.1716187298297882, + "eval_valid_reconstruction/last_seq": 0.33397185802459717, + "eval_valid_reconstruction/second_seq": 0.19104020297527313, + "eval_valid_runtime": 451.8866, + "eval_valid_samples_per_second": 0.425, + "eval_valid_steps_per_second": 0.425, + "step": 58250 + }, + { + "epoch": 0.2172810217616735, + "eval_train_loss": 2.176727294921875, + "eval_train_loss/all": 2.0151901245117188, + "eval_train_loss/end_span": 1.2296106815338135, + "eval_train_perplexity/batch": 7.502153396606445, + "eval_train_perplexity/end_span": 3.4198977947235107, + "eval_train_perplexity/fim": 2.5327370166778564, + "eval_train_perplexity/first_seq": 15.620096206665039, + "eval_train_perplexity/last_seq": 8.741336822509766, + "eval_train_perplexity/second_seq": 14.258686065673828, + "eval_train_perplexity/seq": 8.643649101257324, + "eval_train_reconstruction/all": 0.2872227430343628, + "eval_train_reconstruction/end_span": 0.7149637341499329, + "eval_train_reconstruction/fim": 0.18922144174575806, + "eval_train_reconstruction/first_seq": 0.15025103092193604, + "eval_train_reconstruction/last_seq": 0.3337198495864868, + "eval_train_reconstruction/second_seq": 0.18110688030719757, + "eval_train_runtime": 442.4739, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 58250 + }, + { + "epoch": 0.21731832322463687, + "grad_norm": 0.3559796214103699, + "learning_rate": 0.0006, + "loss": 2.1363, + "step": 58260 + }, + { + "epoch": 0.21735562468760025, + "grad_norm": 0.3920062184333801, + "learning_rate": 0.0006, + "loss": 2.343, + "step": 58270 + }, + { + "epoch": 0.21739292615056363, + "grad_norm": 0.4106892943382263, + "learning_rate": 0.0006, + "loss": 2.2822, + "step": 58280 + }, + { + "epoch": 0.217430227613527, + "grad_norm": 0.45362889766693115, + "learning_rate": 0.0006, + "loss": 2.0376, + "step": 58290 + }, + { + "epoch": 0.2174675290764904, + "grad_norm": 0.40296363830566406, + "learning_rate": 0.0006, + "loss": 2.2469, + "step": 58300 + }, + { + "epoch": 0.21750483053945377, + "grad_norm": 0.26528653502464294, + "learning_rate": 0.0006, + "loss": 2.1641, + "step": 58310 + }, + { + "epoch": 0.21754213200241712, + "grad_norm": 0.3728531002998352, + "learning_rate": 0.0006, + "loss": 2.1972, + "step": 58320 + }, + { + "epoch": 0.2175794334653805, + "grad_norm": 0.22007639706134796, + "learning_rate": 0.0006, + "loss": 2.1236, + "step": 58330 + }, + { + "epoch": 0.21761673492834388, + "grad_norm": 0.35251742601394653, + "learning_rate": 0.0006, + "loss": 2.1396, + "step": 58340 + }, + { + "epoch": 0.21765403639130726, + "grad_norm": 0.3452438414096832, + "learning_rate": 0.0006, + "loss": 2.1045, + "step": 58350 + }, + { + "epoch": 0.21769133785427064, + "grad_norm": 0.3276715874671936, + "learning_rate": 0.0006, + "loss": 2.1576, + "step": 58360 + }, + { + "epoch": 0.21772863931723402, + "grad_norm": 0.2896016538143158, + "learning_rate": 0.0006, + "loss": 2.1354, + "step": 58370 + }, + { + "epoch": 0.2177659407801974, + "grad_norm": 0.31545817852020264, + "learning_rate": 0.0006, + "loss": 1.9875, + "step": 58380 + }, + { + "epoch": 0.21780324224316078, + "grad_norm": 0.3607616126537323, + "learning_rate": 0.0006, + "loss": 2.2476, + "step": 58390 + }, + { + "epoch": 0.21784054370612416, + "grad_norm": 0.41189274191856384, + "learning_rate": 0.0006, + "loss": 2.1816, + "step": 58400 + }, + { + "epoch": 0.21787784516908754, + "grad_norm": 0.3248021900653839, + "learning_rate": 0.0006, + "loss": 2.3905, + "step": 58410 + }, + { + "epoch": 0.21791514663205092, + "grad_norm": 0.7761409878730774, + "learning_rate": 0.0006, + "loss": 2.3008, + "step": 58420 + }, + { + "epoch": 0.2179524480950143, + "grad_norm": 0.4374864995479584, + "learning_rate": 0.0006, + "loss": 2.276, + "step": 58430 + }, + { + "epoch": 0.21798974955797767, + "grad_norm": 0.3035765588283539, + "learning_rate": 0.0006, + "loss": 2.1204, + "step": 58440 + }, + { + "epoch": 0.21802705102094105, + "grad_norm": 0.52498859167099, + "learning_rate": 0.0006, + "loss": 2.2354, + "step": 58450 + }, + { + "epoch": 0.2180643524839044, + "grad_norm": 0.3275212049484253, + "learning_rate": 0.0006, + "loss": 2.204, + "step": 58460 + }, + { + "epoch": 0.21810165394686779, + "grad_norm": 0.30883848667144775, + "learning_rate": 0.0006, + "loss": 2.1772, + "step": 58470 + }, + { + "epoch": 0.21813895540983116, + "grad_norm": 0.26653608679771423, + "learning_rate": 0.0006, + "loss": 2.1484, + "step": 58480 + }, + { + "epoch": 0.21817625687279454, + "grad_norm": 0.32520556449890137, + "learning_rate": 0.0006, + "loss": 2.1006, + "step": 58490 + }, + { + "epoch": 0.21821355833575792, + "grad_norm": 0.4366457760334015, + "learning_rate": 0.0006, + "loss": 2.1647, + "step": 58500 + }, + { + "epoch": 0.21821355833575792, + "eval_valid_loss": 2.1784703731536865, + "eval_valid_loss/all": 2.042707681655884, + "eval_valid_loss/end_span": 1.16417396068573, + "eval_valid_perplexity/batch": 7.711461067199707, + "eval_valid_perplexity/end_span": 3.203275680541992, + "eval_valid_perplexity/fim": 2.380039691925049, + "eval_valid_perplexity/first_seq": 15.373053550720215, + "eval_valid_perplexity/last_seq": 8.81977367401123, + "eval_valid_perplexity/second_seq": 13.93860912322998, + "eval_valid_perplexity/seq": 8.698639869689941, + "eval_valid_reconstruction/all": 0.2980262041091919, + "eval_valid_reconstruction/end_span": 0.7190968990325928, + "eval_valid_reconstruction/fim": 0.17620234191417694, + "eval_valid_reconstruction/first_seq": 0.15617825090885162, + "eval_valid_reconstruction/last_seq": 0.33357807993888855, + "eval_valid_reconstruction/second_seq": 0.1903853565454483, + "eval_valid_runtime": 443.6792, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 58500 + }, + { + "epoch": 0.21821355833575792, + "eval_train_loss": 2.1772592067718506, + "eval_train_loss/all": 2.0154953002929688, + "eval_train_loss/end_span": 1.1427316665649414, + "eval_train_perplexity/batch": 7.504443645477295, + "eval_train_perplexity/end_span": 3.1353213787078857, + "eval_train_perplexity/fim": 1.994415044784546, + "eval_train_perplexity/first_seq": 15.661334991455078, + "eval_train_perplexity/last_seq": 8.939801216125488, + "eval_train_perplexity/second_seq": 14.100112915039062, + "eval_train_perplexity/seq": 8.64138412475586, + "eval_train_reconstruction/all": 0.28724342584609985, + "eval_train_reconstruction/end_span": 0.7286121845245361, + "eval_train_reconstruction/fim": 0.1405717432498932, + "eval_train_reconstruction/first_seq": 0.1512015461921692, + "eval_train_reconstruction/last_seq": 0.3244621455669403, + "eval_train_reconstruction/second_seq": 0.18822036683559418, + "eval_train_runtime": 438.359, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 58500 + }, + { + "epoch": 0.2182508597987213, + "grad_norm": 0.2869076132774353, + "learning_rate": 0.0006, + "loss": 2.0316, + "step": 58510 + }, + { + "epoch": 0.21828816126168468, + "grad_norm": 0.31400465965270996, + "learning_rate": 0.0006, + "loss": 2.3362, + "step": 58520 + }, + { + "epoch": 0.21832546272464806, + "grad_norm": 0.23188528418540955, + "learning_rate": 0.0006, + "loss": 2.1402, + "step": 58530 + }, + { + "epoch": 0.21836276418761144, + "grad_norm": 0.586855411529541, + "learning_rate": 0.0006, + "loss": 1.8764, + "step": 58540 + }, + { + "epoch": 0.21840006565057482, + "grad_norm": 0.3800943195819855, + "learning_rate": 0.0006, + "loss": 2.2102, + "step": 58550 + }, + { + "epoch": 0.2184373671135382, + "grad_norm": 0.3591919243335724, + "learning_rate": 0.0006, + "loss": 2.2574, + "step": 58560 + }, + { + "epoch": 0.21847466857650158, + "grad_norm": 0.28977763652801514, + "learning_rate": 0.0006, + "loss": 2.2913, + "step": 58570 + }, + { + "epoch": 0.21851197003946496, + "grad_norm": 0.25234147906303406, + "learning_rate": 0.0006, + "loss": 2.1393, + "step": 58580 + }, + { + "epoch": 0.21854927150242834, + "grad_norm": 0.3992477357387543, + "learning_rate": 0.0006, + "loss": 2.2255, + "step": 58590 + }, + { + "epoch": 0.2185865729653917, + "grad_norm": 0.2888175845146179, + "learning_rate": 0.0006, + "loss": 2.3204, + "step": 58600 + }, + { + "epoch": 0.21862387442835507, + "grad_norm": 0.28254321217536926, + "learning_rate": 0.0006, + "loss": 2.2372, + "step": 58610 + }, + { + "epoch": 0.21866117589131845, + "grad_norm": 0.4353562891483307, + "learning_rate": 0.0006, + "loss": 2.1904, + "step": 58620 + }, + { + "epoch": 0.21869847735428183, + "grad_norm": 0.38385507464408875, + "learning_rate": 0.0006, + "loss": 2.0764, + "step": 58630 + }, + { + "epoch": 0.2187357788172452, + "grad_norm": 0.34854656457901, + "learning_rate": 0.0006, + "loss": 2.0816, + "step": 58640 + }, + { + "epoch": 0.2187730802802086, + "grad_norm": 0.347678542137146, + "learning_rate": 0.0006, + "loss": 2.1811, + "step": 58650 + }, + { + "epoch": 0.21881038174317197, + "grad_norm": 0.31831690669059753, + "learning_rate": 0.0006, + "loss": 2.2672, + "step": 58660 + }, + { + "epoch": 0.21884768320613535, + "grad_norm": 0.26109179854393005, + "learning_rate": 0.0006, + "loss": 2.2006, + "step": 58670 + }, + { + "epoch": 0.21888498466909873, + "grad_norm": 0.3379114866256714, + "learning_rate": 0.0006, + "loss": 2.0218, + "step": 58680 + }, + { + "epoch": 0.2189222861320621, + "grad_norm": 0.33649277687072754, + "learning_rate": 0.0006, + "loss": 2.3513, + "step": 58690 + }, + { + "epoch": 0.21895958759502548, + "grad_norm": 0.3306572139263153, + "learning_rate": 0.0006, + "loss": 2.2519, + "step": 58700 + }, + { + "epoch": 0.21899688905798886, + "grad_norm": 0.3597598969936371, + "learning_rate": 0.0006, + "loss": 2.0096, + "step": 58710 + }, + { + "epoch": 0.21903419052095224, + "grad_norm": 0.4889586567878723, + "learning_rate": 0.0006, + "loss": 2.252, + "step": 58720 + }, + { + "epoch": 0.21907149198391562, + "grad_norm": 0.4256656765937805, + "learning_rate": 0.0006, + "loss": 2.2999, + "step": 58730 + }, + { + "epoch": 0.21910879344687897, + "grad_norm": 0.314176082611084, + "learning_rate": 0.0006, + "loss": 2.1524, + "step": 58740 + }, + { + "epoch": 0.21914609490984235, + "grad_norm": 0.3565950393676758, + "learning_rate": 0.0006, + "loss": 2.0909, + "step": 58750 + }, + { + "epoch": 0.21914609490984235, + "eval_valid_loss": 2.1793720722198486, + "eval_valid_loss/all": 2.044071674346924, + "eval_valid_loss/end_span": 1.2246270179748535, + "eval_valid_perplexity/batch": 7.721986770629883, + "eval_valid_perplexity/end_span": 3.4028966426849365, + "eval_valid_perplexity/fim": 2.1850392818450928, + "eval_valid_perplexity/first_seq": 14.724773406982422, + "eval_valid_perplexity/last_seq": 9.117027282714844, + "eval_valid_perplexity/second_seq": 13.684350967407227, + "eval_valid_perplexity/seq": 8.717021942138672, + "eval_valid_reconstruction/all": 0.2974758446216583, + "eval_valid_reconstruction/end_span": 0.7176272869110107, + "eval_valid_reconstruction/fim": 0.15890541672706604, + "eval_valid_reconstruction/first_seq": 0.1725829541683197, + "eval_valid_reconstruction/last_seq": 0.32252421975135803, + "eval_valid_reconstruction/second_seq": 0.19733478128910065, + "eval_valid_runtime": 444.1119, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 58750 + }, + { + "epoch": 0.21914609490984235, + "eval_train_loss": 2.177830934524536, + "eval_train_loss/all": 2.0162813663482666, + "eval_train_loss/end_span": 1.203768253326416, + "eval_train_perplexity/batch": 7.510344505310059, + "eval_train_perplexity/end_span": 3.3326516151428223, + "eval_train_perplexity/fim": 2.303170680999756, + "eval_train_perplexity/first_seq": 15.562545776367188, + "eval_train_perplexity/last_seq": 9.093364715576172, + "eval_train_perplexity/second_seq": 14.34765625, + "eval_train_perplexity/seq": 8.653096199035645, + "eval_train_reconstruction/all": 0.2869381010532379, + "eval_train_reconstruction/end_span": 0.7264702320098877, + "eval_train_reconstruction/fim": 0.1698242723941803, + "eval_train_reconstruction/first_seq": 0.15125398337841034, + "eval_train_reconstruction/last_seq": 0.32015591859817505, + "eval_train_reconstruction/second_seq": 0.18113404512405396, + "eval_train_runtime": 445.3042, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 58750 + }, + { + "epoch": 0.21918339637280573, + "grad_norm": 0.41739290952682495, + "learning_rate": 0.0006, + "loss": 2.2599, + "step": 58760 + }, + { + "epoch": 0.2192206978357691, + "grad_norm": 0.41909512877464294, + "learning_rate": 0.0006, + "loss": 2.3399, + "step": 58770 + }, + { + "epoch": 0.2192579992987325, + "grad_norm": 0.29832056164741516, + "learning_rate": 0.0006, + "loss": 2.2222, + "step": 58780 + }, + { + "epoch": 0.21929530076169587, + "grad_norm": 0.19605706632137299, + "learning_rate": 0.0006, + "loss": 2.3947, + "step": 58790 + }, + { + "epoch": 0.21933260222465925, + "grad_norm": 0.2702596187591553, + "learning_rate": 0.0006, + "loss": 2.2999, + "step": 58800 + }, + { + "epoch": 0.21936990368762263, + "grad_norm": 0.32732656598091125, + "learning_rate": 0.0006, + "loss": 2.2351, + "step": 58810 + }, + { + "epoch": 0.219407205150586, + "grad_norm": 0.35603567957878113, + "learning_rate": 0.0006, + "loss": 2.2754, + "step": 58820 + }, + { + "epoch": 0.2194445066135494, + "grad_norm": 0.49805063009262085, + "learning_rate": 0.0006, + "loss": 2.2783, + "step": 58830 + }, + { + "epoch": 0.21948180807651277, + "grad_norm": 0.36027124524116516, + "learning_rate": 0.0006, + "loss": 2.2555, + "step": 58840 + }, + { + "epoch": 0.21951910953947615, + "grad_norm": 0.369676798582077, + "learning_rate": 0.0006, + "loss": 2.1953, + "step": 58850 + }, + { + "epoch": 0.21955641100243953, + "grad_norm": 0.333459734916687, + "learning_rate": 0.0006, + "loss": 2.1659, + "step": 58860 + }, + { + "epoch": 0.21959371246540288, + "grad_norm": 0.2923629879951477, + "learning_rate": 0.0006, + "loss": 2.3042, + "step": 58870 + }, + { + "epoch": 0.21963101392836626, + "grad_norm": 1.923524260520935, + "learning_rate": 0.0006, + "loss": 2.4034, + "step": 58880 + }, + { + "epoch": 0.21966831539132964, + "grad_norm": 0.3136419355869293, + "learning_rate": 0.0006, + "loss": 2.2859, + "step": 58890 + }, + { + "epoch": 0.21970561685429302, + "grad_norm": 0.309696227312088, + "learning_rate": 0.0006, + "loss": 2.0703, + "step": 58900 + }, + { + "epoch": 0.2197429183172564, + "grad_norm": 0.43153849244117737, + "learning_rate": 0.0006, + "loss": 2.1535, + "step": 58910 + }, + { + "epoch": 0.21978021978021978, + "grad_norm": 0.2744959890842438, + "learning_rate": 0.0006, + "loss": 2.2724, + "step": 58920 + }, + { + "epoch": 0.21981752124318316, + "grad_norm": 0.3402911126613617, + "learning_rate": 0.0006, + "loss": 2.2471, + "step": 58930 + }, + { + "epoch": 0.21985482270614654, + "grad_norm": 0.6220993399620056, + "learning_rate": 0.0006, + "loss": 2.2622, + "step": 58940 + }, + { + "epoch": 0.21989212416910991, + "grad_norm": 0.3145897388458252, + "learning_rate": 0.0006, + "loss": 2.2816, + "step": 58950 + }, + { + "epoch": 0.2199294256320733, + "grad_norm": 0.3084612190723419, + "learning_rate": 0.0006, + "loss": 2.0011, + "step": 58960 + }, + { + "epoch": 0.21996672709503667, + "grad_norm": 0.23119544982910156, + "learning_rate": 0.0006, + "loss": 2.1498, + "step": 58970 + }, + { + "epoch": 0.22000402855800005, + "grad_norm": 0.2907674014568329, + "learning_rate": 0.0006, + "loss": 2.1769, + "step": 58980 + }, + { + "epoch": 0.22004133002096343, + "grad_norm": 0.5807960033416748, + "learning_rate": 0.0006, + "loss": 2.1691, + "step": 58990 + }, + { + "epoch": 0.2200786314839268, + "grad_norm": 0.3240935802459717, + "learning_rate": 0.0006, + "loss": 2.3119, + "step": 59000 + }, + { + "epoch": 0.2200786314839268, + "eval_valid_loss": 2.181551456451416, + "eval_valid_loss/all": 2.04555082321167, + "eval_valid_loss/end_span": 1.276772141456604, + "eval_valid_perplexity/batch": 7.73341703414917, + "eval_valid_perplexity/end_span": 3.5850489139556885, + "eval_valid_perplexity/fim": 2.2906131744384766, + "eval_valid_perplexity/first_seq": 14.595321655273438, + "eval_valid_perplexity/last_seq": 9.16301155090332, + "eval_valid_perplexity/second_seq": 14.04749584197998, + "eval_valid_perplexity/seq": 8.716475486755371, + "eval_valid_reconstruction/all": 0.2969684600830078, + "eval_valid_reconstruction/end_span": 0.7003291249275208, + "eval_valid_reconstruction/fim": 0.1662246584892273, + "eval_valid_reconstruction/first_seq": 0.17440921068191528, + "eval_valid_reconstruction/last_seq": 0.3199917674064636, + "eval_valid_reconstruction/second_seq": 0.1880263239145279, + "eval_valid_runtime": 441.1675, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 59000 + }, + { + "epoch": 0.2200786314839268, + "eval_train_loss": 2.179940938949585, + "eval_train_loss/all": 2.017529249191284, + "eval_train_loss/end_span": 1.243135929107666, + "eval_train_perplexity/batch": 7.5197224617004395, + "eval_train_perplexity/end_span": 3.4664671421051025, + "eval_train_perplexity/fim": 2.0968270301818848, + "eval_train_perplexity/first_seq": 15.119485855102539, + "eval_train_perplexity/last_seq": 8.885771751403809, + "eval_train_perplexity/second_seq": 14.128052711486816, + "eval_train_perplexity/seq": 8.655698776245117, + "eval_train_reconstruction/all": 0.28638699650764465, + "eval_train_reconstruction/end_span": 0.7095810770988464, + "eval_train_reconstruction/fim": 0.14831316471099854, + "eval_train_reconstruction/first_seq": 0.1572585105895996, + "eval_train_reconstruction/last_seq": 0.3297511041164398, + "eval_train_reconstruction/second_seq": 0.18518517911434174, + "eval_train_runtime": 442.7425, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 59000 + }, + { + "epoch": 0.22011593294689016, + "grad_norm": 0.24713939428329468, + "learning_rate": 0.0006, + "loss": 2.2861, + "step": 59010 + }, + { + "epoch": 0.22015323440985354, + "grad_norm": 0.23993618786334991, + "learning_rate": 0.0006, + "loss": 2.1112, + "step": 59020 + }, + { + "epoch": 0.22019053587281692, + "grad_norm": 0.28494590520858765, + "learning_rate": 0.0006, + "loss": 2.2506, + "step": 59030 + }, + { + "epoch": 0.2202278373357803, + "grad_norm": 0.3701571524143219, + "learning_rate": 0.0006, + "loss": 2.0765, + "step": 59040 + }, + { + "epoch": 0.22026513879874368, + "grad_norm": 0.49432700872421265, + "learning_rate": 0.0006, + "loss": 2.1912, + "step": 59050 + }, + { + "epoch": 0.22030244026170706, + "grad_norm": 0.40262600779533386, + "learning_rate": 0.0006, + "loss": 2.2878, + "step": 59060 + }, + { + "epoch": 0.22033974172467044, + "grad_norm": 0.6339056491851807, + "learning_rate": 0.0006, + "loss": 2.1917, + "step": 59070 + }, + { + "epoch": 0.22037704318763382, + "grad_norm": 0.37727880477905273, + "learning_rate": 0.0006, + "loss": 2.1762, + "step": 59080 + }, + { + "epoch": 0.2204143446505972, + "grad_norm": 0.29180026054382324, + "learning_rate": 0.0006, + "loss": 2.2627, + "step": 59090 + }, + { + "epoch": 0.22045164611356058, + "grad_norm": 0.2935919165611267, + "learning_rate": 0.0006, + "loss": 2.2408, + "step": 59100 + }, + { + "epoch": 0.22048894757652396, + "grad_norm": 0.5426825881004333, + "learning_rate": 0.0006, + "loss": 2.1295, + "step": 59110 + }, + { + "epoch": 0.22052624903948734, + "grad_norm": 0.2865634858608246, + "learning_rate": 0.0006, + "loss": 2.0858, + "step": 59120 + }, + { + "epoch": 0.22056355050245072, + "grad_norm": 0.23734399676322937, + "learning_rate": 0.0006, + "loss": 2.3613, + "step": 59130 + }, + { + "epoch": 0.2206008519654141, + "grad_norm": 0.3078114092350006, + "learning_rate": 0.0006, + "loss": 2.1823, + "step": 59140 + }, + { + "epoch": 0.22063815342837745, + "grad_norm": 0.29012152552604675, + "learning_rate": 0.0006, + "loss": 2.2684, + "step": 59150 + }, + { + "epoch": 0.22067545489134083, + "grad_norm": 0.24067901074886322, + "learning_rate": 0.0006, + "loss": 2.1146, + "step": 59160 + }, + { + "epoch": 0.2207127563543042, + "grad_norm": 0.3226079046726227, + "learning_rate": 0.0006, + "loss": 2.1786, + "step": 59170 + }, + { + "epoch": 0.2207500578172676, + "grad_norm": 0.456787109375, + "learning_rate": 0.0006, + "loss": 2.1042, + "step": 59180 + }, + { + "epoch": 0.22078735928023097, + "grad_norm": 0.38887444138526917, + "learning_rate": 0.0006, + "loss": 2.1416, + "step": 59190 + }, + { + "epoch": 0.22082466074319435, + "grad_norm": 0.43044617772102356, + "learning_rate": 0.0006, + "loss": 2.0787, + "step": 59200 + }, + { + "epoch": 0.22086196220615772, + "grad_norm": 0.3960256278514862, + "learning_rate": 0.0006, + "loss": 2.2178, + "step": 59210 + }, + { + "epoch": 0.2208992636691211, + "grad_norm": 0.26860013604164124, + "learning_rate": 0.0006, + "loss": 2.3905, + "step": 59220 + }, + { + "epoch": 0.22093656513208448, + "grad_norm": 0.24637633562088013, + "learning_rate": 0.0006, + "loss": 2.0514, + "step": 59230 + }, + { + "epoch": 0.22097386659504786, + "grad_norm": 0.29226940870285034, + "learning_rate": 0.0006, + "loss": 2.2727, + "step": 59240 + }, + { + "epoch": 0.22101116805801124, + "grad_norm": 0.31409549713134766, + "learning_rate": 0.0006, + "loss": 2.1577, + "step": 59250 + }, + { + "epoch": 0.22101116805801124, + "eval_valid_loss": 2.1835453510284424, + "eval_valid_loss/all": 2.0479204654693604, + "eval_valid_loss/end_span": 1.2821903228759766, + "eval_valid_perplexity/batch": 7.751764297485352, + "eval_valid_perplexity/end_span": 3.6045260429382324, + "eval_valid_perplexity/fim": 2.313479423522949, + "eval_valid_perplexity/first_seq": 14.736053466796875, + "eval_valid_perplexity/last_seq": 8.90112590789795, + "eval_valid_perplexity/second_seq": 13.780312538146973, + "eval_valid_perplexity/seq": 8.754670143127441, + "eval_valid_reconstruction/all": 0.2965486943721771, + "eval_valid_reconstruction/end_span": 0.6983031630516052, + "eval_valid_reconstruction/fim": 0.16928431391716003, + "eval_valid_reconstruction/first_seq": 0.16993990540504456, + "eval_valid_reconstruction/last_seq": 0.33259117603302, + "eval_valid_reconstruction/second_seq": 0.19496935606002808, + "eval_valid_runtime": 445.8421, + "eval_valid_samples_per_second": 0.431, + "eval_valid_steps_per_second": 0.431, + "step": 59250 + }, + { + "epoch": 0.22101116805801124, + "eval_train_loss": 2.179866075515747, + "eval_train_loss/all": 2.018193483352661, + "eval_train_loss/end_span": 1.2475924491882324, + "eval_train_perplexity/batch": 7.52471923828125, + "eval_train_perplexity/end_span": 3.481949806213379, + "eval_train_perplexity/fim": 2.092484712600708, + "eval_train_perplexity/first_seq": 15.652873039245605, + "eval_train_perplexity/last_seq": 8.843036651611328, + "eval_train_perplexity/second_seq": 14.04942512512207, + "eval_train_perplexity/seq": 8.676251411437988, + "eval_train_reconstruction/all": 0.2864696979522705, + "eval_train_reconstruction/end_span": 0.7083134055137634, + "eval_train_reconstruction/fim": 0.14962337911128998, + "eval_train_reconstruction/first_seq": 0.15138322114944458, + "eval_train_reconstruction/last_seq": 0.33190488815307617, + "eval_train_reconstruction/second_seq": 0.18505844473838806, + "eval_train_runtime": 444.8118, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 59250 + }, + { + "epoch": 0.22104846952097462, + "grad_norm": 0.38679537177085876, + "learning_rate": 0.0006, + "loss": 2.158, + "step": 59260 + }, + { + "epoch": 0.221085770983938, + "grad_norm": 0.3025723397731781, + "learning_rate": 0.0006, + "loss": 2.277, + "step": 59270 + }, + { + "epoch": 0.22112307244690138, + "grad_norm": 0.32087287306785583, + "learning_rate": 0.0006, + "loss": 2.1673, + "step": 59280 + }, + { + "epoch": 0.22116037390986473, + "grad_norm": 0.3141511082649231, + "learning_rate": 0.0006, + "loss": 2.2656, + "step": 59290 + }, + { + "epoch": 0.2211976753728281, + "grad_norm": 0.3428746163845062, + "learning_rate": 0.0006, + "loss": 2.3277, + "step": 59300 + }, + { + "epoch": 0.2212349768357915, + "grad_norm": 0.25531765818595886, + "learning_rate": 0.0006, + "loss": 2.2626, + "step": 59310 + }, + { + "epoch": 0.22127227829875487, + "grad_norm": 0.23439225554466248, + "learning_rate": 0.0006, + "loss": 2.2244, + "step": 59320 + }, + { + "epoch": 0.22130957976171825, + "grad_norm": 0.2608374357223511, + "learning_rate": 0.0006, + "loss": 2.165, + "step": 59330 + }, + { + "epoch": 0.22134688122468163, + "grad_norm": 0.327511727809906, + "learning_rate": 0.0006, + "loss": 2.3261, + "step": 59340 + }, + { + "epoch": 0.221384182687645, + "grad_norm": 0.4535297751426697, + "learning_rate": 0.0006, + "loss": 2.1963, + "step": 59350 + }, + { + "epoch": 0.2214214841506084, + "grad_norm": 0.3163894712924957, + "learning_rate": 0.0006, + "loss": 2.196, + "step": 59360 + }, + { + "epoch": 0.22145878561357177, + "grad_norm": 0.34255340695381165, + "learning_rate": 0.0006, + "loss": 2.2969, + "step": 59370 + }, + { + "epoch": 0.22149608707653515, + "grad_norm": 0.41709446907043457, + "learning_rate": 0.0006, + "loss": 2.242, + "step": 59380 + }, + { + "epoch": 0.22153338853949853, + "grad_norm": 0.25307416915893555, + "learning_rate": 0.0006, + "loss": 2.1743, + "step": 59390 + }, + { + "epoch": 0.2215706900024619, + "grad_norm": 0.2841636836528778, + "learning_rate": 0.0006, + "loss": 2.3211, + "step": 59400 + }, + { + "epoch": 0.22160799146542529, + "grad_norm": 0.31171292066574097, + "learning_rate": 0.0006, + "loss": 2.3669, + "step": 59410 + }, + { + "epoch": 0.22164529292838867, + "grad_norm": 0.3659217059612274, + "learning_rate": 0.0006, + "loss": 2.3426, + "step": 59420 + }, + { + "epoch": 0.22168259439135202, + "grad_norm": 0.2937544584274292, + "learning_rate": 0.0006, + "loss": 2.0617, + "step": 59430 + }, + { + "epoch": 0.2217198958543154, + "grad_norm": 0.5713806748390198, + "learning_rate": 0.0006, + "loss": 2.2276, + "step": 59440 + }, + { + "epoch": 0.22175719731727878, + "grad_norm": 0.38096746802330017, + "learning_rate": 0.0006, + "loss": 2.2077, + "step": 59450 + }, + { + "epoch": 0.22179449878024216, + "grad_norm": 0.4460981488227844, + "learning_rate": 0.0006, + "loss": 2.145, + "step": 59460 + }, + { + "epoch": 0.22183180024320553, + "grad_norm": 0.6464139819145203, + "learning_rate": 0.0006, + "loss": 2.3844, + "step": 59470 + }, + { + "epoch": 0.22186910170616891, + "grad_norm": 0.3293011784553528, + "learning_rate": 0.0006, + "loss": 2.2045, + "step": 59480 + }, + { + "epoch": 0.2219064031691323, + "grad_norm": 0.30113258957862854, + "learning_rate": 0.0006, + "loss": 2.2609, + "step": 59490 + }, + { + "epoch": 0.22194370463209567, + "grad_norm": 0.8643463253974915, + "learning_rate": 0.0006, + "loss": 1.9479, + "step": 59500 + }, + { + "epoch": 0.22194370463209567, + "eval_valid_loss": 2.179067373275757, + "eval_valid_loss/all": 2.0435285568237305, + "eval_valid_loss/end_span": 1.2099641561508179, + "eval_valid_perplexity/batch": 7.717793941497803, + "eval_valid_perplexity/end_span": 3.3533644676208496, + "eval_valid_perplexity/fim": 2.1685633659362793, + "eval_valid_perplexity/first_seq": 14.686615943908691, + "eval_valid_perplexity/last_seq": 8.758177757263184, + "eval_valid_perplexity/second_seq": 13.638903617858887, + "eval_valid_perplexity/seq": 8.709811210632324, + "eval_valid_reconstruction/all": 0.29759401082992554, + "eval_valid_reconstruction/end_span": 0.7206265926361084, + "eval_valid_reconstruction/fim": 0.15816162526607513, + "eval_valid_reconstruction/first_seq": 0.1717272847890854, + "eval_valid_reconstruction/last_seq": 0.3355565071105957, + "eval_valid_reconstruction/second_seq": 0.1989295482635498, + "eval_valid_runtime": 443.8577, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 59500 + }, + { + "epoch": 0.22194370463209567, + "eval_train_loss": 2.1761763095855713, + "eval_train_loss/all": 2.0146172046661377, + "eval_train_loss/end_span": 1.1713054180145264, + "eval_train_perplexity/batch": 7.497856616973877, + "eval_train_perplexity/end_span": 3.2262015342712402, + "eval_train_perplexity/fim": 2.2329602241516113, + "eval_train_perplexity/first_seq": 15.526777267456055, + "eval_train_perplexity/last_seq": 8.413858413696289, + "eval_train_perplexity/second_seq": 14.086389541625977, + "eval_train_perplexity/seq": 8.640364646911621, + "eval_train_reconstruction/all": 0.28724539279937744, + "eval_train_reconstruction/end_span": 0.7325689196586609, + "eval_train_reconstruction/fim": 0.16312894225120544, + "eval_train_reconstruction/first_seq": 0.1539163440465927, + "eval_train_reconstruction/last_seq": 0.34436675906181335, + "eval_train_reconstruction/second_seq": 0.18817909061908722, + "eval_train_runtime": 440.3467, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 59500 + }, + { + "epoch": 0.22198100609505905, + "grad_norm": 0.3715599477291107, + "learning_rate": 0.0006, + "loss": 2.0811, + "step": 59510 + }, + { + "epoch": 0.22201830755802243, + "grad_norm": 0.3578355312347412, + "learning_rate": 0.0006, + "loss": 2.3712, + "step": 59520 + }, + { + "epoch": 0.2220556090209858, + "grad_norm": 0.21212559938430786, + "learning_rate": 0.0006, + "loss": 2.2981, + "step": 59530 + }, + { + "epoch": 0.2220929104839492, + "grad_norm": 0.40988627076148987, + "learning_rate": 0.0006, + "loss": 2.118, + "step": 59540 + }, + { + "epoch": 0.22213021194691257, + "grad_norm": 0.3366442918777466, + "learning_rate": 0.0006, + "loss": 2.2463, + "step": 59550 + }, + { + "epoch": 0.22216751340987592, + "grad_norm": 0.3596676290035248, + "learning_rate": 0.0006, + "loss": 2.1812, + "step": 59560 + }, + { + "epoch": 0.2222048148728393, + "grad_norm": 0.3603481352329254, + "learning_rate": 0.0006, + "loss": 2.2178, + "step": 59570 + }, + { + "epoch": 0.22224211633580268, + "grad_norm": 0.6816163063049316, + "learning_rate": 0.0006, + "loss": 1.9639, + "step": 59580 + }, + { + "epoch": 0.22227941779876606, + "grad_norm": 0.36146122217178345, + "learning_rate": 0.0006, + "loss": 2.1258, + "step": 59590 + }, + { + "epoch": 0.22231671926172944, + "grad_norm": 0.2512730062007904, + "learning_rate": 0.0006, + "loss": 2.2787, + "step": 59600 + }, + { + "epoch": 0.22235402072469282, + "grad_norm": 0.33759868144989014, + "learning_rate": 0.0006, + "loss": 2.1714, + "step": 59610 + }, + { + "epoch": 0.2223913221876562, + "grad_norm": 0.35111942887306213, + "learning_rate": 0.0006, + "loss": 2.1358, + "step": 59620 + }, + { + "epoch": 0.22242862365061958, + "grad_norm": 0.4104025363922119, + "learning_rate": 0.0006, + "loss": 2.1654, + "step": 59630 + }, + { + "epoch": 0.22246592511358296, + "grad_norm": 0.3693980276584625, + "learning_rate": 0.0006, + "loss": 2.329, + "step": 59640 + }, + { + "epoch": 0.22250322657654634, + "grad_norm": 0.25924959778785706, + "learning_rate": 0.0006, + "loss": 2.1621, + "step": 59650 + }, + { + "epoch": 0.22254052803950972, + "grad_norm": 0.33966583013534546, + "learning_rate": 0.0006, + "loss": 2.1399, + "step": 59660 + }, + { + "epoch": 0.2225778295024731, + "grad_norm": 0.33723539113998413, + "learning_rate": 0.0006, + "loss": 2.156, + "step": 59670 + }, + { + "epoch": 0.22261513096543648, + "grad_norm": 0.7403684854507446, + "learning_rate": 0.0006, + "loss": 2.1999, + "step": 59680 + }, + { + "epoch": 0.22265243242839985, + "grad_norm": 0.2931486666202545, + "learning_rate": 0.0006, + "loss": 2.066, + "step": 59690 + }, + { + "epoch": 0.2226897338913632, + "grad_norm": 0.24629414081573486, + "learning_rate": 0.0006, + "loss": 2.2917, + "step": 59700 + }, + { + "epoch": 0.22272703535432659, + "grad_norm": 0.3354191482067108, + "learning_rate": 0.0006, + "loss": 2.1799, + "step": 59710 + }, + { + "epoch": 0.22276433681728997, + "grad_norm": 0.3920144736766815, + "learning_rate": 0.0006, + "loss": 2.0358, + "step": 59720 + }, + { + "epoch": 0.22280163828025334, + "grad_norm": 0.460401713848114, + "learning_rate": 0.0006, + "loss": 2.2397, + "step": 59730 + }, + { + "epoch": 0.22283893974321672, + "grad_norm": 0.35929715633392334, + "learning_rate": 0.0006, + "loss": 2.1264, + "step": 59740 + }, + { + "epoch": 0.2228762412061801, + "grad_norm": 0.3856881558895111, + "learning_rate": 0.0006, + "loss": 2.2606, + "step": 59750 + }, + { + "epoch": 0.2228762412061801, + "eval_valid_loss": 2.180669069290161, + "eval_valid_loss/all": 2.0447447299957275, + "eval_valid_loss/end_span": 1.2482548952102661, + "eval_valid_perplexity/batch": 7.7271857261657715, + "eval_valid_perplexity/end_span": 3.484257221221924, + "eval_valid_perplexity/fim": 2.3930444717407227, + "eval_valid_perplexity/first_seq": 15.040492057800293, + "eval_valid_perplexity/last_seq": 8.797697067260742, + "eval_valid_perplexity/second_seq": 13.549946784973145, + "eval_valid_perplexity/seq": 8.717806816101074, + "eval_valid_reconstruction/all": 0.2971544563770294, + "eval_valid_reconstruction/end_span": 0.7121357917785645, + "eval_valid_reconstruction/fim": 0.17643095552921295, + "eval_valid_reconstruction/first_seq": 0.16651186347007751, + "eval_valid_reconstruction/last_seq": 0.3332233726978302, + "eval_valid_reconstruction/second_seq": 0.20409534871578217, + "eval_valid_runtime": 438.6069, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 59750 + }, + { + "epoch": 0.2228762412061801, + "eval_train_loss": 2.176892042160034, + "eval_train_loss/all": 2.015169620513916, + "eval_train_loss/end_span": 1.2060309648513794, + "eval_train_perplexity/batch": 7.501999855041504, + "eval_train_perplexity/end_span": 3.340200901031494, + "eval_train_perplexity/fim": 2.0477330684661865, + "eval_train_perplexity/first_seq": 15.856293678283691, + "eval_train_perplexity/last_seq": 8.628440856933594, + "eval_train_perplexity/second_seq": 14.249187469482422, + "eval_train_perplexity/seq": 8.639348030090332, + "eval_train_reconstruction/all": 0.28693974018096924, + "eval_train_reconstruction/end_span": 0.7248731851577759, + "eval_train_reconstruction/fim": 0.1460539996623993, + "eval_train_reconstruction/first_seq": 0.14718060195446014, + "eval_train_reconstruction/last_seq": 0.3381993770599365, + "eval_train_reconstruction/second_seq": 0.18209917843341827, + "eval_train_runtime": 439.0427, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 59750 + }, + { + "epoch": 0.22291354266914348, + "grad_norm": 0.34046095609664917, + "learning_rate": 0.0006, + "loss": 2.0775, + "step": 59760 + }, + { + "epoch": 0.22295084413210686, + "grad_norm": 0.3584985136985779, + "learning_rate": 0.0006, + "loss": 2.1676, + "step": 59770 + }, + { + "epoch": 0.22298814559507024, + "grad_norm": 0.26020753383636475, + "learning_rate": 0.0006, + "loss": 2.1986, + "step": 59780 + }, + { + "epoch": 0.22302544705803362, + "grad_norm": 0.33238735795021057, + "learning_rate": 0.0006, + "loss": 2.3025, + "step": 59790 + }, + { + "epoch": 0.223062748520997, + "grad_norm": 0.27017518877983093, + "learning_rate": 0.0006, + "loss": 2.242, + "step": 59800 + }, + { + "epoch": 0.22310004998396038, + "grad_norm": 0.22363673150539398, + "learning_rate": 0.0006, + "loss": 2.3792, + "step": 59810 + }, + { + "epoch": 0.22313735144692376, + "grad_norm": 0.3537141680717468, + "learning_rate": 0.0006, + "loss": 2.3197, + "step": 59820 + }, + { + "epoch": 0.22317465290988714, + "grad_norm": 0.39177629351615906, + "learning_rate": 0.0006, + "loss": 2.1276, + "step": 59830 + }, + { + "epoch": 0.2232119543728505, + "grad_norm": 0.3539277911186218, + "learning_rate": 0.0006, + "loss": 2.2452, + "step": 59840 + }, + { + "epoch": 0.22324925583581387, + "grad_norm": 0.3167603313922882, + "learning_rate": 0.0006, + "loss": 2.1955, + "step": 59850 + }, + { + "epoch": 0.22328655729877725, + "grad_norm": 0.45725253224372864, + "learning_rate": 0.0006, + "loss": 2.1552, + "step": 59860 + }, + { + "epoch": 0.22332385876174063, + "grad_norm": 0.3575396239757538, + "learning_rate": 0.0006, + "loss": 2.0856, + "step": 59870 + }, + { + "epoch": 0.223361160224704, + "grad_norm": 0.31242579221725464, + "learning_rate": 0.0006, + "loss": 2.1444, + "step": 59880 + }, + { + "epoch": 0.2233984616876674, + "grad_norm": 0.24359701573848724, + "learning_rate": 0.0006, + "loss": 2.346, + "step": 59890 + }, + { + "epoch": 0.22343576315063077, + "grad_norm": 0.35575342178344727, + "learning_rate": 0.0006, + "loss": 1.9945, + "step": 59900 + }, + { + "epoch": 0.22347306461359415, + "grad_norm": 0.3536507487297058, + "learning_rate": 0.0006, + "loss": 2.0366, + "step": 59910 + }, + { + "epoch": 0.22351036607655753, + "grad_norm": 0.34965354204177856, + "learning_rate": 0.0006, + "loss": 2.1508, + "step": 59920 + }, + { + "epoch": 0.2235476675395209, + "grad_norm": 0.34153205156326294, + "learning_rate": 0.0006, + "loss": 2.2062, + "step": 59930 + }, + { + "epoch": 0.22358496900248429, + "grad_norm": 0.2802770733833313, + "learning_rate": 0.0006, + "loss": 2.1239, + "step": 59940 + }, + { + "epoch": 0.22362227046544766, + "grad_norm": 0.39758554100990295, + "learning_rate": 0.0006, + "loss": 2.233, + "step": 59950 + }, + { + "epoch": 0.22365957192841104, + "grad_norm": 0.33091771602630615, + "learning_rate": 0.0006, + "loss": 2.2788, + "step": 59960 + }, + { + "epoch": 0.22369687339137442, + "grad_norm": 0.45378491282463074, + "learning_rate": 0.0006, + "loss": 2.106, + "step": 59970 + }, + { + "epoch": 0.22373417485433778, + "grad_norm": 0.31733211874961853, + "learning_rate": 0.0006, + "loss": 2.2883, + "step": 59980 + }, + { + "epoch": 0.22377147631730115, + "grad_norm": 0.33575841784477234, + "learning_rate": 0.0006, + "loss": 2.1749, + "step": 59990 + }, + { + "epoch": 0.22380877778026453, + "grad_norm": 0.4069451689720154, + "learning_rate": 0.0006, + "loss": 2.1551, + "step": 60000 + }, + { + "epoch": 0.22380877778026453, + "eval_valid_loss": 2.1759655475616455, + "eval_valid_loss/all": 2.0404961109161377, + "eval_valid_loss/end_span": 1.2437355518341064, + "eval_valid_perplexity/batch": 7.694425582885742, + "eval_valid_perplexity/end_span": 3.468546152114868, + "eval_valid_perplexity/fim": 2.2408692836761475, + "eval_valid_perplexity/first_seq": 15.114136695861816, + "eval_valid_perplexity/last_seq": 8.273929595947266, + "eval_valid_perplexity/second_seq": 13.529535293579102, + "eval_valid_perplexity/seq": 8.682639122009277, + "eval_valid_reconstruction/all": 0.29818758368492126, + "eval_valid_reconstruction/end_span": 0.7105174660682678, + "eval_valid_reconstruction/fim": 0.16355352103710175, + "eval_valid_reconstruction/first_seq": 0.16178762912750244, + "eval_valid_reconstruction/last_seq": 0.35762766003608704, + "eval_valid_reconstruction/second_seq": 0.20129424333572388, + "eval_valid_runtime": 440.5378, + "eval_valid_samples_per_second": 0.436, + "eval_valid_steps_per_second": 0.436, + "step": 60000 + }, + { + "epoch": 0.22380877778026453, + "eval_train_loss": 2.173905849456787, + "eval_train_loss/all": 2.012575149536133, + "eval_train_loss/end_span": 1.2128517627716064, + "eval_train_perplexity/batch": 7.482561111450195, + "eval_train_perplexity/end_span": 3.3630616664886475, + "eval_train_perplexity/fim": 1.9744839668273926, + "eval_train_perplexity/first_seq": 15.513237953186035, + "eval_train_perplexity/last_seq": 9.260007858276367, + "eval_train_perplexity/second_seq": 14.290112495422363, + "eval_train_perplexity/seq": 8.619925498962402, + "eval_train_reconstruction/all": 0.2876565456390381, + "eval_train_reconstruction/end_span": 0.7209842205047607, + "eval_train_reconstruction/fim": 0.13876979053020477, + "eval_train_reconstruction/first_seq": 0.15218135714530945, + "eval_train_reconstruction/last_seq": 0.3170412480831146, + "eval_train_reconstruction/second_seq": 0.18269889056682587, + "eval_train_runtime": 442.6829, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 60000 + }, + { + "epoch": 0.2238460792432279, + "grad_norm": 0.5540803670883179, + "learning_rate": 0.0006, + "loss": 2.1772, + "step": 60010 + }, + { + "epoch": 0.2238833807061913, + "grad_norm": 0.3152272701263428, + "learning_rate": 0.0006, + "loss": 2.2145, + "step": 60020 + }, + { + "epoch": 0.22392068216915467, + "grad_norm": 0.27836573123931885, + "learning_rate": 0.0006, + "loss": 2.2682, + "step": 60030 + }, + { + "epoch": 0.22395798363211805, + "grad_norm": 0.3964206576347351, + "learning_rate": 0.0006, + "loss": 2.0663, + "step": 60040 + }, + { + "epoch": 0.22399528509508143, + "grad_norm": 0.38473132252693176, + "learning_rate": 0.0006, + "loss": 2.2025, + "step": 60050 + }, + { + "epoch": 0.2240325865580448, + "grad_norm": 0.4385097324848175, + "learning_rate": 0.0006, + "loss": 2.2325, + "step": 60060 + }, + { + "epoch": 0.2240698880210082, + "grad_norm": 0.392406702041626, + "learning_rate": 0.0006, + "loss": 2.3773, + "step": 60070 + }, + { + "epoch": 0.22410718948397157, + "grad_norm": 0.2963588535785675, + "learning_rate": 0.0006, + "loss": 2.2042, + "step": 60080 + }, + { + "epoch": 0.22414449094693495, + "grad_norm": 0.508838951587677, + "learning_rate": 0.0006, + "loss": 2.2208, + "step": 60090 + }, + { + "epoch": 0.22418179240989833, + "grad_norm": 0.26328492164611816, + "learning_rate": 0.0006, + "loss": 2.1943, + "step": 60100 + }, + { + "epoch": 0.22421909387286168, + "grad_norm": 0.358167827129364, + "learning_rate": 0.0006, + "loss": 2.1689, + "step": 60110 + }, + { + "epoch": 0.22425639533582506, + "grad_norm": 0.2704303562641144, + "learning_rate": 0.0006, + "loss": 2.3668, + "step": 60120 + }, + { + "epoch": 0.22429369679878844, + "grad_norm": 0.3795493543148041, + "learning_rate": 0.0006, + "loss": 2.1062, + "step": 60130 + }, + { + "epoch": 0.22433099826175182, + "grad_norm": 0.34529295563697815, + "learning_rate": 0.0006, + "loss": 2.2723, + "step": 60140 + }, + { + "epoch": 0.2243682997247152, + "grad_norm": 0.31779831647872925, + "learning_rate": 0.0006, + "loss": 2.0299, + "step": 60150 + }, + { + "epoch": 0.22440560118767858, + "grad_norm": 0.28744208812713623, + "learning_rate": 0.0006, + "loss": 2.1071, + "step": 60160 + }, + { + "epoch": 0.22444290265064196, + "grad_norm": 0.3513859212398529, + "learning_rate": 0.0006, + "loss": 2.2512, + "step": 60170 + }, + { + "epoch": 0.22448020411360534, + "grad_norm": 0.3391997516155243, + "learning_rate": 0.0006, + "loss": 2.12, + "step": 60180 + }, + { + "epoch": 0.22451750557656872, + "grad_norm": 0.32269176840782166, + "learning_rate": 0.0006, + "loss": 2.2786, + "step": 60190 + }, + { + "epoch": 0.2245548070395321, + "grad_norm": 0.3315141201019287, + "learning_rate": 0.0006, + "loss": 2.2093, + "step": 60200 + }, + { + "epoch": 0.22459210850249547, + "grad_norm": 0.3638896644115448, + "learning_rate": 0.0006, + "loss": 2.1556, + "step": 60210 + }, + { + "epoch": 0.22462940996545885, + "grad_norm": 0.41211384534835815, + "learning_rate": 0.0006, + "loss": 2.3151, + "step": 60220 + }, + { + "epoch": 0.22466671142842223, + "grad_norm": 0.24839462339878082, + "learning_rate": 0.0006, + "loss": 2.2874, + "step": 60230 + }, + { + "epoch": 0.2247040128913856, + "grad_norm": 0.33220773935317993, + "learning_rate": 0.0006, + "loss": 2.2065, + "step": 60240 + }, + { + "epoch": 0.22474131435434896, + "grad_norm": 0.28005585074424744, + "learning_rate": 0.0006, + "loss": 2.2515, + "step": 60250 + }, + { + "epoch": 0.22474131435434896, + "eval_valid_loss": 2.1804983615875244, + "eval_valid_loss/all": 2.0451064109802246, + "eval_valid_loss/end_span": 1.2626723051071167, + "eval_valid_perplexity/batch": 7.729980945587158, + "eval_valid_perplexity/end_span": 3.5348551273345947, + "eval_valid_perplexity/fim": 2.35001277923584, + "eval_valid_perplexity/first_seq": 14.901179313659668, + "eval_valid_perplexity/last_seq": 8.821609497070312, + "eval_valid_perplexity/second_seq": 13.91525650024414, + "eval_valid_perplexity/seq": 8.723347663879395, + "eval_valid_reconstruction/all": 0.2969713509082794, + "eval_valid_reconstruction/end_span": 0.7039353847503662, + "eval_valid_reconstruction/fim": 0.17297953367233276, + "eval_valid_reconstruction/first_seq": 0.16678103804588318, + "eval_valid_reconstruction/last_seq": 0.333324670791626, + "eval_valid_reconstruction/second_seq": 0.19207772612571716, + "eval_valid_runtime": 442.9427, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 60250 + }, + { + "epoch": 0.22474131435434896, + "eval_train_loss": 2.178054094314575, + "eval_train_loss/all": 2.016451597213745, + "eval_train_loss/end_span": 1.225301742553711, + "eval_train_perplexity/batch": 7.511623382568359, + "eval_train_perplexity/end_span": 3.405193328857422, + "eval_train_perplexity/fim": 1.906447172164917, + "eval_train_perplexity/first_seq": 15.556591033935547, + "eval_train_perplexity/last_seq": 9.170248031616211, + "eval_train_perplexity/second_seq": 14.310104370117188, + "eval_train_perplexity/seq": 8.650652885437012, + "eval_train_reconstruction/all": 0.28654584288597107, + "eval_train_reconstruction/end_span": 0.7135195136070251, + "eval_train_reconstruction/fim": 0.13042646646499634, + "eval_train_reconstruction/first_seq": 0.15225613117218018, + "eval_train_reconstruction/last_seq": 0.32217124104499817, + "eval_train_reconstruction/second_seq": 0.1811511367559433, + "eval_train_runtime": 444.7528, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 60250 + }, + { + "epoch": 0.22477861581731234, + "grad_norm": 0.43281567096710205, + "learning_rate": 0.0006, + "loss": 2.2814, + "step": 60260 + }, + { + "epoch": 0.22481591728027572, + "grad_norm": 0.3839688003063202, + "learning_rate": 0.0006, + "loss": 2.168, + "step": 60270 + }, + { + "epoch": 0.2248532187432391, + "grad_norm": 0.3300624489784241, + "learning_rate": 0.0006, + "loss": 2.0443, + "step": 60280 + }, + { + "epoch": 0.22489052020620248, + "grad_norm": 0.4253198206424713, + "learning_rate": 0.0006, + "loss": 2.195, + "step": 60290 + }, + { + "epoch": 0.22492782166916586, + "grad_norm": 0.528233528137207, + "learning_rate": 0.0006, + "loss": 2.2225, + "step": 60300 + }, + { + "epoch": 0.22496512313212924, + "grad_norm": 0.5541827082633972, + "learning_rate": 0.0006, + "loss": 2.2703, + "step": 60310 + }, + { + "epoch": 0.22500242459509262, + "grad_norm": 0.45079874992370605, + "learning_rate": 0.0006, + "loss": 2.273, + "step": 60320 + }, + { + "epoch": 0.225039726058056, + "grad_norm": 0.38102054595947266, + "learning_rate": 0.0006, + "loss": 2.1898, + "step": 60330 + }, + { + "epoch": 0.22507702752101938, + "grad_norm": 0.3955475687980652, + "learning_rate": 0.0006, + "loss": 2.2281, + "step": 60340 + }, + { + "epoch": 0.22511432898398276, + "grad_norm": 0.33458060026168823, + "learning_rate": 0.0006, + "loss": 2.122, + "step": 60350 + }, + { + "epoch": 0.22515163044694614, + "grad_norm": 0.2660609483718872, + "learning_rate": 0.0006, + "loss": 2.2016, + "step": 60360 + }, + { + "epoch": 0.22518893190990952, + "grad_norm": 0.453977108001709, + "learning_rate": 0.0006, + "loss": 2.0562, + "step": 60370 + }, + { + "epoch": 0.2252262333728729, + "grad_norm": 0.3720099627971649, + "learning_rate": 0.0006, + "loss": 2.3133, + "step": 60380 + }, + { + "epoch": 0.22526353483583625, + "grad_norm": 0.22203843295574188, + "learning_rate": 0.0006, + "loss": 2.116, + "step": 60390 + }, + { + "epoch": 0.22530083629879963, + "grad_norm": 0.43814507126808167, + "learning_rate": 0.0006, + "loss": 2.046, + "step": 60400 + }, + { + "epoch": 0.225338137761763, + "grad_norm": 0.294930636882782, + "learning_rate": 0.0006, + "loss": 2.1753, + "step": 60410 + }, + { + "epoch": 0.2253754392247264, + "grad_norm": 0.3611011505126953, + "learning_rate": 0.0006, + "loss": 2.1803, + "step": 60420 + }, + { + "epoch": 0.22541274068768977, + "grad_norm": 0.2562084197998047, + "learning_rate": 0.0006, + "loss": 2.3069, + "step": 60430 + }, + { + "epoch": 0.22545004215065315, + "grad_norm": 0.28355810046195984, + "learning_rate": 0.0006, + "loss": 2.3367, + "step": 60440 + }, + { + "epoch": 0.22548734361361653, + "grad_norm": 0.2603522539138794, + "learning_rate": 0.0006, + "loss": 2.135, + "step": 60450 + }, + { + "epoch": 0.2255246450765799, + "grad_norm": 0.40100377798080444, + "learning_rate": 0.0006, + "loss": 2.2464, + "step": 60460 + }, + { + "epoch": 0.22556194653954328, + "grad_norm": 0.4974694848060608, + "learning_rate": 0.0006, + "loss": 2.1663, + "step": 60470 + }, + { + "epoch": 0.22559924800250666, + "grad_norm": 0.3275262117385864, + "learning_rate": 0.0006, + "loss": 2.1515, + "step": 60480 + }, + { + "epoch": 0.22563654946547004, + "grad_norm": 0.33098745346069336, + "learning_rate": 0.0006, + "loss": 2.1904, + "step": 60490 + }, + { + "epoch": 0.22567385092843342, + "grad_norm": 0.44011175632476807, + "learning_rate": 0.0006, + "loss": 2.1139, + "step": 60500 + }, + { + "epoch": 0.22567385092843342, + "eval_valid_loss": 2.1778171062469482, + "eval_valid_loss/all": 2.0422518253326416, + "eval_valid_loss/end_span": 1.2017607688903809, + "eval_valid_perplexity/batch": 7.70794677734375, + "eval_valid_perplexity/end_span": 3.325968027114868, + "eval_valid_perplexity/fim": 2.2407774925231934, + "eval_valid_perplexity/first_seq": 14.980579376220703, + "eval_valid_perplexity/last_seq": 8.959681510925293, + "eval_valid_perplexity/second_seq": 13.843092918395996, + "eval_valid_perplexity/seq": 8.698481559753418, + "eval_valid_reconstruction/all": 0.29796192049980164, + "eval_valid_reconstruction/end_span": 0.7238022089004517, + "eval_valid_reconstruction/fim": 0.16377292573451996, + "eval_valid_reconstruction/first_seq": 0.16765613853931427, + "eval_valid_reconstruction/last_seq": 0.329443097114563, + "eval_valid_reconstruction/second_seq": 0.19756793975830078, + "eval_valid_runtime": 442.1032, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 60500 + }, + { + "epoch": 0.22567385092843342, + "eval_train_loss": 2.176223039627075, + "eval_train_loss/all": 2.0146613121032715, + "eval_train_loss/end_span": 1.1695396900177002, + "eval_train_perplexity/batch": 7.49818754196167, + "eval_train_perplexity/end_span": 3.2205097675323486, + "eval_train_perplexity/fim": 1.9782450199127197, + "eval_train_perplexity/first_seq": 15.342142105102539, + "eval_train_perplexity/last_seq": 8.759490013122559, + "eval_train_perplexity/second_seq": 14.385636329650879, + "eval_train_perplexity/seq": 8.635860443115234, + "eval_train_reconstruction/all": 0.2873104214668274, + "eval_train_reconstruction/end_span": 0.734854519367218, + "eval_train_reconstruction/fim": 0.1386716067790985, + "eval_train_reconstruction/first_seq": 0.15527485311031342, + "eval_train_reconstruction/last_seq": 0.3329981565475464, + "eval_train_reconstruction/second_seq": 0.17946600914001465, + "eval_train_runtime": 443.3431, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 60500 + }, + { + "epoch": 0.2257111523913968, + "grad_norm": 0.4060854911804199, + "learning_rate": 0.0006, + "loss": 2.1881, + "step": 60510 + }, + { + "epoch": 0.22574845385436018, + "grad_norm": 0.3309633433818817, + "learning_rate": 0.0006, + "loss": 2.3329, + "step": 60520 + }, + { + "epoch": 0.22578575531732353, + "grad_norm": 0.44397303462028503, + "learning_rate": 0.0006, + "loss": 2.0578, + "step": 60530 + }, + { + "epoch": 0.2258230567802869, + "grad_norm": 0.42786189913749695, + "learning_rate": 0.0006, + "loss": 2.1182, + "step": 60540 + }, + { + "epoch": 0.2258603582432503, + "grad_norm": 0.3892337381839752, + "learning_rate": 0.0006, + "loss": 1.9717, + "step": 60550 + }, + { + "epoch": 0.22589765970621367, + "grad_norm": 0.29205313324928284, + "learning_rate": 0.0006, + "loss": 2.1353, + "step": 60560 + }, + { + "epoch": 0.22593496116917705, + "grad_norm": 0.4090615212917328, + "learning_rate": 0.0006, + "loss": 2.1635, + "step": 60570 + }, + { + "epoch": 0.22597226263214043, + "grad_norm": 0.25204604864120483, + "learning_rate": 0.0006, + "loss": 2.3341, + "step": 60580 + }, + { + "epoch": 0.2260095640951038, + "grad_norm": 0.42941275238990784, + "learning_rate": 0.0006, + "loss": 2.1682, + "step": 60590 + }, + { + "epoch": 0.2260468655580672, + "grad_norm": 0.4300697147846222, + "learning_rate": 0.0006, + "loss": 2.1724, + "step": 60600 + }, + { + "epoch": 0.22608416702103057, + "grad_norm": 0.49970442056655884, + "learning_rate": 0.0006, + "loss": 2.2281, + "step": 60610 + }, + { + "epoch": 0.22612146848399395, + "grad_norm": 0.3379232883453369, + "learning_rate": 0.0006, + "loss": 2.1791, + "step": 60620 + }, + { + "epoch": 0.22615876994695733, + "grad_norm": 0.31675654649734497, + "learning_rate": 0.0006, + "loss": 2.1243, + "step": 60630 + }, + { + "epoch": 0.2261960714099207, + "grad_norm": 0.364261656999588, + "learning_rate": 0.0006, + "loss": 2.0287, + "step": 60640 + }, + { + "epoch": 0.2262333728728841, + "grad_norm": 0.8743095993995667, + "learning_rate": 0.0006, + "loss": 2.2793, + "step": 60650 + }, + { + "epoch": 0.22627067433584744, + "grad_norm": 0.6176365613937378, + "learning_rate": 0.0006, + "loss": 2.0835, + "step": 60660 + }, + { + "epoch": 0.22630797579881082, + "grad_norm": 0.30975914001464844, + "learning_rate": 0.0006, + "loss": 2.2683, + "step": 60670 + }, + { + "epoch": 0.2263452772617742, + "grad_norm": 0.41699621081352234, + "learning_rate": 0.0006, + "loss": 1.9936, + "step": 60680 + }, + { + "epoch": 0.22638257872473758, + "grad_norm": 0.3304349482059479, + "learning_rate": 0.0006, + "loss": 2.2884, + "step": 60690 + }, + { + "epoch": 0.22641988018770096, + "grad_norm": 0.39587175846099854, + "learning_rate": 0.0006, + "loss": 2.2098, + "step": 60700 + }, + { + "epoch": 0.22645718165066434, + "grad_norm": 0.24868449568748474, + "learning_rate": 0.0006, + "loss": 2.2306, + "step": 60710 + }, + { + "epoch": 0.22649448311362771, + "grad_norm": 0.5193856954574585, + "learning_rate": 0.0006, + "loss": 2.1513, + "step": 60720 + }, + { + "epoch": 0.2265317845765911, + "grad_norm": 0.27429062128067017, + "learning_rate": 0.0006, + "loss": 2.3445, + "step": 60730 + }, + { + "epoch": 0.22656908603955447, + "grad_norm": 0.416689395904541, + "learning_rate": 0.0006, + "loss": 2.0947, + "step": 60740 + }, + { + "epoch": 0.22660638750251785, + "grad_norm": 0.26051053404808044, + "learning_rate": 0.0006, + "loss": 2.143, + "step": 60750 + }, + { + "epoch": 0.22660638750251785, + "eval_valid_loss": 2.178703546524048, + "eval_valid_loss/all": 2.042694091796875, + "eval_valid_loss/end_span": 1.1927074193954468, + "eval_valid_perplexity/batch": 7.711356163024902, + "eval_valid_perplexity/end_span": 3.295992851257324, + "eval_valid_perplexity/fim": 2.3133625984191895, + "eval_valid_perplexity/first_seq": 14.7952241897583, + "eval_valid_perplexity/last_seq": 8.610740661621094, + "eval_valid_perplexity/second_seq": 14.399303436279297, + "eval_valid_perplexity/seq": 8.697707176208496, + "eval_valid_reconstruction/all": 0.2978208065032959, + "eval_valid_reconstruction/end_span": 0.7247962951660156, + "eval_valid_reconstruction/fim": 0.16998690366744995, + "eval_valid_reconstruction/first_seq": 0.16992823779582977, + "eval_valid_reconstruction/last_seq": 0.33822041749954224, + "eval_valid_reconstruction/second_seq": 0.18150769174098969, + "eval_valid_runtime": 445.4515, + "eval_valid_samples_per_second": 0.431, + "eval_valid_steps_per_second": 0.431, + "step": 60750 + }, + { + "epoch": 0.22660638750251785, + "eval_train_loss": 2.1767051219940186, + "eval_train_loss/all": 2.014596462249756, + "eval_train_loss/end_span": 1.14840567111969, + "eval_train_perplexity/batch": 7.497701168060303, + "eval_train_perplexity/end_span": 3.1531617641448975, + "eval_train_perplexity/fim": 1.945475459098816, + "eval_train_perplexity/first_seq": 15.774635314941406, + "eval_train_perplexity/last_seq": 8.549749374389648, + "eval_train_perplexity/second_seq": 14.228577613830566, + "eval_train_perplexity/seq": 8.630703926086426, + "eval_train_reconstruction/all": 0.2874455153942108, + "eval_train_reconstruction/end_span": 0.7373431324958801, + "eval_train_reconstruction/fim": 0.13517555594444275, + "eval_train_reconstruction/first_seq": 0.1437789499759674, + "eval_train_reconstruction/last_seq": 0.3417348563671112, + "eval_train_reconstruction/second_seq": 0.18587619066238403, + "eval_train_runtime": 445.6598, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 60750 + }, + { + "epoch": 0.22664368896548123, + "grad_norm": 0.24726448953151703, + "learning_rate": 0.0006, + "loss": 2.3224, + "step": 60760 + }, + { + "epoch": 0.2266809904284446, + "grad_norm": 0.36041128635406494, + "learning_rate": 0.0006, + "loss": 2.3422, + "step": 60770 + }, + { + "epoch": 0.226718291891408, + "grad_norm": 0.43343213200569153, + "learning_rate": 0.0006, + "loss": 2.2952, + "step": 60780 + }, + { + "epoch": 0.22675559335437137, + "grad_norm": 0.4234311580657959, + "learning_rate": 0.0006, + "loss": 2.2549, + "step": 60790 + }, + { + "epoch": 0.22679289481733472, + "grad_norm": 0.3553224802017212, + "learning_rate": 0.0006, + "loss": 2.2693, + "step": 60800 + }, + { + "epoch": 0.2268301962802981, + "grad_norm": 0.3415451645851135, + "learning_rate": 0.0006, + "loss": 2.1616, + "step": 60810 + }, + { + "epoch": 0.22686749774326148, + "grad_norm": 0.4874408543109894, + "learning_rate": 0.0006, + "loss": 2.1158, + "step": 60820 + }, + { + "epoch": 0.22690479920622486, + "grad_norm": 0.4131222665309906, + "learning_rate": 0.0006, + "loss": 2.3151, + "step": 60830 + }, + { + "epoch": 0.22694210066918824, + "grad_norm": 0.4321015179157257, + "learning_rate": 0.0006, + "loss": 2.0532, + "step": 60840 + }, + { + "epoch": 0.22697940213215162, + "grad_norm": 0.3628830015659332, + "learning_rate": 0.0006, + "loss": 2.377, + "step": 60850 + }, + { + "epoch": 0.227016703595115, + "grad_norm": 0.395230770111084, + "learning_rate": 0.0006, + "loss": 2.3206, + "step": 60860 + }, + { + "epoch": 0.22705400505807838, + "grad_norm": 0.3159671127796173, + "learning_rate": 0.0006, + "loss": 2.2118, + "step": 60870 + }, + { + "epoch": 0.22709130652104176, + "grad_norm": 0.3520965874195099, + "learning_rate": 0.0006, + "loss": 2.1484, + "step": 60880 + }, + { + "epoch": 0.22712860798400514, + "grad_norm": 0.4024384617805481, + "learning_rate": 0.0006, + "loss": 2.3563, + "step": 60890 + }, + { + "epoch": 0.22716590944696852, + "grad_norm": 0.8154388070106506, + "learning_rate": 0.0006, + "loss": 2.2452, + "step": 60900 + }, + { + "epoch": 0.2272032109099319, + "grad_norm": 0.46555545926094055, + "learning_rate": 0.0006, + "loss": 2.2289, + "step": 60910 + }, + { + "epoch": 0.22724051237289528, + "grad_norm": 0.6242625117301941, + "learning_rate": 0.0006, + "loss": 2.1634, + "step": 60920 + }, + { + "epoch": 0.22727781383585866, + "grad_norm": 0.3453178107738495, + "learning_rate": 0.0006, + "loss": 2.3087, + "step": 60930 + }, + { + "epoch": 0.227315115298822, + "grad_norm": 0.27761125564575195, + "learning_rate": 0.0006, + "loss": 2.2497, + "step": 60940 + }, + { + "epoch": 0.2273524167617854, + "grad_norm": 0.37474194169044495, + "learning_rate": 0.0006, + "loss": 2.2236, + "step": 60950 + }, + { + "epoch": 0.22738971822474877, + "grad_norm": 0.29081082344055176, + "learning_rate": 0.0006, + "loss": 2.2519, + "step": 60960 + }, + { + "epoch": 0.22742701968771215, + "grad_norm": 0.30079591274261475, + "learning_rate": 0.0006, + "loss": 2.2611, + "step": 60970 + }, + { + "epoch": 0.22746432115067552, + "grad_norm": 0.3279067277908325, + "learning_rate": 0.0006, + "loss": 2.1616, + "step": 60980 + }, + { + "epoch": 0.2275016226136389, + "grad_norm": 0.3101675808429718, + "learning_rate": 0.0006, + "loss": 2.2113, + "step": 60990 + }, + { + "epoch": 0.22753892407660228, + "grad_norm": 0.3001912832260132, + "learning_rate": 0.0006, + "loss": 2.22, + "step": 61000 + }, + { + "epoch": 0.22753892407660228, + "eval_valid_loss": 2.1739957332611084, + "eval_valid_loss/all": 2.038588523864746, + "eval_valid_loss/end_span": 1.226662278175354, + "eval_valid_perplexity/batch": 7.67976188659668, + "eval_valid_perplexity/end_span": 3.4098293781280518, + "eval_valid_perplexity/fim": 2.234630823135376, + "eval_valid_perplexity/first_seq": 14.962311744689941, + "eval_valid_perplexity/last_seq": 8.911016464233398, + "eval_valid_perplexity/second_seq": 13.875713348388672, + "eval_valid_perplexity/seq": 8.661330223083496, + "eval_valid_reconstruction/all": 0.2990168631076813, + "eval_valid_reconstruction/end_span": 0.7097976207733154, + "eval_valid_reconstruction/fim": 0.16390466690063477, + "eval_valid_reconstruction/first_seq": 0.16788502037525177, + "eval_valid_reconstruction/last_seq": 0.3290615379810333, + "eval_valid_reconstruction/second_seq": 0.19183172285556793, + "eval_valid_runtime": 447.6142, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 61000 + }, + { + "epoch": 0.22753892407660228, + "eval_train_loss": 2.1727101802825928, + "eval_train_loss/all": 2.01143741607666, + "eval_train_loss/end_span": 1.1907950639724731, + "eval_train_perplexity/batch": 7.474052906036377, + "eval_train_perplexity/end_span": 3.2896957397460938, + "eval_train_perplexity/fim": 2.023470878601074, + "eval_train_perplexity/first_seq": 15.601456642150879, + "eval_train_perplexity/last_seq": 8.611823081970215, + "eval_train_perplexity/second_seq": 14.215851783752441, + "eval_train_perplexity/seq": 8.604960441589355, + "eval_train_reconstruction/all": 0.2883053421974182, + "eval_train_reconstruction/end_span": 0.7206069231033325, + "eval_train_reconstruction/fim": 0.14367057383060455, + "eval_train_reconstruction/first_seq": 0.1469297707080841, + "eval_train_reconstruction/last_seq": 0.337948203086853, + "eval_train_reconstruction/second_seq": 0.18123263120651245, + "eval_train_runtime": 441.6128, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 61000 + }, + { + "epoch": 0.22757622553956566, + "grad_norm": 0.27474743127822876, + "learning_rate": 0.0006, + "loss": 2.3759, + "step": 61010 + }, + { + "epoch": 0.22761352700252904, + "grad_norm": 0.31646478176116943, + "learning_rate": 0.0006, + "loss": 2.196, + "step": 61020 + }, + { + "epoch": 0.22765082846549242, + "grad_norm": 0.24248002469539642, + "learning_rate": 0.0006, + "loss": 2.2787, + "step": 61030 + }, + { + "epoch": 0.2276881299284558, + "grad_norm": 0.28518593311309814, + "learning_rate": 0.0006, + "loss": 2.3379, + "step": 61040 + }, + { + "epoch": 0.22772543139141918, + "grad_norm": 0.2553219497203827, + "learning_rate": 0.0006, + "loss": 2.3123, + "step": 61050 + }, + { + "epoch": 0.22776273285438256, + "grad_norm": 0.24684850871562958, + "learning_rate": 0.0006, + "loss": 2.2049, + "step": 61060 + }, + { + "epoch": 0.22780003431734594, + "grad_norm": 0.3279916048049927, + "learning_rate": 0.0006, + "loss": 2.3407, + "step": 61070 + }, + { + "epoch": 0.2278373357803093, + "grad_norm": 0.6909355521202087, + "learning_rate": 0.0006, + "loss": 2.2584, + "step": 61080 + }, + { + "epoch": 0.22787463724327267, + "grad_norm": 0.3222281336784363, + "learning_rate": 0.0006, + "loss": 2.0581, + "step": 61090 + }, + { + "epoch": 0.22791193870623605, + "grad_norm": 0.4365600049495697, + "learning_rate": 0.0006, + "loss": 2.264, + "step": 61100 + }, + { + "epoch": 0.22794924016919943, + "grad_norm": 0.25954851508140564, + "learning_rate": 0.0006, + "loss": 2.2105, + "step": 61110 + }, + { + "epoch": 0.2279865416321628, + "grad_norm": 0.3500085175037384, + "learning_rate": 0.0006, + "loss": 2.3167, + "step": 61120 + }, + { + "epoch": 0.2280238430951262, + "grad_norm": 0.42747777700424194, + "learning_rate": 0.0006, + "loss": 2.259, + "step": 61130 + }, + { + "epoch": 0.22806114455808957, + "grad_norm": 0.6745630502700806, + "learning_rate": 0.0006, + "loss": 2.1965, + "step": 61140 + }, + { + "epoch": 0.22809844602105295, + "grad_norm": 0.35712891817092896, + "learning_rate": 0.0006, + "loss": 2.2184, + "step": 61150 + }, + { + "epoch": 0.22813574748401633, + "grad_norm": 0.47870299220085144, + "learning_rate": 0.0006, + "loss": 2.2004, + "step": 61160 + }, + { + "epoch": 0.2281730489469797, + "grad_norm": 0.4862626791000366, + "learning_rate": 0.0006, + "loss": 2.2742, + "step": 61170 + }, + { + "epoch": 0.22821035040994309, + "grad_norm": 0.30477607250213623, + "learning_rate": 0.0006, + "loss": 2.1361, + "step": 61180 + }, + { + "epoch": 0.22824765187290647, + "grad_norm": 0.3558554947376251, + "learning_rate": 0.0006, + "loss": 2.3033, + "step": 61190 + }, + { + "epoch": 0.22828495333586984, + "grad_norm": 0.36496502161026, + "learning_rate": 0.0006, + "loss": 2.1101, + "step": 61200 + }, + { + "epoch": 0.22832225479883322, + "grad_norm": 0.3404332995414734, + "learning_rate": 0.0006, + "loss": 2.0928, + "step": 61210 + }, + { + "epoch": 0.22835955626179658, + "grad_norm": 1.1043641567230225, + "learning_rate": 0.0006, + "loss": 2.1191, + "step": 61220 + }, + { + "epoch": 0.22839685772475996, + "grad_norm": 0.2927128076553345, + "learning_rate": 0.0006, + "loss": 2.2292, + "step": 61230 + }, + { + "epoch": 0.22843415918772333, + "grad_norm": 0.412607342004776, + "learning_rate": 0.0006, + "loss": 2.185, + "step": 61240 + }, + { + "epoch": 0.22847146065068671, + "grad_norm": 0.7497919797897339, + "learning_rate": 0.0006, + "loss": 2.3104, + "step": 61250 + }, + { + "epoch": 0.22847146065068671, + "eval_valid_loss": 2.1776554584503174, + "eval_valid_loss/all": 2.041555166244507, + "eval_valid_loss/end_span": 1.2577447891235352, + "eval_valid_perplexity/batch": 7.702578544616699, + "eval_valid_perplexity/end_span": 3.51747989654541, + "eval_valid_perplexity/fim": 2.2242343425750732, + "eval_valid_perplexity/first_seq": 15.008418083190918, + "eval_valid_perplexity/last_seq": 8.639691352844238, + "eval_valid_perplexity/second_seq": 13.674155235290527, + "eval_valid_perplexity/seq": 8.685529708862305, + "eval_valid_reconstruction/all": 0.2982351779937744, + "eval_valid_reconstruction/end_span": 0.7053247690200806, + "eval_valid_reconstruction/fim": 0.16356578469276428, + "eval_valid_reconstruction/first_seq": 0.16387951374053955, + "eval_valid_reconstruction/last_seq": 0.339092493057251, + "eval_valid_reconstruction/second_seq": 0.19692353904247284, + "eval_valid_runtime": 442.8492, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 61250 + }, + { + "epoch": 0.22847146065068671, + "eval_train_loss": 2.1775801181793213, + "eval_train_loss/all": 2.0156729221343994, + "eval_train_loss/end_span": 1.2263072729110718, + "eval_train_perplexity/batch": 7.505776405334473, + "eval_train_perplexity/end_span": 3.4086191654205322, + "eval_train_perplexity/fim": 2.1164276599884033, + "eval_train_perplexity/first_seq": 15.517976760864258, + "eval_train_perplexity/last_seq": 8.582953453063965, + "eval_train_perplexity/second_seq": 13.677713394165039, + "eval_train_perplexity/seq": 8.63979434967041, + "eval_train_reconstruction/all": 0.28703829646110535, + "eval_train_reconstruction/end_span": 0.7159687280654907, + "eval_train_reconstruction/fim": 0.15265409648418427, + "eval_train_reconstruction/first_seq": 0.1527516096830368, + "eval_train_reconstruction/last_seq": 0.3373570740222931, + "eval_train_reconstruction/second_seq": 0.19554109871387482, + "eval_train_runtime": 441.2977, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 61250 + }, + { + "epoch": 0.2285087621136501, + "grad_norm": 0.36577361822128296, + "learning_rate": 0.0006, + "loss": 2.3375, + "step": 61260 + }, + { + "epoch": 0.22854606357661347, + "grad_norm": 0.367436021566391, + "learning_rate": 0.0006, + "loss": 2.3049, + "step": 61270 + }, + { + "epoch": 0.22858336503957685, + "grad_norm": 0.24086526036262512, + "learning_rate": 0.0006, + "loss": 2.162, + "step": 61280 + }, + { + "epoch": 0.22862066650254023, + "grad_norm": 0.28579744696617126, + "learning_rate": 0.0006, + "loss": 2.0606, + "step": 61290 + }, + { + "epoch": 0.2286579679655036, + "grad_norm": 0.4032588601112366, + "learning_rate": 0.0006, + "loss": 2.0638, + "step": 61300 + }, + { + "epoch": 0.228695269428467, + "grad_norm": 0.30233514308929443, + "learning_rate": 0.0006, + "loss": 2.227, + "step": 61310 + }, + { + "epoch": 0.22873257089143037, + "grad_norm": 0.24958862364292145, + "learning_rate": 0.0006, + "loss": 2.2635, + "step": 61320 + }, + { + "epoch": 0.22876987235439375, + "grad_norm": 0.35025814175605774, + "learning_rate": 0.0006, + "loss": 2.379, + "step": 61330 + }, + { + "epoch": 0.22880717381735713, + "grad_norm": 0.5312410593032837, + "learning_rate": 0.0006, + "loss": 2.3121, + "step": 61340 + }, + { + "epoch": 0.22884447528032048, + "grad_norm": 0.332935094833374, + "learning_rate": 0.0006, + "loss": 2.0583, + "step": 61350 + }, + { + "epoch": 0.22888177674328386, + "grad_norm": 0.3473617434501648, + "learning_rate": 0.0006, + "loss": 2.2055, + "step": 61360 + }, + { + "epoch": 0.22891907820624724, + "grad_norm": 0.2159401923418045, + "learning_rate": 0.0006, + "loss": 2.3416, + "step": 61370 + }, + { + "epoch": 0.22895637966921062, + "grad_norm": 0.36905840039253235, + "learning_rate": 0.0006, + "loss": 2.331, + "step": 61380 + }, + { + "epoch": 0.228993681132174, + "grad_norm": 0.5784881114959717, + "learning_rate": 0.0006, + "loss": 2.2119, + "step": 61390 + }, + { + "epoch": 0.22903098259513738, + "grad_norm": 0.2901619076728821, + "learning_rate": 0.0006, + "loss": 2.2144, + "step": 61400 + }, + { + "epoch": 0.22906828405810076, + "grad_norm": 0.2266281694173813, + "learning_rate": 0.0006, + "loss": 2.0946, + "step": 61410 + }, + { + "epoch": 0.22910558552106414, + "grad_norm": 0.37199822068214417, + "learning_rate": 0.0006, + "loss": 2.3665, + "step": 61420 + }, + { + "epoch": 0.22914288698402752, + "grad_norm": 0.2355591058731079, + "learning_rate": 0.0006, + "loss": 2.2224, + "step": 61430 + }, + { + "epoch": 0.2291801884469909, + "grad_norm": 0.22676345705986023, + "learning_rate": 0.0006, + "loss": 2.3394, + "step": 61440 + }, + { + "epoch": 0.22921748990995428, + "grad_norm": 0.35574498772621155, + "learning_rate": 0.0006, + "loss": 2.0217, + "step": 61450 + }, + { + "epoch": 0.22925479137291765, + "grad_norm": 0.2645534574985504, + "learning_rate": 0.0006, + "loss": 2.2237, + "step": 61460 + }, + { + "epoch": 0.22929209283588103, + "grad_norm": 0.35465171933174133, + "learning_rate": 0.0006, + "loss": 2.2303, + "step": 61470 + }, + { + "epoch": 0.2293293942988444, + "grad_norm": 0.28558218479156494, + "learning_rate": 0.0006, + "loss": 2.3074, + "step": 61480 + }, + { + "epoch": 0.22936669576180777, + "grad_norm": 0.2755430340766907, + "learning_rate": 0.0006, + "loss": 2.3289, + "step": 61490 + }, + { + "epoch": 0.22940399722477114, + "grad_norm": 0.28476089239120483, + "learning_rate": 0.0006, + "loss": 2.2083, + "step": 61500 + }, + { + "epoch": 0.22940399722477114, + "eval_valid_loss": 2.174046754837036, + "eval_valid_loss/all": 2.0385725498199463, + "eval_valid_loss/end_span": 1.2011806964874268, + "eval_valid_perplexity/batch": 7.679638862609863, + "eval_valid_perplexity/end_span": 3.3240392208099365, + "eval_valid_perplexity/fim": 2.5727367401123047, + "eval_valid_perplexity/first_seq": 14.8839693069458, + "eval_valid_perplexity/last_seq": 8.643770217895508, + "eval_valid_perplexity/second_seq": 13.732804298400879, + "eval_valid_perplexity/seq": 8.660368919372559, + "eval_valid_reconstruction/all": 0.29899686574935913, + "eval_valid_reconstruction/end_span": 0.7172021865844727, + "eval_valid_reconstruction/fim": 0.19365787506103516, + "eval_valid_reconstruction/first_seq": 0.16531719267368317, + "eval_valid_reconstruction/last_seq": 0.337368369102478, + "eval_valid_reconstruction/second_seq": 0.19441205263137817, + "eval_valid_runtime": 445.9006, + "eval_valid_samples_per_second": 0.431, + "eval_valid_steps_per_second": 0.431, + "step": 61500 + }, + { + "epoch": 0.22940399722477114, + "eval_train_loss": 2.1739768981933594, + "eval_train_loss/all": 2.0123419761657715, + "eval_train_loss/end_span": 1.1621712446212769, + "eval_train_perplexity/batch": 7.480816841125488, + "eval_train_perplexity/end_span": 3.196866989135742, + "eval_train_perplexity/fim": 2.0189268589019775, + "eval_train_perplexity/first_seq": 15.351835250854492, + "eval_train_perplexity/last_seq": 8.981771469116211, + "eval_train_perplexity/second_seq": 14.071080207824707, + "eval_train_perplexity/seq": 8.610031127929688, + "eval_train_reconstruction/all": 0.28778743743896484, + "eval_train_reconstruction/end_span": 0.7292189002037048, + "eval_train_reconstruction/fim": 0.1430705189704895, + "eval_train_reconstruction/first_seq": 0.15442487597465515, + "eval_train_reconstruction/last_seq": 0.32635435461997986, + "eval_train_reconstruction/second_seq": 0.18645435571670532, + "eval_train_runtime": 439.2559, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 61500 + }, + { + "epoch": 0.22944129868773452, + "grad_norm": 0.5693792700767517, + "learning_rate": 0.0006, + "loss": 2.1189, + "step": 61510 + }, + { + "epoch": 0.2294786001506979, + "grad_norm": 0.2951396703720093, + "learning_rate": 0.0006, + "loss": 2.2536, + "step": 61520 + }, + { + "epoch": 0.22951590161366128, + "grad_norm": 0.5885015726089478, + "learning_rate": 0.0006, + "loss": 2.2882, + "step": 61530 + }, + { + "epoch": 0.22955320307662466, + "grad_norm": 0.31543174386024475, + "learning_rate": 0.0006, + "loss": 2.2407, + "step": 61540 + }, + { + "epoch": 0.22959050453958804, + "grad_norm": 0.38847842812538147, + "learning_rate": 0.0006, + "loss": 2.1475, + "step": 61550 + }, + { + "epoch": 0.22962780600255142, + "grad_norm": 0.40877851843833923, + "learning_rate": 0.0006, + "loss": 2.163, + "step": 61560 + }, + { + "epoch": 0.2296651074655148, + "grad_norm": 0.5398955345153809, + "learning_rate": 0.0006, + "loss": 2.3119, + "step": 61570 + }, + { + "epoch": 0.22970240892847818, + "grad_norm": 0.2806704640388489, + "learning_rate": 0.0006, + "loss": 2.1909, + "step": 61580 + }, + { + "epoch": 0.22973971039144156, + "grad_norm": 0.2586823105812073, + "learning_rate": 0.0006, + "loss": 2.376, + "step": 61590 + }, + { + "epoch": 0.22977701185440494, + "grad_norm": 0.298851877450943, + "learning_rate": 0.0006, + "loss": 2.047, + "step": 61600 + }, + { + "epoch": 0.22981431331736832, + "grad_norm": 0.30468738079071045, + "learning_rate": 0.0006, + "loss": 2.1043, + "step": 61610 + }, + { + "epoch": 0.2298516147803317, + "grad_norm": 0.5651707053184509, + "learning_rate": 0.0006, + "loss": 2.3025, + "step": 61620 + }, + { + "epoch": 0.22988891624329505, + "grad_norm": 0.30753251910209656, + "learning_rate": 0.0006, + "loss": 2.3713, + "step": 61630 + }, + { + "epoch": 0.22992621770625843, + "grad_norm": 0.32950156927108765, + "learning_rate": 0.0006, + "loss": 2.2438, + "step": 61640 + }, + { + "epoch": 0.2299635191692218, + "grad_norm": 0.27913087606430054, + "learning_rate": 0.0006, + "loss": 2.3133, + "step": 61650 + }, + { + "epoch": 0.2300008206321852, + "grad_norm": 0.3698449730873108, + "learning_rate": 0.0006, + "loss": 2.085, + "step": 61660 + }, + { + "epoch": 0.23003812209514857, + "grad_norm": 0.2351548969745636, + "learning_rate": 0.0006, + "loss": 2.2171, + "step": 61670 + }, + { + "epoch": 0.23007542355811195, + "grad_norm": 0.3355870246887207, + "learning_rate": 0.0006, + "loss": 2.1676, + "step": 61680 + }, + { + "epoch": 0.23011272502107533, + "grad_norm": 0.21865278482437134, + "learning_rate": 0.0006, + "loss": 2.2379, + "step": 61690 + }, + { + "epoch": 0.2301500264840387, + "grad_norm": 0.2645665407180786, + "learning_rate": 0.0006, + "loss": 2.3479, + "step": 61700 + }, + { + "epoch": 0.23018732794700208, + "grad_norm": 0.2726399004459381, + "learning_rate": 0.0006, + "loss": 2.2718, + "step": 61710 + }, + { + "epoch": 0.23022462940996546, + "grad_norm": 0.19782483577728271, + "learning_rate": 0.0006, + "loss": 2.3899, + "step": 61720 + }, + { + "epoch": 0.23026193087292884, + "grad_norm": 0.38156816363334656, + "learning_rate": 0.0006, + "loss": 2.2129, + "step": 61730 + }, + { + "epoch": 0.23029923233589222, + "grad_norm": 0.31620094180107117, + "learning_rate": 0.0006, + "loss": 2.0902, + "step": 61740 + }, + { + "epoch": 0.2303365337988556, + "grad_norm": 0.3978903889656067, + "learning_rate": 0.0006, + "loss": 2.0707, + "step": 61750 + }, + { + "epoch": 0.2303365337988556, + "eval_valid_loss": 2.1788716316223145, + "eval_valid_loss/all": 2.0429956912994385, + "eval_valid_loss/end_span": 1.2094783782958984, + "eval_valid_perplexity/batch": 7.713682651519775, + "eval_valid_perplexity/end_span": 3.351735830307007, + "eval_valid_perplexity/fim": 2.4810376167297363, + "eval_valid_perplexity/first_seq": 14.676563262939453, + "eval_valid_perplexity/last_seq": 8.38742446899414, + "eval_valid_perplexity/second_seq": 13.588085174560547, + "eval_valid_perplexity/seq": 8.698834419250488, + "eval_valid_reconstruction/all": 0.2980829179286957, + "eval_valid_reconstruction/end_span": 0.71832674741745, + "eval_valid_reconstruction/fim": 0.18626077473163605, + "eval_valid_reconstruction/first_seq": 0.17163191735744476, + "eval_valid_reconstruction/last_seq": 0.34873509407043457, + "eval_valid_reconstruction/second_seq": 0.19530490040779114, + "eval_valid_runtime": 447.2607, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 61750 + }, + { + "epoch": 0.2303365337988556, + "eval_train_loss": 2.176621437072754, + "eval_train_loss/all": 2.014529228210449, + "eval_train_loss/end_span": 1.181901454925537, + "eval_train_perplexity/batch": 7.497197151184082, + "eval_train_perplexity/end_span": 3.260568141937256, + "eval_train_perplexity/fim": 2.010877847671509, + "eval_train_perplexity/first_seq": 15.461753845214844, + "eval_train_perplexity/last_seq": 8.332402229309082, + "eval_train_perplexity/second_seq": 14.130023002624512, + "eval_train_perplexity/seq": 8.632607460021973, + "eval_train_reconstruction/all": 0.28763994574546814, + "eval_train_reconstruction/end_span": 0.7279807329177856, + "eval_train_reconstruction/fim": 0.14100632071495056, + "eval_train_reconstruction/first_seq": 0.15447048842906952, + "eval_train_reconstruction/last_seq": 0.34837958216667175, + "eval_train_reconstruction/second_seq": 0.18759985268115997, + "eval_train_runtime": 444.7235, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 61750 + }, + { + "epoch": 0.23037383526181898, + "grad_norm": 0.38653042912483215, + "learning_rate": 0.0006, + "loss": 2.1005, + "step": 61760 + }, + { + "epoch": 0.23041113672478233, + "grad_norm": 0.29660990834236145, + "learning_rate": 0.0006, + "loss": 2.2884, + "step": 61770 + }, + { + "epoch": 0.2304484381877457, + "grad_norm": 0.22736458480358124, + "learning_rate": 0.0006, + "loss": 2.2222, + "step": 61780 + }, + { + "epoch": 0.2304857396507091, + "grad_norm": 0.38529446721076965, + "learning_rate": 0.0006, + "loss": 2.1809, + "step": 61790 + }, + { + "epoch": 0.23052304111367247, + "grad_norm": 0.2677445411682129, + "learning_rate": 0.0006, + "loss": 1.9951, + "step": 61800 + }, + { + "epoch": 0.23056034257663585, + "grad_norm": 0.32164427638053894, + "learning_rate": 0.0006, + "loss": 2.1368, + "step": 61810 + }, + { + "epoch": 0.23059764403959923, + "grad_norm": 0.3433118164539337, + "learning_rate": 0.0006, + "loss": 2.3566, + "step": 61820 + }, + { + "epoch": 0.2306349455025626, + "grad_norm": 0.4449862837791443, + "learning_rate": 0.0006, + "loss": 2.0748, + "step": 61830 + }, + { + "epoch": 0.230672246965526, + "grad_norm": 0.6722298860549927, + "learning_rate": 0.0006, + "loss": 2.0925, + "step": 61840 + }, + { + "epoch": 0.23070954842848937, + "grad_norm": 0.2756972014904022, + "learning_rate": 0.0006, + "loss": 2.2344, + "step": 61850 + }, + { + "epoch": 0.23074684989145275, + "grad_norm": 0.5049120187759399, + "learning_rate": 0.0006, + "loss": 2.3061, + "step": 61860 + }, + { + "epoch": 0.23078415135441613, + "grad_norm": 0.30798542499542236, + "learning_rate": 0.0006, + "loss": 2.2736, + "step": 61870 + }, + { + "epoch": 0.2308214528173795, + "grad_norm": 0.3838421404361725, + "learning_rate": 0.0006, + "loss": 1.9882, + "step": 61880 + }, + { + "epoch": 0.2308587542803429, + "grad_norm": 0.34427231550216675, + "learning_rate": 0.0006, + "loss": 2.2553, + "step": 61890 + }, + { + "epoch": 0.23089605574330624, + "grad_norm": 0.25222519040107727, + "learning_rate": 0.0006, + "loss": 2.2267, + "step": 61900 + }, + { + "epoch": 0.23093335720626962, + "grad_norm": 0.296596884727478, + "learning_rate": 0.0006, + "loss": 2.3056, + "step": 61910 + }, + { + "epoch": 0.230970658669233, + "grad_norm": 0.32276666164398193, + "learning_rate": 0.0006, + "loss": 2.1079, + "step": 61920 + }, + { + "epoch": 0.23100796013219638, + "grad_norm": 0.3387462794780731, + "learning_rate": 0.0006, + "loss": 2.3096, + "step": 61930 + }, + { + "epoch": 0.23104526159515976, + "grad_norm": 0.3770594596862793, + "learning_rate": 0.0006, + "loss": 2.1902, + "step": 61940 + }, + { + "epoch": 0.23108256305812314, + "grad_norm": 0.3160400986671448, + "learning_rate": 0.0006, + "loss": 2.3578, + "step": 61950 + }, + { + "epoch": 0.23111986452108652, + "grad_norm": 0.42879578471183777, + "learning_rate": 0.0006, + "loss": 2.0711, + "step": 61960 + }, + { + "epoch": 0.2311571659840499, + "grad_norm": 0.3064481317996979, + "learning_rate": 0.0006, + "loss": 2.3896, + "step": 61970 + }, + { + "epoch": 0.23119446744701327, + "grad_norm": 0.3220112919807434, + "learning_rate": 0.0006, + "loss": 2.2458, + "step": 61980 + }, + { + "epoch": 0.23123176890997665, + "grad_norm": 0.3826386332511902, + "learning_rate": 0.0006, + "loss": 2.1623, + "step": 61990 + }, + { + "epoch": 0.23126907037294003, + "grad_norm": 0.29417166113853455, + "learning_rate": 0.0006, + "loss": 2.2138, + "step": 62000 + }, + { + "epoch": 0.23126907037294003, + "eval_valid_loss": 2.1733558177948, + "eval_valid_loss/all": 2.038046360015869, + "eval_valid_loss/end_span": 1.1704028844833374, + "eval_valid_perplexity/batch": 7.675599098205566, + "eval_valid_perplexity/end_span": 3.2232909202575684, + "eval_valid_perplexity/fim": 2.2915046215057373, + "eval_valid_perplexity/first_seq": 14.485897064208984, + "eval_valid_perplexity/last_seq": 8.825440406799316, + "eval_valid_perplexity/second_seq": 13.933332443237305, + "eval_valid_perplexity/seq": 8.654980659484863, + "eval_valid_reconstruction/all": 0.2992962896823883, + "eval_valid_reconstruction/end_span": 0.7307802438735962, + "eval_valid_reconstruction/fim": 0.16901983320713043, + "eval_valid_reconstruction/first_seq": 0.17570042610168457, + "eval_valid_reconstruction/last_seq": 0.33245205879211426, + "eval_valid_reconstruction/second_seq": 0.1886032521724701, + "eval_valid_runtime": 442.6945, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 62000 + }, + { + "epoch": 0.23126907037294003, + "eval_train_loss": 2.1736526489257812, + "eval_train_loss/all": 2.0120842456817627, + "eval_train_loss/end_span": 1.1441240310668945, + "eval_train_perplexity/batch": 7.478888988494873, + "eval_train_perplexity/end_span": 3.1396899223327637, + "eval_train_perplexity/fim": 2.1738808155059814, + "eval_train_perplexity/first_seq": 15.224273681640625, + "eval_train_perplexity/last_seq": 8.862868309020996, + "eval_train_perplexity/second_seq": 14.196928024291992, + "eval_train_perplexity/seq": 8.610345840454102, + "eval_train_reconstruction/all": 0.2882079482078552, + "eval_train_reconstruction/end_span": 0.7398465275764465, + "eval_train_reconstruction/fim": 0.15820950269699097, + "eval_train_reconstruction/first_seq": 0.16132843494415283, + "eval_train_reconstruction/last_seq": 0.3286786675453186, + "eval_train_reconstruction/second_seq": 0.18247072398662567, + "eval_train_runtime": 444.9668, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 62000 + }, + { + "epoch": 0.2313063718359034, + "grad_norm": 0.454894095659256, + "learning_rate": 0.0006, + "loss": 2.1865, + "step": 62010 + }, + { + "epoch": 0.2313436732988668, + "grad_norm": 0.3762790858745575, + "learning_rate": 0.0006, + "loss": 2.2477, + "step": 62020 + }, + { + "epoch": 0.23138097476183017, + "grad_norm": 0.2550736963748932, + "learning_rate": 0.0006, + "loss": 2.0207, + "step": 62030 + }, + { + "epoch": 0.23141827622479352, + "grad_norm": 0.42376765608787537, + "learning_rate": 0.0006, + "loss": 2.3203, + "step": 62040 + }, + { + "epoch": 0.2314555776877569, + "grad_norm": 0.27173903584480286, + "learning_rate": 0.0006, + "loss": 2.1698, + "step": 62050 + }, + { + "epoch": 0.23149287915072028, + "grad_norm": 0.26074278354644775, + "learning_rate": 0.0006, + "loss": 2.3136, + "step": 62060 + }, + { + "epoch": 0.23153018061368366, + "grad_norm": 0.28008559346199036, + "learning_rate": 0.0006, + "loss": 2.1899, + "step": 62070 + }, + { + "epoch": 0.23156748207664704, + "grad_norm": 0.24944286048412323, + "learning_rate": 0.0006, + "loss": 2.1464, + "step": 62080 + }, + { + "epoch": 0.23160478353961042, + "grad_norm": 0.474660724401474, + "learning_rate": 0.0006, + "loss": 2.2055, + "step": 62090 + }, + { + "epoch": 0.2316420850025738, + "grad_norm": 0.3584505617618561, + "learning_rate": 0.0006, + "loss": 2.1696, + "step": 62100 + }, + { + "epoch": 0.23167938646553718, + "grad_norm": 0.2512160539627075, + "learning_rate": 0.0006, + "loss": 2.2006, + "step": 62110 + }, + { + "epoch": 0.23171668792850056, + "grad_norm": 0.3192983567714691, + "learning_rate": 0.0006, + "loss": 2.2234, + "step": 62120 + }, + { + "epoch": 0.23175398939146394, + "grad_norm": 1.4844794273376465, + "learning_rate": 0.0006, + "loss": 2.2335, + "step": 62130 + }, + { + "epoch": 0.23179129085442732, + "grad_norm": 0.445882111787796, + "learning_rate": 0.0006, + "loss": 2.2204, + "step": 62140 + }, + { + "epoch": 0.2318285923173907, + "grad_norm": 0.3870212137699127, + "learning_rate": 0.0006, + "loss": 2.1606, + "step": 62150 + }, + { + "epoch": 0.23186589378035408, + "grad_norm": 0.31207528710365295, + "learning_rate": 0.0006, + "loss": 2.2952, + "step": 62160 + }, + { + "epoch": 0.23190319524331746, + "grad_norm": 0.5082203149795532, + "learning_rate": 0.0006, + "loss": 2.1709, + "step": 62170 + }, + { + "epoch": 0.2319404967062808, + "grad_norm": 0.30443286895751953, + "learning_rate": 0.0006, + "loss": 2.3385, + "step": 62180 + }, + { + "epoch": 0.2319777981692442, + "grad_norm": 0.2928764522075653, + "learning_rate": 0.0006, + "loss": 2.2919, + "step": 62190 + }, + { + "epoch": 0.23201509963220757, + "grad_norm": 0.4776228964328766, + "learning_rate": 0.0006, + "loss": 2.0712, + "step": 62200 + }, + { + "epoch": 0.23205240109517095, + "grad_norm": 0.4523400664329529, + "learning_rate": 0.0006, + "loss": 1.94, + "step": 62210 + }, + { + "epoch": 0.23208970255813433, + "grad_norm": 0.2998485565185547, + "learning_rate": 0.0006, + "loss": 2.1322, + "step": 62220 + }, + { + "epoch": 0.2321270040210977, + "grad_norm": 0.29169413447380066, + "learning_rate": 0.0006, + "loss": 2.0394, + "step": 62230 + }, + { + "epoch": 0.23216430548406108, + "grad_norm": 0.37128013372421265, + "learning_rate": 0.0006, + "loss": 2.1912, + "step": 62240 + }, + { + "epoch": 0.23220160694702446, + "grad_norm": 0.2150883823633194, + "learning_rate": 0.0006, + "loss": 2.3077, + "step": 62250 + }, + { + "epoch": 0.23220160694702446, + "eval_valid_loss": 2.1771175861358643, + "eval_valid_loss/all": 2.0410518646240234, + "eval_valid_loss/end_span": 1.2086117267608643, + "eval_valid_perplexity/batch": 7.698702812194824, + "eval_valid_perplexity/end_span": 3.348832368850708, + "eval_valid_perplexity/fim": 2.587045192718506, + "eval_valid_perplexity/first_seq": 14.885891914367676, + "eval_valid_perplexity/last_seq": 9.110384941101074, + "eval_valid_perplexity/second_seq": 13.482817649841309, + "eval_valid_perplexity/seq": 8.679030418395996, + "eval_valid_reconstruction/all": 0.29836592078208923, + "eval_valid_reconstruction/end_span": 0.7217188477516174, + "eval_valid_reconstruction/fim": 0.19413205981254578, + "eval_valid_reconstruction/first_seq": 0.17024441063404083, + "eval_valid_reconstruction/last_seq": 0.3228195011615753, + "eval_valid_reconstruction/second_seq": 0.2031535655260086, + "eval_valid_runtime": 444.1427, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 62250 + }, + { + "epoch": 0.23220160694702446, + "eval_train_loss": 2.177037000656128, + "eval_train_loss/all": 2.014890193939209, + "eval_train_loss/end_span": 1.1710160970687866, + "eval_train_perplexity/batch": 7.499903678894043, + "eval_train_perplexity/end_span": 3.2252681255340576, + "eval_train_perplexity/fim": 2.104109525680542, + "eval_train_perplexity/first_seq": 15.373244285583496, + "eval_train_perplexity/last_seq": 8.908099174499512, + "eval_train_perplexity/second_seq": 14.08623218536377, + "eval_train_perplexity/seq": 8.630867958068848, + "eval_train_reconstruction/all": 0.28736039996147156, + "eval_train_reconstruction/end_span": 0.7304965853691101, + "eval_train_reconstruction/fim": 0.15208730101585388, + "eval_train_reconstruction/first_seq": 0.15321286022663116, + "eval_train_reconstruction/last_seq": 0.3313159644603729, + "eval_train_reconstruction/second_seq": 0.18558494746685028, + "eval_train_runtime": 440.1365, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 62250 + }, + { + "epoch": 0.23223890840998784, + "grad_norm": 0.42200589179992676, + "learning_rate": 0.0006, + "loss": 2.085, + "step": 62260 + }, + { + "epoch": 0.23227620987295122, + "grad_norm": 0.3682933449745178, + "learning_rate": 0.0006, + "loss": 2.2081, + "step": 62270 + }, + { + "epoch": 0.2323135113359146, + "grad_norm": 0.302198588848114, + "learning_rate": 0.0006, + "loss": 2.2666, + "step": 62280 + }, + { + "epoch": 0.23235081279887798, + "grad_norm": 0.31730741262435913, + "learning_rate": 0.0006, + "loss": 2.1125, + "step": 62290 + }, + { + "epoch": 0.23238811426184136, + "grad_norm": 0.6200101375579834, + "learning_rate": 0.0006, + "loss": 2.1512, + "step": 62300 + }, + { + "epoch": 0.23242541572480474, + "grad_norm": 0.3704640865325928, + "learning_rate": 0.0006, + "loss": 2.2576, + "step": 62310 + }, + { + "epoch": 0.2324627171877681, + "grad_norm": 0.3480890393257141, + "learning_rate": 0.0006, + "loss": 2.1907, + "step": 62320 + }, + { + "epoch": 0.23250001865073147, + "grad_norm": 0.37764662504196167, + "learning_rate": 0.0006, + "loss": 2.1102, + "step": 62330 + }, + { + "epoch": 0.23253732011369485, + "grad_norm": 0.35246166586875916, + "learning_rate": 0.0006, + "loss": 2.317, + "step": 62340 + }, + { + "epoch": 0.23257462157665823, + "grad_norm": 0.4918510317802429, + "learning_rate": 0.0006, + "loss": 2.2487, + "step": 62350 + }, + { + "epoch": 0.2326119230396216, + "grad_norm": 0.34003588557243347, + "learning_rate": 0.0006, + "loss": 2.2349, + "step": 62360 + }, + { + "epoch": 0.232649224502585, + "grad_norm": 0.2161533087491989, + "learning_rate": 0.0006, + "loss": 2.3438, + "step": 62370 + }, + { + "epoch": 0.23268652596554837, + "grad_norm": 0.35330042243003845, + "learning_rate": 0.0006, + "loss": 2.3597, + "step": 62380 + }, + { + "epoch": 0.23272382742851175, + "grad_norm": 0.2987760007381439, + "learning_rate": 0.0006, + "loss": 2.3299, + "step": 62390 + }, + { + "epoch": 0.23276112889147513, + "grad_norm": 0.4335017502307892, + "learning_rate": 0.0006, + "loss": 2.1743, + "step": 62400 + }, + { + "epoch": 0.2327984303544385, + "grad_norm": 1.2350596189498901, + "learning_rate": 0.0006, + "loss": 2.1932, + "step": 62410 + }, + { + "epoch": 0.2328357318174019, + "grad_norm": 1.793879508972168, + "learning_rate": 0.0006, + "loss": 2.1577, + "step": 62420 + }, + { + "epoch": 0.23287303328036527, + "grad_norm": 0.29743143916130066, + "learning_rate": 0.0006, + "loss": 2.1427, + "step": 62430 + }, + { + "epoch": 0.23291033474332865, + "grad_norm": 0.36728212237358093, + "learning_rate": 0.0006, + "loss": 2.3658, + "step": 62440 + }, + { + "epoch": 0.232947636206292, + "grad_norm": 0.20147620141506195, + "learning_rate": 0.0006, + "loss": 2.1985, + "step": 62450 + }, + { + "epoch": 0.23298493766925538, + "grad_norm": 0.6158781051635742, + "learning_rate": 0.0006, + "loss": 2.1209, + "step": 62460 + }, + { + "epoch": 0.23302223913221876, + "grad_norm": 0.236701101064682, + "learning_rate": 0.0006, + "loss": 2.289, + "step": 62470 + }, + { + "epoch": 0.23305954059518214, + "grad_norm": 0.30784112215042114, + "learning_rate": 0.0006, + "loss": 2.3549, + "step": 62480 + }, + { + "epoch": 0.23309684205814551, + "grad_norm": 0.26601094007492065, + "learning_rate": 0.0006, + "loss": 2.3556, + "step": 62490 + }, + { + "epoch": 0.2331341435211089, + "grad_norm": 0.3364754617214203, + "learning_rate": 0.0006, + "loss": 1.9451, + "step": 62500 + }, + { + "epoch": 0.2331341435211089, + "eval_valid_loss": 2.1791834831237793, + "eval_valid_loss/all": 2.043707847595215, + "eval_valid_loss/end_span": 1.2177196741104126, + "eval_valid_perplexity/batch": 7.719177722930908, + "eval_valid_perplexity/end_span": 3.3794727325439453, + "eval_valid_perplexity/fim": 2.1552577018737793, + "eval_valid_perplexity/first_seq": 15.037885665893555, + "eval_valid_perplexity/last_seq": 8.571277618408203, + "eval_valid_perplexity/second_seq": 13.649110794067383, + "eval_valid_perplexity/seq": 8.704972267150879, + "eval_valid_reconstruction/all": 0.29770541191101074, + "eval_valid_reconstruction/end_span": 0.715954601764679, + "eval_valid_reconstruction/fim": 0.15571050345897675, + "eval_valid_reconstruction/first_seq": 0.16585569083690643, + "eval_valid_reconstruction/last_seq": 0.3410293161869049, + "eval_valid_reconstruction/second_seq": 0.19427549839019775, + "eval_valid_runtime": 443.6696, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 62500 + }, + { + "epoch": 0.2331341435211089, + "eval_train_loss": 2.1768887042999268, + "eval_train_loss/all": 2.0151422023773193, + "eval_train_loss/end_span": 1.1777015924453735, + "eval_train_perplexity/batch": 7.50179386138916, + "eval_train_perplexity/end_span": 3.2469029426574707, + "eval_train_perplexity/fim": 1.9407497644424438, + "eval_train_perplexity/first_seq": 15.531320571899414, + "eval_train_perplexity/last_seq": 8.842683792114258, + "eval_train_perplexity/second_seq": 14.120845794677734, + "eval_train_perplexity/seq": 8.64140510559082, + "eval_train_reconstruction/all": 0.2872632145881653, + "eval_train_reconstruction/end_span": 0.7277683019638062, + "eval_train_reconstruction/fim": 0.1338861584663391, + "eval_train_reconstruction/first_seq": 0.14956146478652954, + "eval_train_reconstruction/last_seq": 0.32981300354003906, + "eval_train_reconstruction/second_seq": 0.18498188257217407, + "eval_train_runtime": 436.7502, + "eval_train_samples_per_second": 0.44, + "eval_train_steps_per_second": 0.44, + "step": 62500 + }, + { + "epoch": 0.23317144498407227, + "grad_norm": 0.4315929710865021, + "learning_rate": 0.0006, + "loss": 2.2143, + "step": 62510 + }, + { + "epoch": 0.23320874644703565, + "grad_norm": 0.40601304173469543, + "learning_rate": 0.0006, + "loss": 2.1046, + "step": 62520 + }, + { + "epoch": 0.23324604790999903, + "grad_norm": 0.2743324935436249, + "learning_rate": 0.0006, + "loss": 2.1282, + "step": 62530 + }, + { + "epoch": 0.2332833493729624, + "grad_norm": 0.4536973834037781, + "learning_rate": 0.0006, + "loss": 2.0351, + "step": 62540 + }, + { + "epoch": 0.2333206508359258, + "grad_norm": 0.24683745205402374, + "learning_rate": 0.0006, + "loss": 2.2157, + "step": 62550 + }, + { + "epoch": 0.23335795229888917, + "grad_norm": 0.21147285401821136, + "learning_rate": 0.0006, + "loss": 1.9543, + "step": 62560 + }, + { + "epoch": 0.23339525376185255, + "grad_norm": 0.2599770128726959, + "learning_rate": 0.0006, + "loss": 2.3815, + "step": 62570 + }, + { + "epoch": 0.23343255522481593, + "grad_norm": 0.34491994976997375, + "learning_rate": 0.0006, + "loss": 2.1006, + "step": 62580 + }, + { + "epoch": 0.23346985668777928, + "grad_norm": 0.3171428442001343, + "learning_rate": 0.0006, + "loss": 2.2629, + "step": 62590 + }, + { + "epoch": 0.23350715815074266, + "grad_norm": 0.22311170399188995, + "learning_rate": 0.0006, + "loss": 2.192, + "step": 62600 + }, + { + "epoch": 0.23354445961370604, + "grad_norm": 0.3838101327419281, + "learning_rate": 0.0006, + "loss": 2.2615, + "step": 62610 + }, + { + "epoch": 0.23358176107666942, + "grad_norm": 0.42850908637046814, + "learning_rate": 0.0006, + "loss": 2.2541, + "step": 62620 + }, + { + "epoch": 0.2336190625396328, + "grad_norm": 0.4276118278503418, + "learning_rate": 0.0006, + "loss": 2.2434, + "step": 62630 + }, + { + "epoch": 0.23365636400259618, + "grad_norm": 0.40120741724967957, + "learning_rate": 0.0006, + "loss": 1.9858, + "step": 62640 + }, + { + "epoch": 0.23369366546555956, + "grad_norm": 0.37117674946784973, + "learning_rate": 0.0006, + "loss": 1.9241, + "step": 62650 + }, + { + "epoch": 0.23373096692852294, + "grad_norm": 0.38669702410697937, + "learning_rate": 0.0006, + "loss": 2.3292, + "step": 62660 + }, + { + "epoch": 0.23376826839148632, + "grad_norm": 0.3773235082626343, + "learning_rate": 0.0006, + "loss": 2.1622, + "step": 62670 + }, + { + "epoch": 0.2338055698544497, + "grad_norm": 0.28948912024497986, + "learning_rate": 0.0006, + "loss": 2.0166, + "step": 62680 + }, + { + "epoch": 0.23384287131741308, + "grad_norm": 0.4517103135585785, + "learning_rate": 0.0006, + "loss": 2.2328, + "step": 62690 + }, + { + "epoch": 0.23388017278037646, + "grad_norm": 0.29888519644737244, + "learning_rate": 0.0006, + "loss": 2.2098, + "step": 62700 + }, + { + "epoch": 0.23391747424333983, + "grad_norm": 0.2591049373149872, + "learning_rate": 0.0006, + "loss": 2.2509, + "step": 62710 + }, + { + "epoch": 0.23395477570630321, + "grad_norm": 0.38815563917160034, + "learning_rate": 0.0006, + "loss": 2.2263, + "step": 62720 + }, + { + "epoch": 0.23399207716926657, + "grad_norm": 0.31519508361816406, + "learning_rate": 0.0006, + "loss": 2.1894, + "step": 62730 + }, + { + "epoch": 0.23402937863222995, + "grad_norm": 0.3278812766075134, + "learning_rate": 0.0006, + "loss": 2.2016, + "step": 62740 + }, + { + "epoch": 0.23406668009519332, + "grad_norm": 0.3333055377006531, + "learning_rate": 0.0006, + "loss": 2.3316, + "step": 62750 + }, + { + "epoch": 0.23406668009519332, + "eval_valid_loss": 2.177018404006958, + "eval_valid_loss/all": 2.0416958332061768, + "eval_valid_loss/end_span": 1.1878865957260132, + "eval_valid_perplexity/batch": 7.703662395477295, + "eval_valid_perplexity/end_span": 3.280141592025757, + "eval_valid_perplexity/fim": 2.2465410232543945, + "eval_valid_perplexity/first_seq": 14.933704376220703, + "eval_valid_perplexity/last_seq": 8.33486557006836, + "eval_valid_perplexity/second_seq": 13.400491714477539, + "eval_valid_perplexity/seq": 8.690938949584961, + "eval_valid_reconstruction/all": 0.2981789708137512, + "eval_valid_reconstruction/end_span": 0.7239491939544678, + "eval_valid_reconstruction/fim": 0.16461683809757233, + "eval_valid_reconstruction/first_seq": 0.16604232788085938, + "eval_valid_reconstruction/last_seq": 0.35212278366088867, + "eval_valid_reconstruction/second_seq": 0.20541325211524963, + "eval_valid_runtime": 441.8428, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 62750 + }, + { + "epoch": 0.23406668009519332, + "eval_train_loss": 2.1752331256866455, + "eval_train_loss/all": 2.0136799812316895, + "eval_train_loss/end_span": 1.1534476280212402, + "eval_train_perplexity/batch": 7.490832805633545, + "eval_train_perplexity/end_span": 3.169100046157837, + "eval_train_perplexity/fim": 2.1376938819885254, + "eval_train_perplexity/first_seq": 15.489584922790527, + "eval_train_perplexity/last_seq": 8.627116203308105, + "eval_train_perplexity/second_seq": 14.29904842376709, + "eval_train_perplexity/seq": 8.62536907196045, + "eval_train_reconstruction/all": 0.2878035008907318, + "eval_train_reconstruction/end_span": 0.7358083724975586, + "eval_train_reconstruction/fim": 0.1548829823732376, + "eval_train_reconstruction/first_seq": 0.15148580074310303, + "eval_train_reconstruction/last_seq": 0.3360171318054199, + "eval_train_reconstruction/second_seq": 0.1814989149570465, + "eval_train_runtime": 441.9757, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 62750 + }, + { + "epoch": 0.2341039815581567, + "grad_norm": 0.31260013580322266, + "learning_rate": 0.0006, + "loss": 1.9967, + "step": 62760 + }, + { + "epoch": 0.23414128302112008, + "grad_norm": 0.2665095925331116, + "learning_rate": 0.0006, + "loss": 2.293, + "step": 62770 + }, + { + "epoch": 0.23417858448408346, + "grad_norm": 0.29392316937446594, + "learning_rate": 0.0006, + "loss": 2.1038, + "step": 62780 + }, + { + "epoch": 0.23421588594704684, + "grad_norm": 0.43916836380958557, + "learning_rate": 0.0006, + "loss": 2.219, + "step": 62790 + }, + { + "epoch": 0.23425318741001022, + "grad_norm": 0.21454332768917084, + "learning_rate": 0.0006, + "loss": 2.2629, + "step": 62800 + }, + { + "epoch": 0.2342904888729736, + "grad_norm": 0.2428930401802063, + "learning_rate": 0.0006, + "loss": 2.2677, + "step": 62810 + }, + { + "epoch": 0.23432779033593698, + "grad_norm": 0.40764763951301575, + "learning_rate": 0.0006, + "loss": 2.3187, + "step": 62820 + }, + { + "epoch": 0.23436509179890036, + "grad_norm": 0.22838354110717773, + "learning_rate": 0.0006, + "loss": 2.2802, + "step": 62830 + }, + { + "epoch": 0.23440239326186374, + "grad_norm": 0.46321144700050354, + "learning_rate": 0.0006, + "loss": 2.1563, + "step": 62840 + }, + { + "epoch": 0.23443969472482712, + "grad_norm": 0.37949085235595703, + "learning_rate": 0.0006, + "loss": 2.2853, + "step": 62850 + }, + { + "epoch": 0.2344769961877905, + "grad_norm": 0.4100196957588196, + "learning_rate": 0.0006, + "loss": 2.2106, + "step": 62860 + }, + { + "epoch": 0.23451429765075385, + "grad_norm": 0.29399871826171875, + "learning_rate": 0.0006, + "loss": 2.0604, + "step": 62870 + }, + { + "epoch": 0.23455159911371723, + "grad_norm": 0.24498102068901062, + "learning_rate": 0.0006, + "loss": 2.2015, + "step": 62880 + }, + { + "epoch": 0.2345889005766806, + "grad_norm": 0.3481140434741974, + "learning_rate": 0.0006, + "loss": 2.255, + "step": 62890 + }, + { + "epoch": 0.234626202039644, + "grad_norm": 0.3278155028820038, + "learning_rate": 0.0006, + "loss": 2.4356, + "step": 62900 + }, + { + "epoch": 0.23466350350260737, + "grad_norm": 0.40350979566574097, + "learning_rate": 0.0006, + "loss": 2.2023, + "step": 62910 + }, + { + "epoch": 0.23470080496557075, + "grad_norm": 0.5426140427589417, + "learning_rate": 0.0006, + "loss": 2.2709, + "step": 62920 + }, + { + "epoch": 0.23473810642853413, + "grad_norm": 0.48930060863494873, + "learning_rate": 0.0006, + "loss": 2.0935, + "step": 62930 + }, + { + "epoch": 0.2347754078914975, + "grad_norm": 0.4373598098754883, + "learning_rate": 0.0006, + "loss": 2.2682, + "step": 62940 + }, + { + "epoch": 0.23481270935446089, + "grad_norm": 0.4946548044681549, + "learning_rate": 0.0006, + "loss": 2.3136, + "step": 62950 + }, + { + "epoch": 0.23485001081742427, + "grad_norm": 0.2681277394294739, + "learning_rate": 0.0006, + "loss": 2.0779, + "step": 62960 + }, + { + "epoch": 0.23488731228038764, + "grad_norm": 0.5310236215591431, + "learning_rate": 0.0006, + "loss": 2.01, + "step": 62970 + }, + { + "epoch": 0.23492461374335102, + "grad_norm": 0.46325209736824036, + "learning_rate": 0.0006, + "loss": 2.1872, + "step": 62980 + }, + { + "epoch": 0.2349619152063144, + "grad_norm": 0.32293274998664856, + "learning_rate": 0.0006, + "loss": 1.9817, + "step": 62990 + }, + { + "epoch": 0.23499921666927778, + "grad_norm": 0.3126661479473114, + "learning_rate": 0.0006, + "loss": 2.2708, + "step": 63000 + }, + { + "epoch": 0.23499921666927778, + "eval_valid_loss": 2.1806557178497314, + "eval_valid_loss/all": 2.045177698135376, + "eval_valid_loss/end_span": 1.3919843435287476, + "eval_valid_perplexity/batch": 7.730532169342041, + "eval_valid_perplexity/end_span": 4.022824764251709, + "eval_valid_perplexity/fim": 2.737995147705078, + "eval_valid_perplexity/first_seq": 15.063796997070312, + "eval_valid_perplexity/last_seq": 8.760415077209473, + "eval_valid_perplexity/second_seq": 13.83757209777832, + "eval_valid_perplexity/seq": 8.723557472229004, + "eval_valid_reconstruction/all": 0.2973720133304596, + "eval_valid_reconstruction/end_span": 0.6778067946434021, + "eval_valid_reconstruction/fim": 0.20438869297504425, + "eval_valid_reconstruction/first_seq": 0.16544197499752045, + "eval_valid_reconstruction/last_seq": 0.33452674746513367, + "eval_valid_reconstruction/second_seq": 0.1921735256910324, + "eval_valid_runtime": 441.3465, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 63000 + }, + { + "epoch": 0.23499921666927778, + "eval_train_loss": 2.1792242527008057, + "eval_train_loss/all": 2.0174834728240967, + "eval_train_loss/end_span": 1.3546618223190308, + "eval_train_perplexity/batch": 7.519378185272217, + "eval_train_perplexity/end_span": 3.8754501342773438, + "eval_train_perplexity/fim": 2.103231191635132, + "eval_train_perplexity/first_seq": 15.609541893005371, + "eval_train_perplexity/last_seq": 8.883325576782227, + "eval_train_perplexity/second_seq": 14.23742389678955, + "eval_train_perplexity/seq": 8.66295051574707, + "eval_train_reconstruction/all": 0.28654906153678894, + "eval_train_reconstruction/end_span": 0.6862994432449341, + "eval_train_reconstruction/fim": 0.14975188672542572, + "eval_train_reconstruction/first_seq": 0.14975975453853607, + "eval_train_reconstruction/last_seq": 0.3275999426841736, + "eval_train_reconstruction/second_seq": 0.18019019067287445, + "eval_train_runtime": 441.5386, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 63000 + }, + { + "epoch": 0.23503651813224113, + "grad_norm": 0.2983573079109192, + "learning_rate": 0.0006, + "loss": 2.3307, + "step": 63010 + }, + { + "epoch": 0.23507381959520451, + "grad_norm": 0.26606321334838867, + "learning_rate": 0.0006, + "loss": 2.2457, + "step": 63020 + }, + { + "epoch": 0.2351111210581679, + "grad_norm": 0.37503886222839355, + "learning_rate": 0.0006, + "loss": 2.1489, + "step": 63030 + }, + { + "epoch": 0.23514842252113127, + "grad_norm": 0.27783453464508057, + "learning_rate": 0.0006, + "loss": 2.0806, + "step": 63040 + }, + { + "epoch": 0.23518572398409465, + "grad_norm": 0.2853241264820099, + "learning_rate": 0.0006, + "loss": 2.0783, + "step": 63050 + }, + { + "epoch": 0.23522302544705803, + "grad_norm": 0.3773101270198822, + "learning_rate": 0.0006, + "loss": 2.2086, + "step": 63060 + }, + { + "epoch": 0.2352603269100214, + "grad_norm": 0.5586997270584106, + "learning_rate": 0.0006, + "loss": 2.2546, + "step": 63070 + }, + { + "epoch": 0.2352976283729848, + "grad_norm": 0.8193877935409546, + "learning_rate": 0.0006, + "loss": 2.2932, + "step": 63080 + }, + { + "epoch": 0.23533492983594817, + "grad_norm": 0.27559563517570496, + "learning_rate": 0.0006, + "loss": 2.0803, + "step": 63090 + }, + { + "epoch": 0.23537223129891155, + "grad_norm": 0.3172043263912201, + "learning_rate": 0.0006, + "loss": 2.1144, + "step": 63100 + }, + { + "epoch": 0.23540953276187493, + "grad_norm": 0.49613603949546814, + "learning_rate": 0.0006, + "loss": 2.2094, + "step": 63110 + }, + { + "epoch": 0.2354468342248383, + "grad_norm": 0.3760833442211151, + "learning_rate": 0.0006, + "loss": 2.1849, + "step": 63120 + }, + { + "epoch": 0.2354841356878017, + "grad_norm": 0.27665597200393677, + "learning_rate": 0.0006, + "loss": 2.3141, + "step": 63130 + }, + { + "epoch": 0.23552143715076504, + "grad_norm": 0.6092215180397034, + "learning_rate": 0.0006, + "loss": 2.1905, + "step": 63140 + }, + { + "epoch": 0.23555873861372842, + "grad_norm": 0.30365344882011414, + "learning_rate": 0.0006, + "loss": 2.2342, + "step": 63150 + }, + { + "epoch": 0.2355960400766918, + "grad_norm": 0.3782223165035248, + "learning_rate": 0.0006, + "loss": 2.0483, + "step": 63160 + }, + { + "epoch": 0.23563334153965518, + "grad_norm": 0.329730749130249, + "learning_rate": 0.0006, + "loss": 2.201, + "step": 63170 + }, + { + "epoch": 0.23567064300261856, + "grad_norm": 0.41510385274887085, + "learning_rate": 0.0006, + "loss": 2.2334, + "step": 63180 + }, + { + "epoch": 0.23570794446558194, + "grad_norm": 0.4033784866333008, + "learning_rate": 0.0006, + "loss": 2.2407, + "step": 63190 + }, + { + "epoch": 0.23574524592854532, + "grad_norm": 0.4344046413898468, + "learning_rate": 0.0006, + "loss": 2.1631, + "step": 63200 + }, + { + "epoch": 0.2357825473915087, + "grad_norm": 0.46203187108039856, + "learning_rate": 0.0006, + "loss": 2.1274, + "step": 63210 + }, + { + "epoch": 0.23581984885447208, + "grad_norm": 0.32508882880210876, + "learning_rate": 0.0006, + "loss": 2.1798, + "step": 63220 + }, + { + "epoch": 0.23585715031743545, + "grad_norm": 0.28116607666015625, + "learning_rate": 0.0006, + "loss": 2.137, + "step": 63230 + }, + { + "epoch": 0.23589445178039883, + "grad_norm": 0.6382726430892944, + "learning_rate": 0.0006, + "loss": 2.2111, + "step": 63240 + }, + { + "epoch": 0.2359317532433622, + "grad_norm": 0.23830489814281464, + "learning_rate": 0.0006, + "loss": 2.2723, + "step": 63250 + }, + { + "epoch": 0.2359317532433622, + "eval_valid_loss": 2.178274631500244, + "eval_valid_loss/all": 2.0425593852996826, + "eval_valid_loss/end_span": 1.1543234586715698, + "eval_valid_perplexity/batch": 7.710317611694336, + "eval_valid_perplexity/end_span": 3.1718766689300537, + "eval_valid_perplexity/fim": 2.4171059131622314, + "eval_valid_perplexity/first_seq": 14.800241470336914, + "eval_valid_perplexity/last_seq": 8.7246675491333, + "eval_valid_perplexity/second_seq": 13.62243938446045, + "eval_valid_perplexity/seq": 8.688836097717285, + "eval_valid_reconstruction/all": 0.2977592349052429, + "eval_valid_reconstruction/end_span": 0.7223653197288513, + "eval_valid_reconstruction/fim": 0.17922252416610718, + "eval_valid_reconstruction/first_seq": 0.1692110300064087, + "eval_valid_reconstruction/last_seq": 0.33572062849998474, + "eval_valid_reconstruction/second_seq": 0.20023846626281738, + "eval_valid_runtime": 446.7118, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 63250 + }, + { + "epoch": 0.2359317532433622, + "eval_train_loss": 2.177766799926758, + "eval_train_loss/all": 2.015821933746338, + "eval_train_loss/end_span": 1.1174715757369995, + "eval_train_perplexity/batch": 7.506895065307617, + "eval_train_perplexity/end_span": 3.057114839553833, + "eval_train_perplexity/fim": 2.0726773738861084, + "eval_train_perplexity/first_seq": 15.269243240356445, + "eval_train_perplexity/last_seq": 8.801253318786621, + "eval_train_perplexity/second_seq": 14.223343849182129, + "eval_train_perplexity/seq": 8.640958786010742, + "eval_train_reconstruction/all": 0.28683042526245117, + "eval_train_reconstruction/end_span": 0.7328974008560181, + "eval_train_reconstruction/fim": 0.14757288992404938, + "eval_train_reconstruction/first_seq": 0.15523578226566315, + "eval_train_reconstruction/last_seq": 0.32983893156051636, + "eval_train_reconstruction/second_seq": 0.182723730802536, + "eval_train_runtime": 440.0237, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 63250 + }, + { + "epoch": 0.2359690547063256, + "grad_norm": 0.28444913029670715, + "learning_rate": 0.0006, + "loss": 2.3294, + "step": 63260 + }, + { + "epoch": 0.23600635616928897, + "grad_norm": 0.33230409026145935, + "learning_rate": 0.0006, + "loss": 2.3169, + "step": 63270 + }, + { + "epoch": 0.23604365763225232, + "grad_norm": 0.33272048830986023, + "learning_rate": 0.0006, + "loss": 2.2945, + "step": 63280 + }, + { + "epoch": 0.2360809590952157, + "grad_norm": 0.3616626560688019, + "learning_rate": 0.0006, + "loss": 2.0869, + "step": 63290 + }, + { + "epoch": 0.23611826055817908, + "grad_norm": 0.32743462920188904, + "learning_rate": 0.0006, + "loss": 2.3649, + "step": 63300 + }, + { + "epoch": 0.23615556202114246, + "grad_norm": 0.29652464389801025, + "learning_rate": 0.0006, + "loss": 2.3615, + "step": 63310 + }, + { + "epoch": 0.23619286348410584, + "grad_norm": 0.39683112502098083, + "learning_rate": 0.0006, + "loss": 1.9969, + "step": 63320 + }, + { + "epoch": 0.23623016494706922, + "grad_norm": 0.2717517018318176, + "learning_rate": 0.0006, + "loss": 2.1888, + "step": 63330 + }, + { + "epoch": 0.2362674664100326, + "grad_norm": 0.37937071919441223, + "learning_rate": 0.0006, + "loss": 2.2557, + "step": 63340 + }, + { + "epoch": 0.23630476787299598, + "grad_norm": 0.2406400591135025, + "learning_rate": 0.0006, + "loss": 2.3504, + "step": 63350 + }, + { + "epoch": 0.23634206933595936, + "grad_norm": 0.4379315972328186, + "learning_rate": 0.0006, + "loss": 2.1703, + "step": 63360 + }, + { + "epoch": 0.23637937079892274, + "grad_norm": 0.37941598892211914, + "learning_rate": 0.0006, + "loss": 2.139, + "step": 63370 + }, + { + "epoch": 0.23641667226188612, + "grad_norm": 0.26176583766937256, + "learning_rate": 0.0006, + "loss": 2.145, + "step": 63380 + }, + { + "epoch": 0.2364539737248495, + "grad_norm": 0.9387753009796143, + "learning_rate": 0.0006, + "loss": 2.2414, + "step": 63390 + }, + { + "epoch": 0.23649127518781288, + "grad_norm": 0.31812381744384766, + "learning_rate": 0.0006, + "loss": 2.3082, + "step": 63400 + }, + { + "epoch": 0.23652857665077626, + "grad_norm": 0.277596652507782, + "learning_rate": 0.0006, + "loss": 2.0629, + "step": 63410 + }, + { + "epoch": 0.2365658781137396, + "grad_norm": 0.26693350076675415, + "learning_rate": 0.0006, + "loss": 2.2486, + "step": 63420 + }, + { + "epoch": 0.236603179576703, + "grad_norm": 0.25049352645874023, + "learning_rate": 0.0006, + "loss": 2.2669, + "step": 63430 + }, + { + "epoch": 0.23664048103966637, + "grad_norm": 0.3472107946872711, + "learning_rate": 0.0006, + "loss": 2.2002, + "step": 63440 + }, + { + "epoch": 0.23667778250262975, + "grad_norm": 0.505631148815155, + "learning_rate": 0.0006, + "loss": 2.0881, + "step": 63450 + }, + { + "epoch": 0.23671508396559313, + "grad_norm": 0.3943856358528137, + "learning_rate": 0.0006, + "loss": 2.0824, + "step": 63460 + }, + { + "epoch": 0.2367523854285565, + "grad_norm": 0.4714726209640503, + "learning_rate": 0.0006, + "loss": 2.0701, + "step": 63470 + }, + { + "epoch": 0.23678968689151988, + "grad_norm": 0.2649427056312561, + "learning_rate": 0.0006, + "loss": 2.3142, + "step": 63480 + }, + { + "epoch": 0.23682698835448326, + "grad_norm": 0.2739051580429077, + "learning_rate": 0.0006, + "loss": 2.2027, + "step": 63490 + }, + { + "epoch": 0.23686428981744664, + "grad_norm": 0.3284844160079956, + "learning_rate": 0.0006, + "loss": 2.2911, + "step": 63500 + }, + { + "epoch": 0.23686428981744664, + "eval_valid_loss": 2.176116704940796, + "eval_valid_loss/all": 2.0406172275543213, + "eval_valid_loss/end_span": 1.187175989151001, + "eval_valid_perplexity/batch": 7.695357322692871, + "eval_valid_perplexity/end_span": 3.2778115272521973, + "eval_valid_perplexity/fim": 2.094224214553833, + "eval_valid_perplexity/first_seq": 14.41403865814209, + "eval_valid_perplexity/last_seq": 9.00261402130127, + "eval_valid_perplexity/second_seq": 13.224832534790039, + "eval_valid_perplexity/seq": 8.677008628845215, + "eval_valid_reconstruction/all": 0.2984836995601654, + "eval_valid_reconstruction/end_span": 0.7196982502937317, + "eval_valid_reconstruction/fim": 0.14952120184898376, + "eval_valid_reconstruction/first_seq": 0.17782938480377197, + "eval_valid_reconstruction/last_seq": 0.3278115689754486, + "eval_valid_reconstruction/second_seq": 0.21070803701877594, + "eval_valid_runtime": 441.9005, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 63500 + }, + { + "epoch": 0.23686428981744664, + "eval_train_loss": 2.174351453781128, + "eval_train_loss/all": 2.0127100944519043, + "eval_train_loss/end_span": 1.1565231084823608, + "eval_train_perplexity/batch": 7.4835710525512695, + "eval_train_perplexity/end_span": 3.178861379623413, + "eval_train_perplexity/fim": 2.2106120586395264, + "eval_train_perplexity/first_seq": 15.612269401550293, + "eval_train_perplexity/last_seq": 8.425389289855957, + "eval_train_perplexity/second_seq": 14.171906471252441, + "eval_train_perplexity/seq": 8.615294456481934, + "eval_train_reconstruction/all": 0.28779709339141846, + "eval_train_reconstruction/end_span": 0.729590117931366, + "eval_train_reconstruction/fim": 0.1613757610321045, + "eval_train_reconstruction/first_seq": 0.15260615944862366, + "eval_train_reconstruction/last_seq": 0.34457701444625854, + "eval_train_reconstruction/second_seq": 0.18404938280582428, + "eval_train_runtime": 442.4249, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 63500 + }, + { + "epoch": 0.23690159128041002, + "grad_norm": 0.461891770362854, + "learning_rate": 0.0006, + "loss": 2.1304, + "step": 63510 + }, + { + "epoch": 0.2369388927433734, + "grad_norm": 0.41245830059051514, + "learning_rate": 0.0006, + "loss": 2.2239, + "step": 63520 + }, + { + "epoch": 0.23697619420633678, + "grad_norm": 0.3525557518005371, + "learning_rate": 0.0006, + "loss": 2.1267, + "step": 63530 + }, + { + "epoch": 0.23701349566930016, + "grad_norm": 0.866901159286499, + "learning_rate": 0.0006, + "loss": 2.3455, + "step": 63540 + }, + { + "epoch": 0.23705079713226354, + "grad_norm": 0.28988245129585266, + "learning_rate": 0.0006, + "loss": 2.2439, + "step": 63550 + }, + { + "epoch": 0.2370880985952269, + "grad_norm": 0.3602098524570465, + "learning_rate": 0.0006, + "loss": 2.1478, + "step": 63560 + }, + { + "epoch": 0.23712540005819027, + "grad_norm": 0.2409995198249817, + "learning_rate": 0.0006, + "loss": 2.1209, + "step": 63570 + }, + { + "epoch": 0.23716270152115365, + "grad_norm": 0.25908905267715454, + "learning_rate": 0.0006, + "loss": 2.0752, + "step": 63580 + }, + { + "epoch": 0.23720000298411703, + "grad_norm": 0.3248814046382904, + "learning_rate": 0.0006, + "loss": 2.1406, + "step": 63590 + }, + { + "epoch": 0.2372373044470804, + "grad_norm": 0.25628894567489624, + "learning_rate": 0.0006, + "loss": 2.3576, + "step": 63600 + }, + { + "epoch": 0.2372746059100438, + "grad_norm": 0.3677913546562195, + "learning_rate": 0.0006, + "loss": 2.2066, + "step": 63610 + }, + { + "epoch": 0.23731190737300717, + "grad_norm": 0.28748178482055664, + "learning_rate": 0.0006, + "loss": 2.3477, + "step": 63620 + }, + { + "epoch": 0.23734920883597055, + "grad_norm": 0.357333779335022, + "learning_rate": 0.0006, + "loss": 2.1727, + "step": 63630 + }, + { + "epoch": 0.23738651029893393, + "grad_norm": 0.2602652907371521, + "learning_rate": 0.0006, + "loss": 2.2143, + "step": 63640 + }, + { + "epoch": 0.2374238117618973, + "grad_norm": 0.2524639070034027, + "learning_rate": 0.0006, + "loss": 2.0473, + "step": 63650 + }, + { + "epoch": 0.2374611132248607, + "grad_norm": 0.2502305209636688, + "learning_rate": 0.0006, + "loss": 2.42, + "step": 63660 + }, + { + "epoch": 0.23749841468782407, + "grad_norm": 0.29808351397514343, + "learning_rate": 0.0006, + "loss": 2.1309, + "step": 63670 + }, + { + "epoch": 0.23753571615078745, + "grad_norm": 0.2519678771495819, + "learning_rate": 0.0006, + "loss": 2.2186, + "step": 63680 + }, + { + "epoch": 0.2375730176137508, + "grad_norm": 0.29877376556396484, + "learning_rate": 0.0006, + "loss": 2.2058, + "step": 63690 + }, + { + "epoch": 0.23761031907671418, + "grad_norm": 0.36054331064224243, + "learning_rate": 0.0006, + "loss": 2.2913, + "step": 63700 + }, + { + "epoch": 0.23764762053967756, + "grad_norm": 0.39026200771331787, + "learning_rate": 0.0006, + "loss": 2.243, + "step": 63710 + }, + { + "epoch": 0.23768492200264094, + "grad_norm": 0.23260918259620667, + "learning_rate": 0.0006, + "loss": 2.3969, + "step": 63720 + }, + { + "epoch": 0.23772222346560432, + "grad_norm": 0.28039807081222534, + "learning_rate": 0.0006, + "loss": 2.2395, + "step": 63730 + }, + { + "epoch": 0.2377595249285677, + "grad_norm": 0.27420535683631897, + "learning_rate": 0.0006, + "loss": 2.2585, + "step": 63740 + }, + { + "epoch": 0.23779682639153107, + "grad_norm": 0.30979612469673157, + "learning_rate": 0.0006, + "loss": 2.2676, + "step": 63750 + }, + { + "epoch": 0.23779682639153107, + "eval_valid_loss": 2.1745831966400146, + "eval_valid_loss/all": 2.0394246578216553, + "eval_valid_loss/end_span": 1.208949089050293, + "eval_valid_perplexity/batch": 7.686185836791992, + "eval_valid_perplexity/end_span": 3.3499622344970703, + "eval_valid_perplexity/fim": 2.3548240661621094, + "eval_valid_perplexity/first_seq": 14.944147109985352, + "eval_valid_perplexity/last_seq": 8.583792686462402, + "eval_valid_perplexity/second_seq": 14.05499267578125, + "eval_valid_perplexity/seq": 8.667851448059082, + "eval_valid_reconstruction/all": 0.29857736825942993, + "eval_valid_reconstruction/end_span": 0.7225587368011475, + "eval_valid_reconstruction/fim": 0.17526580393314362, + "eval_valid_reconstruction/first_seq": 0.16392812132835388, + "eval_valid_reconstruction/last_seq": 0.3399912714958191, + "eval_valid_reconstruction/second_seq": 0.1894766092300415, + "eval_valid_runtime": 450.7848, + "eval_valid_samples_per_second": 0.426, + "eval_valid_steps_per_second": 0.426, + "step": 63750 + }, + { + "epoch": 0.23779682639153107, + "eval_train_loss": 2.17269229888916, + "eval_train_loss/all": 2.0112359523773193, + "eval_train_loss/end_span": 1.171015977859497, + "eval_train_perplexity/batch": 7.47254753112793, + "eval_train_perplexity/end_span": 3.2252678871154785, + "eval_train_perplexity/fim": 2.0821030139923096, + "eval_train_perplexity/first_seq": 15.320898056030273, + "eval_train_perplexity/last_seq": 8.82546615600586, + "eval_train_perplexity/second_seq": 13.894463539123535, + "eval_train_perplexity/seq": 8.603160858154297, + "eval_train_reconstruction/all": 0.2881205081939697, + "eval_train_reconstruction/end_span": 0.730987012386322, + "eval_train_reconstruction/fim": 0.14906741678714752, + "eval_train_reconstruction/first_seq": 0.15505985915660858, + "eval_train_reconstruction/last_seq": 0.3302757143974304, + "eval_train_reconstruction/second_seq": 0.1907997578382492, + "eval_train_runtime": 438.6558, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 63750 + }, + { + "epoch": 0.23783412785449445, + "grad_norm": 0.38150665163993835, + "learning_rate": 0.0006, + "loss": 2.1227, + "step": 63760 + }, + { + "epoch": 0.23787142931745783, + "grad_norm": 0.5601966381072998, + "learning_rate": 0.0006, + "loss": 2.3287, + "step": 63770 + }, + { + "epoch": 0.2379087307804212, + "grad_norm": 0.30137506127357483, + "learning_rate": 0.0006, + "loss": 2.1743, + "step": 63780 + }, + { + "epoch": 0.2379460322433846, + "grad_norm": 0.3806573450565338, + "learning_rate": 0.0006, + "loss": 2.1573, + "step": 63790 + }, + { + "epoch": 0.23798333370634797, + "grad_norm": 0.4263801872730255, + "learning_rate": 0.0006, + "loss": 2.1579, + "step": 63800 + }, + { + "epoch": 0.23802063516931135, + "grad_norm": 0.5262773633003235, + "learning_rate": 0.0006, + "loss": 2.187, + "step": 63810 + }, + { + "epoch": 0.23805793663227473, + "grad_norm": 0.27644917368888855, + "learning_rate": 0.0006, + "loss": 2.1598, + "step": 63820 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 0.4308328926563263, + "learning_rate": 0.0006, + "loss": 2.1944, + "step": 63830 + }, + { + "epoch": 0.23813253955820146, + "grad_norm": 0.4223896265029907, + "learning_rate": 0.0006, + "loss": 2.0879, + "step": 63840 + }, + { + "epoch": 0.23816984102116484, + "grad_norm": 0.34045183658599854, + "learning_rate": 0.0006, + "loss": 2.2626, + "step": 63850 + }, + { + "epoch": 0.23820714248412822, + "grad_norm": 0.305583655834198, + "learning_rate": 0.0006, + "loss": 2.118, + "step": 63860 + }, + { + "epoch": 0.2382444439470916, + "grad_norm": 0.5636866688728333, + "learning_rate": 0.0006, + "loss": 2.1205, + "step": 63870 + }, + { + "epoch": 0.23828174541005498, + "grad_norm": 0.41590964794158936, + "learning_rate": 0.0006, + "loss": 2.2514, + "step": 63880 + }, + { + "epoch": 0.23831904687301836, + "grad_norm": 0.3198423683643341, + "learning_rate": 0.0006, + "loss": 2.23, + "step": 63890 + }, + { + "epoch": 0.23835634833598174, + "grad_norm": 0.27514955401420593, + "learning_rate": 0.0006, + "loss": 2.1949, + "step": 63900 + }, + { + "epoch": 0.23839364979894512, + "grad_norm": 0.3483380675315857, + "learning_rate": 0.0006, + "loss": 2.2437, + "step": 63910 + }, + { + "epoch": 0.2384309512619085, + "grad_norm": 0.25560811161994934, + "learning_rate": 0.0006, + "loss": 2.3282, + "step": 63920 + }, + { + "epoch": 0.23846825272487188, + "grad_norm": 0.3482379615306854, + "learning_rate": 0.0006, + "loss": 2.3568, + "step": 63930 + }, + { + "epoch": 0.23850555418783526, + "grad_norm": 0.279997855424881, + "learning_rate": 0.0006, + "loss": 2.2958, + "step": 63940 + }, + { + "epoch": 0.23854285565079864, + "grad_norm": 0.30098891258239746, + "learning_rate": 0.0006, + "loss": 2.0959, + "step": 63950 + }, + { + "epoch": 0.23858015711376201, + "grad_norm": 0.2846153974533081, + "learning_rate": 0.0006, + "loss": 2.1136, + "step": 63960 + }, + { + "epoch": 0.23861745857672537, + "grad_norm": 0.35202130675315857, + "learning_rate": 0.0006, + "loss": 2.2553, + "step": 63970 + }, + { + "epoch": 0.23865476003968875, + "grad_norm": 0.31503525376319885, + "learning_rate": 0.0006, + "loss": 2.3002, + "step": 63980 + }, + { + "epoch": 0.23869206150265213, + "grad_norm": 0.27028802037239075, + "learning_rate": 0.0006, + "loss": 2.2729, + "step": 63990 + }, + { + "epoch": 0.2387293629656155, + "grad_norm": 0.41654494404792786, + "learning_rate": 0.0006, + "loss": 2.0583, + "step": 64000 + }, + { + "epoch": 0.2387293629656155, + "eval_valid_loss": 2.1813619136810303, + "eval_valid_loss/all": 2.0456573963165283, + "eval_valid_loss/end_span": 1.2394260168075562, + "eval_valid_perplexity/batch": 7.734241485595703, + "eval_valid_perplexity/end_span": 3.4536306858062744, + "eval_valid_perplexity/fim": 2.3379080295562744, + "eval_valid_perplexity/first_seq": 14.99880313873291, + "eval_valid_perplexity/last_seq": 8.585694313049316, + "eval_valid_perplexity/second_seq": 13.948765754699707, + "eval_valid_perplexity/seq": 8.725935935974121, + "eval_valid_reconstruction/all": 0.2970253527164459, + "eval_valid_reconstruction/end_span": 0.7143948674201965, + "eval_valid_reconstruction/fim": 0.1718769073486328, + "eval_valid_reconstruction/first_seq": 0.16384166479110718, + "eval_valid_reconstruction/last_seq": 0.33755189180374146, + "eval_valid_reconstruction/second_seq": 0.1908227950334549, + "eval_valid_runtime": 447.0235, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 64000 + }, + { + "epoch": 0.2387293629656155, + "eval_train_loss": 2.1792218685150146, + "eval_train_loss/all": 2.017284393310547, + "eval_train_loss/end_span": 1.1966240406036377, + "eval_train_perplexity/batch": 7.517881393432617, + "eval_train_perplexity/end_span": 3.308927297592163, + "eval_train_perplexity/fim": 2.1701815128326416, + "eval_train_perplexity/first_seq": 15.283994674682617, + "eval_train_perplexity/last_seq": 8.875693321228027, + "eval_train_perplexity/second_seq": 14.089123725891113, + "eval_train_perplexity/seq": 8.65805721282959, + "eval_train_reconstruction/all": 0.28649988770484924, + "eval_train_reconstruction/end_span": 0.7279579043388367, + "eval_train_reconstruction/fim": 0.15751148760318756, + "eval_train_reconstruction/first_seq": 0.15531523525714874, + "eval_train_reconstruction/last_seq": 0.32923251390457153, + "eval_train_reconstruction/second_seq": 0.1880410760641098, + "eval_train_runtime": 453.0133, + "eval_train_samples_per_second": 0.424, + "eval_train_steps_per_second": 0.424, + "step": 64000 + }, + { + "epoch": 0.23876666442857888, + "grad_norm": 0.3024800419807434, + "learning_rate": 0.0006, + "loss": 2.2105, + "step": 64010 + }, + { + "epoch": 0.23880396589154226, + "grad_norm": 0.2806491255760193, + "learning_rate": 0.0006, + "loss": 2.1249, + "step": 64020 + }, + { + "epoch": 0.23884126735450564, + "grad_norm": 0.3247227072715759, + "learning_rate": 0.0006, + "loss": 2.0553, + "step": 64030 + }, + { + "epoch": 0.23887856881746902, + "grad_norm": 0.348581999540329, + "learning_rate": 0.0006, + "loss": 2.3571, + "step": 64040 + }, + { + "epoch": 0.2389158702804324, + "grad_norm": 0.26146450638771057, + "learning_rate": 0.0006, + "loss": 2.256, + "step": 64050 + }, + { + "epoch": 0.23895317174339578, + "grad_norm": 0.31739935278892517, + "learning_rate": 0.0006, + "loss": 2.3098, + "step": 64060 + }, + { + "epoch": 0.23899047320635916, + "grad_norm": 0.4059697091579437, + "learning_rate": 0.0006, + "loss": 2.1369, + "step": 64070 + }, + { + "epoch": 0.23902777466932254, + "grad_norm": 0.2765035033226013, + "learning_rate": 0.0006, + "loss": 2.3018, + "step": 64080 + }, + { + "epoch": 0.23906507613228592, + "grad_norm": 0.45651936531066895, + "learning_rate": 0.0006, + "loss": 2.2232, + "step": 64090 + }, + { + "epoch": 0.2391023775952493, + "grad_norm": 0.39655324816703796, + "learning_rate": 0.0006, + "loss": 2.2005, + "step": 64100 + }, + { + "epoch": 0.23913967905821265, + "grad_norm": 0.327576607465744, + "learning_rate": 0.0006, + "loss": 2.1653, + "step": 64110 + }, + { + "epoch": 0.23917698052117603, + "grad_norm": 0.38396790623664856, + "learning_rate": 0.0006, + "loss": 2.3789, + "step": 64120 + }, + { + "epoch": 0.2392142819841394, + "grad_norm": 0.20438086986541748, + "learning_rate": 0.0006, + "loss": 2.341, + "step": 64130 + }, + { + "epoch": 0.2392515834471028, + "grad_norm": 0.4735596776008606, + "learning_rate": 0.0006, + "loss": 2.0689, + "step": 64140 + }, + { + "epoch": 0.23928888491006617, + "grad_norm": 0.22247709333896637, + "learning_rate": 0.0006, + "loss": 2.2251, + "step": 64150 + }, + { + "epoch": 0.23932618637302955, + "grad_norm": 0.26781848073005676, + "learning_rate": 0.0006, + "loss": 2.1908, + "step": 64160 + }, + { + "epoch": 0.23936348783599293, + "grad_norm": 0.5175946950912476, + "learning_rate": 0.0006, + "loss": 2.1379, + "step": 64170 + }, + { + "epoch": 0.2394007892989563, + "grad_norm": 0.37512513995170593, + "learning_rate": 0.0006, + "loss": 2.2619, + "step": 64180 + }, + { + "epoch": 0.2394380907619197, + "grad_norm": 0.3275900185108185, + "learning_rate": 0.0006, + "loss": 2.11, + "step": 64190 + }, + { + "epoch": 0.23947539222488307, + "grad_norm": 0.24808348715305328, + "learning_rate": 0.0006, + "loss": 2.3152, + "step": 64200 + }, + { + "epoch": 0.23951269368784645, + "grad_norm": 0.3083451986312866, + "learning_rate": 0.0006, + "loss": 2.2114, + "step": 64210 + }, + { + "epoch": 0.23954999515080982, + "grad_norm": 0.48490646481513977, + "learning_rate": 0.0006, + "loss": 2.1085, + "step": 64220 + }, + { + "epoch": 0.2395872966137732, + "grad_norm": 0.5282449722290039, + "learning_rate": 0.0006, + "loss": 2.3094, + "step": 64230 + }, + { + "epoch": 0.23962459807673656, + "grad_norm": 0.30966830253601074, + "learning_rate": 0.0006, + "loss": 2.2644, + "step": 64240 + }, + { + "epoch": 0.23966189953969994, + "grad_norm": 0.3636534512042999, + "learning_rate": 0.0006, + "loss": 2.242, + "step": 64250 + }, + { + "epoch": 0.23966189953969994, + "eval_valid_loss": 2.1738197803497314, + "eval_valid_loss/all": 2.0386531352996826, + "eval_valid_loss/end_span": 1.182127594947815, + "eval_valid_perplexity/batch": 7.680257797241211, + "eval_valid_perplexity/end_span": 3.261305570602417, + "eval_valid_perplexity/fim": 2.0712921619415283, + "eval_valid_perplexity/first_seq": 15.17318344116211, + "eval_valid_perplexity/last_seq": 8.686607360839844, + "eval_valid_perplexity/second_seq": 13.67168140411377, + "eval_valid_perplexity/seq": 8.664077758789062, + "eval_valid_reconstruction/all": 0.298959344625473, + "eval_valid_reconstruction/end_span": 0.722375214099884, + "eval_valid_reconstruction/fim": 0.14852364361286163, + "eval_valid_reconstruction/first_seq": 0.16396695375442505, + "eval_valid_reconstruction/last_seq": 0.3357618749141693, + "eval_valid_reconstruction/second_seq": 0.19583824276924133, + "eval_valid_runtime": 438.7949, + "eval_valid_samples_per_second": 0.438, + "eval_valid_steps_per_second": 0.438, + "step": 64250 + }, + { + "epoch": 0.23966189953969994, + "eval_train_loss": 2.1724603176116943, + "eval_train_loss/all": 2.0112977027893066, + "eval_train_loss/end_span": 1.1437816619873047, + "eval_train_perplexity/batch": 7.473008632659912, + "eval_train_perplexity/end_span": 3.138615131378174, + "eval_train_perplexity/fim": 2.070805072784424, + "eval_train_perplexity/first_seq": 15.249553680419922, + "eval_train_perplexity/last_seq": 8.566545486450195, + "eval_train_perplexity/second_seq": 14.07728099822998, + "eval_train_perplexity/seq": 8.609102249145508, + "eval_train_reconstruction/all": 0.2881741225719452, + "eval_train_reconstruction/end_span": 0.7354918718338013, + "eval_train_reconstruction/fim": 0.148683100938797, + "eval_train_reconstruction/first_seq": 0.15568341314792633, + "eval_train_reconstruction/last_seq": 0.3395310342311859, + "eval_train_reconstruction/second_seq": 0.18875233829021454, + "eval_train_runtime": 444.2521, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 64250 + }, + { + "epoch": 0.23969920100266331, + "grad_norm": 0.34063631296157837, + "learning_rate": 0.0006, + "loss": 2.1721, + "step": 64260 + }, + { + "epoch": 0.2397365024656267, + "grad_norm": 0.4026123583316803, + "learning_rate": 0.0006, + "loss": 2.061, + "step": 64270 + }, + { + "epoch": 0.23977380392859007, + "grad_norm": 0.49588146805763245, + "learning_rate": 0.0006, + "loss": 2.2089, + "step": 64280 + }, + { + "epoch": 0.23981110539155345, + "grad_norm": 0.8689521551132202, + "learning_rate": 0.0006, + "loss": 2.0779, + "step": 64290 + }, + { + "epoch": 0.23984840685451683, + "grad_norm": 0.2271794229745865, + "learning_rate": 0.0006, + "loss": 2.2672, + "step": 64300 + }, + { + "epoch": 0.2398857083174802, + "grad_norm": 0.41614723205566406, + "learning_rate": 0.0006, + "loss": 2.28, + "step": 64310 + }, + { + "epoch": 0.2399230097804436, + "grad_norm": 0.39472517371177673, + "learning_rate": 0.0006, + "loss": 2.2835, + "step": 64320 + }, + { + "epoch": 0.23996031124340697, + "grad_norm": 0.3506198227405548, + "learning_rate": 0.0006, + "loss": 2.1645, + "step": 64330 + }, + { + "epoch": 0.23999761270637035, + "grad_norm": 0.2985769808292389, + "learning_rate": 0.0006, + "loss": 2.2734, + "step": 64340 + }, + { + "epoch": 0.24003491416933373, + "grad_norm": 0.5588632225990295, + "learning_rate": 0.0006, + "loss": 1.9796, + "step": 64350 + }, + { + "epoch": 0.2400722156322971, + "grad_norm": 0.36752939224243164, + "learning_rate": 0.0006, + "loss": 2.1597, + "step": 64360 + }, + { + "epoch": 0.2401095170952605, + "grad_norm": 0.346260666847229, + "learning_rate": 0.0006, + "loss": 2.2057, + "step": 64370 + }, + { + "epoch": 0.24014681855822384, + "grad_norm": 0.4268495738506317, + "learning_rate": 0.0006, + "loss": 2.1048, + "step": 64380 + }, + { + "epoch": 0.24018412002118722, + "grad_norm": 0.5865180492401123, + "learning_rate": 0.0006, + "loss": 2.1368, + "step": 64390 + }, + { + "epoch": 0.2402214214841506, + "grad_norm": 0.8803560733795166, + "learning_rate": 0.0006, + "loss": 2.1079, + "step": 64400 + }, + { + "epoch": 0.24025872294711398, + "grad_norm": 0.35371133685112, + "learning_rate": 0.0006, + "loss": 2.1699, + "step": 64410 + }, + { + "epoch": 0.24029602441007736, + "grad_norm": 0.34831592440605164, + "learning_rate": 0.0006, + "loss": 2.1525, + "step": 64420 + }, + { + "epoch": 0.24033332587304074, + "grad_norm": 0.27057793736457825, + "learning_rate": 0.0006, + "loss": 2.1117, + "step": 64430 + }, + { + "epoch": 0.24037062733600412, + "grad_norm": 0.29003334045410156, + "learning_rate": 0.0006, + "loss": 2.1348, + "step": 64440 + }, + { + "epoch": 0.2404079287989675, + "grad_norm": 0.3766759932041168, + "learning_rate": 0.0006, + "loss": 2.1386, + "step": 64450 + }, + { + "epoch": 0.24044523026193088, + "grad_norm": 7.960123062133789, + "learning_rate": 0.0006, + "loss": 2.3746, + "step": 64460 + }, + { + "epoch": 0.24048253172489426, + "grad_norm": 0.3371344804763794, + "learning_rate": 0.0006, + "loss": 2.1571, + "step": 64470 + }, + { + "epoch": 0.24051983318785763, + "grad_norm": 0.3551732003688812, + "learning_rate": 0.0006, + "loss": 2.2156, + "step": 64480 + }, + { + "epoch": 0.24055713465082101, + "grad_norm": 0.4003690481185913, + "learning_rate": 0.0006, + "loss": 2.1699, + "step": 64490 + }, + { + "epoch": 0.2405944361137844, + "grad_norm": 0.3175189793109894, + "learning_rate": 0.0006, + "loss": 2.3341, + "step": 64500 + }, + { + "epoch": 0.2405944361137844, + "eval_valid_loss": 2.1789464950561523, + "eval_valid_loss/all": 2.043581247329712, + "eval_valid_loss/end_span": 1.2958085536956787, + "eval_valid_perplexity/batch": 7.71820068359375, + "eval_valid_perplexity/end_span": 3.65394926071167, + "eval_valid_perplexity/fim": 2.2122926712036133, + "eval_valid_perplexity/first_seq": 15.007795333862305, + "eval_valid_perplexity/last_seq": 8.518065452575684, + "eval_valid_perplexity/second_seq": 13.670547485351562, + "eval_valid_perplexity/seq": 8.711064338684082, + "eval_valid_reconstruction/all": 0.2973616123199463, + "eval_valid_reconstruction/end_span": 0.6947085857391357, + "eval_valid_reconstruction/fim": 0.16143792867660522, + "eval_valid_reconstruction/first_seq": 0.16708101332187653, + "eval_valid_reconstruction/last_seq": 0.34428200125694275, + "eval_valid_reconstruction/second_seq": 0.19716733694076538, + "eval_valid_runtime": 440.9111, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 64500 + }, + { + "epoch": 0.2405944361137844, + "eval_train_loss": 2.1754496097564697, + "eval_train_loss/all": 2.014270305633545, + "eval_train_loss/end_span": 1.2643564939498901, + "eval_train_perplexity/batch": 7.495255947113037, + "eval_train_perplexity/end_span": 3.540813446044922, + "eval_train_perplexity/fim": 1.965148687362671, + "eval_train_perplexity/first_seq": 15.71733283996582, + "eval_train_perplexity/last_seq": 8.736763954162598, + "eval_train_perplexity/second_seq": 14.211568832397461, + "eval_train_perplexity/seq": 8.636969566345215, + "eval_train_reconstruction/all": 0.28739020228385925, + "eval_train_reconstruction/end_span": 0.7034434676170349, + "eval_train_reconstruction/fim": 0.1369733065366745, + "eval_train_reconstruction/first_seq": 0.14858810603618622, + "eval_train_reconstruction/last_seq": 0.3345300257205963, + "eval_train_reconstruction/second_seq": 0.1863759160041809, + "eval_train_runtime": 448.8208, + "eval_train_samples_per_second": 0.428, + "eval_train_steps_per_second": 0.428, + "step": 64500 + }, + { + "epoch": 0.24063173757674777, + "grad_norm": 0.6889896988868713, + "learning_rate": 0.0006, + "loss": 2.2002, + "step": 64510 + }, + { + "epoch": 0.24066903903971112, + "grad_norm": 0.30913302302360535, + "learning_rate": 0.0006, + "loss": 2.221, + "step": 64520 + }, + { + "epoch": 0.2407063405026745, + "grad_norm": 0.3029591739177704, + "learning_rate": 0.0006, + "loss": 2.2166, + "step": 64530 + }, + { + "epoch": 0.24074364196563788, + "grad_norm": 0.2945133149623871, + "learning_rate": 0.0006, + "loss": 2.1721, + "step": 64540 + }, + { + "epoch": 0.24078094342860126, + "grad_norm": 0.31269577145576477, + "learning_rate": 0.0006, + "loss": 2.0423, + "step": 64550 + }, + { + "epoch": 0.24081824489156464, + "grad_norm": 0.39428967237472534, + "learning_rate": 0.0006, + "loss": 2.1539, + "step": 64560 + }, + { + "epoch": 0.24085554635452802, + "grad_norm": 0.2929314076900482, + "learning_rate": 0.0006, + "loss": 2.1072, + "step": 64570 + }, + { + "epoch": 0.2408928478174914, + "grad_norm": 0.32419419288635254, + "learning_rate": 0.0006, + "loss": 2.2927, + "step": 64580 + }, + { + "epoch": 0.24093014928045478, + "grad_norm": 0.30607864260673523, + "learning_rate": 0.0006, + "loss": 2.1417, + "step": 64590 + }, + { + "epoch": 0.24096745074341816, + "grad_norm": 0.4812691807746887, + "learning_rate": 0.0006, + "loss": 2.1179, + "step": 64600 + }, + { + "epoch": 0.24100475220638154, + "grad_norm": 0.3031587600708008, + "learning_rate": 0.0006, + "loss": 2.2622, + "step": 64610 + }, + { + "epoch": 0.24104205366934492, + "grad_norm": 0.3865151107311249, + "learning_rate": 0.0006, + "loss": 2.2345, + "step": 64620 + }, + { + "epoch": 0.2410793551323083, + "grad_norm": 0.44195786118507385, + "learning_rate": 0.0006, + "loss": 2.1186, + "step": 64630 + }, + { + "epoch": 0.24111665659527168, + "grad_norm": 0.33333978056907654, + "learning_rate": 0.0006, + "loss": 2.3133, + "step": 64640 + }, + { + "epoch": 0.24115395805823506, + "grad_norm": 0.27118203043937683, + "learning_rate": 0.0006, + "loss": 2.2299, + "step": 64650 + }, + { + "epoch": 0.2411912595211984, + "grad_norm": 0.2860923707485199, + "learning_rate": 0.0006, + "loss": 2.166, + "step": 64660 + }, + { + "epoch": 0.2412285609841618, + "grad_norm": 0.3769072890281677, + "learning_rate": 0.0006, + "loss": 2.1836, + "step": 64670 + }, + { + "epoch": 0.24126586244712517, + "grad_norm": 0.2784774601459503, + "learning_rate": 0.0006, + "loss": 2.1972, + "step": 64680 + }, + { + "epoch": 0.24130316391008855, + "grad_norm": 0.23433932662010193, + "learning_rate": 0.0006, + "loss": 2.2751, + "step": 64690 + }, + { + "epoch": 0.24134046537305193, + "grad_norm": 0.4249230921268463, + "learning_rate": 0.0006, + "loss": 2.2501, + "step": 64700 + }, + { + "epoch": 0.2413777668360153, + "grad_norm": 0.29216137528419495, + "learning_rate": 0.0006, + "loss": 2.3268, + "step": 64710 + }, + { + "epoch": 0.24141506829897869, + "grad_norm": 0.16195246577262878, + "learning_rate": 0.0006, + "loss": 2.4167, + "step": 64720 + }, + { + "epoch": 0.24145236976194207, + "grad_norm": 0.3079032897949219, + "learning_rate": 0.0006, + "loss": 2.2488, + "step": 64730 + }, + { + "epoch": 0.24148967122490544, + "grad_norm": 0.39963069558143616, + "learning_rate": 0.0006, + "loss": 2.1896, + "step": 64740 + }, + { + "epoch": 0.24152697268786882, + "grad_norm": 0.460781067609787, + "learning_rate": 0.0006, + "loss": 2.3412, + "step": 64750 + }, + { + "epoch": 0.24152697268786882, + "eval_valid_loss": 2.176422357559204, + "eval_valid_loss/all": 2.041271924972534, + "eval_valid_loss/end_span": 1.2115767002105713, + "eval_valid_perplexity/batch": 7.700397491455078, + "eval_valid_perplexity/end_span": 3.358776330947876, + "eval_valid_perplexity/fim": 2.528247833251953, + "eval_valid_perplexity/first_seq": 15.123805046081543, + "eval_valid_perplexity/last_seq": 8.744641304016113, + "eval_valid_perplexity/second_seq": 14.274346351623535, + "eval_valid_perplexity/seq": 8.690024375915527, + "eval_valid_reconstruction/all": 0.2983859181404114, + "eval_valid_reconstruction/end_span": 0.7181290984153748, + "eval_valid_reconstruction/fim": 0.18834342062473297, + "eval_valid_reconstruction/first_seq": 0.16082052886486053, + "eval_valid_reconstruction/last_seq": 0.3381807506084442, + "eval_valid_reconstruction/second_seq": 0.18657192587852478, + "eval_valid_runtime": 441.8979, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 64750 + }, + { + "epoch": 0.24152697268786882, + "eval_train_loss": 2.173490524291992, + "eval_train_loss/all": 2.011984348297119, + "eval_train_loss/end_span": 1.1784218549728394, + "eval_train_perplexity/batch": 7.478141784667969, + "eval_train_perplexity/end_span": 3.2492423057556152, + "eval_train_perplexity/fim": 2.1232802867889404, + "eval_train_perplexity/first_seq": 15.494818687438965, + "eval_train_perplexity/last_seq": 8.839764595031738, + "eval_train_perplexity/second_seq": 14.41405963897705, + "eval_train_perplexity/seq": 8.611248016357422, + "eval_train_reconstruction/all": 0.28833526372909546, + "eval_train_reconstruction/end_span": 0.7264699339866638, + "eval_train_reconstruction/fim": 0.15391391515731812, + "eval_train_reconstruction/first_seq": 0.14751079678535461, + "eval_train_reconstruction/last_seq": 0.32693207263946533, + "eval_train_reconstruction/second_seq": 0.1764819175004959, + "eval_train_runtime": 444.1739, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 64750 + }, + { + "epoch": 0.2415642741508322, + "grad_norm": 0.41286700963974, + "learning_rate": 0.0006, + "loss": 2.2532, + "step": 64760 + }, + { + "epoch": 0.24160157561379558, + "grad_norm": 0.32645609974861145, + "learning_rate": 0.0006, + "loss": 2.3326, + "step": 64770 + }, + { + "epoch": 0.24163887707675896, + "grad_norm": 0.45683443546295166, + "learning_rate": 0.0006, + "loss": 2.3428, + "step": 64780 + }, + { + "epoch": 0.24167617853972234, + "grad_norm": 0.31915807723999023, + "learning_rate": 0.0006, + "loss": 2.2567, + "step": 64790 + }, + { + "epoch": 0.2417134800026857, + "grad_norm": 0.41817566752433777, + "learning_rate": 0.0006, + "loss": 2.2488, + "step": 64800 + }, + { + "epoch": 0.24175078146564907, + "grad_norm": 0.2503199279308319, + "learning_rate": 0.0006, + "loss": 2.1758, + "step": 64810 + }, + { + "epoch": 0.24178808292861245, + "grad_norm": 0.3222777545452118, + "learning_rate": 0.0006, + "loss": 2.2302, + "step": 64820 + }, + { + "epoch": 0.24182538439157583, + "grad_norm": 0.2865277826786041, + "learning_rate": 0.0006, + "loss": 2.1042, + "step": 64830 + }, + { + "epoch": 0.2418626858545392, + "grad_norm": 0.3351757228374481, + "learning_rate": 0.0006, + "loss": 2.3912, + "step": 64840 + }, + { + "epoch": 0.2418999873175026, + "grad_norm": 0.22719289362430573, + "learning_rate": 0.0006, + "loss": 2.1716, + "step": 64850 + }, + { + "epoch": 0.24193728878046597, + "grad_norm": 0.34308740496635437, + "learning_rate": 0.0006, + "loss": 2.2678, + "step": 64860 + }, + { + "epoch": 0.24197459024342935, + "grad_norm": 0.36071881651878357, + "learning_rate": 0.0006, + "loss": 2.0663, + "step": 64870 + }, + { + "epoch": 0.24201189170639273, + "grad_norm": 0.4766163229942322, + "learning_rate": 0.0006, + "loss": 2.1327, + "step": 64880 + }, + { + "epoch": 0.2420491931693561, + "grad_norm": 0.2948518991470337, + "learning_rate": 0.0006, + "loss": 2.2121, + "step": 64890 + }, + { + "epoch": 0.2420864946323195, + "grad_norm": 0.18835397064685822, + "learning_rate": 0.0006, + "loss": 1.9973, + "step": 64900 + }, + { + "epoch": 0.24212379609528287, + "grad_norm": 0.33130085468292236, + "learning_rate": 0.0006, + "loss": 2.3106, + "step": 64910 + }, + { + "epoch": 0.24216109755824625, + "grad_norm": 0.2831662595272064, + "learning_rate": 0.0006, + "loss": 2.3839, + "step": 64920 + }, + { + "epoch": 0.2421983990212096, + "grad_norm": 0.31879815459251404, + "learning_rate": 0.0006, + "loss": 2.3338, + "step": 64930 + }, + { + "epoch": 0.24223570048417298, + "grad_norm": 0.49666595458984375, + "learning_rate": 0.0006, + "loss": 2.1069, + "step": 64940 + }, + { + "epoch": 0.24227300194713636, + "grad_norm": 0.3953230679035187, + "learning_rate": 0.0006, + "loss": 2.3222, + "step": 64950 + }, + { + "epoch": 0.24231030341009974, + "grad_norm": 0.47849398851394653, + "learning_rate": 0.0006, + "loss": 2.1798, + "step": 64960 + }, + { + "epoch": 0.24234760487306312, + "grad_norm": 0.4421854317188263, + "learning_rate": 0.0006, + "loss": 1.7936, + "step": 64970 + }, + { + "epoch": 0.2423849063360265, + "grad_norm": 0.42613211274147034, + "learning_rate": 0.0006, + "loss": 2.212, + "step": 64980 + }, + { + "epoch": 0.24242220779898987, + "grad_norm": 0.449965238571167, + "learning_rate": 0.0006, + "loss": 2.301, + "step": 64990 + }, + { + "epoch": 0.24245950926195325, + "grad_norm": 0.39150699973106384, + "learning_rate": 0.0006, + "loss": 2.1026, + "step": 65000 + }, + { + "epoch": 0.24245950926195325, + "eval_valid_loss": 2.177813768386841, + "eval_valid_loss/all": 2.042358875274658, + "eval_valid_loss/end_span": 1.1306601762771606, + "eval_valid_perplexity/batch": 7.708771705627441, + "eval_valid_perplexity/end_span": 3.097700834274292, + "eval_valid_perplexity/fim": 2.553805351257324, + "eval_valid_perplexity/first_seq": 14.537757873535156, + "eval_valid_perplexity/last_seq": 8.772249221801758, + "eval_valid_perplexity/second_seq": 13.229723930358887, + "eval_valid_perplexity/seq": 8.701088905334473, + "eval_valid_reconstruction/all": 0.29788562655448914, + "eval_valid_reconstruction/end_span": 0.7432374358177185, + "eval_valid_reconstruction/fim": 0.18938089907169342, + "eval_valid_reconstruction/first_seq": 0.17652568221092224, + "eval_valid_reconstruction/last_seq": 0.3332226276397705, + "eval_valid_reconstruction/second_seq": 0.20954866707324982, + "eval_valid_runtime": 446.8022, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 65000 + }, + { + "epoch": 0.24245950926195325, + "eval_train_loss": 2.1748602390289307, + "eval_train_loss/all": 2.0133490562438965, + "eval_train_loss/end_span": 1.090988039970398, + "eval_train_perplexity/batch": 7.488354206085205, + "eval_train_perplexity/end_span": 2.9772143363952637, + "eval_train_perplexity/fim": 1.9439506530761719, + "eval_train_perplexity/first_seq": 15.559539794921875, + "eval_train_perplexity/last_seq": 8.972033500671387, + "eval_train_perplexity/second_seq": 14.255888938903809, + "eval_train_perplexity/seq": 8.626870155334473, + "eval_train_reconstruction/all": 0.2876474857330322, + "eval_train_reconstruction/end_span": 0.7564126253128052, + "eval_train_reconstruction/fim": 0.1352224200963974, + "eval_train_reconstruction/first_seq": 0.15524545311927795, + "eval_train_reconstruction/last_seq": 0.3252072036266327, + "eval_train_reconstruction/second_seq": 0.18195614218711853, + "eval_train_runtime": 451.922, + "eval_train_samples_per_second": 0.425, + "eval_train_steps_per_second": 0.425, + "step": 65000 + }, + { + "epoch": 0.24249681072491663, + "grad_norm": 0.3966468870639801, + "learning_rate": 0.0006, + "loss": 2.416, + "step": 65010 + }, + { + "epoch": 0.24253411218788, + "grad_norm": 0.26898613572120667, + "learning_rate": 0.0006, + "loss": 2.2074, + "step": 65020 + }, + { + "epoch": 0.2425714136508434, + "grad_norm": 0.41804268956184387, + "learning_rate": 0.0006, + "loss": 2.3118, + "step": 65030 + }, + { + "epoch": 0.24260871511380677, + "grad_norm": 0.36438530683517456, + "learning_rate": 0.0006, + "loss": 2.1887, + "step": 65040 + }, + { + "epoch": 0.24264601657677015, + "grad_norm": 0.32416418194770813, + "learning_rate": 0.0006, + "loss": 2.222, + "step": 65050 + }, + { + "epoch": 0.24268331803973353, + "grad_norm": 0.30569663643836975, + "learning_rate": 0.0006, + "loss": 2.2233, + "step": 65060 + }, + { + "epoch": 0.24272061950269688, + "grad_norm": 0.38624125719070435, + "learning_rate": 0.0006, + "loss": 2.2669, + "step": 65070 + }, + { + "epoch": 0.24275792096566026, + "grad_norm": 0.35172712802886963, + "learning_rate": 0.0006, + "loss": 2.2086, + "step": 65080 + }, + { + "epoch": 0.24279522242862364, + "grad_norm": 0.27962052822113037, + "learning_rate": 0.0006, + "loss": 2.249, + "step": 65090 + }, + { + "epoch": 0.24283252389158702, + "grad_norm": 0.4215196669101715, + "learning_rate": 0.0006, + "loss": 2.263, + "step": 65100 + }, + { + "epoch": 0.2428698253545504, + "grad_norm": 0.3667326867580414, + "learning_rate": 0.0006, + "loss": 2.0946, + "step": 65110 + }, + { + "epoch": 0.24290712681751378, + "grad_norm": 0.37657850980758667, + "learning_rate": 0.0006, + "loss": 2.2713, + "step": 65120 + }, + { + "epoch": 0.24294442828047716, + "grad_norm": 0.3628084659576416, + "learning_rate": 0.0006, + "loss": 2.2571, + "step": 65130 + }, + { + "epoch": 0.24298172974344054, + "grad_norm": 0.38307616114616394, + "learning_rate": 0.0006, + "loss": 2.2427, + "step": 65140 + }, + { + "epoch": 0.24301903120640392, + "grad_norm": 0.34268495440483093, + "learning_rate": 0.0006, + "loss": 2.1524, + "step": 65150 + }, + { + "epoch": 0.2430563326693673, + "grad_norm": 0.29790472984313965, + "learning_rate": 0.0006, + "loss": 2.1686, + "step": 65160 + }, + { + "epoch": 0.24309363413233068, + "grad_norm": 0.25793153047561646, + "learning_rate": 0.0006, + "loss": 2.3791, + "step": 65170 + }, + { + "epoch": 0.24313093559529406, + "grad_norm": 0.32953211665153503, + "learning_rate": 0.0006, + "loss": 2.228, + "step": 65180 + }, + { + "epoch": 0.24316823705825744, + "grad_norm": 0.2919797897338867, + "learning_rate": 0.0006, + "loss": 2.1428, + "step": 65190 + }, + { + "epoch": 0.24320553852122082, + "grad_norm": 0.2676002085208893, + "learning_rate": 0.0006, + "loss": 2.0668, + "step": 65200 + }, + { + "epoch": 0.24324283998418417, + "grad_norm": 0.33679190278053284, + "learning_rate": 0.0006, + "loss": 2.2889, + "step": 65210 + }, + { + "epoch": 0.24328014144714755, + "grad_norm": 0.284624844789505, + "learning_rate": 0.0006, + "loss": 2.3004, + "step": 65220 + }, + { + "epoch": 0.24331744291011093, + "grad_norm": 0.3139248192310333, + "learning_rate": 0.0006, + "loss": 2.0352, + "step": 65230 + }, + { + "epoch": 0.2433547443730743, + "grad_norm": 0.23859643936157227, + "learning_rate": 0.0006, + "loss": 2.1549, + "step": 65240 + }, + { + "epoch": 0.24339204583603768, + "grad_norm": 0.2755318880081177, + "learning_rate": 0.0006, + "loss": 2.2158, + "step": 65250 + }, + { + "epoch": 0.24339204583603768, + "eval_valid_loss": 2.1793367862701416, + "eval_valid_loss/all": 2.0433781147003174, + "eval_valid_loss/end_span": 1.214194416999817, + "eval_valid_perplexity/batch": 7.716632843017578, + "eval_valid_perplexity/end_span": 3.3675801753997803, + "eval_valid_perplexity/fim": 2.315706968307495, + "eval_valid_perplexity/first_seq": 14.897683143615723, + "eval_valid_perplexity/last_seq": 8.924398422241211, + "eval_valid_perplexity/second_seq": 13.911444664001465, + "eval_valid_perplexity/seq": 8.700864791870117, + "eval_valid_reconstruction/all": 0.2975190579891205, + "eval_valid_reconstruction/end_span": 0.7129493355751038, + "eval_valid_reconstruction/fim": 0.17056968808174133, + "eval_valid_reconstruction/first_seq": 0.16738729178905487, + "eval_valid_reconstruction/last_seq": 0.32878658175468445, + "eval_valid_reconstruction/second_seq": 0.19179877638816833, + "eval_valid_runtime": 441.385, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 65250 + }, + { + "epoch": 0.24339204583603768, + "eval_train_loss": 2.176612615585327, + "eval_train_loss/all": 2.014716148376465, + "eval_train_loss/end_span": 1.1680573225021362, + "eval_train_perplexity/batch": 7.498598575592041, + "eval_train_perplexity/end_span": 3.2157394886016846, + "eval_train_perplexity/fim": 2.2799618244171143, + "eval_train_perplexity/first_seq": 15.637672424316406, + "eval_train_perplexity/last_seq": 8.582853317260742, + "eval_train_perplexity/second_seq": 14.053860664367676, + "eval_train_perplexity/seq": 8.633830070495605, + "eval_train_reconstruction/all": 0.28700193762779236, + "eval_train_reconstruction/end_span": 0.7269026637077332, + "eval_train_reconstruction/fim": 0.1666417121887207, + "eval_train_reconstruction/first_seq": 0.14949220418930054, + "eval_train_reconstruction/last_seq": 0.33635732531547546, + "eval_train_reconstruction/second_seq": 0.19019246101379395, + "eval_train_runtime": 447.2699, + "eval_train_samples_per_second": 0.429, + "eval_train_steps_per_second": 0.429, + "step": 65250 + }, + { + "epoch": 0.24342934729900106, + "grad_norm": 0.2953198254108429, + "learning_rate": 0.0006, + "loss": 2.3315, + "step": 65260 + }, + { + "epoch": 0.24346664876196444, + "grad_norm": 0.36808037757873535, + "learning_rate": 0.0006, + "loss": 2.0345, + "step": 65270 + }, + { + "epoch": 0.24350395022492782, + "grad_norm": 0.5151561498641968, + "learning_rate": 0.0006, + "loss": 2.278, + "step": 65280 + }, + { + "epoch": 0.2435412516878912, + "grad_norm": 0.2510530948638916, + "learning_rate": 0.0006, + "loss": 2.3431, + "step": 65290 + }, + { + "epoch": 0.24357855315085458, + "grad_norm": 0.40632644295692444, + "learning_rate": 0.0006, + "loss": 1.9727, + "step": 65300 + }, + { + "epoch": 0.24361585461381796, + "grad_norm": 0.3412076234817505, + "learning_rate": 0.0006, + "loss": 2.2099, + "step": 65310 + }, + { + "epoch": 0.24365315607678134, + "grad_norm": 0.4429258108139038, + "learning_rate": 0.0006, + "loss": 2.2042, + "step": 65320 + }, + { + "epoch": 0.24369045753974472, + "grad_norm": 0.25826936960220337, + "learning_rate": 0.0006, + "loss": 2.3303, + "step": 65330 + }, + { + "epoch": 0.2437277590027081, + "grad_norm": 0.29476258158683777, + "learning_rate": 0.0006, + "loss": 2.273, + "step": 65340 + }, + { + "epoch": 0.24376506046567145, + "grad_norm": 0.45467430353164673, + "learning_rate": 0.0006, + "loss": 2.1658, + "step": 65350 + }, + { + "epoch": 0.24380236192863483, + "grad_norm": 0.43841472268104553, + "learning_rate": 0.0006, + "loss": 2.2234, + "step": 65360 + }, + { + "epoch": 0.2438396633915982, + "grad_norm": 0.3089059889316559, + "learning_rate": 0.0006, + "loss": 2.1638, + "step": 65370 + }, + { + "epoch": 0.2438769648545616, + "grad_norm": 0.34795308113098145, + "learning_rate": 0.0006, + "loss": 2.2426, + "step": 65380 + }, + { + "epoch": 0.24391426631752497, + "grad_norm": 0.2917163670063019, + "learning_rate": 0.0006, + "loss": 2.344, + "step": 65390 + }, + { + "epoch": 0.24395156778048835, + "grad_norm": 0.3484167754650116, + "learning_rate": 0.0006, + "loss": 2.1929, + "step": 65400 + }, + { + "epoch": 0.24398886924345173, + "grad_norm": 0.2868759036064148, + "learning_rate": 0.0006, + "loss": 2.2271, + "step": 65410 + }, + { + "epoch": 0.2440261707064151, + "grad_norm": 0.2966592013835907, + "learning_rate": 0.0006, + "loss": 1.9842, + "step": 65420 + }, + { + "epoch": 0.2440634721693785, + "grad_norm": 0.29614779353141785, + "learning_rate": 0.0006, + "loss": 2.1008, + "step": 65430 + }, + { + "epoch": 0.24410077363234187, + "grad_norm": 0.304031103849411, + "learning_rate": 0.0006, + "loss": 2.2646, + "step": 65440 + }, + { + "epoch": 0.24413807509530525, + "grad_norm": 0.2403869777917862, + "learning_rate": 0.0006, + "loss": 1.9979, + "step": 65450 + }, + { + "epoch": 0.24417537655826863, + "grad_norm": 0.2776937186717987, + "learning_rate": 0.0006, + "loss": 2.1664, + "step": 65460 + }, + { + "epoch": 0.244212678021232, + "grad_norm": 0.3393367528915405, + "learning_rate": 0.0006, + "loss": 2.1034, + "step": 65470 + }, + { + "epoch": 0.24424997948419536, + "grad_norm": 0.37364670634269714, + "learning_rate": 0.0006, + "loss": 2.0971, + "step": 65480 + }, + { + "epoch": 0.24428728094715874, + "grad_norm": 0.26587173342704773, + "learning_rate": 0.0006, + "loss": 2.3211, + "step": 65490 + }, + { + "epoch": 0.24432458241012212, + "grad_norm": 0.3748851716518402, + "learning_rate": 0.0006, + "loss": 1.9906, + "step": 65500 + }, + { + "epoch": 0.24432458241012212, + "eval_valid_loss": 2.173372507095337, + "eval_valid_loss/all": 2.038665533065796, + "eval_valid_loss/end_span": 1.2621656656265259, + "eval_valid_perplexity/batch": 7.680353164672852, + "eval_valid_perplexity/end_span": 3.533064603805542, + "eval_valid_perplexity/fim": 2.5438389778137207, + "eval_valid_perplexity/first_seq": 14.612117767333984, + "eval_valid_perplexity/last_seq": 8.693297386169434, + "eval_valid_perplexity/second_seq": 13.60897159576416, + "eval_valid_perplexity/seq": 8.675257682800293, + "eval_valid_reconstruction/all": 0.29897481203079224, + "eval_valid_reconstruction/end_span": 0.7053094506263733, + "eval_valid_reconstruction/fim": 0.1902349591255188, + "eval_valid_reconstruction/first_seq": 0.17178429663181305, + "eval_valid_reconstruction/last_seq": 0.3357865810394287, + "eval_valid_reconstruction/second_seq": 0.19976282119750977, + "eval_valid_runtime": 443.076, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 65500 + }, + { + "epoch": 0.24432458241012212, + "eval_train_loss": 2.171935796737671, + "eval_train_loss/all": 2.011198043823242, + "eval_train_loss/end_span": 1.2292180061340332, + "eval_train_perplexity/batch": 7.472264289855957, + "eval_train_perplexity/end_span": 3.41855525970459, + "eval_train_perplexity/fim": 2.136573076248169, + "eval_train_perplexity/first_seq": 15.604194641113281, + "eval_train_perplexity/last_seq": 8.938275337219238, + "eval_train_perplexity/second_seq": 14.123536109924316, + "eval_train_perplexity/seq": 8.613822937011719, + "eval_train_reconstruction/all": 0.2882953882217407, + "eval_train_reconstruction/end_span": 0.7144930362701416, + "eval_train_reconstruction/fim": 0.15507876873016357, + "eval_train_reconstruction/first_seq": 0.149953231215477, + "eval_train_reconstruction/last_seq": 0.3263131082057953, + "eval_train_reconstruction/second_seq": 0.18485428392887115, + "eval_train_runtime": 446.0996, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 65500 + }, + { + "epoch": 0.2443618838730855, + "grad_norm": 0.27768829464912415, + "learning_rate": 0.0006, + "loss": 2.114, + "step": 65510 + }, + { + "epoch": 0.24439918533604887, + "grad_norm": 0.4253733456134796, + "learning_rate": 0.0006, + "loss": 2.1136, + "step": 65520 + }, + { + "epoch": 0.24443648679901225, + "grad_norm": 0.34464240074157715, + "learning_rate": 0.0006, + "loss": 2.2296, + "step": 65530 + }, + { + "epoch": 0.24447378826197563, + "grad_norm": 0.3357136845588684, + "learning_rate": 0.0006, + "loss": 2.1806, + "step": 65540 + }, + { + "epoch": 0.244511089724939, + "grad_norm": 0.4595266580581665, + "learning_rate": 0.0006, + "loss": 2.3832, + "step": 65550 + }, + { + "epoch": 0.2445483911879024, + "grad_norm": 0.3243086040019989, + "learning_rate": 0.0006, + "loss": 2.1813, + "step": 65560 + }, + { + "epoch": 0.24458569265086577, + "grad_norm": 0.4757358133792877, + "learning_rate": 0.0006, + "loss": 2.2727, + "step": 65570 + }, + { + "epoch": 0.24462299411382915, + "grad_norm": 0.34812131524086, + "learning_rate": 0.0006, + "loss": 2.206, + "step": 65580 + }, + { + "epoch": 0.24466029557679253, + "grad_norm": 0.3017381727695465, + "learning_rate": 0.0006, + "loss": 2.2923, + "step": 65590 + }, + { + "epoch": 0.2446975970397559, + "grad_norm": 0.317882776260376, + "learning_rate": 0.0006, + "loss": 2.3149, + "step": 65600 + }, + { + "epoch": 0.2447348985027193, + "grad_norm": 0.3427346348762512, + "learning_rate": 0.0006, + "loss": 2.1883, + "step": 65610 + }, + { + "epoch": 0.24477219996568264, + "grad_norm": 0.3715161681175232, + "learning_rate": 0.0006, + "loss": 2.173, + "step": 65620 + }, + { + "epoch": 0.24480950142864602, + "grad_norm": 0.46525847911834717, + "learning_rate": 0.0006, + "loss": 2.2109, + "step": 65630 + }, + { + "epoch": 0.2448468028916094, + "grad_norm": 0.3464865982532501, + "learning_rate": 0.0006, + "loss": 2.1973, + "step": 65640 + }, + { + "epoch": 0.24488410435457278, + "grad_norm": 0.2613174021244049, + "learning_rate": 0.0006, + "loss": 2.2464, + "step": 65650 + }, + { + "epoch": 0.24492140581753616, + "grad_norm": 0.24773834645748138, + "learning_rate": 0.0006, + "loss": 2.2627, + "step": 65660 + }, + { + "epoch": 0.24495870728049954, + "grad_norm": 0.34854647517204285, + "learning_rate": 0.0006, + "loss": 2.1732, + "step": 65670 + }, + { + "epoch": 0.24499600874346292, + "grad_norm": 0.23651044070720673, + "learning_rate": 0.0006, + "loss": 2.3215, + "step": 65680 + }, + { + "epoch": 0.2450333102064263, + "grad_norm": 0.2780376672744751, + "learning_rate": 0.0006, + "loss": 2.1424, + "step": 65690 + }, + { + "epoch": 0.24507061166938968, + "grad_norm": 0.3129616379737854, + "learning_rate": 0.0006, + "loss": 2.1366, + "step": 65700 + }, + { + "epoch": 0.24510791313235306, + "grad_norm": 0.3184647560119629, + "learning_rate": 0.0006, + "loss": 2.1392, + "step": 65710 + }, + { + "epoch": 0.24514521459531644, + "grad_norm": 0.33469876646995544, + "learning_rate": 0.0006, + "loss": 2.3188, + "step": 65720 + }, + { + "epoch": 0.24518251605827981, + "grad_norm": 0.2815888822078705, + "learning_rate": 0.0006, + "loss": 2.0413, + "step": 65730 + }, + { + "epoch": 0.2452198175212432, + "grad_norm": 0.3360435664653778, + "learning_rate": 0.0006, + "loss": 2.1663, + "step": 65740 + }, + { + "epoch": 0.24525711898420657, + "grad_norm": 0.35362789034843445, + "learning_rate": 0.0006, + "loss": 2.1429, + "step": 65750 + }, + { + "epoch": 0.24525711898420657, + "eval_valid_loss": 2.1707892417907715, + "eval_valid_loss/all": 2.035884141921997, + "eval_valid_loss/end_span": 1.216348648071289, + "eval_valid_perplexity/batch": 7.659020900726318, + "eval_valid_perplexity/end_span": 3.374842405319214, + "eval_valid_perplexity/fim": 2.4034969806671143, + "eval_valid_perplexity/first_seq": 14.713747024536133, + "eval_valid_perplexity/last_seq": 8.882390975952148, + "eval_valid_perplexity/second_seq": 13.792898178100586, + "eval_valid_perplexity/seq": 8.641386985778809, + "eval_valid_reconstruction/all": 0.29954400658607483, + "eval_valid_reconstruction/end_span": 0.7079185843467712, + "eval_valid_reconstruction/fim": 0.17839278280735016, + "eval_valid_reconstruction/first_seq": 0.16794909536838531, + "eval_valid_reconstruction/last_seq": 0.3318643569946289, + "eval_valid_reconstruction/second_seq": 0.19536596536636353, + "eval_valid_runtime": 453.5959, + "eval_valid_samples_per_second": 0.423, + "eval_valid_steps_per_second": 0.423, + "step": 65750 + }, + { + "epoch": 0.24525711898420657, + "eval_train_loss": 2.1708014011383057, + "eval_train_loss/all": 2.009653091430664, + "eval_train_loss/end_span": 1.1845121383666992, + "eval_train_perplexity/batch": 7.460728645324707, + "eval_train_perplexity/end_span": 3.2690916061401367, + "eval_train_perplexity/fim": 2.1042609214782715, + "eval_train_perplexity/first_seq": 15.374860763549805, + "eval_train_perplexity/last_seq": 8.79084300994873, + "eval_train_perplexity/second_seq": 13.87546157836914, + "eval_train_perplexity/seq": 8.592523574829102, + "eval_train_reconstruction/all": 0.2885821759700775, + "eval_train_reconstruction/end_span": 0.7188193202018738, + "eval_train_reconstruction/fim": 0.15218417346477509, + "eval_train_reconstruction/first_seq": 0.1584470123052597, + "eval_train_reconstruction/last_seq": 0.3314693868160248, + "eval_train_reconstruction/second_seq": 0.19356903433799744, + "eval_train_runtime": 442.8512, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 65750 + }, + { + "epoch": 0.24529442044716993, + "grad_norm": 0.35135361552238464, + "learning_rate": 0.0006, + "loss": 2.3939, + "step": 65760 + }, + { + "epoch": 0.2453317219101333, + "grad_norm": 0.3834339678287506, + "learning_rate": 0.0006, + "loss": 2.36, + "step": 65770 + }, + { + "epoch": 0.24536902337309668, + "grad_norm": 0.26229119300842285, + "learning_rate": 0.0006, + "loss": 2.2168, + "step": 65780 + }, + { + "epoch": 0.24540632483606006, + "grad_norm": 0.4796030521392822, + "learning_rate": 0.0006, + "loss": 2.1048, + "step": 65790 + }, + { + "epoch": 0.24544362629902344, + "grad_norm": 0.3146594166755676, + "learning_rate": 0.0006, + "loss": 2.3213, + "step": 65800 + }, + { + "epoch": 0.24548092776198682, + "grad_norm": 0.49156010150909424, + "learning_rate": 0.0006, + "loss": 2.2031, + "step": 65810 + }, + { + "epoch": 0.2455182292249502, + "grad_norm": 0.33689507842063904, + "learning_rate": 0.0006, + "loss": 2.3351, + "step": 65820 + }, + { + "epoch": 0.24555553068791358, + "grad_norm": 0.35858526825904846, + "learning_rate": 0.0006, + "loss": 2.1992, + "step": 65830 + }, + { + "epoch": 0.24559283215087696, + "grad_norm": 0.48018863797187805, + "learning_rate": 0.0006, + "loss": 2.103, + "step": 65840 + }, + { + "epoch": 0.24563013361384034, + "grad_norm": 0.2715890407562256, + "learning_rate": 0.0006, + "loss": 2.3189, + "step": 65850 + }, + { + "epoch": 0.24566743507680372, + "grad_norm": 0.40420883893966675, + "learning_rate": 0.0006, + "loss": 2.0996, + "step": 65860 + }, + { + "epoch": 0.2457047365397671, + "grad_norm": 0.440645307302475, + "learning_rate": 0.0006, + "loss": 2.2603, + "step": 65870 + }, + { + "epoch": 0.24574203800273048, + "grad_norm": 0.35571905970573425, + "learning_rate": 0.0006, + "loss": 2.2506, + "step": 65880 + }, + { + "epoch": 0.24577933946569386, + "grad_norm": 0.23123055696487427, + "learning_rate": 0.0006, + "loss": 2.2015, + "step": 65890 + }, + { + "epoch": 0.2458166409286572, + "grad_norm": 0.38625669479370117, + "learning_rate": 0.0006, + "loss": 2.235, + "step": 65900 + }, + { + "epoch": 0.2458539423916206, + "grad_norm": 0.2500864267349243, + "learning_rate": 0.0006, + "loss": 2.2049, + "step": 65910 + }, + { + "epoch": 0.24589124385458397, + "grad_norm": 0.2599717080593109, + "learning_rate": 0.0006, + "loss": 2.0423, + "step": 65920 + }, + { + "epoch": 0.24592854531754735, + "grad_norm": 0.33777716755867004, + "learning_rate": 0.0006, + "loss": 2.2999, + "step": 65930 + }, + { + "epoch": 0.24596584678051073, + "grad_norm": 0.34295937418937683, + "learning_rate": 0.0006, + "loss": 2.2403, + "step": 65940 + }, + { + "epoch": 0.2460031482434741, + "grad_norm": 0.36794301867485046, + "learning_rate": 0.0006, + "loss": 2.1436, + "step": 65950 + }, + { + "epoch": 0.2460404497064375, + "grad_norm": 0.304818332195282, + "learning_rate": 0.0006, + "loss": 2.1945, + "step": 65960 + }, + { + "epoch": 0.24607775116940087, + "grad_norm": 0.3183918595314026, + "learning_rate": 0.0006, + "loss": 2.2566, + "step": 65970 + }, + { + "epoch": 0.24611505263236425, + "grad_norm": 0.21960867941379547, + "learning_rate": 0.0006, + "loss": 2.3086, + "step": 65980 + }, + { + "epoch": 0.24615235409532762, + "grad_norm": 0.3073136806488037, + "learning_rate": 0.0006, + "loss": 2.1164, + "step": 65990 + }, + { + "epoch": 0.246189655558291, + "grad_norm": 0.28985872864723206, + "learning_rate": 0.0006, + "loss": 2.2425, + "step": 66000 + }, + { + "epoch": 0.246189655558291, + "eval_valid_loss": 2.1767578125, + "eval_valid_loss/all": 2.0411975383758545, + "eval_valid_loss/end_span": 1.2651978731155396, + "eval_valid_perplexity/batch": 7.699824333190918, + "eval_valid_perplexity/end_span": 3.5437939167022705, + "eval_valid_perplexity/fim": 2.590550422668457, + "eval_valid_perplexity/first_seq": 14.606537818908691, + "eval_valid_perplexity/last_seq": 9.146397590637207, + "eval_valid_perplexity/second_seq": 13.443567276000977, + "eval_valid_perplexity/seq": 8.683643341064453, + "eval_valid_reconstruction/all": 0.298128604888916, + "eval_valid_reconstruction/end_span": 0.7100511789321899, + "eval_valid_reconstruction/fim": 0.1932712197303772, + "eval_valid_reconstruction/first_seq": 0.17651522159576416, + "eval_valid_reconstruction/last_seq": 0.3217562735080719, + "eval_valid_reconstruction/second_seq": 0.2047569304704666, + "eval_valid_runtime": 468.0715, + "eval_valid_samples_per_second": 0.41, + "eval_valid_steps_per_second": 0.41, + "step": 66000 + }, + { + "epoch": 0.246189655558291, + "eval_train_loss": 2.1744678020477295, + "eval_train_loss/all": 2.0125842094421387, + "eval_train_loss/end_span": 1.231375813484192, + "eval_train_perplexity/batch": 7.482629299163818, + "eval_train_perplexity/end_span": 3.4259397983551025, + "eval_train_perplexity/fim": 2.0214924812316895, + "eval_train_perplexity/first_seq": 15.80455207824707, + "eval_train_perplexity/last_seq": 8.4434814453125, + "eval_train_perplexity/second_seq": 14.299927711486816, + "eval_train_perplexity/seq": 8.613534927368164, + "eval_train_reconstruction/all": 0.2878928780555725, + "eval_train_reconstruction/end_span": 0.7198744416236877, + "eval_train_reconstruction/fim": 0.1433241069316864, + "eval_train_reconstruction/first_seq": 0.1485610008239746, + "eval_train_reconstruction/last_seq": 0.3450974225997925, + "eval_train_reconstruction/second_seq": 0.18027471005916595, + "eval_train_runtime": 469.9694, + "eval_train_samples_per_second": 0.409, + "eval_train_steps_per_second": 0.409, + "step": 66000 + }, + { + "epoch": 0.24622695702125438, + "grad_norm": 0.2951451241970062, + "learning_rate": 0.0006, + "loss": 2.2756, + "step": 66010 + }, + { + "epoch": 0.24626425848421776, + "grad_norm": 0.28519028425216675, + "learning_rate": 0.0006, + "loss": 2.1124, + "step": 66020 + }, + { + "epoch": 0.24630155994718114, + "grad_norm": 0.4021660089492798, + "learning_rate": 0.0006, + "loss": 2.1249, + "step": 66030 + }, + { + "epoch": 0.2463388614101445, + "grad_norm": 0.3629074692726135, + "learning_rate": 0.0006, + "loss": 2.1217, + "step": 66040 + }, + { + "epoch": 0.24637616287310787, + "grad_norm": 0.26494696736335754, + "learning_rate": 0.0006, + "loss": 2.2165, + "step": 66050 + }, + { + "epoch": 0.24641346433607125, + "grad_norm": 0.44106030464172363, + "learning_rate": 0.0006, + "loss": 2.1027, + "step": 66060 + }, + { + "epoch": 0.24645076579903463, + "grad_norm": 0.22102347016334534, + "learning_rate": 0.0006, + "loss": 2.1108, + "step": 66070 + }, + { + "epoch": 0.246488067261998, + "grad_norm": 0.44970759749412537, + "learning_rate": 0.0006, + "loss": 2.1663, + "step": 66080 + }, + { + "epoch": 0.2465253687249614, + "grad_norm": 0.3320525288581848, + "learning_rate": 0.0006, + "loss": 2.2337, + "step": 66090 + }, + { + "epoch": 0.24656267018792477, + "grad_norm": 0.45963484048843384, + "learning_rate": 0.0006, + "loss": 2.0661, + "step": 66100 + }, + { + "epoch": 0.24659997165088815, + "grad_norm": 0.3351456820964813, + "learning_rate": 0.0006, + "loss": 2.2278, + "step": 66110 + }, + { + "epoch": 0.24663727311385153, + "grad_norm": 0.2630910873413086, + "learning_rate": 0.0006, + "loss": 2.1451, + "step": 66120 + }, + { + "epoch": 0.2466745745768149, + "grad_norm": 0.4890919029712677, + "learning_rate": 0.0006, + "loss": 2.0363, + "step": 66130 + }, + { + "epoch": 0.2467118760397783, + "grad_norm": 0.2062341570854187, + "learning_rate": 0.0006, + "loss": 2.1808, + "step": 66140 + }, + { + "epoch": 0.24674917750274167, + "grad_norm": 0.2777858078479767, + "learning_rate": 0.0006, + "loss": 2.1946, + "step": 66150 + }, + { + "epoch": 0.24678647896570505, + "grad_norm": 0.3774212896823883, + "learning_rate": 0.0006, + "loss": 2.3287, + "step": 66160 + }, + { + "epoch": 0.2468237804286684, + "grad_norm": 0.3391781151294708, + "learning_rate": 0.0006, + "loss": 2.3126, + "step": 66170 + }, + { + "epoch": 0.24686108189163178, + "grad_norm": 0.6168404817581177, + "learning_rate": 0.0006, + "loss": 2.2893, + "step": 66180 + }, + { + "epoch": 0.24689838335459516, + "grad_norm": 0.39924749732017517, + "learning_rate": 0.0006, + "loss": 1.9087, + "step": 66190 + }, + { + "epoch": 0.24693568481755854, + "grad_norm": 0.3207603693008423, + "learning_rate": 0.0006, + "loss": 2.1389, + "step": 66200 + }, + { + "epoch": 0.24697298628052192, + "grad_norm": 0.3990356922149658, + "learning_rate": 0.0006, + "loss": 2.3287, + "step": 66210 + }, + { + "epoch": 0.2470102877434853, + "grad_norm": 2.2306368350982666, + "learning_rate": 0.0006, + "loss": 2.3847, + "step": 66220 + }, + { + "epoch": 0.24704758920644868, + "grad_norm": 0.4786871373653412, + "learning_rate": 0.0006, + "loss": 2.264, + "step": 66230 + }, + { + "epoch": 0.24708489066941206, + "grad_norm": 0.4630492925643921, + "learning_rate": 0.0006, + "loss": 2.2322, + "step": 66240 + }, + { + "epoch": 0.24712219213237543, + "grad_norm": 0.3175380229949951, + "learning_rate": 0.0006, + "loss": 2.4827, + "step": 66250 + }, + { + "epoch": 0.24712219213237543, + "eval_valid_loss": 2.1779778003692627, + "eval_valid_loss/all": 2.042283058166504, + "eval_valid_loss/end_span": 1.1448991298675537, + "eval_valid_perplexity/batch": 7.708187580108643, + "eval_valid_perplexity/end_span": 3.1421244144439697, + "eval_valid_perplexity/fim": 2.523237943649292, + "eval_valid_perplexity/first_seq": 14.56517219543457, + "eval_valid_perplexity/last_seq": 8.805830955505371, + "eval_valid_perplexity/second_seq": 13.575944900512695, + "eval_valid_perplexity/seq": 8.690939903259277, + "eval_valid_reconstruction/all": 0.2979247570037842, + "eval_valid_reconstruction/end_span": 0.734731137752533, + "eval_valid_reconstruction/fim": 0.18750376999378204, + "eval_valid_reconstruction/first_seq": 0.17383117973804474, + "eval_valid_reconstruction/last_seq": 0.3329210877418518, + "eval_valid_reconstruction/second_seq": 0.20147426426410675, + "eval_valid_runtime": 486.6599, + "eval_valid_samples_per_second": 0.395, + "eval_valid_steps_per_second": 0.395, + "step": 66250 + }, + { + "epoch": 0.24712219213237543, + "eval_train_loss": 2.1766202449798584, + "eval_train_loss/all": 2.0148661136627197, + "eval_train_loss/end_span": 1.1169400215148926, + "eval_train_perplexity/batch": 7.499723434448242, + "eval_train_perplexity/end_span": 3.055490255355835, + "eval_train_perplexity/fim": 2.0747456550598145, + "eval_train_perplexity/first_seq": 15.635457992553711, + "eval_train_perplexity/last_seq": 9.002311706542969, + "eval_train_perplexity/second_seq": 14.06777572631836, + "eval_train_perplexity/seq": 8.635523796081543, + "eval_train_reconstruction/all": 0.28724128007888794, + "eval_train_reconstruction/end_span": 0.7435125112533569, + "eval_train_reconstruction/fim": 0.14811451733112335, + "eval_train_reconstruction/first_seq": 0.14674198627471924, + "eval_train_reconstruction/last_seq": 0.3216765224933624, + "eval_train_reconstruction/second_seq": 0.1901688128709793, + "eval_train_runtime": 465.6077, + "eval_train_samples_per_second": 0.412, + "eval_train_steps_per_second": 0.412, + "step": 66250 + }, + { + "epoch": 0.24715949359533881, + "grad_norm": 0.24864540994167328, + "learning_rate": 0.0006, + "loss": 2.2382, + "step": 66260 + }, + { + "epoch": 0.2471967950583022, + "grad_norm": 0.2818039655685425, + "learning_rate": 0.0006, + "loss": 2.3094, + "step": 66270 + }, + { + "epoch": 0.24723409652126557, + "grad_norm": 0.31651467084884644, + "learning_rate": 0.0006, + "loss": 2.2182, + "step": 66280 + }, + { + "epoch": 0.24727139798422895, + "grad_norm": 0.37251928448677063, + "learning_rate": 0.0006, + "loss": 2.2046, + "step": 66290 + }, + { + "epoch": 0.24730869944719233, + "grad_norm": 0.36825644969940186, + "learning_rate": 0.0006, + "loss": 2.2927, + "step": 66300 + }, + { + "epoch": 0.24734600091015568, + "grad_norm": 0.32348504662513733, + "learning_rate": 0.0006, + "loss": 2.3155, + "step": 66310 + }, + { + "epoch": 0.24738330237311906, + "grad_norm": 0.35784292221069336, + "learning_rate": 0.0006, + "loss": 2.2318, + "step": 66320 + }, + { + "epoch": 0.24742060383608244, + "grad_norm": 0.3036428689956665, + "learning_rate": 0.0006, + "loss": 2.2029, + "step": 66330 + }, + { + "epoch": 0.24745790529904582, + "grad_norm": 0.38758984208106995, + "learning_rate": 0.0006, + "loss": 2.2463, + "step": 66340 + }, + { + "epoch": 0.2474952067620092, + "grad_norm": 0.25517046451568604, + "learning_rate": 0.0006, + "loss": 2.2764, + "step": 66350 + }, + { + "epoch": 0.24753250822497258, + "grad_norm": 0.5234713554382324, + "learning_rate": 0.0006, + "loss": 2.1309, + "step": 66360 + }, + { + "epoch": 0.24756980968793596, + "grad_norm": 0.36090293526649475, + "learning_rate": 0.0006, + "loss": 2.194, + "step": 66370 + }, + { + "epoch": 0.24760711115089934, + "grad_norm": 0.3759387135505676, + "learning_rate": 0.0006, + "loss": 2.2663, + "step": 66380 + }, + { + "epoch": 0.24764441261386272, + "grad_norm": 0.24252866208553314, + "learning_rate": 0.0006, + "loss": 1.9522, + "step": 66390 + }, + { + "epoch": 0.2476817140768261, + "grad_norm": 0.2762337327003479, + "learning_rate": 0.0006, + "loss": 2.1268, + "step": 66400 + }, + { + "epoch": 0.24771901553978948, + "grad_norm": 0.39094552397727966, + "learning_rate": 0.0006, + "loss": 2.2693, + "step": 66410 + }, + { + "epoch": 0.24775631700275286, + "grad_norm": 0.3662552237510681, + "learning_rate": 0.0006, + "loss": 2.0769, + "step": 66420 + }, + { + "epoch": 0.24779361846571624, + "grad_norm": 2.060542345046997, + "learning_rate": 0.0006, + "loss": 2.2273, + "step": 66430 + }, + { + "epoch": 0.24783091992867962, + "grad_norm": 0.4331837594509125, + "learning_rate": 0.0006, + "loss": 2.2686, + "step": 66440 + }, + { + "epoch": 0.24786822139164297, + "grad_norm": 0.3488579988479614, + "learning_rate": 0.0006, + "loss": 2.2375, + "step": 66450 + }, + { + "epoch": 0.24790552285460635, + "grad_norm": 0.6757824420928955, + "learning_rate": 0.0006, + "loss": 2.2135, + "step": 66460 + }, + { + "epoch": 0.24794282431756973, + "grad_norm": 0.35093000531196594, + "learning_rate": 0.0006, + "loss": 2.0961, + "step": 66470 + }, + { + "epoch": 0.2479801257805331, + "grad_norm": 0.27947965264320374, + "learning_rate": 0.0006, + "loss": 2.1266, + "step": 66480 + }, + { + "epoch": 0.24801742724349649, + "grad_norm": 0.4325806200504303, + "learning_rate": 0.0006, + "loss": 2.1206, + "step": 66490 + }, + { + "epoch": 0.24805472870645986, + "grad_norm": 0.2412937879562378, + "learning_rate": 0.0006, + "loss": 2.112, + "step": 66500 + }, + { + "epoch": 0.24805472870645986, + "eval_valid_loss": 2.1802821159362793, + "eval_valid_loss/all": 2.0449020862579346, + "eval_valid_loss/end_span": 1.2874313592910767, + "eval_valid_perplexity/batch": 7.7284016609191895, + "eval_valid_perplexity/end_span": 3.623467206954956, + "eval_valid_perplexity/fim": 2.3507299423217773, + "eval_valid_perplexity/first_seq": 15.127287864685059, + "eval_valid_perplexity/last_seq": 8.832467079162598, + "eval_valid_perplexity/second_seq": 13.885436058044434, + "eval_valid_perplexity/seq": 8.721144676208496, + "eval_valid_reconstruction/all": 0.2972956895828247, + "eval_valid_reconstruction/end_span": 0.7005248069763184, + "eval_valid_reconstruction/fim": 0.1729786992073059, + "eval_valid_reconstruction/first_seq": 0.1621820479631424, + "eval_valid_reconstruction/last_seq": 0.3320794701576233, + "eval_valid_reconstruction/second_seq": 0.19568242132663727, + "eval_valid_runtime": 448.0527, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 66500 + }, + { + "epoch": 0.24805472870645986, + "eval_train_loss": 2.178375244140625, + "eval_train_loss/all": 2.0169169902801514, + "eval_train_loss/end_span": 1.2408126592636108, + "eval_train_perplexity/batch": 7.515120029449463, + "eval_train_perplexity/end_span": 3.458422899246216, + "eval_train_perplexity/fim": 2.165475368499756, + "eval_train_perplexity/first_seq": 15.642754554748535, + "eval_train_perplexity/last_seq": 9.048940658569336, + "eval_train_perplexity/second_seq": 14.44253158569336, + "eval_train_perplexity/seq": 8.660225868225098, + "eval_train_reconstruction/all": 0.2868153154850006, + "eval_train_reconstruction/end_span": 0.7109177708625793, + "eval_train_reconstruction/fim": 0.15611709654331207, + "eval_train_reconstruction/first_seq": 0.15097202360630035, + "eval_train_reconstruction/last_seq": 0.3258872330188751, + "eval_train_reconstruction/second_seq": 0.17637184262275696, + "eval_train_runtime": 445.4417, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 66500 + }, + { + "epoch": 0.24809203016942324, + "grad_norm": 0.4160011410713196, + "learning_rate": 0.0006, + "loss": 2.1474, + "step": 66510 + }, + { + "epoch": 0.24812933163238662, + "grad_norm": 0.23557709157466888, + "learning_rate": 0.0006, + "loss": 2.4499, + "step": 66520 + }, + { + "epoch": 0.24816663309535, + "grad_norm": 0.3998684585094452, + "learning_rate": 0.0006, + "loss": 2.17, + "step": 66530 + }, + { + "epoch": 0.24820393455831338, + "grad_norm": 0.22832390666007996, + "learning_rate": 0.0006, + "loss": 2.338, + "step": 66540 + }, + { + "epoch": 0.24824123602127676, + "grad_norm": 9.442256927490234, + "learning_rate": 0.0006, + "loss": 2.1836, + "step": 66550 + }, + { + "epoch": 0.24827853748424014, + "grad_norm": 0.24434305727481842, + "learning_rate": 0.0006, + "loss": 2.2899, + "step": 66560 + }, + { + "epoch": 0.24831583894720352, + "grad_norm": 0.4896937906742096, + "learning_rate": 0.0006, + "loss": 2.1478, + "step": 66570 + }, + { + "epoch": 0.2483531404101669, + "grad_norm": 0.4485887587070465, + "learning_rate": 0.0006, + "loss": 2.2564, + "step": 66580 + }, + { + "epoch": 0.24839044187313025, + "grad_norm": 0.31494390964508057, + "learning_rate": 0.0006, + "loss": 2.0151, + "step": 66590 + }, + { + "epoch": 0.24842774333609363, + "grad_norm": 0.4846585690975189, + "learning_rate": 0.0006, + "loss": 2.1989, + "step": 66600 + }, + { + "epoch": 0.248465044799057, + "grad_norm": 0.48502206802368164, + "learning_rate": 0.0006, + "loss": 2.0878, + "step": 66610 + }, + { + "epoch": 0.2485023462620204, + "grad_norm": 0.38118302822113037, + "learning_rate": 0.0006, + "loss": 2.1851, + "step": 66620 + }, + { + "epoch": 0.24853964772498377, + "grad_norm": 0.30213868618011475, + "learning_rate": 0.0006, + "loss": 2.2403, + "step": 66630 + }, + { + "epoch": 0.24857694918794715, + "grad_norm": 0.28792616724967957, + "learning_rate": 0.0006, + "loss": 2.1204, + "step": 66640 + }, + { + "epoch": 0.24861425065091053, + "grad_norm": 0.41683429479599, + "learning_rate": 0.0006, + "loss": 2.2448, + "step": 66650 + }, + { + "epoch": 0.2486515521138739, + "grad_norm": 0.353012353181839, + "learning_rate": 0.0006, + "loss": 2.2934, + "step": 66660 + }, + { + "epoch": 0.2486888535768373, + "grad_norm": 0.40433964133262634, + "learning_rate": 0.0006, + "loss": 2.3885, + "step": 66670 + }, + { + "epoch": 0.24872615503980067, + "grad_norm": 0.7524428367614746, + "learning_rate": 0.0006, + "loss": 2.3146, + "step": 66680 + }, + { + "epoch": 0.24876345650276405, + "grad_norm": 0.44668787717819214, + "learning_rate": 0.0006, + "loss": 2.2034, + "step": 66690 + }, + { + "epoch": 0.24880075796572743, + "grad_norm": 0.39175331592559814, + "learning_rate": 0.0006, + "loss": 2.0389, + "step": 66700 + }, + { + "epoch": 0.2488380594286908, + "grad_norm": 0.2619231641292572, + "learning_rate": 0.0006, + "loss": 2.1324, + "step": 66710 + }, + { + "epoch": 0.24887536089165416, + "grad_norm": 0.4047219753265381, + "learning_rate": 0.0006, + "loss": 2.3325, + "step": 66720 + }, + { + "epoch": 0.24891266235461754, + "grad_norm": 0.3489759564399719, + "learning_rate": 0.0006, + "loss": 2.1451, + "step": 66730 + }, + { + "epoch": 0.24894996381758092, + "grad_norm": 0.29798850417137146, + "learning_rate": 0.0006, + "loss": 2.3539, + "step": 66740 + }, + { + "epoch": 0.2489872652805443, + "grad_norm": 0.30579763650894165, + "learning_rate": 0.0006, + "loss": 2.2927, + "step": 66750 + }, + { + "epoch": 0.2489872652805443, + "eval_valid_loss": 2.1767425537109375, + "eval_valid_loss/all": 2.0411903858184814, + "eval_valid_loss/end_span": 1.23957359790802, + "eval_valid_perplexity/batch": 7.699769496917725, + "eval_valid_perplexity/end_span": 3.4541401863098145, + "eval_valid_perplexity/fim": 2.1074936389923096, + "eval_valid_perplexity/first_seq": 14.855716705322266, + "eval_valid_perplexity/last_seq": 8.859243392944336, + "eval_valid_perplexity/second_seq": 13.694289207458496, + "eval_valid_perplexity/seq": 8.683403015136719, + "eval_valid_reconstruction/all": 0.29841166734695435, + "eval_valid_reconstruction/end_span": 0.7044258117675781, + "eval_valid_reconstruction/fim": 0.15050248801708221, + "eval_valid_reconstruction/first_seq": 0.17125701904296875, + "eval_valid_reconstruction/last_seq": 0.3313591778278351, + "eval_valid_reconstruction/second_seq": 0.19669750332832336, + "eval_valid_runtime": 444.7849, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 66750 + }, + { + "epoch": 0.2489872652805443, + "eval_train_loss": 2.1761457920074463, + "eval_train_loss/all": 2.014744281768799, + "eval_train_loss/end_span": 1.208139419555664, + "eval_train_perplexity/batch": 7.498809337615967, + "eval_train_perplexity/end_span": 3.3472509384155273, + "eval_train_perplexity/fim": 2.0515763759613037, + "eval_train_perplexity/first_seq": 15.654458999633789, + "eval_train_perplexity/last_seq": 8.644966125488281, + "eval_train_perplexity/second_seq": 13.852826118469238, + "eval_train_perplexity/seq": 8.637588500976562, + "eval_train_reconstruction/all": 0.2873416543006897, + "eval_train_reconstruction/end_span": 0.7141163349151611, + "eval_train_reconstruction/fim": 0.1464848518371582, + "eval_train_reconstruction/first_seq": 0.14867648482322693, + "eval_train_reconstruction/last_seq": 0.3350861966609955, + "eval_train_reconstruction/second_seq": 0.19543658196926117, + "eval_train_runtime": 492.8197, + "eval_train_samples_per_second": 0.39, + "eval_train_steps_per_second": 0.39, + "step": 66750 + }, + { + "epoch": 0.24902456674350767, + "grad_norm": 0.2515851855278015, + "learning_rate": 0.0006, + "loss": 2.1937, + "step": 66760 + }, + { + "epoch": 0.24906186820647105, + "grad_norm": 0.34395599365234375, + "learning_rate": 0.0006, + "loss": 2.1992, + "step": 66770 + }, + { + "epoch": 0.24909916966943443, + "grad_norm": 0.33249619603157043, + "learning_rate": 0.0006, + "loss": 2.3064, + "step": 66780 + }, + { + "epoch": 0.2491364711323978, + "grad_norm": 0.33166342973709106, + "learning_rate": 0.0006, + "loss": 2.3246, + "step": 66790 + }, + { + "epoch": 0.2491737725953612, + "grad_norm": 0.3354625105857849, + "learning_rate": 0.0006, + "loss": 2.2098, + "step": 66800 + }, + { + "epoch": 0.24921107405832457, + "grad_norm": 0.30778446793556213, + "learning_rate": 0.0006, + "loss": 2.249, + "step": 66810 + }, + { + "epoch": 0.24924837552128795, + "grad_norm": 0.23457497358322144, + "learning_rate": 0.0006, + "loss": 2.1091, + "step": 66820 + }, + { + "epoch": 0.24928567698425133, + "grad_norm": 0.27151522040367126, + "learning_rate": 0.0006, + "loss": 2.3416, + "step": 66830 + }, + { + "epoch": 0.2493229784472147, + "grad_norm": 0.6088809967041016, + "learning_rate": 0.0006, + "loss": 2.2256, + "step": 66840 + }, + { + "epoch": 0.2493602799101781, + "grad_norm": 0.3811464011669159, + "learning_rate": 0.0006, + "loss": 2.2088, + "step": 66850 + }, + { + "epoch": 0.24939758137314144, + "grad_norm": 0.3012966811656952, + "learning_rate": 0.0006, + "loss": 2.2274, + "step": 66860 + }, + { + "epoch": 0.24943488283610482, + "grad_norm": 0.37719646096229553, + "learning_rate": 0.0006, + "loss": 2.2685, + "step": 66870 + }, + { + "epoch": 0.2494721842990682, + "grad_norm": 0.5644843578338623, + "learning_rate": 0.0006, + "loss": 2.195, + "step": 66880 + }, + { + "epoch": 0.24950948576203158, + "grad_norm": 0.45791131258010864, + "learning_rate": 0.0006, + "loss": 2.1888, + "step": 66890 + }, + { + "epoch": 0.24954678722499496, + "grad_norm": 0.2534179091453552, + "learning_rate": 0.0006, + "loss": 2.093, + "step": 66900 + }, + { + "epoch": 0.24958408868795834, + "grad_norm": 0.35072144865989685, + "learning_rate": 0.0006, + "loss": 2.2164, + "step": 66910 + }, + { + "epoch": 0.24962139015092172, + "grad_norm": 0.3633664846420288, + "learning_rate": 0.0006, + "loss": 2.186, + "step": 66920 + }, + { + "epoch": 0.2496586916138851, + "grad_norm": 5.323957443237305, + "learning_rate": 0.0006, + "loss": 2.2718, + "step": 66930 + }, + { + "epoch": 0.24969599307684848, + "grad_norm": 0.3656434714794159, + "learning_rate": 0.0006, + "loss": 2.2776, + "step": 66940 + }, + { + "epoch": 0.24973329453981186, + "grad_norm": 0.3224060535430908, + "learning_rate": 0.0006, + "loss": 2.251, + "step": 66950 + }, + { + "epoch": 0.24977059600277524, + "grad_norm": 0.28819525241851807, + "learning_rate": 0.0006, + "loss": 2.2619, + "step": 66960 + }, + { + "epoch": 0.24980789746573862, + "grad_norm": 0.4952159821987152, + "learning_rate": 0.0006, + "loss": 2.2017, + "step": 66970 + }, + { + "epoch": 0.249845198928702, + "grad_norm": 0.3114677965641022, + "learning_rate": 0.0006, + "loss": 2.0725, + "step": 66980 + }, + { + "epoch": 0.24988250039166537, + "grad_norm": 0.2889624834060669, + "learning_rate": 0.0006, + "loss": 2.3053, + "step": 66990 + }, + { + "epoch": 0.24991980185462873, + "grad_norm": 0.43218398094177246, + "learning_rate": 0.0006, + "loss": 2.1446, + "step": 67000 + }, + { + "epoch": 0.24991980185462873, + "eval_valid_loss": 2.17387318611145, + "eval_valid_loss/all": 2.0387604236602783, + "eval_valid_loss/end_span": 1.1953686475753784, + "eval_valid_perplexity/batch": 7.681081771850586, + "eval_valid_perplexity/end_span": 3.3047759532928467, + "eval_valid_perplexity/fim": 2.3575665950775146, + "eval_valid_perplexity/first_seq": 14.590422630310059, + "eval_valid_perplexity/last_seq": 8.686493873596191, + "eval_valid_perplexity/second_seq": 13.969922065734863, + "eval_valid_perplexity/seq": 8.666570663452148, + "eval_valid_reconstruction/all": 0.29900631308555603, + "eval_valid_reconstruction/end_span": 0.7225713729858398, + "eval_valid_reconstruction/fim": 0.17471787333488464, + "eval_valid_reconstruction/first_seq": 0.17455701529979706, + "eval_valid_reconstruction/last_seq": 0.3369501531124115, + "eval_valid_reconstruction/second_seq": 0.18935570120811462, + "eval_valid_runtime": 458.9517, + "eval_valid_samples_per_second": 0.418, + "eval_valid_steps_per_second": 0.418, + "step": 67000 + }, + { + "epoch": 0.24991980185462873, + "eval_train_loss": 2.1733314990997314, + "eval_train_loss/all": 2.0122456550598145, + "eval_train_loss/end_span": 1.1730704307556152, + "eval_train_perplexity/batch": 7.480096340179443, + "eval_train_perplexity/end_span": 3.231900691986084, + "eval_train_perplexity/fim": 2.1912147998809814, + "eval_train_perplexity/first_seq": 15.844351768493652, + "eval_train_perplexity/last_seq": 9.0728120803833, + "eval_train_perplexity/second_seq": 14.069241523742676, + "eval_train_perplexity/seq": 8.620060920715332, + "eval_train_reconstruction/all": 0.2880854606628418, + "eval_train_reconstruction/end_span": 0.7316877841949463, + "eval_train_reconstruction/fim": 0.15950433909893036, + "eval_train_reconstruction/first_seq": 0.14523230493068695, + "eval_train_reconstruction/last_seq": 0.3228289783000946, + "eval_train_reconstruction/second_seq": 0.18768607079982758, + "eval_train_runtime": 442.5164, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 67000 + }, + { + "epoch": 0.2499571033175921, + "grad_norm": 0.176578089594841, + "learning_rate": 0.0006, + "loss": 2.3741, + "step": 67010 + }, + { + "epoch": 0.24999440478055548, + "grad_norm": 0.26939505338668823, + "learning_rate": 0.0006, + "loss": 2.2469, + "step": 67020 + }, + { + "epoch": 0.2500317062435189, + "grad_norm": 0.4745226502418518, + "learning_rate": 0.0006, + "loss": 2.1622, + "step": 67030 + }, + { + "epoch": 0.25006900770648227, + "grad_norm": 0.4060978889465332, + "learning_rate": 0.0006, + "loss": 2.3099, + "step": 67040 + }, + { + "epoch": 0.25010630916944565, + "grad_norm": 0.24327772855758667, + "learning_rate": 0.0006, + "loss": 2.3862, + "step": 67050 + }, + { + "epoch": 0.25014361063240903, + "grad_norm": 0.20203335583209991, + "learning_rate": 0.0006, + "loss": 2.2534, + "step": 67060 + }, + { + "epoch": 0.25018091209537235, + "grad_norm": 0.297366738319397, + "learning_rate": 0.0006, + "loss": 2.1282, + "step": 67070 + }, + { + "epoch": 0.25021821355833573, + "grad_norm": 0.28111934661865234, + "learning_rate": 0.0006, + "loss": 2.2855, + "step": 67080 + }, + { + "epoch": 0.2502555150212991, + "grad_norm": 0.24400076270103455, + "learning_rate": 0.0006, + "loss": 2.3619, + "step": 67090 + }, + { + "epoch": 0.2502928164842625, + "grad_norm": 0.9605894684791565, + "learning_rate": 0.0006, + "loss": 2.2821, + "step": 67100 + }, + { + "epoch": 0.25033011794722587, + "grad_norm": 0.4245823919773102, + "learning_rate": 0.0006, + "loss": 2.1548, + "step": 67110 + }, + { + "epoch": 0.25036741941018925, + "grad_norm": 0.3725559413433075, + "learning_rate": 0.0006, + "loss": 2.3512, + "step": 67120 + }, + { + "epoch": 0.25040472087315263, + "grad_norm": 0.3052908182144165, + "learning_rate": 0.0006, + "loss": 2.2032, + "step": 67130 + }, + { + "epoch": 0.250442022336116, + "grad_norm": 0.250712126493454, + "learning_rate": 0.0006, + "loss": 2.0806, + "step": 67140 + }, + { + "epoch": 0.2504793237990794, + "grad_norm": 0.30366218090057373, + "learning_rate": 0.0006, + "loss": 2.2957, + "step": 67150 + }, + { + "epoch": 0.25051662526204277, + "grad_norm": 0.2234264761209488, + "learning_rate": 0.0006, + "loss": 2.24, + "step": 67160 + }, + { + "epoch": 0.25055392672500615, + "grad_norm": 0.37824052572250366, + "learning_rate": 0.0006, + "loss": 2.3711, + "step": 67170 + }, + { + "epoch": 0.25059122818796953, + "grad_norm": 0.36166027188301086, + "learning_rate": 0.0006, + "loss": 2.2058, + "step": 67180 + }, + { + "epoch": 0.2506285296509329, + "grad_norm": 0.6756989359855652, + "learning_rate": 0.0006, + "loss": 2.1736, + "step": 67190 + }, + { + "epoch": 0.2506658311138963, + "grad_norm": 0.35005196928977966, + "learning_rate": 0.0006, + "loss": 2.1952, + "step": 67200 + }, + { + "epoch": 0.25070313257685967, + "grad_norm": 0.3953125476837158, + "learning_rate": 0.0006, + "loss": 2.1862, + "step": 67210 + }, + { + "epoch": 0.25074043403982305, + "grad_norm": 0.3966676592826843, + "learning_rate": 0.0006, + "loss": 2.1331, + "step": 67220 + }, + { + "epoch": 0.2507777355027864, + "grad_norm": 0.2982276976108551, + "learning_rate": 0.0006, + "loss": 2.1186, + "step": 67230 + }, + { + "epoch": 0.2508150369657498, + "grad_norm": 0.3288394808769226, + "learning_rate": 0.0006, + "loss": 2.2441, + "step": 67240 + }, + { + "epoch": 0.2508523384287132, + "grad_norm": 0.3489663898944855, + "learning_rate": 0.0006, + "loss": 2.2077, + "step": 67250 + }, + { + "epoch": 0.2508523384287132, + "eval_valid_loss": 2.1809186935424805, + "eval_valid_loss/all": 2.0449512004852295, + "eval_valid_loss/end_span": 1.310641884803772, + "eval_valid_perplexity/batch": 7.728781223297119, + "eval_valid_perplexity/end_span": 3.7085533142089844, + "eval_valid_perplexity/fim": 2.229818105697632, + "eval_valid_perplexity/first_seq": 14.811187744140625, + "eval_valid_perplexity/last_seq": 8.99716854095459, + "eval_valid_perplexity/second_seq": 13.769540786743164, + "eval_valid_perplexity/seq": 8.717353820800781, + "eval_valid_reconstruction/all": 0.29728150367736816, + "eval_valid_reconstruction/end_span": 0.6984121203422546, + "eval_valid_reconstruction/fim": 0.1630524843931198, + "eval_valid_reconstruction/first_seq": 0.16814447939395905, + "eval_valid_reconstruction/last_seq": 0.32684507966041565, + "eval_valid_reconstruction/second_seq": 0.19414947926998138, + "eval_valid_runtime": 519.6639, + "eval_valid_samples_per_second": 0.369, + "eval_valid_steps_per_second": 0.369, + "step": 67250 + }, + { + "epoch": 0.2508523384287132, + "eval_train_loss": 2.179054021835327, + "eval_train_loss/all": 2.016814708709717, + "eval_train_loss/end_span": 1.2826682329177856, + "eval_train_perplexity/batch": 7.5143513679504395, + "eval_train_perplexity/end_span": 3.6062490940093994, + "eval_train_perplexity/fim": 2.262766122817993, + "eval_train_perplexity/first_seq": 15.372115135192871, + "eval_train_perplexity/last_seq": 8.092649459838867, + "eval_train_perplexity/second_seq": 14.356042861938477, + "eval_train_perplexity/seq": 8.652268409729004, + "eval_train_reconstruction/all": 0.2868765890598297, + "eval_train_reconstruction/end_span": 0.7066983580589294, + "eval_train_reconstruction/fim": 0.16537566483020782, + "eval_train_reconstruction/first_seq": 0.15367700159549713, + "eval_train_reconstruction/last_seq": 0.3605671525001526, + "eval_train_reconstruction/second_seq": 0.1806081086397171, + "eval_train_runtime": 468.0448, + "eval_train_samples_per_second": 0.41, + "eval_train_steps_per_second": 0.41, + "step": 67250 + }, + { + "epoch": 0.25088963989167656, + "grad_norm": 0.2880297899246216, + "learning_rate": 0.0006, + "loss": 2.2314, + "step": 67260 + }, + { + "epoch": 0.25092694135463994, + "grad_norm": 0.4070432186126709, + "learning_rate": 0.0006, + "loss": 2.2746, + "step": 67270 + }, + { + "epoch": 0.2509642428176033, + "grad_norm": 0.27151060104370117, + "learning_rate": 0.0006, + "loss": 2.2749, + "step": 67280 + }, + { + "epoch": 0.2510015442805667, + "grad_norm": 0.36802300810813904, + "learning_rate": 0.0006, + "loss": 2.0649, + "step": 67290 + }, + { + "epoch": 0.2510388457435301, + "grad_norm": 0.3261270523071289, + "learning_rate": 0.0006, + "loss": 2.275, + "step": 67300 + }, + { + "epoch": 0.25107614720649346, + "grad_norm": 0.27832913398742676, + "learning_rate": 0.0006, + "loss": 2.2533, + "step": 67310 + }, + { + "epoch": 0.25111344866945684, + "grad_norm": 0.39656007289886475, + "learning_rate": 0.0006, + "loss": 1.9978, + "step": 67320 + }, + { + "epoch": 0.2511507501324202, + "grad_norm": 0.25031915307044983, + "learning_rate": 0.0006, + "loss": 2.2525, + "step": 67330 + }, + { + "epoch": 0.25118805159538354, + "grad_norm": 0.38694751262664795, + "learning_rate": 0.0006, + "loss": 2.1018, + "step": 67340 + }, + { + "epoch": 0.2512253530583469, + "grad_norm": 0.46146005392074585, + "learning_rate": 0.0006, + "loss": 2.2656, + "step": 67350 + }, + { + "epoch": 0.2512626545213103, + "grad_norm": 0.4651353061199188, + "learning_rate": 0.0006, + "loss": 2.2382, + "step": 67360 + }, + { + "epoch": 0.2512999559842737, + "grad_norm": 0.42012959718704224, + "learning_rate": 0.0006, + "loss": 2.2635, + "step": 67370 + }, + { + "epoch": 0.25133725744723706, + "grad_norm": 0.31031349301338196, + "learning_rate": 0.0006, + "loss": 2.1039, + "step": 67380 + }, + { + "epoch": 0.25137455891020044, + "grad_norm": 0.35829126834869385, + "learning_rate": 0.0006, + "loss": 2.2366, + "step": 67390 + }, + { + "epoch": 0.2514118603731638, + "grad_norm": 0.29896241426467896, + "learning_rate": 0.0006, + "loss": 2.1715, + "step": 67400 + }, + { + "epoch": 0.2514491618361272, + "grad_norm": 0.3610314428806305, + "learning_rate": 0.0006, + "loss": 2.062, + "step": 67410 + }, + { + "epoch": 0.2514864632990906, + "grad_norm": 0.4300301969051361, + "learning_rate": 0.0006, + "loss": 2.0762, + "step": 67420 + }, + { + "epoch": 0.25152376476205396, + "grad_norm": 0.26338472962379456, + "learning_rate": 0.0006, + "loss": 2.2194, + "step": 67430 + }, + { + "epoch": 0.25156106622501734, + "grad_norm": 0.44304168224334717, + "learning_rate": 0.0006, + "loss": 2.2277, + "step": 67440 + }, + { + "epoch": 0.2515983676879807, + "grad_norm": 0.4460444748401642, + "learning_rate": 0.0006, + "loss": 2.2104, + "step": 67450 + }, + { + "epoch": 0.2516356691509441, + "grad_norm": 0.3809276521205902, + "learning_rate": 0.0006, + "loss": 2.1855, + "step": 67460 + }, + { + "epoch": 0.2516729706139075, + "grad_norm": 0.2947288453578949, + "learning_rate": 0.0006, + "loss": 2.2245, + "step": 67470 + }, + { + "epoch": 0.25171027207687086, + "grad_norm": 0.288993239402771, + "learning_rate": 0.0006, + "loss": 2.4222, + "step": 67480 + }, + { + "epoch": 0.25174757353983424, + "grad_norm": 0.3073076605796814, + "learning_rate": 0.0006, + "loss": 2.2375, + "step": 67490 + }, + { + "epoch": 0.2517848750027976, + "grad_norm": 0.21451640129089355, + "learning_rate": 0.0006, + "loss": 2.3754, + "step": 67500 + }, + { + "epoch": 0.2517848750027976, + "eval_valid_loss": 2.1780526638031006, + "eval_valid_loss/all": 2.0423882007598877, + "eval_valid_loss/end_span": 1.2002954483032227, + "eval_valid_perplexity/batch": 7.70899772644043, + "eval_valid_perplexity/end_span": 3.3210980892181396, + "eval_valid_perplexity/fim": 2.3530986309051514, + "eval_valid_perplexity/first_seq": 15.02242660522461, + "eval_valid_perplexity/last_seq": 8.903504371643066, + "eval_valid_perplexity/second_seq": 13.68478775024414, + "eval_valid_perplexity/seq": 8.691779136657715, + "eval_valid_reconstruction/all": 0.2979224622249603, + "eval_valid_reconstruction/end_span": 0.7166708111763, + "eval_valid_reconstruction/fim": 0.17379291355609894, + "eval_valid_reconstruction/first_seq": 0.16121740639209747, + "eval_valid_reconstruction/last_seq": 0.3318882882595062, + "eval_valid_reconstruction/second_seq": 0.19785557687282562, + "eval_valid_runtime": 446.1219, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 67500 + }, + { + "epoch": 0.2517848750027976, + "eval_train_loss": 2.1763339042663574, + "eval_train_loss/all": 2.014596939086914, + "eval_train_loss/end_span": 1.167262315750122, + "eval_train_perplexity/batch": 7.49770450592041, + "eval_train_perplexity/end_span": 3.213183879852295, + "eval_train_perplexity/fim": 2.1743149757385254, + "eval_train_perplexity/first_seq": 15.462995529174805, + "eval_train_perplexity/last_seq": 8.686202049255371, + "eval_train_perplexity/second_seq": 14.06439208984375, + "eval_train_perplexity/seq": 8.63414192199707, + "eval_train_reconstruction/all": 0.2872885465621948, + "eval_train_reconstruction/end_span": 0.7271923422813416, + "eval_train_reconstruction/fim": 0.15845060348510742, + "eval_train_reconstruction/first_seq": 0.15180079638957977, + "eval_train_reconstruction/last_seq": 0.3341498076915741, + "eval_train_reconstruction/second_seq": 0.18888439238071442, + "eval_train_runtime": 441.3608, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 67500 + }, + { + "epoch": 0.251822176465761, + "grad_norm": 0.4035382866859436, + "learning_rate": 0.0006, + "loss": 2.0907, + "step": 67510 + }, + { + "epoch": 0.2518594779287244, + "grad_norm": 0.2991175651550293, + "learning_rate": 0.0006, + "loss": 2.3255, + "step": 67520 + }, + { + "epoch": 0.25189677939168775, + "grad_norm": 0.23384113609790802, + "learning_rate": 0.0006, + "loss": 2.2129, + "step": 67530 + }, + { + "epoch": 0.25193408085465113, + "grad_norm": 0.34351465106010437, + "learning_rate": 0.0006, + "loss": 2.2286, + "step": 67540 + }, + { + "epoch": 0.2519713823176145, + "grad_norm": 0.35520249605178833, + "learning_rate": 0.0006, + "loss": 2.1978, + "step": 67550 + }, + { + "epoch": 0.2520086837805779, + "grad_norm": 0.5199990272521973, + "learning_rate": 0.0006, + "loss": 2.2107, + "step": 67560 + }, + { + "epoch": 0.25204598524354127, + "grad_norm": 0.27840903401374817, + "learning_rate": 0.0006, + "loss": 2.0072, + "step": 67570 + }, + { + "epoch": 0.25208328670650465, + "grad_norm": 0.30849629640579224, + "learning_rate": 0.0006, + "loss": 2.231, + "step": 67580 + }, + { + "epoch": 0.25212058816946803, + "grad_norm": 0.24431723356246948, + "learning_rate": 0.0006, + "loss": 2.1142, + "step": 67590 + }, + { + "epoch": 0.2521578896324314, + "grad_norm": 0.2916739881038666, + "learning_rate": 0.0006, + "loss": 2.2222, + "step": 67600 + }, + { + "epoch": 0.2521951910953948, + "grad_norm": 0.37549033761024475, + "learning_rate": 0.0006, + "loss": 2.221, + "step": 67610 + }, + { + "epoch": 0.2522324925583581, + "grad_norm": 0.21914905309677124, + "learning_rate": 0.0006, + "loss": 2.3155, + "step": 67620 + }, + { + "epoch": 0.2522697940213215, + "grad_norm": 0.4375993311405182, + "learning_rate": 0.0006, + "loss": 2.1729, + "step": 67630 + }, + { + "epoch": 0.25230709548428487, + "grad_norm": 0.3981236517429352, + "learning_rate": 0.0006, + "loss": 2.0573, + "step": 67640 + }, + { + "epoch": 0.25234439694724825, + "grad_norm": 0.33478519320487976, + "learning_rate": 0.0006, + "loss": 2.2723, + "step": 67650 + }, + { + "epoch": 0.25238169841021163, + "grad_norm": 0.24225778877735138, + "learning_rate": 0.0006, + "loss": 2.2662, + "step": 67660 + }, + { + "epoch": 0.252418999873175, + "grad_norm": 0.3209642767906189, + "learning_rate": 0.0006, + "loss": 2.2037, + "step": 67670 + }, + { + "epoch": 0.2524563013361384, + "grad_norm": 0.3570922315120697, + "learning_rate": 0.0006, + "loss": 2.1347, + "step": 67680 + }, + { + "epoch": 0.25249360279910177, + "grad_norm": 0.2631409764289856, + "learning_rate": 0.0006, + "loss": 2.2796, + "step": 67690 + }, + { + "epoch": 0.25253090426206515, + "grad_norm": 0.2847030460834503, + "learning_rate": 0.0006, + "loss": 2.1867, + "step": 67700 + }, + { + "epoch": 0.2525682057250285, + "grad_norm": 0.3622114062309265, + "learning_rate": 0.0006, + "loss": 2.1233, + "step": 67710 + }, + { + "epoch": 0.2526055071879919, + "grad_norm": 0.31128278374671936, + "learning_rate": 0.0006, + "loss": 2.0066, + "step": 67720 + }, + { + "epoch": 0.2526428086509553, + "grad_norm": 0.2894008159637451, + "learning_rate": 0.0006, + "loss": 2.3228, + "step": 67730 + }, + { + "epoch": 0.25268011011391867, + "grad_norm": 0.33525756001472473, + "learning_rate": 0.0006, + "loss": 2.2226, + "step": 67740 + }, + { + "epoch": 0.25271741157688205, + "grad_norm": 0.3586629033088684, + "learning_rate": 0.0006, + "loss": 2.2197, + "step": 67750 + }, + { + "epoch": 0.25271741157688205, + "eval_valid_loss": 2.173999547958374, + "eval_valid_loss/all": 2.0386126041412354, + "eval_valid_loss/end_span": 1.2529282569885254, + "eval_valid_perplexity/batch": 7.679946422576904, + "eval_valid_perplexity/end_span": 3.5005786418914795, + "eval_valid_perplexity/fim": 2.105043649673462, + "eval_valid_perplexity/first_seq": 14.977876663208008, + "eval_valid_perplexity/last_seq": 8.993314743041992, + "eval_valid_perplexity/second_seq": 13.61434555053711, + "eval_valid_perplexity/seq": 8.663467407226562, + "eval_valid_reconstruction/all": 0.29898229241371155, + "eval_valid_reconstruction/end_span": 0.7094368934631348, + "eval_valid_reconstruction/fim": 0.1511852741241455, + "eval_valid_reconstruction/first_seq": 0.16459563374519348, + "eval_valid_reconstruction/last_seq": 0.32576221227645874, + "eval_valid_reconstruction/second_seq": 0.197151318192482, + "eval_valid_runtime": 448.8966, + "eval_valid_samples_per_second": 0.428, + "eval_valid_steps_per_second": 0.428, + "step": 67750 + }, + { + "epoch": 0.25271741157688205, + "eval_train_loss": 2.172065496444702, + "eval_train_loss/all": 2.0108284950256348, + "eval_train_loss/end_span": 1.2004848718643188, + "eval_train_perplexity/batch": 7.469503402709961, + "eval_train_perplexity/end_span": 3.3217270374298096, + "eval_train_perplexity/fim": 1.9859249591827393, + "eval_train_perplexity/first_seq": 15.41952896118164, + "eval_train_perplexity/last_seq": 8.567337989807129, + "eval_train_perplexity/second_seq": 13.886981964111328, + "eval_train_perplexity/seq": 8.603405952453613, + "eval_train_reconstruction/all": 0.2882978320121765, + "eval_train_reconstruction/end_span": 0.7247335314750671, + "eval_train_reconstruction/fim": 0.1403285562992096, + "eval_train_reconstruction/first_seq": 0.1520891934633255, + "eval_train_reconstruction/last_seq": 0.33907198905944824, + "eval_train_reconstruction/second_seq": 0.19121290743350983, + "eval_train_runtime": 444.0134, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 67750 + }, + { + "epoch": 0.2527547130398454, + "grad_norm": 0.3145832121372223, + "learning_rate": 0.0006, + "loss": 2.2867, + "step": 67760 + }, + { + "epoch": 0.2527920145028088, + "grad_norm": 0.2926054298877716, + "learning_rate": 0.0006, + "loss": 2.1636, + "step": 67770 + }, + { + "epoch": 0.2528293159657722, + "grad_norm": 0.2818651497364044, + "learning_rate": 0.0006, + "loss": 2.2364, + "step": 67780 + }, + { + "epoch": 0.25286661742873556, + "grad_norm": 0.5899767875671387, + "learning_rate": 0.0006, + "loss": 2.2733, + "step": 67790 + }, + { + "epoch": 0.25290391889169894, + "grad_norm": 0.4301680326461792, + "learning_rate": 0.0006, + "loss": 2.1658, + "step": 67800 + }, + { + "epoch": 0.2529412203546623, + "grad_norm": 0.33478638529777527, + "learning_rate": 0.0006, + "loss": 2.2633, + "step": 67810 + }, + { + "epoch": 0.2529785218176257, + "grad_norm": 0.39671751856803894, + "learning_rate": 0.0006, + "loss": 2.1474, + "step": 67820 + }, + { + "epoch": 0.2530158232805891, + "grad_norm": 0.2735438942909241, + "learning_rate": 0.0006, + "loss": 2.2495, + "step": 67830 + }, + { + "epoch": 0.25305312474355246, + "grad_norm": 0.36923468112945557, + "learning_rate": 0.0006, + "loss": 2.2798, + "step": 67840 + }, + { + "epoch": 0.25309042620651584, + "grad_norm": 0.24395698308944702, + "learning_rate": 0.0006, + "loss": 2.149, + "step": 67850 + }, + { + "epoch": 0.2531277276694792, + "grad_norm": 0.3488730490207672, + "learning_rate": 0.0006, + "loss": 2.1194, + "step": 67860 + }, + { + "epoch": 0.2531650291324426, + "grad_norm": 0.35477226972579956, + "learning_rate": 0.0006, + "loss": 2.2359, + "step": 67870 + }, + { + "epoch": 0.253202330595406, + "grad_norm": 0.34857192635536194, + "learning_rate": 0.0006, + "loss": 2.3101, + "step": 67880 + }, + { + "epoch": 0.2532396320583693, + "grad_norm": 0.26119792461395264, + "learning_rate": 0.0006, + "loss": 2.2873, + "step": 67890 + }, + { + "epoch": 0.2532769335213327, + "grad_norm": 0.3671715557575226, + "learning_rate": 0.0006, + "loss": 1.9218, + "step": 67900 + }, + { + "epoch": 0.25331423498429606, + "grad_norm": 0.5500767230987549, + "learning_rate": 0.0006, + "loss": 2.3036, + "step": 67910 + }, + { + "epoch": 0.25335153644725944, + "grad_norm": 0.3065463602542877, + "learning_rate": 0.0006, + "loss": 2.1803, + "step": 67920 + }, + { + "epoch": 0.2533888379102228, + "grad_norm": 0.40420299768447876, + "learning_rate": 0.0006, + "loss": 2.153, + "step": 67930 + }, + { + "epoch": 0.2534261393731862, + "grad_norm": 0.3047483563423157, + "learning_rate": 0.0006, + "loss": 2.1729, + "step": 67940 + }, + { + "epoch": 0.2534634408361496, + "grad_norm": 0.3506884276866913, + "learning_rate": 0.0006, + "loss": 2.2468, + "step": 67950 + }, + { + "epoch": 0.25350074229911296, + "grad_norm": 0.4455946087837219, + "learning_rate": 0.0006, + "loss": 2.2935, + "step": 67960 + }, + { + "epoch": 0.25353804376207634, + "grad_norm": 0.26665687561035156, + "learning_rate": 0.0006, + "loss": 2.2379, + "step": 67970 + }, + { + "epoch": 0.2535753452250397, + "grad_norm": 0.33231863379478455, + "learning_rate": 0.0006, + "loss": 2.1558, + "step": 67980 + }, + { + "epoch": 0.2536126466880031, + "grad_norm": 0.2866859436035156, + "learning_rate": 0.0006, + "loss": 2.2736, + "step": 67990 + }, + { + "epoch": 0.2536499481509665, + "grad_norm": 0.41450992226600647, + "learning_rate": 0.0006, + "loss": 2.2484, + "step": 68000 + }, + { + "epoch": 0.2536499481509665, + "eval_valid_loss": 2.1749143600463867, + "eval_valid_loss/all": 2.039661407470703, + "eval_valid_loss/end_span": 1.2559646368026733, + "eval_valid_perplexity/batch": 7.688005447387695, + "eval_valid_perplexity/end_span": 3.511223793029785, + "eval_valid_perplexity/fim": 2.0965516567230225, + "eval_valid_perplexity/first_seq": 14.951552391052246, + "eval_valid_perplexity/last_seq": 8.550044059753418, + "eval_valid_perplexity/second_seq": 13.757589340209961, + "eval_valid_perplexity/seq": 8.678879737854004, + "eval_valid_reconstruction/all": 0.2987285256385803, + "eval_valid_reconstruction/end_span": 0.7030106782913208, + "eval_valid_reconstruction/fim": 0.15005455911159515, + "eval_valid_reconstruction/first_seq": 0.16803975403308868, + "eval_valid_reconstruction/last_seq": 0.3407159745693207, + "eval_valid_reconstruction/second_seq": 0.19500452280044556, + "eval_valid_runtime": 445.4228, + "eval_valid_samples_per_second": 0.431, + "eval_valid_steps_per_second": 0.431, + "step": 68000 + }, + { + "epoch": 0.2536499481509665, + "eval_train_loss": 2.1713573932647705, + "eval_train_loss/all": 2.010183095932007, + "eval_train_loss/end_span": 1.2157920598983765, + "eval_train_perplexity/batch": 7.464684009552002, + "eval_train_perplexity/end_span": 3.37296462059021, + "eval_train_perplexity/fim": 2.16977858543396, + "eval_train_perplexity/first_seq": 15.551492691040039, + "eval_train_perplexity/last_seq": 8.415032386779785, + "eval_train_perplexity/second_seq": 13.921515464782715, + "eval_train_perplexity/seq": 8.59581184387207, + "eval_train_reconstruction/all": 0.28862228989601135, + "eval_train_reconstruction/end_span": 0.7147364020347595, + "eval_train_reconstruction/fim": 0.157415509223938, + "eval_train_reconstruction/first_seq": 0.1514717936515808, + "eval_train_reconstruction/last_seq": 0.34329959750175476, + "eval_train_reconstruction/second_seq": 0.19330719113349915, + "eval_train_runtime": 447.0686, + "eval_train_samples_per_second": 0.429, + "eval_train_steps_per_second": 0.429, + "step": 68000 + }, + { + "epoch": 0.25368724961392985, + "grad_norm": 0.33424457907676697, + "learning_rate": 0.0006, + "loss": 2.2867, + "step": 68010 + }, + { + "epoch": 0.25372455107689323, + "grad_norm": 0.3765949606895447, + "learning_rate": 0.0006, + "loss": 2.3494, + "step": 68020 + }, + { + "epoch": 0.2537618525398566, + "grad_norm": 0.4184131622314453, + "learning_rate": 0.0006, + "loss": 2.3938, + "step": 68030 + }, + { + "epoch": 0.25379915400282, + "grad_norm": 0.3156387209892273, + "learning_rate": 0.0006, + "loss": 2.2909, + "step": 68040 + }, + { + "epoch": 0.2538364554657834, + "grad_norm": 0.37652620673179626, + "learning_rate": 0.0006, + "loss": 2.3017, + "step": 68050 + }, + { + "epoch": 0.25387375692874675, + "grad_norm": 0.3288861811161041, + "learning_rate": 0.0006, + "loss": 2.1541, + "step": 68060 + }, + { + "epoch": 0.25391105839171013, + "grad_norm": 0.2722855806350708, + "learning_rate": 0.0006, + "loss": 2.2828, + "step": 68070 + }, + { + "epoch": 0.2539483598546735, + "grad_norm": 0.44976070523262024, + "learning_rate": 0.0006, + "loss": 2.0776, + "step": 68080 + }, + { + "epoch": 0.2539856613176369, + "grad_norm": 0.33618173003196716, + "learning_rate": 0.0006, + "loss": 2.1931, + "step": 68090 + }, + { + "epoch": 0.25402296278060027, + "grad_norm": 0.2580469250679016, + "learning_rate": 0.0006, + "loss": 2.3308, + "step": 68100 + }, + { + "epoch": 0.25406026424356365, + "grad_norm": 0.40188246965408325, + "learning_rate": 0.0006, + "loss": 2.2298, + "step": 68110 + }, + { + "epoch": 0.25409756570652703, + "grad_norm": 0.3117847442626953, + "learning_rate": 0.0006, + "loss": 2.2828, + "step": 68120 + }, + { + "epoch": 0.2541348671694904, + "grad_norm": 0.49458053708076477, + "learning_rate": 0.0006, + "loss": 2.3354, + "step": 68130 + }, + { + "epoch": 0.2541721686324538, + "grad_norm": 0.6464273929595947, + "learning_rate": 0.0006, + "loss": 2.0517, + "step": 68140 + }, + { + "epoch": 0.25420947009541717, + "grad_norm": 16.3996639251709, + "learning_rate": 0.0006, + "loss": 2.1871, + "step": 68150 + }, + { + "epoch": 0.25424677155838055, + "grad_norm": 0.38156241178512573, + "learning_rate": 0.0006, + "loss": 2.3943, + "step": 68160 + }, + { + "epoch": 0.25428407302134387, + "grad_norm": 0.4347265064716339, + "learning_rate": 0.0006, + "loss": 2.0281, + "step": 68170 + }, + { + "epoch": 0.25432137448430725, + "grad_norm": 0.3383677303791046, + "learning_rate": 0.0006, + "loss": 2.2523, + "step": 68180 + }, + { + "epoch": 0.25435867594727063, + "grad_norm": 2.7608673572540283, + "learning_rate": 0.0006, + "loss": 2.0479, + "step": 68190 + }, + { + "epoch": 0.254395977410234, + "grad_norm": 0.3107404410839081, + "learning_rate": 0.0006, + "loss": 2.2611, + "step": 68200 + }, + { + "epoch": 0.2544332788731974, + "grad_norm": 0.5975131392478943, + "learning_rate": 0.0006, + "loss": 2.2211, + "step": 68210 + }, + { + "epoch": 0.25447058033616077, + "grad_norm": 0.39048969745635986, + "learning_rate": 0.0006, + "loss": 2.2056, + "step": 68220 + }, + { + "epoch": 0.25450788179912415, + "grad_norm": 0.18255801498889923, + "learning_rate": 0.0006, + "loss": 2.3313, + "step": 68230 + }, + { + "epoch": 0.2545451832620875, + "grad_norm": 0.29921501874923706, + "learning_rate": 0.0006, + "loss": 2.2092, + "step": 68240 + }, + { + "epoch": 0.2545824847250509, + "grad_norm": 0.35138845443725586, + "learning_rate": 0.0006, + "loss": 2.1619, + "step": 68250 + }, + { + "epoch": 0.2545824847250509, + "eval_valid_loss": 2.179271697998047, + "eval_valid_loss/all": 2.0437417030334473, + "eval_valid_loss/end_span": 1.1483256816864014, + "eval_valid_perplexity/batch": 7.7194390296936035, + "eval_valid_perplexity/end_span": 3.152909517288208, + "eval_valid_perplexity/fim": 2.164914131164551, + "eval_valid_perplexity/first_seq": 15.014348030090332, + "eval_valid_perplexity/last_seq": 8.793352127075195, + "eval_valid_perplexity/second_seq": 13.741061210632324, + "eval_valid_perplexity/seq": 8.710951805114746, + "eval_valid_reconstruction/all": 0.29749083518981934, + "eval_valid_reconstruction/end_span": 0.726174533367157, + "eval_valid_reconstruction/fim": 0.15643088519573212, + "eval_valid_reconstruction/first_seq": 0.16546952724456787, + "eval_valid_reconstruction/last_seq": 0.3308422267436981, + "eval_valid_reconstruction/second_seq": 0.19737350940704346, + "eval_valid_runtime": 503.1147, + "eval_valid_samples_per_second": 0.382, + "eval_valid_steps_per_second": 0.382, + "step": 68250 + }, + { + "epoch": 0.2545824847250509, + "eval_train_loss": 2.1770832538604736, + "eval_train_loss/all": 2.0154056549072266, + "eval_train_loss/end_span": 1.1146881580352783, + "eval_train_perplexity/batch": 7.50377082824707, + "eval_train_perplexity/end_span": 3.048617362976074, + "eval_train_perplexity/fim": 1.9497014284133911, + "eval_train_perplexity/first_seq": 15.766608238220215, + "eval_train_perplexity/last_seq": 8.845250129699707, + "eval_train_perplexity/second_seq": 14.171294212341309, + "eval_train_perplexity/seq": 8.649173736572266, + "eval_train_reconstruction/all": 0.28698766231536865, + "eval_train_reconstruction/end_span": 0.7373826503753662, + "eval_train_reconstruction/fim": 0.13499659299850464, + "eval_train_reconstruction/first_seq": 0.14834631979465485, + "eval_train_reconstruction/last_seq": 0.3262639045715332, + "eval_train_reconstruction/second_seq": 0.18327218294143677, + "eval_train_runtime": 446.8559, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 68250 + }, + { + "epoch": 0.2546197861880143, + "grad_norm": 0.36029261350631714, + "learning_rate": 0.0006, + "loss": 2.2673, + "step": 68260 + }, + { + "epoch": 0.25465708765097766, + "grad_norm": 0.39229312539100647, + "learning_rate": 0.0006, + "loss": 2.2027, + "step": 68270 + }, + { + "epoch": 0.25469438911394104, + "grad_norm": 0.3651009798049927, + "learning_rate": 0.0006, + "loss": 2.0728, + "step": 68280 + }, + { + "epoch": 0.2547316905769044, + "grad_norm": 0.22782227396965027, + "learning_rate": 0.0006, + "loss": 2.2638, + "step": 68290 + }, + { + "epoch": 0.2547689920398678, + "grad_norm": 0.3499019742012024, + "learning_rate": 0.0006, + "loss": 2.06, + "step": 68300 + }, + { + "epoch": 0.2548062935028312, + "grad_norm": 0.30250027775764465, + "learning_rate": 0.0006, + "loss": 2.2363, + "step": 68310 + }, + { + "epoch": 0.25484359496579456, + "grad_norm": 0.29030588269233704, + "learning_rate": 0.0006, + "loss": 2.3433, + "step": 68320 + }, + { + "epoch": 0.25488089642875794, + "grad_norm": 0.2883817255496979, + "learning_rate": 0.0006, + "loss": 2.3585, + "step": 68330 + }, + { + "epoch": 0.2549181978917213, + "grad_norm": 0.5837058424949646, + "learning_rate": 0.0006, + "loss": 2.1232, + "step": 68340 + }, + { + "epoch": 0.2549554993546847, + "grad_norm": 0.30069273710250854, + "learning_rate": 0.0006, + "loss": 2.3094, + "step": 68350 + }, + { + "epoch": 0.2549928008176481, + "grad_norm": 0.31506091356277466, + "learning_rate": 0.0006, + "loss": 2.1654, + "step": 68360 + }, + { + "epoch": 0.25503010228061146, + "grad_norm": 0.35755693912506104, + "learning_rate": 0.0006, + "loss": 2.2012, + "step": 68370 + }, + { + "epoch": 0.25506740374357484, + "grad_norm": 0.45862939953804016, + "learning_rate": 0.0006, + "loss": 2.1601, + "step": 68380 + }, + { + "epoch": 0.2551047052065382, + "grad_norm": 0.3281719982624054, + "learning_rate": 0.0006, + "loss": 2.3418, + "step": 68390 + }, + { + "epoch": 0.2551420066695016, + "grad_norm": 0.4674643874168396, + "learning_rate": 0.0006, + "loss": 2.1124, + "step": 68400 + }, + { + "epoch": 0.255179308132465, + "grad_norm": 0.3499080538749695, + "learning_rate": 0.0006, + "loss": 2.1914, + "step": 68410 + }, + { + "epoch": 0.25521660959542836, + "grad_norm": 0.30187711119651794, + "learning_rate": 0.0006, + "loss": 2.2249, + "step": 68420 + }, + { + "epoch": 0.25525391105839174, + "grad_norm": 0.32091024518013, + "learning_rate": 0.0006, + "loss": 2.2209, + "step": 68430 + }, + { + "epoch": 0.25529121252135506, + "grad_norm": 0.2366010993719101, + "learning_rate": 0.0006, + "loss": 2.3085, + "step": 68440 + }, + { + "epoch": 0.25532851398431844, + "grad_norm": 0.4183851182460785, + "learning_rate": 0.0006, + "loss": 2.1963, + "step": 68450 + }, + { + "epoch": 0.2553658154472818, + "grad_norm": 0.30082517862319946, + "learning_rate": 0.0006, + "loss": 2.3228, + "step": 68460 + }, + { + "epoch": 0.2554031169102452, + "grad_norm": 0.5737460255622864, + "learning_rate": 0.0006, + "loss": 2.1807, + "step": 68470 + }, + { + "epoch": 0.2554404183732086, + "grad_norm": 0.5466446280479431, + "learning_rate": 0.0006, + "loss": 2.3323, + "step": 68480 + }, + { + "epoch": 0.25547771983617196, + "grad_norm": 0.2932433784008026, + "learning_rate": 0.0006, + "loss": 2.1498, + "step": 68490 + }, + { + "epoch": 0.25551502129913534, + "grad_norm": 0.2825542986392975, + "learning_rate": 0.0006, + "loss": 2.3036, + "step": 68500 + }, + { + "epoch": 0.25551502129913534, + "eval_valid_loss": 2.1775810718536377, + "eval_valid_loss/all": 2.041959285736084, + "eval_valid_loss/end_span": 1.2427568435668945, + "eval_valid_perplexity/batch": 7.705692291259766, + "eval_valid_perplexity/end_span": 3.465153217315674, + "eval_valid_perplexity/fim": 2.465902090072632, + "eval_valid_perplexity/first_seq": 15.111971855163574, + "eval_valid_perplexity/last_seq": 8.883975982666016, + "eval_valid_perplexity/second_seq": 12.783503532409668, + "eval_valid_perplexity/seq": 8.690536499023438, + "eval_valid_reconstruction/all": 0.2979307174682617, + "eval_valid_reconstruction/end_span": 0.7047423720359802, + "eval_valid_reconstruction/fim": 0.18260496854782104, + "eval_valid_reconstruction/first_seq": 0.16185785830020905, + "eval_valid_reconstruction/last_seq": 0.33197879791259766, + "eval_valid_reconstruction/second_seq": 0.22161586582660675, + "eval_valid_runtime": 453.231, + "eval_valid_samples_per_second": 0.424, + "eval_valid_steps_per_second": 0.424, + "step": 68500 + }, + { + "epoch": 0.25551502129913534, + "eval_train_loss": 2.1780407428741455, + "eval_train_loss/all": 2.0162782669067383, + "eval_train_loss/end_span": 1.207038164138794, + "eval_train_perplexity/batch": 7.510321617126465, + "eval_train_perplexity/end_span": 3.34356689453125, + "eval_train_perplexity/fim": 2.0536508560180664, + "eval_train_perplexity/first_seq": 15.610289573669434, + "eval_train_perplexity/last_seq": 8.693328857421875, + "eval_train_perplexity/second_seq": 14.408699035644531, + "eval_train_perplexity/seq": 8.651848793029785, + "eval_train_reconstruction/all": 0.28666308522224426, + "eval_train_reconstruction/end_span": 0.7148545384407043, + "eval_train_reconstruction/fim": 0.14618517458438873, + "eval_train_reconstruction/first_seq": 0.15027250349521637, + "eval_train_reconstruction/last_seq": 0.3356996476650238, + "eval_train_reconstruction/second_seq": 0.17790530622005463, + "eval_train_runtime": 439.4863, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 68500 + }, + { + "epoch": 0.2555523227620987, + "grad_norm": 0.2283155918121338, + "learning_rate": 0.0006, + "loss": 2.2195, + "step": 68510 + }, + { + "epoch": 0.2555896242250621, + "grad_norm": 0.3569042980670929, + "learning_rate": 0.0006, + "loss": 2.1685, + "step": 68520 + }, + { + "epoch": 0.2556269256880255, + "grad_norm": 0.20299218595027924, + "learning_rate": 0.0006, + "loss": 2.1845, + "step": 68530 + }, + { + "epoch": 0.25566422715098885, + "grad_norm": 0.41205570101737976, + "learning_rate": 0.0006, + "loss": 2.3558, + "step": 68540 + }, + { + "epoch": 0.25570152861395223, + "grad_norm": 0.4658833146095276, + "learning_rate": 0.0006, + "loss": 2.0493, + "step": 68550 + }, + { + "epoch": 0.2557388300769156, + "grad_norm": 0.3698042333126068, + "learning_rate": 0.0006, + "loss": 2.1286, + "step": 68560 + }, + { + "epoch": 0.255776131539879, + "grad_norm": 0.5289434790611267, + "learning_rate": 0.0006, + "loss": 2.1707, + "step": 68570 + }, + { + "epoch": 0.25581343300284237, + "grad_norm": 0.40913859009742737, + "learning_rate": 0.0006, + "loss": 2.0942, + "step": 68580 + }, + { + "epoch": 0.25585073446580575, + "grad_norm": 0.35581034421920776, + "learning_rate": 0.0006, + "loss": 2.1078, + "step": 68590 + }, + { + "epoch": 0.25588803592876913, + "grad_norm": 0.32836490869522095, + "learning_rate": 0.0006, + "loss": 2.1553, + "step": 68600 + }, + { + "epoch": 0.2559253373917325, + "grad_norm": 0.3652757406234741, + "learning_rate": 0.0006, + "loss": 2.1879, + "step": 68610 + }, + { + "epoch": 0.2559626388546959, + "grad_norm": 0.32371726632118225, + "learning_rate": 0.0006, + "loss": 2.224, + "step": 68620 + }, + { + "epoch": 0.25599994031765927, + "grad_norm": 0.36226803064346313, + "learning_rate": 0.0006, + "loss": 2.2068, + "step": 68630 + }, + { + "epoch": 0.25603724178062265, + "grad_norm": 0.5402891039848328, + "learning_rate": 0.0006, + "loss": 2.246, + "step": 68640 + }, + { + "epoch": 0.25607454324358603, + "grad_norm": 2.434738874435425, + "learning_rate": 0.0006, + "loss": 2.186, + "step": 68650 + }, + { + "epoch": 0.2561118447065494, + "grad_norm": 0.3123502731323242, + "learning_rate": 0.0006, + "loss": 2.2572, + "step": 68660 + }, + { + "epoch": 0.2561491461695128, + "grad_norm": 0.3727930188179016, + "learning_rate": 0.0006, + "loss": 2.3568, + "step": 68670 + }, + { + "epoch": 0.25618644763247617, + "grad_norm": 0.2893980145454407, + "learning_rate": 0.0006, + "loss": 2.1642, + "step": 68680 + }, + { + "epoch": 0.25622374909543955, + "grad_norm": 1.1828391551971436, + "learning_rate": 0.0006, + "loss": 2.2879, + "step": 68690 + }, + { + "epoch": 0.2562610505584029, + "grad_norm": 0.526322603225708, + "learning_rate": 0.0006, + "loss": 2.0802, + "step": 68700 + }, + { + "epoch": 0.2562983520213663, + "grad_norm": 0.24138985574245453, + "learning_rate": 0.0006, + "loss": 2.2099, + "step": 68710 + }, + { + "epoch": 0.25633565348432963, + "grad_norm": 0.45168936252593994, + "learning_rate": 0.0006, + "loss": 1.9539, + "step": 68720 + }, + { + "epoch": 0.256372954947293, + "grad_norm": 0.5405563712120056, + "learning_rate": 0.0006, + "loss": 1.9659, + "step": 68730 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 0.35188406705856323, + "learning_rate": 0.0006, + "loss": 2.214, + "step": 68740 + }, + { + "epoch": 0.25644755787321977, + "grad_norm": 0.3226656913757324, + "learning_rate": 0.0006, + "loss": 2.1541, + "step": 68750 + }, + { + "epoch": 0.25644755787321977, + "eval_valid_loss": 2.172172784805298, + "eval_valid_loss/all": 2.037074089050293, + "eval_valid_loss/end_span": 1.2618621587753296, + "eval_valid_perplexity/batch": 7.668139934539795, + "eval_valid_perplexity/end_span": 3.5319924354553223, + "eval_valid_perplexity/fim": 2.2263741493225098, + "eval_valid_perplexity/first_seq": 14.884235382080078, + "eval_valid_perplexity/last_seq": 8.903462409973145, + "eval_valid_perplexity/second_seq": 13.557488441467285, + "eval_valid_perplexity/seq": 8.655760765075684, + "eval_valid_reconstruction/all": 0.29933956265449524, + "eval_valid_reconstruction/end_span": 0.7027130126953125, + "eval_valid_reconstruction/fim": 0.16424314677715302, + "eval_valid_reconstruction/first_seq": 0.16589677333831787, + "eval_valid_reconstruction/last_seq": 0.33007219433784485, + "eval_valid_reconstruction/second_seq": 0.20376448333263397, + "eval_valid_runtime": 441.7493, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 68750 + }, + { + "epoch": 0.25644755787321977, + "eval_train_loss": 2.171330213546753, + "eval_train_loss/all": 2.010093927383423, + "eval_train_loss/end_span": 1.2216531038284302, + "eval_train_perplexity/batch": 7.46401834487915, + "eval_train_perplexity/end_span": 3.392791748046875, + "eval_train_perplexity/fim": 2.1012401580810547, + "eval_train_perplexity/first_seq": 15.490286827087402, + "eval_train_perplexity/last_seq": 8.207072257995605, + "eval_train_perplexity/second_seq": 14.255602836608887, + "eval_train_perplexity/seq": 8.596604347229004, + "eval_train_reconstruction/all": 0.2883727550506592, + "eval_train_reconstruction/end_span": 0.7154097557067871, + "eval_train_reconstruction/fim": 0.1512426882982254, + "eval_train_reconstruction/first_seq": 0.15096424520015717, + "eval_train_reconstruction/last_seq": 0.35215842723846436, + "eval_train_reconstruction/second_seq": 0.18399348855018616, + "eval_train_runtime": 447.3336, + "eval_train_samples_per_second": 0.429, + "eval_train_steps_per_second": 0.429, + "step": 68750 + }, + { + "epoch": 0.25648485933618315, + "grad_norm": 0.359343945980072, + "learning_rate": 0.0006, + "loss": 2.0845, + "step": 68760 + }, + { + "epoch": 0.2565221607991465, + "grad_norm": 0.36971601843833923, + "learning_rate": 0.0006, + "loss": 2.1808, + "step": 68770 + }, + { + "epoch": 0.2565594622621099, + "grad_norm": 0.32773882150650024, + "learning_rate": 0.0006, + "loss": 2.1677, + "step": 68780 + }, + { + "epoch": 0.2565967637250733, + "grad_norm": 0.295166015625, + "learning_rate": 0.0006, + "loss": 2.1596, + "step": 68790 + }, + { + "epoch": 0.25663406518803666, + "grad_norm": 0.4102863371372223, + "learning_rate": 0.0006, + "loss": 2.1176, + "step": 68800 + }, + { + "epoch": 0.25667136665100004, + "grad_norm": 0.34981444478034973, + "learning_rate": 0.0006, + "loss": 2.268, + "step": 68810 + }, + { + "epoch": 0.2567086681139634, + "grad_norm": 0.41105321049690247, + "learning_rate": 0.0006, + "loss": 2.081, + "step": 68820 + }, + { + "epoch": 0.2567459695769268, + "grad_norm": 0.3065412640571594, + "learning_rate": 0.0006, + "loss": 2.3286, + "step": 68830 + }, + { + "epoch": 0.2567832710398902, + "grad_norm": 0.6280194520950317, + "learning_rate": 0.0006, + "loss": 2.1368, + "step": 68840 + }, + { + "epoch": 0.25682057250285356, + "grad_norm": 0.4275001585483551, + "learning_rate": 0.0006, + "loss": 2.1639, + "step": 68850 + }, + { + "epoch": 0.25685787396581694, + "grad_norm": 0.7353479862213135, + "learning_rate": 0.0006, + "loss": 1.9671, + "step": 68860 + }, + { + "epoch": 0.2568951754287803, + "grad_norm": 0.341703325510025, + "learning_rate": 0.0006, + "loss": 2.3505, + "step": 68870 + }, + { + "epoch": 0.2569324768917437, + "grad_norm": 0.2593001127243042, + "learning_rate": 0.0006, + "loss": 2.1836, + "step": 68880 + }, + { + "epoch": 0.2569697783547071, + "grad_norm": 0.38678669929504395, + "learning_rate": 0.0006, + "loss": 2.2178, + "step": 68890 + }, + { + "epoch": 0.25700707981767046, + "grad_norm": 0.34692618250846863, + "learning_rate": 0.0006, + "loss": 2.1244, + "step": 68900 + }, + { + "epoch": 0.25704438128063384, + "grad_norm": 0.516563892364502, + "learning_rate": 0.0006, + "loss": 2.2585, + "step": 68910 + }, + { + "epoch": 0.2570816827435972, + "grad_norm": 4.600094318389893, + "learning_rate": 0.0006, + "loss": 2.1787, + "step": 68920 + }, + { + "epoch": 0.2571189842065606, + "grad_norm": 0.3736876845359802, + "learning_rate": 0.0006, + "loss": 2.3141, + "step": 68930 + }, + { + "epoch": 0.257156285669524, + "grad_norm": 0.24783596396446228, + "learning_rate": 0.0006, + "loss": 2.2627, + "step": 68940 + }, + { + "epoch": 0.25719358713248736, + "grad_norm": 0.25866246223449707, + "learning_rate": 0.0006, + "loss": 2.0738, + "step": 68950 + }, + { + "epoch": 0.25723088859545074, + "grad_norm": 0.28039371967315674, + "learning_rate": 0.0006, + "loss": 2.2716, + "step": 68960 + }, + { + "epoch": 0.2572681900584141, + "grad_norm": 0.308277428150177, + "learning_rate": 0.0006, + "loss": 2.3073, + "step": 68970 + }, + { + "epoch": 0.2573054915213775, + "grad_norm": 0.24974387884140015, + "learning_rate": 0.0006, + "loss": 2.2332, + "step": 68980 + }, + { + "epoch": 0.2573427929843409, + "grad_norm": 0.40777668356895447, + "learning_rate": 0.0006, + "loss": 2.1569, + "step": 68990 + }, + { + "epoch": 0.2573800944473042, + "grad_norm": 0.3399016261100769, + "learning_rate": 0.0006, + "loss": 2.2543, + "step": 69000 + }, + { + "epoch": 0.2573800944473042, + "eval_valid_loss": 2.172041177749634, + "eval_valid_loss/all": 2.03706431388855, + "eval_valid_loss/end_span": 1.2360618114471436, + "eval_valid_perplexity/batch": 7.668065071105957, + "eval_valid_perplexity/end_span": 3.4420313835144043, + "eval_valid_perplexity/fim": 2.385140895843506, + "eval_valid_perplexity/first_seq": 14.926801681518555, + "eval_valid_perplexity/last_seq": 9.14093017578125, + "eval_valid_perplexity/second_seq": 13.72712516784668, + "eval_valid_perplexity/seq": 8.650422096252441, + "eval_valid_reconstruction/all": 0.2993718087673187, + "eval_valid_reconstruction/end_span": 0.7079063057899475, + "eval_valid_reconstruction/fim": 0.17764835059642792, + "eval_valid_reconstruction/first_seq": 0.16470499336719513, + "eval_valid_reconstruction/last_seq": 0.32398244738578796, + "eval_valid_reconstruction/second_seq": 0.1966705024242401, + "eval_valid_runtime": 444.6975, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 69000 + }, + { + "epoch": 0.2573800944473042, + "eval_train_loss": 2.1728599071502686, + "eval_train_loss/all": 2.0117902755737305, + "eval_train_loss/end_span": 1.203246831893921, + "eval_train_perplexity/batch": 7.476690769195557, + "eval_train_perplexity/end_span": 3.330914258956909, + "eval_train_perplexity/fim": 1.9972602128982544, + "eval_train_perplexity/first_seq": 15.748147010803223, + "eval_train_perplexity/last_seq": 8.726673126220703, + "eval_train_perplexity/second_seq": 14.419864654541016, + "eval_train_perplexity/seq": 8.614553451538086, + "eval_train_reconstruction/all": 0.2881616950035095, + "eval_train_reconstruction/end_span": 0.718267023563385, + "eval_train_reconstruction/fim": 0.14067895710468292, + "eval_train_reconstruction/first_seq": 0.14722616970539093, + "eval_train_reconstruction/last_seq": 0.3327223062515259, + "eval_train_reconstruction/second_seq": 0.17954286932945251, + "eval_train_runtime": 452.6232, + "eval_train_samples_per_second": 0.424, + "eval_train_steps_per_second": 0.424, + "step": 69000 + }, + { + "epoch": 0.2574173959102676, + "grad_norm": 0.5211663842201233, + "learning_rate": 0.0006, + "loss": 2.2326, + "step": 69010 + }, + { + "epoch": 0.25745469737323096, + "grad_norm": 0.2938636541366577, + "learning_rate": 0.0006, + "loss": 2.1968, + "step": 69020 + }, + { + "epoch": 0.25749199883619434, + "grad_norm": 0.19762378931045532, + "learning_rate": 0.0006, + "loss": 2.2886, + "step": 69030 + }, + { + "epoch": 0.2575293002991577, + "grad_norm": 0.49471133947372437, + "learning_rate": 0.0006, + "loss": 2.2823, + "step": 69040 + }, + { + "epoch": 0.2575666017621211, + "grad_norm": 0.25640571117401123, + "learning_rate": 0.0006, + "loss": 1.9674, + "step": 69050 + }, + { + "epoch": 0.2576039032250845, + "grad_norm": 0.30203133821487427, + "learning_rate": 0.0006, + "loss": 2.0323, + "step": 69060 + }, + { + "epoch": 0.25764120468804785, + "grad_norm": 0.38744479417800903, + "learning_rate": 0.0006, + "loss": 2.2758, + "step": 69070 + }, + { + "epoch": 0.25767850615101123, + "grad_norm": 0.4170753061771393, + "learning_rate": 0.0006, + "loss": 2.1459, + "step": 69080 + }, + { + "epoch": 0.2577158076139746, + "grad_norm": 0.338164746761322, + "learning_rate": 0.0006, + "loss": 2.101, + "step": 69090 + }, + { + "epoch": 0.257753109076938, + "grad_norm": 0.3839610815048218, + "learning_rate": 0.0006, + "loss": 2.1036, + "step": 69100 + }, + { + "epoch": 0.25779041053990137, + "grad_norm": 0.2708706259727478, + "learning_rate": 0.0006, + "loss": 2.0902, + "step": 69110 + }, + { + "epoch": 0.25782771200286475, + "grad_norm": 0.3202308118343353, + "learning_rate": 0.0006, + "loss": 2.148, + "step": 69120 + }, + { + "epoch": 0.25786501346582813, + "grad_norm": 0.4114714562892914, + "learning_rate": 0.0006, + "loss": 2.1898, + "step": 69130 + }, + { + "epoch": 0.2579023149287915, + "grad_norm": 0.39789247512817383, + "learning_rate": 0.0006, + "loss": 2.3552, + "step": 69140 + }, + { + "epoch": 0.2579396163917549, + "grad_norm": 0.33925196528434753, + "learning_rate": 0.0006, + "loss": 2.231, + "step": 69150 + }, + { + "epoch": 0.25797691785471827, + "grad_norm": 0.2837834060192108, + "learning_rate": 0.0006, + "loss": 2.1436, + "step": 69160 + }, + { + "epoch": 0.25801421931768165, + "grad_norm": 0.3948037028312683, + "learning_rate": 0.0006, + "loss": 2.1381, + "step": 69170 + }, + { + "epoch": 0.258051520780645, + "grad_norm": 0.384765088558197, + "learning_rate": 0.0006, + "loss": 2.2448, + "step": 69180 + }, + { + "epoch": 0.2580888222436084, + "grad_norm": 0.39348700642585754, + "learning_rate": 0.0006, + "loss": 2.2114, + "step": 69190 + }, + { + "epoch": 0.2581261237065718, + "grad_norm": 0.43290409445762634, + "learning_rate": 0.0006, + "loss": 2.1646, + "step": 69200 + }, + { + "epoch": 0.25816342516953517, + "grad_norm": 0.2865796685218811, + "learning_rate": 0.0006, + "loss": 2.3557, + "step": 69210 + }, + { + "epoch": 0.25820072663249855, + "grad_norm": 0.34151491522789, + "learning_rate": 0.0006, + "loss": 2.0702, + "step": 69220 + }, + { + "epoch": 0.2582380280954619, + "grad_norm": 0.3853791356086731, + "learning_rate": 0.0006, + "loss": 2.2811, + "step": 69230 + }, + { + "epoch": 0.2582753295584253, + "grad_norm": 0.7462456226348877, + "learning_rate": 0.0006, + "loss": 2.2789, + "step": 69240 + }, + { + "epoch": 0.2583126310213887, + "grad_norm": 0.304538369178772, + "learning_rate": 0.0006, + "loss": 2.1545, + "step": 69250 + }, + { + "epoch": 0.2583126310213887, + "eval_valid_loss": 2.1740236282348633, + "eval_valid_loss/all": 2.0387635231018066, + "eval_valid_loss/end_span": 1.2356981039047241, + "eval_valid_perplexity/batch": 7.681105613708496, + "eval_valid_perplexity/end_span": 3.440779685974121, + "eval_valid_perplexity/fim": 2.459341049194336, + "eval_valid_perplexity/first_seq": 15.041335105895996, + "eval_valid_perplexity/last_seq": 8.595162391662598, + "eval_valid_perplexity/second_seq": 14.045761108398438, + "eval_valid_perplexity/seq": 8.668724060058594, + "eval_valid_reconstruction/all": 0.2988196313381195, + "eval_valid_reconstruction/end_span": 0.7141804695129395, + "eval_valid_reconstruction/fim": 0.1831131875514984, + "eval_valid_reconstruction/first_seq": 0.162941113114357, + "eval_valid_reconstruction/last_seq": 0.3401348292827606, + "eval_valid_reconstruction/second_seq": 0.18913616240024567, + "eval_valid_runtime": 441.2086, + "eval_valid_samples_per_second": 0.435, + "eval_valid_steps_per_second": 0.435, + "step": 69250 + }, + { + "epoch": 0.2583126310213887, + "eval_train_loss": 2.173398971557617, + "eval_train_loss/all": 2.012220859527588, + "eval_train_loss/end_span": 1.2041789293289185, + "eval_train_perplexity/batch": 7.479910850524902, + "eval_train_perplexity/end_span": 3.3340203762054443, + "eval_train_perplexity/fim": 2.1424691677093506, + "eval_train_perplexity/first_seq": 15.56221866607666, + "eval_train_perplexity/last_seq": 8.465222358703613, + "eval_train_perplexity/second_seq": 14.121846199035645, + "eval_train_perplexity/seq": 8.614590644836426, + "eval_train_reconstruction/all": 0.2879422903060913, + "eval_train_reconstruction/end_span": 0.7224674820899963, + "eval_train_reconstruction/fim": 0.15531614422798157, + "eval_train_reconstruction/first_seq": 0.1529371738433838, + "eval_train_reconstruction/last_seq": 0.3385331332683563, + "eval_train_reconstruction/second_seq": 0.18748117983341217, + "eval_train_runtime": 437.8642, + "eval_train_samples_per_second": 0.438, + "eval_train_steps_per_second": 0.438, + "step": 69250 + }, + { + "epoch": 0.25834993248435206, + "grad_norm": 0.3209875822067261, + "learning_rate": 0.0006, + "loss": 2.1458, + "step": 69260 + }, + { + "epoch": 0.2583872339473154, + "grad_norm": 0.3500621020793915, + "learning_rate": 0.0006, + "loss": 2.1337, + "step": 69270 + }, + { + "epoch": 0.25842453541027877, + "grad_norm": 0.2278168499469757, + "learning_rate": 0.0006, + "loss": 2.1758, + "step": 69280 + }, + { + "epoch": 0.25846183687324215, + "grad_norm": 0.3556830883026123, + "learning_rate": 0.0006, + "loss": 2.1506, + "step": 69290 + }, + { + "epoch": 0.2584991383362055, + "grad_norm": 0.4548273980617523, + "learning_rate": 0.0006, + "loss": 2.3188, + "step": 69300 + }, + { + "epoch": 0.2585364397991689, + "grad_norm": 0.35688701272010803, + "learning_rate": 0.0006, + "loss": 2.0191, + "step": 69310 + }, + { + "epoch": 0.2585737412621323, + "grad_norm": 0.3763044774532318, + "learning_rate": 0.0006, + "loss": 2.2024, + "step": 69320 + }, + { + "epoch": 0.25861104272509566, + "grad_norm": 0.7324116230010986, + "learning_rate": 0.0006, + "loss": 2.0045, + "step": 69330 + }, + { + "epoch": 0.25864834418805904, + "grad_norm": 0.28157272934913635, + "learning_rate": 0.0006, + "loss": 2.1841, + "step": 69340 + }, + { + "epoch": 0.2586856456510224, + "grad_norm": 0.35089367628097534, + "learning_rate": 0.0006, + "loss": 2.139, + "step": 69350 + }, + { + "epoch": 0.2587229471139858, + "grad_norm": 0.5083507299423218, + "learning_rate": 0.0006, + "loss": 2.3464, + "step": 69360 + }, + { + "epoch": 0.2587602485769492, + "grad_norm": 0.30368515849113464, + "learning_rate": 0.0006, + "loss": 2.1312, + "step": 69370 + }, + { + "epoch": 0.25879755003991256, + "grad_norm": 0.43587568402290344, + "learning_rate": 0.0006, + "loss": 2.2119, + "step": 69380 + }, + { + "epoch": 0.25883485150287594, + "grad_norm": 0.4535144567489624, + "learning_rate": 0.0006, + "loss": 2.1499, + "step": 69390 + }, + { + "epoch": 0.2588721529658393, + "grad_norm": 0.2546617090702057, + "learning_rate": 0.0006, + "loss": 2.2216, + "step": 69400 + }, + { + "epoch": 0.2589094544288027, + "grad_norm": 0.36176934838294983, + "learning_rate": 0.0006, + "loss": 2.3478, + "step": 69410 + }, + { + "epoch": 0.2589467558917661, + "grad_norm": 0.31548842787742615, + "learning_rate": 0.0006, + "loss": 2.2312, + "step": 69420 + }, + { + "epoch": 0.25898405735472946, + "grad_norm": 0.3770500719547272, + "learning_rate": 0.0006, + "loss": 2.237, + "step": 69430 + }, + { + "epoch": 0.25902135881769284, + "grad_norm": 0.3058531880378723, + "learning_rate": 0.0006, + "loss": 2.0014, + "step": 69440 + }, + { + "epoch": 0.2590586602806562, + "grad_norm": 0.2673278748989105, + "learning_rate": 0.0006, + "loss": 2.1712, + "step": 69450 + }, + { + "epoch": 0.2590959617436196, + "grad_norm": 0.3263632357120514, + "learning_rate": 0.0006, + "loss": 2.2036, + "step": 69460 + }, + { + "epoch": 0.259133263206583, + "grad_norm": 0.32123610377311707, + "learning_rate": 0.0006, + "loss": 2.0355, + "step": 69470 + }, + { + "epoch": 0.25917056466954635, + "grad_norm": 0.39569905400276184, + "learning_rate": 0.0006, + "loss": 2.0381, + "step": 69480 + }, + { + "epoch": 0.25920786613250973, + "grad_norm": 0.38556042313575745, + "learning_rate": 0.0006, + "loss": 2.2378, + "step": 69490 + }, + { + "epoch": 0.2592451675954731, + "grad_norm": 0.3305649161338806, + "learning_rate": 0.0006, + "loss": 2.3926, + "step": 69500 + }, + { + "epoch": 0.2592451675954731, + "eval_valid_loss": 2.173684597015381, + "eval_valid_loss/all": 2.0383379459381104, + "eval_valid_loss/end_span": 1.1890051364898682, + "eval_valid_perplexity/batch": 7.677837371826172, + "eval_valid_perplexity/end_span": 3.2838127613067627, + "eval_valid_perplexity/fim": 2.220777988433838, + "eval_valid_perplexity/first_seq": 15.029082298278809, + "eval_valid_perplexity/last_seq": 8.677911758422852, + "eval_valid_perplexity/second_seq": 13.686155319213867, + "eval_valid_perplexity/seq": 8.662219047546387, + "eval_valid_reconstruction/all": 0.2991957366466522, + "eval_valid_reconstruction/end_span": 0.7237911224365234, + "eval_valid_reconstruction/fim": 0.16339245438575745, + "eval_valid_reconstruction/first_seq": 0.16473287343978882, + "eval_valid_reconstruction/last_seq": 0.3356972932815552, + "eval_valid_reconstruction/second_seq": 0.19850802421569824, + "eval_valid_runtime": 442.313, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 69500 + }, + { + "epoch": 0.2592451675954731, + "eval_train_loss": 2.1712262630462646, + "eval_train_loss/all": 2.0102813243865967, + "eval_train_loss/end_span": 1.1503844261169434, + "eval_train_perplexity/batch": 7.465417385101318, + "eval_train_perplexity/end_span": 3.159407138824463, + "eval_train_perplexity/fim": 2.329862117767334, + "eval_train_perplexity/first_seq": 15.175440788269043, + "eval_train_perplexity/last_seq": 8.596749305725098, + "eval_train_perplexity/second_seq": 14.607356071472168, + "eval_train_perplexity/seq": 8.59548568725586, + "eval_train_reconstruction/all": 0.2886776626110077, + "eval_train_reconstruction/end_span": 0.7364149689674377, + "eval_train_reconstruction/fim": 0.17350006103515625, + "eval_train_reconstruction/first_seq": 0.1617620438337326, + "eval_train_reconstruction/last_seq": 0.3395230174064636, + "eval_train_reconstruction/second_seq": 0.1761721968650818, + "eval_train_runtime": 440.7001, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 69500 + }, + { + "epoch": 0.2592824690584365, + "grad_norm": 0.3155306875705719, + "learning_rate": 0.0006, + "loss": 2.1107, + "step": 69510 + }, + { + "epoch": 0.2593197705213999, + "grad_norm": 0.29621848464012146, + "learning_rate": 0.0006, + "loss": 2.2707, + "step": 69520 + }, + { + "epoch": 0.25935707198436325, + "grad_norm": 0.40441030263900757, + "learning_rate": 0.0006, + "loss": 2.1892, + "step": 69530 + }, + { + "epoch": 0.25939437344732663, + "grad_norm": 0.2690236270427704, + "learning_rate": 0.0006, + "loss": 2.2818, + "step": 69540 + }, + { + "epoch": 0.25943167491028996, + "grad_norm": 0.27445995807647705, + "learning_rate": 0.0006, + "loss": 2.3118, + "step": 69550 + }, + { + "epoch": 0.25946897637325333, + "grad_norm": 0.32355862855911255, + "learning_rate": 0.0006, + "loss": 2.2157, + "step": 69560 + }, + { + "epoch": 0.2595062778362167, + "grad_norm": 0.38833853602409363, + "learning_rate": 0.0006, + "loss": 2.0541, + "step": 69570 + }, + { + "epoch": 0.2595435792991801, + "grad_norm": 0.373622328042984, + "learning_rate": 0.0006, + "loss": 2.2862, + "step": 69580 + }, + { + "epoch": 0.2595808807621435, + "grad_norm": 7.156891822814941, + "learning_rate": 0.0006, + "loss": 2.2484, + "step": 69590 + }, + { + "epoch": 0.25961818222510685, + "grad_norm": 0.48298749327659607, + "learning_rate": 0.0006, + "loss": 2.2524, + "step": 69600 + }, + { + "epoch": 0.25965548368807023, + "grad_norm": 0.262930303812027, + "learning_rate": 0.0006, + "loss": 2.4309, + "step": 69610 + }, + { + "epoch": 0.2596927851510336, + "grad_norm": 0.3692421615123749, + "learning_rate": 0.0006, + "loss": 2.1856, + "step": 69620 + }, + { + "epoch": 0.259730086613997, + "grad_norm": 0.2766210436820984, + "learning_rate": 0.0006, + "loss": 2.1028, + "step": 69630 + }, + { + "epoch": 0.25976738807696037, + "grad_norm": 0.3066149652004242, + "learning_rate": 0.0006, + "loss": 2.1455, + "step": 69640 + }, + { + "epoch": 0.25980468953992375, + "grad_norm": 0.3022131323814392, + "learning_rate": 0.0006, + "loss": 2.195, + "step": 69650 + }, + { + "epoch": 0.25984199100288713, + "grad_norm": 0.2822599411010742, + "learning_rate": 0.0006, + "loss": 2.2288, + "step": 69660 + }, + { + "epoch": 0.2598792924658505, + "grad_norm": 0.3493376672267914, + "learning_rate": 0.0006, + "loss": 2.2148, + "step": 69670 + }, + { + "epoch": 0.2599165939288139, + "grad_norm": 0.6055523157119751, + "learning_rate": 0.0006, + "loss": 2.2629, + "step": 69680 + }, + { + "epoch": 0.25995389539177727, + "grad_norm": 0.310552716255188, + "learning_rate": 0.0006, + "loss": 2.1929, + "step": 69690 + }, + { + "epoch": 0.25999119685474065, + "grad_norm": 0.4051542282104492, + "learning_rate": 0.0006, + "loss": 2.1593, + "step": 69700 + }, + { + "epoch": 0.260028498317704, + "grad_norm": 0.311389297246933, + "learning_rate": 0.0006, + "loss": 2.231, + "step": 69710 + }, + { + "epoch": 0.2600657997806674, + "grad_norm": 0.4079144299030304, + "learning_rate": 0.0006, + "loss": 2.1336, + "step": 69720 + }, + { + "epoch": 0.2601031012436308, + "grad_norm": 0.8404681086540222, + "learning_rate": 0.0006, + "loss": 2.1358, + "step": 69730 + }, + { + "epoch": 0.26014040270659416, + "grad_norm": 0.33709749579429626, + "learning_rate": 0.0006, + "loss": 2.2258, + "step": 69740 + }, + { + "epoch": 0.26017770416955754, + "grad_norm": 0.30034318566322327, + "learning_rate": 0.0006, + "loss": 2.0855, + "step": 69750 + }, + { + "epoch": 0.26017770416955754, + "eval_valid_loss": 2.174180746078491, + "eval_valid_loss/all": 2.039039134979248, + "eval_valid_loss/end_span": 1.2374449968338013, + "eval_valid_perplexity/batch": 7.683223247528076, + "eval_valid_perplexity/end_span": 3.446795701980591, + "eval_valid_perplexity/fim": 2.5642473697662354, + "eval_valid_perplexity/first_seq": 14.811286926269531, + "eval_valid_perplexity/last_seq": 8.872227668762207, + "eval_valid_perplexity/second_seq": 13.766461372375488, + "eval_valid_perplexity/seq": 8.669622421264648, + "eval_valid_reconstruction/all": 0.2989513576030731, + "eval_valid_reconstruction/end_span": 0.7098358869552612, + "eval_valid_reconstruction/fim": 0.19183355569839478, + "eval_valid_reconstruction/first_seq": 0.16609464585781097, + "eval_valid_reconstruction/last_seq": 0.330147922039032, + "eval_valid_reconstruction/second_seq": 0.19598376750946045, + "eval_valid_runtime": 442.6576, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 69750 + }, + { + "epoch": 0.26017770416955754, + "eval_train_loss": 2.172498941421509, + "eval_train_loss/all": 2.011310577392578, + "eval_train_loss/end_span": 1.1997884511947632, + "eval_train_perplexity/batch": 7.473104953765869, + "eval_train_perplexity/end_span": 3.3194146156311035, + "eval_train_perplexity/fim": 1.9533367156982422, + "eval_train_perplexity/first_seq": 15.748139381408691, + "eval_train_perplexity/last_seq": 8.545886993408203, + "eval_train_perplexity/second_seq": 14.065133094787598, + "eval_train_perplexity/seq": 8.60969066619873, + "eval_train_reconstruction/all": 0.288163423538208, + "eval_train_reconstruction/end_span": 0.7208293676376343, + "eval_train_reconstruction/fim": 0.13647505640983582, + "eval_train_reconstruction/first_seq": 0.14832977950572968, + "eval_train_reconstruction/last_seq": 0.33878517150878906, + "eval_train_reconstruction/second_seq": 0.18640661239624023, + "eval_train_runtime": 441.3436, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 69750 + }, + { + "epoch": 0.2602150056325209, + "grad_norm": 0.3881726562976837, + "learning_rate": 0.0006, + "loss": 2.1515, + "step": 69760 + }, + { + "epoch": 0.2602523070954843, + "grad_norm": 0.23149509727954865, + "learning_rate": 0.0006, + "loss": 2.2287, + "step": 69770 + }, + { + "epoch": 0.2602896085584477, + "grad_norm": 0.3465954661369324, + "learning_rate": 0.0006, + "loss": 2.009, + "step": 69780 + }, + { + "epoch": 0.26032691002141106, + "grad_norm": 0.31684646010398865, + "learning_rate": 0.0006, + "loss": 2.0482, + "step": 69790 + }, + { + "epoch": 0.26036421148437444, + "grad_norm": 0.34797126054763794, + "learning_rate": 0.0006, + "loss": 2.1274, + "step": 69800 + }, + { + "epoch": 0.2604015129473378, + "grad_norm": 15.150010108947754, + "learning_rate": 0.0006, + "loss": 2.1027, + "step": 69810 + }, + { + "epoch": 0.26043881441030114, + "grad_norm": 0.5422221422195435, + "learning_rate": 0.0006, + "loss": 2.1974, + "step": 69820 + }, + { + "epoch": 0.2604761158732645, + "grad_norm": 0.28047534823417664, + "learning_rate": 0.0006, + "loss": 2.1366, + "step": 69830 + }, + { + "epoch": 0.2605134173362279, + "grad_norm": 0.29830917716026306, + "learning_rate": 0.0006, + "loss": 2.0848, + "step": 69840 + }, + { + "epoch": 0.2605507187991913, + "grad_norm": 0.39669865369796753, + "learning_rate": 0.0006, + "loss": 2.2618, + "step": 69850 + }, + { + "epoch": 0.26058802026215466, + "grad_norm": 0.19334633648395538, + "learning_rate": 0.0006, + "loss": 2.1198, + "step": 69860 + }, + { + "epoch": 0.26062532172511804, + "grad_norm": 0.24065329134464264, + "learning_rate": 0.0006, + "loss": 2.0801, + "step": 69870 + }, + { + "epoch": 0.2606626231880814, + "grad_norm": 0.23842404782772064, + "learning_rate": 0.0006, + "loss": 2.2506, + "step": 69880 + }, + { + "epoch": 0.2606999246510448, + "grad_norm": 0.3849699795246124, + "learning_rate": 0.0006, + "loss": 2.2857, + "step": 69890 + }, + { + "epoch": 0.2607372261140082, + "grad_norm": 0.339061439037323, + "learning_rate": 0.0006, + "loss": 2.096, + "step": 69900 + }, + { + "epoch": 0.26077452757697156, + "grad_norm": 0.33712440729141235, + "learning_rate": 0.0006, + "loss": 2.0886, + "step": 69910 + }, + { + "epoch": 0.26081182903993494, + "grad_norm": 0.31797271966934204, + "learning_rate": 0.0006, + "loss": 2.168, + "step": 69920 + }, + { + "epoch": 0.2608491305028983, + "grad_norm": 0.26548802852630615, + "learning_rate": 0.0006, + "loss": 2.2855, + "step": 69930 + }, + { + "epoch": 0.2608864319658617, + "grad_norm": 0.3010738492012024, + "learning_rate": 0.0006, + "loss": 2.2642, + "step": 69940 + }, + { + "epoch": 0.2609237334288251, + "grad_norm": 0.39593297243118286, + "learning_rate": 0.0006, + "loss": 2.1359, + "step": 69950 + }, + { + "epoch": 0.26096103489178846, + "grad_norm": 0.36289358139038086, + "learning_rate": 0.0006, + "loss": 2.1355, + "step": 69960 + }, + { + "epoch": 0.26099833635475184, + "grad_norm": 0.2872433364391327, + "learning_rate": 0.0006, + "loss": 2.1228, + "step": 69970 + }, + { + "epoch": 0.2610356378177152, + "grad_norm": 0.24371351301670074, + "learning_rate": 0.0006, + "loss": 2.3174, + "step": 69980 + }, + { + "epoch": 0.2610729392806786, + "grad_norm": 0.2744345963001251, + "learning_rate": 0.0006, + "loss": 2.1679, + "step": 69990 + }, + { + "epoch": 0.261110240743642, + "grad_norm": 0.36441826820373535, + "learning_rate": 0.0006, + "loss": 2.2702, + "step": 70000 + }, + { + "epoch": 0.261110240743642, + "eval_valid_loss": 2.17525053024292, + "eval_valid_loss/all": 2.039719343185425, + "eval_valid_loss/end_span": 1.2497225999832153, + "eval_valid_perplexity/batch": 7.688451290130615, + "eval_valid_perplexity/end_span": 3.489374876022339, + "eval_valid_perplexity/fim": 2.3238611221313477, + "eval_valid_perplexity/first_seq": 14.955018043518066, + "eval_valid_perplexity/last_seq": 8.9224214553833, + "eval_valid_perplexity/second_seq": 13.724448204040527, + "eval_valid_perplexity/seq": 8.671939849853516, + "eval_valid_reconstruction/all": 0.29871711134910583, + "eval_valid_reconstruction/end_span": 0.7062385678291321, + "eval_valid_reconstruction/fim": 0.171255424618721, + "eval_valid_reconstruction/first_seq": 0.16303372383117676, + "eval_valid_reconstruction/last_seq": 0.3265743553638458, + "eval_valid_reconstruction/second_seq": 0.1994773894548416, + "eval_valid_runtime": 444.1035, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 70000 + }, + { + "epoch": 0.261110240743642, + "eval_train_loss": 2.175004720687866, + "eval_train_loss/all": 2.0137948989868164, + "eval_train_loss/end_span": 1.2028523683547974, + "eval_train_perplexity/batch": 7.491693496704102, + "eval_train_perplexity/end_span": 3.3296005725860596, + "eval_train_perplexity/fim": 2.02938175201416, + "eval_train_perplexity/first_seq": 15.280488967895508, + "eval_train_perplexity/last_seq": 8.698843955993652, + "eval_train_perplexity/second_seq": 14.068047523498535, + "eval_train_perplexity/seq": 8.628769874572754, + "eval_train_reconstruction/all": 0.2875477969646454, + "eval_train_reconstruction/end_span": 0.7204757332801819, + "eval_train_reconstruction/fim": 0.14375941455364227, + "eval_train_reconstruction/first_seq": 0.15610064566135406, + "eval_train_reconstruction/last_seq": 0.33174052834510803, + "eval_train_reconstruction/second_seq": 0.18691660463809967, + "eval_train_runtime": 440.3254, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 70000 + }, + { + "epoch": 0.26114754220660535, + "grad_norm": 0.5129245519638062, + "learning_rate": 0.0006, + "loss": 2.1241, + "step": 70010 + }, + { + "epoch": 0.26118484366956873, + "grad_norm": 0.19310788810253143, + "learning_rate": 0.0006, + "loss": 2.3083, + "step": 70020 + }, + { + "epoch": 0.2612221451325321, + "grad_norm": 0.5195063948631287, + "learning_rate": 0.0006, + "loss": 2.1809, + "step": 70030 + }, + { + "epoch": 0.2612594465954955, + "grad_norm": 0.41057515144348145, + "learning_rate": 0.0006, + "loss": 2.1384, + "step": 70040 + }, + { + "epoch": 0.26129674805845887, + "grad_norm": 12.224466323852539, + "learning_rate": 0.0006, + "loss": 2.3124, + "step": 70050 + }, + { + "epoch": 0.26133404952142225, + "grad_norm": 0.3610404133796692, + "learning_rate": 0.0006, + "loss": 2.0407, + "step": 70060 + }, + { + "epoch": 0.26137135098438563, + "grad_norm": 0.39367225766181946, + "learning_rate": 0.0006, + "loss": 2.3064, + "step": 70070 + }, + { + "epoch": 0.261408652447349, + "grad_norm": 0.4310288429260254, + "learning_rate": 0.0006, + "loss": 2.3628, + "step": 70080 + }, + { + "epoch": 0.2614459539103124, + "grad_norm": 0.9238101840019226, + "learning_rate": 0.0006, + "loss": 2.1426, + "step": 70090 + }, + { + "epoch": 0.2614832553732757, + "grad_norm": 0.48947611451148987, + "learning_rate": 0.0006, + "loss": 2.2582, + "step": 70100 + }, + { + "epoch": 0.2615205568362391, + "grad_norm": 0.40298911929130554, + "learning_rate": 0.0006, + "loss": 2.2535, + "step": 70110 + }, + { + "epoch": 0.2615578582992025, + "grad_norm": 0.2559771239757538, + "learning_rate": 0.0006, + "loss": 2.2221, + "step": 70120 + }, + { + "epoch": 0.26159515976216585, + "grad_norm": 0.42671531438827515, + "learning_rate": 0.0006, + "loss": 2.2128, + "step": 70130 + }, + { + "epoch": 0.26163246122512923, + "grad_norm": 0.433657705783844, + "learning_rate": 0.0006, + "loss": 2.153, + "step": 70140 + }, + { + "epoch": 0.2616697626880926, + "grad_norm": 0.25182414054870605, + "learning_rate": 0.0006, + "loss": 2.3293, + "step": 70150 + }, + { + "epoch": 0.261707064151056, + "grad_norm": 0.3910868167877197, + "learning_rate": 0.0006, + "loss": 2.2388, + "step": 70160 + }, + { + "epoch": 0.26174436561401937, + "grad_norm": 0.31224948167800903, + "learning_rate": 0.0006, + "loss": 2.2485, + "step": 70170 + }, + { + "epoch": 0.26178166707698275, + "grad_norm": 0.38469552993774414, + "learning_rate": 0.0006, + "loss": 2.3155, + "step": 70180 + }, + { + "epoch": 0.26181896853994613, + "grad_norm": 0.3261687457561493, + "learning_rate": 0.0006, + "loss": 2.232, + "step": 70190 + }, + { + "epoch": 0.2618562700029095, + "grad_norm": 0.27963507175445557, + "learning_rate": 0.0006, + "loss": 2.1723, + "step": 70200 + }, + { + "epoch": 0.2618935714658729, + "grad_norm": 0.2787613272666931, + "learning_rate": 0.0006, + "loss": 2.2736, + "step": 70210 + }, + { + "epoch": 0.26193087292883627, + "grad_norm": 0.40789923071861267, + "learning_rate": 0.0006, + "loss": 2.2197, + "step": 70220 + }, + { + "epoch": 0.26196817439179965, + "grad_norm": 0.4228828549385071, + "learning_rate": 0.0006, + "loss": 2.3224, + "step": 70230 + }, + { + "epoch": 0.262005475854763, + "grad_norm": 0.4585232436656952, + "learning_rate": 0.0006, + "loss": 2.0415, + "step": 70240 + }, + { + "epoch": 0.2620427773177264, + "grad_norm": 0.4273920953273773, + "learning_rate": 0.0006, + "loss": 2.2106, + "step": 70250 + }, + { + "epoch": 0.2620427773177264, + "eval_valid_loss": 2.177008628845215, + "eval_valid_loss/all": 2.0418426990509033, + "eval_valid_loss/end_span": 1.1824586391448975, + "eval_valid_perplexity/batch": 7.704793930053711, + "eval_valid_perplexity/end_span": 3.262385368347168, + "eval_valid_perplexity/fim": 2.29190731048584, + "eval_valid_perplexity/first_seq": 14.518325805664062, + "eval_valid_perplexity/last_seq": 8.683764457702637, + "eval_valid_perplexity/second_seq": 14.076966285705566, + "eval_valid_perplexity/seq": 8.692063331604004, + "eval_valid_reconstruction/all": 0.2980682849884033, + "eval_valid_reconstruction/end_span": 0.7251846790313721, + "eval_valid_reconstruction/fim": 0.16832105815410614, + "eval_valid_reconstruction/first_seq": 0.17681440711021423, + "eval_valid_reconstruction/last_seq": 0.3388378918170929, + "eval_valid_reconstruction/second_seq": 0.18907728791236877, + "eval_valid_runtime": 439.832, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 70250 + }, + { + "epoch": 0.2620427773177264, + "eval_train_loss": 2.176536798477173, + "eval_train_loss/all": 2.0149760246276855, + "eval_train_loss/end_span": 1.1468653678894043, + "eval_train_perplexity/batch": 7.500547409057617, + "eval_train_perplexity/end_span": 3.148308753967285, + "eval_train_perplexity/fim": 1.9622405767440796, + "eval_train_perplexity/first_seq": 15.514761924743652, + "eval_train_perplexity/last_seq": 8.948768615722656, + "eval_train_perplexity/second_seq": 14.153847694396973, + "eval_train_perplexity/seq": 8.63541030883789, + "eval_train_reconstruction/all": 0.2870703637599945, + "eval_train_reconstruction/end_span": 0.7371185421943665, + "eval_train_reconstruction/fim": 0.13811412453651428, + "eval_train_reconstruction/first_seq": 0.1515771746635437, + "eval_train_reconstruction/last_seq": 0.3248847424983978, + "eval_train_reconstruction/second_seq": 0.1867869347333908, + "eval_train_runtime": 439.7, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 70250 + }, + { + "epoch": 0.2620800787806898, + "grad_norm": 0.30516043305397034, + "learning_rate": 0.0006, + "loss": 1.8833, + "step": 70260 + }, + { + "epoch": 0.26211738024365316, + "grad_norm": 0.22604654729366302, + "learning_rate": 0.0006, + "loss": 2.1011, + "step": 70270 + }, + { + "epoch": 0.26215468170661654, + "grad_norm": 0.28378188610076904, + "learning_rate": 0.0006, + "loss": 2.3029, + "step": 70280 + }, + { + "epoch": 0.2621919831695799, + "grad_norm": 0.39116379618644714, + "learning_rate": 0.0006, + "loss": 2.2472, + "step": 70290 + }, + { + "epoch": 0.2622292846325433, + "grad_norm": 0.3174382150173187, + "learning_rate": 0.0006, + "loss": 2.2112, + "step": 70300 + }, + { + "epoch": 0.2622665860955067, + "grad_norm": 0.2313239574432373, + "learning_rate": 0.0006, + "loss": 2.3492, + "step": 70310 + }, + { + "epoch": 0.26230388755847006, + "grad_norm": 0.40612339973449707, + "learning_rate": 0.0006, + "loss": 2.0916, + "step": 70320 + }, + { + "epoch": 0.26234118902143344, + "grad_norm": 0.36341047286987305, + "learning_rate": 0.0006, + "loss": 2.3165, + "step": 70330 + }, + { + "epoch": 0.2623784904843968, + "grad_norm": 0.3781681954860687, + "learning_rate": 0.0006, + "loss": 2.2086, + "step": 70340 + }, + { + "epoch": 0.2624157919473602, + "grad_norm": 0.3095043897628784, + "learning_rate": 0.0006, + "loss": 2.149, + "step": 70350 + }, + { + "epoch": 0.2624530934103236, + "grad_norm": 3.239800214767456, + "learning_rate": 0.0006, + "loss": 2.1244, + "step": 70360 + }, + { + "epoch": 0.2624903948732869, + "grad_norm": 0.32190605998039246, + "learning_rate": 0.0006, + "loss": 2.2485, + "step": 70370 + }, + { + "epoch": 0.2625276963362503, + "grad_norm": 1.3087761402130127, + "learning_rate": 0.0006, + "loss": 2.2956, + "step": 70380 + }, + { + "epoch": 0.26256499779921366, + "grad_norm": 0.4796418845653534, + "learning_rate": 0.0006, + "loss": 2.236, + "step": 70390 + }, + { + "epoch": 0.26260229926217704, + "grad_norm": 0.4191867411136627, + "learning_rate": 0.0006, + "loss": 2.2412, + "step": 70400 + }, + { + "epoch": 0.2626396007251404, + "grad_norm": 0.3343067765235901, + "learning_rate": 0.0006, + "loss": 2.271, + "step": 70410 + }, + { + "epoch": 0.2626769021881038, + "grad_norm": 0.3531298041343689, + "learning_rate": 0.0006, + "loss": 2.065, + "step": 70420 + }, + { + "epoch": 0.2627142036510672, + "grad_norm": 0.2169564962387085, + "learning_rate": 0.0006, + "loss": 2.1132, + "step": 70430 + }, + { + "epoch": 0.26275150511403056, + "grad_norm": 0.395227313041687, + "learning_rate": 0.0006, + "loss": 2.0619, + "step": 70440 + }, + { + "epoch": 0.26278880657699394, + "grad_norm": 0.7383837103843689, + "learning_rate": 0.0006, + "loss": 2.2121, + "step": 70450 + }, + { + "epoch": 0.2628261080399573, + "grad_norm": 0.2646317183971405, + "learning_rate": 0.0006, + "loss": 2.2846, + "step": 70460 + }, + { + "epoch": 0.2628634095029207, + "grad_norm": 0.4520869553089142, + "learning_rate": 0.0006, + "loss": 2.131, + "step": 70470 + }, + { + "epoch": 0.2629007109658841, + "grad_norm": 0.5522571206092834, + "learning_rate": 0.0006, + "loss": 2.3431, + "step": 70480 + }, + { + "epoch": 0.26293801242884746, + "grad_norm": 0.367950975894928, + "learning_rate": 0.0006, + "loss": 2.3291, + "step": 70490 + }, + { + "epoch": 0.26297531389181084, + "grad_norm": 0.42714717984199524, + "learning_rate": 0.0006, + "loss": 1.9772, + "step": 70500 + }, + { + "epoch": 0.26297531389181084, + "eval_valid_loss": 2.17325496673584, + "eval_valid_loss/all": 2.0383172035217285, + "eval_valid_loss/end_span": 1.3046071529388428, + "eval_valid_perplexity/batch": 7.677678108215332, + "eval_valid_perplexity/end_span": 3.6862406730651855, + "eval_valid_perplexity/fim": 2.3588180541992188, + "eval_valid_perplexity/first_seq": 14.869150161743164, + "eval_valid_perplexity/last_seq": 9.260125160217285, + "eval_valid_perplexity/second_seq": 13.881355285644531, + "eval_valid_perplexity/seq": 8.672874450683594, + "eval_valid_reconstruction/all": 0.29921403527259827, + "eval_valid_reconstruction/end_span": 0.6959015727043152, + "eval_valid_reconstruction/fim": 0.1756524294614792, + "eval_valid_reconstruction/first_seq": 0.167583629488945, + "eval_valid_reconstruction/last_seq": 0.31616243720054626, + "eval_valid_reconstruction/second_seq": 0.1940581053495407, + "eval_valid_runtime": 443.6548, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 70500 + }, + { + "epoch": 0.26297531389181084, + "eval_train_loss": 2.171715497970581, + "eval_train_loss/all": 2.0108866691589355, + "eval_train_loss/end_span": 1.2582978010177612, + "eval_train_perplexity/batch": 7.469937801361084, + "eval_train_perplexity/end_span": 3.519425630569458, + "eval_train_perplexity/fim": 2.082066535949707, + "eval_train_perplexity/first_seq": 15.41983413696289, + "eval_train_perplexity/last_seq": 8.55129623413086, + "eval_train_perplexity/second_seq": 14.021306037902832, + "eval_train_perplexity/seq": 8.605775833129883, + "eval_train_reconstruction/all": 0.28848835825920105, + "eval_train_reconstruction/end_span": 0.7086471915245056, + "eval_train_reconstruction/fim": 0.14979195594787598, + "eval_train_reconstruction/first_seq": 0.1510038673877716, + "eval_train_reconstruction/last_seq": 0.3410453498363495, + "eval_train_reconstruction/second_seq": 0.18726405501365662, + "eval_train_runtime": 442.0849, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 70500 + }, + { + "epoch": 0.2630126153547742, + "grad_norm": 0.5303149819374084, + "learning_rate": 0.0006, + "loss": 2.0447, + "step": 70510 + }, + { + "epoch": 0.2630499168177376, + "grad_norm": 0.31071388721466064, + "learning_rate": 0.0006, + "loss": 2.0621, + "step": 70520 + }, + { + "epoch": 0.263087218280701, + "grad_norm": 0.4306846857070923, + "learning_rate": 0.0006, + "loss": 2.2537, + "step": 70530 + }, + { + "epoch": 0.26312451974366435, + "grad_norm": 0.2793137729167938, + "learning_rate": 0.0006, + "loss": 2.3467, + "step": 70540 + }, + { + "epoch": 0.26316182120662773, + "grad_norm": 0.33777719736099243, + "learning_rate": 0.0006, + "loss": 2.1364, + "step": 70550 + }, + { + "epoch": 0.2631991226695911, + "grad_norm": 0.2924858033657074, + "learning_rate": 0.0006, + "loss": 2.3168, + "step": 70560 + }, + { + "epoch": 0.2632364241325545, + "grad_norm": 0.28496673703193665, + "learning_rate": 0.0006, + "loss": 2.1978, + "step": 70570 + }, + { + "epoch": 0.26327372559551787, + "grad_norm": 0.2582199275493622, + "learning_rate": 0.0006, + "loss": 2.3162, + "step": 70580 + }, + { + "epoch": 0.26331102705848125, + "grad_norm": 0.4096781313419342, + "learning_rate": 0.0006, + "loss": 1.9946, + "step": 70590 + }, + { + "epoch": 0.26334832852144463, + "grad_norm": 0.25247853994369507, + "learning_rate": 0.0006, + "loss": 2.3231, + "step": 70600 + }, + { + "epoch": 0.263385629984408, + "grad_norm": 0.4077681303024292, + "learning_rate": 0.0006, + "loss": 2.078, + "step": 70610 + }, + { + "epoch": 0.2634229314473714, + "grad_norm": 0.25435328483581543, + "learning_rate": 0.0006, + "loss": 2.3182, + "step": 70620 + }, + { + "epoch": 0.26346023291033477, + "grad_norm": 0.36743229627609253, + "learning_rate": 0.0006, + "loss": 2.3754, + "step": 70630 + }, + { + "epoch": 0.26349753437329815, + "grad_norm": 0.29292798042297363, + "learning_rate": 0.0006, + "loss": 2.2172, + "step": 70640 + }, + { + "epoch": 0.26353483583626147, + "grad_norm": 0.3632417917251587, + "learning_rate": 0.0006, + "loss": 2.3347, + "step": 70650 + }, + { + "epoch": 0.26357213729922485, + "grad_norm": 0.23597502708435059, + "learning_rate": 0.0006, + "loss": 2.0777, + "step": 70660 + }, + { + "epoch": 0.26360943876218823, + "grad_norm": 0.3221082389354706, + "learning_rate": 0.0006, + "loss": 2.2203, + "step": 70670 + }, + { + "epoch": 0.2636467402251516, + "grad_norm": 0.42619678378105164, + "learning_rate": 0.0006, + "loss": 2.2663, + "step": 70680 + }, + { + "epoch": 0.263684041688115, + "grad_norm": 0.30030539631843567, + "learning_rate": 0.0006, + "loss": 2.1416, + "step": 70690 + }, + { + "epoch": 0.26372134315107837, + "grad_norm": 0.2266189306974411, + "learning_rate": 0.0006, + "loss": 2.3501, + "step": 70700 + }, + { + "epoch": 0.26375864461404175, + "grad_norm": 0.40321746468544006, + "learning_rate": 0.0006, + "loss": 2.2343, + "step": 70710 + }, + { + "epoch": 0.26379594607700513, + "grad_norm": 0.5948221683502197, + "learning_rate": 0.0006, + "loss": 2.0696, + "step": 70720 + }, + { + "epoch": 0.2638332475399685, + "grad_norm": 0.44807732105255127, + "learning_rate": 0.0006, + "loss": 2.1009, + "step": 70730 + }, + { + "epoch": 0.2638705490029319, + "grad_norm": 0.22612442076206207, + "learning_rate": 0.0006, + "loss": 2.1295, + "step": 70740 + }, + { + "epoch": 0.26390785046589527, + "grad_norm": 0.3283025026321411, + "learning_rate": 0.0006, + "loss": 2.1289, + "step": 70750 + }, + { + "epoch": 0.26390785046589527, + "eval_valid_loss": 2.177398920059204, + "eval_valid_loss/all": 2.0420567989349365, + "eval_valid_loss/end_span": 1.261631727218628, + "eval_valid_perplexity/batch": 7.7064433097839355, + "eval_valid_perplexity/end_span": 3.5311787128448486, + "eval_valid_perplexity/fim": 2.172248363494873, + "eval_valid_perplexity/first_seq": 14.761251449584961, + "eval_valid_perplexity/last_seq": 9.247121810913086, + "eval_valid_perplexity/second_seq": 13.48827075958252, + "eval_valid_perplexity/seq": 8.696822166442871, + "eval_valid_reconstruction/all": 0.2981431484222412, + "eval_valid_reconstruction/end_span": 0.7063668966293335, + "eval_valid_reconstruction/fim": 0.1579350084066391, + "eval_valid_reconstruction/first_seq": 0.16962163150310516, + "eval_valid_reconstruction/last_seq": 0.3210279941558838, + "eval_valid_reconstruction/second_seq": 0.20406189560890198, + "eval_valid_runtime": 502.049, + "eval_valid_samples_per_second": 0.382, + "eval_valid_steps_per_second": 0.382, + "step": 70750 + }, + { + "epoch": 0.26390785046589527, + "eval_train_loss": 2.17490553855896, + "eval_train_loss/all": 2.0132181644439697, + "eval_train_loss/end_span": 1.2234768867492676, + "eval_train_perplexity/batch": 7.487374305725098, + "eval_train_perplexity/end_span": 3.3989851474761963, + "eval_train_perplexity/fim": 2.0501270294189453, + "eval_train_perplexity/first_seq": 15.82113265991211, + "eval_train_perplexity/last_seq": 8.678391456604004, + "eval_train_perplexity/second_seq": 13.95440673828125, + "eval_train_perplexity/seq": 8.624754905700684, + "eval_train_reconstruction/all": 0.2878495454788208, + "eval_train_reconstruction/end_span": 0.7168136835098267, + "eval_train_reconstruction/fim": 0.14614510536193848, + "eval_train_reconstruction/first_seq": 0.1449112743139267, + "eval_train_reconstruction/last_seq": 0.3376804292201996, + "eval_train_reconstruction/second_seq": 0.18877451121807098, + "eval_train_runtime": 440.0711, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 70750 + }, + { + "epoch": 0.26394515192885865, + "grad_norm": 0.505945086479187, + "learning_rate": 0.0006, + "loss": 2.1758, + "step": 70760 + }, + { + "epoch": 0.263982453391822, + "grad_norm": 0.3079427480697632, + "learning_rate": 0.0006, + "loss": 2.2888, + "step": 70770 + }, + { + "epoch": 0.2640197548547854, + "grad_norm": 0.2943854331970215, + "learning_rate": 0.0006, + "loss": 2.1517, + "step": 70780 + }, + { + "epoch": 0.2640570563177488, + "grad_norm": 0.26557978987693787, + "learning_rate": 0.0006, + "loss": 2.2082, + "step": 70790 + }, + { + "epoch": 0.26409435778071216, + "grad_norm": 0.5534644722938538, + "learning_rate": 0.0006, + "loss": 2.4819, + "step": 70800 + }, + { + "epoch": 0.26413165924367554, + "grad_norm": 0.4037012755870819, + "learning_rate": 0.0006, + "loss": 2.1165, + "step": 70810 + }, + { + "epoch": 0.2641689607066389, + "grad_norm": 0.4069984257221222, + "learning_rate": 0.0006, + "loss": 2.1793, + "step": 70820 + }, + { + "epoch": 0.2642062621696023, + "grad_norm": 0.34280696511268616, + "learning_rate": 0.0006, + "loss": 2.2521, + "step": 70830 + }, + { + "epoch": 0.2642435636325657, + "grad_norm": 0.5179033279418945, + "learning_rate": 0.0006, + "loss": 2.1582, + "step": 70840 + }, + { + "epoch": 0.26428086509552906, + "grad_norm": 0.4179510176181793, + "learning_rate": 0.0006, + "loss": 2.2264, + "step": 70850 + }, + { + "epoch": 0.26431816655849244, + "grad_norm": 0.2914011478424072, + "learning_rate": 0.0006, + "loss": 2.2296, + "step": 70860 + }, + { + "epoch": 0.2643554680214558, + "grad_norm": 0.5115554332733154, + "learning_rate": 0.0006, + "loss": 2.1208, + "step": 70870 + }, + { + "epoch": 0.2643927694844192, + "grad_norm": 0.4475714862346649, + "learning_rate": 0.0006, + "loss": 2.1409, + "step": 70880 + }, + { + "epoch": 0.2644300709473826, + "grad_norm": 0.34954866766929626, + "learning_rate": 0.0006, + "loss": 2.1618, + "step": 70890 + }, + { + "epoch": 0.26446737241034596, + "grad_norm": 0.4395585060119629, + "learning_rate": 0.0006, + "loss": 2.1755, + "step": 70900 + }, + { + "epoch": 0.26450467387330934, + "grad_norm": 0.352631539106369, + "learning_rate": 0.0006, + "loss": 2.1591, + "step": 70910 + }, + { + "epoch": 0.26454197533627266, + "grad_norm": 0.475788414478302, + "learning_rate": 0.0006, + "loss": 2.1347, + "step": 70920 + }, + { + "epoch": 0.26457927679923604, + "grad_norm": 0.39959174394607544, + "learning_rate": 0.0006, + "loss": 2.1496, + "step": 70930 + }, + { + "epoch": 0.2646165782621994, + "grad_norm": 0.37746021151542664, + "learning_rate": 0.0006, + "loss": 2.1943, + "step": 70940 + }, + { + "epoch": 0.2646538797251628, + "grad_norm": 0.3699350357055664, + "learning_rate": 0.0006, + "loss": 2.0744, + "step": 70950 + }, + { + "epoch": 0.2646911811881262, + "grad_norm": 0.2748631238937378, + "learning_rate": 0.0006, + "loss": 2.2763, + "step": 70960 + }, + { + "epoch": 0.26472848265108956, + "grad_norm": 0.2730596959590912, + "learning_rate": 0.0006, + "loss": 2.2583, + "step": 70970 + }, + { + "epoch": 0.26476578411405294, + "grad_norm": 0.3004848062992096, + "learning_rate": 0.0006, + "loss": 2.21, + "step": 70980 + }, + { + "epoch": 0.2648030855770163, + "grad_norm": 0.4007299244403839, + "learning_rate": 0.0006, + "loss": 2.0292, + "step": 70990 + }, + { + "epoch": 0.2648403870399797, + "grad_norm": 0.3583895266056061, + "learning_rate": 0.0006, + "loss": 2.2028, + "step": 71000 + }, + { + "epoch": 0.2648403870399797, + "eval_valid_loss": 2.177942991256714, + "eval_valid_loss/all": 2.0423812866210938, + "eval_valid_loss/end_span": 1.2287523746490479, + "eval_valid_perplexity/batch": 7.708944797515869, + "eval_valid_perplexity/end_span": 3.416963815689087, + "eval_valid_perplexity/fim": 2.4142606258392334, + "eval_valid_perplexity/first_seq": 15.028874397277832, + "eval_valid_perplexity/last_seq": 8.979534149169922, + "eval_valid_perplexity/second_seq": 13.681333541870117, + "eval_valid_perplexity/seq": 8.701610565185547, + "eval_valid_reconstruction/all": 0.29777443408966064, + "eval_valid_reconstruction/end_span": 0.717440664768219, + "eval_valid_reconstruction/fim": 0.17773501574993134, + "eval_valid_reconstruction/first_seq": 0.16483040153980255, + "eval_valid_reconstruction/last_seq": 0.3274606168270111, + "eval_valid_reconstruction/second_seq": 0.1967117041349411, + "eval_valid_runtime": 439.7673, + "eval_valid_samples_per_second": 0.437, + "eval_valid_steps_per_second": 0.437, + "step": 71000 + }, + { + "epoch": 0.2648403870399797, + "eval_train_loss": 2.176375389099121, + "eval_train_loss/all": 2.014721393585205, + "eval_train_loss/end_span": 1.199843406677246, + "eval_train_perplexity/batch": 7.498638153076172, + "eval_train_perplexity/end_span": 3.319597005844116, + "eval_train_perplexity/fim": 2.1229665279388428, + "eval_train_perplexity/first_seq": 15.408496856689453, + "eval_train_perplexity/last_seq": 9.072299003601074, + "eval_train_perplexity/second_seq": 14.123566627502441, + "eval_train_perplexity/seq": 8.639204978942871, + "eval_train_reconstruction/all": 0.28715163469314575, + "eval_train_reconstruction/end_span": 0.7266381978988647, + "eval_train_reconstruction/fim": 0.152304545044899, + "eval_train_reconstruction/first_seq": 0.1543097198009491, + "eval_train_reconstruction/last_seq": 0.3209468722343445, + "eval_train_reconstruction/second_seq": 0.18488337099552155, + "eval_train_runtime": 443.3253, + "eval_train_samples_per_second": 0.433, + "eval_train_steps_per_second": 0.433, + "step": 71000 + }, + { + "epoch": 0.2648776885029431, + "grad_norm": 0.4232008457183838, + "learning_rate": 0.0006, + "loss": 2.2477, + "step": 71010 + }, + { + "epoch": 0.26491498996590646, + "grad_norm": 0.3337348699569702, + "learning_rate": 0.0006, + "loss": 2.1013, + "step": 71020 + }, + { + "epoch": 0.26495229142886983, + "grad_norm": 0.31950843334198, + "learning_rate": 0.0006, + "loss": 2.2743, + "step": 71030 + }, + { + "epoch": 0.2649895928918332, + "grad_norm": 2.4208106994628906, + "learning_rate": 0.0006, + "loss": 2.239, + "step": 71040 + }, + { + "epoch": 0.2650268943547966, + "grad_norm": 0.30203837156295776, + "learning_rate": 0.0006, + "loss": 2.3586, + "step": 71050 + }, + { + "epoch": 0.26506419581776, + "grad_norm": 0.32248827815055847, + "learning_rate": 0.0006, + "loss": 2.1197, + "step": 71060 + }, + { + "epoch": 0.26510149728072335, + "grad_norm": 0.2634012699127197, + "learning_rate": 0.0006, + "loss": 2.3234, + "step": 71070 + }, + { + "epoch": 0.26513879874368673, + "grad_norm": 0.35917583107948303, + "learning_rate": 0.0006, + "loss": 2.3166, + "step": 71080 + }, + { + "epoch": 0.2651761002066501, + "grad_norm": 0.3524962067604065, + "learning_rate": 0.0006, + "loss": 2.1466, + "step": 71090 + }, + { + "epoch": 0.2652134016696135, + "grad_norm": 0.46472176909446716, + "learning_rate": 0.0006, + "loss": 1.9189, + "step": 71100 + }, + { + "epoch": 0.26525070313257687, + "grad_norm": 0.40646493434906006, + "learning_rate": 0.0006, + "loss": 2.1952, + "step": 71110 + }, + { + "epoch": 0.26528800459554025, + "grad_norm": 0.3592829704284668, + "learning_rate": 0.0006, + "loss": 2.2881, + "step": 71120 + }, + { + "epoch": 0.26532530605850363, + "grad_norm": 0.38191381096839905, + "learning_rate": 0.0006, + "loss": 2.0952, + "step": 71130 + }, + { + "epoch": 0.265362607521467, + "grad_norm": 0.31819549202919006, + "learning_rate": 0.0006, + "loss": 2.1101, + "step": 71140 + }, + { + "epoch": 0.2653999089844304, + "grad_norm": 0.42818453907966614, + "learning_rate": 0.0006, + "loss": 2.1482, + "step": 71150 + }, + { + "epoch": 0.26543721044739377, + "grad_norm": 1.327642560005188, + "learning_rate": 0.0006, + "loss": 2.1483, + "step": 71160 + }, + { + "epoch": 0.26547451191035715, + "grad_norm": 0.32540032267570496, + "learning_rate": 0.0006, + "loss": 2.1686, + "step": 71170 + }, + { + "epoch": 0.2655118133733205, + "grad_norm": 0.40791359543800354, + "learning_rate": 0.0006, + "loss": 2.2137, + "step": 71180 + }, + { + "epoch": 0.2655491148362839, + "grad_norm": 0.3865613043308258, + "learning_rate": 0.0006, + "loss": 2.1152, + "step": 71190 + }, + { + "epoch": 0.26558641629924723, + "grad_norm": 0.464129775762558, + "learning_rate": 0.0006, + "loss": 2.1272, + "step": 71200 + }, + { + "epoch": 0.2656237177622106, + "grad_norm": 0.4576452374458313, + "learning_rate": 0.0006, + "loss": 2.152, + "step": 71210 + }, + { + "epoch": 0.265661019225174, + "grad_norm": 0.4665803015232086, + "learning_rate": 0.0006, + "loss": 2.1415, + "step": 71220 + }, + { + "epoch": 0.26569832068813737, + "grad_norm": 0.2506738603115082, + "learning_rate": 0.0006, + "loss": 2.2879, + "step": 71230 + }, + { + "epoch": 0.26573562215110075, + "grad_norm": 0.36720219254493713, + "learning_rate": 0.0006, + "loss": 2.0917, + "step": 71240 + }, + { + "epoch": 0.2657729236140641, + "grad_norm": 0.36964094638824463, + "learning_rate": 0.0006, + "loss": 2.1833, + "step": 71250 + }, + { + "epoch": 0.2657729236140641, + "eval_valid_loss": 2.172461748123169, + "eval_valid_loss/all": 2.037660837173462, + "eval_valid_loss/end_span": 1.2362010478973389, + "eval_valid_perplexity/batch": 7.672640800476074, + "eval_valid_perplexity/end_span": 3.4425106048583984, + "eval_valid_perplexity/fim": 2.165436029434204, + "eval_valid_perplexity/first_seq": 14.919760704040527, + "eval_valid_perplexity/last_seq": 8.75615119934082, + "eval_valid_perplexity/second_seq": 13.876008033752441, + "eval_valid_perplexity/seq": 8.660021781921387, + "eval_valid_reconstruction/all": 0.2994311451911926, + "eval_valid_reconstruction/end_span": 0.7097600102424622, + "eval_valid_reconstruction/fim": 0.1586417257785797, + "eval_valid_reconstruction/first_seq": 0.1663506031036377, + "eval_valid_reconstruction/last_seq": 0.33316004276275635, + "eval_valid_reconstruction/second_seq": 0.19031432271003723, + "eval_valid_runtime": 442.5322, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 71250 + }, + { + "epoch": 0.2657729236140641, + "eval_train_loss": 2.172701835632324, + "eval_train_loss/all": 2.0121188163757324, + "eval_train_loss/end_span": 1.198869228363037, + "eval_train_perplexity/batch": 7.479147434234619, + "eval_train_perplexity/end_span": 3.3163647651672363, + "eval_train_perplexity/fim": 2.090475559234619, + "eval_train_perplexity/first_seq": 15.502290725708008, + "eval_train_perplexity/last_seq": 8.160091400146484, + "eval_train_perplexity/second_seq": 14.084388732910156, + "eval_train_perplexity/seq": 8.618834495544434, + "eval_train_reconstruction/all": 0.28800779581069946, + "eval_train_reconstruction/end_span": 0.7205458283424377, + "eval_train_reconstruction/fim": 0.15021927654743195, + "eval_train_reconstruction/first_seq": 0.15165038406848907, + "eval_train_reconstruction/last_seq": 0.35337233543395996, + "eval_train_reconstruction/second_seq": 0.18628625571727753, + "eval_train_runtime": 440.4872, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 71250 + }, + { + "epoch": 0.2658102250770275, + "grad_norm": 0.628415048122406, + "learning_rate": 0.0006, + "loss": 2.255, + "step": 71260 + }, + { + "epoch": 0.2658475265399909, + "grad_norm": 0.3683498203754425, + "learning_rate": 0.0006, + "loss": 2.2439, + "step": 71270 + }, + { + "epoch": 0.26588482800295427, + "grad_norm": 0.3527843654155731, + "learning_rate": 0.0006, + "loss": 2.1363, + "step": 71280 + }, + { + "epoch": 0.26592212946591764, + "grad_norm": 0.34301888942718506, + "learning_rate": 0.0006, + "loss": 2.3733, + "step": 71290 + }, + { + "epoch": 0.265959430928881, + "grad_norm": 0.3558467626571655, + "learning_rate": 0.0006, + "loss": 2.1312, + "step": 71300 + }, + { + "epoch": 0.2659967323918444, + "grad_norm": 0.4773295521736145, + "learning_rate": 0.0006, + "loss": 2.2452, + "step": 71310 + }, + { + "epoch": 0.2660340338548078, + "grad_norm": 0.45735082030296326, + "learning_rate": 0.0006, + "loss": 2.2274, + "step": 71320 + }, + { + "epoch": 0.26607133531777116, + "grad_norm": 0.3010883331298828, + "learning_rate": 0.0006, + "loss": 2.2217, + "step": 71330 + }, + { + "epoch": 0.26610863678073454, + "grad_norm": 0.2634093165397644, + "learning_rate": 0.0006, + "loss": 2.3493, + "step": 71340 + }, + { + "epoch": 0.2661459382436979, + "grad_norm": 1.4214320182800293, + "learning_rate": 0.0006, + "loss": 2.1995, + "step": 71350 + }, + { + "epoch": 0.2661832397066613, + "grad_norm": 0.32335910201072693, + "learning_rate": 0.0006, + "loss": 2.3553, + "step": 71360 + }, + { + "epoch": 0.2662205411696247, + "grad_norm": 0.32087260484695435, + "learning_rate": 0.0006, + "loss": 2.2536, + "step": 71370 + }, + { + "epoch": 0.26625784263258806, + "grad_norm": 0.47774749994277954, + "learning_rate": 0.0006, + "loss": 2.1122, + "step": 71380 + }, + { + "epoch": 0.26629514409555144, + "grad_norm": 0.3333744406700134, + "learning_rate": 0.0006, + "loss": 2.209, + "step": 71390 + }, + { + "epoch": 0.2663324455585148, + "grad_norm": 0.39903897047042847, + "learning_rate": 0.0006, + "loss": 1.9239, + "step": 71400 + }, + { + "epoch": 0.2663697470214782, + "grad_norm": 0.3243035674095154, + "learning_rate": 0.0006, + "loss": 2.1776, + "step": 71410 + }, + { + "epoch": 0.2664070484844416, + "grad_norm": 0.33725371956825256, + "learning_rate": 0.0006, + "loss": 2.3045, + "step": 71420 + }, + { + "epoch": 0.26644434994740496, + "grad_norm": 0.30836737155914307, + "learning_rate": 0.0006, + "loss": 2.1115, + "step": 71430 + }, + { + "epoch": 0.26648165141036834, + "grad_norm": 0.4598879814147949, + "learning_rate": 0.0006, + "loss": 2.2619, + "step": 71440 + }, + { + "epoch": 0.2665189528733317, + "grad_norm": 0.5415644645690918, + "learning_rate": 0.0006, + "loss": 2.1284, + "step": 71450 + }, + { + "epoch": 0.2665562543362951, + "grad_norm": 0.4242587387561798, + "learning_rate": 0.0006, + "loss": 2.2919, + "step": 71460 + }, + { + "epoch": 0.2665935557992584, + "grad_norm": 0.39414742588996887, + "learning_rate": 0.0006, + "loss": 2.2478, + "step": 71470 + }, + { + "epoch": 0.2666308572622218, + "grad_norm": 0.28839150071144104, + "learning_rate": 0.0006, + "loss": 2.2885, + "step": 71480 + }, + { + "epoch": 0.2666681587251852, + "grad_norm": 0.29349175095558167, + "learning_rate": 0.0006, + "loss": 2.1751, + "step": 71490 + }, + { + "epoch": 0.26670546018814856, + "grad_norm": 0.4390137791633606, + "learning_rate": 0.0006, + "loss": 2.1901, + "step": 71500 + }, + { + "epoch": 0.26670546018814856, + "eval_valid_loss": 2.1869235038757324, + "eval_valid_loss/all": 2.0515146255493164, + "eval_valid_loss/end_span": 1.3134493827819824, + "eval_valid_perplexity/batch": 7.779675483703613, + "eval_valid_perplexity/end_span": 3.718979835510254, + "eval_valid_perplexity/fim": 2.2852728366851807, + "eval_valid_perplexity/first_seq": 14.536880493164062, + "eval_valid_perplexity/last_seq": 8.823826789855957, + "eval_valid_perplexity/second_seq": 13.749836921691895, + "eval_valid_perplexity/seq": 8.795414924621582, + "eval_valid_reconstruction/all": 0.2953147292137146, + "eval_valid_reconstruction/end_span": 0.6989870071411133, + "eval_valid_reconstruction/fim": 0.16452151536941528, + "eval_valid_reconstruction/first_seq": 0.17458932101726532, + "eval_valid_reconstruction/last_seq": 0.33049479126930237, + "eval_valid_reconstruction/second_seq": 0.1969793736934662, + "eval_valid_runtime": 443.4151, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 71500 + }, + { + "epoch": 0.26670546018814856, + "eval_train_loss": 2.182051658630371, + "eval_train_loss/all": 2.020542860031128, + "eval_train_loss/end_span": 1.2666997909545898, + "eval_train_perplexity/batch": 7.542418479919434, + "eval_train_perplexity/end_span": 3.5491204261779785, + "eval_train_perplexity/fim": 2.3489809036254883, + "eval_train_perplexity/first_seq": 15.206157684326172, + "eval_train_perplexity/last_seq": 9.266474723815918, + "eval_train_perplexity/second_seq": 14.222726821899414, + "eval_train_perplexity/seq": 8.699945449829102, + "eval_train_reconstruction/all": 0.2856082022190094, + "eval_train_reconstruction/end_span": 0.7142057418823242, + "eval_train_reconstruction/fim": 0.17052488029003143, + "eval_train_reconstruction/first_seq": 0.15556393563747406, + "eval_train_reconstruction/last_seq": 0.31289204955101013, + "eval_train_reconstruction/second_seq": 0.18434841930866241, + "eval_train_runtime": 440.7981, + "eval_train_samples_per_second": 0.436, + "eval_train_steps_per_second": 0.436, + "step": 71500 + }, + { + "epoch": 0.26674276165111194, + "grad_norm": 0.7434736490249634, + "learning_rate": 0.0006, + "loss": 2.1037, + "step": 71510 + }, + { + "epoch": 0.2667800631140753, + "grad_norm": 0.31584662199020386, + "learning_rate": 0.0006, + "loss": 2.4125, + "step": 71520 + }, + { + "epoch": 0.2668173645770387, + "grad_norm": 0.34290945529937744, + "learning_rate": 0.0006, + "loss": 2.3017, + "step": 71530 + }, + { + "epoch": 0.2668546660400021, + "grad_norm": 0.31134340167045593, + "learning_rate": 0.0006, + "loss": 2.2656, + "step": 71540 + }, + { + "epoch": 0.26689196750296545, + "grad_norm": 0.29418545961380005, + "learning_rate": 0.0006, + "loss": 2.2305, + "step": 71550 + }, + { + "epoch": 0.26692926896592883, + "grad_norm": 0.3824256658554077, + "learning_rate": 0.0006, + "loss": 2.1001, + "step": 71560 + }, + { + "epoch": 0.2669665704288922, + "grad_norm": 0.2309129536151886, + "learning_rate": 0.0006, + "loss": 2.2907, + "step": 71570 + }, + { + "epoch": 0.2670038718918556, + "grad_norm": 0.31684184074401855, + "learning_rate": 0.0006, + "loss": 2.2141, + "step": 71580 + }, + { + "epoch": 0.267041173354819, + "grad_norm": 0.3676535189151764, + "learning_rate": 0.0006, + "loss": 2.3487, + "step": 71590 + }, + { + "epoch": 0.26707847481778235, + "grad_norm": 0.4502810537815094, + "learning_rate": 0.0006, + "loss": 2.2056, + "step": 71600 + }, + { + "epoch": 0.26711577628074573, + "grad_norm": 0.26903557777404785, + "learning_rate": 0.0006, + "loss": 2.3897, + "step": 71610 + }, + { + "epoch": 0.2671530777437091, + "grad_norm": 0.2732763886451721, + "learning_rate": 0.0006, + "loss": 2.1507, + "step": 71620 + }, + { + "epoch": 0.2671903792066725, + "grad_norm": 0.2450244128704071, + "learning_rate": 0.0006, + "loss": 2.3148, + "step": 71630 + }, + { + "epoch": 0.26722768066963587, + "grad_norm": 0.414916455745697, + "learning_rate": 0.0006, + "loss": 2.1372, + "step": 71640 + }, + { + "epoch": 0.26726498213259925, + "grad_norm": 0.35898783802986145, + "learning_rate": 0.0006, + "loss": 2.1212, + "step": 71650 + }, + { + "epoch": 0.26730228359556263, + "grad_norm": 0.34975478053092957, + "learning_rate": 0.0006, + "loss": 2.2067, + "step": 71660 + }, + { + "epoch": 0.267339585058526, + "grad_norm": 0.28128790855407715, + "learning_rate": 0.0006, + "loss": 2.3324, + "step": 71670 + }, + { + "epoch": 0.2673768865214894, + "grad_norm": 0.33120089769363403, + "learning_rate": 0.0006, + "loss": 2.2618, + "step": 71680 + }, + { + "epoch": 0.26741418798445277, + "grad_norm": 0.36734676361083984, + "learning_rate": 0.0006, + "loss": 2.1166, + "step": 71690 + }, + { + "epoch": 0.26745148944741615, + "grad_norm": 0.27211931347846985, + "learning_rate": 0.0006, + "loss": 2.1178, + "step": 71700 + }, + { + "epoch": 0.2674887909103795, + "grad_norm": 0.21470700204372406, + "learning_rate": 0.0006, + "loss": 2.2444, + "step": 71710 + }, + { + "epoch": 0.2675260923733429, + "grad_norm": 0.33252137899398804, + "learning_rate": 0.0006, + "loss": 2.2321, + "step": 71720 + }, + { + "epoch": 0.2675633938363063, + "grad_norm": 0.34499990940093994, + "learning_rate": 0.0006, + "loss": 2.268, + "step": 71730 + }, + { + "epoch": 0.26760069529926966, + "grad_norm": 0.32599642872810364, + "learning_rate": 0.0006, + "loss": 2.2043, + "step": 71740 + }, + { + "epoch": 0.267637996762233, + "grad_norm": 0.3441823720932007, + "learning_rate": 0.0006, + "loss": 2.3331, + "step": 71750 + }, + { + "epoch": 0.267637996762233, + "eval_valid_loss": 2.173961877822876, + "eval_valid_loss/all": 2.0386106967926025, + "eval_valid_loss/end_span": 1.153823733329773, + "eval_valid_perplexity/batch": 7.679932117462158, + "eval_valid_perplexity/end_span": 3.1702921390533447, + "eval_valid_perplexity/fim": 2.901603937149048, + "eval_valid_perplexity/first_seq": 14.647464752197266, + "eval_valid_perplexity/last_seq": 8.749533653259277, + "eval_valid_perplexity/second_seq": 13.662421226501465, + "eval_valid_perplexity/seq": 8.659116744995117, + "eval_valid_reconstruction/all": 0.29917365312576294, + "eval_valid_reconstruction/end_span": 0.7331089973449707, + "eval_valid_reconstruction/fim": 0.21820995211601257, + "eval_valid_reconstruction/first_seq": 0.16887855529785156, + "eval_valid_reconstruction/last_seq": 0.33600693941116333, + "eval_valid_reconstruction/second_seq": 0.1968211829662323, + "eval_valid_runtime": 442.7474, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 71750 + }, + { + "epoch": 0.267637996762233, + "eval_train_loss": 2.173415422439575, + "eval_train_loss/all": 2.0122439861297607, + "eval_train_loss/end_span": 1.1175543069839478, + "eval_train_perplexity/batch": 7.48008394241333, + "eval_train_perplexity/end_span": 3.0573675632476807, + "eval_train_perplexity/fim": 2.3447184562683105, + "eval_train_perplexity/first_seq": 15.444190979003906, + "eval_train_perplexity/last_seq": 8.618020057678223, + "eval_train_perplexity/second_seq": 14.351272583007812, + "eval_train_perplexity/seq": 8.61066722869873, + "eval_train_reconstruction/all": 0.2882564663887024, + "eval_train_reconstruction/end_span": 0.746184229850769, + "eval_train_reconstruction/fim": 0.17369897663593292, + "eval_train_reconstruction/first_seq": 0.15158255398273468, + "eval_train_reconstruction/last_seq": 0.34025776386260986, + "eval_train_reconstruction/second_seq": 0.17778947949409485, + "eval_train_runtime": 439.5007, + "eval_train_samples_per_second": 0.437, + "eval_train_steps_per_second": 0.437, + "step": 71750 + }, + { + "epoch": 0.26767529822519637, + "grad_norm": 0.34792637825012207, + "learning_rate": 0.0006, + "loss": 2.2019, + "step": 71760 + }, + { + "epoch": 0.26771259968815975, + "grad_norm": 0.35016921162605286, + "learning_rate": 0.0006, + "loss": 2.0897, + "step": 71770 + }, + { + "epoch": 0.2677499011511231, + "grad_norm": 0.4109892249107361, + "learning_rate": 0.0006, + "loss": 2.0999, + "step": 71780 + }, + { + "epoch": 0.2677872026140865, + "grad_norm": 0.2954612076282501, + "learning_rate": 0.0006, + "loss": 2.1305, + "step": 71790 + }, + { + "epoch": 0.2678245040770499, + "grad_norm": 0.35285332798957825, + "learning_rate": 0.0006, + "loss": 2.1704, + "step": 71800 + }, + { + "epoch": 0.26786180554001326, + "grad_norm": 0.32532796263694763, + "learning_rate": 0.0006, + "loss": 2.1588, + "step": 71810 + }, + { + "epoch": 0.26789910700297664, + "grad_norm": 0.28645408153533936, + "learning_rate": 0.0006, + "loss": 2.1031, + "step": 71820 + }, + { + "epoch": 0.26793640846594, + "grad_norm": 0.3674977123737335, + "learning_rate": 0.0006, + "loss": 2.3274, + "step": 71830 + }, + { + "epoch": 0.2679737099289034, + "grad_norm": 0.35167625546455383, + "learning_rate": 0.0006, + "loss": 1.9775, + "step": 71840 + }, + { + "epoch": 0.2680110113918668, + "grad_norm": 0.4636824429035187, + "learning_rate": 0.0006, + "loss": 2.2759, + "step": 71850 + }, + { + "epoch": 0.26804831285483016, + "grad_norm": 0.2982134521007538, + "learning_rate": 0.0006, + "loss": 2.2515, + "step": 71860 + }, + { + "epoch": 0.26808561431779354, + "grad_norm": 0.45291388034820557, + "learning_rate": 0.0006, + "loss": 2.1223, + "step": 71870 + }, + { + "epoch": 0.2681229157807569, + "grad_norm": 0.31167179346084595, + "learning_rate": 0.0006, + "loss": 2.3241, + "step": 71880 + }, + { + "epoch": 0.2681602172437203, + "grad_norm": 0.46173420548439026, + "learning_rate": 0.0006, + "loss": 2.1111, + "step": 71890 + }, + { + "epoch": 0.2681975187066837, + "grad_norm": 0.3817271292209625, + "learning_rate": 0.0006, + "loss": 2.2998, + "step": 71900 + }, + { + "epoch": 0.26823482016964706, + "grad_norm": 0.3505696654319763, + "learning_rate": 0.0006, + "loss": 2.0696, + "step": 71910 + }, + { + "epoch": 0.26827212163261044, + "grad_norm": 0.3440290689468384, + "learning_rate": 0.0006, + "loss": 2.2659, + "step": 71920 + }, + { + "epoch": 0.2683094230955738, + "grad_norm": 0.2471424639225006, + "learning_rate": 0.0006, + "loss": 2.2828, + "step": 71930 + }, + { + "epoch": 0.2683467245585372, + "grad_norm": 0.24377326667308807, + "learning_rate": 0.0006, + "loss": 2.1038, + "step": 71940 + }, + { + "epoch": 0.2683840260215006, + "grad_norm": 0.2915897071361542, + "learning_rate": 0.0006, + "loss": 2.1892, + "step": 71950 + }, + { + "epoch": 0.26842132748446396, + "grad_norm": 1.1722906827926636, + "learning_rate": 0.0006, + "loss": 2.2041, + "step": 71960 + }, + { + "epoch": 0.26845862894742734, + "grad_norm": 0.32430869340896606, + "learning_rate": 0.0006, + "loss": 2.159, + "step": 71970 + }, + { + "epoch": 0.2684959304103907, + "grad_norm": 0.28741979598999023, + "learning_rate": 0.0006, + "loss": 2.2306, + "step": 71980 + }, + { + "epoch": 0.2685332318733541, + "grad_norm": 0.20920328795909882, + "learning_rate": 0.0006, + "loss": 2.388, + "step": 71990 + }, + { + "epoch": 0.2685705333363175, + "grad_norm": 0.42014965415000916, + "learning_rate": 0.0006, + "loss": 2.2071, + "step": 72000 + }, + { + "epoch": 0.2685705333363175, + "eval_valid_loss": 2.174372434616089, + "eval_valid_loss/all": 2.0394794940948486, + "eval_valid_loss/end_span": 1.1732476949691772, + "eval_valid_perplexity/batch": 7.686607360839844, + "eval_valid_perplexity/end_span": 3.232473611831665, + "eval_valid_perplexity/fim": 2.3014323711395264, + "eval_valid_perplexity/first_seq": 14.855362892150879, + "eval_valid_perplexity/last_seq": 8.73400592803955, + "eval_valid_perplexity/second_seq": 13.895153045654297, + "eval_valid_perplexity/seq": 8.672462463378906, + "eval_valid_reconstruction/all": 0.2988453507423401, + "eval_valid_reconstruction/end_span": 0.7240703105926514, + "eval_valid_reconstruction/fim": 0.1701180785894394, + "eval_valid_reconstruction/first_seq": 0.1685260534286499, + "eval_valid_reconstruction/last_seq": 0.33499646186828613, + "eval_valid_reconstruction/second_seq": 0.1942072957754135, + "eval_valid_runtime": 441.8876, + "eval_valid_samples_per_second": 0.434, + "eval_valid_steps_per_second": 0.434, + "step": 72000 + }, + { + "epoch": 0.2685705333363175, + "eval_train_loss": 2.1725735664367676, + "eval_train_loss/all": 2.0112483501434326, + "eval_train_loss/end_span": 1.1405004262924194, + "eval_train_perplexity/batch": 7.472640037536621, + "eval_train_perplexity/end_span": 3.128333568572998, + "eval_train_perplexity/fim": 2.0318918228149414, + "eval_train_perplexity/first_seq": 15.28567123413086, + "eval_train_perplexity/last_seq": 8.660795211791992, + "eval_train_perplexity/second_seq": 14.023582458496094, + "eval_train_perplexity/seq": 8.606852531433105, + "eval_train_reconstruction/all": 0.28840792179107666, + "eval_train_reconstruction/end_span": 0.734438419342041, + "eval_train_reconstruction/fim": 0.14468567073345184, + "eval_train_reconstruction/first_seq": 0.15578635036945343, + "eval_train_reconstruction/last_seq": 0.33227089047431946, + "eval_train_reconstruction/second_seq": 0.18978050351142883, + "eval_train_runtime": 441.8691, + "eval_train_samples_per_second": 0.435, + "eval_train_steps_per_second": 0.435, + "step": 72000 + }, + { + "epoch": 0.26860783479928085, + "grad_norm": 0.3794051706790924, + "learning_rate": 0.0006, + "loss": 2.2382, + "step": 72010 + }, + { + "epoch": 0.26864513626224423, + "grad_norm": 0.3498188853263855, + "learning_rate": 0.0006, + "loss": 2.2314, + "step": 72020 + }, + { + "epoch": 0.26868243772520756, + "grad_norm": 0.27875202894210815, + "learning_rate": 0.0006, + "loss": 2.2798, + "step": 72030 + }, + { + "epoch": 0.26871973918817094, + "grad_norm": 0.42012256383895874, + "learning_rate": 0.0006, + "loss": 2.2512, + "step": 72040 + }, + { + "epoch": 0.2687570406511343, + "grad_norm": 0.48804718255996704, + "learning_rate": 0.0006, + "loss": 2.1006, + "step": 72050 + }, + { + "epoch": 0.2687943421140977, + "grad_norm": 0.5539289116859436, + "learning_rate": 0.0006, + "loss": 2.216, + "step": 72060 + }, + { + "epoch": 0.2688316435770611, + "grad_norm": 0.40056225657463074, + "learning_rate": 0.0006, + "loss": 2.1396, + "step": 72070 + }, + { + "epoch": 0.26886894504002445, + "grad_norm": 0.310433566570282, + "learning_rate": 0.0006, + "loss": 2.1679, + "step": 72080 + }, + { + "epoch": 0.26890624650298783, + "grad_norm": 0.31726714968681335, + "learning_rate": 0.0006, + "loss": 2.3375, + "step": 72090 + }, + { + "epoch": 0.2689435479659512, + "grad_norm": 0.31957685947418213, + "learning_rate": 0.0006, + "loss": 2.1443, + "step": 72100 + }, + { + "epoch": 0.2689808494289146, + "grad_norm": 0.3312925100326538, + "learning_rate": 0.0006, + "loss": 2.2343, + "step": 72110 + }, + { + "epoch": 0.26901815089187797, + "grad_norm": 0.3154027760028839, + "learning_rate": 0.0006, + "loss": 2.365, + "step": 72120 + }, + { + "epoch": 0.26905545235484135, + "grad_norm": 0.2989743947982788, + "learning_rate": 0.0006, + "loss": 2.2583, + "step": 72130 + }, + { + "epoch": 0.26909275381780473, + "grad_norm": 0.4152958393096924, + "learning_rate": 0.0006, + "loss": 2.0787, + "step": 72140 + }, + { + "epoch": 0.2691300552807681, + "grad_norm": 0.3219113051891327, + "learning_rate": 0.0006, + "loss": 2.072, + "step": 72150 + }, + { + "epoch": 0.2691673567437315, + "grad_norm": 0.3434145748615265, + "learning_rate": 0.0006, + "loss": 2.2068, + "step": 72160 + }, + { + "epoch": 0.26920465820669487, + "grad_norm": 0.3899513781070709, + "learning_rate": 0.0006, + "loss": 2.2321, + "step": 72170 + }, + { + "epoch": 0.26924195966965825, + "grad_norm": 0.28698617219924927, + "learning_rate": 0.0006, + "loss": 2.3114, + "step": 72180 + }, + { + "epoch": 0.26927926113262163, + "grad_norm": 0.347339391708374, + "learning_rate": 0.0006, + "loss": 2.3862, + "step": 72190 + }, + { + "epoch": 0.269316562595585, + "grad_norm": 0.21309931576251984, + "learning_rate": 0.0006, + "loss": 2.0169, + "step": 72200 + }, + { + "epoch": 0.2693538640585484, + "grad_norm": 0.3696986138820648, + "learning_rate": 0.0006, + "loss": 2.1713, + "step": 72210 + }, + { + "epoch": 0.26939116552151177, + "grad_norm": 0.3359403908252716, + "learning_rate": 0.0006, + "loss": 2.2755, + "step": 72220 + }, + { + "epoch": 0.26942846698447515, + "grad_norm": 0.3492668569087982, + "learning_rate": 0.0006, + "loss": 2.1523, + "step": 72230 + }, + { + "epoch": 0.2694657684474385, + "grad_norm": 0.5066770911216736, + "learning_rate": 0.0006, + "loss": 2.3259, + "step": 72240 + }, + { + "epoch": 0.2695030699104019, + "grad_norm": 0.3834320902824402, + "learning_rate": 0.0006, + "loss": 2.2693, + "step": 72250 + }, + { + "epoch": 0.2695030699104019, + "eval_valid_loss": 2.176182270050049, + "eval_valid_loss/all": 2.0408542156219482, + "eval_valid_loss/end_span": 1.180593490600586, + "eval_valid_perplexity/batch": 7.697181224822998, + "eval_valid_perplexity/end_span": 3.2563061714172363, + "eval_valid_perplexity/fim": 2.267301082611084, + "eval_valid_perplexity/first_seq": 14.552367210388184, + "eval_valid_perplexity/last_seq": 8.931631088256836, + "eval_valid_perplexity/second_seq": 13.79576587677002, + "eval_valid_perplexity/seq": 8.683823585510254, + "eval_valid_reconstruction/all": 0.2982942461967468, + "eval_valid_reconstruction/end_span": 0.7335109114646912, + "eval_valid_reconstruction/fim": 0.16649729013442993, + "eval_valid_reconstruction/first_seq": 0.1733391135931015, + "eval_valid_reconstruction/last_seq": 0.3259474039077759, + "eval_valid_reconstruction/second_seq": 0.19489571452140808, + "eval_valid_runtime": 461.3181, + "eval_valid_samples_per_second": 0.416, + "eval_valid_steps_per_second": 0.416, + "step": 72250 + }, + { + "epoch": 0.2695030699104019, + "eval_train_loss": 2.17529034614563, + "eval_train_loss/all": 2.0140488147735596, + "eval_train_loss/end_span": 1.1480059623718262, + "eval_train_perplexity/batch": 7.493596076965332, + "eval_train_perplexity/end_span": 3.1519017219543457, + "eval_train_perplexity/fim": 2.0528509616851807, + "eval_train_perplexity/first_seq": 15.606887817382812, + "eval_train_perplexity/last_seq": 8.87503719329834, + "eval_train_perplexity/second_seq": 13.828987121582031, + "eval_train_perplexity/seq": 8.631723403930664, + "eval_train_reconstruction/all": 0.28751951456069946, + "eval_train_reconstruction/end_span": 0.7428790330886841, + "eval_train_reconstruction/fim": 0.14583800733089447, + "eval_train_reconstruction/first_seq": 0.15072131156921387, + "eval_train_reconstruction/last_seq": 0.3305262625217438, + "eval_train_reconstruction/second_seq": 0.19145654141902924, + "eval_train_runtime": 519.5671, + "eval_train_samples_per_second": 0.37, + "eval_train_steps_per_second": 0.37, + "step": 72250 + }, + { + "epoch": 0.2695403713733653, + "grad_norm": 0.3985598385334015, + "learning_rate": 0.0006, + "loss": 2.2535, + "step": 72260 + }, + { + "epoch": 0.26957767283632866, + "grad_norm": 0.25363337993621826, + "learning_rate": 0.0006, + "loss": 2.1851, + "step": 72270 + }, + { + "epoch": 0.26961497429929204, + "grad_norm": 0.3498002290725708, + "learning_rate": 0.0006, + "loss": 2.1509, + "step": 72280 + }, + { + "epoch": 0.2696522757622554, + "grad_norm": 0.3032186031341553, + "learning_rate": 0.0006, + "loss": 2.2193, + "step": 72290 + }, + { + "epoch": 0.26968957722521875, + "grad_norm": 0.27618157863616943, + "learning_rate": 0.0006, + "loss": 2.2171, + "step": 72300 + }, + { + "epoch": 0.2697268786881821, + "grad_norm": 0.3004341721534729, + "learning_rate": 0.0006, + "loss": 2.2785, + "step": 72310 + }, + { + "epoch": 0.2697641801511455, + "grad_norm": 0.39046743512153625, + "learning_rate": 0.0006, + "loss": 2.3781, + "step": 72320 + }, + { + "epoch": 0.2698014816141089, + "grad_norm": 0.29987436532974243, + "learning_rate": 0.0006, + "loss": 2.0383, + "step": 72330 + }, + { + "epoch": 0.26983878307707226, + "grad_norm": 0.2782708406448364, + "learning_rate": 0.0006, + "loss": 2.3937, + "step": 72340 + }, + { + "epoch": 0.26987608454003564, + "grad_norm": 0.4484589397907257, + "learning_rate": 0.0006, + "loss": 2.0756, + "step": 72350 + }, + { + "epoch": 0.269913386002999, + "grad_norm": 0.319737046957016, + "learning_rate": 0.0006, + "loss": 2.2121, + "step": 72360 + }, + { + "epoch": 0.2699506874659624, + "grad_norm": 0.19176355004310608, + "learning_rate": 0.0006, + "loss": 2.3699, + "step": 72370 + }, + { + "epoch": 0.2699879889289258, + "grad_norm": 0.28696608543395996, + "learning_rate": 0.0006, + "loss": 2.2369, + "step": 72380 + }, + { + "epoch": 0.27002529039188916, + "grad_norm": 0.29878300428390503, + "learning_rate": 0.0006, + "loss": 2.3282, + "step": 72390 + }, + { + "epoch": 0.27006259185485254, + "grad_norm": 0.2591858208179474, + "learning_rate": 0.0006, + "loss": 2.2808, + "step": 72400 + }, + { + "epoch": 0.2700998933178159, + "grad_norm": 0.35127362608909607, + "learning_rate": 0.0006, + "loss": 2.3699, + "step": 72410 + }, + { + "epoch": 0.2701371947807793, + "grad_norm": 0.2836741805076599, + "learning_rate": 0.0006, + "loss": 2.1281, + "step": 72420 + }, + { + "epoch": 0.2701744962437427, + "grad_norm": 0.3580758273601532, + "learning_rate": 0.0006, + "loss": 2.2724, + "step": 72430 + }, + { + "epoch": 0.27021179770670606, + "grad_norm": 0.32959309220314026, + "learning_rate": 0.0006, + "loss": 2.1002, + "step": 72440 + }, + { + "epoch": 0.27024909916966944, + "grad_norm": 0.3405182361602783, + "learning_rate": 0.0006, + "loss": 2.2653, + "step": 72450 + }, + { + "epoch": 0.2702864006326328, + "grad_norm": 0.2970857322216034, + "learning_rate": 0.0006, + "loss": 2.2231, + "step": 72460 + }, + { + "epoch": 0.2703237020955962, + "grad_norm": 0.26306572556495667, + "learning_rate": 0.0006, + "loss": 2.2001, + "step": 72470 + }, + { + "epoch": 0.2703610035585596, + "grad_norm": 0.33439892530441284, + "learning_rate": 0.0006, + "loss": 2.3165, + "step": 72480 + }, + { + "epoch": 0.27039830502152296, + "grad_norm": 0.3910636007785797, + "learning_rate": 0.0006, + "loss": 2.0669, + "step": 72490 + }, + { + "epoch": 0.27043560648448633, + "grad_norm": 0.3993254601955414, + "learning_rate": 0.0006, + "loss": 2.2508, + "step": 72500 + }, + { + "epoch": 0.27043560648448633, + "eval_valid_loss": 2.1729586124420166, + "eval_valid_loss/all": 2.0378048419952393, + "eval_valid_loss/end_span": 1.2763521671295166, + "eval_valid_perplexity/batch": 7.673745632171631, + "eval_valid_perplexity/end_span": 3.5835437774658203, + "eval_valid_perplexity/fim": 2.978442907333374, + "eval_valid_perplexity/first_seq": 15.06083869934082, + "eval_valid_perplexity/last_seq": 8.803153991699219, + "eval_valid_perplexity/second_seq": 13.799506187438965, + "eval_valid_perplexity/seq": 8.656464576721191, + "eval_valid_reconstruction/all": 0.2991110384464264, + "eval_valid_reconstruction/end_span": 0.693695604801178, + "eval_valid_reconstruction/fim": 0.2233538180589676, + "eval_valid_reconstruction/first_seq": 0.161537304520607, + "eval_valid_reconstruction/last_seq": 0.33020931482315063, + "eval_valid_reconstruction/second_seq": 0.19557400047779083, + "eval_valid_runtime": 447.0578, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 72500 + }, + { + "epoch": 0.27043560648448633, + "eval_train_loss": 2.1720542907714844, + "eval_train_loss/all": 2.010773181915283, + "eval_train_loss/end_span": 1.238412857055664, + "eval_train_perplexity/batch": 7.469089984893799, + "eval_train_perplexity/end_span": 3.4501333236694336, + "eval_train_perplexity/fim": 2.0826914310455322, + "eval_train_perplexity/first_seq": 15.43399429321289, + "eval_train_perplexity/last_seq": 8.561657905578613, + "eval_train_perplexity/second_seq": 14.487994194030762, + "eval_train_perplexity/seq": 8.598713874816895, + "eval_train_reconstruction/all": 0.2883833050727844, + "eval_train_reconstruction/end_span": 0.7055066823959351, + "eval_train_reconstruction/fim": 0.14957748353481293, + "eval_train_reconstruction/first_seq": 0.15053899586200714, + "eval_train_reconstruction/last_seq": 0.3400583565235138, + "eval_train_reconstruction/second_seq": 0.17420902848243713, + "eval_train_runtime": 445.8838, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 72500 + }, + { + "epoch": 0.2704729079474497, + "grad_norm": 0.6580089926719666, + "learning_rate": 0.0006, + "loss": 2.2939, + "step": 72510 + }, + { + "epoch": 0.2705102094104131, + "grad_norm": 0.468295693397522, + "learning_rate": 0.0006, + "loss": 2.26, + "step": 72520 + }, + { + "epoch": 0.2705475108733765, + "grad_norm": 0.3015368580818176, + "learning_rate": 0.0006, + "loss": 2.1773, + "step": 72530 + }, + { + "epoch": 0.27058481233633985, + "grad_norm": 0.2964109778404236, + "learning_rate": 0.0006, + "loss": 2.3234, + "step": 72540 + }, + { + "epoch": 0.27062211379930323, + "grad_norm": 0.41130462288856506, + "learning_rate": 0.0006, + "loss": 2.2645, + "step": 72550 + }, + { + "epoch": 0.2706594152622666, + "grad_norm": 0.42608633637428284, + "learning_rate": 0.0006, + "loss": 2.3461, + "step": 72560 + }, + { + "epoch": 0.27069671672523, + "grad_norm": 0.31289130449295044, + "learning_rate": 0.0006, + "loss": 2.1225, + "step": 72570 + }, + { + "epoch": 0.2707340181881933, + "grad_norm": 0.3658079206943512, + "learning_rate": 0.0006, + "loss": 2.3788, + "step": 72580 + }, + { + "epoch": 0.2707713196511567, + "grad_norm": 0.3946497440338135, + "learning_rate": 0.0006, + "loss": 2.1379, + "step": 72590 + }, + { + "epoch": 0.2708086211141201, + "grad_norm": 0.3902820348739624, + "learning_rate": 0.0006, + "loss": 2.2912, + "step": 72600 + }, + { + "epoch": 0.27084592257708345, + "grad_norm": 0.30275407433509827, + "learning_rate": 0.0006, + "loss": 2.2979, + "step": 72610 + }, + { + "epoch": 0.27088322404004683, + "grad_norm": 0.28674808144569397, + "learning_rate": 0.0006, + "loss": 2.1812, + "step": 72620 + }, + { + "epoch": 0.2709205255030102, + "grad_norm": 0.35606491565704346, + "learning_rate": 0.0006, + "loss": 2.1581, + "step": 72630 + }, + { + "epoch": 0.2709578269659736, + "grad_norm": 0.3565541207790375, + "learning_rate": 0.0006, + "loss": 2.1254, + "step": 72640 + }, + { + "epoch": 0.27099512842893697, + "grad_norm": 0.40270182490348816, + "learning_rate": 0.0006, + "loss": 2.3236, + "step": 72650 + }, + { + "epoch": 0.27103242989190035, + "grad_norm": 0.2814704179763794, + "learning_rate": 0.0006, + "loss": 2.1594, + "step": 72660 + }, + { + "epoch": 0.27106973135486373, + "grad_norm": 0.32268407940864563, + "learning_rate": 0.0006, + "loss": 2.0502, + "step": 72670 + }, + { + "epoch": 0.2711070328178271, + "grad_norm": 0.1978541612625122, + "learning_rate": 0.0006, + "loss": 2.1881, + "step": 72680 + }, + { + "epoch": 0.2711443342807905, + "grad_norm": 0.5874109864234924, + "learning_rate": 0.0006, + "loss": 2.2157, + "step": 72690 + }, + { + "epoch": 0.27118163574375387, + "grad_norm": 0.2948969602584839, + "learning_rate": 0.0006, + "loss": 2.251, + "step": 72700 + }, + { + "epoch": 0.27121893720671725, + "grad_norm": 0.34756532311439514, + "learning_rate": 0.0006, + "loss": 2.0809, + "step": 72710 + }, + { + "epoch": 0.2712562386696806, + "grad_norm": 0.45877528190612793, + "learning_rate": 0.0006, + "loss": 2.2245, + "step": 72720 + }, + { + "epoch": 0.271293540132644, + "grad_norm": 0.334123432636261, + "learning_rate": 0.0006, + "loss": 2.2259, + "step": 72730 + }, + { + "epoch": 0.2713308415956074, + "grad_norm": 0.36423924565315247, + "learning_rate": 0.0006, + "loss": 2.2117, + "step": 72740 + }, + { + "epoch": 0.27136814305857077, + "grad_norm": 0.4326832592487335, + "learning_rate": 0.0006, + "loss": 2.1585, + "step": 72750 + }, + { + "epoch": 0.27136814305857077, + "eval_valid_loss": 2.167672872543335, + "eval_valid_loss/all": 2.0329110622406006, + "eval_valid_loss/end_span": 1.2267205715179443, + "eval_valid_perplexity/batch": 7.636283874511719, + "eval_valid_perplexity/end_span": 3.4100282192230225, + "eval_valid_perplexity/fim": 2.5068087577819824, + "eval_valid_perplexity/first_seq": 15.133940696716309, + "eval_valid_perplexity/last_seq": 8.858527183532715, + "eval_valid_perplexity/second_seq": 13.559205055236816, + "eval_valid_perplexity/seq": 8.61282730102539, + "eval_valid_reconstruction/all": 0.3008420765399933, + "eval_valid_reconstruction/end_span": 0.7076792120933533, + "eval_valid_reconstruction/fim": 0.19070962071418762, + "eval_valid_reconstruction/first_seq": 0.16023831069469452, + "eval_valid_reconstruction/last_seq": 0.3303530514240265, + "eval_valid_reconstruction/second_seq": 0.19910159707069397, + "eval_valid_runtime": 449.6162, + "eval_valid_samples_per_second": 0.427, + "eval_valid_steps_per_second": 0.427, + "step": 72750 + }, + { + "epoch": 0.27136814305857077, + "eval_train_loss": 2.167856454849243, + "eval_train_loss/all": 2.007053852081299, + "eval_train_loss/end_span": 1.1873339414596558, + "eval_train_perplexity/batch": 7.441361904144287, + "eval_train_perplexity/end_span": 3.278329372406006, + "eval_train_perplexity/fim": 2.303070306777954, + "eval_train_perplexity/first_seq": 15.66845417022705, + "eval_train_perplexity/last_seq": 8.726631164550781, + "eval_train_perplexity/second_seq": 13.97596549987793, + "eval_train_perplexity/seq": 8.568081855773926, + "eval_train_reconstruction/all": 0.2898240089416504, + "eval_train_reconstruction/end_span": 0.7195532917976379, + "eval_train_reconstruction/fim": 0.17222760617733002, + "eval_train_reconstruction/first_seq": 0.1510808914899826, + "eval_train_reconstruction/last_seq": 0.3339575231075287, + "eval_train_reconstruction/second_seq": 0.1875135451555252, + "eval_train_runtime": 445.738, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 72750 + }, + { + "epoch": 0.27140544452153414, + "grad_norm": 0.40041932463645935, + "learning_rate": 0.0006, + "loss": 2.2206, + "step": 72760 + }, + { + "epoch": 0.2714427459844975, + "grad_norm": 0.3591804504394531, + "learning_rate": 0.0006, + "loss": 2.2801, + "step": 72770 + }, + { + "epoch": 0.2714800474474609, + "grad_norm": 0.21042296290397644, + "learning_rate": 0.0006, + "loss": 2.3218, + "step": 72780 + }, + { + "epoch": 0.2715173489104243, + "grad_norm": 0.3254069983959198, + "learning_rate": 0.0006, + "loss": 2.1541, + "step": 72790 + }, + { + "epoch": 0.27155465037338766, + "grad_norm": 0.2933943271636963, + "learning_rate": 0.0006, + "loss": 2.1145, + "step": 72800 + }, + { + "epoch": 0.27159195183635104, + "grad_norm": 0.30363476276397705, + "learning_rate": 0.0006, + "loss": 2.3029, + "step": 72810 + }, + { + "epoch": 0.2716292532993144, + "grad_norm": 0.27159202098846436, + "learning_rate": 0.0006, + "loss": 2.3765, + "step": 72820 + }, + { + "epoch": 0.2716665547622778, + "grad_norm": 0.47068196535110474, + "learning_rate": 0.0006, + "loss": 2.2315, + "step": 72830 + }, + { + "epoch": 0.2717038562252412, + "grad_norm": 0.2977624833583832, + "learning_rate": 0.0006, + "loss": 2.1509, + "step": 72840 + }, + { + "epoch": 0.2717411576882045, + "grad_norm": 0.3599001467227936, + "learning_rate": 0.0006, + "loss": 2.2316, + "step": 72850 + }, + { + "epoch": 0.2717784591511679, + "grad_norm": 0.48025622963905334, + "learning_rate": 0.0006, + "loss": 2.1724, + "step": 72860 + }, + { + "epoch": 0.27181576061413126, + "grad_norm": 0.35448014736175537, + "learning_rate": 0.0006, + "loss": 2.1863, + "step": 72870 + }, + { + "epoch": 0.27185306207709464, + "grad_norm": 0.29568955302238464, + "learning_rate": 0.0006, + "loss": 2.1989, + "step": 72880 + }, + { + "epoch": 0.271890363540058, + "grad_norm": 0.3251614272594452, + "learning_rate": 0.0006, + "loss": 2.1337, + "step": 72890 + }, + { + "epoch": 0.2719276650030214, + "grad_norm": 0.29184478521347046, + "learning_rate": 0.0006, + "loss": 2.1406, + "step": 72900 + }, + { + "epoch": 0.2719649664659848, + "grad_norm": 0.3174465596675873, + "learning_rate": 0.0006, + "loss": 2.2127, + "step": 72910 + }, + { + "epoch": 0.27200226792894816, + "grad_norm": 0.29586151242256165, + "learning_rate": 0.0006, + "loss": 2.3392, + "step": 72920 + }, + { + "epoch": 0.27203956939191154, + "grad_norm": 0.26219815015792847, + "learning_rate": 0.0006, + "loss": 2.1626, + "step": 72930 + }, + { + "epoch": 0.2720768708548749, + "grad_norm": 0.49266812205314636, + "learning_rate": 0.0006, + "loss": 2.0414, + "step": 72940 + }, + { + "epoch": 0.2721141723178383, + "grad_norm": 0.3312556743621826, + "learning_rate": 0.0006, + "loss": 2.2006, + "step": 72950 + }, + { + "epoch": 0.2721514737808017, + "grad_norm": 0.4902256429195404, + "learning_rate": 0.0006, + "loss": 2.2191, + "step": 72960 + }, + { + "epoch": 0.27218877524376506, + "grad_norm": 0.4771040081977844, + "learning_rate": 0.0006, + "loss": 2.2673, + "step": 72970 + }, + { + "epoch": 0.27222607670672844, + "grad_norm": 0.37191274762153625, + "learning_rate": 0.0006, + "loss": 2.2424, + "step": 72980 + }, + { + "epoch": 0.2722633781696918, + "grad_norm": 0.4135366678237915, + "learning_rate": 0.0006, + "loss": 2.169, + "step": 72990 + }, + { + "epoch": 0.2723006796326552, + "grad_norm": 0.473685085773468, + "learning_rate": 0.0006, + "loss": 1.8941, + "step": 73000 + }, + { + "epoch": 0.2723006796326552, + "eval_valid_loss": 2.1751151084899902, + "eval_valid_loss/all": 2.03987455368042, + "eval_valid_loss/end_span": 1.32237708568573, + "eval_valid_perplexity/batch": 7.6896443367004395, + "eval_valid_perplexity/end_span": 3.7523303031921387, + "eval_valid_perplexity/fim": 2.1493756771087646, + "eval_valid_perplexity/first_seq": 14.922649383544922, + "eval_valid_perplexity/last_seq": 8.417552947998047, + "eval_valid_perplexity/second_seq": 13.989627838134766, + "eval_valid_perplexity/seq": 8.676669120788574, + "eval_valid_reconstruction/all": 0.29867836833000183, + "eval_valid_reconstruction/end_span": 0.6825957894325256, + "eval_valid_reconstruction/fim": 0.1567068099975586, + "eval_valid_reconstruction/first_seq": 0.16477787494659424, + "eval_valid_reconstruction/last_seq": 0.34671953320503235, + "eval_valid_reconstruction/second_seq": 0.18793606758117676, + "eval_valid_runtime": 444.0645, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 73000 + }, + { + "epoch": 0.2723006796326552, + "eval_train_loss": 2.1727938652038574, + "eval_train_loss/all": 2.011582851409912, + "eval_train_loss/end_span": 1.2833398580551147, + "eval_train_perplexity/batch": 7.47514009475708, + "eval_train_perplexity/end_span": 3.6086721420288086, + "eval_train_perplexity/fim": 2.0587544441223145, + "eval_train_perplexity/first_seq": 15.575154304504395, + "eval_train_perplexity/last_seq": 8.737698554992676, + "eval_train_perplexity/second_seq": 14.356361389160156, + "eval_train_perplexity/seq": 8.608967781066895, + "eval_train_reconstruction/all": 0.28844377398490906, + "eval_train_reconstruction/end_span": 0.6970247030258179, + "eval_train_reconstruction/fim": 0.14788316190242767, + "eval_train_reconstruction/first_seq": 0.15153351426124573, + "eval_train_reconstruction/last_seq": 0.3348236382007599, + "eval_train_reconstruction/second_seq": 0.1772012561559677, + "eval_train_runtime": 447.502, + "eval_train_samples_per_second": 0.429, + "eval_train_steps_per_second": 0.429, + "step": 73000 + }, + { + "epoch": 0.2723379810956186, + "grad_norm": 0.4683516323566437, + "learning_rate": 0.0006, + "loss": 2.1509, + "step": 73010 + }, + { + "epoch": 0.27237528255858195, + "grad_norm": 0.3700971305370331, + "learning_rate": 0.0006, + "loss": 2.1928, + "step": 73020 + }, + { + "epoch": 0.27241258402154533, + "grad_norm": 0.3304467797279358, + "learning_rate": 0.0006, + "loss": 2.2972, + "step": 73030 + }, + { + "epoch": 0.2724498854845087, + "grad_norm": 0.2759745717048645, + "learning_rate": 0.0006, + "loss": 2.3573, + "step": 73040 + }, + { + "epoch": 0.2724871869474721, + "grad_norm": 0.3277765214443207, + "learning_rate": 0.0006, + "loss": 1.9844, + "step": 73050 + }, + { + "epoch": 0.2725244884104355, + "grad_norm": 0.38547655940055847, + "learning_rate": 0.0006, + "loss": 2.2428, + "step": 73060 + }, + { + "epoch": 0.27256178987339885, + "grad_norm": 0.2669929265975952, + "learning_rate": 0.0006, + "loss": 2.1147, + "step": 73070 + }, + { + "epoch": 0.27259909133636223, + "grad_norm": 0.4548290967941284, + "learning_rate": 0.0006, + "loss": 2.2608, + "step": 73080 + }, + { + "epoch": 0.2726363927993256, + "grad_norm": 0.29052770137786865, + "learning_rate": 0.0006, + "loss": 2.2413, + "step": 73090 + }, + { + "epoch": 0.272673694262289, + "grad_norm": 0.33146902918815613, + "learning_rate": 0.0006, + "loss": 2.1073, + "step": 73100 + }, + { + "epoch": 0.27271099572525237, + "grad_norm": 0.3660481870174408, + "learning_rate": 0.0006, + "loss": 2.2152, + "step": 73110 + }, + { + "epoch": 0.27274829718821575, + "grad_norm": 0.36779239773750305, + "learning_rate": 0.0006, + "loss": 2.28, + "step": 73120 + }, + { + "epoch": 0.2727855986511791, + "grad_norm": 0.3647671043872833, + "learning_rate": 0.0006, + "loss": 2.3069, + "step": 73130 + }, + { + "epoch": 0.27282290011414245, + "grad_norm": 0.2512661814689636, + "learning_rate": 0.0006, + "loss": 2.368, + "step": 73140 + }, + { + "epoch": 0.27286020157710583, + "grad_norm": 0.5014669299125671, + "learning_rate": 0.0006, + "loss": 2.2687, + "step": 73150 + }, + { + "epoch": 0.2728975030400692, + "grad_norm": 0.29690513014793396, + "learning_rate": 0.0006, + "loss": 2.154, + "step": 73160 + }, + { + "epoch": 0.2729348045030326, + "grad_norm": 0.22328084707260132, + "learning_rate": 0.0006, + "loss": 2.2117, + "step": 73170 + }, + { + "epoch": 0.27297210596599597, + "grad_norm": 0.8518648147583008, + "learning_rate": 0.0006, + "loss": 2.1833, + "step": 73180 + }, + { + "epoch": 0.27300940742895935, + "grad_norm": 0.22720836102962494, + "learning_rate": 0.0006, + "loss": 2.2232, + "step": 73190 + }, + { + "epoch": 0.27304670889192273, + "grad_norm": 0.34139832854270935, + "learning_rate": 0.0006, + "loss": 2.281, + "step": 73200 + }, + { + "epoch": 0.2730840103548861, + "grad_norm": 0.5120810270309448, + "learning_rate": 0.0006, + "loss": 2.184, + "step": 73210 + }, + { + "epoch": 0.2731213118178495, + "grad_norm": 0.36050742864608765, + "learning_rate": 0.0006, + "loss": 2.0236, + "step": 73220 + }, + { + "epoch": 0.27315861328081287, + "grad_norm": 0.3052944242954254, + "learning_rate": 0.0006, + "loss": 2.2515, + "step": 73230 + }, + { + "epoch": 0.27319591474377625, + "grad_norm": 0.2517072558403015, + "learning_rate": 0.0006, + "loss": 2.2287, + "step": 73240 + }, + { + "epoch": 0.2732332162067396, + "grad_norm": 0.32433080673217773, + "learning_rate": 0.0006, + "loss": 2.2069, + "step": 73250 + }, + { + "epoch": 0.2732332162067396, + "eval_valid_loss": 2.1705844402313232, + "eval_valid_loss/all": 2.0357112884521484, + "eval_valid_loss/end_span": 1.2818102836608887, + "eval_valid_perplexity/batch": 7.6576972007751465, + "eval_valid_perplexity/end_span": 3.603156566619873, + "eval_valid_perplexity/fim": 2.087700128555298, + "eval_valid_perplexity/first_seq": 15.141277313232422, + "eval_valid_perplexity/last_seq": 8.26984977722168, + "eval_valid_perplexity/second_seq": 13.787410736083984, + "eval_valid_perplexity/seq": 8.63598346710205, + "eval_valid_reconstruction/all": 0.3001599907875061, + "eval_valid_reconstruction/end_span": 0.7074048519134521, + "eval_valid_reconstruction/fim": 0.15132999420166016, + "eval_valid_reconstruction/first_seq": 0.16002462804317474, + "eval_valid_reconstruction/last_seq": 0.35369789600372314, + "eval_valid_reconstruction/second_seq": 0.1953953504562378, + "eval_valid_runtime": 444.4544, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 73250 + }, + { + "epoch": 0.2732332162067396, + "eval_train_loss": 2.170313596725464, + "eval_train_loss/all": 2.0092532634735107, + "eval_train_loss/end_span": 1.2477754354476929, + "eval_train_perplexity/batch": 7.457746505737305, + "eval_train_perplexity/end_span": 3.4825870990753174, + "eval_train_perplexity/fim": 2.582735776901245, + "eval_train_perplexity/first_seq": 15.648932456970215, + "eval_train_perplexity/last_seq": 8.72481918334961, + "eval_train_perplexity/second_seq": 14.192989349365234, + "eval_train_perplexity/seq": 8.584328651428223, + "eval_train_reconstruction/all": 0.28906649351119995, + "eval_train_reconstruction/end_span": 0.714722216129303, + "eval_train_reconstruction/fim": 0.19496607780456543, + "eval_train_reconstruction/first_seq": 0.14891955256462097, + "eval_train_reconstruction/last_seq": 0.33548569679260254, + "eval_train_reconstruction/second_seq": 0.18521764874458313, + "eval_train_runtime": 451.5777, + "eval_train_samples_per_second": 0.425, + "eval_train_steps_per_second": 0.425, + "step": 73250 + }, + { + "epoch": 0.273270517669703, + "grad_norm": 0.3181588649749756, + "learning_rate": 0.0006, + "loss": 2.2554, + "step": 73260 + }, + { + "epoch": 0.2733078191326664, + "grad_norm": 0.4701465666294098, + "learning_rate": 0.0006, + "loss": 2.1535, + "step": 73270 + }, + { + "epoch": 0.27334512059562976, + "grad_norm": 0.4112912714481354, + "learning_rate": 0.0006, + "loss": 2.2748, + "step": 73280 + }, + { + "epoch": 0.27338242205859314, + "grad_norm": 0.3017160892486572, + "learning_rate": 0.0006, + "loss": 2.3742, + "step": 73290 + }, + { + "epoch": 0.2734197235215565, + "grad_norm": 0.2090490609407425, + "learning_rate": 0.0006, + "loss": 2.2288, + "step": 73300 + }, + { + "epoch": 0.2734570249845199, + "grad_norm": 0.3288210332393646, + "learning_rate": 0.0006, + "loss": 1.9619, + "step": 73310 + }, + { + "epoch": 0.2734943264474833, + "grad_norm": 0.2604547441005707, + "learning_rate": 0.0006, + "loss": 2.246, + "step": 73320 + }, + { + "epoch": 0.27353162791044666, + "grad_norm": 0.40298792719841003, + "learning_rate": 0.0006, + "loss": 2.331, + "step": 73330 + }, + { + "epoch": 0.27356892937341004, + "grad_norm": 0.25402024388313293, + "learning_rate": 0.0006, + "loss": 2.3148, + "step": 73340 + }, + { + "epoch": 0.2736062308363734, + "grad_norm": 0.22154290974140167, + "learning_rate": 0.0006, + "loss": 2.2478, + "step": 73350 + }, + { + "epoch": 0.2736435322993368, + "grad_norm": 0.29321378469467163, + "learning_rate": 0.0006, + "loss": 2.2421, + "step": 73360 + }, + { + "epoch": 0.2736808337623002, + "grad_norm": 0.31261059641838074, + "learning_rate": 0.0006, + "loss": 2.1321, + "step": 73370 + }, + { + "epoch": 0.27371813522526356, + "grad_norm": 0.304656445980072, + "learning_rate": 0.0006, + "loss": 2.274, + "step": 73380 + }, + { + "epoch": 0.27375543668822694, + "grad_norm": 0.32280999422073364, + "learning_rate": 0.0006, + "loss": 2.1499, + "step": 73390 + }, + { + "epoch": 0.27379273815119026, + "grad_norm": 0.24027420580387115, + "learning_rate": 0.0006, + "loss": 2.1778, + "step": 73400 + }, + { + "epoch": 0.27383003961415364, + "grad_norm": 0.28386953473091125, + "learning_rate": 0.0006, + "loss": 2.1357, + "step": 73410 + }, + { + "epoch": 0.273867341077117, + "grad_norm": 0.5291633009910583, + "learning_rate": 0.0006, + "loss": 2.3448, + "step": 73420 + }, + { + "epoch": 0.2739046425400804, + "grad_norm": 0.4290693998336792, + "learning_rate": 0.0006, + "loss": 2.1451, + "step": 73430 + }, + { + "epoch": 0.2739419440030438, + "grad_norm": 0.2519432306289673, + "learning_rate": 0.0006, + "loss": 2.2525, + "step": 73440 + }, + { + "epoch": 0.27397924546600716, + "grad_norm": 0.49465420842170715, + "learning_rate": 0.0006, + "loss": 2.0736, + "step": 73450 + }, + { + "epoch": 0.27401654692897054, + "grad_norm": 0.24785536527633667, + "learning_rate": 0.0006, + "loss": 2.14, + "step": 73460 + }, + { + "epoch": 0.2740538483919339, + "grad_norm": 0.30458638072013855, + "learning_rate": 0.0006, + "loss": 2.1785, + "step": 73470 + }, + { + "epoch": 0.2740911498548973, + "grad_norm": 0.2284933477640152, + "learning_rate": 0.0006, + "loss": 2.1117, + "step": 73480 + }, + { + "epoch": 0.2741284513178607, + "grad_norm": 0.2641538083553314, + "learning_rate": 0.0006, + "loss": 2.3269, + "step": 73490 + }, + { + "epoch": 0.27416575278082406, + "grad_norm": 0.31528282165527344, + "learning_rate": 0.0006, + "loss": 2.2493, + "step": 73500 + }, + { + "epoch": 0.27416575278082406, + "eval_valid_loss": 2.1751551628112793, + "eval_valid_loss/all": 2.0399036407470703, + "eval_valid_loss/end_span": 1.178372859954834, + "eval_valid_perplexity/batch": 7.689867973327637, + "eval_valid_perplexity/end_span": 3.2490832805633545, + "eval_valid_perplexity/fim": 2.0182478427886963, + "eval_valid_perplexity/first_seq": 15.003055572509766, + "eval_valid_perplexity/last_seq": 8.559585571289062, + "eval_valid_perplexity/second_seq": 13.49489688873291, + "eval_valid_perplexity/seq": 8.67530345916748, + "eval_valid_reconstruction/all": 0.2986146807670593, + "eval_valid_reconstruction/end_span": 0.7226594686508179, + "eval_valid_reconstruction/fim": 0.14338240027427673, + "eval_valid_reconstruction/first_seq": 0.1648857742547989, + "eval_valid_reconstruction/last_seq": 0.34005382657051086, + "eval_valid_reconstruction/second_seq": 0.20290841162204742, + "eval_valid_runtime": 446.0987, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 73500 + }, + { + "epoch": 0.27416575278082406, + "eval_train_loss": 2.17276668548584, + "eval_train_loss/all": 2.0110960006713867, + "eval_train_loss/end_span": 1.1413980722427368, + "eval_train_perplexity/batch": 7.47150182723999, + "eval_train_perplexity/end_span": 3.1311428546905518, + "eval_train_perplexity/fim": 2.067681074142456, + "eval_train_perplexity/first_seq": 15.395217895507812, + "eval_train_perplexity/last_seq": 8.828591346740723, + "eval_train_perplexity/second_seq": 14.289294242858887, + "eval_train_perplexity/seq": 8.603652000427246, + "eval_train_reconstruction/all": 0.2885746359825134, + "eval_train_reconstruction/end_span": 0.7349753975868225, + "eval_train_reconstruction/fim": 0.14863383769989014, + "eval_train_reconstruction/first_seq": 0.15469257533550262, + "eval_train_reconstruction/last_seq": 0.33242398500442505, + "eval_train_reconstruction/second_seq": 0.1819368451833725, + "eval_train_runtime": 445.7062, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 73500 + }, + { + "epoch": 0.27420305424378744, + "grad_norm": 3.8439512252807617, + "learning_rate": 0.0006, + "loss": 2.319, + "step": 73510 + }, + { + "epoch": 0.2742403557067508, + "grad_norm": 0.22594906389713287, + "learning_rate": 0.0006, + "loss": 2.0008, + "step": 73520 + }, + { + "epoch": 0.2742776571697142, + "grad_norm": 0.3294510543346405, + "learning_rate": 0.0006, + "loss": 2.3453, + "step": 73530 + }, + { + "epoch": 0.2743149586326776, + "grad_norm": 1.0886116027832031, + "learning_rate": 0.0006, + "loss": 2.1461, + "step": 73540 + }, + { + "epoch": 0.27435226009564095, + "grad_norm": 0.34480318427085876, + "learning_rate": 0.0006, + "loss": 2.264, + "step": 73550 + }, + { + "epoch": 0.27438956155860433, + "grad_norm": 0.34966933727264404, + "learning_rate": 0.0006, + "loss": 2.1363, + "step": 73560 + }, + { + "epoch": 0.2744268630215677, + "grad_norm": 1.3274110555648804, + "learning_rate": 0.0006, + "loss": 2.2215, + "step": 73570 + }, + { + "epoch": 0.2744641644845311, + "grad_norm": 0.39433515071868896, + "learning_rate": 0.0006, + "loss": 2.2257, + "step": 73580 + }, + { + "epoch": 0.27450146594749447, + "grad_norm": 0.3229996860027313, + "learning_rate": 0.0006, + "loss": 2.0719, + "step": 73590 + }, + { + "epoch": 0.27453876741045785, + "grad_norm": 0.4135199189186096, + "learning_rate": 0.0006, + "loss": 2.2365, + "step": 73600 + }, + { + "epoch": 0.27457606887342123, + "grad_norm": 0.29664692282676697, + "learning_rate": 0.0006, + "loss": 2.2783, + "step": 73610 + }, + { + "epoch": 0.2746133703363846, + "grad_norm": 0.4053618609905243, + "learning_rate": 0.0006, + "loss": 2.2966, + "step": 73620 + }, + { + "epoch": 0.274650671799348, + "grad_norm": 0.26225706934928894, + "learning_rate": 0.0006, + "loss": 2.3276, + "step": 73630 + }, + { + "epoch": 0.27468797326231137, + "grad_norm": 0.27485546469688416, + "learning_rate": 0.0006, + "loss": 2.3181, + "step": 73640 + }, + { + "epoch": 0.27472527472527475, + "grad_norm": 0.4235851466655731, + "learning_rate": 0.0006, + "loss": 2.1705, + "step": 73650 + }, + { + "epoch": 0.27476257618823813, + "grad_norm": 0.3250933289527893, + "learning_rate": 0.0006, + "loss": 2.1589, + "step": 73660 + }, + { + "epoch": 0.2747998776512015, + "grad_norm": 0.3405730128288269, + "learning_rate": 0.0006, + "loss": 2.1076, + "step": 73670 + }, + { + "epoch": 0.27483717911416483, + "grad_norm": 0.3271826207637787, + "learning_rate": 0.0006, + "loss": 1.9616, + "step": 73680 + }, + { + "epoch": 0.2748744805771282, + "grad_norm": 0.42971205711364746, + "learning_rate": 0.0006, + "loss": 2.0875, + "step": 73690 + }, + { + "epoch": 0.2749117820400916, + "grad_norm": 0.2965017557144165, + "learning_rate": 0.0006, + "loss": 2.0759, + "step": 73700 + }, + { + "epoch": 0.27494908350305497, + "grad_norm": 4.904737949371338, + "learning_rate": 0.0006, + "loss": 2.2651, + "step": 73710 + }, + { + "epoch": 0.27498638496601835, + "grad_norm": 0.22409150004386902, + "learning_rate": 0.0006, + "loss": 2.1572, + "step": 73720 + }, + { + "epoch": 0.27502368642898173, + "grad_norm": 0.36555466055870056, + "learning_rate": 0.0006, + "loss": 2.0914, + "step": 73730 + }, + { + "epoch": 0.2750609878919451, + "grad_norm": 0.31730756163597107, + "learning_rate": 0.0006, + "loss": 2.2372, + "step": 73740 + }, + { + "epoch": 0.2750982893549085, + "grad_norm": 0.22477620840072632, + "learning_rate": 0.0006, + "loss": 2.1054, + "step": 73750 + }, + { + "epoch": 0.2750982893549085, + "eval_valid_loss": 2.172475814819336, + "eval_valid_loss/all": 2.0376641750335693, + "eval_valid_loss/end_span": 1.238178014755249, + "eval_valid_perplexity/batch": 7.672666072845459, + "eval_valid_perplexity/end_span": 3.4493231773376465, + "eval_valid_perplexity/fim": 2.7631616592407227, + "eval_valid_perplexity/first_seq": 15.395801544189453, + "eval_valid_perplexity/last_seq": 8.821300506591797, + "eval_valid_perplexity/second_seq": 13.538431167602539, + "eval_valid_perplexity/seq": 8.6609468460083, + "eval_valid_reconstruction/all": 0.2992725968360901, + "eval_valid_reconstruction/end_span": 0.7096816301345825, + "eval_valid_reconstruction/fim": 0.20811884105205536, + "eval_valid_reconstruction/first_seq": 0.15559829771518707, + "eval_valid_reconstruction/last_seq": 0.33341091871261597, + "eval_valid_reconstruction/second_seq": 0.20291905105113983, + "eval_valid_runtime": 444.053, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 73750 + }, + { + "epoch": 0.2750982893549085, + "eval_train_loss": 2.17031192779541, + "eval_train_loss/all": 2.009549140930176, + "eval_train_loss/end_span": 1.2050063610076904, + "eval_train_perplexity/batch": 7.459953308105469, + "eval_train_perplexity/end_span": 3.336780309677124, + "eval_train_perplexity/fim": 1.9864082336425781, + "eval_train_perplexity/first_seq": 15.355971336364746, + "eval_train_perplexity/last_seq": 8.791081428527832, + "eval_train_perplexity/second_seq": 14.462141036987305, + "eval_train_perplexity/seq": 8.594128608703613, + "eval_train_reconstruction/all": 0.28875496983528137, + "eval_train_reconstruction/end_span": 0.7180012464523315, + "eval_train_reconstruction/fim": 0.14055165648460388, + "eval_train_reconstruction/first_seq": 0.15322892367839813, + "eval_train_reconstruction/last_seq": 0.3255976438522339, + "eval_train_reconstruction/second_seq": 0.17959751188755035, + "eval_train_runtime": 447.8172, + "eval_train_samples_per_second": 0.429, + "eval_train_steps_per_second": 0.429, + "step": 73750 + }, + { + "epoch": 0.27513559081787187, + "grad_norm": 0.28171440958976746, + "learning_rate": 0.0006, + "loss": 2.2938, + "step": 73760 + }, + { + "epoch": 0.27517289228083525, + "grad_norm": 0.43896791338920593, + "learning_rate": 0.0006, + "loss": 2.1571, + "step": 73770 + }, + { + "epoch": 0.2752101937437986, + "grad_norm": 2.2759034633636475, + "learning_rate": 0.0006, + "loss": 2.2413, + "step": 73780 + }, + { + "epoch": 0.275247495206762, + "grad_norm": 0.3102523684501648, + "learning_rate": 0.0006, + "loss": 2.3049, + "step": 73790 + }, + { + "epoch": 0.2752847966697254, + "grad_norm": 0.3964422345161438, + "learning_rate": 0.0006, + "loss": 1.9844, + "step": 73800 + }, + { + "epoch": 0.27532209813268876, + "grad_norm": 0.28040188550949097, + "learning_rate": 0.0006, + "loss": 2.316, + "step": 73810 + }, + { + "epoch": 0.27535939959565214, + "grad_norm": 1.5804866552352905, + "learning_rate": 0.0006, + "loss": 2.2225, + "step": 73820 + }, + { + "epoch": 0.2753967010586155, + "grad_norm": 0.41356220841407776, + "learning_rate": 0.0006, + "loss": 2.274, + "step": 73830 + }, + { + "epoch": 0.2754340025215789, + "grad_norm": 0.36007651686668396, + "learning_rate": 0.0006, + "loss": 2.0884, + "step": 73840 + }, + { + "epoch": 0.2754713039845423, + "grad_norm": 0.41850197315216064, + "learning_rate": 0.0006, + "loss": 2.143, + "step": 73850 + }, + { + "epoch": 0.27550860544750566, + "grad_norm": 0.27965307235717773, + "learning_rate": 0.0006, + "loss": 2.1094, + "step": 73860 + }, + { + "epoch": 0.27554590691046904, + "grad_norm": 0.3202695846557617, + "learning_rate": 0.0006, + "loss": 2.3196, + "step": 73870 + }, + { + "epoch": 0.2755832083734324, + "grad_norm": 0.2657283842563629, + "learning_rate": 0.0006, + "loss": 2.2354, + "step": 73880 + }, + { + "epoch": 0.2756205098363958, + "grad_norm": 0.3609267473220825, + "learning_rate": 0.0006, + "loss": 2.1702, + "step": 73890 + }, + { + "epoch": 0.2756578112993592, + "grad_norm": 0.4999746084213257, + "learning_rate": 0.0006, + "loss": 2.0291, + "step": 73900 + }, + { + "epoch": 0.27569511276232256, + "grad_norm": 0.5732814073562622, + "learning_rate": 0.0006, + "loss": 2.2776, + "step": 73910 + }, + { + "epoch": 0.27573241422528594, + "grad_norm": 0.30241939425468445, + "learning_rate": 0.0006, + "loss": 2.2058, + "step": 73920 + }, + { + "epoch": 0.2757697156882493, + "grad_norm": 0.275706946849823, + "learning_rate": 0.0006, + "loss": 2.2703, + "step": 73930 + }, + { + "epoch": 0.2758070171512127, + "grad_norm": 0.43934834003448486, + "learning_rate": 0.0006, + "loss": 2.2657, + "step": 73940 + }, + { + "epoch": 0.275844318614176, + "grad_norm": 0.32628029584884644, + "learning_rate": 0.0006, + "loss": 2.3098, + "step": 73950 + }, + { + "epoch": 0.2758816200771394, + "grad_norm": 0.2817137837409973, + "learning_rate": 0.0006, + "loss": 2.1044, + "step": 73960 + }, + { + "epoch": 0.2759189215401028, + "grad_norm": 0.4237329661846161, + "learning_rate": 0.0006, + "loss": 2.1054, + "step": 73970 + }, + { + "epoch": 0.27595622300306616, + "grad_norm": 0.3919408619403839, + "learning_rate": 0.0006, + "loss": 2.362, + "step": 73980 + }, + { + "epoch": 0.27599352446602954, + "grad_norm": 0.43478724360466003, + "learning_rate": 0.0006, + "loss": 2.1819, + "step": 73990 + }, + { + "epoch": 0.2760308259289929, + "grad_norm": 0.2695055305957794, + "learning_rate": 0.0006, + "loss": 2.2811, + "step": 74000 + }, + { + "epoch": 0.2760308259289929, + "eval_valid_loss": 2.173694610595703, + "eval_valid_loss/all": 2.038524627685547, + "eval_valid_loss/end_span": 1.1904804706573486, + "eval_valid_perplexity/batch": 7.679271221160889, + "eval_valid_perplexity/end_span": 3.288661003112793, + "eval_valid_perplexity/fim": 2.5525856018066406, + "eval_valid_perplexity/first_seq": 15.035748481750488, + "eval_valid_perplexity/last_seq": 8.996990203857422, + "eval_valid_perplexity/second_seq": 12.883639335632324, + "eval_valid_perplexity/seq": 8.663287162780762, + "eval_valid_reconstruction/all": 0.2988955080509186, + "eval_valid_reconstruction/end_span": 0.7211082577705383, + "eval_valid_reconstruction/fim": 0.19187307357788086, + "eval_valid_reconstruction/first_seq": 0.16491135954856873, + "eval_valid_reconstruction/last_seq": 0.32840999960899353, + "eval_valid_reconstruction/second_seq": 0.2198163866996765, + "eval_valid_runtime": 445.4815, + "eval_valid_samples_per_second": 0.431, + "eval_valid_steps_per_second": 0.431, + "step": 74000 + }, + { + "epoch": 0.2760308259289929, + "eval_train_loss": 2.170945644378662, + "eval_train_loss/all": 2.0098226070404053, + "eval_train_loss/end_span": 1.153947353363037, + "eval_train_perplexity/batch": 7.46199369430542, + "eval_train_perplexity/end_span": 3.1706840991973877, + "eval_train_perplexity/fim": 2.1272478103637695, + "eval_train_perplexity/first_seq": 15.670893669128418, + "eval_train_perplexity/last_seq": 8.716605186462402, + "eval_train_perplexity/second_seq": 14.412323951721191, + "eval_train_perplexity/seq": 8.59372615814209, + "eval_train_reconstruction/all": 0.2886277735233307, + "eval_train_reconstruction/end_span": 0.7330309152603149, + "eval_train_reconstruction/fim": 0.1555069237947464, + "eval_train_reconstruction/first_seq": 0.14640262722969055, + "eval_train_reconstruction/last_seq": 0.33302950859069824, + "eval_train_reconstruction/second_seq": 0.17771916091442108, + "eval_train_runtime": 445.525, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 74000 + }, + { + "epoch": 0.2760681273919563, + "grad_norm": 0.6899027824401855, + "learning_rate": 0.0006, + "loss": 2.1498, + "step": 74010 + }, + { + "epoch": 0.2761054288549197, + "grad_norm": 0.2838856875896454, + "learning_rate": 0.0006, + "loss": 2.2815, + "step": 74020 + }, + { + "epoch": 0.27614273031788306, + "grad_norm": 0.29656797647476196, + "learning_rate": 0.0006, + "loss": 2.2591, + "step": 74030 + }, + { + "epoch": 0.27618003178084644, + "grad_norm": 0.30813050270080566, + "learning_rate": 0.0006, + "loss": 1.9663, + "step": 74040 + }, + { + "epoch": 0.2762173332438098, + "grad_norm": 0.358619749546051, + "learning_rate": 0.0006, + "loss": 2.1917, + "step": 74050 + }, + { + "epoch": 0.2762546347067732, + "grad_norm": 0.3466036319732666, + "learning_rate": 0.0006, + "loss": 2.1208, + "step": 74060 + }, + { + "epoch": 0.2762919361697366, + "grad_norm": 0.3453007638454437, + "learning_rate": 0.0006, + "loss": 2.0913, + "step": 74070 + }, + { + "epoch": 0.27632923763269995, + "grad_norm": 0.25096482038497925, + "learning_rate": 0.0006, + "loss": 2.3257, + "step": 74080 + }, + { + "epoch": 0.27636653909566333, + "grad_norm": 0.32108986377716064, + "learning_rate": 0.0006, + "loss": 2.389, + "step": 74090 + }, + { + "epoch": 0.2764038405586267, + "grad_norm": 0.3445339500904083, + "learning_rate": 0.0006, + "loss": 2.3261, + "step": 74100 + }, + { + "epoch": 0.2764411420215901, + "grad_norm": 0.2986460328102112, + "learning_rate": 0.0006, + "loss": 2.3335, + "step": 74110 + }, + { + "epoch": 0.27647844348455347, + "grad_norm": 0.29969412088394165, + "learning_rate": 0.0006, + "loss": 2.2302, + "step": 74120 + }, + { + "epoch": 0.27651574494751685, + "grad_norm": 0.31114864349365234, + "learning_rate": 0.0006, + "loss": 2.1816, + "step": 74130 + }, + { + "epoch": 0.27655304641048023, + "grad_norm": 0.3224421739578247, + "learning_rate": 0.0006, + "loss": 2.1658, + "step": 74140 + }, + { + "epoch": 0.2765903478734436, + "grad_norm": 0.333607017993927, + "learning_rate": 0.0006, + "loss": 2.3271, + "step": 74150 + }, + { + "epoch": 0.276627649336407, + "grad_norm": 0.3042415678501129, + "learning_rate": 0.0006, + "loss": 2.1775, + "step": 74160 + }, + { + "epoch": 0.27666495079937037, + "grad_norm": 0.2943926453590393, + "learning_rate": 0.0006, + "loss": 2.2413, + "step": 74170 + }, + { + "epoch": 0.27670225226233375, + "grad_norm": 0.2737027406692505, + "learning_rate": 0.0006, + "loss": 2.2663, + "step": 74180 + }, + { + "epoch": 0.2767395537252971, + "grad_norm": 0.2683679461479187, + "learning_rate": 0.0006, + "loss": 2.161, + "step": 74190 + }, + { + "epoch": 0.2767768551882605, + "grad_norm": 0.2968566417694092, + "learning_rate": 0.0006, + "loss": 2.1197, + "step": 74200 + }, + { + "epoch": 0.2768141566512239, + "grad_norm": 0.3452818989753723, + "learning_rate": 0.0006, + "loss": 2.204, + "step": 74210 + }, + { + "epoch": 0.27685145811418727, + "grad_norm": 0.342287540435791, + "learning_rate": 0.0006, + "loss": 2.061, + "step": 74220 + }, + { + "epoch": 0.2768887595771506, + "grad_norm": 0.2702382802963257, + "learning_rate": 0.0006, + "loss": 2.2468, + "step": 74230 + }, + { + "epoch": 0.27692606104011397, + "grad_norm": 0.25371429324150085, + "learning_rate": 0.0006, + "loss": 2.2165, + "step": 74240 + }, + { + "epoch": 0.27696336250307735, + "grad_norm": 0.3041655123233795, + "learning_rate": 0.0006, + "loss": 2.076, + "step": 74250 + }, + { + "epoch": 0.27696336250307735, + "eval_valid_loss": 2.178429365158081, + "eval_valid_loss/all": 2.043541193008423, + "eval_valid_loss/end_span": 1.3164823055267334, + "eval_valid_perplexity/batch": 7.717891216278076, + "eval_valid_perplexity/end_span": 3.730276346206665, + "eval_valid_perplexity/fim": 2.065422534942627, + "eval_valid_perplexity/first_seq": 14.709863662719727, + "eval_valid_perplexity/last_seq": 8.632539749145508, + "eval_valid_perplexity/second_seq": 13.345081329345703, + "eval_valid_perplexity/seq": 8.725584030151367, + "eval_valid_reconstruction/all": 0.29712381958961487, + "eval_valid_reconstruction/end_span": 0.6986109018325806, + "eval_valid_reconstruction/fim": 0.14760516583919525, + "eval_valid_reconstruction/first_seq": 0.17280122637748718, + "eval_valid_reconstruction/last_seq": 0.33955448865890503, + "eval_valid_reconstruction/second_seq": 0.20675241947174072, + "eval_valid_runtime": 451.5866, + "eval_valid_samples_per_second": 0.425, + "eval_valid_steps_per_second": 0.425, + "step": 74250 + }, + { + "epoch": 0.27696336250307735, + "eval_train_loss": 2.174989938735962, + "eval_train_loss/all": 2.012355327606201, + "eval_train_loss/end_span": 1.2798645496368408, + "eval_train_perplexity/batch": 7.480916500091553, + "eval_train_perplexity/end_span": 3.5961525440216064, + "eval_train_perplexity/fim": 2.101060628890991, + "eval_train_perplexity/first_seq": 15.424084663391113, + "eval_train_perplexity/last_seq": 8.687870979309082, + "eval_train_perplexity/second_seq": 14.371745109558105, + "eval_train_perplexity/seq": 8.601598739624023, + "eval_train_reconstruction/all": 0.2879115045070648, + "eval_train_reconstruction/end_span": 0.7086433172225952, + "eval_train_reconstruction/fim": 0.15128281712532043, + "eval_train_reconstruction/first_seq": 0.15011219680309296, + "eval_train_reconstruction/last_seq": 0.33435243368148804, + "eval_train_reconstruction/second_seq": 0.17952756583690643, + "eval_train_runtime": 445.0115, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 74250 + }, + { + "epoch": 0.2770006639660407, + "grad_norm": 0.28726375102996826, + "learning_rate": 0.0006, + "loss": 2.1016, + "step": 74260 + }, + { + "epoch": 0.2770379654290041, + "grad_norm": 0.27529558539390564, + "learning_rate": 0.0006, + "loss": 2.3302, + "step": 74270 + }, + { + "epoch": 0.2770752668919675, + "grad_norm": 0.29642972350120544, + "learning_rate": 0.0006, + "loss": 2.327, + "step": 74280 + }, + { + "epoch": 0.27711256835493087, + "grad_norm": 0.659103512763977, + "learning_rate": 0.0006, + "loss": 2.33, + "step": 74290 + }, + { + "epoch": 0.27714986981789425, + "grad_norm": 0.33237770199775696, + "learning_rate": 0.0006, + "loss": 2.2139, + "step": 74300 + }, + { + "epoch": 0.2771871712808576, + "grad_norm": 0.4162044823169708, + "learning_rate": 0.0006, + "loss": 2.2487, + "step": 74310 + }, + { + "epoch": 0.277224472743821, + "grad_norm": 0.4948231875896454, + "learning_rate": 0.0006, + "loss": 2.1736, + "step": 74320 + }, + { + "epoch": 0.2772617742067844, + "grad_norm": 0.3173467218875885, + "learning_rate": 0.0006, + "loss": 2.1622, + "step": 74330 + }, + { + "epoch": 0.27729907566974776, + "grad_norm": 0.4163627624511719, + "learning_rate": 0.0006, + "loss": 2.2046, + "step": 74340 + }, + { + "epoch": 0.27733637713271114, + "grad_norm": 0.37166523933410645, + "learning_rate": 0.0006, + "loss": 2.2114, + "step": 74350 + }, + { + "epoch": 0.2773736785956745, + "grad_norm": 0.2907128632068634, + "learning_rate": 0.0006, + "loss": 2.2848, + "step": 74360 + }, + { + "epoch": 0.2774109800586379, + "grad_norm": 0.2706395089626312, + "learning_rate": 0.0006, + "loss": 2.1885, + "step": 74370 + }, + { + "epoch": 0.2774482815216013, + "grad_norm": 0.31767380237579346, + "learning_rate": 0.0006, + "loss": 2.0855, + "step": 74380 + }, + { + "epoch": 0.27748558298456466, + "grad_norm": 0.5821526646614075, + "learning_rate": 0.0006, + "loss": 2.0161, + "step": 74390 + }, + { + "epoch": 0.27752288444752804, + "grad_norm": 0.2629508078098297, + "learning_rate": 0.0006, + "loss": 2.2703, + "step": 74400 + }, + { + "epoch": 0.2775601859104914, + "grad_norm": 0.37413620948791504, + "learning_rate": 0.0006, + "loss": 2.2401, + "step": 74410 + }, + { + "epoch": 0.2775974873734548, + "grad_norm": 0.3495366871356964, + "learning_rate": 0.0006, + "loss": 2.2095, + "step": 74420 + }, + { + "epoch": 0.2776347888364182, + "grad_norm": 0.39046597480773926, + "learning_rate": 0.0006, + "loss": 2.245, + "step": 74430 + }, + { + "epoch": 0.27767209029938156, + "grad_norm": 0.3114532232284546, + "learning_rate": 0.0006, + "loss": 2.3534, + "step": 74440 + }, + { + "epoch": 0.27770939176234494, + "grad_norm": 0.46329012513160706, + "learning_rate": 0.0006, + "loss": 2.1934, + "step": 74450 + }, + { + "epoch": 0.2777466932253083, + "grad_norm": 0.5839083194732666, + "learning_rate": 0.0006, + "loss": 2.2464, + "step": 74460 + }, + { + "epoch": 0.2777839946882717, + "grad_norm": 0.43849679827690125, + "learning_rate": 0.0006, + "loss": 2.2885, + "step": 74470 + }, + { + "epoch": 0.2778212961512351, + "grad_norm": 0.3215157091617584, + "learning_rate": 0.0006, + "loss": 2.3275, + "step": 74480 + }, + { + "epoch": 0.27785859761419845, + "grad_norm": 0.3870348036289215, + "learning_rate": 0.0006, + "loss": 2.2564, + "step": 74490 + }, + { + "epoch": 0.2778958990771618, + "grad_norm": 0.22930744290351868, + "learning_rate": 0.0006, + "loss": 2.1906, + "step": 74500 + }, + { + "epoch": 0.2778958990771618, + "eval_valid_loss": 2.1703379154205322, + "eval_valid_loss/all": 2.0354788303375244, + "eval_valid_loss/end_span": 1.202729344367981, + "eval_valid_perplexity/batch": 7.655917167663574, + "eval_valid_perplexity/end_span": 3.329190969467163, + "eval_valid_perplexity/fim": 2.726393699645996, + "eval_valid_perplexity/first_seq": 14.836609840393066, + "eval_valid_perplexity/last_seq": 8.568103790283203, + "eval_valid_perplexity/second_seq": 13.730711936950684, + "eval_valid_perplexity/seq": 8.63312816619873, + "eval_valid_reconstruction/all": 0.29962053894996643, + "eval_valid_reconstruction/end_span": 0.7158915400505066, + "eval_valid_reconstruction/fim": 0.20574302971363068, + "eval_valid_reconstruction/first_seq": 0.16547170281410217, + "eval_valid_reconstruction/last_seq": 0.34108060598373413, + "eval_valid_reconstruction/second_seq": 0.20100295543670654, + "eval_valid_runtime": 447.9885, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 74500 + }, + { + "epoch": 0.2778958990771618, + "eval_train_loss": 2.1665985584259033, + "eval_train_loss/all": 2.005671262741089, + "eval_train_loss/end_span": 1.1645301580429077, + "eval_train_perplexity/batch": 7.431080341339111, + "eval_train_perplexity/end_span": 3.2044169902801514, + "eval_train_perplexity/fim": 2.049928903579712, + "eval_train_perplexity/first_seq": 15.37572956085205, + "eval_train_perplexity/last_seq": 8.09463119506836, + "eval_train_perplexity/second_seq": 14.009987831115723, + "eval_train_perplexity/seq": 8.551240921020508, + "eval_train_reconstruction/all": 0.2899014353752136, + "eval_train_reconstruction/end_span": 0.7291910648345947, + "eval_train_reconstruction/fim": 0.14736157655715942, + "eval_train_reconstruction/first_seq": 0.1553840935230255, + "eval_train_reconstruction/last_seq": 0.3545723855495453, + "eval_train_reconstruction/second_seq": 0.18651613593101501, + "eval_train_runtime": 446.1679, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 74500 + }, + { + "epoch": 0.27793320054012516, + "grad_norm": 0.38194936513900757, + "learning_rate": 0.0006, + "loss": 2.0144, + "step": 74510 + }, + { + "epoch": 0.27797050200308854, + "grad_norm": 0.24631938338279724, + "learning_rate": 0.0006, + "loss": 2.3132, + "step": 74520 + }, + { + "epoch": 0.2780078034660519, + "grad_norm": 0.3126868009567261, + "learning_rate": 0.0006, + "loss": 2.224, + "step": 74530 + }, + { + "epoch": 0.2780451049290153, + "grad_norm": 0.4142298996448517, + "learning_rate": 0.0006, + "loss": 2.4647, + "step": 74540 + }, + { + "epoch": 0.2780824063919787, + "grad_norm": 0.36814025044441223, + "learning_rate": 0.0006, + "loss": 2.2691, + "step": 74550 + }, + { + "epoch": 0.27811970785494206, + "grad_norm": 0.36388418078422546, + "learning_rate": 0.0006, + "loss": 2.1717, + "step": 74560 + }, + { + "epoch": 0.27815700931790543, + "grad_norm": 0.37681540846824646, + "learning_rate": 0.0006, + "loss": 2.3126, + "step": 74570 + }, + { + "epoch": 0.2781943107808688, + "grad_norm": 0.301016241312027, + "learning_rate": 0.0006, + "loss": 2.4662, + "step": 74580 + }, + { + "epoch": 0.2782316122438322, + "grad_norm": 0.34114232659339905, + "learning_rate": 0.0006, + "loss": 2.1859, + "step": 74590 + }, + { + "epoch": 0.2782689137067956, + "grad_norm": 0.3044298589229584, + "learning_rate": 0.0006, + "loss": 2.2645, + "step": 74600 + }, + { + "epoch": 0.27830621516975895, + "grad_norm": 0.44104060530662537, + "learning_rate": 0.0006, + "loss": 2.1232, + "step": 74610 + }, + { + "epoch": 0.27834351663272233, + "grad_norm": 0.31532034277915955, + "learning_rate": 0.0006, + "loss": 2.1182, + "step": 74620 + }, + { + "epoch": 0.2783808180956857, + "grad_norm": 0.28038331866264343, + "learning_rate": 0.0006, + "loss": 2.1355, + "step": 74630 + }, + { + "epoch": 0.2784181195586491, + "grad_norm": 0.34122931957244873, + "learning_rate": 0.0006, + "loss": 2.0351, + "step": 74640 + }, + { + "epoch": 0.27845542102161247, + "grad_norm": 0.25261953473091125, + "learning_rate": 0.0006, + "loss": 2.2055, + "step": 74650 + }, + { + "epoch": 0.27849272248457585, + "grad_norm": 0.2828962802886963, + "learning_rate": 0.0006, + "loss": 2.0995, + "step": 74660 + }, + { + "epoch": 0.27853002394753923, + "grad_norm": 0.4401315450668335, + "learning_rate": 0.0006, + "loss": 2.2208, + "step": 74670 + }, + { + "epoch": 0.2785673254105026, + "grad_norm": 0.2827627956867218, + "learning_rate": 0.0006, + "loss": 2.2769, + "step": 74680 + }, + { + "epoch": 0.278604626873466, + "grad_norm": 0.414140522480011, + "learning_rate": 0.0006, + "loss": 2.2218, + "step": 74690 + }, + { + "epoch": 0.27864192833642937, + "grad_norm": 0.3745887875556946, + "learning_rate": 0.0006, + "loss": 2.2628, + "step": 74700 + }, + { + "epoch": 0.27867922979939275, + "grad_norm": 0.40842872858047485, + "learning_rate": 0.0006, + "loss": 2.2992, + "step": 74710 + }, + { + "epoch": 0.2787165312623561, + "grad_norm": 0.38990113139152527, + "learning_rate": 0.0006, + "loss": 2.1125, + "step": 74720 + }, + { + "epoch": 0.2787538327253195, + "grad_norm": 0.318654865026474, + "learning_rate": 0.0006, + "loss": 2.2463, + "step": 74730 + }, + { + "epoch": 0.2787911341882829, + "grad_norm": 0.24291275441646576, + "learning_rate": 0.0006, + "loss": 2.2895, + "step": 74740 + }, + { + "epoch": 0.27882843565124626, + "grad_norm": 0.46920710802078247, + "learning_rate": 0.0006, + "loss": 2.3932, + "step": 74750 + }, + { + "epoch": 0.27882843565124626, + "eval_valid_loss": 2.167370080947876, + "eval_valid_loss/all": 2.0325489044189453, + "eval_valid_loss/end_span": 1.2932273149490356, + "eval_valid_perplexity/batch": 7.633518695831299, + "eval_valid_perplexity/end_span": 3.6445295810699463, + "eval_valid_perplexity/fim": 2.5271286964416504, + "eval_valid_perplexity/first_seq": 14.70777702331543, + "eval_valid_perplexity/last_seq": 8.784031867980957, + "eval_valid_perplexity/second_seq": 13.170451164245605, + "eval_valid_perplexity/seq": 8.60783863067627, + "eval_valid_reconstruction/all": 0.3007667362689972, + "eval_valid_reconstruction/end_span": 0.69487065076828, + "eval_valid_reconstruction/fim": 0.18989112973213196, + "eval_valid_reconstruction/first_seq": 0.1706581562757492, + "eval_valid_reconstruction/last_seq": 0.3316781520843506, + "eval_valid_reconstruction/second_seq": 0.20777934789657593, + "eval_valid_runtime": 445.4525, + "eval_valid_samples_per_second": 0.431, + "eval_valid_steps_per_second": 0.431, + "step": 74750 + }, + { + "epoch": 0.27882843565124626, + "eval_train_loss": 2.166733980178833, + "eval_train_loss/all": 2.0058321952819824, + "eval_train_loss/end_span": 1.255335807800293, + "eval_train_perplexity/batch": 7.432276248931885, + "eval_train_perplexity/end_span": 3.509016513824463, + "eval_train_perplexity/fim": 1.8580204248428345, + "eval_train_perplexity/first_seq": 15.337636947631836, + "eval_train_perplexity/last_seq": 8.439887046813965, + "eval_train_perplexity/second_seq": 14.376529693603516, + "eval_train_perplexity/seq": 8.555428504943848, + "eval_train_reconstruction/all": 0.289798378944397, + "eval_train_reconstruction/end_span": 0.7055888772010803, + "eval_train_reconstruction/fim": 0.126679927110672, + "eval_train_reconstruction/first_seq": 0.1572616994380951, + "eval_train_reconstruction/last_seq": 0.34747380018234253, + "eval_train_reconstruction/second_seq": 0.17808827757835388, + "eval_train_runtime": 449.8671, + "eval_train_samples_per_second": 0.427, + "eval_train_steps_per_second": 0.427, + "step": 74750 + }, + { + "epoch": 0.27886573711420964, + "grad_norm": 0.3466240167617798, + "learning_rate": 0.0006, + "loss": 2.1129, + "step": 74760 + }, + { + "epoch": 0.278903038577173, + "grad_norm": 0.324133962392807, + "learning_rate": 0.0006, + "loss": 2.1964, + "step": 74770 + }, + { + "epoch": 0.27894034004013635, + "grad_norm": 0.40218281745910645, + "learning_rate": 0.0006, + "loss": 2.3078, + "step": 74780 + }, + { + "epoch": 0.2789776415030997, + "grad_norm": 0.31216302514076233, + "learning_rate": 0.0006, + "loss": 2.022, + "step": 74790 + }, + { + "epoch": 0.2790149429660631, + "grad_norm": 0.45754101872444153, + "learning_rate": 0.0006, + "loss": 2.2334, + "step": 74800 + }, + { + "epoch": 0.2790522444290265, + "grad_norm": 0.39443299174308777, + "learning_rate": 0.0006, + "loss": 2.0618, + "step": 74810 + }, + { + "epoch": 0.27908954589198987, + "grad_norm": 0.3055379092693329, + "learning_rate": 0.0006, + "loss": 2.0255, + "step": 74820 + }, + { + "epoch": 0.27912684735495324, + "grad_norm": 0.23015175759792328, + "learning_rate": 0.0006, + "loss": 2.3126, + "step": 74830 + }, + { + "epoch": 0.2791641488179166, + "grad_norm": 0.2886829972267151, + "learning_rate": 0.0006, + "loss": 2.1131, + "step": 74840 + }, + { + "epoch": 0.27920145028088, + "grad_norm": 0.31328123807907104, + "learning_rate": 0.0006, + "loss": 2.1763, + "step": 74850 + }, + { + "epoch": 0.2792387517438434, + "grad_norm": 0.5746935606002808, + "learning_rate": 0.0006, + "loss": 2.2443, + "step": 74860 + }, + { + "epoch": 0.27927605320680676, + "grad_norm": 0.2969142496585846, + "learning_rate": 0.0006, + "loss": 2.2058, + "step": 74870 + }, + { + "epoch": 0.27931335466977014, + "grad_norm": 0.26575902104377747, + "learning_rate": 0.0006, + "loss": 2.0593, + "step": 74880 + }, + { + "epoch": 0.2793506561327335, + "grad_norm": 0.2833717167377472, + "learning_rate": 0.0006, + "loss": 2.2742, + "step": 74890 + }, + { + "epoch": 0.2793879575956969, + "grad_norm": 0.41897764801979065, + "learning_rate": 0.0006, + "loss": 2.1193, + "step": 74900 + }, + { + "epoch": 0.2794252590586603, + "grad_norm": 0.29566526412963867, + "learning_rate": 0.0006, + "loss": 2.1855, + "step": 74910 + }, + { + "epoch": 0.27946256052162366, + "grad_norm": 0.4721492826938629, + "learning_rate": 0.0006, + "loss": 2.005, + "step": 74920 + }, + { + "epoch": 0.27949986198458704, + "grad_norm": 0.28234925866127014, + "learning_rate": 0.0006, + "loss": 2.3483, + "step": 74930 + }, + { + "epoch": 0.2795371634475504, + "grad_norm": 0.3039003014564514, + "learning_rate": 0.0006, + "loss": 2.1452, + "step": 74940 + }, + { + "epoch": 0.2795744649105138, + "grad_norm": 0.23657085001468658, + "learning_rate": 0.0006, + "loss": 2.182, + "step": 74950 + }, + { + "epoch": 0.2796117663734772, + "grad_norm": 0.40961286425590515, + "learning_rate": 0.0006, + "loss": 2.0015, + "step": 74960 + }, + { + "epoch": 0.27964906783644056, + "grad_norm": 0.29173386096954346, + "learning_rate": 0.0006, + "loss": 2.1292, + "step": 74970 + }, + { + "epoch": 0.27968636929940394, + "grad_norm": 0.23706109821796417, + "learning_rate": 0.0006, + "loss": 2.1497, + "step": 74980 + }, + { + "epoch": 0.2797236707623673, + "grad_norm": 0.4233017563819885, + "learning_rate": 0.0006, + "loss": 2.0681, + "step": 74990 + }, + { + "epoch": 0.2797609722253307, + "grad_norm": 0.32116612792015076, + "learning_rate": 0.0006, + "loss": 2.275, + "step": 75000 + }, + { + "epoch": 0.2797609722253307, + "eval_valid_loss": 2.175104856491089, + "eval_valid_loss/all": 2.039428949356079, + "eval_valid_loss/end_span": 1.1858974695205688, + "eval_valid_perplexity/batch": 7.686218738555908, + "eval_valid_perplexity/end_span": 3.273623466491699, + "eval_valid_perplexity/fim": 2.1604456901550293, + "eval_valid_perplexity/first_seq": 15.03695011138916, + "eval_valid_perplexity/last_seq": 8.847909927368164, + "eval_valid_perplexity/second_seq": 13.949832916259766, + "eval_valid_perplexity/seq": 8.664531707763672, + "eval_valid_reconstruction/all": 0.2986067235469818, + "eval_valid_reconstruction/end_span": 0.7277700901031494, + "eval_valid_reconstruction/fim": 0.1569186896085739, + "eval_valid_reconstruction/first_seq": 0.1626872569322586, + "eval_valid_reconstruction/last_seq": 0.3333994448184967, + "eval_valid_reconstruction/second_seq": 0.19309237599372864, + "eval_valid_runtime": 447.547, + "eval_valid_samples_per_second": 0.429, + "eval_valid_steps_per_second": 0.429, + "step": 75000 + }, + { + "epoch": 0.2797609722253307, + "eval_train_loss": 2.1749448776245117, + "eval_train_loss/all": 2.0132932662963867, + "eval_train_loss/end_span": 1.1465981006622314, + "eval_train_perplexity/batch": 7.487936496734619, + "eval_train_perplexity/end_span": 3.1474673748016357, + "eval_train_perplexity/fim": 2.2129104137420654, + "eval_train_perplexity/first_seq": 15.567440032958984, + "eval_train_perplexity/last_seq": 8.607172012329102, + "eval_train_perplexity/second_seq": 14.367986679077148, + "eval_train_perplexity/seq": 8.622620582580566, + "eval_train_reconstruction/all": 0.2874797582626343, + "eval_train_reconstruction/end_span": 0.7385661602020264, + "eval_train_reconstruction/fim": 0.16053897142410278, + "eval_train_reconstruction/first_seq": 0.1520141214132309, + "eval_train_reconstruction/last_seq": 0.3426550328731537, + "eval_train_reconstruction/second_seq": 0.1811733841896057, + "eval_train_runtime": 445.9793, + "eval_train_samples_per_second": 0.431, + "eval_train_steps_per_second": 0.431, + "step": 75000 + }, + { + "epoch": 0.2797982736882941, + "grad_norm": 0.36213183403015137, + "learning_rate": 0.0006, + "loss": 2.184, + "step": 75010 + }, + { + "epoch": 0.27983557515125745, + "grad_norm": 0.8453875780105591, + "learning_rate": 0.0006, + "loss": 2.0468, + "step": 75020 + }, + { + "epoch": 0.27987287661422083, + "grad_norm": 0.38316741585731506, + "learning_rate": 0.0006, + "loss": 2.2063, + "step": 75030 + }, + { + "epoch": 0.2799101780771842, + "grad_norm": 0.4784882068634033, + "learning_rate": 0.0006, + "loss": 2.2526, + "step": 75040 + }, + { + "epoch": 0.27994747954014754, + "grad_norm": 0.32719504833221436, + "learning_rate": 0.0006, + "loss": 2.1864, + "step": 75050 + }, + { + "epoch": 0.2799847810031109, + "grad_norm": 0.2472158670425415, + "learning_rate": 0.0006, + "loss": 2.1734, + "step": 75060 + }, + { + "epoch": 0.2800220824660743, + "grad_norm": 0.34939247369766235, + "learning_rate": 0.0006, + "loss": 2.0732, + "step": 75070 + }, + { + "epoch": 0.2800593839290377, + "grad_norm": 0.28129103779792786, + "learning_rate": 0.0006, + "loss": 2.3293, + "step": 75080 + }, + { + "epoch": 0.28009668539200105, + "grad_norm": 0.42249780893325806, + "learning_rate": 0.0006, + "loss": 2.1795, + "step": 75090 + }, + { + "epoch": 0.28013398685496443, + "grad_norm": 0.29121091961860657, + "learning_rate": 0.0006, + "loss": 2.1923, + "step": 75100 + }, + { + "epoch": 0.2801712883179278, + "grad_norm": 0.3445837199687958, + "learning_rate": 0.0006, + "loss": 2.2078, + "step": 75110 + }, + { + "epoch": 0.2802085897808912, + "grad_norm": 0.32858845591545105, + "learning_rate": 0.0006, + "loss": 2.2258, + "step": 75120 + }, + { + "epoch": 0.2802458912438546, + "grad_norm": 0.3888324201107025, + "learning_rate": 0.0006, + "loss": 2.3066, + "step": 75130 + }, + { + "epoch": 0.28028319270681795, + "grad_norm": 0.5713692903518677, + "learning_rate": 0.0006, + "loss": 2.1182, + "step": 75140 + }, + { + "epoch": 0.28032049416978133, + "grad_norm": 0.3714461922645569, + "learning_rate": 0.0006, + "loss": 2.1291, + "step": 75150 + }, + { + "epoch": 0.2803577956327447, + "grad_norm": 0.29889100790023804, + "learning_rate": 0.0006, + "loss": 2.1111, + "step": 75160 + }, + { + "epoch": 0.2803950970957081, + "grad_norm": 0.29435503482818604, + "learning_rate": 0.0006, + "loss": 2.266, + "step": 75170 + }, + { + "epoch": 0.28043239855867147, + "grad_norm": 0.2889629602432251, + "learning_rate": 0.0006, + "loss": 2.1712, + "step": 75180 + }, + { + "epoch": 0.28046970002163485, + "grad_norm": 0.4235837161540985, + "learning_rate": 0.0006, + "loss": 2.2414, + "step": 75190 + }, + { + "epoch": 0.28050700148459823, + "grad_norm": 0.2165633887052536, + "learning_rate": 0.0006, + "loss": 2.0792, + "step": 75200 + }, + { + "epoch": 0.2805443029475616, + "grad_norm": 0.3906039893627167, + "learning_rate": 0.0006, + "loss": 2.131, + "step": 75210 + }, + { + "epoch": 0.280581604410525, + "grad_norm": 0.3634793758392334, + "learning_rate": 0.0006, + "loss": 2.2667, + "step": 75220 + }, + { + "epoch": 0.28061890587348837, + "grad_norm": 0.33030465245246887, + "learning_rate": 0.0006, + "loss": 2.2715, + "step": 75230 + }, + { + "epoch": 0.28065620733645175, + "grad_norm": 0.3279762864112854, + "learning_rate": 0.0006, + "loss": 2.2046, + "step": 75240 + }, + { + "epoch": 0.2806935087994151, + "grad_norm": 0.4318578839302063, + "learning_rate": 0.0006, + "loss": 2.1175, + "step": 75250 + }, + { + "epoch": 0.2806935087994151, + "eval_valid_loss": 2.169276237487793, + "eval_valid_loss/all": 2.034627914428711, + "eval_valid_loss/end_span": 1.1203759908676147, + "eval_valid_perplexity/batch": 7.649405479431152, + "eval_valid_perplexity/end_span": 3.066006660461426, + "eval_valid_perplexity/fim": 2.5553698539733887, + "eval_valid_perplexity/first_seq": 14.951752662658691, + "eval_valid_perplexity/last_seq": 8.815924644470215, + "eval_valid_perplexity/second_seq": 13.638860702514648, + "eval_valid_perplexity/seq": 8.634575843811035, + "eval_valid_reconstruction/all": 0.3003618121147156, + "eval_valid_reconstruction/end_span": 0.7372848391532898, + "eval_valid_reconstruction/fim": 0.19184963405132294, + "eval_valid_reconstruction/first_seq": 0.16519546508789062, + "eval_valid_reconstruction/last_seq": 0.3309241831302643, + "eval_valid_reconstruction/second_seq": 0.20004154741764069, + "eval_valid_runtime": 444.4268, + "eval_valid_samples_per_second": 0.432, + "eval_valid_steps_per_second": 0.432, + "step": 75250 + }, + { + "epoch": 0.2806935087994151, + "eval_train_loss": 2.169414758682251, + "eval_train_loss/all": 2.0088095664978027, + "eval_train_loss/end_span": 1.0918182134628296, + "eval_train_perplexity/batch": 7.454438209533691, + "eval_train_perplexity/end_span": 2.979686975479126, + "eval_train_perplexity/fim": 1.8670639991760254, + "eval_train_perplexity/first_seq": 15.567421913146973, + "eval_train_perplexity/last_seq": 8.801255226135254, + "eval_train_perplexity/second_seq": 14.796018600463867, + "eval_train_perplexity/seq": 8.590758323669434, + "eval_train_reconstruction/all": 0.2892763316631317, + "eval_train_reconstruction/end_span": 0.7468417286872864, + "eval_train_reconstruction/fim": 0.12789595127105713, + "eval_train_reconstruction/first_seq": 0.14717762172222137, + "eval_train_reconstruction/last_seq": 0.3291650116443634, + "eval_train_reconstruction/second_seq": 0.17114393413066864, + "eval_train_runtime": 442.8434, + "eval_train_samples_per_second": 0.434, + "eval_train_steps_per_second": 0.434, + "step": 75250 + }, + { + "epoch": 0.2807308102623785, + "grad_norm": 0.41051968932151794, + "learning_rate": 0.0006, + "loss": 2.255, + "step": 75260 + }, + { + "epoch": 0.2807681117253419, + "grad_norm": 0.33167511224746704, + "learning_rate": 0.0006, + "loss": 2.1201, + "step": 75270 + }, + { + "epoch": 0.28080541318830526, + "grad_norm": 0.34792640805244446, + "learning_rate": 0.0006, + "loss": 2.2288, + "step": 75280 + }, + { + "epoch": 0.28084271465126864, + "grad_norm": 0.30625253915786743, + "learning_rate": 0.0006, + "loss": 2.2971, + "step": 75290 + }, + { + "epoch": 0.280880016114232, + "grad_norm": 0.3013812005519867, + "learning_rate": 0.0006, + "loss": 2.2624, + "step": 75300 + }, + { + "epoch": 0.2809173175771954, + "grad_norm": 0.30414679646492004, + "learning_rate": 0.0006, + "loss": 2.3866, + "step": 75310 + }, + { + "epoch": 0.2809546190401588, + "grad_norm": 0.3073890805244446, + "learning_rate": 0.0006, + "loss": 2.0845, + "step": 75320 + }, + { + "epoch": 0.2809919205031221, + "grad_norm": 0.3742433190345764, + "learning_rate": 0.0006, + "loss": 2.0814, + "step": 75330 + }, + { + "epoch": 0.2810292219660855, + "grad_norm": 0.2752669155597687, + "learning_rate": 0.0006, + "loss": 2.0944, + "step": 75340 + }, + { + "epoch": 0.28106652342904886, + "grad_norm": 0.3098616600036621, + "learning_rate": 0.0006, + "loss": 2.2353, + "step": 75350 + }, + { + "epoch": 0.28110382489201224, + "grad_norm": 0.26786568760871887, + "learning_rate": 0.0006, + "loss": 2.346, + "step": 75360 + }, + { + "epoch": 0.2811411263549756, + "grad_norm": 0.4763182997703552, + "learning_rate": 0.0006, + "loss": 2.2018, + "step": 75370 + }, + { + "epoch": 0.281178427817939, + "grad_norm": 0.5341889262199402, + "learning_rate": 0.0006, + "loss": 2.1952, + "step": 75380 + }, + { + "epoch": 0.2812157292809024, + "grad_norm": 0.2940075695514679, + "learning_rate": 0.0006, + "loss": 2.094, + "step": 75390 + }, + { + "epoch": 0.28125303074386576, + "grad_norm": 0.2823386490345001, + "learning_rate": 0.0006, + "loss": 2.1366, + "step": 75400 + }, + { + "epoch": 0.28129033220682914, + "grad_norm": 0.5424323678016663, + "learning_rate": 0.0006, + "loss": 2.2354, + "step": 75410 + }, + { + "epoch": 0.2813276336697925, + "grad_norm": 0.34047597646713257, + "learning_rate": 0.0006, + "loss": 2.2525, + "step": 75420 + }, + { + "epoch": 0.2813649351327559, + "grad_norm": 0.4934256970882416, + "learning_rate": 0.0006, + "loss": 2.2022, + "step": 75430 + }, + { + "epoch": 0.2814022365957193, + "grad_norm": 0.5251213908195496, + "learning_rate": 0.0006, + "loss": 2.3641, + "step": 75440 + }, + { + "epoch": 0.28143953805868266, + "grad_norm": 0.4289340674877167, + "learning_rate": 0.0006, + "loss": 2.3471, + "step": 75450 + }, + { + "epoch": 0.28147683952164604, + "grad_norm": 0.5917035341262817, + "learning_rate": 0.0006, + "loss": 2.1993, + "step": 75460 + }, + { + "epoch": 0.2815141409846094, + "grad_norm": 0.33262014389038086, + "learning_rate": 0.0006, + "loss": 2.3713, + "step": 75470 + }, + { + "epoch": 0.2815514424475728, + "grad_norm": 0.45707520842552185, + "learning_rate": 0.0006, + "loss": 2.3207, + "step": 75480 + }, + { + "epoch": 0.2815887439105362, + "grad_norm": 0.3034512400627136, + "learning_rate": 0.0006, + "loss": 2.2749, + "step": 75490 + }, + { + "epoch": 0.28162604537349956, + "grad_norm": 0.5077193975448608, + "learning_rate": 0.0006, + "loss": 2.2809, + "step": 75500 + }, + { + "epoch": 0.28162604537349956, + "eval_valid_loss": 2.1768338680267334, + "eval_valid_loss/all": 2.041712999343872, + "eval_valid_loss/end_span": 1.3034459352493286, + "eval_valid_perplexity/batch": 7.703794479370117, + "eval_valid_perplexity/end_span": 3.681962728500366, + "eval_valid_perplexity/fim": 2.3643510341644287, + "eval_valid_perplexity/first_seq": 15.011058807373047, + "eval_valid_perplexity/last_seq": 8.373141288757324, + "eval_valid_perplexity/second_seq": 14.187010765075684, + "eval_valid_perplexity/seq": 8.696707725524902, + "eval_valid_reconstruction/all": 0.2979932725429535, + "eval_valid_reconstruction/end_span": 0.6911466717720032, + "eval_valid_reconstruction/fim": 0.17536690831184387, + "eval_valid_reconstruction/first_seq": 0.16359646618366241, + "eval_valid_reconstruction/last_seq": 0.3461824357509613, + "eval_valid_reconstruction/second_seq": 0.18708206713199615, + "eval_valid_runtime": 446.1464, + "eval_valid_samples_per_second": 0.43, + "eval_valid_steps_per_second": 0.43, + "step": 75500 + }, + { + "epoch": 0.28162604537349956, + "eval_train_loss": 2.172076940536499, + "eval_train_loss/all": 2.0109593868255615, + "eval_train_loss/end_span": 1.2786633968353271, + "eval_train_perplexity/batch": 7.470480918884277, + "eval_train_perplexity/end_span": 3.5918357372283936, + "eval_train_perplexity/fim": 2.114293336868286, + "eval_train_perplexity/first_seq": 15.663990020751953, + "eval_train_perplexity/last_seq": 9.009507179260254, + "eval_train_perplexity/second_seq": 13.964581489562988, + "eval_train_perplexity/seq": 8.605696678161621, + "eval_train_reconstruction/all": 0.2883189618587494, + "eval_train_reconstruction/end_span": 0.7006653547286987, + "eval_train_reconstruction/fim": 0.1531553864479065, + "eval_train_reconstruction/first_seq": 0.14915882050991058, + "eval_train_reconstruction/last_seq": 0.32514482736587524, + "eval_train_reconstruction/second_seq": 0.19012364745140076, + "eval_train_runtime": 446.39, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 75500 + }, + { + "epoch": 0.28166334683646294, + "grad_norm": 0.3712429702281952, + "learning_rate": 0.0006, + "loss": 2.1925, + "step": 75510 + }, + { + "epoch": 0.2817006482994263, + "grad_norm": 0.3328739106655121, + "learning_rate": 0.0006, + "loss": 2.3555, + "step": 75520 + }, + { + "epoch": 0.2817379497623897, + "grad_norm": 0.29463857412338257, + "learning_rate": 0.0006, + "loss": 2.1031, + "step": 75530 + }, + { + "epoch": 0.2817752512253531, + "grad_norm": 0.381165474653244, + "learning_rate": 0.0006, + "loss": 2.254, + "step": 75540 + }, + { + "epoch": 0.28181255268831645, + "grad_norm": 0.3161940276622772, + "learning_rate": 0.0006, + "loss": 2.1164, + "step": 75550 + }, + { + "epoch": 0.28184985415127983, + "grad_norm": 0.25379592180252075, + "learning_rate": 0.0006, + "loss": 2.1215, + "step": 75560 + }, + { + "epoch": 0.2818871556142432, + "grad_norm": 0.2584163546562195, + "learning_rate": 0.0006, + "loss": 2.169, + "step": 75570 + }, + { + "epoch": 0.2819244570772066, + "grad_norm": 0.40557020902633667, + "learning_rate": 0.0006, + "loss": 2.1786, + "step": 75580 + }, + { + "epoch": 0.28196175854016997, + "grad_norm": 0.2809443771839142, + "learning_rate": 0.0006, + "loss": 2.2664, + "step": 75590 + }, + { + "epoch": 0.28199906000313335, + "grad_norm": 0.41998329758644104, + "learning_rate": 0.0006, + "loss": 2.2122, + "step": 75600 + }, + { + "epoch": 0.2820363614660967, + "grad_norm": 0.34298455715179443, + "learning_rate": 0.0006, + "loss": 2.3895, + "step": 75610 + }, + { + "epoch": 0.28207366292906005, + "grad_norm": 0.26696738600730896, + "learning_rate": 0.0006, + "loss": 2.1039, + "step": 75620 + }, + { + "epoch": 0.28211096439202343, + "grad_norm": 0.33793094754219055, + "learning_rate": 0.0006, + "loss": 2.2516, + "step": 75630 + }, + { + "epoch": 0.2821482658549868, + "grad_norm": 0.3099914491176605, + "learning_rate": 0.0006, + "loss": 2.0683, + "step": 75640 + }, + { + "epoch": 0.2821855673179502, + "grad_norm": 0.43295150995254517, + "learning_rate": 0.0006, + "loss": 2.1372, + "step": 75650 + }, + { + "epoch": 0.28222286878091357, + "grad_norm": 0.31853094696998596, + "learning_rate": 0.0006, + "loss": 2.2759, + "step": 75660 + }, + { + "epoch": 0.28226017024387695, + "grad_norm": 0.35702329874038696, + "learning_rate": 0.0006, + "loss": 2.1132, + "step": 75670 + }, + { + "epoch": 0.28229747170684033, + "grad_norm": 0.37555986642837524, + "learning_rate": 0.0006, + "loss": 2.2364, + "step": 75680 + }, + { + "epoch": 0.2823347731698037, + "grad_norm": 0.4125838279724121, + "learning_rate": 0.0006, + "loss": 2.1372, + "step": 75690 + }, + { + "epoch": 0.2823720746327671, + "grad_norm": 0.2868840992450714, + "learning_rate": 0.0006, + "loss": 2.2353, + "step": 75700 + }, + { + "epoch": 0.28240937609573047, + "grad_norm": 0.2407359629869461, + "learning_rate": 0.0006, + "loss": 2.2124, + "step": 75710 + }, + { + "epoch": 0.28244667755869385, + "grad_norm": 0.3133198618888855, + "learning_rate": 0.0006, + "loss": 2.2178, + "step": 75720 + }, + { + "epoch": 0.2824839790216572, + "grad_norm": 0.30383384227752686, + "learning_rate": 0.0006, + "loss": 2.29, + "step": 75730 + }, + { + "epoch": 0.2825212804846206, + "grad_norm": 0.32927387952804565, + "learning_rate": 0.0006, + "loss": 2.2221, + "step": 75740 + }, + { + "epoch": 0.282558581947584, + "grad_norm": 0.5105395317077637, + "learning_rate": 0.0006, + "loss": 2.0855, + "step": 75750 + }, + { + "epoch": 0.282558581947584, + "eval_valid_loss": 2.170318603515625, + "eval_valid_loss/all": 2.035487651824951, + "eval_valid_loss/end_span": 1.262284755706787, + "eval_valid_perplexity/batch": 7.655984878540039, + "eval_valid_perplexity/end_span": 3.5334854125976562, + "eval_valid_perplexity/fim": 2.0716254711151123, + "eval_valid_perplexity/first_seq": 14.85738468170166, + "eval_valid_perplexity/last_seq": 8.513193130493164, + "eval_valid_perplexity/second_seq": 13.757969856262207, + "eval_valid_perplexity/seq": 8.638903617858887, + "eval_valid_reconstruction/all": 0.30004411935806274, + "eval_valid_reconstruction/end_span": 0.7069327235221863, + "eval_valid_reconstruction/fim": 0.1489056944847107, + "eval_valid_reconstruction/first_seq": 0.16630147397518158, + "eval_valid_reconstruction/last_seq": 0.3441596031188965, + "eval_valid_reconstruction/second_seq": 0.192609965801239, + "eval_valid_runtime": 450.9158, + "eval_valid_samples_per_second": 0.426, + "eval_valid_steps_per_second": 0.426, + "step": 75750 + }, + { + "epoch": 0.282558581947584, + "eval_train_loss": 2.1701276302337646, + "eval_train_loss/all": 2.009322166442871, + "eval_train_loss/end_span": 1.2274900674819946, + "eval_train_perplexity/batch": 7.4582600593566895, + "eval_train_perplexity/end_span": 3.4126532077789307, + "eval_train_perplexity/fim": 2.0722055435180664, + "eval_train_perplexity/first_seq": 15.62826156616211, + "eval_train_perplexity/last_seq": 8.599321365356445, + "eval_train_perplexity/second_seq": 14.43779468536377, + "eval_train_perplexity/seq": 8.591960906982422, + "eval_train_reconstruction/all": 0.2889956533908844, + "eval_train_reconstruction/end_span": 0.7168272137641907, + "eval_train_reconstruction/fim": 0.1501486450433731, + "eval_train_reconstruction/first_seq": 0.1474902480840683, + "eval_train_reconstruction/last_seq": 0.33799973130226135, + "eval_train_reconstruction/second_seq": 0.18095047771930695, + "eval_train_runtime": 444.9188, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 75750 + }, + { + "epoch": 0.28259588341054737, + "grad_norm": 0.3439565598964691, + "learning_rate": 0.0006, + "loss": 2.0872, + "step": 75760 + }, + { + "epoch": 0.28263318487351075, + "grad_norm": 0.3462529480457306, + "learning_rate": 0.0006, + "loss": 2.2514, + "step": 75770 + }, + { + "epoch": 0.2826704863364741, + "grad_norm": 0.35514378547668457, + "learning_rate": 0.0006, + "loss": 2.0535, + "step": 75780 + }, + { + "epoch": 0.2827077877994375, + "grad_norm": 0.22555014491081238, + "learning_rate": 0.0006, + "loss": 2.2669, + "step": 75790 + }, + { + "epoch": 0.2827450892624009, + "grad_norm": 0.3472558557987213, + "learning_rate": 0.0006, + "loss": 2.1632, + "step": 75800 + }, + { + "epoch": 0.28278239072536426, + "grad_norm": 0.38792333006858826, + "learning_rate": 0.0006, + "loss": 2.065, + "step": 75810 + }, + { + "epoch": 0.28281969218832764, + "grad_norm": 0.368511825799942, + "learning_rate": 0.0006, + "loss": 2.1777, + "step": 75820 + }, + { + "epoch": 0.282856993651291, + "grad_norm": 0.37510591745376587, + "learning_rate": 0.0006, + "loss": 2.2712, + "step": 75830 + }, + { + "epoch": 0.2828942951142544, + "grad_norm": 0.39705294370651245, + "learning_rate": 0.0006, + "loss": 2.2004, + "step": 75840 + }, + { + "epoch": 0.2829315965772178, + "grad_norm": 0.287596195936203, + "learning_rate": 0.0006, + "loss": 2.2416, + "step": 75850 + }, + { + "epoch": 0.28296889804018116, + "grad_norm": 0.32104969024658203, + "learning_rate": 0.0006, + "loss": 2.2725, + "step": 75860 + }, + { + "epoch": 0.28300619950314454, + "grad_norm": 0.35363245010375977, + "learning_rate": 0.0006, + "loss": 2.2167, + "step": 75870 + }, + { + "epoch": 0.28304350096610786, + "grad_norm": 0.35631898045539856, + "learning_rate": 0.0006, + "loss": 1.9449, + "step": 75880 + }, + { + "epoch": 0.28308080242907124, + "grad_norm": 0.2691686451435089, + "learning_rate": 0.0006, + "loss": 2.1555, + "step": 75890 + }, + { + "epoch": 0.2831181038920346, + "grad_norm": 0.3195740580558777, + "learning_rate": 0.0006, + "loss": 2.3995, + "step": 75900 + }, + { + "epoch": 0.283155405354998, + "grad_norm": 0.24454709887504578, + "learning_rate": 0.0006, + "loss": 2.3595, + "step": 75910 + }, + { + "epoch": 0.2831927068179614, + "grad_norm": 0.3086003363132477, + "learning_rate": 0.0006, + "loss": 2.1929, + "step": 75920 + }, + { + "epoch": 0.28323000828092476, + "grad_norm": 0.23271290957927704, + "learning_rate": 0.0006, + "loss": 2.2053, + "step": 75930 + }, + { + "epoch": 0.28326730974388814, + "grad_norm": 0.34192079305648804, + "learning_rate": 0.0006, + "loss": 2.2329, + "step": 75940 + }, + { + "epoch": 0.2833046112068515, + "grad_norm": 0.2642180621623993, + "learning_rate": 0.0006, + "loss": 1.972, + "step": 75950 + }, + { + "epoch": 0.2833419126698149, + "grad_norm": 0.361190527677536, + "learning_rate": 0.0006, + "loss": 2.1142, + "step": 75960 + }, + { + "epoch": 0.2833792141327783, + "grad_norm": 0.2633317708969116, + "learning_rate": 0.0006, + "loss": 2.161, + "step": 75970 + }, + { + "epoch": 0.28341651559574166, + "grad_norm": 0.3998386561870575, + "learning_rate": 0.0006, + "loss": 2.1646, + "step": 75980 + }, + { + "epoch": 0.28345381705870504, + "grad_norm": 0.32863086462020874, + "learning_rate": 0.0006, + "loss": 2.0912, + "step": 75990 + }, + { + "epoch": 0.2834911185216684, + "grad_norm": 0.3007068634033203, + "learning_rate": 0.0006, + "loss": 2.1344, + "step": 76000 + }, + { + "epoch": 0.2834911185216684, + "eval_valid_loss": 2.1729161739349365, + "eval_valid_loss/all": 2.0380961894989014, + "eval_valid_loss/end_span": 1.2484912872314453, + "eval_valid_perplexity/batch": 7.675981521606445, + "eval_valid_perplexity/end_span": 3.4850809574127197, + "eval_valid_perplexity/fim": 2.2186429500579834, + "eval_valid_perplexity/first_seq": 15.158756256103516, + "eval_valid_perplexity/last_seq": 8.770243644714355, + "eval_valid_perplexity/second_seq": 13.691699981689453, + "eval_valid_perplexity/seq": 8.665358543395996, + "eval_valid_reconstruction/all": 0.29901042580604553, + "eval_valid_reconstruction/end_span": 0.7121679186820984, + "eval_valid_reconstruction/fim": 0.16213884949684143, + "eval_valid_reconstruction/first_seq": 0.16204771399497986, + "eval_valid_reconstruction/last_seq": 0.33310002088546753, + "eval_valid_reconstruction/second_seq": 0.1966993808746338, + "eval_valid_runtime": 443.3799, + "eval_valid_samples_per_second": 0.433, + "eval_valid_steps_per_second": 0.433, + "step": 76000 + }, + { + "epoch": 0.2834911185216684, + "eval_train_loss": 2.170051097869873, + "eval_train_loss/all": 2.0091605186462402, + "eval_train_loss/end_span": 1.2144097089767456, + "eval_train_perplexity/batch": 7.457054615020752, + "eval_train_perplexity/end_span": 3.368305206298828, + "eval_train_perplexity/fim": 2.213419198989868, + "eval_train_perplexity/first_seq": 15.550305366516113, + "eval_train_perplexity/last_seq": 8.693330764770508, + "eval_train_perplexity/second_seq": 14.486919403076172, + "eval_train_perplexity/seq": 8.588676452636719, + "eval_train_reconstruction/all": 0.28886887431144714, + "eval_train_reconstruction/end_span": 0.7233846187591553, + "eval_train_reconstruction/fim": 0.1631736010313034, + "eval_train_reconstruction/first_seq": 0.1514371633529663, + "eval_train_reconstruction/last_seq": 0.3310673236846924, + "eval_train_reconstruction/second_seq": 0.17829833924770355, + "eval_train_runtime": 443.9751, + "eval_train_samples_per_second": 0.432, + "eval_train_steps_per_second": 0.432, + "step": 76000 + }, + { + "epoch": 0.2835284199846318, + "grad_norm": 0.3525699973106384, + "learning_rate": 0.0006, + "loss": 2.159, + "step": 76010 + }, + { + "epoch": 0.2835657214475952, + "grad_norm": 0.43438759446144104, + "learning_rate": 0.0006, + "loss": 2.0351, + "step": 76020 + }, + { + "epoch": 0.28360302291055856, + "grad_norm": 0.25788938999176025, + "learning_rate": 0.0006, + "loss": 2.1053, + "step": 76030 + }, + { + "epoch": 0.28364032437352193, + "grad_norm": 0.317695289850235, + "learning_rate": 0.0006, + "loss": 2.239, + "step": 76040 + }, + { + "epoch": 0.2836776258364853, + "grad_norm": 0.472495973110199, + "learning_rate": 0.0006, + "loss": 2.2351, + "step": 76050 + }, + { + "epoch": 0.2837149272994487, + "grad_norm": 0.4434443414211273, + "learning_rate": 0.0006, + "loss": 2.1997, + "step": 76060 + }, + { + "epoch": 0.2837522287624121, + "grad_norm": 0.35220035910606384, + "learning_rate": 0.0006, + "loss": 2.1676, + "step": 76070 + }, + { + "epoch": 0.28378953022537545, + "grad_norm": 0.4516620337963104, + "learning_rate": 0.0006, + "loss": 2.0871, + "step": 76080 + }, + { + "epoch": 0.28382683168833883, + "grad_norm": 0.4613349437713623, + "learning_rate": 0.0006, + "loss": 2.0513, + "step": 76090 + }, + { + "epoch": 0.2838641331513022, + "grad_norm": 0.4238419830799103, + "learning_rate": 0.0006, + "loss": 2.2161, + "step": 76100 + }, + { + "epoch": 0.2839014346142656, + "grad_norm": 0.34369751811027527, + "learning_rate": 0.0006, + "loss": 2.1704, + "step": 76110 + }, + { + "epoch": 0.28393873607722897, + "grad_norm": 0.2828458845615387, + "learning_rate": 0.0006, + "loss": 2.1595, + "step": 76120 + }, + { + "epoch": 0.28397603754019235, + "grad_norm": 0.44437965750694275, + "learning_rate": 0.0006, + "loss": 2.2495, + "step": 76130 + }, + { + "epoch": 0.28401333900315573, + "grad_norm": 0.48439157009124756, + "learning_rate": 0.0006, + "loss": 2.1253, + "step": 76140 + }, + { + "epoch": 0.2840506404661191, + "grad_norm": 0.30863112211227417, + "learning_rate": 0.0006, + "loss": 1.9979, + "step": 76150 + }, + { + "epoch": 0.28408794192908243, + "grad_norm": 0.31124067306518555, + "learning_rate": 0.0006, + "loss": 2.4423, + "step": 76160 + }, + { + "epoch": 0.2841252433920458, + "grad_norm": 0.3300701677799225, + "learning_rate": 0.0006, + "loss": 2.2399, + "step": 76170 + }, + { + "epoch": 0.2841625448550092, + "grad_norm": 0.3011423647403717, + "learning_rate": 0.0006, + "loss": 2.1103, + "step": 76180 + }, + { + "epoch": 0.28419984631797257, + "grad_norm": 0.3439212739467621, + "learning_rate": 0.0006, + "loss": 2.2664, + "step": 76190 + }, + { + "epoch": 0.28423714778093595, + "grad_norm": 0.39205750823020935, + "learning_rate": 0.0006, + "loss": 2.2209, + "step": 76200 + }, + { + "epoch": 0.28427444924389933, + "grad_norm": 0.3536302447319031, + "learning_rate": 0.0006, + "loss": 2.4023, + "step": 76210 + }, + { + "epoch": 0.2843117507068627, + "grad_norm": 0.2582565248012543, + "learning_rate": 0.0006, + "loss": 2.104, + "step": 76220 + }, + { + "epoch": 0.2843490521698261, + "grad_norm": 0.42445674538612366, + "learning_rate": 0.0006, + "loss": 2.2382, + "step": 76230 + }, + { + "epoch": 0.28438635363278947, + "grad_norm": 0.41732916235923767, + "learning_rate": 0.0006, + "loss": 2.2232, + "step": 76240 + }, + { + "epoch": 0.28442365509575285, + "grad_norm": 0.24454964697360992, + "learning_rate": 0.0006, + "loss": 2.1391, + "step": 76250 + }, + { + "epoch": 0.28442365509575285, + "eval_valid_loss": 2.1745033264160156, + "eval_valid_loss/all": 2.0394492149353027, + "eval_valid_loss/end_span": 1.2605010271072388, + "eval_valid_perplexity/batch": 7.686374664306641, + "eval_valid_perplexity/end_span": 3.527188301086426, + "eval_valid_perplexity/fim": 2.6171417236328125, + "eval_valid_perplexity/first_seq": 14.731757164001465, + "eval_valid_perplexity/last_seq": 8.790367126464844, + "eval_valid_perplexity/second_seq": 13.727298736572266, + "eval_valid_perplexity/seq": 8.67956256866455, + "eval_valid_reconstruction/all": 0.29879701137542725, + "eval_valid_reconstruction/end_span": 0.7063522934913635, + "eval_valid_reconstruction/fim": 0.19578638672828674, + "eval_valid_reconstruction/first_seq": 0.1709340512752533, + "eval_valid_reconstruction/last_seq": 0.3338199257850647, + "eval_valid_reconstruction/second_seq": 0.1999795138835907, + "eval_valid_runtime": 449.7401, + "eval_valid_samples_per_second": 0.427, + "eval_valid_steps_per_second": 0.427, + "step": 76250 + }, + { + "epoch": 0.28442365509575285, + "eval_train_loss": 2.172313928604126, + "eval_train_loss/all": 2.011277675628662, + "eval_train_loss/end_span": 1.21461021900177, + "eval_train_perplexity/batch": 7.472858905792236, + "eval_train_perplexity/end_span": 3.368980646133423, + "eval_train_perplexity/fim": 1.9906104803085327, + "eval_train_perplexity/first_seq": 15.384177207946777, + "eval_train_perplexity/last_seq": 8.817089080810547, + "eval_train_perplexity/second_seq": 14.308150291442871, + "eval_train_perplexity/seq": 8.61098861694336, + "eval_train_reconstruction/all": 0.2881842255592346, + "eval_train_reconstruction/end_span": 0.7188572883605957, + "eval_train_reconstruction/fim": 0.1407361626625061, + "eval_train_reconstruction/first_seq": 0.1547790765762329, + "eval_train_reconstruction/last_seq": 0.33009809255599976, + "eval_train_reconstruction/second_seq": 0.18260616064071655, + "eval_train_runtime": 446.7739, + "eval_train_samples_per_second": 0.43, + "eval_train_steps_per_second": 0.43, + "step": 76250 + } + ], + "logging_steps": 10, + "max_steps": 26808600, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 250, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/protxlstm/data.py b/protxlstm/data.py new file mode 100644 index 0000000000000000000000000000000000000000..c1cc20cd144c7d5d44aed80ba755c2b5022f6d4a --- /dev/null +++ b/protxlstm/data.py @@ -0,0 +1,60 @@ +import csv +import os + +import numpy as np +from tqdm import tqdm + +from protxlstm.utils import load_sequences_from_msa_file, tokenizer + +def process_msa(msa_item): + msa_name, msa_path = msa_item + # Load an a3m file with all the context sequences + msa = load_sequences_from_msa_file(msa_path) + # Tokenize the sequences and concatenate them into a single array + tokens = tokenizer(msa, concatenate=True) + tokens = tokens.numpy()[0] + return msa_name, tokens + +def main(data_dir, output_dir): + msa_paths = {k: os.path.join(data_dir, k, 'a3m/uniclust30.a3m') for k in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, k))} + msa_items = list(msa_paths.items()) + + dataset_dictionary = {} + total_length = 0 + + # First pass: calculate total length of all concatenated arrays + for item in tqdm(msa_items): + try: + k, v = process_msa(item) + dataset_dictionary[k] = v + total_length += len(v) + except: + print(f"Error processing {item}") + + # Initialize the memmap array with the calculated total length + memmap_path = os.path.join(output_dir, 'open_protein_set_memmap.dat') + concatenated_array = np.memmap(memmap_path, dtype='int8', mode='w+', shape=(total_length,)) + + with open(f'{output_dir}/open_protein_set_memmap_indices.csv', 'w', newline='') as csvfile: + csvwriter = csv.writer(csvfile) + + csvwriter.writerow(['msa_id', 'Start', 'End']) + + start_index = 0 + for key, array in dataset_dictionary.items(): + end_index = start_index + len(array) - 1 + concatenated_array[start_index:end_index + 1] = array # Write to memmap + csvwriter.writerow([key, start_index, end_index]) + start_index = end_index + 1 + + # Ensure the data is written to disk + concatenated_array.flush() + + +if __name__ == "__main__": + data_dir = 'data/a3m_files' + output_dir = 'data/' + main(data_dir, output_dir) + + + diff --git a/protxlstm/dataloaders.py b/protxlstm/dataloaders.py new file mode 100644 index 0000000000000000000000000000000000000000..29f4d7efe26977a9a4d0ec7f3772ab9fabde4d26 --- /dev/null +++ b/protxlstm/dataloaders.py @@ -0,0 +1,249 @@ +# Original code from ProtMamba under Apache License 2.0. +# +# Modifications made by Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Uniclust30_Dataset renamed to ProteinMemmapDataset +# - Dataset input file format changed for more efficient dataloading +# - Option to use only a subset +# - DataCollatorForUniclust30Dataset renamed to ProteinDataCollator +# - Add sequence padding + +import numpy as np +import pandas as pd +import torch +from torch.utils.data import DataLoader, Dataset +from typing import Dict, Optional, Sequence + +from protxlstm.fim import MultipleSpanFIM, NoFIM, SingleSpanFIM +from protxlstm.utils import AA_TO_ID + + +# Make dataset +class ProteinMemmapDataset(Dataset): + """ + ProteinMemmapDataset is a PyTorch Dataset class for handling memory-mapped datasets of protein multiple sequence alignments (MSAs). + + This class imports MSA data stored in memmap format and associated metadata CSVs. It supports flexible + data sampling strategies and inpainting methods for sequence manipulation and training purposes. + + Args: + msa_memmap_path (str): Path to the memory-mapped file containing the MSA clusters. + msa_memmap_meta_path (str): Path to the CSV file with metadata linking MSA Cluster IDs and indices in the memmap array. + subset_path (str, optional): Path to a CSV file specifying a subset of cluster IDs to use. + sample (bool, optional): If True, randomly samples sequences from each cluster; otherwise, loads all sequences and shuffles them. + max_msa_len (int, optional): Maximum length of the MSA sequences to include. Defaults to -1 (no limit). + reverse (bool, optional): If True, reverses sequences with a probability of 0.5 and moves the last token to the front. + seed (int, optional): Random seed for reproducibility. Defaults to 42. + troubleshoot (bool, optional): If True, prints debugging information. Defaults to False. + fim_strategy (str, optional): Strategy for inpainting ("no-scramble", "one_span", or "multiple_span"). + max_patches (int, optional): Number of patches for inpainting. Used when fim_strategy is "multiple_span". + mask_fraction (float, optional): Fraction of the patches to mask. Used when fim_strategy is "multiple_span". + always_mask (bool, optional): If True, ensures masking is applied in the inpainting process. + max_position_embeddings (int, optional): Maximum position embeddings. Defaults to 2048. + max_seq_position_embeddings (int, optional): Maximum sequence position embeddings for 2D positional IDs. Defaults to 512. + add_position_ids (str, optional): Type of position IDs to add ("none", "1d", or "2d"). Defaults to "1d". + """ + + _FIM = {"no-scramble": NoFIM, "one_span": SingleSpanFIM, "multiple_span": MultipleSpanFIM} + _POSIDS = {"none", "1d", "2d"} + + def __init__(self, + msa_memmap_path=None, + msa_memmap_meta_path=None, + subset_path=None, + sample=False, + max_msa_len=-1, + reverse=False, + seed=42, + troubleshoot=False, + fim_strategy="no-scramble", + max_patches=5, + mask_fraction=0.2, + always_mask=False, + max_position_embeddings=2048, + max_seq_position_embeddings=512, + add_position_ids="1d", ): + + np.random.seed(seed) + + if msa_memmap_path: + self.dataset = np.memmap(msa_memmap_path, dtype=np.int8, mode='r') + self.dataset_meta = pd.read_csv(msa_memmap_meta_path) + if subset_path: + subset_ids = pd.read_csv(subset_path, header=None, names=['ID'])['ID'].tolist() + self.dataset_meta = self.dataset_meta[self.dataset_meta['msa_id'].isin(subset_ids)] + else: + self.dataset = None + + self.sample = sample + self.max_msa_len = max_msa_len + self.reverse = reverse + self.fim_strategy = fim_strategy + if fim_strategy in ProteinMemmapDataset._FIM: + self.fim = ProteinMemmapDataset._FIM[fim_strategy](max_patches=max_patches, + mask_fraction=mask_fraction, + always_mask=always_mask, + add_position_ids=add_position_ids != "none", + troubleshoot=troubleshoot) + else: + raise ValueError(f'Fill in the middle stragy "{fim_strategy}" not recognized.') + + self.max_position_embeddings = max_position_embeddings + self.max_seq_position_embeddings = max_seq_position_embeddings + self.add_position_ids = add_position_ids + + self.troubleshoot = troubleshoot + + def __len__(self): + # meta dataframe has one row for each MSA cluster + return len(self.dataset_meta) + + def __getitem__(self, idx): + # get all the sequences in the cluster + sequences = self.get_sequences(idx) + # get total number of sequences in the cluster and choose how many to sample + orig_num_sequences = len(self.get_index_start_of_sequences(sequences)) + num_sequences = np.random.randint(1, orig_num_sequences + 1) if self.sample else orig_num_sequences + # sample the sequences + sequences, position_ids = self.sample_sequences(sequences, num_sequences) + # with probability 0.5, reverse the sequences and move the last token to the front + sequences, position_ids = self.reverse_sequences(sequences, position_ids) if ( + self.reverse and np.random.rand() > 0.5) else sequences, position_ids + # limit the length of the MSA + sequences = sequences[:self.max_msa_len] if self.max_msa_len > 0 else sequences + if self.add_position_ids != "none": + position_ids = position_ids[:self.max_msa_len] if self.max_msa_len > 0 else position_ids + # convert to tensor + sequences = torch.asarray(sequences, dtype=torch.int64) + position_ids = torch.asarray(position_ids, dtype=torch.int64).clamp(0, + self.max_position_embeddings - 1) if self.add_position_ids!="none" else None + + if self.troubleshoot: + print( + f"Cluster {idx} has {orig_num_sequences} sequences, of which {num_sequences} sampled now. Total MSA length: {len(sequences)}") + if self.add_position_ids == "1d": + return dict(input_ids=sequences, position_ids=position_ids, labels=sequences) + if self.add_position_ids == "2d": + seq_position_ids = (sequences == AA_TO_ID[""]).int().cumsum(-1).clamp(0, + self.max_seq_position_embeddings - 1).contiguous() + return dict(input_ids=sequences, position_ids=position_ids, seq_position_ids=seq_position_ids, + labels=sequences) + return dict(input_ids=sequences, labels=sequences) + + def get_msa_id(self, idx): + """Get the MSA ID in the cluster with index `idx`.""" + cluster_meta = self.dataset_meta.iloc[idx] + return cluster_meta.msa_id + + def get_idx_from_msa_id(self, msa_id): + """Get `idx` with the MSA ID""" + return self.dataset_meta[self.dataset_meta.msa_id == msa_id].index[0] + + def get_sequences(self, idx): + """Get the sequences in the cluster with index `idx`.""" + cluster_meta = self.dataset_meta.iloc[idx] + sequences = self.dataset[cluster_meta.Start : cluster_meta.End] + return sequences + + def get_index_start_of_sequences(self, sequences): + """Get the positions of the start of each sequence in the cluster.""" + return np.where(sequences == 0)[0] + + def reverse_sequences(self, sequence, position_ids=None): + """Reverse the sequences and move the last token to the front.""" + sequence = sequence[::-1] + if position_ids is not None: + position_ids = position_ids[::-1] + return np.concatenate([sequence[-1:], sequence[:-1]]), np.concatenate( + [position_ids[-1:], position_ids[:-1]]) if position_ids is not None else None + + def sample_sequences(self, sequences, num_sequences, shuffle=True): + """Sample `num_sequences` from the sequences in the cluster.""" + L = len(sequences) + # get the indexes of the start of each sequence + inds = self.get_index_start_of_sequences(sequences) + # check that there are sequences in the cluster and that there are enough of them + assert len(inds) > 0, "No sequences found in cluster." + assert len(inds) >= num_sequences, "Not enough sequences in cluster." + # sample n_sequences randomly from the sequences + if shuffle: + which_seqs = np.random.choice(np.arange(len(inds)), num_sequences, replace=False) + else: + which_seqs = np.arange(len(inds))[-num_sequences:] + # get the tuples of start and end indexes of the sequences + tuples = [(inds[i], inds[i + 1]) if i < len(inds) - 1 else (inds[i], L) for i in which_seqs] + if self.troubleshoot: + print(f"Sampled sequences: {tuples}") + # concatenate the sequences + sequences, position_ids = self.fim.apply(sequences, tuples) + return sequences, position_ids + + + +def make_dataloader(dataset): + """Basic function to make a dataloader. + """ + dataloader = DataLoader(dataset) + return dataloader + + +class ProteinDataCollator(object): + """ + Collate examples into a batch, and pad batch to a specified maximum sequence length, + or to the longest sequence in the batch if max_sequence_length is None. + """ + def __init__(self, max_sequence_length: Optional[int] = None): + """ + Initialize the collator with an optional max_sequence_length. + + Args: + max_sequence_length (Optional[int]): The maximum sequence length to pad/truncate to. + If None, pad to the longest sequence in the batch. + """ + self.max_sequence_length = max_sequence_length + + def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: + + input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "input_ids")) + + longest_seq = max(len(seq) for seq in input_ids) + if self.max_sequence_length is None: + max_len = longest_seq + else: + max_len = self.max_sequence_length + + input_ids = self.pad_sequences(input_ids, max_len, padding_value=AA_TO_ID[""]) + + labels = self.pad_sequences(labels, longest_seq, padding_value=AA_TO_ID[""]) + labels = self.pad_sequences(labels, max_len, padding_value=-100) + + return_dict = dict( + input_ids=input_ids, + labels=labels, + attention_mask=input_ids.ne(AA_TO_ID[""]) + ) + + if "position_ids" in instances[0]: + + position_ids = [instance["position_ids"] for instance in instances] + position_ids = self.pad_sequences(position_ids, max_len, padding_value=0) + return_dict["position_ids"] = position_ids + + if "seq_position_ids" in instances[0]: + seq_position_ids = [instance["seq_position_ids"] for instance in instances] + seq_position_ids = self.pad_sequences(seq_position_ids, max_len, padding_value=0) + return_dict["seq_position_ids"] = seq_position_ids + + return return_dict + + def pad_sequences(self, seqs, max_length, padding_value): + # truncate long sequences (redundant, already done in __getitem__, maybe safe to remove) + seqs = [seq[:max_length] for seq in seqs] + + # pad to same length + seqs = torch.nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=padding_value) + + # pad to max length + padding = max_length - seqs.size(1) + seqs = torch.nn.functional.pad(seqs, (0, padding), value=padding_value) + + return seqs \ No newline at end of file diff --git a/protxlstm/fim.py b/protxlstm/fim.py new file mode 100644 index 0000000000000000000000000000000000000000..b388a2376eb816bfb3049d3fdbace555d1179ba3 --- /dev/null +++ b/protxlstm/fim.py @@ -0,0 +1,203 @@ + +# Original code from ProtMamba under Apache License 2.0. + +from protxlstm.utils import MASK_TO_ID, AA_TO_ID +import numpy as np + +class AbstractFIM(object): + def __init__(self, + max_patches=5, + mask_fraction=0.2, + always_mask=False, + mask_tokens=MASK_TO_ID, + eos_token=AA_TO_ID[""], + add_position_ids=False, + troubleshoot=False): + """ + This class is designed to concatenate sequences based on different scrambling strategies. + It takes a list of sequences, tuples indicating the start and end indices of each sequence, + an optional number of patches to sample, and a scrambling strategy as inputs. + """ + self.troubleshoot = troubleshoot + self.max_patches = max_patches + self.mask_fraction = mask_fraction + self.mask_tokens = mask_tokens + assert len( + self.mask_tokens) >= self.max_patches, "Number of mask tokens must be bigger than max number of patches." + self.eos_token = eos_token + self.add_position_ids = add_position_ids + self.always_mask = always_mask + + def apply(self, sequences, tuples): + """ + This function concatenates the sequences scrambling each one according to the scrambling strategy. + """ + input_ids, position_ids = [], [] + for t in tuples: + seq, pos = self.fim(sequences, t) + input_ids.extend(seq) + if self.add_position_ids: + position_ids.extend(pos) + if self.add_position_ids: + return input_ids, position_ids + return input_ids, None + + def fim(self, sequences, t): + """ + This function concatenates the sequence's parts based on the scrambling strategy. + """ + raise NotImplementedError + + +class NoFIM(AbstractFIM): + def __init__(self, + max_patches=5, + mask_fraction=0.2, + always_mask=False, + mask_tokens=MASK_TO_ID, + eos_token=AA_TO_ID[""], + add_position_ids=False, + troubleshoot=False): + super().__init__(max_patches, mask_fraction, always_mask, mask_tokens, eos_token, add_position_ids, troubleshoot) + + def fim(self, sequences, t): + """ + This function keeps the sequence identical without any scrambling. + """ + if self.add_position_ids: + position_ids = np.arange(t[0], t[1]) - t[0] + return sequences[t[0]:t[1]], position_ids + return sequences[t[0]:t[1]], None + + +class SingleSpanFIM(AbstractFIM): + + def __init__(self, + max_patches=5, + mask_fraction=0.2, + always_mask=False, + mask_tokens=MASK_TO_ID, + eos_token=AA_TO_ID[""], + add_position_ids=False, + troubleshoot=False): + super().__init__(max_patches, mask_fraction, always_mask, mask_tokens, eos_token, add_position_ids, troubleshoot) + + def fim(self, sequences, t): + """ + This function creates and concatenates parts of the sequences based on the OpenAI scrambling strategy. + It randomly selects two indices within the range of the given tuple, + splits the sequence into three parts based on these indices, and then concatenates them with the + masked patch at the end + """ + new_tuple = tuple(np.sort(np.random.choice(np.arange(t[0] + 1, t[1]), 2, replace=False))) + part1 = sequences[t[0]:new_tuple[0]] + part2 = sequences[new_tuple[0]:new_tuple[1]] + part3 = sequences[new_tuple[1]:t[1]] + sequence = np.concatenate([part1, [self.mask_tokens[""]], part3, [self.mask_tokens[""]], part2]) + position_ids_sequence = None + if self.add_position_ids: + position_ids = np.arange(t[0], t[1]) - t[0] + position_ids_part1 = position_ids[t[0]:new_tuple[0]] + position_ids_part2 = position_ids[new_tuple[0]:new_tuple[1]] + position_ids_part3 = position_ids[new_tuple[1]:t[1]] + position_ids_sequence = np.concatenate( + [position_ids_part1, [position_ids_part2[0]], position_ids_part3, [position_ids_part2[0]], + position_ids_part2]) + + return sequence, position_ids_sequence + + +class MultipleSpanFIM(AbstractFIM): + def __init__(self, + max_patches=5, + mask_fraction=0.2, + always_mask=False, + mask_tokens=MASK_TO_ID, + eos_token=AA_TO_ID[""], + add_position_ids=False, + troubleshoot=False): + super().__init__(max_patches, mask_fraction, always_mask, mask_tokens, eos_token, add_position_ids, troubleshoot) + + def fim(self, sequences, t): + """ + This function creates and concatenates parts of the sequences based on the inpaint scrambling strategy. + It randomly selects `2*num_patches` indices within the range of the given tuple, + splits the sequence into unmasked and masked parts based on these indices, and then concatenates them. + The number of patches is sampled from a poisson distribution with upper limit `self.max_patches` and average 1. + The concatenation is done by joining all unmaksed parts (interleaved with mask tokens) and afterwards + all masked parts (interleaved with mask tokens). At the end of the unmasked parts, a special token is added + to indicate the end of the unmasked parts, and at the end of the masked parts, a special token is added + to indicate the end of the masked parts. + """ + # sample num_patches from a discrete poisson distribution with upper limit L + def sample_lengths(start, end): + """ + Sample a length uniformly from 1 to max_L*self.mask_fraction (must be bigger than 1). + If the length is larger than max_L, return max_L. + """ + max_L = end - start + length = np.random.randint(1, max(int(max_L * self.mask_fraction), 2)) + return min(length, max_L) + + # sample num_patches from a discrete poisson distribution with upper limit max_patches + num_patches = 1000 + while num_patches > self.max_patches: + num_patches = np.random.poisson(1) + if self.always_mask: + num_patches = max(num_patches, 1) + # sample num_patches starting points for the masked positions (+ final position) + start_patches = list(np.sort(np.random.choice(np.arange(t[0] + 1, t[1]), + num_patches, + replace=False))) + [t[1]] + # sample num_patches lengths of the patches + len_patches = [sample_lengths(start_patches[i], start_patches[i + 1]) + for i in range(len(start_patches) - 1)] + # create masked tuples with start and end indices of the patches + masked_tuples = [(start_patches[i], start_patches[i] + len_patches[i]) for i in range(len(start_patches) - 1)] + # split the sequences into unmasked and masked parts + unmasked_sequence, masked_sequence, unmasked_position_ids, masked_position_ids = self.split_sequences(sequences, + t, + masked_tuples) + + if self.troubleshoot: + print(f"For sequence in {t}: sampled {num_patches=}, {start_patches=}, {len_patches=}, {masked_tuples=}") + # concatenate the unmasked and masked parts + return unmasked_sequence + masked_sequence, unmasked_position_ids + masked_position_ids if self.add_position_ids else None + + def split_sequences(self, sequences, t, masked_tuples): + """ + This function splits the sequences into unmasked and masked parts based on the given tuples. + Args: + t (tuple): The start and end index of each sequence. + masked_tuples (list): A list of tuples specifying the indices for masked regions. + Returns: + unmasked_parts (list): The unmasked parts of the sequences interleaved with mask_tokens. + masked_parts (list): The masked parts of the sequences interleaved with mask_tokens. + """ + unmasked_parts, masked_parts = [], [] + unmasked_positions, masked_positions = [], [] + position_ids = None + start, end = t + if self.add_position_ids: + position_ids = np.arange(start, end) - start + for i, region in enumerate(masked_tuples): + mask_token = self.mask_tokens[f""] + unmasked_parts.extend(sequences[start:region[0]]) + unmasked_parts.append(mask_token) + masked_parts.append(mask_token) + masked_parts.extend(sequences[region[0]:region[1]]) + if self.add_position_ids: + unmasked_positions.extend(position_ids[start-t[0]:region[0]-t[0]]) + unmasked_positions.append(position_ids[region[0]-t[0]]) + masked_positions.append(position_ids[region[0]-t[0]]) + masked_positions.extend(position_ids[region[0]-t[0]:region[1]-t[0]]) + + start = region[1] + unmasked_parts.extend(sequences[start:end]) + if self.add_position_ids: + unmasked_positions.extend(position_ids[start-t[0]:end-t[0]]) + if len(masked_tuples) > 0: + unmasked_parts.append(self.eos_token) + if self.add_position_ids: + unmasked_positions.append(0) + return unmasked_parts, masked_parts, unmasked_positions, masked_positions diff --git a/protxlstm/generation.py b/protxlstm/generation.py new file mode 100644 index 0000000000000000000000000000000000000000..2ea6f25a5e56ef3e2738dec323281eadf2058f36 --- /dev/null +++ b/protxlstm/generation.py @@ -0,0 +1,384 @@ +# Original code from ProtMamba under Apache License 2.0. +# +# Modifications made by Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Add option to pass input state for generation +# - Add functions to generate sequences with xlstm + +import numpy as np +import torch +from protxlstm.mamba_utils_generation import ( + InferenceParams, + GenerationMixin, + GreedySearchDecoderOnlyOutput, + modify_logits_for_top_p_filtering, + modify_logits_for_min_p_filtering, + modify_logit_for_repetition_penalty, + SampleDecoderOnlyOutput, + update_graph_cache +) + +from protxlstm.utils import AA_TO_ID, decode_sequence + +def sample_safe(logits, top_k=1, top_p=0.0, min_p=0.0, temperature=1.0): + """Sample from top-k logits. + Arguments: + logits: Tensor of shape (batch_size, vocab_size) + """ + if top_k == 1: # Short-circuit for greedy decoding + return logits.argmax(dim=-1) + else: + if top_p > 0.0: + assert top_p <= 1.0, "top-p should be in (0, 1]." + if top_k > 0: + top_k = min(top_k, logits.size(-1)) # Safety check + logits_top, indices = torch.topk(logits, top_k, dim=-1) + if temperature != 1.0: + logits_top /= temperature + modify_logits_for_top_p_filtering(logits_top, top_p) + + return indices[ + torch.arange(indices.shape[0], device=indices.device), + torch.multinomial( + torch.softmax(logits_top, dim=-1), num_samples=1 + ).squeeze(dim=-1), + ] + else: + if min_p > 0.0: + logits_top = logits.clone() + max_prob = logits_top[..., 0].item() + min_prob = max_prob * min_p + modify_logits_for_min_p_filtering(logits_top, min_p) + if temperature != 1.0: + logits_top /= temperature + return torch.multinomial( + torch.softmax(logits_top, dim=-1), num_samples=1 + ).squeeze(dim=-1) + # Clone so that when we modify for top_p we don't change the original logits + logits_top = logits / temperature if temperature != 1.0 else logits.clone() + modify_logits_for_top_p_filtering(logits_top, top_p) + return torch.multinomial( + torch.softmax(logits_top, dim=-1), num_samples=1 + ).squeeze(dim=-1) + + +@torch.inference_mode() +def decode_safe( + input_ids, + position_ids, + seq_position_ids, + is_fim, + model, + max_length, + state=None, + top_k=1, + top_p=0.0, + min_p=0.0, + temperature=1.0, + repetition_penalty=1.0, + eos_token_id=None, + teacher_outputs=None, + vocab_size=None, + cg=False, + enable_timing=False, + streamer = None, + chunk_chunk_size = 2**15, +): + """Decoding, either greedy or with top-k or top-p sampling. + If top-k = 0, don't limit the number of candidates (pure sampling). + Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first, + then top-p. + We assume that all sequences in the same batch have the same length. + + Arguments: + input_ids: (batch, seq_len) + max_length: int + is_fim: dictionary with mask indices and associated position indices + teacher_outputs (optional): (batch, seq_len). If provided, instead of sampling from the + logits, the next token is taken from the teacher_outputs. Useful for testing. + Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields: + sequences: (batch, max_length) + scores: tuples of (batch, vocab_size) + """ + if streamer is not None: + streamer.put(input_ids.cpu()) + + batch_size, seqlen_og = input_ids.shape + teacher_output_len = teacher_outputs.shape[1] if teacher_outputs is not None else 0 + if cg: + if not hasattr(model, "_decoding_cache"): + model._decoding_cache = None + model._decoding_cache = update_graph_cache( + model, + model._decoding_cache, + batch_size, + seqlen_og, + max_length, + ) + inference_params = model._decoding_cache.inference_params + inference_params.reset(max_length, batch_size) + else: + inference_params = InferenceParams( + max_seqlen=max_length, max_batch_size=batch_size + ) + + def get_logits(input_ids, position_ids, seq_position_ids, inference_params): + decoding = inference_params.seqlen_offset > 0 + if not cg or not decoding: + logits = model( + input_ids, + position_ids=position_ids, + seq_position_ids=seq_position_ids, + inference_params=inference_params, + num_last_tokens=1, + ).logits.squeeze(dim=1) + else: + logits = model._decoding_cache.run( + input_ids, + position_ids, + inference_params.seqlen_offset, + seq_position_ids=seq_position_ids, + ).squeeze(dim=1) + return logits[..., :vocab_size] if vocab_size is not None else logits + + def get_xlstm_logits_step(input_ids, position_ids, seq_position_ids, state): + + if not input_ids.shape[1] == 1: + + for i in range(input_ids.shape[1]): + if position_ids != None: + token_position_ids = position_ids[:,i:(i+1)] + else: + token_position_ids = None + if seq_position_ids != None: + token_seq_position_ids = seq_position_ids[:,i:(i+1)] + else: + token_seq_position_ids = None + logits, state = model.step(input_ids[:,i:(i+1)], state, position_ids=token_position_ids, seq_position_ids=token_seq_position_ids) + + else: + logits, state = model.step(input_ids, state, position_ids=position_ids, seq_position_ids=seq_position_ids) + + logits = logits.squeeze(dim=1) + if vocab_size is not None: + logits = logits[..., :vocab_size] + + return logits, state + + def get_xlstm_logits_chunkwise(input_ids, position_ids, seq_position_ids, chunk_chunk_size=2**15, state=None): + + assert model.config.config_dataclass.mlstm_block.mlstm.backend == "chunkwise_variable" + + for chunk in range(input_ids.shape[1]//chunk_chunk_size+1): + + start_idx = chunk*chunk_chunk_size + end_idx = min((chunk+1)*chunk_chunk_size, input_ids.shape[1]) + + if start_idx == end_idx: + pass + + else: + input_ids_chunk = input_ids[:, start_idx:end_idx] + + if not position_ids == None: + position_ids_chunk = position_ids[:, start_idx:end_idx] + else: + position_ids_chunk = None + + if not seq_position_ids == None: + seq_position_ids_chunk = seq_position_ids[:, start_idx:end_idx] + else: + seq_position_ids_chunk = None + + outputs = model(input_ids_chunk, position_ids=position_ids_chunk, seq_position_ids=seq_position_ids_chunk, state=state) + logits, state = outputs.logits, outputs.state + + logits = logits[:,-1,:] + logits = logits.squeeze(dim=1) + if vocab_size is not None: + logits = logits[..., :vocab_size] + + return logits, state + + def sample_tokens(logits, inference_params): + if ( + teacher_outputs is None + or teacher_output_len <= inference_params.seqlen_offset + ): + token = sample_safe( + logits, top_k=top_k, top_p=top_p, min_p=min_p, temperature=temperature + ) + else: + token = teacher_outputs[:, inference_params.seqlen_offset] + # return rearrange(token, "b -> b 1") + return token.unsqueeze(1) + + def get_fim_position_id( + last_position_ids, sampled_tokens, is_fim, repeat_next=False + ): + if type(is_fim) is dict: + val = int(last_position_ids) + 1 + should_repeat_next = False + if is_fim and int(sampled_tokens) in is_fim: + val = is_fim[int(sampled_tokens)] + should_repeat_next = True + elif repeat_next: + val = int(last_position_ids) + return torch.full_like(last_position_ids, fill_value=val), should_repeat_next + else: + t = [get_fim_position_id(last_position_ids_, sampled_tokens_, is_fim_dict, repeat_next) for + (last_position_ids_, sampled_tokens_, is_fim_dict) in + zip(last_position_ids, sampled_tokens, is_fim)] + return torch.stack([t_[0] for t_ in t], dim=0), t[0][1] + + def should_stop(current_token, inference_params): + if inference_params.seqlen_offset == 0: + return False + if eos_token_id is not None and (current_token == eos_token_id).any(): + if current_token.shape[1] > 1: + raise NotImplementedError("Batched eos_token_id not supported") + return True + if inference_params.seqlen_offset >= max_length - 1: + return True + return False + + start = torch.cuda.Event(enable_timing=enable_timing) + end = torch.cuda.Event(enable_timing=enable_timing) + + if enable_timing: + start.record() + scores, sequences = [], [input_ids] + new_position_ids, new_seq_position_ids = [position_ids], [seq_position_ids] + sequences_cat = input_ids + repeat_next = False + if position_ids.shape[0] > 1: + raise NotImplementedError("Batched generation with position_ids not supported") + + encode_context=True + while not should_stop(sequences[-1], inference_params): + + from protxlstm.models.xlstm import xLSTMLMHeadModel + if isinstance(model, xLSTMLMHeadModel): + if encode_context: + with torch.no_grad(): + logits, state = get_xlstm_logits_chunkwise(sequences[-1], new_position_ids[-1], new_seq_position_ids[-1], state=state, chunk_chunk_size=chunk_chunk_size) + encode_context = False + else: + logits, state = get_xlstm_logits_step(sequences[-1], new_position_ids[-1], new_seq_position_ids[-1], state=state) + else: + logits = get_logits(sequences[-1], new_position_ids[-1], new_seq_position_ids[-1], inference_params) + + scores.append(logits) + + inference_params.seqlen_offset += sequences[-1].shape[1] + if repetition_penalty == 1.0: + sampled_tokens = sample_tokens(scores[-1], inference_params) + else: + logits = modify_logit_for_repetition_penalty( + scores[-1].clone(), sequences_cat, repetition_penalty + ) + sampled_tokens = sample_tokens(logits, inference_params) + sequences_cat = torch.cat([sequences_cat, sampled_tokens], dim=1) + sequences.append(sampled_tokens) + # Update position_ids + if position_ids is not None: + last_position_ids, repeat_next = get_fim_position_id( + new_position_ids[-1][:, -1:], sampled_tokens, is_fim, repeat_next + ) + new_position_ids.append(last_position_ids) + # Update seq_position_ids + if seq_position_ids is not None: + new_seq_position_ids.append(new_seq_position_ids[-1][:, -1:]) + + if streamer is not None: + streamer.put(sampled_tokens.cpu()) + if streamer is not None: + streamer.end() + if enable_timing: + end.record() + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(start.elapsed_time(end)):.0f}ms") + output_cls = ( + GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput + ) + return output_cls(sequences=torch.cat(sequences, dim=1), scores=tuple(scores)) + + +class GenerationMixinSafe(GenerationMixin): + + def generate( + self, + input_ids, + position_ids, + seq_position_ids, + is_fim=None, + state=None, + max_length=1, + top_k=1, + top_p=0.0, + min_p=0.0, + temperature=1.0, + return_dict_in_generate=False, + output_scores=False, + chunk_chunk_size=2**15, + **kwargs, + ): + + output = decode_safe( + input_ids, + position_ids, + seq_position_ids, + is_fim, + self, + max_length, + state=state, + top_k=top_k, + top_p=top_p, + min_p=min_p, + temperature=temperature, + chunk_chunk_size=chunk_chunk_size, + **kwargs, + ) + if not output_scores: + output.scores = None + return output if return_dict_in_generate else output.sequences + + +def generate_sequence(model, tokens, position_ids=None, seq_position_ids=None, state=None, is_fim=False, max_length=2000, temperature=1., top_p=0.0, top_k=1, + return_dict_in_generate=False, output_scores=False, eos_token_id=AA_TO_ID[""], device="cuda", chunk_chunk_size=2**15): + """Generating, either greedy or with top-k or top-p sampling. + If top-k = 0, don't limit the number of candidates (pure sampling). + Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first, + then top-p. We assume that all sequences in the same batch have the same length. + """ + input_ids = tokens.to(device) + position_ids = position_ids.to(device) if position_ids is not None else None + seq_position_ids = seq_position_ids.to(device) if seq_position_ids is not None else None + # generate sequence + out = model.generate(input_ids=input_ids, + position_ids=position_ids, + seq_position_ids=seq_position_ids, + is_fim=is_fim, + state=state, + max_length=max_length, + temperature=temperature, + top_p=top_p, + top_k=top_k, + return_dict_in_generate=return_dict_in_generate, + output_scores=output_scores, + eos_token_id=eos_token_id, + chunk_chunk_size=chunk_chunk_size, + ) + sequences = out.sequences + dic = {"input": [decode_sequence(seq) for seq in sequences[:, :input_ids.shape[-1]].cpu().numpy()], + "generated": [decode_sequence(seq) for seq in sequences[:, input_ids.shape[-1]:].cpu().numpy()], + "input_tokens": [seq for seq in sequences[:, :input_ids.shape[-1]].cpu().numpy()], + "generated_tokens": [seq for seq in sequences[:, input_ids.shape[-1]:].cpu().numpy()]} + if output_scores: + dic["scores"] = np.array([el.to(torch.float32).cpu().numpy() for el in out.scores]).transpose(1, 0, 2) + return dic + + + + + + diff --git a/protxlstm/index.html b/protxlstm/index.html new file mode 100644 index 0000000000000000000000000000000000000000..0d649d07182b4d99e9bb9d6f334c0fe072dce164 --- /dev/null +++ b/protxlstm/index.html @@ -0,0 +1,16 @@ + + + + Index of /research/Bio-xLSTM/downloads/Prot-xLSTM/checkpoints/protxlstm_26M_30B + + +

Index of /research/Bio-xLSTM/downloads/Prot-xLSTM/checkpoints/protxlstm_26M_30B

+
Icon  Name                                                  Last modified      Size  Description
[PARENTDIR] Parent Directory - +[   ] config.json 2024-11-04 14:36 1.8K +[   ] optimizer.pt 2024-11-04 14:36 198M +[   ] pytorch_model.bin 2024-11-04 14:36 99M +[   ] rng_state.pth 2024-11-04 14:36 14K +[   ] scheduler.pt 2024-11-04 14:36 1.0K +[   ] trainer_state.json 2024-11-04 14:36 2.4M +
+ diff --git a/protxlstm/mamba_utils_generation.py b/protxlstm/mamba_utils_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..96f74cfefd91983dbf386ebb7b20bbbfa90e300b --- /dev/null +++ b/protxlstm/mamba_utils_generation.py @@ -0,0 +1,382 @@ +# From: https://github.com/state-spaces/mamba/blob/main/mamba_ssm/utils/generation.py +# Copyright (c) 2023, Albert Gu, Tri Dao. +import gc +from dataclasses import dataclass, field +from typing import Callable, Optional + +import torch +from torch import Tensor +from transformers.generation import GreedySearchDecoderOnlyOutput, SampleDecoderOnlyOutput, TextStreamer + + +@dataclass +class InferenceParams: + """Inference parameters that are passed to the main model in order + to efficienly calculate and store the context during inference.""" + + max_seqlen: int + max_batch_size: int + seqlen_offset: int = 0 + batch_size_offset: int = 0 + key_value_memory_dict: dict = field(default_factory=dict) + lengths_per_sample: Optional[Tensor] = None + + def reset(self, max_seqlen, max_batch_size): + self.max_seqlen = max_seqlen + self.max_batch_size = max_batch_size + self.seqlen_offset = 0 + if self.lengths_per_sample is not None: + self.lengths_per_sample.zero_() + + +def modify_logits_for_min_p_filtering(logits, min_p): + """Set the logits for none min_p values to -inf. Done in-place.""" + if min_p <= 0.0 or min_p >= 1.0: + return + indices_to_remove = logits < min_p + logits.masked_fill_(indices_to_remove, float("-Inf")) +# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py +# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L231 +def modify_logits_for_top_k_filtering(logits, top_k): + """Set the logits for none top-k values to -inf. Done in-place.""" + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits.masked_fill_(indices_to_remove, float("-Inf")) + + +# https://github.com/NVIDIA/Megatron-LM/blob/0bb597b42c53355a567aba2a1357cc34b9d99ddd/megatron/text_generation/sampling.py +# https://github.com/huggingface/transformers/blob/a44985b41cfa2de48a5e1de7f1f93b7483da25d1/src/transformers/generation/logits_process.py#L170 +def modify_logits_for_top_p_filtering(logits, top_p): + """Set the logits for none top-p values to -inf. Done in-place.""" + if top_p <= 0.0 or top_p >= 1.0: + return + # First sort and calculate cumulative sum of probabilities. + sorted_logits, sorted_indices = torch.sort(logits, descending=False) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + # Remove tokens with cumulative top_p above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs <= (1 - top_p) + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter( + 1, sorted_indices, sorted_indices_to_remove + ) + logits.masked_fill_(indices_to_remove, float("-inf")) + + +def modify_logit_for_repetition_penalty(logits, prev_output_tokens, repetition_penalty=1.0): + """Apply repetition penalty. See https://arxiv.org/abs/1909.05858 + logits: (batch_size, vocab_size) + prev_output_tokens: (batch_size, seq_len) + """ + if repetition_penalty == 1.0: + return logits + score = torch.gather(logits, 1, prev_output_tokens) + # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability + score = torch.where(score < 0, score * repetition_penalty, score / repetition_penalty) + logits.scatter_(1, prev_output_tokens, score) + return logits + + +def sample(logits, top_k=1, top_p=0.0, min_p=0.0, temperature=1.0): + """Sample from top-k logits. + Arguments: + logits: Tensor of shape (batch_size, vocab_size) + """ + if top_k == 1: # Short-circuit for greedy decoding + return logits.argmax(dim=-1) + else: + if top_p > 0.0: + assert top_p <= 1.0, "top-p should be in (0, 1]." + if top_k > 0: + top_k = min(top_k, logits.size(-1)) # Safety check + logits_top, indices = torch.topk(logits, top_k, dim=-1) + if temperature != 1.0: + logits_top /= temperature + modify_logits_for_top_p_filtering(logits_top, top_p) + return indices[ + torch.arange(indices.shape[0], device=indices.device), + torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1), + ] + else: + if min_p > 0.0: + logits_top = logits.clone() + max_prob = logits_top[..., 0].item() + min_prob = max_prob * min_p + modify_logits_for_min_p_filtering(logits_top, min_prob) + if temperature != 1.0: + logits_top /= temperature + return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze(dim=-1) + # Clone so that when we modify for top_p we don't change the original logits + logits_top = logits / temperature if temperature != 1.0 else logits.clone() + modify_logits_for_top_p_filtering(logits_top, top_p) + return torch.multinomial(torch.softmax(logits_top, dim=-1), num_samples=1).squeeze( + dim=-1 + ) + + +@torch.inference_mode() +def decode( + input_ids, + model, + max_length, + top_k=1, + top_p=0.0, + min_p=0.0, + temperature=1.0, + repetition_penalty=1.0, + eos_token_id=None, + teacher_outputs=None, + vocab_size=None, + cg=False, + enable_timing=False, + streamer: Optional[TextStreamer] = None +): + """Decoding, either greedy or with top-k or top-p sampling. + If top-k = 0, don't limit the number of candidates (pure sampling). + Top-k and top-p can be used together. If top_k > 0 and top_p > 0, then top-k is applied first, + then top-p. + We assume that all sequences in the same batch have the same length. + + Arguments: + input_ids: (batch, seq_len) + max_length: int + teacher_outputs (optional): (batch, seq_len). If provided, instead of sampling from the + logits, the next token is taken from the teacher_outputs. Useful for testing. + Returns: GreedySearchDecoderOnlyOutput or SampleDecoderOnlyOutput, with the following fields: + sequences: (batch, max_length) + scores: tuples of (batch, vocab_size) + """ + if streamer is not None: + streamer.put(input_ids.cpu()) + + batch_size, seqlen_og = input_ids.shape + teacher_output_len = teacher_outputs.shape[1] if teacher_outputs is not None else 0 + if cg: + if not hasattr(model, "_decoding_cache"): + model._decoding_cache = None + model._decoding_cache = update_graph_cache( + model, + model._decoding_cache, + batch_size, + seqlen_og, + max_length, + ) + inference_params = model._decoding_cache.inference_params + inference_params.reset(max_length, batch_size) + else: + inference_params = InferenceParams(max_seqlen=max_length, max_batch_size=batch_size) + + def get_logits(input_ids, inference_params): + decoding = inference_params.seqlen_offset > 0 + if decoding: + position_ids = torch.full( + (batch_size, 1), + inference_params.seqlen_offset, + dtype=torch.long, + device=input_ids.device, + ) + else: + position_ids = None + if not cg or not decoding: + logits = model( + input_ids, + position_ids=position_ids, + inference_params=inference_params, + num_last_tokens=1, + ).logits.squeeze(dim=1) + else: + logits = model._decoding_cache.run( + input_ids, position_ids, inference_params.seqlen_offset + ).squeeze(dim=1) + return logits[..., :vocab_size] if vocab_size is not None else logits + + def sample_tokens(logits, inference_params): + if teacher_outputs is None or teacher_output_len <= inference_params.seqlen_offset: + token = sample(logits, top_k=top_k, top_p=top_p, min_p=min_p, temperature=temperature) + else: + token = teacher_outputs[:, inference_params.seqlen_offset] + # return rearrange(token, "b -> b 1") + return token.unsqueeze(1) + + def should_stop(current_token, inference_params): + if inference_params.seqlen_offset == 0: + return False + if eos_token_id is not None and (current_token == eos_token_id).all(): + return True + if inference_params.seqlen_offset >= max_length - 1: + return True + return False + + start = torch.cuda.Event(enable_timing=enable_timing) + end = torch.cuda.Event(enable_timing=enable_timing) + + if enable_timing: + start.record() + scores, sequences = [], [input_ids] + sequences_cat = input_ids + while not should_stop(sequences[-1], inference_params): + scores.append(get_logits(sequences[-1], inference_params)) + inference_params.seqlen_offset += sequences[-1].shape[1] + if repetition_penalty == 1.0: + sampled_tokens = sample_tokens(scores[-1], inference_params) + else: + logits = modify_logit_for_repetition_penalty( + scores[-1].clone(), sequences_cat, repetition_penalty + ) + sampled_tokens = sample_tokens(logits, inference_params) + sequences_cat = torch.cat([sequences_cat, sampled_tokens], dim=1) + sequences.append(sampled_tokens) + if streamer is not None: + streamer.put(sampled_tokens.cpu()) + if streamer is not None: + streamer.end() + if enable_timing: + end.record() + torch.cuda.synchronize() + print(f"Prompt processing + decoding time: {(start.elapsed_time(end)):.0f}ms") + output_cls = GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput + return output_cls(sequences=torch.cat(sequences, dim=1), scores=tuple(scores)) + + +class GenerationMixin: + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): + raise NotImplementedError + + def generate( + self, + input_ids, + max_length, + top_k=1, + top_p=0.0, + min_p=0.0, + temperature=1.0, + return_dict_in_generate=False, + output_scores=False, + **kwargs, + ): + output = decode( + input_ids, self, max_length, top_k=top_k, top_p=top_p, min_p = min_p, temperature=temperature, **kwargs + ) + if not output_scores: + output.scores = None + return output if return_dict_in_generate else output.sequences + + +@dataclass +class DecodingCGCache: + max_batch_size: int = 0 + max_seqlen: int = 0 + device = None + dtype = None + callables: dict = field(default_factory=dict) + mempool = None + inference_params: Optional[InferenceParams] = None + run: Optional[Callable] = None + + +@torch.inference_mode() +def update_graph_cache( + model, + cache, + batch_size, + seqlen_og, + max_seqlen, + decoding_seqlens=(1,), + dtype=None, + n_warmups=2, +): + if cache is None: + cache = DecodingCGCache() + param_example = next(iter(model.parameters())) + device = param_example.device + if dtype is None: + dtype = param_example.dtype + if ( + (device, dtype) != (cache.device, cache.dtype) + or batch_size > cache.max_batch_size + or max_seqlen > cache.max_seqlen + ): # Invalidate the cache + cache.callables = {} + cache.mempool = None + cache.inference_params = None + gc.collect() + cache.device, cache.dtype = device, dtype + cache.max_batch_size, cache.max_seqlen = batch_size, max_seqlen + assert hasattr(model, "allocate_inference_cache"), "CUDA graph decoding requires that the model has a method allocate_inference_cache" + inf_cache = model.allocate_inference_cache(batch_size, max_seqlen, dtype) + lengths_per_sample = torch.full((batch_size,), seqlen_og, dtype=torch.int32, device=device) + cache.inference_params = InferenceParams( + max_seqlen=max_seqlen, + max_batch_size=batch_size, + seqlen_offset=seqlen_og, + key_value_memory_dict=inf_cache, + lengths_per_sample=lengths_per_sample, + ) + cache.mempool = torch.cuda.graphs.graph_pool_handle() + for decoding_seqlen in decoding_seqlens: + if (batch_size, decoding_seqlen) not in cache.callables: + cache.callables[batch_size, decoding_seqlen] = capture_graph( + model, + cache.inference_params, + batch_size, + max_seqlen, + decoding_seqlen=decoding_seqlen, + mempool=cache.mempool, + n_warmups=n_warmups, + ) + + def dispatch(input_ids, position_ids, seqlen): + batch_size, decoding_seqlen = input_ids.shape[:2] + return cache.callables[batch_size, decoding_seqlen](input_ids, position_ids, seqlen) + + cache.run = dispatch + cache.inference_params.seqlen_offset = 0 # Reset so it's not confusing + return cache + + +def capture_graph( + model, inference_params, batch_size, max_seqlen, decoding_seqlen=1, mempool=None, n_warmups=2 +): + device = next(iter(model.parameters())).device + input_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device) + position_ids = torch.full((batch_size, decoding_seqlen), 0, dtype=torch.long, device=device) + seqlen_offset_og = inference_params.seqlen_offset + inference_params.seqlen_offset = max_seqlen - decoding_seqlen + inference_params.lengths_per_sample[:] = inference_params.seqlen_offset + + # Warmup before capture + s = torch.cuda.Stream() + s.wait_stream(torch.cuda.current_stream()) + with torch.cuda.stream(s): + for _ in range(n_warmups): + logits = model( + input_ids, + position_ids=position_ids, + inference_params=inference_params, + num_last_tokens=decoding_seqlen, + ).logits + s.synchronize() + # This might be needed for correctness if we run with NCCL_GRAPH_MIXING_SUPPORT=0, + # which requires that graph launch and non-captured launch to not overlap (I think, + # that's how I interpret the documentation). I'm not sure if this is required. + if torch.distributed.is_initialized(): + torch.distributed.barrier() + torch.cuda.current_stream().wait_stream(s) + # Captures the graph + # To allow capture, automatically sets a side stream as the current stream in the context + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, pool=mempool): + logits = model( + input_ids, + position_ids=position_ids, + inference_params=inference_params, + num_last_tokens=decoding_seqlen, + ).logits + + def run(new_input_ids, new_position_ids, seqlen): + inference_params.lengths_per_sample[:] = seqlen + input_ids.copy_(new_input_ids) + position_ids.copy_(new_position_ids) + graph.replay() + return logits.clone() + + inference_params.seqlen_offset = seqlen_offset_og + return run \ No newline at end of file diff --git a/protxlstm/models/__init__.py b/protxlstm/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/protxlstm/models/llama.py b/protxlstm/models/llama.py new file mode 100644 index 0000000000000000000000000000000000000000..f039c46298e9381ecfee298da559562bebc1e626 --- /dev/null +++ b/protxlstm/models/llama.py @@ -0,0 +1,342 @@ +import json +import math +import os +from collections import namedtuple +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PretrainedConfig + +from protxlstm.xlstm.components.rotary_position import compute_freqs_cis + +# Note: generation capabilities are not implemented for the transformer + +class TransformerConfig(PretrainedConfig): + + model_type = "llama" + + def __init__( + self, + d_model, + n_layer, + n_heads, + n_kv_heads, + bidirectional, + vocab_size, + hidden_dim, + multiple_of, # MLP hidden layer size will be multiple of + norm_eps, + max_length, + dropout, + max_position_embeddings, + rope_base_frequency, + **kwargs + ): + super().__init__(**kwargs) + + # default hyperparameters for the Llama 7B model + self.dim = d_model + self.n_layers = n_layer + self.n_heads = n_heads + self.n_kv_heads = n_kv_heads + self.causal_attention = not bidirectional + self.vocab_size = vocab_size + self.hidden_dim = hidden_dim + self.multiple_of = multiple_of + self.norm_eps = norm_eps + self.max_seq_len = max_length + self.dropout = dropout + self.max_position_embeddings = max_position_embeddings + self.rope_base_frequency = rope_base_frequency + +class RMSNorm_transformer(torch.nn.Module): + def __init__(self, dim: int, eps: float): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + output = self._norm(x.float()).type_as(x) + return output * self.weight + +def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): + freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) + t = torch.arange(end, device=freqs.device) # type: ignore + freqs = torch.outer(t, freqs).float() # type: ignore + freqs_cos = torch.cos(freqs) # real part + freqs_sin = torch.sin(freqs) # imaginary part + return freqs_cos, freqs_sin + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): + ndim = x.ndim + assert 0 <= 1 < ndim + assert freqs_cis.shape == (x.shape[1], x.shape[-1]) + shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(shape) + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cos: torch.Tensor, + freqs_sin: torch.Tensor +) -> Tuple[torch.Tensor, torch.Tensor]: + + # reshape xq and xk to match the complex representation + xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1) + xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1) + + # reshape freqs_cos and freqs_sin for broadcasting + freqs_cos = reshape_for_broadcast(freqs_cos, xq_r) + freqs_sin = reshape_for_broadcast(freqs_sin, xq_r) + + # apply rotation using real numbers + xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin + xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos + xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin + xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos + + # flatten last two dimensions + xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3) + xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3) + + return xq_out.type_as(xq), xk_out.type_as(xk) + +def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor: + """torch.repeat_interleave(x, dim=2, repeats=n_rep)""" + bs, slen, n_kv_heads, head_dim = x.shape + if n_rep == 1: + return x + return ( + x[:, :, :, None, :] + .expand(bs, slen, n_kv_heads, n_rep, head_dim) + .reshape(bs, slen, n_kv_heads * n_rep, head_dim) + ) + +class Attention(nn.Module): + def __init__(self, args: TransformerConfig): + super().__init__() + self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads + assert args.n_heads % self.n_kv_heads == 0 + model_parallel_size = 1 + self.n_local_heads = args.n_heads // model_parallel_size + self.n_local_kv_heads = self.n_kv_heads // model_parallel_size + self.n_rep = self.n_local_heads // self.n_local_kv_heads + self.head_dim = args.dim // args.n_heads + self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False) + self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False) + self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False) + self.attn_dropout = nn.Dropout(args.dropout) + self.resid_dropout = nn.Dropout(args.dropout) + self.dropout = args.dropout + self.causal_attention = args.causal_attention + + # use flash attention or a manual implementation? + self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention') + if not self.flash and self.causal_attention: + print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0") + mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf")) + mask = torch.triu(mask, diagonal=1) + self.register_buffer("mask", mask) + + def forward( + self, + x: torch.Tensor, + freqs_cos: torch.Tensor, + freqs_sin: torch.Tensor, + ): + bsz, seqlen, _ = x.shape + + # QKV + xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) + xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) + xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) + xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) + + # RoPE relative positional embeddings + xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin) + + # grouped multiquery attention: expand out keys and values + xk = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + xv = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim) + + # make heads into a batch dimension + xq = xq.transpose(1, 2) # (bs, n_local_heads, seqlen, head_dim) + xk = xk.transpose(1, 2) + xv = xv.transpose(1, 2) + + # flash implementation + if self.flash: + output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None, dropout_p=self.dropout if self.training else 0.0, is_causal=self.causal_attention) + else: + # manual implementation + scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim) + if self.causal_attention: + scores = scores + self.mask[:, :, :seqlen, :seqlen] # (bs, n_local_heads, seqlen, cache_len + seqlen) + scores = F.softmax(scores.float(), dim=-1).type_as(xq) + scores = self.attn_dropout(scores) + output = torch.matmul(scores, xv) # (bs, n_local_heads, seqlen, head_dim) + + # restore time as batch dimension and concat heads + output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1) + + # final projection into the residual stream + output = self.wo(output) + output = self.resid_dropout(output) + return output + +class FeedForward(nn.Module): + def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float): + super().__init__() + if hidden_dim is None: + hidden_dim = 4 * dim + hidden_dim = int(2 * hidden_dim / 3) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + self.w1 = nn.Linear(dim, hidden_dim, bias=False) + self.w2 = nn.Linear(hidden_dim, dim, bias=False) + self.w3 = nn.Linear(dim, hidden_dim, bias=False) + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x))) + +class TransformerBlock(nn.Module): + def __init__(self, layer_id: int, args: TransformerConfig): + super().__init__() + self.n_heads = args.n_heads + self.dim = args.dim + self.head_dim = args.dim // args.n_heads + self.attention = Attention(args) + self.feed_forward = FeedForward( + dim=args.dim, + hidden_dim=args.hidden_dim, + multiple_of=args.multiple_of, + dropout=args.dropout, + ) + self.layer_id = layer_id + self.attention_norm = RMSNorm_transformer(args.dim, eps=args.norm_eps) + self.ffn_norm = RMSNorm_transformer(args.dim, eps=args.norm_eps) + + def forward(self, x, freqs_cos, freqs_sin): + h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin) + out = h + self.feed_forward.forward(self.ffn_norm(h)) + return out + +class Transformer(nn.Module): + + last_loss: Optional[torch.Tensor] + + def __init__(self, params: TransformerConfig): + super().__init__() + self.params = params + self.vocab_size = params.vocab_size + self.n_layers = params.n_layers + + self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim) + self.dropout = nn.Dropout(params.dropout) + self.layers = torch.nn.ModuleList() + for layer_id in range(params.n_layers): + self.layers.append(TransformerBlock(layer_id, params)) + self.layer_head_dim = self.layers[0].head_dim + + self.norm = RMSNorm_transformer(params.dim, eps=params.norm_eps) + self.output = nn.Linear(params.dim, params.vocab_size, bias=False) + + # share the unembedding parameters with the embedding parameters + self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying + + # some useful precompute for the RoPE relative positional embeddings + # freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len) + # self.register_buffer("freqs_cos", freqs_cos, persistent=False) + # self.register_buffer("freqs_sin", freqs_sin, persistent=False) + + # init all weights + self.apply(self._init_weights) + # apply special scaled init to the residual projections, per GPT-2 paper + for pn, p in self.named_parameters(): + if pn.endswith('w3.weight') or pn.endswith('wo.weight'): + torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers)) + + # Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets tensor. + self.last_loss = None + + def _init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + if module.bias is not None: + torch.nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + torch.nn.init.normal_(module.weight, mean=0.0, std=0.02) + + def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None, **kwargs) -> torch.Tensor: + _bsz, seqlen = tokens.shape + h = self.tok_embeddings(tokens) + h = self.dropout(h) + # freqs_cos = self.freqs_cos[:seqlen] + # freqs_sin = self.freqs_sin[:seqlen] + + if 'position_ids' in kwargs: + freqs_cos, freqs_sin = compute_freqs_cis(kwargs.pop("position_ids"), self.layer_head_dim, theta=self.params.rope_base_frequency) + else: + raise ValueError('Llama model only implemented with RoPEs') + + freqs_cos = freqs_cos.squeeze() + freqs_sin = freqs_sin.squeeze() + + for layer in self.layers: + h = layer(h, freqs_cos, freqs_sin) + h = self.norm(h) + + if targets is not None: + logits = self.output(h) + self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1) + else: + logits = self.output(h) + self.last_loss = None + + return logits + +class TransformerLMHeadModel(nn.Module): + + def __init__( + self, + config: TransformerConfig, + ) -> None: + + super().__init__() + + self.config = config + + self.backbone = Transformer(config) + + def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0): + """ + num_last_tokens: if > 0, only return the logits for the last n tokens + """ + + lm_logits = self.backbone(input_ids, position_ids=position_ids) + + CausalLMOutput = namedtuple("CausalLMOutput", ["loss", "logits"]) + return CausalLMOutput(loss=None, logits=lm_logits) + + def save_pretrained(self, save_directory): + """ + Save the model and its configuration file to a directory. + """ + + # Ensure save_directory exists + os.makedirs(save_directory, exist_ok=True) + + # Save the model's state_dict + model_path = os.path.join(save_directory, "pytorch_model.bin") + torch.save(self.state_dict(), model_path) + + # Save the configuration of the model + config_path = os.path.join(save_directory, "config.json") + with open(config_path, "w") as f: + json.dump(self.config.to_dict(), f) \ No newline at end of file diff --git a/protxlstm/models/mamba.py b/protxlstm/models/mamba.py new file mode 100644 index 0000000000000000000000000000000000000000..8c33b9b195d6932637124fb0901a0120f273c5aa --- /dev/null +++ b/protxlstm/models/mamba.py @@ -0,0 +1,833 @@ +# Original code from ProtMamba under Apache License 2.0. + +import json +import os +from collections import namedtuple +from dataclasses import dataclass, field +from functools import partial + +from mamba_ssm.models.config_mamba import MambaConfig +from mamba_ssm.modules.mamba_simple import Block, Mamba +from mamba_ssm.models.mixer_seq_simple import MixerModel, _init_weights +from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn +from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint +from transformers import PretrainedConfig + +from protxlstm.generation import GenerationMixinSafe + +@dataclass +class MambaConfig(PretrainedConfig): + d_model: int = 2560 + n_layer: int = 64 + vocab_size: int = 50277 + ssm_cfg: dict = field(default_factory=dict) + rms_norm: bool = True + residual_in_fp32: bool = True + fused_add_norm: bool = True + pad_vocab_size_multiple: int = 8 + max_position_embeddings: int = 2048 + +def create_block( + d_model, + ssm_cfg=None, + norm_epsilon=1e-5, + rms_norm=False, + residual_in_fp32=False, + fused_add_norm=False, + layer_idx=None, + device=None, + dtype=None, + checkpoint_mixer=False, +): + if ssm_cfg is None: + ssm_cfg = {} + factory_kwargs = {"device": device, "dtype": dtype} + mixer_cls = partial(Mamba, layer_idx=layer_idx, **ssm_cfg, **factory_kwargs) + norm_cls = partial( + nn.LayerNorm if not rms_norm else RMSNorm, eps=norm_epsilon, **factory_kwargs + ) + block = Block( + d_model, + mixer_cls, + norm_cls=norm_cls, + fused_add_norm=fused_add_norm, + residual_in_fp32=residual_in_fp32, + ) + block.layer_idx = layer_idx + if checkpoint_mixer: + block.mixer = CheckpointedModule(block.mixer) + return block + +class CheckpointedModule(torch.nn.Module): + def __init__(self, layer): + super().__init__() + self.ckpt_layer = layer + + def forward(self, x, *args, **kwargs): + return checkpoint(self.ckpt_layer, x, use_reentrant=False) + + # def state_dict(self, **kwargs): + # # Get the state dict of the underlying layer + # layer_state_dict = self.ckpt_layer.state_dict(**kwargs) + # # Create a new state dict with the original keys + # state_dict = {k.replace('ckpt_layer.', ''): v for k, v in layer_state_dict.items()} + # return state_dict + +class MixerModelSafe(MixerModel): + """ + Overwrite the forward method to allow saving intermediate layers. + """ + + def forward(self, input_ids, inference_params=None, save_layer=[]): + hidden_states = self.embedding(input_ids) + residual = None + if len(save_layer) > 0: + hidden_states_dict = {} + for i, layer in enumerate(self.layers): + hidden_states, residual = layer( + hidden_states, residual, inference_params=inference_params + ) + if i + 1 in save_layer: + hidden_states_dict[i + 1] = ( + hidden_states.detach().cpu().to(torch.float).numpy() + ) + if len(save_layer) > 0: + return hidden_states_dict + + if not self.fused_add_norm: + residual = ( + (hidden_states + residual) if residual is not None else hidden_states + ) + hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype)) + else: + # Set prenorm=False here since we don't need the residual + fused_add_norm_fn = ( + rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn + ) + hidden_states = fused_add_norm_fn( + hidden_states, + self.norm_f.weight, + self.norm_f.bias, + eps=self.norm_f.eps, + residual=residual, + prenorm=False, + residual_in_fp32=self.residual_in_fp32, + ) + return hidden_states + +class MixerModelWithPosids(nn.Module): + r"""Mixer model for Mamba but we add positional encodings to the input embeddings.""" + + def __init__( + self, + d_model: int, + n_layer: int, + vocab_size: int, + max_position_embeddings: int, + ssm_cfg=None, + norm_epsilon: float = 1e-5, + rms_norm: bool = False, + initializer_cfg=None, + fused_add_norm=False, + residual_in_fp32=False, + device=None, + dtype=None, + checkpoint_mixer=False, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.residual_in_fp32 = residual_in_fp32 + + self.embedding = nn.Embedding(vocab_size, d_model // 2, **factory_kwargs) + self.position_embedding = nn.Embedding( + max_position_embeddings, d_model - d_model // 2, **factory_kwargs + ) + + # We change the order of residual and layer norm: + # Instead of LN -> Attn / MLP -> Add, we do: + # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and + # the main branch (output of MLP / Mixer). The model definition is unchanged. + # This is for performance reason: we can fuse add + layer_norm. + self.fused_add_norm = fused_add_norm + if self.fused_add_norm: + if layer_norm_fn is None or rms_norm_fn is None: + raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels") + + self.layers = nn.ModuleList( + [ + create_block( + d_model, + ssm_cfg=ssm_cfg, + norm_epsilon=norm_epsilon, + rms_norm=rms_norm, + residual_in_fp32=residual_in_fp32, + fused_add_norm=fused_add_norm, + layer_idx=i, + checkpoint_mixer=checkpoint_mixer, + **factory_kwargs, + ) + for i in range(n_layer) + ] + ) + + self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)( + d_model, eps=norm_epsilon, **factory_kwargs + ) + + self.apply( + partial( + _init_weights, + n_layer=n_layer, + **(initializer_cfg if initializer_cfg is not None else {}), + ) + ) + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): + return { + i: layer.allocate_inference_cache( + batch_size, max_seqlen, dtype=dtype, **kwargs + ) + for i, layer in enumerate(self.layers) + } + + def forward(self, input_ids, position_ids, inference_params=None, save_layer=[]): + hidden_states = torch.cat( + [ + self.embedding(input_ids), + self.position_embedding(position_ids), + ], + -1, + ) + residual = None + if len(save_layer) > 0: + hidden_states_dict = {} + for i, layer in enumerate(self.layers): + hidden_states, residual = layer( + hidden_states, residual, inference_params=inference_params + ) + if i + 1 in save_layer: + hidden_states_dict[i + 1] = ( + hidden_states.detach().cpu().to(torch.float).numpy() + ) + if len(save_layer) > 0: + return hidden_states_dict + + if not self.fused_add_norm: + residual = ( + (hidden_states + residual) if residual is not None else hidden_states + ) + hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype)) + else: + fused_add_norm_fn = ( + rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn + ) + hidden_states = fused_add_norm_fn( + hidden_states, + self.norm_f.weight, + self.norm_f.bias, + eps=self.norm_f.eps, + residual=residual, + prenorm=False, + residual_in_fp32=self.residual_in_fp32, + ) + return hidden_states + +class MixerModelWith2DPosids(nn.Module): + r"""Mixer model for Mamba but we add positional encodings to the input embeddings.""" + + def __init__( + self, + d_model: int, + n_layer: int, + vocab_size: int, + max_position_embeddings: int, + max_sequence_position_embeddings: int = 512, + ssm_cfg=None, + norm_epsilon: float = 1e-5, + rms_norm: bool = False, + initializer_cfg=None, + fused_add_norm=False, + residual_in_fp32=False, + device=None, + dtype=None, + checkpoint_mixer=False, + ) -> None: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.residual_in_fp32 = residual_in_fp32 + + self.embedding = nn.Embedding( + vocab_size, d_model - 2 * d_model // 4, **factory_kwargs + ) + self.position_embedding = nn.Embedding( + max_position_embeddings, d_model // 4, **factory_kwargs + ) + self.seq_position_embedding = nn.Embedding( + max_sequence_position_embeddings, d_model // 4, **factory_kwargs + ) + self.d_embeddings = d_model - 2 * d_model // 4 + + # We change the order of residual and layer norm: + # Instead of LN -> Attn / MLP -> Add, we do: + # Add -> LN -> Attn / MLP / Mixer, returning both the residual branch (output of Add) and + # the main branch (output of MLP / Mixer). The model definition is unchanged. + # This is for performance reason: we can fuse add + layer_norm. + self.fused_add_norm = fused_add_norm + if self.fused_add_norm: + if layer_norm_fn is None or rms_norm_fn is None: + raise ImportError("Failed to import Triton LayerNorm / RMSNorm kernels") + + self.layers = nn.ModuleList( + [ + create_block( + d_model, + ssm_cfg=ssm_cfg, + norm_epsilon=norm_epsilon, + rms_norm=rms_norm, + residual_in_fp32=residual_in_fp32, + fused_add_norm=fused_add_norm, + layer_idx=i, + checkpoint_mixer=checkpoint_mixer, + **factory_kwargs, + ) + for i in range(n_layer) + ] + ) + + self.norm_f = (nn.LayerNorm if not rms_norm else RMSNorm)( + d_model, eps=norm_epsilon, **factory_kwargs + ) + + self.apply( + partial( + _init_weights, + n_layer=n_layer, + **(initializer_cfg if initializer_cfg is not None else {}), + ) + ) + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): + return { + i: layer.allocate_inference_cache( + batch_size, max_seqlen, dtype=dtype, **kwargs + ) + for i, layer in enumerate(self.layers) + } + + def forward( + self, + input_ids, + position_ids, + seq_position_ids, + inference_params=None, + save_layer=[], + ): + hidden_states = torch.cat( + [ + self.embedding(input_ids), + self.position_embedding(position_ids), + self.seq_position_embedding(seq_position_ids), + ], + -1, + ) + residual = None + if len(save_layer) > 0: + hidden_states_dict = {} + for i, layer in enumerate(self.layers): + hidden_states, residual = layer( + hidden_states, residual, inference_params=inference_params + ) + if i + 1 in save_layer: + hidden_states_dict[i + 1] = ( + hidden_states.detach().cpu().to(torch.float).numpy() + ) + if len(save_layer) > 0: + return hidden_states_dict + + if not self.fused_add_norm: + residual = ( + (hidden_states + residual) if residual is not None else hidden_states + ) + hidden_states = self.norm_f(residual.to(dtype=self.norm_f.weight.dtype)) + else: + fused_add_norm_fn = ( + rms_norm_fn if isinstance(self.norm_f, RMSNorm) else layer_norm_fn + ) + hidden_states = fused_add_norm_fn( + hidden_states, + self.norm_f.weight, + self.norm_f.bias, + eps=self.norm_f.eps, + residual=residual, + prenorm=False, + residual_in_fp32=self.residual_in_fp32, + ) + return hidden_states + +class MambaLMHeadModelSafe(nn.Module, GenerationMixinSafe): + + def __init__( + self, + config: MambaConfig, + initializer_cfg=None, + device=None, + dtype=None, + checkpoint_mixer=False, + ) -> None: + self.config = config + d_model = config.d_model + n_layer = config.n_layer + vocab_size = config.vocab_size + ssm_cfg = config.ssm_cfg + rms_norm = config.rms_norm + residual_in_fp32 = config.residual_in_fp32 + fused_add_norm = config.fused_add_norm + pad_vocab_size_multiple = config.pad_vocab_size_multiple + factory_kwargs = {"device": device, "dtype": dtype} + if checkpoint_mixer: + raise NotImplementedError( + "Checkpointing is not yet supported for MambaLMHeadModelSafe" + ) + + super().__init__() + if vocab_size % pad_vocab_size_multiple != 0: + vocab_size += pad_vocab_size_multiple - ( + vocab_size % pad_vocab_size_multiple + ) + self.backbone = MixerModelSafe( + d_model=d_model, + n_layer=n_layer, + vocab_size=vocab_size, + ssm_cfg=ssm_cfg, + rms_norm=rms_norm, + initializer_cfg=initializer_cfg, + fused_add_norm=fused_add_norm, + residual_in_fp32=residual_in_fp32, + **factory_kwargs, + ) + self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs) + + # Initialize weights and apply final processing + self.apply( + partial( + _init_weights, + n_layer=n_layer, + **(initializer_cfg if initializer_cfg is not None else {}), + ) + ) + self.tie_weights() + + def tie_weights(self): + self.lm_head.weight = self.backbone.embedding.weight + + def clip_grad_norm_(self, max_norm, norm_type=2.0): + r"""Clip the norm of the gradients for the model. + Args: + max_norm (float or int): The maximum norm of the gradients. + The gradients are modified in-place. + norm_type (float or int): The type of the used p-norm. Can be 'inf' for infinity norm. + Returns: + Total norm of the parameters (viewed as a single vector). + """ + return torch.nn.utils.clip_grad_value_(self.parameters(), max_norm) + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): + return self.backbone.allocate_inference_cache( + batch_size, max_seqlen, dtype=dtype, **kwargs + ) + + def forward( + self, + input_ids, + position_ids=None, + inference_params=None, + num_last_tokens=0, + save_layer=[], + *args, + **kwargs, + ): + """ + "position_ids" is just to be compatible with Transformer generation. We don't use it. + num_last_tokens: if > 0, only return the logits for the last n tokens + """ + return self.protected_forward( + input_ids, position_ids, inference_params, num_last_tokens, save_layer + ) + + def protected_forward( + self, + input_ids, + position_ids=None, + inference_params=None, + num_last_tokens=0, + save_layer=[], + ): + hidden_states = self.backbone( + input_ids, inference_params=inference_params, save_layer=save_layer + ) + if len(save_layer) > 0: + return hidden_states + if num_last_tokens > 0: + hidden_states = hidden_states[:, -num_last_tokens:] + lm_logits = self.lm_head(hidden_states) + CausalLMOutput = namedtuple("CausalLMOutput", ["loss", "logits"]) + return CausalLMOutput(loss=None, logits=lm_logits) + + @classmethod + def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs): + config_data = load_config_hf(pretrained_model_name) + config = MambaConfig(**config_data) + model = cls(config, device=device, dtype=dtype, **kwargs) + model.load_state_dict( + load_state_dict_hf(pretrained_model_name, device=device, dtype=dtype), + strict=False, + ) + return model + + def save_pretrained(self, save_directory): + """ + Minimal implementation of save_pretrained for MambaLMHeadModel. + Save the model and its configuration file to a directory. + """ + # Ensure save_directory exists + os.makedirs(save_directory, exist_ok=True) + + # Save the model's state_dict + model_path = os.path.join(save_directory, "pytorch_model.bin") + torch.save(self.state_dict(), model_path) + + # Save the configuration of the model + config_path = os.path.join(save_directory, "config.json") + with open(config_path, "w") as f: + json.dump(self.config.__dict__, f) + +class MambaLMHeadModelwithPosids(nn.Module, GenerationMixinSafe): + + def __init__( + self, + config: MambaConfig, + initializer_cfg=None, + device=None, + dtype=None, + checkpoint_mixer=False, + ) -> None: + self.config = config + d_model = config.d_model + n_layer = config.n_layer + vocab_size = config.vocab_size + max_position_embeddings = config.max_position_embeddings + ssm_cfg = config.ssm_cfg + rms_norm = config.rms_norm + residual_in_fp32 = config.residual_in_fp32 + fused_add_norm = config.fused_add_norm + pad_vocab_size_multiple = config.pad_vocab_size_multiple + factory_kwargs = {"device": device, "dtype": dtype} + + super().__init__() + if vocab_size % pad_vocab_size_multiple != 0: + vocab_size += pad_vocab_size_multiple - ( + vocab_size % pad_vocab_size_multiple + ) + self.backbone = MixerModelWithPosids( + d_model=d_model, + n_layer=n_layer, + vocab_size=vocab_size, + max_position_embeddings=max_position_embeddings, + ssm_cfg=ssm_cfg, + rms_norm=rms_norm, + initializer_cfg=initializer_cfg, + fused_add_norm=fused_add_norm, + residual_in_fp32=residual_in_fp32, + checkpoint_mixer=checkpoint_mixer, + **factory_kwargs, + ) + self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs) + + # Initialize weights and apply final processing + self.apply( + partial( + _init_weights, + n_layer=n_layer, + **(initializer_cfg if initializer_cfg is not None else {}), + ) + ) + self.tie_weights() + + def tie_weights(self): + self.lm_head.weight = self.backbone.embedding.weight + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): + return self.backbone.allocate_inference_cache( + batch_size, max_seqlen, dtype=dtype, **kwargs + ) + + def forward( + self, + input_ids, + position_ids=None, + inference_params=None, + num_last_tokens=0, + save_layer=[], + *args, + **kwargs, + ): + """ + "position_ids" is just to be compatible with Transformer generation. We don't use it. + num_last_tokens: if > 0, only return the logits for the last n tokens + """ + return self.protected_forward( + input_ids, position_ids, inference_params, num_last_tokens, save_layer + ) + + def protected_forward( + self, + input_ids, + position_ids=None, + inference_params=None, + num_last_tokens=0, + save_layer=[], + ): + hidden_states = self.backbone( + input_ids, + position_ids=position_ids, + inference_params=inference_params, + save_layer=save_layer, + ) + if len(save_layer) > 0: + return hidden_states + hidden_states = hidden_states[:, :, : self.config.d_model // 2] + if num_last_tokens > 0: + hidden_states = hidden_states[:, -num_last_tokens:] + lm_logits = self.lm_head(hidden_states) + CausalLMOutput = namedtuple("CausalLMOutput", ["loss", "logits"]) + return CausalLMOutput(loss=None, logits=lm_logits) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name, + device=None, + dtype=None, + checkpoint_mixer=False, + **kwargs, + ): + config_data = load_config_hf(pretrained_model_name) + config = MambaConfig(**config_data) + model = cls( + config, + device=device, + dtype=dtype, + checkpoint_mixer=checkpoint_mixer, + **kwargs, + ) + state_dict = load_state_dict_hf( + pretrained_model_name, device=device, dtype=dtype + ) + if state_dict.keys() != model.state_dict().keys(): + if checkpoint_mixer: + for key in model.state_dict().keys(): + if "ckpt_layer" in key: + state_dict[key] = state_dict.pop(key.replace("ckpt_layer.", "")) + print( + "Using a model that was pretrained without gradient checkpointing and now want to use it. Changed the keys of the state_dict to match the model's keys." + ) + else: + for key in list(state_dict.keys()): + if "ckpt_layer" in key: + state_dict[key.replace("ckpt_layer.", "")] = state_dict.pop(key) + print( + "Using a model that was pretrained with gradient checkpointing but now do not want to use it. Changed the keys of the state_dict to match the model's keys." + ) + assert ( + state_dict.keys() == model.state_dict().keys() + ), "The keys of the state_dict do not match the model's keys." + model.load_state_dict(state_dict) + return model + + def save_pretrained(self, save_directory): + """ + Minimal implementation of save_pretrained for MambaLMHeadModel. + Save the model and its configuration file to a directory. + """ + # Ensure save_directory exists + os.makedirs(save_directory, exist_ok=True) + + # Save the model's state_dict + model_path = os.path.join(save_directory, "pytorch_model.bin") + torch.save(self.state_dict(), model_path) + + # Save the configuration of the model + config_path = os.path.join(save_directory, "config.json") + with open(config_path, "w") as f: + json.dump(self.config.__dict__, f) + +class MambaLMHeadModelwith2DPosids(nn.Module, GenerationMixinSafe): + + def __init__( + self, + config: MambaConfig, + initializer_cfg=None, + device=None, + dtype=None, + checkpoint_mixer=False, + ) -> None: + self.config = config + d_model = config.d_model + n_layer = config.n_layer + vocab_size = config.vocab_size + max_position_embeddings = config.max_position_embeddings + ssm_cfg = config.ssm_cfg + rms_norm = config.rms_norm + residual_in_fp32 = config.residual_in_fp32 + fused_add_norm = config.fused_add_norm + pad_vocab_size_multiple = config.pad_vocab_size_multiple + factory_kwargs = {"device": device, "dtype": dtype} + + super().__init__() + if vocab_size % pad_vocab_size_multiple != 0: + vocab_size += pad_vocab_size_multiple - ( + vocab_size % pad_vocab_size_multiple + ) + self.backbone = MixerModelWith2DPosids( + d_model=d_model, + n_layer=n_layer, + vocab_size=vocab_size, + max_position_embeddings=max_position_embeddings, + ssm_cfg=ssm_cfg, + rms_norm=rms_norm, + initializer_cfg=initializer_cfg, + fused_add_norm=fused_add_norm, + residual_in_fp32=residual_in_fp32, + checkpoint_mixer=checkpoint_mixer, + **factory_kwargs, + ) + self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs) + + # Initialize weights and apply final processing + self.apply( + partial( + _init_weights, + n_layer=n_layer, + **(initializer_cfg if initializer_cfg is not None else {}), + ) + ) + self.tie_weights() + + def tie_weights(self): + self.lm_head.weight = self.backbone.embedding.weight + + def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs): + return self.backbone.allocate_inference_cache( + batch_size, max_seqlen, dtype=dtype, **kwargs + ) + + def forward( + self, + input_ids, + position_ids=None, + seq_position_ids=None, + inference_params=None, + num_last_tokens=0, + save_layer=[], + *args, + **kwargs, + ): + """ + "position_ids" is just to be compatible with Transformer generation. We don't use it. + num_last_tokens: if > 0, only return the logits for the last n tokens + """ + return self.protected_forward( + input_ids, + position_ids, + seq_position_ids, + inference_params, + num_last_tokens, + save_layer, + ) + + def protected_forward( + self, + input_ids, + position_ids=None, + seq_position_ids=None, + inference_params=None, + num_last_tokens=0, + save_layer=[], + ): + hidden_states = self.backbone( + input_ids, + position_ids=position_ids, + seq_position_ids=seq_position_ids, + inference_params=inference_params, + save_layer=save_layer, + ) + if len(save_layer) > 0: + return hidden_states + hidden_states = hidden_states[:, :, : self.backbone.d_embeddings] + if num_last_tokens > 0: + hidden_states = hidden_states[:, -num_last_tokens:] + lm_logits = self.lm_head(hidden_states) + CausalLMOutput = namedtuple("CausalLMOutput", ["loss", "logits"]) + return CausalLMOutput(loss=None, logits=lm_logits) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name, + device=None, + dtype=None, + checkpoint_mixer=False, + **kwargs, + ): + config_data = load_config_hf(pretrained_model_name) + config = MambaConfig(**config_data) + model = cls( + config, + device=device, + dtype=dtype, + checkpoint_mixer=checkpoint_mixer, + **kwargs, + ) + state_dict = load_state_dict_hf( + pretrained_model_name, device=device, dtype=dtype + ) + if state_dict.keys() != model.state_dict().keys(): + if checkpoint_mixer: + for key in model.state_dict().keys(): + if "ckpt_layer" in key: + state_dict[key] = state_dict.pop(key.replace("ckpt_layer.", "")) + print( + "Using a model that was pretrained without gradient checkpointing and now want to use it. Changed the keys of the state_dict to match the model's keys." + ) + else: + for key in list(state_dict.keys()): + if "ckpt_layer" in key: + state_dict[key.replace("ckpt_layer.", "")] = state_dict.pop(key) + print( + "Using a model that was pretrained with gradient checkpointing but now do not want to use it. Changed the keys of the state_dict to match the model's keys." + ) + assert ( + state_dict.keys() == model.state_dict().keys() + ), "The keys of the state_dict do not match the model's keys." + model.load_state_dict(state_dict) + return model + + def save_pretrained(self, save_directory): + """ + Minimal implementation of save_pretrained for MambaLMHeadModel. + Save the model and its configuration file to a directory. + """ + # Ensure save_directory exists + os.makedirs(save_directory, exist_ok=True) + + # Save the model's state_dict + model_path = os.path.join(save_directory, "pytorch_model.bin") + torch.save(self.state_dict(), model_path) + + # Save the configuration of the model + config_path = os.path.join(save_directory, "config.json") + with open(config_path, "w") as f: + json.dump(self.config.__dict__, f) diff --git a/protxlstm/models/xlstm.py b/protxlstm/models/xlstm.py new file mode 100644 index 0000000000000000000000000000000000000000..55867342050869d8ba010f1c923afeb9957f5ec3 --- /dev/null +++ b/protxlstm/models/xlstm.py @@ -0,0 +1,180 @@ +__all__ = [ + "xLSTMConfig", + "xLSTMLMHeadModel", +] + +import json +import os +from collections import namedtuple +from dataclasses import asdict + +import torch +import torch.nn as nn +from dacite import Config as DaciteConfig, from_dict +from omegaconf import OmegaConf +from transformers import PretrainedConfig + +from protxlstm.generation import GenerationMixinSafe +from protxlstm.utils import load_config_hf, load_state_dict_hf +from protxlstm.xlstm.xlstm_lm_model import xLSTMLMModel, xLSTMLMModelConfig + + +class xLSTMConfig(PretrainedConfig): + + def __init__(self): + self.config_dataclass = xLSTMLMModelConfig() + + def init_from_dict(self, config: dict): + config = OmegaConf.create(config) + self.config_dataclass = from_dict( + data_class=xLSTMLMModelConfig, + data=OmegaConf.to_container(config), + config=DaciteConfig(strict=True), + ) + return self + + def to_dict(self): + return asdict(self.config_dataclass) + + +class xLSTMLMHeadModel(nn.Module, GenerationMixinSafe): + + def __init__(self, config: xLSTMConfig) -> None: + super().__init__() + + self.config = config + self.backbone = xLSTMLMModel(self.config.config_dataclass) + self.backbone.reset_parameters() + + self.setup() + + + def setup(self): + + if 'LOCAL_RANK' in os.environ: + current_device = int(os.environ['LOCAL_RANK']) + else: + if 'SLURM_LOCALID' in os.environ: + current_device = int(os.environ['SLURM_LOCALID']) + else: + current_device = 0 + + #torch.cuda.set_device(f'cuda:{current_device}') + + #self.backbone = self.backbone.to("cuda") + + + def forward( + self, + input_ids, + state=None, + position_ids=None, + seq_position_ids=None, + inference_params=None, + num_last_tokens=0, + save_layer=[], + **kwargs, + ): + + if self.config.config_dataclass.mlstm_block.mlstm.return_last_state: + lm_logits, state = self.backbone(input_ids, position_ids=position_ids, seq_position_ids=seq_position_ids, state=state) + CausalLMOutput = namedtuple("CausalLMOutput", ["loss", "logits", "state"]) + return CausalLMOutput(loss=None, logits=lm_logits, state=state) + else: + lm_logits = self.backbone(input_ids, position_ids=position_ids, seq_position_ids=seq_position_ids, state=state) + CausalLMOutput = namedtuple("CausalLMOutput", ["loss", "logits"]) + return CausalLMOutput(loss=None, logits=lm_logits) + + def step( + self, + input_ids, + state=None, + position_ids=None, + seq_position_ids=None, + inference_params=None, + num_last_tokens=0, + save_layer=[], + **kwargs, + ): + + lm_logits, state = self.backbone.step( + input_ids, state=state, position_ids=position_ids, seq_position_ids=seq_position_ids + ) + + return lm_logits, state + + + @classmethod + def from_pretrained( + cls, + pretrained_model_name, + device=None, + dtype=None, + mlstm_backend=None, + mlstm_chunksize=None, + checkpoint_blocks=None, + rope_base_frequency=None, + mlstm_return_last_state=None, + ): + # Load the checkpoint config + config_dict = load_config_hf(pretrained_model_name) + + # update rope base frequency + if rope_base_frequency is not None and config_dict.get("rope_base_frequency", None) != rope_base_frequency: + config_dict["rope_base_frequency"] = rope_base_frequency + + # update mlstm backend + if mlstm_backend is not None and config_dict["mlstm_block"]["mlstm"].get("backend", None) != mlstm_backend: + assert mlstm_backend in ["chunkwise", "chunkwise_variable", "parallel"], "invalid mlstm backend." + config_dict["mlstm_block"]["mlstm"]["backend"] = mlstm_backend + + # update mlstm chunksize + if mlstm_chunksize is not None and config_dict["mlstm_block"]["mlstm"].get("chunk_size", None) != mlstm_chunksize: + config_dict["mlstm_block"]["mlstm"]["chunk_size"] = mlstm_chunksize + + # update activation checkpointing + if checkpoint_blocks is not None: + config_dict["checkpoint_blocks"] = checkpoint_blocks + + if mlstm_return_last_state is not None: + config_dict["mlstm_block"]["mlstm"]["return_last_state"] = mlstm_return_last_state + + if "slstm_block" in config_dict: + config_dict.pop("slstm_block") + + if "slstm_at" in config_dict: + config_dict.pop("slstm_at") + + config = xLSTMConfig().init_from_dict(config_dict) + + model = cls(config) + + state_dict = load_state_dict_hf( + pretrained_model_name, device=device, dtype=dtype + ) + assert ( + state_dict.keys() == model.state_dict().keys() + ), "The keys of the state_dict do not match the model's keys." + + model.load_state_dict(state_dict) + + return model + + def save_pretrained(self, save_directory): + """ + Save the model and its configuration file to a directory. + """ + + # Ensure save_directory exists + os.makedirs(save_directory, exist_ok=True) + + # Save the model's state_dict + model_path = os.path.join(save_directory, "pytorch_model.bin") + torch.save(self.state_dict(), model_path) + + # Save the configuration of the model + config_path = os.path.join(save_directory, "config.json") + with open(config_path, "w") as f: + json.dump(self.config.to_dict(), f) + + diff --git a/protxlstm/plot_utils.py b/protxlstm/plot_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..18d9da053fa68eb94bea4609e1aecc3f59564926 --- /dev/null +++ b/protxlstm/plot_utils.py @@ -0,0 +1,26 @@ + +cd = { # use dependent on model-type!! + "xLSTM": "#3073AD", + "Transformers": "#4B9D7A", + "Mamba": "#DF8953", + "S4": "#D275AB", + "Hyena": "#E86A61", +} + +def setup_matplotlib(): + import matplotlib.pyplot as plt + from tueplots import bundles, axes + bundles.icml2022() + plt.rcParams.update(bundles.icml2022()) + plt.rcParams.update(axes.lines(base_width=0.5)) + plt.rcParams["text.usetex"] = False + plt.rcParams['font.family'] = "sans-serif" + plt.rcParams['font.serif'] = 'Arial' + plt.rcParams['legend.edgecolor'] = 'grey' + plt.rcParams['legend.framealpha'] = 0.7 + plt.rcParams['lines.linewidth'] = 1.2 + plt.rcParams['axes.grid'] = True + plt.rcParams['axes.grid.axis'] = 'both' + plt.rcParams['grid.alpha'] = 0.2 + plt.rcParams['axes.grid'] = True + plt.rcParams['axes.prop_cycle'] = plt.cycler(color=cd.values()) \ No newline at end of file diff --git a/protxlstm/train.py b/protxlstm/train.py new file mode 100644 index 0000000000000000000000000000000000000000..84303e3b962da82b50d735c1eb283f771fbdda36 --- /dev/null +++ b/protxlstm/train.py @@ -0,0 +1,338 @@ +# Original code from ProtMamba under Apache License 2.0. +# +# Modifications made by Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Extended to training of xlstm and transformer-based models +# - Predefined splits instead of on-the-fly creation +# - Option to overwrite config parameters from the command line +# - wandb logging + +import argparse +import os + +import torch +from omegaconf import OmegaConf +from transformers import TrainingArguments + +from protxlstm.dataloaders import ProteinMemmapDataset, ProteinDataCollator +from protxlstm.models.xlstm import xLSTMConfig, xLSTMLMHeadModel +from protxlstm.models.llama import TransformerConfig, TransformerLMHeadModel +from protxlstm.trainer import ProtTrainer, EarlyStoppingCallback, get_last_checkpoint +from protxlstm.utils import ( + AA_TO_ID, + compute_metrics, + is_zero_rank, + parse_override_args, + print_number_of_parameters, + print_zero_rank, + set_optimizer_and_scheduler, + setup_wandb, + load_model, +) + +def run(config): + """ + Run training loop. + + Args: + config (dict): dictionary with the configuration parameters. + """ + + if config.model_type == 'llama': + pe_kwargs = { + 'max_position_embeddings' : config["model"]["max_position_embeddings"], + 'add_position_ids' : '1d', + } + elif config.model_type == 'mamba': + from protxlstm.models.mamba import MambaConfig, MambaLMHeadModelSafe, MambaLMHeadModelwithPosids, MambaLMHeadModelwith2DPosids + pe_kwargs = { + 'max_position_embeddings' : config["model"]["max_position_embeddings"], + 'max_seq_position_embeddings' : config["model"]["max_seq_position_embeddings"], + 'add_position_ids' : config["model"]["add_position_ids"] + } + else: + position_embeddings = config["model"]["position_embeddings"] + assert position_embeddings in ["none", "abs_1d", "abs_2d", "rot_1d", "rot_2d"] + if position_embeddings != "none": + position_embeddings = position_embeddings.split("_")[-1] + pe_kwargs = { + 'max_position_embeddings' : config["model"]["max_position_embeddings"], + 'max_seq_position_embeddings' : config["model"]["max_seq_position_embeddings"], + 'add_position_ids' : position_embeddings + } + + # Setup WandB + wandb_run_name = setup_wandb(config) + + # Load datasets + dataset_params = { + "msa_memmap_path": config["msa_memmap_path"], + "msa_memmap_meta_path": config["msa_memmap_meta_path"], + "sample": config["sample_sequences"], + "max_msa_len": config["max_msa_len"], + "reverse": False, + "seed": config["seed_sequence_sampling"], + "troubleshoot": False, + "fim_strategy": config["fim_strategy"], + "always_mask": config["always_mask"], + **pe_kwargs, + } + train_dataset = ProteinMemmapDataset(subset_path=config["train_set"], **dataset_params) + valid_dataset = ProteinMemmapDataset(subset_path=config["valid_set"], **dataset_params) + train_eval_dataset = ProteinMemmapDataset(subset_path=config["train_eval_set"], **dataset_params) + + print(f'Train set size: {len(train_dataset)} Train eval set size: {len(train_eval_dataset)} Valid set size: {len(valid_dataset)}') + + assert ( + len(AA_TO_ID) == config["model"]["vocab_size"] + ), f"Vocab size in the config file does not match the one in the code. I should be {len(AA_TO_ID)}" + + # Create data collator for batched training + data_collator = ProteinDataCollator(max_sequence_length=config["max_msa_len"]) + + # Check datatypes + if config["dtype"] == "float32": + dtype = torch.float32 + elif config["dtype"] == "bfloat16": + dtype = torch.bfloat16 + else: + raise ValueError("dtype must be either float32 or bfloat16") + + # Initialize model + if config.model_type == 'xlstm': + + # Load model for finetuning + if config.finetune_model_path: + # These fields are updated in the config loaded from the checkpoint + config_update_kwargs = { + "mlstm_backend": config["model"]["mlstm_block"]["mlstm"]["backend"], + "mlstm_chunksize": config["model"]["mlstm_block"]["mlstm"]["chunk_size"], + "checkpoint_blocks": config["model"]["checkpoint_blocks"], + "rope_base_frequency": config["model"]["rope_base_frequency"] + } + model = load_model( + config.finetune_model_path, + model_class=xLSTMLMHeadModel, + device="cuda", + dtype=dtype, + **config_update_kwargs + ) + else: + # Create new mode + xlstm_config = xLSTMConfig().init_from_dict(config["model"]) + model = xLSTMLMHeadModel(xlstm_config) + + elif config.model_type == 'mamba': + + _mamba_model = { + "none": MambaLMHeadModelSafe, + "1d": MambaLMHeadModelwithPosids, + "2d": MambaLMHeadModelwith2DPosids, + } + Mamba = _mamba_model[config['model']["add_position_ids"]] + + # Load model for finetuning + if config.finetune_model_path: + model = load_model( + config.finetune_model_path, + model_class=Mamba, + device="cuda", + dtype=dtype, + checkpoint_mixer=config["checkpoint_mixer"], + ) + else: + # Create new mode + mamba_config = MambaConfig(d_model=config['model']["d_model"], + n_layer=config['model']["n_layer"], + vocab_size=config['model']["vocab_size"], + residual_in_fp32=config['model']["residual_in_fp32"]) + model = Mamba(mamba_config, dtype=dtype, checkpoint_mixer=config['model']["checkpoint_mixer"]) + + elif config.model_type == 'llama': + + llama_config = TransformerConfig( + d_model=config["model"]["d_model"], + n_layer=config["model"]["n_layer"], + n_heads=config["model"]["n_heads"], + n_kv_heads=config["model"]["n_kv_heads"], + bidirectional=config["model"]["bidirectional"], + hidden_dim=config["model"]["hidden_dim"], + multiple_of=config["model"]["multiple_of"], + norm_eps=config["model"]["norm_eps"], + max_length=config["model"]["max_length"], + vocab_size=config["model"]["vocab_size"], + dropout=config["model"]["dropout"], + max_position_embeddings=config["model"]["max_position_embeddings"], + rope_base_frequency=config["model"]["rope_base_frequency"], + + ) + + model = TransformerLMHeadModel(llama_config) + + else: + raise ValueError(f"Unsupported model_type: {config.model_type}. Expected 'xlstm', 'mamba', or 'llama'.") + + + # TODO: Improve what we want print + if is_zero_rank(): + print_number_of_parameters(model) + print_zero_rank(f"dtype: {config['dtype']}") + print_zero_rank(f"Epochs: {config['num_epochs']}") + print_zero_rank(f"Batch size per GPU: {config['batch_size']}") + print_zero_rank(f"Gradient accumulation steps: {config['gradient_accumulation_steps']}") + eff_batch_size = config["batch_size"] * config["gradient_accumulation_steps"] + nr_gpus = torch.cuda.device_count() + print_zero_rank(f"GPUS: {nr_gpus}") + eff_batch_size *= nr_gpus + print_zero_rank(f"Effective batch size: {eff_batch_size}") + print_zero_rank( + f"Steps per training epoch: {len(train_dataset) // config['batch_size']}, eff. steps: {len(train_dataset) // eff_batch_size}" + ) + print_zero_rank(f"Steps per evaluation epoch: {len(valid_dataset) // config['batch_size']}") + print_zero_rank(f"Max MSA length: {config['max_msa_len']}") + ev_epochs = round( + config["eval_steps"] * config["batch_size"] / len(train_dataset), 3 + ) + print_zero_rank( + f"Evaluation every {config['eval_steps']} steps, i.e. {ev_epochs} epochs. Effectively every {config['eval_steps']*config['gradient_accumulation_steps']} steps, i.e. {ev_epochs*config['gradient_accumulation_steps']} epochs." + ) + if config.model_type == 'xlstm' and config["model"]["checkpoint_blocks"]: + print_zero_rank("Using gradient checkpointing") + if config["compute_only_fim_loss"]: + print_zero_rank("Computing only FIM loss for training") + + # Training callbacks + es_callback = EarlyStoppingCallback( + train_path=config["output_dir"] + '/' + wandb_run_name, config=config + ) + callbacks = [es_callback] + + # Optimizer and Schedulers + optimizer, scheduler = set_optimizer_and_scheduler( + config, + len(train_dataset), + model.parameters() + ) + + # Find checkpoint if available + last_checkpoint = None + if config.finetune_model_path is None: + path = os.path.join(config["output_dir"], wandb_run_name) + if os.path.exists(path): + last_checkpoint = get_last_checkpoint(path) + if last_checkpoint is None: + print_zero_rank("No checkpoint found, starting training from scratch.") + else: + print_zero_rank(f"Resuming training from the last checkpoint: {last_checkpoint}") + + # Create trainer + trainer = ProtTrainer( + model=model, + train_dataset=train_dataset, + eval_dataset={"valid": valid_dataset, "train": train_eval_dataset}, + optimizers=(optimizer, scheduler), + args=TrainingArguments( + run_name=wandb_run_name, + local_rank=int(os.getenv('LOCAL_RANK', '0')), + learning_rate=config["learning_rate"], + num_train_epochs=config["num_epochs"], + per_device_train_batch_size=config["batch_size"], + per_device_eval_batch_size=config["batch_size"], + gradient_accumulation_steps=config["gradient_accumulation_steps"], + eval_accumulation_steps=config["eval_accumulation_steps"], + eval_strategy="steps", + max_grad_norm=config["max_grad_norm"], + bf16=config["dtype"] == "bfloat16", + dataloader_num_workers=32, + logging_steps=config["logging_steps"], + eval_steps=config["eval_steps"], + save_steps=config["save_steps"], + output_dir=config["output_dir"] + '/' + wandb_run_name, + logging_dir=config["output_dir"] + '/' + wandb_run_name, + report_to="wandb" if is_zero_rank() else None, + log_on_each_node=False, + overwrite_output_dir=False, + push_to_hub=False, + label_names=["labels"], + ), + compute_only_fim_loss=config["compute_only_fim_loss"], + data_collator=data_collator, + compute_metrics=compute_metrics, + callbacks=callbacks, + ) + + # Train model + while True: + if last_checkpoint is None and trainer.state.global_step == 0: + eval_results = trainer.evaluate() + print_zero_rank( + f">>> Initial validation perplexity: {eval_results['eval_valid_perplexity/batch']:.2f}" + ) + else: + print_zero_rank(f"Resuming training from the last checkpoint: {last_checkpoint}") + # Train + trainer.train(resume_from_checkpoint=last_checkpoint) + + # Break training when the number of epochs is reached + if ( + not es_callback.should_restart + or trainer.state.epoch >= config["num_epochs"] + ): + eval_results = trainer.evaluate() + print_zero_rank( + f">>> Final Perplexity: {eval_results['eval_valid_perplexity/batch']:.2f}" + ) + break + # If the training was interrupted because of a loss spike, restart from the last checkpoint + last_checkpoint = es_callback.checkpoint_path + + return trainer + +if __name__ == "__main__": + + # Default configuration file paths + default_model_config = "configs/xlstm_default_config.yaml" + default_train_config = "configs/train_default_config.yaml" + + parser = argparse.ArgumentParser( + description="Train or finetune a model with the provided configuration." + ) + parser.add_argument( + "--model_config_path", + type=str, + default=default_model_config, + help=f"Path to the model configuration file (default: {default_model_config})" + ) + parser.add_argument( + "--train_config_path", + type=str, + default=default_train_config, + help=f"Path to the training and dataset configuration file (default: {default_train_config})" + ) + parser.add_argument( + "overrides", + nargs=argparse.REMAINDER, + help="Override configuration values using key=value format.", + ) + + args = parser.parse_args() + + # Check if the default config files exist, or raise an error + if not os.path.exists(args.model_config_path): + raise FileNotFoundError(f"Model config file not found: {args.model_config_path}") + if not os.path.exists(args.train_config_path): + raise FileNotFoundError(f"Train config file not found: {args.train_config_path}") + + # Load the model and training configurations + model_config = OmegaConf.load(args.model_config_path) + train_config = OmegaConf.load(args.train_config_path) + + # Merge the model and training configurations + config = OmegaConf.merge(model_config, train_config) + + # Parse overrides + if args.overrides: + overrides = parse_override_args(args.overrides) + config.merge_with(OmegaConf.create(overrides)) + + # Run the training/finetuning process + run(config) \ No newline at end of file diff --git a/protxlstm/trainer.py b/protxlstm/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..5f9759cb545a844d041a2f775bb8c928b3e3e0e4 --- /dev/null +++ b/protxlstm/trainer.py @@ -0,0 +1,123 @@ +# Original code from ProtMamba under Apache License 2.0. +# +# Modifications made by Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - MambaTrainer renamed to ProtTrainer + +import os +import re + +import torch +from transformers import Trainer, TrainerCallback + +from protxlstm.utils import AA_TO_ID, find_fim_indices + +class ProtTrainer(Trainer): + """ + Base HuggingFace Trainer used for training. + + from https://github.com/havenhq/mamba-chat/blob/main/trainer/mamba_trainer.py""" + def __init__(self, compute_only_fim_loss, **kwargs,): + super().__init__(**kwargs) + self.compute_only_fim_loss = compute_only_fim_loss + + + def compute_loss(self, model, inputs, return_outputs=False): + input_ids = inputs.pop("input_ids") + labels = inputs.pop("labels") + if "seq_position_ids" in inputs and "position_ids" in inputs: + position_ids = inputs.pop("position_ids") + seq_position_ids = inputs.pop("seq_position_ids") + output = model(input_ids, position_ids=position_ids, seq_position_ids=seq_position_ids) + elif "position_ids" in inputs: + position_ids = inputs.pop("position_ids") + output = model(input_ids, position_ids=position_ids) + else: + output = model(input_ids) + lm_logits = output.logits + + labels = labels.to(lm_logits.device) + shift_logits = lm_logits[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + + loss_fct = torch.nn.CrossEntropyLoss() + if self.compute_only_fim_loss: + # start and end tokens + is_cls_tokens = (labels == AA_TO_ID[""]) + is_eos_tokens = (labels == AA_TO_ID[""]) + bool_fim = find_fim_indices(is_cls_tokens, is_eos_tokens) + # include also the cls token + bool_fim = bool_fim | is_cls_tokens + inds = torch.where(bool_fim) + lm_loss = loss_fct(shift_logits[inds[0], inds[1], :], labels[bool_fim]) + else: + lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1)) + + return (lm_loss, output) if return_outputs else lm_loss + + def save_model(self, output_dir, _internal_call): + if int(os.getenv('LOCAL_RANK', '0')) == 0: + self.model.save_pretrained(output_dir) + +PREFIX_CHECKPOINT_DIR = "checkpoint" +_re_checkpoint = re.compile(r"^" + PREFIX_CHECKPOINT_DIR + r"\-(\d+)$") + +def get_last_checkpoint(folder, max_steps=None): + content = os.listdir(folder) + checkpoints = [ + path + for path in content + if _re_checkpoint.search(path) is not None and os.path.isdir(os.path.join(folder, path)) + ] + if len(checkpoints) == 0: + return + + max_steps = max_steps if max_steps is not None else float("inf") + # func = lambda x: int(_re_checkpoint.search(x).groups()[0]) + def func(x): + num = int(_re_checkpoint.search(x).groups()[0]) + return num if num < max_steps else -1 + return os.path.join(folder, max(checkpoints, key=func)) + +class EarlyStoppingCallback(TrainerCallback): + def __init__(self, train_path, config=None): + self.step_counter_reset = 0 + self.step_counter_stop = 0 + self.best_loss = None + self.train_path = train_path + self.patience = config["patience"] + self.metric_name = config["early_stopping_metric"] + self.checkpoint_path = None + self.should_restart = False + self.eval_steps = config["eval_steps"] + self.loss_increase_factor = config["loss_increase_factor"] + + def get_checkpoint_path(self, max_steps): + last_checkpoint = None + if os.path.exists(self.train_path): + last_checkpoint = get_last_checkpoint(self.train_path, max_steps) + if last_checkpoint is None: + print("No checkpoint found, starting training from scratch.") + else: + print(f"Max checkpoint allowed: {max_steps}, restarting from {last_checkpoint}.") + return last_checkpoint + + def on_evaluate(self, args, state, control, model, metrics, **kwargs): + if self.metric_name in metrics: + if self.best_loss is None: + self.best_loss = metrics[self.metric_name] + elif self.best_loss*self.loss_increase_factor < metrics[self.metric_name]: + self.step_counter += 1 + if self.step_counter >= self.patience: + checkpoint_path = self.get_checkpoint_path(max_steps=(state.global_step-self.patience*self.eval_steps)) + control.should_training_stop = True + self.checkpoint_path = checkpoint_path + self.should_restart = True + else: + self.step_counter = 0 + self.best_loss = min(self.best_loss, metrics[self.metric_name]) + self.should_restart = False + + def on_train_begin(self, args, state, control, **kwargs): + self.step_counter = 0 + self.best_loss = None + self.should_restart = False diff --git a/protxlstm/utils.py b/protxlstm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9603d013daa8f16d995b73f6939ec97d38f4f11d --- /dev/null +++ b/protxlstm/utils.py @@ -0,0 +1,482 @@ +# Some of the objects in this file come from ProtMamba and mamba both under Apache License 2.0. + +import json +import os + +import numpy as np +import rich +import torch +from Bio import SeqIO +from omegaconf import DictConfig, OmegaConf +from torch.optim import AdamW +import wandb +from transformers import ( + get_constant_schedule_with_warmup, + get_cosine_schedule_with_warmup, + get_cosine_with_hard_restarts_schedule_with_warmup, +) +from transformers.utils import WEIGHTS_NAME, CONFIG_NAME +from transformers.utils.hub import cached_file + +__all__ = ['AA_TO_ID', 'MASK_TO_ID', 'ID_TO_AA', 'load_model', 'encode_sequence', 'decode_sequence', 'clean_sequence', 'tokenizer', + 'reorder_masked_sequence', 'load_sequences_from_msa_file', 'prepare_dataset_for_fim_generation', + 'prepare_tokens', 'prepare_target', 'print_number_of_parameters', 'find_fim_indices', + 'compute_metrics', 'compute_metrics_with_std', 'print_config', 'print_zero_rank', 'is_zero_rank'] + +# Constants +AA_TO_ID = {'': 0, + '': 1, + '': 2, + '': 3, + 'L': 4, + 'A': 5, + 'G': 6, + 'V': 7, + 'S': 8, + 'E': 9, + 'R': 10, + 'T': 11, + 'I': 12, + 'D': 13, + 'P': 14, + 'K': 15, + 'Q': 16, + 'N': 17, + 'F': 18, + 'Y': 19, + 'M': 20, + 'H': 21, + 'W': 22, + 'C': 23, + 'X': 24, + 'B': 25, + 'U': 26, + 'Z': 27, + 'O': 28, + '.': 29, + '-': 30, + '': 31, + '': 32} + +MASK_TO_ID = {"": 33, + "": 34, + "": 35, + "": 36, + "": 37,} + +AA_TO_ID.update(MASK_TO_ID) + +ID_TO_AA = {v: k for k, v in AA_TO_ID.items()} + +# Logging & prints +def setup_wandb(config): + + # WandB setup + os.environ["WANDB_PROJECT"] = config["wandb_project"] + os.environ["WANDB_ENTITY"] = config["wandb_entity"] + os.environ["WANDB_MODE"] = config["wandb_mode"] + + if config['model_type'] == 'xlstm': + pe = config['model']['add_position_ids'] + pe = 'None' if pe == 'none' else 'AbsPE' if pe == 'abs_1d' else 'AbsPE2' if pe == 'abs_2d' else 'RoPE' if pe == 'rot_1d' else pe == 'rot_2d' + wandb_run_name = f"{config['model_type']}_l{config['model']['num_blocks']}_d{config['model']['embedding_dim']}_{pe}_s{config['max_msa_len']}_lr{config['learning_rate']}" + elif config['model_type'] == 'mamba': + pe = config['model']['add_position_ids'] + pe = 'None' if pe == 'none' else 'AbsPE' if pe == '1d' else pe == '2d' + wandb_run_name = f"{config['model_type']}_l{config['model']['n_layer']}_d{config['model']['d_model']}_{pe}_s{config['max_msa_len']}_lr{config['learning_rate']}" + elif config['model_type'] == 'llama': + pe = 'RoPE' + wandb_run_name = f"{config['model_type']}_l{config['model']['n_layer']}_d{config['model']['d_model']}_dh{config['model']['hidden_dim']}_{prepare_dataset_for_fim_generation}_s{config['max_msa_len']}_lr{config['learning_rate']}_sched-{config['scheduler']}" + + if config['name_prefix']: + wandb_run_name = str(config['name_prefix']) + '_' + wandb_run_name + if config['name_suffix']: + wandb_run_name = wandb_run_name + '_' + str(config['name_suffix']) + + if is_zero_rank(): + wandb.init( + project=config["wandb_project"], + entity=config["wandb_entity"], + mode=config["wandb_mode"], + name=wandb_run_name) + config_dict = OmegaConf.to_container(config, resolve=True) + wandb.config.update(config_dict) + return wandb_run_name + +def is_zero_rank(): + return int(os.getenv('LOCAL_RANK', '0')) == 0 + +def print_zero_rank(var): + if is_zero_rank(): + print(var) + +def print_number_of_parameters(model): + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + formatted_num_params = f"{num_params:_}" + print("Number of trainable parameters: ", formatted_num_params) + +# Sequence tools +def encode_sequence(sequence): + """Tokenize a sequence of amino acids and add a cls token at the beginning.""" + tokenized_sequence = [AA_TO_ID[aa] if aa in AA_TO_ID else AA_TO_ID[''] for aa in sequence] + return [AA_TO_ID['']] + tokenized_sequence + +def decode_sequence(sequence): + """Decode a sequence of tokens.""" + return "".join([ID_TO_AA[token] if token in ID_TO_AA else "" for token in sequence]) + +def clean_sequence(sequence): + """Remove gaps and convert all residues to upper case.""" + return sequence.replace("-", "").upper() + +def tokenizer(sequence_list, concatenate=True): + """Tokenize a collection of sequences. If the sequences are aligned, the gaps will be removed + and the insertions (lower case) will be promoted to upper case.""" + # clean and encode all sequences + sequence_list = [encode_sequence(clean_sequence(sequence)) for sequence in sequence_list] + if concatenate: + # concatenate all sequences + sequences = np.concatenate(sequence_list) + # convert to tensor and add batch dimension + return torch.asarray(sequences, dtype=torch.int8)[None,:] + else: + return [torch.asarray(sequence, dtype=torch.int8) for sequence in sequence_list] + +def reorder_masked_sequence(mask_seq, return_ids=False): + """ + Reorder a masked sequence to fill the masked positions with the tokens + that should be there but are positioned after the token. + """ + mask_seq = mask_seq.split("")[0] + try: + # Split the sequence and masks + seq, masks = mask_seq.split("") + except: + return mask_seq + full_seq = "" + ids_mask = [] + # Iterate over each mask tag + for mm in ["", "", "", "", "",""]: + try: + # Split the sequence in before and after the mask tag + seq1, seq2 = seq.split(mm) + if mm=="": + # If the mask is the first one, add the sequence before the mask and update the masks + masks = masks.split("")[1] + full_seq += seq1 + else: + # If the mask is not the first one, insert the mask between the two sequence parts + masks1, masks2 = masks.split(mm) + ids_mask += [(len(full_seq), len(full_seq)+len(masks1))] + full_seq += masks1 + seq1 + # Update the masks + masks = masks2 + # Update the sequence with the part after the mask + seq = seq2 + except: + # If the mask is not found, add the remaining sequence + ids_mask += [(len(full_seq), len(full_seq)+len(masks))] + full_seq += masks + seq + break + if return_ids: + return full_seq, ids_mask + return full_seq + +def load_sequences_from_msa_file(file_path): + """Load a collection of sequences from an a3m file.""" + with open(file_path, "r") as f: + sequences = [str(record.seq) for record in SeqIO.parse(f, "fasta")] + return sequences + +def prepare_dataset_for_fim_generation(tokens, pos_ids): + """ + Function to transform the tokenized training dataset into a format that can be used for FIM generation. + Splits the input tokens and pos_ids into the FIM part (of the last sequence) and the context part (all + the previous sequences and the masked part of the last sequence). + Also returns a dictionary with the positions of the mask tokens in the FIM part. + """ + def find_mask_positions(tokens_fim): + """ + Function to find the positions of the mask tokens in the FIM part of the last sequence. + """ + bool_mask = None + inds_masks = [] + for ind in MASK_TO_ID.values(): + tmp_bool = tokens_fim[0].cpu().numpy() == ind + bool_mask = tmp_bool if bool_mask is None else bool_mask | tmp_bool + inds_masks += [ind] + return bool_mask, inds_masks + # find where the FIM part of the last sequence starts + start_last_fim = np.where(tokens[0].cpu().numpy() == AA_TO_ID[""])[0][-1] + start_next_seqs = np.where(tokens[0,start_last_fim+1:].cpu().numpy() == AA_TO_ID[""])[0] + end_last_fim = start_last_fim+ 1 +start_next_seqs[0] if len(start_next_seqs) > 0 else tokens.shape[1] + # split tokens and pos_ids into FIM part and context part + tokens_to_fim = tokens[:,:start_last_fim+1] + pos_ids_to_fim = pos_ids[:,:start_last_fim+1] + tokens_fim = tokens[:,start_last_fim+1:end_last_fim] + pos_ids_fim = pos_ids[:,start_last_fim+1:end_last_fim] + # find positions of mask tokens + bool_mask, inds_masks = find_mask_positions(tokens_fim) + masked_positions = pos_ids_fim[0,bool_mask] + mask_dict = {ind: int(pos) for ind, pos in zip(inds_masks, masked_positions)} + return tokens_to_fim, pos_ids_to_fim, tokens_fim, pos_ids_fim, mask_dict + +# Metrics +def find_fim_indices(is_cls_tokens, is_eos_tokens): + """Function to find the indices of the FIM tokens in the sequences. + """ + # add a cls token at the beginning + is_cls_tokens = torch.cat([torch.ones_like(is_cls_tokens[:, :1]), is_cls_tokens], dim=1) + is_eos_tokens = torch.cat([torch.zeros_like(is_eos_tokens[:, :1]), is_eos_tokens], dim=1) + # both eos and cls tokens + bol = is_cls_tokens | is_eos_tokens + tmp = torch.zeros_like(is_cls_tokens, dtype=torch.int) + tmp[torch.nonzero(is_cls_tokens, as_tuple=True)] = 1 + tmp[torch.nonzero(is_eos_tokens, as_tuple=True)] = -1 + bol1 = torch.clone(bol) + for batch_ind in range(tmp.size(0)): + tmp1 = tmp[batch_ind,bol[batch_ind]] + # find all positions where a 1 if preceeded by a -1 + tmp1 = tmp1[:-1]*tmp1[1:] + # add the first element to make the sequence start with a 1 + tmp1 = torch.cat([torch.ones_like(tmp1[:1]).to(tmp1.device), tmp1]) + new_bol = tmp1<0 + # bool array True only in the positions where a 1 is preceeded by a -1 + bol1[batch_ind,bol[batch_ind]] = False if new_bol.size(0) == 0 else new_bol + cumulative_sum = torch.cumsum(bol1, dim=1) + # Use modulo operation to get the desired tensor + bol2 = cumulative_sum % 2 == 1 + bol2[is_eos_tokens]= False + return bol2[:,1:] + +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = torch.tensor(predictions).permute(0, 2, 1) + labels = torch.tensor(labels) + # shift labels to align them with predictions and remove last prediction to match the length + predictions = predictions[:, :, :-1].contiguous() + labels = labels[:, 1:].contiguous() + # compute unreduced elementwise loss + unreduced_loss = torch.nn.functional.cross_entropy(predictions, labels, reduction="none") + # compute reconstruction accuracy + reconstruction = (predictions.argmax(1) == labels) + + # start and end tokens + is_cls_tokens = (labels == AA_TO_ID[""]) + is_eos_tokens = (labels == AA_TO_ID[""]) + # fill in the middle tokens + if False: + fim_tokens = torch.zeros(is_cls_tokens.size(0), is_cls_tokens.size(1), dtype=torch.bool) + in_mask_vector = torch.zeros(is_cls_tokens.size(0), dtype=torch.bool) + for j in range(is_cls_tokens.size(1)): + in_mask_vector = in_mask_vector & ~is_cls_tokens[:, j] + fim_tokens[:, j] = in_mask_vector + in_mask_vector = in_mask_vector | is_eos_tokens[:, j] + fim_tokens = find_fim_indices(is_cls_tokens, is_eos_tokens) + + number_sequences = torch.cumsum(torch.cat([torch.zeros(is_cls_tokens.size(0),1, dtype=torch.int32), is_cls_tokens[:,:-1]],1), -1) + # fist, second and last sequence tokens + first_sequence_tokens = ((~fim_tokens & (labels < 33)) | fim_tokens) & (number_sequences == 0) + second_sequence_tokens = ((~fim_tokens & (labels < 33)) | fim_tokens) & (number_sequences == 1) + last_sequence_tokens = ((~fim_tokens & (labels < 33)) | fim_tokens) & (number_sequences == (number_sequences.max(1).values[:, None] - 1)) + # end of mask tokens + end_of_masks = (fim_tokens & (labels > 33)) | is_cls_tokens | is_eos_tokens + + return { + "loss/all": torch.mean(unreduced_loss).item(), + "loss/end_span": torch.mean(unreduced_loss[end_of_masks]).item(), + "perplexity/seq": torch.mean(torch.exp(torch.mean(unreduced_loss, dim=1))).item(), + "perplexity/end_span": torch.exp(torch.mean(unreduced_loss[end_of_masks])).item(), + "perplexity/batch": torch.exp(torch.mean(unreduced_loss)).item(), + "perplexity/first_seq": torch.exp(torch.mean(unreduced_loss[first_sequence_tokens])).item(), + "perplexity/second_seq": torch.exp(torch.mean(unreduced_loss[second_sequence_tokens])).item(), + "perplexity/last_seq": torch.exp(torch.mean(unreduced_loss[last_sequence_tokens])).item(), + "perplexity/fim": torch.exp(torch.mean(unreduced_loss[fim_tokens])).item(), + "reconstruction/all": torch.mean(reconstruction.float()).item(), + "reconstruction/end_span": torch.mean(reconstruction[end_of_masks].float()).item(), + "reconstruction/first_seq": torch.mean(reconstruction[first_sequence_tokens].float()).item(), + "reconstruction/second_seq": torch.mean(reconstruction[second_sequence_tokens].float()).item(), + "reconstruction/last_seq": torch.mean(reconstruction[last_sequence_tokens].float()).item(), + "reconstruction/fim": torch.mean(reconstruction[fim_tokens].float()).item(), + } + +def compute_metrics_with_std(eval_pred): + predictions, labels = eval_pred + predictions = torch.tensor(predictions).permute(0, 2, 1) + labels = torch.tensor(labels) + # shift labels to align them with predictions and remove last prediction to match the length + predictions = predictions[:, :, :-1].contiguous() + labels = labels[:, 1:].contiguous() + # compute unreduced elementwise loss + unreduced_loss = torch.nn.functional.cross_entropy(predictions, labels, reduction="none") + # compute reconstruction accuracy + reconstruction = (predictions.argmax(1) == labels) + + # start and end tokens + is_cls_tokens = (labels == AA_TO_ID[""]) + is_eos_tokens = (labels == AA_TO_ID[""]) + # fill in the middle tokens + if False: + fim_tokens = torch.zeros(is_cls_tokens.size(0), is_cls_tokens.size(1), dtype=torch.bool) + in_mask_vector = torch.zeros(is_cls_tokens.size(0), dtype=torch.bool) + for j in range(is_cls_tokens.size(1)): + in_mask_vector = in_mask_vector & ~is_cls_tokens[:, j] + fim_tokens[:, j] = in_mask_vector + in_mask_vector = in_mask_vector | is_eos_tokens[:, j] + fim_tokens = find_fim_indices(is_cls_tokens, is_eos_tokens) + + number_sequences = torch.cumsum(torch.cat([torch.zeros(is_cls_tokens.size(0),1, dtype=torch.int32), is_cls_tokens[:,:-1]],1), -1) + # fist, second and last sequence tokens + first_sequence_tokens = ((~fim_tokens & (labels < 33)) | fim_tokens) & (number_sequences == 0) + second_sequence_tokens = ((~fim_tokens & (labels < 33)) | fim_tokens) & (number_sequences == 1) + last_sequence_tokens = ((~fim_tokens & (labels < 33)) | fim_tokens) & (number_sequences == (number_sequences.max(1).values[:, None] - 1)) + # end of mask tokens + end_of_masks = (fim_tokens & (labels > 33)) | is_cls_tokens | is_eos_tokens + + def perplexities_per_seq_for_subset(unreduced_loss, subset): + return torch.exp(torch.nanmean(torch.where(subset, unreduced_loss, torch.tensor(float('nan'))), dim=1)) + + return{ + # Loss + "loss/all": torch.mean(unreduced_loss).item(), + "loss/std": torch.std(unreduced_loss).item(), + "loss/end_span": torch.mean(unreduced_loss[end_of_masks]).item(), + "loss/end_span_std": torch.std(unreduced_loss[end_of_masks]).item(), + + # Perplexity of all tokens + "perplexity/batch": torch.exp(torch.mean(unreduced_loss)).item(), + "perplexity/batch_std": torch.exp(torch.std(unreduced_loss)).item(), # Fix + + # Perplexity per sequence + "perplexity/seq": torch.mean(torch.exp(torch.mean(unreduced_loss, dim=1))).item(), + "perplexity/seq_std": torch.std(torch.exp(torch.mean(unreduced_loss, dim=1))).item(), + "perplexity/end_span": torch.exp(torch.mean(unreduced_loss[end_of_masks])).item(), + "perplexity/end_span_std": torch.std(torch.exp(unreduced_loss[end_of_masks])).item(), + + "perplexity/first_seq": torch.mean(perplexities_per_seq_for_subset(unreduced_loss, first_sequence_tokens)).item(), + "perplexity/first_seq_std": torch.std(perplexities_per_seq_for_subset(unreduced_loss, first_sequence_tokens)).item(), + "perplexity/second_seq": torch.mean(perplexities_per_seq_for_subset(unreduced_loss, second_sequence_tokens)).item(), + "perplexity/second_seq_std": torch.std(perplexities_per_seq_for_subset(unreduced_loss, second_sequence_tokens)).item(), + "perplexity/last_seq": torch.mean(perplexities_per_seq_for_subset(unreduced_loss, last_sequence_tokens)).item(), + "perplexity/last_seq_std": torch.std(perplexities_per_seq_for_subset(unreduced_loss, last_sequence_tokens)).item(), + "perplexity/fim": torch.mean(perplexities_per_seq_for_subset(unreduced_loss, fim_tokens)).item(), + "perplexity/fim_std": torch.std(perplexities_per_seq_for_subset(unreduced_loss, fim_tokens)).item(), + "reconstruction/all": torch.mean(reconstruction.float()).item(), + "reconstruction/std": torch.std(reconstruction.float()).item(), + "reconstruction/end_span": torch.mean(reconstruction[end_of_masks].float()).item(), + "reconstruction/end_span_std": torch.std(reconstruction[end_of_masks].float()).item(), + "reconstruction/first_seq": torch.mean(reconstruction[first_sequence_tokens].float()).item(), + "reconstruction/first_seq_std": torch.std(reconstruction[first_sequence_tokens].float()).item(), + "reconstruction/second_seq": torch.mean(reconstruction[second_sequence_tokens].float()).item(), + "reconstruction/second_seq_std": torch.std(reconstruction[second_sequence_tokens].float()).item(), + "reconstruction/last_seq": torch.mean(reconstruction[last_sequence_tokens].float()).item(), + "reconstruction/last_seq_std": torch.std(reconstruction[last_sequence_tokens].float()).item(), + "reconstruction/fim": torch.mean(reconstruction[fim_tokens].float()).item(), + "reconstruction/fim_std": torch.std(reconstruction[fim_tokens].float()).item(), + } + +# Others +def set_optimizer_and_scheduler(config, ntrain, parameters): + + # Set optimizer + optimizer = AdamW( + parameters, + lr=config["learning_rate"], + betas=(config["beta1"], config["beta2"]), + weight_decay=config["weight_decay"], + ) + + eff_batch_size = config["batch_size"] * config["gradient_accumulation_steps"] * torch.cuda.device_count() + + # Set scheduler + if config["scheduler"] == "cosine": + print_zero_rank("Using cosine scheduler") + scheduler = get_cosine_schedule_with_warmup( + optimizer, + num_warmup_steps=config["warmup_steps"], + num_training_steps=config["num_epochs"] * ntrain // eff_batch_size, + ) + if config["scheduler"] == "cosine-restarts": + scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( + optimizer, + num_warmup_steps=config["warmup_steps"], + num_training_steps=config["num_epochs"] * ntrain // eff_batch_size, + num_cycles=config["num_cycles"], + ) + elif config["scheduler"] == "constant": + print_zero_rank("Using constant scheduler with warmup") + scheduler = get_constant_schedule_with_warmup( + optimizer, num_warmup_steps=config["warmup_steps"] + ) + else: + raise ValueError("Scheduler must be either cosine or constant") + + # Finetuning and no optimizer/scheduler reset + if config.finetune_model_path and not config.restart_optimizer_and_scheduler: + optimizer.load_state_dict(torch.load(config.finetune_model_path + "/optimizer.pt")) + for param_group in optimizer.param_groups: + param_group['initial_lr'] = config['learning_rate'] + param_group['lr'] = config['learning_rate'] + + scheduler.load_state_dict(torch.load(config.finetune_model_path + "/scheduler.pt")) + scheduler.base_lrs = [config['learning_rate']] + scheduler._last_lr = [config['learning_rate']] + + return optimizer, scheduler + +def parse_override_args(override_args): + overrides = {} + for arg in override_args: + key, value = arg.split("=") + keys = key.split(".") + sub_dict = overrides + for sub_key in keys[:-1]: + if sub_key not in sub_dict: + sub_dict[sub_key] = {} + sub_dict = sub_dict[sub_key] + # Convert value to appropriate type + if value == 'True': + value = True + elif value == 'False': + value = False + elif value == 'None': + value = None + else: + try: + value = int(value) + except ValueError: + try: + value = float(value) + except ValueError: + pass + sub_dict[keys[-1]] = value + return overrides + +def load_model( + model_path, + device, + model_class, + dtype=torch.bfloat16, + **kwargs +): + model = model_class.from_pretrained( + model_path, device=device, dtype=dtype, **kwargs + ) + return model + +# https://github.com/state-spaces/mamba/blob/main/mamba_ssm/utils/hf.py +def load_config_hf(model_name): + resolved_archive_file = cached_file(model_name, CONFIG_NAME, _raise_exceptions_for_missing_entries=False) + return json.load(open(resolved_archive_file)) + +# https://github.com/state-spaces/mamba/blob/main/mamba_ssm/utils/hf.py +def load_state_dict_hf(model_name, device=None, dtype=None): + # If not fp32, then we don't want to load directly to the GPU + mapped_device = "cpu" if dtype not in [torch.float32, None] else device + resolved_archive_file = cached_file(model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False) + return torch.load(resolved_archive_file, map_location=mapped_device) + # Convert dtype before moving to GPU to save memory + if dtype is not None: + state_dict = {k: v.to(dtype=dtype) for k, v in state_dict.items()} + state_dict = {k: v.to(device=device) for k, v in state_dict.items()} + return state_dict \ No newline at end of file diff --git a/protxlstm/xlstm/__init__.py b/protxlstm/xlstm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e4397f977621ef2c4fdd8a4cff3312292ca7edd3 --- /dev/null +++ b/protxlstm/xlstm/__init__.py @@ -0,0 +1,6 @@ +from .blocks.mlstm.block import mLSTMBlock, mLSTMBlockConfig +from .blocks.mlstm.layer import mLSTMLayer, mLSTMLayerConfig +from .components.feedforward import FeedForwardConfig, GatedFeedForward +from .components.rotary_position import compute_freqs_cis, apply_rotary_emb +from .xlstm_block_stack import xLSTMBlockStack, xLSTMBlockStackConfig +from .xlstm_lm_model import xLSTMLMModel, xLSTMLMModelConfig diff --git a/protxlstm/xlstm/blocks/__init__.py b/protxlstm/xlstm/blocks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/protxlstm/xlstm/blocks/mlstm/__init__.py b/protxlstm/xlstm/blocks/mlstm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/protxlstm/xlstm/blocks/mlstm/__init__.py @@ -0,0 +1 @@ + diff --git a/protxlstm/xlstm/blocks/mlstm/backends.py b/protxlstm/xlstm/blocks/mlstm/backends.py new file mode 100644 index 0000000000000000000000000000000000000000..445b3206df6167999e94cdfac7624b4cacf566c1 --- /dev/null +++ b/protxlstm/xlstm/blocks/mlstm/backends.py @@ -0,0 +1,314 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck + +# Modified by Pieter-Jan Hoedt, Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Fix numerical issues between parallel and stepwise backends +# - Make chunkwise implementation compatible with stepwise backend and variable sequence lengths + + +import math +from typing import Union, Tuple, Optional +import torch + + +def parallel_stabilized_simple( + queries: torch.Tensor, + keys: torch.Tensor, + values: torch.Tensor, + igate_preact: torch.Tensor, + fgate_preact: torch.Tensor, + lower_triangular_matrix: torch.Tensor = None, + stabilize_rowwise: bool = True, + eps: float = 1e-6, + **kwargs, +) -> torch.Tensor: + """This is the mLSTM cell in parallel form. + This version is stabilized. We control the range of exp() arguments by + ensuring that they are always smaller than 0.0 by subtracting the maximum. + + Args: + queries (torch.Tensor): (B, NH, S, DH) + keys (torch.Tensor): (B, NH, S, DH) + values (torch.Tensor): (B, NH, S, DH) + igate_preact (torch.Tensor): (B, NH, S, 1) + fgate_preact (torch.Tensor): (B, NH, S, 1) + lower_triangular_matrix (torch.Tensor, optional): (S,S). Defaults to None. + stabilize_rowwise (bool, optional): Wether to stabilize the combination matrix C rowwise (take maximum per row). + Alternative: Subtract the maximum over all rows. Defaults to True. + + Returns: + torch.Tensor: (B, NH, S, DH), h_tilde_state + """ + + B, NH, S, DH = queries.shape + _dtype, _device = queries.dtype, queries.device + + # forget gate matrix + log_fgates = torch.nn.functional.logsigmoid(fgate_preact) # (B, NH, S, 1) + if lower_triangular_matrix is None or S < lower_triangular_matrix.size(-1): + ltr = torch.tril(torch.ones((S, S), dtype=torch.bool, device=_device)) + else: + ltr = lower_triangular_matrix + assert ( + ltr.dtype == torch.bool + ), f"lower_triangular_matrix must be of dtype bool, got {ltr.dtype}" + + log_f_mat = torch.tril(log_fgates.repeat(1, 1, 1, S), diagonal=-1) + log_prod_f_mat = torch.cumsum(log_f_mat, dim=-2) + # Causal masking & selection of the correct submatrix, such that forgetgate at timestep t is not applied + # to the input at timestep t + log_fg_matrix = torch.where(ltr, log_prod_f_mat, -float("inf")) # (B, NH, S, S) + + # gate decay matrix D (combination of forget gate and input gate) + log_D_matrix = log_fg_matrix + igate_preact.transpose(-2, -1) # (B, NH, S, S) + # D matrix stabilization + if stabilize_rowwise: + max_log_D, _ = torch.max(log_D_matrix, dim=-1, keepdim=True) # (B, NH, S, 1) + else: + max_log_D = torch.max(log_D_matrix.view(B, NH, -1), dim=-1, keepdim=True)[ + 0 + ].unsqueeze(-1) + # (B, NH, 1, 1) + log_D_matrix_stabilized = log_D_matrix - max_log_D # (B, NH, S, S) + D_matrix = torch.exp(log_D_matrix_stabilized) # (B, NH, S, S) + + keys_scaled = keys / math.sqrt(DH) + + # combination matrix C + qk_matrix = queries @ keys_scaled.transpose(-2, -1) # (B, NH, S, S) + C_matrix = qk_matrix * D_matrix # (B, NH, S, S) + normalizer = torch.maximum( + C_matrix.sum(dim=-1, keepdim=True).abs(), torch.exp(-max_log_D) + ) # (B, NH, S, 1) + # (B, NH, S, S) + C_matrix_normalized = C_matrix / (normalizer + eps) + + # retrieved values + h_tilde_state = C_matrix_normalized @ values # (B, NH, S, DH) + + return h_tilde_state + + +def chunkwise_simple( + queries: torch.Tensor, + keys: torch.Tensor, # B, NH, S, DH + values: torch.Tensor, # B, NH, S, DH + igate_preact: torch.Tensor, # B, NH, S + fgate_preact: torch.Tensor, # B, NH, S + initial_C: Optional[torch.Tensor] = None, # B, NH, DH, DH + initial_n: Optional[torch.Tensor] = None, # B, NH, DH, 1 + initial_m: Optional[torch.Tensor] = None, # B, NH, 1, 1 + chunk_size: int = 64, # optimize this + return_last_state: bool = False, + eps: float = 1e-6, + **kwargs, +) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: + B, NH, S, DH = queries.shape + NS, CS = S // chunk_size, chunk_size + _dtype, _device = queries.dtype, queries.device + + # form chunks + q = queries.view(B, NH, NS, CS, DH) + k = keys.view(B, NH, NS, CS, DH) / math.sqrt(DH) + v = values.view(B, NH, NS, CS, DH) + + # forget gates + log_fgates = torch.nn.functional.logsigmoid(fgate_preact).view(B, NH, NS, CS) + log_fgates_acc = log_fgates.cumsum(dim=3) + igate_preact = igate_preact.view(B, NH, NS, CS) + + log_fgates_rep = log_fgates[:, :, :, :, None].repeat(1, 1, 1, 1, CS) + log_fg_matrix = torch.tril(log_fgates_rep, diagonal=-1) + log_prod_fg_matrix = torch.cumsum(log_fg_matrix, dim=3) + + loggates = (igate_preact + log_prod_fg_matrix[:, :, :, -1]).unsqueeze(-1) + m_loc, _ = torch.max(loggates, dim=3, keepdim=True) + loggates = loggates - m_loc + + kv = k.transpose(-1, -2) @ (v * (loggates).exp()) + ksum = (k * (loggates).exp()).sum(dim=-2) + C = torch.zeros((B, NH, NS + 1, DH, DH), device=kv.device, dtype=kv.dtype) + n = torch.zeros((B, NH, NS + 1, DH, 1), device=kv.device, dtype=kv.dtype) + if initial_C is not None: + C[:, :, 0] = initial_C + if initial_n is not None: + n[:, :, 0] = initial_n + + m = torch.zeros((B, NH, NS + 1, 1, 1), device=kv.device, dtype=kv.dtype) + if initial_m is not None: + m[:, :, 0] = initial_m + + for i in range(1, NS + 1): + m[:, :, i] = torch.maximum( + log_fgates_acc[:, :, i - 1, -1, None, None] + m[:, :, i - 1], + m_loc[:, :, i - 1], + ) + C[:, :, i] = ( + C[:, :, i - 1].clone() + * ( + log_fgates_acc[:, :, i - 1, -1, None, None] + + m[:, :, i - 1] + - m[:, :, i] + ).exp() + + kv[:, :, i - 1] * (m_loc[:, :, i - 1] - m[:, :, i]).exp() + ) + n[:, :, i] = ( + n[:, :, i - 1].clone() + * ( + log_fgates_acc[:, :, i - 1, None, -1:] + + m[:, :, i - 1] + - m[:, :, i] + ).exp() + + ksum[:, :, i - 1, :, None] * (m_loc[:, :, i - 1] - m[:, :, i]).exp() + ) + + log_fg_matrix = log_prod_fg_matrix - torch.triu( + torch.full([1, 1, 1, CS, CS], float("inf")).to(q), diagonal=1 + ) + + # gate decay matrix D (combination of forget gate and input gate) + log_D_matrix = log_fg_matrix + igate_preact[:, :, :, :, None].transpose( + -2, -1 + ) # (B, NH, NS, CS, CS) + D_max, _ = torch.max(log_D_matrix, dim=-1, keepdim=True) + + stab = torch.maximum(D_max, m[:, :, :-1, :] + log_fgates_acc[:, :, :, :, None]) + inter_C = ( + q * (m[:, :, :-1, :] + log_fgates_acc[:, :, :, :, None] - stab).exp() + ) @ C[:, :, :-1] + inter_n = ( + q * (m[:, :, :-1, :] + log_fgates_acc[:, :, :, :, None] - stab).exp() + ) @ n[:, :, :-1, :] + + # D matrix stabilization + log_D_matrix_stabilized = log_D_matrix - stab # (B, NH, NS, CS, CS) + D_matrix = torch.exp(log_D_matrix_stabilized) # (B, NH, NS, CS, CS) + + # combination matrix C + qk_matrix = q @ k.transpose(-2, -1) # (B, NH, NS, CS, CS) + E_matrix = qk_matrix * D_matrix # (B, NH, NS, CS, CS) + + normalizer = torch.maximum( + (E_matrix.sum(dim=-1, keepdim=True) + inter_n).abs(), + torch.exp(-stab), + ) # (B, NH, NS, CS, 1) + + E_matrix_normalized = E_matrix / (normalizer + eps) + + # retrieved values + intra = E_matrix_normalized @ v # (B, NH, S, DH) + inter = inter_C / (normalizer + eps) + + if return_last_state: + return (intra + inter).view((B, NH, S, DH)), (C[:, :, -1], n[:, :, -1], m[:, :, -1]) + else: + return (intra + inter).view((B, NH, S, DH)) + + +# chunkwise backend adapted to handle inputs which are not cleanly divisible by chunk_size +def chunkwise_variable( + queries: torch.Tensor, + keys: torch.Tensor, + values: torch.Tensor, + igate_preact: torch.Tensor, + fgate_preact: torch.Tensor, + initial_C: Optional[torch.Tensor] = None, + initial_n: Optional[torch.Tensor] = None, + initial_m: Optional[torch.Tensor] = None, + chunk_size: int = 64, + return_last_state: bool = False, + eps: float = 1e-6, + **kwargs, +) -> Union[ + torch.Tensor, Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]] +]: + """" + Wrapper around chunkwise_simple to allow sequences with arbitrary lengths + """ + tail_size = queries.shape[-2] % chunk_size + if tail_size == 0 or queries.shape[-2] < chunk_size: + return chunkwise_simple( + queries, keys, values, igate_preact, fgate_preact, + initial_C, initial_n, initial_m, chunk_size if tail_size == 0 else tail_size, + return_last_state, eps, **kwargs + ) + + sections = [queries.shape[-2] - tail_size, tail_size] + head_args, tail_args = zip(*(torch.split(x, sections, dim=-2) for x in [ + queries, keys, values, igate_preact, fgate_preact + ])) + head_out, state = chunkwise_simple( + *head_args, initial_C, initial_n, initial_m, + chunk_size=chunk_size, return_last_state=True, eps=eps, **kwargs + ) + tail_out = chunkwise_simple( + *tail_args, *state, chunk_size=tail_size, + return_last_state=return_last_state, eps=eps, **kwargs + ) + + if return_last_state: + return torch.cat([head_out, tail_out[0]], dim=-2), tail_out[-1] + else: + return torch.cat([head_out, tail_out], dim=-2) + + +def recurrent_step_stabilized_simple( + c_state: torch.Tensor, + n_state: torch.Tensor, + m_state: torch.Tensor, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + igate_preact: torch.Tensor, + fgate_preact: torch.Tensor, + eps: float = 1e-6, + **kwargs, +) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + """This is a single step of the mLSTM operation in recurrent form. + + Args: + c_state (torch.Tensor): (B, NH, DH, DH) + n_state (torch.Tensor): (B, NH, DH, 1) + m_state (torch.Tensor): (B, NH, 1, 1) + q (torch.Tensor): (B, NH, 1, DH) + k (torch.Tensor): (B, NH, 1, DH) + v (torch.Tensor): (B, NH, 1, DH) + igate_preact (torch.Tensor): (B, NH, 1, 1) + fgate_preact (torch.Tensor): (B, NH, 1, 1) + + Returns: + tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: + (hidden_state [B, NH, DH], (c_state_new [B, NH, DH, DH], n_state_new [B, NH, DH, 1]], m_state_new [B, NH, 1, 1])) + """ + B, NH, S, DH = q.shape + # projections + q, k, v = ( + q.squeeze_(2).unsqueeze(-1), + k.squeeze_(2).unsqueeze(-1), + v.squeeze_(2).unsqueeze(-1), + ) # (B, NH, DH, 1) + + # gates + log_fg_act = torch.nn.functional.logsigmoid(fgate_preact) # (B, NH, 1, 1) + + # update rule + m_state_new = torch.max(log_fg_act + m_state, igate_preact) # (B, NH, 1, 1) + + fg_act = torch.exp(log_fg_act + m_state - m_state_new) # (B, NH, 1, 1) + ig_act = torch.exp(igate_preact - m_state_new) # (B, NH, 1, 1) + + k_scaled = k / math.sqrt(DH) + + c_state_new = fg_act * c_state + ig_act * ( + k_scaled @ v.transpose(-1, -2) + ) # (B, NH, DH, DH) + n_state_new = fg_act * n_state + ig_act * k_scaled # (B, NH, DH, 1) + + h_num = q.transpose(-1, -2) @ c_state_new # (B, NH, 1, DH) + + qn_dotproduct = q.transpose(-1, -2) @ n_state_new # (B, NH, 1, 1) + max_val = torch.exp(-m_state_new) # (B, NH, 1, 1) + h_denom = torch.maximum(qn_dotproduct.abs(), max_val) + eps + h = h_num / h_denom # (B, NH, 1, DH) / (B, NH, 1, 1) = (B, NH, 1, DH) + + return h, (c_state_new, n_state_new, m_state_new) diff --git a/protxlstm/xlstm/blocks/mlstm/block.py b/protxlstm/xlstm/blocks/mlstm/block.py new file mode 100644 index 0000000000000000000000000000000000000000..748b001053935d8cddbd719ec06fb924f3c429a4 --- /dev/null +++ b/protxlstm/xlstm/blocks/mlstm/block.py @@ -0,0 +1,27 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck + +# Modified by Pieter-Jan Hoedt, Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Remove sLSTM + + +from dataclasses import dataclass, field + +from ..xlstm_block import xLSTMBlock, xLSTMBlockConfig +from .layer import mLSTMLayerConfig + + +@dataclass +class mLSTMBlockConfig: + mlstm: mLSTMLayerConfig = field(default_factory=mLSTMLayerConfig) + + def __post_init__(self): + self.mlstm.__post_init__() + + +class mLSTMBlock(xLSTMBlock): + + config_class = mLSTMBlockConfig + + def __init__(self, config: mLSTMBlockConfig) -> None: + super().__init__(config=xLSTMBlockConfig(mlstm=config.mlstm, feedforward=None)) diff --git a/protxlstm/xlstm/blocks/mlstm/cell.py b/protxlstm/xlstm/blocks/mlstm/cell.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd9343e04c4f65be3b9b94190fb085696ff258e --- /dev/null +++ b/protxlstm/xlstm/blocks/mlstm/cell.py @@ -0,0 +1,212 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck + +# Modified by Pieter-Jan Hoedt, Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Add references to chunkwise backends +# - Modify forward to take and return state + + +from dataclasses import dataclass + +import torch +from torch import nn +from functools import partial + +from ...components.init import bias_linspace_init_ +from ...components.ln import MultiHeadLayerNorm +from .backends import parallel_stabilized_simple, chunkwise_simple, chunkwise_variable, recurrent_step_stabilized_simple + + +@dataclass +class mLSTMCellConfig: + context_length: int = -1 + embedding_dim: int = -1 + num_heads: int = -1 + backend: str = "parallel" # "chunkwise" + chunk_size: int = 64 + return_last_state: bool = False + + +class mLSTMCell(nn.Module): + config_class = mLSTMCellConfig + + def __init__(self, config: mLSTMCellConfig): + super().__init__() + self.config = config + + if self.config.return_last_state == True: + assert config.backend != "parallel", "Parallel backend cannot return state - set return_last_state to False or use a chunkwise backend." + + if config.backend == "parallel": + self.backend_fn = parallel_stabilized_simple + elif config.backend == "chunkwise": + chunkwise_backend = partial(chunkwise_simple, chunk_size=config.chunk_size, return_last_state=config.return_last_state) + self.backend_fn = chunkwise_backend + elif config.backend == "chunkwise_variable": + chunkwise_backend = partial(chunkwise_variable, chunk_size=config.chunk_size, return_last_state=config.return_last_state) + self.backend_fn = chunkwise_backend + else: + raise ValueError(f"Unknown mLSTM backend: {config.backend}") + self.backend_fn_step = recurrent_step_stabilized_simple + + self.igate = nn.Linear(3 * config.embedding_dim, config.num_heads) + self.fgate = nn.Linear(3 * config.embedding_dim, config.num_heads) + + self.outnorm = MultiHeadLayerNorm(ndim=config.embedding_dim, weight=True, bias=False) + + if config.backend == "parallel": + self.register_buffer( + "causal_mask", + torch.tril(torch.ones(config.context_length, config.context_length, dtype=torch.bool)), + persistent=False, + ) + else: + self.causal_mask = None + + self.reset_parameters() + + def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, state = None, **kwargs) -> torch.Tensor: + B, S, _ = q.shape # (B, S, H) + + if_gate_input = torch.cat([q, k, v], dim=-1) + q = q.view(B, S, self.config.num_heads, -1) # (B, S, NH, DH) + k = k.view(B, S, self.config.num_heads, -1) # (B, S, NH, DH) + v = v.view(B, S, self.config.num_heads, -1) # (B, S, NH, DH) + + q = q.transpose(1, 2) # (B, NH, S, DH) + k = k.transpose(1, 2) # (B, NH, S, DH) + v = v.transpose(1, 2) # (B, NH, S, DH) + + # compute input and forget gate pre-activations + igate_preact = self.igate(if_gate_input) # (B, S, NH) + igate_preact = igate_preact.transpose(-1, -2).unsqueeze(-1) # (B, NH, S, 1) + fgate_preact = self.fgate(if_gate_input) # (B, S, NH) + fgate_preact = fgate_preact.transpose(-1, -2).unsqueeze(-1) # (B, NH, S, 1)# + + if state != None and self.config.backend in ["chunkwise", "chunkwise_variable"]: + + initial_C, initial_n, initial_m = state + + if self.config.return_last_state: + + h_state, mlstm_state = self.backend_fn( + queries=q, + keys=k, + values=v, + igate_preact=igate_preact, + fgate_preact=fgate_preact, + initial_C=initial_C, + initial_n=initial_n, + initial_m=initial_m, + lower_triangular_matrix=self.causal_mask, + ) + + else: + h_state = self.backend_fn( + queries=q, + keys=k, + values=v, + igate_preact=igate_preact, + fgate_preact=fgate_preact, + initial_C=initial_C, + initial_n=initial_n, + initial_m=initial_m, + lower_triangular_matrix=self.causal_mask, + ) # (B, NH, S, DH) + + else: + if self.config.return_last_state: + h_state, mlstm_state = self.backend_fn( + queries=q, + keys=k, + values=v, + igate_preact=igate_preact, + fgate_preact=fgate_preact, + lower_triangular_matrix=self.causal_mask, + ) + + else: + h_state = self.backend_fn( + queries=q, + keys=k, + values=v, + igate_preact=igate_preact, + fgate_preact=fgate_preact, + lower_triangular_matrix=self.causal_mask, + ) # (B, NH, S, DH) + + h_state_norm = self.outnorm(h_state) # (B, NH, S, DH) + h_state_norm = h_state_norm.transpose(1, 2).reshape(B, S, -1) # (B, NH, S, DH) -> (B, S, NH, DH) -> (B, S, H) + + if self.config.return_last_state: + return h_state_norm, mlstm_state + else: + return h_state_norm + + def step( + self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + mlstm_state: tuple[torch.Tensor, torch.Tensor, torch.Tensor] = None, + **kwargs, + ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: + B, S, _ = q.shape # (B, S, H) + assert S == 1, f"mLSTMCell.step only supports sequence length S=1, but got S={S}." + + if_gate_input = torch.cat([q, k, v], dim=-1) + q = q.view(B, S, self.config.num_heads, -1) # (B, S, NH, DH) + k = k.view(B, S, self.config.num_heads, -1) # (B, S, NH, DH) + v = v.view(B, S, self.config.num_heads, -1) # (B, S, NH, DH) + + _, _, NH, DH = q.shape + + q = q.transpose(1, 2) # (B, NH, S, DH) + k = k.transpose(1, 2) # (B, NH, S, DH) + v = v.transpose(1, 2) # (B, NH, S, DH) + + # compute input and forget gate pre-activations + igate_preact = self.igate(if_gate_input) # (B, S, NH) + igate_preact = igate_preact.transpose(-1, -2).unsqueeze(-1) # (B, NH, S, 1) + fgate_preact = self.fgate(if_gate_input) # (B, S, NH) + fgate_preact = fgate_preact.transpose(-1, -2).unsqueeze(-1) # (B, NH, S, 1) + + if mlstm_state is None: + c_state = torch.zeros(size=(B, NH, DH, DH), device=q.device, dtype=q.dtype) + n_state = torch.zeros(size=(B, NH, DH, 1), device=q.device, dtype=q.dtype) + m_state = torch.zeros(size=(B, NH, 1, 1), device=q.device, dtype=q.dtype) + else: + c_state, n_state, m_state = mlstm_state + c_state = c_state.to(device=q.device, dtype=q.dtype) + n_state = n_state.to(device=q.device, dtype=q.dtype) + m_state = m_state.to(device=q.device, dtype=q.dtype) + + assert c_state.shape == (B, NH, DH, DH), f"Expected c_state shape {(B, NH, DH, DH)}, but got {c_state.shape}." + assert n_state.shape == (B, NH, DH, 1), f"Expected n_state shape {(B, NH, DH, 1)}, but got {n_state.shape}." + assert m_state.shape == (B, NH, 1, 1), f"Expected m_state shape {(B, NH, 1, 1)}, but got {m_state.shape}." + + h_state, mlstm_state = self.backend_fn_step( + c_state=c_state, + n_state=n_state, + m_state=m_state, + q=q, + k=k, + v=v, + igate_preact=igate_preact, + fgate_preact=fgate_preact, + ) # (B, NH, 1 DH), ((B, NH, DH, DH), (B, NH, DH, 1), (B, NH, 1, 1)) + + h_state_norm = self.outnorm(h_state) # (B, NH, S, DH) + h_state_norm = h_state_norm.transpose(1, 2).reshape(B, S, -1) # (B, NH, S, DH) -> (B, S, NH, DH) -> (B, S, H) + + return h_state_norm, mlstm_state + + + def reset_parameters(self): + self.outnorm.reset_parameters() + # forget gate initialization + torch.nn.init.zeros_(self.fgate.weight) + bias_linspace_init_(self.fgate.bias, start=3.0, end=6.0) + # input gate initialization + torch.nn.init.zeros_(self.igate.weight) + torch.nn.init.normal_(self.igate.bias, mean=0.0, std=0.1) diff --git a/protxlstm/xlstm/blocks/mlstm/layer.py b/protxlstm/xlstm/blocks/mlstm/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..8399380b6e3300fc5d1cb91946f1c7629e2ccfa9 --- /dev/null +++ b/protxlstm/xlstm/blocks/mlstm/layer.py @@ -0,0 +1,217 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck + +# Modified by Pieter-Jan Hoedt, Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Modify forward to take and return state + + +from dataclasses import dataclass + +import torch +from torch import nn + +from ...components.conv import CausalConv1d, CausalConv1dConfig +from ...components.init import small_init_init_, wang_init_ +from ...components.linear_headwise import ( + LinearHeadwiseExpand, + LinearHeadwiseExpandConfig, +) +from ...utils import UpProjConfigMixin +from ...components.rotary_position import apply_rotary_emb +from .cell import mLSTMCell, mLSTMCellConfig + + +@dataclass +class mLSTMLayerConfig(UpProjConfigMixin): + conv1d_kernel_size: int = 4 + qkv_proj_blocksize: int = 4 + num_heads: int = 4 + proj_factor: float = 2.0 + + # will be set toplevel config + embedding_dim: int = -1 + bias: bool = False + dropout: float = 0.0 + context_length: int = -1 + backend: str = "parallel" # "chunkwise" + chunk_size: int = 64 + return_last_state: bool = False + + _num_blocks: int = 1 + _inner_embedding_dim: int = None + + def __post_init__(self): + self._set_proj_up_dim(embedding_dim=self.embedding_dim) + self._inner_embedding_dim = self._proj_up_dim + + +class mLSTMLayer(nn.Module): + config_class = mLSTMLayerConfig + + def __init__(self, config: mLSTMLayerConfig): + super().__init__() + self.config = config + + self.proj_up = nn.Linear( + in_features=self.config.embedding_dim, + out_features=2 * self.config._inner_embedding_dim, + bias=self.config.bias, + ) + + num_proj_heads = round(self.config._inner_embedding_dim // self.config.qkv_proj_blocksize) + self.q_proj = LinearHeadwiseExpand( + config=LinearHeadwiseExpandConfig( + in_features=self.config._inner_embedding_dim, + num_heads=num_proj_heads, + bias=self.config.bias, + ) + ) + self.k_proj = LinearHeadwiseExpand( + config=LinearHeadwiseExpandConfig( + in_features=self.config._inner_embedding_dim, + num_heads=num_proj_heads, + bias=self.config.bias, + ) + ) + self.v_proj = LinearHeadwiseExpand( + config=LinearHeadwiseExpandConfig( + in_features=self.config._inner_embedding_dim, + num_heads=num_proj_heads, + bias=self.config.bias, + ) + ) + + self.conv1d = CausalConv1d( + config=CausalConv1dConfig( + feature_dim=self.config._inner_embedding_dim, + kernel_size=self.config.conv1d_kernel_size, + ) + ) + self.conv_act_fn = nn.SiLU() + self.mlstm_cell = mLSTMCell( + config=mLSTMCellConfig( + context_length=self.config.context_length, + embedding_dim=self.config._inner_embedding_dim, + num_heads=self.config.num_heads, + backend=self.config.backend, + chunk_size=self.config.chunk_size, + return_last_state = self.config.return_last_state + ) + ) + self.ogate_act_fn = nn.SiLU() + + self.learnable_skip = nn.Parameter(torch.ones(self.config._inner_embedding_dim, requires_grad=True)) + + self.proj_down = nn.Linear( + in_features=self.config._inner_embedding_dim, + out_features=self.config.embedding_dim, + bias=self.config.bias, + ) + self.dropout = nn.Dropout(self.config.dropout) + self.reset_parameters() + + def forward(self, x: torch.Tensor, freqs_cos=None, freqs_sin=None, state=None, **kwargs) -> torch.Tensor: + B, S, _ = x.shape + + # up-projection + x_inner = self.proj_up(x) + x_mlstm, z = torch.split(x_inner, split_size_or_sections=self.config._inner_embedding_dim, dim=-1) + + # mlstm branch + if state != None: + mlstm_state = state["mlstm_state"] + conv_state = state["conv_state"][0] + else: + mlstm_state, conv_state = None, None + + if self.config.return_last_state: + x_mlstm_conv, conv_state = self.conv1d(x_mlstm, conv_state = conv_state, return_last_state = True) + else: + x_mlstm_conv = self.conv1d(x_mlstm, conv_state = conv_state) + + x_mlstm_conv_act = self.conv_act_fn(x_mlstm_conv) + + q = self.q_proj(x_mlstm_conv_act) + k = self.k_proj(x_mlstm_conv_act) + v = self.v_proj(x_mlstm) + + if freqs_cos is not None and freqs_sin is not None: + q, k = apply_rotary_emb(q, k, freqs_cos, freqs_sin) + + if self.config.return_last_state: + h_tilde_state, mlstm_state = self.mlstm_cell(q=q, k=k, v=v, state=mlstm_state, **kwargs) + else: + h_tilde_state = self.mlstm_cell(q=q, k=k, v=v, state=mlstm_state, **kwargs) + + h_tilde_state_skip = h_tilde_state + (self.learnable_skip * x_mlstm_conv_act) + + # output / z branch + h_state = h_tilde_state_skip * self.ogate_act_fn(z) + + # down-projection + y = self.dropout(self.proj_down(h_state)) + + if self.config.return_last_state: + return y, {"mlstm_state": mlstm_state, "conv_state": (conv_state,)} + else: + return y + + def step( + self, + x: torch.Tensor, + freqs_cos=None, + freqs_sin=None, + mlstm_state: tuple[torch.Tensor, torch.Tensor, torch.Tensor] = None, + conv_state: tuple[torch.Tensor] = None, + ) -> tuple[torch.Tensor, dict[str, tuple[torch.Tensor, ...]]]: + B, S, _ = x.shape + + # up-projection + x_inner = self.proj_up(x) + x_mlstm, z = torch.split(x_inner, split_size_or_sections=self.config._inner_embedding_dim, dim=-1) + + # mlstm branch + x_mlstm_conv, conv_state = self.conv1d.step(x_mlstm, conv_state=conv_state) + x_mlstm_conv_act = self.conv_act_fn(x_mlstm_conv) + + q = self.q_proj(x_mlstm_conv_act) + k = self.k_proj(x_mlstm_conv_act) + v = self.v_proj(x_mlstm) + + if freqs_cos is not None and freqs_sin is not None: + q, k = apply_rotary_emb(q, k, freqs_cos, freqs_sin) + + h_tilde_state, mlstm_state = self.mlstm_cell.step(q=q, k=k, v=v, mlstm_state=mlstm_state) + + h_tilde_state_skip = h_tilde_state + (self.learnable_skip * x_mlstm_conv_act) + + # output / z branch + h_state = h_tilde_state_skip * self.ogate_act_fn(z) + + # down-projection + y = self.dropout(self.proj_down(h_state)) + return y, {"mlstm_state": mlstm_state, "conv_state": conv_state} + + def reset_parameters(self): + # init inproj + small_init_init_(self.proj_up.weight, dim=self.config.embedding_dim) + if self.proj_up.bias is not None: + nn.init.zeros_(self.proj_up.bias) + # init outproj + wang_init_(self.proj_down.weight, dim=self.config.embedding_dim, num_blocks=self.config._num_blocks) + if self.proj_down.bias is not None: + nn.init.zeros_(self.proj_down.bias) + + nn.init.ones_(self.learnable_skip) + + def _init_qkv_proj(qkv_proj: LinearHeadwiseExpand): + # use the embedding dim instead of the inner embedding dim + small_init_init_(qkv_proj.weight, dim=self.config.embedding_dim) + if qkv_proj.bias is not None: + nn.init.zeros_(qkv_proj.bias) + + _init_qkv_proj(self.q_proj) + _init_qkv_proj(self.k_proj) + _init_qkv_proj(self.v_proj) + + self.mlstm_cell.reset_parameters() diff --git a/protxlstm/xlstm/blocks/xlstm_block.py b/protxlstm/xlstm/blocks/xlstm_block.py new file mode 100644 index 0000000000000000000000000000000000000000..0f93af3b0d0e197a990c27cb8708e4ffb5e016f2 --- /dev/null +++ b/protxlstm/xlstm/blocks/xlstm_block.py @@ -0,0 +1,111 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck + +# Modified by Pieter-Jan Hoedt, Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Remove sLSTM +# - Modify forward to take and return state + + +from dataclasses import dataclass +from typing import Optional + +import torch +from torch import nn + +from ..components.feedforward import FeedForwardConfig, create_feedforward +from ..components.ln import LayerNorm +from .mlstm.layer import mLSTMLayer, mLSTMLayerConfig + +"""An xLSTM block can be either an sLSTM Block or an mLSTM Block. +In this repository only mLSTM is implemented. + +It contains the pre-LayerNorms and the skip connections. +""" + + +@dataclass +class xLSTMBlockConfig: + mlstm: Optional[mLSTMLayerConfig] = None + + feedforward: Optional[FeedForwardConfig] = None + + _num_blocks: int = 1 + _block_idx: int = 0 + + def __post_init__(self): + assert ( + self.mlstm is not None + ), "mlstm config must be provided" + + embedding_dim = ( + self.mlstm.embedding_dim + ) + + self.mlstm._num_blocks = self._num_blocks + self.mlstm._block_idx = self._block_idx + + if self.feedforward: + self.feedforward.embedding_dim = embedding_dim + self.feedforward._num_blocks = self._num_blocks + self.feedforward.__post_init__() + + +class xLSTMBlock(nn.Module): + + config_class = xLSTMBlockConfig + + def __init__(self, config: xLSTMBlockConfig) -> None: + super().__init__() + self.config = config + embedding_dim = ( + self.config.mlstm.embedding_dim + ) + + self.xlstm_norm = LayerNorm(ndim=embedding_dim, weight=True, bias=False) + + if self.config.mlstm is not None: + self.xlstm = mLSTMLayer(config=self.config.mlstm) + else: + raise ValueError("mlstm must be provided") + + if self.config.feedforward is not None: + self.ffn_norm = LayerNorm( + ndim=self.config.feedforward.embedding_dim, weight=True, bias=False + ) + self.ffn = create_feedforward(config=self.config.feedforward) + else: + self.ffn_norm = None + self.ffn = None + + self.reset_parameters() + + def forward(self, x: torch.Tensor, state=None, **kwargs) -> torch.Tensor: + if self.config.mlstm.return_last_state: + x_xlstm, xlstm_state = self.xlstm(self.xlstm_norm(x), state=state, **kwargs) + x = x + x_xlstm + else: + x = x + self.xlstm(self.xlstm_norm(x), state=state, **kwargs) + + if self.ffn is not None: + x = x + self.ffn(self.ffn_norm(x), **kwargs) + + if self.config.mlstm.return_last_state: + return x, xlstm_state + else: + return x + + def step(self, x: torch.Tensor, **kwargs) -> tuple[torch.Tensor, dict[str, tuple[torch.Tensor, ...]]]: + x_xlstm, xlstm_state = self.xlstm.step(self.xlstm_norm(x), **kwargs) + x = x + x_xlstm + if self.ffn is not None: + x = x + self.ffn(self.ffn_norm(x), **kwargs) + return x, xlstm_state + + def reset_parameters(self) -> None: + + self.xlstm.reset_parameters() + self.xlstm_norm.reset_parameters() + + if self.ffn is not None: + self.ffn.reset_parameters() + self.ffn_norm.reset_parameters() diff --git a/protxlstm/xlstm/components/__init__.py b/protxlstm/xlstm/components/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/protxlstm/xlstm/components/conv.py b/protxlstm/xlstm/components/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..04166aac96b9e930ae511140720f69a3c07d6c82 --- /dev/null +++ b/protxlstm/xlstm/components/conv.py @@ -0,0 +1,163 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck, Korbinian Pöppel + +# Modified by Pieter-Jan Hoedt, Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Modify forward to take and return state + + +from dataclasses import dataclass, field +from typing import Optional + +import torch + +# from einops import rearrange +from torch import nn + + +@dataclass +class CausalConv1dConfig: + feature_dim: int = None # F + kernel_size: int = 4 + causal_conv_bias: bool = True + channel_mixing: bool = False + conv1d_kwargs: dict = field(default_factory=dict) + + def __post_init__(self): + assert self.kernel_size >= 0, "kernel_size must be >= 0" + + +def conv1d_step( + x: torch.Tensor, + conv_state: torch.Tensor, + conv1d_weight: torch.Tensor, + conv1d_bias: torch.Tensor = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + B: batch size + S: sequence length + D: feature dimension + KS: kernel size + Args: + x (torch.Tensor): (B, S, D) + conv_state (torch.Tensor): (B, KS, D) + conv1d_weight (torch.Tensor): (KS, D) + """ + assert ( + x.shape[0] == conv_state.shape[0] + ), f"x has batch size {x.shape[0]} but conv_state has batch size {conv_state.shape[0]}" + assert ( + x.shape[2] == conv_state.shape[2] + ), f"x has feature dimension {x.shape[2]} but conv_state has feature dimension {conv_state.shape[2]}" + assert x.shape[1] == 1, f"x has sequence length {x.shape[1]} but it should be 1" + conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=1)) + conv_state[:, -1:, :] = x + y = torch.sum(conv_state * conv1d_weight, dim=1, keepdim=True) + if conv1d_bias is not None: + y += conv1d_bias + return y, conv_state + + +class CausalConv1d(nn.Module): + config_class = CausalConv1dConfig + """ + Implements causal depthwise convolution of a time series tensor. + Input: Tensor of shape (B,T,F), i.e. (batch, time, feature) + Output: Tensor of shape (B,T,F) + + Args: + feature_dim: number of features in the input tensor + kernel_size: size of the kernel for the depthwise convolution + causal_conv_bias: whether to use bias in the depthwise convolution + channel_mixing: whether to use channel mixing (i.e. groups=1) or not (i.e. groups=feature_dim) + If True, it mixes the convolved features across channels. + If False, all the features are convolved independently. + """ + + def __init__(self, config: CausalConv1dConfig): + super().__init__() + self.config = config + self.groups = self.config.feature_dim + if self.config.channel_mixing: + self.groups = 1 + if self.config.kernel_size == 0: + self.conv = None # Noop + else: + self.pad = ( + self.config.kernel_size - 1 + ) # padding of this size assures temporal causality. + self.conv = nn.Conv1d( + in_channels=self.config.feature_dim, + out_channels=self.config.feature_dim, + kernel_size=self.config.kernel_size, + padding=self.pad, + groups=self.groups, + bias=self.config.causal_conv_bias, + **self.config.conv1d_kwargs, + ) + # B, C, L + self.reset_parameters() + + def reset_parameters(self, **kwargs): + self.conv.reset_parameters() + + def _create_weight_decay_optim_groups( + self, + ) -> tuple[set[nn.Parameter], set[nn.Parameter]]: + if self.config.kernel_size == 0: + return (), () + else: + weight_decay = (self.conv.weight,) + no_weight_decay = () + if self.config.causal_conv_bias: + no_weight_decay += (self.conv.bias,) + return weight_decay, no_weight_decay + + def forward( + self, + x: torch.Tensor, + conv_state: Optional[torch.Tensor] = None, + return_last_state: bool = False, + ) -> torch.Tensor: + if conv_state is not None: + conv_state = conv_state[:,-self.pad:] + x = torch.cat([conv_state, x], dim=1) + + if self.config.kernel_size == 0: + return x + y = x.transpose(2, 1) # (B,F,T) tensor - now in the right shape for conv layer. + y = self.conv(y) # (B,F,T+pad) tensor + if conv_state is not None: + y = y[:, :, conv_state.shape[1] :] + + if return_last_state: + return y[:, :, : -self.pad].transpose(2, 1), x[:, -self.config.kernel_size:] #[:, -self.pad :] + else: + return y[:, :, : -self.pad].transpose(2, 1) + + def step( + self, + x: torch.Tensor, + conv_state: tuple[torch.Tensor] = None, + ) -> tuple[torch.Tensor, tuple[torch.Tensor]]: + + if self.config.kernel_size == 0: + return x, conv_state + + B, S, D = x.shape + + if conv_state is None: + conv_state = ( + torch.zeros( + size=(B, self.config.kernel_size, D), + device=self.conv.weight.device, + dtype=self.conv.weight.dtype, + ), + ) + + y, conv_state = conv1d_step( + x, + conv_state[0], + self.conv.weight[:, 0, :].transpose(0, 1), # rearrange(, "D 1 KS -> KS D") + conv1d_bias=self.conv.bias if self.config.causal_conv_bias else None, + ) + return y, (conv_state,) \ No newline at end of file diff --git a/protxlstm/xlstm/components/feedforward.py b/protxlstm/xlstm/components/feedforward.py new file mode 100644 index 0000000000000000000000000000000000000000..58401e2bc6f3a55974047966a37c3578c560dbb1 --- /dev/null +++ b/protxlstm/xlstm/components/feedforward.py @@ -0,0 +1,88 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck +from dataclasses import dataclass +from typing import Callable, Literal + +import torch +from torch import nn + +from ..utils import UpProjConfigMixin +from .init import small_init_init_, wang_init_ + +_act_fn_registry = { + "gelu": nn.functional.gelu, + "relu": nn.functional.relu, + "relu^2": lambda x: torch.square(nn.functional.relu(x)), + "sigmoid": nn.functional.sigmoid, + "swish": nn.functional.silu, + "selu": nn.functional.selu, +} + + +def get_act_fn(act_fn_name: str) -> Callable[[torch.Tensor], torch.Tensor]: + if act_fn_name in _act_fn_registry: + return _act_fn_registry[act_fn_name] + else: + assert ( + False + ), f'Unknown activation function name "{act_fn_name}". Available activation functions are: {str(_act_fn_cls_registry.keys())}' + + +@dataclass +class FeedForwardConfig(UpProjConfigMixin): + proj_factor: float = 1.3 + act_fn: str = "gelu" + embedding_dim: int = -1 + dropout: float = 0.0 + bias: bool = False + ff_type: Literal["ffn_gated"] = "ffn_gated" + + _num_blocks: int = 1 + + def __post_init__(self): + self._set_proj_up_dim(embedding_dim=self.embedding_dim) + assert self.act_fn in _act_fn_registry, f"Unknown activation function {self.act_fn}" + + +class GatedFeedForward(nn.Module): + config_class = FeedForwardConfig + + def __init__(self, config: FeedForwardConfig): + super().__init__() + self.config = config + + self.proj_up = nn.Linear( + in_features=self.config.embedding_dim, + out_features=2 * self.config._proj_up_dim, + bias=self.config.bias, + ) + self.proj_down = nn.Linear( + in_features=self.config._proj_up_dim, + out_features=self.config.embedding_dim, + bias=self.config.bias, + ) + + self.act_fn = get_act_fn(self.config.act_fn) + + self.dropout = nn.Dropout(self.config.dropout) + self.reset_parameters() + + def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor: + gate_preact, up_proj = self.proj_up(x).split(self.config._proj_up_dim, dim=-1) + x = self.dropout(self.proj_down(self.act_fn(gate_preact) * up_proj)) + return x + + def reset_parameters(self): + small_init_init_(self.proj_up.weight, dim=self.config.embedding_dim) + if self.proj_up.bias is not None: + nn.init.zeros_(self.proj_up.bias) + wang_init_(self.proj_down.weight, dim=self.config.embedding_dim, num_blocks=self.config._num_blocks) + if self.proj_down.bias is not None: + nn.init.zeros_(self.proj_down.bias) + + +def create_feedforward(config: FeedForwardConfig) -> nn.Module: + if config.ff_type == "ffn_gated": + return GatedFeedForward(config) + else: + raise ValueError(f"Unknown feedforward type {config.ff_type}") diff --git a/protxlstm/xlstm/components/init.py b/protxlstm/xlstm/components/init.py new file mode 100644 index 0000000000000000000000000000000000000000..e7a205aa5522e3b5641f4db3c9bff3083563ede6 --- /dev/null +++ b/protxlstm/xlstm/components/init.py @@ -0,0 +1,32 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck +import math + +import torch + + +def bias_linspace_init_(param: torch.Tensor, start: float = 3.4, end: float = 6.0) -> torch.Tensor: + """Linearly spaced bias init across dimensions.""" + assert param.dim() == 1, f"param must be 1-dimensional (typically a bias), got {param.dim()}" + n_dims = param.shape[0] + init_vals = torch.linspace(start, end, n_dims) + with torch.no_grad(): + param.copy_(init_vals) + return param + + +def small_init_init_(param: torch.Tensor, dim: int) -> torch.Tensor: + """Fills the input Tensor with values according to the method described in Transformers without Tears: Improving + the Normalization of Self-Attention - Nguyen, T. & Salazar, J. (2019), using a normal distribution. + Adopted from https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/init_functions.py. + """ + std = math.sqrt(2 / (5 * dim)) + torch.nn.init.normal_(param, mean=0.0, std=std) + return param + + +def wang_init_(param: torch.Tensor, dim: int, num_blocks: int): + """Adopted from https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/init_functions.py.""" + std = 2 / num_blocks / math.sqrt(dim) + torch.nn.init.normal_(param, mean=0.0, std=std) + return param diff --git a/protxlstm/xlstm/components/linear_headwise.py b/protxlstm/xlstm/components/linear_headwise.py new file mode 100644 index 0000000000000000000000000000000000000000..4531265e4560b9b489aa03d05a8e3fd3c7595c1f --- /dev/null +++ b/protxlstm/xlstm/components/linear_headwise.py @@ -0,0 +1,92 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck, Korbininan Pöppel +from dataclasses import dataclass + +from math import sqrt +import torch + +# from einops import einsum, rearrange +from torch import nn + + +@dataclass +class LinearHeadwiseExpandConfig: + in_features: int = 0 + # this is the number of heads that the in_features are split into + # if num_heads=1, this is a normal linear layer + # if num_heads>1, the in_features are split into num_heads and each head is projected separately + # if num_heads=in_features, each feature is projected separately + num_heads: int = -1 + expand_factor_up: float = 1 + + # this is internally computed + # but can be overwritten if you want to use a different output dimension + # if > 0 the expand factor is ignored + _out_features: int = -1 + + bias: bool = True + trainable_weight: bool = True + trainable_bias: bool = True + + def __post_init__(self): + assert self.num_heads > 0, "num_heads must be set" + assert self.num_heads <= self.in_features, "num_heads must be <= in_features" + assert ( + self.in_features % self.num_heads == 0 + ), "in_features must be a multiple of num_heads" + + if self._out_features < 0: + self._out_features = round(self.expand_factor_up * self.in_features) + + +class LinearHeadwiseExpand(nn.Module): + """This is a structured projection layer that projects the input to a higher dimension. + It only allows integer up-projection factors, i.e. the output dimension is a multiple of the input dimension. + """ + + config_class = LinearHeadwiseExpandConfig + + def __init__(self, config: LinearHeadwiseExpandConfig): + super().__init__() + self.config = config + in_features = self.config.in_features + num_heads = self.config.num_heads + out_features_per_head = config._out_features // num_heads + self.weight = nn.Parameter( + torch.empty(num_heads, out_features_per_head, in_features // num_heads), + requires_grad=config.trainable_weight, + ) + if config.bias: + self.bias = nn.Parameter( + torch.empty(config._out_features), requires_grad=config.trainable_bias + ) + else: + self.bias = None + self.reset_parameters() + + def reset_parameters(self, **kwargs): + # small init + nn.init.normal_( + self.weight.data, mean=0.0, std=sqrt(2 / 5 / self.weight.shape[-1]) + ) + if self.bias is not None: + nn.init.zeros_(self.bias.data) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + shape = x.shape + x = x.view(*shape[:-1], self.config.num_heads, -1) + x = torch.einsum("...hd,hod->...ho", x, self.weight) + x = x.reshape(*shape[:-1], -1) + if self.bias is not None: + x = x + self.bias + return x + + def extra_repr(self): + return ( + f"in_features={self.config.in_features}, " + f"num_heads={self.config.num_heads}, " + f"expand_factor_up={self.config.expand_factor_up}, " + f"bias={self.config.bias}, " + f"trainable_weight={self.config.trainable_weight}, " + f"trainable_bias={self.config.trainable_bias}, " + ) diff --git a/protxlstm/xlstm/components/ln.py b/protxlstm/xlstm/components/ln.py new file mode 100644 index 0000000000000000000000000000000000000000..8e9d2f25f18a827018aa7d62e09cd4a17118890c --- /dev/null +++ b/protxlstm/xlstm/components/ln.py @@ -0,0 +1,68 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck, Korbinian Pöppel +import torch +import torch.nn.functional as F +from torch import nn + + +class LayerNorm(nn.Module): + """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False.""" + + def __init__( + self, + ndim: int = -1, + weight: bool = True, + bias: bool = False, + eps: float = 1e-5, + residual_weight: bool = True, + ): + super().__init__() + self.weight = nn.Parameter(torch.zeros(ndim)) if weight else None + self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None + self.eps = eps + self.residual_weight = residual_weight + self.ndim = ndim + self.reset_parameters() + + @property + def weight_proxy(self) -> torch.Tensor: + if self.weight is None: + return None + if self.residual_weight: + return 1.0 + self.weight + else: + return self.weight + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.layer_norm( + input, normalized_shape=(self.ndim,), weight=self.weight_proxy, bias=self.bias, eps=self.eps + ) + + def reset_parameters(self): + if self.weight_proxy is not None: + if self.residual_weight: + nn.init.zeros_(self.weight) + else: + nn.init.ones_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + +class MultiHeadLayerNorm(LayerNorm): + + def forward(self, input: torch.Tensor) -> torch.Tensor: + assert input.dim() == 4, "Input must be 4D tensor (B, NH, S, DH)" + B, NH, S, DH = input.shape + + gn_in_1 = input.transpose(1, 2) # (B, S, NH, DH) + gn_in_2 = gn_in_1.reshape(B * S, NH * DH) # (B * S, NH * DH) + out = F.group_norm( + gn_in_2, + num_groups=NH, + weight=self.weight_proxy, + bias=self.bias, + eps=self.eps, + ) + # (B * S), (NH * DH) -> (B, S, NH, DH) -> (B, NH, S, DH) + out = out.view(B, S, NH, DH).transpose(1, 2) + return out diff --git a/protxlstm/xlstm/components/rotary_position.py b/protxlstm/xlstm/components/rotary_position.py new file mode 100644 index 0000000000000000000000000000000000000000..40d22b9522a7cf970c1442656ff49f6ababdf94a --- /dev/null +++ b/protxlstm/xlstm/components/rotary_position.py @@ -0,0 +1,35 @@ +import torch +from typing import Tuple +import math + + +def compute_freqs_cis(t: torch.Tensor, head_dim: int, theta: float = 10_000.0): + freqs = theta ** (-torch.arange(0, head_dim, 2).float() / head_dim) + freqs = t.unsqueeze(-1) * freqs.to(t.device) # type: ignore + freqs_cos = torch.cos(freqs) # real part + freqs_sin = torch.sin(freqs) # imaginary part + return freqs_cos, freqs_sin + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cos: torch.Tensor, + freqs_sin: torch.Tensor +) -> Tuple[torch.Tensor, torch.Tensor]: + + # reshape xq and xk to match the complex representation + xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1) + xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1) + + # apply rotation using real numbers + xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin + xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos + xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin + xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos + + # flatten last two dimensions + xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(-2) + xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(-2) + + return xq_out.type_as(xq), xk_out.type_as(xk) diff --git a/protxlstm/xlstm/components/util.py b/protxlstm/xlstm/components/util.py new file mode 100644 index 0000000000000000000000000000000000000000..2c6bc61911d9b81cd7b3b28c798a1168694585a5 --- /dev/null +++ b/protxlstm/xlstm/components/util.py @@ -0,0 +1,74 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Korbininan Pöppel +import torch +from typing import Callable + + +def round_to_multiple(n, m=8): + return ((n + m - 1) // m) * m + + +def conditional_decorator(condition, decorator): + """A higher-order decorator that applies 'decorator' only if 'condition' is True.""" + + def dummy_decorator(func): + """A dummy decorator that does nothing.""" + return func + + if condition: + # If condition is True, return the actual decorator + return decorator + else: + # If condition is False, return the dummy decorator + return dummy_decorator + + +class ParameterProxy: + """ + This class helps keeping parameters in a specialized internal structure to be optimal for + computation speed, while having a proxied version to be called externally that is backend-agnostic. + It takes a module and a parameter name of a parameter in that module it represents. + Via __setitem__ and __getitem__ the "external" + """ + + def __init__( + self, + module, + parameter_name, + internal_to_external: Callable[[torch.Tensor], torch.Tensor], + external_to_internal: Callable[[torch.Tensor], torch.Tensor], + ): + self.module = module + self.parameter_name = parameter_name + self.internal_to_external = internal_to_external + self.external_to_internal = external_to_internal + + def __getitem__(self, key): + # Transform and then apply the slice to the external shape + external_param = self.internal_to_external(getattr(self.module, self.parameter_name)).detach() + return external_param[key] + + def __setitem__(self, key, value): + # Apply the slice on the external shape, then transform back + with torch.no_grad(): + external_param = self.internal_to_external(getattr(self.module, self.parameter_name)) + external_param[key] = value + getattr(self.module, self.parameter_name).data = self.external_to_internal(external_param).contiguous() + + def clone(self): + return self.internal_to_external(getattr(self.module, self.parameter_name)).clone() + + @property + def shape(self): + return self.internal_to_external(getattr(self.module, self.parameter_name)).shape + + @property + def ndim(self): + return self.internal_to_external(getattr(self.module, self.parameter_name)).ndim + + @property + def grad(self): + return self.internal_to_external(getattr(self.module, self.parameter_name).grad) + + def __getattr__(self, name: str): + return getattr(getattr(self.module, self.parameter_name), name) diff --git a/protxlstm/xlstm/utils.py b/protxlstm/xlstm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a15702a1497e5288a3e73be08531828873fa70b0 --- /dev/null +++ b/protxlstm/xlstm/utils.py @@ -0,0 +1,105 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck +import math +from abc import ABC +from dataclasses import dataclass +from typing import Sequence + +from torch import nn + + +@dataclass +class UpProjConfigMixin: + proj_factor: float = None # will be overridden by subclasses + round_proj_up_dim_up: bool = True + round_proj_up_to_multiple_of: int = 64 + + # internal + _proj_up_dim: int = None # will be computed from embedding_dim and proj_factor + + def _set_proj_up_dim(self, embedding_dim: int) -> None: + if self.proj_factor is not None and embedding_dim is not None: + proj_up_dim = self.proj_factor * embedding_dim + multiple_of_multiplier = proj_up_dim / self.round_proj_up_to_multiple_of + if self.round_proj_up_dim_up: + multiple_of_multiplier = math.ceil(multiple_of_multiplier) + else: + multiple_of_multiplier = math.floor(multiple_of_multiplier) + + self._proj_up_dim = int(multiple_of_multiplier * self.round_proj_up_to_multiple_of) + + +class WeightDecayOptimGroupMixin(nn.Module, ABC): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def get_weight_decay_optim_groups(self, **kwargs) -> tuple[Sequence[nn.Parameter], Sequence[nn.Parameter]]: + """Return a tuple of two sequences, one for parameters with weight decay and one for parameters without weight decay. + Performs checks to ensure that each parameter is only in one of the two sequences. + """ + weight_decay, no_weight_decay = self._create_weight_decay_optim_groups(**kwargs) + + # Check that parameters have been assigned correctly. + # Each parameter can only be in one optim group. + intersection_params = set(weight_decay).intersection(set(no_weight_decay)) + assert ( + len(intersection_params) == 0 + ), f"parameters {[pn for pn, p in self.named_parameters() if p in intersection_params]} made it into both decay/no_decay sets!" + + union_params = set(weight_decay).union(set(no_weight_decay)) + param_dict = {pn: p for pn, p in self.named_parameters()} + unassigned_params = set(param_dict.values()) - union_params + unassigned_params = [up for up in unassigned_params if not hasattr(up, "requires_grad") or up.requires_grad] + # We have parameters that were not assigned to either weight decay or no weight decay. + # Find the parameter names and raise an error. + assert ( + len(unassigned_params) == 0 + ), f"Parameters {[pn for pn, p in self.named_parameters() if all([p is not q for q in unassigned_params])]} were not separated into either decay/no_decay set!" + + return weight_decay, no_weight_decay + + def get_weight_decay_optim_group_param_names(self, **kwargs) -> tuple[Sequence[str], Sequence[str]]: + """Return a tuple of two sequences, one for parameter names with weight decay and one for parameter names without weight decay. + Performs checks to ensure that each parameter is only in one of the two sequences. + """ + + def _is_in_sequence(param: nn.Parameter, sequence: Sequence[nn.Parameter]) -> bool: + for p in sequence: + if param is p: + return True + return False + + weight_decay, no_weight_decay = self.get_weight_decay_optim_groups(**kwargs) + names_weight_decay = [pn for pn, p in self.named_parameters() if _is_in_sequence(p, weight_decay)] + names_no_weight_decay = [pn for pn, p in self.named_parameters() if _is_in_sequence(p, no_weight_decay)] + return names_weight_decay, names_no_weight_decay + + def _create_weight_decay_optim_groups(self, **kwargs) -> tuple[Sequence[nn.Parameter], Sequence[nn.Parameter]]: + """Return a tuple of two sequences, one for parameters with weight decay and one for parameters without weight decay. + Default separation: + - weight decay: all parameters which have > 1 dimensions. + - no weight decay: all parameters which have = 1 dimension, e.g. biases. + """ + + decay = set() + no_decay = set() + for name, param in self.named_parameters(): + if param.requires_grad: + if param.ndim > 1: + decay.add(param) + elif param.ndim == 1: + no_decay.add(param) + else: + raise ValueError(f"Unsupported parameter shape: {param.shape}") + + return tuple(decay), tuple(no_decay) + + def _get_weight_decay_optim_groups_for_modules( + self, modules: list["WeightDecayOptimGroupMixin"], **kwargs + ) -> tuple[Sequence[nn.Parameter], Sequence[nn.Parameter]]: + weight_decay, no_weight_decay = (), () + for module in modules: + wd, nwd = module.get_weight_decay_optim_groups(**kwargs) + weight_decay += wd + no_weight_decay += nwd + return weight_decay, no_weight_decay diff --git a/protxlstm/xlstm/xlstm_block_stack.py b/protxlstm/xlstm/xlstm_block_stack.py new file mode 100644 index 0000000000000000000000000000000000000000..5bea0c40f27b7525101d72648d9e4d684a33a934 --- /dev/null +++ b/protxlstm/xlstm/xlstm_block_stack.py @@ -0,0 +1,149 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck + +# Modified by Pieter-Jan Hoedt, Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Remove sLSTM +# - Modify forward to take and return state + + +from copy import deepcopy +from dataclasses import dataclass +from typing import Optional + +import torch +from torch import nn +from torch.utils.checkpoint import checkpoint + +from .blocks.mlstm.block import mLSTMBlock, mLSTMBlockConfig +from .components.ln import LayerNorm + + +@dataclass +class xLSTMBlockStackConfig: + mlstm_block: Optional[mLSTMBlockConfig] = None + + context_length: int = -1 + num_blocks: int = 1 + embedding_dim: int = 128 + add_post_blocks_norm: bool = True + bias: bool = False + dropout: float = 0.0 + + checkpoint_blocks: bool = False + + # _block_map is a string that specifies which block is used at which position + # 0: use the mLSTM block + # 1: use the sLSTM block (not available in this repository) + _block_map: str = None + + @property + def block_map(self) -> list[int]: + return list(map(int, self._block_map.split(","))) + + def _create_block_map(self) -> str: + """Creates the block map, that specifies which block is used at which position.""" + block_map = [0] * self.num_blocks + block_map_str = ",".join(map(str, block_map)) + + return block_map_str + + def __post_init__(self): + + if self.mlstm_block is not None: + + self.mlstm_block.mlstm.embedding_dim = self.embedding_dim + self.mlstm_block.mlstm.bias = self.bias + self.mlstm_block.mlstm.dropout = self.dropout + self.mlstm_block.mlstm.context_length = self.context_length + self.mlstm_block.mlstm._num_blocks = self.num_blocks + # call post init, for setting inner_embedding_dim + self.mlstm_block.__post_init__() + + self._block_map = self._create_block_map() + + +class xLSTMBlockStack(nn.Module): + config_class = xLSTMBlockStackConfig + + def __init__(self, config: xLSTMBlockStackConfig): + super().__init__() + self.config = config + + self.blocks = self._create_blocks(config=config) + if config.add_post_blocks_norm: + self.post_blocks_norm = LayerNorm(ndim=config.embedding_dim) + else: + self.post_blocks_norm = nn.Identity() + + def _create_blocks(self, config: xLSTMBlockStackConfig): + + blocks = [] + for block_idx, block_type_int in enumerate(config.block_map): + if block_type_int == 0: + config = deepcopy(self.config.mlstm_block) + if hasattr(config, "_block_idx"): + config._block_idx = block_idx + config.__post_init__() + blocks.append(mLSTMBlock(config=config)) + else: + raise ValueError(f"Invalid block type {block_type_int}") + + return nn.ModuleList(blocks) + + def reset_parameters(self) -> None: + for block in self.blocks: + block.reset_parameters() + if not isinstance(self.post_blocks_norm, nn.Identity): + self.post_blocks_norm.reset_parameters() + + def forward(self, x: torch.Tensor, state=None, **kwargs) -> torch.Tensor: + + if self.config.mlstm_block.mlstm.backend not in ["chunkwise", "chunkwise_variable"]: + state=None + + new_state = {} + + for block_idx, block in enumerate(self.blocks): + if state != None: + block_state = state[f"block_{block_idx}"] + else: + block_state = None + + if self.config.mlstm_block.mlstm.return_last_state: + + if self.config.checkpoint_blocks: + x, new_state[f"block_{block_idx}"] = checkpoint(block, x, state=block_state, use_reentrant=False, **kwargs) + else: + x, new_state[f"block_{block_idx}"] = block(x, state=block_state, **kwargs) + + else: + + if self.config.checkpoint_blocks: + x = checkpoint(block, x, state=block_state, **kwargs, use_reentrant=False) + else: + x = block(x, state=block_state, **kwargs) + + x = self.post_blocks_norm(x) + + if self.config.mlstm_block.mlstm.return_last_state: + return x, new_state + else: + return x + + def step( + self, + x: torch.Tensor, + state: dict[str, dict[str, tuple[torch.Tensor, ...]]] = None, + **kwargs + ) -> tuple[torch.Tensor, dict[str, dict[str, tuple[torch.Tensor, ...]]]]: + if state is None: + state = {} + + for block_idx, block in enumerate(self.blocks): + x, state[f"block_{block_idx}"] = block.step( + x, **state.get(f"block_{block_idx}", {}), **kwargs + ) + + x = self.post_blocks_norm(x) + + return x, state diff --git a/protxlstm/xlstm/xlstm_lm_model.py b/protxlstm/xlstm/xlstm_lm_model.py new file mode 100644 index 0000000000000000000000000000000000000000..89d9cbc92299bee55e5bd8609374dd3b0ca2e205 --- /dev/null +++ b/protxlstm/xlstm/xlstm_lm_model.py @@ -0,0 +1,263 @@ +# Copyright (c) NXAI GmbH and its affiliates 2024 +# Maximilian Beck + +# Modified by Pieter-Jan Hoedt, Niklas Schmidinger, Lisa Schneckenreiter and Sohvi Luukkonen +# - Remove sLSTM +# - Add positional embeddings +# - Modify forward to take and return state + + +from dataclasses import dataclass +from typing import Sequence + +import torch +from torch import nn + +from .components.init import small_init_init_ +from .components.rotary_position import compute_freqs_cis +from .utils import WeightDecayOptimGroupMixin +from .xlstm_block_stack import xLSTMBlockStack, xLSTMBlockStackConfig + + +@dataclass +class xLSTMLMModelConfig(xLSTMBlockStackConfig): + vocab_size: int = -1 + tie_weights: bool = False + weight_decay_on_embedding: bool = False + add_embedding_dropout: bool = False + position_embeddings: str = "none" + max_position_embeddings: int = 2048 + max_seq_position_embeddings: int = 512 + rope_base_frequency: int = 10_000 + + +class xLSTMLMModel(WeightDecayOptimGroupMixin, nn.Module): + config_class = xLSTMLMModelConfig + + def __init__(self, config: xLSTMLMModelConfig, **kwargs): + super().__init__() + self.config = config + + self.xlstm_block_stack = xLSTMBlockStack(config=config) + + assert config.position_embeddings in [ + "abs_1d", + "abs_2d", + "rot", + "rot_1d", + "rot_2d", + "none", + ], f"Unknown position embeddings: {config.position_embeddings}" + + if config.position_embeddings == "abs_1d": + assert ( + config.embedding_dim % 2 == 0 + ), "for abs_1d embedding_dim must be divisible by 2." + self.token_embedding = nn.Embedding( + config.vocab_size, config.embedding_dim // 2 + ) + self.position_embedding = nn.Embedding( + config.max_position_embeddings, + config.embedding_dim - config.embedding_dim // 2, + ) + elif config.position_embeddings == "abs_2d": + assert ( + config.embedding_dim % 4 == 0 + ), "for abs_1d embedding_dim must be divisible by 4." + self.token_embedding = nn.Embedding( + config.vocab_size, config.embedding_dim - 2 * config.embedding_dim // 4 + ) + self.position_embedding = nn.Embedding( + config.max_position_embeddings, config.embedding_dim // 4 + ) + self.seq_position_embedding = nn.Embedding( + config.max_seq_position_embeddings, config.embedding_dim // 4 + ) + elif config.position_embeddings.startswith("rot"): + + head_dim = config.mlstm_block.mlstm._inner_embedding_dim + assert head_dim % 2 == 0, "RoPE requires even head dimension" + self.token_embedding = nn.Embedding( + num_embeddings=config.vocab_size, embedding_dim=config.embedding_dim + ) + + if config.position_embeddings == "rot": + max_positions = config.max_position_embeddings * config.max_seq_position_embeddings + freqs_cos, freqs_sin = compute_freqs_cis(torch.arange(max_positions), head_dim) + self.register_buffer("freqs_cos", freqs_cos, persistent=False) + self.register_buffer("freqs_sin", freqs_sin, persistent=False) + else: + self.token_embedding = nn.Embedding( + num_embeddings=config.vocab_size, embedding_dim=config.embedding_dim + ) + + self.emb_dropout = ( + nn.Dropout(config.dropout) + if config.add_embedding_dropout + else nn.Identity() + ) + + self.lm_head = nn.Linear( + in_features=config.embedding_dim, + out_features=config.vocab_size, + bias=False, + ) + if config.tie_weights: + self.lm_head.weight = self.token_embedding.weight + + + def reset_parameters(self): + self.xlstm_block_stack.reset_parameters() + + small_init_init_( + self.token_embedding.weight, dim=self.token_embedding.embedding_dim + ) + + if not self.config.tie_weights: + small_init_init_(self.lm_head.weight, dim=self.config.embedding_dim) + + if hasattr(self, "position_embedding"): + small_init_init_( + self.position_embedding.weight, dim=self.position_embedding.embedding_dim + ) + + if hasattr(self, "seq_position_embedding"): + small_init_init_(self.seq_position_embedding.weight, dim=self.seq_position_embedding.embedding_dim) + + def forward(self, input_ids: torch.Tensor, state=None, **kwargs) -> torch.Tensor: + + x = self.token_embedding(input_ids) + + # absolute position embeddings + if self.config.position_embeddings.startswith("abs"): + position_ids = kwargs.pop("position_ids", None) + position_embeddings = self.position_embedding(position_ids) + + seq_position_ids = kwargs.pop("seq_position_ids", None) # check if abs_2d + if seq_position_ids is not None: + seq_position_embeddings = self.seq_position_embedding(seq_position_ids) + position_embeddings = torch.cat( + [position_embeddings, seq_position_embeddings], dim=-1 + ) + + x = torch.cat([x, position_embeddings], dim=-1) + + # rotary postion embeddings + elif self.config.position_embeddings.startswith("rot"): + if self.config.position_embeddings.endswith("1d"): + assert "position_ids" in kwargs, "1d RoPE requires 'position_ids' argument" + head_dim = self.config.mlstm_block.mlstm._inner_embedding_dim + freqs_cos, freqs_sin = compute_freqs_cis(kwargs.pop("position_ids"), head_dim, theta=self.config.rope_base_frequency) + elif self.config.position_embeddings.endswith("2d"): + assert ( + "position_ids" in kwargs and "seq_position_ids" in kwargs + ), "2d RoPE requires 'position_ids' and 'seq_position_ids' arguments" + head_dim = self.config.mlstm_block.mlstm._inner_embedding_dim + total_emb = self.config.max_position_embeddings + self.config.max_seq_position_embeddings + pos_dim = head_dim * self.config.max_position_embeddings // total_emb + pos_dim -= pos_dim % 2 # assure pos_dim is even + seq_dim = head_dim - pos_dim + freqs_cos1, freqs_sin1 = compute_freqs_cis(kwargs.pop("position_ids"), pos_dim, theta=self.config.rope_base_frequency) + freqs_cos2, freqs_sin2 = compute_freqs_cis(kwargs.pop("seq_position_ids"), seq_dim, theta=self.config.rope_base_frequency) + freqs_cos = torch.cat([freqs_cos1, freqs_cos2], dim=-1) + freqs_sin = torch.cat([freqs_sin1, freqs_sin2], dim=-1) + else: + assert hasattr(self, "freqs_cos"), "model was not configured for general RoPE" + assert len(self.freqs_cos) >= x.shape[1], "input sequence longer than max_seq_positions" + freqs_cos, freqs_sin = self.freqs_cos[:x.shape[1]], self.freqs_sin[:x.shape[1]] + + kwargs["freqs_cos"] = freqs_cos + kwargs["freqs_sin"] = freqs_sin + + x = self.emb_dropout(x) + + if self.config.mlstm_block.mlstm.return_last_state: + x, state = self.xlstm_block_stack(x, state=state, **kwargs) + else: + x = self.xlstm_block_stack(x, state=state, **kwargs) + + logits = self.lm_head(x) + + if self.config.mlstm_block.mlstm.return_last_state: + return logits, state + else: + return logits + + + def step( + self, + input_ids: torch.Tensor, + state: dict[str, dict[str, tuple[torch.Tensor, ...]]] = None, + **kwargs, + ) -> tuple[torch.Tensor, dict[str, dict[str, tuple[torch.Tensor, ...]]]]: + + x = self.token_embedding(input_ids) + + # absolute position embeddings + if self.config.position_embeddings.startswith("abs"): + position_ids = kwargs.pop("position_ids", None) + position_embeddings = self.position_embedding(position_ids) + + seq_position_ids = kwargs.pop("seq_position_ids", None) # check if abs_2d + if seq_position_ids is not None: + seq_position_embeddings = self.seq_position_embedding(seq_position_ids) + position_embeddings = torch.cat( + [position_embeddings, seq_position_embeddings], dim=-1 + ) + + x = torch.cat([x, position_embeddings], dim=-1) + + # rotary postion embeddings + elif self.config.position_embeddings.startswith("rot"): + if self.config.position_embeddings.endswith("1d"): + assert "position_ids" in kwargs, "1d RoPE requires 'position_ids' argument" + head_dim = self.config.mlstm_block.mlstm._inner_embedding_dim + freqs_cos, freqs_sin = compute_freqs_cis(kwargs.pop("position_ids"), head_dim, theta=self.config.rope_base_frequency) + kwargs.pop("seq_position_ids", None) + + elif self.config.position_embeddings.endswith("2d"): + assert ( + "position_ids" in kwargs and "seq_position_ids" in kwargs + ), "2d RoPE requires 'position_ids' and 'seq_position_ids' arguments" + head_dim = self.config.mlstm_block.mlstm._inner_embedding_dim + total_emb = self.config.max_position_embeddings + self.config.max_seq_position_embeddings + pos_dim = head_dim * self.config.max_position_embeddings // total_emb + pos_dim -= pos_dim % 2 # assure pos_dim is even + seq_dim = head_dim - pos_dim + freqs_cos1, freqs_sin1 = compute_freqs_cis(kwargs.pop("position_ids"), pos_dim, theta=self.config.rope_base_frequency) + freqs_cos2, freqs_sin2 = compute_freqs_cis(kwargs.pop("seq_position_ids"), seq_dim, theta=self.config.rope_base_frequency) + freqs_cos = torch.cat([freqs_cos1, freqs_cos2], dim=-1) + freqs_sin = torch.cat([freqs_sin1, freqs_sin2], dim=-1) + else: + assert hasattr(self, "freqs_cos"), "model was not configured for general RoPE" + assert len(self.freqs_cos) >= x.shape[1], "input sequence longer than max_seq_positions" + freqs_cos, freqs_sin = self.freqs_cos[:x.shape[1]], self.freqs_sin[:x.shape[1]] + + kwargs["freqs_cos"] = freqs_cos + kwargs["freqs_sin"] = freqs_sin + + x = self.emb_dropout(x) + x, state = self.xlstm_block_stack.step(x, state=state, **kwargs) + logits = self.lm_head(x) + return logits, state + + def _create_weight_decay_optim_groups( + self, **kwargs + ) -> tuple[Sequence[nn.Parameter], Sequence[nn.Parameter]]: + weight_decay, no_weight_decay = super()._create_weight_decay_optim_groups( + **kwargs + ) + # remove token embedding and add it to the correct group, accrording to the config + weight_decay = list(weight_decay) + removed = 0 + for idx in range(len(weight_decay)): + if weight_decay[idx - removed] is self.token_embedding.weight: + weight_decay.pop(idx - removed) + removed += 1 + weight_decay = tuple(weight_decay) + if self.config.weight_decay_on_embedding: + weight_decay += (self.token_embedding.weight,) + else: + no_weight_decay += (self.token_embedding.weight,) + + return weight_decay, no_weight_decay diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..020a4929d3ab60cce4848672997b93174679e083 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,30 @@ +cuda=12.1 +cuda-nvcc=12.1 +gxx_linux-64=11.2.0 +python=3.11 +pytorch=2.2.0 +pytorch-cuda=12.1 +transformers==4.44.2 +mamba_ssm==1.2.0 +cmake +ninja +accelerate +biopython +bottleneck +dacite +ipykernel +matplotlib +numpy +omegaconf +pandas +pyhmmer +rich +scipy +seaborn +torchmetrics +tqdm +tueplots +wandb +streamlit + +