Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	Añadido modelo sin submódulo
Browse files- models/all-MiniLM-L6-v2 +0 -1
- models/all-MiniLM-L6-v2/.gitattributes +28 -0
- models/all-MiniLM-L6-v2/1_Pooling/config.json +7 -0
- models/all-MiniLM-L6-v2/README.md +173 -0
- models/all-MiniLM-L6-v2/config.json +24 -0
- models/all-MiniLM-L6-v2/config_sentence_transformers.json +7 -0
- models/all-MiniLM-L6-v2/data_config.json +1452 -0
- models/all-MiniLM-L6-v2/model.safetensors +3 -0
- models/all-MiniLM-L6-v2/modules.json +20 -0
- models/all-MiniLM-L6-v2/onnx/model.onnx +3 -0
- models/all-MiniLM-L6-v2/onnx/model_O1.onnx +3 -0
- models/all-MiniLM-L6-v2/onnx/model_O2.onnx +3 -0
- models/all-MiniLM-L6-v2/onnx/model_O3.onnx +3 -0
- models/all-MiniLM-L6-v2/onnx/model_O4.onnx +3 -0
- models/all-MiniLM-L6-v2/onnx/model_qint8_arm64.onnx +3 -0
- models/all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx +3 -0
- models/all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx +3 -0
- models/all-MiniLM-L6-v2/onnx/model_quint8_avx2.onnx +3 -0
- models/all-MiniLM-L6-v2/openvino/openvino_model.bin +3 -0
- models/all-MiniLM-L6-v2/openvino/openvino_model.xml +0 -0
- models/all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.bin +3 -0
- models/all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.xml +0 -0
- models/all-MiniLM-L6-v2/pytorch_model.bin +3 -0
- models/all-MiniLM-L6-v2/rust_model.ot +3 -0
- models/all-MiniLM-L6-v2/sentence_bert_config.json +4 -0
- models/all-MiniLM-L6-v2/special_tokens_map.json +1 -0
- models/all-MiniLM-L6-v2/tf_model.h5 +3 -0
- models/all-MiniLM-L6-v2/tokenizer.json +0 -0
- models/all-MiniLM-L6-v2/tokenizer_config.json +1 -0
- models/all-MiniLM-L6-v2/train_script.py +344 -0
- models/all-MiniLM-L6-v2/vocab.txt +0 -0
    	
        models/all-MiniLM-L6-v2
    DELETED
    
    | @@ -1 +0,0 @@ | |
| 1 | 
            -
            Subproject commit c9745ed1d9f207416be6d2e6f8de32d1f16199bf
         | 
|  | |
|  | 
    	
        models/all-MiniLM-L6-v2/.gitattributes
    ADDED
    
    | @@ -0,0 +1,28 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            *.7z filter=lfs diff=lfs merge=lfs -text
         | 
| 2 | 
            +
            *.arrow filter=lfs diff=lfs merge=lfs -text
         | 
| 3 | 
            +
            *.bin filter=lfs diff=lfs merge=lfs -text
         | 
| 4 | 
            +
            *.bin.* filter=lfs diff=lfs merge=lfs -text
         | 
| 5 | 
            +
            *.bz2 filter=lfs diff=lfs merge=lfs -text
         | 
| 6 | 
            +
            *.ftz filter=lfs diff=lfs merge=lfs -text
         | 
| 7 | 
            +
            *.gz filter=lfs diff=lfs merge=lfs -text
         | 
| 8 | 
            +
            *.h5 filter=lfs diff=lfs merge=lfs -text
         | 
| 9 | 
            +
            *.joblib filter=lfs diff=lfs merge=lfs -text
         | 
| 10 | 
            +
            *.lfs.* filter=lfs diff=lfs merge=lfs -text
         | 
| 11 | 
            +
            *.model filter=lfs diff=lfs merge=lfs -text
         | 
| 12 | 
            +
            *.msgpack filter=lfs diff=lfs merge=lfs -text
         | 
| 13 | 
            +
            *.onnx filter=lfs diff=lfs merge=lfs -text
         | 
| 14 | 
            +
            *.ot filter=lfs diff=lfs merge=lfs -text
         | 
| 15 | 
            +
            *.parquet filter=lfs diff=lfs merge=lfs -text
         | 
| 16 | 
            +
            *.pb filter=lfs diff=lfs merge=lfs -text
         | 
| 17 | 
            +
            *.pt filter=lfs diff=lfs merge=lfs -text
         | 
| 18 | 
            +
            *.pth filter=lfs diff=lfs merge=lfs -text
         | 
| 19 | 
            +
            *.rar filter=lfs diff=lfs merge=lfs -text
         | 
| 20 | 
            +
            saved_model/**/* filter=lfs diff=lfs merge=lfs -text 
         | 
| 21 | 
            +
            *.tar.* filter=lfs diff=lfs merge=lfs -text
         | 
| 22 | 
            +
            *.tflite filter=lfs diff=lfs merge=lfs -text
         | 
| 23 | 
            +
            *.tgz filter=lfs diff=lfs merge=lfs -text
         | 
| 24 | 
            +
            *.xz filter=lfs diff=lfs merge=lfs -text
         | 
| 25 | 
            +
            *.zip filter=lfs diff=lfs merge=lfs -text
         | 
| 26 | 
            +
            *.zstandard filter=lfs diff=lfs merge=lfs -text
         | 
| 27 | 
            +
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         | 
| 28 | 
            +
            model.safetensors filter=lfs diff=lfs merge=lfs -text
         | 
    	
        models/all-MiniLM-L6-v2/1_Pooling/config.json
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "word_embedding_dimension": 384,
         | 
| 3 | 
            +
              "pooling_mode_cls_token": false,
         | 
| 4 | 
            +
              "pooling_mode_mean_tokens": true,
         | 
| 5 | 
            +
              "pooling_mode_max_tokens": false,
         | 
| 6 | 
            +
              "pooling_mode_mean_sqrt_len_tokens": false
         | 
| 7 | 
            +
            }
         | 
    	
        models/all-MiniLM-L6-v2/README.md
    ADDED
    
    | @@ -0,0 +1,173 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            language: en
         | 
| 3 | 
            +
            license: apache-2.0
         | 
| 4 | 
            +
            library_name: sentence-transformers
         | 
| 5 | 
            +
            tags:
         | 
| 6 | 
            +
            - sentence-transformers
         | 
| 7 | 
            +
            - feature-extraction
         | 
| 8 | 
            +
            - sentence-similarity
         | 
| 9 | 
            +
            - transformers
         | 
| 10 | 
            +
            datasets:
         | 
| 11 | 
            +
            - s2orc
         | 
| 12 | 
            +
            - flax-sentence-embeddings/stackexchange_xml
         | 
| 13 | 
            +
            - ms_marco
         | 
| 14 | 
            +
            - gooaq
         | 
| 15 | 
            +
            - yahoo_answers_topics
         | 
| 16 | 
            +
            - code_search_net
         | 
| 17 | 
            +
            - search_qa
         | 
| 18 | 
            +
            - eli5
         | 
| 19 | 
            +
            - snli
         | 
| 20 | 
            +
            - multi_nli
         | 
| 21 | 
            +
            - wikihow
         | 
| 22 | 
            +
            - natural_questions
         | 
| 23 | 
            +
            - trivia_qa
         | 
| 24 | 
            +
            - embedding-data/sentence-compression
         | 
| 25 | 
            +
            - embedding-data/flickr30k-captions
         | 
| 26 | 
            +
            - embedding-data/altlex
         | 
| 27 | 
            +
            - embedding-data/simple-wiki
         | 
| 28 | 
            +
            - embedding-data/QQP
         | 
| 29 | 
            +
            - embedding-data/SPECTER
         | 
| 30 | 
            +
            - embedding-data/PAQ_pairs
         | 
| 31 | 
            +
            - embedding-data/WikiAnswers
         | 
| 32 | 
            +
            pipeline_tag: sentence-similarity
         | 
| 33 | 
            +
            ---
         | 
| 34 | 
            +
             | 
| 35 | 
            +
             | 
| 36 | 
            +
            # all-MiniLM-L6-v2
         | 
| 37 | 
            +
            This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            ## Usage (Sentence-Transformers)
         | 
| 40 | 
            +
            Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            ```
         | 
| 43 | 
            +
            pip install -U sentence-transformers
         | 
| 44 | 
            +
            ```
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            Then you can use the model like this:
         | 
| 47 | 
            +
            ```python
         | 
| 48 | 
            +
            from sentence_transformers import SentenceTransformer
         | 
| 49 | 
            +
            sentences = ["This is an example sentence", "Each sentence is converted"]
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
         | 
| 52 | 
            +
            embeddings = model.encode(sentences)
         | 
| 53 | 
            +
            print(embeddings)
         | 
| 54 | 
            +
            ```
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            ## Usage (HuggingFace Transformers)
         | 
| 57 | 
            +
            Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
         | 
| 58 | 
            +
             | 
| 59 | 
            +
            ```python
         | 
| 60 | 
            +
            from transformers import AutoTokenizer, AutoModel
         | 
| 61 | 
            +
            import torch
         | 
| 62 | 
            +
            import torch.nn.functional as F
         | 
| 63 | 
            +
             | 
| 64 | 
            +
            #Mean Pooling - Take attention mask into account for correct averaging
         | 
| 65 | 
            +
            def mean_pooling(model_output, attention_mask):
         | 
| 66 | 
            +
                token_embeddings = model_output[0] #First element of model_output contains all token embeddings
         | 
| 67 | 
            +
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         | 
| 68 | 
            +
                return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
         | 
| 69 | 
            +
             | 
| 70 | 
            +
             | 
| 71 | 
            +
            # Sentences we want sentence embeddings for
         | 
| 72 | 
            +
            sentences = ['This is an example sentence', 'Each sentence is converted']
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            # Load model from HuggingFace Hub
         | 
| 75 | 
            +
            tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
         | 
| 76 | 
            +
            model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
         | 
| 77 | 
            +
             | 
| 78 | 
            +
            # Tokenize sentences
         | 
| 79 | 
            +
            encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
         | 
| 80 | 
            +
             | 
| 81 | 
            +
            # Compute token embeddings
         | 
| 82 | 
            +
            with torch.no_grad():
         | 
| 83 | 
            +
                model_output = model(**encoded_input)
         | 
| 84 | 
            +
             | 
| 85 | 
            +
            # Perform pooling
         | 
| 86 | 
            +
            sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
         | 
| 87 | 
            +
             | 
| 88 | 
            +
            # Normalize embeddings
         | 
| 89 | 
            +
            sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            print("Sentence embeddings:")
         | 
| 92 | 
            +
            print(sentence_embeddings)
         | 
| 93 | 
            +
            ```
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            ------
         | 
| 96 | 
            +
             | 
| 97 | 
            +
            ## Background
         | 
| 98 | 
            +
             | 
| 99 | 
            +
            The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised 
         | 
| 100 | 
            +
            contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a 
         | 
| 101 | 
            +
            1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            We developed this model during the 
         | 
| 104 | 
            +
            [Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104), 
         | 
| 105 | 
            +
            organized by Hugging Face. We developed this model as part of the project:
         | 
| 106 | 
            +
            [Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
         | 
| 107 | 
            +
             | 
| 108 | 
            +
            ## Intended uses
         | 
| 109 | 
            +
             | 
| 110 | 
            +
            Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures 
         | 
| 111 | 
            +
            the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
         | 
| 112 | 
            +
             | 
| 113 | 
            +
            By default, input text longer than 256 word pieces is truncated.
         | 
| 114 | 
            +
             | 
| 115 | 
            +
             | 
| 116 | 
            +
            ## Training procedure
         | 
| 117 | 
            +
             | 
| 118 | 
            +
            ### Pre-training 
         | 
| 119 | 
            +
             | 
| 120 | 
            +
            We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            ### Fine-tuning 
         | 
| 123 | 
            +
             | 
| 124 | 
            +
            We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
         | 
| 125 | 
            +
            We then apply the cross entropy loss by comparing with true pairs.
         | 
| 126 | 
            +
             | 
| 127 | 
            +
            #### Hyper parameters
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
         | 
| 130 | 
            +
            We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
         | 
| 131 | 
            +
            a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
         | 
| 132 | 
            +
             | 
| 133 | 
            +
            #### Training data
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
         | 
| 136 | 
            +
            We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
         | 
| 137 | 
            +
             | 
| 138 | 
            +
             | 
| 139 | 
            +
            | Dataset                                                  | Paper                                    | Number of training tuples  |
         | 
| 140 | 
            +
            |--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
         | 
| 141 | 
            +
            | [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
         | 
| 142 | 
            +
            | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
         | 
| 143 | 
            +
            | [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
         | 
| 144 | 
            +
            | [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
         | 
| 145 | 
            +
            | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
         | 
| 146 | 
            +
            | [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
         | 
| 147 | 
            +
            | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs  | - | 25,316,456 |
         | 
| 148 | 
            +
            | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs  | - | 21,396,559 |
         | 
| 149 | 
            +
            | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs  | - | 21,396,559 |
         | 
| 150 | 
            +
            | [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
         | 
| 151 | 
            +
            | [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
         | 
| 152 | 
            +
            | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
         | 
| 153 | 
            +
            | [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
         | 
| 154 | 
            +
            | [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
         | 
| 155 | 
            +
            | [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
         | 
| 156 | 
            +
            | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
         | 
| 157 | 
            +
            | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
         | 
| 158 | 
            +
            | [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
         | 
| 159 | 
            +
            | [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
         | 
| 160 | 
            +
            | [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
         | 
| 161 | 
            +
            | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
         | 
| 162 | 
            +
            | AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 | 
         | 
| 163 | 
            +
            | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
         | 
| 164 | 
            +
            | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
         | 
| 165 | 
            +
            | [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
         | 
| 166 | 
            +
            | [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
         | 
| 167 | 
            +
            | [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
         | 
| 168 | 
            +
            | [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
         | 
| 169 | 
            +
            | [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
         | 
| 170 | 
            +
            | [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
         | 
| 171 | 
            +
            | [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
         | 
| 172 | 
            +
            | [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
         | 
| 173 | 
            +
            | **Total** | | **1,170,060,424** |
         | 
    	
        models/all-MiniLM-L6-v2/config.json
    ADDED
    
    | @@ -0,0 +1,24 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
         | 
| 3 | 
            +
              "architectures": [
         | 
| 4 | 
            +
                "BertModel"
         | 
| 5 | 
            +
              ],
         | 
| 6 | 
            +
              "attention_probs_dropout_prob": 0.1,
         | 
| 7 | 
            +
              "gradient_checkpointing": false,
         | 
| 8 | 
            +
              "hidden_act": "gelu",
         | 
| 9 | 
            +
              "hidden_dropout_prob": 0.1,
         | 
| 10 | 
            +
              "hidden_size": 384,
         | 
| 11 | 
            +
              "initializer_range": 0.02,
         | 
| 12 | 
            +
              "intermediate_size": 1536,
         | 
| 13 | 
            +
              "layer_norm_eps": 1e-12,
         | 
| 14 | 
            +
              "max_position_embeddings": 512,
         | 
| 15 | 
            +
              "model_type": "bert",
         | 
| 16 | 
            +
              "num_attention_heads": 12,
         | 
| 17 | 
            +
              "num_hidden_layers": 6,
         | 
| 18 | 
            +
              "pad_token_id": 0,
         | 
| 19 | 
            +
              "position_embedding_type": "absolute",
         | 
| 20 | 
            +
              "transformers_version": "4.8.2",
         | 
| 21 | 
            +
              "type_vocab_size": 2,
         | 
| 22 | 
            +
              "use_cache": true,
         | 
| 23 | 
            +
              "vocab_size": 30522
         | 
| 24 | 
            +
            }
         | 
    	
        models/all-MiniLM-L6-v2/config_sentence_transformers.json
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "__version__": {
         | 
| 3 | 
            +
                "sentence_transformers": "2.0.0",
         | 
| 4 | 
            +
                "transformers": "4.6.1",
         | 
| 5 | 
            +
                "pytorch": "1.8.1"
         | 
| 6 | 
            +
              }
         | 
| 7 | 
            +
            }
         | 
    	
        models/all-MiniLM-L6-v2/data_config.json
    ADDED
    
    | @@ -0,0 +1,1452 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            [
         | 
| 2 | 
            +
                {
         | 
| 3 | 
            +
                    "name": "stackexchange_title_body/skeptics.stackexchange.com.jsonl.gz",
         | 
| 4 | 
            +
                    "lines": 10009,
         | 
| 5 | 
            +
                    "weight": 1
         | 
| 6 | 
            +
                },
         | 
| 7 | 
            +
                {
         | 
| 8 | 
            +
                    "name": "stackexchange_TitleBody_Answer/islam.stackexchange.com.jsonl.gz",
         | 
| 9 | 
            +
                    "lines": 10052,
         | 
| 10 | 
            +
                    "weight": 1
         | 
| 11 | 
            +
                },
         | 
| 12 | 
            +
                {
         | 
| 13 | 
            +
                    "name": "stackexchange_Title_Answer/islam.stackexchange.com.jsonl.gz",
         | 
| 14 | 
            +
                    "lines": 10052,
         | 
| 15 | 
            +
                    "weight": 1
         | 
| 16 | 
            +
                },
         | 
| 17 | 
            +
                {
         | 
| 18 | 
            +
                    "name": "stackexchange_TitleBody_Answer/anime.stackexchange.com.jsonl.gz",
         | 
| 19 | 
            +
                    "lines": 10131,
         | 
| 20 | 
            +
                    "weight": 1
         | 
| 21 | 
            +
                },
         | 
| 22 | 
            +
                {
         | 
| 23 | 
            +
                    "name": "stackexchange_Title_Answer/anime.stackexchange.com.jsonl.gz",
         | 
| 24 | 
            +
                    "lines": 10131,
         | 
| 25 | 
            +
                    "weight": 1
         | 
| 26 | 
            +
                },
         | 
| 27 | 
            +
                {
         | 
| 28 | 
            +
                    "name": "stackexchange_title_body/writers.stackexchange.com.jsonl.gz",
         | 
| 29 | 
            +
                    "lines": 10157,
         | 
| 30 | 
            +
                    "weight": 1
         | 
| 31 | 
            +
                },
         | 
| 32 | 
            +
                {
         | 
| 33 | 
            +
                    "name": "stackexchange_title_body/astronomy.stackexchange.com.jsonl.gz",
         | 
| 34 | 
            +
                    "lines": 10462,
         | 
| 35 | 
            +
                    "weight": 1
         | 
| 36 | 
            +
                },
         | 
| 37 | 
            +
                {
         | 
| 38 | 
            +
                    "name": "stackexchange_title_body/vi.stackexchange.com.jsonl.gz",
         | 
| 39 | 
            +
                    "lines": 10551,
         | 
| 40 | 
            +
                    "weight": 1
         | 
| 41 | 
            +
                },
         | 
| 42 | 
            +
                {
         | 
| 43 | 
            +
                    "name": "stackexchange_TitleBody_Answer/french.stackexchange.com.jsonl.gz",
         | 
| 44 | 
            +
                    "lines": 10578,
         | 
| 45 | 
            +
                    "weight": 1
         | 
| 46 | 
            +
                },
         | 
| 47 | 
            +
                {
         | 
| 48 | 
            +
                    "name": "stackexchange_Title_Answer/french.stackexchange.com.jsonl.gz",
         | 
| 49 | 
            +
                    "lines": 10578,
         | 
| 50 | 
            +
                    "weight": 1
         | 
| 51 | 
            +
                },
         | 
| 52 | 
            +
                {
         | 
| 53 | 
            +
                    "name": "stackexchange_title_body/cstheory.stackexchange.com.jsonl.gz",
         | 
| 54 | 
            +
                    "lines": 10642,
         | 
| 55 | 
            +
                    "weight": 1
         | 
| 56 | 
            +
                },
         | 
| 57 | 
            +
                {
         | 
| 58 | 
            +
                    "name": "stackexchange_TitleBody_Answer/civicrm.stackexchange.com.jsonl.gz",
         | 
| 59 | 
            +
                    "lines": 10648,
         | 
| 60 | 
            +
                    "weight": 1
         | 
| 61 | 
            +
                },
         | 
| 62 | 
            +
                {
         | 
| 63 | 
            +
                    "name": "stackexchange_Title_Answer/civicrm.stackexchange.com.jsonl.gz",
         | 
| 64 | 
            +
                    "lines": 10648,
         | 
| 65 | 
            +
                    "weight": 1
         | 
| 66 | 
            +
                },
         | 
| 67 | 
            +
                {
         | 
| 68 | 
            +
                    "name": "stackexchange_TitleBody_Answer/expressionengine.stackexchange.com.jsonl.gz",
         | 
| 69 | 
            +
                    "lines": 10742,
         | 
| 70 | 
            +
                    "weight": 1
         | 
| 71 | 
            +
                },
         | 
| 72 | 
            +
                {
         | 
| 73 | 
            +
                    "name": "stackexchange_Title_Answer/expressionengine.stackexchange.com.jsonl.gz",
         | 
| 74 | 
            +
                    "lines": 10742,
         | 
| 75 | 
            +
                    "weight": 1
         | 
| 76 | 
            +
                },
         | 
| 77 | 
            +
                {
         | 
| 78 | 
            +
                    "name": "stackexchange_title_body/engineering.stackexchange.com.jsonl.gz",
         | 
| 79 | 
            +
                    "lines": 10753,
         | 
| 80 | 
            +
                    "weight": 1
         | 
| 81 | 
            +
                },
         | 
| 82 | 
            +
                {
         | 
| 83 | 
            +
                    "name": "stackexchange_TitleBody_Answer/history.stackexchange.com.jsonl.gz",
         | 
| 84 | 
            +
                    "lines": 10766,
         | 
| 85 | 
            +
                    "weight": 1
         | 
| 86 | 
            +
                },
         | 
| 87 | 
            +
                {
         | 
| 88 | 
            +
                    "name": "stackexchange_Title_Answer/history.stackexchange.com.jsonl.gz",
         | 
| 89 | 
            +
                    "lines": 10766,
         | 
| 90 | 
            +
                    "weight": 1
         | 
| 91 | 
            +
                },
         | 
| 92 | 
            +
                {
         | 
| 93 | 
            +
                    "name": "stackexchange_title_body/french.stackexchange.com.jsonl.gz",
         | 
| 94 | 
            +
                    "lines": 10794,
         | 
| 95 | 
            +
                    "weight": 1
         | 
| 96 | 
            +
                },
         | 
| 97 | 
            +
                {
         | 
| 98 | 
            +
                    "name": "stackexchange_TitleBody_Answer/politics.stackexchange.com.jsonl.gz",
         | 
| 99 | 
            +
                    "lines": 11047,
         | 
| 100 | 
            +
                    "weight": 1
         | 
| 101 | 
            +
                },
         | 
| 102 | 
            +
                {
         | 
| 103 | 
            +
                    "name": "stackexchange_Title_Answer/politics.stackexchange.com.jsonl.gz",
         | 
| 104 | 
            +
                    "lines": 11047,
         | 
| 105 | 
            +
                    "weight": 1
         | 
| 106 | 
            +
                },
         | 
| 107 | 
            +
                {
         | 
| 108 | 
            +
                    "name": "stackexchange_title_body/economics.stackexchange.com.jsonl.gz",
         | 
| 109 | 
            +
                    "lines": 11115,
         | 
| 110 | 
            +
                    "weight": 1
         | 
| 111 | 
            +
                },
         | 
| 112 | 
            +
                {
         | 
| 113 | 
            +
                    "name": "stackexchange_TitleBody_Answer/craftcms.stackexchange.com.jsonl.gz",
         | 
| 114 | 
            +
                    "lines": 11236,
         | 
| 115 | 
            +
                    "weight": 1
         | 
| 116 | 
            +
                },
         | 
| 117 | 
            +
                {
         | 
| 118 | 
            +
                    "name": "stackexchange_Title_Answer/craftcms.stackexchange.com.jsonl.gz",
         | 
| 119 | 
            +
                    "lines": 11236,
         | 
| 120 | 
            +
                    "weight": 1
         | 
| 121 | 
            +
                },
         | 
| 122 | 
            +
                {
         | 
| 123 | 
            +
                    "name": "stackexchange_title_body/anime.stackexchange.com.jsonl.gz",
         | 
| 124 | 
            +
                    "lines": 11444,
         | 
| 125 | 
            +
                    "weight": 1
         | 
| 126 | 
            +
                },
         | 
| 127 | 
            +
                {
         | 
| 128 | 
            +
                    "name": "stackexchange_TitleBody_Answer/christianity.stackexchange.com.jsonl.gz",
         | 
| 129 | 
            +
                    "lines": 11498,
         | 
| 130 | 
            +
                    "weight": 1
         | 
| 131 | 
            +
                },
         | 
| 132 | 
            +
                {
         | 
| 133 | 
            +
                    "name": "stackexchange_Title_Answer/christianity.stackexchange.com.jsonl.gz",
         | 
| 134 | 
            +
                    "lines": 11498,
         | 
| 135 | 
            +
                    "weight": 1
         | 
| 136 | 
            +
                },
         | 
| 137 | 
            +
                {
         | 
| 138 | 
            +
                    "name": "stackexchange_TitleBody_Answer/softwarerecs.stackexchange.com.jsonl.gz",
         | 
| 139 | 
            +
                    "lines": 11761,
         | 
| 140 | 
            +
                    "weight": 1
         | 
| 141 | 
            +
                },
         | 
| 142 | 
            +
                {
         | 
| 143 | 
            +
                    "name": "stackexchange_Title_Answer/softwarerecs.stackexchange.com.jsonl.gz",
         | 
| 144 | 
            +
                    "lines": 11761,
         | 
| 145 | 
            +
                    "weight": 1
         | 
| 146 | 
            +
                },
         | 
| 147 | 
            +
                {
         | 
| 148 | 
            +
                    "name": "stackexchange_TitleBody_Answer/boardgames.stackexchange.com.jsonl.gz",
         | 
| 149 | 
            +
                    "lines": 11805,
         | 
| 150 | 
            +
                    "weight": 1
         | 
| 151 | 
            +
                },
         | 
| 152 | 
            +
                {
         | 
| 153 | 
            +
                    "name": "stackexchange_Title_Answer/boardgames.stackexchange.com.jsonl.gz",
         | 
| 154 | 
            +
                    "lines": 11805,
         | 
| 155 | 
            +
                    "weight": 1
         | 
| 156 | 
            +
                },
         | 
| 157 | 
            +
                {
         | 
| 158 | 
            +
                    "name": "stackexchange_title_body/islam.stackexchange.com.jsonl.gz",
         | 
| 159 | 
            +
                    "lines": 11853,
         | 
| 160 | 
            +
                    "weight": 1
         | 
| 161 | 
            +
                },
         | 
| 162 | 
            +
                {
         | 
| 163 | 
            +
                    "name": "stackexchange_title_body/expressionengine.stackexchange.com.jsonl.gz",
         | 
| 164 | 
            +
                    "lines": 11866,
         | 
| 165 | 
            +
                    "weight": 1
         | 
| 166 | 
            +
                },
         | 
| 167 | 
            +
                {
         | 
| 168 | 
            +
                    "name": "stackexchange_title_body/politics.stackexchange.com.jsonl.gz",
         | 
| 169 | 
            +
                    "lines": 11894,
         | 
| 170 | 
            +
                    "weight": 1
         | 
| 171 | 
            +
                },
         | 
| 172 | 
            +
                {
         | 
| 173 | 
            +
                    "name": "stackexchange_title_body/history.stackexchange.com.jsonl.gz",
         | 
| 174 | 
            +
                    "lines": 12021,
         | 
| 175 | 
            +
                    "weight": 1
         | 
| 176 | 
            +
                },
         | 
| 177 | 
            +
                {
         | 
| 178 | 
            +
                    "name": "stackexchange_title_body/christianity.stackexchange.com.jsonl.gz",
         | 
| 179 | 
            +
                    "lines": 12108,
         | 
| 180 | 
            +
                    "weight": 1
         | 
| 181 | 
            +
                },
         | 
| 182 | 
            +
                {
         | 
| 183 | 
            +
                    "name": "stackexchange_title_body/boardgames.stackexchange.com.jsonl.gz",
         | 
| 184 | 
            +
                    "lines": 12149,
         | 
| 185 | 
            +
                    "weight": 1
         | 
| 186 | 
            +
                },
         | 
| 187 | 
            +
                {
         | 
| 188 | 
            +
                    "name": "flickr30k_captions.jsonl.gz",
         | 
| 189 | 
            +
                    "lines": 317695,
         | 
| 190 | 
            +
                    "weight": 1
         | 
| 191 | 
            +
                },
         | 
| 192 | 
            +
                {
         | 
| 193 | 
            +
                    "name": "coco_captions.jsonl.gz",
         | 
| 194 | 
            +
                    "lines": 828395,
         | 
| 195 | 
            +
                    "weight": 1
         | 
| 196 | 
            +
                },
         | 
| 197 | 
            +
                {
         | 
| 198 | 
            +
                    "name": "codesearchnet.jsonl.gz",
         | 
| 199 | 
            +
                    "lines": 1151414,
         | 
| 200 | 
            +
                    "weight": 1
         | 
| 201 | 
            +
                },
         | 
| 202 | 
            +
                {
         | 
| 203 | 
            +
                    "name": "stackexchange_title_body/civicrm.stackexchange.com.jsonl.gz",
         | 
| 204 | 
            +
                    "lines": 12543,
         | 
| 205 | 
            +
                    "weight": 2
         | 
| 206 | 
            +
                },
         | 
| 207 | 
            +
                {
         | 
| 208 | 
            +
                    "name": "stackexchange_title_body/craftcms.stackexchange.com.jsonl.gz",
         | 
| 209 | 
            +
                    "lines": 12574,
         | 
| 210 | 
            +
                    "weight": 2
         | 
| 211 | 
            +
                },
         | 
| 212 | 
            +
                {
         | 
| 213 | 
            +
                    "name": "stackexchange_TitleBody_Answer/networkengineering.stackexchange.com.jsonl.gz",
         | 
| 214 | 
            +
                    "lines": 12590,
         | 
| 215 | 
            +
                    "weight": 2
         | 
| 216 | 
            +
                },
         | 
| 217 | 
            +
                {
         | 
| 218 | 
            +
                    "name": "stackexchange_Title_Answer/networkengineering.stackexchange.com.jsonl.gz",
         | 
| 219 | 
            +
                    "lines": 12590,
         | 
| 220 | 
            +
                    "weight": 2
         | 
| 221 | 
            +
                },
         | 
| 222 | 
            +
                {
         | 
| 223 | 
            +
                    "name": "stackexchange_TitleBody_Answer/space.stackexchange.com.jsonl.gz",
         | 
| 224 | 
            +
                    "lines": 12893,
         | 
| 225 | 
            +
                    "weight": 2
         | 
| 226 | 
            +
                },
         | 
| 227 | 
            +
                {
         | 
| 228 | 
            +
                    "name": "stackexchange_Title_Answer/space.stackexchange.com.jsonl.gz",
         | 
| 229 | 
            +
                    "lines": 12893,
         | 
| 230 | 
            +
                    "weight": 2
         | 
| 231 | 
            +
                },
         | 
| 232 | 
            +
                {
         | 
| 233 | 
            +
                    "name": "stackexchange_TitleBody_Answer/quant.stackexchange.com.jsonl.gz",
         | 
| 234 | 
            +
                    "lines": 12933,
         | 
| 235 | 
            +
                    "weight": 2
         | 
| 236 | 
            +
                },
         | 
| 237 | 
            +
                {
         | 
| 238 | 
            +
                    "name": "stackexchange_Title_Answer/quant.stackexchange.com.jsonl.gz",
         | 
| 239 | 
            +
                    "lines": 12933,
         | 
| 240 | 
            +
                    "weight": 2
         | 
| 241 | 
            +
                },
         | 
| 242 | 
            +
                {
         | 
| 243 | 
            +
                    "name": "stackexchange_TitleBody_Answer/philosophy.stackexchange.com.jsonl.gz",
         | 
| 244 | 
            +
                    "lines": 13114,
         | 
| 245 | 
            +
                    "weight": 2
         | 
| 246 | 
            +
                },
         | 
| 247 | 
            +
                {
         | 
| 248 | 
            +
                    "name": "stackexchange_Title_Answer/philosophy.stackexchange.com.jsonl.gz",
         | 
| 249 | 
            +
                    "lines": 13114,
         | 
| 250 | 
            +
                    "weight": 2
         | 
| 251 | 
            +
                },
         | 
| 252 | 
            +
                {
         | 
| 253 | 
            +
                    "name": "stackexchange_TitleBody_Answer/gardening.stackexchange.com.jsonl.gz",
         | 
| 254 | 
            +
                    "lines": 13246,
         | 
| 255 | 
            +
                    "weight": 2
         | 
| 256 | 
            +
                },
         | 
| 257 | 
            +
                {
         | 
| 258 | 
            +
                    "name": "stackexchange_Title_Answer/gardening.stackexchange.com.jsonl.gz",
         | 
| 259 | 
            +
                    "lines": 13246,
         | 
| 260 | 
            +
                    "weight": 2
         | 
| 261 | 
            +
                },
         | 
| 262 | 
            +
                {
         | 
| 263 | 
            +
                    "name": "stackexchange_title_body/hinduism.stackexchange.com.jsonl.gz",
         | 
| 264 | 
            +
                    "lines": 13450,
         | 
| 265 | 
            +
                    "weight": 2
         | 
| 266 | 
            +
                },
         | 
| 267 | 
            +
                {
         | 
| 268 | 
            +
                    "name": "stackexchange_title_body/networkengineering.stackexchange.com.jsonl.gz",
         | 
| 269 | 
            +
                    "lines": 13454,
         | 
| 270 | 
            +
                    "weight": 2
         | 
| 271 | 
            +
                },
         | 
| 272 | 
            +
                {
         | 
| 273 | 
            +
                    "name": "stackexchange_TitleBody_Answer/german.stackexchange.com.jsonl.gz",
         | 
| 274 | 
            +
                    "lines": 13733,
         | 
| 275 | 
            +
                    "weight": 2
         | 
| 276 | 
            +
                },
         | 
| 277 | 
            +
                {
         | 
| 278 | 
            +
                    "name": "stackexchange_Title_Answer/german.stackexchange.com.jsonl.gz",
         | 
| 279 | 
            +
                    "lines": 13733,
         | 
| 280 | 
            +
                    "weight": 2
         | 
| 281 | 
            +
                },
         | 
| 282 | 
            +
                {
         | 
| 283 | 
            +
                    "name": "stackexchange_title_body/german.stackexchange.com.jsonl.gz",
         | 
| 284 | 
            +
                    "lines": 13950,
         | 
| 285 | 
            +
                    "weight": 2
         | 
| 286 | 
            +
                },
         | 
| 287 | 
            +
                {
         | 
| 288 | 
            +
                    "name": "stackexchange_title_body/philosophy.stackexchange.com.jsonl.gz",
         | 
| 289 | 
            +
                    "lines": 14829,
         | 
| 290 | 
            +
                    "weight": 2
         | 
| 291 | 
            +
                },
         | 
| 292 | 
            +
                {
         | 
| 293 | 
            +
                    "name": "stackexchange_title_body/gardening.stackexchange.com.jsonl.gz",
         | 
| 294 | 
            +
                    "lines": 15136,
         | 
| 295 | 
            +
                    "weight": 2
         | 
| 296 | 
            +
                },
         | 
| 297 | 
            +
                {
         | 
| 298 | 
            +
                    "name": "stackexchange_title_body/space.stackexchange.com.jsonl.gz",
         | 
| 299 | 
            +
                    "lines": 15142,
         | 
| 300 | 
            +
                    "weight": 2
         | 
| 301 | 
            +
                },
         | 
| 302 | 
            +
                {
         | 
| 303 | 
            +
                    "name": "stackexchange_TitleBody_Answer/bicycles.stackexchange.com.jsonl.gz",
         | 
| 304 | 
            +
                    "lines": 15708,
         | 
| 305 | 
            +
                    "weight": 2
         | 
| 306 | 
            +
                },
         | 
| 307 | 
            +
                {
         | 
| 308 | 
            +
                    "name": "stackexchange_Title_Answer/bicycles.stackexchange.com.jsonl.gz",
         | 
| 309 | 
            +
                    "lines": 15708,
         | 
| 310 | 
            +
                    "weight": 2
         | 
| 311 | 
            +
                },
         | 
| 312 | 
            +
                {
         | 
| 313 | 
            +
                    "name": "stackexchange_TitleBody_Answer/law.stackexchange.com.jsonl.gz",
         | 
| 314 | 
            +
                    "lines": 16133,
         | 
| 315 | 
            +
                    "weight": 2
         | 
| 316 | 
            +
                },
         | 
| 317 | 
            +
                {
         | 
| 318 | 
            +
                    "name": "stackexchange_Title_Answer/law.stackexchange.com.jsonl.gz",
         | 
| 319 | 
            +
                    "lines": 16133,
         | 
| 320 | 
            +
                    "weight": 2
         | 
| 321 | 
            +
                },
         | 
| 322 | 
            +
                {
         | 
| 323 | 
            +
                    "name": "stackexchange_TitleBody_Answer/arduino.stackexchange.com.jsonl.gz",
         | 
| 324 | 
            +
                    "lines": 16281,
         | 
| 325 | 
            +
                    "weight": 2
         | 
| 326 | 
            +
                },
         | 
| 327 | 
            +
                {
         | 
| 328 | 
            +
                    "name": "stackexchange_Title_Answer/arduino.stackexchange.com.jsonl.gz",
         | 
| 329 | 
            +
                    "lines": 16281,
         | 
| 330 | 
            +
                    "weight": 2
         | 
| 331 | 
            +
                },
         | 
| 332 | 
            +
                {
         | 
| 333 | 
            +
                    "name": "stackexchange_title_body/bicycles.stackexchange.com.jsonl.gz",
         | 
| 334 | 
            +
                    "lines": 16353,
         | 
| 335 | 
            +
                    "weight": 2
         | 
| 336 | 
            +
                },
         | 
| 337 | 
            +
                {
         | 
| 338 | 
            +
                    "name": "stackexchange_TitleBody_Answer/emacs.stackexchange.com.jsonl.gz",
         | 
| 339 | 
            +
                    "lines": 16830,
         | 
| 340 | 
            +
                    "weight": 2
         | 
| 341 | 
            +
                },
         | 
| 342 | 
            +
                {
         | 
| 343 | 
            +
                    "name": "stackexchange_Title_Answer/emacs.stackexchange.com.jsonl.gz",
         | 
| 344 | 
            +
                    "lines": 16830,
         | 
| 345 | 
            +
                    "weight": 2
         | 
| 346 | 
            +
                },
         | 
| 347 | 
            +
                {
         | 
| 348 | 
            +
                    "name": "stackexchange_title_body/quant.stackexchange.com.jsonl.gz",
         | 
| 349 | 
            +
                    "lines": 17261,
         | 
| 350 | 
            +
                    "weight": 2
         | 
| 351 | 
            +
                },
         | 
| 352 | 
            +
                {
         | 
| 353 | 
            +
                    "name": "stackexchange_TitleBody_Answer/dsp.stackexchange.com.jsonl.gz",
         | 
| 354 | 
            +
                    "lines": 17430,
         | 
| 355 | 
            +
                    "weight": 2
         | 
| 356 | 
            +
                },
         | 
| 357 | 
            +
                {
         | 
| 358 | 
            +
                    "name": "stackexchange_Title_Answer/dsp.stackexchange.com.jsonl.gz",
         | 
| 359 | 
            +
                    "lines": 17430,
         | 
| 360 | 
            +
                    "weight": 2
         | 
| 361 | 
            +
                },
         | 
| 362 | 
            +
                {
         | 
| 363 | 
            +
                    "name": "stackexchange_TitleBody_Answer/puzzling.stackexchange.com.jsonl.gz",
         | 
| 364 | 
            +
                    "lines": 17448,
         | 
| 365 | 
            +
                    "weight": 2
         | 
| 366 | 
            +
                },
         | 
| 367 | 
            +
                {
         | 
| 368 | 
            +
                    "name": "stackexchange_Title_Answer/puzzling.stackexchange.com.jsonl.gz",
         | 
| 369 | 
            +
                    "lines": 17448,
         | 
| 370 | 
            +
                    "weight": 2
         | 
| 371 | 
            +
                },
         | 
| 372 | 
            +
                {
         | 
| 373 | 
            +
                    "name": "stackexchange_title_body/puzzling.stackexchange.com.jsonl.gz",
         | 
| 374 | 
            +
                    "lines": 17851,
         | 
| 375 | 
            +
                    "weight": 2
         | 
| 376 | 
            +
                },
         | 
| 377 | 
            +
                {
         | 
| 378 | 
            +
                    "name": "stackexchange_title_body/law.stackexchange.com.jsonl.gz",
         | 
| 379 | 
            +
                    "lines": 17941,
         | 
| 380 | 
            +
                    "weight": 2
         | 
| 381 | 
            +
                },
         | 
| 382 | 
            +
                {
         | 
| 383 | 
            +
                    "name": "stackexchange_TitleBody_Answer/movies.stackexchange.com.jsonl.gz",
         | 
| 384 | 
            +
                    "lines": 18243,
         | 
| 385 | 
            +
                    "weight": 2
         | 
| 386 | 
            +
                },
         | 
| 387 | 
            +
                {
         | 
| 388 | 
            +
                    "name": "stackexchange_Title_Answer/movies.stackexchange.com.jsonl.gz",
         | 
| 389 | 
            +
                    "lines": 18243,
         | 
| 390 | 
            +
                    "weight": 2
         | 
| 391 | 
            +
                },
         | 
| 392 | 
            +
                {
         | 
| 393 | 
            +
                    "name": "stackexchange_TitleBody_Answer/mechanics.stackexchange.com.jsonl.gz",
         | 
| 394 | 
            +
                    "lines": 18613,
         | 
| 395 | 
            +
                    "weight": 2
         | 
| 396 | 
            +
                },
         | 
| 397 | 
            +
                {
         | 
| 398 | 
            +
                    "name": "stackexchange_Title_Answer/mechanics.stackexchange.com.jsonl.gz",
         | 
| 399 | 
            +
                    "lines": 18613,
         | 
| 400 | 
            +
                    "weight": 2
         | 
| 401 | 
            +
                },
         | 
| 402 | 
            +
                {
         | 
| 403 | 
            +
                    "name": "stackexchange_TitleBody_Answer/aviation.stackexchange.com.jsonl.gz",
         | 
| 404 | 
            +
                    "lines": 18755,
         | 
| 405 | 
            +
                    "weight": 2
         | 
| 406 | 
            +
                },
         | 
| 407 | 
            +
                {
         | 
| 408 | 
            +
                    "name": "stackexchange_Title_Answer/aviation.stackexchange.com.jsonl.gz",
         | 
| 409 | 
            +
                    "lines": 18755,
         | 
| 410 | 
            +
                    "weight": 2
         | 
| 411 | 
            +
                },
         | 
| 412 | 
            +
                {
         | 
| 413 | 
            +
                    "name": "stackexchange_TitleBody_Answer/biology.stackexchange.com.jsonl.gz",
         | 
| 414 | 
            +
                    "lines": 19277,
         | 
| 415 | 
            +
                    "weight": 2
         | 
| 416 | 
            +
                },
         | 
| 417 | 
            +
                {
         | 
| 418 | 
            +
                    "name": "stackexchange_Title_Answer/biology.stackexchange.com.jsonl.gz",
         | 
| 419 | 
            +
                    "lines": 19277,
         | 
| 420 | 
            +
                    "weight": 2
         | 
| 421 | 
            +
                },
         | 
| 422 | 
            +
                {
         | 
| 423 | 
            +
                    "name": "stackexchange_TitleBody_Answer/crypto.stackexchange.com.jsonl.gz",
         | 
| 424 | 
            +
                    "lines": 19404,
         | 
| 425 | 
            +
                    "weight": 2
         | 
| 426 | 
            +
                },
         | 
| 427 | 
            +
                {
         | 
| 428 | 
            +
                    "name": "stackexchange_Title_Answer/crypto.stackexchange.com.jsonl.gz",
         | 
| 429 | 
            +
                    "lines": 19404,
         | 
| 430 | 
            +
                    "weight": 2
         | 
| 431 | 
            +
                },
         | 
| 432 | 
            +
                {
         | 
| 433 | 
            +
                    "name": "stackexchange_title_body/arduino.stackexchange.com.jsonl.gz",
         | 
| 434 | 
            +
                    "lines": 19553,
         | 
| 435 | 
            +
                    "weight": 2
         | 
| 436 | 
            +
                },
         | 
| 437 | 
            +
                {
         | 
| 438 | 
            +
                    "name": "stackexchange_TitleBody_Answer/music.stackexchange.com.jsonl.gz",
         | 
| 439 | 
            +
                    "lines": 19936,
         | 
| 440 | 
            +
                    "weight": 2
         | 
| 441 | 
            +
                },
         | 
| 442 | 
            +
                {
         | 
| 443 | 
            +
                    "name": "stackexchange_Title_Answer/music.stackexchange.com.jsonl.gz",
         | 
| 444 | 
            +
                    "lines": 19936,
         | 
| 445 | 
            +
                    "weight": 2
         | 
| 446 | 
            +
                },
         | 
| 447 | 
            +
                {
         | 
| 448 | 
            +
                    "name": "stackexchange_title_body/aviation.stackexchange.com.jsonl.gz",
         | 
| 449 | 
            +
                    "lines": 20139,
         | 
| 450 | 
            +
                    "weight": 2
         | 
| 451 | 
            +
                },
         | 
| 452 | 
            +
                {
         | 
| 453 | 
            +
                    "name": "stackexchange_title_body/softwarerecs.stackexchange.com.jsonl.gz",
         | 
| 454 | 
            +
                    "lines": 20142,
         | 
| 455 | 
            +
                    "weight": 2
         | 
| 456 | 
            +
                },
         | 
| 457 | 
            +
                {
         | 
| 458 | 
            +
                    "name": "stackexchange_title_body/movies.stackexchange.com.jsonl.gz",
         | 
| 459 | 
            +
                    "lines": 20181,
         | 
| 460 | 
            +
                    "weight": 2
         | 
| 461 | 
            +
                },
         | 
| 462 | 
            +
                {
         | 
| 463 | 
            +
                    "name": "stackexchange_TitleBody_Answer/datascience.stackexchange.com.jsonl.gz",
         | 
| 464 | 
            +
                    "lines": 20503,
         | 
| 465 | 
            +
                    "weight": 2
         | 
| 466 | 
            +
                },
         | 
| 467 | 
            +
                {
         | 
| 468 | 
            +
                    "name": "stackexchange_Title_Answer/datascience.stackexchange.com.jsonl.gz",
         | 
| 469 | 
            +
                    "lines": 20503,
         | 
| 470 | 
            +
                    "weight": 2
         | 
| 471 | 
            +
                },
         | 
| 472 | 
            +
                {
         | 
| 473 | 
            +
                    "name": "stackexchange_title_body/music.stackexchange.com.jsonl.gz",
         | 
| 474 | 
            +
                    "lines": 20636,
         | 
| 475 | 
            +
                    "weight": 2
         | 
| 476 | 
            +
                },
         | 
| 477 | 
            +
                {
         | 
| 478 | 
            +
                    "name": "stackexchange_TitleBody_Answer/japanese.stackexchange.com.jsonl.gz",
         | 
| 479 | 
            +
                    "lines": 20948,
         | 
| 480 | 
            +
                    "weight": 2
         | 
| 481 | 
            +
                },
         | 
| 482 | 
            +
                {
         | 
| 483 | 
            +
                    "name": "stackexchange_Title_Answer/japanese.stackexchange.com.jsonl.gz",
         | 
| 484 | 
            +
                    "lines": 20948,
         | 
| 485 | 
            +
                    "weight": 2
         | 
| 486 | 
            +
                },
         | 
| 487 | 
            +
                {
         | 
| 488 | 
            +
                    "name": "stackexchange_title_body/emacs.stackexchange.com.jsonl.gz",
         | 
| 489 | 
            +
                    "lines": 21055,
         | 
| 490 | 
            +
                    "weight": 2
         | 
| 491 | 
            +
                },
         | 
| 492 | 
            +
                {
         | 
| 493 | 
            +
                    "name": "stackexchange_title_body/dsp.stackexchange.com.jsonl.gz",
         | 
| 494 | 
            +
                    "lines": 21252,
         | 
| 495 | 
            +
                    "weight": 2
         | 
| 496 | 
            +
                },
         | 
| 497 | 
            +
                {
         | 
| 498 | 
            +
                    "name": "stackexchange_title_body/japanese.stackexchange.com.jsonl.gz",
         | 
| 499 | 
            +
                    "lines": 22056,
         | 
| 500 | 
            +
                    "weight": 2
         | 
| 501 | 
            +
                },
         | 
| 502 | 
            +
                {
         | 
| 503 | 
            +
                    "name": "stackexchange_TitleBody_Answer/bitcoin.stackexchange.com.jsonl.gz",
         | 
| 504 | 
            +
                    "lines": 22474,
         | 
| 505 | 
            +
                    "weight": 2
         | 
| 506 | 
            +
                },
         | 
| 507 | 
            +
                {
         | 
| 508 | 
            +
                    "name": "stackexchange_Title_Answer/bitcoin.stackexchange.com.jsonl.gz",
         | 
| 509 | 
            +
                    "lines": 22474,
         | 
| 510 | 
            +
                    "weight": 2
         | 
| 511 | 
            +
                },
         | 
| 512 | 
            +
                {
         | 
| 513 | 
            +
                    "name": "stackexchange_TitleBody_Answer/cooking.stackexchange.com.jsonl.gz",
         | 
| 514 | 
            +
                    "lines": 22641,
         | 
| 515 | 
            +
                    "weight": 2
         | 
| 516 | 
            +
                },
         | 
| 517 | 
            +
                {
         | 
| 518 | 
            +
                    "name": "stackexchange_Title_Answer/cooking.stackexchange.com.jsonl.gz",
         | 
| 519 | 
            +
                    "lines": 22641,
         | 
| 520 | 
            +
                    "weight": 2
         | 
| 521 | 
            +
                },
         | 
| 522 | 
            +
                {
         | 
| 523 | 
            +
                    "name": "stackexchange_title_body/mechanics.stackexchange.com.jsonl.gz",
         | 
| 524 | 
            +
                    "lines": 22868,
         | 
| 525 | 
            +
                    "weight": 2
         | 
| 526 | 
            +
                },
         | 
| 527 | 
            +
                {
         | 
| 528 | 
            +
                    "name": "stackexchange_TitleBody_Answer/photo.stackexchange.com.jsonl.gz",
         | 
| 529 | 
            +
                    "lines": 23204,
         | 
| 530 | 
            +
                    "weight": 2
         | 
| 531 | 
            +
                },
         | 
| 532 | 
            +
                {
         | 
| 533 | 
            +
                    "name": "stackexchange_Title_Answer/photo.stackexchange.com.jsonl.gz",
         | 
| 534 | 
            +
                    "lines": 23204,
         | 
| 535 | 
            +
                    "weight": 2
         | 
| 536 | 
            +
                },
         | 
| 537 | 
            +
                {
         | 
| 538 | 
            +
                    "name": "stackexchange_title_body/crypto.stackexchange.com.jsonl.gz",
         | 
| 539 | 
            +
                    "lines": 23231,
         | 
| 540 | 
            +
                    "weight": 2
         | 
| 541 | 
            +
                },
         | 
| 542 | 
            +
                {
         | 
| 543 | 
            +
                    "name": "stackexchange_title_body/cooking.stackexchange.com.jsonl.gz",
         | 
| 544 | 
            +
                    "lines": 23705,
         | 
| 545 | 
            +
                    "weight": 2
         | 
| 546 | 
            +
                },
         | 
| 547 | 
            +
                {
         | 
| 548 | 
            +
                    "name": "stackexchange_title_body/photo.stackexchange.com.jsonl.gz",
         | 
| 549 | 
            +
                    "lines": 23753,
         | 
| 550 | 
            +
                    "weight": 2
         | 
| 551 | 
            +
                },
         | 
| 552 | 
            +
                {
         | 
| 553 | 
            +
                    "name": "stackexchange_TitleBody_Answer/workplace.stackexchange.com.jsonl.gz",
         | 
| 554 | 
            +
                    "lines": 24012,
         | 
| 555 | 
            +
                    "weight": 2
         | 
| 556 | 
            +
                },
         | 
| 557 | 
            +
                {
         | 
| 558 | 
            +
                    "name": "stackexchange_Title_Answer/workplace.stackexchange.com.jsonl.gz",
         | 
| 559 | 
            +
                    "lines": 24012,
         | 
| 560 | 
            +
                    "weight": 2
         | 
| 561 | 
            +
                },
         | 
| 562 | 
            +
                {
         | 
| 563 | 
            +
                    "name": "stackexchange_TitleBody_Answer/meta.stackoverflow.com.jsonl.gz",
         | 
| 564 | 
            +
                    "lines": 24044,
         | 
| 565 | 
            +
                    "weight": 2
         | 
| 566 | 
            +
                },
         | 
| 567 | 
            +
                {
         | 
| 568 | 
            +
                    "name": "stackexchange_Title_Answer/meta.stackoverflow.com.jsonl.gz",
         | 
| 569 | 
            +
                    "lines": 24044,
         | 
| 570 | 
            +
                    "weight": 2
         | 
| 571 | 
            +
                },
         | 
| 572 | 
            +
                {
         | 
| 573 | 
            +
                    "name": "stackexchange_TitleBody_Answer/raspberrypi.stackexchange.com.jsonl.gz",
         | 
| 574 | 
            +
                    "lines": 24143,
         | 
| 575 | 
            +
                    "weight": 2
         | 
| 576 | 
            +
                },
         | 
| 577 | 
            +
                {
         | 
| 578 | 
            +
                    "name": "stackexchange_Title_Answer/raspberrypi.stackexchange.com.jsonl.gz",
         | 
| 579 | 
            +
                    "lines": 24143,
         | 
| 580 | 
            +
                    "weight": 2
         | 
| 581 | 
            +
                },
         | 
| 582 | 
            +
                {
         | 
| 583 | 
            +
                    "name": "stackexchange_title_body/workplace.stackexchange.com.jsonl.gz",
         | 
| 584 | 
            +
                    "lines": 24189,
         | 
| 585 | 
            +
                    "weight": 2
         | 
| 586 | 
            +
                },
         | 
| 587 | 
            +
                {
         | 
| 588 | 
            +
                    "name": "stackexchange_title_body/biology.stackexchange.com.jsonl.gz",
         | 
| 589 | 
            +
                    "lines": 24447,
         | 
| 590 | 
            +
                    "weight": 3
         | 
| 591 | 
            +
                },
         | 
| 592 | 
            +
                {
         | 
| 593 | 
            +
                    "name": "stackexchange_TitleBody_Answer/webapps.stackexchange.com.jsonl.gz",
         | 
| 594 | 
            +
                    "lines": 24867,
         | 
| 595 | 
            +
                    "weight": 3
         | 
| 596 | 
            +
                },
         | 
| 597 | 
            +
                {
         | 
| 598 | 
            +
                    "name": "stackexchange_Title_Answer/webapps.stackexchange.com.jsonl.gz",
         | 
| 599 | 
            +
                    "lines": 24867,
         | 
| 600 | 
            +
                    "weight": 3
         | 
| 601 | 
            +
                },
         | 
| 602 | 
            +
                {
         | 
| 603 | 
            +
                    "name": "stackexchange_title_body/bitcoin.stackexchange.com.jsonl.gz",
         | 
| 604 | 
            +
                    "lines": 25374,
         | 
| 605 | 
            +
                    "weight": 3
         | 
| 606 | 
            +
                },
         | 
| 607 | 
            +
                {
         | 
| 608 | 
            +
                    "name": "stackexchange_TitleBody_Answer/judaism.stackexchange.com.jsonl.gz",
         | 
| 609 | 
            +
                    "lines": 26085,
         | 
| 610 | 
            +
                    "weight": 3
         | 
| 611 | 
            +
                },
         | 
| 612 | 
            +
                {
         | 
| 613 | 
            +
                    "name": "stackexchange_Title_Answer/judaism.stackexchange.com.jsonl.gz",
         | 
| 614 | 
            +
                    "lines": 26085,
         | 
| 615 | 
            +
                    "weight": 3
         | 
| 616 | 
            +
                },
         | 
| 617 | 
            +
                {
         | 
| 618 | 
            +
                    "name": "stackexchange_TitleBody_Answer/ethereum.stackexchange.com.jsonl.gz",
         | 
| 619 | 
            +
                    "lines": 26124,
         | 
| 620 | 
            +
                    "weight": 3
         | 
| 621 | 
            +
                },
         | 
| 622 | 
            +
                {
         | 
| 623 | 
            +
                    "name": "stackexchange_Title_Answer/ethereum.stackexchange.com.jsonl.gz",
         | 
| 624 | 
            +
                    "lines": 26124,
         | 
| 625 | 
            +
                    "weight": 3
         | 
| 626 | 
            +
                },
         | 
| 627 | 
            +
                {
         | 
| 628 | 
            +
                    "name": "stackexchange_TitleBody_Answer/worldbuilding.stackexchange.com.jsonl.gz",
         | 
| 629 | 
            +
                    "lines": 26210,
         | 
| 630 | 
            +
                    "weight": 3
         | 
| 631 | 
            +
                },
         | 
| 632 | 
            +
                {
         | 
| 633 | 
            +
                    "name": "stackexchange_Title_Answer/worldbuilding.stackexchange.com.jsonl.gz",
         | 
| 634 | 
            +
                    "lines": 26210,
         | 
| 635 | 
            +
                    "weight": 3
         | 
| 636 | 
            +
                },
         | 
| 637 | 
            +
                {
         | 
| 638 | 
            +
                    "name": "stackexchange_title_body/worldbuilding.stackexchange.com.jsonl.gz",
         | 
| 639 | 
            +
                    "lines": 26763,
         | 
| 640 | 
            +
                    "weight": 3
         | 
| 641 | 
            +
                },
         | 
| 642 | 
            +
                {
         | 
| 643 | 
            +
                    "name": "stackexchange_TitleBody_Answer/chemistry.stackexchange.com.jsonl.gz",
         | 
| 644 | 
            +
                    "lines": 27061,
         | 
| 645 | 
            +
                    "weight": 3
         | 
| 646 | 
            +
                },
         | 
| 647 | 
            +
                {
         | 
| 648 | 
            +
                    "name": "stackexchange_Title_Answer/chemistry.stackexchange.com.jsonl.gz",
         | 
| 649 | 
            +
                    "lines": 27061,
         | 
| 650 | 
            +
                    "weight": 3
         | 
| 651 | 
            +
                },
         | 
| 652 | 
            +
                {
         | 
| 653 | 
            +
                    "name": "stackexchange_title_body/datascience.stackexchange.com.jsonl.gz",
         | 
| 654 | 
            +
                    "lines": 27397,
         | 
| 655 | 
            +
                    "weight": 3
         | 
| 656 | 
            +
                },
         | 
| 657 | 
            +
                {
         | 
| 658 | 
            +
                    "name": "stackexchange_TitleBody_Answer/graphicdesign.stackexchange.com.jsonl.gz",
         | 
| 659 | 
            +
                    "lines": 28083,
         | 
| 660 | 
            +
                    "weight": 3
         | 
| 661 | 
            +
                },
         | 
| 662 | 
            +
                {
         | 
| 663 | 
            +
                    "name": "stackexchange_Title_Answer/graphicdesign.stackexchange.com.jsonl.gz",
         | 
| 664 | 
            +
                    "lines": 28083,
         | 
| 665 | 
            +
                    "weight": 3
         | 
| 666 | 
            +
                },
         | 
| 667 | 
            +
                {
         | 
| 668 | 
            +
                    "name": "stackexchange_TitleBody_Answer/ux.stackexchange.com.jsonl.gz",
         | 
| 669 | 
            +
                    "lines": 28901,
         | 
| 670 | 
            +
                    "weight": 3
         | 
| 671 | 
            +
                },
         | 
| 672 | 
            +
                {
         | 
| 673 | 
            +
                    "name": "stackexchange_Title_Answer/ux.stackexchange.com.jsonl.gz",
         | 
| 674 | 
            +
                    "lines": 28901,
         | 
| 675 | 
            +
                    "weight": 3
         | 
| 676 | 
            +
                },
         | 
| 677 | 
            +
                {
         | 
| 678 | 
            +
                    "name": "stackexchange_title_body/ux.stackexchange.com.jsonl.gz",
         | 
| 679 | 
            +
                    "lines": 29403,
         | 
| 680 | 
            +
                    "weight": 3
         | 
| 681 | 
            +
                },
         | 
| 682 | 
            +
                {
         | 
| 683 | 
            +
                    "name": "stackexchange_TitleBody_Answer/money.stackexchange.com.jsonl.gz",
         | 
| 684 | 
            +
                    "lines": 29404,
         | 
| 685 | 
            +
                    "weight": 3
         | 
| 686 | 
            +
                },
         | 
| 687 | 
            +
                {
         | 
| 688 | 
            +
                    "name": "stackexchange_Title_Answer/money.stackexchange.com.jsonl.gz",
         | 
| 689 | 
            +
                    "lines": 29404,
         | 
| 690 | 
            +
                    "weight": 3
         | 
| 691 | 
            +
                },
         | 
| 692 | 
            +
                {
         | 
| 693 | 
            +
                    "name": "stackexchange_title_body/webapps.stackexchange.com.jsonl.gz",
         | 
| 694 | 
            +
                    "lines": 29697,
         | 
| 695 | 
            +
                    "weight": 3
         | 
| 696 | 
            +
                },
         | 
| 697 | 
            +
                {
         | 
| 698 | 
            +
                    "name": "stackexchange_TitleBody_Answer/cs.stackexchange.com.jsonl.gz",
         | 
| 699 | 
            +
                    "lines": 30010,
         | 
| 700 | 
            +
                    "weight": 3
         | 
| 701 | 
            +
                },
         | 
| 702 | 
            +
                {
         | 
| 703 | 
            +
                    "name": "stackexchange_Title_Answer/cs.stackexchange.com.jsonl.gz",
         | 
| 704 | 
            +
                    "lines": 30010,
         | 
| 705 | 
            +
                    "weight": 3
         | 
| 706 | 
            +
                },
         | 
| 707 | 
            +
                {
         | 
| 708 | 
            +
                    "name": "stackexchange_title_body/graphicdesign.stackexchange.com.jsonl.gz",
         | 
| 709 | 
            +
                    "lines": 30233,
         | 
| 710 | 
            +
                    "weight": 3
         | 
| 711 | 
            +
                },
         | 
| 712 | 
            +
                {
         | 
| 713 | 
            +
                    "name": "stackexchange_TitleBody_Answer/webmasters.stackexchange.com.jsonl.gz",
         | 
| 714 | 
            +
                    "lines": 30370,
         | 
| 715 | 
            +
                    "weight": 3
         | 
| 716 | 
            +
                },
         | 
| 717 | 
            +
                {
         | 
| 718 | 
            +
                    "name": "stackexchange_Title_Answer/webmasters.stackexchange.com.jsonl.gz",
         | 
| 719 | 
            +
                    "lines": 30370,
         | 
| 720 | 
            +
                    "weight": 3
         | 
| 721 | 
            +
                },
         | 
| 722 | 
            +
                {
         | 
| 723 | 
            +
                    "name": "stackexchange_title_body/raspberrypi.stackexchange.com.jsonl.gz",
         | 
| 724 | 
            +
                    "lines": 30625,
         | 
| 725 | 
            +
                    "weight": 3
         | 
| 726 | 
            +
                },
         | 
| 727 | 
            +
                {
         | 
| 728 | 
            +
                    "name": "stackexchange_title_body/money.stackexchange.com.jsonl.gz",
         | 
| 729 | 
            +
                    "lines": 32021,
         | 
| 730 | 
            +
                    "weight": 3
         | 
| 731 | 
            +
                },
         | 
| 732 | 
            +
                {
         | 
| 733 | 
            +
                    "name": "stackexchange_title_body/judaism.stackexchange.com.jsonl.gz",
         | 
| 734 | 
            +
                    "lines": 32028,
         | 
| 735 | 
            +
                    "weight": 3
         | 
| 736 | 
            +
                },
         | 
| 737 | 
            +
                {
         | 
| 738 | 
            +
                    "name": "stackexchange_TitleBody_Answer/academia.stackexchange.com.jsonl.gz",
         | 
| 739 | 
            +
                    "lines": 32137,
         | 
| 740 | 
            +
                    "weight": 3
         | 
| 741 | 
            +
                },
         | 
| 742 | 
            +
                {
         | 
| 743 | 
            +
                    "name": "stackexchange_Title_Answer/academia.stackexchange.com.jsonl.gz",
         | 
| 744 | 
            +
                    "lines": 32137,
         | 
| 745 | 
            +
                    "weight": 3
         | 
| 746 | 
            +
                },
         | 
| 747 | 
            +
                {
         | 
| 748 | 
            +
                    "name": "stackexchange_title_body/ethereum.stackexchange.com.jsonl.gz",
         | 
| 749 | 
            +
                    "lines": 32760,
         | 
| 750 | 
            +
                    "weight": 3
         | 
| 751 | 
            +
                },
         | 
| 752 | 
            +
                {
         | 
| 753 | 
            +
                    "name": "stackexchange_title_body/academia.stackexchange.com.jsonl.gz",
         | 
| 754 | 
            +
                    "lines": 34331,
         | 
| 755 | 
            +
                    "weight": 3
         | 
| 756 | 
            +
                },
         | 
| 757 | 
            +
                {
         | 
| 758 | 
            +
                    "name": "stackexchange_title_body/chemistry.stackexchange.com.jsonl.gz",
         | 
| 759 | 
            +
                    "lines": 34506,
         | 
| 760 | 
            +
                    "weight": 3
         | 
| 761 | 
            +
                },
         | 
| 762 | 
            +
                {
         | 
| 763 | 
            +
                    "name": "stackexchange_title_body/webmasters.stackexchange.com.jsonl.gz",
         | 
| 764 | 
            +
                    "lines": 34559,
         | 
| 765 | 
            +
                    "weight": 3
         | 
| 766 | 
            +
                },
         | 
| 767 | 
            +
                {
         | 
| 768 | 
            +
                    "name": "stackexchange_title_body/meta.stackoverflow.com.jsonl.gz",
         | 
| 769 | 
            +
                    "lines": 36456,
         | 
| 770 | 
            +
                    "weight": 3
         | 
| 771 | 
            +
                },
         | 
| 772 | 
            +
                {
         | 
| 773 | 
            +
                    "name": "stackexchange_TitleBody_Answer/travel.stackexchange.com.jsonl.gz",
         | 
| 774 | 
            +
                    "lines": 36533,
         | 
| 775 | 
            +
                    "weight": 4
         | 
| 776 | 
            +
                },
         | 
| 777 | 
            +
                {
         | 
| 778 | 
            +
                    "name": "stackexchange_Title_Answer/travel.stackexchange.com.jsonl.gz",
         | 
| 779 | 
            +
                    "lines": 36533,
         | 
| 780 | 
            +
                    "weight": 4
         | 
| 781 | 
            +
                },
         | 
| 782 | 
            +
                {
         | 
| 783 | 
            +
                    "name": "stackexchange_TitleBody_Answer/android.stackexchange.com.jsonl.gz",
         | 
| 784 | 
            +
                    "lines": 38077,
         | 
| 785 | 
            +
                    "weight": 4
         | 
| 786 | 
            +
                },
         | 
| 787 | 
            +
                {
         | 
| 788 | 
            +
                    "name": "stackexchange_Title_Answer/android.stackexchange.com.jsonl.gz",
         | 
| 789 | 
            +
                    "lines": 38077,
         | 
| 790 | 
            +
                    "weight": 4
         | 
| 791 | 
            +
                },
         | 
| 792 | 
            +
                {
         | 
| 793 | 
            +
                    "name": "stackexchange_title_body/cs.stackexchange.com.jsonl.gz",
         | 
| 794 | 
            +
                    "lines": 38314,
         | 
| 795 | 
            +
                    "weight": 4
         | 
| 796 | 
            +
                },
         | 
| 797 | 
            +
                {
         | 
| 798 | 
            +
                    "name": "stackexchange_TitleBody_Answer/gamedev.stackexchange.com.jsonl.gz",
         | 
| 799 | 
            +
                    "lines": 40154,
         | 
| 800 | 
            +
                    "weight": 4
         | 
| 801 | 
            +
                },
         | 
| 802 | 
            +
                {
         | 
| 803 | 
            +
                    "name": "stackexchange_Title_Answer/gamedev.stackexchange.com.jsonl.gz",
         | 
| 804 | 
            +
                    "lines": 40154,
         | 
| 805 | 
            +
                    "weight": 4
         | 
| 806 | 
            +
                },
         | 
| 807 | 
            +
                {
         | 
| 808 | 
            +
                    "name": "stackexchange_TitleBody_Answer/rpg.stackexchange.com.jsonl.gz",
         | 
| 809 | 
            +
                    "lines": 40435,
         | 
| 810 | 
            +
                    "weight": 4
         | 
| 811 | 
            +
                },
         | 
| 812 | 
            +
                {
         | 
| 813 | 
            +
                    "name": "stackexchange_Title_Answer/rpg.stackexchange.com.jsonl.gz",
         | 
| 814 | 
            +
                    "lines": 40435,
         | 
| 815 | 
            +
                    "weight": 4
         | 
| 816 | 
            +
                },
         | 
| 817 | 
            +
                {
         | 
| 818 | 
            +
                    "name": "stackexchange_title_body/travel.stackexchange.com.jsonl.gz",
         | 
| 819 | 
            +
                    "lines": 41227,
         | 
| 820 | 
            +
                    "weight": 4
         | 
| 821 | 
            +
                },
         | 
| 822 | 
            +
                {
         | 
| 823 | 
            +
                    "name": "stackexchange_TitleBody_Answer/codereview.stackexchange.com.jsonl.gz",
         | 
| 824 | 
            +
                    "lines": 41748,
         | 
| 825 | 
            +
                    "weight": 4
         | 
| 826 | 
            +
                },
         | 
| 827 | 
            +
                {
         | 
| 828 | 
            +
                    "name": "stackexchange_Title_Answer/codereview.stackexchange.com.jsonl.gz",
         | 
| 829 | 
            +
                    "lines": 41748,
         | 
| 830 | 
            +
                    "weight": 4
         | 
| 831 | 
            +
                },
         | 
| 832 | 
            +
                {
         | 
| 833 | 
            +
                    "name": "stackexchange_title_body/rpg.stackexchange.com.jsonl.gz",
         | 
| 834 | 
            +
                    "lines": 42303,
         | 
| 835 | 
            +
                    "weight": 4
         | 
| 836 | 
            +
                },
         | 
| 837 | 
            +
                {
         | 
| 838 | 
            +
                    "name": "stackexchange_title_body/codereview.stackexchange.com.jsonl.gz",
         | 
| 839 | 
            +
                    "lines": 45765,
         | 
| 840 | 
            +
                    "weight": 4
         | 
| 841 | 
            +
                },
         | 
| 842 | 
            +
                {
         | 
| 843 | 
            +
                    "name": "stackexchange_title_body/gamedev.stackexchange.com.jsonl.gz",
         | 
| 844 | 
            +
                    "lines": 46485,
         | 
| 845 | 
            +
                    "weight": 4
         | 
| 846 | 
            +
                },
         | 
| 847 | 
            +
                {
         | 
| 848 | 
            +
                    "name": "stackexchange_TitleBody_Answer/softwareengineering.stackexchange.com.jsonl.gz",
         | 
| 849 | 
            +
                    "lines": 51326,
         | 
| 850 | 
            +
                    "weight": 5
         | 
| 851 | 
            +
                },
         | 
| 852 | 
            +
                {
         | 
| 853 | 
            +
                    "name": "stackexchange_Title_Answer/softwareengineering.stackexchange.com.jsonl.gz",
         | 
| 854 | 
            +
                    "lines": 51326,
         | 
| 855 | 
            +
                    "weight": 5
         | 
| 856 | 
            +
                },
         | 
| 857 | 
            +
                {
         | 
| 858 | 
            +
                    "name": "stackexchange_TitleBody_Answer/security.stackexchange.com.jsonl.gz",
         | 
| 859 | 
            +
                    "lines": 51355,
         | 
| 860 | 
            +
                    "weight": 5
         | 
| 861 | 
            +
                },
         | 
| 862 | 
            +
                {
         | 
| 863 | 
            +
                    "name": "stackexchange_Title_Answer/security.stackexchange.com.jsonl.gz",
         | 
| 864 | 
            +
                    "lines": 51355,
         | 
| 865 | 
            +
                    "weight": 5
         | 
| 866 | 
            +
                },
         | 
| 867 | 
            +
                {
         | 
| 868 | 
            +
                    "name": "stackexchange_title_body/android.stackexchange.com.jsonl.gz",
         | 
| 869 | 
            +
                    "lines": 51608,
         | 
| 870 | 
            +
                    "weight": 5
         | 
| 871 | 
            +
                },
         | 
| 872 | 
            +
                {
         | 
| 873 | 
            +
                    "name": "stackexchange_TitleBody_Answer/diy.stackexchange.com.jsonl.gz",
         | 
| 874 | 
            +
                    "lines": 52896,
         | 
| 875 | 
            +
                    "weight": 5
         | 
| 876 | 
            +
                },
         | 
| 877 | 
            +
                {
         | 
| 878 | 
            +
                    "name": "stackexchange_Title_Answer/diy.stackexchange.com.jsonl.gz",
         | 
| 879 | 
            +
                    "lines": 52896,
         | 
| 880 | 
            +
                    "weight": 5
         | 
| 881 | 
            +
                },
         | 
| 882 | 
            +
                {
         | 
| 883 | 
            +
                    "name": "stackexchange_title_body/softwareengineering.stackexchange.com.jsonl.gz",
         | 
| 884 | 
            +
                    "lines": 53942,
         | 
| 885 | 
            +
                    "weight": 5
         | 
| 886 | 
            +
                },
         | 
| 887 | 
            +
                {
         | 
| 888 | 
            +
                    "name": "stackexchange_TitleBody_Answer/blender.stackexchange.com.jsonl.gz",
         | 
| 889 | 
            +
                    "lines": 54153,
         | 
| 890 | 
            +
                    "weight": 5
         | 
| 891 | 
            +
                },
         | 
| 892 | 
            +
                {
         | 
| 893 | 
            +
                    "name": "stackexchange_Title_Answer/blender.stackexchange.com.jsonl.gz",
         | 
| 894 | 
            +
                    "lines": 54153,
         | 
| 895 | 
            +
                    "weight": 5
         | 
| 896 | 
            +
                },
         | 
| 897 | 
            +
                {
         | 
| 898 | 
            +
                    "name": "stackexchange_TitleBody_Answer/scifi.stackexchange.com.jsonl.gz",
         | 
| 899 | 
            +
                    "lines": 54805,
         | 
| 900 | 
            +
                    "weight": 5
         | 
| 901 | 
            +
                },
         | 
| 902 | 
            +
                {
         | 
| 903 | 
            +
                    "name": "stackexchange_Title_Answer/scifi.stackexchange.com.jsonl.gz",
         | 
| 904 | 
            +
                    "lines": 54805,
         | 
| 905 | 
            +
                    "weight": 5
         | 
| 906 | 
            +
                },
         | 
| 907 | 
            +
                {
         | 
| 908 | 
            +
                    "name": "stackexchange_title_body/security.stackexchange.com.jsonl.gz",
         | 
| 909 | 
            +
                    "lines": 58000,
         | 
| 910 | 
            +
                    "weight": 5
         | 
| 911 | 
            +
                },
         | 
| 912 | 
            +
                {
         | 
| 913 | 
            +
                    "name": "stackexchange_TitleBody_Answer/mathematica.stackexchange.com.jsonl.gz",
         | 
| 914 | 
            +
                    "lines": 59895,
         | 
| 915 | 
            +
                    "weight": 5
         | 
| 916 | 
            +
                },
         | 
| 917 | 
            +
                {
         | 
| 918 | 
            +
                    "name": "stackexchange_Title_Answer/mathematica.stackexchange.com.jsonl.gz",
         | 
| 919 | 
            +
                    "lines": 59895,
         | 
| 920 | 
            +
                    "weight": 5
         | 
| 921 | 
            +
                },
         | 
| 922 | 
            +
                {
         | 
| 923 | 
            +
                    "name": "stackexchange_title_body/diy.stackexchange.com.jsonl.gz",
         | 
| 924 | 
            +
                    "lines": 60083,
         | 
| 925 | 
            +
                    "weight": 5
         | 
| 926 | 
            +
                },
         | 
| 927 | 
            +
                {
         | 
| 928 | 
            +
                    "name": "stackexchange_TitleBody_Answer/meta.stackexchange.com.jsonl.gz",
         | 
| 929 | 
            +
                    "lines": 60744,
         | 
| 930 | 
            +
                    "weight": 5
         | 
| 931 | 
            +
                },
         | 
| 932 | 
            +
                {
         | 
| 933 | 
            +
                    "name": "stackexchange_Title_Answer/meta.stackexchange.com.jsonl.gz",
         | 
| 934 | 
            +
                    "lines": 60744,
         | 
| 935 | 
            +
                    "weight": 5
         | 
| 936 | 
            +
                },
         | 
| 937 | 
            +
                {
         | 
| 938 | 
            +
                    "name": "stackexchange_title_body/scifi.stackexchange.com.jsonl.gz",
         | 
| 939 | 
            +
                    "lines": 61528,
         | 
| 940 | 
            +
                    "weight": 6
         | 
| 941 | 
            +
                },
         | 
| 942 | 
            +
                {
         | 
| 943 | 
            +
                    "name": "stackexchange_TitleBody_Answer/drupal.stackexchange.com.jsonl.gz",
         | 
| 944 | 
            +
                    "lines": 67817,
         | 
| 945 | 
            +
                    "weight": 6
         | 
| 946 | 
            +
                },
         | 
| 947 | 
            +
                {
         | 
| 948 | 
            +
                    "name": "stackexchange_Title_Answer/drupal.stackexchange.com.jsonl.gz",
         | 
| 949 | 
            +
                    "lines": 67817,
         | 
| 950 | 
            +
                    "weight": 6
         | 
| 951 | 
            +
                },
         | 
| 952 | 
            +
                {
         | 
| 953 | 
            +
                    "name": "stackexchange_TitleBody_Answer/dba.stackexchange.com.jsonl.gz",
         | 
| 954 | 
            +
                    "lines": 71449,
         | 
| 955 | 
            +
                    "weight": 6
         | 
| 956 | 
            +
                },
         | 
| 957 | 
            +
                {
         | 
| 958 | 
            +
                    "name": "stackexchange_Title_Answer/dba.stackexchange.com.jsonl.gz",
         | 
| 959 | 
            +
                    "lines": 71449,
         | 
| 960 | 
            +
                    "weight": 6
         | 
| 961 | 
            +
                },
         | 
| 962 | 
            +
                {
         | 
| 963 | 
            +
                    "name": "stackexchange_title_body/mathematica.stackexchange.com.jsonl.gz",
         | 
| 964 | 
            +
                    "lines": 73131,
         | 
| 965 | 
            +
                    "weight": 7
         | 
| 966 | 
            +
                },
         | 
| 967 | 
            +
                {
         | 
| 968 | 
            +
                    "name": "stackexchange_TitleBody_Answer/ell.stackexchange.com.jsonl.gz",
         | 
| 969 | 
            +
                    "lines": 77892,
         | 
| 970 | 
            +
                    "weight": 7
         | 
| 971 | 
            +
                },
         | 
| 972 | 
            +
                {
         | 
| 973 | 
            +
                    "name": "stackexchange_Title_Answer/ell.stackexchange.com.jsonl.gz",
         | 
| 974 | 
            +
                    "lines": 77892,
         | 
| 975 | 
            +
                    "weight": 7
         | 
| 976 | 
            +
                },
         | 
| 977 | 
            +
                {
         | 
| 978 | 
            +
                    "name": "stackexchange_TitleBody_Answer/magento.stackexchange.com.jsonl.gz",
         | 
| 979 | 
            +
                    "lines": 79241,
         | 
| 980 | 
            +
                    "weight": 7
         | 
| 981 | 
            +
                },
         | 
| 982 | 
            +
                {
         | 
| 983 | 
            +
                    "name": "stackexchange_Title_Answer/magento.stackexchange.com.jsonl.gz",
         | 
| 984 | 
            +
                    "lines": 79241,
         | 
| 985 | 
            +
                    "weight": 7
         | 
| 986 | 
            +
                },
         | 
| 987 | 
            +
                {
         | 
| 988 | 
            +
                    "name": "stackexchange_title_body/drupal.stackexchange.com.jsonl.gz",
         | 
| 989 | 
            +
                    "lines": 79717,
         | 
| 990 | 
            +
                    "weight": 7
         | 
| 991 | 
            +
                },
         | 
| 992 | 
            +
                {
         | 
| 993 | 
            +
                    "name": "stackexchange_TitleBody_Answer/sharepoint.stackexchange.com.jsonl.gz",
         | 
| 994 | 
            +
                    "lines": 80420,
         | 
| 995 | 
            +
                    "weight": 7
         | 
| 996 | 
            +
                },
         | 
| 997 | 
            +
                {
         | 
| 998 | 
            +
                    "name": "stackexchange_Title_Answer/sharepoint.stackexchange.com.jsonl.gz",
         | 
| 999 | 
            +
                    "lines": 80420,
         | 
| 1000 | 
            +
                    "weight": 7
         | 
| 1001 | 
            +
                },
         | 
| 1002 | 
            +
                {
         | 
| 1003 | 
            +
                    "name": "stackexchange_title_body/blender.stackexchange.com.jsonl.gz",
         | 
| 1004 | 
            +
                    "lines": 80766,
         | 
| 1005 | 
            +
                    "weight": 7
         | 
| 1006 | 
            +
                },
         | 
| 1007 | 
            +
                {
         | 
| 1008 | 
            +
                    "name": "stackexchange_title_body/dba.stackexchange.com.jsonl.gz",
         | 
| 1009 | 
            +
                    "lines": 81871,
         | 
| 1010 | 
            +
                    "weight": 7
         | 
| 1011 | 
            +
                },
         | 
| 1012 | 
            +
                {
         | 
| 1013 | 
            +
                    "name": "stackexchange_TitleBody_Answer/gaming.stackexchange.com.jsonl.gz",
         | 
| 1014 | 
            +
                    "lines": 82887,
         | 
| 1015 | 
            +
                    "weight": 7
         | 
| 1016 | 
            +
                },
         | 
| 1017 | 
            +
                {
         | 
| 1018 | 
            +
                    "name": "stackexchange_Title_Answer/gaming.stackexchange.com.jsonl.gz",
         | 
| 1019 | 
            +
                    "lines": 82887,
         | 
| 1020 | 
            +
                    "weight": 7
         | 
| 1021 | 
            +
                },
         | 
| 1022 | 
            +
                {
         | 
| 1023 | 
            +
                    "name": "stackexchange_title_body/ell.stackexchange.com.jsonl.gz",
         | 
| 1024 | 
            +
                    "lines": 83271,
         | 
| 1025 | 
            +
                    "weight": 7
         | 
| 1026 | 
            +
                },
         | 
| 1027 | 
            +
                {
         | 
| 1028 | 
            +
                    "name": "stackexchange_title_body/meta.stackexchange.com.jsonl.gz",
         | 
| 1029 | 
            +
                    "lines": 83510,
         | 
| 1030 | 
            +
                    "weight": 7
         | 
| 1031 | 
            +
                },
         | 
| 1032 | 
            +
                {
         | 
| 1033 | 
            +
                    "name": "stackexchange_TitleBody_Answer/wordpress.stackexchange.com.jsonl.gz",
         | 
| 1034 | 
            +
                    "lines": 83621,
         | 
| 1035 | 
            +
                    "weight": 7
         | 
| 1036 | 
            +
                },
         | 
| 1037 | 
            +
                {
         | 
| 1038 | 
            +
                    "name": "stackexchange_Title_Answer/wordpress.stackexchange.com.jsonl.gz",
         | 
| 1039 | 
            +
                    "lines": 83621,
         | 
| 1040 | 
            +
                    "weight": 7
         | 
| 1041 | 
            +
                },
         | 
| 1042 | 
            +
                {
         | 
| 1043 | 
            +
                    "name": "stackexchange_TitleBody_Answer/mathoverflow.net.jsonl.gz",
         | 
| 1044 | 
            +
                    "lines": 85289,
         | 
| 1045 | 
            +
                    "weight": 8
         | 
| 1046 | 
            +
                },
         | 
| 1047 | 
            +
                {
         | 
| 1048 | 
            +
                    "name": "stackexchange_Title_Answer/mathoverflow.net.jsonl.gz",
         | 
| 1049 | 
            +
                    "lines": 85289,
         | 
| 1050 | 
            +
                    "weight": 8
         | 
| 1051 | 
            +
                },
         | 
| 1052 | 
            +
                {
         | 
| 1053 | 
            +
                    "name": "stackexchange_TitleBody_Answer/salesforce.stackexchange.com.jsonl.gz",
         | 
| 1054 | 
            +
                    "lines": 87272,
         | 
| 1055 | 
            +
                    "weight": 8
         | 
| 1056 | 
            +
                },
         | 
| 1057 | 
            +
                {
         | 
| 1058 | 
            +
                    "name": "stackexchange_Title_Answer/salesforce.stackexchange.com.jsonl.gz",
         | 
| 1059 | 
            +
                    "lines": 87272,
         | 
| 1060 | 
            +
                    "weight": 8
         | 
| 1061 | 
            +
                },
         | 
| 1062 | 
            +
                {
         | 
| 1063 | 
            +
                    "name": "stackexchange_title_body/gaming.stackexchange.com.jsonl.gz",
         | 
| 1064 | 
            +
                    "lines": 88912,
         | 
| 1065 | 
            +
                    "weight": 8
         | 
| 1066 | 
            +
                },
         | 
| 1067 | 
            +
                {
         | 
| 1068 | 
            +
                    "name": "stackexchange_TitleBody_Answer/apple.stackexchange.com.jsonl.gz",
         | 
| 1069 | 
            +
                    "lines": 92487,
         | 
| 1070 | 
            +
                    "weight": 8
         | 
| 1071 | 
            +
                },
         | 
| 1072 | 
            +
                {
         | 
| 1073 | 
            +
                    "name": "stackexchange_Title_Answer/apple.stackexchange.com.jsonl.gz",
         | 
| 1074 | 
            +
                    "lines": 92487,
         | 
| 1075 | 
            +
                    "weight": 8
         | 
| 1076 | 
            +
                },
         | 
| 1077 | 
            +
                {
         | 
| 1078 | 
            +
                    "name": "stackexchange_title_body/sharepoint.stackexchange.com.jsonl.gz",
         | 
| 1079 | 
            +
                    "lines": 94011,
         | 
| 1080 | 
            +
                    "weight": 8
         | 
| 1081 | 
            +
                },
         | 
| 1082 | 
            +
                {
         | 
| 1083 | 
            +
                    "name": "stackexchange_title_body/magento.stackexchange.com.jsonl.gz",
         | 
| 1084 | 
            +
                    "lines": 99991,
         | 
| 1085 | 
            +
                    "weight": 9
         | 
| 1086 | 
            +
                },
         | 
| 1087 | 
            +
                {
         | 
| 1088 | 
            +
                    "name": "stackexchange_TitleBody_Answer/gis.stackexchange.com.jsonl.gz",
         | 
| 1089 | 
            +
                    "lines": 100254,
         | 
| 1090 | 
            +
                    "weight": 9
         | 
| 1091 | 
            +
                },
         | 
| 1092 | 
            +
                {
         | 
| 1093 | 
            +
                    "name": "stackexchange_Title_Answer/gis.stackexchange.com.jsonl.gz",
         | 
| 1094 | 
            +
                    "lines": 100254,
         | 
| 1095 | 
            +
                    "weight": 9
         | 
| 1096 | 
            +
                },
         | 
| 1097 | 
            +
                {
         | 
| 1098 | 
            +
                    "name": "stackexchange_title_body/wordpress.stackexchange.com.jsonl.gz",
         | 
| 1099 | 
            +
                    "lines": 100474,
         | 
| 1100 | 
            +
                    "weight": 9
         | 
| 1101 | 
            +
                },
         | 
| 1102 | 
            +
                {
         | 
| 1103 | 
            +
                    "name": "stackexchange_TitleBody_Answer/english.stackexchange.com.jsonl.gz",
         | 
| 1104 | 
            +
                    "lines": 100640,
         | 
| 1105 | 
            +
                    "weight": 9
         | 
| 1106 | 
            +
                },
         | 
| 1107 | 
            +
                {
         | 
| 1108 | 
            +
                    "name": "stackexchange_Title_Answer/english.stackexchange.com.jsonl.gz",
         | 
| 1109 | 
            +
                    "lines": 100640,
         | 
| 1110 | 
            +
                    "weight": 9
         | 
| 1111 | 
            +
                },
         | 
| 1112 | 
            +
                {
         | 
| 1113 | 
            +
                    "name": "stackexchange_title_body/salesforce.stackexchange.com.jsonl.gz",
         | 
| 1114 | 
            +
                    "lines": 105260,
         | 
| 1115 | 
            +
                    "weight": 9
         | 
| 1116 | 
            +
                },
         | 
| 1117 | 
            +
                {
         | 
| 1118 | 
            +
                    "name": "stackexchange_title_body/english.stackexchange.com.jsonl.gz",
         | 
| 1119 | 
            +
                    "lines": 109522,
         | 
| 1120 | 
            +
                    "weight": 10
         | 
| 1121 | 
            +
                },
         | 
| 1122 | 
            +
                {
         | 
| 1123 | 
            +
                    "name": "stackexchange_title_body/apple.stackexchange.com.jsonl.gz",
         | 
| 1124 | 
            +
                    "lines": 110622,
         | 
| 1125 | 
            +
                    "weight": 10
         | 
| 1126 | 
            +
                },
         | 
| 1127 | 
            +
                {
         | 
| 1128 | 
            +
                    "name": "stackexchange_TitleBody_Answer/stats.stackexchange.com.jsonl.gz",
         | 
| 1129 | 
            +
                    "lines": 115679,
         | 
| 1130 | 
            +
                    "weight": 10
         | 
| 1131 | 
            +
                },
         | 
| 1132 | 
            +
                {
         | 
| 1133 | 
            +
                    "name": "stackexchange_Title_Answer/stats.stackexchange.com.jsonl.gz",
         | 
| 1134 | 
            +
                    "lines": 115679,
         | 
| 1135 | 
            +
                    "weight": 10
         | 
| 1136 | 
            +
                },
         | 
| 1137 | 
            +
                {
         | 
| 1138 | 
            +
                    "name": "stackexchange_title_body/mathoverflow.net.jsonl.gz",
         | 
| 1139 | 
            +
                    "lines": 120851,
         | 
| 1140 | 
            +
                    "weight": 10
         | 
| 1141 | 
            +
                },
         | 
| 1142 | 
            +
                {
         | 
| 1143 | 
            +
                    "name": "stackexchange_TitleBody_Answer/electronics.stackexchange.com.jsonl.gz",
         | 
| 1144 | 
            +
                    "lines": 129494,
         | 
| 1145 | 
            +
                    "weight": 11
         | 
| 1146 | 
            +
                },
         | 
| 1147 | 
            +
                {
         | 
| 1148 | 
            +
                    "name": "stackexchange_Title_Answer/electronics.stackexchange.com.jsonl.gz",
         | 
| 1149 | 
            +
                    "lines": 129494,
         | 
| 1150 | 
            +
                    "weight": 11
         | 
| 1151 | 
            +
                },
         | 
| 1152 | 
            +
                {
         | 
| 1153 | 
            +
                    "name": "stackexchange_title_body/gis.stackexchange.com.jsonl.gz",
         | 
| 1154 | 
            +
                    "lines": 131000,
         | 
| 1155 | 
            +
                    "weight": 11
         | 
| 1156 | 
            +
                },
         | 
| 1157 | 
            +
                {
         | 
| 1158 | 
            +
                    "name": "stackexchange_TitleBody_Answer/physics.stackexchange.com.jsonl.gz",
         | 
| 1159 | 
            +
                    "lines": 141230,
         | 
| 1160 | 
            +
                    "weight": 12
         | 
| 1161 | 
            +
                },
         | 
| 1162 | 
            +
                {
         | 
| 1163 | 
            +
                    "name": "stackexchange_Title_Answer/physics.stackexchange.com.jsonl.gz",
         | 
| 1164 | 
            +
                    "lines": 141230,
         | 
| 1165 | 
            +
                    "weight": 12
         | 
| 1166 | 
            +
                },
         | 
| 1167 | 
            +
                {
         | 
| 1168 | 
            +
                    "name": "stackexchange_title_body/electronics.stackexchange.com.jsonl.gz",
         | 
| 1169 | 
            +
                    "lines": 143582,
         | 
| 1170 | 
            +
                    "weight": 12
         | 
| 1171 | 
            +
                },
         | 
| 1172 | 
            +
                {
         | 
| 1173 | 
            +
                    "name": "stackexchange_TitleBody_Answer/unix.stackexchange.com.jsonl.gz",
         | 
| 1174 | 
            +
                    "lines": 155414,
         | 
| 1175 | 
            +
                    "weight": 13
         | 
| 1176 | 
            +
                },
         | 
| 1177 | 
            +
                {
         | 
| 1178 | 
            +
                    "name": "stackexchange_Title_Answer/unix.stackexchange.com.jsonl.gz",
         | 
| 1179 | 
            +
                    "lines": 155414,
         | 
| 1180 | 
            +
                    "weight": 13
         | 
| 1181 | 
            +
                },
         | 
| 1182 | 
            +
                {
         | 
| 1183 | 
            +
                    "name": "stackexchange_TitleBody_Answer/tex.stackexchange.com.jsonl.gz",
         | 
| 1184 | 
            +
                    "lines": 171628,
         | 
| 1185 | 
            +
                    "weight": 15
         | 
| 1186 | 
            +
                },
         | 
| 1187 | 
            +
                {
         | 
| 1188 | 
            +
                    "name": "stackexchange_Title_Answer/tex.stackexchange.com.jsonl.gz",
         | 
| 1189 | 
            +
                    "lines": 171628,
         | 
| 1190 | 
            +
                    "weight": 15
         | 
| 1191 | 
            +
                },
         | 
| 1192 | 
            +
                {
         | 
| 1193 | 
            +
                    "name": "stackexchange_title_body/physics.stackexchange.com.jsonl.gz",
         | 
| 1194 | 
            +
                    "lines": 173307,
         | 
| 1195 | 
            +
                    "weight": 15
         | 
| 1196 | 
            +
                },
         | 
| 1197 | 
            +
                {
         | 
| 1198 | 
            +
                    "name": "stackexchange_title_body/stats.stackexchange.com.jsonl.gz",
         | 
| 1199 | 
            +
                    "lines": 173466,
         | 
| 1200 | 
            +
                    "weight": 15
         | 
| 1201 | 
            +
                },
         | 
| 1202 | 
            +
                {
         | 
| 1203 | 
            +
                    "name": "stackexchange_title_body/unix.stackexchange.com.jsonl.gz",
         | 
| 1204 | 
            +
                    "lines": 185997,
         | 
| 1205 | 
            +
                    "weight": 16
         | 
| 1206 | 
            +
                },
         | 
| 1207 | 
            +
                {
         | 
| 1208 | 
            +
                    "name": "stackexchange_title_body/tex.stackexchange.com.jsonl.gz",
         | 
| 1209 | 
            +
                    "lines": 202954,
         | 
| 1210 | 
            +
                    "weight": 17
         | 
| 1211 | 
            +
                },
         | 
| 1212 | 
            +
                {
         | 
| 1213 | 
            +
                    "name": "TriviaQA_pairs.jsonl.gz",
         | 
| 1214 | 
            +
                    "lines": 73346,
         | 
| 1215 | 
            +
                    "weight": 19
         | 
| 1216 | 
            +
                },
         | 
| 1217 | 
            +
                {
         | 
| 1218 | 
            +
                    "name": "stackexchange_TitleBody_Answer/serverfault.com.jsonl.gz",
         | 
| 1219 | 
            +
                    "lines": 238507,
         | 
| 1220 | 
            +
                    "weight": 20
         | 
| 1221 | 
            +
                },
         | 
| 1222 | 
            +
                {
         | 
| 1223 | 
            +
                    "name": "stackexchange_Title_Answer/serverfault.com.jsonl.gz",
         | 
| 1224 | 
            +
                    "lines": 238507,
         | 
| 1225 | 
            +
                    "weight": 20
         | 
| 1226 | 
            +
                },
         | 
| 1227 | 
            +
                {
         | 
| 1228 | 
            +
                    "name": "stackexchange_duplicate_questions_title-body_title-body.jsonl.gz",
         | 
| 1229 | 
            +
                    "lines": 250460,
         | 
| 1230 | 
            +
                    "weight": 21
         | 
| 1231 | 
            +
                },
         | 
| 1232 | 
            +
                {
         | 
| 1233 | 
            +
                    "name": "stackexchange_duplicate_questions_body_body.jsonl.gz",
         | 
| 1234 | 
            +
                    "lines": 250519,
         | 
| 1235 | 
            +
                    "weight": 21
         | 
| 1236 | 
            +
                },
         | 
| 1237 | 
            +
                {
         | 
| 1238 | 
            +
                    "name": "squad_pairs.jsonl.gz",
         | 
| 1239 | 
            +
                    "lines": 87599,
         | 
| 1240 | 
            +
                    "weight": 22
         | 
| 1241 | 
            +
                },
         | 
| 1242 | 
            +
                {
         | 
| 1243 | 
            +
                    "name": "stackexchange_TitleBody_Answer/askubuntu.com.jsonl.gz",
         | 
| 1244 | 
            +
                    "lines": 267135,
         | 
| 1245 | 
            +
                    "weight": 22
         | 
| 1246 | 
            +
                },
         | 
| 1247 | 
            +
                {
         | 
| 1248 | 
            +
                    "name": "stackexchange_Title_Answer/askubuntu.com.jsonl.gz",
         | 
| 1249 | 
            +
                    "lines": 267135,
         | 
| 1250 | 
            +
                    "weight": 22
         | 
| 1251 | 
            +
                },
         | 
| 1252 | 
            +
                {
         | 
| 1253 | 
            +
                    "name": "stackexchange_title_body/serverfault.com.jsonl.gz",
         | 
| 1254 | 
            +
                    "lines": 270904,
         | 
| 1255 | 
            +
                    "weight": 23
         | 
| 1256 | 
            +
                },
         | 
| 1257 | 
            +
                {
         | 
| 1258 | 
            +
                    "name": "NQ-train_pairs.jsonl.gz",
         | 
| 1259 | 
            +
                    "lines": 100231,
         | 
| 1260 | 
            +
                    "weight": 25
         | 
| 1261 | 
            +
                },
         | 
| 1262 | 
            +
                {
         | 
| 1263 | 
            +
                    "name": "SimpleWiki.jsonl.gz",
         | 
| 1264 | 
            +
                    "lines": 102225,
         | 
| 1265 | 
            +
                    "weight": 26
         | 
| 1266 | 
            +
                },
         | 
| 1267 | 
            +
                {
         | 
| 1268 | 
            +
                    "name": "quora_duplicates_triplets.jsonl.gz",
         | 
| 1269 | 
            +
                    "lines": 103663,
         | 
| 1270 | 
            +
                    "weight": 26
         | 
| 1271 | 
            +
                },
         | 
| 1272 | 
            +
                {
         | 
| 1273 | 
            +
                    "name": "stackexchange_duplicate_questions_title_title.jsonl.gz",
         | 
| 1274 | 
            +
                    "lines": 304525,
         | 
| 1275 | 
            +
                    "weight": 26
         | 
| 1276 | 
            +
                },
         | 
| 1277 | 
            +
                {
         | 
| 1278 | 
            +
                    "name": "altlex.jsonl.gz",
         | 
| 1279 | 
            +
                    "lines": 112696,
         | 
| 1280 | 
            +
                    "weight": 28
         | 
| 1281 | 
            +
                },
         | 
| 1282 | 
            +
                {
         | 
| 1283 | 
            +
                    "name": "stackexchange_title_body/askubuntu.com.jsonl.gz",
         | 
| 1284 | 
            +
                    "lines": 347925,
         | 
| 1285 | 
            +
                    "weight": 29
         | 
| 1286 | 
            +
                },
         | 
| 1287 | 
            +
                {
         | 
| 1288 | 
            +
                    "name": "stackexchange_TitleBody_Answer/superuser.com.jsonl.gz",
         | 
| 1289 | 
            +
                    "lines": 352610,
         | 
| 1290 | 
            +
                    "weight": 30
         | 
| 1291 | 
            +
                },
         | 
| 1292 | 
            +
                {
         | 
| 1293 | 
            +
                    "name": "stackexchange_Title_Answer/superuser.com.jsonl.gz",
         | 
| 1294 | 
            +
                    "lines": 352610,
         | 
| 1295 | 
            +
                    "weight": 30
         | 
| 1296 | 
            +
                },
         | 
| 1297 | 
            +
                {
         | 
| 1298 | 
            +
                    "name": "wikihow.jsonl.gz",
         | 
| 1299 | 
            +
                    "lines": 128542,
         | 
| 1300 | 
            +
                    "weight": 32
         | 
| 1301 | 
            +
                },
         | 
| 1302 | 
            +
                {
         | 
| 1303 | 
            +
                    "name": "stackexchange_title_body/superuser.com.jsonl.gz",
         | 
| 1304 | 
            +
                    "lines": 435463,
         | 
| 1305 | 
            +
                    "weight": 36
         | 
| 1306 | 
            +
                },
         | 
| 1307 | 
            +
                {
         | 
| 1308 | 
            +
                    "name": "stackexchange_title_body/small_stackexchanges.jsonl.gz",
         | 
| 1309 | 
            +
                    "lines": 448146,
         | 
| 1310 | 
            +
                    "weight": 37
         | 
| 1311 | 
            +
                },
         | 
| 1312 | 
            +
                {
         | 
| 1313 | 
            +
                    "name": "stackexchange_TitleBody_Answer/small_stackexchanges.jsonl.gz",
         | 
| 1314 | 
            +
                    "lines": 460256,
         | 
| 1315 | 
            +
                    "weight": 38
         | 
| 1316 | 
            +
                },
         | 
| 1317 | 
            +
                {
         | 
| 1318 | 
            +
                    "name": "stackexchange_Title_Answer/small_stackexchanges.jsonl.gz",
         | 
| 1319 | 
            +
                    "lines": 460256,
         | 
| 1320 | 
            +
                    "weight": 38
         | 
| 1321 | 
            +
                },
         | 
| 1322 | 
            +
                {
         | 
| 1323 | 
            +
                    "name": "sentence-compression.jsonl.gz",
         | 
| 1324 | 
            +
                    "lines": 180000,
         | 
| 1325 | 
            +
                    "weight": 45
         | 
| 1326 | 
            +
                },
         | 
| 1327 | 
            +
                {
         | 
| 1328 | 
            +
                    "name": "AllNLI.jsonl.gz",
         | 
| 1329 | 
            +
                    "lines": 277230,
         | 
| 1330 | 
            +
                    "weight": 69
         | 
| 1331 | 
            +
                },
         | 
| 1332 | 
            +
                {
         | 
| 1333 | 
            +
                    "name": "eli5_question_answer.jsonl.gz",
         | 
| 1334 | 
            +
                    "lines": 325475,
         | 
| 1335 | 
            +
                    "weight": 81
         | 
| 1336 | 
            +
                },
         | 
| 1337 | 
            +
                {
         | 
| 1338 | 
            +
                    "name": "reddit/reddit_2015.jsonl.gz",
         | 
| 1339 | 
            +
                    "lines": 135108166,
         | 
| 1340 | 
            +
                    "weight": 82
         | 
| 1341 | 
            +
                },
         | 
| 1342 | 
            +
                {
         | 
| 1343 | 
            +
                    "name": "reddit/reddit_2016.jsonl.gz",
         | 
| 1344 | 
            +
                    "lines": 159164386,
         | 
| 1345 | 
            +
                    "weight": 82
         | 
| 1346 | 
            +
                },
         | 
| 1347 | 
            +
                {
         | 
| 1348 | 
            +
                    "name": "reddit/reddit_2017.jsonl.gz",
         | 
| 1349 | 
            +
                    "lines": 191485219,
         | 
| 1350 | 
            +
                    "weight": 82
         | 
| 1351 | 
            +
                },
         | 
| 1352 | 
            +
                {
         | 
| 1353 | 
            +
                    "name": "reddit/reddit_2018.jsonl.gz",
         | 
| 1354 | 
            +
                    "lines": 240726659,
         | 
| 1355 | 
            +
                    "weight": 82
         | 
| 1356 | 
            +
                },
         | 
| 1357 | 
            +
                {
         | 
| 1358 | 
            +
                    "name": "stackexchange_TitleBody_Answer/math.stackexchange.com.jsonl.gz",
         | 
| 1359 | 
            +
                    "lines": 1100953,
         | 
| 1360 | 
            +
                    "weight": 83
         | 
| 1361 | 
            +
                },
         | 
| 1362 | 
            +
                {
         | 
| 1363 | 
            +
                    "name": "stackexchange_Title_Answer/math.stackexchange.com.jsonl.gz",
         | 
| 1364 | 
            +
                    "lines": 1100953,
         | 
| 1365 | 
            +
                    "weight": 83
         | 
| 1366 | 
            +
                },
         | 
| 1367 | 
            +
                {
         | 
| 1368 | 
            +
                    "name": "stackexchange_title_body/math.stackexchange.com.jsonl.gz",
         | 
| 1369 | 
            +
                    "lines": 1338443,
         | 
| 1370 | 
            +
                    "weight": 83
         | 
| 1371 | 
            +
                },
         | 
| 1372 | 
            +
                {
         | 
| 1373 | 
            +
                    "name": "stackexchange_TitleBody_Answer/stackoverflow.com-Posts.jsonl.gz",
         | 
| 1374 | 
            +
                    "lines": 15768211,
         | 
| 1375 | 
            +
                    "weight": 83
         | 
| 1376 | 
            +
                },
         | 
| 1377 | 
            +
                {
         | 
| 1378 | 
            +
                    "name": "stackexchange_Title_Answer/stackoverflow.com-Posts.jsonl.gz",
         | 
| 1379 | 
            +
                    "lines": 15768211,
         | 
| 1380 | 
            +
                    "weight": 83
         | 
| 1381 | 
            +
                },
         | 
| 1382 | 
            +
                {
         | 
| 1383 | 
            +
                    "name": "stackexchange_title_body/stackoverflow.com-Posts.jsonl.gz",
         | 
| 1384 | 
            +
                    "lines": 18562443,
         | 
| 1385 | 
            +
                    "weight": 83
         | 
| 1386 | 
            +
                },
         | 
| 1387 | 
            +
                {
         | 
| 1388 | 
            +
                    "name": "specter_train_triples.jsonl.gz",
         | 
| 1389 | 
            +
                    "lines": 684100,
         | 
| 1390 | 
            +
                    "weight": 84
         | 
| 1391 | 
            +
                },
         | 
| 1392 | 
            +
                {
         | 
| 1393 | 
            +
                    "name": "S2ORC_title_abstract.jsonl.gz",
         | 
| 1394 | 
            +
                    "lines": 41769185,
         | 
| 1395 | 
            +
                    "weight": 123
         | 
| 1396 | 
            +
                },
         | 
| 1397 | 
            +
                {
         | 
| 1398 | 
            +
                    "name": "S2ORC_citation_pairs.jsonl.gz",
         | 
| 1399 | 
            +
                    "lines": 52603982,
         | 
| 1400 | 
            +
                    "weight": 123
         | 
| 1401 | 
            +
                },
         | 
| 1402 | 
            +
                {
         | 
| 1403 | 
            +
                    "name": "PAQ_pairs.jsonl.gz",
         | 
| 1404 | 
            +
                    "lines": 64371441,
         | 
| 1405 | 
            +
                    "weight": 123
         | 
| 1406 | 
            +
                },
         | 
| 1407 | 
            +
                {
         | 
| 1408 | 
            +
                    "name": "WikiAnswers_pairs.jsonl.gz",
         | 
| 1409 | 
            +
                    "lines": 77427422,
         | 
| 1410 | 
            +
                    "weight": 123
         | 
| 1411 | 
            +
                },
         | 
| 1412 | 
            +
                {
         | 
| 1413 | 
            +
                    "name": "S2ORC_citation_pairs_abstract.jsonl.gz",
         | 
| 1414 | 
            +
                    "lines": 116288806,
         | 
| 1415 | 
            +
                    "weight": 123
         | 
| 1416 | 
            +
                },
         | 
| 1417 | 
            +
                {
         | 
| 1418 | 
            +
                    "name": "searchQA_question_top5_snippets_merged.jsonl.gz",
         | 
| 1419 | 
            +
                    "lines": 582261,
         | 
| 1420 | 
            +
                    "weight": 144
         | 
| 1421 | 
            +
                },
         | 
| 1422 | 
            +
                {
         | 
| 1423 | 
            +
                    "name": "yahoo_answers_title_question.jsonl.gz",
         | 
| 1424 | 
            +
                    "lines": 659896,
         | 
| 1425 | 
            +
                    "weight": 163
         | 
| 1426 | 
            +
                },
         | 
| 1427 | 
            +
                {
         | 
| 1428 | 
            +
                    "name": "yahoo_answers_question_answer.jsonl.gz",
         | 
| 1429 | 
            +
                    "lines": 681164,
         | 
| 1430 | 
            +
                    "weight": 169
         | 
| 1431 | 
            +
                },
         | 
| 1432 | 
            +
                {
         | 
| 1433 | 
            +
                    "name": "yahoo_answers_title_answer.jsonl.gz",
         | 
| 1434 | 
            +
                    "lines": 1198260,
         | 
| 1435 | 
            +
                    "weight": 247
         | 
| 1436 | 
            +
                },
         | 
| 1437 | 
            +
                {
         | 
| 1438 | 
            +
                    "name": "amazon-qa-train-pairs.jsonl.gz",
         | 
| 1439 | 
            +
                    "lines": 2448839,
         | 
| 1440 | 
            +
                    "weight": 247
         | 
| 1441 | 
            +
                },
         | 
| 1442 | 
            +
                {
         | 
| 1443 | 
            +
                    "name": "gooaq_pairs.jsonl.gz",
         | 
| 1444 | 
            +
                    "lines": 3012496,
         | 
| 1445 | 
            +
                    "weight": 247
         | 
| 1446 | 
            +
                },
         | 
| 1447 | 
            +
                {
         | 
| 1448 | 
            +
                    "name": "msmarco-query_passage_negative.jsonl.gz",
         | 
| 1449 | 
            +
                    "lines": 9144553,
         | 
| 1450 | 
            +
                    "weight": 247
         | 
| 1451 | 
            +
                }
         | 
| 1452 | 
            +
            ]
         | 
    	
        models/all-MiniLM-L6-v2/model.safetensors
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
         | 
| 3 | 
            +
            size 90868376
         | 
    	
        models/all-MiniLM-L6-v2/modules.json
    ADDED
    
    | @@ -0,0 +1,20 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            [
         | 
| 2 | 
            +
              {
         | 
| 3 | 
            +
                "idx": 0,
         | 
| 4 | 
            +
                "name": "0",
         | 
| 5 | 
            +
                "path": "",
         | 
| 6 | 
            +
                "type": "sentence_transformers.models.Transformer"
         | 
| 7 | 
            +
              },
         | 
| 8 | 
            +
              {
         | 
| 9 | 
            +
                "idx": 1,
         | 
| 10 | 
            +
                "name": "1",
         | 
| 11 | 
            +
                "path": "1_Pooling",
         | 
| 12 | 
            +
                "type": "sentence_transformers.models.Pooling"
         | 
| 13 | 
            +
              },
         | 
| 14 | 
            +
              {
         | 
| 15 | 
            +
                "idx": 2,
         | 
| 16 | 
            +
                "name": "2",
         | 
| 17 | 
            +
                "path": "2_Normalize",
         | 
| 18 | 
            +
                "type": "sentence_transformers.models.Normalize"
         | 
| 19 | 
            +
              }
         | 
| 20 | 
            +
            ]
         | 
    	
        models/all-MiniLM-L6-v2/onnx/model.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:6fd5d72fe4589f189f8ebc006442dbb529bb7ce38f8082112682524616046452
         | 
| 3 | 
            +
            size 90405214
         | 
    	
        models/all-MiniLM-L6-v2/onnx/model_O1.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:1391c6fc20b5530250bc15cbe1f47578ffeca55ab0551d335cc668b6299a88ec
         | 
| 3 | 
            +
            size 90360328
         | 
    	
        models/all-MiniLM-L6-v2/onnx/model_O2.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:1de3905029190b398c7d300b530e320cf4b5e7d3dfb9af1429ebd73fd9a16faf
         | 
| 3 | 
            +
            size 90326566
         | 
    	
        models/all-MiniLM-L6-v2/onnx/model_O3.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:a44f671e364dddbac31f203f07b91be6b0a35e51936e5ebfab65b6d9538b83ff
         | 
| 3 | 
            +
            size 90326497
         | 
    	
        models/all-MiniLM-L6-v2/onnx/model_O4.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:1667d7f3ba669048b13a96ee3a44456d5e42c8f44588ae8b603430e16160c485
         | 
| 3 | 
            +
            size 45212349
         | 
    	
        models/all-MiniLM-L6-v2/onnx/model_qint8_arm64.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
         | 
| 3 | 
            +
            size 23026053
         | 
    	
        models/all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
         | 
| 3 | 
            +
            size 23026053
         | 
    	
        models/all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
         | 
| 3 | 
            +
            size 23026053
         | 
    	
        models/all-MiniLM-L6-v2/onnx/model_quint8_avx2.onnx
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:b941bf19f1f1283680f449fa6a7336bb5600bdcd5f84d10ddc5cd72218a0fd21
         | 
| 3 | 
            +
            size 23046789
         | 
    	
        models/all-MiniLM-L6-v2/openvino/openvino_model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:8b86cab4722e2aefab310cf96d4d5a9eb3b187f7d9670a082afc55c7fa0d392a
         | 
| 3 | 
            +
            size 90265744
         | 
    	
        models/all-MiniLM-L6-v2/openvino/openvino_model.xml
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        models/all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:c92ea4af3c6bc7b4a0f3b3d61b147c850f4dbdd7c9e7beee0c0c70dc12da289b
         | 
| 3 | 
            +
            size 22933664
         | 
    	
        models/all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.xml
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        models/all-MiniLM-L6-v2/pytorch_model.bin
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:c3a85f238711653950f6a79ece63eb0ea93d76f6a6284be04019c53733baf256
         | 
| 3 | 
            +
            size 90888945
         | 
    	
        models/all-MiniLM-L6-v2/rust_model.ot
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:2d98d96d278348988f2744e6445b8bc16d921c3f6e17c667362f3cb353007aea
         | 
| 3 | 
            +
            size 90887379
         | 
    	
        models/all-MiniLM-L6-v2/sentence_bert_config.json
    ADDED
    
    | @@ -0,0 +1,4 @@ | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "max_seq_length": 256,
         | 
| 3 | 
            +
              "do_lower_case": false
         | 
| 4 | 
            +
            }
         | 
    	
        models/all-MiniLM-L6-v2/special_tokens_map.json
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
         | 
    	
        models/all-MiniLM-L6-v2/tf_model.h5
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:24c06a7429b843d46e40c6b167122053921bf94dce2e5550ea5c07fabc597646
         | 
| 3 | 
            +
            size 91005696
         | 
    	
        models/all-MiniLM-L6-v2/tokenizer.json
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
    	
        models/all-MiniLM-L6-v2/tokenizer_config.json
    ADDED
    
    | @@ -0,0 +1 @@ | |
|  | 
|  | |
| 1 | 
            +
            {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512}
         | 
    	
        models/all-MiniLM-L6-v2/train_script.py
    ADDED
    
    | @@ -0,0 +1,344 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            """
         | 
| 2 | 
            +
            Train script for a single file
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            Need to set the TPU address first:
         | 
| 5 | 
            +
            export XRT_TPU_CONFIG="localservice;0;localhost:51011"
         | 
| 6 | 
            +
            """
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            import torch.multiprocessing as mp
         | 
| 9 | 
            +
            import threading
         | 
| 10 | 
            +
            import time
         | 
| 11 | 
            +
            import random
         | 
| 12 | 
            +
            import sys
         | 
| 13 | 
            +
            import argparse
         | 
| 14 | 
            +
            import gzip
         | 
| 15 | 
            +
            import json
         | 
| 16 | 
            +
            import logging
         | 
| 17 | 
            +
            import tqdm
         | 
| 18 | 
            +
            import torch
         | 
| 19 | 
            +
            from torch import nn
         | 
| 20 | 
            +
            from torch.utils.data import DataLoader
         | 
| 21 | 
            +
            import torch
         | 
| 22 | 
            +
            import torch_xla
         | 
| 23 | 
            +
            import torch_xla.core
         | 
| 24 | 
            +
            import torch_xla.core.functions
         | 
| 25 | 
            +
            import torch_xla.core.xla_model as xm
         | 
| 26 | 
            +
            import torch_xla.distributed.xla_multiprocessing as xmp
         | 
| 27 | 
            +
            import torch_xla.distributed.parallel_loader as pl
         | 
| 28 | 
            +
            import os
         | 
| 29 | 
            +
            from shutil import copyfile
         | 
| 30 | 
            +
             | 
| 31 | 
            +
             | 
| 32 | 
            +
            from transformers import (
         | 
| 33 | 
            +
                AdamW,
         | 
| 34 | 
            +
                AutoModel,
         | 
| 35 | 
            +
                AutoTokenizer,
         | 
| 36 | 
            +
                get_linear_schedule_with_warmup,
         | 
| 37 | 
            +
                set_seed,
         | 
| 38 | 
            +
            )
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            class AutoModelForSentenceEmbedding(nn.Module):
         | 
| 41 | 
            +
                def __init__(self, model_name, tokenizer, normalize=True):
         | 
| 42 | 
            +
                    super(AutoModelForSentenceEmbedding, self).__init__()
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                    self.model = AutoModel.from_pretrained(model_name)
         | 
| 45 | 
            +
                    self.normalize = normalize
         | 
| 46 | 
            +
                    self.tokenizer = tokenizer
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                def forward(self, **kwargs):
         | 
| 49 | 
            +
                    model_output = self.model(**kwargs)
         | 
| 50 | 
            +
                    embeddings = self.mean_pooling(model_output, kwargs['attention_mask'])
         | 
| 51 | 
            +
                    if self.normalize:
         | 
| 52 | 
            +
                        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                    return embeddings
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                def mean_pooling(self, model_output, attention_mask):
         | 
| 57 | 
            +
                    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
         | 
| 58 | 
            +
                    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
         | 
| 59 | 
            +
                    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                def save_pretrained(self, output_path):
         | 
| 62 | 
            +
                    if xm.is_master_ordinal():
         | 
| 63 | 
            +
                        self.tokenizer.save_pretrained(output_path)
         | 
| 64 | 
            +
                        self.model.config.save_pretrained(output_path)
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                    xm.save(self.model.state_dict(), os.path.join(output_path, "pytorch_model.bin"))
         | 
| 67 | 
            +
                   
         | 
| 68 | 
            +
             | 
| 69 | 
            +
             | 
| 70 | 
            +
             | 
| 71 | 
            +
            def train_function(index, args, queue):
         | 
| 72 | 
            +
                tokenizer = AutoTokenizer.from_pretrained(args.model)
         | 
| 73 | 
            +
                model = AutoModelForSentenceEmbedding(args.model, tokenizer)
         | 
| 74 | 
            +
                
         | 
| 75 | 
            +
              
         | 
| 76 | 
            +
                ### Train Loop
         | 
| 77 | 
            +
                device = xm.xla_device()
         | 
| 78 | 
            +
                model = model.to(device)
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                # Instantiate optimizer
         | 
| 81 | 
            +
                optimizer = AdamW(params=model.parameters(), lr=2e-5, correct_bias=True)
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                lr_scheduler = get_linear_schedule_with_warmup(
         | 
| 84 | 
            +
                    optimizer=optimizer,
         | 
| 85 | 
            +
                    num_warmup_steps=500,
         | 
| 86 | 
            +
                    num_training_steps=args.steps,
         | 
| 87 | 
            +
                )
         | 
| 88 | 
            +
                
         | 
| 89 | 
            +
                # Now we train the model
         | 
| 90 | 
            +
                cross_entropy_loss = nn.CrossEntropyLoss()
         | 
| 91 | 
            +
                max_grad_norm = 1
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                model.train()
         | 
| 94 | 
            +
               
         | 
| 95 | 
            +
                for global_step in tqdm.trange(args.steps, disable=not xm.is_master_ordinal()):
         | 
| 96 | 
            +
                    #### Get the batch data
         | 
| 97 | 
            +
                    batch = queue.get()
         | 
| 98 | 
            +
                    #print(index, "batch {}x{}".format(len(batch), ",".join([str(len(b)) for b in batch])))
         | 
| 99 | 
            +
                    
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                    if len(batch[0]) == 2: #(anchor, positive)
         | 
| 102 | 
            +
                        text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
         | 
| 103 | 
            +
                        text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                        ### Compute embeddings
         | 
| 106 | 
            +
                        embeddings_a = model(**text1.to(device))
         | 
| 107 | 
            +
                        embeddings_b = model(**text2.to(device))
         | 
| 108 | 
            +
                        
         | 
| 109 | 
            +
                        ### Gather all embedings 
         | 
| 110 | 
            +
                        embeddings_a = torch_xla.core.functions.all_gather(embeddings_a)
         | 
| 111 | 
            +
                        embeddings_b = torch_xla.core.functions.all_gather(embeddings_b)
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                        ### Compute similarity scores 512 x 512
         | 
| 114 | 
            +
                        scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
         | 
| 115 | 
            +
                    
         | 
| 116 | 
            +
                        ### Compute cross-entropy loss
         | 
| 117 | 
            +
                        labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device)  # Example a[i] should match with b[i]
         | 
| 118 | 
            +
                        
         | 
| 119 | 
            +
                        ## Symmetric loss as in CLIP
         | 
| 120 | 
            +
                        loss = (cross_entropy_loss(scores, labels) + cross_entropy_loss(scores.transpose(0, 1), labels)) / 2
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                    else:   #(anchor, positive, negative)
         | 
| 123 | 
            +
                        text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
         | 
| 124 | 
            +
                        text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
         | 
| 125 | 
            +
                        text3 = tokenizer([b[2] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                        embeddings_a  = model(**text1.to(device))
         | 
| 128 | 
            +
                        embeddings_b1 = model(**text2.to(device))
         | 
| 129 | 
            +
                        embeddings_b2 = model(**text3.to(device))
         | 
| 130 | 
            +
             | 
| 131 | 
            +
                        embeddings_a  = torch_xla.core.functions.all_gather(embeddings_a)
         | 
| 132 | 
            +
                        embeddings_b1 = torch_xla.core.functions.all_gather(embeddings_b1)
         | 
| 133 | 
            +
                        embeddings_b2 = torch_xla.core.functions.all_gather(embeddings_b2)
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                        embeddings_b = torch.cat([embeddings_b1, embeddings_b2])
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                        ### Compute similarity scores 512 x 1024
         | 
| 138 | 
            +
                        scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
         | 
| 139 | 
            +
                    
         | 
| 140 | 
            +
                        ### Compute cross-entropy loss
         | 
| 141 | 
            +
                        labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device)  # Example a[i] should match with b[i]
         | 
| 142 | 
            +
                        
         | 
| 143 | 
            +
                        ## One-way loss
         | 
| 144 | 
            +
                        loss = cross_entropy_loss(scores, labels)
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                    
         | 
| 147 | 
            +
                    # Backward pass
         | 
| 148 | 
            +
                    optimizer.zero_grad()
         | 
| 149 | 
            +
                    loss.backward()
         | 
| 150 | 
            +
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
         | 
| 151 | 
            +
                    
         | 
| 152 | 
            +
                    xm.optimizer_step(optimizer, barrier=True)
         | 
| 153 | 
            +
                    lr_scheduler.step()
         | 
| 154 | 
            +
             | 
| 155 | 
            +
             | 
| 156 | 
            +
                    #Save model
         | 
| 157 | 
            +
                    if (global_step+1) % args.save_steps == 0:
         | 
| 158 | 
            +
                        output_path = os.path.join(args.output, str(global_step+1))
         | 
| 159 | 
            +
                        xm.master_print("save model: "+output_path)
         | 
| 160 | 
            +
                        model.save_pretrained(output_path)
         | 
| 161 | 
            +
                      
         | 
| 162 | 
            +
                        
         | 
| 163 | 
            +
                output_path = os.path.join(args.output, "final")
         | 
| 164 | 
            +
                xm.master_print("save model final: "+ output_path)
         | 
| 165 | 
            +
                model.save_pretrained(output_path)
         | 
| 166 | 
            +
             | 
| 167 | 
            +
             | 
| 168 | 
            +
            def produce_data(args, queue, filepaths, dataset_indices):
         | 
| 169 | 
            +
                global_batch_size = args.batch_size*args.nprocs    #Global batch size
         | 
| 170 | 
            +
                size_per_dataset = int(global_batch_size / args.datasets_per_batch)    #How many datasets per batch
         | 
| 171 | 
            +
                num_same_dataset = int(size_per_dataset / args.batch_size)
         | 
| 172 | 
            +
                print("producer", "global_batch_size", global_batch_size)
         | 
| 173 | 
            +
                print("producer", "size_per_dataset", size_per_dataset)
         | 
| 174 | 
            +
                print("producer", "num_same_dataset", num_same_dataset)
         | 
| 175 | 
            +
                
         | 
| 176 | 
            +
                datasets = []
         | 
| 177 | 
            +
                for filepath in filepaths:
         | 
| 178 | 
            +
                    if "reddit_" in filepath:       #Special dataset class for Reddit files
         | 
| 179 | 
            +
                        data_obj = RedditDataset(filepath)
         | 
| 180 | 
            +
                    else:
         | 
| 181 | 
            +
                        data_obj = Dataset(filepath)
         | 
| 182 | 
            +
                    datasets.append(iter(data_obj)) 
         | 
| 183 | 
            +
                
         | 
| 184 | 
            +
                # Store if dataset is in a 2 col or 3 col format
         | 
| 185 | 
            +
                num_cols = {idx: len(next(dataset)) for idx, dataset in enumerate(datasets)}
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                while True:
         | 
| 188 | 
            +
                    texts_in_batch = set()
         | 
| 189 | 
            +
                    batch_format = None     #2 vs 3 col format for this batch
         | 
| 190 | 
            +
                    
         | 
| 191 | 
            +
                    #Add data from several sub datasets
         | 
| 192 | 
            +
                    for _ in range(args.datasets_per_batch):
         | 
| 193 | 
            +
                        valid_dataset = False   #Check that datasets have the same 2/3 col format
         | 
| 194 | 
            +
                        while not valid_dataset:
         | 
| 195 | 
            +
                            data_idx = random.choice(dataset_indices)
         | 
| 196 | 
            +
                            if batch_format is None:
         | 
| 197 | 
            +
                                batch_format = num_cols[data_idx]
         | 
| 198 | 
            +
                                valid_dataset = True
         | 
| 199 | 
            +
                            else:   #Check that this dataset has the same format
         | 
| 200 | 
            +
                                valid_dataset = (batch_format == num_cols[data_idx])
         | 
| 201 | 
            +
                        
         | 
| 202 | 
            +
                        #Get data from this dataset
         | 
| 203 | 
            +
                        dataset = datasets[data_idx]
         | 
| 204 | 
            +
                        for _ in range(num_same_dataset):
         | 
| 205 | 
            +
                            for _ in range(args.nprocs):
         | 
| 206 | 
            +
                                batch_device = []   #A batch for one device
         | 
| 207 | 
            +
                                while len(batch_device) < args.batch_size:
         | 
| 208 | 
            +
                                    sample = next(dataset)
         | 
| 209 | 
            +
                                    in_batch = False
         | 
| 210 | 
            +
                                    for text in sample:
         | 
| 211 | 
            +
                                        if text in texts_in_batch:
         | 
| 212 | 
            +
                                            in_batch = True
         | 
| 213 | 
            +
                                            break
         | 
| 214 | 
            +
                                    
         | 
| 215 | 
            +
                                    if not in_batch:
         | 
| 216 | 
            +
                                        for text in sample:
         | 
| 217 | 
            +
                                            texts_in_batch.add(text)
         | 
| 218 | 
            +
                                        batch_device.append(sample)
         | 
| 219 | 
            +
             | 
| 220 | 
            +
                                queue.put(batch_device)
         | 
| 221 | 
            +
                                  
         | 
| 222 | 
            +
             | 
| 223 | 
            +
            class RedditDataset:
         | 
| 224 | 
            +
                """
         | 
| 225 | 
            +
                A class that handles the reddit data files
         | 
| 226 | 
            +
                """
         | 
| 227 | 
            +
                def __init__(self, filepath):
         | 
| 228 | 
            +
                    self.filepath = filepath
         | 
| 229 | 
            +
             | 
| 230 | 
            +
                def __iter__(self):
         | 
| 231 | 
            +
                    while True:
         | 
| 232 | 
            +
                        with gzip.open(self.filepath, "rt") as fIn:
         | 
| 233 | 
            +
                                for line in fIn:
         | 
| 234 | 
            +
                                    data = json.loads(line)
         | 
| 235 | 
            +
             | 
| 236 | 
            +
                                    if "response" in data and "context" in data:
         | 
| 237 | 
            +
                                        yield [data["response"], data["context"]]
         | 
| 238 | 
            +
             | 
| 239 | 
            +
            class Dataset:
         | 
| 240 | 
            +
                """
         | 
| 241 | 
            +
                A class that handles one dataset
         | 
| 242 | 
            +
                """
         | 
| 243 | 
            +
                def __init__(self, filepath):
         | 
| 244 | 
            +
                    self.filepath = filepath
         | 
| 245 | 
            +
             | 
| 246 | 
            +
                def __iter__(self):
         | 
| 247 | 
            +
                    max_dataset_size = 10*1000*1000    #Cache small datasets in memory
         | 
| 248 | 
            +
                    dataset = []
         | 
| 249 | 
            +
                    data_format = None
         | 
| 250 | 
            +
             | 
| 251 | 
            +
                    while dataset is None or len(dataset) == 0:
         | 
| 252 | 
            +
                        with gzip.open(self.filepath, "rt") as fIn:
         | 
| 253 | 
            +
                            for line in fIn:
         | 
| 254 | 
            +
                                data = json.loads(line)
         | 
| 255 | 
            +
                                if isinstance(data, dict):
         | 
| 256 | 
            +
                                    data = data['texts']
         | 
| 257 | 
            +
             | 
| 258 | 
            +
                                if data_format is None:
         | 
| 259 | 
            +
                                    data_format = len(data)
         | 
| 260 | 
            +
                                
         | 
| 261 | 
            +
                                #Ensure that all entries are of the same 2/3 col format
         | 
| 262 | 
            +
                                assert len(data) == data_format
         | 
| 263 | 
            +
             | 
| 264 | 
            +
                                if dataset is not None:
         | 
| 265 | 
            +
                                    dataset.append(data)
         | 
| 266 | 
            +
                                    if len(dataset) >= max_dataset_size:
         | 
| 267 | 
            +
                                        dataset = None
         | 
| 268 | 
            +
             | 
| 269 | 
            +
                                yield data
         | 
| 270 | 
            +
                            
         | 
| 271 | 
            +
                    # Data loaded. Now stream to the queue
         | 
| 272 | 
            +
                    # Shuffle for each epoch
         | 
| 273 | 
            +
                    while True:
         | 
| 274 | 
            +
                        random.shuffle(dataset)
         | 
| 275 | 
            +
                        for data in dataset:
         | 
| 276 | 
            +
                            yield data
         | 
| 277 | 
            +
                            
         | 
| 278 | 
            +
                           
         | 
| 279 | 
            +
             | 
| 280 | 
            +
            if __name__ == "__main__":
         | 
| 281 | 
            +
                parser = argparse.ArgumentParser()
         | 
| 282 | 
            +
                parser.add_argument('--model', default='nreimers/MiniLM-L6-H384-uncased')
         | 
| 283 | 
            +
                parser.add_argument('--steps', type=int, default=2000)
         | 
| 284 | 
            +
                parser.add_argument('--save_steps', type=int, default=10000)
         | 
| 285 | 
            +
                parser.add_argument('--batch_size', type=int, default=64)
         | 
| 286 | 
            +
                parser.add_argument('--max_length', type=int, default=128)
         | 
| 287 | 
            +
                parser.add_argument('--nprocs', type=int, default=8)
         | 
| 288 | 
            +
                parser.add_argument('--datasets_per_batch', type=int, default=2, help="Number of datasets per batch")
         | 
| 289 | 
            +
                parser.add_argument('--scale', type=float, default=20, help="Use 20 for cossim, and 1 when you work with unnormalized embeddings with dot product")
         | 
| 290 | 
            +
                parser.add_argument('--data_folder', default="/data", help="Folder with your dataset files")
         | 
| 291 | 
            +
                parser.add_argument('data_config', help="A data_config.json file")
         | 
| 292 | 
            +
                parser.add_argument('output')
         | 
| 293 | 
            +
                args = parser.parse_args()
         | 
| 294 | 
            +
             | 
| 295 | 
            +
                # Ensure global batch size is divisble by data_sample_size
         | 
| 296 | 
            +
                assert (args.batch_size*args.nprocs) % args.datasets_per_batch == 0
         | 
| 297 | 
            +
             | 
| 298 | 
            +
                logging.info("Output: "+args.output)
         | 
| 299 | 
            +
                if os.path.exists(args.output):
         | 
| 300 | 
            +
                    print("Output folder already exists.")
         | 
| 301 | 
            +
                    input("Continue?")
         | 
| 302 | 
            +
             | 
| 303 | 
            +
                # Write train script to output path
         | 
| 304 | 
            +
                os.makedirs(args.output, exist_ok=True)
         | 
| 305 | 
            +
             | 
| 306 | 
            +
                data_config_path = os.path.join(args.output, 'data_config.json')
         | 
| 307 | 
            +
                copyfile(args.data_config, data_config_path)
         | 
| 308 | 
            +
             | 
| 309 | 
            +
                train_script_path = os.path.join(args.output, 'train_script.py')
         | 
| 310 | 
            +
                copyfile(__file__, train_script_path)
         | 
| 311 | 
            +
                with open(train_script_path, 'a') as fOut:
         | 
| 312 | 
            +
                    fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
         | 
| 313 | 
            +
             | 
| 314 | 
            +
             | 
| 315 | 
            +
             | 
| 316 | 
            +
                #Load data config
         | 
| 317 | 
            +
                with open(args.data_config) as fIn:
         | 
| 318 | 
            +
                    data_config = json.load(fIn)
         | 
| 319 | 
            +
             | 
| 320 | 
            +
                queue = mp.Queue(maxsize=100*args.nprocs)
         | 
| 321 | 
            +
                
         | 
| 322 | 
            +
                filepaths = []
         | 
| 323 | 
            +
                dataset_indices = []
         | 
| 324 | 
            +
                for idx, data in enumerate(data_config):
         | 
| 325 | 
            +
                    filepaths.append(os.path.join(os.path.expanduser(args.data_folder), data['name']))
         | 
| 326 | 
            +
                    dataset_indices.extend([idx]*data['weight'])
         | 
| 327 | 
            +
             | 
| 328 | 
            +
                # Start producer
         | 
| 329 | 
            +
                p = mp.Process(target=produce_data, args=(args, queue, filepaths, dataset_indices))
         | 
| 330 | 
            +
                p.start()
         | 
| 331 | 
            +
             | 
| 332 | 
            +
                # Run training
         | 
| 333 | 
            +
                print("Start processes:", args.nprocs)
         | 
| 334 | 
            +
                xmp.spawn(train_function, args=(args, queue), nprocs=args.nprocs, start_method='fork')
         | 
| 335 | 
            +
                print("Training done")
         | 
| 336 | 
            +
                print("It might be that not all processes exit automatically. In that case you must manually kill this process.")
         | 
| 337 | 
            +
                print("With 'pkill python' you can kill all remaining python processes")
         | 
| 338 | 
            +
                p.kill()
         | 
| 339 | 
            +
                exit()
         | 
| 340 | 
            +
             | 
| 341 | 
            +
             | 
| 342 | 
            +
             | 
| 343 | 
            +
            # Script was called via:
         | 
| 344 | 
            +
            #python train_many_data_files_v2.py --steps 1000000 --batch_size 128 --model nreimers/MiniLM-L6-H384-uncased train_data_configs/all_datasets_v4.json output/all_datasets_v4_MiniLM-L6-H384-uncased-batch128
         | 
    	
        models/all-MiniLM-L6-v2/vocab.txt
    ADDED
    
    | The diff for this file is too large to render. 
		See raw diff | 
|  | 
