Ipshitaa commited on
Commit
c71b3b8
·
1 Parent(s): ee429a7

Deploying SHL chatbot to Hugging Face Spaces

Browse files
Files changed (44) hide show
  1. .devcontainer/devcontainer.json +33 -0
  2. .gitattributes +0 -0
  3. .gitignore +0 -0
  4. README.md +2 -13
  5. Web_Scraper.py +121 -0
  6. all-MiniLM-L6-v2/.gitattributes +28 -0
  7. all-MiniLM-L6-v2/1_Pooling/config.json +7 -0
  8. all-MiniLM-L6-v2/README.md +173 -0
  9. all-MiniLM-L6-v2/config.json +24 -0
  10. all-MiniLM-L6-v2/config_sentence_transformers.json +7 -0
  11. all-MiniLM-L6-v2/data_config.json +1452 -0
  12. all-MiniLM-L6-v2/model.safetensors +3 -0
  13. all-MiniLM-L6-v2/modules.json +20 -0
  14. all-MiniLM-L6-v2/onnx/model.onnx +3 -0
  15. all-MiniLM-L6-v2/onnx/model_O1.onnx +3 -0
  16. all-MiniLM-L6-v2/onnx/model_O2.onnx +3 -0
  17. all-MiniLM-L6-v2/onnx/model_O3.onnx +3 -0
  18. all-MiniLM-L6-v2/onnx/model_O4.onnx +3 -0
  19. all-MiniLM-L6-v2/onnx/model_qint8_arm64.onnx +3 -0
  20. all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx +3 -0
  21. all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx +3 -0
  22. all-MiniLM-L6-v2/onnx/model_quint8_avx2.onnx +3 -0
  23. all-MiniLM-L6-v2/openvino/openvino_model.bin +3 -0
  24. all-MiniLM-L6-v2/openvino/openvino_model.xml +0 -0
  25. all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.bin +3 -0
  26. all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.xml +0 -0
  27. all-MiniLM-L6-v2/pytorch_model.bin +3 -0
  28. all-MiniLM-L6-v2/rust_model.ot +3 -0
  29. all-MiniLM-L6-v2/sentence_bert_config.json +4 -0
  30. all-MiniLM-L6-v2/special_tokens_map.json +1 -0
  31. all-MiniLM-L6-v2/tf_model.h5 +3 -0
  32. all-MiniLM-L6-v2/tokenizer.json +0 -0
  33. all-MiniLM-L6-v2/tokenizer_config.json +1 -0
  34. all-MiniLM-L6-v2/train_script.py +344 -0
  35. all-MiniLM-L6-v2/vocab.txt +0 -0
  36. endpoint.py +231 -0
  37. main.py +199 -0
  38. requirements.txt +12 -0
  39. shl_assessments.csv +101 -0
  40. storage/default__vector_store.json +0 -0
  41. storage/docstore.json +0 -0
  42. storage/graph_store.json +1 -0
  43. storage/image__vector_store.json +1 -0
  44. storage/index_store.json +1 -0
.devcontainer/devcontainer.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Python 3",
3
+ // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
4
+ "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
5
+ "customizations": {
6
+ "codespaces": {
7
+ "openFiles": [
8
+ "README.md",
9
+ "main.py"
10
+ ]
11
+ },
12
+ "vscode": {
13
+ "settings": {},
14
+ "extensions": [
15
+ "ms-python.python",
16
+ "ms-python.vscode-pylance"
17
+ ]
18
+ }
19
+ },
20
+ "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21
+ "postAttachCommand": {
22
+ "server": "streamlit run main.py --server.enableCORS false --server.enableXsrfProtection false"
23
+ },
24
+ "portsAttributes": {
25
+ "8501": {
26
+ "label": "Application",
27
+ "onAutoForward": "openPreview"
28
+ }
29
+ },
30
+ "forwardPorts": [
31
+ 8501
32
+ ]
33
+ }
.gitattributes CHANGED
Binary files a/.gitattributes and b/.gitattributes differ
 
.gitignore ADDED
Binary file (196 Bytes). View file
 
README.md CHANGED
@@ -1,13 +1,2 @@
1
- ---
2
- title: Rag Chatbot
3
- emoji: 👁
4
- colorFrom: red
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.44.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ This is a prototype chatbot.
2
+ To run it you need to create .env file with your GROQ_API_KEY and OPEN_API_KEY
 
 
 
 
 
 
 
 
 
 
 
Web_Scraper.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.service import Service
3
+ from selenium.webdriver.chrome.options import Options
4
+ from selenium.webdriver.common.by import By
5
+ from selenium.webdriver.support.ui import WebDriverWait
6
+ from selenium.webdriver.support import expected_conditions as EC
7
+ from selenium.common.exceptions import TimeoutException, NoSuchElementException
8
+ from webdriver_manager.chrome import ChromeDriverManager
9
+ import pandas as pd
10
+ import time
11
+ from urllib.parse import urljoin
12
+
13
+
14
+ def scrape_shl_products():
15
+ # Configure Chrome options
16
+ chrome_options = Options()
17
+ chrome_options.add_argument("--headless") # Optional: Run in background
18
+ chrome_options.add_argument("--disable-blink-features=AutomationControlled")
19
+ chrome_options.add_argument(
20
+ "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
21
+ )
22
+
23
+ # Set up driver
24
+ service = Service(ChromeDriverManager().install())
25
+ driver = webdriver.Chrome(service=service, options=chrome_options)
26
+
27
+ base_url = "https://www.shl.com"
28
+ catalog_url = "https://www.shl.com/solutions/products/product-catalog/"
29
+
30
+ try:
31
+ print("Loading SHL product catalog...")
32
+ driver.get(catalog_url)
33
+
34
+ # Wait for products to load
35
+ WebDriverWait(driver, 15).until(
36
+ EC.presence_of_element_located((By.CSS_SELECTOR, ".product-card"))
37
+ )
38
+
39
+ # Scroll to load all products
40
+ print("Scrolling to load all products...")
41
+ last_height = driver.execute_script("return document.body.scrollHeight")
42
+ while True:
43
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
44
+ time.sleep(2)
45
+ new_height = driver.execute_script("return document.body.scrollHeight")
46
+ if new_height == last_height:
47
+ break
48
+ last_height = new_height
49
+
50
+ product_cards = driver.find_elements(By.CSS_SELECTOR, ".product-card")
51
+ print(f"Found {len(product_cards)} products.")
52
+
53
+ products = []
54
+
55
+ for card in product_cards:
56
+ try:
57
+ product = {
58
+ 'Assessment Name': 'Not found',
59
+ 'URL': 'Not found',
60
+ 'Remote Testing Support': 'No',
61
+ 'Adaptive/IRT Support': 'No',
62
+ 'Duration': 'Not specified',
63
+ 'Test Type': 'Not specified'
64
+ }
65
+
66
+ # Name
67
+ name_element = card.find_element(By.CSS_SELECTOR, ".product-card__title")
68
+ product['Assessment Name'] = name_element.text
69
+
70
+ # URL
71
+ link_element = card.find_element(By.CSS_SELECTOR, "a[href]")
72
+ product['URL'] = urljoin(base_url, link_element.get_attribute("href"))
73
+
74
+ # Metadata
75
+ meta_items = card.find_elements(By.CSS_SELECTOR, ".product-card__meta-item")
76
+ for item in meta_items:
77
+ try:
78
+ label = item.find_element(By.CSS_SELECTOR, ".product-card__meta-label").text.lower()
79
+ value = item.find_element(By.CSS_SELECTOR, ".product-card__meta-value").text
80
+
81
+ if 'remote' in label:
82
+ product['Remote Testing Support'] = 'Yes' if 'yes' in value.lower() else 'No'
83
+ elif 'adaptive' in label or 'irt' in label:
84
+ product['Adaptive/IRT Support'] = 'Yes' if 'yes' in value.lower() else 'No'
85
+ elif 'duration' in label:
86
+ product['Duration'] = value
87
+ elif 'type' in label:
88
+ product['Test Type'] = value
89
+ except NoSuchElementException:
90
+ continue
91
+
92
+ products.append(product)
93
+
94
+ except Exception as e:
95
+ print(f"Error processing a product card: {str(e)}")
96
+ continue
97
+
98
+ # Save data
99
+ df = pd.DataFrame(products)
100
+ df.to_csv('shl_products.csv', index=False)
101
+ print("Data saved to shl_products.csv")
102
+
103
+ return df
104
+
105
+ except TimeoutException:
106
+ print("Timeout loading the page.")
107
+ except Exception as e:
108
+ print(f"An error occurred: {str(e)}")
109
+ finally:
110
+ driver.quit()
111
+ print("Browser closed.")
112
+
113
+
114
+ if __name__ == "__main__":
115
+ print("Starting SHL scraper...") # Debug print
116
+ df = scrape_shl_products()
117
+ if df is not None and not df.empty:
118
+ print("\nFirst 5 results:")
119
+ print(df.head())
120
+ else:
121
+ print("No data scraped.")
all-MiniLM-L6-v2/.gitattributes ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
all-MiniLM-L6-v2/1_Pooling/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 384,
3
+ "pooling_mode_cls_token": false,
4
+ "pooling_mode_mean_tokens": true,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false
7
+ }
all-MiniLM-L6-v2/README.md ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: apache-2.0
4
+ library_name: sentence-transformers
5
+ tags:
6
+ - sentence-transformers
7
+ - feature-extraction
8
+ - sentence-similarity
9
+ - transformers
10
+ datasets:
11
+ - s2orc
12
+ - flax-sentence-embeddings/stackexchange_xml
13
+ - ms_marco
14
+ - gooaq
15
+ - yahoo_answers_topics
16
+ - code_search_net
17
+ - search_qa
18
+ - eli5
19
+ - snli
20
+ - multi_nli
21
+ - wikihow
22
+ - natural_questions
23
+ - trivia_qa
24
+ - embedding-data/sentence-compression
25
+ - embedding-data/flickr30k-captions
26
+ - embedding-data/altlex
27
+ - embedding-data/simple-wiki
28
+ - embedding-data/QQP
29
+ - embedding-data/SPECTER
30
+ - embedding-data/PAQ_pairs
31
+ - embedding-data/WikiAnswers
32
+ pipeline_tag: sentence-similarity
33
+ ---
34
+
35
+
36
+ # all-MiniLM-L6-v2
37
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
38
+
39
+ ## Usage (Sentence-Transformers)
40
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
41
+
42
+ ```
43
+ pip install -U sentence-transformers
44
+ ```
45
+
46
+ Then you can use the model like this:
47
+ ```python
48
+ from sentence_transformers import SentenceTransformer
49
+ sentences = ["This is an example sentence", "Each sentence is converted"]
50
+
51
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
52
+ embeddings = model.encode(sentences)
53
+ print(embeddings)
54
+ ```
55
+
56
+ ## Usage (HuggingFace Transformers)
57
+ Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
58
+
59
+ ```python
60
+ from transformers import AutoTokenizer, AutoModel
61
+ import torch
62
+ import torch.nn.functional as F
63
+
64
+ #Mean Pooling - Take attention mask into account for correct averaging
65
+ def mean_pooling(model_output, attention_mask):
66
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
67
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
68
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
69
+
70
+
71
+ # Sentences we want sentence embeddings for
72
+ sentences = ['This is an example sentence', 'Each sentence is converted']
73
+
74
+ # Load model from HuggingFace Hub
75
+ tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
76
+ model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
77
+
78
+ # Tokenize sentences
79
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
80
+
81
+ # Compute token embeddings
82
+ with torch.no_grad():
83
+ model_output = model(**encoded_input)
84
+
85
+ # Perform pooling
86
+ sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
87
+
88
+ # Normalize embeddings
89
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
90
+
91
+ print("Sentence embeddings:")
92
+ print(sentence_embeddings)
93
+ ```
94
+
95
+ ------
96
+
97
+ ## Background
98
+
99
+ The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
100
+ contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
101
+ 1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
102
+
103
+ We developed this model during the
104
+ [Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
105
+ organized by Hugging Face. We developed this model as part of the project:
106
+ [Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
107
+
108
+ ## Intended uses
109
+
110
+ Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
111
+ the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
112
+
113
+ By default, input text longer than 256 word pieces is truncated.
114
+
115
+
116
+ ## Training procedure
117
+
118
+ ### Pre-training
119
+
120
+ We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
121
+
122
+ ### Fine-tuning
123
+
124
+ We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
125
+ We then apply the cross entropy loss by comparing with true pairs.
126
+
127
+ #### Hyper parameters
128
+
129
+ We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
130
+ We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
131
+ a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
132
+
133
+ #### Training data
134
+
135
+ We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
136
+ We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
137
+
138
+
139
+ | Dataset | Paper | Number of training tuples |
140
+ |--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
141
+ | [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
142
+ | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
143
+ | [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
144
+ | [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
145
+ | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
146
+ | [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
147
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
148
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
149
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
150
+ | [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
151
+ | [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
152
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
153
+ | [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
154
+ | [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
155
+ | [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
156
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
157
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
158
+ | [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
159
+ | [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
160
+ | [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
161
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
162
+ | AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
163
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
164
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
165
+ | [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
166
+ | [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
167
+ | [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
168
+ | [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
169
+ | [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
170
+ | [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
171
+ | [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
172
+ | [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
173
+ | **Total** | | **1,170,060,424** |
all-MiniLM-L6-v2/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 384,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 1536,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 6,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "transformers_version": "4.8.2",
21
+ "type_vocab_size": 2,
22
+ "use_cache": true,
23
+ "vocab_size": 30522
24
+ }
all-MiniLM-L6-v2/config_sentence_transformers.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "2.0.0",
4
+ "transformers": "4.6.1",
5
+ "pytorch": "1.8.1"
6
+ }
7
+ }
all-MiniLM-L6-v2/data_config.json ADDED
@@ -0,0 +1,1452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "name": "stackexchange_title_body/skeptics.stackexchange.com.jsonl.gz",
4
+ "lines": 10009,
5
+ "weight": 1
6
+ },
7
+ {
8
+ "name": "stackexchange_TitleBody_Answer/islam.stackexchange.com.jsonl.gz",
9
+ "lines": 10052,
10
+ "weight": 1
11
+ },
12
+ {
13
+ "name": "stackexchange_Title_Answer/islam.stackexchange.com.jsonl.gz",
14
+ "lines": 10052,
15
+ "weight": 1
16
+ },
17
+ {
18
+ "name": "stackexchange_TitleBody_Answer/anime.stackexchange.com.jsonl.gz",
19
+ "lines": 10131,
20
+ "weight": 1
21
+ },
22
+ {
23
+ "name": "stackexchange_Title_Answer/anime.stackexchange.com.jsonl.gz",
24
+ "lines": 10131,
25
+ "weight": 1
26
+ },
27
+ {
28
+ "name": "stackexchange_title_body/writers.stackexchange.com.jsonl.gz",
29
+ "lines": 10157,
30
+ "weight": 1
31
+ },
32
+ {
33
+ "name": "stackexchange_title_body/astronomy.stackexchange.com.jsonl.gz",
34
+ "lines": 10462,
35
+ "weight": 1
36
+ },
37
+ {
38
+ "name": "stackexchange_title_body/vi.stackexchange.com.jsonl.gz",
39
+ "lines": 10551,
40
+ "weight": 1
41
+ },
42
+ {
43
+ "name": "stackexchange_TitleBody_Answer/french.stackexchange.com.jsonl.gz",
44
+ "lines": 10578,
45
+ "weight": 1
46
+ },
47
+ {
48
+ "name": "stackexchange_Title_Answer/french.stackexchange.com.jsonl.gz",
49
+ "lines": 10578,
50
+ "weight": 1
51
+ },
52
+ {
53
+ "name": "stackexchange_title_body/cstheory.stackexchange.com.jsonl.gz",
54
+ "lines": 10642,
55
+ "weight": 1
56
+ },
57
+ {
58
+ "name": "stackexchange_TitleBody_Answer/civicrm.stackexchange.com.jsonl.gz",
59
+ "lines": 10648,
60
+ "weight": 1
61
+ },
62
+ {
63
+ "name": "stackexchange_Title_Answer/civicrm.stackexchange.com.jsonl.gz",
64
+ "lines": 10648,
65
+ "weight": 1
66
+ },
67
+ {
68
+ "name": "stackexchange_TitleBody_Answer/expressionengine.stackexchange.com.jsonl.gz",
69
+ "lines": 10742,
70
+ "weight": 1
71
+ },
72
+ {
73
+ "name": "stackexchange_Title_Answer/expressionengine.stackexchange.com.jsonl.gz",
74
+ "lines": 10742,
75
+ "weight": 1
76
+ },
77
+ {
78
+ "name": "stackexchange_title_body/engineering.stackexchange.com.jsonl.gz",
79
+ "lines": 10753,
80
+ "weight": 1
81
+ },
82
+ {
83
+ "name": "stackexchange_TitleBody_Answer/history.stackexchange.com.jsonl.gz",
84
+ "lines": 10766,
85
+ "weight": 1
86
+ },
87
+ {
88
+ "name": "stackexchange_Title_Answer/history.stackexchange.com.jsonl.gz",
89
+ "lines": 10766,
90
+ "weight": 1
91
+ },
92
+ {
93
+ "name": "stackexchange_title_body/french.stackexchange.com.jsonl.gz",
94
+ "lines": 10794,
95
+ "weight": 1
96
+ },
97
+ {
98
+ "name": "stackexchange_TitleBody_Answer/politics.stackexchange.com.jsonl.gz",
99
+ "lines": 11047,
100
+ "weight": 1
101
+ },
102
+ {
103
+ "name": "stackexchange_Title_Answer/politics.stackexchange.com.jsonl.gz",
104
+ "lines": 11047,
105
+ "weight": 1
106
+ },
107
+ {
108
+ "name": "stackexchange_title_body/economics.stackexchange.com.jsonl.gz",
109
+ "lines": 11115,
110
+ "weight": 1
111
+ },
112
+ {
113
+ "name": "stackexchange_TitleBody_Answer/craftcms.stackexchange.com.jsonl.gz",
114
+ "lines": 11236,
115
+ "weight": 1
116
+ },
117
+ {
118
+ "name": "stackexchange_Title_Answer/craftcms.stackexchange.com.jsonl.gz",
119
+ "lines": 11236,
120
+ "weight": 1
121
+ },
122
+ {
123
+ "name": "stackexchange_title_body/anime.stackexchange.com.jsonl.gz",
124
+ "lines": 11444,
125
+ "weight": 1
126
+ },
127
+ {
128
+ "name": "stackexchange_TitleBody_Answer/christianity.stackexchange.com.jsonl.gz",
129
+ "lines": 11498,
130
+ "weight": 1
131
+ },
132
+ {
133
+ "name": "stackexchange_Title_Answer/christianity.stackexchange.com.jsonl.gz",
134
+ "lines": 11498,
135
+ "weight": 1
136
+ },
137
+ {
138
+ "name": "stackexchange_TitleBody_Answer/softwarerecs.stackexchange.com.jsonl.gz",
139
+ "lines": 11761,
140
+ "weight": 1
141
+ },
142
+ {
143
+ "name": "stackexchange_Title_Answer/softwarerecs.stackexchange.com.jsonl.gz",
144
+ "lines": 11761,
145
+ "weight": 1
146
+ },
147
+ {
148
+ "name": "stackexchange_TitleBody_Answer/boardgames.stackexchange.com.jsonl.gz",
149
+ "lines": 11805,
150
+ "weight": 1
151
+ },
152
+ {
153
+ "name": "stackexchange_Title_Answer/boardgames.stackexchange.com.jsonl.gz",
154
+ "lines": 11805,
155
+ "weight": 1
156
+ },
157
+ {
158
+ "name": "stackexchange_title_body/islam.stackexchange.com.jsonl.gz",
159
+ "lines": 11853,
160
+ "weight": 1
161
+ },
162
+ {
163
+ "name": "stackexchange_title_body/expressionengine.stackexchange.com.jsonl.gz",
164
+ "lines": 11866,
165
+ "weight": 1
166
+ },
167
+ {
168
+ "name": "stackexchange_title_body/politics.stackexchange.com.jsonl.gz",
169
+ "lines": 11894,
170
+ "weight": 1
171
+ },
172
+ {
173
+ "name": "stackexchange_title_body/history.stackexchange.com.jsonl.gz",
174
+ "lines": 12021,
175
+ "weight": 1
176
+ },
177
+ {
178
+ "name": "stackexchange_title_body/christianity.stackexchange.com.jsonl.gz",
179
+ "lines": 12108,
180
+ "weight": 1
181
+ },
182
+ {
183
+ "name": "stackexchange_title_body/boardgames.stackexchange.com.jsonl.gz",
184
+ "lines": 12149,
185
+ "weight": 1
186
+ },
187
+ {
188
+ "name": "flickr30k_captions.jsonl.gz",
189
+ "lines": 317695,
190
+ "weight": 1
191
+ },
192
+ {
193
+ "name": "coco_captions.jsonl.gz",
194
+ "lines": 828395,
195
+ "weight": 1
196
+ },
197
+ {
198
+ "name": "codesearchnet.jsonl.gz",
199
+ "lines": 1151414,
200
+ "weight": 1
201
+ },
202
+ {
203
+ "name": "stackexchange_title_body/civicrm.stackexchange.com.jsonl.gz",
204
+ "lines": 12543,
205
+ "weight": 2
206
+ },
207
+ {
208
+ "name": "stackexchange_title_body/craftcms.stackexchange.com.jsonl.gz",
209
+ "lines": 12574,
210
+ "weight": 2
211
+ },
212
+ {
213
+ "name": "stackexchange_TitleBody_Answer/networkengineering.stackexchange.com.jsonl.gz",
214
+ "lines": 12590,
215
+ "weight": 2
216
+ },
217
+ {
218
+ "name": "stackexchange_Title_Answer/networkengineering.stackexchange.com.jsonl.gz",
219
+ "lines": 12590,
220
+ "weight": 2
221
+ },
222
+ {
223
+ "name": "stackexchange_TitleBody_Answer/space.stackexchange.com.jsonl.gz",
224
+ "lines": 12893,
225
+ "weight": 2
226
+ },
227
+ {
228
+ "name": "stackexchange_Title_Answer/space.stackexchange.com.jsonl.gz",
229
+ "lines": 12893,
230
+ "weight": 2
231
+ },
232
+ {
233
+ "name": "stackexchange_TitleBody_Answer/quant.stackexchange.com.jsonl.gz",
234
+ "lines": 12933,
235
+ "weight": 2
236
+ },
237
+ {
238
+ "name": "stackexchange_Title_Answer/quant.stackexchange.com.jsonl.gz",
239
+ "lines": 12933,
240
+ "weight": 2
241
+ },
242
+ {
243
+ "name": "stackexchange_TitleBody_Answer/philosophy.stackexchange.com.jsonl.gz",
244
+ "lines": 13114,
245
+ "weight": 2
246
+ },
247
+ {
248
+ "name": "stackexchange_Title_Answer/philosophy.stackexchange.com.jsonl.gz",
249
+ "lines": 13114,
250
+ "weight": 2
251
+ },
252
+ {
253
+ "name": "stackexchange_TitleBody_Answer/gardening.stackexchange.com.jsonl.gz",
254
+ "lines": 13246,
255
+ "weight": 2
256
+ },
257
+ {
258
+ "name": "stackexchange_Title_Answer/gardening.stackexchange.com.jsonl.gz",
259
+ "lines": 13246,
260
+ "weight": 2
261
+ },
262
+ {
263
+ "name": "stackexchange_title_body/hinduism.stackexchange.com.jsonl.gz",
264
+ "lines": 13450,
265
+ "weight": 2
266
+ },
267
+ {
268
+ "name": "stackexchange_title_body/networkengineering.stackexchange.com.jsonl.gz",
269
+ "lines": 13454,
270
+ "weight": 2
271
+ },
272
+ {
273
+ "name": "stackexchange_TitleBody_Answer/german.stackexchange.com.jsonl.gz",
274
+ "lines": 13733,
275
+ "weight": 2
276
+ },
277
+ {
278
+ "name": "stackexchange_Title_Answer/german.stackexchange.com.jsonl.gz",
279
+ "lines": 13733,
280
+ "weight": 2
281
+ },
282
+ {
283
+ "name": "stackexchange_title_body/german.stackexchange.com.jsonl.gz",
284
+ "lines": 13950,
285
+ "weight": 2
286
+ },
287
+ {
288
+ "name": "stackexchange_title_body/philosophy.stackexchange.com.jsonl.gz",
289
+ "lines": 14829,
290
+ "weight": 2
291
+ },
292
+ {
293
+ "name": "stackexchange_title_body/gardening.stackexchange.com.jsonl.gz",
294
+ "lines": 15136,
295
+ "weight": 2
296
+ },
297
+ {
298
+ "name": "stackexchange_title_body/space.stackexchange.com.jsonl.gz",
299
+ "lines": 15142,
300
+ "weight": 2
301
+ },
302
+ {
303
+ "name": "stackexchange_TitleBody_Answer/bicycles.stackexchange.com.jsonl.gz",
304
+ "lines": 15708,
305
+ "weight": 2
306
+ },
307
+ {
308
+ "name": "stackexchange_Title_Answer/bicycles.stackexchange.com.jsonl.gz",
309
+ "lines": 15708,
310
+ "weight": 2
311
+ },
312
+ {
313
+ "name": "stackexchange_TitleBody_Answer/law.stackexchange.com.jsonl.gz",
314
+ "lines": 16133,
315
+ "weight": 2
316
+ },
317
+ {
318
+ "name": "stackexchange_Title_Answer/law.stackexchange.com.jsonl.gz",
319
+ "lines": 16133,
320
+ "weight": 2
321
+ },
322
+ {
323
+ "name": "stackexchange_TitleBody_Answer/arduino.stackexchange.com.jsonl.gz",
324
+ "lines": 16281,
325
+ "weight": 2
326
+ },
327
+ {
328
+ "name": "stackexchange_Title_Answer/arduino.stackexchange.com.jsonl.gz",
329
+ "lines": 16281,
330
+ "weight": 2
331
+ },
332
+ {
333
+ "name": "stackexchange_title_body/bicycles.stackexchange.com.jsonl.gz",
334
+ "lines": 16353,
335
+ "weight": 2
336
+ },
337
+ {
338
+ "name": "stackexchange_TitleBody_Answer/emacs.stackexchange.com.jsonl.gz",
339
+ "lines": 16830,
340
+ "weight": 2
341
+ },
342
+ {
343
+ "name": "stackexchange_Title_Answer/emacs.stackexchange.com.jsonl.gz",
344
+ "lines": 16830,
345
+ "weight": 2
346
+ },
347
+ {
348
+ "name": "stackexchange_title_body/quant.stackexchange.com.jsonl.gz",
349
+ "lines": 17261,
350
+ "weight": 2
351
+ },
352
+ {
353
+ "name": "stackexchange_TitleBody_Answer/dsp.stackexchange.com.jsonl.gz",
354
+ "lines": 17430,
355
+ "weight": 2
356
+ },
357
+ {
358
+ "name": "stackexchange_Title_Answer/dsp.stackexchange.com.jsonl.gz",
359
+ "lines": 17430,
360
+ "weight": 2
361
+ },
362
+ {
363
+ "name": "stackexchange_TitleBody_Answer/puzzling.stackexchange.com.jsonl.gz",
364
+ "lines": 17448,
365
+ "weight": 2
366
+ },
367
+ {
368
+ "name": "stackexchange_Title_Answer/puzzling.stackexchange.com.jsonl.gz",
369
+ "lines": 17448,
370
+ "weight": 2
371
+ },
372
+ {
373
+ "name": "stackexchange_title_body/puzzling.stackexchange.com.jsonl.gz",
374
+ "lines": 17851,
375
+ "weight": 2
376
+ },
377
+ {
378
+ "name": "stackexchange_title_body/law.stackexchange.com.jsonl.gz",
379
+ "lines": 17941,
380
+ "weight": 2
381
+ },
382
+ {
383
+ "name": "stackexchange_TitleBody_Answer/movies.stackexchange.com.jsonl.gz",
384
+ "lines": 18243,
385
+ "weight": 2
386
+ },
387
+ {
388
+ "name": "stackexchange_Title_Answer/movies.stackexchange.com.jsonl.gz",
389
+ "lines": 18243,
390
+ "weight": 2
391
+ },
392
+ {
393
+ "name": "stackexchange_TitleBody_Answer/mechanics.stackexchange.com.jsonl.gz",
394
+ "lines": 18613,
395
+ "weight": 2
396
+ },
397
+ {
398
+ "name": "stackexchange_Title_Answer/mechanics.stackexchange.com.jsonl.gz",
399
+ "lines": 18613,
400
+ "weight": 2
401
+ },
402
+ {
403
+ "name": "stackexchange_TitleBody_Answer/aviation.stackexchange.com.jsonl.gz",
404
+ "lines": 18755,
405
+ "weight": 2
406
+ },
407
+ {
408
+ "name": "stackexchange_Title_Answer/aviation.stackexchange.com.jsonl.gz",
409
+ "lines": 18755,
410
+ "weight": 2
411
+ },
412
+ {
413
+ "name": "stackexchange_TitleBody_Answer/biology.stackexchange.com.jsonl.gz",
414
+ "lines": 19277,
415
+ "weight": 2
416
+ },
417
+ {
418
+ "name": "stackexchange_Title_Answer/biology.stackexchange.com.jsonl.gz",
419
+ "lines": 19277,
420
+ "weight": 2
421
+ },
422
+ {
423
+ "name": "stackexchange_TitleBody_Answer/crypto.stackexchange.com.jsonl.gz",
424
+ "lines": 19404,
425
+ "weight": 2
426
+ },
427
+ {
428
+ "name": "stackexchange_Title_Answer/crypto.stackexchange.com.jsonl.gz",
429
+ "lines": 19404,
430
+ "weight": 2
431
+ },
432
+ {
433
+ "name": "stackexchange_title_body/arduino.stackexchange.com.jsonl.gz",
434
+ "lines": 19553,
435
+ "weight": 2
436
+ },
437
+ {
438
+ "name": "stackexchange_TitleBody_Answer/music.stackexchange.com.jsonl.gz",
439
+ "lines": 19936,
440
+ "weight": 2
441
+ },
442
+ {
443
+ "name": "stackexchange_Title_Answer/music.stackexchange.com.jsonl.gz",
444
+ "lines": 19936,
445
+ "weight": 2
446
+ },
447
+ {
448
+ "name": "stackexchange_title_body/aviation.stackexchange.com.jsonl.gz",
449
+ "lines": 20139,
450
+ "weight": 2
451
+ },
452
+ {
453
+ "name": "stackexchange_title_body/softwarerecs.stackexchange.com.jsonl.gz",
454
+ "lines": 20142,
455
+ "weight": 2
456
+ },
457
+ {
458
+ "name": "stackexchange_title_body/movies.stackexchange.com.jsonl.gz",
459
+ "lines": 20181,
460
+ "weight": 2
461
+ },
462
+ {
463
+ "name": "stackexchange_TitleBody_Answer/datascience.stackexchange.com.jsonl.gz",
464
+ "lines": 20503,
465
+ "weight": 2
466
+ },
467
+ {
468
+ "name": "stackexchange_Title_Answer/datascience.stackexchange.com.jsonl.gz",
469
+ "lines": 20503,
470
+ "weight": 2
471
+ },
472
+ {
473
+ "name": "stackexchange_title_body/music.stackexchange.com.jsonl.gz",
474
+ "lines": 20636,
475
+ "weight": 2
476
+ },
477
+ {
478
+ "name": "stackexchange_TitleBody_Answer/japanese.stackexchange.com.jsonl.gz",
479
+ "lines": 20948,
480
+ "weight": 2
481
+ },
482
+ {
483
+ "name": "stackexchange_Title_Answer/japanese.stackexchange.com.jsonl.gz",
484
+ "lines": 20948,
485
+ "weight": 2
486
+ },
487
+ {
488
+ "name": "stackexchange_title_body/emacs.stackexchange.com.jsonl.gz",
489
+ "lines": 21055,
490
+ "weight": 2
491
+ },
492
+ {
493
+ "name": "stackexchange_title_body/dsp.stackexchange.com.jsonl.gz",
494
+ "lines": 21252,
495
+ "weight": 2
496
+ },
497
+ {
498
+ "name": "stackexchange_title_body/japanese.stackexchange.com.jsonl.gz",
499
+ "lines": 22056,
500
+ "weight": 2
501
+ },
502
+ {
503
+ "name": "stackexchange_TitleBody_Answer/bitcoin.stackexchange.com.jsonl.gz",
504
+ "lines": 22474,
505
+ "weight": 2
506
+ },
507
+ {
508
+ "name": "stackexchange_Title_Answer/bitcoin.stackexchange.com.jsonl.gz",
509
+ "lines": 22474,
510
+ "weight": 2
511
+ },
512
+ {
513
+ "name": "stackexchange_TitleBody_Answer/cooking.stackexchange.com.jsonl.gz",
514
+ "lines": 22641,
515
+ "weight": 2
516
+ },
517
+ {
518
+ "name": "stackexchange_Title_Answer/cooking.stackexchange.com.jsonl.gz",
519
+ "lines": 22641,
520
+ "weight": 2
521
+ },
522
+ {
523
+ "name": "stackexchange_title_body/mechanics.stackexchange.com.jsonl.gz",
524
+ "lines": 22868,
525
+ "weight": 2
526
+ },
527
+ {
528
+ "name": "stackexchange_TitleBody_Answer/photo.stackexchange.com.jsonl.gz",
529
+ "lines": 23204,
530
+ "weight": 2
531
+ },
532
+ {
533
+ "name": "stackexchange_Title_Answer/photo.stackexchange.com.jsonl.gz",
534
+ "lines": 23204,
535
+ "weight": 2
536
+ },
537
+ {
538
+ "name": "stackexchange_title_body/crypto.stackexchange.com.jsonl.gz",
539
+ "lines": 23231,
540
+ "weight": 2
541
+ },
542
+ {
543
+ "name": "stackexchange_title_body/cooking.stackexchange.com.jsonl.gz",
544
+ "lines": 23705,
545
+ "weight": 2
546
+ },
547
+ {
548
+ "name": "stackexchange_title_body/photo.stackexchange.com.jsonl.gz",
549
+ "lines": 23753,
550
+ "weight": 2
551
+ },
552
+ {
553
+ "name": "stackexchange_TitleBody_Answer/workplace.stackexchange.com.jsonl.gz",
554
+ "lines": 24012,
555
+ "weight": 2
556
+ },
557
+ {
558
+ "name": "stackexchange_Title_Answer/workplace.stackexchange.com.jsonl.gz",
559
+ "lines": 24012,
560
+ "weight": 2
561
+ },
562
+ {
563
+ "name": "stackexchange_TitleBody_Answer/meta.stackoverflow.com.jsonl.gz",
564
+ "lines": 24044,
565
+ "weight": 2
566
+ },
567
+ {
568
+ "name": "stackexchange_Title_Answer/meta.stackoverflow.com.jsonl.gz",
569
+ "lines": 24044,
570
+ "weight": 2
571
+ },
572
+ {
573
+ "name": "stackexchange_TitleBody_Answer/raspberrypi.stackexchange.com.jsonl.gz",
574
+ "lines": 24143,
575
+ "weight": 2
576
+ },
577
+ {
578
+ "name": "stackexchange_Title_Answer/raspberrypi.stackexchange.com.jsonl.gz",
579
+ "lines": 24143,
580
+ "weight": 2
581
+ },
582
+ {
583
+ "name": "stackexchange_title_body/workplace.stackexchange.com.jsonl.gz",
584
+ "lines": 24189,
585
+ "weight": 2
586
+ },
587
+ {
588
+ "name": "stackexchange_title_body/biology.stackexchange.com.jsonl.gz",
589
+ "lines": 24447,
590
+ "weight": 3
591
+ },
592
+ {
593
+ "name": "stackexchange_TitleBody_Answer/webapps.stackexchange.com.jsonl.gz",
594
+ "lines": 24867,
595
+ "weight": 3
596
+ },
597
+ {
598
+ "name": "stackexchange_Title_Answer/webapps.stackexchange.com.jsonl.gz",
599
+ "lines": 24867,
600
+ "weight": 3
601
+ },
602
+ {
603
+ "name": "stackexchange_title_body/bitcoin.stackexchange.com.jsonl.gz",
604
+ "lines": 25374,
605
+ "weight": 3
606
+ },
607
+ {
608
+ "name": "stackexchange_TitleBody_Answer/judaism.stackexchange.com.jsonl.gz",
609
+ "lines": 26085,
610
+ "weight": 3
611
+ },
612
+ {
613
+ "name": "stackexchange_Title_Answer/judaism.stackexchange.com.jsonl.gz",
614
+ "lines": 26085,
615
+ "weight": 3
616
+ },
617
+ {
618
+ "name": "stackexchange_TitleBody_Answer/ethereum.stackexchange.com.jsonl.gz",
619
+ "lines": 26124,
620
+ "weight": 3
621
+ },
622
+ {
623
+ "name": "stackexchange_Title_Answer/ethereum.stackexchange.com.jsonl.gz",
624
+ "lines": 26124,
625
+ "weight": 3
626
+ },
627
+ {
628
+ "name": "stackexchange_TitleBody_Answer/worldbuilding.stackexchange.com.jsonl.gz",
629
+ "lines": 26210,
630
+ "weight": 3
631
+ },
632
+ {
633
+ "name": "stackexchange_Title_Answer/worldbuilding.stackexchange.com.jsonl.gz",
634
+ "lines": 26210,
635
+ "weight": 3
636
+ },
637
+ {
638
+ "name": "stackexchange_title_body/worldbuilding.stackexchange.com.jsonl.gz",
639
+ "lines": 26763,
640
+ "weight": 3
641
+ },
642
+ {
643
+ "name": "stackexchange_TitleBody_Answer/chemistry.stackexchange.com.jsonl.gz",
644
+ "lines": 27061,
645
+ "weight": 3
646
+ },
647
+ {
648
+ "name": "stackexchange_Title_Answer/chemistry.stackexchange.com.jsonl.gz",
649
+ "lines": 27061,
650
+ "weight": 3
651
+ },
652
+ {
653
+ "name": "stackexchange_title_body/datascience.stackexchange.com.jsonl.gz",
654
+ "lines": 27397,
655
+ "weight": 3
656
+ },
657
+ {
658
+ "name": "stackexchange_TitleBody_Answer/graphicdesign.stackexchange.com.jsonl.gz",
659
+ "lines": 28083,
660
+ "weight": 3
661
+ },
662
+ {
663
+ "name": "stackexchange_Title_Answer/graphicdesign.stackexchange.com.jsonl.gz",
664
+ "lines": 28083,
665
+ "weight": 3
666
+ },
667
+ {
668
+ "name": "stackexchange_TitleBody_Answer/ux.stackexchange.com.jsonl.gz",
669
+ "lines": 28901,
670
+ "weight": 3
671
+ },
672
+ {
673
+ "name": "stackexchange_Title_Answer/ux.stackexchange.com.jsonl.gz",
674
+ "lines": 28901,
675
+ "weight": 3
676
+ },
677
+ {
678
+ "name": "stackexchange_title_body/ux.stackexchange.com.jsonl.gz",
679
+ "lines": 29403,
680
+ "weight": 3
681
+ },
682
+ {
683
+ "name": "stackexchange_TitleBody_Answer/money.stackexchange.com.jsonl.gz",
684
+ "lines": 29404,
685
+ "weight": 3
686
+ },
687
+ {
688
+ "name": "stackexchange_Title_Answer/money.stackexchange.com.jsonl.gz",
689
+ "lines": 29404,
690
+ "weight": 3
691
+ },
692
+ {
693
+ "name": "stackexchange_title_body/webapps.stackexchange.com.jsonl.gz",
694
+ "lines": 29697,
695
+ "weight": 3
696
+ },
697
+ {
698
+ "name": "stackexchange_TitleBody_Answer/cs.stackexchange.com.jsonl.gz",
699
+ "lines": 30010,
700
+ "weight": 3
701
+ },
702
+ {
703
+ "name": "stackexchange_Title_Answer/cs.stackexchange.com.jsonl.gz",
704
+ "lines": 30010,
705
+ "weight": 3
706
+ },
707
+ {
708
+ "name": "stackexchange_title_body/graphicdesign.stackexchange.com.jsonl.gz",
709
+ "lines": 30233,
710
+ "weight": 3
711
+ },
712
+ {
713
+ "name": "stackexchange_TitleBody_Answer/webmasters.stackexchange.com.jsonl.gz",
714
+ "lines": 30370,
715
+ "weight": 3
716
+ },
717
+ {
718
+ "name": "stackexchange_Title_Answer/webmasters.stackexchange.com.jsonl.gz",
719
+ "lines": 30370,
720
+ "weight": 3
721
+ },
722
+ {
723
+ "name": "stackexchange_title_body/raspberrypi.stackexchange.com.jsonl.gz",
724
+ "lines": 30625,
725
+ "weight": 3
726
+ },
727
+ {
728
+ "name": "stackexchange_title_body/money.stackexchange.com.jsonl.gz",
729
+ "lines": 32021,
730
+ "weight": 3
731
+ },
732
+ {
733
+ "name": "stackexchange_title_body/judaism.stackexchange.com.jsonl.gz",
734
+ "lines": 32028,
735
+ "weight": 3
736
+ },
737
+ {
738
+ "name": "stackexchange_TitleBody_Answer/academia.stackexchange.com.jsonl.gz",
739
+ "lines": 32137,
740
+ "weight": 3
741
+ },
742
+ {
743
+ "name": "stackexchange_Title_Answer/academia.stackexchange.com.jsonl.gz",
744
+ "lines": 32137,
745
+ "weight": 3
746
+ },
747
+ {
748
+ "name": "stackexchange_title_body/ethereum.stackexchange.com.jsonl.gz",
749
+ "lines": 32760,
750
+ "weight": 3
751
+ },
752
+ {
753
+ "name": "stackexchange_title_body/academia.stackexchange.com.jsonl.gz",
754
+ "lines": 34331,
755
+ "weight": 3
756
+ },
757
+ {
758
+ "name": "stackexchange_title_body/chemistry.stackexchange.com.jsonl.gz",
759
+ "lines": 34506,
760
+ "weight": 3
761
+ },
762
+ {
763
+ "name": "stackexchange_title_body/webmasters.stackexchange.com.jsonl.gz",
764
+ "lines": 34559,
765
+ "weight": 3
766
+ },
767
+ {
768
+ "name": "stackexchange_title_body/meta.stackoverflow.com.jsonl.gz",
769
+ "lines": 36456,
770
+ "weight": 3
771
+ },
772
+ {
773
+ "name": "stackexchange_TitleBody_Answer/travel.stackexchange.com.jsonl.gz",
774
+ "lines": 36533,
775
+ "weight": 4
776
+ },
777
+ {
778
+ "name": "stackexchange_Title_Answer/travel.stackexchange.com.jsonl.gz",
779
+ "lines": 36533,
780
+ "weight": 4
781
+ },
782
+ {
783
+ "name": "stackexchange_TitleBody_Answer/android.stackexchange.com.jsonl.gz",
784
+ "lines": 38077,
785
+ "weight": 4
786
+ },
787
+ {
788
+ "name": "stackexchange_Title_Answer/android.stackexchange.com.jsonl.gz",
789
+ "lines": 38077,
790
+ "weight": 4
791
+ },
792
+ {
793
+ "name": "stackexchange_title_body/cs.stackexchange.com.jsonl.gz",
794
+ "lines": 38314,
795
+ "weight": 4
796
+ },
797
+ {
798
+ "name": "stackexchange_TitleBody_Answer/gamedev.stackexchange.com.jsonl.gz",
799
+ "lines": 40154,
800
+ "weight": 4
801
+ },
802
+ {
803
+ "name": "stackexchange_Title_Answer/gamedev.stackexchange.com.jsonl.gz",
804
+ "lines": 40154,
805
+ "weight": 4
806
+ },
807
+ {
808
+ "name": "stackexchange_TitleBody_Answer/rpg.stackexchange.com.jsonl.gz",
809
+ "lines": 40435,
810
+ "weight": 4
811
+ },
812
+ {
813
+ "name": "stackexchange_Title_Answer/rpg.stackexchange.com.jsonl.gz",
814
+ "lines": 40435,
815
+ "weight": 4
816
+ },
817
+ {
818
+ "name": "stackexchange_title_body/travel.stackexchange.com.jsonl.gz",
819
+ "lines": 41227,
820
+ "weight": 4
821
+ },
822
+ {
823
+ "name": "stackexchange_TitleBody_Answer/codereview.stackexchange.com.jsonl.gz",
824
+ "lines": 41748,
825
+ "weight": 4
826
+ },
827
+ {
828
+ "name": "stackexchange_Title_Answer/codereview.stackexchange.com.jsonl.gz",
829
+ "lines": 41748,
830
+ "weight": 4
831
+ },
832
+ {
833
+ "name": "stackexchange_title_body/rpg.stackexchange.com.jsonl.gz",
834
+ "lines": 42303,
835
+ "weight": 4
836
+ },
837
+ {
838
+ "name": "stackexchange_title_body/codereview.stackexchange.com.jsonl.gz",
839
+ "lines": 45765,
840
+ "weight": 4
841
+ },
842
+ {
843
+ "name": "stackexchange_title_body/gamedev.stackexchange.com.jsonl.gz",
844
+ "lines": 46485,
845
+ "weight": 4
846
+ },
847
+ {
848
+ "name": "stackexchange_TitleBody_Answer/softwareengineering.stackexchange.com.jsonl.gz",
849
+ "lines": 51326,
850
+ "weight": 5
851
+ },
852
+ {
853
+ "name": "stackexchange_Title_Answer/softwareengineering.stackexchange.com.jsonl.gz",
854
+ "lines": 51326,
855
+ "weight": 5
856
+ },
857
+ {
858
+ "name": "stackexchange_TitleBody_Answer/security.stackexchange.com.jsonl.gz",
859
+ "lines": 51355,
860
+ "weight": 5
861
+ },
862
+ {
863
+ "name": "stackexchange_Title_Answer/security.stackexchange.com.jsonl.gz",
864
+ "lines": 51355,
865
+ "weight": 5
866
+ },
867
+ {
868
+ "name": "stackexchange_title_body/android.stackexchange.com.jsonl.gz",
869
+ "lines": 51608,
870
+ "weight": 5
871
+ },
872
+ {
873
+ "name": "stackexchange_TitleBody_Answer/diy.stackexchange.com.jsonl.gz",
874
+ "lines": 52896,
875
+ "weight": 5
876
+ },
877
+ {
878
+ "name": "stackexchange_Title_Answer/diy.stackexchange.com.jsonl.gz",
879
+ "lines": 52896,
880
+ "weight": 5
881
+ },
882
+ {
883
+ "name": "stackexchange_title_body/softwareengineering.stackexchange.com.jsonl.gz",
884
+ "lines": 53942,
885
+ "weight": 5
886
+ },
887
+ {
888
+ "name": "stackexchange_TitleBody_Answer/blender.stackexchange.com.jsonl.gz",
889
+ "lines": 54153,
890
+ "weight": 5
891
+ },
892
+ {
893
+ "name": "stackexchange_Title_Answer/blender.stackexchange.com.jsonl.gz",
894
+ "lines": 54153,
895
+ "weight": 5
896
+ },
897
+ {
898
+ "name": "stackexchange_TitleBody_Answer/scifi.stackexchange.com.jsonl.gz",
899
+ "lines": 54805,
900
+ "weight": 5
901
+ },
902
+ {
903
+ "name": "stackexchange_Title_Answer/scifi.stackexchange.com.jsonl.gz",
904
+ "lines": 54805,
905
+ "weight": 5
906
+ },
907
+ {
908
+ "name": "stackexchange_title_body/security.stackexchange.com.jsonl.gz",
909
+ "lines": 58000,
910
+ "weight": 5
911
+ },
912
+ {
913
+ "name": "stackexchange_TitleBody_Answer/mathematica.stackexchange.com.jsonl.gz",
914
+ "lines": 59895,
915
+ "weight": 5
916
+ },
917
+ {
918
+ "name": "stackexchange_Title_Answer/mathematica.stackexchange.com.jsonl.gz",
919
+ "lines": 59895,
920
+ "weight": 5
921
+ },
922
+ {
923
+ "name": "stackexchange_title_body/diy.stackexchange.com.jsonl.gz",
924
+ "lines": 60083,
925
+ "weight": 5
926
+ },
927
+ {
928
+ "name": "stackexchange_TitleBody_Answer/meta.stackexchange.com.jsonl.gz",
929
+ "lines": 60744,
930
+ "weight": 5
931
+ },
932
+ {
933
+ "name": "stackexchange_Title_Answer/meta.stackexchange.com.jsonl.gz",
934
+ "lines": 60744,
935
+ "weight": 5
936
+ },
937
+ {
938
+ "name": "stackexchange_title_body/scifi.stackexchange.com.jsonl.gz",
939
+ "lines": 61528,
940
+ "weight": 6
941
+ },
942
+ {
943
+ "name": "stackexchange_TitleBody_Answer/drupal.stackexchange.com.jsonl.gz",
944
+ "lines": 67817,
945
+ "weight": 6
946
+ },
947
+ {
948
+ "name": "stackexchange_Title_Answer/drupal.stackexchange.com.jsonl.gz",
949
+ "lines": 67817,
950
+ "weight": 6
951
+ },
952
+ {
953
+ "name": "stackexchange_TitleBody_Answer/dba.stackexchange.com.jsonl.gz",
954
+ "lines": 71449,
955
+ "weight": 6
956
+ },
957
+ {
958
+ "name": "stackexchange_Title_Answer/dba.stackexchange.com.jsonl.gz",
959
+ "lines": 71449,
960
+ "weight": 6
961
+ },
962
+ {
963
+ "name": "stackexchange_title_body/mathematica.stackexchange.com.jsonl.gz",
964
+ "lines": 73131,
965
+ "weight": 7
966
+ },
967
+ {
968
+ "name": "stackexchange_TitleBody_Answer/ell.stackexchange.com.jsonl.gz",
969
+ "lines": 77892,
970
+ "weight": 7
971
+ },
972
+ {
973
+ "name": "stackexchange_Title_Answer/ell.stackexchange.com.jsonl.gz",
974
+ "lines": 77892,
975
+ "weight": 7
976
+ },
977
+ {
978
+ "name": "stackexchange_TitleBody_Answer/magento.stackexchange.com.jsonl.gz",
979
+ "lines": 79241,
980
+ "weight": 7
981
+ },
982
+ {
983
+ "name": "stackexchange_Title_Answer/magento.stackexchange.com.jsonl.gz",
984
+ "lines": 79241,
985
+ "weight": 7
986
+ },
987
+ {
988
+ "name": "stackexchange_title_body/drupal.stackexchange.com.jsonl.gz",
989
+ "lines": 79717,
990
+ "weight": 7
991
+ },
992
+ {
993
+ "name": "stackexchange_TitleBody_Answer/sharepoint.stackexchange.com.jsonl.gz",
994
+ "lines": 80420,
995
+ "weight": 7
996
+ },
997
+ {
998
+ "name": "stackexchange_Title_Answer/sharepoint.stackexchange.com.jsonl.gz",
999
+ "lines": 80420,
1000
+ "weight": 7
1001
+ },
1002
+ {
1003
+ "name": "stackexchange_title_body/blender.stackexchange.com.jsonl.gz",
1004
+ "lines": 80766,
1005
+ "weight": 7
1006
+ },
1007
+ {
1008
+ "name": "stackexchange_title_body/dba.stackexchange.com.jsonl.gz",
1009
+ "lines": 81871,
1010
+ "weight": 7
1011
+ },
1012
+ {
1013
+ "name": "stackexchange_TitleBody_Answer/gaming.stackexchange.com.jsonl.gz",
1014
+ "lines": 82887,
1015
+ "weight": 7
1016
+ },
1017
+ {
1018
+ "name": "stackexchange_Title_Answer/gaming.stackexchange.com.jsonl.gz",
1019
+ "lines": 82887,
1020
+ "weight": 7
1021
+ },
1022
+ {
1023
+ "name": "stackexchange_title_body/ell.stackexchange.com.jsonl.gz",
1024
+ "lines": 83271,
1025
+ "weight": 7
1026
+ },
1027
+ {
1028
+ "name": "stackexchange_title_body/meta.stackexchange.com.jsonl.gz",
1029
+ "lines": 83510,
1030
+ "weight": 7
1031
+ },
1032
+ {
1033
+ "name": "stackexchange_TitleBody_Answer/wordpress.stackexchange.com.jsonl.gz",
1034
+ "lines": 83621,
1035
+ "weight": 7
1036
+ },
1037
+ {
1038
+ "name": "stackexchange_Title_Answer/wordpress.stackexchange.com.jsonl.gz",
1039
+ "lines": 83621,
1040
+ "weight": 7
1041
+ },
1042
+ {
1043
+ "name": "stackexchange_TitleBody_Answer/mathoverflow.net.jsonl.gz",
1044
+ "lines": 85289,
1045
+ "weight": 8
1046
+ },
1047
+ {
1048
+ "name": "stackexchange_Title_Answer/mathoverflow.net.jsonl.gz",
1049
+ "lines": 85289,
1050
+ "weight": 8
1051
+ },
1052
+ {
1053
+ "name": "stackexchange_TitleBody_Answer/salesforce.stackexchange.com.jsonl.gz",
1054
+ "lines": 87272,
1055
+ "weight": 8
1056
+ },
1057
+ {
1058
+ "name": "stackexchange_Title_Answer/salesforce.stackexchange.com.jsonl.gz",
1059
+ "lines": 87272,
1060
+ "weight": 8
1061
+ },
1062
+ {
1063
+ "name": "stackexchange_title_body/gaming.stackexchange.com.jsonl.gz",
1064
+ "lines": 88912,
1065
+ "weight": 8
1066
+ },
1067
+ {
1068
+ "name": "stackexchange_TitleBody_Answer/apple.stackexchange.com.jsonl.gz",
1069
+ "lines": 92487,
1070
+ "weight": 8
1071
+ },
1072
+ {
1073
+ "name": "stackexchange_Title_Answer/apple.stackexchange.com.jsonl.gz",
1074
+ "lines": 92487,
1075
+ "weight": 8
1076
+ },
1077
+ {
1078
+ "name": "stackexchange_title_body/sharepoint.stackexchange.com.jsonl.gz",
1079
+ "lines": 94011,
1080
+ "weight": 8
1081
+ },
1082
+ {
1083
+ "name": "stackexchange_title_body/magento.stackexchange.com.jsonl.gz",
1084
+ "lines": 99991,
1085
+ "weight": 9
1086
+ },
1087
+ {
1088
+ "name": "stackexchange_TitleBody_Answer/gis.stackexchange.com.jsonl.gz",
1089
+ "lines": 100254,
1090
+ "weight": 9
1091
+ },
1092
+ {
1093
+ "name": "stackexchange_Title_Answer/gis.stackexchange.com.jsonl.gz",
1094
+ "lines": 100254,
1095
+ "weight": 9
1096
+ },
1097
+ {
1098
+ "name": "stackexchange_title_body/wordpress.stackexchange.com.jsonl.gz",
1099
+ "lines": 100474,
1100
+ "weight": 9
1101
+ },
1102
+ {
1103
+ "name": "stackexchange_TitleBody_Answer/english.stackexchange.com.jsonl.gz",
1104
+ "lines": 100640,
1105
+ "weight": 9
1106
+ },
1107
+ {
1108
+ "name": "stackexchange_Title_Answer/english.stackexchange.com.jsonl.gz",
1109
+ "lines": 100640,
1110
+ "weight": 9
1111
+ },
1112
+ {
1113
+ "name": "stackexchange_title_body/salesforce.stackexchange.com.jsonl.gz",
1114
+ "lines": 105260,
1115
+ "weight": 9
1116
+ },
1117
+ {
1118
+ "name": "stackexchange_title_body/english.stackexchange.com.jsonl.gz",
1119
+ "lines": 109522,
1120
+ "weight": 10
1121
+ },
1122
+ {
1123
+ "name": "stackexchange_title_body/apple.stackexchange.com.jsonl.gz",
1124
+ "lines": 110622,
1125
+ "weight": 10
1126
+ },
1127
+ {
1128
+ "name": "stackexchange_TitleBody_Answer/stats.stackexchange.com.jsonl.gz",
1129
+ "lines": 115679,
1130
+ "weight": 10
1131
+ },
1132
+ {
1133
+ "name": "stackexchange_Title_Answer/stats.stackexchange.com.jsonl.gz",
1134
+ "lines": 115679,
1135
+ "weight": 10
1136
+ },
1137
+ {
1138
+ "name": "stackexchange_title_body/mathoverflow.net.jsonl.gz",
1139
+ "lines": 120851,
1140
+ "weight": 10
1141
+ },
1142
+ {
1143
+ "name": "stackexchange_TitleBody_Answer/electronics.stackexchange.com.jsonl.gz",
1144
+ "lines": 129494,
1145
+ "weight": 11
1146
+ },
1147
+ {
1148
+ "name": "stackexchange_Title_Answer/electronics.stackexchange.com.jsonl.gz",
1149
+ "lines": 129494,
1150
+ "weight": 11
1151
+ },
1152
+ {
1153
+ "name": "stackexchange_title_body/gis.stackexchange.com.jsonl.gz",
1154
+ "lines": 131000,
1155
+ "weight": 11
1156
+ },
1157
+ {
1158
+ "name": "stackexchange_TitleBody_Answer/physics.stackexchange.com.jsonl.gz",
1159
+ "lines": 141230,
1160
+ "weight": 12
1161
+ },
1162
+ {
1163
+ "name": "stackexchange_Title_Answer/physics.stackexchange.com.jsonl.gz",
1164
+ "lines": 141230,
1165
+ "weight": 12
1166
+ },
1167
+ {
1168
+ "name": "stackexchange_title_body/electronics.stackexchange.com.jsonl.gz",
1169
+ "lines": 143582,
1170
+ "weight": 12
1171
+ },
1172
+ {
1173
+ "name": "stackexchange_TitleBody_Answer/unix.stackexchange.com.jsonl.gz",
1174
+ "lines": 155414,
1175
+ "weight": 13
1176
+ },
1177
+ {
1178
+ "name": "stackexchange_Title_Answer/unix.stackexchange.com.jsonl.gz",
1179
+ "lines": 155414,
1180
+ "weight": 13
1181
+ },
1182
+ {
1183
+ "name": "stackexchange_TitleBody_Answer/tex.stackexchange.com.jsonl.gz",
1184
+ "lines": 171628,
1185
+ "weight": 15
1186
+ },
1187
+ {
1188
+ "name": "stackexchange_Title_Answer/tex.stackexchange.com.jsonl.gz",
1189
+ "lines": 171628,
1190
+ "weight": 15
1191
+ },
1192
+ {
1193
+ "name": "stackexchange_title_body/physics.stackexchange.com.jsonl.gz",
1194
+ "lines": 173307,
1195
+ "weight": 15
1196
+ },
1197
+ {
1198
+ "name": "stackexchange_title_body/stats.stackexchange.com.jsonl.gz",
1199
+ "lines": 173466,
1200
+ "weight": 15
1201
+ },
1202
+ {
1203
+ "name": "stackexchange_title_body/unix.stackexchange.com.jsonl.gz",
1204
+ "lines": 185997,
1205
+ "weight": 16
1206
+ },
1207
+ {
1208
+ "name": "stackexchange_title_body/tex.stackexchange.com.jsonl.gz",
1209
+ "lines": 202954,
1210
+ "weight": 17
1211
+ },
1212
+ {
1213
+ "name": "TriviaQA_pairs.jsonl.gz",
1214
+ "lines": 73346,
1215
+ "weight": 19
1216
+ },
1217
+ {
1218
+ "name": "stackexchange_TitleBody_Answer/serverfault.com.jsonl.gz",
1219
+ "lines": 238507,
1220
+ "weight": 20
1221
+ },
1222
+ {
1223
+ "name": "stackexchange_Title_Answer/serverfault.com.jsonl.gz",
1224
+ "lines": 238507,
1225
+ "weight": 20
1226
+ },
1227
+ {
1228
+ "name": "stackexchange_duplicate_questions_title-body_title-body.jsonl.gz",
1229
+ "lines": 250460,
1230
+ "weight": 21
1231
+ },
1232
+ {
1233
+ "name": "stackexchange_duplicate_questions_body_body.jsonl.gz",
1234
+ "lines": 250519,
1235
+ "weight": 21
1236
+ },
1237
+ {
1238
+ "name": "squad_pairs.jsonl.gz",
1239
+ "lines": 87599,
1240
+ "weight": 22
1241
+ },
1242
+ {
1243
+ "name": "stackexchange_TitleBody_Answer/askubuntu.com.jsonl.gz",
1244
+ "lines": 267135,
1245
+ "weight": 22
1246
+ },
1247
+ {
1248
+ "name": "stackexchange_Title_Answer/askubuntu.com.jsonl.gz",
1249
+ "lines": 267135,
1250
+ "weight": 22
1251
+ },
1252
+ {
1253
+ "name": "stackexchange_title_body/serverfault.com.jsonl.gz",
1254
+ "lines": 270904,
1255
+ "weight": 23
1256
+ },
1257
+ {
1258
+ "name": "NQ-train_pairs.jsonl.gz",
1259
+ "lines": 100231,
1260
+ "weight": 25
1261
+ },
1262
+ {
1263
+ "name": "SimpleWiki.jsonl.gz",
1264
+ "lines": 102225,
1265
+ "weight": 26
1266
+ },
1267
+ {
1268
+ "name": "quora_duplicates_triplets.jsonl.gz",
1269
+ "lines": 103663,
1270
+ "weight": 26
1271
+ },
1272
+ {
1273
+ "name": "stackexchange_duplicate_questions_title_title.jsonl.gz",
1274
+ "lines": 304525,
1275
+ "weight": 26
1276
+ },
1277
+ {
1278
+ "name": "altlex.jsonl.gz",
1279
+ "lines": 112696,
1280
+ "weight": 28
1281
+ },
1282
+ {
1283
+ "name": "stackexchange_title_body/askubuntu.com.jsonl.gz",
1284
+ "lines": 347925,
1285
+ "weight": 29
1286
+ },
1287
+ {
1288
+ "name": "stackexchange_TitleBody_Answer/superuser.com.jsonl.gz",
1289
+ "lines": 352610,
1290
+ "weight": 30
1291
+ },
1292
+ {
1293
+ "name": "stackexchange_Title_Answer/superuser.com.jsonl.gz",
1294
+ "lines": 352610,
1295
+ "weight": 30
1296
+ },
1297
+ {
1298
+ "name": "wikihow.jsonl.gz",
1299
+ "lines": 128542,
1300
+ "weight": 32
1301
+ },
1302
+ {
1303
+ "name": "stackexchange_title_body/superuser.com.jsonl.gz",
1304
+ "lines": 435463,
1305
+ "weight": 36
1306
+ },
1307
+ {
1308
+ "name": "stackexchange_title_body/small_stackexchanges.jsonl.gz",
1309
+ "lines": 448146,
1310
+ "weight": 37
1311
+ },
1312
+ {
1313
+ "name": "stackexchange_TitleBody_Answer/small_stackexchanges.jsonl.gz",
1314
+ "lines": 460256,
1315
+ "weight": 38
1316
+ },
1317
+ {
1318
+ "name": "stackexchange_Title_Answer/small_stackexchanges.jsonl.gz",
1319
+ "lines": 460256,
1320
+ "weight": 38
1321
+ },
1322
+ {
1323
+ "name": "sentence-compression.jsonl.gz",
1324
+ "lines": 180000,
1325
+ "weight": 45
1326
+ },
1327
+ {
1328
+ "name": "AllNLI.jsonl.gz",
1329
+ "lines": 277230,
1330
+ "weight": 69
1331
+ },
1332
+ {
1333
+ "name": "eli5_question_answer.jsonl.gz",
1334
+ "lines": 325475,
1335
+ "weight": 81
1336
+ },
1337
+ {
1338
+ "name": "reddit/reddit_2015.jsonl.gz",
1339
+ "lines": 135108166,
1340
+ "weight": 82
1341
+ },
1342
+ {
1343
+ "name": "reddit/reddit_2016.jsonl.gz",
1344
+ "lines": 159164386,
1345
+ "weight": 82
1346
+ },
1347
+ {
1348
+ "name": "reddit/reddit_2017.jsonl.gz",
1349
+ "lines": 191485219,
1350
+ "weight": 82
1351
+ },
1352
+ {
1353
+ "name": "reddit/reddit_2018.jsonl.gz",
1354
+ "lines": 240726659,
1355
+ "weight": 82
1356
+ },
1357
+ {
1358
+ "name": "stackexchange_TitleBody_Answer/math.stackexchange.com.jsonl.gz",
1359
+ "lines": 1100953,
1360
+ "weight": 83
1361
+ },
1362
+ {
1363
+ "name": "stackexchange_Title_Answer/math.stackexchange.com.jsonl.gz",
1364
+ "lines": 1100953,
1365
+ "weight": 83
1366
+ },
1367
+ {
1368
+ "name": "stackexchange_title_body/math.stackexchange.com.jsonl.gz",
1369
+ "lines": 1338443,
1370
+ "weight": 83
1371
+ },
1372
+ {
1373
+ "name": "stackexchange_TitleBody_Answer/stackoverflow.com-Posts.jsonl.gz",
1374
+ "lines": 15768211,
1375
+ "weight": 83
1376
+ },
1377
+ {
1378
+ "name": "stackexchange_Title_Answer/stackoverflow.com-Posts.jsonl.gz",
1379
+ "lines": 15768211,
1380
+ "weight": 83
1381
+ },
1382
+ {
1383
+ "name": "stackexchange_title_body/stackoverflow.com-Posts.jsonl.gz",
1384
+ "lines": 18562443,
1385
+ "weight": 83
1386
+ },
1387
+ {
1388
+ "name": "specter_train_triples.jsonl.gz",
1389
+ "lines": 684100,
1390
+ "weight": 84
1391
+ },
1392
+ {
1393
+ "name": "S2ORC_title_abstract.jsonl.gz",
1394
+ "lines": 41769185,
1395
+ "weight": 123
1396
+ },
1397
+ {
1398
+ "name": "S2ORC_citation_pairs.jsonl.gz",
1399
+ "lines": 52603982,
1400
+ "weight": 123
1401
+ },
1402
+ {
1403
+ "name": "PAQ_pairs.jsonl.gz",
1404
+ "lines": 64371441,
1405
+ "weight": 123
1406
+ },
1407
+ {
1408
+ "name": "WikiAnswers_pairs.jsonl.gz",
1409
+ "lines": 77427422,
1410
+ "weight": 123
1411
+ },
1412
+ {
1413
+ "name": "S2ORC_citation_pairs_abstract.jsonl.gz",
1414
+ "lines": 116288806,
1415
+ "weight": 123
1416
+ },
1417
+ {
1418
+ "name": "searchQA_question_top5_snippets_merged.jsonl.gz",
1419
+ "lines": 582261,
1420
+ "weight": 144
1421
+ },
1422
+ {
1423
+ "name": "yahoo_answers_title_question.jsonl.gz",
1424
+ "lines": 659896,
1425
+ "weight": 163
1426
+ },
1427
+ {
1428
+ "name": "yahoo_answers_question_answer.jsonl.gz",
1429
+ "lines": 681164,
1430
+ "weight": 169
1431
+ },
1432
+ {
1433
+ "name": "yahoo_answers_title_answer.jsonl.gz",
1434
+ "lines": 1198260,
1435
+ "weight": 247
1436
+ },
1437
+ {
1438
+ "name": "amazon-qa-train-pairs.jsonl.gz",
1439
+ "lines": 2448839,
1440
+ "weight": 247
1441
+ },
1442
+ {
1443
+ "name": "gooaq_pairs.jsonl.gz",
1444
+ "lines": 3012496,
1445
+ "weight": 247
1446
+ },
1447
+ {
1448
+ "name": "msmarco-query_passage_negative.jsonl.gz",
1449
+ "lines": 9144553,
1450
+ "weight": 247
1451
+ }
1452
+ ]
all-MiniLM-L6-v2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
3
+ size 90868376
all-MiniLM-L6-v2/modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
all-MiniLM-L6-v2/onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fd5d72fe4589f189f8ebc006442dbb529bb7ce38f8082112682524616046452
3
+ size 90405214
all-MiniLM-L6-v2/onnx/model_O1.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1391c6fc20b5530250bc15cbe1f47578ffeca55ab0551d335cc668b6299a88ec
3
+ size 90360328
all-MiniLM-L6-v2/onnx/model_O2.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1de3905029190b398c7d300b530e320cf4b5e7d3dfb9af1429ebd73fd9a16faf
3
+ size 90326566
all-MiniLM-L6-v2/onnx/model_O3.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a44f671e364dddbac31f203f07b91be6b0a35e51936e5ebfab65b6d9538b83ff
3
+ size 90326497
all-MiniLM-L6-v2/onnx/model_O4.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1667d7f3ba669048b13a96ee3a44456d5e42c8f44588ae8b603430e16160c485
3
+ size 45212349
all-MiniLM-L6-v2/onnx/model_qint8_arm64.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
3
+ size 23026053
all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
3
+ size 23026053
all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
3
+ size 23026053
all-MiniLM-L6-v2/onnx/model_quint8_avx2.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b941bf19f1f1283680f449fa6a7336bb5600bdcd5f84d10ddc5cd72218a0fd21
3
+ size 23046789
all-MiniLM-L6-v2/openvino/openvino_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b86cab4722e2aefab310cf96d4d5a9eb3b187f7d9670a082afc55c7fa0d392a
3
+ size 90265744
all-MiniLM-L6-v2/openvino/openvino_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c92ea4af3c6bc7b4a0f3b3d61b147c850f4dbdd7c9e7beee0c0c70dc12da289b
3
+ size 22933664
all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.xml ADDED
The diff for this file is too large to render. See raw diff
 
all-MiniLM-L6-v2/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3a85f238711653950f6a79ece63eb0ea93d76f6a6284be04019c53733baf256
3
+ size 90888945
all-MiniLM-L6-v2/rust_model.ot ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d98d96d278348988f2744e6445b8bc16d921c3f6e17c667362f3cb353007aea
3
+ size 90887379
all-MiniLM-L6-v2/sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 256,
3
+ "do_lower_case": false
4
+ }
all-MiniLM-L6-v2/special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
all-MiniLM-L6-v2/tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24c06a7429b843d46e40c6b167122053921bf94dce2e5550ea5c07fabc597646
3
+ size 91005696
all-MiniLM-L6-v2/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
all-MiniLM-L6-v2/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "name_or_path": "nreimers/MiniLM-L6-H384-uncased", "do_basic_tokenize": true, "never_split": null, "tokenizer_class": "BertTokenizer", "model_max_length": 512}
all-MiniLM-L6-v2/train_script.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train script for a single file
3
+
4
+ Need to set the TPU address first:
5
+ export XRT_TPU_CONFIG="localservice;0;localhost:51011"
6
+ """
7
+
8
+ import torch.multiprocessing as mp
9
+ import threading
10
+ import time
11
+ import random
12
+ import sys
13
+ import argparse
14
+ import gzip
15
+ import json
16
+ import logging
17
+ import tqdm
18
+ import torch
19
+ from torch import nn
20
+ from torch.utils.data import DataLoader
21
+ import torch
22
+ import torch_xla
23
+ import torch_xla.core
24
+ import torch_xla.core.functions
25
+ import torch_xla.core.xla_model as xm
26
+ import torch_xla.distributed.xla_multiprocessing as xmp
27
+ import torch_xla.distributed.parallel_loader as pl
28
+ import os
29
+ from shutil import copyfile
30
+
31
+
32
+ from transformers import (
33
+ AdamW,
34
+ AutoModel,
35
+ AutoTokenizer,
36
+ get_linear_schedule_with_warmup,
37
+ set_seed,
38
+ )
39
+
40
+ class AutoModelForSentenceEmbedding(nn.Module):
41
+ def __init__(self, model_name, tokenizer, normalize=True):
42
+ super(AutoModelForSentenceEmbedding, self).__init__()
43
+
44
+ self.model = AutoModel.from_pretrained(model_name)
45
+ self.normalize = normalize
46
+ self.tokenizer = tokenizer
47
+
48
+ def forward(self, **kwargs):
49
+ model_output = self.model(**kwargs)
50
+ embeddings = self.mean_pooling(model_output, kwargs['attention_mask'])
51
+ if self.normalize:
52
+ embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
53
+
54
+ return embeddings
55
+
56
+ def mean_pooling(self, model_output, attention_mask):
57
+ token_embeddings = model_output[0] # First element of model_output contains all token embeddings
58
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
59
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
60
+
61
+ def save_pretrained(self, output_path):
62
+ if xm.is_master_ordinal():
63
+ self.tokenizer.save_pretrained(output_path)
64
+ self.model.config.save_pretrained(output_path)
65
+
66
+ xm.save(self.model.state_dict(), os.path.join(output_path, "pytorch_model.bin"))
67
+
68
+
69
+
70
+
71
+ def train_function(index, args, queue):
72
+ tokenizer = AutoTokenizer.from_pretrained(args.model)
73
+ model = AutoModelForSentenceEmbedding(args.model, tokenizer)
74
+
75
+
76
+ ### Train Loop
77
+ device = xm.xla_device()
78
+ model = model.to(device)
79
+
80
+ # Instantiate optimizer
81
+ optimizer = AdamW(params=model.parameters(), lr=2e-5, correct_bias=True)
82
+
83
+ lr_scheduler = get_linear_schedule_with_warmup(
84
+ optimizer=optimizer,
85
+ num_warmup_steps=500,
86
+ num_training_steps=args.steps,
87
+ )
88
+
89
+ # Now we train the model
90
+ cross_entropy_loss = nn.CrossEntropyLoss()
91
+ max_grad_norm = 1
92
+
93
+ model.train()
94
+
95
+ for global_step in tqdm.trange(args.steps, disable=not xm.is_master_ordinal()):
96
+ #### Get the batch data
97
+ batch = queue.get()
98
+ #print(index, "batch {}x{}".format(len(batch), ",".join([str(len(b)) for b in batch])))
99
+
100
+
101
+ if len(batch[0]) == 2: #(anchor, positive)
102
+ text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
103
+ text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
104
+
105
+ ### Compute embeddings
106
+ embeddings_a = model(**text1.to(device))
107
+ embeddings_b = model(**text2.to(device))
108
+
109
+ ### Gather all embedings
110
+ embeddings_a = torch_xla.core.functions.all_gather(embeddings_a)
111
+ embeddings_b = torch_xla.core.functions.all_gather(embeddings_b)
112
+
113
+ ### Compute similarity scores 512 x 512
114
+ scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
115
+
116
+ ### Compute cross-entropy loss
117
+ labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device) # Example a[i] should match with b[i]
118
+
119
+ ## Symmetric loss as in CLIP
120
+ loss = (cross_entropy_loss(scores, labels) + cross_entropy_loss(scores.transpose(0, 1), labels)) / 2
121
+
122
+ else: #(anchor, positive, negative)
123
+ text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
124
+ text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
125
+ text3 = tokenizer([b[2] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
126
+
127
+ embeddings_a = model(**text1.to(device))
128
+ embeddings_b1 = model(**text2.to(device))
129
+ embeddings_b2 = model(**text3.to(device))
130
+
131
+ embeddings_a = torch_xla.core.functions.all_gather(embeddings_a)
132
+ embeddings_b1 = torch_xla.core.functions.all_gather(embeddings_b1)
133
+ embeddings_b2 = torch_xla.core.functions.all_gather(embeddings_b2)
134
+
135
+ embeddings_b = torch.cat([embeddings_b1, embeddings_b2])
136
+
137
+ ### Compute similarity scores 512 x 1024
138
+ scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
139
+
140
+ ### Compute cross-entropy loss
141
+ labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device) # Example a[i] should match with b[i]
142
+
143
+ ## One-way loss
144
+ loss = cross_entropy_loss(scores, labels)
145
+
146
+
147
+ # Backward pass
148
+ optimizer.zero_grad()
149
+ loss.backward()
150
+ torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
151
+
152
+ xm.optimizer_step(optimizer, barrier=True)
153
+ lr_scheduler.step()
154
+
155
+
156
+ #Save model
157
+ if (global_step+1) % args.save_steps == 0:
158
+ output_path = os.path.join(args.output, str(global_step+1))
159
+ xm.master_print("save model: "+output_path)
160
+ model.save_pretrained(output_path)
161
+
162
+
163
+ output_path = os.path.join(args.output, "final")
164
+ xm.master_print("save model final: "+ output_path)
165
+ model.save_pretrained(output_path)
166
+
167
+
168
+ def produce_data(args, queue, filepaths, dataset_indices):
169
+ global_batch_size = args.batch_size*args.nprocs #Global batch size
170
+ size_per_dataset = int(global_batch_size / args.datasets_per_batch) #How many datasets per batch
171
+ num_same_dataset = int(size_per_dataset / args.batch_size)
172
+ print("producer", "global_batch_size", global_batch_size)
173
+ print("producer", "size_per_dataset", size_per_dataset)
174
+ print("producer", "num_same_dataset", num_same_dataset)
175
+
176
+ datasets = []
177
+ for filepath in filepaths:
178
+ if "reddit_" in filepath: #Special dataset class for Reddit files
179
+ data_obj = RedditDataset(filepath)
180
+ else:
181
+ data_obj = Dataset(filepath)
182
+ datasets.append(iter(data_obj))
183
+
184
+ # Store if dataset is in a 2 col or 3 col format
185
+ num_cols = {idx: len(next(dataset)) for idx, dataset in enumerate(datasets)}
186
+
187
+ while True:
188
+ texts_in_batch = set()
189
+ batch_format = None #2 vs 3 col format for this batch
190
+
191
+ #Add data from several sub datasets
192
+ for _ in range(args.datasets_per_batch):
193
+ valid_dataset = False #Check that datasets have the same 2/3 col format
194
+ while not valid_dataset:
195
+ data_idx = random.choice(dataset_indices)
196
+ if batch_format is None:
197
+ batch_format = num_cols[data_idx]
198
+ valid_dataset = True
199
+ else: #Check that this dataset has the same format
200
+ valid_dataset = (batch_format == num_cols[data_idx])
201
+
202
+ #Get data from this dataset
203
+ dataset = datasets[data_idx]
204
+ for _ in range(num_same_dataset):
205
+ for _ in range(args.nprocs):
206
+ batch_device = [] #A batch for one device
207
+ while len(batch_device) < args.batch_size:
208
+ sample = next(dataset)
209
+ in_batch = False
210
+ for text in sample:
211
+ if text in texts_in_batch:
212
+ in_batch = True
213
+ break
214
+
215
+ if not in_batch:
216
+ for text in sample:
217
+ texts_in_batch.add(text)
218
+ batch_device.append(sample)
219
+
220
+ queue.put(batch_device)
221
+
222
+
223
+ class RedditDataset:
224
+ """
225
+ A class that handles the reddit data files
226
+ """
227
+ def __init__(self, filepath):
228
+ self.filepath = filepath
229
+
230
+ def __iter__(self):
231
+ while True:
232
+ with gzip.open(self.filepath, "rt") as fIn:
233
+ for line in fIn:
234
+ data = json.loads(line)
235
+
236
+ if "response" in data and "context" in data:
237
+ yield [data["response"], data["context"]]
238
+
239
+ class Dataset:
240
+ """
241
+ A class that handles one dataset
242
+ """
243
+ def __init__(self, filepath):
244
+ self.filepath = filepath
245
+
246
+ def __iter__(self):
247
+ max_dataset_size = 10*1000*1000 #Cache small datasets in memory
248
+ dataset = []
249
+ data_format = None
250
+
251
+ while dataset is None or len(dataset) == 0:
252
+ with gzip.open(self.filepath, "rt") as fIn:
253
+ for line in fIn:
254
+ data = json.loads(line)
255
+ if isinstance(data, dict):
256
+ data = data['texts']
257
+
258
+ if data_format is None:
259
+ data_format = len(data)
260
+
261
+ #Ensure that all entries are of the same 2/3 col format
262
+ assert len(data) == data_format
263
+
264
+ if dataset is not None:
265
+ dataset.append(data)
266
+ if len(dataset) >= max_dataset_size:
267
+ dataset = None
268
+
269
+ yield data
270
+
271
+ # Data loaded. Now stream to the queue
272
+ # Shuffle for each epoch
273
+ while True:
274
+ random.shuffle(dataset)
275
+ for data in dataset:
276
+ yield data
277
+
278
+
279
+
280
+ if __name__ == "__main__":
281
+ parser = argparse.ArgumentParser()
282
+ parser.add_argument('--model', default='nreimers/MiniLM-L6-H384-uncased')
283
+ parser.add_argument('--steps', type=int, default=2000)
284
+ parser.add_argument('--save_steps', type=int, default=10000)
285
+ parser.add_argument('--batch_size', type=int, default=64)
286
+ parser.add_argument('--max_length', type=int, default=128)
287
+ parser.add_argument('--nprocs', type=int, default=8)
288
+ parser.add_argument('--datasets_per_batch', type=int, default=2, help="Number of datasets per batch")
289
+ parser.add_argument('--scale', type=float, default=20, help="Use 20 for cossim, and 1 when you work with unnormalized embeddings with dot product")
290
+ parser.add_argument('--data_folder', default="/data", help="Folder with your dataset files")
291
+ parser.add_argument('data_config', help="A data_config.json file")
292
+ parser.add_argument('output')
293
+ args = parser.parse_args()
294
+
295
+ # Ensure global batch size is divisble by data_sample_size
296
+ assert (args.batch_size*args.nprocs) % args.datasets_per_batch == 0
297
+
298
+ logging.info("Output: "+args.output)
299
+ if os.path.exists(args.output):
300
+ print("Output folder already exists.")
301
+ input("Continue?")
302
+
303
+ # Write train script to output path
304
+ os.makedirs(args.output, exist_ok=True)
305
+
306
+ data_config_path = os.path.join(args.output, 'data_config.json')
307
+ copyfile(args.data_config, data_config_path)
308
+
309
+ train_script_path = os.path.join(args.output, 'train_script.py')
310
+ copyfile(__file__, train_script_path)
311
+ with open(train_script_path, 'a') as fOut:
312
+ fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
313
+
314
+
315
+
316
+ #Load data config
317
+ with open(args.data_config) as fIn:
318
+ data_config = json.load(fIn)
319
+
320
+ queue = mp.Queue(maxsize=100*args.nprocs)
321
+
322
+ filepaths = []
323
+ dataset_indices = []
324
+ for idx, data in enumerate(data_config):
325
+ filepaths.append(os.path.join(os.path.expanduser(args.data_folder), data['name']))
326
+ dataset_indices.extend([idx]*data['weight'])
327
+
328
+ # Start producer
329
+ p = mp.Process(target=produce_data, args=(args, queue, filepaths, dataset_indices))
330
+ p.start()
331
+
332
+ # Run training
333
+ print("Start processes:", args.nprocs)
334
+ xmp.spawn(train_function, args=(args, queue), nprocs=args.nprocs, start_method='fork')
335
+ print("Training done")
336
+ print("It might be that not all processes exit automatically. In that case you must manually kill this process.")
337
+ print("With 'pkill python' you can kill all remaining python processes")
338
+ p.kill()
339
+ exit()
340
+
341
+
342
+
343
+ # Script was called via:
344
+ #python train_many_data_files_v2.py --steps 1000000 --batch_size 128 --model nreimers/MiniLM-L6-H384-uncased train_data_configs/all_datasets_v4.json output/all_datasets_v4_MiniLM-L6-H384-uncased-batch128
all-MiniLM-L6-v2/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
endpoint.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from llama_index.core import VectorStoreIndex, Settings, StorageContext, load_index_from_storage
5
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
6
+ from llama_index.llms.groq import Groq
7
+ import pandas as pd
8
+ from llama_index.core import Document
9
+
10
+ app = Flask(__name__)
11
+
12
+ # --- Configuration ---
13
+ PERSIST_DIR = "./storage"
14
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
15
+ LLM_MODEL = "llama3-8b-8192"
16
+ CSV_FILE_PATH = "shl_assessments.csv"
17
+
18
+ # --- Root Route (for health check) ---
19
+ @app.route("/", methods=["GET"])
20
+ def home():
21
+ return "🧠 SHL Chatbot API is running!", 200
22
+
23
+ # --- Utility Functions ---
24
+ def load_groq_llm():
25
+ load_dotenv()
26
+ api_key = os.getenv("GROQ_API_KEY")
27
+ if not api_key:
28
+ raise ValueError("GROQ_API_KEY not found in .env file or environment variables")
29
+ return Groq(model=LLM_MODEL, api_key=api_key, temperature=0.1)
30
+
31
+ def load_embeddings():
32
+ return HuggingFaceEmbedding(model_name=EMBED_MODEL)
33
+
34
+ def load_data_from_csv(csv_path):
35
+ try:
36
+ df = pd.read_csv(csv_path)
37
+ required_columns = ["Assessment Name", "URL", "Remote Testing Support",
38
+ "Adaptive/IRT Support", "Duration (min)", "Test Type"]
39
+ if not all(col in df.columns for col in required_columns):
40
+ raise ValueError(f"CSV file must contain columns: {', '.join(required_columns)}")
41
+ return df.to_dict(orient="records")
42
+ except FileNotFoundError:
43
+ raise FileNotFoundError(f"CSV file not found at {csv_path}")
44
+ except Exception as e:
45
+ raise Exception(f"Error reading CSV: {e}")
46
+
47
+ def build_index(data):
48
+ Settings.embed_model = load_embeddings()
49
+ Settings.llm = load_groq_llm()
50
+ documents = [
51
+ Document(text=f"Name: {item['Assessment Name']}, URL: {item['URL']}, Remote Testing: {item['Remote Testing Support']}, Adaptive/IRT: {item['Adaptive/IRT Support']}, Duration: {item['Duration (min)']}, Type: {item['Test Type']}")
52
+ for item in data
53
+ ]
54
+ index = VectorStoreIndex.from_documents(documents)
55
+ index.storage_context.persist(persist_dir=PERSIST_DIR)
56
+ return index
57
+
58
+ def load_chat_engine():
59
+ if not os.path.exists(PERSIST_DIR):
60
+ return None
61
+ Settings.embed_model = load_embeddings()
62
+ Settings.llm = load_groq_llm()
63
+ storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
64
+ index = load_index_from_storage(storage_context)
65
+ return index.as_chat_engine(chat_mode="context", verbose=True)
66
+
67
+ # --- Load or Build Index ---
68
+ try:
69
+ chat_engine = load_chat_engine()
70
+ if chat_engine is None:
71
+ assessment_data = load_data_from_csv(CSV_FILE_PATH)
72
+ build_index(assessment_data)
73
+ chat_engine = load_chat_engine()
74
+ except Exception as e:
75
+ print(f"❌ Error initializing chat engine: {e}")
76
+ chat_engine = None
77
+
78
+ # --- Endpoint ---
79
+ @app.route("/assessments", methods=["POST"])
80
+ def get_assessments():
81
+ data = request.get_json()
82
+ query = data.get("query")
83
+
84
+ if not query:
85
+ return jsonify({"error": "No query provided"}), 400
86
+
87
+ if chat_engine:
88
+ try:
89
+ response = chat_engine.chat(query)
90
+ results = []
91
+
92
+ for node in response.source_nodes:
93
+ try:
94
+ parts = node.node.text.split(", ")
95
+ results.append({
96
+ "assessment_name": parts[0].split(": ")[1] if len(parts) > 0 else "N/A",
97
+ "assessment_url": parts[1].split(": ")[1] if len(parts) > 1 else "N/A",
98
+ "remote_testing_support": parts[2].split(": ")[1] if len(parts) > 2 else "N/A",
99
+ "adaptive_irt_support": parts[3].split(": ")[1] if len(parts) > 3 else "N/A",
100
+ "duration": parts[4].split(": ")[1] if len(parts) > 4 else "N/A",
101
+ "test_type": parts[5].split(": ")[1] if len(parts) > 5 else "N/A"
102
+ })
103
+ except:
104
+ results.append({"error": "Error parsing assessment info"})
105
+
106
+ return jsonify({"query": query, "response": results}), 200
107
+
108
+ except Exception as e:
109
+ return jsonify({"error": f"Chat processing error: {e}"}), 500
110
+ else:
111
+ return jsonify({"error": "Chat engine not initialized"}), 500
112
+
113
+ # --- Entry Point for Local Debugging ---
114
+ if __name__ == "__main__":
115
+ app.run(host="0.0.0.0", port=10000)
116
+
117
+ from flask import Flask, request, jsonify
118
+ import os
119
+ from dotenv import load_dotenv
120
+ from llama_index.core import VectorStoreIndex, Settings, StorageContext, load_index_from_storage
121
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
122
+ from llama_index.llms.groq import Groq
123
+ import pandas as pd
124
+ from llama_index.core import Document
125
+
126
+ app = Flask(__name__)
127
+
128
+ # --- Configuration ---
129
+ PERSIST_DIR = "./storage"
130
+ EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
131
+ LLM_MODEL = "llama3-8b-8192"
132
+ CSV_FILE_PATH = "shl_assessments.csv"
133
+
134
+ # --- Root Route (for health check) ---
135
+ @app.route("/", methods=["GET"])
136
+ def home():
137
+ return "🧠 SHL Chatbot API is running!", 200
138
+
139
+ # --- Utility Functions ---
140
+ def load_groq_llm():
141
+ load_dotenv()
142
+ api_key = os.getenv("GROQ_API_KEY")
143
+ if not api_key:
144
+ raise ValueError("GROQ_API_KEY not found in .env file or environment variables")
145
+ return Groq(model=LLM_MODEL, api_key=api_key, temperature=0.1)
146
+
147
+ def load_embeddings():
148
+ return HuggingFaceEmbedding(model_name=EMBED_MODEL)
149
+
150
+ def load_data_from_csv(csv_path):
151
+ try:
152
+ df = pd.read_csv(csv_path)
153
+ required_columns = ["Assessment Name", "URL", "Remote Testing Support",
154
+ "Adaptive/IRT Support", "Duration (min)", "Test Type"]
155
+ if not all(col in df.columns for col in required_columns):
156
+ raise ValueError(f"CSV file must contain columns: {', '.join(required_columns)}")
157
+ return df.to_dict(orient="records")
158
+ except FileNotFoundError:
159
+ raise FileNotFoundError(f"CSV file not found at {csv_path}")
160
+ except Exception as e:
161
+ raise Exception(f"Error reading CSV: {e}")
162
+
163
+ def build_index(data):
164
+ Settings.embed_model = load_embeddings()
165
+ Settings.llm = load_groq_llm()
166
+ documents = [
167
+ Document(text=f"Name: {item['Assessment Name']}, URL: {item['URL']}, Remote Testing: {item['Remote Testing Support']}, Adaptive/IRT: {item['Adaptive/IRT Support']}, Duration: {item['Duration (min)']}, Type: {item['Test Type']}")
168
+ for item in data
169
+ ]
170
+ index = VectorStoreIndex.from_documents(documents)
171
+ index.storage_context.persist(persist_dir=PERSIST_DIR)
172
+ return index
173
+
174
+ def load_chat_engine():
175
+ if not os.path.exists(PERSIST_DIR):
176
+ return None
177
+ Settings.embed_model = load_embeddings()
178
+ Settings.llm = load_groq_llm()
179
+ storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
180
+ index = load_index_from_storage(storage_context)
181
+ return index.as_chat_engine(chat_mode="context", verbose=True)
182
+
183
+ # --- Load or Build Index ---
184
+ try:
185
+ chat_engine = load_chat_engine()
186
+ if chat_engine is None:
187
+ assessment_data = load_data_from_csv(CSV_FILE_PATH)
188
+ build_index(assessment_data)
189
+ chat_engine = load_chat_engine()
190
+ except Exception as e:
191
+ print(f"❌ Error initializing chat engine: {e}")
192
+ chat_engine = None
193
+
194
+ # --- Endpoint ---
195
+ @app.route("/assessments", methods=["POST"])
196
+ def get_assessments():
197
+ data = request.get_json()
198
+ query = data.get("query")
199
+
200
+ if not query:
201
+ return jsonify({"error": "No query provided"}), 400
202
+
203
+ if chat_engine:
204
+ try:
205
+ response = chat_engine.chat(query)
206
+ results = []
207
+
208
+ for node in response.source_nodes:
209
+ try:
210
+ parts = node.node.text.split(", ")
211
+ results.append({
212
+ "assessment_name": parts[0].split(": ")[1] if len(parts) > 0 else "N/A",
213
+ "assessment_url": parts[1].split(": ")[1] if len(parts) > 1 else "N/A",
214
+ "remote_testing_support": parts[2].split(": ")[1] if len(parts) > 2 else "N/A",
215
+ "adaptive_irt_support": parts[3].split(": ")[1] if len(parts) > 3 else "N/A",
216
+ "duration": parts[4].split(": ")[1] if len(parts) > 4 else "N/A",
217
+ "test_type": parts[5].split(": ")[1] if len(parts) > 5 else "N/A"
218
+ })
219
+ except:
220
+ results.append({"error": "Error parsing assessment info"})
221
+
222
+ return jsonify({"query": query, "response": results}), 200
223
+
224
+ except Exception as e:
225
+ return jsonify({"error": f"Chat processing error: {e}"}), 500
226
+ else:
227
+ return jsonify({"error": "Chat engine not initialized"}), 500
228
+
229
+ # --- Entry Point for Local Debugging ---
230
+ if __name__ == "__main__":
231
+ app.run(host="0.0.0.0", port=10000)
main.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import streamlit as st
4
+ from dotenv import load_dotenv
5
+ from llama_index.core import (
6
+ VectorStoreIndex,
7
+ Settings,
8
+ StorageContext,
9
+ load_index_from_storage,
10
+ )
11
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
12
+ from llama_index.llms.groq import Groq
13
+ import pandas as pd
14
+ from llama_index.core import Document
15
+
16
+
17
+ PERSIST_DIR = "./storage"
18
+ EMBED_MODEL = "./all-MiniLM-L6-v2"
19
+ EMBED_MODEL = "./all-MiniLM-L6-v2"
20
+ EMBED_MODEL = "./all-MiniLM-L6-v2"
21
+ LLM_MODEL = "llama3-8b-8192"
22
+ CSV_FILE_PATH = "shl_assessments.csv"
23
+ GROQ_API_KEY = st.secrets["GROQ_API_KEY"] or os.getenv["GROQ_API_KEY"] or os.getenv("GROQ_API_KEY")
24
+
25
+
26
+ def load_data_from_csv(csv_path):
27
+ """Loads assessment data from a CSV file."""
28
+ try:
29
+ df = pd.read_csv(csv_path)
30
+ required_columns = ["Assessment Name", "URL", "Remote Testing Support",
31
+ "Adaptive/IRT Support", "Duration (min)", "Test Type"]
32
+ if not all(col in df.columns for col in required_columns):
33
+ raise ValueError(f"CSV file must contain columns: {', '.join(required_columns)}")
34
+ return df.to_dict(orient="records")
35
+ except FileNotFoundError:
36
+ raise FileNotFoundError(f"Error: CSV file not found at {csv_path}")
37
+ except ValueError as e:
38
+ raise ValueError(f"Error reading CSV: {e}")
39
+ except Exception as e:
40
+ raise Exception(f"An unexpected error occurred while loading CSV data: {e}")
41
+
42
+
43
+ def load_groq_llm():
44
+ try:
45
+ api_key = st.secrets.get("GROQ_API_KEY") or os.getenv("GROQ_API_KEY")
46
+ api_key = st.secrets.get("GROQ_API_KEY") or os.getenv("GROQ_API_KEY")
47
+ except KeyError:
48
+ raise ValueError("GROQ_API_KEY not found in Streamlit secrets.")
49
+
50
+ return Groq(model=LLM_MODEL, api_key=api_key, temperature=0.1)
51
+
52
+
53
+
54
+ def load_embeddings():
55
+ return HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")
56
+ return HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")
57
+
58
+ def build_index(data):
59
+ """Builds the vector index from the provided assessment data."""
60
+ return HuggingFaceEmbedding(model_name=EMBED_MODEL)
61
+ return HuggingFaceEmbedding(model_name=EMBED_MODEL)
62
+ Settings.llm = load_groq_llm()
63
+
64
+ documents = [Document(text=f"Name: {item['Assessment Name']}, URL: {item['URL']}, Remote Testing: {item['Remote Testing Support']}, Adaptive/IRT: {item['Adaptive/IRT Support']}, Duration: {item['Duration (min)']}, Type: {item['Test Type']}") for item in data]
65
+
66
+ index = VectorStoreIndex.from_documents(documents)
67
+ index.storage_context.persist(persist_dir=PERSIST_DIR)
68
+ return index
69
+
70
+ def load_chat_engine():
71
+ """Loads the chat engine from the persisted index."""
72
+ if not os.path.exists(PERSIST_DIR):
73
+ return None
74
+
75
+ Settings.embed_model = load_embeddings()
76
+ Settings.llm = load_groq_llm()
77
+ storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
78
+ index = load_index_from_storage(storage_context)
79
+ return index.as_chat_engine(chat_mode="context", verbose=True)
80
+
81
+ def reset_index():
82
+ """Resets the persisted index and chat history."""
83
+ try:
84
+ shutil.rmtree(PERSIST_DIR, ignore_errors=True)
85
+ st.success("Knowledge index reset successfully!")
86
+ st.session_state.messages = [{"role": "assistant", "content": "Hello! I'm your SHL assessment assistant. How can I help you?"}]
87
+ st.session_state["index_built"] = False
88
+ if 'chat_engine' in st.session_state:
89
+ del st.session_state['chat_engine']
90
+ return None
91
+ except Exception as e:
92
+ st.error(f"Error resetting index: {str(e)}")
93
+ return None
94
+
95
+ def main():
96
+ st.set_page_config(
97
+ page_title="SHL Assessment Chatbot",
98
+ layout="wide",
99
+ initial_sidebar_state="collapsed"
100
+ )
101
+
102
+ st.markdown("""
103
+ <style>
104
+ :root {
105
+ --primary: #6eb5ff;
106
+ --background: #000000;
107
+ --card: #f0f2f6;
108
+ --text: #ffffff;
109
+ --background: #000000;
110
+ --card: #f0f2f6;
111
+ --text: #ffffff;
112
+ }
113
+ .stApp {
114
+ background-color: var(--background) !important;
115
+ color: var(--text) !important;
116
+ }
117
+ .stMarkdown, .stTextInput, .stChatMessage, .stChatInputContainer, .css-10trblm, .css-1cpxqw2 {
118
+ color: var(--text) !important;
119
+ }
120
+ .stApp {
121
+ background-color: var(--background) !important;
122
+ color: var(--text) !important;
123
+ }
124
+ .stMarkdown, .stTextInput, .stChatMessage, .stChatInputContainer, .css-10trblm, .css-1cpxqw2 {
125
+ color: var(--text) !important;
126
+ }
127
+ .stApp {
128
+ background-color: var(--background) !important;
129
+ color: var(--text) !important;
130
+ }
131
+ .stMarkdown, .stTextInput, .stChatMessage, .stChatInputContainer, .css-10trblm, .css-1cpxqw2 {
132
+ color: var(--text) !important;
133
+ }
134
+ </style>
135
+ """, unsafe_allow_html=True)
136
+
137
+ load_dotenv()
138
+ os.environ["STREAMLIT_SERVER_ENABLE_FILE_WATCHER"] = "false"
139
+ os.environ["TORCH_DISABLE_STREAMLIT_WATCHER"] = "1"
140
+ os.environ["LLAMA_INDEX_DISABLE_OPENAI"] = "1"
141
+
142
+ if "messages" not in st.session_state:
143
+ st.session_state.messages = [{
144
+ "role": "assistant",
145
+ "content": "Hello! I'm your SHL assessment assistant. How can I help you?"
146
+ }]
147
+ st.session_state.messages = [{
148
+ "role": "assistant",
149
+ "content": "Hello! I'm your SHL assessment assistant. How can I help you?"
150
+ }]
151
+ if "index_built" not in st.session_state:
152
+ st.session_state["index_built"] = False
153
+
154
+
155
+
156
+
157
+ if not st.session_state["index_built"]:
158
+ try:
159
+ with st.spinner("Loading data and building index..."):
160
+ assessment_data = load_data_from_csv(CSV_FILE_PATH)
161
+ if assessment_data:
162
+ build_index(assessment_data)
163
+ st.session_state['chat_engine'] = load_chat_engine()
164
+ st.session_state["index_built"] = True
165
+ else:
166
+ st.error("Failed to load assessment data. Please check the CSV file.")
167
+ except Exception as e:
168
+ st.error(f"Error initializing application: {e}")
169
+
170
+ # --- Chat Interface ---
171
+ chat_engine = st.session_state.get('chat_engine')
172
+ if chat_engine:
173
+ for msg in st.session_state.messages:
174
+ icon = "🤖" if msg["role"] == "assistant" else "👤"
175
+ with st.chat_message(msg["role"]):
176
+ st.markdown(f"<span style='color: white;'>{icon} {msg['content']}</span>", unsafe_allow_html=True)
177
+
178
+ if prompt := st.chat_input("Ask me about SHL assessments..."):
179
+ st.session_state.messages.append({"role": "user", "content": prompt})
180
+ with st.chat_message("user"):
181
+ st.markdown(f"<span style='color: white;'>👤 {prompt}</span>", unsafe_allow_html=True)
182
+
183
+
184
+ with st.chat_message("assistant"):
185
+ try:
186
+ # Add formatting instructions to the prompt
187
+ formatted_prompt = f"{prompt}. Please provide a list of all matching SHL assessments (minimum 1, maximum 10). For each assessment, include the following details: Assessment Name: [Name], URL: [URL], Remote Testing Support: [Yes/No], Adaptive/IRT Support: [Yes/No], Duration: [Duration], Test Type: [Type]. If there are no matching assessments, please state that."
188
+ response = chat_engine.chat(formatted_prompt)
189
+ st.markdown(f"<span style='color: white;'>🤖 {response.response}</span>", unsafe_allow_html=True)
190
+ st.session_state.messages.append({"role": "assistant", "content": response.response})
191
+ except Exception as e:
192
+ st.error(f"An error occurred during chat: {e}")
193
+
194
+ else:
195
+ st.info("💬 Chat is ready! Ask me anything about SHL assessments.")
196
+
197
+ if __name__ == "__main__":
198
+ main()
199
+
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.44.1
2
+ llama-index==0.12.28
3
+ llama-index-core==0.12.28
4
+ llama-index-embeddings-huggingface==0.5.2
5
+ llama-index-llms-groq==0.3.1
6
+ pandas==2.2.3
7
+ python-dotenv==1.1.0
8
+ sentence-transformers==4.0.2
9
+ groq==0.22.0
10
+ streamlit
11
+ Flask
12
+ gunicorn
shl_assessments.csv ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Assessment Name,URL,Remote Testing Support,Adaptive/IRT Support,Duration (min),Test Type
2
+ Cognitive Ability Test,https://www.shl.com/solutions/products/cognitive-ability-test/,Yes,Yes,20,Cognitive
3
+ Personality Profiler,https://www.shl.com/solutions/products/personality-profiler/,No,No,25,Personality
4
+ Java Developer Test,https://www.shl.com/solutions/products/java-developer-test/,Yes,No,30,Technical
5
+ Team Collaboration Assessment,https://www.shl.com/solutions/products/team-collaboration-assessment/,No,Yes,35,Behavioral
6
+ SQL Proficiency Test,https://www.shl.com/solutions/products/sql-proficiency-test/,Yes,No,40,Aptitude
7
+ Leadership Style Inventory,https://www.shl.com/solutions/products/leadership-style-inventory/,No,No,45,Situational
8
+ Customer Service Aptitude,https://www.shl.com/solutions/products/customer-service-aptitude/,Yes,Yes,20,Reasoning
9
+ Verbal Reasoning Test,https://www.shl.com/solutions/products/verbal-reasoning-test/,No,No,25,Communication
10
+ Numerical Reasoning Test,https://www.shl.com/solutions/products/numerical-reasoning-test/,Yes,No,30,Cognitive
11
+ Critical Thinking Assessment,https://www.shl.com/solutions/products/critical-thinking-assessment/,No,Yes,35,Personality
12
+ Python Coding Test,https://www.shl.com/solutions/products/python-coding-test/,Yes,No,40,Technical
13
+ Emotional Intelligence Survey,https://www.shl.com/solutions/products/emotional-intelligence-survey/,No,No,45,Behavioral
14
+ Situational Judgment Test,https://www.shl.com/solutions/products/situational-judgment-test/,Yes,Yes,20,Aptitude
15
+ Time Management Evaluation,https://www.shl.com/solutions/products/time-management-evaluation/,No,No,25,Situational
16
+ Data Analysis Assessment,https://www.shl.com/solutions/products/data-analysis-assessment/,Yes,No,30,Reasoning
17
+ Software Engineering Exam,https://www.shl.com/solutions/products/software-engineering-exam/,No,Yes,35,Communication
18
+ Finance & Accounting Aptitude,https://www.shl.com/solutions/products/finance-and-accounting-aptitude/,Yes,No,40,Cognitive
19
+ Logical Reasoning Test,https://www.shl.com/solutions/products/logical-reasoning-test/,No,No,45,Personality
20
+ Communication Skills Test,https://www.shl.com/solutions/products/communication-skills-test/,Yes,Yes,20,Technical
21
+ Machine Learning Quiz,https://www.shl.com/solutions/products/machine-learning-quiz/,No,No,25,Behavioral
22
+ Cognitive Ability Test,https://www.shl.com/solutions/products/cognitive-ability-test/,Yes,No,30,Aptitude
23
+ Personality Profiler,https://www.shl.com/solutions/products/personality-profiler/,No,Yes,35,Situational
24
+ Java Developer Test,https://www.shl.com/solutions/products/java-developer-test/,Yes,No,40,Reasoning
25
+ Team Collaboration Assessment,https://www.shl.com/solutions/products/team-collaboration-assessment/,No,No,45,Communication
26
+ SQL Proficiency Test,https://www.shl.com/solutions/products/sql-proficiency-test/,Yes,Yes,20,Cognitive
27
+ Leadership Style Inventory,https://www.shl.com/solutions/products/leadership-style-inventory/,No,No,25,Personality
28
+ Customer Service Aptitude,https://www.shl.com/solutions/products/customer-service-aptitude/,Yes,No,30,Technical
29
+ Verbal Reasoning Test,https://www.shl.com/solutions/products/verbal-reasoning-test/,No,Yes,35,Behavioral
30
+ Numerical Reasoning Test,https://www.shl.com/solutions/products/numerical-reasoning-test/,Yes,No,40,Aptitude
31
+ Critical Thinking Assessment,https://www.shl.com/solutions/products/critical-thinking-assessment/,No,No,45,Situational
32
+ Python Coding Test,https://www.shl.com/solutions/products/python-coding-test/,Yes,Yes,20,Reasoning
33
+ Emotional Intelligence Survey,https://www.shl.com/solutions/products/emotional-intelligence-survey/,No,No,25,Communication
34
+ Situational Judgment Test,https://www.shl.com/solutions/products/situational-judgment-test/,Yes,No,30,Cognitive
35
+ Time Management Evaluation,https://www.shl.com/solutions/products/time-management-evaluation/,No,Yes,35,Personality
36
+ Data Analysis Assessment,https://www.shl.com/solutions/products/data-analysis-assessment/,Yes,No,40,Technical
37
+ Software Engineering Exam,https://www.shl.com/solutions/products/software-engineering-exam/,No,No,45,Behavioral
38
+ Finance & Accounting Aptitude,https://www.shl.com/solutions/products/finance-and-accounting-aptitude/,Yes,Yes,20,Aptitude
39
+ Logical Reasoning Test,https://www.shl.com/solutions/products/logical-reasoning-test/,No,No,25,Situational
40
+ Communication Skills Test,https://www.shl.com/solutions/products/communication-skills-test/,Yes,No,30,Reasoning
41
+ Machine Learning Quiz,https://www.shl.com/solutions/products/machine-learning-quiz/,No,Yes,35,Communication
42
+ Cognitive Ability Test,https://www.shl.com/solutions/products/cognitive-ability-test/,Yes,No,40,Cognitive
43
+ Personality Profiler,https://www.shl.com/solutions/products/personality-profiler/,No,No,45,Personality
44
+ Java Developer Test,https://www.shl.com/solutions/products/java-developer-test/,Yes,Yes,20,Technical
45
+ Team Collaboration Assessment,https://www.shl.com/solutions/products/team-collaboration-assessment/,No,No,25,Behavioral
46
+ SQL Proficiency Test,https://www.shl.com/solutions/products/sql-proficiency-test/,Yes,No,30,Aptitude
47
+ Leadership Style Inventory,https://www.shl.com/solutions/products/leadership-style-inventory/,No,Yes,35,Situational
48
+ Customer Service Aptitude,https://www.shl.com/solutions/products/customer-service-aptitude/,Yes,No,40,Reasoning
49
+ Verbal Reasoning Test,https://www.shl.com/solutions/products/verbal-reasoning-test/,No,No,45,Communication
50
+ Numerical Reasoning Test,https://www.shl.com/solutions/products/numerical-reasoning-test/,Yes,Yes,20,Cognitive
51
+ Critical Thinking Assessment,https://www.shl.com/solutions/products/critical-thinking-assessment/,No,No,25,Personality
52
+ Python Coding Test,https://www.shl.com/solutions/products/python-coding-test/,Yes,No,30,Technical
53
+ Emotional Intelligence Survey,https://www.shl.com/solutions/products/emotional-intelligence-survey/,No,Yes,35,Behavioral
54
+ Situational Judgment Test,https://www.shl.com/solutions/products/situational-judgment-test/,Yes,No,40,Aptitude
55
+ Time Management Evaluation,https://www.shl.com/solutions/products/time-management-evaluation/,No,No,45,Situational
56
+ Data Analysis Assessment,https://www.shl.com/solutions/products/data-analysis-assessment/,Yes,Yes,20,Reasoning
57
+ Software Engineering Exam,https://www.shl.com/solutions/products/software-engineering-exam/,No,No,25,Communication
58
+ Finance & Accounting Aptitude,https://www.shl.com/solutions/products/finance-and-accounting-aptitude/,Yes,No,30,Cognitive
59
+ Logical Reasoning Test,https://www.shl.com/solutions/products/logical-reasoning-test/,No,Yes,35,Personality
60
+ Communication Skills Test,https://www.shl.com/solutions/products/communication-skills-test/,Yes,No,40,Technical
61
+ Machine Learning Quiz,https://www.shl.com/solutions/products/machine-learning-quiz/,No,No,45,Behavioral
62
+ Cognitive Ability Test,https://www.shl.com/solutions/products/cognitive-ability-test/,Yes,Yes,20,Aptitude
63
+ Personality Profiler,https://www.shl.com/solutions/products/personality-profiler/,No,No,25,Situational
64
+ Java Developer Test,https://www.shl.com/solutions/products/java-developer-test/,Yes,No,30,Reasoning
65
+ Team Collaboration Assessment,https://www.shl.com/solutions/products/team-collaboration-assessment/,No,Yes,35,Communication
66
+ SQL Proficiency Test,https://www.shl.com/solutions/products/sql-proficiency-test/,Yes,No,40,Cognitive
67
+ Leadership Style Inventory,https://www.shl.com/solutions/products/leadership-style-inventory/,No,No,45,Personality
68
+ Customer Service Aptitude,https://www.shl.com/solutions/products/customer-service-aptitude/,Yes,Yes,20,Technical
69
+ Verbal Reasoning Test,https://www.shl.com/solutions/products/verbal-reasoning-test/,No,No,25,Behavioral
70
+ Numerical Reasoning Test,https://www.shl.com/solutions/products/numerical-reasoning-test/,Yes,No,30,Aptitude
71
+ Critical Thinking Assessment,https://www.shl.com/solutions/products/critical-thinking-assessment/,No,Yes,35,Situational
72
+ Python Coding Test,https://www.shl.com/solutions/products/python-coding-test/,Yes,No,40,Reasoning
73
+ Emotional Intelligence Survey,https://www.shl.com/solutions/products/emotional-intelligence-survey/,No,No,45,Communication
74
+ Situational Judgment Test,https://www.shl.com/solutions/products/situational-judgment-test/,Yes,Yes,20,Cognitive
75
+ Time Management Evaluation,https://www.shl.com/solutions/products/time-management-evaluation/,No,No,25,Personality
76
+ Data Analysis Assessment,https://www.shl.com/solutions/products/data-analysis-assessment/,Yes,No,30,Technical
77
+ Software Engineering Exam,https://www.shl.com/solutions/products/software-engineering-exam/,No,Yes,35,Behavioral
78
+ Finance & Accounting Aptitude,https://www.shl.com/solutions/products/finance-and-accounting-aptitude/,Yes,No,40,Aptitude
79
+ Logical Reasoning Test,https://www.shl.com/solutions/products/logical-reasoning-test/,No,No,45,Situational
80
+ Communication Skills Test,https://www.shl.com/solutions/products/communication-skills-test/,Yes,Yes,20,Reasoning
81
+ Machine Learning Quiz,https://www.shl.com/solutions/products/machine-learning-quiz/,No,No,25,Communication
82
+ Cognitive Ability Test,https://www.shl.com/solutions/products/cognitive-ability-test/,Yes,No,30,Cognitive
83
+ Personality Profiler,https://www.shl.com/solutions/products/personality-profiler/,No,Yes,35,Personality
84
+ Java Developer Test,https://www.shl.com/solutions/products/java-developer-test/,Yes,No,40,Technical
85
+ Team Collaboration Assessment,https://www.shl.com/solutions/products/team-collaboration-assessment/,No,No,45,Behavioral
86
+ SQL Proficiency Test,https://www.shl.com/solutions/products/sql-proficiency-test/,Yes,Yes,20,Aptitude
87
+ Leadership Style Inventory,https://www.shl.com/solutions/products/leadership-style-inventory/,No,No,25,Situational
88
+ Customer Service Aptitude,https://www.shl.com/solutions/products/customer-service-aptitude/,Yes,No,30,Reasoning
89
+ Verbal Reasoning Test,https://www.shl.com/solutions/products/verbal-reasoning-test/,No,Yes,35,Communication
90
+ Numerical Reasoning Test,https://www.shl.com/solutions/products/numerical-reasoning-test/,Yes,No,40,Cognitive
91
+ Critical Thinking Assessment,https://www.shl.com/solutions/products/critical-thinking-assessment/,No,No,45,Personality
92
+ Python Coding Test,https://www.shl.com/solutions/products/python-coding-test/,Yes,Yes,20,Technical
93
+ Emotional Intelligence Survey,https://www.shl.com/solutions/products/emotional-intelligence-survey/,No,No,25,Behavioral
94
+ Situational Judgment Test,https://www.shl.com/solutions/products/situational-judgment-test/,Yes,No,30,Aptitude
95
+ Time Management Evaluation,https://www.shl.com/solutions/products/time-management-evaluation/,No,Yes,35,Situational
96
+ Data Analysis Assessment,https://www.shl.com/solutions/products/data-analysis-assessment/,Yes,No,40,Reasoning
97
+ Software Engineering Exam,https://www.shl.com/solutions/products/software-engineering-exam/,No,No,45,Communication
98
+ Finance & Accounting Aptitude,https://www.shl.com/solutions/products/finance-and-accounting-aptitude/,Yes,Yes,20,Cognitive
99
+ Logical Reasoning Test,https://www.shl.com/solutions/products/logical-reasoning-test/,No,No,25,Personality
100
+ Communication Skills Test,https://www.shl.com/solutions/products/communication-skills-test/,Yes,No,30,Technical
101
+ Machine Learning Quiz,https://www.shl.com/solutions/products/machine-learning-quiz/,No,Yes,35,Behavioral
storage/default__vector_store.json ADDED
The diff for this file is too large to render. See raw diff
 
storage/docstore.json ADDED
The diff for this file is too large to render. See raw diff
 
storage/graph_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"graph_dict": {}}
storage/image__vector_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"embedding_dict": {}, "text_id_to_ref_doc_id": {}, "metadata_dict": {}}
storage/index_store.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"index_store/data": {"d05d7679-e3c3-4463-8ebb-3546191f3a69": {"__type__": "vector_store", "__data__": "{\"index_id\": \"d05d7679-e3c3-4463-8ebb-3546191f3a69\", \"summary\": null, \"nodes_dict\": {\"1c33403f-62ee-4a45-8c7e-4a1a8d295921\": \"1c33403f-62ee-4a45-8c7e-4a1a8d295921\", \"94d2cdb9-f5bd-41fa-bf79-92f1d4b03898\": \"94d2cdb9-f5bd-41fa-bf79-92f1d4b03898\", \"fa619f4e-9527-4fd1-ae9c-701f88213a71\": \"fa619f4e-9527-4fd1-ae9c-701f88213a71\", \"ae0ad3e2-3a03-4520-b182-d62d8006c433\": \"ae0ad3e2-3a03-4520-b182-d62d8006c433\", \"06e1cf6f-ef51-439f-bc9f-3baf7233f443\": \"06e1cf6f-ef51-439f-bc9f-3baf7233f443\", \"2b84c0ec-82d8-49a4-ade7-77bcbeffe9f4\": \"2b84c0ec-82d8-49a4-ade7-77bcbeffe9f4\", \"1233d6bf-5664-4e92-9581-5be18b141baa\": \"1233d6bf-5664-4e92-9581-5be18b141baa\", \"523f5875-5fa0-46ad-980c-b0777c53af31\": \"523f5875-5fa0-46ad-980c-b0777c53af31\", \"b1b769b7-6f0a-4728-b00c-29c41ec07af7\": \"b1b769b7-6f0a-4728-b00c-29c41ec07af7\", \"187bf2bd-3bbb-47df-80e9-672cf571d181\": \"187bf2bd-3bbb-47df-80e9-672cf571d181\", \"5d43e57c-eada-4b04-83f0-96fa9bea2765\": \"5d43e57c-eada-4b04-83f0-96fa9bea2765\", \"20ffb540-1ef5-4c9f-921b-ea7f872f685c\": \"20ffb540-1ef5-4c9f-921b-ea7f872f685c\", \"3addec3f-721d-4b64-a709-5a5380a0aba4\": \"3addec3f-721d-4b64-a709-5a5380a0aba4\", \"8438f7ae-1514-472d-80b4-9a4d60f48e6b\": \"8438f7ae-1514-472d-80b4-9a4d60f48e6b\", \"5da3e70f-6934-4c40-9d5b-acf7aeeac6a3\": \"5da3e70f-6934-4c40-9d5b-acf7aeeac6a3\", \"995edbf0-74e1-4083-8f60-98f89b680336\": \"995edbf0-74e1-4083-8f60-98f89b680336\", \"02602778-ab96-4e35-9430-03c30aabe5ab\": \"02602778-ab96-4e35-9430-03c30aabe5ab\", \"5b736259-6f8b-4aa3-9ceb-dfd1e4620c3a\": \"5b736259-6f8b-4aa3-9ceb-dfd1e4620c3a\", \"540730ac-02c6-4c53-b1af-1b99cd6ce86a\": \"540730ac-02c6-4c53-b1af-1b99cd6ce86a\", \"7ab316eb-e53e-49ef-bdce-2d4152f78c9e\": \"7ab316eb-e53e-49ef-bdce-2d4152f78c9e\", \"d55e4b8f-579c-4e4e-a3cb-dbfcfd38463c\": \"d55e4b8f-579c-4e4e-a3cb-dbfcfd38463c\", \"7cd5bbf6-222b-451a-9adc-5215048d4219\": \"7cd5bbf6-222b-451a-9adc-5215048d4219\", \"b9b546c1-e457-4368-ad28-843ef5bd4196\": \"b9b546c1-e457-4368-ad28-843ef5bd4196\", \"e617ad14-34b7-4d09-b33c-b17b7b480a2a\": \"e617ad14-34b7-4d09-b33c-b17b7b480a2a\", \"8bcd6f8e-b7fc-4328-ad2d-68c9510b02da\": \"8bcd6f8e-b7fc-4328-ad2d-68c9510b02da\", \"8c32be74-a616-449f-a0f7-451c8d4f3232\": \"8c32be74-a616-449f-a0f7-451c8d4f3232\", \"980a6d6c-44af-4802-a8a4-855c6945c637\": \"980a6d6c-44af-4802-a8a4-855c6945c637\", \"1917b51e-cb83-4c91-82ce-cae806dbde88\": \"1917b51e-cb83-4c91-82ce-cae806dbde88\", \"32f55832-2c1a-49f4-b5f5-5c1a4ce91f71\": \"32f55832-2c1a-49f4-b5f5-5c1a4ce91f71\", \"8466a2c8-16a8-4bfc-bb0d-bad4aed77bc0\": \"8466a2c8-16a8-4bfc-bb0d-bad4aed77bc0\", \"754e6307-ef57-4c74-8e69-ad123ab179a2\": \"754e6307-ef57-4c74-8e69-ad123ab179a2\", \"e4890f6e-e88e-41af-8906-720b2154ac30\": \"e4890f6e-e88e-41af-8906-720b2154ac30\", \"1a753576-b223-41a1-91ae-af50f10eca6f\": \"1a753576-b223-41a1-91ae-af50f10eca6f\", \"1450402c-60fe-4173-aa26-9fd29e35dc0e\": \"1450402c-60fe-4173-aa26-9fd29e35dc0e\", \"8538e7e2-51fa-4177-9fd3-c0a7beef9945\": \"8538e7e2-51fa-4177-9fd3-c0a7beef9945\", \"f7f9e2d6-cdd0-4000-af62-07aad08a6add\": \"f7f9e2d6-cdd0-4000-af62-07aad08a6add\", \"5969c678-6c10-4e40-ab81-cb81f4230004\": \"5969c678-6c10-4e40-ab81-cb81f4230004\", \"f28a2e8f-a3e9-4223-ba20-e16dd7f75566\": \"f28a2e8f-a3e9-4223-ba20-e16dd7f75566\", \"cad9cc59-708e-4564-98f0-0c8fec149377\": \"cad9cc59-708e-4564-98f0-0c8fec149377\", \"a1f0ccb5-c233-4e1c-bcf6-1ea87e7ba749\": \"a1f0ccb5-c233-4e1c-bcf6-1ea87e7ba749\", \"0ae1fe03-dc86-444f-8579-056647ae1758\": \"0ae1fe03-dc86-444f-8579-056647ae1758\", \"77221aed-93b6-4299-ba17-4c07113db5fe\": \"77221aed-93b6-4299-ba17-4c07113db5fe\", \"200dd2d9-eece-4cd4-bbcd-4ff9f36e4960\": \"200dd2d9-eece-4cd4-bbcd-4ff9f36e4960\", \"4ef820cc-20bb-47f7-bc23-3ebf974d2b5c\": \"4ef820cc-20bb-47f7-bc23-3ebf974d2b5c\", \"ec52500d-12a9-4de6-b11a-87b06eac8f31\": \"ec52500d-12a9-4de6-b11a-87b06eac8f31\", \"3bc85d9e-6c26-458e-a5ad-fb2b263fca7a\": \"3bc85d9e-6c26-458e-a5ad-fb2b263fca7a\", \"9e4e052b-956e-4856-bd05-c4a7723ba2c4\": \"9e4e052b-956e-4856-bd05-c4a7723ba2c4\", \"5911c107-f421-41d7-973c-2d208f784693\": \"5911c107-f421-41d7-973c-2d208f784693\", \"619bfdc1-a169-4e65-b197-78b9bef745c8\": \"619bfdc1-a169-4e65-b197-78b9bef745c8\", \"8b3112e4-1ad7-4a9d-a4a4-6f7f3e598124\": \"8b3112e4-1ad7-4a9d-a4a4-6f7f3e598124\", \"d2b2745d-a122-4dce-b9f1-a8d4e0fe8d74\": \"d2b2745d-a122-4dce-b9f1-a8d4e0fe8d74\", \"b9c0d3da-7c39-46b0-bcd9-0bf10b678d01\": \"b9c0d3da-7c39-46b0-bcd9-0bf10b678d01\", \"0f97c2ee-ab52-4487-8fbe-7170421d5dca\": \"0f97c2ee-ab52-4487-8fbe-7170421d5dca\", \"3c7c8128-6c56-418b-97e9-94221af751ea\": \"3c7c8128-6c56-418b-97e9-94221af751ea\", \"a7c3ca4d-0e8e-40b5-bfc9-3fe82d457bfc\": \"a7c3ca4d-0e8e-40b5-bfc9-3fe82d457bfc\", \"b6cdf91b-5b9d-4aef-9f0e-f52361934133\": \"b6cdf91b-5b9d-4aef-9f0e-f52361934133\", \"22a8d6f1-7b24-4ba2-8b94-adc105330ef8\": \"22a8d6f1-7b24-4ba2-8b94-adc105330ef8\", \"c72b32ae-e7e2-4056-9266-19ffc974d05f\": \"c72b32ae-e7e2-4056-9266-19ffc974d05f\", \"917897e6-c6f1-4d0b-acd9-8e0c919af3d2\": \"917897e6-c6f1-4d0b-acd9-8e0c919af3d2\", \"9565237a-1f5c-48d9-84ca-34f60145891f\": \"9565237a-1f5c-48d9-84ca-34f60145891f\", \"61b2074a-77ab-465f-8e77-a6680d252cb6\": \"61b2074a-77ab-465f-8e77-a6680d252cb6\", \"9d289d93-3ab4-490b-ad8d-4c378fe554b3\": \"9d289d93-3ab4-490b-ad8d-4c378fe554b3\", \"c8c0261a-5fbc-41b4-b34a-8b2098400427\": \"c8c0261a-5fbc-41b4-b34a-8b2098400427\", \"373ed326-a94c-40f6-80a7-1be07052d343\": \"373ed326-a94c-40f6-80a7-1be07052d343\", \"3826da16-4c66-4b29-a6fa-3f7102c6b379\": \"3826da16-4c66-4b29-a6fa-3f7102c6b379\", \"56d19ec3-4e0a-49c0-922d-eea6884c785c\": \"56d19ec3-4e0a-49c0-922d-eea6884c785c\", \"4ad46bae-4f08-448c-b892-4268764fe1bc\": \"4ad46bae-4f08-448c-b892-4268764fe1bc\", \"173ff29a-d800-4d7d-ba2b-da737169f2cf\": \"173ff29a-d800-4d7d-ba2b-da737169f2cf\", \"5db5a672-91a9-4798-bead-8d400c0d0bb1\": \"5db5a672-91a9-4798-bead-8d400c0d0bb1\", \"7c2f64bb-ec14-4b8d-bfd0-5b14191c5809\": \"7c2f64bb-ec14-4b8d-bfd0-5b14191c5809\", \"2f278108-f352-41d0-82f7-1828de3fb16e\": \"2f278108-f352-41d0-82f7-1828de3fb16e\", \"ad259806-6807-45d9-a425-8a8c73a5f4ba\": \"ad259806-6807-45d9-a425-8a8c73a5f4ba\", \"be5dabf6-2231-4d00-bc61-7c62a61b7fc4\": \"be5dabf6-2231-4d00-bc61-7c62a61b7fc4\", \"09b364a9-d72d-4623-9800-4ebc13774ba7\": \"09b364a9-d72d-4623-9800-4ebc13774ba7\", \"2165fda3-a9a8-4830-9a61-421008e38c0c\": \"2165fda3-a9a8-4830-9a61-421008e38c0c\", \"b043c9d3-0b83-4ce4-ae23-6161b069af66\": \"b043c9d3-0b83-4ce4-ae23-6161b069af66\", \"36714536-310e-4c8e-9e7c-4ccdf48cc2b7\": \"36714536-310e-4c8e-9e7c-4ccdf48cc2b7\", \"e325972d-c75a-4020-a535-5c66932fd90e\": \"e325972d-c75a-4020-a535-5c66932fd90e\", \"53e85b60-39b3-4ae7-ab12-40433c4fea8f\": \"53e85b60-39b3-4ae7-ab12-40433c4fea8f\", \"6a38fae6-b26d-419f-a7ee-7bcf9a316ed4\": \"6a38fae6-b26d-419f-a7ee-7bcf9a316ed4\", \"a3e73773-7d62-4a6d-9ecd-5e0a1c581f9b\": \"a3e73773-7d62-4a6d-9ecd-5e0a1c581f9b\", \"330ee559-4eeb-4a4e-8ba4-d9ad41a8a6d4\": \"330ee559-4eeb-4a4e-8ba4-d9ad41a8a6d4\", \"eb05cf8d-2988-4a56-a27f-daeba8cd4351\": \"eb05cf8d-2988-4a56-a27f-daeba8cd4351\", \"3ceab828-6131-4e14-a38c-b7a49e93fa00\": \"3ceab828-6131-4e14-a38c-b7a49e93fa00\", \"575261c8-b595-449c-9910-f731dfbbc524\": \"575261c8-b595-449c-9910-f731dfbbc524\", \"2b804415-dea0-4228-bb01-cdef7b7314f1\": \"2b804415-dea0-4228-bb01-cdef7b7314f1\", \"b81e8926-a94d-4a0d-a131-2b094e0f544e\": \"b81e8926-a94d-4a0d-a131-2b094e0f544e\", \"67438345-98b2-4ecc-9a32-7737b81a377f\": \"67438345-98b2-4ecc-9a32-7737b81a377f\", \"22912616-4dd1-4fa9-a721-b703015efbce\": \"22912616-4dd1-4fa9-a721-b703015efbce\", \"14d8706f-030b-41a5-89ba-9c8da9ffa437\": \"14d8706f-030b-41a5-89ba-9c8da9ffa437\", \"046bddfd-74ca-4f19-90fb-7e467fd20451\": \"046bddfd-74ca-4f19-90fb-7e467fd20451\", \"df56b2a3-3630-401b-ad25-7f8102b56957\": \"df56b2a3-3630-401b-ad25-7f8102b56957\", \"28e480fb-1b36-4316-99e2-4ceccc37f0ca\": \"28e480fb-1b36-4316-99e2-4ceccc37f0ca\", \"c3466ff2-beeb-4d67-8812-60e43ada861c\": \"c3466ff2-beeb-4d67-8812-60e43ada861c\", \"ac7db4fa-eba7-418f-9b7e-0be0148f52e6\": \"ac7db4fa-eba7-418f-9b7e-0be0148f52e6\", \"cf422ea9-3371-407d-a5de-3718a41642ff\": \"cf422ea9-3371-407d-a5de-3718a41642ff\", \"aaa8a4bb-d027-48bd-b725-4637335e425c\": \"aaa8a4bb-d027-48bd-b725-4637335e425c\", \"9d7f56dd-17c7-440c-8295-bc3a84e11317\": \"9d7f56dd-17c7-440c-8295-bc3a84e11317\", \"747c0f41-c1cc-461d-bb8e-ea78071f0bc3\": \"747c0f41-c1cc-461d-bb8e-ea78071f0bc3\", \"0bd536f3-3582-4159-aa20-9e3bd05e223a\": \"0bd536f3-3582-4159-aa20-9e3bd05e223a\"}, \"doc_id_dict\": {}, \"embeddings_dict\": {}}"}}}