Update main.py
Browse files
main.py
CHANGED
|
@@ -123,10 +123,15 @@ def main():
|
|
| 123 |
),
|
| 124 |
)
|
| 125 |
|
| 126 |
-
intro_text = P(
|
| 127 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
-
intro_list = P("
|
| 130 |
|
| 131 |
intro_list1 = Ol(
|
| 132 |
Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
|
|
|
|
| 123 |
),
|
| 124 |
)
|
| 125 |
|
| 126 |
+
intro_text = P("Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects ",
|
| 127 |
+
A("Amber-7B", href = "https://huggingface.co/LLM360/Amber"),
|
| 128 |
+
", ",
|
| 129 |
+
A("Crystal-7B", href = "https://huggingface.co/LLM360/CrystalCoder"),
|
| 130 |
+
", ",
|
| 131 |
+
A("K2-65B", href = "https://huggingface.co/LLM360/K2"),
|
| 132 |
+
"have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.",)
|
| 133 |
|
| 134 |
+
intro_list = P("We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:")
|
| 135 |
|
| 136 |
intro_list1 = Ol(
|
| 137 |
Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
|