b/lit_gpt/config.py
+import json
+from copy import deepcopy
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Literal, Optional, Type, Union
+import torch
+from typing_extensions import Self
+import lit_gpt.model
+from lit_gpt.utils import find_multiple
+@dataclass
+class Config:
+    name: str = ""
+    hf_config: dict = field(default_factory=dict)
+    block_size: int = 4096
+    vocab_size: int = 50254
+    padding_multiple: int = 512
+    padded_vocab_size: Optional[int] = None
+    n_layer: int = 16
+    n_head: int = 32
+    n_embd: int = 4096
+    rotary_percentage: float = 0.25
+    parallel_residual: bool = True
+    bias: bool = True
+    lm_head_bias: bool = False
+    # to use multi-head attention (MHA), set this to `n_head` (default)
+    # to use multi-query attention (MQA), set this to 1
+    # to use grouped-query attention (GQA), set this to a value in between
+    # Example with `n_head=4`
+    # ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
+    # │ v ││ v ││ v ││ v │     │ v │    │ v │             │ v │
+    # └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
+    #   │    │    │    │         │        │                 │
+    # ┌───┐┌───┐┌───┐┌───┐     ┌───┐    ┌───┐             ┌───┐
+    # │ k ││ k ││ k ││ k │     │ k │    │ k │             │ k │
+    # └───┘└───┘└───┘└───┘     └───┘    └───┘             └───┘
+    #   │    │    │    │      ┌──┴──┐  ┌──┴──┐      ┌────┬──┴─┬────┐
+    # ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐  ┌───┐┌───┐┌───┐┌───┐
+    # │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │  │ q ││ q ││ q ││ q │
+    # └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘  └───┘└───┘└───┘└───┘
+    # ◀──────────────────▶  ◀──────────────────▶  ◀──────────────────▶
+    #         MHA                    GQA                   MQA
+    #   n_query_groups=4       n_query_groups=2      n_query_groups=1
+    #
+    # credit https://arxiv.org/pdf/2305.13245.pdf
+    n_query_groups: Optional[int] = None
+    shared_attention_norm: bool = False
+    _norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm"
+    norm_eps: float = 1e-5
+    _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP"
+    gelu_approximate: str = "none"
+    intermediate_size: Optional[int] = None
+    rope_condense_ratio: int = 1
+    rope_base: int = 10000
+    def __post_init__(self):
+        if not self.name:
+            self.name = self.hf_config.get("name", self.name)
+        assert self.n_embd % self.n_head == 0
+        self.head_size = self.n_embd // self.n_head
+        # vocab size should be a power of 2 to be optimal on hardware. compute the closest value
+        if self.padded_vocab_size is None:
+            self.padded_vocab_size = find_multiple(self.vocab_size, self.padding_multiple)
+        else:
+            # vocab size shouldn't be larger than padded vocab size
+            self.vocab_size = min(self.vocab_size, self.padded_vocab_size)
+        # compute the number of query groups
+        if self.n_query_groups is not None:
+            assert self.n_head % self.n_query_groups == 0
+        else:
+            self.n_query_groups = self.n_head
+        # compute the intermediate size for MLP if not set
+        if self.intermediate_size is None:
+            if self._mlp_class == "LLaMAMLP":
+                raise ValueError("The config needs to set the `intermediate_size`")
+            self.intermediate_size = 4 * self.n_embd
+        self.rope_n_elem = int(self.rotary_percentage * self.head_size)
+    @classmethod
+    def from_name(cls, name: str, **kwargs: Any) -> Self:
+        if name not in name_to_config:
+            # search through all `config['hf_config']['name']`
+            try:
+                conf_dict = next(config for config in configs if name == config["hf_config"]["name"])
+            except StopIteration:
+                raise ValueError(f"{name!r} is not a supported config name")
+        else:
+            conf_dict = name_to_config[name]
+        conf_dict = conf_dict.copy()
+        if "condense_ratio" in kwargs:  # legacy name
+            kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
+        conf_dict.update(kwargs)
+        return cls(**conf_dict)
+    @classmethod
+    def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self:
+        with open(path, encoding="utf-8") as fp:
+            json_kwargs = json.load(fp)
+        if "condense_ratio" in json_kwargs:  # legacy name
+            json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio")
+        if "condense_ratio" in kwargs:  # legacy name
+            kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
+        if "org" in json_kwargs:  # legacy name
+            json_kwargs["hf_config"] = {"name": json_kwargs["name"], "org": json_kwargs.pop("org")}
+        if "org" in kwargs:  # legacy name
+            kwargs["hf_config"] = {"name": kwargs.get("name", json_kwargs["name"]), "org": kwargs.pop("org")}
+        json_kwargs.update(kwargs)
+        return cls(**json_kwargs)
+    @classmethod
+    def from_checkpoint(cls, path: Path, **kwargs: Any) -> Self:
+        """Automatically load `lit_config.json` and if it doesn't exist - a matching config from `lit_gpt/config.py`."""
+        if (config_path := path / "lit_config.json").is_file():
+            return cls.from_json(config_path, **kwargs)
+        if (model_name := path.name) in name_to_config:
+            return cls.from_name(model_name, **kwargs)
+        raise FileNotFoundError(f"For {str(path)!r} neither 'lit_config.json' nor matching config exists.")
+    @property
+    def mlp_class(self) -> Type:
+        # `self._mlp_class` cannot be the type to keep the config json serializable
+        return getattr(lit_gpt.model, self._mlp_class)
+    @property
+    def norm_class(self) -> Type:
+        # `self._norm_class` cannot be the type to keep the config json serializable
+        if self._norm_class == "RMSNorm":
+            from lit_gpt.rmsnorm import RMSNorm
+            return RMSNorm
+        return getattr(torch.nn, self._norm_class)
+########################
+# Stability AI StableLM
+########################
+configs = [
+    # https://huggingface.co/stabilityai/stablelm-base-alpha-3b/blob/main/config.json
+    dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")),
+    # https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json
+    dict(
+        name="stablelm-base-alpha-7b",
+        hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"),
+        n_head=48,
+        n_embd=6144,
+        padding_multiple=256,
+    ),
+    # https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json
+    dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32),
+    # https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json
+    dict(
+        name="stablelm-tuned-alpha-7b",
+        hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"),
+        n_head=48,
+        n_embd=6144,
+        padding_multiple=256,
+    ),
+]
+####################
+# EleutherAI Pythia
+####################
+pythia = [
+    # https://huggingface.co/EleutherAI/pythia-14m/blob/main/config.json
+    dict(
+        name="pythia-14m",
+        hf_config=dict(org="EleutherAI", name="pythia-14m"),
+        block_size=512,
+        n_layer=6,
+        n_embd=128,
+        n_head=4,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-31m/blob/main/config.json
+    dict(
+        name="pythia-31m",
+        hf_config=dict(org="EleutherAI", name="pythia-31m"),
+        block_size=1024,
+        n_layer=6,
+        n_embd=256,
+        n_head=8,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json
+    dict(
+        name="pythia-70m",
+        hf_config=dict(org="EleutherAI", name="pythia-70m"),
+        block_size=2048,
+        n_layer=6,
+        n_embd=512,
+        n_head=8,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json
+    dict(
+        name="pythia-160m",
+        hf_config=dict(org="EleutherAI", name="pythia-160m"),
+        block_size=2048,
+        n_layer=12,
+        n_embd=768,
+        n_head=12,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json
+    dict(
+        name="pythia-410m",
+        hf_config=dict(org="EleutherAI", name="pythia-410m"),
+        block_size=2048,
+        n_layer=24,
+        n_embd=1024,
+        n_head=16,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json
+    dict(
+        name="pythia-1b",
+        hf_config=dict(org="EleutherAI", name="pythia-1b"),
+        block_size=2048,
+        n_embd=2048,
+        n_head=8,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json
+    dict(
+        name="pythia-1.4b",
+        hf_config=dict(org="EleutherAI", name="pythia-1.4b"),
+        block_size=2048,
+        n_layer=24,
+        n_embd=2048,
+        n_head=16,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json
+    dict(
+        name="pythia-2.8b",
+        hf_config=dict(org="EleutherAI", name="pythia-2.8b"),
+        block_size=2048,
+        n_layer=32,
+        n_embd=2560,
+        padding_multiple=128,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json
+    dict(
+        name="pythia-6.9b",
+        hf_config=dict(org="EleutherAI", name="pythia-6.9b"),
+        block_size=2048,
+        n_layer=32,
+        padding_multiple=256,
+    ),
+    # https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json
+    dict(
+        name="pythia-12b",
+        hf_config=dict(org="EleutherAI", name="pythia-12b"),
+        block_size=2048,
+        n_layer=36,
+        n_embd=5120,
+        n_head=40,
+    ),
+]
+configs.extend(pythia)
+for c in pythia:
+    # "pythia-14m" and "pythia-31m" don't have deduped version
+    if c["name"] in ("pythia-14m", "pythia-31m"):
+        continue
+    copy = deepcopy(c)
+    copy["name"] = f"{c['name']}-deduped"
+    copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped"
+    configs.append(copy)
+####################################
+# togethercomputer RedPajama INCITE
+####################################
+redpajama_incite = [
+    # https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json
+    dict(
+        name="RedPajama-INCITE-{}-3B-v1",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"),
+        block_size=2048,
+        n_layer=32,
+        n_embd=2560,
+        padding_multiple=256,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+    ),
+    # https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Base/blob/main/config.json
+    dict(
+        name="RedPajama-INCITE-7B-{}",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-7B-{}"),
+        block_size=2048,
+        n_layer=32,
+        padding_multiple=256,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+    ),
+    # this redirects to the checkpoint above. kept for those who had the old weights already downloaded
+    dict(
+        name="RedPajama-INCITE-{}-7B-v0.1",
+        hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"),
+        block_size=2048,
+        n_layer=32,
+        padding_multiple=256,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+    ),
+]
+for c in redpajama_incite:
+    for kind in ("Base", "Chat", "Instruct"):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+#################
+# TII UAE Falcon
+#################
+falcon = [
+    # https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json
+    dict(
+        name="falcon-7b{}",
+        hf_config=dict(org="tiiuae", name="falcon-7b{}"),
+        block_size=2048,
+        vocab_size=65024,
+        padded_vocab_size=65024,
+        n_layer=32,
+        n_head=71,
+        n_embd=4544,
+        rotary_percentage=1.0,
+        n_query_groups=1,
+        bias=False,
+        # this is not in the config, but in the original model implementation, only for this config
+        shared_attention_norm=True,
+    ),
+    # https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json
+    dict(
+        name="falcon-40b{}",
+        hf_config=dict(org="tiiuae", name="falcon-40b{}"),
+        block_size=2048,
+        vocab_size=65024,
+        padded_vocab_size=65024,
+        n_layer=60,
+        n_head=128,
+        n_embd=8192,
+        rotary_percentage=1.0,
+        n_query_groups=8,
+        bias=False,
+    ),
+]
+for c in falcon:
+    for kind in ("", "-instruct"):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+# https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json
+falcon180b = dict(
+    name="falcon-180B{}",
+    hf_config=dict(org="tiiuae", name="falcon-180B{}"),
+    block_size=2048,
+    vocab_size=65024,
+    padded_vocab_size=65024,
+    n_layer=80,
+    n_head=232,
+    n_embd=14848,
+    rotary_percentage=1.0,
+    n_query_groups=8,
+    bias=False,
+)
+for kind in ("", "-chat"):
+    copy = deepcopy(falcon180b)
+    copy["name"] = falcon180b["name"].format(kind)
+    copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind)
+    configs.append(copy)
+#############################
+# OpenLM Research Open LLaMA
+#############################
+open_LLaMA = [
+    # https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json
+    dict(
+        name="open_llama_3b",
+        hf_config=dict(org="openlm-research", name="open_llama_3b"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=26,
+        n_embd=3200,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=8640,
+    ),
+    # https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json
+    dict(
+        name="open_llama_7b",
+        hf_config=dict(org="openlm-research", name="open_llama_7b"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json
+    dict(
+        name="open_llama_13b",
+        hf_config=dict(org="openlm-research", name="open_llama_13b"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+]
+configs.extend(open_LLaMA)
+###############
+# LMSYS Vicuna
+###############
+vicuna = [
+    # https://huggingface.co/lmsys/vicuna-7b-v1.3/blob/main/config.json
+    dict(
+        name="vicuna-7b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.3"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/lmsys/vicuna-13b-v1.3/blob/main/config.json
+    dict(
+        name="vicuna-13b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.3"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/lmsys/vicuna-33b-v1.3/blob/main/config.json
+    dict(
+        name="vicuna-33b-v1.3",
+        hf_config=dict(org="lmsys", name="vicuna-33b-v1.3"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=60,
+        n_head=52,
+        n_embd=6656,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=17920,
+    ),
+    # https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json
+    dict(
+        name="vicuna-7b-v1.5",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.5"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/lmsys/vicuna-7b-v1.5-16k/blob/main/config.json
+    dict(
+        name="vicuna-7b-v1.5-16k",
+        hf_config=dict(org="lmsys", name="vicuna-7b-v1.5-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_condense_ratio=4,
+    ),
+    # https://huggingface.co/lmsys/vicuna-13b-v1.5/blob/main/config.json
+    dict(
+        name="vicuna-13b-v1.5",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.5"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/lmsys/vicuna-13b-v1.5-16k/blob/main/config.json
+    dict(
+        name="vicuna-13b-v1.5-16k",
+        hf_config=dict(org="lmsys", name="vicuna-13b-v1.5-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_condense_ratio=4,
+    ),
+]
+configs.extend(vicuna)
+#################
+# LMSYS LongChat
+#################
+long_chat = [
+    # https://huggingface.co/lmsys/longchat-7b-16k/blob/main/config.json
+    dict(
+        name="longchat-7b-16k",
+        hf_config=dict(org="lmsys", name="longchat-7b-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_condense_ratio=8,
+    ),
+    # https://huggingface.co/lmsys/longchat-13b-16k/blob/main/config.json
+    dict(
+        name="longchat-13b-16k",
+        hf_config=dict(org="lmsys", name="longchat-13b-16k"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_condense_ratio=8,
+    ),
+]
+configs.extend(long_chat)
+######################
+# NousResearch Hermes
+######################
+nous_research = [
+    # https://huggingface.co/NousResearch/Nous-Hermes-llama-2-7b/blob/main/config.json
+    dict(
+        name="Nous-Hermes-llama-2-7b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-llama-2-7b"),
+        padded_vocab_size=32000,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/NousResearch/Nous-Hermes-13B/blob/main/config.json
+    dict(
+        name="Nous-Hermes-13b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-13b"),
+        block_size=2048,
+        vocab_size=32000,
+        padded_vocab_size=32001,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-6,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/NousResearch/Nous-Hermes-Llama2-13b
+    dict(
+        name="Nous-Hermes-Llama2-13b",
+        hf_config=dict(org="NousResearch", name="Nous-Hermes-Llama2-13b"),
+        vocab_size=32000,
+        padded_vocab_size=32032,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+]
+configs.extend(nous_research)
+###############
+# Meta LLaMA 2
+###############
+llama_2 = [
+    # https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json
+    dict(
+        name="Llama-2-7b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json
+    dict(
+        name="Llama-2-13b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json
+    dict(
+        name="Llama-2-70b{}-hf",
+        hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+]
+for c in llama_2:
+    for kind in ("", "-chat"):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+##########################
+# Stability AI FreeWilly2
+##########################
+freewilly_2 = [
+    # https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json
+    dict(
+        name="FreeWilly2",
+        hf_config=dict(org="stabilityai", name="FreeWilly2"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    )
+]
+configs.extend(freewilly_2)
+##################
+# Meta Code Llama
+##################
+code_llama = [
+    # https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-7b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-13b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-34b-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=48,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=22016,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-7b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-13b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-34b-Python-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=48,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=22016,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/tree/main/config.json
+    dict(
+        name="CodeLlama-7b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"),
+        block_size=16384,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-13b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"),
+        block_size=2048,
+        vocab_size=32016,
+        padding_multiple=16,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+        rope_base=1000000,
+    ),
+    # https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json
+    dict(
+        name="CodeLlama-34b-Instruct-hf",
+        hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"),
+        block_size=16384,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=48,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=22016,
+        rope_base=1000000,
+    ),
+]
+configs.extend(code_llama)
+########################
+# garage-bAInd Platypus
+########################
+platypus = [
+    # https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json
+    dict(
+        name="Platypus-30B",
+        hf_config=dict(org="garage-bAInd", name="Platypus-30B"),
+        block_size=2048,
+        padded_vocab_size=32000,
+        n_layer=60,
+        n_head=52,
+        n_embd=6656,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-06,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=17920,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json
+    dict(
+        name="Platypus2-7B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-7B"),
+        padded_vocab_size=32000,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json
+    dict(
+        name="Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-13B"),
+        padded_vocab_size=32000,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json
+    dict(
+        name="Platypus2-70B",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-70B"),
+        padded_vocab_size=32000,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+    # https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json
+    dict(
+        name="Camel-Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"),
+        padded_vocab_size=32000,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json
+    dict(
+        name="Camel-Platypus2-70B",
+        hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"),
+        padded_vocab_size=32000,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+    # https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json
+    dict(
+        name="Stable-Platypus2-13B",
+        hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"),
+        padded_vocab_size=32000,
+        n_layer=40,
+        n_head=40,
+        n_embd=5120,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=13824,
+    ),
+    # https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json
+    dict(
+        name="Platypus2-70B-instruct",
+        hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"),
+        padded_vocab_size=32000,
+        n_layer=80,
+        n_head=64,
+        n_embd=8192,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=28672,
+    ),
+]
+configs.extend(platypus)
+##########################
+# Stability AI StableCode
+##########################
+stablecode = [
+    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json
+    dict(
+        name="stablecode-completion-alpha-3b",
+        hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"),
+        block_size=16384,
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+    # https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json
+    dict(
+        name="stablecode-completion-alpha-3b-4k",
+        hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"),
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+    # https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json
+    dict(
+        name="stablecode-instruct-alpha-3b",
+        hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"),
+        vocab_size=49152,
+        n_layer=32,
+        n_embd=2560,
+    ),
+]
+configs.extend(stablecode)
+##################################
+# togethercomputer LLaMA-2-7B-32K
+##################################
+together_llama2_32k = [
+    # https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json
+    dict(
+        name="LLaMA-2-7B-32K",
+        hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"),
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=32,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        _mlp_class="LLaMAMLP",
+        intermediate_size=11008,
+        rope_condense_ratio=8,
+    )
+]
+configs.extend(together_llama2_32k)
+################
+# Microsoft Phi
+################
+phi = [
+    # https://huggingface.co/microsoft/phi-1_5/blob/main/config.json
+    dict(
+        name="phi-1_5",
+        hf_config=dict(org="microsoft", name="phi-1_5"),
+        vocab_size=50257,
+        padded_vocab_size=51200,
+        block_size=2048,
+        n_embd=2048,
+        n_layer=24,
+        rotary_percentage=0.5,  # 32 / (n_embd / n_head) = 32 / 64
+        shared_attention_norm=True,
+        lm_head_bias=True,
+        gelu_approximate="tanh",
+    )
+]
+configs.extend(phi)
+#############
+# Mistral AI
+#############
+mistral = [
+    # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+    dict(
+        name="Mistral-7B-{}v0.1",
+        hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"),
+        padded_vocab_size=32000,
+        block_size=4096,  # should be 32768 but sliding window attention is not implemented
+        n_layer=32,
+        n_query_groups=8,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",
+        norm_eps=1e-05,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=14336,
+    )
+]
+for c in mistral:
+    for kind in ("", "Instruct-"):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
+        configs.append(copy)
+############
+# TinyLlama
+############
+tiny_llama = [
+    dict(
+        name="tiny-llama-1.1b{}",
+        hf_config=dict(org="TinyLlama", name="TinyLlama-1.1B{}"),
+        block_size=2048,
+        vocab_size=32000,
+        padding_multiple=64,
+        n_layer=22,
+        n_head=32,
+        n_embd=2048,
+        rotary_percentage=1.0,
+        parallel_residual=False,
+        bias=False,
+        _norm_class="RMSNorm",  # original TinyLlama uses FusedRMSNorm
+        norm_eps=1e-5,
+        _mlp_class="LLaMAMLP",
+        intermediate_size=5632,
+        n_query_groups=4,
+    ),
+]
+for c in tiny_llama:
+    for kind, hf_postfix in (("", "-intermediate-step-955k-token-2T"), ("chat", "-Chat-v0.6")):
+        copy = deepcopy(c)
+        copy["name"] = c["name"].format(kind)
+        copy["hf_config"]["name"] = c["hf_config"]["name"].format(hf_postfix)
+        configs.append(copy)
+name_to_config = {config["name"]: config for config in configs}