|
15 | 15 | },
|
16 | 16 | {
|
17 | 17 | "cell_type": "code",
|
18 |
| - "execution_count": 8, |
| 18 | + "execution_count": 1, |
19 | 19 | "metadata": {},
|
20 | 20 | "outputs": [],
|
21 | 21 | "source": [
|
|
29 | 29 | },
|
30 | 30 | {
|
31 | 31 | "cell_type": "code",
|
32 |
| - "execution_count": 9, |
| 32 | + "execution_count": 2, |
33 | 33 | "metadata": {},
|
34 | 34 | "outputs": [
|
35 | 35 | {
|
|
40 | 40 | " (embed_tokens): Embedding(32000, 768)\n",
|
41 | 41 | " (layers): ModuleList(\n",
|
42 | 42 | " (0-11): 12 x LlamaDecoderLayer(\n",
|
43 |
| - " (self_attn): LlamaSdpaAttention(\n", |
| 43 | + " (self_attn): LlamaAttention(\n", |
44 | 44 | " (q_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
45 | 45 | " (k_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
46 | 46 | " (v_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
47 | 47 | " (o_proj): Linear(in_features=768, out_features=768, bias=False)\n",
|
48 |
| - " (rotary_emb): LlamaRotaryEmbedding()\n", |
49 | 48 | " )\n",
|
50 | 49 | " (mlp): LlamaMLP(\n",
|
51 | 50 | " (gate_proj): Linear(in_features=768, out_features=2048, bias=False)\n",
|
52 | 51 | " (up_proj): Linear(in_features=768, out_features=2048, bias=False)\n",
|
53 | 52 | " (down_proj): Linear(in_features=2048, out_features=768, bias=False)\n",
|
54 | 53 | " (act_fn): SiLU()\n",
|
55 | 54 | " )\n",
|
56 |
| - " (input_layernorm): LlamaRMSNorm()\n", |
57 |
| - " (post_attention_layernorm): LlamaRMSNorm()\n", |
| 55 | + " (input_layernorm): LlamaRMSNorm((768,), eps=1e-05)\n", |
| 56 | + " (post_attention_layernorm): LlamaRMSNorm((768,), eps=1e-05)\n", |
58 | 57 | " )\n",
|
59 | 58 | " )\n",
|
60 |
| - " (norm): LlamaRMSNorm()\n", |
| 59 | + " (norm): LlamaRMSNorm((768,), eps=1e-05)\n", |
| 60 | + " (rotary_emb): LlamaRotaryEmbedding()\n", |
61 | 61 | " )\n",
|
62 | 62 | " (lm_head): Linear(in_features=768, out_features=32000, bias=False)\n",
|
63 | 63 | ")"
|
64 | 64 | ]
|
65 | 65 | },
|
66 |
| - "execution_count": 9, |
| 66 | + "execution_count": 2, |
67 | 67 | "metadata": {},
|
68 | 68 | "output_type": "execute_result"
|
69 | 69 | }
|
|
77 | 77 | },
|
78 | 78 | {
|
79 | 79 | "cell_type": "code",
|
80 |
| - "execution_count": 10, |
| 80 | + "execution_count": 3, |
81 | 81 | "metadata": {},
|
82 | 82 | "outputs": [
|
83 | 83 | {
|
84 | 84 | "name": "stdout",
|
85 | 85 | "output_type": "stream",
|
86 | 86 | "text": [
|
87 |
| - "The example layer model.layers.0.self_attn.q_proj.weight has sparsity 0.50%\n" |
| 87 | + "The example layer model.layers.0.self_attn.q_proj.weight has sparsity 50%\n" |
88 | 88 | ]
|
89 | 89 | }
|
90 | 90 | ],
|
|
93 | 93 | "state_dict = model.state_dict()\n",
|
94 | 94 | "state_dict.keys()\n",
|
95 | 95 | "example_layer = \"model.layers.0.self_attn.q_proj.weight\"\n",
|
96 |
| - "print(f\"The example layer {example_layer} has sparsity {torch.sum(state_dict[example_layer] == 0).item() / state_dict[example_layer].numel():.2f}%\")" |
| 96 | + "print(f\"The example layer {example_layer} has sparsity {100 * state_dict[example_layer].eq(0).sum().item() / state_dict[example_layer].numel():.0f}%\")" |
97 | 97 | ]
|
98 | 98 | },
|
99 | 99 | {
|
100 | 100 | "cell_type": "code",
|
101 |
| - "execution_count": 11, |
| 101 | + "execution_count": 4, |
102 | 102 | "metadata": {},
|
103 | 103 | "outputs": [
|
104 | 104 | {
|
105 | 105 | "name": "stdout",
|
106 | 106 | "output_type": "stream",
|
107 | 107 | "text": [
|
108 |
| - "The model is 31.67% sparse overall\n" |
| 108 | + "The model is 32% sparse overall\n" |
109 | 109 | ]
|
110 | 110 | }
|
111 | 111 | ],
|
112 | 112 | "source": [
|
113 |
| - "# we can inspect to total sparisity of the state_dict\n", |
| 113 | + "# we can inspect to total sparsity of the state_dict\n", |
114 | 114 | "total_num_parameters = 0\n",
|
115 | 115 | "total_num_zero_parameters = 0\n",
|
116 | 116 | "for key in state_dict:\n",
|
117 | 117 | " total_num_parameters += state_dict[key].numel()\n",
|
118 | 118 | " total_num_zero_parameters += state_dict[key].eq(0).sum().item()\n",
|
119 |
| - "print(f\"The model is {total_num_zero_parameters/total_num_parameters*100:.2f}% sparse overall\")" |
| 119 | + "print(f\"The model is {total_num_zero_parameters/total_num_parameters*100:.0f}% sparse overall\")" |
120 | 120 | ]
|
121 | 121 | },
|
122 | 122 | {
|
123 | 123 | "cell_type": "code",
|
124 |
| - "execution_count": 12, |
| 124 | + "execution_count": 5, |
125 | 125 | "metadata": {},
|
126 | 126 | "outputs": [
|
127 | 127 | {
|
128 | 128 | "name": "stderr",
|
129 | 129 | "output_type": "stream",
|
130 | 130 | "text": [
|
131 |
| - "Compressing model: 100%|██████████| 111/111 [00:06<00:00, 17.92it/s]\n" |
| 131 | + "Compressing model: 100%|██████████| 111/111 [00:00<00:00, 313.39it/s]\n" |
132 | 132 | ]
|
133 | 133 | },
|
134 | 134 | {
|
|
168 | 168 | },
|
169 | 169 | {
|
170 | 170 | "cell_type": "code",
|
171 |
| - "execution_count": 13, |
| 171 | + "execution_count": 6, |
172 | 172 | "metadata": {},
|
173 | 173 | "outputs": [
|
174 | 174 | {
|
|
185 | 185 | "## load the uncompressed safetensors to memory ##\n",
|
186 | 186 | "state_dict_1 = {}\n",
|
187 | 187 | "with safe_open('model.safetensors', framework=\"pt\") as f:\n",
|
188 |
| - " for key in f.keys():\n", |
189 |
| - " state_dict_1[key] = f.get_tensor(key)\n", |
| 188 | + " for key in f.keys():\n", |
| 189 | + " state_dict_1[key] = f.get_tensor(key)\n", |
190 | 190 | "\n",
|
191 | 191 | "## load the compressed-tensors to memory ##\n",
|
192 | 192 | "config = BitmaskConfig() # we need to specify the method for decompression\n",
|
|
0 commit comments