<?xml version="1.0" encoding="utf-8" standalone="yes"?><rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:media="http://search.yahoo.com/mrss/" xmlns:dc="http://purl.org/dc/elements/1.1/"><channel><title>Ajay Walia</title><link>https://curiousbit.netlify.app/</link><description>Digital workplace, artificial intelligence, cloud, security, automation, and enterprise technology notes by Ajay Walia.</description><language>en-au</language><managingEditor>Ajay Walia</managingEditor><webMaster>Ajay Walia</webMaster><copyright>Copyright 2026 Ajay Walia</copyright><lastBuildDate>Sun, 21 Jun 2026 05:46:10 +0000</lastBuildDate><atom:link href="https://curiousbit.netlify.app/tags/llms/index.xml" rel="self" type="application/rss+xml"/><image><url>https://curiousbit.netlify.app/images/og-default.png</url><title>Ajay Walia</title><link>https://curiousbit.netlify.app/</link></image><item><title>Memory Management in LLMs</title><link>https://curiousbit.netlify.app/memory-management-in-llms/</link><guid isPermaLink="true">https://curiousbit.netlify.app/memory-management-in-llms/</guid><pubDate>Sun, 14 Jun 2026 00:00:00 +0000</pubDate><dc:creator>Ajay Walia</dc:creator><description>&lt;style&gt;
.mm-kb{--mm-accent:#00c853;--mm-static:#5b8cff;--mm-runtime:#3ecf8e;--mm-training:#f5a623;--mm-overview:#9aa4b2;--mm-app:#7c5cff;color:inherit;font-size:clamp(1.2rem,1.15rem + 0.25vw,1.45rem);line-height:1.85;}
.mm-kb *{box-sizing:border-box;}
.mm-kb .mm-intro{font-size:1em;line-height:1.85;margin:0 0 1.6rem;opacity:.9;}
.mm-kb .mm-legend{display:flex;flex-wrap:wrap;gap:14px;font-size:.82rem;opacity:.85;margin:0 0 2rem;padding:.9rem 1rem;border:1px solid rgba(127,127,127,.22);border-radius:12px;background:rgba(127,127,127,.05);}
.mm-kb .mm-legend span{display:inline-flex;align-items:center;gap:7px;}
.mm-kb .mm-legend i{width:11px;height:11px;border-radius:3px;display:inline-block;}
.mm-kb .mm-domain{display:flex;align-items:center;gap:12px;margin:2.2rem 0 1rem;}
.mm-kb .mm-domain .mm-dot{width:11px;height:11px;border-radius:50%;}
.mm-kb .mm-domain h2{font-size:1.15rem;margin:0;font-weight:800;}
.mm-kb .mm-domain .mm-tag{font-size:.72rem;opacity:.7;border:1px solid rgba(127,127,127,.3);padding:3px 10px;border-radius:999px;}
.mm-kb .mm-grid{display:grid;grid-template-columns:repeat(2,1fr);gap:20px;}
@media(max-width:640px){.mm-kb .mm-grid{grid-template-columns:1fr;}}
.mm-kb .mm-card{position:relative;border:1px solid rgba(127,127,127,.22);border-radius:16px;padding:24px 24px 22px;cursor:pointer;background:rgba(127,127,127,.04);transition:.18s;overflow:hidden;}
.mm-kb .mm-card:hover{transform:translateY(-3px);border-color:var(--mm-accent);box-shadow:0 8px 26px rgba(0,0,0,.18);}
.mm-kb .mm-thumb{margin:-24px -24px 16px;aspect-ratio:3/2;overflow:hidden;background:rgba(127,127,127,.08);border-bottom:1px solid rgba(127,127,127,.18);}
.mm-kb .mm-thumb img{width:100%;height:100%;object-fit:cover;display:block;}
.mm-kb .mm-card .mm-num{position:absolute;top:12px;right:16px;font-size:2.1rem;font-weight:800;opacity:.08;}
.mm-kb .mm-card h3{margin:0 0 10px;font-size:1.15em;font-weight:800;padding-right:30px;line-height:1.25;}
.mm-kb .mm-card p{margin:0 0 14px;font-size:.92em;opacity:.8;line-height:1.7;}
.mm-kb .mm-chips{display:flex;flex-wrap:wrap;gap:6px;}
.mm-kb .mm-chip{font-size:.78rem;opacity:.8;background:rgba(127,127,127,.1);border:1px solid rgba(127,127,127,.2);padding:3px 8px;border-radius:999px;}
.mm-kb .mm-open{margin-top:15px;font-size:.92rem;color:var(--mm-accent);font-weight:700;}
.mm-kb .mm-badge{display:inline-block;font-size:.66rem;font-weight:800;letter-spacing:.4px;text-transform:uppercase;padding:3px 9px;border-radius:999px;margin-bottom:10px;border:1px solid transparent;}
.mm-kb .mm-badge.static{color:#3f6fd8;background:rgba(91,140,255,.13);border-color:rgba(91,140,255,.4);}
.mm-kb .mm-badge.runtime{color:#1f9d63;background:rgba(62,207,142,.14);border-color:rgba(62,207,142,.42);}
.mm-kb .mm-badge.training{color:#c47e10;background:rgba(245,166,35,.16);border-color:rgba(245,166,35,.45);}
.mm-kb .mm-badge.overview{color:#6b7280;background:rgba(154,164,178,.14);border-color:rgba(154,164,178,.4);}
.mm-kb .mm-badge.app{color:#6a4dd6;background:rgba(124,92,255,.14);border-color:rgba(124,92,255,.42);}
.mm-kb .mm-hero{margin:0 0 1.4rem;border-radius:16px;overflow:hidden;border:1px solid rgba(127,127,127,.2);background:rgba(127,127,127,.06);aspect-ratio:16/9;}
.mm-kb .mm-hero img,.mm-kb .mm-hero video{width:100%;height:100%;object-fit:cover;display:block;}
.mm-kb .mm-article{display:none;}
.mm-kb .mm-back{display:inline-flex;align-items:center;gap:7px;font-size:.9rem;color:var(--mm-accent);margin-bottom:18px;cursor:pointer;background:none;border:none;padding:0;font-family:inherit;font-weight:700;}
.mm-kb .mm-back:hover{text-decoration:underline;}
.mm-kb .mm-article h1{font-size:1.6em;margin:.2rem 0 .5rem;font-weight:900;line-height:1.1;}
.mm-kb .mm-article .mm-lead{opacity:.8;font-size:1em;line-height:1.85;margin:0 0 1.8rem;}
.mm-kb .mm-sec{border:1px solid rgba(127,127,127,.22);border-radius:14px;padding:18px 22px;margin-bottom:14px;background:rgba(127,127,127,.04);}
.mm-kb .mm-sec h3{margin:0 0 10px;font-size:1.12em;font-weight:800;line-height:1.3;}
.mm-kb .mm-sec ul{margin:0;padding-left:20px;}
.mm-kb .mm-sec li{margin:8px 0;opacity:.92;font-size:.95em;line-height:1.75;}
.mm-kb .mm-note{font-size:.8rem;opacity:.7;border-left:3px solid var(--mm-accent);padding:10px 14px;border-radius:8px;background:rgba(127,127,127,.05);margin-top:14px;}
/* ---- rich article body + interactive widgets (Fundamentals) ---- */
.mm-kb .mm-sec p{margin:0 0 1.1rem;font-size:1.22em;line-height:1.9;opacity:.92;}
.mm-kb .mm-sec p:last-child{margin-bottom:0;}
.mm-kb .mmf-cap{font-size:1rem;opacity:.65;margin:.6rem 0 0;line-height:1.7;}
.mm-kb .mmf-kbd{font-family:ui-monospace,SFMono-Regular,Menlo,monospace;font-size:.85em;background:rgba(127,127,127,.14);padding:1px 6px;border-radius:5px;}
/* widget shell */
.mm-kb .mmf-w{border:1px solid rgba(127,127,127,.22);border-radius:14px;padding:18px;margin:1.1rem 0 .3rem;background:rgba(127,127,127,.05);}
.mm-kb .mmf-w h4{margin:0 0 3px;font-size:1.28rem;font-weight:800;}
.mm-kb .mmf-w .mmf-sub{font-size:1.02rem;opacity:.6;margin:0 0 16px;}
/* mode switch (segmented) */
.mm-kb .mmf-switch{display:inline-flex;border:1px solid rgba(127,127,127,.3);border-radius:999px;overflow:hidden;margin-bottom:14px;}
.mm-kb .mmf-switch button{font:inherit;font-size:1.1rem;font-weight:700;border:none;background:none;color:inherit;padding:9px 20px;cursor:pointer;opacity:.6;transition:.15s;}
.mm-kb .mmf-switch button.on{background:var(--mm-accent);color:#06231a;opacity:1;}
/* component map blocks */
.mm-kb .mmf-blocks{display:grid;grid-template-columns:repeat(4,1fr);gap:10px;}
@media(max-width:560px){.mm-kb .mmf-blocks{grid-template-columns:repeat(2,1fr);}}
.mm-kb .mmf-block{border:1px solid rgba(127,127,127,.25);border-radius:11px;padding:13px 12px;cursor:pointer;background:rgba(127,127,127,.04);transition:.18s;position:relative;text-align:left;}
.mm-kb .mmf-block .mmf-bt{font-weight:800;font-size:1.15rem;display:block;margin-bottom:6px;}
.mm-kb .mmf-block .mmf-bar{height:9px;border-radius:5px;background:var(--c,#888);transition:width .4s ease,opacity .3s;width:30%;}
.mm-kb .mmf-block .mmf-tag{font-size:.9rem;opacity:.7;display:block;margin-top:8px;}
.mm-kb .mmf-block.off{opacity:.28;filter:grayscale(.6);}
.mm-kb .mmf-block.sel{border-color:var(--c,var(--mm-accent));box-shadow:0 0 0 2px rgba(0,200,83,.15);}
.mm-kb .mmf-detail{margin-top:14px;font-size:1.12rem;line-height:1.8;border-left:3px solid var(--mm-accent);padding:13px 16px;background:rgba(127,127,127,.05);border-radius:8px;min-height:1.5em;}
/* trade-off slider */
.mm-kb .mmf-slider{width:100%;accent-color:var(--mm-accent);margin:6px 0 4px;}
.mm-kb .mmf-ends{display:flex;justify-content:space-between;font-size:1rem;opacity:.7;font-weight:700;}
.mm-kb .mmf-meters{margin-top:14px;display:grid;gap:10px;}
.mm-kb .mmf-meter .mmf-ml{display:flex;justify-content:space-between;font-size:1.05rem;margin-bottom:5px;opacity:.85;}
.mm-kb .mmf-track{height:14px;border-radius:7px;background:rgba(127,127,127,.16);overflow:hidden;}
.mm-kb .mmf-fill{height:100%;border-radius:7px;transition:width .25s ease;}
.mm-kb .mmf-readout{margin-top:14px;font-size:1.12rem;line-height:1.8;opacity:.9;}
/* estimator */
.mm-kb .mmf-ctl{display:flex;flex-wrap:wrap;gap:18px;align-items:flex-end;margin-bottom:8px;}
.mm-kb .mmf-ctl label{font-size:1.02rem;font-weight:700;opacity:.8;display:block;margin-bottom:7px;}
.mm-kb .mmf-prec button{font:inherit;font-size:1.05rem;font-weight:700;border:1px solid rgba(127,127,127,.3);background:none;color:inherit;padding:8px 15px;border-radius:8px;cursor:pointer;opacity:.65;margin-right:6px;transition:.15s;}
.mm-kb .mmf-prec button.on{background:var(--mm-static);color:#fff;border-color:var(--mm-static);opacity:1;}
.mm-kb .mmf-pval{font-weight:800;font-size:1.3rem;}
.mm-kb .mmf-est-bars{margin-top:16px;display:grid;gap:16px;}
.mm-kb .mmf-eb .mmf-ebl{display:flex;justify-content:space-between;align-items:baseline;font-size:1.1rem;font-weight:700;margin-bottom:6px;}
.mm-kb .mmf-eb .mmf-ebl small{font-weight:600;opacity:.65;}
.mm-kb .mmf-stack{display:flex;height:34px;border-radius:8px;overflow:hidden;background:rgba(127,127,127,.12);}
.mm-kb .mmf-seg{height:100%;transition:width .3s ease;min-width:0;}
.mm-kb .mmf-legend{display:flex;flex-wrap:wrap;gap:16px;margin-top:14px;font-size:1rem;opacity:.82;}
.mm-kb .mmf-legend span{display:inline-flex;align-items:center;gap:6px;}
.mm-kb .mmf-legend i{width:12px;height:12px;border-radius:3px;display:inline-block;}
.mm-kb .mmf-ratio{margin-top:14px;font-size:1.2rem;font-weight:700;text-align:center;padding:13px;border-radius:9px;background:rgba(0,200,83,.1);border:1px solid rgba(0,200,83,.25);}
/* ---- extra widgets for the remaining articles ---- */
.mm-kb .mmf-stat{font-size:2.2rem;font-weight:800;margin:8px 0 4px;letter-spacing:-.5px;line-height:1.1;}
.mm-kb .mmf-fitline{font-size:1.02rem;opacity:.85;margin-bottom:14px;line-height:1.65;}
.mm-kb .mmf-matrix{display:grid;gap:2px;margin:10px auto 18px;max-width:420px;}
.mm-kb .mmf-cell{aspect-ratio:1;background:#f5a623;border-radius:2px;opacity:.82;}
.mm-kb .mmf-memgrid{display:grid;grid-template-columns:repeat(20,1fr);gap:3px;margin:8px 0 14px;}
.mm-kb .mmf-mcell{aspect-ratio:1;border-radius:3px;background:rgba(127,127,127,.16);transition:.2s;}
.mm-kb .mmf-hier{display:grid;gap:11px;margin:8px 0 4px;}
.mm-kb .mmf-hrow{cursor:pointer;border:1px solid rgba(127,127,127,.22);border-radius:11px;padding:12px 15px;background:rgba(127,127,127,.04);transition:.15s;}
.mm-kb .mmf-hrow:hover,.mm-kb .mmf-hrow.sel{border-color:var(--mm-accent);box-shadow:0 0 0 2px rgba(0,200,83,.12);}
.mm-kb .mmf-hrow .mmf-htop{display:flex;justify-content:space-between;align-items:baseline;font-weight:800;font-size:1.08rem;margin-bottom:8px;}
.mm-kb .mmf-hrow .mmf-htop small{font-weight:600;opacity:.7;font-size:.92rem;}
.mm-kb .mmf-hbar{height:12px;border-radius:6px;background:var(--c,#888);}
.mm-kb .mmf-btn{font:inherit;font-size:1.02rem;font-weight:700;border:1px solid var(--mm-accent);background:rgba(0,200,83,.1);color:inherit;padding:9px 16px;border-radius:9px;cursor:pointer;transition:.15s;}
.mm-kb .mmf-btn:hover{background:rgba(0,200,83,.2);}
.mm-kb .mmf-btn:disabled{opacity:.4;cursor:not-allowed;}
&lt;/style&gt;
&lt;div class="mm-kb not-prose" id="mm-kb"&gt;
&lt;div id="mm-home"&gt;
&lt;p class="mm-intro"&gt;Everything about how large language models use, store, and optimize memory — from the bytes that hold model weights on a GPU to how an agent remembers a conversation across sessions. Each topic is tagged by &lt;em&gt;when&lt;/em&gt; the memory is consumed. Pick a card to open the article.&lt;/p&gt;</description><content:encoded>&lt;![CDATA[<img src="https://curiousbit.netlify.app/images/memory-llms/hero.png" alt="LLMs" style="max-width:100%;height:auto;margin-bottom:1.5em;"/><style>
.mm-kb{--mm-accent:#00c853;--mm-static:#5b8cff;--mm-runtime:#3ecf8e;--mm-training:#f5a623;--mm-overview:#9aa4b2;--mm-app:#7c5cff;color:inherit;font-size:clamp(1.2rem,1.15rem + 0.25vw,1.45rem);line-height:1.85;}
.mm-kb *{box-sizing:border-box;}
.mm-kb .mm-intro{font-size:1em;line-height:1.85;margin:0 0 1.6rem;opacity:.9;}
.mm-kb .mm-legend{display:flex;flex-wrap:wrap;gap:14px;font-size:.82rem;opacity:.85;margin:0 0 2rem;padding:.9rem 1rem;border:1px solid rgba(127,127,127,.22);border-radius:12px;background:rgba(127,127,127,.05);}
.mm-kb .mm-legend span{display:inline-flex;align-items:center;gap:7px;}
.mm-kb .mm-legend i{width:11px;height:11px;border-radius:3px;display:inline-block;}
.mm-kb .mm-domain{display:flex;align-items:center;gap:12px;margin:2.2rem 0 1rem;}
.mm-kb .mm-domain .mm-dot{width:11px;height:11px;border-radius:50%;}
.mm-kb .mm-domain h2{font-size:1.15rem;margin:0;font-weight:800;}
.mm-kb .mm-domain .mm-tag{font-size:.72rem;opacity:.7;border:1px solid rgba(127,127,127,.3);padding:3px 10px;border-radius:999px;}
.mm-kb .mm-grid{display:grid;grid-template-columns:repeat(2,1fr);gap:20px;}
@media(max-width:640px){.mm-kb .mm-grid{grid-template-columns:1fr;}}
.mm-kb .mm-card{position:relative;border:1px solid rgba(127,127,127,.22);border-radius:16px;padding:24px 24px 22px;cursor:pointer;background:rgba(127,127,127,.04);transition:.18s;overflow:hidden;}
.mm-kb .mm-card:hover{transform:translateY(-3px);border-color:var(--mm-accent);box-shadow:0 8px 26px rgba(0,0,0,.18);}
.mm-kb .mm-thumb{margin:-24px -24px 16px;aspect-ratio:3/2;overflow:hidden;background:rgba(127,127,127,.08);border-bottom:1px solid rgba(127,127,127,.18);}
.mm-kb .mm-thumb img{width:100%;height:100%;object-fit:cover;display:block;}
.mm-kb .mm-card .mm-num{position:absolute;top:12px;right:16px;font-size:2.1rem;font-weight:800;opacity:.08;}
.mm-kb .mm-card h3{margin:0 0 10px;font-size:1.15em;font-weight:800;padding-right:30px;line-height:1.25;}
.mm-kb .mm-card p{margin:0 0 14px;font-size:.92em;opacity:.8;line-height:1.7;}
.mm-kb .mm-chips{display:flex;flex-wrap:wrap;gap:6px;}
.mm-kb .mm-chip{font-size:.78rem;opacity:.8;background:rgba(127,127,127,.1);border:1px solid rgba(127,127,127,.2);padding:3px 8px;border-radius:999px;}
.mm-kb .mm-open{margin-top:15px;font-size:.92rem;color:var(--mm-accent);font-weight:700;}
.mm-kb .mm-badge{display:inline-block;font-size:.66rem;font-weight:800;letter-spacing:.4px;text-transform:uppercase;padding:3px 9px;border-radius:999px;margin-bottom:10px;border:1px solid transparent;}
.mm-kb .mm-badge.static{color:#3f6fd8;background:rgba(91,140,255,.13);border-color:rgba(91,140,255,.4);}
.mm-kb .mm-badge.runtime{color:#1f9d63;background:rgba(62,207,142,.14);border-color:rgba(62,207,142,.42);}
.mm-kb .mm-badge.training{color:#c47e10;background:rgba(245,166,35,.16);border-color:rgba(245,166,35,.45);}
.mm-kb .mm-badge.overview{color:#6b7280;background:rgba(154,164,178,.14);border-color:rgba(154,164,178,.4);}
.mm-kb .mm-badge.app{color:#6a4dd6;background:rgba(124,92,255,.14);border-color:rgba(124,92,255,.42);}
.mm-kb .mm-hero{margin:0 0 1.4rem;border-radius:16px;overflow:hidden;border:1px solid rgba(127,127,127,.2);background:rgba(127,127,127,.06);aspect-ratio:16/9;}
.mm-kb .mm-hero img,.mm-kb .mm-hero video{width:100%;height:100%;object-fit:cover;display:block;}
.mm-kb .mm-article{display:none;}
.mm-kb .mm-back{display:inline-flex;align-items:center;gap:7px;font-size:.9rem;color:var(--mm-accent);margin-bottom:18px;cursor:pointer;background:none;border:none;padding:0;font-family:inherit;font-weight:700;}
.mm-kb .mm-back:hover{text-decoration:underline;}
.mm-kb .mm-article h1{font-size:1.6em;margin:.2rem 0 .5rem;font-weight:900;line-height:1.1;}
.mm-kb .mm-article .mm-lead{opacity:.8;font-size:1em;line-height:1.85;margin:0 0 1.8rem;}
.mm-kb .mm-sec{border:1px solid rgba(127,127,127,.22);border-radius:14px;padding:18px 22px;margin-bottom:14px;background:rgba(127,127,127,.04);}
.mm-kb .mm-sec h3{margin:0 0 10px;font-size:1.12em;font-weight:800;line-height:1.3;}
.mm-kb .mm-sec ul{margin:0;padding-left:20px;}
.mm-kb .mm-sec li{margin:8px 0;opacity:.92;font-size:.95em;line-height:1.75;}
.mm-kb .mm-note{font-size:.8rem;opacity:.7;border-left:3px solid var(--mm-accent);padding:10px 14px;border-radius:8px;background:rgba(127,127,127,.05);margin-top:14px;}
/* ---- rich article body + interactive widgets (Fundamentals) ---- */
.mm-kb .mm-sec p{margin:0 0 1.1rem;font-size:1.22em;line-height:1.9;opacity:.92;}
.mm-kb .mm-sec p:last-child{margin-bottom:0;}
.mm-kb .mmf-cap{font-size:1rem;opacity:.65;margin:.6rem 0 0;line-height:1.7;}
.mm-kb .mmf-kbd{font-family:ui-monospace,SFMono-Regular,Menlo,monospace;font-size:.85em;background:rgba(127,127,127,.14);padding:1px 6px;border-radius:5px;}
/* widget shell */
.mm-kb .mmf-w{border:1px solid rgba(127,127,127,.22);border-radius:14px;padding:18px;margin:1.1rem 0 .3rem;background:rgba(127,127,127,.05);}
.mm-kb .mmf-w h4{margin:0 0 3px;font-size:1.28rem;font-weight:800;}
.mm-kb .mmf-w .mmf-sub{font-size:1.02rem;opacity:.6;margin:0 0 16px;}
/* mode switch (segmented) */
.mm-kb .mmf-switch{display:inline-flex;border:1px solid rgba(127,127,127,.3);border-radius:999px;overflow:hidden;margin-bottom:14px;}
.mm-kb .mmf-switch button{font:inherit;font-size:1.1rem;font-weight:700;border:none;background:none;color:inherit;padding:9px 20px;cursor:pointer;opacity:.6;transition:.15s;}
.mm-kb .mmf-switch button.on{background:var(--mm-accent);color:#06231a;opacity:1;}
/* component map blocks */
.mm-kb .mmf-blocks{display:grid;grid-template-columns:repeat(4,1fr);gap:10px;}
@media(max-width:560px){.mm-kb .mmf-blocks{grid-template-columns:repeat(2,1fr);}}
.mm-kb .mmf-block{border:1px solid rgba(127,127,127,.25);border-radius:11px;padding:13px 12px;cursor:pointer;background:rgba(127,127,127,.04);transition:.18s;position:relative;text-align:left;}
.mm-kb .mmf-block .mmf-bt{font-weight:800;font-size:1.15rem;display:block;margin-bottom:6px;}
.mm-kb .mmf-block .mmf-bar{height:9px;border-radius:5px;background:var(--c,#888);transition:width .4s ease,opacity .3s;width:30%;}
.mm-kb .mmf-block .mmf-tag{font-size:.9rem;opacity:.7;display:block;margin-top:8px;}
.mm-kb .mmf-block.off{opacity:.28;filter:grayscale(.6);}
.mm-kb .mmf-block.sel{border-color:var(--c,var(--mm-accent));box-shadow:0 0 0 2px rgba(0,200,83,.15);}
.mm-kb .mmf-detail{margin-top:14px;font-size:1.12rem;line-height:1.8;border-left:3px solid var(--mm-accent);padding:13px 16px;background:rgba(127,127,127,.05);border-radius:8px;min-height:1.5em;}
/* trade-off slider */
.mm-kb .mmf-slider{width:100%;accent-color:var(--mm-accent);margin:6px 0 4px;}
.mm-kb .mmf-ends{display:flex;justify-content:space-between;font-size:1rem;opacity:.7;font-weight:700;}
.mm-kb .mmf-meters{margin-top:14px;display:grid;gap:10px;}
.mm-kb .mmf-meter .mmf-ml{display:flex;justify-content:space-between;font-size:1.05rem;margin-bottom:5px;opacity:.85;}
.mm-kb .mmf-track{height:14px;border-radius:7px;background:rgba(127,127,127,.16);overflow:hidden;}
.mm-kb .mmf-fill{height:100%;border-radius:7px;transition:width .25s ease;}
.mm-kb .mmf-readout{margin-top:14px;font-size:1.12rem;line-height:1.8;opacity:.9;}
/* estimator */
.mm-kb .mmf-ctl{display:flex;flex-wrap:wrap;gap:18px;align-items:flex-end;margin-bottom:8px;}
.mm-kb .mmf-ctl label{font-size:1.02rem;font-weight:700;opacity:.8;display:block;margin-bottom:7px;}
.mm-kb .mmf-prec button{font:inherit;font-size:1.05rem;font-weight:700;border:1px solid rgba(127,127,127,.3);background:none;color:inherit;padding:8px 15px;border-radius:8px;cursor:pointer;opacity:.65;margin-right:6px;transition:.15s;}
.mm-kb .mmf-prec button.on{background:var(--mm-static);color:#fff;border-color:var(--mm-static);opacity:1;}
.mm-kb .mmf-pval{font-weight:800;font-size:1.3rem;}
.mm-kb .mmf-est-bars{margin-top:16px;display:grid;gap:16px;}
.mm-kb .mmf-eb .mmf-ebl{display:flex;justify-content:space-between;align-items:baseline;font-size:1.1rem;font-weight:700;margin-bottom:6px;}
.mm-kb .mmf-eb .mmf-ebl small{font-weight:600;opacity:.65;}
.mm-kb .mmf-stack{display:flex;height:34px;border-radius:8px;overflow:hidden;background:rgba(127,127,127,.12);}
.mm-kb .mmf-seg{height:100%;transition:width .3s ease;min-width:0;}
.mm-kb .mmf-legend{display:flex;flex-wrap:wrap;gap:16px;margin-top:14px;font-size:1rem;opacity:.82;}
.mm-kb .mmf-legend span{display:inline-flex;align-items:center;gap:6px;}
.mm-kb .mmf-legend i{width:12px;height:12px;border-radius:3px;display:inline-block;}
.mm-kb .mmf-ratio{margin-top:14px;font-size:1.2rem;font-weight:700;text-align:center;padding:13px;border-radius:9px;background:rgba(0,200,83,.1);border:1px solid rgba(0,200,83,.25);}
/* ---- extra widgets for the remaining articles ---- */
.mm-kb .mmf-stat{font-size:2.2rem;font-weight:800;margin:8px 0 4px;letter-spacing:-.5px;line-height:1.1;}
.mm-kb .mmf-fitline{font-size:1.02rem;opacity:.85;margin-bottom:14px;line-height:1.65;}
.mm-kb .mmf-matrix{display:grid;gap:2px;margin:10px auto 18px;max-width:420px;}
.mm-kb .mmf-cell{aspect-ratio:1;background:#f5a623;border-radius:2px;opacity:.82;}
.mm-kb .mmf-memgrid{display:grid;grid-template-columns:repeat(20,1fr);gap:3px;margin:8px 0 14px;}
.mm-kb .mmf-mcell{aspect-ratio:1;border-radius:3px;background:rgba(127,127,127,.16);transition:.2s;}
.mm-kb .mmf-hier{display:grid;gap:11px;margin:8px 0 4px;}
.mm-kb .mmf-hrow{cursor:pointer;border:1px solid rgba(127,127,127,.22);border-radius:11px;padding:12px 15px;background:rgba(127,127,127,.04);transition:.15s;}
.mm-kb .mmf-hrow:hover,.mm-kb .mmf-hrow.sel{border-color:var(--mm-accent);box-shadow:0 0 0 2px rgba(0,200,83,.12);}
.mm-kb .mmf-hrow .mmf-htop{display:flex;justify-content:space-between;align-items:baseline;font-weight:800;font-size:1.08rem;margin-bottom:8px;}
.mm-kb .mmf-hrow .mmf-htop small{font-weight:600;opacity:.7;font-size:.92rem;}
.mm-kb .mmf-hbar{height:12px;border-radius:6px;background:var(--c,#888);}
.mm-kb .mmf-btn{font:inherit;font-size:1.02rem;font-weight:700;border:1px solid var(--mm-accent);background:rgba(0,200,83,.1);color:inherit;padding:9px 16px;border-radius:9px;cursor:pointer;transition:.15s;}
.mm-kb .mmf-btn:hover{background:rgba(0,200,83,.2);}
.mm-kb .mmf-btn:disabled{opacity:.4;cursor:not-allowed;}</style><div class="mm-kb not-prose" id="mm-kb"><div id="mm-home"><p class="mm-intro">Everything about how large language models use, store, and optimize memory — from the bytes that hold model weights on a GPU to how an agent remembers a conversation across sessions. Each topic is tagged by<em>when</em> the memory is consumed. Pick a card to open the article.</p><div class="mm-legend"><span><i style="background:#5b8cff"/>Static — set before you run</span><span><i style="background:#3ecf8e"/>Runtime — scales with workload</span><span><i style="background:#f5a623"/>Training — only during training</span><span><i style="background:#9aa4b2"/>Overview</span><span><i style="background:#7c5cff"/>App-level memory</span></div><div class="mm-domain"><span class="mm-dot" style="background:#5b8cff"/><h2>System &amp; Runtime Memory</h2><span class="mm-tag">How the model physically uses hardware</span></div><div class="mm-grid" id="mm-grid-system"/><div class="mm-domain"><span class="mm-dot" style="background:#7c5cff"/><h2>Agent &amp; Long-Term Memory</h2><span class="mm-tag">How the model "remembers" across turns &amp; sessions</span></div><div class="mm-grid" id="mm-grid-agent"/></div><div id="mm-article-container"/></div><script>
(function(){
var PHASE={static:"Static",runtime:"Runtime",training:"Training",overview:"Overview",app:"App-level"};
var TOPICS=[
{id:1,domain:"system",num:"01",phase:"overview",img:"/images/memory-llms/card-01.png",title:"Fundamentals of LLM Memory",summary:"The mental model: what consumes memory in an LLM and the core trade-offs.",chips:["Weights","Activations","KV cache","Compute trade-off"],lead:"Before optimizing anything, you need a clear picture of what actually lives in memory when a model runs — and how that differs between training and inference.",sections:[
{h:"Types of memory in an LLM",li:["<b>Model weights</b> — the learned parameters; the largest fixed cost.","<b>Activations</b> — intermediate tensors produced during a forward pass.","<b>KV cache</b> — cached keys/values that grow with generated tokens (inference).","<b>Optimizer states &amp; gradients</b> — only during training."]},
{h:"Memory vs compute trade-offs",li:["Recomputation trades compute for memory (activation checkpointing).","Caching trades memory for compute (KV cache, prefix caching).","Memory<b>bandwidth</b> vs<b>capacity</b> — decode is usually bandwidth-bound."]},
{h:"Training vs inference memory",li:["Training holds weights + gradients + optimizer states + activations.","Inference holds weights + a much smaller activation set + KV cache.","Rule of thumb: training needs several× the memory of inference."]}],body:"fundamentals"},
{id:2,domain:"system",num:"02",phase:"static",img:"/images/memory-llms/card-02.png",title:"Model Weights Memory",summary:"How parameter count and numeric precision determine the model's fixed footprint.",chips:["Parameters","FP16/BF16","INT8/INT4","Quantization","LoRA/QLoRA"],lead:"Weights are the baseline memory cost — fixed by how many parameters the model has and how many bytes each one takes.",sections:[
{h:"Parameter count &amp; footprint",li:["Footprint ≈ params × bytes-per-param. A 7B model in FP16 ≈ 14 GB.","Embedding/vocab and LM-head matrices can be a non-trivial slice."]},
{h:"Data types &amp; precision",li:["<b>FP32</b> (4B),<b>FP16/BF16</b> (2B),<b>INT8</b> (1B),<b>INT4</b> (~0.5B).","BF16 vs FP16: same size, different exponent/mantissa trade-off.","Lower precision cuts memory and bandwidth but can cost accuracy."]},
{h:"Quantization basics &amp; impact",li:["Post-training quantization (PTQ) vs quantization-aware training (QAT).","Schemes: GPTQ, AWQ, GGUF/llama.cpp, bitsandbytes.","Accuracy vs size vs speed; per-channel / group scaling."]},
{h:"Memory-efficient fine-tuning",li:["<b>LoRA / PEFT</b> — train small adapters instead of full weights.","<b>QLoRA</b> — fine-tune on top of a 4-bit quantized base model."]}],body:"weights"},
{id:3,domain:"system",num:"03",phase:"runtime",img:"/images/memory-llms/card-03.png",title:"KV Cache &amp; Context Management",summary:"Why the KV cache exists, how it scales with context, and how to shrink it.",chips:["KV cache","Context length","GQA/MQA","MLA","Window limits"],lead:"The KV cache is what makes generation fast — and what makes long context expensive. It often dominates inference memory.",sections:[
{h:"What the KV cache is &amp; why it matters",li:["Stores attention keys/values so past tokens aren't recomputed each step.","Without it, every new token would re-attend over the whole sequence."]},
{h:"How it scales",li:["Size ≈ 2 × layers × heads × head_dim × seq_len × batch × bytes.","Grows linearly with context length and batch size — the long-context tax."]},
{h:"Reducing the KV cache",li:["<b>MQA / GQA</b> — share K/V across heads to cut cache size dramatically.","<b>MLA</b> (Multi-head Latent Attention, DeepSeek) — compress KV into a latent.","KV-cache quantization (store K/V in INT8/INT4)."]},
{h:"Context window limitations",li:["Practical limits driven by KV memory and attention cost, not just training.","Position encodings (RoPE scaling, ALiBi) and their effect on length."]}],body:"kv"},
{id:4,domain:"system",num:"04",phase:"runtime",img:"/images/memory-llms/card-04.png",video:"/images/memory-llms/card-04.mp4",title:"Attention Mechanisms &amp; Memory Efficiency",summary:"The memory cost of attention and the algorithms that tame it.",chips:["O(n²)","FlashAttention","Sparse","Sliding window"],lead:"Naive attention materializes a sequence-length-squared matrix. The fixes are some of the most important systems work in modern LLMs.",sections:[
{h:"Standard attention memory cost",li:["The score matrix is O(n²) in sequence length — the core bottleneck.","Memory, not just FLOPs, is what blows up at long context."]},
{h:"FlashAttention &amp; IO-aware methods",li:["Tiles the computation so the full n×n matrix is never materialized.","Cuts memory to O(n) and improves speed via better HBM/SRAM use.","Versions and what changed (FA-2, FA-3)."]},
{h:"Sparse &amp; windowed attention",li:["<b>Sliding-window</b> attention (Mistral) bounds the attended span.","Sparse / block-sparse patterns; local + global token schemes.","Trade-offs between coverage and memory savings."]}],body:"attention"},
{id:5,domain:"system",num:"05",phase:"runtime",img:"/images/memory-llms/card-05.png",video:"/images/memory-llms/card-05.mp4",title:"Inference Memory Optimization",summary:"Serving-time techniques that pack more requests into the same GPU.",chips:["PagedAttention","Continuous batching","Prefix cache","Speculative decoding"],lead:"Production inference is largely a memory-management problem. These techniques are why modern serving engines achieve high throughput.",sections:[
{h:"Paging &amp; batching",li:["<b>PagedAttention</b> (vLLM) — page the KV cache to kill fragmentation.","<b>Continuous batching</b> — add/remove requests mid-flight for high utilization."]},
{h:"Caching across requests",li:["<b>Prefix / prompt caching</b> — reuse KV for shared prefixes (system prompts, RAG).","Cache hit-rate economics and eviction policy."]},
{h:"KV compression &amp; eviction",li:["Token eviction (H2O, attention-sink / StreamingLLM ideas).","Quantized KV; selective retention of important tokens."]},
{h:"Other techniques",li:["<b>Speculative decoding</b> — draft model + verify; memory implications.","<b>Disaggregated serving</b> — separate prefill and decode across GPUs."]}],body:"inference"},
{id:6,domain:"system",num:"06",phase:"training",img:"/images/memory-llms/card-06.png",video:"/images/memory-llms/card-06.mp4",title:"Training-Time Memory",summary:"Where memory actually goes during training — and how to fit bigger models.",chips:["Optimizer states","Gradients","Checkpointing","ZeRO/FSDP"],lead:"Training memory is dominated by things inference never sees. This is the part most people's mental model is missing.",sections:[
{h:"What consumes training memory",li:["<b>Optimizer states</b> — Adam stores 2 extra tensors per param (~2× weights).","<b>Gradients</b> — one per parameter.","<b>Activations</b> — scale with batch size, sequence length, and depth."]},
{h:"Reducing activation memory",li:["<b>Activation checkpointing / recomputation</b> — recompute instead of store.","Mixed-precision training (FP16/BF16) with FP32 master weights."]},
{h:"Sharding &amp; distribution",li:["<b>ZeRO / FSDP</b> — shard optimizer states, gradients, and params across devices.","Offloading optimizer states to CPU/NVMe (ZeRO-Offload/Infinity).","Gradient accumulation to emulate larger batches within memory limits."]}],body:"training"},
{id:7,domain:"system",num:"07",phase:"static",img:"/images/memory-llms/card-07.png",title:"Hardware &amp; System-Level Memory",summary:"The physical memory hierarchy and how models are spread across it.",chips:["HBM/VRAM","Bandwidth","Unified memory","Parallelism","Offloading"],lead:"All of the above ultimately lands on real silicon. Understanding the hierarchy explains why certain optimizations matter.",sections:[
{h:"The memory hierarchy",li:["<b>HBM / VRAM</b> vs on-chip SRAM vs system RAM vs disk.","<b>Bandwidth vs capacity</b> — decode is typically bandwidth-bound.","Why FlashAttention's IO-awareness pays off here."]},
{h:"Unified &amp; alternative memory",li:["Apple Silicon unified memory — CPU/GPU share one pool.","Implications for running large models on consumer hardware."]},
{h:"Distribution &amp; offloading",li:["Tensor / pipeline / data parallelism and their memory profiles.","<b>CPU offloading</b> and<b>disk offloading</b> for oversized models.","PCIe/NVLink transfer costs as the hidden bottleneck."]}],body:"hardware"},
{id:8,domain:"agent",num:"08",phase:"app",img:"/images/memory-llms/card-08.png",video:"/images/memory-llms/card-08.mp4",title:"Agent &amp; Long-Term Memory Systems",summary:"How an LLM application remembers beyond a single context window.",chips:["Working memory","RAG","Vector stores","Summarization","Episodic/semantic"],lead:"A different sense of 'memory': not bytes on a GPU, but how an app retains and recalls information across turns and sessions.",sections:[
{h:"The context window as working memory",li:["Short-term memory = whatever fits in the current context window.","Limits, cost, and the 'lost in the middle' recall problem."]},
{h:"External / long-term memory",li:["<b>RAG</b> + vector databases as retrievable long-term storage.","Episodic (events) vs semantic (facts) vs procedural memory.","Memory writes: what to store, when, and how to index it."]},
{h:"Managing growing memory",li:["Conversation<b>summarization / compaction</b> to stay within context.","Eviction and relevance ranking of stored memories.","Frameworks &amp; patterns (memory stores, scratchpads, profiles)."]},
{h:"Multimodal memory",li:["Storing and recalling images/audio alongside text.","Embedding and retrieval across modalities."]}],body:"agent"},
{id:9,domain:"agent",num:"09",phase:"app",img:"/images/memory-llms/card-09.png",title:"Advanced &amp; Emerging Architectures",summary:"Architectures and techniques rethinking memory from the ground up.",chips:["Mamba","RWKV","Long context","SSMs"],lead:"Beyond transformers: approaches that change the fundamental memory–sequence relationship.",sections:[
{h:"Memory-efficient architectures",li:["<b>State-space models</b> (Mamba) — constant-size state instead of KV cache.","<b>RWKV</b> — RNN-style recurrence with transformer-level quality.","Linear / sub-quadratic attention variants."]},
{h:"Long-context techniques",li:["Context extension (RoPE scaling, position interpolation).","Retrieval-augmented and memory-augmented long context.","Hybrid architectures mixing attention and recurrence."]}],body:"advanced"}
];
/* ===== rich, written-out bodies for selected topics ===== */
var BODIES={
fundamentals:function(){return `<div class="mm-sec"><h3>Types of memory in an LLM</h3><p>Think of GPU memory as a desk. Before the model can do any useful work, several different things have to fit on that desk<em>at the same time</em>. Almost every optimization you'll meet later is just a way to shrink one of these piles or remove it entirely — so it pays to know what they are.</p><p>There are four buckets. Two are always there, one shows up only when you're generating text, and one only when you're training. Toggle the mode below and tap each block to see what it holds.</p><div class="mmf-w" id="mmf-map"><h4>What's on the GPU right now</h4><p class="mmf-sub">Switch the mode, then tap a block. Bar length shows roughly how big each pile tends to be.</p><div class="mmf-switch" data-role="mode"><button data-mode="inference" class="on">Inference</button><button data-mode="training">Training</button></div><div class="mmf-blocks"><div class="mmf-block" data-k="weights" style="--c:#5b8cff"><span class="mmf-bt">Weights</span><div class="mmf-bar"/><span class="mmf-tag">always present</span></div><div class="mmf-block" data-k="act" style="--c:#3ecf8e"><span class="mmf-bt">Activations</span><div class="mmf-bar"/><span class="mmf-tag">both modes</span></div><div class="mmf-block" data-k="kv" style="--c:#00c853"><span class="mmf-bt">KV cache</span><div class="mmf-bar"/><span class="mmf-tag">inference only</span></div><div class="mmf-block" data-k="opt" style="--c:#f5a623"><span class="mmf-bt">Optimizer + grads</span><div class="mmf-bar"/><span class="mmf-tag">training only</span></div></div><div class="mmf-detail">Tap a block above to see what it stores.</div></div></div><div class="mm-sec"><h3>Memory vs compute trade-offs</h3><p>You rarely get memory savings for free. Most techniques are a trade against<em>compute</em> (time): you decide which resource you'd rather spend. Two moves show up over and over.</p><p><b>Recomputation</b> throws activations away and rebuilds them later when they're needed — saving memory but paying extra compute (this is "activation checkpointing").<b>Caching</b> is the opposite: keep results around so you never redo them — saving compute but spending memory (the KV cache and prefix caching both do this).</p><p>There's a third axis people forget:<b>bandwidth vs capacity</b>. Capacity is how much fits on the desk; bandwidth is how fast you can move things on and off it. When a model generates text one token at a time, the GPU often sits idle waiting to<em>read the weights</em> from memory — so decoding is usually bandwidth-bound, not compute-bound.</p><div class="mmf-w" id="mmf-trade"><h4>Spend memory or spend time?</h4><p class="mmf-sub">Drag the slider to choose how much you store vs recompute.</p><input type="range" class="mmf-slider" min="0" max="100" value="50"><div class="mmf-ends"><span>◀ Store / cache everything</span><span>Recompute everything ▶</span></div><div class="mmf-meters"><div class="mmf-meter"><div class="mmf-ml"><span>Memory used</span><span class="mmf-mv" data-m="mem"/></div><div class="mmf-track"><div class="mmf-fill" data-f="mem" style="background:#5b8cff"/></div></div><div class="mmf-meter"><div class="mmf-ml"><span>Compute / latency</span><span class="mmf-mv" data-m="cpu"/></div><div class="mmf-track"><div class="mmf-fill" data-f="cpu" style="background:#f5a623"/></div></div></div><div class="mmf-readout"/></div></div><div class="mm-sec"><h3>Training vs inference memory</h3><p>The same model has a very different appetite depending on what you're doing with it.<b>Training</b> has to hold everything at once: the weights, a gradient for every weight, the optimizer's running averages, and all the activations needed to run backpropagation.<b>Inference</b> drops the gradients and optimizer entirely, keeps only a slim set of activations, and adds the KV cache.</p><p>The rule of thumb: training a model takes<em>several times</em> the memory of simply running it. That gap is exactly why you can comfortably run a model on hardware that could never have trained it. Move the slider and switch the inference precision to feel the difference.</p><div class="mmf-w" id="mmf-est"><h4>Training vs inference memory estimator</h4><p class="mmf-sub">Rough rule-of-thumb breakdown — orders of magnitude, not exact figures.</p><div class="mmf-ctl"><div style="flex:1 1 240px;"><label>Model size —<span class="mmf-pval">7B</span> parameters</label><input type="range" class="mmf-slider" id="mmf-params" min="1" max="70" value="7"/><div class="mmf-prec"><label>Inference precision</label><button data-b="4">FP32</button><button data-b="2" class="on">FP16</button><button data-b="1">INT8</button><button data-b="0.5">INT4</button></div></div><div class="mmf-est-bars"><div class="mmf-eb" data-row="train"><div class="mmf-ebl"><span>Training<small>(FP16 mixed precision)</small></span><span class="mmf-tot"/></div><div class="mmf-stack"/></div><div class="mmf-eb" data-row="inf"><div class="mmf-ebl"><span>Inference</span><span class="mmf-tot"/></div><div class="mmf-stack"/></div></div><div class="mmf-legend"><span><i style="background:#5b8cff"/>Weights</span><span><i style="background:#f5a623"/>Optimizer + grads</span><span><i style="background:#3ecf8e"/>Activations</span><span><i style="background:#00c853"/>KV cache</span></div><div class="mmf-ratio"/><p class="mmf-cap">Approximate, per common rules of thumb: weights = params × bytes-per-param; Adam optimizer + gradients ≈ 14 bytes/param; activations and KV cache depend on batch size and context length and are shown as representative values.</p></div></div>`;},
weights:function(){return `<div class="mm-sec"><h3>Parameter count &amp; footprint</h3><p>A model's weights are simply a giant pile of numbers — the parameters it learned during training. The memory they take is pleasantly easy to estimate:<span class="mmf-kbd">footprint ≈ parameters × bytes per parameter</span>. Nothing else about the architecture changes this number. A 7-billion-parameter model at 2 bytes each is about 14&nbsp;GB.</p><p>For smaller models the embedding and output (vocabulary) matrices can be a surprisingly large slice, because they scale with vocabulary size rather than with depth.</p></div><div class="mm-sec"><h3>Data types &amp; precision</h3><p>How many bytes each parameter takes depends on its numeric type:<b>FP32</b> uses 4 bytes,<b>FP16/BF16</b> use 2,<b>INT8</b> uses 1, and<b>INT4</b> about half a byte. BF16 and FP16 are the same size but split their bits differently — BF16 keeps FP32's wide range (good for training stability) at the cost of precision.</p><p>Halving the bytes halves both the memory and the bandwidth needed to read the weights, which is why lower precision usually runs faster too. Try it below.</p><div class="mmf-w" id="mmf-wt"><h4>Weights footprint calculator</h4><p class="mmf-sub">Pick a size and precision — see how many GB the raw weights need.</p><div class="mmf-ctl"><div style="flex:1 1 240px;"><label>Parameters —<span class="mmf-pval">7B</span></label><input type="range" class="mmf-slider" id="mmf-wparams" min="1" max="180" value="7"/><div class="mmf-prec"><label>Precision</label><button data-b="4">FP32</button><button data-b="2" class="on">FP16</button><button data-b="1">INT8</button><button data-b="0.5">INT4</button></div></div><div class="mmf-stat" data-s="gb"/><div class="mmf-fitline" data-s="fit"/><div class="mmf-est-bars" data-s="cmp"/><p class="mmf-cap">footprint = parameters × bytes-per-param. FP32 = 4B, FP16/BF16 = 2B, INT8 = 1B, INT4 ≈ 0.5B.</p></div></div><div class="mm-sec"><h3>Quantization basics &amp; impact</h3><p>Quantization is the art of squeezing weights into fewer bits.<b>Post-training quantization (PTQ)</b> compresses an already-trained model;<b>quantization-aware training (QAT)</b> trains with the rounding in mind for better accuracy. Popular schemes — GPTQ, AWQ, GGUF/llama.cpp, bitsandbytes — differ mainly in how they choose scaling factors (per-channel or per-group) to limit the accuracy hit.</p><p>The trade-off is always the same triangle:<b>size vs accuracy vs speed</b>. Going from FP16 to INT4 roughly quarters the footprint — letting a model that needed a data-center GPU run on a laptop — usually with only a small quality drop.</p></div><div class="mm-sec"><h3>Memory-efficient fine-tuning</h3><p>You don't have to retrain all the weights to adapt a model.<b>LoRA / PEFT</b> freezes the big base model and trains tiny add-on matrices, so you store and update a few million parameters instead of billions.<b>QLoRA</b> goes further: it keeps the base model quantized to 4-bit and trains the LoRA adapters on top — letting you fine-tune large models on a single consumer GPU.</p></div>`;},
kv:function(){return `<div class="mm-sec"><h3>What the KV cache is &amp; why it matters</h3><p>When a model generates text, each new token must "look back" at every previous token through attention. Re-deriving the keys and values for all those past tokens at every step would be hugely wasteful, so the model caches them — that's the<b>KV cache</b>. It's what makes generation fast: each new token computes only its own key and value and reuses the rest.</p></div><div class="mm-sec"><h3>How it scales</h3><p>The catch is that the cache grows with every token. Its size is roughly<span class="mmf-kbd">2 × layers × heads × head_dim × tokens × batch × bytes</span> (the 2 is for keys and values). It grows<b>linearly</b> with context length and batch size — double the conversation and you double the cache. At long context this often dwarfs the weights themselves: the long-context tax.</p><div class="mmf-w" id="mmf-kv"><h4>KV cache size — watch it grow</h4><p class="mmf-sub">Based on a 7B-class model: 32 layers, 32 query heads, head dim 128, FP16.</p><div class="mmf-ctl"><div style="flex:1 1 220px;"><label>Context length —<span class="mmf-pval" data-p="ctx">8K</span> tokens</label><input type="range" class="mmf-slider" id="mmf-ctx" min="1" max="128" value="8"/><div style="flex:1 1 150px;"><label>Batch —<span class="mmf-pval" data-p="bs">1</span></label><input type="range" class="mmf-slider" id="mmf-bs" min="1" max="32" value="1"/></div><div class="mmf-prec" data-role="attn" style="margin-bottom:6px;"><label>Attention type (K/V heads)</label><button data-h="32">MHA · 32</button><button data-h="8" class="on">GQA · 8</button><button data-h="1">MQA · 1</button></div><div class="mmf-stat" data-s="gb"/><div class="mmf-readout" data-s="cmp"/><p class="mmf-cap">cache = 2 × layers × kv-heads × head_dim × tokens × batch × 2 bytes. GQA/MQA shrink it by sharing K/V across heads.</p></div></div><div class="mm-sec"><h3>Reducing the KV cache</h3><p>Most savings come from making heads share their keys and values.<b>Multi-Query Attention (MQA)</b> uses one shared K/V for all heads;<b>Grouped-Query Attention (GQA)</b> is the popular middle ground, sharing across small groups. DeepSeek's<b>Multi-head Latent Attention (MLA)</b> compresses K/V into a small latent vector. You can also<b>quantize the cache</b> to INT8/INT4. Flip the attention type above to feel the difference.</p></div><div class="mm-sec"><h3>Context window limitations</h3><p>Context limits are usually set by this memory cost and attention's compute, not by anything fundamental from training. Position-encoding tricks like<b>RoPE scaling</b> and<b>ALiBi</b> let models stretch to longer contexts than they were trained on — with some quality cost.</p></div>`;},
attention:function(){return `<div class="mm-sec"><h3>Standard attention memory cost</h3><p>Attention compares every token with every other token. That comparison produces a score matrix of size<b>sequence × sequence</b> — so its memory grows with the<em>square</em> of the sequence length. Double the context and the matrix quadruples. For long sequences it's the memory of this matrix, not the raw compute, that blows up first.</p><div class="mmf-w" id="mmf-attn"><h4>Why long context explodes: n² vs n</h4><p class="mmf-sub">Slide the sequence length and watch the attention matrix grow.</p><div class="mmf-ctl"><div style="flex:1 1 100%;"><label>Sequence length —<span class="mmf-pval">16</span> tokens</label><input type="range" class="mmf-slider" id="mmf-n" min="4" max="40" value="16"/></div><div class="mmf-matrix" data-s="grid"/><div class="mmf-est-bars"><div class="mmf-eb"><div class="mmf-ebl"><span>Standard attention<small>O(n²)</small></span><span data-s="std"/></div><div class="mmf-stack"><div class="mmf-seg" data-f="std" style="background:#f5a623"/></div></div><div class="mmf-eb"><div class="mmf-ebl"><span>FlashAttention<small>O(n)</small></span><span data-s="fa"/></div><div class="mmf-stack"><div class="mmf-seg" data-f="fa" style="background:#00c853"/></div></div></div><div class="mmf-readout" data-s="note"/></div></div><div class="mm-sec"><h3>FlashAttention &amp; IO-aware methods</h3><p><b>FlashAttention</b> is the key fix. Instead of building the whole n×n matrix in slow GPU memory, it processes attention in small<b>tiles</b> that fit in fast on-chip SRAM, computing the result without ever materializing the full matrix. This drops memory from O(n²) to<b>O(n)</b> and runs faster by moving less data. Later versions (FA-2, FA-3) tuned the GPU work further.</p></div><div class="mm-sec"><h3>Sparse &amp; windowed attention</h3><p>The other approach is to simply not attend to everything.<b>Sliding-window</b> attention (used by Mistral) limits each token to a fixed nearby window.<b>Sparse</b> and block-sparse patterns mix local attention with a few global tokens. You trade some ability to connect very distant tokens for large memory savings.</p></div>`;},
inference:function(){return `<div class="mm-sec"><h3>Paging &amp; batching</h3><p>Production serving is mostly a memory-management problem.<b>PagedAttention</b> (from vLLM) treats the KV cache the way an operating system treats RAM: it splits memory into fixed-size pages so requests of different lengths pack together without leaving wasted gaps (fragmentation).<b>Continuous batching</b> adds and removes requests from the running batch on the fly instead of waiting for a whole batch to finish, keeping the GPU busy.</p><div class="mmf-w" id="mmf-pag"><h4>PagedAttention: stop wasting GPU memory</h4><p class="mmf-sub">Toggle paging and watch fragmentation disappear. Each colour is one request.</p><div class="mmf-switch" data-role="pg"><button data-pg="0" class="on">No paging</button><button data-pg="1">PagedAttention</button></div><div class="mmf-memgrid" data-s="grid"/><div class="mmf-readout" data-s="note"/></div></div><div class="mm-sec"><h3>Caching across requests</h3><p>Different requests often share a prefix — the same long system prompt, or the same retrieved documents in RAG.<b>Prefix / prompt caching</b> stores the KV for that shared part once and reuses it across requests, so you pay to process it only the first time. Hit rate and a sensible eviction policy decide how much you save.</p></div><div class="mm-sec"><h3>KV compression &amp; eviction</h3><p>You can also shrink the cache while serving:<b>evict</b> tokens that no longer matter (H2O and the attention-sink / StreamingLLM idea keep just the important and most-recent tokens), or store the cache in<b>INT8/INT4</b>.</p></div><div class="mm-sec"><h3>Other techniques</h3><p><b>Speculative decoding</b> uses a small draft model to guess several tokens that the big model then verifies in one pass.<b>Disaggregated serving</b> splits the prefill and decode phases onto different GPUs so each runs on hardware suited to it.</p></div>`;},
training:function(){return `<div class="mm-sec"><h3>What consumes training memory</h3><p>Training memory is dominated by things inference never sees. For every parameter you keep three things: the<b>weight</b> itself, its<b>gradient</b>, and the optimizer's bookkeeping. Adam stores two extra values per parameter (a running mean and variance), and mixed-precision training also keeps an FP32 master copy — together often<b>~12–16 bytes per parameter</b>, several times the weights alone.</p><p>On top of that sit the<b>activations</b>, which scale with batch size, sequence length, and depth. Use the widget below to see how sharding and checkpointing claw the memory back.</p><div class="mmf-w" id="mmf-zero"><h4>ZeRO / FSDP: fitting a model across GPUs</h4><p class="mmf-sub">7B model, FP16 mixed precision. Watch per-GPU memory shrink as you shard.</p><div class="mmf-prec" data-role="stage" style="margin-bottom:12px;"><label>ZeRO stage</label><button data-z="0" class="on">0 · none</button><button data-z="1">1 · optim</button><button data-z="2">2 · +grads</button><button data-z="3">3 · +params</button></div><div class="mmf-ctl"><div style="flex:1 1 240px;"><label>GPUs —<span class="mmf-pval">8</span></label><input type="range" class="mmf-slider" id="mmf-gpus" min="1" max="16" value="8"/><div><label>Activation checkpointing</label><div class="mmf-switch" data-role="ckpt"><button data-ck="0" class="on">Off</button><button data-ck="1">On</button></div></div></div><div class="mmf-est-bars"><div class="mmf-eb"><div class="mmf-ebl"><span>Memory per GPU</span><span class="mmf-tot"/></div><div class="mmf-stack"/></div></div><div class="mmf-legend"><span><i style="background:#5b8cff"/>Weights</span><span><i style="background:#f5a623"/>Optimizer</span><span><i style="background:#e0556b"/>Gradients</span><span><i style="background:#3ecf8e"/>Activations</span></div><div class="mmf-ratio"/></div></div><div class="mm-sec"><h3>Reducing activation memory</h3><p>The biggest lever on activation memory is<b>recomputation</b>, a.k.a. activation checkpointing: instead of storing every layer's activations for the backward pass, you keep a few checkpoints and recompute the rest on demand — trading extra compute for large memory savings.<b>Mixed-precision</b> (FP16/BF16) with an FP32 master copy also keeps activations and gradients small.</p></div><div class="mm-sec"><h3>Sharding &amp; distribution</h3><p>To train models too big for one GPU, you<b>shard</b>.<b>ZeRO</b> (DeepSpeed) and<b>FSDP</b> (PyTorch) split the optimizer states, gradients, and even the parameters across GPUs so no single device holds the whole thing. You can<b>offload</b> optimizer state to CPU or NVMe (ZeRO-Offload / Infinity), and use<b>gradient accumulation</b> to simulate big batches within a tight memory budget.</p></div>`;},
hardware:function(){return `<div class="mm-sec"><h3>The memory hierarchy</h3><p>Every byte we've discussed ultimately lives on real silicon, arranged in a hierarchy: tiny, blazing-fast on-chip<b>SRAM</b>; the GPU's<b>HBM/VRAM</b> (tens of GB, very fast);<b>system RAM</b> (hundreds of GB, slower); and<b>disk/SSD</b> (huge, slow). The higher you go, the faster and smaller; the lower, the bigger and slower. Two numbers matter at each level —<b>capacity</b> (how much fits) and<b>bandwidth</b> (how fast you can move it).</p><div class="mmf-w" id="mmf-hier"><h4>The memory hierarchy — speed vs size</h4><p class="mmf-sub">Tap a tier. Bar length shows relative bandwidth (how fast); the label shows capacity (how much).</p><div class="mmf-hier" data-s="rows"/><div class="mmf-detail">Tap a tier to see its trade-off.</div></div></div><div class="mm-sec"><h3>Unified &amp; alternative memory</h3><p>During token-by-token decoding the GPU often finishes its math and then waits to read weights from HBM — it's<b>bandwidth-bound</b>, not compute-bound. That's exactly why FlashAttention's trick of keeping data in fast SRAM pays off. Some systems blur the levels:<b>Apple Silicon</b> uses unified memory, where CPU and GPU share one pool — letting consumer machines run surprisingly large models.</p></div><div class="mm-sec"><h3>Distribution &amp; offloading</h3><p>When a model still doesn't fit, you can<b>offload</b> parts to CPU RAM or disk, or spread it across GPUs with<b>tensor</b>,<b>pipeline</b>, or<b>data</b> parallelism — each with a different memory profile. The catch: the<b>PCIe/NVLink</b> links between devices then become the hidden bottleneck, since moving data between them is far slower than reading local memory.</p></div>`;},
agent:function(){return `<div class="mm-sec"><h3>The context window as working memory</h3><p>Here "memory" means something different — not bytes on a GPU, but what an application remembers across turns. The simplest memory is the<b>context window</b> itself: whatever you can fit in the prompt is the model's short-term, working memory. It's fast and direct, but limited in size, costs money per token, and suffers the<b>"lost in the middle"</b> problem where facts buried in a long context get overlooked.</p><div class="mmf-w" id="mmf-ctxwin"><h4>Working memory fills up — manage it</h4><p class="mmf-sub">Add turns until the window fills, then try summarizing. Limit: 8,000 tokens.</p><div style="display:flex;gap:10px;flex-wrap:wrap;margin-bottom:14px;"><button class="mmf-btn" data-a="add">+ Add a turn</button><button class="mmf-btn" data-a="sum">Summarize old turns</button><button class="mmf-btn" data-a="reset">Reset</button></div><div style="display:flex;justify-content:space-between;font-size:1.05rem;margin-bottom:6px;opacity:.85;"><span data-s="lbl">0 / 8000 tokens</span><span data-s="turns">0 turns</span></div><div class="mmf-track" style="height:24px;"><div class="mmf-fill" data-f="ctx" style="background:#7c5cff;width:0%"/></div><div class="mmf-readout" data-s="note">Empty context. Each turn adds about 800 tokens.</div></div></div><div class="mm-sec"><h3>External / long-term memory</h3><p>For anything longer-lived, you store information outside the model and retrieve it when needed — usually with<b>RAG</b> and a vector database. It helps to think in human terms:<b>episodic</b> memory (what happened),<b>semantic</b> memory (facts), and<b>procedural</b> memory (how to do things). The hard design questions are what to write down, when, and how to index it for retrieval.</p></div><div class="mm-sec"><h3>Managing growing memory</h3><p>Because conversations grow without bound, you must manage what stays in context.<b>Summarization / compaction</b> folds old turns into a short recap;<b>relevance ranking</b> and<b>eviction</b> keep only what matters. Common patterns include memory stores, scratchpads, and user profiles.</p></div><div class="mm-sec"><h3>Multimodal memory</h3><p>Memory isn't only text.<b>Multimodal memory</b> stores and recalls images and audio alongside words, embedding everything into a shared space so a single query can retrieve across modalities.</p></div>`;},
advanced:function(){return `<div class="mm-sec"><h3>Memory-efficient architectures</h3><p>Transformers pay for their power with a KV cache that grows with every token. A new family of architectures avoids that.<b>State-space models</b> like<b>Mamba</b> carry a fixed-size "state" that summarizes everything so far — so memory stays<b>constant</b> no matter how long the sequence, instead of growing linearly.<b>RWKV</b> blends RNN-style recurrence with transformer-level quality, also keeping memory flat. Various linear / sub-quadratic attention variants chase the same goal.</p><div class="mmf-w" id="mmf-arch"><h4>Growing cache vs constant state</h4><p class="mmf-sub">Slide the sequence length. The transformer's memory climbs; an SSM stays flat.</p><div class="mmf-ctl"><div style="flex:1 1 100%;"><label>Sequence length —<span class="mmf-pval">8K</span> tokens</label><input type="range" class="mmf-slider" id="mmf-seq" min="1" max="256" value="8"/></div><div class="mmf-est-bars"><div class="mmf-eb"><div class="mmf-ebl"><span>Transformer<small>KV cache, grows with length</small></span><span data-s="tf"/></div><div class="mmf-stack"><div class="mmf-seg" data-f="tf" style="background:#f5a623"/></div></div><div class="mmf-eb"><div class="mmf-ebl"><span>State-space / Mamba<small>constant state</small></span><span data-s="ssm"/></div><div class="mmf-stack"><div class="mmf-seg" data-f="ssm" style="background:#00c853"/></div></div></div><div class="mmf-readout" data-s="note"/></div></div><div class="mm-sec"><h3>Long-context techniques</h3><p>Even within transformers, several techniques stretch usable context:<b>RoPE scaling</b> and<b>position interpolation</b> extend a model past its trained length;<b>retrieval-</b> and<b>memory-augmented</b> methods pull in only the relevant slices instead of attending to everything; and<b>hybrid</b> architectures interleave attention layers with recurrent or state-space layers to get the best of both.</p></div>`;}
};
var INITS={
fundamentals:function(c){
var map=c.querySelector("#mmf-map");
if(map){
var DET={
weights:"Model weights — the learned parameters. Always present, training or inference. A 7B model in FP16 is about 14 GB before anything else loads.",
act:"Activations — the intermediate results from each layer during a forward pass. They scale with batch size and sequence length. Training must keep them around for the backward pass (large); inference can discard them almost immediately (small).",
kv:"KV cache — the keys and values for every token so far, saved so they aren't recomputed each step. Inference only. It starts tiny and grows with every generated token — the long-context tax.",
opt:"Optimizer states & gradients — only exist during training. Every parameter gets a gradient, and an optimizer like Adam keeps two extra running averages per parameter. Usually the single biggest chunk of training memory."
};
var SIZ={inference:{weights:72,act:22,kv:38,opt:0},training:{weights:42,act:78,kv:0,opt:92}};
var blocks=map.querySelectorAll(".mmf-block");
var det=map.querySelector(".mmf-detail");
function applyMode(m){var s=SIZ[m];blocks.forEach(function(b){var k=b.getAttribute("data-k");var bar=b.querySelector(".mmf-bar");var w=s[k];if(w<=0){b.classList.add("off");bar.style.width="0%";}else{b.classList.remove("off");bar.style.width=w+"%";}});} map.querySelectorAll('[data-role="mode" ]= button').forEach(function(btn){btn.addEventListener("click",function(){map.querySelectorAll('[data-role="mode" ]= button').forEach(function(x){x.classList.remove("on");});btn.classList.add("on");applyMode(btn.getAttribute("data-mode"));});});= blocks.forEach(function(b){b.addEventListener("click",function(){blocks.forEach(function(x){x.classList.remove("sel");});b.classList.add("sel");det.textContent=DET[b.getAttribute("data-k")];});}); applyMode("inference");= }= var= trade=c.querySelector("#mmf-trade"); if(trade){= var= sl=trade.querySelector(".mmf-slider"); var= fmem=trade.querySelector('[data-f="mem"]'),fcpu=trade.querySelector('[data-f="cpu"]'); var= vmem=trade.querySelector('[data-m="mem"]'),vcpu=trade.querySelector('[data-m="cpu"]'); var= ro=trade.querySelector(".mmf-readout"); function= upd(){var= v=+sl.value;var mem=Math.round(95-0.7*v),cpu=Math.round(25+0.7*v);fmem.style.width=mem+"%";fcpu.style.width=cpu+"%";vmem.textContent=mem+"% used";vcpu.textContent=cpu+"% of= max";var= t;if(v<33)t="<b>Store everything</b> — full KV cache, no recomputation. Fastest, but memory fills up quickly." ;else= if(v=>66)t="<b>Recompute aggressively</b> — activation checkpointing, minimal caching. Fits in far less memory, but each step costs extra compute.";else t="<b>Balanced</b> — keep the hot data, rebuild the cheap-to-recompute parts. The usual real-world setting.";ro.innerHTML=t;}
sl.addEventListener("input",upd);upd();
}
var est=c.querySelector("#mmf-est");
if(est){
var ps=est.querySelector("#mmf-params");var pv=est.querySelector(".mmf-pval");var bytes=2;
function gb(x){return x>=10?Math.round(x)+" GB":(Math.round(x*10)/10)+" GB";}
function seg(color,val,scale){return '<div class="mmf-seg" style="background:'+color+';width:'+(val/scale*100)+'%"/>';}
function calc(){var P=+ps.value;pv.textContent=P+"B";var tw=2*P,to=14*P,ta=2*P,tt=tw+to+ta;var iw=bytes*P,ia=0.5*P,ik=1*P,it=iw+ia+ik;var scale=tt;var tr=est.querySelector('[data-row="train"]'),inf=est.querySelector('[data-row="inf"]');tr.querySelector(".mmf-stack").innerHTML=seg("#5b8cff",tw,scale)+seg("#f5a623",to,scale)+seg("#3ecf8e",ta,scale);tr.querySelector(".mmf-tot").textContent=gb(tt);inf.querySelector(".mmf-stack").innerHTML=seg("#5b8cff",iw,scale)+seg("#3ecf8e",ia,scale)+seg("#00c853",ik,scale);inf.querySelector(".mmf-tot").textContent=gb(it);est.querySelector(".mmf-ratio").innerHTML="Training needs ≈<b>"+(Math.round(tt/it*10)/10)+"×</b> the memory of inference here.";}
ps.addEventListener("input",calc);
est.querySelectorAll(".mmf-prec button").forEach(function(b){b.addEventListener("click",function(){est.querySelectorAll(".mmf-prec button").forEach(function(x){x.classList.remove("on");});b.classList.add("on");bytes=parseFloat(b.getAttribute("data-b"));calc();});});
calc();
}
},
weights:function(c){
var w=c.querySelector("#mmf-wt");if(!w)return;
var ps=w.querySelector("#mmf-wparams"),pv=w.querySelector(".mmf-pval"),bytes=2;
var PREC=[{n:"FP32",b:4,col:"#9aa4b2"},{n:"FP16",b:2,col:"#5b8cff"},{n:"INT8",b:1,col:"#3ecf8e"},{n:"INT4",b:0.5,col:"#00c853"}];
function gb(x){return x>=10?Math.round(x)+" GB":(Math.round(x*10)/10)+" GB";}
function calc(){var P=+ps.value;pv.textContent=P+"B";var g=P*bytes;
w.querySelector('[data-s="gb"]').textContent=gb(g)+" · "+bytes+" bytes/param";
var fit=w.querySelector('[data-s="fit"]');
if(g<=24)fit.innerHTML="✅ Fits= on= a= 24= GB= GPU= (e.g.= RTX= 4090).";= else= if(g<=80)fit.innerHTML="✅ Fits= on= an= 80= GB= GPU= (A100/H100),= but= not= a= 24= GB= card.";= else= fit.innerHTML="⚠️ Too big for a single 80 GB GPU — needs multiple GPUs or heavier quantization." ;= var= maxg=P*4; w.querySelector('[data-s="cmp" ]').innerHTML=PREC.map(function(p){var v=P*p.b;return '<div= class="mmf-eb"><div class="mmf-ebl"><span>'+p.n+'</span><span>'+gb(v)+'</span></div><div class="mmf-stack"><div class="mmf-seg" style="background:'+p.col+';width:'+(v/maxg*100)+'%"/></div></div>';}).join("");
}
ps.addEventListener("input",calc);
w.querySelectorAll(".mmf-prec button").forEach(function(b){b.addEventListener("click",function(){w.querySelectorAll(".mmf-prec button").forEach(function(x){x.classList.remove("on");});b.classList.add("on");bytes=parseFloat(b.getAttribute("data-b"));calc();});});
calc();
},
kv:function(c){
var w=c.querySelector("#mmf-kv");if(!w)return;
var ctx=w.querySelector("#mmf-ctx"),bs=w.querySelector("#mmf-bs"),kvh=8,layers=32,hd=128,bytes=2;
function gb(x){return x>=10?Math.round(x)+" GB":(Math.round(x*100)/100)+" GB";}
function calc(){var seqK=+ctx.value,batch=+bs.value,tokens=seqK*1024;
w.querySelector('[data-p="ctx"]').textContent=seqK+"K";w.querySelector('[data-p="bs"]').textContent=batch;
var g=(2*layers*kvh*hd*tokens*batch*bytes)/1e9;
w.querySelector('[data-s="gb"]').textContent=gb(g);
var pct=Math.round(g/14*100);
w.querySelector('[data-s="cmp"]').innerHTML="That's about<b>"+pct+"%</b> of the 14 GB the 7B weights take. "+(g>14?"The cache now exceeds the model itself.":"Stretch the context further and it overtakes the weights.");
}
ctx.addEventListener("input",calc);bs.addEventListener("input",calc);
w.querySelectorAll('[data-role="attn"] button').forEach(function(b){b.addEventListener("click",function(){w.querySelectorAll('[data-role="attn"] button').forEach(function(x){x.classList.remove("on");});b.classList.add("on");kvh=parseInt(b.getAttribute("data-h"),10);calc();});});
calc();
},
attention:function(c){
var w=c.querySelector("#mmf-attn");if(!w)return;
var n=w.querySelector("#mmf-n"),pv=w.querySelector(".mmf-pval"),grid=w.querySelector('[data-s="grid"]'),maxN=40;
function calc(){var v=+n.value;pv.textContent=v;
grid.style.gridTemplateColumns="repeat("+v+",1fr)";
grid.innerHTML=new Array(v*v).fill('<div class="mmf-cell"/>').join("");
var std=v*v,fa=v,scale=maxN*maxN;
w.querySelector('[data-f="std"]').style.width=(std/scale*100)+"%";
w.querySelector('[data-f="fa"]').style.width=Math.max(fa/scale*100,1)+"%";
w.querySelector('[data-s="std"]').textContent=std.toLocaleString()+" cells";
w.querySelector('[data-s="fa"]').textContent="~"+fa+" units";
w.querySelector('[data-s="note"]').innerHTML="At<b>"+v+"</b> tokens the full score matrix is<b>"+std.toLocaleString()+"</b> cells. FlashAttention never stores it — it streams tiles, so its memory grows with<b>"+v+"</b>, not "+std.toLocaleString()+".";
}
n.addEventListener("input",calc);calc();
},
inference:function(c){
var w=c.querySelector("#mmf-pag");if(!w)return;
var grid=w.querySelector('[data-s="grid"]'),note=w.querySelector('[data-s="note"]'),mode=0;
var COLS=["#5b8cff","#3ecf8e","#f5a623","#7c5cff"],reqs=[18,27,9,14],slot=25;
function render(){var cells=new Array(100).fill(null);
if(mode===0){var pos=0;reqs.forEach(function(len,i){for(var j=0;j<slot;j++){if(pos<100)cells[pos]=(j<len)?{c:COLS[i]}:{w:true};pos++;}}); var= wasted=cells.filter(function(x){return x&&x.w;}).length;= note.innerHTML="<b>No paging:</b> each request reserves a full-size slot up front. <b>" +wasted+"%= wasted</b=> on gaps, and the grid is full — no room for a 5th request.";}
else{var pos=0;reqs.forEach(function(len,i){for(var j=0;j<len;j++){cells[pos++]={c:COLS[i]};}}); var= free=cells.filter(function(x){return !x;}).length;= note.innerHTML="<b>PagedAttention:</b> requests are packed into pages with almost no waste. <b>" +free+"%= free</b=> — enough to admit more requests on the same GPU.";}
grid.innerHTML=cells.map(function(x){var bg=x?(x.w?"repeating-linear-gradient(45deg,rgba(127,127,127,.3),rgba(127,127,127,.3) 3px,transparent 3px,transparent 6px)":x.c):"rgba(127,127,127,.16)";return '<div class="mmf-mcell" style="background:'+bg+'"/>';}).join("");
}
w.querySelectorAll('[data-role="pg"] button').forEach(function(b){b.addEventListener("click",function(){w.querySelectorAll('[data-role="pg"] button').forEach(function(x){x.classList.remove("on");});b.classList.add("on");mode=parseInt(b.getAttribute("data-pg"),10);render();});});
render();
},
training:function(c){
var w=c.querySelector("#mmf-zero");if(!w)return;
var gpus=w.querySelector("#mmf-gpus"),pv=w.querySelector(".mmf-pval"),stage=0,ckpt=0;
var P=7,W=2*P,G=2*P,O=12*P;
function gb(x){return x>=10?Math.round(x)+" GB":(Math.round(x*10)/10)+" GB";}
function calc(){var N=+gpus.value;pv.textContent=N;var A=ckpt?4:16;var wp=W,gp=G,op=O;
if(stage>=1)op=O/N;if(stage>=2)gp=G/N;if(stage>=3)wp=W/N;
var tot=wp+gp+op+A,maxT=W+G+O+16;
var segs=[["#5b8cff",wp],["#f5a623",op],["#e0556b",gp],["#3ecf8e",A]];
w.querySelector(".mmf-stack").innerHTML=segs.map(function(s){return '<div class="mmf-seg" style="background:'+s[0]+';width:'+(s[1]/maxT*100)+'%"/>';}).join("");
w.querySelector(".mmf-tot").textContent=gb(tot);
w.querySelector(".mmf-ratio").innerHTML="Stage "+stage+" on "+N+" GPUs"+(ckpt?" + checkpointing":"")+":<b>"+gb(tot)+"</b> per GPU vs "+gb(maxT)+" unsharded.";
}
gpus.addEventListener("input",calc);
w.querySelectorAll('[data-role="stage"] button').forEach(function(b){b.addEventListener("click",function(){w.querySelectorAll('[data-role="stage"] button').forEach(function(x){x.classList.remove("on");});b.classList.add("on");stage=parseInt(b.getAttribute("data-z"),10);calc();});});
w.querySelectorAll('[data-role="ckpt"] button').forEach(function(b){b.addEventListener("click",function(){w.querySelectorAll('[data-role="ckpt"] button').forEach(function(x){x.classList.remove("on");});b.classList.add("on");ckpt=parseInt(b.getAttribute("data-ck"),10);calc();});});
calc();
},
hardware:function(c){
var w=c.querySelector("#mmf-hier");if(!w)return;
var rows=w.querySelector('[data-s="rows"]'),det=w.querySelector(".mmf-detail");
var TIERS=[
{n:"On-chip SRAM",cap:"~50 MB",bw:19000,col:"#00c853",d:"On-chip SRAM — the GPU's scratchpad. Astonishingly fast (~19 TB/s) but tiny. FlashAttention works by keeping its tiles here instead of in HBM."},
{n:"HBM / VRAM",cap:"~80 GB",bw:3350,col:"#3ecf8e",d:"High-Bandwidth Memory — the GPU's main memory, holding weights and the KV cache. Fast (~3.3 TB/s on an H100) but limited capacity. This is what 'GPU memory' usually means."},
{n:"System RAM",cap:"~512 GB",bw:100,col:"#5b8cff",d:"CPU system memory. Much bigger, but ~30x slower than HBM and reached over PCIe. Used for offloading optimizer state or layers that don't fit on the GPU."},
{n:"SSD / NVMe",cap:"~8 TB",bw:7,col:"#f5a623",d:"Disk. Effectively unlimited capacity but very slow (~7 GB/s). A last-resort offload (ZeRO-Infinity) for models far larger than RAM."}
];
var maxLog=Math.log(19000);
rows.innerHTML=TIERS.map(function(t,i){var wd=Math.max(Math.log(t.bw)/maxLog*100,6);var bwl=t.bw>=1000?(t.bw/1000)+" TB/s":t.bw+" GB/s";return '<div class="mmf-hrow" data-i="'+i+'" style="--c:'+t.col+'"><div class="mmf-htop"><span>'+t.n+'</span><small>'+t.cap+' · '+bwl+'</small></div><div class="mmf-hbar" style="width:'+wd+'%"/></div>';}).join("");
rows.querySelectorAll(".mmf-hrow").forEach(function(r){r.addEventListener("click",function(){rows.querySelectorAll(".mmf-hrow").forEach(function(x){x.classList.remove("sel");});r.classList.add("sel");det.textContent=TIERS[+r.getAttribute("data-i")].d;});});
},
agent:function(c){
var w=c.querySelector("#mmf-ctxwin");if(!w)return;
var fill=w.querySelector('[data-f="ctx"]'),lbl=w.querySelector('[data-s="lbl"]'),tn=w.querySelector('[data-s="turns"]'),note=w.querySelector('[data-s="note"]');
var LIMIT=8000,used=0,turns=0;
function render(msg){var pct=Math.min(used/LIMIT*100,100);fill.style.width=pct+"%";fill.style.background=used>=LIMIT?"#e0556b":(used>LIMIT*0.8?"#f5a623":"#7c5cff");lbl.textContent=Math.min(used,LIMIT)+" / "+LIMIT+" tokens";tn.textContent=turns+" turns";if(msg)note.innerHTML=msg;}
w.querySelectorAll(".mmf-btn").forEach(function(b){b.addEventListener("click",function(){var a=b.getAttribute("data-a");
if(a==="add"){turns++;used+=800;if(used>=LIMIT)render("<b>Context full.</b> New turns push the oldest ones out — the model forgets the start of the conversation unless you store it elsewhere.");else if(used>LIMIT*0.8)render("Getting full. Time to summarize or offload to long-term memory.");else render("Added a turn. Working memory holds the recent conversation verbatim.");}
else if(a==="sum"){if(turns<=1){render("Nothing to= summarize= yet= —= add= a= few= turns= first.");return;}used=800+Math.round((used-800)*0.25);render("<b>Summarized.</b> Old turns compacted to about 25% of their size — same gist, far fewer tokens. The standard way to stay under the limit.");}
else{used=0;turns=0;render("Reset. Each turn adds about 800 tokens.");}
});});
render();
},
advanced:function(c){
var w=c.querySelector("#mmf-arch");if(!w)return;
var seq=w.querySelector("#mmf-seq"),pv=w.querySelector(".mmf-pval"),perTok=0.125/1024;
function gb(x){return x>=10?Math.round(x)+" GB":(Math.round(x*100)/100)+" GB";}
function calc(){var k=+seq.value,tokens=k*1024;pv.textContent=k+"K";var tf=tokens*perTok,ssm=0.4,max=256*1024*perTok;
w.querySelector('[data-f="tf"]').style.width=Math.min(tf/max*100,100)+"%";
w.querySelector('[data-f="ssm"]').style.width=Math.max(ssm/max*100,1.5)+"%";
w.querySelector('[data-s="tf"]').textContent=gb(tf);
w.querySelector('[data-s="ssm"]').textContent=gb(ssm)+" (flat)";
w.querySelector('[data-s="note"]').innerHTML="At<b>"+k+"K</b> tokens the transformer's cache is<b>"+gb(tf)+"</b> and still climbing; the SSM holds a fixed<b>"+gb(ssm)+"</b> state no matter the length.";
}
seq.addEventListener("input",calc);calc();
}
};
var root=document.getElementById("mm-kb");
function esc(s){return s;}
function cardHTML(t){return '<div class="mm-card" data-id="'+t.id+'"><div class="mm-thumb"><img src="'+t.img+'" alt="" loading="lazy" onerror="this.parentNode.style.display=\'none\'"/><span class="mm-num">'+t.num+'</span><span class="mm-badge '+t.phase+'">'+PHASE[t.phase]+'</span><h3>'+t.title+'</h3><p>'+t.summary+'</p><div class="mm-chips">'+t.chips.map(function(c){return '<span class="mm-chip">'+c+'</span>';}).join("")+'</div><div class="mm-open">Open article →</div></div>';}
function fill(id,dom){document.getElementById(id).innerHTML=TOPICS.filter(function(t){return t.domain===dom;}).map(cardHTML).join("");}
fill("mm-grid-system","system");fill("mm-grid-agent","agent");
function heroHTML(t){if(t.video){return '<div class="mm-hero"><video src="'+t.video+'" autoplay= loop= muted= playsinline=/></div>';}if(t.img){return '<div class="mm-hero"><img src="'+t.img+'" alt="" onerror="this.parentNode.style.display=\'none\'"/>';}return '';}
function articleHTML(t){var body;if(t.body&&BODIES[t.body]){body=BODIES[t.body]();}else{var secs=t.sections.map(function(s){return '<div class="mm-sec"><h3>'+s.h+'</h3><ul>'+s.li.map(function(x){return '<li>'+x+'</li>';}).join("")+'</ul></div>';}).join("");body=secs+'<div class="mm-note">This is the article outline. Each bullet is a section to be written out in full.</div>';}var dom=t.domain==="system"?"System &amp; Runtime Memory":"Agent &amp; Long-Term Memory";return '<div class="mm-article" style="display:block"><button class="mm-back" id="mm-back">← Back to all topics</button>'+heroHTML(t)+'<span class="mm-badge '+t.phase+'">'+PHASE[t.phase]+'</span><span class="mm-chip">'+dom+'</span><h1>'+t.title+'</h1><p class="mm-lead">'+t.lead+'</p>'+body+'</div>';}
function show(id){var home=document.getElementById("mm-home");var cont=document.getElementById("mm-article-container");var t=TOPICS.filter(function(x){return x.id===id;})[0];if(t){home.style.display="none";cont.innerHTML=articleHTML(t);if(t.body&&INITS[t.body]){INITS[t.body](cont);}root.scrollIntoView({behavior:"smooth",block:"start"});}}
function goHome(){document.getElementById("mm-article-container").innerHTML="";document.getElementById("mm-home").style.display="block";root.scrollIntoView({behavior:"smooth",block:"start"});}
root.addEventListener("click",function(e){var card=e.target.closest(".mm-card");if(card){show(parseInt(card.getAttribute("data-id"),10));return;}if(e.target.closest("#mm-back")){goHome();}});
})();</script>
]]></content:encoded><media:content url="https://curiousbit.netlify.app/images/memory-llms/hero.png" medium="image"><media:title type="plain">LLMs</media:title></media:content><category>LLMs</category><category>Memory</category><category>Inference</category><category>GPU</category></item></channel></rss>