Skip to content

Commit b7322b2

Browse files
add handcalcs
1 parent da7e4d5 commit b7322b2

12 files changed

+824
-911
lines changed

Chapter5/feature_engineer.ipynb

Lines changed: 81 additions & 237 deletions
Large diffs are not rendered by default.

Chapter5/machine_learning.ipynb

Lines changed: 9 additions & 162 deletions
Large diffs are not rendered by default.

Chapter5/time_series.ipynb

Lines changed: 46 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -464,37 +464,45 @@
464464
},
465465
{
466466
"cell_type": "code",
467-
"execution_count": 17,
468-
"id": "9563e0f4",
467+
"execution_count": 15,
468+
"id": "f7bafb95",
469469
"metadata": {},
470470
"outputs": [
471471
{
472472
"name": "stdout",
473473
"output_type": "stream",
474474
"text": [
475-
"Toronto Time: 2024-03-26 10:19:01.843320-04:00\n",
476-
"Paris Time: 2024-03-26 15:19:01.843320+01:00\n",
477-
"Future datetime (after adding two days): 2024-03-28 10:19:01.843320-04:00\n"
475+
"Current time: 2024-09-02 19:18:13.589774\n",
476+
"7 days from now: 2024-09-09 19:18:13.589774\n",
477+
"Time in Tokyo: 2024-09-03 09:18:13.590063+09:00\n",
478+
"Difference: -477 days, 19:11:46.410226\n"
478479
]
479480
}
480481
],
481482
"source": [
482483
"from datetime import datetime, timedelta\n",
483484
"import pytz\n",
484485
"\n",
485-
"# Get current time in Paris\n",
486-
"paris_time = datetime.now(pytz.timezone(\"Europe/Paris\"))\n",
486+
"# Creating a datetime\n",
487+
"now = datetime.now()\n",
488+
"print(f\"Current time: {now}\")\n",
489+
"\n",
490+
"# Date arithmetic\n",
491+
"future = now + timedelta(days=7)\n",
492+
"print(f\"7 days from now: {future}\")\n",
487493
"\n",
488-
"# Convert Paris time to Toronto time\n",
489-
"toronto_timezone = pytz.timezone(\"America/Toronto\")\n",
490-
"toronto_time = paris_time.astimezone(toronto_timezone)\n",
494+
"# Timezone handling\n",
495+
"utc_now = datetime.now(pytz.UTC)\n",
496+
"tokyo_tz = pytz.timezone('Asia/Tokyo')\n",
497+
"tokyo_time = utc_now.astimezone(tokyo_tz)\n",
498+
"print(f\"Time in Tokyo: {tokyo_time}\")\n",
491499
"\n",
492-
"# Add two days\n",
493-
"future_datetime = toronto_time + timedelta(days=2)\n",
500+
"# Parsing (requires exact format)\n",
501+
"parsed = datetime.strptime(\"2023-05-15 14:30:00\", \"%Y-%m-%d %H:%M:%S\")\n",
494502
"\n",
495-
"print(\"Toronto Time:\", toronto_time)\n",
496-
"print(\"Paris Time:\", paris_time)\n",
497-
"print(\"Future datetime (after adding two days):\", future_datetime)"
503+
"# Time difference (not human-readable)\n",
504+
"diff = parsed - now \n",
505+
"print(f\"Difference: {diff}\")"
498506
]
499507
},
500508
{
@@ -507,35 +515,44 @@
507515
},
508516
{
509517
"cell_type": "code",
510-
"execution_count": 18,
511-
"id": "9158e7f6",
518+
"execution_count": 4,
519+
"id": "7ea99b14",
512520
"metadata": {},
513521
"outputs": [
514522
{
515523
"name": "stdout",
516524
"output_type": "stream",
517525
"text": [
518-
"Toronto Time: 2024-03-26 10:19:03.398059-04:00\n",
519-
"Paris Time: 2024-03-26 15:19:03.398059+01:00\n",
520-
"Future datetime (after adding two days): 2024-03-28 10:19:03.398059-04:00\n"
526+
"Current time: 2024-09-02 18:58:20.467988-05:00\n",
527+
"7 days from now: 2024-09-09 18:58:20.467988-05:00\n",
528+
"Time in Tokyo: 2024-09-03 08:58:20.467988+09:00\n",
529+
"Parsed date: 2023-05-15 14:30:00+00:00\n",
530+
"Difference: -1 year -3 months -2 weeks -4 days -9 hours -28 minutes -20 seconds\n"
521531
]
522532
}
523533
],
524534
"source": [
525535
"import pendulum\n",
526536
"\n",
527-
"# Get current time in Paris\n",
528-
"paris_time = pendulum.now(\"Europe/Paris\")\n",
537+
"# Creating a datetime\n",
538+
"now = pendulum.now()\n",
539+
"print(f\"Current time: {now}\")\n",
540+
"\n",
541+
"# Date arithmetic (more intuitive than datetime)\n",
542+
"future = now.add(days=7)\n",
543+
"print(f\"7 days from now: {future}\")\n",
529544
"\n",
530-
"# Convert Paris time to Toronto time\n",
531-
"toronto_time = paris_time.in_timezone(\"America/Toronto\")\n",
545+
"# Timezone handling\n",
546+
"tokyo_time = now.in_timezone(\"Asia/Tokyo\")\n",
547+
"print(f\"Time in Tokyo: {tokyo_time}\")\n",
532548
"\n",
533-
"# Add two days\n",
534-
"future_datetime = toronto_time.add(days=2)\n",
549+
"# Parsing without specifying format\n",
550+
"parsed = pendulum.parse(\"2023-05-15 14:30:00\")\n",
551+
"print(f\"Parsed date: {parsed}\")\n",
535552
"\n",
536-
"print(\"Toronto Time:\", toronto_time)\n",
537-
"print(\"Paris Time:\", paris_time)\n",
538-
"print(\"Future datetime (after adding two days):\", future_datetime)"
553+
"# Human-readable differences\n",
554+
"diff = parsed - now\n",
555+
"print(f\"Difference: {diff.in_words()}\")"
539556
]
540557
},
541558
{

Chapter6/better_outputs.ipynb

Lines changed: 169 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,174 @@
741741
"[Link to latexify_py](https://github.com/google/latexify_py)."
742742
]
743743
},
744+
{
745+
"cell_type": "markdown",
746+
"id": "4c8c79c1-23df-4b11-ae56-d9050e03317d",
747+
"metadata": {},
748+
"source": [
749+
"### From Python to Paper: Visualizing Calculations with Handcalcs"
750+
]
751+
},
752+
{
753+
"cell_type": "code",
754+
"execution_count": null,
755+
"id": "aaab648d-549f-404c-84d9-73139fa1495e",
756+
"metadata": {
757+
"tags": [
758+
"hide-cell"
759+
]
760+
},
761+
"outputs": [],
762+
"source": [
763+
"!pip install handcalcs"
764+
]
765+
},
766+
{
767+
"cell_type": "markdown",
768+
"id": "4c1b38f2-edb2-4a36-9f68-7e62a8f3d4b9",
769+
"metadata": {},
770+
"source": [
771+
"Python calculations often lack transparency when only showing final results. \n",
772+
"\n",
773+
"Handcalcs addresses this by generating LaTeX output that mimics handwritten calculations. It displays symbolic formulas, numeric substitutions, and results, providing a clear step-by-step breakdown. \n",
774+
"\n",
775+
"This approach makes calculations more intuitive, readable, and easier to verify manually.\n",
776+
"\n",
777+
"Handcalcs can be used in two main ways:\n",
778+
"\n",
779+
"1. As a cell magic in Jupyter notebooks using `%%render`:\n"
780+
]
781+
},
782+
{
783+
"cell_type": "code",
784+
"execution_count": 5,
785+
"id": "dbaac238-05b1-4a8a-8cb8-658f7e39bcff",
786+
"metadata": {},
787+
"outputs": [],
788+
"source": [
789+
"import handcalcs.render\n",
790+
"from handcalcs.decorator import handcalc"
791+
]
792+
},
793+
{
794+
"cell_type": "code",
795+
"execution_count": 3,
796+
"id": "f3521696-92ba-437e-a552-e090698c676c",
797+
"metadata": {},
798+
"outputs": [
799+
{
800+
"data": {
801+
"text/latex": [
802+
"\\[\n",
803+
"\\begin{aligned}\n",
804+
"a &= 2 \\; \n",
805+
"\\\\[10pt]\n",
806+
"b &= 3 \\; \n",
807+
"\\\\[10pt]\n",
808+
"c &= 2 \\cdot a + \\frac{ b }{ 3 } = 2 \\cdot 2 + \\frac{ 3 }{ 3 } &= 5.000 \n",
809+
"\\end{aligned}\n",
810+
"\\]"
811+
],
812+
"text/plain": [
813+
"<IPython.core.display.Latex object>"
814+
]
815+
},
816+
"metadata": {},
817+
"output_type": "display_data"
818+
}
819+
],
820+
"source": [
821+
"%%render\n",
822+
"a = 2\n",
823+
"b = 3\n",
824+
"c = 2*a + b/3"
825+
]
826+
},
827+
{
828+
"cell_type": "markdown",
829+
"id": "bdb7d726-a5d6-47e7-a05e-3559b4265f0c",
830+
"metadata": {},
831+
"source": [
832+
"2. As a decorator for functions:"
833+
]
834+
},
835+
{
836+
"cell_type": "code",
837+
"execution_count": 9,
838+
"id": "f605eb86-1eb0-4579-8a57-4a8ff50406af",
839+
"metadata": {},
840+
"outputs": [],
841+
"source": [
842+
"from math import sqrt\n",
843+
"\n",
844+
"@handcalc(jupyter_display=True)\n",
845+
"def my_calc(x: float, y: float, z: float):\n",
846+
" a = 2*x\n",
847+
" b = 3*a/z + sqrt(a+y/2)\n",
848+
" c = a + b\n",
849+
" return c"
850+
]
851+
},
852+
{
853+
"cell_type": "code",
854+
"execution_count": 10,
855+
"id": "5e0f4471-b2b3-44a0-a462-52dc2a58a1bd",
856+
"metadata": {},
857+
"outputs": [
858+
{
859+
"data": {
860+
"text/latex": [
861+
"\\[\n",
862+
"\\begin{aligned}\n",
863+
"a &= 2 \\cdot x = 2 \\cdot 2.300 &= 4.600 \n",
864+
"\\\\[10pt]\n",
865+
"b &= 3 \\cdot \\frac{ a }{ z } + \\sqrt { a + \\frac{ y }{ 2 } } = 3 \\cdot \\frac{ 4.600 }{ 1.200 } + \\sqrt { 4.600 + \\frac{ 3.200 }{ 2 } } &= 13.990 \n",
866+
"\\\\[10pt]\n",
867+
"c &= a + b = 4.600 + 13.990 &= 18.590 \n",
868+
"\\end{aligned}\n",
869+
"\\]"
870+
],
871+
"text/plain": [
872+
"<IPython.core.display.Latex object>"
873+
]
874+
},
875+
"metadata": {},
876+
"output_type": "display_data"
877+
}
878+
],
879+
"source": [
880+
"result = my_calc(2.3, 3.2, 1.2)"
881+
]
882+
},
883+
{
884+
"cell_type": "code",
885+
"execution_count": 11,
886+
"id": "abca39c7-c8b9-4ef2-b56d-115638cc9316",
887+
"metadata": {},
888+
"outputs": [
889+
{
890+
"data": {
891+
"text/plain": [
892+
"18.589979919597745"
893+
]
894+
},
895+
"execution_count": 11,
896+
"metadata": {},
897+
"output_type": "execute_result"
898+
}
899+
],
900+
"source": [
901+
"result"
902+
]
903+
},
904+
{
905+
"cell_type": "markdown",
906+
"id": "ae2c0e92-292d-426f-9cc0-b90aed8ee193",
907+
"metadata": {},
908+
"source": [
909+
"[Link to handcalcs](https://github.com/connorferster/handcalcs)."
910+
]
911+
},
744912
{
745913
"attachments": {},
746914
"cell_type": "markdown",
@@ -1475,7 +1643,7 @@
14751643
"celltoolbar": "Tags",
14761644
"hide_input": false,
14771645
"kernelspec": {
1478-
"display_name": "venv",
1646+
"display_name": "Python 3 (ipykernel)",
14791647
"language": "python",
14801648
"name": "python3"
14811649
},

docs/Chapter5/feature_engineer.html

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -705,6 +705,48 @@ <h2><span class="section-number">6.2.2. </span>Strategy to Prevent Data Leakage
705705
</div>
706706
</div>
707707
</div>
708+
<p>Time series data is unique because it has a temporal order. This means that data from the future shouldn’t influence predictions about the past. However, standard cross-validation techniques like K-Fold randomly shuffle the data, potentially using future information to predict past events.</p>
709+
<p>scikit-learn provides us with a powerful tool designed specifically for time series data: TimeSeriesSplit. This clever cross-validator respects the temporal order of our data, ensuring that we always train on past data and test on future data.</p>
710+
<p>Let’s explore how to use TimeSeriesSplit with a simple example:</p>
711+
<div class="cell docutils container">
712+
<div class="cell_input docutils container">
713+
<div class="highlight-ipython3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
714+
<span class="kn">from</span> <span class="nn">sklearn.model_selection</span> <span class="kn">import</span> <span class="n">TimeSeriesSplit</span>
715+
716+
<span class="n">X</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">]])</span>
717+
<span class="n">y</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">6</span><span class="p">])</span>
718+
719+
<span class="n">tscv</span> <span class="o">=</span> <span class="n">TimeSeriesSplit</span><span class="p">(</span><span class="n">n_splits</span><span class="o">=</span><span class="mi">3</span><span class="p">)</span>
720+
721+
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">train_index</span><span class="p">,</span> <span class="n">test_index</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">tscv</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="n">X</span><span class="p">)):</span>
722+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Fold </span><span class="si">{</span><span class="n">i</span><span class="si">}</span><span class="s2">:&quot;</span><span class="p">)</span>
723+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot; Train: index=</span><span class="si">{</span><span class="n">train_index</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
724+
<span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot; Test: index=</span><span class="si">{</span><span class="n">test_index</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
725+
</pre></div>
726+
</div>
727+
</div>
728+
<div class="cell_output docutils container">
729+
<div class="output stream highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>Fold 0:
730+
Train: index=[0 1 2]
731+
Test: index=[3]
732+
Fold 1:
733+
Train: index=[0 1 2 3]
734+
Test: index=[4]
735+
Fold 2:
736+
Train: index=[0 1 2 3 4]
737+
Test: index=[5]
738+
</pre></div>
739+
</div>
740+
</div>
741+
</div>
742+
<p>From the outputs, we can see that:</p>
743+
<ol class="arabic simple">
744+
<li><p>Temporal Integrity: The split always respects the original order of the data.</p></li>
745+
<li><p>Growing Training Set: With each fold, the training set expands to include more historical data.</p></li>
746+
<li><p>Forward-Moving Test Set: The test set is always a single future sample, progressing with each fold.</p></li>
747+
<li><p>No Data Leakage: Future information is never used to predict past events.</p></li>
748+
</ol>
749+
<p>This approach mimics real-world forecasting scenarios, where models use historical data to predict future outcomes.</p>
708750
</section>
709751
<section id="enhancing-data-handling-with-scikit-learn-s-dataframe-support">
710752
<h2><span class="section-number">6.2.3. </span>Enhancing Data Handling with scikit-learn’s DataFrame Support<a class="headerlink" href="#enhancing-data-handling-with-scikit-learn-s-dataframe-support" title="Permalink to this heading">#</a></h2>
@@ -3763,16 +3805,16 @@ <h2><span class="section-number">6.2.14. </span>sketch: AI Code-Writing Assistan
37633805
},
37643806
codeMirrorConfig: {
37653807
theme: "abcdef",
3766-
mode: "python"
3808+
mode: "data-science"
37673809
},
37683810
kernelOptions: {
3769-
name: "python3",
3811+
name: "data-science",
37703812
path: "./Chapter5"
37713813
},
37723814
predefinedOutput: true
37733815
}
37743816
</script>
3775-
<script>kernelName = 'python3'</script>
3817+
<script>kernelName = 'data-science'</script>
37763818

37773819
</article>
37783820

0 commit comments

Comments
 (0)