|
991 | 991 | "source": [
|
992 | 992 | "[Link to Great Tables](https://bit.ly/3U58fvP)."
|
993 | 993 | ]
|
| 994 | + }, |
| 995 | + { |
| 996 | + "cell_type": "markdown", |
| 997 | + "id": "01df33c0", |
| 998 | + "metadata": {}, |
| 999 | + "source": [ |
| 1000 | + "### Camelot: PDF Table Extraction for Humans" |
| 1001 | + ] |
| 1002 | + }, |
| 1003 | + { |
| 1004 | + "cell_type": "code", |
| 1005 | + "execution_count": null, |
| 1006 | + "id": "90e6b894", |
| 1007 | + "metadata": { |
| 1008 | + "tags": [ |
| 1009 | + "hide-cell" |
| 1010 | + ] |
| 1011 | + }, |
| 1012 | + "outputs": [], |
| 1013 | + "source": [ |
| 1014 | + "!pip install \"camelot-py[base]\" \"opencv-python\" \"pypdf2<3\"" |
| 1015 | + ] |
| 1016 | + }, |
| 1017 | + { |
| 1018 | + "cell_type": "markdown", |
| 1019 | + "id": "ec590463", |
| 1020 | + "metadata": {}, |
| 1021 | + "source": [ |
| 1022 | + "With Camelot, you can extract tables from PDFs using Python and convert the data into a more structured format, such as a pandas DataFrame or a CSV file for efficient analysis, manipulation, and integration." |
| 1023 | + ] |
| 1024 | + }, |
| 1025 | + { |
| 1026 | + "cell_type": "markdown", |
| 1027 | + "id": "cce6ec0b", |
| 1028 | + "metadata": {}, |
| 1029 | + "source": [ |
| 1030 | + "To see how Camelot works, start with reading the PDF file named 'foo.pdf' and extracts all the tables present in the file." |
| 1031 | + ] |
| 1032 | + }, |
| 1033 | + { |
| 1034 | + "cell_type": "code", |
| 1035 | + "execution_count": 1, |
| 1036 | + "id": "60cbbdf3", |
| 1037 | + "metadata": {}, |
| 1038 | + "outputs": [ |
| 1039 | + { |
| 1040 | + "data": { |
| 1041 | + "text/plain": [ |
| 1042 | + "<TableList n=1>" |
| 1043 | + ] |
| 1044 | + }, |
| 1045 | + "execution_count": 1, |
| 1046 | + "metadata": {}, |
| 1047 | + "output_type": "execute_result" |
| 1048 | + } |
| 1049 | + ], |
| 1050 | + "source": [ |
| 1051 | + "import camelot\n", |
| 1052 | + "tables = camelot.read_pdf('foo.pdf')\n", |
| 1053 | + "tables" |
| 1054 | + ] |
| 1055 | + }, |
| 1056 | + { |
| 1057 | + "cell_type": "markdown", |
| 1058 | + "id": "f5125a51", |
| 1059 | + "metadata": {}, |
| 1060 | + "source": [ |
| 1061 | + "The output shows that there is one table extracted from the PDF file." |
| 1062 | + ] |
| 1063 | + }, |
| 1064 | + { |
| 1065 | + "cell_type": "markdown", |
| 1066 | + "id": "26edcb5c", |
| 1067 | + "metadata": {}, |
| 1068 | + "source": [ |
| 1069 | + "Export the extracted tables to a CSV file named 'foo.csv'. Camelot also supports exporting tables to other formats like JSON, Excel, HTML, Markdown, and SQLite databases." |
| 1070 | + ] |
| 1071 | + }, |
| 1072 | + { |
| 1073 | + "cell_type": "code", |
| 1074 | + "execution_count": 3, |
| 1075 | + "id": "611f33ff", |
| 1076 | + "metadata": {}, |
| 1077 | + "outputs": [ |
| 1078 | + { |
| 1079 | + "data": { |
| 1080 | + "text/html": [ |
| 1081 | + "<div>\n", |
| 1082 | + "<style scoped>\n", |
| 1083 | + " .dataframe tbody tr th:only-of-type {\n", |
| 1084 | + " vertical-align: middle;\n", |
| 1085 | + " }\n", |
| 1086 | + "\n", |
| 1087 | + " .dataframe tbody tr th {\n", |
| 1088 | + " vertical-align: top;\n", |
| 1089 | + " }\n", |
| 1090 | + "\n", |
| 1091 | + " .dataframe thead th {\n", |
| 1092 | + " text-align: right;\n", |
| 1093 | + " }\n", |
| 1094 | + "</style>\n", |
| 1095 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 1096 | + " <thead>\n", |
| 1097 | + " <tr style=\"text-align: right;\">\n", |
| 1098 | + " <th></th>\n", |
| 1099 | + " <th>0</th>\n", |
| 1100 | + " <th>1</th>\n", |
| 1101 | + " <th>2</th>\n", |
| 1102 | + " <th>3</th>\n", |
| 1103 | + " <th>4</th>\n", |
| 1104 | + " <th>5</th>\n", |
| 1105 | + " <th>6</th>\n", |
| 1106 | + " </tr>\n", |
| 1107 | + " </thead>\n", |
| 1108 | + " <tbody>\n", |
| 1109 | + " <tr>\n", |
| 1110 | + " <th>0</th>\n", |
| 1111 | + " <td>Cycle \\nName</td>\n", |
| 1112 | + " <td>KI \\n(1/km)</td>\n", |
| 1113 | + " <td>Distance \\n(mi)</td>\n", |
| 1114 | + " <td>Percent Fuel Savings</td>\n", |
| 1115 | + " <td></td>\n", |
| 1116 | + " <td></td>\n", |
| 1117 | + " <td></td>\n", |
| 1118 | + " </tr>\n", |
| 1119 | + " <tr>\n", |
| 1120 | + " <th>1</th>\n", |
| 1121 | + " <td></td>\n", |
| 1122 | + " <td></td>\n", |
| 1123 | + " <td></td>\n", |
| 1124 | + " <td>Improved \\nSpeed</td>\n", |
| 1125 | + " <td>Decreased \\nAccel</td>\n", |
| 1126 | + " <td>Eliminate \\nStops</td>\n", |
| 1127 | + " <td>Decreased \\nIdle</td>\n", |
| 1128 | + " </tr>\n", |
| 1129 | + " <tr>\n", |
| 1130 | + " <th>2</th>\n", |
| 1131 | + " <td>2012_2</td>\n", |
| 1132 | + " <td>3.30</td>\n", |
| 1133 | + " <td>1.3</td>\n", |
| 1134 | + " <td>5.9%</td>\n", |
| 1135 | + " <td>9.5%</td>\n", |
| 1136 | + " <td>29.2%</td>\n", |
| 1137 | + " <td>17.4%</td>\n", |
| 1138 | + " </tr>\n", |
| 1139 | + " <tr>\n", |
| 1140 | + " <th>3</th>\n", |
| 1141 | + " <td>2145_1</td>\n", |
| 1142 | + " <td>0.68</td>\n", |
| 1143 | + " <td>11.2</td>\n", |
| 1144 | + " <td>2.4%</td>\n", |
| 1145 | + " <td>0.1%</td>\n", |
| 1146 | + " <td>9.5%</td>\n", |
| 1147 | + " <td>2.7%</td>\n", |
| 1148 | + " </tr>\n", |
| 1149 | + " <tr>\n", |
| 1150 | + " <th>4</th>\n", |
| 1151 | + " <td>4234_1</td>\n", |
| 1152 | + " <td>0.59</td>\n", |
| 1153 | + " <td>58.7</td>\n", |
| 1154 | + " <td>8.5%</td>\n", |
| 1155 | + " <td>1.3%</td>\n", |
| 1156 | + " <td>8.5%</td>\n", |
| 1157 | + " <td>3.3%</td>\n", |
| 1158 | + " </tr>\n", |
| 1159 | + " <tr>\n", |
| 1160 | + " <th>5</th>\n", |
| 1161 | + " <td>2032_2</td>\n", |
| 1162 | + " <td>0.17</td>\n", |
| 1163 | + " <td>57.8</td>\n", |
| 1164 | + " <td>21.7%</td>\n", |
| 1165 | + " <td>0.3%</td>\n", |
| 1166 | + " <td>2.7%</td>\n", |
| 1167 | + " <td>1.2%</td>\n", |
| 1168 | + " </tr>\n", |
| 1169 | + " <tr>\n", |
| 1170 | + " <th>6</th>\n", |
| 1171 | + " <td>4171_1</td>\n", |
| 1172 | + " <td>0.07</td>\n", |
| 1173 | + " <td>173.9</td>\n", |
| 1174 | + " <td>58.1%</td>\n", |
| 1175 | + " <td>1.6%</td>\n", |
| 1176 | + " <td>2.1%</td>\n", |
| 1177 | + " <td>0.5%</td>\n", |
| 1178 | + " </tr>\n", |
| 1179 | + " </tbody>\n", |
| 1180 | + "</table>\n", |
| 1181 | + "</div>" |
| 1182 | + ], |
| 1183 | + "text/plain": [ |
| 1184 | + " 0 1 2 3 \\\n", |
| 1185 | + "0 Cycle \\nName KI \\n(1/km) Distance \\n(mi) Percent Fuel Savings \n", |
| 1186 | + "1 Improved \\nSpeed \n", |
| 1187 | + "2 2012_2 3.30 1.3 5.9% \n", |
| 1188 | + "3 2145_1 0.68 11.2 2.4% \n", |
| 1189 | + "4 4234_1 0.59 58.7 8.5% \n", |
| 1190 | + "5 2032_2 0.17 57.8 21.7% \n", |
| 1191 | + "6 4171_1 0.07 173.9 58.1% \n", |
| 1192 | + "\n", |
| 1193 | + " 4 5 6 \n", |
| 1194 | + "0 \n", |
| 1195 | + "1 Decreased \\nAccel Eliminate \\nStops Decreased \\nIdle \n", |
| 1196 | + "2 9.5% 29.2% 17.4% \n", |
| 1197 | + "3 0.1% 9.5% 2.7% \n", |
| 1198 | + "4 1.3% 8.5% 3.3% \n", |
| 1199 | + "5 0.3% 2.7% 1.2% \n", |
| 1200 | + "6 1.6% 2.1% 0.5% " |
| 1201 | + ] |
| 1202 | + }, |
| 1203 | + "execution_count": 3, |
| 1204 | + "metadata": {}, |
| 1205 | + "output_type": "execute_result" |
| 1206 | + } |
| 1207 | + ], |
| 1208 | + "source": [ |
| 1209 | + "tables[0].parsing_report\n", |
| 1210 | + "{\n", |
| 1211 | + " 'accuracy': 99.02,\n", |
| 1212 | + " 'whitespace': 12.24,\n", |
| 1213 | + " 'order': 1,\n", |
| 1214 | + " 'page': 1\n", |
| 1215 | + "}\n", |
| 1216 | + "tables[0].to_csv('foo.csv') # to_json, to_excel, to_html, to_markdown, to_sqlite\n", |
| 1217 | + "tables[0].df # get a pandas DataFrame!" |
| 1218 | + ] |
| 1219 | + }, |
| 1220 | + { |
| 1221 | + "cell_type": "markdown", |
| 1222 | + "id": "9c914e50", |
| 1223 | + "metadata": {}, |
| 1224 | + "source": [ |
| 1225 | + "[Link to Camelot](https://bit.ly/3xPBw6L)." |
| 1226 | + ] |
994 | 1227 | }
|
995 | 1228 | ],
|
996 | 1229 | "metadata": {
|
|
0 commit comments