Feature/fix (#71)

WinChua · web-flow · commit 7ad606c046d9 · 2025-04-25T17:47:52.000+08:00
* refa: refactor auto

* refa: refact the cli:main

* doc

* refa: 4294967295 -&gt; cosnt.FFFFFFFF

* refa: show undo hist

* feat: infer datadir when not specify

* bump 0.0.19
diff --git a/README.md b/README.md
@@ -24,26 +24,26 @@ $ ./pyinnodb.sh --help
 ### dump the ddl from ibd file
 
 ```bash
-./pyinnodb.sh ${your_ibd_path} tosql --mode ddl
+./pyinnodb.sh --fn ${your_ibd_path} tosql --mode ddl
 ```
 
 ### dump sql script to insert data
 
 ```bash
-./pyinnodb.sh ${your_ibd_path} tosql --mode sql
+./pyinnodb.sh --fn ${your_ibd_path} tosql --mode sql
 ```
 
 ### search data with primary key(only support for int primary key now)
 
 ```bash
-./pyinnodb.sh ${your_ibd_path} search --primary-key 42
+./pyinnodb.sh --fn ${your_ibd_path} search --primary-key 42
 ```
 
 ## Mysql 5.7
 
 ### view data in ibd file, require .frm as well
 
 ```bash
-./pyinnodb.sh ${your_ibd_path} frm ${your_frm_path}
+./pyinnodb.sh --fn ${your_ibd_path} frm ${your_frm_path}
 ```
 
diff --git a/README_zh.md b/README_zh.md
@@ -34,7 +34,7 @@ python 3.8 以上
 
 #### 1. 验证.ibd文件
 ```bash
-$ ./pyinnodb.sh datadir/test/all_type.ibd validate
+$ ./pyinnodb.sh --fn datadir/test/all_type.ibd validate
 
 page[1], fil.checksum[0x20fa5081], calculate checksum[0x20fa5081], eq[True]
 page[2], fil.checksum[0x18395c50], calculate checksum[0x18395c50], eq[True]
@@ -45,27 +45,27 @@ page[3], fil.checksum[0x1493810c], calculate checksum[0x1493810c], eq[True]
 
 #### 2. 输出表结构DDL语句
 ```bash
-$ ./pyinnodb.sh datadir/test/all_type.ibd tosql --mode ddl
+$ ./pyinnodb.sh --fn datadir/test/all_type.ibd tosql --mode ddl
 ```
 
 #### 3. 查看sdi
 8.0之后, mysql新增了一种page用于存储表结构数据,将表结构存储在.ibd文件中,一般
 称为SDI,通过以下命令查看表结构的sdi数据
 
 ```bash
-$ ./pyinnodb.sh datadir/test/all_type.ibd tosql --mode sdi
+$ ./pyinnodb.sh --fn datadir/test/all_type.ibd tosql --mode sdi
 ```
 SDI页中每一条记录都是一个JSON串, 可以通过 ` | jnv ` 交互式查看json数据
 
 #### 4. 导出ibd文件中的数据
 ```bash
-$ ./pyinnodb.sh datadir/test/all_type.ibd tosql --mode dump
+$ ./pyinnodb.sh --fn datadir/test/all_type.ibd tosql --mode dump
 ```
 命令会将ibd文件中每一条记录导出成SQL语句, 通过 ` > data.sql`
 
 #### 5. 搜索指定主键的记录
 ```bash
-$ ./pyinnodb.sh datadir/test/all_type.ibd search --primary-key 1
+$ ./pyinnodb.sh --fn datadir/test/all_type.ibd search --primary-key 1
 ```
 <details>
 <summary>展开输出以及解释</summary>
@@ -77,7 +77,7 @@ search命令通过--primary-key选项指定主键的值, 将会在ibd文件中
 
 此外,search命令还包括--hidden-col, 指定后将会解析,记录的隐藏字段, 如:
 ```bash
-$ ./pyinnodb.sh datadir/test/all_type.ibd search --primary-key 2 --hidden-col
+$ ./pyinnodb.sh --fn datadir/test/all_type.ibd search --primary-key 2 --hidden-col
 ```
 
 <details>
@@ -88,7 +88,7 @@ $ ./pyinnodb.sh datadir/test/all_type.ibd search --primary-key 2 --hidden-col
 
 如果进一步查看数据的修改记录, 可以指定 --with-hist 以及--datadir指定mysql的数据目录来查看, 如:
 ```bash
-$ ./pyinnodb.sh datadir/test/all_type.ibd search --primary-key 2 --hidden-col --with-hist --datadir datadir
+$ ./pyinnodb.sh --fn datadir/test/all_type.ibd search --primary-key 2 --hidden-col --with-hist --datadir datadir
 ```
 
 <details>
@@ -110,5 +110,5 @@ $ ./pyinnodb.sh datadir/test/all_type.ibd search --primary-key 2 --hidden-col --
 mysql 5.7的文件组织方式与mysql8.0不同,表结构存储在.frm文件,而数据存储在.ibd,对ibd文件的解析需要使用:
 
 ```
-./pyinnodb.sh datadir/test/all_type.ibd frm datadir/test/all_type.frm
+./pyinnodb.sh --fn datadir/test/all_type.ibd frm datadir/test/all_type.frm
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "pyinnodb"
-version = "0.0.18"
+version = "0.0.19"
 description = "A parser for InnoDB file formats, in Python"
 authors = [
     { name = "WinChua", email = "winchua@foxmail.com" }
diff --git a/src/pyinnodb/cli/__init__.py b/src/pyinnodb/cli/__init__.py
@@ -1,3 +1,3 @@
-from .main import *
+from .main import *  # noqa: F403
 
-from . import frm, iter_record, parse, sdi, sql, static_usage, systab, undo, validate
+from . import frm, iter_record, parse, sdi, sql, static_usage, systab, undo, validate  # noqa: F401
diff --git a/src/pyinnodb/cli/iter_record.py b/src/pyinnodb/cli/iter_record.py
@@ -3,8 +3,12 @@
 from pyinnodb.disk_struct.index import MSDIPage
 from pyinnodb.disk_struct.record import MRecordHeader
 from pyinnodb.sdi.table import Table
+from pyinnodb.disk_struct.rollback import History
+from pathlib import Path
+import os
+import typing as t
 
-from . import *
+from . import *  # noqa: F403
 
 
 @main.command()
@@ -32,7 +36,7 @@ def list_first_page(ctx, pageno):
 @click.option("--datadir", type=click.Path(exists=False), default=None)
 def search(ctx, primary_key, pageno, hidden_col, with_hist, datadir):
     """search the primary-key(int support only now)"""
-    f = ctx.obj["fn"]
+    f: t.IO[t.Any] = ctx.obj["fn"]
     # print("search start cost:", time.time() - ctx.obj["start_time"])
     fsp_page: MFspPage = ctx.obj["fsp_page"]
     f.seek(fsp_page.sdi_page_no * const.PAGE_SIZE)
@@ -58,26 +62,23 @@ def search(ctx, primary_key, pageno, hidden_col, with_hist, datadir):
         return
 
     if datadir is None:
-        print("--datadir should be specified to view the history")
-        return
+        fpath = Path(f.name)
+        if not (fpath.parent.parent/"mysql.ibd").exists():
+            print("--datadir should be specified to view the history")
+            return
+        datadir = fpath.parent.parent
+
     if not os.path.exists(datadir):
         print(f"--datadir {datadir} not exists")
         return
 
-    rptr = result.DB_ROLL_PTR
     primary_key_col = dd_object.get_primary_key_col()
     disk_data_layout = dd_object.get_disk_data_layout()
     undo_map = const.util.get_undo_tablespacefile(f"{datadir}/mysql.ibd")
-    history = []
-    while rptr is not None:
-        hist, rptr = rptr.last_version(
-            undo_map,
-            primary_key_col,
-            disk_data_layout,
-        )
-        history.append(hist)
-    for h in history:
-        print(h)
+
+    history = History(result)
+    history.parse(primary_key_col, disk_data_layout, undo_map)
+    history.show()
 
     return
 
diff --git a/src/pyinnodb/cli/main.py b/src/pyinnodb/cli/main.py
@@ -1,5 +1,6 @@
 import logging
 import sys
+import typing as t
 from importlib_metadata import version as meta_version
 
 import click
@@ -11,8 +12,24 @@
 logger = logging.getLogger(__name__)
 
 
-@click.group()
-@click.argument("fn", type=click.File("rb"))
+def validate_ibd(fsp_page: MFspPage, fn: t.IO[t.Any]):
+    for pn in range(fsp_page.fsp_header.highest_page_number):
+        fn.seek(const.PAGE_SIZE * pn)
+        page_data = fn.read(const.PAGE_SIZE)
+        fil = MFil.parse(page_data)
+        if fil.page_type == const.FIL_PAGE_TYPE_ALLOCATED:
+            continue
+        checksum = const.page_checksum_crc32c(page_data)
+        if checksum != fil.checksum:
+            print(
+                f"PAGE {pn}'s checksum is invalid, stored[{hex(fil.checksum)}] != calculate[{hex(checksum)}]"
+            )
+            print("use validate to get a more detail output of the validation")
+            return False
+    return True
+
+@click.group(invoke_without_command=True)
+@click.option("--fn", type=click.File("rb"), default=None)
 @click.option(
     "--log-level", type=click.Choice(["DEBUG", "ERROR", "INFO"]), default="ERROR"
 )
@@ -22,45 +39,38 @@
 def main(ctx, fn, log_level, validate_first, version):
     """A ibd file parser for MySQL 8.0 above, help you to know innodb better.
 
+    \b
     It offer several function bellow:
-    a) validate the checksum of your ibd file;
-    b) output the DDL of table;
-    c) dump the data in ibd file as INSERT statments;
-    d) search record by primary key;
-    e) show the undo log history
+        a) validate the checksum of your ibd file;
+        b) output the DDL of table;
+        c) dump the data in ibd file as INSERT statments;
+        d) search record by primary key;
+        e) show the undo log history
 
     many other function to explore your ibd file
 
     """
-    if version:
+
+    if version and not ctx.invoked_subcommand:
         print(meta_version("pyinnodb"))
         sys.exit(0)
-    # pid = os.getpid()
-    # start_time = os.stat(f"/proc/{pid}").st_ctime
-    # print("cost to startup:", time.time() - start_time)
-    # ctx.obj["start_time"] = start_time
+    if fn is None:
+        print("use --fn to specify ibd file")
+        sys.exit(0)
+
     logging.basicConfig(
         format="[%(levelname)s]-[%(filename)s:%(lineno)d] %(message)s", level=log_level
     )
     ctx.ensure_object(dict)
     ctx.obj["fn"] = fn
+
     try:
         fsp_page = MFspPage.parse_stream(fn)
-        ctx.obj["fsp_page"] = fsp_page
-        if validate_first:
-            for pn in range(fsp_page.fsp_header.highest_page_number):
-                fn.seek(const.PAGE_SIZE * pn)
-                page_data = fn.read(const.PAGE_SIZE)
-                fil = MFil.parse(page_data)
-                if fil.page_type == const.FIL_PAGE_TYPE_ALLOCATED:
-                    continue
-                checksum = const.page_checksum_crc32c(page_data)
-                if checksum != fil.checksum:
-                    print(
-                        f"PAGE {pn}'s checksum is invalid, stored[{hex(fil.checksum)}] != calculate[{hex(checksum)}]"
-                    )
-                    print("use validate to get a more detail output of the validation")
-                    sys.exit(1)
     except Exception as e:
         print(e)
         print("the file parse faile")
+        sys.exit(1)
+
+    ctx.obj["fsp_page"] = fsp_page
+    if validate_first and not validate_ibd(fsp_page, fn):
+        sys.exit(1)
diff --git a/src/pyinnodb/cli/sql.py b/src/pyinnodb/cli/sql.py
@@ -55,7 +55,7 @@ def dump_ibd(table_object, f, oneline=True):
     )
 
     values = []
-    while first_leaf_page_no != 4294967295:
+    while first_leaf_page_no != const.FFFFFFFF:
         f.seek(first_leaf_page_no * const.PAGE_SIZE)
         index_page = MIndexPage.parse_stream(f)
         values.extend(
diff --git a/src/pyinnodb/cli/undo.py b/src/pyinnodb/cli/undo.py
@@ -84,6 +84,6 @@ def rseg_array(ctx, pageno):
     for pageno in page.header.pagenos:
         f.seek(pageno * const.PAGE_SIZE)
         rseg_page = MRSEGPage.parse_stream(f)
-        pages = [f for f in rseg_page.slots if f != 4294967295]
+        pages = [f for f in rseg_page.slots if f != const.FFFFFFFF]
         if 150 in pages:
             print(rseg_page, pageno)
diff --git a/src/pyinnodb/const/define.py b/src/pyinnodb/const/define.py
@@ -1,4 +1,5 @@
 PAGE_SIZE = 16 * 1024
+FFFFFFFF = 0xFFFFFFFF
 
 FIL_PAGE_INDEX = 17855  # B-tree node */
 FIL_PAGE_RTREE = 17854  # R-tree node */
diff --git a/src/pyinnodb/disk_struct/data.py b/src/pyinnodb/disk_struct/data.py
@@ -1,11 +1,10 @@
-from datetime import timedelta, datetime, date
+from datetime import datetime, date
 from ..mconstruct import cs, cfield, CC
 
 try:
     from datetime import UTC
-except:
+except ImportError:
     from datetime import timezone
-
     UTC = timezone.utc
 
 TIMEF_INT_OFS = 0x800000
diff --git a/src/pyinnodb/disk_struct/first_page.py b/src/pyinnodb/disk_struct/first_page.py
@@ -68,7 +68,7 @@ def get_data(self, stream):
             stream.seek(ie.page_no * const.PAGE_SIZE)
             dp = MDataPage.parse_stream(stream)
             data += stream.read(dp.data_len)
-            if ie.node.next.page_number == 4294967295:
+            if ie.node.next.page_number == const.FFFFFFFF:
                 break
             stream.seek(ie.node.next.seek_loc())
             ie = MIndexEntryNode.parse_stream(stream)
diff --git a/src/pyinnodb/disk_struct/index.py b/src/pyinnodb/disk_struct/index.py
@@ -50,7 +50,7 @@ class MFsegHeader(CC):
     # should not use this way to determine the first leaf page number
     # as off-page may allocate first
     # def get_first_leaf_page(self, f):
-    #     if self.leaf_pointer.page_number != 4294967295:
+const.FFFFFFFF   #     if self.leaf_pointer.page_number != const.FFFFFFFF:
     #         f.seek(self.leaf_pointer.seek_loc())
     #         inode_entry = MInodeEntry.parse_stream(f)
     #         fp = inode_entry.first_page()
@@ -163,7 +163,7 @@ def value_parser(rh: MRecordHeader, f):
             nullable_cols = [
                 d[0]
                 for d in cols_disk_layout
-                if d[1] == 4294967295 and d[0].is_nullable
+                if d[1] == const.FFFFFFFF and d[0].is_nullable
             ]
 
             logger.debug(
@@ -256,16 +256,15 @@ def value_parser(rh: MRecordHeader, f):
                 disk_data_parsed[col.name] = col_value
 
             for col in dd_object.columns:
-                if (
-                    col.name in ["DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR"]
-                    and not hidden_col
-                ) or col.private_data.get("version_dropped", 0) != 0 or col.is_hidden_from_user:
+                if col.name in ["DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR"]:
+                    if not hidden_col and col.name in disk_data_parsed:
+                        disk_data_parsed.pop(col.name)
+                elif col.private_data.get("version_dropped", 0) != 0 or col.is_hidden_from_user:
                     if col.name in disk_data_parsed:
                         disk_data_parsed.pop(col.name)
+                elif col.is_virtual or col.generation_expression_utf8 != "":
                     continue
-                if col.is_virtual or col.generation_expression_utf8 != "":
-                    continue
-                if col.name not in disk_data_parsed:
+                elif col.name not in disk_data_parsed:
                     disk_data_parsed[col.name] = col.get_instant_default()
 
             klass = dd_object.DataClassHiddenCol if hidden_col else dd_object.DataClass
@@ -407,7 +406,7 @@ def iterate_sdi_record(self, stream):
             stream.seek(-8 + infimum.next_record_offset + 12, 1)
             cur_page_num = int.from_bytes(stream.read(4), byteorder="big")
 
-        while cur_page_num != 4294967295:
+        while cur_page_num != const.FFFFFFFF:
             stream.seek(cur_page_num * const.PAGE_SIZE)
             sdi_page = MSDIPage.parse_stream(stream)
             stream.seek(
diff --git a/src/pyinnodb/disk_struct/rollback.py b/src/pyinnodb/disk_struct/rollback.py
diff --git a/src/pyinnodb/frm/frm.py b/src/pyinnodb/frm/frm.py
diff --git a/src/pyinnodb/sdi/column.py b/src/pyinnodb/sdi/column.py
diff --git a/src/pyinnodb/sdi/table.py b/src/pyinnodb/sdi/table.py

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ def dump_ibd(table_object, f, oneline=True):`
`55`	`55`	`)`
`56`	`56`
`57`	`57`	`values = []`
`58`		`- while first_leaf_page_no != 4294967295:`
	`58`	`+ while first_leaf_page_no != const.FFFFFFFF:`
`59`	`59`	`f.seek(first_leaf_page_no * const.PAGE_SIZE)`
`60`	`60`	`index_page = MIndexPage.parse_stream(f)`
`61`	`61`	`values.extend(`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`PAGE_SIZE = 16 * 1024`
	`2`	`+FFFFFFFF = 0xFFFFFFFF`
`2`	`3`
`3`	`4`	`FIL_PAGE_INDEX = 17855 # B-tree node */`
`4`	`5`	`FIL_PAGE_RTREE = 17854 # R-tree node */`