Update create_statistics.sgml

chegong18 · chegong18 · commit b3872b9d9be7 · 2022-11-18T21:19:36.000+08:00
diff --git a/postgresql/doc/src/sgml/ref/create_statistics.sgml b/postgresql/doc/src/sgml/ref/create_statistics.sgml
@@ -40,16 +40,24 @@ ____________________________________________________________________________-->
  <refsynopsisdiv>
 <!--==========================orignal english content==========================
 <synopsis>
+CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
+    ON ( <replaceable class="parameter">expression</replaceable> )
+    FROM <replaceable class="parameter">table_name</replaceable>
+
 CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
     [ ( <replaceable class="parameter">statistics_kind</replaceable> [, ... ] ) ]
-    ON <replaceable class="parameter">column_name</replaceable>, <replaceable class="parameter">column_name</replaceable> [, ...]
+    ON { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) }, { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [, ...]
     FROM <replaceable class="parameter">table_name</replaceable>
 </synopsis>
 ____________________________________________________________________________-->
 <synopsis>
+CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
+    ON ( <replaceable class="parameter">expression</replaceable> )
+    FROM <replaceable class="parameter">table_name</replaceable>
+
 CREATE STATISTICS [ IF NOT EXISTS ] <replaceable class="parameter">statistics_name</replaceable>
     [ ( <replaceable class="parameter">statistics_kind</replaceable> [, ... ] ) ]
-    ON <replaceable class="parameter">column_name</replaceable>, <replaceable class="parameter">column_name</replaceable> [, ...]
+    ON { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) }, { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [, ...]
     FROM <replaceable class="parameter">table_name</replaceable>
 </synopsis>
 
@@ -75,6 +83,28 @@ ____________________________________________________________________________-->
    被发出该命令的用户所有。
   </para>
 
+<!--==========================orignal english content==========================
+  <para>
+   The <command>CREATE STATISTICS</command> command has two basic forms. The
+   first form allows univariate statistics for a single expression to be
+   collected, providing benefits similar to an expression index without the
+   overhead of index maintenance.  This form does not allow the statistics
+   kind to be specified, since the various statistics kinds refer only to
+   multivariate statistics.  The second form of the command allows
+   multivariate statistics on multiple columns and/or expressions to be
+   collected, optionally specifying which statistics kinds to include.  This
+   form will also automatically cause univariate statistics to be collected on
+   any expressions included in the list.
+  </para>
+____________________________________________________________________________-->
+  <para>
+   <command>CREATE STATISTICS</command>命令有两种基本形式。
+   第一种形式允许对被收集的单个表达式的单变量统计信息，提供了类似于表达式索引的好处，而不需要索引维护的开销。
+   这种形式不允许指定统计类型，因为不同的统计类型引用只针对多元统计。
+   此命令的第二种形式允许收集多个列和/或表达式的多元统计信息，可选地指定需要包括的统计信息类型。
+   这种格式也会自动使得列表中包含的任何表达式上的单变量统计信息被收集。
+  </para>
+
 <!--==========================orignal english content==========================
   <para>
    If a schema name is given (for example, <literal>CREATE STATISTICS
@@ -146,24 +176,26 @@ ____________________________________________________________________________-->
     <listitem>
 <!--==========================orignal english content==========================
      <para>
-      A statistics kind to be computed in this statistics object.
+      A multivariate statistics kind to be computed in this statistics object.
       Currently supported kinds are
       <literal>ndistinct</literal>, which enables n-distinct statistics,
       <literal>dependencies</literal>, which enables functional
       dependency statistics, and <literal>mcv</literal> which enables
       most-common values lists.
       If this clause is omitted, all supported statistics kinds are
-      included in the statistics object.
+      included in the statistics object. Univariate expression statistics are
+      built automatically if the statistics definition includes any complex
+      expressions rather than just simple column references.
       For more information, see <xref linkend="planner-stats-extended"/>
       and <xref linkend="multivariate-statistics-examples"/>.
      </para>
 ____________________________________________________________________________-->
      <para>
-      在此统计对象中计算的统计种类。目前支持的种类是启用n-distinct统计的
-      <literal>ndistinct</literal>，启用功能依赖性统计的<literal>dependencies</literal>，以及启用最常见的值列表的<literal>mcv</literal>。
+      在此统计对象中计算的多变量统计种类。
+      目前支持的种类是启用n-distinct统计的<literal>ndistinct</literal>，启用功能依赖性统计的<literal>dependencies</literal>，以及启用最常见的值列表的<literal>mcv</literal>。
       如果省略该子句，则统计对象中将包含所有支持的统计类型。
-      有关更多信息，请参阅<xref linkend="planner-stats-extended"/>和
-      <xref linkend="multivariate-statistics-examples"/>。
+      如果统计信息定义包含任何复杂表达式而不仅仅是简单的列引用，单变量表达式统计会自动构建。
+	  有关更多信息，请参阅<xref linkend="planner-stats-extended"/>和<xref linkend="multivariate-statistics-examples"/>。
      </para>
     </listitem>
    </varlistentry>
@@ -177,16 +209,43 @@ ____________________________________________________________________________-->
 <!--==========================orignal english content==========================
      <para>
       The name of a table column to be covered by the computed statistics.
-      At least two column names must be given;  the order of the column names
-      is insignificant.
+      This is only allowed when building multivariate statistics.  At least
+      two column names or expressions must be specified, and their order is
+      not significant.
      </para>
 ____________________________________________________________________________-->
      <para>
-      被计算的统计信息包含的表格列的名称。至少必须给出两个列名，列名的顺序可以忽略。
+      被计算的统计信息包含的表格列的名称。
+      这里只在建立多变量统计信息时才被允许。
+      至少必须指定两个列名或表达式，它们的顺序是不重要的。
      </para>
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+<!--==========================orignal english content==========================
+    <term><replaceable class="parameter">expression</replaceable></term>
+____________________________________________________________________________-->
+    <term><replaceable class="parameter">表达式</replaceable></term>
+    <listitem>
+<!--==========================orignal english content==========================
+     <para>
+      An expression to be covered by the computed statistics.  This may be
+      used to build univariate statistics on a single expression, or as part
+      of a list of multiple column names and/or expressions to build
+      multivariate statistics.  In the latter case, separate univariate
+      statistics are built automatically for each expression in the list.
+     </para>
+____________________________________________________________________________-->
+     <para>
+      由计算统计信息包含的表达式。
+      这可以用于在单个表达式上构建单变量统计信息，或者作为多个列名和/或表达式的列表的一部分来构建多变量统计信息。
+      在后一种情况中，将为列表中的每个表达式自动构建单独的单变量统计信息。
+     </para>
+    </listitem>
+   </varlistentry>
+
+
    <varlistentry>
 <!--==========================orignal english content==========================
     <term><replaceable class="parameter">table_name</replaceable></term>
@@ -225,6 +284,19 @@ ____________________________________________________________________________-->
    你必须是表的所有者才能创建读取它的统计对象。不过，一旦创建，
    统计对象的所有权与基础表无关。
   </para>
+
+<!--==========================orignal english content==========================
+  <para>
+   Expression statistics are per-expression and are similar to creating an
+   index on the expression, except that they avoid the overhead of index
+   maintenance. Expression statistics are built automatically for each
+   expression in the statistics object definition.
+  </para>
+____________________________________________________________________________-->
+  <para>
+   表达式统计信息是对每个表达式的，就像在表达式上创建索引，只是它们避免了索引维护的开销。
+   表达式统计信息是为统计对象定义中的每个表达式自动构建的。
+  </para>
  </refsect1>
 
  <refsect1 id="sql-createstatistics-examples">
@@ -305,7 +377,7 @@ EXPLAIN ANALYZE SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
 <!--==========================orignal english content==========================
   <para>
    Create table <structname>t2</structname> with two perfectly correlated columns
-   (containing identical data), and a MCV list on those columns:
+   (containing identical data), and an MCV list on those columns:
 
 <programlisting>
 CREATE TABLE t2 (
@@ -359,6 +431,128 @@ EXPLAIN ANALYZE SELECT * FROM t2 WHERE (a = 1) AND (b = 2);
    MCV列表为计划器提供了关于表中普遍出现的特定值的更详细的信息，以及表中未显示的值组合的选择性上限，允许它在这两种情况下产生更好的估计值。
   </para>
 
+<!--==========================orignal english content==========================
+  <para>
+   Create table <structname>t3</structname> with a single timestamp column,
+   and run queries using expressions on that column.  Without extended
+   statistics, the planner has no information about the data distribution for
+   the expressions, and uses default estimates.  The planner also does not
+   realize that the value of the date truncated to the month is fully
+   determined by the value of the date truncated to the day. Then expression
+   and ndistinct statistics are built on those two expressions:
+
+<programlisting>
+CREATE TABLE t3 (
+    a   timestamp
+);
+
+INSERT INTO t3 SELECT i FROM generate_series('2020-01-01'::timestamp,
+                                             '2020-12-31'::timestamp,
+                                             '1 minute'::interval) s(i);
+
+ANALYZE t3;
+
+-&minus; the number of matching rows will be drastically underestimated:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+
+-&minus; build ndistinct statistics on the pair of expressions (per-expression
+-&minus; statistics are built automatically)
+CREATE STATISTICS s3 (ndistinct) ON date_trunc('month', a), date_trunc('day', a) FROM t3;
+
+ANALYZE t3;
+
+-&minus; now the row count estimates are more accurate:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+</programlisting>
+
+   Without expression and ndistinct statistics, the planner has no information
+   about the number of distinct values for the expressions, and has to rely
+   on default estimates. The equality and range conditions are assumed to have
+   0.5% selectivity, and the number of distinct values in the expression is
+   assumed to be the same as for the column (i.e. unique). This results in a
+   significant underestimate of the row count in the first two queries. Moreover,
+   the planner has no information about the relationship between the expressions,
+   so it assumes the two <literal>WHERE</literal> and <literal>GROUP BY</literal>
+   conditions are independent, and multiplies their selectivities together to
+   arrive at a severe overestimate of the group count in the aggregate query.
+   This is further exacerbated by the lack of accurate statistics for the
+   expressions, forcing the planner to use a default ndistinct estimate for the
+   expression derived from ndistinct for the column. With such statistics, the
+   planner recognizes that the conditions are correlated, and arrives at much
+   more accurate estimates.
+  </para>
+____________________________________________________________________________-->
+  <para>
+   使用单个时间戳列创建表<structname>t3</structname>，并用该列上的表达式运行查询。
+   没有扩展的统计信息，计划器无法获知表达式数据分布的相关信息，然后使用默认的估计值。
+   计划器也没有认识到按月截断日期的值完全取决于按天截断日期的值。
+   然后表达式和模糊统计构建在这两个表达式之上:
+
+
+<programlisting>
+CREATE TABLE t3 (
+    a   timestamp
+);
+
+INSERT INTO t3 SELECT i FROM generate_series('2020-01-01'::timestamp,
+                                             '2020-12-31'::timestamp,
+                                             '1 minute'::interval) s(i);
+
+ANALYZE t3;
+
+-- the number of matching rows will be drastically underestimated:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+
+-- build ndistinct statistics on the pair of expressions (per-expression
+-- statistics are built automatically)
+CREATE STATISTICS s3 (ndistinct) ON date_trunc('month', a), date_trunc('day', a) FROM t3;
+
+ANALYZE t3;
+
+-- now the row count estimates are more accurate:
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
+
+EXPLAIN ANALYZE SELECT * FROM t3
+  WHERE date_trunc('day', a) BETWEEN '2020-01-01'::timestamp
+                                 AND '2020-06-30'::timestamp;
+
+EXPLAIN ANALYZE SELECT date_trunc('month', a), date_trunc('day', a)
+   FROM t3 GROUP BY 1, 2;
+</programlisting>
+
+   没有表达式和模糊统计信息，规划器就没有表达式的不同值的数量所相关的信息，并且不得不依赖默认估计值。
+   相等和范围条件假设有0.5%的选择度，并且表达式中不同值的数量被假设为与列相同(也就是独一无二的)。
+   这将导致前两个查询中的行数严重低估。
+   此外，计划器没有关于表达式之间关系的信息，所以它假设两个<literal>WHERE</literal>和<literal>GROUP BY</literal>条件是独立的，并将它们的选择相乘，以得到对聚合查询中的组数的严重高估。
+   由于缺乏表达式准确的统计信息，这种情况进一步加剧了，强迫计划器使用默认的ndistinct估计，对于从列的ndistinct派生的表达式。
+   有了这些统计信息，规划器就能认识到这些条件是有相互关系的，并得出更准确的估计。
+  </para>
+
  </refsect1>
 
  <refsect1>