postgresql 使用處理 like 'xxoo' 、like 'xxoo%' 、like '%xxoo'、like '%xxoo%'

os: centos 7.4
db: postgresql 10.11

版本

# cat /etc/centos-release
CentOS Linux release 7.4.1708 (Core) 
# 
# 
# yum list installed |grep -i postgresql
postgresql10.x86_64                10.11-2PGDG.rhel7                   @pgdg10  
postgresql10-contrib.x86_64        10.11-2PGDG.rhel7                   @pgdg10  
postgresql10-debuginfo.x86_64      10.11-2PGDG.rhel7                   @pgdg10  
postgresql10-devel.x86_64          10.11-2PGDG.rhel7                   @pgdg10  
postgresql10-docs.x86_64           10.11-2PGDG.rhel7                   @pgdg10  
postgresql10-libs.x86_64           10.11-2PGDG.rhel7                   @pgdg10  
postgresql10-odbc.x86_64           12.00.0000-1PGDG.rhel7              @pgdg10  
postgresql10-plperl.x86_64         10.11-2PGDG.rhel7                   @pgdg10  
postgresql10-plpython.x86_64       10.11-2PGDG.rhel7                   @pgdg10  
postgresql10-pltcl.x86_64          10.11-2PGDG.rhel7                   @pgdg10  
postgresql10-server.x86_64         10.11-2PGDG.rhel7                   @pgdg10  
postgresql10-tcl.x86_64            2.4.0-1.rhel7                       @pgdg10  
postgresql10-tcl-debuginfo.x86_64  2.3.1-1.rhel7                       @pgdg10  
postgresql10-test.x86_64           10.11-2PGDG.rhel7                   @pgdg10 

# su - postgres
Last login: Wed Jan 15 18:34:12 CST 2020 on pts/0
$
$
$ psql -c "select version();"
                                                 version                                                  
----------------------------------------------------------------------------------------------------------
 PostgreSQL 10.11 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.8.5 20150623 (Red Hat 4.8.5-39), 64-bit
(1 row)

測試數據

postgres=# create table tmp_t0(id varchar(100),name1 varchar(100),name2 varchar(100));

postgres=# create table tmp_t1(id varchar(100),name1 varchar(100),name2 varchar(100));

postgres=# 
postgres=# insert into tmp_t0 
select id::varchar,md5(id::varchar),md5(md5(id::varchar)) from generate_series(1,5000000) as id;

postgres=# insert into tmp_t1 
select id::varchar,md5(id::varchar),md5(md5(id::varchar)) from generate_series(1,5000000) as id;

tmp_t0,tmp_t1 的 id 列上創建普通 btree 索引

postgres=# create index idx_tmp_t0_id on tmp_t0(id);

postgres=# create index idx_tmp_t1_id on tmp_t1(id);

postgres=# vacuum analyze tmp_t0;

postgres=# vacuum analyze tmp_t1;

排除並行

postgres=# set max_parallel_workers_per_gather=0;

like ‘xxoo’

like ‘xxoo’ 等價於 = ‘xxoo’

postgres=# explain select t0.*,t1.* from tmp_t0 t0,tmp_t1 t1 where t0.id=t1.id and t0.id like '1000000';
                                      QUERY PLAN                                      
--------------------------------------------------------------------------------------
 Nested Loop  (cost=0.86..16.91 rows=1 width=146)
   ->  Index Scan using idx_tmp_t0_id on tmp_t0 t0  (cost=0.43..8.45 rows=1 width=73)
         Index Cond: ((id)::text = '1000000'::text)
         Filter: ((id)::text ~~ '1000000'::text)
   ->  Index Scan using idx_tmp_t1_id on tmp_t1 t1  (cost=0.43..8.45 rows=1 width=73)
         Index Cond: ((id)::text = (t0.id)::text)
(6 rows)

like ‘xxoo%’

前模糊(有前綴的模糊)

postgres=# explain select t0.*,t1.* from tmp_t0 t0,tmp_t1 t1 where t0.id=t1.id and t0.id like '1000000%';
                                      QUERY PLAN                                      
--------------------------------------------------------------------------------------
 Nested Loop  (cost=0.43..133262.84 rows=500 width=146)
   ->  Seq Scan on tmp_t0 t0  (cost=0.00..129068.84 rows=500 width=73)
         Filter: ((id)::text ~~ '1000000%'::text)
   ->  Index Scan using idx_tmp_t1_id on tmp_t1 t1  (cost=0.43..8.38 rows=1 width=73)
         Index Cond: ((id)::text = (t0.id)::text)
(5 rows)

可以看到 tmp_t0 走的是全表掃描(Seq Scan),並沒有使用到索引。

查看文檔(http://postgres.cn/docs/10/indexes-opclass.html )後發現 postgresql 並不會對 like ‘xxoo%’ 使用普通 btree 索引,看起來和 oracle 有差異。

新建個索引

postgres=# create index idx_tmp_t0_id_2 on tmp_t0(id varchar_pattern_ops);

postgres=# vacuum analyze tmp_t0;

postgres=# explain select t0.*,t1.* from tmp_t0 t0,tmp_t1 t1 where t0.id=t1.id and t0.id like '1000000%';
                                          QUERY PLAN                                          
----------------------------------------------------------------------------------------------
 Nested Loop  (cost=0.86..4202.45 rows=500 width=146)
   ->  Index Scan using idx_tmp_t0_id_2 on tmp_t0 t0  (cost=0.43..8.45 rows=500 width=73)
         Index Cond: (((id)::text ~>=~ '1000000'::text) AND ((id)::text ~<~ '1000001'::text))
         Filter: ((id)::text ~~ '1000000%'::text)
   ->  Index Scan using idx_tmp_t1_id on tmp_t1 t1  (cost=0.43..8.38 rows=1 width=73)
         Index Cond: ((id)::text = (t0.id)::text)
(6 rows)

可以看到已經使用了新建的 idx_tmp_t0_id_2 索引

like ‘%xxoo’

後模糊(有後綴的模糊)

postgres=# explain select t0.*,t1.* from tmp_t0 t0,tmp_t1 t1 where t0.id=t1.id and t0.id like '%1000000';
                                      QUERY PLAN                                      
--------------------------------------------------------------------------------------
 Nested Loop  (cost=0.43..133264.25 rows=500 width=146)
   ->  Seq Scan on tmp_t0 t0  (cost=0.00..129070.25 rows=500 width=73)
         Filter: ((id)::text ~~ '%1000000'::text)
   ->  Index Scan using idx_tmp_t1_id on tmp_t1 t1  (cost=0.43..8.38 rows=1 width=73)
         Index Cond: ((id)::text = (t0.id)::text)
(5 rows)

沒有使用到之前創建的 idx_tmp_t0_id,idx_tmp_t0_id_2 這兩個索引。

這時需要做兩個調整
1,新建個反轉函數索引
2,調整 ‘%1000000’ 爲 ‘0000001%’

postgres=# create index idx_tmp_t0_id_3 on tmp_t0(reverse(id) varchar_pattern_ops);

postgres=# vacuum analyze tmp_t0;

postgres=# explain select t0.*,t1.* from tmp_t0 t0,tmp_t1 t1 where t0.id=t1.id and reverse(t0.id) like '0000001%';
                                                   QUERY PLAN                                                   
----------------------------------------------------------------------------------------------------------------
 Nested Loop  (cost=0.86..4202.46 rows=500 width=146)
   ->  Index Scan using idx_tmp_t0_id_3 on tmp_t0 t0  (cost=0.43..8.46 rows=500 width=73)
         Index Cond: ((reverse((id)::text) ~>=~ '0000001'::text) AND (reverse((id)::text) ~<~ '0000002'::text))
         Filter: (reverse((id)::text) ~~ '0000001%'::text)
   ->  Index Scan using idx_tmp_t1_id on tmp_t1 t1  (cost=0.43..8.38 rows=1 width=73)
         Index Cond: ((id)::text = (t0.id)::text)
(6 rows)

或者下面形式更好看些

postgres=# explain select t0.*,t1.* from tmp_t0 t0,tmp_t1 t1 where t0.id=t1.id and reverse(t0.id) like reverse('1000000')||'%';
                                                   QUERY PLAN                                                   
----------------------------------------------------------------------------------------------------------------
 Nested Loop  (cost=0.86..4202.46 rows=500 width=146)
   ->  Index Scan using idx_tmp_t0_id_3 on tmp_t0 t0  (cost=0.43..8.46 rows=500 width=73)
         Index Cond: ((reverse((id)::text) ~>=~ '0000001'::text) AND (reverse((id)::text) ~<~ '0000002'::text))
         Filter: (reverse((id)::text) ~~ '0000001%'::text)
   ->  Index Scan using idx_tmp_t1_id on tmp_t1 t1  (cost=0.43..8.38 rows=1 width=73)
         Index Cond: ((id)::text = (t0.id)::text)
(6 rows)

like ‘%xxoo%’

前後模糊(無前後綴的模糊)
3字或以上模糊查詢,使用pg_trgm可以很好的解決。參考<<pg_trgm 處理中間匹配 like ‘%xxoo%’>>
pg_trgm 也能很好處理前後綴模糊的場景.

postgres=# create extension pg_trgm;

postgres=# create index idx_tmp_t0_id_4 on tmp_t0 using gin(id gin_trgm_ops);

postgres=# explain select t0.*,t1.* from tmp_t0 t0,tmp_t1 t1 where t0.id=t1.id and t0.id like '%1000000%';
                                      QUERY PLAN                                       
---------------------------------------------------------------------------------------
 Nested Loop  (cost=28.31..6094.51 rows=500 width=146)
   ->  Bitmap Heap Scan on tmp_t0 t0  (cost=27.88..1900.51 rows=500 width=73)
         Recheck Cond: ((id)::text ~~ '%1000000%'::text)
         ->  Bitmap Index Scan on idx_tmp_t0_id_4  (cost=0.00..27.75 rows=500 width=0)
               Index Cond: ((id)::text ~~ '%1000000%'::text)
   ->  Index Scan using idx_tmp_t1_id on tmp_t1 t1  (cost=0.43..8.38 rows=1 width=73)
         Index Cond: ((id)::text = (t0.id)::text)
(7 rows)

postgres=# \d+ tmp_t0
                                          Table "public.tmp_t0"
 Column |          Type          | Collation | Nullable | Default | Storage  | Stats target | Description 
--------+------------------------+-----------+----------+---------+----------+--------------+-------------
 id     | character varying(100) |           |          |         | extended |              | 
 name1  | character varying(100) |           |          |         | extended |              | 
 name2  | character varying(100) |           |          |         | extended |              | 
Indexes:
    "idx_tmp_t0_id" btree (id)
    "idx_tmp_t0_id_2" btree (id varchar_pattern_ops)
    "idx_tmp_t0_id_3" btree (reverse(id::text) varchar_pattern_ops)
    "idx_tmp_t0_id_4" gin (id gin_trgm_ops)
	

like ‘%xxoo%’

1-2個字的模糊查詢,優化器不會使用 pg_trgm 類型索引,可以創建個自定義函數索引。

postgres=# create or replace function f_user_split(text) 
returns text[] as 
$$
declare
 res text[];
begin
  select regexp_split_to_array($1, '') into res;
  for i in 1..length($1)-1 
  loop
    res := array_append(res, substring($1, i, 2));
  end loop;
  return res;
end;
$$
language plpgsql strict immutable;

postgres=# create index idx_tmp_t0_id_5 on tmp_t0 using gin(f_user_split(id));

postgres=# explain select t0.*,t1.* from tmp_t0 t0,tmp_t1 t1 where t0.id=t1.id and f_user_split(t0.id)  @> array['99'];
                                           QUERY PLAN                                           
------------------------------------------------------------------------------------------------
 Hash Join  (cost=55802.50..191371.63 rows=25000 width=146)
   Hash Cond: ((t1.id)::text = (t0.id)::text)
   ->  Seq Scan on tmp_t1 t1  (cost=0.00..116568.82 rows=5000082 width=73)
   ->  Hash  (cost=55490.00..55490.00 rows=25000 width=73)
         ->  Bitmap Heap Scan on tmp_t0 t0  (cost=241.75..55490.00 rows=25000 width=73)
               Recheck Cond: (f_user_split((id)::text) @> '{99}'::text[])
               ->  Bitmap Index Scan on idx_tmp_t0_id_5  (cost=0.00..235.50 rows=25000 width=0)
                     Index Cond: (f_user_split((id)::text) @> '{99}'::text[])
(8 rows)

Time: 83.097 ms

參考:
http://postgres.cn/docs/10/indexes-opclass.html

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章