Check tp search en for explanation of fts technique implemented in this project.
Database: PostgreSQL 8.0x (also, tested with PostgreSQL 8.1dev) Default tsearch2 configuration is 'tps'.
Example of right configuration: (data dir - /home/olegb/tmp/data,tsearch2 installed into /home/olegb/tmp/lib/)
initdb --locale=en_US /home/olegb/tmp/data (assume, postmaster started with en_US locale) pg_ctl -D /home/olegb/tmp/data -o "-p 5433" start createdb -U olegb -p 5433 --encoding=SQL_ASCII tp sed -e 's|$libdir|/home/olegb/tmp/lib/|g' < tsearch2.sql | psql -p 5433 -U oleg psql -p 5433 -U olegb tp < ~/app/dump/tables/sql/tscfg-tps.sql tp=# select * from pg_ts_cfg; ts_name | prs_name | locale -----------------+----------+-------------- default | default | C default_russian | default | ru_RU.KOI8-R simple | default | tps | default | en_US (4 rows) tp=# select to_tsquery('paris'); to_tsquery ------------ 'paris' (1 row)
Addtional modules:
We use two different configurations of fts for better flexibility (different sets of stop words).
Installation:
Please, check configuration files (tscfg-tps.cfg,tscfg-tpsh.cfg) for location of dictionary and stop-words files. There are 3 example files provided:
paris paris united united states states orlean orleans las las los los
Capital letters (A,B,C,D) denotes lexem class (by importance). D is a default class.
Following indices should exist:
-- check indices below !!! -- create index admin_searchalias_placeid on admin.searchalias(place_id); -- create index placedetail_us_state_abbrev on placedetail(us_state_abbrev); -- create index usastate_state_abbrev on usastate(state_abbrev); -- create index name_idx on place(name);
PLACE SEARCH:
place.fts_index = place.name_A || usa.state_name_B || usa.state_abbrev_B || country.name_C || mk_place_aliases(place.place_id)
UPDATE QUERY:
begin; select place.*, mk_tsvector_place(coalesce(place.name,''),coalesce(country.name,''),coalesce(usa.state_name,'')||' '|| coalesce(usa.state_abbrev,'')) || to_tsvector('tps',coalesce(mk_place_aliases(place.place_id),'')) as fts_index into place_new from place inner join placedetail on (place.place_id = placedetail.place_id) left outer join country on (placedetail.country_id = country.country_id) LEFT OUTER JOIN usastate usa on ( placedetail.us_state_abbrev = usa.state_abbrev); drop table place; alter table place_new rename to place; create index fts_index_idx on place using gist(fts_index);
HOTEL SEARCH:
property.fts_index = property.property_name_A || place.name_B || usa.state_name_B || usa.state_abbrev_B country.name_C || mk_place_aliases(place.place_id)
UPDATE QUERY:
update property.property set fts_index = CASE WHEN property.property.status_type_id is null OR property.property.status_type_id > 0 then '' ELSE mk_tsvector_hotel(coalesce(place.name,''), coalesce(country.name,''), coalesce(usa.state_name,'')||' '||coalesce(usa.state_abbrev,''), coalesce(property.property.property_name,'') ) || to_tsvector('tpsh',coalesce(mk_place_aliases(place.place_id),'')) END from place inner join placedetail on (place.place_id = placedetail.place_id) inner join country on (placedetail.country_id = country.country_id) inner JOIN usastate usa on ( placedetail.us_state_abbrev = usa.state_abbrev) where place.place_id = property.property.place_id; create index ftsp_index_idx on property.property using gist(fts_index);
LOG SEARCH:
vw_log.fts_notes = notes || caption_text_A || place.name_qualified_B
UPDATE QUERY:
alter table vw_log add column fti_notes tsvector; update vw_log set fti_notes = coalesce(to_tsvector('tps',notes),'')||' '|| coalesce( setweight( to_tsvector('tps',caption_text),'A'),'')||' '|| coalesce( setweight( to_tsvector('tps',p.name_qualified),'B'),'') from place p where p.place_id = vw_log.place_id; create index vw_log_fti_notes_idx on vw_log using gist(fti_notes);
SUPPLEMENTARY TABLE 'names' used for query rewriting.
begin; drop table names; select foo.place_id, foo.cname as cname, to_tsquery('tps',replace(foo.txt,chr(176),chr(32))) as name_tsquery into names from ( select distinct search_term as cname, place_id, '\'\\\'' || replace(search_term,'\'','\\\'') || '\\\'\'' as txt from admin.searchalias where admin.searchalias.search_term ~ ' ' union select distinct name as cname, place_id, '\'\\\'' || replace(place.name,'\'','\\\'') || '\\\'\'' as txt from place where name ~ ' ') as foo; -- create index for names create index cnames_idx on names using gist(name_tsquery gist_tp_tsquery_ops); end;
-- use 'tps' tsearch2 configuration ! create or replace function mk_tsvector_place(text, text, text) returns tsvector as $$ DECLARE res tsvector; place_name ALIAS FOR $1; country_name ALIAS FOR $2; state_name ALIAS FOR $3; BEGIN -- IF country_name = 'US' THEN res = setweight(to_tsvector('tps',place_name),'A'); IF country_name = 'United States' THEN res = res || setweight(to_tsvector('tps',state_name),'B'); END IF; res = res || setweight(to_tsvector('tps',country_name),'C'); RETURN res; END; $$ LANGUAGE plpgsql; --------------------------------------------------------------------------- -- returns aliases (no original name!) separated by '|' create or replace function mk_place_aliases(integer) returns text as $$ DECLARE res text; placeid ALIAS FOR $1; rec record; BEGIN res = ''; FOR rec IN SELECT replace(a.search_term,'\'','') AS place_alias FROM place p, admin.searchalias a WHERE a.place_id = p.place_id and p.place_id = placeid -- from place p left outer join admin.searchalias a on -- ( p.place_id = a.place_id) where p.name = place_name LOOP IF res != '' THEN res = res || '\|'; END IF; res = res || '\'' || rec.place_alias || '\''; END LOOP; IF res = '' THEN RETURN NULL; END IF; RETURN res; END; $$ LANGUAGE plpgsql; --------------------------------------------------------------------------- create or replace function mk_tsvector_hotel(text, text, text,text) returns tsvector as $$ DECLARE res tsvector; place_name ALIAS FOR $1; country_name ALIAS FOR $2; state_name ALIAS FOR $3; property_name ALIAS FOR $4; BEGIN res = setweight(to_tsvector('tpsh',property_name),'A'); res = res || setweight(to_tsvector('tpsh',place_name),'B'); IF country_name = 'United States' THEN res = res || setweight(to_tsvector('tpsh',state_name),'B'); END IF; res = res || setweight(to_tsvector('tpsh',country_name),'C'); RETURN res; END; $$ LANGUAGE plpgsql;
NOTICE: logarithm slowdown search, use something another if performance is not satisfactory !!!
select p.population,p.name_qualified, (rank('{1,0.2,0.4,1}',p.fts_index, query.r_query,0) + p.type_id/90. + log(coalesce(p.population,1000.0)+1)/7 ) as rank from place p, (select tp_rewrite(ARRAY[query, n.name_tsquery]) as r_query from names n, to_tsquery('new&orleans&hilton') as query where query @ n.name_tsquery) as query where p.fts_index @@ query.r_query and p.type_id != 40 order by rank desc limit 10; population | name_qualified | rank ------------+------------------------------------+------------------- 484670 | New Orleans, Louisiana | 1.62535215409395 5860 | Hilton, New York | 1.10576071576761 33860 | Hilton Head Island, South Carolina | 1.09939228000737 | Orleans Four Corners, New York | 0.977777779764599 | Orleans, New York | 0.977777779764599 | Hilton, New Jersey | 0.977777779764599 | Hilton, New Zealand | 0.911111117071576 | Hilton, Namibia | 0.844444454378552 | Hilton, Oklahoma | 0.844444454378552 | Hilton, Kansas | 0.844444454378552 (10 rows) Time: 385.579 ms
We use query rewriting here and return results ordered by their full text rank and property rating
tp=# select place.name_qualified,pd.property_name,pr.rating_count, (rank('{1,1,1,1}',pd.fts_index, query.r_query,0) + coalesce(pr.rating_count/500.0,0.0)) as rank from place, property.property pd, property.propertycounts pr, (select tp_rewrite(ARRAY[query, n.name_tsquery]) as r_query from names n, to_tsquery('tpsh','hilton&new&york') as query where query @ n.name_tsquery) as query where pd.fts_index @@ query.r_query and pd.place_id = place.place_id and pr.property_id = pd.property_id order by rank desc limit 10; name_qualified | property_name | rating_count | rank ------------------------+----------------------------------+--------------+------------------ Las Vegas, Nevada | New York-New York Hotel & Casino | 597 | 2.02733331346512 New York, New York | Hilton New York | 111 | 1.347 New York, New York | Hilton New York Times Square | 59 | 1.243 New York, New York | Millenium Hilton | 26 | 1.20014817905426 New York, New York | New York Marriott Marquis | 195 | 1.18166668653488 New York, New York | Hilton Waldorf Towers | 4 | 1.15614817905426 New York, New York | Hilton Waldorf=Astoria | 0 | 1.14814817905426 New Orleans, Louisiana | Hilton New Orleans Riverside | 151 | 1.052 Melville, New York | Hilton Huntington/Melville | 22 | 1.044 Rye Brook, New York | Hilton Rye Town | 16 | 1.032 (10 rows) Time: 213.734 ms
tp=# select place.name_qualified,pd.property_name,pr.rating_count, (rank('{1,1,1,1}',pd.fts_index,query,0) + coalesce(pr.rating_count/500.0,0.0)) as rank from place, property.property pd, property.propertycounts pr, to_tsquery('tpsh','hilton&new&york') as query where pd.fts_index @@ query and pd.place_id = place.place_id and pr.property_id = pd.property_id order by rank desc limit 10; name_qualified | property_name | rating_count | rank ----------------------------+------------------------------------+--------------+------------------ New York, New York | Hilton New York | 111 | 1.222 New York, New York | Hilton New York Times Square | 59 | 1.118 New York, New York | Millenium Hilton | 26 | 1.052 Melville, New York | Hilton Huntington/Melville | 22 | 1.04399642372131 Rye Brook, New York | Hilton Rye Town | 16 | 1.03188776445389 Lake Placid, New York | Hilton Lake Placid | 15 | 1.02988776445389 Tarrytown, New York | Hilton Tarrytown | 10 | 1.01998873472214 Liverpool, New York | Homewood Suites by Hilton Syracuse | 9 | 1.01798873472214 Saratoga Springs, New York | Hilton Garden Inn | 7 | 1.01396370077133 Pearl River, New York | Hilton Pearl River | 7 | 1.01388776445389 (10 rows) Time: 57.542 ms
Rewrite query and order results by fts rank and updated_date
select foo.log_id, v.caption_text, headline(v.notes,foo.query),foo.rank from vw_log v, ( select l.log_id, query.r_query as query, (rank('{0.1,1,0.5,1}', fti_notes,query.r_query,0) + 1 - log(8,(current_date-coalesce(updated_date,current_date-1)))/10) as rank from vw_log l, ( select tp_rewrite(ARRAY[query, n.name_tsquery]) as r_query from names n, to_tsquery('tps','new&york&hilton') as query where query @ n.name_tsquery ) as query where l.fti_notes @@ query.r_query order by rank desc limit 10 ) as foo where foo.log_id = v.log_id;