How do I implement weighting on fulltext search results ?

How do I implement weighting on fulltext search results ?

I'm working on Nuxeo EP 5.6HF29 with Postgresql 9.1. I'm looking for a way to implement weighting on search results based on a multi criteria query. The NXQL query looks like this :

Select * from Document where ecm:fulltext = 'Discipline' or ecm:fulltext_title = 'Discipline'

Nuxeo converts the NXQL query to Postgresql SQL and it looks like that :

SELECT "_C1" FROM(
    SELECT "hierarchy"."id" AS "_C1"
    FROM "hierarchy" JOIN "fulltext" ON "fulltext"."id" = "hierarchy"."id"
    WHERE(
        ("hierarchy"."primarytype" IN ('SectionRoot', 'concours', 'Workspace', 'RelationSearch', 'BlogSite', 'ManagementRoot', 'Tag', 'AdministrativeStatusContainer', 'DocumentRoute', 'Thread', 'search_results', 'Space', 'documentrh', 'TemplateRoot', 'DocumentRouteStep', 'Domain', 'WorkspaceRoot', 'Comment', 'QueryNav', 'ConditionalStepFolder', 'SimpleTask', 'WebPage', 'ConditionalTask', 'StepFolder', 'WCOpenSocial', 'MailMessage', 'Forum', 'ContextualLink', 'PictureBook', 'Unit', 'TaskRoot', 'Post', 'WCHtml', 'DocumentRouteInstancesRoot', 'FollowLifeCycleTransitionTask', 'PublishTask', 'DocumentRouteModelsRoot', 'AdministrativeStatus', 'UserWorkspacesRoot', 'Picture', 'Document', 'Folder', 'WCPicture', 'FacetedSearch', 'RouteNode', 'File', 'AdvancedSearch', 'WebSite', 'FacetedSearchDefault', 'Gadget', 'Note', 'TaskDoc', 'HiddenFolder', 'Section', 'UserProfile', 'MailFolder', 'OrderedFolder', 'BasicAuditSearch', 'CommentRoot', 'BlogPost')
    )
    AND (((TO_TSQUERY('fr', '(Discipline)') @@ NX_TO_TSVECTOR("fulltext"."fulltext"))) OR ((TO_TSQUERY('fr', '(Discipline)') @@ NX_TO_TSVECTOR("fulltext"."fulltext_title"))))
)
UNION ALL
    SELECT "_H"."id" AS "_C1"
    FROM "hierarchy" "_H"
    JOIN "proxies" ON "_H"."id" = "proxies"."id"
    JOIN "hierarchy" ON "proxies"."targetid" = "hierarchy"."id"
    JOIN "fulltext" ON "fulltext"."id" = "hierarchy"."id"
    WHERE (
        ("hierarchy"."primarytype" IN ('SectionRoot', 'concours', 'Workspace', 'RelationSearch', 'BlogSite', 'ManagementRoot', 'Tag', 'AdministrativeStatusContainer', 'DocumentRoute', 'Thread', 'search_results', 'Space', 'documentrh', 'TemplateRoot', 'DocumentRouteStep', 'Domain', 'WorkspaceRoot', 'Comment', 'QueryNav', 'ConditionalStepFolder', 'SimpleTask', 'WebPage', 'ConditionalTask', 'StepFolder', 'WCOpenSocial', 'MailMessage', 'Forum', 'ContextualLink', 'PictureBook', 'Unit', 'TaskRoot', 'Post', 'WCHtml', 'DocumentRouteInstancesRoot', 'FollowLifeCycleTransitionTask', 'PublishTask', 'DocumentRouteModelsRoot', 'AdministrativeStatus', 'UserWorkspacesRoot', 'Picture', 'Document', 'Folder', 'WCPicture', 'FacetedSearch', 'RouteNode', 'File', 'AdvancedSearch', 'WebSite', 'FacetedSearchDefault', 'Gadget', 'Note', 'TaskDoc', 'HiddenFolder', 'Section', 'UserProfile', 'MailFolder', 'OrderedFolder', 'BasicAuditSearch', 'CommentRoot', 'BlogPost'))
        AND (((TO_TSQUERY('fr', '(Discipline)') @@ NX_TO_TSVECTOR("fulltext"."fulltext"))) OR ((TO_TSQUERY('fr', '(Discipline)') @@ NX_TO_TSVECTOR("fulltext"."fulltext_title"))))
    )) AS "_T"
    LIMIT 201 OFFSET 0

I want to rank search results based on the weighting applied on the title and fulltext columns. By specifying two criterias in the query, the Postgresql ranking function disapears (and there is no way to apply the desired weight on each column) For example : Select * from Document where ecm:fulltext = 'Discipline' generates

SELECT "_C1" FROM(
    SELECT "hierarchy"."id" AS "_C1", TS_RANK_CD(NX_TO_TSVECTOR("fulltext"."fulltext"), TO_TSQUERY('fr', '(Discipline)'), 32) AS _nxscore
    FROM "hierarchy" JOIN "fulltext" ON "fulltext"."id" = "hierarchy"."id"
    WHERE(
        ("hierarchy"."primarytype" IN ('SectionRoot', 'concours', 'Workspace', 'RelationSearch', 'BlogSite', 'ManagementRoot', 'Tag', 'AdministrativeStatusContainer', 'DocumentRoute', 'Thread', 'search_results', 'Space', 'documentrh', 'TemplateRoot', 'DocumentRouteStep', 'Domain', 'WorkspaceRoot', 'Comment', 'QueryNav', 'ConditionalStepFolder', 'SimpleTask', 'WebPage', 'ConditionalTask', 'StepFolder', 'WCOpenSocial', 'MailMessage', 'Forum', 'ContextualLink', 'PictureBook', 'Unit', 'TaskRoot', 'Post', 'WCHtml', 'DocumentRouteInstancesRoot', 'FollowLifeCycleTransitionTask', 'PublishTask', 'DocumentRouteModelsRoot', 'AdministrativeStatus', 'UserWorkspacesRoot', 'Picture', 'Document', 'Folder', 'WCPicture', 'FacetedSearch', 'RouteNode', 'File', 'AdvancedSearch', 'WebSite', 'FacetedSearchDefault', 'Gadget', 'Note', 'TaskDoc', 'HiddenFolder', 'Section', 'UserProfile', 'MailFolder', 'OrderedFolder', 'BasicAuditSearch', 'CommentRoot', 'BlogPost')
    )
    AND (((TO_TSQUERY('fr', '(Discipline)') @@ NX_TO_TSVECTOR("fulltext"."fulltext"))))
)
UNION ALL
    SELECT "_H"."id" AS "_C1", TS_RANK_CD(NX_TO_TSVECTOR("fulltext"."fulltext"), TO_TSQUERY('fr', '(Discipline)'), 32) AS _nxscore
    FROM "hierarchy" "_H"
    JOIN "proxies" ON "_H"."id" = "proxies"."id"
    JOIN "hierarchy" ON "proxies"."targetid" = "hierarchy"."id"
    JOIN "fulltext" ON "fulltext"."id" = "hierarchy"."id"
    WHERE (
        ("hierarchy"."primarytype" IN ('SectionRoot', 'concours', 'Workspace', 'RelationSearch', 'BlogSite', 'ManagementRoot', 'Tag', 'AdministrativeStatusContainer', 'DocumentRoute', 'Thread', 'search_results', 'Space', 'documentrh', 'TemplateRoot', 'DocumentRouteStep', 'Domain', 'WorkspaceRoot', 'Comment', 'QueryNav', 'ConditionalStepFolder', 'SimpleTask', 'WebPage', 'ConditionalTask', 'StepFolder', 'WCOpenSocial', 'MailMessage', 'Forum', 'ContextualLink', 'PictureBook', 'Unit', 'TaskRoot', 'Post', 'WCHtml', 'DocumentRouteInstancesRoot', 'FollowLifeCycleTransitionTask', 'PublishTask', 'DocumentRouteModelsRoot', 'AdministrativeStatus', 'UserWorkspacesRoot', 'Picture', 'Document', 'Folder', 'WCPicture', 'FacetedSearch', 'RouteNode', 'File', 'AdvancedSearch', 'WebSite', 'FacetedSearchDefault', 'Gadget', 'Note', 'TaskDoc', 'HiddenFolder', 'Section', 'UserProfile', 'MailFolder', 'OrderedFolder', 'BasicAuditSearch', 'CommentRoot', 'BlogPost'))
        AND (((TO_TSQUERY('fr', '(Discipline)') @@ NX_TO_TSVECTOR("fulltext"."fulltext"))))
    )) AS "_T"
    ORDER BY _nxscore
    LIMIT 201 OFFSET 0

I want to implement a multi criteria query (with custom weights) directly in the Java source code (query based on the official documentation of Postgresql available here : Controlling Text Search) :

SELECT "_C1" FROM(
    SELECT "hierarchy"."id" AS "_C1", TS_RANK_CD('{0.1,0.4,0.6,1.0}', setweight(to_tsvector(coalesce(fulltext_title,'')), 'A') || setweight(to_tsvector(coalesce(fulltext,'')), 'D'), TO_TSQUERY('fr', '(Discipline)')) AS _nxscore
    FROM "hierarchy" JOIN "fulltext" ON "fulltext"."id" = "hierarchy"."id"
    LEFT JOIN "misc" "_F1" ON "hierarchy"."id" = "_F1"."id"
    WHERE (
        ("hierarchy"."primarytype" IN ('SectionRoot', 'concours', 'Workspace', 'RelationSearch', 'BlogSite', 'ManagementRoot', 'Tag', 'AdministrativeStatusContainer', 'DocumentRoute', 'Thread', 'search_results', 'Space', 'documentrh', 'TemplateRoot', 'DocumentRouteStep', 'Domain', 'WorkspaceRoot', 'Comment', 'QueryNav', 'ConditionalStepFolder', 'SimpleTask', 'WebPage', 'ConditionalTask', 'StepFolder', 'WCOpenSocial', 'MailMessage', 'Forum', 'ContextualLink', 'PictureBook', 'Unit', 'TaskRoot', 'Post', 'WCHtml', 'DocumentRouteInstancesRoot', 'FollowLifeCycleTransitionTask', 'PublishTask', 'DocumentRouteModelsRoot', 'AdministrativeStatus', 'UserWorkspacesRoot', 'Picture', 'Document', 'Folder', 'WCPicture', 'FacetedSearch', 'RouteNode', 'File', 'AdvancedSearch', 'WebSite', 'FacetedSearchDefault', 'Gadget', 'Note', 'TaskDoc', 'HiddenFolder', 'Section', 'UserProfile', 'MailFolder', 'OrderedFolder', 'BasicAuditSearch', 'CommentRoot', 'BlogPost'))
        AND ((TO_TSQUERY('fr', '(Discipline)') @@ NX_TO_TSVECTOR("fulltext"."fulltext_title")) OR (TO_TSQUERY('fr', '(Discipline)') @@ NX_TO_TSVECTOR("fulltext"."fulltext")))
)
UNION ALL
    SELECT "_H"."id" AS "_C1", TS_RANK_CD('{0.1,0.4,0.6,1.0}',setweight(to_tsvector(coalesce(fulltext_title,'')), 'A') || setweight(to_tsvector(coalesce(fulltext,'')), 'D'), TO_TSQUERY('fr', '(Discipline)')) AS _nxscore
    FROM "hierarchy" "_H"
    JOIN "proxies" ON "_H"."id" = "proxies"."id"
    JOIN "hierarchy" ON "proxies"."targetid" = "hierarchy"."id"
    JOIN "fulltext" ON "fulltext"."id" = "hierarchy"."id"
    LEFT JOIN "misc" "_F1" ON "hierarchy"."id" = "_F1"."id"
    WHERE (
        ("hierarchy"."primarytype" IN ('SectionRoot', 'concours', 'Workspace', 'RelationSearch', 'BlogSite', 'ManagementRoot', 'Tag', 'AdministrativeStatusContainer', 'DocumentRoute', 'Thread', 'search_results', 'Space', 'documentrh', 'TemplateRoot', 'DocumentRouteStep', 'Domain', 'WorkspaceRoot', 'Comment', 'QueryNav', 'ConditionalStepFolder', 'SimpleTask', 'WebPage', 'ConditionalTask', 'StepFolder', 'WCOpenSocial', 'MailMessage', 'Forum', 'ContextualLink', 'PictureBook', 'Unit', 'TaskRoot', 'Post', 'WCHtml', 'DocumentRouteInstancesRoot', 'FollowLifeCycleTransitionTask', 'PublishTask', 'DocumentRouteModelsRoot', 'AdministrativeStatus', 'UserWorkspacesRoot', 'Picture', 'Document', 'Folder', 'WCPicture', 'FacetedSearch', 'RouteNode', 'File', 'AdvancedSearch', 'WebSite', 'FacetedSearchDefault', 'Gadget', 'Note', 'TaskDoc', 'HiddenFolder', 'Section', 'UserProfile', 'MailFolder', 'OrderedFolder', 'BasicAuditSearch', 'CommentRoot', 'BlogPost'))
        AND ((TO_TSQUERY('fr', '(Discipline)') @@ NX_TO_TSVECTOR("fulltext"."fulltext_title")) OR (TO_TSQUERY('fr', '(Discipline)') @@ NX_TO_TSVECTOR("fulltext"."fulltext")))
    )) AS "_T"
JOIN hierarchy ON hierarchy.id = "_C1"
ORDER BY _nxscore DESC LIMIT 201 OFFSET 0

How do I proceed ?

Thank you for your help. David

2 votes

1 answers

1208 views

ANSWER



You'll want to use your own datasource and JDBC connection to the database and do the query entirely by hand.

Since Nuxeo 5.8 if you're using single-datasource mode (the default), you can use ConnectionHelper.getConnection(null) to get to the low-level connection already used by VCS.

As a side not, if it's of no interest to you if you don't use proxies, you can drop the UNION ALL part of the query by adding AND ecm:isProxy = 0 to the NXQL query.

0 votes