NorthCat NorthCat - 3 months ago 25
Python Question

ElasticSearch and special characters

I can not figure out how to look up words with special characters.

For example, I have two documents:

1) We are looking for C++ and C# developers

2) We are looking for C developers

I want only to find a document which contains

C++
.

Code for creating an index, documents and searching:

from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan

ELASTIC_SEARCH_NODES = ['http://localhost:9200']

INDEX = 'my_index'
DOC_TYPE = 'material'


def create_index():
data = {
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"filter": [
"lowercase"
],
"tokenizer": "whitespace",
}
}
}
}
}

print es_client.indices.create(index=INDEX, body=data)


def create_doc(body):

if es_client.exists(INDEX, DOC_TYPE, body['docid']):
es_client.delete(INDEX, DOC_TYPE, body['docid'])

print es_client.create(index=INDEX, doc_type=DOC_TYPE, body=body, id=body['docid'])


def find_doc(value):
results_generator = scan(es_client,
query={"query": {

"match_phrase" : {
"text" : value
}

}},
index=INDEX
)
return results_generator


if __name__ == '__main__':
es_client = Elasticsearch(ELASTIC_SEARCH_NODES, verify_certs=True)

# create_index()
doc1 = {"docid": 1, 'text': u"We are looking for C developers"}
doc2 = {"docid": 2, 'text': u"We are looking for C++ and C# developers"}

# create_doc(doc1)
# create_doc(doc2)

for r in find_doc("C++"):
print r


Search result(if I escape
+
(
"C\+\+"
), the result will be the same):

{u'_score': 0.0, u'_type': u'material', u'_id': u'2', u'_source': {u'text': u'We are looking for C++ and C# developers', u'docid': 2}, u'_index': u'my_index'}
{u'_score': 0.0, u'_type': u'material', u'_id': u'1', u'_source': {u'text': u'We are looking for C developers', u'docid': 1}, u'_index': u'my_index'}


It seems that such a result is obtained because in the division into tokens symbols like
+
and
#
not indexed, and in fact, it looks for documents in which there is the symbol
C
:

curl 'http://localhost:9200/my_index/material/_search?pretty=true' -d '{
"query" : {
"match_all" : { }
},
"script_fields": {
"terms" : {
"script": "doc[field].values",
"params": {
"field": "text"
}
}
}
}'


Result:

{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 1.0,
"hits" : [ {
"_index" : "my_index",
"_type" : "material",
"_id" : "2",
"_score" : 1.0,
"fields" : {
"terms" : [ "and", "are", "c", "developers", "for", "looking", "we" ]
}
}, {
"_index" : "my_index",
"_type" : "material",
"_id" : "1",
"_score" : 1.0,
"fields" : {
"terms" : [ "are", "c", "developers", "for", "looking", "we" ]
}
}]
}
}


How can this problem be solved? The second question related to the previous: is it possible to search only non-alphanumeric characters such as
%
or
+
?

P.S. I am using Elastic 2.3.2 and elasticsearch=2.3.0.

Answer

Thanks Andrew, I solved the problem. The problem was that the standard analyzer was used for indexing, and not my_analyzer. Thus, I forgot to use the mapping. The correct version:

data = {
   "settings": {
      "analysis": {
         "analyzer": {
            "my_analyzer": {
               "type": "custom",
               "filter": [
                  "lowercase"
               ],
               "tokenizer": "whitespace",
            }
         }
      }
   },
   "mappings": {
       "material": {
           "properties": {
               "docid": {
                   "type": "integer"
               },
               "text": {
                   "type": "string",
                   "analyzer": "my_analyzer"
               }
           }
       }
   }
}

In addition, it was necessary to recreate the index and add documents. To search for special characters, you I am using the query_string. Code of find_doc function:

def find_doc(value):
     results_generator = scan(es_client,
            query=
            {
                "query": {
                    "filtered" : {
                        "query" : {
                            "query_string" : {
                                "query": value,
                                "fields" : ["text"],
                                "analyzer": ANALYZER,
                                "default_operator": "AND"
                            },

                        }
                    }

                }
            },
            index=INDEX
        )
     return results_generator

Examples of queries (now the wildcard-characters can be used):

for r in find_doc("*#"):
    print r

for r in find_doc(u"%"):
    print r

for r in find_doc("looking fo*"):
    print r

Request for verification of the analyzer (on which token string is broken):

curl -XPOST "http://localhost:9200/my_index/_analyze?analyzer=my_analyzer&pretty=true" -d 'We are looking for C++ and C# developers'
Comments