ElasticSearch(3)

導入tmdb

tmdb是電影數據,他的數據量很大,非常適合用作es實踐。直接谷歌kaggle tmdb下載。

首先還是要在es上建立mapping:

PUT /movie
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 1
  },
  "mappings": {
    "properties": {
      "title":{"type": "text","analyzer": "english"},
      "tagline":{"type": "text","analyzer": "english"},
      "release_date":{"type": "date","format": "8yyyy/MM/dd||yyyy/M/dd||yyyy/MM/d||yyyy/M/d"},
      "popularity":{"type": "double"},
      "overview":{"type": "text","analyzer": "english"},
      "cast":{
        "type": "object",
        "properties": {
          "character":{"type":"text","analyzer":"standard"},
          "name":{"type":"text","analyzer":"standard"}
        }
        
      }
    }
  }
}

接下來創建一個程序

        <dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter</artifactId>
		</dependency>
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-web</artifactId>
		</dependency>
		<dependency>
			<groupId>org.elasticsearch</groupId>
			<artifactId>elasticsearch</artifactId>
			<version>7.6.1</version>
		</dependency>
		<dependency>
			<groupId>org.elasticsearch.client</groupId>
			<artifactId>transport</artifactId>
			<version>7.6.1</version>
		</dependency>
		<dependency>
			<groupId>org.springframework.boot</groupId>
			<artifactId>spring-boot-starter-test</artifactId>
			<scope>test</scope>
		</dependency>
		<dependency>
			<groupId>org.elasticsearch.plugin</groupId>
			<artifactId>transport-netty4-client</artifactId>
			<version>7.6.1</version>
		</dependency>
		<dependency>
			<groupId>com.opencsv</groupId>
			<artifactId>opencsv</artifactId>
			<version>4.2</version>
		</dependency>
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.58</version>
		</dependency>
@Configuration
public class ESConfig {
    @Bean
    public TransportClient getClient(){
        TransportClient transportClient = null;
        try {
            Settings settings = Settings.builder()
                    .put("cluster.name","dianping-app").build();
            transportClient = new PreBuiltTransportClient(settings);
            TransportAddress firstAddress = new TransportAddress(InetAddress.getByName("127.0.0.1"),Integer.parseInt("9300"));
            TransportAddress secondAddress = new TransportAddress(InetAddress.getByName("127.0.0.1"),Integer.parseInt("9301"));
            TransportAddress thirdAddress = new TransportAddress(InetAddress.getByName("127.0.0.1"),Integer.parseInt("9302"));
            transportClient.addTransportAddress(firstAddress);
            transportClient.addTransportAddress(secondAddress);
            transportClient.addTransportAddress(thirdAddress);

        }catch (Exception e){
            e.printStackTrace();

        }
        return transportClient;
    }
}
@Controller("/es")
@RequestMapping("/es")
public class ESController {

    @Autowired
    private TransportClient transportClient;

    @RequestMapping("/get")
    @ResponseBody
    public ResponseEntity get(@RequestParam(name="id")Integer id){
        GetResponse getResponse = transportClient.prepareGet("movie",null,id.toString()).get();
        return new ResponseEntity(getResponse.getSource(), HttpStatus.OK);
    }


    @RequestMapping("/importdata")
    @ResponseBody
    public ResponseEntity importdata() throws IOException {
        //批量插入
        BulkRequest bulkRequest = new BulkRequest();
        int lineId = 0;
        InputStreamReader in = new InputStreamReader(new FileInputStream("./tmdb_5000_movies.csv"), Charset.forName("UTF-8"));
        CSVReader reader = new CSVReader(in, ',');
        List<String[]> allRecords = reader.readAll();
        for (String[] records : allRecords) {
            lineId++;
            if(lineId == 1){
                continue;
            }
            try{
                JSONArray castJsonArray = JSONArray.parseArray(records[20]);
                String character = (String) castJsonArray.getJSONObject(0).get("character");
                String name = (String) castJsonArray.getJSONObject(0).get("name");
                JSONObject cast = new JSONObject();
                cast.put("character",character);
                cast.put("name",name);
                String date = records[11];
                if(date == null || date.equals("")){
                    date = "1970/01/01";
                }
                //IndexRequest一條索引記錄
                bulkRequest.add(new IndexRequest("movie", "_doc", String.valueOf(lineId-1)).source(XContentType.JSON,
                        "title", records[17],
                        "tagline",records[16],
                        "release_date",date,
                        "popularity",records[8],
                        "cast",cast,
                        "overview",records[7]));
            }catch(Exception ex){

            }
        }
        reader.close();
        transportClient.bulk(bulkRequest, new ActionListener<BulkResponse>() {
            @Override
            public void onResponse(BulkResponse bulkItemResponses) {
                System.out.println(bulkItemResponses);
            }

            @Override
            public void onFailure(Exception e) {
                System.out.println(e);
            }
        });
        return new ResponseEntity("", HttpStatus.OK);
    }
}

將csv放到項目目錄下,運行一下,數據就導進來了。

一些查詢

之前說過一些語句,這裏再說一些,match之前說過,適用於關鍵詞匹配的,邏輯是or,例如:

GET /movie/_search
{
  "query": {
    "match": {
      "title": "basketball with cartoom aliens"
    }
  }
}

那麼換成and查詢的話,代碼:

GET /movie/_search
{
  "query": {
    "match": {
      "title": {
        "query": "basketball with cartoom aliens",
        "operator":"and"
      }
    }
  }
}

最小詞匹配項minimum_should_match,意思就是最少有幾個詞匹配,默認or是1個詞匹配。

GET /movie/_search
{
  "query": {
    "match": {
      "title": {
        "query": "basketball love aliens",
        "operator":"or",
        "minimum_should_match": 2
      }
    }
  }
}

短語查詢:match_phrase,這樣,兩個詞就不會分開了,這跟term的區別在於,他會做大小寫之類的匹配。

GET /movie/_search
{
  "query": {
    "match_phrase": {
      "title": "steve zissou"
    }
  }
}

多字段查詢:multi_match,這個可以同時查多個字段

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title","overview"]
    }
  }
}

我們在查詢的時候,每個查詢都會有一個score打分,

這個打分我們之前說過一個TF/IDF,再補充一個TFNORM,token frequency nomalized詞頻歸一化。例如搜索steve jobs,結果:

jobs這個詞在第一個結果裏佔比50%,第二個佔比33.3%,所以第一個詞的詞頻更高。所以分數更高。

那麼打分的過程是什麼樣的呢?

GET /movie/_search
{
  "explain": true, 
  "query": {
    "match": {
      "title": "steve"
    }
  }
}

結果:截取其中一個看下

{
        "_shard" : "[movie][0]",
        "_node" : "WQDVMY19QXOuANmMQ0yWWg",
        "_index" : "movie",
        "_type" : "_doc",
        "_id" : "2340",
        "_score" : 7.4039927,
        "_source" : {
          "title" : "Steve Jobs",
          "tagline" : "Can a great man be a good man?",
          "release_date" : "2015/10/9",
          "popularity" : "53.670525",
          "cast" : {
            "character" : "Burke",
            "name" : "Aaron Eckhart"
          },
          "overview" : "Set backstage at three iconic product launches and ending in 1998 with the unveiling of the iMac, Steve Jobs takes us behind the scenes of the digital revolution to paint an intimate portrait of the brilliant man at its epicenter."
        },
        "_explanation" : {
          "value" : 7.4039927,
          "description" : "weight(title:steve in 2183) [PerFieldSimilarity], result of:",
          "details" : [
            {
              "value" : 7.4039927,
              "description" : "score(freq=1.0), computed as boost * idf * tf from:",
              "details" : [
                {
                  "value" : 2.2,
                    //默認放大係數
                  "description" : "boost",
                  "details" : [ ]
                },
                {
                  "value" : 7.1592917,
                  "description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                  "details" : [
                    {
                      "value" : 3,
                      "description" : "n, number of documents containing term",
                      "details" : [ ]
                    },
                    {
                      "value" : 4500,
                      "description" : "N, total number of documents with field",
                      "details" : [ ]
                    }
                  ]
                },
                {
                  "value" : 0.47008157,
                  "description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
                  "details" : [
                    {
                      "value" : 1.0,
                      "description" : "freq, occurrences of term within document",
                      "details" : [ ]
                    },
                    {
                      "value" : 1.2,
                      "description" : "k1, term saturation parameter",
                      "details" : [ ]
                    },
                    {
                      "value" : 0.75,
                      "description" : "b, length normalization parameter",
                      "details" : [ ]
                    },
                    {
                      "value" : 2.0,
                      "description" : "dl, length of field",
                      "details" : [ ]
                    },
                    {
                      "value" : 2.1757777,
                      "description" : "avgdl, average length of field",
                      "details" : [ ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      }

結果可以看出,是df分數*idf分數*放大係數得來的。其中freq / (freq + k1 * (1 - b + b * dl / avgdl))中的分母是BM25算法,他用來解決詞頻問題。

對於多字段查詢還有個問題,

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title","overview"]
    }
  }
}

查詢結果:

{
        "_shard" : "[movie][0]",
        "_node" : "WQDVMY19QXOuANmMQ0yWWg",
        "_index" : "movie",
        "_type" : "_doc",
        "_id" : "453",
        "_score" : 8.579647,
        "_source" : {
          "title" : "Space Jam",
          "tagline" : "Get ready to jam.",
          "release_date" : "1996/11/15",
          "popularity" : "36.125715",
          "cast" : {
            "character" : "Cameron Poe",
            "name" : "Nicolas Cage"
          },
          "overview" : "In a desperate attempt to win a basketball match and earn their freedom, the Looney Tunes seek the aid of retired basketball champion, Michael Jordan."
        },
        "_explanation" : {
          "value" : 8.579647,
          "description" : "max of:",
          "details" : [
            {
              "value" : 8.579647,
              "description" : "sum of:",
              "details" : [
                {
                  "value" : 8.579647,
                  "description" : "weight(overview:basketbal in 396) [PerFieldSimilarity], result of:",
                  "details" : [
                    {
                      "value" : 8.579647,
                      "description" : "score(freq=2.0), computed as boost * idf * tf from:",
                      "details" : [
                        {
                          "value" : 2.2,
                          "description" : "boost",
                          "details" : [ ]
                        },
                        {
                          "value" : 5.25461,
                          "description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                          "details" : [
                            {
                              "value" : 23,
                              "description" : "n, number of documents containing term",
                              "details" : [ ]
                            },
                            {
                              "value" : 4498,
                              "description" : "N, total number of documents with field",
                              "details" : [ ]
                            }
                          ]
                        },
                        {
                          "value" : 0.74217486,
                          "description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
                          "details" : [
                            {
                              "value" : 2.0,
                              "description" : "freq, occurrences of term within document",
                              "details" : [ ]
                            },
                            {
                              "value" : 1.2,
                              "description" : "k1, term saturation parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 0.75,
                              "description" : "b, length normalization parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 16.0,
                              "description" : "dl, length of field",
                              "details" : [ ]
                            },
                            {
                              "value" : 36.475765,
                              "description" : "avgdl, average length of field",
                              "details" : [ ]
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard" : "[movie][0]",
        "_node" : "WQDVMY19QXOuANmMQ0yWWg",
        "_index" : "movie",
        "_type" : "_doc",
        "_id" : "2550",
        "_score" : 8.280251,
        "_source" : {
          "title" : "Love & Basketball",
          "tagline" : "All's fair in love and basketball.",
          "release_date" : "2000/4/21",
          "popularity" : "2.027393",
          "cast" : {
            "character" : "Laurie Strode",
            "name" : "Jamie Lee Curtis"
          },
          "overview" : "A young African-American couple navigates the tricky paths of romance and athletics in this drama. Quincy McCall (Omar Epps) and Monica Wright (Sanaa Lathan) grew up in the same neighborhood and have known each other since childhood. As they grow into adulthood, they fall in love, but they also share another all-consuming passion: basketball. They've followed the game all their lives and have no small amount of talent on the court. As Quincy and Monica struggle to make their relationship work, they follow separate career paths though high school and college basketball and, they hope, into stardom in big-league professional ball."
        },
        "_explanation" : {
          "value" : 8.280251,
          "description" : "max of:",
          "details" : [
            {
              "value" : 5.812291,
              "description" : "sum of:",
              "details" : [
                {
                  "value" : 5.812291,
                  "description" : "weight(overview:basketbal in 2376) [PerFieldSimilarity], result of:",
                  "details" : [
                    {
                      "value" : 5.812291,
                      "description" : "score(freq=2.0), computed as boost * idf * tf from:",
                      "details" : [
                        {
                          "value" : 2.2,
                          "description" : "boost",
                          "details" : [ ]
                        },
                        {
                          "value" : 5.25461,
                          "description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                          "details" : [
                            {
                              "value" : 23,
                              "description" : "n, number of documents containing term",
                              "details" : [ ]
                            },
                            {
                              "value" : 4498,
                              "description" : "N, total number of documents with field",
                              "details" : [ ]
                            }
                          ]
                        },
                        {
                          "value" : 0.5027872,
                          "description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
                          "details" : [
                            {
                              "value" : 2.0,
                              "description" : "freq, occurrences of term within document",
                              "details" : [ ]
                            },
                            {
                              "value" : 1.2,
                              "description" : "k1, term saturation parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 0.75,
                              "description" : "b, length normalization parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 68.0,
                              "description" : "dl, length of field (approximate)",
                              "details" : [ ]
                            },
                            {
                              "value" : 36.475765,
                              "description" : "avgdl, average length of field",
                              "details" : [ ]
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            },
            {
              "value" : 8.280251,
              "description" : "sum of:",
              "details" : [
                {
                  "value" : 8.280251,
                  "description" : "weight(title:basketbal in 2376) [PerFieldSimilarity], result of:",
                  "details" : [
                    {
                      "value" : 8.280251,
                      "description" : "score(freq=1.0), computed as boost * idf * tf from:",
                      "details" : [
                        {
                          "value" : 2.2,
                          "description" : "boost",
                          "details" : [ ]
                        },
                        {
                          "value" : 8.00659,
                          "description" : "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
                          "details" : [
                            {
                              "value" : 1,
                              "description" : "n, number of documents containing term",
                              "details" : [ ]
                            },
                            {
                              "value" : 4500,
                              "description" : "N, total number of documents with field",
                              "details" : [ ]
                            }
                          ]
                        },
                        {
                          "value" : 0.47008157,
                          "description" : "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
                          "details" : [
                            {
                              "value" : 1.0,
                              "description" : "freq, occurrences of term within document",
                              "details" : [ ]
                            },
                            {
                              "value" : 1.2,
                              "description" : "k1, term saturation parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 0.75,
                              "description" : "b, length normalization parameter",
                              "details" : [ ]
                            },
                            {
                              "value" : 2.0,
                              "description" : "dl, length of field",
                              "details" : [ ]
                            },
                            {
                              "value" : 2.1757777,
                              "description" : "avgdl, average length of field",
                              "details" : [ ]
                            }
                          ]
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      }

通過查看評分可以看出,第一個title:Space Jam,第二個:Love & Basketball,有時候我們搜索basketball有現在標題,但是這裏去不是這樣的,原因是在於評分機制,多字段的評分會取最大值,從第一個可以看出,title中並沒有basketball這個詞,所以自然取的就是overview中的分數,而第二個顯然title沒有overview分數高,所以排到了第二位。所以這就是原因。

如何優化這種情況?

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball",
      "fields": ["title^10","overview"]
    }
  }
}

title^10的意思是title這個字段的分數的放大係數放大了10倍,從原來的2.2變成了22,自然排到了第一位。

這種樣子還是有些侷限,有很多時候並不只是根據一個字段判斷,因此:

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball",
      "fields": ["title^10","overview"],
      "tie_breaker": 0.3
    }
  }
}

 關於tie_breaker,先看下數據:

解釋也很清楚,最大值加其他值和的0.3。

接下來講一下布爾查詢,首先有幾個關鍵詞:

must:必須都爲true

must not:必須都爲false

should:其中一個爲true即可

還要注意的是,這裏的打分是爲true的越多,得分越高。例如:

GET /movie/_search
{
  "query": {
    "bool": {
      "should": [
        {"match": {"title": "basketball with cartoom aliens"}},
        {"match": {"overview": "basketball with cartoom aliens"}}
      ]
    }
  }
}

關於打分機制,其實有很多種,不同的multi_query有很多不同的type:

best_fields:默認得分方式,取得最高的分數作爲對應文檔的分數,是“最匹配模式”,也叫dis_max模式

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title","overview"],
      "type": "best_fields"
    }
  }
}

也可以寫成:

GET /movie/_search
{
  "query": {
    "dis_max": {
      "queries": [
        {"match": {"title": "basketball with cartoom aliens"}},
        {"match": {"overview": "basketball with cartoom aliens"}}
      ]
    }
  }
}

如果想看打分公式:

GET /movie/_validate/query?explain
{
  "query": {
    "multi_match": {
      "query": "basketball with cartoom aliens",
      "fields": ["title","overview"],
      "type": "best_fields"
    }
  }
}

結果:

most_fields:考慮所有的文檔字段得分相加,來獲得結果

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "basketball",
      "fields": ["title","overview"],
      "type": "most_fields"
    }
  }
}

cross_fields:以分詞爲單位計算欄位的總分

GET /movie/_search
{
  "query": {
    "multi_match": {
      "query": "steve jobs",
      "fields": ["title","overview"],
      "type": "most_fields"
    }
  }
}

看下結構,這種是以詞爲單位來匹配的,先算出steve在overview和title中的分數取最大值,然後和job的最大值相加得出來的分數。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章