搜索重要指標:召回率、準確率。
核心指標:relevance相關性、freshness時效性、quality質量、ctr點擊率、confidence權威度、cold_start冷啓動。最終的排序依賴這幾個字段計算分數
業務接口返回6000-9000個字段
搜索接口查了7個搜索鏈路
PM需求:
選取某一天人物的query,標識出同時出type=12及type=3的query,計算同時出現的概率,另外把同時出的query給出來,分析使用
query選取:人物top1000 、隨機1000
引擎接口傳參加上調試信息cmd=xx後的接口返回43846個字段
引擎接口不加調試信息cmd=xx的接口返回23944個字段
如果從業務接口去請求,不可,因爲業務接口過濾了引擎返回的大量字段
所以只能測引擎接口
難點:上萬個返回字段裏如何取到這個“特殊的字段”,查了很多個query,看json格式吐了,在json在線解析直接無響應。。。
最終:取resultList這個jsonArray下的jsonObject的元素即可
QA測試設計:
數據源準備:去FBI撈取或去ODPS讀取top1000個人物query、任意1000個人物query
讀取引擎接口(對內),判斷接口返回,取標誌性字段(type類型),for循環遍歷JSONObject下的value。分支判斷,組合各種場景實現需求要的數據統計
發現開發代碼的bug:引擎接口召回不穩定,同一個query請求2次,時而返回3 和12類型,時而僅返回12類型
寫給自己的bug:跑完數據才發現,召回比率應該改爲百分比
修改後:
代碼實現:
模塊劃分:
1. HTTPCommonMethod爲拼接http請求的工具類
package com.xx.searchRecall.utils;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.util.EncodingUtil;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
public class HTTPCommonMethod {
/**
* get 請求,只需將變動的參數傳入params中即可
*
* @param url_pre
* @param params
* @return
*/
public static String requestURL;
public static String doGet(String url_pre, Map<String, String> params, int count) {
try {
Header header = new Header("Content-type", "application/json");
String response = "";
// HttpClient是Apache Jakarta Common下的子項目,用來提供高效的、最新的、功能豐富的支持HTTP協議的客戶端編程工具包,並且它支持HTTP協議最新的版本和建議。
// HttpClient已經應用在很多的項目中,比如Apache Jakarta上很著名的另外兩個開源項目Cactus和HTMLUnit都使用了HttpClient。
// 使用HttpClient發送請求、接收響應
HttpClient httpClient = new HttpClient();
if (url_pre != null) {
// NameValuePair是簡單名稱值對節點類型。多用於Java像url_pre發送Post請求。在發送post請求時用該list來存放參數
// getParamsList(url_online, params, count);
// 預發環境value替換線上環境value
List<NameValuePair> qparams_pre = getParamsList_pre(params);
if (qparams_pre != null && qparams_pre.size() > 0) {
String formatParams = EncodingUtil.formUrlEncode(qparams_pre.toArray(new NameValuePair[qparams_pre.size()]),
"utf-8");
url_pre = url_pre.indexOf("?") < 0 ? url_pre + "?" + formatParams : url_pre + "&" + formatParams;
}
requestURL = url_pre;
// System.out.println("第【" + count + "】條日誌,預發環境pre請求的url_pre==" + url_pre);
GetMethod getMethod = new GetMethod(url_pre);
getMethod.addRequestHeader(header);
/*if (null != headers) {
Iterator var8 = headers.entrySet().iterator();
while (var8.hasNext()) {
Map.Entry<String, String> entry = (Map.Entry)var8.next();
getMethod.addRequestHeader((String)entry.getKey(), (String)entry.getValue());
}
}*/
//System.out.println(getMethod.getRequestHeader("User-Agent"));
int statusCode = httpClient.executeMethod(getMethod);
// 如果請求失敗則打印出失敗的返回碼
if (statusCode != 200) {
System.out.println("第" + statusCode + "【" + count + "】條日誌,預發環境請求出錯,錯誤碼爲=======" + statusCode);
return response;
}
response = new String(getMethod.getResponseBody(), "utf-8");
}
return response;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
// 參數格式化
private static List<NameValuePair> getParamsList_pre(Map<String, String> paramsMap) {
if (paramsMap != null && paramsMap.size() != 0) {
List<NameValuePair> params = new ArrayList();
Iterator var2 = paramsMap.entrySet().iterator();
while (var2.hasNext()) {
Map.Entry<String, String> map = (Map.Entry) var2.next();
// 預發環境最新版本日誌回放,請求參數打開以下if else,註釋掉最後一行
// 參數格式化,commons-httpclient自帶的方法NameValuePair會自動將==轉爲=,還有特殊符號格式化
// NameValuePair是簡單名稱值對節點類型。多用於Java像url_pre發送Post請求。在發送post請求時用該list來存放參數
params.add(new NameValuePair(map.getKey() + "", map.getValue() + ""));
// params.add(new NameValuePair(map.getKey() + "", map.getValue() + ""));
}
return params;
} else {
return null;
}
}
}
2. OdpsUtil爲連接數據庫的工具類
package com.xx.searchRecall.utils;
import com.aliyun.odps.Instance;
import com.aliyun.odps.Odps;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.account.Account;
import com.aliyun.odps.account.AliyunAccount;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.task.SQLTask;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class OdpsUtil {
// 以下爲https://xx.dw.xx-inc.com/ 點擊用戶的頭像,獲取如下連接開發環境數據庫的參數
private static String accessId = "xx";
private static String accessKey = "xx";
private static String odpsUrl = "http://xx.com/api";
// xx爲odps的dev環境(測試環境),線上爲xx
// private static String project = "xx";
private static String project = "xx";
public static List<Record> getSQLResult(String sql){
Account account = new AliyunAccount(accessId, accessKey);
Odps odps =new Odps(account);
odps.setEndpoint(odpsUrl);
odps.setDefaultProject(project);
Instance i;
List<Record> records = new ArrayList<>();
try {
i = SQLTask.run(odps, sql);
i.waitForSuccess();
records = SQLTask.getResult(i);
} catch (OdpsException e) {
e.printStackTrace();
}
return records;
}
public static List<Record> getSQLResult(String sql,String accessSelfId,String accessSelfKey){
Account account = new AliyunAccount(accessSelfId, accessSelfKey);
Odps odps =new Odps(account);
odps.setEndpoint(odpsUrl);
odps.setDefaultProject(project);
Instance i;
List<Record> records = new ArrayList<>();
try {
i = SQLTask.run(odps, sql);
i.waitForSuccess();
records = SQLTask.getResult(i);
} catch (OdpsException e) {
e.printStackTrace();
}
return records;
}
public static List<String> record2wordList(List<Record> list)
{
List<String> listFile = new ArrayList<>();
if(list !=null && list.size()>0)
{
Iterator iterator=list.iterator();
while (iterator.hasNext())
{
Record record= (Record) iterator.next();
String keyWord=record.getString(0);
listFile.add(keyWord);
}
}
return listFile;
}
}
3. RunProcess爲程序入口方法,傳入要查詢的SQL,調用odps工具類按行讀取數據源(top1000的query)
package com.xx.searchRecall;
import com.xx.searchRecall.utils.OdpsUtil;
import com.xx.searchRecall.utils.TimeTransfer;
import com.xx.searchRecall.utils.logOnlineReadODPS;
import com.xx.odps.data.Record;
import java.text.SimpleDateFormat;
import java.util.List;
public class RunProcess {
private static String accessId = "xx";
private static String accessKey = "xx";
public static void main(String[] args) {
// 運行程序(讀取新的log文件)之前,清空舊文件(上次的log日誌信息)
String currentDay = TimeTransfer.getCurrentTime();
// top1000個人物卡
// xx
// 隨機1000個人物卡,SQL查詢條件不一致
String sql="SELECT t0t.query AS f1 FROM( \n" +
"\n" +
"SELECT ftbl_1t.type AS type\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.ctr AS ctr\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.vv AS vv\n" +
" , ftbl_1t.ts AS ts\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.query AS query\n" +
" , ftbl_1t.ds AS ds\n" +
"FROM xx.xx ftbl_1t\n" +
"\n" +
"\n" +
" )t0t WHERE ((t0t.ds >= '20200419') AND(t0t.ds < '20200519')) AND(t0t.type = '人物卡片') ORDER BY TO_DATE(t0t.ds,'yyyymmdd') DESC LIMIT 1000;";
// 定義日期時間格式,DateFormat 類的子類——SimpleDateFormat。SimpleDateFormat使得可以選擇任何用戶定義的日期/時間格式的模式
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy年MM月dd日 HH:mm:ss");
long startTime = System.currentTimeMillis();
// 當前時間默認爲毫秒,直接轉換爲年月日時分秒
String startTimeRun = dateFormat.format(startTime);
System.out.println("===============查詢客戶端傳參日誌SQL開始執行了,startTimeRun爲=================【" + startTimeRun + "】");
// System.out.println(sql);
List<Record> list = OdpsUtil.getSQLResult(sql, accessId, accessKey);
// System.out.println("list==" + list);
logOnlineReadODPS.startSearch(list);
long endTime = System.currentTimeMillis();
// 當前時間默認爲毫秒,直接轉換爲年月日時分秒
String endTimeRun = dateFormat.format(endTime);
System.out.println("==========讀取odps當前日期的傳參日誌完畢,endTimeRun爲===========【" + endTimeRun + "】");
long ReadTime = (endTime - startTime) / 1000;
System.out.println("==========從連接到讀取數據庫日誌的時長,ReadTime爲===========【" + ReadTime + "】秒");
System.err.println("list.size=" + list.size());
}
}
4. ReadFiles爲讀取本地數據方法(百度的一段代碼),本地文件格式--以=","拆分
趙露思,周星馳,陳芊芊,林正英,迪麗熱巴,楊爍,劉德華,吳亦凡
package com.alibaba.searchRecall.utils;
import java.io.*;
import java.util.Arrays;
public class ReadFiles {
public static String[] readTxt(String filePath) {
StringBuilder builder = new StringBuilder();
try {
File file = new File(filePath);
if (file.isFile() && file.exists()) {
InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "utf-8");
BufferedReader br = new BufferedReader(isr);
String lineTxt = null;
int num = 0;
long time1 = System.currentTimeMillis();
while ((lineTxt = br.readLine()) != null) {
System.out.println(lineTxt);
builder.append(lineTxt);
builder.append(",");
num++;
// System.out.println("總共" + num + "條數據!");
}
//System.out.println("總共"+num+"條數據!");
long time2 = System.currentTimeMillis();
long time = time1 - time2;
// System.out.println("共花費" + time + "秒");
br.close();
} else {
System.out.println("文件不存在!");
}
} catch (Exception e) {
System.out.println("文件讀取錯誤!");
}
String[] strings = builder.toString().split(",");
return strings;
}
public static void main(String[] args) {
String filePath = "/Users/xx/searchRecall/utils/person.txt";
System.out.println(filePath);
String[] strings = readTxt(filePath);
System.out.println("strings:"+Arrays.toString(strings));
}
}
5. logOnlineReadODPS爲從數據庫類取到源數據後請求接口,接口返回解析
package com.xx.searchRecall.utils;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.aliyun.odps.data.Record;
import java.util.*;
public class logOnlineReadODPS {
public static void main(String[] args) {
// startSearch();
}
// 定義集合,把搜索詞query放到list集合
public static List<String> list = new ArrayList<>();
public static void startSearch(List<Record> list) {
int only3 = 0;
String query3 = "";
int only12 = 0;
String query12 = "";
int both3_12 = 0;
String query3_12 = "";
int no3_12 = 0;
String queryNO3_12 = "";
int totalCount = 0;
for (int i = 0; i < list.size(); i++) {
// 獲取單條SQL的查詢字段內容
Record record = list.get(i);
String keywords = record.getString("f1");
Map<String, String> query = new HashMap<>();
query.put("keyword", keywords);
// 如果URL沒有公共參數,則把 ?去掉;
// 業務接口傳參增加cmd=4拿到引擎字段返回
String url_pre = "http://xx/query?noqc=0&xx=xx&pg=1&nocache=1&sdkver=xx";
// 開始請求,域名、接口名==url+請求參數param(hashMap)
// String response = HTTPCommonMethod.doGet(url_pre, url_online, map, count);
System.out.println("第" + (i + 1) + "條數據==" + query);
String response = HTTPCommonMethod.doGet(url_pre, query, i);
JSONObject responseJson = JSONObject.parseObject(response);
int type = responseToParse(i, keywords, responseJson);
if (type == 1) {
only3++;
query3 = query3 + keywords + ",";
} else if (type == 2) {
only12++;
query12 = query12 + keywords + ",";
} else if (type == 3) {
both3_12++;
query3_12 = query3_12 + keywords + ",";
} else {
no3_12++;
queryNO3_12 = queryNO3_12 + keywords + ",";
}
// 打印接口返回的數據
totalCount = i + 1;
}
System.out.println("totalCount==" + totalCount);
String rate3 = ((float) only3 / (float) totalCount) * 100 + "%";
String rate12 = ((float) only12 / (float) totalCount) * 100 + "%";
String rate3_12 = ((float) both3_12 / (float) totalCount) * 100 + "%";
String rateNO3_12 = ((float) no3_12 / (float) totalCount) * 100 + "%";
System.out.println("------------------------------------------------------------------------------------------------");
System.out.println("------------------------------------------------------------------------------------------------g");
System.out.println("only3---只召回自頻道==【" + only3 + "】---比率爲==【" + rate3 + "】---query3==【" + query3 + "】");
System.out.println("only12---只召回人物==【" + only12 + "】---比率爲==【" + rate12 + "】---query12==【" + query12 + "】");
System.out.println("both3-12---同時召回自頻道和人物==【" + both3_12 + "】---比率爲==【" + rate3_12 + "】---query3_12==【" + query3_12 + "】");
System.out.println("no3-12---均未召回自頻道和人物==【" + no3_12 + "】---比率爲==【" + rateNO3_12 + "】---queryNO3_12==【" + queryNO3_12 + "】");
}
/**
* @param count
* @param query
* @param response
* @return 1:只包含自頻道類型3 2:只包含人物類型12 3:既包含頻道類型3且包含人物類型12 0:既不包含頻道類型3且不包含人物類型12
*/
public static int responseToParse(int count, String query, JSONObject response) {
try {
// HashMap<Integer, Integer> hm = new HashMap<Integer, Integer>();
boolean docSource3 = false;
boolean docSource12 = false;
boolean docSource3_12 = false;
if (!response.isEmpty()) {
// 獲取JSONArray
JSONArray jsonArray = response.getJSONArray("resultList");
// for循環遍歷JSONObject
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject jsonObject = jsonArray.getJSONObject(i);
// 獲取key對應value的取值getInteger("key")
int doc_source = jsonObject.getInteger("doc_source");
if (doc_source == 3) {
docSource3 = true;
} else if (doc_source == 12) {
docSource12 = true;
} else if (docSource3 && docSource12) {
docSource3_12 = true;
}
// System.out.println("第【" + count + "】條日誌,搜索query爲==【" + query + "】,doc_source==【" + doc_source + "】");
/*if (hm.containsKey(doc_source)) {
int value = hm.get(doc_source);
if (String.valueOf(value) != null && value != 0) {
hm.put(doc_source, value);
} else {
hm.put(doc_source, 1);
}
// System.out.println("value==" + hm.get(doc_source));
System.out.println("hm=" + hm);
}*/
}
if (docSource3 && !docSource12) {
return 1;
} else if (!docSource3 && docSource12) {
return 2;
} else if (docSource3 && docSource12) {
return 3;
}
} else {
System.err.println("第【" + count + "】條日誌,搜索query爲==【" + query + "】,接口返回爲空");
}
} catch (Exception e) {
e.printStackTrace();
}
return 0;
}
public static JSONObject jsonObject = new JSONObject();
}
6. logOnlineReadFiles爲讀取本地數據源,請求接口,接口返回解析
package com.alibaba.searchRecall.utils;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import java.util.*;
import static com.alibaba.searchRecall.utils.ReadFiles.readTxt;
public class logOnlineReadFiles {
public static void main(String[] args) {
startSearch();
}
// 定義集合,把搜索場景放到list集合
public static List<String> list = new ArrayList<>();
public static void startSearch() {
// 拼接的傳參參數爲中文,需要把中文放到map
// 方法1:把待測試的query top排行前1000在odps查詢出,存到本地,再通過接口拼接
String filePath = "/Users/lishan/Desktop/xx/xx/src/main/java/com/xx/searchRecall/person.txt";
System.out.println(filePath);
String[] keywords = readTxt(filePath);
System.out.println("strings:" + Arrays.toString(keywords));
// String keywords=record.getString("f1");
// 方法2:代碼讀取odps工具類,查詢top1000的query,再通過接口拼接
// 見logOnlineReadODPS
// String[] keywords={"吳亦凡","楊冪","唐嫣"};
// String[] keywords = {"吳亦凡"};
int only3 = 0;
String query3 = "";
int only12 = 0;
String query12 = "";
int both3_12 = 0;
String query3_12 = "";
int no3_12 = 0;
String queryNO3_12 = "";
int totalCount = 0;
for (int i = 0; i < keywords.length; i++) {
Map<String, String> query = new HashMap<>();
query.put("keyword", keywords[i]);
// 如果URL沒有公共參數,則把 ?去掉;
// 業務接口傳參增加cmd=4拿到引擎字段返回
String url_pre = "http://xx/query?noqc=0&pg=1&nocache=1&xx=308";
// 開始請求,域名、接口名==url+請求參數param(hashMap)
// String response = HTTPCommonMethod.doGet(url_pre, url_online, map, count);
System.out.println("第" + (i + 1) + "條數據==" + query);
String response = HTTPCommonMethod.doGet(url_pre, query, i);
JSONObject responseJson = JSONObject.parseObject(response);
int type = responseToParse(i, keywords[i], responseJson);
if (type == 1) {
only3++;
query3 = query3 + keywords[i] + ",";
} else if (type == 2) {
only12++;
query12 = query12 + keywords[i] + ",";
} else if (type == 3) {
both3_12++;
query3_12 = query3_12 + keywords[i] + ",";
} else {
no3_12++;
queryNO3_12 = queryNO3_12 + keywords[i] + ",";
}
// 打印接口返回的數據
// System.out.println("第【" + i + "】條日誌,預發環境pre接口返回response爲=======" + response);
totalCount = i + 1;
// System.out.println("每次循環的totalCount=="+totalCount);
}
System.out.println("totalCount==" + totalCount);
float rate3 = (float) only3 / (float) totalCount;
float rate12 = (float) only12 / (float) totalCount;
float rate3_12 = (float) both3_12 / (float) totalCount;
float rateNO3_12 = (float) no3_12 / (float) totalCount;
System.out.println("------------------------------------------------------------------------------------------------");
System.out.println("------------------------------------------------------------------------------------------------g");
System.out.println("only3---只召回自頻道==【" + only3 + "】---比率爲==【" + rate3 + "】---query3==【" + query3 + "】");
System.out.println("only12---只召回人物==【" + only12 + "】---比率爲==【" + rate12 + "】---query12==【" + query12 + "】");
System.out.println("both3-12---同時召回自頻道和人物==【" + both3_12 + "】---比率爲==【" + rate3_12 + "】---query3_12==【" + query3_12 + "】");
System.out.println("no3-12---均未召回自頻道和人物==【" + no3_12 + "】---比率爲==【" + rateNO3_12 + "】---queryNO3_12==【" + queryNO3_12 + "】");
}
/**
* @param count
* @param query
* @param response
* @return 1:只包含自頻道類型3 2:只包含人物類型12 3:既包含頻道類型3且包含人物類型12 0:既不包含頻道類型3且不包含人物類型12
*/
public static int responseToParse(int count, String query, JSONObject response) {
try {
// HashMap<Integer, Integer> hm = new HashMap<Integer, Integer>();
boolean docSource3 = false;
boolean docSource12 = false;
boolean docSource3_12 = false;
if (!response.isEmpty()) {
// 獲取JSONArray
JSONArray jsonArray = response.getJSONArray("resultList");
// for循環遍歷JSONObject
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject jsonObject = jsonArray.getJSONObject(i);
// 獲取key對應value的取值getInteger("key")
int doc_source = jsonObject.getInteger("doc_source");
if (doc_source == 3) {
docSource3 = true;
} else if (doc_source == 12) {
docSource12 = true;
} else if (docSource3 && docSource12) {
docSource3_12 = true;
}
// System.out.println("第【" + count + "】條日誌,搜索query爲==【" + query + "】,doc_source==【" + doc_source + "】");
/*if (hm.containsKey(doc_source)) {
int value = hm.get(doc_source);
if (String.valueOf(value) != null && value != 0) {
hm.put(doc_source, value);
} else {
hm.put(doc_source, 1);
}
// System.out.println("value==" + hm.get(doc_source));
System.out.println("hm=" + hm);
}*/
}
if (docSource3 && !docSource12) {
return 1;
} else if (!docSource3 && docSource12) {
return 2;
} else if (docSource3 && docSource12) {
return 3;
}
} else {
System.err.println("第【" + count + "】條日誌,搜索query爲==【" + query + "】,接口返回爲空");
}
} catch (Exception e) {
e.printStackTrace();
}
return 0;
}
public static JSONObject jsonObject = new JSONObject();
}
。。。