C++對中文字符的處理

前言

1.C++的string對中文的查找替換之類的基本操作並不友好,如果要對中文進行操作,要把中文轉成寬字符(wstring)來解決,因爲中文字符長度不確定的,在unicode中每個中文爲2個字節,而字符串中有時還可能有英文數字字符等,這些只佔一個字節1個字節,查找的過程很容易返回的不是找到的位置。
2.如果要操作中文字符串,比較好的辦法先把string轉成wstring,進行操作查找匹配操作之後,再轉回來。
3.這裏我定了個類,把它們之間的互相轉換都封裝成函數。

代碼

Chinese.h

#pragma once
#include <string>
#include <iostream>

class Chinese
{
public:
	Chinese();
	~Chinese();

	//char*轉換爲wchar_t*
	wchar_t* MBCSToUnicode(wchar_t * buff, const char * str);
	//wchar*轉換爲char*
	char* unicodeToMBCS(char* buff, const wchar_t* str);
	//string轉wstring
	std::wstring strToWstr(std::string &input);
	std::string wstrToStr(std::wstring &wstr);
	
	char* wstrToChar(std::wstring &wstr);
	char* wstrToChar(const wchar_t* wstr);
};

Chinese.cpp

#include "Chinese.h"

Chinese::Chinese()
{

}

Chinese::~Chinese()
{
}

wchar_t* Chinese::MBCSToUnicode(wchar_t* buff, const char* str)
{
	wchar_t * wp = buff;
	char * p = (char *)str;
	while (*p)
	{
		if (*p & 0x80)
		{
			*wp = *(wchar_t *)p;
			p++;
		}
		else {
			*wp = (wchar_t)*p;
		}
		wp++;
		p++;
	}
	*wp = 0x0000;
	return buff;
}

char* Chinese::unicodeToMBCS(char* buff, const wchar_t* str)
{
	wchar_t * wp = (wchar_t *)str;
	char * p = buff, *tmp;
	while (*wp)
	{
		tmp = (char *)wp;
		if (*wp & 0xFF00)
		{
			*p = *tmp;
			p++; tmp++;
			*p = *tmp;
			p++;
		}
		else
		{
			*p = *tmp;
			p++;
		}
		wp++;
	}
	*p = 0x00;
	return buff;
}

std::wstring Chinese::strToWstr(std::string &input)
{
	size_t len = input.size();
	wchar_t * b = (wchar_t *)malloc((len + 1) * sizeof(wchar_t));
	MBCSToUnicode(b, input.c_str());
	std::wstring r(b);
	free(b);
	return r;
}

char* Chinese::wstrToChar(std::wstring &wstr)
{
	char* re = wstrToChar(wstr.c_str());
	return re;
}

char* Chinese::wstrToChar(const wchar_t* wstr)
{
	int len = wcslen(wstr);
	char * buff = (char *)malloc((len * 2 + 1) * sizeof(char));
	char* re = unicodeToMBCS(buff, wstr);
	free(buff);
	return re;
}

std::string Chinese::wstrToStr(std::wstring &wstr)
{
	size_t len = wstr.size();
	char * b = (char *)malloc((2 * len + 1) * sizeof(char));
	unicodeToMBCS(b, wstr.c_str());
	std::string r(b);
	free(b);
	return r;
}

main.cpp

#include <iostream>
#include <string>
#include "Chinese.h"

int main()
{
	//輸入層:接收char*輸入,並將其轉換爲wchar*
	std::string input = "於老師的k父親王老爺子是蒙古的海軍司令!yes";
	std::string temp = "王";
	
	Chinese ch;
	std::wstring w_str = ch.strToWstr(input);
	std::wstring w_tem = ch.strToWstr(temp);
	int index = w_str.find(w_tem);
	std::cout << index << std::endl;
	
	return 0;
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章