C++爬虫原理(五):编码和解码URL_UTF-8
C++爬虫原理(五):编码和解码URL,UTF-8方式,网上大多数是ansi方式的编码,即:UTF-8,UrlEncode编码/UrlDecode解码:
一个CString版的代码如下(项目需要随手写了一个):
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
CString CTestDlg::URLEncode(CString str) { int len = MultiByteToWideChar(CP_ACP,0,str.GetBuffer(0),-1,NULL,0); str.ReleaseBuffer(); wchar_t *unicode = new wchar_t[len]; MultiByteToWideChar(CP_ACP,0,str.GetBuffer(0),-1,unicode,len); str.ReleaseBuffer(); len = WideCharToMultiByte(CP_UTF8,0,unicode,-1,NULL,0,NULL,NULL); unsigned char *newChar = new unsigned char[len]; WideCharToMultiByte(CP_UTF8,0,unicode,-1,(LPSTR)newChar,len,NULL,NULL); CString newStr=""; CString tempStr=""; char ch[4]; //code by:cplusplus.me for (size_t i=0;i<len-1;i++) { if (isalnum(newChar[i])) //数字字母不转换 sprintf(ch,"%c",newChar[i]); else if (isspace(newChar[i])) //空格转为+,or %20 sprintf(ch,"%%20"); else //中文 sprintf(ch,"%%%x",newChar[i]); tempStr.Format("%s",ch); newStr += tempStr; } delete[] unicode; delete[] newChar; return newStr; } |
另外一个适用,靠谱的版本是[代码不知道出自哪里了]:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
#include <string> #include <vector> inline BYTE toHex(const BYTE x) { return x>9?x+55:x+48; } std::string WC2UT(const wchar_t* buf) { int len=WideCharToMultiByte(CP_UTF8,0,buf,-1,NULL,0,NULL,NULL); std::vector<char> utf8(len); WideCharToMultiByte(CP_UTF8,0,buf,-1,&utf8[0],len,NULL,NULL); return std::string(&utf8[0]); } std::wstring MB2WC(const char* buf) { int len=MultiByteToWideChar(CP_ACP,0,buf,-1,NULL,0); std::vector<wchar_t> unicode(len); MultiByteToWideChar(CP_ACP,0,buf,-1,&unicode[0],len); return std::wstring(&unicode[0]); } //参数要用指针。 void URLEncode(CString* str) { std::string sln=str->GetBuffer(0); sln=WC2UT(MB2WC(sln.c_str()).c_str()); std::string sOut; for (size_t ix=0;ix<sln.size();ix++) { BYTE buf[4]; memset(buf,0,4); if (isalnum((BYTE)sln[ix])) buf[0]=sln[ix]; else if (isspace((BYTE)sln[ix])) buf[0]='+'; else { buf[0]='%'; buf[1]=toHex((BYTE)sln[ix]>>4); buf[2]=toHex((BYTE)sln[ix]); } sOut+=(char*)buf; } CString out=sOut.c_str(); *str=out; } |
这里仅仅给出了编码方式,解码方式可自行百度。
Copyright:cpp.cloudcpp.com Share、Open- C/C++程序员之家