詞頻計(jì)算部分
原數(shù)據(jù)從mysql中獲取. 我要統(tǒng)計(jì)返回行tag屬性中包含的tag詞頻扛吞。
sql_tags_all = "select tags from wbdata;"
cursor.execute(sql_tags_all)
sql_tags_all_result = cursor.fetchall()
返回?cái)?shù)據(jù)的每一個(gè)row都是一個(gè)微博對(duì)應(yīng)的5個(gè)tag厢钧,由空格分開.
接下來把查詢到的所有行的tag合并
def turn_tags_tostring(sql_result):
outputstring = ""
for row in sql_result:
longlist = row[0].split(" ")
for i in range(len(longlist)-1):
outputstring = outputstring + longlist[i+1] +","
return outputstring
接下來是對(duì)輸出的這個(gè)string進(jìn)行詞頻統(tǒng)計(jì).
這里涉及到后面詞云的生成,因此輸出上有規(guī)范.
已知目標(biāo)輸出格式為一個(gè)list睹逃,包含了所有出現(xiàn)在詞云中的詞語盗扇,每個(gè)詞語是一個(gè)字典,分別為文字部分和大小沉填,樣例為:
[{"text":"德國","size":120},{"text":"motion","size":15},{"text":"forces","size":10}]
因此疗隶,在統(tǒng)計(jì)完詞頻后,要將頻率轉(zhuǎn)化為size.
在這里翼闹,詞云詞語的大小范圍定位10到120.
線性轉(zhuǎn)化方法:
def linear_scale(inputmin,inputmax,outputmin,outputmax,item):
a = float(outputmax-outputmin)/float(inputmax-inputmin)
b = outputmax - a*inputmax
output = a*item +b
return output
接下來就是輸入sting和希望取的前多少位詞語斑鼻,(下面的代碼塊兒都為一個(gè)函數(shù))
def wordscounter(text, n):
wordDict = {}
wordlist =text.split(",")
for word in wordlist:
if word in wordDict:
wordDict[word] = wordDict[word] + 1
else:
wordDict[word] = 1
然后加入了無關(guān)詞語過濾,手動(dòng)去掉高頻無關(guān)詞匯:
removelist = ["秒拍", "視頻", "網(wǎng)頁", "分享", "全文", "鏈接", "00", "01","1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11",
"12", "13", "14", "15","16","17","18","19","20", "21", "22", "23", "24", "25", "26", "27", "28","29", "30","31","100", "20", "40", "50",
"60", "70", "80","90"]//這里添加一個(gè)list去掉無關(guān)但是頻率較高的詞語
for word in removelist:
try:
del wordDict[word.decode("utf-8")]
print "delet", word
except Exception:
pass
接下來使用counter方法來進(jìn)行詞頻統(tǒng)計(jì)猎荠,并取得前n位最頻繁的詞語:
count = Counter(wordDict)
rank = count.most_common()[:n]
接下來要使用最開始定義的線性變換方法來計(jì)算每個(gè)單詞的大小了坚弱,
因?yàn)闀?huì)有出現(xiàn)數(shù)據(jù)量小的時(shí)候,詞頻最高的第一位由于是關(guān)鍵詞把第二位甩得老遠(yuǎn)关摇,做出來的詞云沒有層次不好看荒叶,所以在這里,我設(shè)定詞頻最高的第一個(gè)詞大小恒定位120输虱,從第二名開始到第120名再從110到10進(jìn)行線性變換.
countmax = rank[1][1] //取得最大詞頻
countmin = rank[-1][1] //取得最小詞頻
diclist = []
for item in rank:
rankdic = {}
rankdic['text'] = item[0] //設(shè)定文字部分
rankdic['size'] = functions.linear_scale(countmin,countmax,10,110,item[1])
diclist.append(rankdic) //調(diào)用線性變換方法根據(jù)詞頻得到每個(gè)單詞的大小
diclist[0]['size'] = 120 // 把首詞大小設(shè)為120
print json.dumps(diclist, encoding="UTF-8", ensure_ascii=False) //打印中文檢查結(jié)果
詞云部分
包含以下script
<script src="http://d3js.org/d3.v3.min.js"></script>
<script src="http://d3js.org/topojson.v1.min.js"></script>
<script src="./static/lib/js/d3.layout.cloud.js"></script>
d3.layout.cloud.js在文章最后給出
frequency_list
就是上面返回的格式為[{"text":"text", "size":120},{"text":"text", "size":100}]
的列表.
//獲得數(shù)據(jù)
var frequency_list ={{frequency_list}};
//設(shè)定一個(gè)線性非連貫比例尺來進(jìn)行給不同大小的詞賦顏色.
var color = d3.scale.linear()
.domain([0,1,2,3,4,5,6,10,15,20,100])
.range(["#ddd", "#ccc", "#bbb", "#aaa", "#999", "#888", "#777", "#666", "#555", "#444", "#333", "#222"]);
d3.layout.cloud().size([800, 300])
.words(frequency_list)
.rotate(0)
.fontSize(function(d) { return d.size; })
.on("end", draw)
.start();
function draw(words) {
d3.select("#word_cloud").append("svg")//根據(jù)id選擇父對(duì)象插入svg
.attr("width", "100%")
.attr("height", "100%")
.attr("viewBox","0 0 900 400")
.attr("style", "border: 1px solid black")
.attr("preserveAspectRatio","xMaxYMax meet")
.attr("class", "wordcloud")
.append("g")
.attr("transform", "translate(400,200)")
.selectAll("text")
.data(words)
.enter().append("text")
.style("font-size", function(d) { return d.size + "px"; })
.style("fill", function(d, i) { return color(i); })
.attr("transform", function(d) {
return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")";
})
.text(function(d) { return d.text; });
}
到此些楣,首圖上的詞云就應(yīng)該可以實(shí)現(xiàn)啦~
:)
d3.layout.cloud.js
// Word cloud layout by Jason Davies, http://www.jasondavies.com/word-cloud/
// Algorithm due to Jonathan Feinberg, http://static.mrfeinberg.com/bv_ch03.pdf
(function(exports) {
function cloud() {
var size = [256, 256],
text = cloudText,
font = cloudFont,
fontSize = cloudFontSize,
fontStyle = cloudFontNormal,
fontWeight = cloudFontNormal,
rotate = cloudRotate,
padding = cloudPadding,
spiral = archimedeanSpiral,
words = [],
timeInterval = Infinity,
event = d3.dispatch("word", "end"),
timer = null,
cloud = {};
cloud.start = function() {
var board = zeroArray((size[0] >> 5) * size[1]),
bounds = null,
n = words.length,
i = -1,
tags = [],
data = words.map(function(d, i) {
d.text = text.call(this, d, i);
d.font = font.call(this, d, i);
d.style = fontStyle.call(this, d, i);
d.weight = fontWeight.call(this, d, i);
d.rotate = rotate.call(this, d, i);
d.size = ~~fontSize.call(this, d, i);
d.padding = padding.call(this, d, i);
return d;
}).sort(function(a, b) { return b.size - a.size; });
if (timer) clearInterval(timer);
timer = setInterval(step, 0);
step();
return cloud;
function step() {
var start = +new Date,
d;
while (+new Date - start < timeInterval && ++i < n && timer) {
d = data[i];
d.x = (size[0] * (Math.random() + .5)) >> 1;
d.y = (size[1] * (Math.random() + .5)) >> 1;
cloudSprite(d, data, i);
if (d.hasText && place(board, d, bounds)) {
tags.push(d);
event.word(d);
if (bounds) cloudBounds(bounds, d);
else bounds = [{x: d.x + d.x0, y: d.y + d.y0}, {x: d.x + d.x1, y: d.y + d.y1}];
// Temporary hack
d.x -= size[0] >> 1;
d.y -= size[1] >> 1;
}
}
if (i >= n) {
cloud.stop();
event.end(tags, bounds);
}
}
}
cloud.stop = function() {
if (timer) {
clearInterval(timer);
timer = null;
}
return cloud;
};
cloud.timeInterval = function(x) {
if (!arguments.length) return timeInterval;
timeInterval = x == null ? Infinity : x;
return cloud;
};
function place(board, tag, bounds) {
var perimeter = [{x: 0, y: 0}, {x: size[0], y: size[1]}],
startX = tag.x,
startY = tag.y,
maxDelta = Math.sqrt(size[0] * size[0] + size[1] * size[1]),
s = spiral(size),
dt = Math.random() < .5 ? 1 : -1,
t = -dt,
dxdy,
dx,
dy;
while (dxdy = s(t += dt)) {
dx = ~~dxdy[0];
dy = ~~dxdy[1];
if (Math.min(dx, dy) > maxDelta) break;
tag.x = startX + dx;
tag.y = startY + dy;
if (tag.x + tag.x0 < 0 || tag.y + tag.y0 < 0 ||
tag.x + tag.x1 > size[0] || tag.y + tag.y1 > size[1]) continue;
// TODO only check for collisions within current bounds.
if (!bounds || !cloudCollide(tag, board, size[0])) {
if (!bounds || collideRects(tag, bounds)) {
var sprite = tag.sprite,
w = tag.width >> 5,
sw = size[0] >> 5,
lx = tag.x - (w << 4),
sx = lx & 0x7f,
msx = 32 - sx,
h = tag.y1 - tag.y0,
x = (tag.y + tag.y0) * sw + (lx >> 5),
last;
for (var j = 0; j < h; j++) {
last = 0;
for (var i = 0; i <= w; i++) {
board[x + i] |= (last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0);
}
x += sw;
}
delete tag.sprite;
return true;
}
}
}
return false;
}
cloud.words = function(x) {
if (!arguments.length) return words;
words = x;
return cloud;
};
cloud.size = function(x) {
if (!arguments.length) return size;
size = [+x[0], +x[1]];
return cloud;
};
cloud.font = function(x) {
if (!arguments.length) return font;
font = d3.functor(x);
return cloud;
};
cloud.fontStyle = function(x) {
if (!arguments.length) return fontStyle;
fontStyle = d3.functor(x);
return cloud;
};
cloud.fontWeight = function(x) {
if (!arguments.length) return fontWeight;
fontWeight = d3.functor(x);
return cloud;
};
cloud.rotate = function(x) {
if (!arguments.length) return rotate;
rotate = d3.functor(x);
return cloud;
};
cloud.text = function(x) {
if (!arguments.length) return text;
text = d3.functor(x);
return cloud;
};
cloud.spiral = function(x) {
if (!arguments.length) return spiral;
spiral = spirals[x + ""] || x;
return cloud;
};
cloud.fontSize = function(x) {
if (!arguments.length) return fontSize;
fontSize = d3.functor(x);
return cloud;
};
cloud.padding = function(x) {
if (!arguments.length) return padding;
padding = d3.functor(x);
return cloud;
};
return d3.rebind(cloud, event, "on");
}
function cloudText(d) {
return d.text;
}
function cloudFont() {
return "serif";
}
function cloudFontNormal() {
return "normal";
}
function cloudFontSize(d) {
return Math.sqrt(d.value);
}
function cloudRotate() {
return (~~(Math.random() * 6) - 3) * 30;
}
function cloudPadding() {
return 1;
}
// Fetches a monochrome sprite bitmap for the specified text.
// Load in batches for speed.
function cloudSprite(d, data, di) {
if (d.sprite) return;
c.clearRect(0, 0, (cw << 5) / ratio, ch / ratio);
var x = 0,
y = 0,
maxh = 0,
n = data.length;
--di;
while (++di < n) {
d = data[di];
c.save();
c.font = d.style + " " + d.weight + " " + ~~((d.size + 1) / ratio) + "px " + d.font;
var w = c.measureText(d.text + "m").width * ratio,
h = d.size << 1;
if (d.rotate) {
var sr = Math.sin(d.rotate * cloudRadians),
cr = Math.cos(d.rotate * cloudRadians),
wcr = w * cr,
wsr = w * sr,
hcr = h * cr,
hsr = h * sr;
w = (Math.max(Math.abs(wcr + hsr), Math.abs(wcr - hsr)) + 0x1f) >> 5 << 5;
h = ~~Math.max(Math.abs(wsr + hcr), Math.abs(wsr - hcr));
} else {
w = (w + 0x1f) >> 5 << 5;
}
if (h > maxh) maxh = h;
if (x + w >= (cw << 5)) {
x = 0;
y += maxh;
maxh = 0;
}
if (y + h >= ch) break;
c.translate((x + (w >> 1)) / ratio, (y + (h >> 1)) / ratio);
if (d.rotate) c.rotate(d.rotate * cloudRadians);
c.fillText(d.text, 0, 0);
if (d.padding) c.lineWidth = 2 * d.padding, c.strokeText(d.text, 0, 0);
c.restore();
d.width = w;
d.height = h;
d.xoff = x;
d.yoff = y;
d.x1 = w >> 1;
d.y1 = h >> 1;
d.x0 = -d.x1;
d.y0 = -d.y1;
d.hasText = true;
x += w;
}
var pixels = c.getImageData(0, 0, (cw << 5) / ratio, ch / ratio).data,
sprite = [];
while (--di >= 0) {
d = data[di];
if (!d.hasText) continue;
var w = d.width,
w32 = w >> 5,
h = d.y1 - d.y0;
// Zero the buffer
for (var i = 0; i < h * w32; i++) sprite[i] = 0;
x = d.xoff;
if (x == null) return;
y = d.yoff;
var seen = 0,
seenRow = -1;
for (var j = 0; j < h; j++) {
for (var i = 0; i < w; i++) {
var k = w32 * j + (i >> 5),
m = pixels[((y + j) * (cw << 5) + (x + i)) << 2] ? 1 << (31 - (i % 32)) : 0;
sprite[k] |= m;
seen |= m;
}
if (seen) seenRow = j;
else {
d.y0++;
h--;
j--;
y++;
}
}
d.y1 = d.y0 + seenRow;
d.sprite = sprite.slice(0, (d.y1 - d.y0) * w32);
}
}
// Use mask-based collision detection.
function cloudCollide(tag, board, sw) {
sw >>= 5;
var sprite = tag.sprite,
w = tag.width >> 5,
lx = tag.x - (w << 4),
sx = lx & 0x7f,
msx = 32 - sx,
h = tag.y1 - tag.y0,
x = (tag.y + tag.y0) * sw + (lx >> 5),
last;
for (var j = 0; j < h; j++) {
last = 0;
for (var i = 0; i <= w; i++) {
if (((last << msx) | (i < w ? (last = sprite[j * w + i]) >>> sx : 0))
& board[x + i]) return true;
}
x += sw;
}
return false;
}
function cloudBounds(bounds, d) {
var b0 = bounds[0],
b1 = bounds[1];
if (d.x + d.x0 < b0.x) b0.x = d.x + d.x0;
if (d.y + d.y0 < b0.y) b0.y = d.y + d.y0;
if (d.x + d.x1 > b1.x) b1.x = d.x + d.x1;
if (d.y + d.y1 > b1.y) b1.y = d.y + d.y1;
}
function collideRects(a, b) {
return a.x + a.x1 > b[0].x && a.x + a.x0 < b[1].x && a.y + a.y1 > b[0].y && a.y + a.y0 < b[1].y;
}
function archimedeanSpiral(size) {
var e = size[0] / size[1];
return function(t) {
return [e * (t *= .1) * Math.cos(t), t * Math.sin(t)];
};
}
function rectangularSpiral(size) {
var dy = 4,
dx = dy * size[0] / size[1],
x = 0,
y = 0;
return function(t) {
var sign = t < 0 ? -1 : 1;
// See triangular numbers: T_n = n * (n + 1) / 2.
switch ((Math.sqrt(1 + 4 * sign * t) - sign) & 3) {
case 0: x += dx; break;
case 1: y += dy; break;
case 2: x -= dx; break;
default: y -= dy; break;
}
return [x, y];
};
}
// TODO reuse arrays?
function zeroArray(n) {
var a = [],
i = -1;
while (++i < n) a[i] = 0;
return a;
}
var cloudRadians = Math.PI / 180,
cw = 1 << 11 >> 5,
ch = 1 << 11,
canvas,
ratio = 1;
if (typeof document !== "undefined") {
canvas = document.createElement("canvas");
canvas.width = 1;
canvas.height = 1;
ratio = Math.sqrt(canvas.getContext("2d").getImageData(0, 0, 1, 1).data.length >> 2);
canvas.width = (cw << 5) / ratio;
canvas.height = ch / ratio;
} else {
// node-canvas support
var Canvas = require("canvas");
canvas = new Canvas(cw << 5, ch);
}
var c = canvas.getContext("2d"),
spirals = {
archimedean: archimedeanSpiral,
rectangular: rectangularSpiral
};
c.fillStyle = c.strokeStyle = "red";
c.textAlign = "center";
exports.cloud = cloud;
})(typeof exports === "undefined" ? d3.layout || (d3.layout = {}) : exports);