将文本拆分为等长的字符串，保持单词完整答案

【问题标题】：Split text into equal length strings keeping words intact将文本拆分为等长的字符串，保持单词完整
【发布时间】：2015-12-26 20:08:28
【问题描述】：

我有这段代码可以将较长的行分成等长的字符串数组来保留单词，它还考虑了[[u;#fff;]some text] 之类的格式，它拆分了文本，因此每个字符串都可以独立地转换为 html：

var format_re = /\[\[([!gbiuso]*;[^;\]]*;[^;\]]*(?:;|[^\]()]*);?[^\]]*)\]([^\]]*\\\][^\]]*|[^\]]*|[^\[]*\[[^\]]*)\]?/gi;
var format_begin_re = /(\[\[[!gbiuso]*;[^;]*;[^\]]*\])/i;
var format_last_re = /\[\[[!gbiuso]*;[^;]*;[^\]]*\]?$/i;
$.terminal.split_equal = function(str, length, words) {
  var formatting = false;
  var in_text = false;
  var prev_format = '';
  var result = [];
  // add format text as 5th paramter to formatting it's used for
  // data attribute in format function
  var array = str.replace(format_re, function(_, format, text) {
    var semicolons = format.match(/;/g).length;
    // missing semicolons
    if (semicolons == 2) {
      semicolons = ';;';
    } else if (semicolons == 3) {
      semicolons = ';';
    } else {
      semicolons = '';
    }
    // return '[[' + format + ']' + text + ']';
    // closing braket will break formatting so we need to escape
    // those using html entity equvalent
    return '[[' + format + semicolons +
      text.replace(/\\\]/g, '&#93;').replace(/\n/g, '\\n') + ']' +
      text + ']';
  }).split(/\n/g);
  for (var i = 0, len = array.length; i < len; ++i) {
    if (array[i] === '') {
      result.push('');
      continue;
    }
    var line = array[i];
    var first_index = 0;
    var count = 0;
    var space = -1;
    for (var j=0, jlen=line.length; j<jlen; ++j) {
      if (line[j] === '[' && line[j+1] === '[') {
        formatting = true;
      } else if (formatting && line[j] === ']') {
        if (in_text) {
          formatting = false;
          in_text = false;
        } else {
          in_text = true;
        }
      } else if ((formatting && in_text) || !formatting) {
        if (line[j] === '&') { // treat entity as one character
          var m = line.substring(j).match(/^(&[^;]+;)/);
          if (!m) {
            // should never happen if used by terminal,
            // because it always calls $.terminal.encode
            // before this function
            throw new Error("Unclosed html entity in line " +
                            (i+1) + ' at char ' + (j+1));
          }
          j+=m[1].length-2; // because continue adds 1 to j
          // if entity is at the end there is no next loop
          // issue #77
          if (j === jlen-1) {
            result.push(output + m[1]);
          }
          continue;
        } else if (line[j] === ']' && line[j-1] === '\\') {
          // escape \] counts as one character
          --count;
        } else {
          ++count;
        }
      }
      function is_space() {
        return line.substring(j-6, j) == '&nbsp;' ||
          line.substring(j-1, j) == ' ';
      }
      if (is_space() && ((formatting && in_text) || !formatting)) {
        space = j;
      }
      if ((count === length || j === jlen-1) &&
          ((formatting && in_text) || !formatting)) {
        var output;
        var after = line.substring(space, j+length+1);
        var text = $('<span>' + after + '</span>').text();
        var can_break = text.match(/\s/);
        if (words && space != -1 && j !== jlen-1 && can_break) {
          // get text to last space
          output = line.substring(first_index, space);
          j = space-1;
          space = -1;
        } else {
          output = line.substring(first_index, j+1);
        }
        if (words) {
          output = output.replace(/^(&nbsp;|\s)+|(&nbsp;|\s)+$/g, '');
        }
        first_index = j+1;
        count = 0;
        if (prev_format) {
          output = prev_format + output;
          if (output.match(']')) {
            prev_format = '';
          }
        }
        // Fix output if formatting not closed
        var matched = output.match(format_re);
        if (matched) {
          var last = matched[matched.length-1];
          if (last[last.length-1] !== ']') {
            prev_format = last.match(format_begin_re)[1];
            output += ']';
          } else if (output.match(format_last_re)) {
            var line_len = output.length;
            // why this line ???
            //var f_len = line_len-last[last.length-1].length;
            output = output.replace(format_last_re, '');
            prev_format = last.match(format_begin_re)[1];
          }
        }
        result.push(output);
      }
    }
  }
  return result;
};

它几乎可以正常工作，但有些行比它应该的要短：

is cracker.The term

在这个FIDDLE 中，当您剥离格式并选中复选框时，它可以正常工作。我为此工作了几个小时，不知道为什么这条线会更短，非常感谢任何帮助。

【问题讨论】：

如果text 在下一行换行，[[u;#fff;]some text] 会发生什么？你会重复像'[[u;#fff;]some]', '[[u;#fff;]text]'这样的格式吗？
@WouterHuysentruit 是的。
@Valijon 你用过小提琴jsfiddle.net/bjrdamgg/2吗？您需要包含 jQuery 和 jQuery 终端。
Chrome 可以，FF 不行
是否需要支持嵌套格式[[u]like [[#fff]this] example]？

标签： javascript jquery

【解决方案1】：

以下是修复原始代码的方法：

在第 40 行之后添加以下内容：

in_text = false;

代码使用in_text 标志来确定当前位置是否为常规文本。但是，当它进入格式化标记区域时，它并没有清除标志。这是超短线问题中描述的主要问题的原因。

将第 76/77 行的 if 语句更改为：

if (is_space() && ((formatting && in_text) || !formatting || (line[j] === '[' && line[j+1] === '['))) {

这解决了一个较小的问题，即常规文本和格式化文本之间的空格没有发生换行。

在这里工作：https://jsfiddle.net/2w10xp3m/1/

【讨论】：

非常感谢您在这方面的帮助，我们将在 22 小时后奖励赏金。
是的，我刚刚注意到，这就是我删除评论的原因。
@jcubic 我在答案中添加了一些解释以供将来参考。

【解决方案2】：

我想我已经用一种更简单的方法解决了这个问题。首先分解所有单词，然后在跟踪当前格式的同时重新组合行。见JsFiddle。

JavaScript

$.terminal.split_equal = function(str, length, words) {
  var result = [],
    currentFormat = null,
    currentLine = '',
    currentLineLengthWithoutFormatting = 0;

  // 1. Split words on &nbsp;
  words = str.split(/&nbsp;/g);

  // 2. Re-assemble lines while keeping track of current formats
  words.forEach(function(word) {
    // Keep track of current format
    var format = word.match(/^\[\[([^\]]+)\]/g),
      wordWithFormatting, wordLength;
    if (format !== null && format[0]) {
      currentFormat = format[0];
      word = word.slice(format[0].length);
    }
    // Apply current format to each word separatly
    wordLength = word.length;
    wordWithFormatting = (currentFormat || '') + word;
    if (currentFormat) {
      if (word.indexOf(']') !== -1) {
        wordLength--;
        currentFormat = null;
      } else {
        wordWithFormatting += ']';
      }
    }
    // Assemble line
    if (currentLineLengthWithoutFormatting + wordLength <= length) {
      // Word still fits on current line
      if (currentLineLengthWithoutFormatting > 0) {
        currentLine += ' ';
        currentLineLengthWithoutFormatting++;
      }
    } else {
      // Need to start new line
      result.push(currentLine);
      currentLine = '';
      currentLineLengthWithoutFormatting = 0;
    }

    currentLine += wordWithFormatting;
    currentLineLengthWithoutFormatting += wordLength;
  });

  if (currentLineLengthWithoutFormatting > 0)
    result.push(currentLine);

  return result;
};

【讨论】：

words 是一个禁用自动换行的标志，如果它设置为 false，它可能会在单词中间中断。
您没有在问题中指定这一点。无论如何，这可能是一个简单的修改，我想我可以留给你:)

【解决方案3】：

npm 包paragraph-builder 将连续文本分割成所谓的段落，这些段落均匀分布，并且在字数上大致相同。这个段落的概念似乎就是你要搜索的。

您可以定义段落的字数。您可以将段落的原则扩展到页面，考虑到一个页面平均具有大约相同数量的字符，包括空间。

此段落构建器节点脚本从连续文本生成段落。它输出一个文本，其中每个段落的大小大致相同，从而在文本中提供均匀的段落分布。它不会将文本拆分为“1.2”等数字。

有一个选项可以定义段落之间的分隔符，或者您可以将段落提取到一个字符串数组中，您可以从中应用 html 标记 <p>。检查其文档以获得进一步说明。

【讨论】：